diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpuset.c | 418 |
1 files changed, 281 insertions, 137 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cd54dba2be18..7491352276b2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -60,6 +60,9 @@ struct cpuset { | |||
| 60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
| 61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
| 62 | 62 | ||
| 63 | /* | ||
| 64 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
| 65 | */ | ||
| 63 | atomic_t count; /* count tasks using this cpuset */ | 66 | atomic_t count; /* count tasks using this cpuset */ |
| 64 | 67 | ||
| 65 | /* | 68 | /* |
| @@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount; | |||
| 142 | static struct super_block *cpuset_sb = NULL; | 145 | static struct super_block *cpuset_sb = NULL; |
| 143 | 146 | ||
| 144 | /* | 147 | /* |
| 145 | * cpuset_sem should be held by anyone who is depending on the children | 148 | * We have two global cpuset semaphores below. They can nest. |
| 146 | * or sibling lists of any cpuset, or performing non-atomic operations | 149 | * It is ok to first take manage_sem, then nest callback_sem. We also |
| 147 | * on the flags or *_allowed values of a cpuset, such as raising the | 150 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
| 148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 151 | * See "The task_lock() exception", at the end of this comment. |
| 149 | * conditionally modifying the *_allowed values. One kernel global | 152 | * |
| 150 | * cpuset semaphore should be sufficient - these things don't change | 153 | * A task must hold both semaphores to modify cpusets. If a task |
| 151 | * that much. | 154 | * holds manage_sem, then it blocks others wanting that semaphore, |
| 152 | * | 155 | * ensuring that it is the only task able to also acquire callback_sem |
| 153 | * The code that modifies cpusets holds cpuset_sem across the entire | 156 | * and be able to modify cpusets. It can perform various checks on |
| 154 | * operation, from cpuset_common_file_write() down, single threading | 157 | * the cpuset structure first, knowing nothing will change. It can |
| 155 | * all cpuset modifications (except for counter manipulations from | 158 | * also allocate memory while just holding manage_sem. While it is |
| 156 | * fork and exit) across the system. This presumes that cpuset | 159 | * performing these checks, various callback routines can briefly |
| 157 | * modifications are rare - better kept simple and safe, even if slow. | 160 | * acquire callback_sem to query cpusets. Once it is ready to make |
| 158 | * | 161 | * the changes, it takes callback_sem, blocking everyone else. |
| 159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 162 | * |
| 160 | * and below, only holds cpuset_sem across small pieces of code, such | 163 | * Calls to the kernel memory allocator can not be made while holding |
| 161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 164 | * callback_sem, as that would risk double tripping on callback_sem |
| 162 | * the risks are less, and the desire for performance a little greater. | 165 | * from one of the callbacks into the cpuset code from within |
| 163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 166 | * __alloc_pages(). |
| 164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 167 | * |
| 165 | * | 168 | * If a task is only holding callback_sem, then it has read-only |
| 166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 169 | * access to cpusets. |
| 167 | * (usually) grab cpuset_sem. These are the two most performance | 170 | * |
| 168 | * critical pieces of code here. The exception occurs on exit(), | 171 | * The task_struct fields mems_allowed and mems_generation may only |
| 169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 172 | * be accessed in the context of that task, so require no locks. |
| 173 | * | ||
| 174 | * Any task can increment and decrement the count field without lock. | ||
| 175 | * So in general, code holding manage_sem or callback_sem can't rely | ||
| 176 | * on the count field not changing. However, if the count goes to | ||
| 177 | * zero, then only attach_task(), which holds both semaphores, can | ||
| 178 | * increment it again. Because a count of zero means that no tasks | ||
| 179 | * are currently attached, therefore there is no way a task attached | ||
| 180 | * to that cpuset can fork (the other way to increment the count). | ||
| 181 | * So code holding manage_sem or callback_sem can safely assume that | ||
| 182 | * if the count is zero, it will stay zero. Similarly, if a task | ||
| 183 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
| 184 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
| 185 | * both of those semaphores. | ||
| 186 | * | ||
| 187 | * A possible optimization to improve parallelism would be to make | ||
| 188 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
| 189 | * to proceed in parallel, with read access, until the holder of | ||
| 190 | * manage_sem needed to take this rwsem for exclusive write access | ||
| 191 | * and modify some cpusets. | ||
| 192 | * | ||
| 193 | * The cpuset_common_file_write handler for operations that modify | ||
| 194 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
| 195 | * single threading all such cpuset modifications across the system. | ||
| 196 | * | ||
| 197 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
| 198 | * small pieces of code, such as when reading out possibly multi-word | ||
| 199 | * cpumasks and nodemasks. | ||
| 200 | * | ||
| 201 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
| 202 | * (usually) take either semaphore. These are the two most performance | ||
| 203 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
| 204 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
| 170 | * is taken, and if the cpuset count is zero, a usermode call made | 205 | * is taken, and if the cpuset count is zero, a usermode call made |
| 171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 206 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
| 172 | * relative to the root of cpuset file system) as the argument. | 207 | * relative to the root of cpuset file system) as the argument. |
| 173 | * | 208 | * |
| 174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 209 | * A cpuset can only be deleted if both its 'count' of using tasks |
| 175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 210 | * is zero, and its list of 'children' cpusets is empty. Since all |
| 176 | * in the system use _some_ cpuset, and since there is always at least | 211 | * tasks in the system use _some_ cpuset, and since there is always at |
| 177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 212 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
| 178 | * always has either children cpusets and/or using tasks. So no need | 213 | * always has either children cpusets and/or using tasks. So we don't |
| 179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 214 | * need a special hack to ensure that top_cpuset cannot be deleted. |
| 215 | * | ||
| 216 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
| 217 | * | ||
| 218 | * The task_lock() exception | ||
| 219 | * | ||
| 220 | * The need for this exception arises from the action of attach_task(), | ||
| 221 | * which overwrites one tasks cpuset pointer with another. It does | ||
| 222 | * so using both semaphores, however there are several performance | ||
| 223 | * critical places that need to reference task->cpuset without the | ||
| 224 | * expense of grabbing a system global semaphore. Therefore except as | ||
| 225 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
| 226 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
| 227 | * (task->alloc_lock) already in the task_struct routinely used for | ||
| 228 | * such matters. | ||
| 180 | */ | 229 | */ |
| 181 | 230 | ||
| 182 | static DECLARE_MUTEX(cpuset_sem); | 231 | static DECLARE_MUTEX(manage_sem); |
| 232 | static DECLARE_MUTEX(callback_sem); | ||
| 183 | 233 | ||
| 184 | /* | 234 | /* |
| 185 | * A couple of forward declarations required, due to cyclic reference loop: | 235 | * A couple of forward declarations required, due to cyclic reference loop: |
| @@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
| 354 | } | 404 | } |
| 355 | 405 | ||
| 356 | /* | 406 | /* |
| 357 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 407 | * Call with manage_sem held. Writes path of cpuset into buf. |
| 358 | * Returns 0 on success, -errno on error. | 408 | * Returns 0 on success, -errno on error. |
| 359 | */ | 409 | */ |
| 360 | 410 | ||
| @@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
| 406 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 456 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
| 407 | * our caller up for that. | 457 | * our caller up for that. |
| 408 | * | 458 | * |
| 409 | * The simple act of forking that task might require more memory, | 459 | * When we had only one cpuset semaphore, we had to call this |
| 410 | * which might need cpuset_sem. So this routine must be called while | 460 | * without holding it, to avoid deadlock when call_usermodehelper() |
| 411 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 461 | * allocated memory. With two locks, we could now call this while |
| 412 | * comments for check_for_release(), below. | 462 | * holding manage_sem, but we still don't, so as to minimize |
| 463 | * the time manage_sem is held. | ||
| 413 | */ | 464 | */ |
| 414 | 465 | ||
| 415 | static void cpuset_release_agent(const char *pathbuf) | 466 | static void cpuset_release_agent(const char *pathbuf) |
| @@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
| 441 | * cs is notify_on_release() and now both the user count is zero and | 492 | * cs is notify_on_release() and now both the user count is zero and |
| 442 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 493 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
| 443 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 494 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
| 444 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 495 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
| 445 | * Call here with cpuset_sem held. | 496 | * Call here with manage_sem held. |
| 446 | * | 497 | * |
| 447 | * This check_for_release() routine is responsible for kmalloc'ing | 498 | * This check_for_release() routine is responsible for kmalloc'ing |
| 448 | * pathbuf. The above cpuset_release_agent() is responsible for | 499 | * pathbuf. The above cpuset_release_agent() is responsible for |
| 449 | * kfree'ing pathbuf. The caller of these routines is responsible | 500 | * kfree'ing pathbuf. The caller of these routines is responsible |
| 450 | * for providing a pathbuf pointer, initialized to NULL, then | 501 | * for providing a pathbuf pointer, initialized to NULL, then |
| 451 | * calling check_for_release() with cpuset_sem held and the address | 502 | * calling check_for_release() with manage_sem held and the address |
| 452 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 503 | * of the pathbuf pointer, then dropping manage_sem, then calling |
| 453 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 504 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
| 454 | */ | 505 | */ |
| 455 | 506 | ||
| @@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
| 480 | * One way or another, we guarantee to return some non-empty subset | 531 | * One way or another, we guarantee to return some non-empty subset |
| 481 | * of cpu_online_map. | 532 | * of cpu_online_map. |
| 482 | * | 533 | * |
| 483 | * Call with cpuset_sem held. | 534 | * Call with callback_sem held. |
| 484 | */ | 535 | */ |
| 485 | 536 | ||
| 486 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 537 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
| @@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
| 504 | * One way or another, we guarantee to return some non-empty subset | 555 | * One way or another, we guarantee to return some non-empty subset |
| 505 | * of node_online_map. | 556 | * of node_online_map. |
| 506 | * | 557 | * |
| 507 | * Call with cpuset_sem held. | 558 | * Call with callback_sem held. |
| 508 | */ | 559 | */ |
| 509 | 560 | ||
| 510 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 561 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
| @@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 519 | } | 570 | } |
| 520 | 571 | ||
| 521 | /* | 572 | /* |
| 522 | * Refresh current tasks mems_allowed and mems_generation from | 573 | * Refresh current tasks mems_allowed and mems_generation from current |
| 523 | * current tasks cpuset. Call with cpuset_sem held. | 574 | * tasks cpuset. |
| 524 | * | 575 | * |
| 525 | * Be sure to call refresh_mems() on any cpuset operation which | 576 | * Call without callback_sem or task_lock() held. May be called with |
| 526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 577 | * or without manage_sem held. Will acquire task_lock() and might |
| 527 | * Call after obtaining cpuset_sem lock, before any possible | 578 | * acquire callback_sem during call. |
| 528 | * allocation. Otherwise one risks trying to allocate memory | 579 | * |
| 529 | * while the task cpuset_mems_generation is not the same as | 580 | * The task_lock() is required to dereference current->cpuset safely. |
| 530 | * the mems_generation in its cpuset, which would deadlock on | 581 | * Without it, we could pick up the pointer value of current->cpuset |
| 531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | 582 | * in one instruction, and then attach_task could give us a different |
| 532 | * | 583 | * cpuset, and then the cpuset we had could be removed and freed, |
| 533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | 584 | * and then on our next instruction, we could dereference a no longer |
| 534 | * test (current->cpuset_mems_generation != cs->mems_generation) | 585 | * valid cpuset pointer to get its mems_generation field. |
| 535 | * in cpuset_update_current_mems_allowed() will remain false, | 586 | * |
| 536 | * until we drop cpuset_sem. Anyone else who would change our | 587 | * This routine is needed to update the per-task mems_allowed data, |
| 537 | * cpusets mems_generation needs to lock cpuset_sem first. | 588 | * within the tasks context, when it is trying to allocate memory |
| 589 | * (in various mm/mempolicy.c routines) and notices that some other | ||
| 590 | * task has been modifying its cpuset. | ||
| 538 | */ | 591 | */ |
| 539 | 592 | ||
| 540 | static void refresh_mems(void) | 593 | static void refresh_mems(void) |
| 541 | { | 594 | { |
| 542 | struct cpuset *cs = current->cpuset; | 595 | int my_cpusets_mem_gen; |
| 596 | |||
| 597 | task_lock(current); | ||
| 598 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
| 599 | task_unlock(current); | ||
| 543 | 600 | ||
| 544 | if (current->cpuset_mems_generation != cs->mems_generation) { | 601 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
| 602 | struct cpuset *cs; | ||
| 603 | |||
| 604 | down(&callback_sem); | ||
| 605 | task_lock(current); | ||
| 606 | cs = current->cpuset; | ||
| 545 | guarantee_online_mems(cs, ¤t->mems_allowed); | 607 | guarantee_online_mems(cs, ¤t->mems_allowed); |
| 546 | current->cpuset_mems_generation = cs->mems_generation; | 608 | current->cpuset_mems_generation = cs->mems_generation; |
| 609 | task_unlock(current); | ||
| 610 | up(&callback_sem); | ||
| 547 | } | 611 | } |
| 548 | } | 612 | } |
| 549 | 613 | ||
| @@ -552,7 +616,7 @@ static void refresh_mems(void) | |||
| 552 | * | 616 | * |
| 553 | * One cpuset is a subset of another if all its allowed CPUs and | 617 | * One cpuset is a subset of another if all its allowed CPUs and |
| 554 | * Memory Nodes are a subset of the other, and its exclusive flags | 618 | * Memory Nodes are a subset of the other, and its exclusive flags |
| 555 | * are only set if the other's are set. | 619 | * are only set if the other's are set. Call holding manage_sem. |
| 556 | */ | 620 | */ |
| 557 | 621 | ||
| 558 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 622 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
| @@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
| 570 | * If we replaced the flag and mask values of the current cpuset | 634 | * If we replaced the flag and mask values of the current cpuset |
| 571 | * (cur) with those values in the trial cpuset (trial), would | 635 | * (cur) with those values in the trial cpuset (trial), would |
| 572 | * our various subset and exclusive rules still be valid? Presumes | 636 | * our various subset and exclusive rules still be valid? Presumes |
| 573 | * cpuset_sem held. | 637 | * manage_sem held. |
| 574 | * | 638 | * |
| 575 | * 'cur' is the address of an actual, in-use cpuset. Operations | 639 | * 'cur' is the address of an actual, in-use cpuset. Operations |
| 576 | * such as list traversal that depend on the actual address of the | 640 | * such as list traversal that depend on the actual address of the |
| @@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 624 | * exclusive child cpusets | 688 | * exclusive child cpusets |
| 625 | * Build these two partitions by calling partition_sched_domains | 689 | * Build these two partitions by calling partition_sched_domains |
| 626 | * | 690 | * |
| 627 | * Call with cpuset_sem held. May nest a call to the | 691 | * Call with manage_sem held. May nest a call to the |
| 628 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 692 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
| 629 | */ | 693 | */ |
| 630 | 694 | ||
| @@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 669 | unlock_cpu_hotplug(); | 733 | unlock_cpu_hotplug(); |
| 670 | } | 734 | } |
| 671 | 735 | ||
| 736 | /* | ||
| 737 | * Call with manage_sem held. May take callback_sem during call. | ||
| 738 | */ | ||
| 739 | |||
| 672 | static int update_cpumask(struct cpuset *cs, char *buf) | 740 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 673 | { | 741 | { |
| 674 | struct cpuset trialcs; | 742 | struct cpuset trialcs; |
| @@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 685 | if (retval < 0) | 753 | if (retval < 0) |
| 686 | return retval; | 754 | return retval; |
| 687 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 755 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
| 756 | down(&callback_sem); | ||
| 688 | cs->cpus_allowed = trialcs.cpus_allowed; | 757 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 758 | up(&callback_sem); | ||
| 689 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 759 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
| 690 | update_cpu_domains(cs); | 760 | update_cpu_domains(cs); |
| 691 | return 0; | 761 | return 0; |
| 692 | } | 762 | } |
| 693 | 763 | ||
| 764 | /* | ||
| 765 | * Call with manage_sem held. May take callback_sem during call. | ||
| 766 | */ | ||
| 767 | |||
| 694 | static int update_nodemask(struct cpuset *cs, char *buf) | 768 | static int update_nodemask(struct cpuset *cs, char *buf) |
| 695 | { | 769 | { |
| 696 | struct cpuset trialcs; | 770 | struct cpuset trialcs; |
| @@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 705 | return -ENOSPC; | 779 | return -ENOSPC; |
| 706 | retval = validate_change(cs, &trialcs); | 780 | retval = validate_change(cs, &trialcs); |
| 707 | if (retval == 0) { | 781 | if (retval == 0) { |
| 782 | down(&callback_sem); | ||
| 708 | cs->mems_allowed = trialcs.mems_allowed; | 783 | cs->mems_allowed = trialcs.mems_allowed; |
| 709 | atomic_inc(&cpuset_mems_generation); | 784 | atomic_inc(&cpuset_mems_generation); |
| 710 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 785 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
| 786 | up(&callback_sem); | ||
| 711 | } | 787 | } |
| 712 | return retval; | 788 | return retval; |
| 713 | } | 789 | } |
| @@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 718 | * CS_NOTIFY_ON_RELEASE) | 794 | * CS_NOTIFY_ON_RELEASE) |
| 719 | * cs: the cpuset to update | 795 | * cs: the cpuset to update |
| 720 | * buf: the buffer where we read the 0 or 1 | 796 | * buf: the buffer where we read the 0 or 1 |
| 797 | * | ||
| 798 | * Call with manage_sem held. | ||
| 721 | */ | 799 | */ |
| 722 | 800 | ||
| 723 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 801 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
| @@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 739 | return err; | 817 | return err; |
| 740 | cpu_exclusive_changed = | 818 | cpu_exclusive_changed = |
| 741 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 819 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
| 820 | down(&callback_sem); | ||
| 742 | if (turning_on) | 821 | if (turning_on) |
| 743 | set_bit(bit, &cs->flags); | 822 | set_bit(bit, &cs->flags); |
| 744 | else | 823 | else |
| 745 | clear_bit(bit, &cs->flags); | 824 | clear_bit(bit, &cs->flags); |
| 825 | up(&callback_sem); | ||
| 746 | 826 | ||
| 747 | if (cpu_exclusive_changed) | 827 | if (cpu_exclusive_changed) |
| 748 | update_cpu_domains(cs); | 828 | update_cpu_domains(cs); |
| 749 | return 0; | 829 | return 0; |
| 750 | } | 830 | } |
| 751 | 831 | ||
| 832 | /* | ||
| 833 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
| 834 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
| 835 | * notified on release. | ||
| 836 | * | ||
| 837 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
| 838 | * the task 'pid' during call. | ||
| 839 | */ | ||
| 840 | |||
| 752 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 841 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
| 753 | { | 842 | { |
| 754 | pid_t pid; | 843 | pid_t pid; |
| @@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 765 | read_lock(&tasklist_lock); | 854 | read_lock(&tasklist_lock); |
| 766 | 855 | ||
| 767 | tsk = find_task_by_pid(pid); | 856 | tsk = find_task_by_pid(pid); |
| 768 | if (!tsk) { | 857 | if (!tsk || tsk->flags & PF_EXITING) { |
| 769 | read_unlock(&tasklist_lock); | 858 | read_unlock(&tasklist_lock); |
| 770 | return -ESRCH; | 859 | return -ESRCH; |
| 771 | } | 860 | } |
| @@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 783 | get_task_struct(tsk); | 872 | get_task_struct(tsk); |
| 784 | } | 873 | } |
| 785 | 874 | ||
| 875 | down(&callback_sem); | ||
| 876 | |||
| 786 | task_lock(tsk); | 877 | task_lock(tsk); |
| 787 | oldcs = tsk->cpuset; | 878 | oldcs = tsk->cpuset; |
| 788 | if (!oldcs) { | 879 | if (!oldcs) { |
| 789 | task_unlock(tsk); | 880 | task_unlock(tsk); |
| 881 | up(&callback_sem); | ||
| 790 | put_task_struct(tsk); | 882 | put_task_struct(tsk); |
| 791 | return -ESRCH; | 883 | return -ESRCH; |
| 792 | } | 884 | } |
| @@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 797 | guarantee_online_cpus(cs, &cpus); | 889 | guarantee_online_cpus(cs, &cpus); |
| 798 | set_cpus_allowed(tsk, cpus); | 890 | set_cpus_allowed(tsk, cpus); |
| 799 | 891 | ||
| 892 | up(&callback_sem); | ||
| 800 | put_task_struct(tsk); | 893 | put_task_struct(tsk); |
| 801 | if (atomic_dec_and_test(&oldcs->count)) | 894 | if (atomic_dec_and_test(&oldcs->count)) |
| 802 | check_for_release(oldcs, ppathbuf); | 895 | check_for_release(oldcs, ppathbuf); |
| @@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 840 | } | 933 | } |
| 841 | buffer[nbytes] = 0; /* nul-terminate */ | 934 | buffer[nbytes] = 0; /* nul-terminate */ |
| 842 | 935 | ||
| 843 | down(&cpuset_sem); | 936 | down(&manage_sem); |
| 844 | 937 | ||
| 845 | if (is_removed(cs)) { | 938 | if (is_removed(cs)) { |
| 846 | retval = -ENODEV; | 939 | retval = -ENODEV; |
| @@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 874 | if (retval == 0) | 967 | if (retval == 0) |
| 875 | retval = nbytes; | 968 | retval = nbytes; |
| 876 | out2: | 969 | out2: |
| 877 | up(&cpuset_sem); | 970 | up(&manage_sem); |
| 878 | cpuset_release_agent(pathbuf); | 971 | cpuset_release_agent(pathbuf); |
| 879 | out1: | 972 | out1: |
| 880 | kfree(buffer); | 973 | kfree(buffer); |
| @@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
| 914 | { | 1007 | { |
| 915 | cpumask_t mask; | 1008 | cpumask_t mask; |
| 916 | 1009 | ||
| 917 | down(&cpuset_sem); | 1010 | down(&callback_sem); |
| 918 | mask = cs->cpus_allowed; | 1011 | mask = cs->cpus_allowed; |
| 919 | up(&cpuset_sem); | 1012 | up(&callback_sem); |
| 920 | 1013 | ||
| 921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1014 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
| 922 | } | 1015 | } |
| @@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 925 | { | 1018 | { |
| 926 | nodemask_t mask; | 1019 | nodemask_t mask; |
| 927 | 1020 | ||
| 928 | down(&cpuset_sem); | 1021 | down(&callback_sem); |
| 929 | mask = cs->mems_allowed; | 1022 | mask = cs->mems_allowed; |
| 930 | up(&cpuset_sem); | 1023 | up(&callback_sem); |
| 931 | 1024 | ||
| 932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1025 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
| 933 | } | 1026 | } |
| @@ -1135,7 +1228,9 @@ struct ctr_struct { | |||
| 1135 | 1228 | ||
| 1136 | /* | 1229 | /* |
| 1137 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1230 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
| 1138 | * Return actual number of pids loaded. | 1231 | * Return actual number of pids loaded. No need to task_lock(p) |
| 1232 | * when reading out p->cpuset, as we don't really care if it changes | ||
| 1233 | * on the next cycle, and we are not going to try to dereference it. | ||
| 1139 | */ | 1234 | */ |
| 1140 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1235 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
| 1141 | { | 1236 | { |
| @@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
| 1177 | return cnt; | 1272 | return cnt; |
| 1178 | } | 1273 | } |
| 1179 | 1274 | ||
| 1275 | /* | ||
| 1276 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
| 1277 | * process id's of tasks currently attached to the cpuset being opened. | ||
| 1278 | * | ||
| 1279 | * Does not require any specific cpuset semaphores, and does not take any. | ||
| 1280 | */ | ||
| 1180 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1281 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
| 1181 | { | 1282 | { |
| 1182 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1283 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
| @@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1324 | if (!cs) | 1425 | if (!cs) |
| 1325 | return -ENOMEM; | 1426 | return -ENOMEM; |
| 1326 | 1427 | ||
| 1327 | down(&cpuset_sem); | 1428 | down(&manage_sem); |
| 1328 | refresh_mems(); | 1429 | refresh_mems(); |
| 1329 | cs->flags = 0; | 1430 | cs->flags = 0; |
| 1330 | if (notify_on_release(parent)) | 1431 | if (notify_on_release(parent)) |
| @@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1339 | 1440 | ||
| 1340 | cs->parent = parent; | 1441 | cs->parent = parent; |
| 1341 | 1442 | ||
| 1443 | down(&callback_sem); | ||
| 1342 | list_add(&cs->sibling, &cs->parent->children); | 1444 | list_add(&cs->sibling, &cs->parent->children); |
| 1445 | up(&callback_sem); | ||
| 1343 | 1446 | ||
| 1344 | err = cpuset_create_dir(cs, name, mode); | 1447 | err = cpuset_create_dir(cs, name, mode); |
| 1345 | if (err < 0) | 1448 | if (err < 0) |
| 1346 | goto err; | 1449 | goto err; |
| 1347 | 1450 | ||
| 1348 | /* | 1451 | /* |
| 1349 | * Release cpuset_sem before cpuset_populate_dir() because it | 1452 | * Release manage_sem before cpuset_populate_dir() because it |
| 1350 | * will down() this new directory's i_sem and if we race with | 1453 | * will down() this new directory's i_sem and if we race with |
| 1351 | * another mkdir, we might deadlock. | 1454 | * another mkdir, we might deadlock. |
| 1352 | */ | 1455 | */ |
| 1353 | up(&cpuset_sem); | 1456 | up(&manage_sem); |
| 1354 | 1457 | ||
| 1355 | err = cpuset_populate_dir(cs->dentry); | 1458 | err = cpuset_populate_dir(cs->dentry); |
| 1356 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1459 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 1357 | return 0; | 1460 | return 0; |
| 1358 | err: | 1461 | err: |
| 1359 | list_del(&cs->sibling); | 1462 | list_del(&cs->sibling); |
| 1360 | up(&cpuset_sem); | 1463 | up(&manage_sem); |
| 1361 | kfree(cs); | 1464 | kfree(cs); |
| 1362 | return err; | 1465 | return err; |
| 1363 | } | 1466 | } |
| @@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1379 | 1482 | ||
| 1380 | /* the vfs holds both inode->i_sem already */ | 1483 | /* the vfs holds both inode->i_sem already */ |
| 1381 | 1484 | ||
| 1382 | down(&cpuset_sem); | 1485 | down(&manage_sem); |
| 1383 | refresh_mems(); | 1486 | refresh_mems(); |
| 1384 | if (atomic_read(&cs->count) > 0) { | 1487 | if (atomic_read(&cs->count) > 0) { |
| 1385 | up(&cpuset_sem); | 1488 | up(&manage_sem); |
| 1386 | return -EBUSY; | 1489 | return -EBUSY; |
| 1387 | } | 1490 | } |
| 1388 | if (!list_empty(&cs->children)) { | 1491 | if (!list_empty(&cs->children)) { |
| 1389 | up(&cpuset_sem); | 1492 | up(&manage_sem); |
| 1390 | return -EBUSY; | 1493 | return -EBUSY; |
| 1391 | } | 1494 | } |
| 1392 | parent = cs->parent; | 1495 | parent = cs->parent; |
| 1496 | down(&callback_sem); | ||
| 1393 | set_bit(CS_REMOVED, &cs->flags); | 1497 | set_bit(CS_REMOVED, &cs->flags); |
| 1394 | if (is_cpu_exclusive(cs)) | 1498 | if (is_cpu_exclusive(cs)) |
| 1395 | update_cpu_domains(cs); | 1499 | update_cpu_domains(cs); |
| 1396 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1500 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
| 1397 | if (list_empty(&parent->children)) | ||
| 1398 | check_for_release(parent, &pathbuf); | ||
| 1399 | spin_lock(&cs->dentry->d_lock); | 1501 | spin_lock(&cs->dentry->d_lock); |
| 1400 | d = dget(cs->dentry); | 1502 | d = dget(cs->dentry); |
| 1401 | cs->dentry = NULL; | 1503 | cs->dentry = NULL; |
| 1402 | spin_unlock(&d->d_lock); | 1504 | spin_unlock(&d->d_lock); |
| 1403 | cpuset_d_remove_dir(d); | 1505 | cpuset_d_remove_dir(d); |
| 1404 | dput(d); | 1506 | dput(d); |
| 1405 | up(&cpuset_sem); | 1507 | up(&callback_sem); |
| 1508 | if (list_empty(&parent->children)) | ||
| 1509 | check_for_release(parent, &pathbuf); | ||
| 1510 | up(&manage_sem); | ||
| 1406 | cpuset_release_agent(pathbuf); | 1511 | cpuset_release_agent(pathbuf); |
| 1407 | return 0; | 1512 | return 0; |
| 1408 | } | 1513 | } |
| @@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void) | |||
| 1462 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1567 | * cpuset_fork - attach newly forked task to its parents cpuset. |
| 1463 | * @tsk: pointer to task_struct of forking parent process. | 1568 | * @tsk: pointer to task_struct of forking parent process. |
| 1464 | * | 1569 | * |
| 1465 | * Description: By default, on fork, a task inherits its | 1570 | * Description: A task inherits its parent's cpuset at fork(). |
| 1466 | * parent's cpuset. The pointer to the shared cpuset is | 1571 | * |
| 1467 | * automatically copied in fork.c by dup_task_struct(). | 1572 | * A pointer to the shared cpuset was automatically copied in fork.c |
| 1468 | * This cpuset_fork() routine need only increment the usage | 1573 | * by dup_task_struct(). However, we ignore that copy, since it was |
| 1469 | * counter in that cpuset. | 1574 | * not made under the protection of task_lock(), so might no longer be |
| 1575 | * a valid cpuset pointer. attach_task() might have already changed | ||
| 1576 | * current->cpuset, allowing the previously referenced cpuset to | ||
| 1577 | * be removed and freed. Instead, we task_lock(current) and copy | ||
| 1578 | * its present value of current->cpuset for our freshly forked child. | ||
| 1579 | * | ||
| 1580 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
| 1581 | * task, and the passed argument 'child' points to the child task. | ||
| 1470 | **/ | 1582 | **/ |
| 1471 | 1583 | ||
| 1472 | void cpuset_fork(struct task_struct *tsk) | 1584 | void cpuset_fork(struct task_struct *child) |
| 1473 | { | 1585 | { |
| 1474 | atomic_inc(&tsk->cpuset->count); | 1586 | task_lock(current); |
| 1587 | child->cpuset = current->cpuset; | ||
| 1588 | atomic_inc(&child->cpuset->count); | ||
| 1589 | task_unlock(current); | ||
| 1475 | } | 1590 | } |
| 1476 | 1591 | ||
| 1477 | /** | 1592 | /** |
| @@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
| 1480 | * | 1595 | * |
| 1481 | * Description: Detach cpuset from @tsk and release it. | 1596 | * Description: Detach cpuset from @tsk and release it. |
| 1482 | * | 1597 | * |
| 1483 | * Note that cpusets marked notify_on_release force every task | 1598 | * Note that cpusets marked notify_on_release force every task in |
| 1484 | * in them to take the global cpuset_sem semaphore when exiting. | 1599 | * them to take the global manage_sem semaphore when exiting. |
| 1485 | * This could impact scaling on very large systems. Be reluctant | 1600 | * This could impact scaling on very large systems. Be reluctant to |
| 1486 | * to use notify_on_release cpusets where very high task exit | 1601 | * use notify_on_release cpusets where very high task exit scaling |
| 1487 | * scaling is required on large systems. | 1602 | * is required on large systems. |
| 1488 | * | 1603 | * |
| 1489 | * Don't even think about derefencing 'cs' after the cpuset use | 1604 | * Don't even think about derefencing 'cs' after the cpuset use count |
| 1490 | * count goes to zero, except inside a critical section guarded | 1605 | * goes to zero, except inside a critical section guarded by manage_sem |
| 1491 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1606 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
| 1492 | * then a zero cpuset use count is a license to any other task to | 1607 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
| 1493 | * nuke the cpuset immediately. | 1608 | * |
| 1609 | * This routine has to take manage_sem, not callback_sem, because | ||
| 1610 | * it is holding that semaphore while calling check_for_release(), | ||
| 1611 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
| 1612 | * | ||
| 1613 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
| 1614 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
| 1615 | * mess with it. | ||
| 1494 | **/ | 1616 | **/ |
| 1495 | 1617 | ||
| 1496 | void cpuset_exit(struct task_struct *tsk) | 1618 | void cpuset_exit(struct task_struct *tsk) |
| 1497 | { | 1619 | { |
| 1498 | struct cpuset *cs; | 1620 | struct cpuset *cs; |
| 1499 | 1621 | ||
| 1500 | task_lock(tsk); | 1622 | BUG_ON(!(tsk->flags & PF_EXITING)); |
| 1623 | |||
| 1501 | cs = tsk->cpuset; | 1624 | cs = tsk->cpuset; |
| 1502 | tsk->cpuset = NULL; | 1625 | tsk->cpuset = NULL; |
| 1503 | task_unlock(tsk); | ||
| 1504 | 1626 | ||
| 1505 | if (notify_on_release(cs)) { | 1627 | if (notify_on_release(cs)) { |
| 1506 | char *pathbuf = NULL; | 1628 | char *pathbuf = NULL; |
| 1507 | 1629 | ||
| 1508 | down(&cpuset_sem); | 1630 | down(&manage_sem); |
| 1509 | if (atomic_dec_and_test(&cs->count)) | 1631 | if (atomic_dec_and_test(&cs->count)) |
| 1510 | check_for_release(cs, &pathbuf); | 1632 | check_for_release(cs, &pathbuf); |
| 1511 | up(&cpuset_sem); | 1633 | up(&manage_sem); |
| 1512 | cpuset_release_agent(pathbuf); | 1634 | cpuset_release_agent(pathbuf); |
| 1513 | } else { | 1635 | } else { |
| 1514 | atomic_dec(&cs->count); | 1636 | atomic_dec(&cs->count); |
| @@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
| 1529 | { | 1651 | { |
| 1530 | cpumask_t mask; | 1652 | cpumask_t mask; |
| 1531 | 1653 | ||
| 1532 | down(&cpuset_sem); | 1654 | down(&callback_sem); |
| 1533 | task_lock((struct task_struct *)tsk); | 1655 | task_lock((struct task_struct *)tsk); |
| 1534 | guarantee_online_cpus(tsk->cpuset, &mask); | 1656 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 1535 | task_unlock((struct task_struct *)tsk); | 1657 | task_unlock((struct task_struct *)tsk); |
| 1536 | up(&cpuset_sem); | 1658 | up(&callback_sem); |
| 1537 | 1659 | ||
| 1538 | return mask; | 1660 | return mask; |
| 1539 | } | 1661 | } |
| @@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void) | |||
| 1549 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1671 | * If the current tasks cpusets mems_allowed changed behind our backs, |
| 1550 | * update current->mems_allowed and mems_generation to the new value. | 1672 | * update current->mems_allowed and mems_generation to the new value. |
| 1551 | * Do not call this routine if in_interrupt(). | 1673 | * Do not call this routine if in_interrupt(). |
| 1674 | * | ||
| 1675 | * Call without callback_sem or task_lock() held. May be called | ||
| 1676 | * with or without manage_sem held. Unless exiting, it will acquire | ||
| 1677 | * task_lock(). Also might acquire callback_sem during call to | ||
| 1678 | * refresh_mems(). | ||
| 1552 | */ | 1679 | */ |
| 1553 | 1680 | ||
| 1554 | void cpuset_update_current_mems_allowed(void) | 1681 | void cpuset_update_current_mems_allowed(void) |
| 1555 | { | 1682 | { |
| 1556 | struct cpuset *cs = current->cpuset; | 1683 | struct cpuset *cs; |
| 1684 | int need_to_refresh = 0; | ||
| 1557 | 1685 | ||
| 1686 | task_lock(current); | ||
| 1687 | cs = current->cpuset; | ||
| 1558 | if (!cs) | 1688 | if (!cs) |
| 1559 | return; /* task is exiting */ | 1689 | goto done; |
| 1560 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1690 | if (current->cpuset_mems_generation != cs->mems_generation) |
| 1561 | down(&cpuset_sem); | 1691 | need_to_refresh = 1; |
| 1692 | done: | ||
| 1693 | task_unlock(current); | ||
| 1694 | if (need_to_refresh) | ||
| 1562 | refresh_mems(); | 1695 | refresh_mems(); |
| 1563 | up(&cpuset_sem); | ||
| 1564 | } | ||
| 1565 | } | 1696 | } |
| 1566 | 1697 | ||
| 1567 | /** | 1698 | /** |
| @@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 1595 | 1726 | ||
| 1596 | /* | 1727 | /* |
| 1597 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1728 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
| 1598 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1729 | * ancestor to the specified cpuset. Call holding callback_sem. |
| 1599 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1730 | * If no ancestor is mem_exclusive (an unusual configuration), then |
| 1600 | * returns the root cpuset. | 1731 | * returns the root cpuset. |
| 1601 | */ | 1732 | */ |
| @@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1622 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1753 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 1623 | * nearest mem_exclusive ancestor cpuset. | 1754 | * nearest mem_exclusive ancestor cpuset. |
| 1624 | * | 1755 | * |
| 1625 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1756 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
| 1626 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1757 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
| 1627 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1758 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
| 1628 | * mems_allowed came up empty on the first pass over the zonelist. | 1759 | * mems_allowed came up empty on the first pass over the zonelist. |
| 1629 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1760 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
| 1630 | * short of memory, might require taking the cpuset_sem semaphore. | 1761 | * short of memory, might require taking the callback_sem semaphore. |
| 1631 | * | 1762 | * |
| 1632 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1763 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
| 1633 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1764 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
| @@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 1659 | return 0; | 1790 | return 0; |
| 1660 | 1791 | ||
| 1661 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1792 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 1662 | down(&cpuset_sem); | 1793 | down(&callback_sem); |
| 1663 | cs = current->cpuset; | 1794 | |
| 1664 | if (!cs) | 1795 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
| 1665 | goto done; /* current task exiting */ | 1796 | return 1; |
| 1666 | cs = nearest_exclusive_ancestor(cs); | 1797 | task_lock(current); |
| 1798 | cs = nearest_exclusive_ancestor(current->cpuset); | ||
| 1799 | task_unlock(current); | ||
| 1800 | |||
| 1667 | allowed = node_isset(node, cs->mems_allowed); | 1801 | allowed = node_isset(node, cs->mems_allowed); |
| 1668 | done: | 1802 | up(&callback_sem); |
| 1669 | up(&cpuset_sem); | ||
| 1670 | return allowed; | 1803 | return allowed; |
| 1671 | } | 1804 | } |
| 1672 | 1805 | ||
| @@ -1679,7 +1812,7 @@ done: | |||
| 1679 | * determine if task @p's memory usage might impact the memory | 1812 | * determine if task @p's memory usage might impact the memory |
| 1680 | * available to the current task. | 1813 | * available to the current task. |
| 1681 | * | 1814 | * |
| 1682 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1815 | * Acquires callback_sem - not suitable for calling from a fast path. |
| 1683 | **/ | 1816 | **/ |
| 1684 | 1817 | ||
| 1685 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1818 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
| @@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
| 1687 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1820 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
| 1688 | int overlap = 0; /* do cpusets overlap? */ | 1821 | int overlap = 0; /* do cpusets overlap? */ |
| 1689 | 1822 | ||
| 1690 | down(&cpuset_sem); | 1823 | down(&callback_sem); |
| 1691 | cs1 = current->cpuset; | 1824 | |
| 1692 | if (!cs1) | 1825 | task_lock(current); |
| 1693 | goto done; /* current task exiting */ | 1826 | if (current->flags & PF_EXITING) { |
| 1694 | cs2 = p->cpuset; | 1827 | task_unlock(current); |
| 1695 | if (!cs2) | 1828 | goto done; |
| 1696 | goto done; /* task p is exiting */ | 1829 | } |
| 1697 | cs1 = nearest_exclusive_ancestor(cs1); | 1830 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
| 1698 | cs2 = nearest_exclusive_ancestor(cs2); | 1831 | task_unlock(current); |
| 1832 | |||
| 1833 | task_lock((struct task_struct *)p); | ||
| 1834 | if (p->flags & PF_EXITING) { | ||
| 1835 | task_unlock((struct task_struct *)p); | ||
| 1836 | goto done; | ||
| 1837 | } | ||
| 1838 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
| 1839 | task_unlock((struct task_struct *)p); | ||
| 1840 | |||
| 1699 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1841 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
| 1700 | done: | 1842 | done: |
| 1701 | up(&cpuset_sem); | 1843 | up(&callback_sem); |
| 1702 | 1844 | ||
| 1703 | return overlap; | 1845 | return overlap; |
| 1704 | } | 1846 | } |
| @@ -1707,6 +1849,10 @@ done: | |||
| 1707 | * proc_cpuset_show() | 1849 | * proc_cpuset_show() |
| 1708 | * - Print tasks cpuset path into seq_file. | 1850 | * - Print tasks cpuset path into seq_file. |
| 1709 | * - Used for /proc/<pid>/cpuset. | 1851 | * - Used for /proc/<pid>/cpuset. |
| 1852 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
| 1853 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
| 1854 | * and we take manage_sem, keeping attach_task() from changing it | ||
| 1855 | * anyway. | ||
| 1710 | */ | 1856 | */ |
| 1711 | 1857 | ||
| 1712 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1858 | static int proc_cpuset_show(struct seq_file *m, void *v) |
| @@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1721 | return -ENOMEM; | 1867 | return -ENOMEM; |
| 1722 | 1868 | ||
| 1723 | tsk = m->private; | 1869 | tsk = m->private; |
| 1724 | down(&cpuset_sem); | 1870 | down(&manage_sem); |
| 1725 | task_lock(tsk); | ||
| 1726 | cs = tsk->cpuset; | 1871 | cs = tsk->cpuset; |
| 1727 | task_unlock(tsk); | ||
| 1728 | if (!cs) { | 1872 | if (!cs) { |
| 1729 | retval = -EINVAL; | 1873 | retval = -EINVAL; |
| 1730 | goto out; | 1874 | goto out; |
| @@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1736 | seq_puts(m, buf); | 1880 | seq_puts(m, buf); |
| 1737 | seq_putc(m, '\n'); | 1881 | seq_putc(m, '\n'); |
| 1738 | out: | 1882 | out: |
| 1739 | up(&cpuset_sem); | 1883 | up(&manage_sem); |
| 1740 | kfree(buf); | 1884 | kfree(buf); |
| 1741 | return retval; | 1885 | return retval; |
| 1742 | } | 1886 | } |
