aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/cpuset.c418
2 files changed, 282 insertions, 138 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c30bc308ef1..b2d2dc14f0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,7 +1211,7 @@ extern void unhash_process(struct task_struct *p);
1211/* 1211/*
1212 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring 1212 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
1213 * subscriptions and synchronises with wait4(). Also used in procfs. Also 1213 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1214 * pins the final release of task.io_context. 1214 * pins the final release of task.io_context. Also protects ->cpuset.
1215 * 1215 *
1216 * Nests both inside and outside of read_lock(&tasklist_lock). 1216 * Nests both inside and outside of read_lock(&tasklist_lock).
1217 * It must not be nested with write_lock_irq(&tasklist_lock), 1217 * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cd54dba2be18..7491352276b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,9 @@ struct cpuset {
60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
62 62
63 /*
64 * Count is atomic so can incr (fork) or decr (exit) without a lock.
65 */
63 atomic_t count; /* count tasks using this cpuset */ 66 atomic_t count; /* count tasks using this cpuset */
64 67
65 /* 68 /*
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount;
142static struct super_block *cpuset_sb = NULL; 145static struct super_block *cpuset_sb = NULL;
143 146
144/* 147/*
145 * cpuset_sem should be held by anyone who is depending on the children 148 * We have two global cpuset semaphores below. They can nest.
146 * or sibling lists of any cpuset, or performing non-atomic operations 149 * It is ok to first take manage_sem, then nest callback_sem. We also
147 * on the flags or *_allowed values of a cpuset, such as raising the 150 * require taking task_lock() when dereferencing a tasks cpuset pointer.
148 * CS_REMOVED flag bit iff it is not already raised, or reading and 151 * See "The task_lock() exception", at the end of this comment.
149 * conditionally modifying the *_allowed values. One kernel global 152 *
150 * cpuset semaphore should be sufficient - these things don't change 153 * A task must hold both semaphores to modify cpusets. If a task
151 * that much. 154 * holds manage_sem, then it blocks others wanting that semaphore,
152 * 155 * ensuring that it is the only task able to also acquire callback_sem
153 * The code that modifies cpusets holds cpuset_sem across the entire 156 * and be able to modify cpusets. It can perform various checks on
154 * operation, from cpuset_common_file_write() down, single threading 157 * the cpuset structure first, knowing nothing will change. It can
155 * all cpuset modifications (except for counter manipulations from 158 * also allocate memory while just holding manage_sem. While it is
156 * fork and exit) across the system. This presumes that cpuset 159 * performing these checks, various callback routines can briefly
157 * modifications are rare - better kept simple and safe, even if slow. 160 * acquire callback_sem to query cpusets. Once it is ready to make
158 * 161 * the changes, it takes callback_sem, blocking everyone else.
159 * The code that reads cpusets, such as in cpuset_common_file_read() 162 *
160 * and below, only holds cpuset_sem across small pieces of code, such 163 * Calls to the kernel memory allocator can not be made while holding
161 * as when reading out possibly multi-word cpumasks and nodemasks, as 164 * callback_sem, as that would risk double tripping on callback_sem
162 * the risks are less, and the desire for performance a little greater. 165 * from one of the callbacks into the cpuset code from within
163 * The proc_cpuset_show() routine needs to hold cpuset_sem to insure 166 * __alloc_pages().
164 * that no cs->dentry is NULL, as it walks up the cpuset tree to root. 167 *
165 * 168 * If a task is only holding callback_sem, then it has read-only
166 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't 169 * access to cpusets.
167 * (usually) grab cpuset_sem. These are the two most performance 170 *
168 * critical pieces of code here. The exception occurs on exit(), 171 * The task_struct fields mems_allowed and mems_generation may only
169 * when a task in a notify_on_release cpuset exits. Then cpuset_sem 172 * be accessed in the context of that task, so require no locks.
173 *
174 * Any task can increment and decrement the count field without lock.
175 * So in general, code holding manage_sem or callback_sem can't rely
176 * on the count field not changing. However, if the count goes to
177 * zero, then only attach_task(), which holds both semaphores, can
178 * increment it again. Because a count of zero means that no tasks
179 * are currently attached, therefore there is no way a task attached
180 * to that cpuset can fork (the other way to increment the count).
181 * So code holding manage_sem or callback_sem can safely assume that
182 * if the count is zero, it will stay zero. Similarly, if a task
183 * holds manage_sem or callback_sem on a cpuset with zero count, it
184 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
185 * both of those semaphores.
186 *
187 * A possible optimization to improve parallelism would be to make
188 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
189 * to proceed in parallel, with read access, until the holder of
190 * manage_sem needed to take this rwsem for exclusive write access
191 * and modify some cpusets.
192 *
193 * The cpuset_common_file_write handler for operations that modify
194 * the cpuset hierarchy holds manage_sem across the entire operation,
195 * single threading all such cpuset modifications across the system.
196 *
197 * The cpuset_common_file_read() handlers only hold callback_sem across
198 * small pieces of code, such as when reading out possibly multi-word
199 * cpumasks and nodemasks.
200 *
201 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
202 * (usually) take either semaphore. These are the two most performance
203 * critical pieces of code here. The exception occurs on cpuset_exit(),
204 * when a task in a notify_on_release cpuset exits. Then manage_sem
170 * is taken, and if the cpuset count is zero, a usermode call made 205 * is taken, and if the cpuset count is zero, a usermode call made
171 * to /sbin/cpuset_release_agent with the name of the cpuset (path 206 * to /sbin/cpuset_release_agent with the name of the cpuset (path
172 * relative to the root of cpuset file system) as the argument. 207 * relative to the root of cpuset file system) as the argument.
173 * 208 *
174 * A cpuset can only be deleted if both its 'count' of using tasks is 209 * A cpuset can only be deleted if both its 'count' of using tasks
175 * zero, and its list of 'children' cpusets is empty. Since all tasks 210 * is zero, and its list of 'children' cpusets is empty. Since all
176 * in the system use _some_ cpuset, and since there is always at least 211 * tasks in the system use _some_ cpuset, and since there is always at
177 * one task in the system (init, pid == 1), therefore, top_cpuset 212 * least one task in the system (init, pid == 1), therefore, top_cpuset
178 * always has either children cpusets and/or using tasks. So no need 213 * always has either children cpusets and/or using tasks. So we don't
179 * for any special hack to ensure that top_cpuset cannot be deleted. 214 * need a special hack to ensure that top_cpuset cannot be deleted.
215 *
216 * The above "Tale of Two Semaphores" would be complete, but for:
217 *
218 * The task_lock() exception
219 *
220 * The need for this exception arises from the action of attach_task(),
221 * which overwrites one tasks cpuset pointer with another. It does
222 * so using both semaphores, however there are several performance
223 * critical places that need to reference task->cpuset without the
224 * expense of grabbing a system global semaphore. Therefore except as
225 * noted below, when dereferencing or, as in attach_task(), modifying
226 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
227 * (task->alloc_lock) already in the task_struct routinely used for
228 * such matters.
180 */ 229 */
181 230
182static DECLARE_MUTEX(cpuset_sem); 231static DECLARE_MUTEX(manage_sem);
232static DECLARE_MUTEX(callback_sem);
183 233
184/* 234/*
185 * A couple of forward declarations required, due to cyclic reference loop: 235 * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
354} 404}
355 405
356/* 406/*
357 * Call with cpuset_sem held. Writes path of cpuset into buf. 407 * Call with manage_sem held. Writes path of cpuset into buf.
358 * Returns 0 on success, -errno on error. 408 * Returns 0 on success, -errno on error.
359 */ 409 */
360 410
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
406 * status of the /sbin/cpuset_release_agent task, so no sense holding 456 * status of the /sbin/cpuset_release_agent task, so no sense holding
407 * our caller up for that. 457 * our caller up for that.
408 * 458 *
409 * The simple act of forking that task might require more memory, 459 * When we had only one cpuset semaphore, we had to call this
410 * which might need cpuset_sem. So this routine must be called while 460 * without holding it, to avoid deadlock when call_usermodehelper()
411 * cpuset_sem is not held, to avoid a possible deadlock. See also 461 * allocated memory. With two locks, we could now call this while
412 * comments for check_for_release(), below. 462 * holding manage_sem, but we still don't, so as to minimize
463 * the time manage_sem is held.
413 */ 464 */
414 465
415static void cpuset_release_agent(const char *pathbuf) 466static void cpuset_release_agent(const char *pathbuf)
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf)
441 * cs is notify_on_release() and now both the user count is zero and 492 * cs is notify_on_release() and now both the user count is zero and
442 * the list of children is empty, prepare cpuset path in a kmalloc'd 493 * the list of children is empty, prepare cpuset path in a kmalloc'd
443 * buffer, to be returned via ppathbuf, so that the caller can invoke 494 * buffer, to be returned via ppathbuf, so that the caller can invoke
444 * cpuset_release_agent() with it later on, once cpuset_sem is dropped. 495 * cpuset_release_agent() with it later on, once manage_sem is dropped.
445 * Call here with cpuset_sem held. 496 * Call here with manage_sem held.
446 * 497 *
447 * This check_for_release() routine is responsible for kmalloc'ing 498 * This check_for_release() routine is responsible for kmalloc'ing
448 * pathbuf. The above cpuset_release_agent() is responsible for 499 * pathbuf. The above cpuset_release_agent() is responsible for
449 * kfree'ing pathbuf. The caller of these routines is responsible 500 * kfree'ing pathbuf. The caller of these routines is responsible
450 * for providing a pathbuf pointer, initialized to NULL, then 501 * for providing a pathbuf pointer, initialized to NULL, then
451 * calling check_for_release() with cpuset_sem held and the address 502 * calling check_for_release() with manage_sem held and the address
452 * of the pathbuf pointer, then dropping cpuset_sem, then calling 503 * of the pathbuf pointer, then dropping manage_sem, then calling
453 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 504 * cpuset_release_agent() with pathbuf, as set by check_for_release().
454 */ 505 */
455 506
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
480 * One way or another, we guarantee to return some non-empty subset 531 * One way or another, we guarantee to return some non-empty subset
481 * of cpu_online_map. 532 * of cpu_online_map.
482 * 533 *
483 * Call with cpuset_sem held. 534 * Call with callback_sem held.
484 */ 535 */
485 536
486static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 537static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
504 * One way or another, we guarantee to return some non-empty subset 555 * One way or another, we guarantee to return some non-empty subset
505 * of node_online_map. 556 * of node_online_map.
506 * 557 *
507 * Call with cpuset_sem held. 558 * Call with callback_sem held.
508 */ 559 */
509 560
510static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 561static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
519} 570}
520 571
521/* 572/*
522 * Refresh current tasks mems_allowed and mems_generation from 573 * Refresh current tasks mems_allowed and mems_generation from current
523 * current tasks cpuset. Call with cpuset_sem held. 574 * tasks cpuset.
524 * 575 *
525 * Be sure to call refresh_mems() on any cpuset operation which 576 * Call without callback_sem or task_lock() held. May be called with
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 577 * or without manage_sem held. Will acquire task_lock() and might
527 * Call after obtaining cpuset_sem lock, before any possible 578 * acquire callback_sem during call.
528 * allocation. Otherwise one risks trying to allocate memory 579 *
529 * while the task cpuset_mems_generation is not the same as 580 * The task_lock() is required to dereference current->cpuset safely.
530 * the mems_generation in its cpuset, which would deadlock on 581 * Without it, we could pick up the pointer value of current->cpuset
531 * cpuset_sem in cpuset_update_current_mems_allowed(). 582 * in one instruction, and then attach_task could give us a different
532 * 583 * cpuset, and then the cpuset we had could be removed and freed,
533 * Since we hold cpuset_sem, once refresh_mems() is called, the 584 * and then on our next instruction, we could dereference a no longer
534 * test (current->cpuset_mems_generation != cs->mems_generation) 585 * valid cpuset pointer to get its mems_generation field.
535 * in cpuset_update_current_mems_allowed() will remain false, 586 *
536 * until we drop cpuset_sem. Anyone else who would change our 587 * This routine is needed to update the per-task mems_allowed data,
537 * cpusets mems_generation needs to lock cpuset_sem first. 588 * within the tasks context, when it is trying to allocate memory
589 * (in various mm/mempolicy.c routines) and notices that some other
590 * task has been modifying its cpuset.
538 */ 591 */
539 592
540static void refresh_mems(void) 593static void refresh_mems(void)
541{ 594{
542 struct cpuset *cs = current->cpuset; 595 int my_cpusets_mem_gen;
596
597 task_lock(current);
598 my_cpusets_mem_gen = current->cpuset->mems_generation;
599 task_unlock(current);
543 600
544 if (current->cpuset_mems_generation != cs->mems_generation) { 601 if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
602 struct cpuset *cs;
603
604 down(&callback_sem);
605 task_lock(current);
606 cs = current->cpuset;
545 guarantee_online_mems(cs, &current->mems_allowed); 607 guarantee_online_mems(cs, &current->mems_allowed);
546 current->cpuset_mems_generation = cs->mems_generation; 608 current->cpuset_mems_generation = cs->mems_generation;
609 task_unlock(current);
610 up(&callback_sem);
547 } 611 }
548} 612}
549 613
@@ -552,7 +616,7 @@ static void refresh_mems(void)
552 * 616 *
553 * One cpuset is a subset of another if all its allowed CPUs and 617 * One cpuset is a subset of another if all its allowed CPUs and
554 * Memory Nodes are a subset of the other, and its exclusive flags 618 * Memory Nodes are a subset of the other, and its exclusive flags
555 * are only set if the other's are set. 619 * are only set if the other's are set. Call holding manage_sem.
556 */ 620 */
557 621
558static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 622static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
570 * If we replaced the flag and mask values of the current cpuset 634 * If we replaced the flag and mask values of the current cpuset
571 * (cur) with those values in the trial cpuset (trial), would 635 * (cur) with those values in the trial cpuset (trial), would
572 * our various subset and exclusive rules still be valid? Presumes 636 * our various subset and exclusive rules still be valid? Presumes
573 * cpuset_sem held. 637 * manage_sem held.
574 * 638 *
575 * 'cur' is the address of an actual, in-use cpuset. Operations 639 * 'cur' is the address of an actual, in-use cpuset. Operations
576 * such as list traversal that depend on the actual address of the 640 * such as list traversal that depend on the actual address of the
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
624 * exclusive child cpusets 688 * exclusive child cpusets
625 * Build these two partitions by calling partition_sched_domains 689 * Build these two partitions by calling partition_sched_domains
626 * 690 *
627 * Call with cpuset_sem held. May nest a call to the 691 * Call with manage_sem held. May nest a call to the
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 692 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */ 693 */
630 694
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur)
669 unlock_cpu_hotplug(); 733 unlock_cpu_hotplug();
670} 734}
671 735
736/*
737 * Call with manage_sem held. May take callback_sem during call.
738 */
739
672static int update_cpumask(struct cpuset *cs, char *buf) 740static int update_cpumask(struct cpuset *cs, char *buf)
673{ 741{
674 struct cpuset trialcs; 742 struct cpuset trialcs;
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
685 if (retval < 0) 753 if (retval < 0)
686 return retval; 754 return retval;
687 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 755 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
756 down(&callback_sem);
688 cs->cpus_allowed = trialcs.cpus_allowed; 757 cs->cpus_allowed = trialcs.cpus_allowed;
758 up(&callback_sem);
689 if (is_cpu_exclusive(cs) && !cpus_unchanged) 759 if (is_cpu_exclusive(cs) && !cpus_unchanged)
690 update_cpu_domains(cs); 760 update_cpu_domains(cs);
691 return 0; 761 return 0;
692} 762}
693 763
764/*
765 * Call with manage_sem held. May take callback_sem during call.
766 */
767
694static int update_nodemask(struct cpuset *cs, char *buf) 768static int update_nodemask(struct cpuset *cs, char *buf)
695{ 769{
696 struct cpuset trialcs; 770 struct cpuset trialcs;
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
705 return -ENOSPC; 779 return -ENOSPC;
706 retval = validate_change(cs, &trialcs); 780 retval = validate_change(cs, &trialcs);
707 if (retval == 0) { 781 if (retval == 0) {
782 down(&callback_sem);
708 cs->mems_allowed = trialcs.mems_allowed; 783 cs->mems_allowed = trialcs.mems_allowed;
709 atomic_inc(&cpuset_mems_generation); 784 atomic_inc(&cpuset_mems_generation);
710 cs->mems_generation = atomic_read(&cpuset_mems_generation); 785 cs->mems_generation = atomic_read(&cpuset_mems_generation);
786 up(&callback_sem);
711 } 787 }
712 return retval; 788 return retval;
713} 789}
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
718 * CS_NOTIFY_ON_RELEASE) 794 * CS_NOTIFY_ON_RELEASE)
719 * cs: the cpuset to update 795 * cs: the cpuset to update
720 * buf: the buffer where we read the 0 or 1 796 * buf: the buffer where we read the 0 or 1
797 *
798 * Call with manage_sem held.
721 */ 799 */
722 800
723static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 801static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
739 return err; 817 return err;
740 cpu_exclusive_changed = 818 cpu_exclusive_changed =
741 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 819 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
820 down(&callback_sem);
742 if (turning_on) 821 if (turning_on)
743 set_bit(bit, &cs->flags); 822 set_bit(bit, &cs->flags);
744 else 823 else
745 clear_bit(bit, &cs->flags); 824 clear_bit(bit, &cs->flags);
825 up(&callback_sem);
746 826
747 if (cpu_exclusive_changed) 827 if (cpu_exclusive_changed)
748 update_cpu_domains(cs); 828 update_cpu_domains(cs);
749 return 0; 829 return 0;
750} 830}
751 831
832/*
833 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
834 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
835 * notified on release.
836 *
837 * Call holding manage_sem. May take callback_sem and task_lock of
838 * the task 'pid' during call.
839 */
840
752static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) 841static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
753{ 842{
754 pid_t pid; 843 pid_t pid;
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
765 read_lock(&tasklist_lock); 854 read_lock(&tasklist_lock);
766 855
767 tsk = find_task_by_pid(pid); 856 tsk = find_task_by_pid(pid);
768 if (!tsk) { 857 if (!tsk || tsk->flags & PF_EXITING) {
769 read_unlock(&tasklist_lock); 858 read_unlock(&tasklist_lock);
770 return -ESRCH; 859 return -ESRCH;
771 } 860 }
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
783 get_task_struct(tsk); 872 get_task_struct(tsk);
784 } 873 }
785 874
875 down(&callback_sem);
876
786 task_lock(tsk); 877 task_lock(tsk);
787 oldcs = tsk->cpuset; 878 oldcs = tsk->cpuset;
788 if (!oldcs) { 879 if (!oldcs) {
789 task_unlock(tsk); 880 task_unlock(tsk);
881 up(&callback_sem);
790 put_task_struct(tsk); 882 put_task_struct(tsk);
791 return -ESRCH; 883 return -ESRCH;
792 } 884 }
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
797 guarantee_online_cpus(cs, &cpus); 889 guarantee_online_cpus(cs, &cpus);
798 set_cpus_allowed(tsk, cpus); 890 set_cpus_allowed(tsk, cpus);
799 891
892 up(&callback_sem);
800 put_task_struct(tsk); 893 put_task_struct(tsk);
801 if (atomic_dec_and_test(&oldcs->count)) 894 if (atomic_dec_and_test(&oldcs->count))
802 check_for_release(oldcs, ppathbuf); 895 check_for_release(oldcs, ppathbuf);
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
840 } 933 }
841 buffer[nbytes] = 0; /* nul-terminate */ 934 buffer[nbytes] = 0; /* nul-terminate */
842 935
843 down(&cpuset_sem); 936 down(&manage_sem);
844 937
845 if (is_removed(cs)) { 938 if (is_removed(cs)) {
846 retval = -ENODEV; 939 retval = -ENODEV;
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
874 if (retval == 0) 967 if (retval == 0)
875 retval = nbytes; 968 retval = nbytes;
876out2: 969out2:
877 up(&cpuset_sem); 970 up(&manage_sem);
878 cpuset_release_agent(pathbuf); 971 cpuset_release_agent(pathbuf);
879out1: 972out1:
880 kfree(buffer); 973 kfree(buffer);
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
914{ 1007{
915 cpumask_t mask; 1008 cpumask_t mask;
916 1009
917 down(&cpuset_sem); 1010 down(&callback_sem);
918 mask = cs->cpus_allowed; 1011 mask = cs->cpus_allowed;
919 up(&cpuset_sem); 1012 up(&callback_sem);
920 1013
921 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1014 return cpulist_scnprintf(page, PAGE_SIZE, mask);
922} 1015}
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
925{ 1018{
926 nodemask_t mask; 1019 nodemask_t mask;
927 1020
928 down(&cpuset_sem); 1021 down(&callback_sem);
929 mask = cs->mems_allowed; 1022 mask = cs->mems_allowed;
930 up(&cpuset_sem); 1023 up(&callback_sem);
931 1024
932 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1025 return nodelist_scnprintf(page, PAGE_SIZE, mask);
933} 1026}
@@ -1135,7 +1228,9 @@ struct ctr_struct {
1135 1228
1136/* 1229/*
1137 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. 1230 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1138 * Return actual number of pids loaded. 1231 * Return actual number of pids loaded. No need to task_lock(p)
1232 * when reading out p->cpuset, as we don't really care if it changes
1233 * on the next cycle, and we are not going to try to dereference it.
1139 */ 1234 */
1140static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1235static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1141{ 1236{
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1177 return cnt; 1272 return cnt;
1178} 1273}
1179 1274
1275/*
1276 * Handle an open on 'tasks' file. Prepare a buffer listing the
1277 * process id's of tasks currently attached to the cpuset being opened.
1278 *
1279 * Does not require any specific cpuset semaphores, and does not take any.
1280 */
1180static int cpuset_tasks_open(struct inode *unused, struct file *file) 1281static int cpuset_tasks_open(struct inode *unused, struct file *file)
1181{ 1282{
1182 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1283 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1324 if (!cs) 1425 if (!cs)
1325 return -ENOMEM; 1426 return -ENOMEM;
1326 1427
1327 down(&cpuset_sem); 1428 down(&manage_sem);
1328 refresh_mems(); 1429 refresh_mems();
1329 cs->flags = 0; 1430 cs->flags = 0;
1330 if (notify_on_release(parent)) 1431 if (notify_on_release(parent))
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1339 1440
1340 cs->parent = parent; 1441 cs->parent = parent;
1341 1442
1443 down(&callback_sem);
1342 list_add(&cs->sibling, &cs->parent->children); 1444 list_add(&cs->sibling, &cs->parent->children);
1445 up(&callback_sem);
1343 1446
1344 err = cpuset_create_dir(cs, name, mode); 1447 err = cpuset_create_dir(cs, name, mode);
1345 if (err < 0) 1448 if (err < 0)
1346 goto err; 1449 goto err;
1347 1450
1348 /* 1451 /*
1349 * Release cpuset_sem before cpuset_populate_dir() because it 1452 * Release manage_sem before cpuset_populate_dir() because it
1350 * will down() this new directory's i_sem and if we race with 1453 * will down() this new directory's i_sem and if we race with
1351 * another mkdir, we might deadlock. 1454 * another mkdir, we might deadlock.
1352 */ 1455 */
1353 up(&cpuset_sem); 1456 up(&manage_sem);
1354 1457
1355 err = cpuset_populate_dir(cs->dentry); 1458 err = cpuset_populate_dir(cs->dentry);
1356 /* If err < 0, we have a half-filled directory - oh well ;) */ 1459 /* If err < 0, we have a half-filled directory - oh well ;) */
1357 return 0; 1460 return 0;
1358err: 1461err:
1359 list_del(&cs->sibling); 1462 list_del(&cs->sibling);
1360 up(&cpuset_sem); 1463 up(&manage_sem);
1361 kfree(cs); 1464 kfree(cs);
1362 return err; 1465 return err;
1363} 1466}
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1379 1482
1380 /* the vfs holds both inode->i_sem already */ 1483 /* the vfs holds both inode->i_sem already */
1381 1484
1382 down(&cpuset_sem); 1485 down(&manage_sem);
1383 refresh_mems(); 1486 refresh_mems();
1384 if (atomic_read(&cs->count) > 0) { 1487 if (atomic_read(&cs->count) > 0) {
1385 up(&cpuset_sem); 1488 up(&manage_sem);
1386 return -EBUSY; 1489 return -EBUSY;
1387 } 1490 }
1388 if (!list_empty(&cs->children)) { 1491 if (!list_empty(&cs->children)) {
1389 up(&cpuset_sem); 1492 up(&manage_sem);
1390 return -EBUSY; 1493 return -EBUSY;
1391 } 1494 }
1392 parent = cs->parent; 1495 parent = cs->parent;
1496 down(&callback_sem);
1393 set_bit(CS_REMOVED, &cs->flags); 1497 set_bit(CS_REMOVED, &cs->flags);
1394 if (is_cpu_exclusive(cs)) 1498 if (is_cpu_exclusive(cs))
1395 update_cpu_domains(cs); 1499 update_cpu_domains(cs);
1396 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1500 list_del(&cs->sibling); /* delete my sibling from parent->children */
1397 if (list_empty(&parent->children))
1398 check_for_release(parent, &pathbuf);
1399 spin_lock(&cs->dentry->d_lock); 1501 spin_lock(&cs->dentry->d_lock);
1400 d = dget(cs->dentry); 1502 d = dget(cs->dentry);
1401 cs->dentry = NULL; 1503 cs->dentry = NULL;
1402 spin_unlock(&d->d_lock); 1504 spin_unlock(&d->d_lock);
1403 cpuset_d_remove_dir(d); 1505 cpuset_d_remove_dir(d);
1404 dput(d); 1506 dput(d);
1405 up(&cpuset_sem); 1507 up(&callback_sem);
1508 if (list_empty(&parent->children))
1509 check_for_release(parent, &pathbuf);
1510 up(&manage_sem);
1406 cpuset_release_agent(pathbuf); 1511 cpuset_release_agent(pathbuf);
1407 return 0; 1512 return 0;
1408} 1513}
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void)
1462 * cpuset_fork - attach newly forked task to its parents cpuset. 1567 * cpuset_fork - attach newly forked task to its parents cpuset.
1463 * @tsk: pointer to task_struct of forking parent process. 1568 * @tsk: pointer to task_struct of forking parent process.
1464 * 1569 *
1465 * Description: By default, on fork, a task inherits its 1570 * Description: A task inherits its parent's cpuset at fork().
1466 * parent's cpuset. The pointer to the shared cpuset is 1571 *
1467 * automatically copied in fork.c by dup_task_struct(). 1572 * A pointer to the shared cpuset was automatically copied in fork.c
1468 * This cpuset_fork() routine need only increment the usage 1573 * by dup_task_struct(). However, we ignore that copy, since it was
1469 * counter in that cpuset. 1574 * not made under the protection of task_lock(), so might no longer be
1575 * a valid cpuset pointer. attach_task() might have already changed
1576 * current->cpuset, allowing the previously referenced cpuset to
1577 * be removed and freed. Instead, we task_lock(current) and copy
1578 * its present value of current->cpuset for our freshly forked child.
1579 *
1580 * At the point that cpuset_fork() is called, 'current' is the parent
1581 * task, and the passed argument 'child' points to the child task.
1470 **/ 1582 **/
1471 1583
1472void cpuset_fork(struct task_struct *tsk) 1584void cpuset_fork(struct task_struct *child)
1473{ 1585{
1474 atomic_inc(&tsk->cpuset->count); 1586 task_lock(current);
1587 child->cpuset = current->cpuset;
1588 atomic_inc(&child->cpuset->count);
1589 task_unlock(current);
1475} 1590}
1476 1591
1477/** 1592/**
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk)
1480 * 1595 *
1481 * Description: Detach cpuset from @tsk and release it. 1596 * Description: Detach cpuset from @tsk and release it.
1482 * 1597 *
1483 * Note that cpusets marked notify_on_release force every task 1598 * Note that cpusets marked notify_on_release force every task in
1484 * in them to take the global cpuset_sem semaphore when exiting. 1599 * them to take the global manage_sem semaphore when exiting.
1485 * This could impact scaling on very large systems. Be reluctant 1600 * This could impact scaling on very large systems. Be reluctant to
1486 * to use notify_on_release cpusets where very high task exit 1601 * use notify_on_release cpusets where very high task exit scaling
1487 * scaling is required on large systems. 1602 * is required on large systems.
1488 * 1603 *
1489 * Don't even think about derefencing 'cs' after the cpuset use 1604 * Don't even think about derefencing 'cs' after the cpuset use count
1490 * count goes to zero, except inside a critical section guarded 1605 * goes to zero, except inside a critical section guarded by manage_sem
1491 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1606 * or callback_sem. Otherwise a zero cpuset use count is a license to
1492 * then a zero cpuset use count is a license to any other task to 1607 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1493 * nuke the cpuset immediately. 1608 *
1609 * This routine has to take manage_sem, not callback_sem, because
1610 * it is holding that semaphore while calling check_for_release(),
1611 * which calls kmalloc(), so can't be called holding callback__sem().
1612 *
1613 * We don't need to task_lock() this reference to tsk->cpuset,
1614 * because tsk is already marked PF_EXITING, so attach_task() won't
1615 * mess with it.
1494 **/ 1616 **/
1495 1617
1496void cpuset_exit(struct task_struct *tsk) 1618void cpuset_exit(struct task_struct *tsk)
1497{ 1619{
1498 struct cpuset *cs; 1620 struct cpuset *cs;
1499 1621
1500 task_lock(tsk); 1622 BUG_ON(!(tsk->flags & PF_EXITING));
1623
1501 cs = tsk->cpuset; 1624 cs = tsk->cpuset;
1502 tsk->cpuset = NULL; 1625 tsk->cpuset = NULL;
1503 task_unlock(tsk);
1504 1626
1505 if (notify_on_release(cs)) { 1627 if (notify_on_release(cs)) {
1506 char *pathbuf = NULL; 1628 char *pathbuf = NULL;
1507 1629
1508 down(&cpuset_sem); 1630 down(&manage_sem);
1509 if (atomic_dec_and_test(&cs->count)) 1631 if (atomic_dec_and_test(&cs->count))
1510 check_for_release(cs, &pathbuf); 1632 check_for_release(cs, &pathbuf);
1511 up(&cpuset_sem); 1633 up(&manage_sem);
1512 cpuset_release_agent(pathbuf); 1634 cpuset_release_agent(pathbuf);
1513 } else { 1635 } else {
1514 atomic_dec(&cs->count); 1636 atomic_dec(&cs->count);
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1529{ 1651{
1530 cpumask_t mask; 1652 cpumask_t mask;
1531 1653
1532 down(&cpuset_sem); 1654 down(&callback_sem);
1533 task_lock((struct task_struct *)tsk); 1655 task_lock((struct task_struct *)tsk);
1534 guarantee_online_cpus(tsk->cpuset, &mask); 1656 guarantee_online_cpus(tsk->cpuset, &mask);
1535 task_unlock((struct task_struct *)tsk); 1657 task_unlock((struct task_struct *)tsk);
1536 up(&cpuset_sem); 1658 up(&callback_sem);
1537 1659
1538 return mask; 1660 return mask;
1539} 1661}
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void)
1549 * If the current tasks cpusets mems_allowed changed behind our backs, 1671 * If the current tasks cpusets mems_allowed changed behind our backs,
1550 * update current->mems_allowed and mems_generation to the new value. 1672 * update current->mems_allowed and mems_generation to the new value.
1551 * Do not call this routine if in_interrupt(). 1673 * Do not call this routine if in_interrupt().
1674 *
1675 * Call without callback_sem or task_lock() held. May be called
1676 * with or without manage_sem held. Unless exiting, it will acquire
1677 * task_lock(). Also might acquire callback_sem during call to
1678 * refresh_mems().
1552 */ 1679 */
1553 1680
1554void cpuset_update_current_mems_allowed(void) 1681void cpuset_update_current_mems_allowed(void)
1555{ 1682{
1556 struct cpuset *cs = current->cpuset; 1683 struct cpuset *cs;
1684 int need_to_refresh = 0;
1557 1685
1686 task_lock(current);
1687 cs = current->cpuset;
1558 if (!cs) 1688 if (!cs)
1559 return; /* task is exiting */ 1689 goto done;
1560 if (current->cpuset_mems_generation != cs->mems_generation) { 1690 if (current->cpuset_mems_generation != cs->mems_generation)
1561 down(&cpuset_sem); 1691 need_to_refresh = 1;
1692done:
1693 task_unlock(current);
1694 if (need_to_refresh)
1562 refresh_mems(); 1695 refresh_mems();
1563 up(&cpuset_sem);
1564 }
1565} 1696}
1566 1697
1567/** 1698/**
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1595 1726
1596/* 1727/*
1597 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1728 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1598 * ancestor to the specified cpuset. Call while holding cpuset_sem. 1729 * ancestor to the specified cpuset. Call holding callback_sem.
1599 * If no ancestor is mem_exclusive (an unusual configuration), then 1730 * If no ancestor is mem_exclusive (an unusual configuration), then
1600 * returns the root cpuset. 1731 * returns the root cpuset.
1601 */ 1732 */
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1622 * GFP_KERNEL allocations are not so marked, so can escape to the 1753 * GFP_KERNEL allocations are not so marked, so can escape to the
1623 * nearest mem_exclusive ancestor cpuset. 1754 * nearest mem_exclusive ancestor cpuset.
1624 * 1755 *
1625 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() 1756 * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
1626 * routine only calls here with __GFP_HARDWALL bit _not_ set if 1757 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1627 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 1758 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1628 * mems_allowed came up empty on the first pass over the zonelist. 1759 * mems_allowed came up empty on the first pass over the zonelist.
1629 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 1760 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1630 * short of memory, might require taking the cpuset_sem semaphore. 1761 * short of memory, might require taking the callback_sem semaphore.
1631 * 1762 *
1632 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 1763 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1633 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 1764 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1659 return 0; 1790 return 0;
1660 1791
1661 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1792 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1662 down(&cpuset_sem); 1793 down(&callback_sem);
1663 cs = current->cpuset; 1794
1664 if (!cs) 1795 if (current->flags & PF_EXITING) /* Let dying task have memory */
1665 goto done; /* current task exiting */ 1796 return 1;
1666 cs = nearest_exclusive_ancestor(cs); 1797 task_lock(current);
1798 cs = nearest_exclusive_ancestor(current->cpuset);
1799 task_unlock(current);
1800
1667 allowed = node_isset(node, cs->mems_allowed); 1801 allowed = node_isset(node, cs->mems_allowed);
1668done: 1802 up(&callback_sem);
1669 up(&cpuset_sem);
1670 return allowed; 1803 return allowed;
1671} 1804}
1672 1805
@@ -1679,7 +1812,7 @@ done:
1679 * determine if task @p's memory usage might impact the memory 1812 * determine if task @p's memory usage might impact the memory
1680 * available to the current task. 1813 * available to the current task.
1681 * 1814 *
1682 * Acquires cpuset_sem - not suitable for calling from a fast path. 1815 * Acquires callback_sem - not suitable for calling from a fast path.
1683 **/ 1816 **/
1684 1817
1685int cpuset_excl_nodes_overlap(const struct task_struct *p) 1818int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1687 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1820 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1688 int overlap = 0; /* do cpusets overlap? */ 1821 int overlap = 0; /* do cpusets overlap? */
1689 1822
1690 down(&cpuset_sem); 1823 down(&callback_sem);
1691 cs1 = current->cpuset; 1824
1692 if (!cs1) 1825 task_lock(current);
1693 goto done; /* current task exiting */ 1826 if (current->flags & PF_EXITING) {
1694 cs2 = p->cpuset; 1827 task_unlock(current);
1695 if (!cs2) 1828 goto done;
1696 goto done; /* task p is exiting */ 1829 }
1697 cs1 = nearest_exclusive_ancestor(cs1); 1830 cs1 = nearest_exclusive_ancestor(current->cpuset);
1698 cs2 = nearest_exclusive_ancestor(cs2); 1831 task_unlock(current);
1832
1833 task_lock((struct task_struct *)p);
1834 if (p->flags & PF_EXITING) {
1835 task_unlock((struct task_struct *)p);
1836 goto done;
1837 }
1838 cs2 = nearest_exclusive_ancestor(p->cpuset);
1839 task_unlock((struct task_struct *)p);
1840
1699 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1841 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1700done: 1842done:
1701 up(&cpuset_sem); 1843 up(&callback_sem);
1702 1844
1703 return overlap; 1845 return overlap;
1704} 1846}
@@ -1707,6 +1849,10 @@ done:
1707 * proc_cpuset_show() 1849 * proc_cpuset_show()
1708 * - Print tasks cpuset path into seq_file. 1850 * - Print tasks cpuset path into seq_file.
1709 * - Used for /proc/<pid>/cpuset. 1851 * - Used for /proc/<pid>/cpuset.
1852 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
1853 * doesn't really matter if tsk->cpuset changes after we read it,
1854 * and we take manage_sem, keeping attach_task() from changing it
1855 * anyway.
1710 */ 1856 */
1711 1857
1712static int proc_cpuset_show(struct seq_file *m, void *v) 1858static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1721 return -ENOMEM; 1867 return -ENOMEM;
1722 1868
1723 tsk = m->private; 1869 tsk = m->private;
1724 down(&cpuset_sem); 1870 down(&manage_sem);
1725 task_lock(tsk);
1726 cs = tsk->cpuset; 1871 cs = tsk->cpuset;
1727 task_unlock(tsk);
1728 if (!cs) { 1872 if (!cs) {
1729 retval = -EINVAL; 1873 retval = -EINVAL;
1730 goto out; 1874 goto out;
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1736 seq_puts(m, buf); 1880 seq_puts(m, buf);
1737 seq_putc(m, '\n'); 1881 seq_putc(m, '\n');
1738out: 1882out:
1739 up(&cpuset_sem); 1883 up(&manage_sem);
1740 kfree(buf); 1884 kfree(buf);
1741 return retval; 1885 return retval;
1742} 1886}