aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2005-10-30 18:02:30 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 20:37:21 -0500
commit053199edf54f685e7dea765b60d4d5e9070dadec (patch)
treea2d12a8b7f07b59048da992e7ae9405bc4ee292b
parent5aa15b5f27fc2c404530c6c8eabdb8437deb3163 (diff)
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores. The suggestion to use two locks was made by Roman Zippel. Both locks are global. Code that wants to modify cpusets must first acquire the exclusive manage_sem, which allows them read-only access to cpusets, and holds off other would-be modifiers. Before making actual changes, the second semaphore, callback_sem must be acquired as well. Code that needs only to query cpusets must acquire callback_sem, which is also a global exclusive lock. The earlier problems with double tripping are avoided, because it is allowed for holders of manage_sem to nest the second callback_sem lock, and only callback_sem is needed by code called from within __alloc_pages(), where the double tripping had been possible. This is not quite the same as a normal read/write semaphore, because obtaining read-only access with intent to change must hold off other such attempts, while allowing read-only access w/o such intention. Changing cpusets involves several related checks and changes, which must be done while allowing read-only queries (to avoid the double trip), but while ensuring nothing changes (holding off other would be modifiers.) This overhaul of cpuset locking also makes careful use of task_lock() to guard access to the task->cpuset pointer, closing a couple of race conditions noticed while reading this code (thanks, Roman). I've never seen these races fail in any use or test. See further the comments in the code. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/cpuset.c418
2 files changed, 282 insertions, 138 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c30bc308ef1..b2d2dc14f0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,7 +1211,7 @@ extern void unhash_process(struct task_struct *p);
1211/* 1211/*
1212 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring 1212 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
1213 * subscriptions and synchronises with wait4(). Also used in procfs. Also 1213 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1214 * pins the final release of task.io_context. 1214 * pins the final release of task.io_context. Also protects ->cpuset.
1215 * 1215 *
1216 * Nests both inside and outside of read_lock(&tasklist_lock). 1216 * Nests both inside and outside of read_lock(&tasklist_lock).
1217 * It must not be nested with write_lock_irq(&tasklist_lock), 1217 * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cd54dba2be18..7491352276b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,9 @@ struct cpuset {
60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
62 62
63 /*
64 * Count is atomic so can incr (fork) or decr (exit) without a lock.
65 */
63 atomic_t count; /* count tasks using this cpuset */ 66 atomic_t count; /* count tasks using this cpuset */
64 67
65 /* 68 /*
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount;
142static struct super_block *cpuset_sb = NULL; 145static struct super_block *cpuset_sb = NULL;
143 146
144/* 147/*
145 * cpuset_sem should be held by anyone who is depending on the children 148 * We have two global cpuset semaphores below. They can nest.
146 * or sibling lists of any cpuset, or performing non-atomic operations 149 * It is ok to first take manage_sem, then nest callback_sem. We also
147 * on the flags or *_allowed values of a cpuset, such as raising the 150 * require taking task_lock() when dereferencing a tasks cpuset pointer.
148 * CS_REMOVED flag bit iff it is not already raised, or reading and 151 * See "The task_lock() exception", at the end of this comment.
149 * conditionally modifying the *_allowed values. One kernel global 152 *
150 * cpuset semaphore should be sufficient - these things don't change 153 * A task must hold both semaphores to modify cpusets. If a task
151 * that much. 154 * holds manage_sem, then it blocks others wanting that semaphore,
152 * 155 * ensuring that it is the only task able to also acquire callback_sem
153 * The code that modifies cpusets holds cpuset_sem across the entire 156 * and be able to modify cpusets. It can perform various checks on
154 * operation, from cpuset_common_file_write() down, single threading 157 * the cpuset structure first, knowing nothing will change. It can
155 * all cpuset modifications (except for counter manipulations from 158 * also allocate memory while just holding manage_sem. While it is
156 * fork and exit) across the system. This presumes that cpuset 159 * performing these checks, various callback routines can briefly
157 * modifications are rare - better kept simple and safe, even if slow. 160 * acquire callback_sem to query cpusets. Once it is ready to make
158 * 161 * the changes, it takes callback_sem, blocking everyone else.
159 * The code that reads cpusets, such as in cpuset_common_file_read() 162 *
160 * and below, only holds cpuset_sem across small pieces of code, such 163 * Calls to the kernel memory allocator can not be made while holding
161 * as when reading out possibly multi-word cpumasks and nodemasks, as 164 * callback_sem, as that would risk double tripping on callback_sem
162 * the risks are less, and the desire for performance a little greater. 165 * from one of the callbacks into the cpuset code from within
163 * The proc_cpuset_show() routine needs to hold cpuset_sem to insure 166 * __alloc_pages().
164 * that no cs->dentry is NULL, as it walks up the cpuset tree to root. 167 *
165 * 168 * If a task is only holding callback_sem, then it has read-only
166 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't 169 * access to cpusets.
167 * (usually) grab cpuset_sem. These are the two most performance 170 *
168 * critical pieces of code here. The exception occurs on exit(), 171 * The task_struct fields mems_allowed and mems_generation may only
169 * when a task in a notify_on_release cpuset exits. Then cpuset_sem 172 * be accessed in the context of that task, so require no locks.
173 *
174 * Any task can increment and decrement the count field without lock.
175 * So in general, code holding manage_sem or callback_sem can't rely
176 * on the count field not changing. However, if the count goes to
177 * zero, then only attach_task(), which holds both semaphores, can
178 * increment it again. Because a count of zero means that no tasks
179 * are currently attached, therefore there is no way a task attached
180 * to that cpuset can fork (the other way to increment the count).
181 * So code holding manage_sem or callback_sem can safely assume that
182 * if the count is zero, it will stay zero. Similarly, if a task
183 * holds manage_sem or callback_sem on a cpuset with zero count, it
184 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
185 * both of those semaphores.
186 *
187 * A possible optimization to improve parallelism would be to make
188 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
189 * to proceed in parallel, with read access, until the holder of
190 * manage_sem needed to take this rwsem for exclusive write access
191 * and modify some cpusets.
192 *
193 * The cpuset_common_file_write handler for operations that modify
194 * the cpuset hierarchy holds manage_sem across the entire operation,
195 * single threading all such cpuset modifications across the system.
196 *
197 * The cpuset_common_file_read() handlers only hold callback_sem across
198 * small pieces of code, such as when reading out possibly multi-word
199 * cpumasks and nodemasks.
200 *
201 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
202 * (usually) take either semaphore. These are the two most performance
203 * critical pieces of code here. The exception occurs on cpuset_exit(),
204 * when a task in a notify_on_release cpuset exits. Then manage_sem
170 * is taken, and if the cpuset count is zero, a usermode call made 205 * is taken, and if the cpuset count is zero, a usermode call made
171 * to /sbin/cpuset_release_agent with the name of the cpuset (path 206 * to /sbin/cpuset_release_agent with the name of the cpuset (path
172 * relative to the root of cpuset file system) as the argument. 207 * relative to the root of cpuset file system) as the argument.
173 * 208 *
174 * A cpuset can only be deleted if both its 'count' of using tasks is 209 * A cpuset can only be deleted if both its 'count' of using tasks
175 * zero, and its list of 'children' cpusets is empty. Since all tasks 210 * is zero, and its list of 'children' cpusets is empty. Since all
176 * in the system use _some_ cpuset, and since there is always at least 211 * tasks in the system use _some_ cpuset, and since there is always at
177 * one task in the system (init, pid == 1), therefore, top_cpuset 212 * least one task in the system (init, pid == 1), therefore, top_cpuset
178 * always has either children cpusets and/or using tasks. So no need 213 * always has either children cpusets and/or using tasks. So we don't
179 * for any special hack to ensure that top_cpuset cannot be deleted. 214 * need a special hack to ensure that top_cpuset cannot be deleted.
215 *
216 * The above "Tale of Two Semaphores" would be complete, but for:
217 *
218 * The task_lock() exception
219 *
220 * The need for this exception arises from the action of attach_task(),
221 * which overwrites one tasks cpuset pointer with another. It does
222 * so using both semaphores, however there are several performance
223 * critical places that need to reference task->cpuset without the
224 * expense of grabbing a system global semaphore. Therefore except as
225 * noted below, when dereferencing or, as in attach_task(), modifying
226 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
227 * (task->alloc_lock) already in the task_struct routinely used for
228 * such matters.
180 */ 229 */
181 230
182static DECLARE_MUTEX(cpuset_sem); 231static DECLARE_MUTEX(manage_sem);
232static DECLARE_MUTEX(callback_sem);
183 233
184/* 234/*
185 * A couple of forward declarations required, due to cyclic reference loop: 235 * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
354} 404}
355 405
356/* 406/*
357 * Call with cpuset_sem held. Writes path of cpuset into buf. 407 * Call with manage_sem held. Writes path of cpuset into buf.
358 * Returns 0 on success, -errno on error. 408 * Returns 0 on success, -errno on error.
359 */ 409 */
360 410
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
406 * status of the /sbin/cpuset_release_agent task, so no sense holding 456 * status of the /sbin/cpuset_release_agent task, so no sense holding
407 * our caller up for that. 457 * our caller up for that.
408 * 458 *
409 * The simple act of forking that task might require more memory, 459 * When we had only one cpuset semaphore, we had to call this
410 * which might need cpuset_sem. So this routine must be called while 460 * without holding it, to avoid deadlock when call_usermodehelper()
411 * cpuset_sem is not held, to avoid a possible deadlock. See also 461 * allocated memory. With two locks, we could now call this while
412 * comments for check_for_release(), below. 462 * holding manage_sem, but we still don't, so as to minimize
463 * the time manage_sem is held.
413 */ 464 */
414 465
415static void cpuset_release_agent(const char *pathbuf) 466static void cpuset_release_agent(const char *pathbuf)
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf)
441 * cs is notify_on_release() and now both the user count is zero and 492 * cs is notify_on_release() and now both the user count is zero and
442 * the list of children is empty, prepare cpuset path in a kmalloc'd 493 * the list of children is empty, prepare cpuset path in a kmalloc'd
443 * buffer, to be returned via ppathbuf, so that the caller can invoke 494 * buffer, to be returned via ppathbuf, so that the caller can invoke
444 * cpuset_release_agent() with it later on, once cpuset_sem is dropped. 495 * cpuset_release_agent() with it later on, once manage_sem is dropped.
445 * Call here with cpuset_sem held. 496 * Call here with manage_sem held.
446 * 497 *
447 * This check_for_release() routine is responsible for kmalloc'ing 498 * This check_for_release() routine is responsible for kmalloc'ing
448 * pathbuf. The above cpuset_release_agent() is responsible for 499 * pathbuf. The above cpuset_release_agent() is responsible for
449 * kfree'ing pathbuf. The caller of these routines is responsible 500 * kfree'ing pathbuf. The caller of these routines is responsible
450 * for providing a pathbuf pointer, initialized to NULL, then 501 * for providing a pathbuf pointer, initialized to NULL, then
451 * calling check_for_release() with cpuset_sem held and the address 502 * calling check_for_release() with manage_sem held and the address
452 * of the pathbuf pointer, then dropping cpuset_sem, then calling 503 * of the pathbuf pointer, then dropping manage_sem, then calling
453 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 504 * cpuset_release_agent() with pathbuf, as set by check_for_release().
454 */ 505 */
455 506
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
480 * One way or another, we guarantee to return some non-empty subset 531 * One way or another, we guarantee to return some non-empty subset
481 * of cpu_online_map. 532 * of cpu_online_map.
482 * 533 *
483 * Call with cpuset_sem held. 534 * Call with callback_sem held.
484 */ 535 */
485 536
486static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 537static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
504 * One way or another, we guarantee to return some non-empty subset 555 * One way or another, we guarantee to return some non-empty subset
505 * of node_online_map. 556 * of node_online_map.
506 * 557 *
507 * Call with cpuset_sem held. 558 * Call with callback_sem held.
508 */ 559 */
509 560
510static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 561static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
519} 570}
520 571
521/* 572/*
522 * Refresh current tasks mems_allowed and mems_generation from 573 * Refresh current tasks mems_allowed and mems_generation from current
523 * current tasks cpuset. Call with cpuset_sem held. 574 * tasks cpuset.
524 * 575 *
525 * Be sure to call refresh_mems() on any cpuset operation which 576 * Call without callback_sem or task_lock() held. May be called with
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 577 * or without manage_sem held. Will acquire task_lock() and might
527 * Call after obtaining cpuset_sem lock, before any possible 578 * acquire callback_sem during call.
528 * allocation. Otherwise one risks trying to allocate memory 579 *
529 * while the task cpuset_mems_generation is not the same as 580 * The task_lock() is required to dereference current->cpuset safely.
530 * the mems_generation in its cpuset, which would deadlock on 581 * Without it, we could pick up the pointer value of current->cpuset
531 * cpuset_sem in cpuset_update_current_mems_allowed(). 582 * in one instruction, and then attach_task could give us a different
532 * 583 * cpuset, and then the cpuset we had could be removed and freed,
533 * Since we hold cpuset_sem, once refresh_mems() is called, the 584 * and then on our next instruction, we could dereference a no longer
534 * test (current->cpuset_mems_generation != cs->mems_generation) 585 * valid cpuset pointer to get its mems_generation field.
535 * in cpuset_update_current_mems_allowed() will remain false, 586 *
536 * until we drop cpuset_sem. Anyone else who would change our 587 * This routine is needed to update the per-task mems_allowed data,
537 * cpusets mems_generation needs to lock cpuset_sem first. 588 * within the tasks context, when it is trying to allocate memory
589 * (in various mm/mempolicy.c routines) and notices that some other
590 * task has been modifying its cpuset.
538 */ 591 */
539 592
540static void refresh_mems(void) 593static void refresh_mems(void)
541{ 594{
542 struct cpuset *cs = current->cpuset; 595 int my_cpusets_mem_gen;
596
597 task_lock(current);
598 my_cpusets_mem_gen = current->cpuset->mems_generation;
599 task_unlock(current);
543 600
544 if (current->cpuset_mems_generation != cs->mems_generation) { 601 if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
602 struct cpuset *cs;
603
604 down(&callback_sem);
605 task_lock(current);
606 cs = current->cpuset;
545 guarantee_online_mems(cs, &current->mems_allowed); 607 guarantee_online_mems(cs, &current->mems_allowed);
546 current->cpuset_mems_generation = cs->mems_generation; 608 current->cpuset_mems_generation = cs->mems_generation;
609 task_unlock(current);
610 up(&callback_sem);
547 } 611 }
548} 612}
549 613
@@ -552,7 +616,7 @@ static void refresh_mems(void)
552 * 616 *
553 * One cpuset is a subset of another if all its allowed CPUs and 617 * One cpuset is a subset of another if all its allowed CPUs and
554 * Memory Nodes are a subset of the other, and its exclusive flags 618 * Memory Nodes are a subset of the other, and its exclusive flags
555 * are only set if the other's are set. 619 * are only set if the other's are set. Call holding manage_sem.
556 */ 620 */
557 621
558static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 622static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
570 * If we replaced the flag and mask values of the current cpuset 634 * If we replaced the flag and mask values of the current cpuset
571 * (cur) with those values in the trial cpuset (trial), would 635 * (cur) with those values in the trial cpuset (trial), would
572 * our various subset and exclusive rules still be valid? Presumes 636 * our various subset and exclusive rules still be valid? Presumes
573 * cpuset_sem held. 637 * manage_sem held.
574 * 638 *
575 * 'cur' is the address of an actual, in-use cpuset. Operations 639 * 'cur' is the address of an actual, in-use cpuset. Operations
576 * such as list traversal that depend on the actual address of the 640 * such as list traversal that depend on the actual address of the
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
624 * exclusive child cpusets 688 * exclusive child cpusets
625 * Build these two partitions by calling partition_sched_domains 689 * Build these two partitions by calling partition_sched_domains
626 * 690 *
627 * Call with cpuset_sem held. May nest a call to the 691 * Call with manage_sem held. May nest a call to the
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 692 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */ 693 */
630 694
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur)
669 unlock_cpu_hotplug(); 733 unlock_cpu_hotplug();
670} 734}
671 735
736/*
737 * Call with manage_sem held. May take callback_sem during call.
738 */
739
672static int update_cpumask(struct cpuset *cs, char *buf) 740static int update_cpumask(struct cpuset *cs, char *buf)
673{ 741{
674 struct cpuset trialcs; 742 struct cpuset trialcs;
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
685 if (retval < 0) 753 if (retval < 0)
686 return retval; 754 return retval;
687 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 755 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
756 down(&callback_sem);
688 cs->cpus_allowed = trialcs.cpus_allowed; 757 cs->cpus_allowed = trialcs.cpus_allowed;
758 up(&callback_sem);
689 if (is_cpu_exclusive(cs) && !cpus_unchanged) 759 if (is_cpu_exclusive(cs) && !cpus_unchanged)
690 update_cpu_domains(cs); 760 update_cpu_domains(cs);
691 return 0; 761 return 0;
692} 762}
693 763
764/*
765 * Call with manage_sem held. May take callback_sem during call.
766 */
767
694static int update_nodemask(struct cpuset *cs, char *buf) 768static int update_nodemask(struct cpuset *cs, char *buf)
695{ 769{
696 struct cpuset trialcs; 770 struct cpuset trialcs;
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
705 return -ENOSPC; 779 return -ENOSPC;
706 retval = validate_change(cs, &trialcs); 780 retval = validate_change(cs, &trialcs);
707 if (retval == 0) { 781 if (retval == 0) {
782 down(&callback_sem);
708 cs->mems_allowed = trialcs.mems_allowed; 783 cs->mems_allowed = trialcs.mems_allowed;
709 atomic_inc(&cpuset_mems_generation); 784 atomic_inc(&cpuset_mems_generation);
710 cs->mems_generation = atomic_read(&cpuset_mems_generation); 785 cs->mems_generation = atomic_read(&cpuset_mems_generation);
786 up(&callback_sem);
711 } 787 }
712 return retval; 788 return retval;
713} 789}
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
718 * CS_NOTIFY_ON_RELEASE) 794 * CS_NOTIFY_ON_RELEASE)
719 * cs: the cpuset to update 795 * cs: the cpuset to update
720 * buf: the buffer where we read the 0 or 1 796 * buf: the buffer where we read the 0 or 1
797 *
798 * Call with manage_sem held.
721 */ 799 */
722 800
723static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 801static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
739 return err; 817 return err;
740 cpu_exclusive_changed = 818 cpu_exclusive_changed =
741 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 819 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
820 down(&callback_sem);
742 if (turning_on) 821 if (turning_on)
743 set_bit(bit, &cs->flags); 822 set_bit(bit, &cs->flags);
744 else 823 else
745 clear_bit(bit, &cs->flags); 824 clear_bit(bit, &cs->flags);
825 up(&callback_sem);
746 826
747 if (cpu_exclusive_changed) 827 if (cpu_exclusive_changed)
748 update_cpu_domains(cs); 828 update_cpu_domains(cs);
749 return 0; 829 return 0;
750} 830}
751 831
832/*
833 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
834 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
835 * notified on release.
836 *
837 * Call holding manage_sem. May take callback_sem and task_lock of
838 * the task 'pid' during call.
839 */
840
752static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) 841static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
753{ 842{
754 pid_t pid; 843 pid_t pid;
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
765 read_lock(&tasklist_lock); 854 read_lock(&tasklist_lock);
766 855
767 tsk = find_task_by_pid(pid); 856 tsk = find_task_by_pid(pid);
768 if (!tsk) { 857 if (!tsk || tsk->flags & PF_EXITING) {
769 read_unlock(&tasklist_lock); 858 read_unlock(&tasklist_lock);
770 return -ESRCH; 859 return -ESRCH;
771 } 860 }
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
783 get_task_struct(tsk); 872 get_task_struct(tsk);
784 } 873 }
785 874
875 down(&callback_sem);
876
786 task_lock(tsk); 877 task_lock(tsk);
787 oldcs = tsk->cpuset; 878 oldcs = tsk->cpuset;
788 if (!oldcs) { 879 if (!oldcs) {
789 task_unlock(tsk); 880 task_unlock(tsk);
881 up(&callback_sem);
790 put_task_struct(tsk); 882 put_task_struct(tsk);
791 return -ESRCH; 883 return -ESRCH;
792 } 884 }
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
797 guarantee_online_cpus(cs, &cpus); 889 guarantee_online_cpus(cs, &cpus);
798 set_cpus_allowed(tsk, cpus); 890 set_cpus_allowed(tsk, cpus);
799 891
892 up(&callback_sem);
800 put_task_struct(tsk); 893 put_task_struct(tsk);
801 if (atomic_dec_and_test(&oldcs->count)) 894 if (atomic_dec_and_test(&oldcs->count))
802 check_for_release(oldcs, ppathbuf); 895 check_for_release(oldcs, ppathbuf);
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
840 } 933 }
841 buffer[nbytes] = 0; /* nul-terminate */ 934 buffer[nbytes] = 0; /* nul-terminate */
842 935
843 down(&cpuset_sem); 936 down(&manage_sem);
844 937
845 if (is_removed(cs)) { 938 if (is_removed(cs)) {
846 retval = -ENODEV; 939 retval = -ENODEV;
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
874 if (retval == 0) 967 if (retval == 0)
875 retval = nbytes; 968 retval = nbytes;
876out2: 969out2:
877 up(&cpuset_sem); 970 up(&manage_sem);
878 cpuset_release_agent(pathbuf); 971 cpuset_release_agent(pathbuf);
879out1: 972out1:
880 kfree(buffer); 973 kfree(buffer);
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
914{ 1007{
915 cpumask_t mask; 1008 cpumask_t mask;
916 1009
917 down(&cpuset_sem); 1010 down(&callback_sem);
918 mask = cs->cpus_allowed; 1011 mask = cs->cpus_allowed;
919 up(&cpuset_sem); 1012 up(&callback_sem);
920 1013
921 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1014 return cpulist_scnprintf(page, PAGE_SIZE, mask);
922} 1015}
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
925{ 1018{
926 nodemask_t mask; 1019 nodemask_t mask;
927 1020
928 down(&cpuset_sem); 1021 down(&callback_sem);
929 mask = cs->mems_allowed; 1022 mask = cs->mems_allowed;
930 up(&cpuset_sem); 1023 up(&callback_sem);
931 1024
932 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1025 return nodelist_scnprintf(page, PAGE_SIZE, mask);
933} 1026}
@@ -1135,7 +1228,9 @@ struct ctr_struct {
1135 1228
1136/* 1229/*
1137 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. 1230 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1138 * Return actual number of pids loaded. 1231 * Return actual number of pids loaded. No need to task_lock(p)
1232 * when reading out p->cpuset, as we don't really care if it changes
1233 * on the next cycle, and we are not going to try to dereference it.
1139 */ 1234 */
1140static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1235static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1141{ 1236{
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1177 return cnt; 1272 return cnt;
1178} 1273}
1179 1274
1275/*
1276 * Handle an open on 'tasks' file. Prepare a buffer listing the
1277 * process id's of tasks currently attached to the cpuset being opened.
1278 *
1279 * Does not require any specific cpuset semaphores, and does not take any.
1280 */
1180static int cpuset_tasks_open(struct inode *unused, struct file *file) 1281static int cpuset_tasks_open(struct inode *unused, struct file *file)
1181{ 1282{
1182 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1283 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1324 if (!cs) 1425 if (!cs)
1325 return -ENOMEM; 1426 return -ENOMEM;
1326 1427
1327 down(&cpuset_sem); 1428 down(&manage_sem);
1328 refresh_mems(); 1429 refresh_mems();
1329 cs->flags = 0; 1430 cs->flags = 0;
1330 if (notify_on_release(parent)) 1431 if (notify_on_release(parent))
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1339 1440
1340 cs->parent = parent; 1441 cs->parent = parent;
1341 1442
1443 down(&callback_sem);
1342 list_add(&cs->sibling, &cs->parent->children); 1444 list_add(&cs->sibling, &cs->parent->children);
1445 up(&callback_sem);
1343 1446
1344 err = cpuset_create_dir(cs, name, mode); 1447 err = cpuset_create_dir(cs, name, mode);
1345 if (err < 0) 1448 if (err < 0)
1346 goto err; 1449 goto err;
1347 1450
1348 /* 1451 /*
1349 * Release cpuset_sem before cpuset_populate_dir() because it 1452 * Release manage_sem before cpuset_populate_dir() because it
1350 * will down() this new directory's i_sem and if we race with 1453 * will down() this new directory's i_sem and if we race with
1351 * another mkdir, we might deadlock. 1454 * another mkdir, we might deadlock.
1352 */ 1455 */
1353 up(&cpuset_sem); 1456 up(&manage_sem);
1354 1457
1355 err = cpuset_populate_dir(cs->dentry); 1458 err = cpuset_populate_dir(cs->dentry);
1356 /* If err < 0, we have a half-filled directory - oh well ;) */ 1459 /* If err < 0, we have a half-filled directory - oh well ;) */
1357 return 0; 1460 return 0;
1358err: 1461err:
1359 list_del(&cs->sibling); 1462 list_del(&cs->sibling);
1360 up(&cpuset_sem); 1463 up(&manage_sem);
1361 kfree(cs); 1464 kfree(cs);
1362 return err; 1465 return err;
1363} 1466}
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1379 1482
1380 /* the vfs holds both inode->i_sem already */ 1483 /* the vfs holds both inode->i_sem already */
1381 1484
1382 down(&cpuset_sem); 1485 down(&manage_sem);
1383 refresh_mems(); 1486 refresh_mems();
1384 if (atomic_read(&cs->count) > 0) { 1487 if (atomic_read(&cs->count) > 0) {
1385 up(&cpuset_sem); 1488 up(&manage_sem);
1386 return -EBUSY; 1489 return -EBUSY;
1387 } 1490 }
1388 if (!list_empty(&cs->children)) { 1491 if (!list_empty(&cs->children)) {
1389 up(&cpuset_sem); 1492 up(&manage_sem);
1390 return -EBUSY; 1493 return -EBUSY;
1391 } 1494 }
1392 parent = cs->parent; 1495 parent = cs->parent;
1496 down(&callback_sem);
1393 set_bit(CS_REMOVED, &cs->flags); 1497 set_bit(CS_REMOVED, &cs->flags);
1394 if (is_cpu_exclusive(cs)) 1498 if (is_cpu_exclusive(cs))
1395 update_cpu_domains(cs); 1499 update_cpu_domains(cs);
1396 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1500 list_del(&cs->sibling); /* delete my sibling from parent->children */
1397 if (list_empty(&parent->children))
1398 check_for_release(parent, &pathbuf);
1399 spin_lock(&cs->dentry->d_lock); 1501 spin_lock(&cs->dentry->d_lock);
1400 d = dget(cs->dentry); 1502 d = dget(cs->dentry);
1401 cs->dentry = NULL; 1503 cs->dentry = NULL;
1402 spin_unlock(&d->d_lock); 1504 spin_unlock(&d->d_lock);
1403 cpuset_d_remove_dir(d); 1505 cpuset_d_remove_dir(d);
1404 dput(d); 1506 dput(d);
1405 up(&cpuset_sem); 1507 up(&callback_sem);
1508 if (list_empty(&parent->children))
1509 check_for_release(parent, &pathbuf);
1510 up(&manage_sem);
1406 cpuset_release_agent(pathbuf); 1511 cpuset_release_agent(pathbuf);
1407 return 0; 1512 return 0;
1408} 1513}
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void)
1462 * cpuset_fork - attach newly forked task to its parents cpuset. 1567 * cpuset_fork - attach newly forked task to its parents cpuset.
1463 * @tsk: pointer to task_struct of forking parent process. 1568 * @tsk: pointer to task_struct of forking parent process.
1464 * 1569 *
1465 * Description: By default, on fork, a task inherits its 1570 * Description: A task inherits its parent's cpuset at fork().
1466 * parent's cpuset. The pointer to the shared cpuset is 1571 *
1467 * automatically copied in fork.c by dup_task_struct(). 1572 * A pointer to the shared cpuset was automatically copied in fork.c
1468 * This cpuset_fork() routine need only increment the usage 1573 * by dup_task_struct(). However, we ignore that copy, since it was
1469 * counter in that cpuset. 1574 * not made under the protection of task_lock(), so might no longer be
1575 * a valid cpuset pointer. attach_task() might have already changed
1576 * current->cpuset, allowing the previously referenced cpuset to
1577 * be removed and freed. Instead, we task_lock(current) and copy
1578 * its present value of current->cpuset for our freshly forked child.
1579 *
1580 * At the point that cpuset_fork() is called, 'current' is the parent
1581 * task, and the passed argument 'child' points to the child task.
1470 **/ 1582 **/
1471 1583
1472void cpuset_fork(struct task_struct *tsk) 1584void cpuset_fork(struct task_struct *child)
1473{ 1585{
1474 atomic_inc(&tsk->cpuset->count); 1586 task_lock(current);
1587 child->cpuset = current->cpuset;
1588 atomic_inc(&child->cpuset->count);
1589 task_unlock(current);
1475} 1590}
1476 1591
1477/** 1592/**
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk)
1480 * 1595 *
1481 * Description: Detach cpuset from @tsk and release it. 1596 * Description: Detach cpuset from @tsk and release it.
1482 * 1597 *
1483 * Note that cpusets marked notify_on_release force every task 1598 * Note that cpusets marked notify_on_release force every task in
1484 * in them to take the global cpuset_sem semaphore when exiting. 1599 * them to take the global manage_sem semaphore when exiting.
1485 * This could impact scaling on very large systems. Be reluctant 1600 * This could impact scaling on very large systems. Be reluctant to
1486 * to use notify_on_release cpusets where very high task exit 1601 * use notify_on_release cpusets where very high task exit scaling
1487 * scaling is required on large systems. 1602 * is required on large systems.
1488 * 1603 *
1489 * Don't even think about derefencing 'cs' after the cpuset use 1604 * Don't even think about derefencing 'cs' after the cpuset use count
1490 * count goes to zero, except inside a critical section guarded 1605 * goes to zero, except inside a critical section guarded by manage_sem
1491 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1606 * or callback_sem. Otherwise a zero cpuset use count is a license to
1492 * then a zero cpuset use count is a license to any other task to 1607 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1493 * nuke the cpuset immediately. 1608 *
1609 * This routine has to take manage_sem, not callback_sem, because
1610 * it is holding that semaphore while calling check_for_release(),
1611 * which calls kmalloc(), so can't be called holding callback__sem().
1612 *
1613 * We don't need to task_lock() this reference to tsk->cpuset,
1614 * because tsk is already marked PF_EXITING, so attach_task() won't
1615 * mess with it.
1494 **/ 1616 **/
1495 1617
1496void cpuset_exit(struct task_struct *tsk) 1618void cpuset_exit(struct task_struct *tsk)
1497{ 1619{
1498 struct cpuset *cs; 1620 struct cpuset *cs;
1499 1621
1500 task_lock(tsk); 1622 BUG_ON(!(tsk->flags & PF_EXITING));
1623
1501 cs = tsk->cpuset; 1624 cs = tsk->cpuset;
1502 tsk->cpuset = NULL; 1625 tsk->cpuset = NULL;
1503 task_unlock(tsk);
1504 1626
1505 if (notify_on_release(cs)) { 1627 if (notify_on_release(cs)) {
1506 char *pathbuf = NULL; 1628 char *pathbuf = NULL;
1507 1629
1508 down(&cpuset_sem); 1630 down(&manage_sem);
1509 if (atomic_dec_and_test(&cs->count)) 1631 if (atomic_dec_and_test(&cs->count))
1510 check_for_release(cs, &pathbuf); 1632 check_for_release(cs, &pathbuf);
1511 up(&cpuset_sem); 1633 up(&manage_sem);
1512 cpuset_release_agent(pathbuf); 1634 cpuset_release_agent(pathbuf);
1513 } else { 1635 } else {
1514 atomic_dec(&cs->count); 1636 atomic_dec(&cs->count);
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1529{ 1651{
1530 cpumask_t mask; 1652 cpumask_t mask;
1531 1653
1532 down(&cpuset_sem); 1654 down(&callback_sem);
1533 task_lock((struct task_struct *)tsk); 1655 task_lock((struct task_struct *)tsk);
1534 guarantee_online_cpus(tsk->cpuset, &mask); 1656 guarantee_online_cpus(tsk->cpuset, &mask);
1535 task_unlock((struct task_struct *)tsk); 1657 task_unlock((struct task_struct *)tsk);
1536 up(&cpuset_sem); 1658 up(&callback_sem);
1537 1659
1538 return mask; 1660 return mask;
1539} 1661}
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void)
1549 * If the current tasks cpusets mems_allowed changed behind our backs, 1671 * If the current tasks cpusets mems_allowed changed behind our backs,
1550 * update current->mems_allowed and mems_generation to the new value. 1672 * update current->mems_allowed and mems_generation to the new value.
1551 * Do not call this routine if in_interrupt(). 1673 * Do not call this routine if in_interrupt().
1674 *
1675 * Call without callback_sem or task_lock() held. May be called
1676 * with or without manage_sem held. Unless exiting, it will acquire
1677 * task_lock(). Also might acquire callback_sem during call to
1678 * refresh_mems().
1552 */ 1679 */
1553 1680
1554void cpuset_update_current_mems_allowed(void) 1681void cpuset_update_current_mems_allowed(void)
1555{ 1682{
1556 struct cpuset *cs = current->cpuset; 1683 struct cpuset *cs;
1684 int need_to_refresh = 0;
1557 1685
1686 task_lock(current);
1687 cs = current->cpuset;
1558 if (!cs) 1688 if (!cs)
1559 return; /* task is exiting */ 1689 goto done;
1560 if (current->cpuset_mems_generation != cs->mems_generation) { 1690 if (current->cpuset_mems_generation != cs->mems_generation)
1561 down(&cpuset_sem); 1691 need_to_refresh = 1;
1692done:
1693 task_unlock(current);
1694 if (need_to_refresh)
1562 refresh_mems(); 1695 refresh_mems();
1563 up(&cpuset_sem);
1564 }
1565} 1696}
1566 1697
1567/** 1698/**
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1595 1726
1596/* 1727/*
1597 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1728 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1598 * ancestor to the specified cpuset. Call while holding cpuset_sem. 1729 * ancestor to the specified cpuset. Call holding callback_sem.
1599 * If no ancestor is mem_exclusive (an unusual configuration), then 1730 * If no ancestor is mem_exclusive (an unusual configuration), then
1600 * returns the root cpuset. 1731 * returns the root cpuset.
1601 */ 1732 */
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1622 * GFP_KERNEL allocations are not so marked, so can escape to the 1753 * GFP_KERNEL allocations are not so marked, so can escape to the
1623 * nearest mem_exclusive ancestor cpuset. 1754 * nearest mem_exclusive ancestor cpuset.
1624 * 1755 *
1625 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() 1756 * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
1626 * routine only calls here with __GFP_HARDWALL bit _not_ set if 1757 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1627 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 1758 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1628 * mems_allowed came up empty on the first pass over the zonelist. 1759 * mems_allowed came up empty on the first pass over the zonelist.
1629 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 1760 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1630 * short of memory, might require taking the cpuset_sem semaphore. 1761 * short of memory, might require taking the callback_sem semaphore.
1631 * 1762 *
1632 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 1763 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1633 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 1764 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1659 return 0; 1790 return 0;
1660 1791
1661 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1792 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1662 down(&cpuset_sem); 1793 down(&callback_sem);
1663 cs = current->cpuset; 1794
1664 if (!cs) 1795 if (current->flags & PF_EXITING) /* Let dying task have memory */
1665 goto done; /* current task exiting */ 1796 return 1;
1666 cs = nearest_exclusive_ancestor(cs); 1797 task_lock(current);
1798 cs = nearest_exclusive_ancestor(current->cpuset);
1799 task_unlock(current);
1800
1667 allowed = node_isset(node, cs->mems_allowed); 1801 allowed = node_isset(node, cs->mems_allowed);
1668done: 1802 up(&callback_sem);
1669 up(&cpuset_sem);
1670 return allowed; 1803 return allowed;
1671} 1804}
1672 1805
@@ -1679,7 +1812,7 @@ done:
1679 * determine if task @p's memory usage might impact the memory 1812 * determine if task @p's memory usage might impact the memory
1680 * available to the current task. 1813 * available to the current task.
1681 * 1814 *
1682 * Acquires cpuset_sem - not suitable for calling from a fast path. 1815 * Acquires callback_sem - not suitable for calling from a fast path.
1683 **/ 1816 **/
1684 1817
1685int cpuset_excl_nodes_overlap(const struct task_struct *p) 1818int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1687 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1820 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1688 int overlap = 0; /* do cpusets overlap? */ 1821 int overlap = 0; /* do cpusets overlap? */
1689 1822
1690 down(&cpuset_sem); 1823 down(&callback_sem);
1691 cs1 = current->cpuset; 1824
1692 if (!cs1) 1825 task_lock(current);
1693 goto done; /* current task exiting */ 1826 if (current->flags & PF_EXITING) {
1694 cs2 = p->cpuset; 1827 task_unlock(current);
1695 if (!cs2) 1828 goto done;
1696 goto done; /* task p is exiting */ 1829 }
1697 cs1 = nearest_exclusive_ancestor(cs1); 1830 cs1 = nearest_exclusive_ancestor(current->cpuset);
1698 cs2 = nearest_exclusive_ancestor(cs2); 1831 task_unlock(current);
1832
1833 task_lock((struct task_struct *)p);
1834 if (p->flags & PF_EXITING) {
1835 task_unlock((struct task_struct *)p);
1836 goto done;
1837 }
1838 cs2 = nearest_exclusive_ancestor(p->cpuset);
1839 task_unlock((struct task_struct *)p);
1840
1699 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1841 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1700done: 1842done:
1701 up(&cpuset_sem); 1843 up(&callback_sem);
1702 1844
1703 return overlap; 1845 return overlap;
1704} 1846}
@@ -1707,6 +1849,10 @@ done:
1707 * proc_cpuset_show() 1849 * proc_cpuset_show()
1708 * - Print tasks cpuset path into seq_file. 1850 * - Print tasks cpuset path into seq_file.
1709 * - Used for /proc/<pid>/cpuset. 1851 * - Used for /proc/<pid>/cpuset.
1852 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
1853 * doesn't really matter if tsk->cpuset changes after we read it,
1854 * and we take manage_sem, keeping attach_task() from changing it
1855 * anyway.
1710 */ 1856 */
1711 1857
1712static int proc_cpuset_show(struct seq_file *m, void *v) 1858static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1721 return -ENOMEM; 1867 return -ENOMEM;
1722 1868
1723 tsk = m->private; 1869 tsk = m->private;
1724 down(&cpuset_sem); 1870 down(&manage_sem);
1725 task_lock(tsk);
1726 cs = tsk->cpuset; 1871 cs = tsk->cpuset;
1727 task_unlock(tsk);
1728 if (!cs) { 1872 if (!cs) {
1729 retval = -EINVAL; 1873 retval = -EINVAL;
1730 goto out; 1874 goto out;
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1736 seq_puts(m, buf); 1880 seq_puts(m, buf);
1737 seq_putc(m, '\n'); 1881 seq_putc(m, '\n');
1738out: 1882out:
1739 up(&cpuset_sem); 1883 up(&manage_sem);
1740 kfree(buf); 1884 kfree(buf);
1741 return retval; 1885 return retval;
1742} 1886}