aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/cpuset.c212
1 files changed, 103 insertions, 109 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a05..c86ee051b734 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -53,7 +53,7 @@
53 53
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/atomic.h> 55#include <asm/atomic.h>
56#include <asm/semaphore.h> 56#include <linux/mutex.h>
57 57
58#define CPUSET_SUPER_MAGIC 0x27e0eb 58#define CPUSET_SUPER_MAGIC 0x27e0eb
59 59
@@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount;
168static struct super_block *cpuset_sb; 168static struct super_block *cpuset_sb;
169 169
170/* 170/*
171 * We have two global cpuset semaphores below. They can nest. 171 * We have two global cpuset mutexes below. They can nest.
172 * It is ok to first take manage_sem, then nest callback_sem. We also 172 * It is ok to first take manage_mutex, then nest callback_mutex. We also
173 * require taking task_lock() when dereferencing a tasks cpuset pointer. 173 * require taking task_lock() when dereferencing a tasks cpuset pointer.
174 * See "The task_lock() exception", at the end of this comment. 174 * See "The task_lock() exception", at the end of this comment.
175 * 175 *
176 * A task must hold both semaphores to modify cpusets. If a task 176 * A task must hold both mutexes to modify cpusets. If a task
177 * holds manage_sem, then it blocks others wanting that semaphore, 177 * holds manage_mutex, then it blocks others wanting that mutex,
178 * ensuring that it is the only task able to also acquire callback_sem 178 * ensuring that it is the only task able to also acquire callback_mutex
179 * and be able to modify cpusets. It can perform various checks on 179 * and be able to modify cpusets. It can perform various checks on
180 * the cpuset structure first, knowing nothing will change. It can 180 * the cpuset structure first, knowing nothing will change. It can
181 * also allocate memory while just holding manage_sem. While it is 181 * also allocate memory while just holding manage_mutex. While it is
182 * performing these checks, various callback routines can briefly 182 * performing these checks, various callback routines can briefly
183 * acquire callback_sem to query cpusets. Once it is ready to make 183 * acquire callback_mutex to query cpusets. Once it is ready to make
184 * the changes, it takes callback_sem, blocking everyone else. 184 * the changes, it takes callback_mutex, blocking everyone else.
185 * 185 *
186 * Calls to the kernel memory allocator can not be made while holding 186 * Calls to the kernel memory allocator can not be made while holding
187 * callback_sem, as that would risk double tripping on callback_sem 187 * callback_mutex, as that would risk double tripping on callback_mutex
188 * from one of the callbacks into the cpuset code from within 188 * from one of the callbacks into the cpuset code from within
189 * __alloc_pages(). 189 * __alloc_pages().
190 * 190 *
191 * If a task is only holding callback_sem, then it has read-only 191 * If a task is only holding callback_mutex, then it has read-only
192 * access to cpusets. 192 * access to cpusets.
193 * 193 *
194 * The task_struct fields mems_allowed and mems_generation may only 194 * The task_struct fields mems_allowed and mems_generation may only
195 * be accessed in the context of that task, so require no locks. 195 * be accessed in the context of that task, so require no locks.
196 * 196 *
197 * Any task can increment and decrement the count field without lock. 197 * Any task can increment and decrement the count field without lock.
198 * So in general, code holding manage_sem or callback_sem can't rely 198 * So in general, code holding manage_mutex or callback_mutex can't rely
199 * on the count field not changing. However, if the count goes to 199 * on the count field not changing. However, if the count goes to
200 * zero, then only attach_task(), which holds both semaphores, can 200 * zero, then only attach_task(), which holds both mutexes, can
201 * increment it again. Because a count of zero means that no tasks 201 * increment it again. Because a count of zero means that no tasks
202 * are currently attached, therefore there is no way a task attached 202 * are currently attached, therefore there is no way a task attached
203 * to that cpuset can fork (the other way to increment the count). 203 * to that cpuset can fork (the other way to increment the count).
204 * So code holding manage_sem or callback_sem can safely assume that 204 * So code holding manage_mutex or callback_mutex can safely assume that
205 * if the count is zero, it will stay zero. Similarly, if a task 205 * if the count is zero, it will stay zero. Similarly, if a task
206 * holds manage_sem or callback_sem on a cpuset with zero count, it 206 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs 207 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
208 * both of those semaphores. 208 * both of those mutexes.
209 *
210 * A possible optimization to improve parallelism would be to make
211 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
212 * to proceed in parallel, with read access, until the holder of
213 * manage_sem needed to take this rwsem for exclusive write access
214 * and modify some cpusets.
215 * 209 *
216 * The cpuset_common_file_write handler for operations that modify 210 * The cpuset_common_file_write handler for operations that modify
217 * the cpuset hierarchy holds manage_sem across the entire operation, 211 * the cpuset hierarchy holds manage_mutex across the entire operation,
218 * single threading all such cpuset modifications across the system. 212 * single threading all such cpuset modifications across the system.
219 * 213 *
220 * The cpuset_common_file_read() handlers only hold callback_sem across 214 * The cpuset_common_file_read() handlers only hold callback_mutex across
221 * small pieces of code, such as when reading out possibly multi-word 215 * small pieces of code, such as when reading out possibly multi-word
222 * cpumasks and nodemasks. 216 * cpumasks and nodemasks.
223 * 217 *
224 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 218 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
225 * (usually) take either semaphore. These are the two most performance 219 * (usually) take either mutex. These are the two most performance
226 * critical pieces of code here. The exception occurs on cpuset_exit(), 220 * critical pieces of code here. The exception occurs on cpuset_exit(),
227 * when a task in a notify_on_release cpuset exits. Then manage_sem 221 * when a task in a notify_on_release cpuset exits. Then manage_mutex
228 * is taken, and if the cpuset count is zero, a usermode call made 222 * is taken, and if the cpuset count is zero, a usermode call made
229 * to /sbin/cpuset_release_agent with the name of the cpuset (path 223 * to /sbin/cpuset_release_agent with the name of the cpuset (path
230 * relative to the root of cpuset file system) as the argument. 224 * relative to the root of cpuset file system) as the argument.
@@ -242,9 +236,9 @@ static struct super_block *cpuset_sb;
242 * 236 *
243 * The need for this exception arises from the action of attach_task(), 237 * The need for this exception arises from the action of attach_task(),
244 * which overwrites one tasks cpuset pointer with another. It does 238 * which overwrites one tasks cpuset pointer with another. It does
245 * so using both semaphores, however there are several performance 239 * so using both mutexes, however there are several performance
246 * critical places that need to reference task->cpuset without the 240 * critical places that need to reference task->cpuset without the
247 * expense of grabbing a system global semaphore. Therefore except as 241 * expense of grabbing a system global mutex. Therefore except as
248 * noted below, when dereferencing or, as in attach_task(), modifying 242 * noted below, when dereferencing or, as in attach_task(), modifying
249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 243 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
250 * (task->alloc_lock) already in the task_struct routinely used for 244 * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +250,8 @@ static struct super_block *cpuset_sb;
256 * the routine cpuset_update_task_memory_state(). 250 * the routine cpuset_update_task_memory_state().
257 */ 251 */
258 252
259static DECLARE_MUTEX(manage_sem); 253static DEFINE_MUTEX(manage_mutex);
260static DECLARE_MUTEX(callback_sem); 254static DEFINE_MUTEX(callback_mutex);
261 255
262/* 256/*
263 * A couple of forward declarations required, due to cyclic reference loop: 257 * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
432} 426}
433 427
434/* 428/*
435 * Call with manage_sem held. Writes path of cpuset into buf. 429 * Call with manage_mutex held. Writes path of cpuset into buf.
436 * Returns 0 on success, -errno on error. 430 * Returns 0 on success, -errno on error.
437 */ 431 */
438 432
@@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
484 * status of the /sbin/cpuset_release_agent task, so no sense holding 478 * status of the /sbin/cpuset_release_agent task, so no sense holding
485 * our caller up for that. 479 * our caller up for that.
486 * 480 *
487 * When we had only one cpuset semaphore, we had to call this 481 * When we had only one cpuset mutex, we had to call this
488 * without holding it, to avoid deadlock when call_usermodehelper() 482 * without holding it, to avoid deadlock when call_usermodehelper()
489 * allocated memory. With two locks, we could now call this while 483 * allocated memory. With two locks, we could now call this while
490 * holding manage_sem, but we still don't, so as to minimize 484 * holding manage_mutex, but we still don't, so as to minimize
491 * the time manage_sem is held. 485 * the time manage_mutex is held.
492 */ 486 */
493 487
494static void cpuset_release_agent(const char *pathbuf) 488static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf)
520 * cs is notify_on_release() and now both the user count is zero and 514 * cs is notify_on_release() and now both the user count is zero and
521 * the list of children is empty, prepare cpuset path in a kmalloc'd 515 * the list of children is empty, prepare cpuset path in a kmalloc'd
522 * buffer, to be returned via ppathbuf, so that the caller can invoke 516 * buffer, to be returned via ppathbuf, so that the caller can invoke
523 * cpuset_release_agent() with it later on, once manage_sem is dropped. 517 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
524 * Call here with manage_sem held. 518 * Call here with manage_mutex held.
525 * 519 *
526 * This check_for_release() routine is responsible for kmalloc'ing 520 * This check_for_release() routine is responsible for kmalloc'ing
527 * pathbuf. The above cpuset_release_agent() is responsible for 521 * pathbuf. The above cpuset_release_agent() is responsible for
528 * kfree'ing pathbuf. The caller of these routines is responsible 522 * kfree'ing pathbuf. The caller of these routines is responsible
529 * for providing a pathbuf pointer, initialized to NULL, then 523 * for providing a pathbuf pointer, initialized to NULL, then
530 * calling check_for_release() with manage_sem held and the address 524 * calling check_for_release() with manage_mutex held and the address
531 * of the pathbuf pointer, then dropping manage_sem, then calling 525 * of the pathbuf pointer, then dropping manage_mutex, then calling
532 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 526 * cpuset_release_agent() with pathbuf, as set by check_for_release().
533 */ 527 */
534 528
@@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
559 * One way or another, we guarantee to return some non-empty subset 553 * One way or another, we guarantee to return some non-empty subset
560 * of cpu_online_map. 554 * of cpu_online_map.
561 * 555 *
562 * Call with callback_sem held. 556 * Call with callback_mutex held.
563 */ 557 */
564 558
565static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 559static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
583 * One way or another, we guarantee to return some non-empty subset 577 * One way or another, we guarantee to return some non-empty subset
584 * of node_online_map. 578 * of node_online_map.
585 * 579 *
586 * Call with callback_sem held. 580 * Call with callback_mutex held.
587 */ 581 */
588 582
589static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 583static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
608 * current->cpuset if a task has its memory placement changed. 602 * current->cpuset if a task has its memory placement changed.
609 * Do not call this routine if in_interrupt(). 603 * Do not call this routine if in_interrupt().
610 * 604 *
611 * Call without callback_sem or task_lock() held. May be called 605 * Call without callback_mutex or task_lock() held. May be called
612 * with or without manage_sem held. Doesn't need task_lock to guard 606 * with or without manage_mutex held. Doesn't need task_lock to guard
613 * against another task changing a non-NULL cpuset pointer to NULL, 607 * against another task changing a non-NULL cpuset pointer to NULL,
614 * as that is only done by a task on itself, and if the current task 608 * as that is only done by a task on itself, and if the current task
615 * is here, it is not simultaneously in the exit code NULL'ing its 609 * is here, it is not simultaneously in the exit code NULL'ing its
616 * cpuset pointer. This routine also might acquire callback_sem and 610 * cpuset pointer. This routine also might acquire callback_mutex and
617 * current->mm->mmap_sem during call. 611 * current->mm->mmap_sem during call.
618 * 612 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock 613 * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void)
658 } 652 }
659 653
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 654 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
661 down(&callback_sem); 655 mutex_lock(&callback_mutex);
662 task_lock(tsk); 656 task_lock(tsk);
663 cs = tsk->cpuset; /* Maybe changed when task not locked */ 657 cs = tsk->cpuset; /* Maybe changed when task not locked */
664 guarantee_online_mems(cs, &tsk->mems_allowed); 658 guarantee_online_mems(cs, &tsk->mems_allowed);
665 tsk->cpuset_mems_generation = cs->mems_generation; 659 tsk->cpuset_mems_generation = cs->mems_generation;
666 task_unlock(tsk); 660 task_unlock(tsk);
667 up(&callback_sem); 661 mutex_unlock(&callback_mutex);
668 mpol_rebind_task(tsk, &tsk->mems_allowed); 662 mpol_rebind_task(tsk, &tsk->mems_allowed);
669 } 663 }
670} 664}
@@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void)
674 * 668 *
675 * One cpuset is a subset of another if all its allowed CPUs and 669 * One cpuset is a subset of another if all its allowed CPUs and
676 * Memory Nodes are a subset of the other, and its exclusive flags 670 * Memory Nodes are a subset of the other, and its exclusive flags
677 * are only set if the other's are set. Call holding manage_sem. 671 * are only set if the other's are set. Call holding manage_mutex.
678 */ 672 */
679 673
680static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 674static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
692 * If we replaced the flag and mask values of the current cpuset 686 * If we replaced the flag and mask values of the current cpuset
693 * (cur) with those values in the trial cpuset (trial), would 687 * (cur) with those values in the trial cpuset (trial), would
694 * our various subset and exclusive rules still be valid? Presumes 688 * our various subset and exclusive rules still be valid? Presumes
695 * manage_sem held. 689 * manage_mutex held.
696 * 690 *
697 * 'cur' is the address of an actual, in-use cpuset. Operations 691 * 'cur' is the address of an actual, in-use cpuset. Operations
698 * such as list traversal that depend on the actual address of the 692 * such as list traversal that depend on the actual address of the
@@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
746 * exclusive child cpusets 740 * exclusive child cpusets
747 * Build these two partitions by calling partition_sched_domains 741 * Build these two partitions by calling partition_sched_domains
748 * 742 *
749 * Call with manage_sem held. May nest a call to the 743 * Call with manage_mutex held. May nest a call to the
750 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 744 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
751 */ 745 */
752 746
@@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur)
792} 786}
793 787
794/* 788/*
795 * Call with manage_sem held. May take callback_sem during call. 789 * Call with manage_mutex held. May take callback_mutex during call.
796 */ 790 */
797 791
798static int update_cpumask(struct cpuset *cs, char *buf) 792static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
811 if (retval < 0) 805 if (retval < 0)
812 return retval; 806 return retval;
813 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 807 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
814 down(&callback_sem); 808 mutex_lock(&callback_mutex);
815 cs->cpus_allowed = trialcs.cpus_allowed; 809 cs->cpus_allowed = trialcs.cpus_allowed;
816 up(&callback_sem); 810 mutex_unlock(&callback_mutex);
817 if (is_cpu_exclusive(cs) && !cpus_unchanged) 811 if (is_cpu_exclusive(cs) && !cpus_unchanged)
818 update_cpu_domains(cs); 812 update_cpu_domains(cs);
819 return 0; 813 return 0;
@@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
827 * the cpuset is marked 'memory_migrate', migrate the tasks 821 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory. 822 * pages to the new memory.
829 * 823 *
830 * Call with manage_sem held. May take callback_sem during call. 824 * Call with manage_mutex held. May take callback_mutex during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 825 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 826 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed. 827 * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
862 if (retval < 0) 856 if (retval < 0)
863 goto done; 857 goto done;
864 858
865 down(&callback_sem); 859 mutex_lock(&callback_mutex);
866 cs->mems_allowed = trialcs.mems_allowed; 860 cs->mems_allowed = trialcs.mems_allowed;
867 atomic_inc(&cpuset_mems_generation); 861 atomic_inc(&cpuset_mems_generation);
868 cs->mems_generation = atomic_read(&cpuset_mems_generation); 862 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869 up(&callback_sem); 863 mutex_unlock(&callback_mutex);
870 864
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 865 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872 866
@@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 * tasklist_lock. Forks can happen again now - the mpol_copy() 916 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind 917 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global 918 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will 919 * cpuset manage_mutex, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound. 920 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 921 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes. 922 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -948,7 +942,7 @@ done:
948} 942}
949 943
950/* 944/*
951 * Call with manage_sem held. 945 * Call with manage_mutex held.
952 */ 946 */
953 947
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 948static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
967 * cs: the cpuset to update 961 * cs: the cpuset to update
968 * buf: the buffer where we read the 0 or 1 962 * buf: the buffer where we read the 0 or 1
969 * 963 *
970 * Call with manage_sem held. 964 * Call with manage_mutex held.
971 */ 965 */
972 966
973static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 967static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
989 return err; 983 return err;
990 cpu_exclusive_changed = 984 cpu_exclusive_changed =
991 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 985 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
992 down(&callback_sem); 986 mutex_lock(&callback_mutex);
993 if (turning_on) 987 if (turning_on)
994 set_bit(bit, &cs->flags); 988 set_bit(bit, &cs->flags);
995 else 989 else
996 clear_bit(bit, &cs->flags); 990 clear_bit(bit, &cs->flags);
997 up(&callback_sem); 991 mutex_unlock(&callback_mutex);
998 992
999 if (cpu_exclusive_changed) 993 if (cpu_exclusive_changed)
1000 update_cpu_domains(cs); 994 update_cpu_domains(cs);
@@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1098 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1105 * notified on release. 1099 * notified on release.
1106 * 1100 *
1107 * Call holding manage_sem. May take callback_sem and task_lock of 1101 * Call holding manage_mutex. May take callback_mutex and task_lock of
1108 * the task 'pid' during call. 1102 * the task 'pid' during call.
1109 */ 1103 */
1110 1104
@@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1144 get_task_struct(tsk); 1138 get_task_struct(tsk);
1145 } 1139 }
1146 1140
1147 down(&callback_sem); 1141 mutex_lock(&callback_mutex);
1148 1142
1149 task_lock(tsk); 1143 task_lock(tsk);
1150 oldcs = tsk->cpuset; 1144 oldcs = tsk->cpuset;
1151 if (!oldcs) { 1145 if (!oldcs) {
1152 task_unlock(tsk); 1146 task_unlock(tsk);
1153 up(&callback_sem); 1147 mutex_unlock(&callback_mutex);
1154 put_task_struct(tsk); 1148 put_task_struct(tsk);
1155 return -ESRCH; 1149 return -ESRCH;
1156 } 1150 }
@@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1164 from = oldcs->mems_allowed; 1158 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed; 1159 to = cs->mems_allowed;
1166 1160
1167 up(&callback_sem); 1161 mutex_unlock(&callback_mutex);
1168 1162
1169 mm = get_task_mm(tsk); 1163 mm = get_task_mm(tsk);
1170 if (mm) { 1164 if (mm) {
@@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1221 } 1215 }
1222 buffer[nbytes] = 0; /* nul-terminate */ 1216 buffer[nbytes] = 0; /* nul-terminate */
1223 1217
1224 down(&manage_sem); 1218 mutex_lock(&manage_mutex);
1225 1219
1226 if (is_removed(cs)) { 1220 if (is_removed(cs)) {
1227 retval = -ENODEV; 1221 retval = -ENODEV;
@@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1264 if (retval == 0) 1258 if (retval == 0)
1265 retval = nbytes; 1259 retval = nbytes;
1266out2: 1260out2:
1267 up(&manage_sem); 1261 mutex_unlock(&manage_mutex);
1268 cpuset_release_agent(pathbuf); 1262 cpuset_release_agent(pathbuf);
1269out1: 1263out1:
1270 kfree(buffer); 1264 kfree(buffer);
@@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1304{ 1298{
1305 cpumask_t mask; 1299 cpumask_t mask;
1306 1300
1307 down(&callback_sem); 1301 mutex_lock(&callback_mutex);
1308 mask = cs->cpus_allowed; 1302 mask = cs->cpus_allowed;
1309 up(&callback_sem); 1303 mutex_unlock(&callback_mutex);
1310 1304
1311 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1305 return cpulist_scnprintf(page, PAGE_SIZE, mask);
1312} 1306}
@@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1315{ 1309{
1316 nodemask_t mask; 1310 nodemask_t mask;
1317 1311
1318 down(&callback_sem); 1312 mutex_lock(&callback_mutex);
1319 mask = cs->mems_allowed; 1313 mask = cs->mems_allowed;
1320 up(&callback_sem); 1314 mutex_unlock(&callback_mutex);
1321 1315
1322 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1316 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1323} 1317}
@@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1598 * Handle an open on 'tasks' file. Prepare a buffer listing the 1592 * Handle an open on 'tasks' file. Prepare a buffer listing the
1599 * process id's of tasks currently attached to the cpuset being opened. 1593 * process id's of tasks currently attached to the cpuset being opened.
1600 * 1594 *
1601 * Does not require any specific cpuset semaphores, and does not take any. 1595 * Does not require any specific cpuset mutexes, and does not take any.
1602 */ 1596 */
1603static int cpuset_tasks_open(struct inode *unused, struct file *file) 1597static int cpuset_tasks_open(struct inode *unused, struct file *file)
1604{ 1598{
@@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1754 * name: name of the new cpuset. Will be strcpy'ed. 1748 * name: name of the new cpuset. Will be strcpy'ed.
1755 * mode: mode to set on new inode 1749 * mode: mode to set on new inode
1756 * 1750 *
1757 * Must be called with the semaphore on the parent inode held 1751 * Must be called with the mutex on the parent inode held
1758 */ 1752 */
1759 1753
1760static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1754static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1766 if (!cs) 1760 if (!cs)
1767 return -ENOMEM; 1761 return -ENOMEM;
1768 1762
1769 down(&manage_sem); 1763 mutex_lock(&manage_mutex);
1770 cpuset_update_task_memory_state(); 1764 cpuset_update_task_memory_state();
1771 cs->flags = 0; 1765 cs->flags = 0;
1772 if (notify_on_release(parent)) 1766 if (notify_on_release(parent))
@@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1782 1776
1783 cs->parent = parent; 1777 cs->parent = parent;
1784 1778
1785 down(&callback_sem); 1779 mutex_lock(&callback_mutex);
1786 list_add(&cs->sibling, &cs->parent->children); 1780 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++; 1781 number_of_cpusets++;
1788 up(&callback_sem); 1782 mutex_unlock(&callback_mutex);
1789 1783
1790 err = cpuset_create_dir(cs, name, mode); 1784 err = cpuset_create_dir(cs, name, mode);
1791 if (err < 0) 1785 if (err < 0)
1792 goto err; 1786 goto err;
1793 1787
1794 /* 1788 /*
1795 * Release manage_sem before cpuset_populate_dir() because it 1789 * Release manage_mutex before cpuset_populate_dir() because it
1796 * will down() this new directory's i_mutex and if we race with 1790 * will down() this new directory's i_mutex and if we race with
1797 * another mkdir, we might deadlock. 1791 * another mkdir, we might deadlock.
1798 */ 1792 */
1799 up(&manage_sem); 1793 mutex_unlock(&manage_mutex);
1800 1794
1801 err = cpuset_populate_dir(cs->dentry); 1795 err = cpuset_populate_dir(cs->dentry);
1802 /* If err < 0, we have a half-filled directory - oh well ;) */ 1796 /* If err < 0, we have a half-filled directory - oh well ;) */
1803 return 0; 1797 return 0;
1804err: 1798err:
1805 list_del(&cs->sibling); 1799 list_del(&cs->sibling);
1806 up(&manage_sem); 1800 mutex_unlock(&manage_mutex);
1807 kfree(cs); 1801 kfree(cs);
1808 return err; 1802 return err;
1809} 1803}
@@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1825 1819
1826 /* the vfs holds both inode->i_mutex already */ 1820 /* the vfs holds both inode->i_mutex already */
1827 1821
1828 down(&manage_sem); 1822 mutex_lock(&manage_mutex);
1829 cpuset_update_task_memory_state(); 1823 cpuset_update_task_memory_state();
1830 if (atomic_read(&cs->count) > 0) { 1824 if (atomic_read(&cs->count) > 0) {
1831 up(&manage_sem); 1825 mutex_unlock(&manage_mutex);
1832 return -EBUSY; 1826 return -EBUSY;
1833 } 1827 }
1834 if (!list_empty(&cs->children)) { 1828 if (!list_empty(&cs->children)) {
1835 up(&manage_sem); 1829 mutex_unlock(&manage_mutex);
1836 return -EBUSY; 1830 return -EBUSY;
1837 } 1831 }
1838 parent = cs->parent; 1832 parent = cs->parent;
1839 down(&callback_sem); 1833 mutex_lock(&callback_mutex);
1840 set_bit(CS_REMOVED, &cs->flags); 1834 set_bit(CS_REMOVED, &cs->flags);
1841 if (is_cpu_exclusive(cs)) 1835 if (is_cpu_exclusive(cs))
1842 update_cpu_domains(cs); 1836 update_cpu_domains(cs);
@@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1848 cpuset_d_remove_dir(d); 1842 cpuset_d_remove_dir(d);
1849 dput(d); 1843 dput(d);
1850 number_of_cpusets--; 1844 number_of_cpusets--;
1851 up(&callback_sem); 1845 mutex_unlock(&callback_mutex);
1852 if (list_empty(&parent->children)) 1846 if (list_empty(&parent->children))
1853 check_for_release(parent, &pathbuf); 1847 check_for_release(parent, &pathbuf);
1854 up(&manage_sem); 1848 mutex_unlock(&manage_mutex);
1855 cpuset_release_agent(pathbuf); 1849 cpuset_release_agent(pathbuf);
1856 return 0; 1850 return 0;
1857} 1851}
@@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child)
1960 * Description: Detach cpuset from @tsk and release it. 1954 * Description: Detach cpuset from @tsk and release it.
1961 * 1955 *
1962 * Note that cpusets marked notify_on_release force every task in 1956 * Note that cpusets marked notify_on_release force every task in
1963 * them to take the global manage_sem semaphore when exiting. 1957 * them to take the global manage_mutex mutex when exiting.
1964 * This could impact scaling on very large systems. Be reluctant to 1958 * This could impact scaling on very large systems. Be reluctant to
1965 * use notify_on_release cpusets where very high task exit scaling 1959 * use notify_on_release cpusets where very high task exit scaling
1966 * is required on large systems. 1960 * is required on large systems.
1967 * 1961 *
1968 * Don't even think about derefencing 'cs' after the cpuset use count 1962 * Don't even think about derefencing 'cs' after the cpuset use count
1969 * goes to zero, except inside a critical section guarded by manage_sem 1963 * goes to zero, except inside a critical section guarded by manage_mutex
1970 * or callback_sem. Otherwise a zero cpuset use count is a license to 1964 * or callback_mutex. Otherwise a zero cpuset use count is a license to
1971 * any other task to nuke the cpuset immediately, via cpuset_rmdir(). 1965 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1972 * 1966 *
1973 * This routine has to take manage_sem, not callback_sem, because 1967 * This routine has to take manage_mutex, not callback_mutex, because
1974 * it is holding that semaphore while calling check_for_release(), 1968 * it is holding that mutex while calling check_for_release(),
1975 * which calls kmalloc(), so can't be called holding callback__sem(). 1969 * which calls kmalloc(), so can't be called holding callback_mutex().
1976 * 1970 *
1977 * We don't need to task_lock() this reference to tsk->cpuset, 1971 * We don't need to task_lock() this reference to tsk->cpuset,
1978 * because tsk is already marked PF_EXITING, so attach_task() won't 1972 * because tsk is already marked PF_EXITING, so attach_task() won't
@@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk)
2022 if (notify_on_release(cs)) { 2016 if (notify_on_release(cs)) {
2023 char *pathbuf = NULL; 2017 char *pathbuf = NULL;
2024 2018
2025 down(&manage_sem); 2019 mutex_lock(&manage_mutex);
2026 if (atomic_dec_and_test(&cs->count)) 2020 if (atomic_dec_and_test(&cs->count))
2027 check_for_release(cs, &pathbuf); 2021 check_for_release(cs, &pathbuf);
2028 up(&manage_sem); 2022 mutex_unlock(&manage_mutex);
2029 cpuset_release_agent(pathbuf); 2023 cpuset_release_agent(pathbuf);
2030 } else { 2024 } else {
2031 atomic_dec(&cs->count); 2025 atomic_dec(&cs->count);
@@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2046{ 2040{
2047 cpumask_t mask; 2041 cpumask_t mask;
2048 2042
2049 down(&callback_sem); 2043 mutex_lock(&callback_mutex);
2050 task_lock(tsk); 2044 task_lock(tsk);
2051 guarantee_online_cpus(tsk->cpuset, &mask); 2045 guarantee_online_cpus(tsk->cpuset, &mask);
2052 task_unlock(tsk); 2046 task_unlock(tsk);
2053 up(&callback_sem); 2047 mutex_unlock(&callback_mutex);
2054 2048
2055 return mask; 2049 return mask;
2056} 2050}
@@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2074{ 2068{
2075 nodemask_t mask; 2069 nodemask_t mask;
2076 2070
2077 down(&callback_sem); 2071 mutex_lock(&callback_mutex);
2078 task_lock(tsk); 2072 task_lock(tsk);
2079 guarantee_online_mems(tsk->cpuset, &mask); 2073 guarantee_online_mems(tsk->cpuset, &mask);
2080 task_unlock(tsk); 2074 task_unlock(tsk);
2081 up(&callback_sem); 2075 mutex_unlock(&callback_mutex);
2082 2076
2083 return mask; 2077 return mask;
2084} 2078}
@@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2104 2098
2105/* 2099/*
2106 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 2100 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
2107 * ancestor to the specified cpuset. Call holding callback_sem. 2101 * ancestor to the specified cpuset. Call holding callback_mutex.
2108 * If no ancestor is mem_exclusive (an unusual configuration), then 2102 * If no ancestor is mem_exclusive (an unusual configuration), then
2109 * returns the root cpuset. 2103 * returns the root cpuset.
2110 */ 2104 */
@@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2131 * GFP_KERNEL allocations are not so marked, so can escape to the 2125 * GFP_KERNEL allocations are not so marked, so can escape to the
2132 * nearest mem_exclusive ancestor cpuset. 2126 * nearest mem_exclusive ancestor cpuset.
2133 * 2127 *
2134 * Scanning up parent cpusets requires callback_sem. The __alloc_pages() 2128 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages()
2135 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2129 * routine only calls here with __GFP_HARDWALL bit _not_ set if
2136 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2130 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
2137 * mems_allowed came up empty on the first pass over the zonelist. 2131 * mems_allowed came up empty on the first pass over the zonelist.
2138 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2132 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
2139 * short of memory, might require taking the callback_sem semaphore. 2133 * short of memory, might require taking the callback_mutex mutex.
2140 * 2134 *
2141 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 2135 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
2142 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 2136 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2171 return 1; 2165 return 1;
2172 2166
2173 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2167 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2174 down(&callback_sem); 2168 mutex_lock(&callback_mutex);
2175 2169
2176 task_lock(current); 2170 task_lock(current);
2177 cs = nearest_exclusive_ancestor(current->cpuset); 2171 cs = nearest_exclusive_ancestor(current->cpuset);
2178 task_unlock(current); 2172 task_unlock(current);
2179 2173
2180 allowed = node_isset(node, cs->mems_allowed); 2174 allowed = node_isset(node, cs->mems_allowed);
2181 up(&callback_sem); 2175 mutex_unlock(&callback_mutex);
2182 return allowed; 2176 return allowed;
2183} 2177}
2184 2178
2185/** 2179/**
2186 * cpuset_lock - lock out any changes to cpuset structures 2180 * cpuset_lock - lock out any changes to cpuset structures
2187 * 2181 *
2188 * The out of memory (oom) code needs to lock down cpusets 2182 * The out of memory (oom) code needs to mutex_lock cpusets
2189 * from being changed while it scans the tasklist looking for a 2183 * from being changed while it scans the tasklist looking for a
2190 * task in an overlapping cpuset. Expose callback_sem via this 2184 * task in an overlapping cpuset. Expose callback_mutex via this
2191 * cpuset_lock() routine, so the oom code can lock it, before 2185 * cpuset_lock() routine, so the oom code can lock it, before
2192 * locking the task list. The tasklist_lock is a spinlock, so 2186 * locking the task list. The tasklist_lock is a spinlock, so
2193 * must be taken inside callback_sem. 2187 * must be taken inside callback_mutex.
2194 */ 2188 */
2195 2189
2196void cpuset_lock(void) 2190void cpuset_lock(void)
2197{ 2191{
2198 down(&callback_sem); 2192 mutex_lock(&callback_mutex);
2199} 2193}
2200 2194
2201/** 2195/**
@@ -2206,7 +2200,7 @@ void cpuset_lock(void)
2206 2200
2207void cpuset_unlock(void) 2201void cpuset_unlock(void)
2208{ 2202{
2209 up(&callback_sem); 2203 mutex_unlock(&callback_mutex);
2210} 2204}
2211 2205
2212/** 2206/**
@@ -2218,7 +2212,7 @@ void cpuset_unlock(void)
2218 * determine if task @p's memory usage might impact the memory 2212 * determine if task @p's memory usage might impact the memory
2219 * available to the current task. 2213 * available to the current task.
2220 * 2214 *
2221 * Call while holding callback_sem. 2215 * Call while holding callback_mutex.
2222 **/ 2216 **/
2223 2217
2224int cpuset_excl_nodes_overlap(const struct task_struct *p) 2218int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void)
2289 * - Used for /proc/<pid>/cpuset. 2283 * - Used for /proc/<pid>/cpuset.
2290 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2284 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2291 * doesn't really matter if tsk->cpuset changes after we read it, 2285 * doesn't really matter if tsk->cpuset changes after we read it,
2292 * and we take manage_sem, keeping attach_task() from changing it 2286 * and we take manage_mutex, keeping attach_task() from changing it
2293 * anyway. 2287 * anyway.
2294 */ 2288 */
2295 2289
@@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2305 return -ENOMEM; 2299 return -ENOMEM;
2306 2300
2307 tsk = m->private; 2301 tsk = m->private;
2308 down(&manage_sem); 2302 mutex_lock(&manage_mutex);
2309 cs = tsk->cpuset; 2303 cs = tsk->cpuset;
2310 if (!cs) { 2304 if (!cs) {
2311 retval = -EINVAL; 2305 retval = -EINVAL;
@@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2318 seq_puts(m, buf); 2312 seq_puts(m, buf);
2319 seq_putc(m, '\n'); 2313 seq_putc(m, '\n');
2320out: 2314out:
2321 up(&manage_sem); 2315 mutex_unlock(&manage_mutex);
2322 kfree(buf); 2316 kfree(buf);
2323 return retval; 2317 return retval;
2324} 2318}