aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:39 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commit8793d854edbc2774943a4b0de3304dc73991159a (patch)
tree380b3403a0fedfcce61d9af5af1ffbcc71017abf
parent81a6a5cdd2c5cd70874b88afe524ab09e9e869af (diff)
Task Control Groups: make cpusets a client of cgroups
Remove the filesystem support logic from the cpusets system and makes cpusets a cgroup subsystem The "cpuset" filesystem becomes a dummy filesystem; attempts to mount it get passed through to the cgroup filesystem with the appropriate options to emulate the old cpuset filesystem behaviour. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cpusets.txt93
-rw-r--r--fs/proc/base.c4
-rw-r--r--include/linux/cgroup_subsys.h6
-rw-r--r--include/linux/cpuset.h12
-rw-r--r--include/linux/mempolicy.h12
-rw-r--r--include/linux/sched.h3
-rw-r--r--init/Kconfig7
-rw-r--r--kernel/cpuset.c1188
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c3
-rw-r--r--mm/mempolicy.c2
11 files changed, 277 insertions, 1055 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index ec9de6917f01..85eeab5e7e32 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
7Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. 7Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
8Modified by Paul Jackson <pj@sgi.com> 8Modified by Paul Jackson <pj@sgi.com>
9Modified by Christoph Lameter <clameter@sgi.com> 9Modified by Christoph Lameter <clameter@sgi.com>
10Modified by Paul Menage <menage@google.com>
10 11
11CONTENTS: 12CONTENTS:
12========= 13=========
@@ -16,10 +17,9 @@ CONTENTS:
16 1.2 Why are cpusets needed ? 17 1.2 Why are cpusets needed ?
17 1.3 How are cpusets implemented ? 18 1.3 How are cpusets implemented ?
18 1.4 What are exclusive cpusets ? 19 1.4 What are exclusive cpusets ?
19 1.5 What does notify_on_release do ? 20 1.5 What is memory_pressure ?
20 1.6 What is memory_pressure ? 21 1.6 What is memory spread ?
21 1.7 What is memory spread ? 22 1.7 How do I use cpusets ?
22 1.8 How do I use cpusets ?
232. Usage Examples and Syntax 232. Usage Examples and Syntax
24 2.1 Basic Usage 24 2.1 Basic Usage
25 2.2 Adding/removing cpus 25 2.2 Adding/removing cpus
@@ -44,18 +44,19 @@ hierarchy visible in a virtual file system. These are the essential
44hooks, beyond what is already present, required to manage dynamic 44hooks, beyond what is already present, required to manage dynamic
45job placement on large systems. 45job placement on large systems.
46 46
47Each task has a pointer to a cpuset. Multiple tasks may reference 47Cpusets use the generic cgroup subsystem described in
48the same cpuset. Requests by a task, using the sched_setaffinity(2) 48Documentation/cgroup.txt.
49system call to include CPUs in its CPU affinity mask, and using the 49
50mbind(2) and set_mempolicy(2) system calls to include Memory Nodes 50Requests by a task, using the sched_setaffinity(2) system call to
51in its memory policy, are both filtered through that tasks cpuset, 51include CPUs in its CPU affinity mask, and using the mbind(2) and
52filtering out any CPUs or Memory Nodes not in that cpuset. The 52set_mempolicy(2) system calls to include Memory Nodes in its memory
53scheduler will not schedule a task on a CPU that is not allowed in 53policy, are both filtered through that tasks cpuset, filtering out any
54its cpus_allowed vector, and the kernel page allocator will not 54CPUs or Memory Nodes not in that cpuset. The scheduler will not
55allocate a page on a node that is not allowed in the requesting tasks 55schedule a task on a CPU that is not allowed in its cpus_allowed
56mems_allowed vector. 56vector, and the kernel page allocator will not allocate a page on a
57 57node that is not allowed in the requesting tasks mems_allowed vector.
58User level code may create and destroy cpusets by name in the cpuset 58
59User level code may create and destroy cpusets by name in the cgroup
59virtual file system, manage the attributes and permissions of these 60virtual file system, manage the attributes and permissions of these
60cpusets and which CPUs and Memory Nodes are assigned to each cpuset, 61cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
61specify and query to which cpuset a task is assigned, and list the 62specify and query to which cpuset a task is assigned, and list the
@@ -115,7 +116,7 @@ Cpusets extends these two mechanisms as follows:
115 - Cpusets are sets of allowed CPUs and Memory Nodes, known to the 116 - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
116 kernel. 117 kernel.
117 - Each task in the system is attached to a cpuset, via a pointer 118 - Each task in the system is attached to a cpuset, via a pointer
118 in the task structure to a reference counted cpuset structure. 119 in the task structure to a reference counted cgroup structure.
119 - Calls to sched_setaffinity are filtered to just those CPUs 120 - Calls to sched_setaffinity are filtered to just those CPUs
120 allowed in that tasks cpuset. 121 allowed in that tasks cpuset.
121 - Calls to mbind and set_mempolicy are filtered to just 122 - Calls to mbind and set_mempolicy are filtered to just
@@ -145,15 +146,10 @@ into the rest of the kernel, none in performance critical paths:
145 - in page_alloc.c, to restrict memory to allowed nodes. 146 - in page_alloc.c, to restrict memory to allowed nodes.
146 - in vmscan.c, to restrict page recovery to the current cpuset. 147 - in vmscan.c, to restrict page recovery to the current cpuset.
147 148
148In addition a new file system, of type "cpuset" may be mounted, 149You should mount the "cgroup" filesystem type in order to enable
149typically at /dev/cpuset, to enable browsing and modifying the cpusets 150browsing and modifying the cpusets presently known to the kernel. No
150presently known to the kernel. No new system calls are added for 151new system calls are added for cpusets - all support for querying and
151cpusets - all support for querying and modifying cpusets is via 152modifying cpusets is via this cpuset file system.
152this cpuset file system.
153
154Each task under /proc has an added file named 'cpuset', displaying
155the cpuset name, as the path relative to the root of the cpuset file
156system.
157 153
158The /proc/<pid>/status file for each task has two added lines, 154The /proc/<pid>/status file for each task has two added lines,
159displaying the tasks cpus_allowed (on which CPUs it may be scheduled) 155displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -163,16 +159,15 @@ in the format seen in the following example:
163 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff 159 Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
164 Mems_allowed: ffffffff,ffffffff 160 Mems_allowed: ffffffff,ffffffff
165 161
166Each cpuset is represented by a directory in the cpuset file system 162Each cpuset is represented by a directory in the cgroup file system
167containing the following files describing that cpuset: 163containing (on top of the standard cgroup files) the following
164files describing that cpuset:
168 165
169 - cpus: list of CPUs in that cpuset 166 - cpus: list of CPUs in that cpuset
170 - mems: list of Memory Nodes in that cpuset 167 - mems: list of Memory Nodes in that cpuset
171 - memory_migrate flag: if set, move pages to cpusets nodes 168 - memory_migrate flag: if set, move pages to cpusets nodes
172 - cpu_exclusive flag: is cpu placement exclusive? 169 - cpu_exclusive flag: is cpu placement exclusive?
173 - mem_exclusive flag: is memory placement exclusive? 170 - mem_exclusive flag: is memory placement exclusive?
174 - tasks: list of tasks (by pid) attached to that cpuset
175 - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
176 - memory_pressure: measure of how much paging pressure in cpuset 171 - memory_pressure: measure of how much paging pressure in cpuset
177 172
178In addition, the root cpuset only has the following file: 173In addition, the root cpuset only has the following file:
@@ -237,21 +232,7 @@ such as requests from interrupt handlers, is allowed to be taken
237outside even a mem_exclusive cpuset. 232outside even a mem_exclusive cpuset.
238 233
239 234
2401.5 What does notify_on_release do ? 2351.5 What is memory_pressure ?
241------------------------------------
242
243If the notify_on_release flag is enabled (1) in a cpuset, then whenever
244the last task in the cpuset leaves (exits or attaches to some other
245cpuset) and the last child cpuset of that cpuset is removed, then
246the kernel runs the command /sbin/cpuset_release_agent, supplying the
247pathname (relative to the mount point of the cpuset file system) of the
248abandoned cpuset. This enables automatic removal of abandoned cpusets.
249The default value of notify_on_release in the root cpuset at system
250boot is disabled (0). The default value of other cpusets at creation
251is the current value of their parents notify_on_release setting.
252
253
2541.6 What is memory_pressure ?
255----------------------------- 236-----------------------------
256The memory_pressure of a cpuset provides a simple per-cpuset metric 237The memory_pressure of a cpuset provides a simple per-cpuset metric
257of the rate that the tasks in a cpuset are attempting to free up in 238of the rate that the tasks in a cpuset are attempting to free up in
@@ -308,7 +289,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
308times 1000. 289times 1000.
309 290
310 291
3111.7 What is memory spread ? 2921.6 What is memory spread ?
312--------------------------- 293---------------------------
313There are two boolean flag files per cpuset that control where the 294There are two boolean flag files per cpuset that control where the
314kernel allocates pages for the file system buffers and related in 295kernel allocates pages for the file system buffers and related in
@@ -379,7 +360,7 @@ data set, the memory allocation across the nodes in the jobs cpuset
379can become very uneven. 360can become very uneven.
380 361
381 362
3821.8 How do I use cpusets ? 3631.7 How do I use cpusets ?
383-------------------------- 364--------------------------
384 365
385In order to minimize the impact of cpusets on critical kernel 366In order to minimize the impact of cpusets on critical kernel
@@ -469,7 +450,7 @@ than stress the kernel.
469To start a new job that is to be contained within a cpuset, the steps are: 450To start a new job that is to be contained within a cpuset, the steps are:
470 451
471 1) mkdir /dev/cpuset 452 1) mkdir /dev/cpuset
472 2) mount -t cpuset none /dev/cpuset 453 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
473 3) Create the new cpuset by doing mkdir's and write's (or echo's) in 454 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
474 the /dev/cpuset virtual file system. 455 the /dev/cpuset virtual file system.
475 4) Start a task that will be the "founding father" of the new job. 456 4) Start a task that will be the "founding father" of the new job.
@@ -481,7 +462,7 @@ For example, the following sequence of commands will setup a cpuset
481named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 462named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
482and then start a subshell 'sh' in that cpuset: 463and then start a subshell 'sh' in that cpuset:
483 464
484 mount -t cpuset none /dev/cpuset 465 mount -t cgroup -ocpuset cpuset /dev/cpuset
485 cd /dev/cpuset 466 cd /dev/cpuset
486 mkdir Charlie 467 mkdir Charlie
487 cd Charlie 468 cd Charlie
@@ -513,7 +494,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
513virtual filesystem. 494virtual filesystem.
514 495
515To mount it, type: 496To mount it, type:
516# mount -t cpuset none /dev/cpuset 497# mount -t cgroup -o cpuset cpuset /dev/cpuset
517 498
518Then under /dev/cpuset you can find a tree that corresponds to the 499Then under /dev/cpuset you can find a tree that corresponds to the
519tree of the cpusets in the system. For instance, /dev/cpuset 500tree of the cpusets in the system. For instance, /dev/cpuset
@@ -556,6 +537,18 @@ To remove a cpuset, just use rmdir:
556This will fail if the cpuset is in use (has cpusets inside, or has 537This will fail if the cpuset is in use (has cpusets inside, or has
557processes attached). 538processes attached).
558 539
540Note that for legacy reasons, the "cpuset" filesystem exists as a
541wrapper around the cgroup filesystem.
542
543The command
544
545mount -t cpuset X /dev/cpuset
546
547is equivalent to
548
549mount -t cgroup -ocpuset X /dev/cpuset
550echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
551
5592.2 Adding/removing cpus 5522.2 Adding/removing cpus
560------------------------ 553------------------------
561 554
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0e9a9aa9df64..fbff900fd5ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2131,7 +2131,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2131#ifdef CONFIG_SCHEDSTATS 2131#ifdef CONFIG_SCHEDSTATS
2132 INF("schedstat", S_IRUGO, pid_schedstat), 2132 INF("schedstat", S_IRUGO, pid_schedstat),
2133#endif 2133#endif
2134#ifdef CONFIG_CPUSETS 2134#ifdef CONFIG_PROC_PID_CPUSET
2135 REG("cpuset", S_IRUGO, cpuset), 2135 REG("cpuset", S_IRUGO, cpuset),
2136#endif 2136#endif
2137#ifdef CONFIG_CGROUPS 2137#ifdef CONFIG_CGROUPS
@@ -2420,7 +2420,7 @@ static const struct pid_entry tid_base_stuff[] = {
2420#ifdef CONFIG_SCHEDSTATS 2420#ifdef CONFIG_SCHEDSTATS
2421 INF("schedstat", S_IRUGO, pid_schedstat), 2421 INF("schedstat", S_IRUGO, pid_schedstat),
2422#endif 2422#endif
2423#ifdef CONFIG_CPUSETS 2423#ifdef CONFIG_PROC_PID_CPUSET
2424 REG("cpuset", S_IRUGO, cpuset), 2424 REG("cpuset", S_IRUGO, cpuset),
2425#endif 2425#endif
2426#ifdef CONFIG_CGROUPS 2426#ifdef CONFIG_CGROUPS
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f8eddbbcad9a..b152b51a4367 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -7,4 +7,10 @@
7 7
8/* */ 8/* */
9 9
10#ifdef CONFIG_CPUSETS
11SUBSYS(cpuset)
12#endif
13
14/* */
15
10/* */ 16/* */
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index ea44d2e768a0..31adfde1c95f 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -11,6 +11,7 @@
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/cgroup.h>
14 15
15#ifdef CONFIG_CPUSETS 16#ifdef CONFIG_CPUSETS
16 17
@@ -19,8 +20,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
19extern int cpuset_init_early(void); 20extern int cpuset_init_early(void);
20extern int cpuset_init(void); 21extern int cpuset_init(void);
21extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
22extern void cpuset_fork(struct task_struct *p);
23extern void cpuset_exit(struct task_struct *p);
24extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); 23extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
25extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 24extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
26#define cpuset_current_mems_allowed (current->mems_allowed) 25#define cpuset_current_mems_allowed (current->mems_allowed)
@@ -76,13 +75,13 @@ static inline int cpuset_do_slab_mem_spread(void)
76 75
77extern void cpuset_track_online_nodes(void); 76extern void cpuset_track_online_nodes(void);
78 77
78extern int current_cpuset_is_being_rebound(void);
79
79#else /* !CONFIG_CPUSETS */ 80#else /* !CONFIG_CPUSETS */
80 81
81static inline int cpuset_init_early(void) { return 0; } 82static inline int cpuset_init_early(void) { return 0; }
82static inline int cpuset_init(void) { return 0; } 83static inline int cpuset_init(void) { return 0; }
83static inline void cpuset_init_smp(void) {} 84static inline void cpuset_init_smp(void) {}
84static inline void cpuset_fork(struct task_struct *p) {}
85static inline void cpuset_exit(struct task_struct *p) {}
86 85
87static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p) 86static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
88{ 87{
@@ -148,6 +147,11 @@ static inline int cpuset_do_slab_mem_spread(void)
148 147
149static inline void cpuset_track_online_nodes(void) {} 148static inline void cpuset_track_online_nodes(void) {}
150 149
150static inline int current_cpuset_is_being_rebound(void)
151{
152 return 0;
153}
154
151#endif /* !CONFIG_CPUSETS */ 155#endif /* !CONFIG_CPUSETS */
152 156
153#endif /* _LINUX_CPUSET_H */ 157#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 38c04d61ee06..59c4865bc85f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -148,14 +148,6 @@ extern void mpol_rebind_task(struct task_struct *tsk,
148 const nodemask_t *new); 148 const nodemask_t *new);
149extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); 149extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
150extern void mpol_fix_fork_child_flag(struct task_struct *p); 150extern void mpol_fix_fork_child_flag(struct task_struct *p);
151#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
152
153#ifdef CONFIG_CPUSETS
154#define current_cpuset_is_being_rebound() \
155 (cpuset_being_rebound == current->cpuset)
156#else
157#define current_cpuset_is_being_rebound() 0
158#endif
159 151
160extern struct mempolicy default_policy; 152extern struct mempolicy default_policy;
161extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 153extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
@@ -173,8 +165,6 @@ static inline void check_highest_zone(enum zone_type k)
173int do_migrate_pages(struct mm_struct *mm, 165int do_migrate_pages(struct mm_struct *mm,
174 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); 166 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
175 167
176extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
177
178#else 168#else
179 169
180struct mempolicy {}; 170struct mempolicy {};
@@ -248,8 +238,6 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
248{ 238{
249} 239}
250 240
251#define set_cpuset_being_rebound(x) do {} while (0)
252
253static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 241static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
254 unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) 242 unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
255{ 243{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1aa1cfa63b37..93a55f2e5ef6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -756,8 +756,6 @@ static inline int above_background_load(void)
756} 756}
757 757
758struct io_context; /* See blkdev.h */ 758struct io_context; /* See blkdev.h */
759struct cpuset;
760
761#define NGROUPS_SMALL 32 759#define NGROUPS_SMALL 32
762#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) 760#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
763struct group_info { 761struct group_info {
@@ -1125,7 +1123,6 @@ struct task_struct {
1125 short il_next; 1123 short il_next;
1126#endif 1124#endif
1127#ifdef CONFIG_CPUSETS 1125#ifdef CONFIG_CPUSETS
1128 struct cpuset *cpuset;
1129 nodemask_t mems_allowed; 1126 nodemask_t mems_allowed;
1130 int cpuset_mems_generation; 1127 int cpuset_mems_generation;
1131 int cpuset_mem_spread_rotor; 1128 int cpuset_mem_spread_rotor;
diff --git a/init/Kconfig b/init/Kconfig
index 51b3d14f44f1..18b1abc677da 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -280,7 +280,7 @@ config CGROUPS
280 280
281config CPUSETS 281config CPUSETS
282 bool "Cpuset support" 282 bool "Cpuset support"
283 depends on SMP 283 depends on SMP && CGROUPS
284 help 284 help
285 This option will let you create and manage CPUSETs which 285 This option will let you create and manage CPUSETs which
286 allow dynamically partitioning a system into sets of CPUs and 286 allow dynamically partitioning a system into sets of CPUs and
@@ -330,6 +330,11 @@ config SYSFS_DEPRECATED
330 If you are using a distro that was released in 2006 or later, 330 If you are using a distro that was released in 2006 or later,
331 it should be safe to say N here. 331 it should be safe to say N here.
332 332
333config PROC_PID_CPUSET
334 bool "Include legacy /proc/<pid>/cpuset file"
335 depends on CPUSETS
336 default y
337
333config RELAY 338config RELAY
334 bool "Kernel->user space relay support (formerly relayfs)" 339 bool "Kernel->user space relay support (formerly relayfs)"
335 help 340 help
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a40a2c4384b3..1133062395e2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
8 * 9 *
9 * Portions derived from Patrick Mochel's sysfs code. 10 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
@@ -12,6 +13,7 @@
12 * 2003-10-10 Written by Simon Derr. 13 * 2003-10-10 Written by Simon Derr.
13 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
14 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
15 * 17 *
16 * This file is subject to the terms and conditions of the GNU General Public 18 * This file is subject to the terms and conditions of the GNU General Public
17 * License. See the file COPYING in the main directory of the Linux 19 * License. See the file COPYING in the main directory of the Linux
@@ -53,8 +55,6 @@
53#include <asm/atomic.h> 55#include <asm/atomic.h>
54#include <linux/mutex.h> 56#include <linux/mutex.h>
55 57
56#define CPUSET_SUPER_MAGIC 0x27e0eb
57
58/* 58/*
59 * Tracks how many cpusets are currently defined in system. 59 * Tracks how many cpusets are currently defined in system.
60 * When there is only one cpuset (the root cpuset) we can 60 * When there is only one cpuset (the root cpuset) we can
@@ -62,6 +62,10 @@
62 */ 62 */
63int number_of_cpusets __read_mostly; 63int number_of_cpusets __read_mostly;
64 64
65/* Retrieve the cpuset from a cgroup */
66struct cgroup_subsys cpuset_subsys;
67struct cpuset;
68
65/* See "Frequency meter" comments, below. */ 69/* See "Frequency meter" comments, below. */
66 70
67struct fmeter { 71struct fmeter {
@@ -72,24 +76,13 @@ struct fmeter {
72}; 76};
73 77
74struct cpuset { 78struct cpuset {
79 struct cgroup_subsys_state css;
80
75 unsigned long flags; /* "unsigned long" so bitops work */ 81 unsigned long flags; /* "unsigned long" so bitops work */
76 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 82 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
77 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 83 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
78 84
79 /*
80 * Count is atomic so can incr (fork) or decr (exit) without a lock.
81 */
82 atomic_t count; /* count tasks using this cpuset */
83
84 /*
85 * We link our 'sibling' struct into our parents 'children'.
86 * Our children link their 'sibling' into our 'children'.
87 */
88 struct list_head sibling; /* my parents children */
89 struct list_head children; /* my children */
90
91 struct cpuset *parent; /* my parent */ 85 struct cpuset *parent; /* my parent */
92 struct dentry *dentry; /* cpuset fs entry */
93 86
94 /* 87 /*
95 * Copy of global cpuset_mems_generation as of the most 88 * Copy of global cpuset_mems_generation as of the most
@@ -100,13 +93,26 @@ struct cpuset {
100 struct fmeter fmeter; /* memory_pressure filter */ 93 struct fmeter fmeter; /* memory_pressure filter */
101}; 94};
102 95
96/* Retrieve the cpuset for a cgroup */
97static inline struct cpuset *cgroup_cs(struct cgroup *cont)
98{
99 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
100 struct cpuset, css);
101}
102
103/* Retrieve the cpuset for a task */
104static inline struct cpuset *task_cs(struct task_struct *task)
105{
106 return container_of(task_subsys_state(task, cpuset_subsys_id),
107 struct cpuset, css);
108}
109
110
103/* bits in struct cpuset flags field */ 111/* bits in struct cpuset flags field */
104typedef enum { 112typedef enum {
105 CS_CPU_EXCLUSIVE, 113 CS_CPU_EXCLUSIVE,
106 CS_MEM_EXCLUSIVE, 114 CS_MEM_EXCLUSIVE,
107 CS_MEMORY_MIGRATE, 115 CS_MEMORY_MIGRATE,
108 CS_REMOVED,
109 CS_NOTIFY_ON_RELEASE,
110 CS_SPREAD_PAGE, 116 CS_SPREAD_PAGE,
111 CS_SPREAD_SLAB, 117 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 118} cpuset_flagbits_t;
@@ -122,16 +128,6 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
122 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 128 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 129}
124 130
125static inline int is_removed(const struct cpuset *cs)
126{
127 return test_bit(CS_REMOVED, &cs->flags);
128}
129
130static inline int notify_on_release(const struct cpuset *cs)
131{
132 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133}
134
135static inline int is_memory_migrate(const struct cpuset *cs) 131static inline int is_memory_migrate(const struct cpuset *cs)
136{ 132{
137 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 133 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
@@ -172,14 +168,8 @@ static struct cpuset top_cpuset = {
172 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 168 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
173 .cpus_allowed = CPU_MASK_ALL, 169 .cpus_allowed = CPU_MASK_ALL,
174 .mems_allowed = NODE_MASK_ALL, 170 .mems_allowed = NODE_MASK_ALL,
175 .count = ATOMIC_INIT(0),
176 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
177 .children = LIST_HEAD_INIT(top_cpuset.children),
178}; 171};
179 172
180static struct vfsmount *cpuset_mount;
181static struct super_block *cpuset_sb;
182
183/* 173/*
184 * We have two global cpuset mutexes below. They can nest. 174 * We have two global cpuset mutexes below. They can nest.
185 * It is ok to first take manage_mutex, then nest callback_mutex. We also 175 * It is ok to first take manage_mutex, then nest callback_mutex. We also
@@ -263,297 +253,33 @@ static struct super_block *cpuset_sb;
263 * the routine cpuset_update_task_memory_state(). 253 * the routine cpuset_update_task_memory_state().
264 */ 254 */
265 255
266static DEFINE_MUTEX(manage_mutex);
267static DEFINE_MUTEX(callback_mutex); 256static DEFINE_MUTEX(callback_mutex);
268 257
269/* 258/* This is ugly, but preserves the userspace API for existing cpuset
270 * A couple of forward declarations required, due to cyclic reference loop: 259 * users. If someone tries to mount the "cpuset" filesystem, we
271 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 260 * silently switch it to mount "cgroup" instead */
272 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
273 */
274
275static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
276static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
277
278static struct backing_dev_info cpuset_backing_dev_info = {
279 .ra_pages = 0, /* No readahead */
280 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
281};
282
283static struct inode *cpuset_new_inode(mode_t mode)
284{
285 struct inode *inode = new_inode(cpuset_sb);
286
287 if (inode) {
288 inode->i_mode = mode;
289 inode->i_uid = current->fsuid;
290 inode->i_gid = current->fsgid;
291 inode->i_blocks = 0;
292 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
293 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
294 }
295 return inode;
296}
297
298static void cpuset_diput(struct dentry *dentry, struct inode *inode)
299{
300 /* is dentry a directory ? if so, kfree() associated cpuset */
301 if (S_ISDIR(inode->i_mode)) {
302 struct cpuset *cs = dentry->d_fsdata;
303 BUG_ON(!(is_removed(cs)));
304 kfree(cs);
305 }
306 iput(inode);
307}
308
309static struct dentry_operations cpuset_dops = {
310 .d_iput = cpuset_diput,
311};
312
313static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
314{
315 struct dentry *d = lookup_one_len(name, parent, strlen(name));
316 if (!IS_ERR(d))
317 d->d_op = &cpuset_dops;
318 return d;
319}
320
321static void remove_dir(struct dentry *d)
322{
323 struct dentry *parent = dget(d->d_parent);
324
325 d_delete(d);
326 simple_rmdir(parent->d_inode, d);
327 dput(parent);
328}
329
330/*
331 * NOTE : the dentry must have been dget()'ed
332 */
333static void cpuset_d_remove_dir(struct dentry *dentry)
334{
335 struct list_head *node;
336
337 spin_lock(&dcache_lock);
338 node = dentry->d_subdirs.next;
339 while (node != &dentry->d_subdirs) {
340 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
341 list_del_init(node);
342 if (d->d_inode) {
343 d = dget_locked(d);
344 spin_unlock(&dcache_lock);
345 d_delete(d);
346 simple_unlink(dentry->d_inode, d);
347 dput(d);
348 spin_lock(&dcache_lock);
349 }
350 node = dentry->d_subdirs.next;
351 }
352 list_del_init(&dentry->d_u.d_child);
353 spin_unlock(&dcache_lock);
354 remove_dir(dentry);
355}
356
357static struct super_operations cpuset_ops = {
358 .statfs = simple_statfs,
359 .drop_inode = generic_delete_inode,
360};
361
362static int cpuset_fill_super(struct super_block *sb, void *unused_data,
363 int unused_silent)
364{
365 struct inode *inode;
366 struct dentry *root;
367
368 sb->s_blocksize = PAGE_CACHE_SIZE;
369 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
370 sb->s_magic = CPUSET_SUPER_MAGIC;
371 sb->s_op = &cpuset_ops;
372 cpuset_sb = sb;
373
374 inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
375 if (inode) {
376 inode->i_op = &simple_dir_inode_operations;
377 inode->i_fop = &simple_dir_operations;
378 /* directories start off with i_nlink == 2 (for "." entry) */
379 inc_nlink(inode);
380 } else {
381 return -ENOMEM;
382 }
383
384 root = d_alloc_root(inode);
385 if (!root) {
386 iput(inode);
387 return -ENOMEM;
388 }
389 sb->s_root = root;
390 return 0;
391}
392
393static int cpuset_get_sb(struct file_system_type *fs_type, 261static int cpuset_get_sb(struct file_system_type *fs_type,
394 int flags, const char *unused_dev_name, 262 int flags, const char *unused_dev_name,
395 void *data, struct vfsmount *mnt) 263 void *data, struct vfsmount *mnt)
396{ 264{
397 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); 265 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
266 int ret = -ENODEV;
267 if (cgroup_fs) {
268 char mountopts[] =
269 "cpuset,noprefix,"
270 "release_agent=/sbin/cpuset_release_agent";
271 ret = cgroup_fs->get_sb(cgroup_fs, flags,
272 unused_dev_name, mountopts, mnt);
273 put_filesystem(cgroup_fs);
274 }
275 return ret;
398} 276}
399 277
400static struct file_system_type cpuset_fs_type = { 278static struct file_system_type cpuset_fs_type = {
401 .name = "cpuset", 279 .name = "cpuset",
402 .get_sb = cpuset_get_sb, 280 .get_sb = cpuset_get_sb,
403 .kill_sb = kill_litter_super,
404}; 281};
405 282
406/* struct cftype:
407 *
408 * The files in the cpuset filesystem mostly have a very simple read/write
409 * handling, some common function will take care of it. Nevertheless some cases
410 * (read tasks) are special and therefore I define this structure for every
411 * kind of file.
412 *
413 *
414 * When reading/writing to a file:
415 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
416 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
417 */
418
419struct cftype {
420 char *name;
421 int private;
422 int (*open) (struct inode *inode, struct file *file);
423 ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
424 loff_t *ppos);
425 int (*write) (struct file *file, const char __user *buf, size_t nbytes,
426 loff_t *ppos);
427 int (*release) (struct inode *inode, struct file *file);
428};
429
430static inline struct cpuset *__d_cs(struct dentry *dentry)
431{
432 return dentry->d_fsdata;
433}
434
435static inline struct cftype *__d_cft(struct dentry *dentry)
436{
437 return dentry->d_fsdata;
438}
439
440/*
441 * Call with manage_mutex held. Writes path of cpuset into buf.
442 * Returns 0 on success, -errno on error.
443 */
444
445static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
446{
447 char *start;
448
449 start = buf + buflen;
450
451 *--start = '\0';
452 for (;;) {
453 int len = cs->dentry->d_name.len;
454 if ((start -= len) < buf)
455 return -ENAMETOOLONG;
456 memcpy(start, cs->dentry->d_name.name, len);
457 cs = cs->parent;
458 if (!cs)
459 break;
460 if (!cs->parent)
461 continue;
462 if (--start < buf)
463 return -ENAMETOOLONG;
464 *start = '/';
465 }
466 memmove(buf, start, buf + buflen - start);
467 return 0;
468}
469
470/*
471 * Notify userspace when a cpuset is released, by running
472 * /sbin/cpuset_release_agent with the name of the cpuset (path
473 * relative to the root of cpuset file system) as the argument.
474 *
475 * Most likely, this user command will try to rmdir this cpuset.
476 *
477 * This races with the possibility that some other task will be
478 * attached to this cpuset before it is removed, or that some other
479 * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
480 * The presumed 'rmdir' will fail quietly if this cpuset is no longer
481 * unused, and this cpuset will be reprieved from its death sentence,
482 * to continue to serve a useful existence. Next time it's released,
483 * we will get notified again, if it still has 'notify_on_release' set.
484 *
485 * The final arg to call_usermodehelper() is 0, which means don't
486 * wait. The separate /sbin/cpuset_release_agent task is forked by
487 * call_usermodehelper(), then control in this thread returns here,
488 * without waiting for the release agent task. We don't bother to
489 * wait because the caller of this routine has no use for the exit
490 * status of the /sbin/cpuset_release_agent task, so no sense holding
491 * our caller up for that.
492 *
493 * When we had only one cpuset mutex, we had to call this
494 * without holding it, to avoid deadlock when call_usermodehelper()
495 * allocated memory. With two locks, we could now call this while
496 * holding manage_mutex, but we still don't, so as to minimize
497 * the time manage_mutex is held.
498 */
499
500static void cpuset_release_agent(const char *pathbuf)
501{
502 char *argv[3], *envp[3];
503 int i;
504
505 if (!pathbuf)
506 return;
507
508 i = 0;
509 argv[i++] = "/sbin/cpuset_release_agent";
510 argv[i++] = (char *)pathbuf;
511 argv[i] = NULL;
512
513 i = 0;
514 /* minimal command environment */
515 envp[i++] = "HOME=/";
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL;
518
519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf);
521}
522
523/*
524 * Either cs->count of using tasks transitioned to zero, or the
525 * cs->children list of child cpusets just became empty. If this
526 * cs is notify_on_release() and now both the user count is zero and
527 * the list of children is empty, prepare cpuset path in a kmalloc'd
528 * buffer, to be returned via ppathbuf, so that the caller can invoke
529 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
530 * Call here with manage_mutex held.
531 *
532 * This check_for_release() routine is responsible for kmalloc'ing
533 * pathbuf. The above cpuset_release_agent() is responsible for
534 * kfree'ing pathbuf. The caller of these routines is responsible
535 * for providing a pathbuf pointer, initialized to NULL, then
536 * calling check_for_release() with manage_mutex held and the address
537 * of the pathbuf pointer, then dropping manage_mutex, then calling
538 * cpuset_release_agent() with pathbuf, as set by check_for_release().
539 */
540
541static void check_for_release(struct cpuset *cs, char **ppathbuf)
542{
543 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
544 list_empty(&cs->children)) {
545 char *buf;
546
547 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
548 if (!buf)
549 return;
550 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
551 kfree(buf);
552 else
553 *ppathbuf = buf;
554 }
555}
556
557/* 283/*
558 * Return in *pmask the portion of a cpusets's cpus_allowed that 284 * Return in *pmask the portion of a cpusets's cpus_allowed that
559 * are online. If none are online, walk up the cpuset hierarchy 285 * are online. If none are online, walk up the cpuset hierarchy
@@ -653,20 +379,19 @@ void cpuset_update_task_memory_state(void)
653 struct task_struct *tsk = current; 379 struct task_struct *tsk = current;
654 struct cpuset *cs; 380 struct cpuset *cs;
655 381
656 if (tsk->cpuset == &top_cpuset) { 382 if (task_cs(tsk) == &top_cpuset) {
657 /* Don't need rcu for top_cpuset. It's never freed. */ 383 /* Don't need rcu for top_cpuset. It's never freed. */
658 my_cpusets_mem_gen = top_cpuset.mems_generation; 384 my_cpusets_mem_gen = top_cpuset.mems_generation;
659 } else { 385 } else {
660 rcu_read_lock(); 386 rcu_read_lock();
661 cs = rcu_dereference(tsk->cpuset); 387 my_cpusets_mem_gen = task_cs(current)->mems_generation;
662 my_cpusets_mem_gen = cs->mems_generation;
663 rcu_read_unlock(); 388 rcu_read_unlock();
664 } 389 }
665 390
666 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 391 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
667 mutex_lock(&callback_mutex); 392 mutex_lock(&callback_mutex);
668 task_lock(tsk); 393 task_lock(tsk);
669 cs = tsk->cpuset; /* Maybe changed when task not locked */ 394 cs = task_cs(tsk); /* Maybe changed when task not locked */
670 guarantee_online_mems(cs, &tsk->mems_allowed); 395 guarantee_online_mems(cs, &tsk->mems_allowed);
671 tsk->cpuset_mems_generation = cs->mems_generation; 396 tsk->cpuset_mems_generation = cs->mems_generation;
672 if (is_spread_page(cs)) 397 if (is_spread_page(cs))
@@ -721,11 +446,12 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
721 446
722static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 447static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
723{ 448{
449 struct cgroup *cont;
724 struct cpuset *c, *par; 450 struct cpuset *c, *par;
725 451
726 /* Each of our child cpusets must be a subset of us */ 452 /* Each of our child cpusets must be a subset of us */
727 list_for_each_entry(c, &cur->children, sibling) { 453 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
728 if (!is_cpuset_subset(c, trial)) 454 if (!is_cpuset_subset(cgroup_cs(cont), trial))
729 return -EBUSY; 455 return -EBUSY;
730 } 456 }
731 457
@@ -740,7 +466,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
740 return -EACCES; 466 return -EACCES;
741 467
742 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 468 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
743 list_for_each_entry(c, &par->children, sibling) { 469 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
470 c = cgroup_cs(cont);
744 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 471 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
745 c != cur && 472 c != cur &&
746 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 473 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -783,7 +510,8 @@ static int update_cpumask(struct cpuset *cs, char *buf)
783 } 510 }
784 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); 511 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
785 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ 512 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
786 if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) 513 if (cgroup_task_count(cs->css.cgroup) &&
514 cpus_empty(trialcs.cpus_allowed))
787 return -ENOSPC; 515 return -ENOSPC;
788 retval = validate_change(cs, &trialcs); 516 retval = validate_change(cs, &trialcs);
789 if (retval < 0) 517 if (retval < 0)
@@ -839,7 +567,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
839 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 567 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
840 568
841 mutex_lock(&callback_mutex); 569 mutex_lock(&callback_mutex);
842 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); 570 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
843 mutex_unlock(&callback_mutex); 571 mutex_unlock(&callback_mutex);
844} 572}
845 573
@@ -857,16 +585,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
857 * their mempolicies to the cpusets new mems_allowed. 585 * their mempolicies to the cpusets new mems_allowed.
858 */ 586 */
859 587
588static void *cpuset_being_rebound;
589
860static int update_nodemask(struct cpuset *cs, char *buf) 590static int update_nodemask(struct cpuset *cs, char *buf)
861{ 591{
862 struct cpuset trialcs; 592 struct cpuset trialcs;
863 nodemask_t oldmem; 593 nodemask_t oldmem;
864 struct task_struct *g, *p; 594 struct task_struct *p;
865 struct mm_struct **mmarray; 595 struct mm_struct **mmarray;
866 int i, n, ntasks; 596 int i, n, ntasks;
867 int migrate; 597 int migrate;
868 int fudge; 598 int fudge;
869 int retval; 599 int retval;
600 struct cgroup_iter it;
870 601
871 /* 602 /*
872 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 603 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -909,7 +640,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
909 goto done; 640 goto done;
910 } 641 }
911 /* mems_allowed cannot be empty for a cpuset with attached tasks. */ 642 /* mems_allowed cannot be empty for a cpuset with attached tasks. */
912 if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { 643 if (cgroup_task_count(cs->css.cgroup) &&
644 nodes_empty(trialcs.mems_allowed)) {
913 retval = -ENOSPC; 645 retval = -ENOSPC;
914 goto done; 646 goto done;
915 } 647 }
@@ -922,7 +654,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 cs->mems_generation = cpuset_mems_generation++; 654 cs->mems_generation = cpuset_mems_generation++;
923 mutex_unlock(&callback_mutex); 655 mutex_unlock(&callback_mutex);
924 656
925 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 657 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
926 658
927 fudge = 10; /* spare mmarray[] slots */ 659 fudge = 10; /* spare mmarray[] slots */
928 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 660 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -936,13 +668,13 @@ static int update_nodemask(struct cpuset *cs, char *buf)
936 * enough mmarray[] w/o using GFP_ATOMIC. 668 * enough mmarray[] w/o using GFP_ATOMIC.
937 */ 669 */
938 while (1) { 670 while (1) {
939 ntasks = atomic_read(&cs->count); /* guess */ 671 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
940 ntasks += fudge; 672 ntasks += fudge;
941 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 673 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
942 if (!mmarray) 674 if (!mmarray)
943 goto done; 675 goto done;
944 read_lock(&tasklist_lock); /* block fork */ 676 read_lock(&tasklist_lock); /* block fork */
945 if (atomic_read(&cs->count) <= ntasks) 677 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
946 break; /* got enough */ 678 break; /* got enough */
947 read_unlock(&tasklist_lock); /* try again */ 679 read_unlock(&tasklist_lock); /* try again */
948 kfree(mmarray); 680 kfree(mmarray);
@@ -951,21 +683,21 @@ static int update_nodemask(struct cpuset *cs, char *buf)
951 n = 0; 683 n = 0;
952 684
953 /* Load up mmarray[] with mm reference for each task in cpuset. */ 685 /* Load up mmarray[] with mm reference for each task in cpuset. */
954 do_each_thread(g, p) { 686 cgroup_iter_start(cs->css.cgroup, &it);
687 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
955 struct mm_struct *mm; 688 struct mm_struct *mm;
956 689
957 if (n >= ntasks) { 690 if (n >= ntasks) {
958 printk(KERN_WARNING 691 printk(KERN_WARNING
959 "Cpuset mempolicy rebind incomplete.\n"); 692 "Cpuset mempolicy rebind incomplete.\n");
960 continue; 693 break;
961 } 694 }
962 if (p->cpuset != cs)
963 continue;
964 mm = get_task_mm(p); 695 mm = get_task_mm(p);
965 if (!mm) 696 if (!mm)
966 continue; 697 continue;
967 mmarray[n++] = mm; 698 mmarray[n++] = mm;
968 } while_each_thread(g, p); 699 }
700 cgroup_iter_end(cs->css.cgroup, &it);
969 read_unlock(&tasklist_lock); 701 read_unlock(&tasklist_lock);
970 702
971 /* 703 /*
@@ -993,12 +725,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
993 725
994 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 726 /* We're done rebinding vma's to this cpusets new mems_allowed. */
995 kfree(mmarray); 727 kfree(mmarray);
996 set_cpuset_being_rebound(NULL); 728 cpuset_being_rebound = NULL;
997 retval = 0; 729 retval = 0;
998done: 730done:
999 return retval; 731 return retval;
1000} 732}
1001 733
734int current_cpuset_is_being_rebound(void)
735{
736 return task_cs(current) == cpuset_being_rebound;
737}
738
1002/* 739/*
1003 * Call with manage_mutex held. 740 * Call with manage_mutex held.
1004 */ 741 */
@@ -1145,85 +882,34 @@ static int fmeter_getrate(struct fmeter *fmp)
1145 return val; 882 return val;
1146} 883}
1147 884
1148/* 885static int cpuset_can_attach(struct cgroup_subsys *ss,
1149 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 886 struct cgroup *cont, struct task_struct *tsk)
1150 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1151 * notified on release.
1152 *
1153 * Call holding manage_mutex. May take callback_mutex and task_lock of
1154 * the task 'pid' during call.
1155 */
1156
1157static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1158{ 887{
1159 pid_t pid; 888 struct cpuset *cs = cgroup_cs(cont);
1160 struct task_struct *tsk;
1161 struct cpuset *oldcs;
1162 cpumask_t cpus;
1163 nodemask_t from, to;
1164 struct mm_struct *mm;
1165 int retval;
1166 889
1167 if (sscanf(pidbuf, "%d", &pid) != 1)
1168 return -EIO;
1169 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 890 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1170 return -ENOSPC; 891 return -ENOSPC;
1171 892
1172 if (pid) { 893 return security_task_setscheduler(tsk, 0, NULL);
1173 read_lock(&tasklist_lock); 894}
1174
1175 tsk = find_task_by_pid(pid);
1176 if (!tsk || tsk->flags & PF_EXITING) {
1177 read_unlock(&tasklist_lock);
1178 return -ESRCH;
1179 }
1180
1181 get_task_struct(tsk);
1182 read_unlock(&tasklist_lock);
1183
1184 if ((current->euid) && (current->euid != tsk->uid)
1185 && (current->euid != tsk->suid)) {
1186 put_task_struct(tsk);
1187 return -EACCES;
1188 }
1189 } else {
1190 tsk = current;
1191 get_task_struct(tsk);
1192 }
1193 895
1194 retval = security_task_setscheduler(tsk, 0, NULL); 896static void cpuset_attach(struct cgroup_subsys *ss,
1195 if (retval) { 897 struct cgroup *cont, struct cgroup *oldcont,
1196 put_task_struct(tsk); 898 struct task_struct *tsk)
1197 return retval; 899{
1198 } 900 cpumask_t cpus;
901 nodemask_t from, to;
902 struct mm_struct *mm;
903 struct cpuset *cs = cgroup_cs(cont);
904 struct cpuset *oldcs = cgroup_cs(oldcont);
1199 905
1200 mutex_lock(&callback_mutex); 906 mutex_lock(&callback_mutex);
1201
1202 task_lock(tsk);
1203 oldcs = tsk->cpuset;
1204 /*
1205 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1206 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1207 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1208 */
1209 if (tsk->flags & PF_EXITING) {
1210 task_unlock(tsk);
1211 mutex_unlock(&callback_mutex);
1212 put_task_struct(tsk);
1213 return -ESRCH;
1214 }
1215 atomic_inc(&cs->count);
1216 rcu_assign_pointer(tsk->cpuset, cs);
1217 task_unlock(tsk);
1218
1219 guarantee_online_cpus(cs, &cpus); 907 guarantee_online_cpus(cs, &cpus);
1220 set_cpus_allowed(tsk, cpus); 908 set_cpus_allowed(tsk, cpus);
909 mutex_unlock(&callback_mutex);
1221 910
1222 from = oldcs->mems_allowed; 911 from = oldcs->mems_allowed;
1223 to = cs->mems_allowed; 912 to = cs->mems_allowed;
1224
1225 mutex_unlock(&callback_mutex);
1226
1227 mm = get_task_mm(tsk); 913 mm = get_task_mm(tsk);
1228 if (mm) { 914 if (mm) {
1229 mpol_rebind_mm(mm, &to); 915 mpol_rebind_mm(mm, &to);
@@ -1232,40 +918,31 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1232 mmput(mm); 918 mmput(mm);
1233 } 919 }
1234 920
1235 put_task_struct(tsk);
1236 synchronize_rcu();
1237 if (atomic_dec_and_test(&oldcs->count))
1238 check_for_release(oldcs, ppathbuf);
1239 return 0;
1240} 921}
1241 922
1242/* The various types of files and directories in a cpuset file system */ 923/* The various types of files and directories in a cpuset file system */
1243 924
1244typedef enum { 925typedef enum {
1245 FILE_ROOT,
1246 FILE_DIR,
1247 FILE_MEMORY_MIGRATE, 926 FILE_MEMORY_MIGRATE,
1248 FILE_CPULIST, 927 FILE_CPULIST,
1249 FILE_MEMLIST, 928 FILE_MEMLIST,
1250 FILE_CPU_EXCLUSIVE, 929 FILE_CPU_EXCLUSIVE,
1251 FILE_MEM_EXCLUSIVE, 930 FILE_MEM_EXCLUSIVE,
1252 FILE_NOTIFY_ON_RELEASE,
1253 FILE_MEMORY_PRESSURE_ENABLED, 931 FILE_MEMORY_PRESSURE_ENABLED,
1254 FILE_MEMORY_PRESSURE, 932 FILE_MEMORY_PRESSURE,
1255 FILE_SPREAD_PAGE, 933 FILE_SPREAD_PAGE,
1256 FILE_SPREAD_SLAB, 934 FILE_SPREAD_SLAB,
1257 FILE_TASKLIST,
1258} cpuset_filetype_t; 935} cpuset_filetype_t;
1259 936
1260static ssize_t cpuset_common_file_write(struct file *file, 937static ssize_t cpuset_common_file_write(struct cgroup *cont,
938 struct cftype *cft,
939 struct file *file,
1261 const char __user *userbuf, 940 const char __user *userbuf,
1262 size_t nbytes, loff_t *unused_ppos) 941 size_t nbytes, loff_t *unused_ppos)
1263{ 942{
1264 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); 943 struct cpuset *cs = cgroup_cs(cont);
1265 struct cftype *cft = __d_cft(file->f_path.dentry);
1266 cpuset_filetype_t type = cft->private; 944 cpuset_filetype_t type = cft->private;
1267 char *buffer; 945 char *buffer;
1268 char *pathbuf = NULL;
1269 int retval = 0; 946 int retval = 0;
1270 947
1271 /* Crude upper limit on largest legitimate cpulist user might write. */ 948 /* Crude upper limit on largest legitimate cpulist user might write. */
@@ -1282,9 +959,9 @@ static ssize_t cpuset_common_file_write(struct file *file,
1282 } 959 }
1283 buffer[nbytes] = 0; /* nul-terminate */ 960 buffer[nbytes] = 0; /* nul-terminate */
1284 961
1285 mutex_lock(&manage_mutex); 962 cgroup_lock();
1286 963
1287 if (is_removed(cs)) { 964 if (cgroup_is_removed(cont)) {
1288 retval = -ENODEV; 965 retval = -ENODEV;
1289 goto out2; 966 goto out2;
1290 } 967 }
@@ -1302,9 +979,6 @@ static ssize_t cpuset_common_file_write(struct file *file,
1302 case FILE_MEM_EXCLUSIVE: 979 case FILE_MEM_EXCLUSIVE:
1303 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 980 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1304 break; 981 break;
1305 case FILE_NOTIFY_ON_RELEASE:
1306 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
1307 break;
1308 case FILE_MEMORY_MIGRATE: 982 case FILE_MEMORY_MIGRATE:
1309 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 983 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1310 break; 984 break;
@@ -1322,9 +996,6 @@ static ssize_t cpuset_common_file_write(struct file *file,
1322 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 996 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1323 cs->mems_generation = cpuset_mems_generation++; 997 cs->mems_generation = cpuset_mems_generation++;
1324 break; 998 break;
1325 case FILE_TASKLIST:
1326 retval = attach_task(cs, buffer, &pathbuf);
1327 break;
1328 default: 999 default:
1329 retval = -EINVAL; 1000 retval = -EINVAL;
1330 goto out2; 1001 goto out2;
@@ -1333,30 +1004,12 @@ static ssize_t cpuset_common_file_write(struct file *file,
1333 if (retval == 0) 1004 if (retval == 0)
1334 retval = nbytes; 1005 retval = nbytes;
1335out2: 1006out2:
1336 mutex_unlock(&manage_mutex); 1007 cgroup_unlock();
1337 cpuset_release_agent(pathbuf);
1338out1: 1008out1:
1339 kfree(buffer); 1009 kfree(buffer);
1340 return retval; 1010 return retval;
1341} 1011}
1342 1012
1343static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1344 size_t nbytes, loff_t *ppos)
1345{
1346 ssize_t retval = 0;
1347 struct cftype *cft = __d_cft(file->f_path.dentry);
1348 if (!cft)
1349 return -ENODEV;
1350
1351 /* special function ? */
1352 if (cft->write)
1353 retval = cft->write(file, buf, nbytes, ppos);
1354 else
1355 retval = cpuset_common_file_write(file, buf, nbytes, ppos);
1356
1357 return retval;
1358}
1359
1360/* 1013/*
1361 * These ascii lists should be read in a single call, by using a user 1014 * These ascii lists should be read in a single call, by using a user
1362 * buffer large enough to hold the entire map. If read in smaller 1015 * buffer large enough to hold the entire map. If read in smaller
@@ -1391,11 +1044,13 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1391 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1044 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1392} 1045}
1393 1046
1394static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1047static ssize_t cpuset_common_file_read(struct cgroup *cont,
1395 size_t nbytes, loff_t *ppos) 1048 struct cftype *cft,
1049 struct file *file,
1050 char __user *buf,
1051 size_t nbytes, loff_t *ppos)
1396{ 1052{
1397 struct cftype *cft = __d_cft(file->f_path.dentry); 1053 struct cpuset *cs = cgroup_cs(cont);
1398 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1399 cpuset_filetype_t type = cft->private; 1054 cpuset_filetype_t type = cft->private;
1400 char *page; 1055 char *page;
1401 ssize_t retval = 0; 1056 ssize_t retval = 0;
@@ -1419,9 +1074,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1419 case FILE_MEM_EXCLUSIVE: 1074 case FILE_MEM_EXCLUSIVE:
1420 *s++ = is_mem_exclusive(cs) ? '1' : '0'; 1075 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1421 break; 1076 break;
1422 case FILE_NOTIFY_ON_RELEASE:
1423 *s++ = notify_on_release(cs) ? '1' : '0';
1424 break;
1425 case FILE_MEMORY_MIGRATE: 1077 case FILE_MEMORY_MIGRATE:
1426 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1078 *s++ = is_memory_migrate(cs) ? '1' : '0';
1427 break; 1079 break;
@@ -1449,390 +1101,141 @@ out:
1449 return retval; 1101 return retval;
1450} 1102}
1451 1103
1452static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
1453 loff_t *ppos)
1454{
1455 ssize_t retval = 0;
1456 struct cftype *cft = __d_cft(file->f_path.dentry);
1457 if (!cft)
1458 return -ENODEV;
1459 1104
1460 /* special function ? */
1461 if (cft->read)
1462 retval = cft->read(file, buf, nbytes, ppos);
1463 else
1464 retval = cpuset_common_file_read(file, buf, nbytes, ppos);
1465 1105
1466 return retval;
1467}
1468 1106
1469static int cpuset_file_open(struct inode *inode, struct file *file)
1470{
1471 int err;
1472 struct cftype *cft;
1473
1474 err = generic_file_open(inode, file);
1475 if (err)
1476 return err;
1477
1478 cft = __d_cft(file->f_path.dentry);
1479 if (!cft)
1480 return -ENODEV;
1481 if (cft->open)
1482 err = cft->open(inode, file);
1483 else
1484 err = 0;
1485
1486 return err;
1487}
1488
1489static int cpuset_file_release(struct inode *inode, struct file *file)
1490{
1491 struct cftype *cft = __d_cft(file->f_path.dentry);
1492 if (cft->release)
1493 return cft->release(inode, file);
1494 return 0;
1495}
1496
1497/*
1498 * cpuset_rename - Only allow simple rename of directories in place.
1499 */
1500static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1501 struct inode *new_dir, struct dentry *new_dentry)
1502{
1503 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1504 return -ENOTDIR;
1505 if (new_dentry->d_inode)
1506 return -EEXIST;
1507 if (old_dir != new_dir)
1508 return -EIO;
1509 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1510}
1511
1512static const struct file_operations cpuset_file_operations = {
1513 .read = cpuset_file_read,
1514 .write = cpuset_file_write,
1515 .llseek = generic_file_llseek,
1516 .open = cpuset_file_open,
1517 .release = cpuset_file_release,
1518};
1519
1520static const struct inode_operations cpuset_dir_inode_operations = {
1521 .lookup = simple_lookup,
1522 .mkdir = cpuset_mkdir,
1523 .rmdir = cpuset_rmdir,
1524 .rename = cpuset_rename,
1525};
1526
1527static int cpuset_create_file(struct dentry *dentry, int mode)
1528{
1529 struct inode *inode;
1530
1531 if (!dentry)
1532 return -ENOENT;
1533 if (dentry->d_inode)
1534 return -EEXIST;
1535
1536 inode = cpuset_new_inode(mode);
1537 if (!inode)
1538 return -ENOMEM;
1539
1540 if (S_ISDIR(mode)) {
1541 inode->i_op = &cpuset_dir_inode_operations;
1542 inode->i_fop = &simple_dir_operations;
1543
1544 /* start off with i_nlink == 2 (for "." entry) */
1545 inc_nlink(inode);
1546 } else if (S_ISREG(mode)) {
1547 inode->i_size = 0;
1548 inode->i_fop = &cpuset_file_operations;
1549 }
1550
1551 d_instantiate(dentry, inode);
1552 dget(dentry); /* Extra count - pin the dentry in core */
1553 return 0;
1554}
1555
1556/*
1557 * cpuset_create_dir - create a directory for an object.
1558 * cs: the cpuset we create the directory for.
1559 * It must have a valid ->parent field
1560 * And we are going to fill its ->dentry field.
1561 * name: The name to give to the cpuset directory. Will be copied.
1562 * mode: mode to set on new directory.
1563 */
1564
1565static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1566{
1567 struct dentry *dentry = NULL;
1568 struct dentry *parent;
1569 int error = 0;
1570
1571 parent = cs->parent->dentry;
1572 dentry = cpuset_get_dentry(parent, name);
1573 if (IS_ERR(dentry))
1574 return PTR_ERR(dentry);
1575 error = cpuset_create_file(dentry, S_IFDIR | mode);
1576 if (!error) {
1577 dentry->d_fsdata = cs;
1578 inc_nlink(parent->d_inode);
1579 cs->dentry = dentry;
1580 }
1581 dput(dentry);
1582
1583 return error;
1584}
1585
1586static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1587{
1588 struct dentry *dentry;
1589 int error;
1590
1591 mutex_lock(&dir->d_inode->i_mutex);
1592 dentry = cpuset_get_dentry(dir, cft->name);
1593 if (!IS_ERR(dentry)) {
1594 error = cpuset_create_file(dentry, 0644 | S_IFREG);
1595 if (!error)
1596 dentry->d_fsdata = (void *)cft;
1597 dput(dentry);
1598 } else
1599 error = PTR_ERR(dentry);
1600 mutex_unlock(&dir->d_inode->i_mutex);
1601 return error;
1602}
1603
1604/*
1605 * Stuff for reading the 'tasks' file.
1606 *
1607 * Reading this file can return large amounts of data if a cpuset has
1608 * *lots* of attached tasks. So it may need several calls to read(),
1609 * but we cannot guarantee that the information we produce is correct
1610 * unless we produce it entirely atomically.
1611 *
1612 * Upon tasks file open(), a struct ctr_struct is allocated, that
1613 * will have a pointer to an array (also allocated here). The struct
1614 * ctr_struct * is stored in file->private_data. Its resources will
1615 * be freed by release() when the file is closed. The array is used
1616 * to sprintf the PIDs and then used by read().
1617 */
1618
1619/* cpusets_tasks_read array */
1620
1621struct ctr_struct {
1622 char *buf;
1623 int bufsz;
1624};
1625
1626/*
1627 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1628 * Return actual number of pids loaded. No need to task_lock(p)
1629 * when reading out p->cpuset, as we don't really care if it changes
1630 * on the next cycle, and we are not going to try to dereference it.
1631 */
1632static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1633{
1634 int n = 0;
1635 struct task_struct *g, *p;
1636
1637 read_lock(&tasklist_lock);
1638
1639 do_each_thread(g, p) {
1640 if (p->cpuset == cs) {
1641 pidarray[n++] = p->pid;
1642 if (unlikely(n == npids))
1643 goto array_full;
1644 }
1645 } while_each_thread(g, p);
1646
1647array_full:
1648 read_unlock(&tasklist_lock);
1649 return n;
1650}
1651
1652static int cmppid(const void *a, const void *b)
1653{
1654 return *(pid_t *)a - *(pid_t *)b;
1655}
1656
1657/*
1658 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1659 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1660 * count 'cnt' of how many chars would be written if buf were large enough.
1661 */
1662static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1663{
1664 int cnt = 0;
1665 int i;
1666
1667 for (i = 0; i < npids; i++)
1668 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1669 return cnt;
1670}
1671
1672/*
1673 * Handle an open on 'tasks' file. Prepare a buffer listing the
1674 * process id's of tasks currently attached to the cpuset being opened.
1675 *
1676 * Does not require any specific cpuset mutexes, and does not take any.
1677 */
1678static int cpuset_tasks_open(struct inode *unused, struct file *file)
1679{
1680 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1681 struct ctr_struct *ctr;
1682 pid_t *pidarray;
1683 int npids;
1684 char c;
1685
1686 if (!(file->f_mode & FMODE_READ))
1687 return 0;
1688
1689 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1690 if (!ctr)
1691 goto err0;
1692
1693 /*
1694 * If cpuset gets more users after we read count, we won't have
1695 * enough space - tough. This race is indistinguishable to the
1696 * caller from the case that the additional cpuset users didn't
1697 * show up until sometime later on.
1698 */
1699 npids = atomic_read(&cs->count);
1700 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1701 if (!pidarray)
1702 goto err1;
1703
1704 npids = pid_array_load(pidarray, npids, cs);
1705 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1706
1707 /* Call pid_array_to_buf() twice, first just to get bufsz */
1708 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1709 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1710 if (!ctr->buf)
1711 goto err2;
1712 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1713
1714 kfree(pidarray);
1715 file->private_data = ctr;
1716 return 0;
1717
1718err2:
1719 kfree(pidarray);
1720err1:
1721 kfree(ctr);
1722err0:
1723 return -ENOMEM;
1724}
1725
1726static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1727 size_t nbytes, loff_t *ppos)
1728{
1729 struct ctr_struct *ctr = file->private_data;
1730
1731 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1732}
1733
1734static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
1735{
1736 struct ctr_struct *ctr;
1737
1738 if (file->f_mode & FMODE_READ) {
1739 ctr = file->private_data;
1740 kfree(ctr->buf);
1741 kfree(ctr);
1742 }
1743 return 0;
1744}
1745 1107
1746/* 1108/*
1747 * for the common functions, 'private' gives the type of file 1109 * for the common functions, 'private' gives the type of file
1748 */ 1110 */
1749 1111
1750static struct cftype cft_tasks = {
1751 .name = "tasks",
1752 .open = cpuset_tasks_open,
1753 .read = cpuset_tasks_read,
1754 .release = cpuset_tasks_release,
1755 .private = FILE_TASKLIST,
1756};
1757
1758static struct cftype cft_cpus = { 1112static struct cftype cft_cpus = {
1759 .name = "cpus", 1113 .name = "cpus",
1114 .read = cpuset_common_file_read,
1115 .write = cpuset_common_file_write,
1760 .private = FILE_CPULIST, 1116 .private = FILE_CPULIST,
1761}; 1117};
1762 1118
1763static struct cftype cft_mems = { 1119static struct cftype cft_mems = {
1764 .name = "mems", 1120 .name = "mems",
1121 .read = cpuset_common_file_read,
1122 .write = cpuset_common_file_write,
1765 .private = FILE_MEMLIST, 1123 .private = FILE_MEMLIST,
1766}; 1124};
1767 1125
1768static struct cftype cft_cpu_exclusive = { 1126static struct cftype cft_cpu_exclusive = {
1769 .name = "cpu_exclusive", 1127 .name = "cpu_exclusive",
1128 .read = cpuset_common_file_read,
1129 .write = cpuset_common_file_write,
1770 .private = FILE_CPU_EXCLUSIVE, 1130 .private = FILE_CPU_EXCLUSIVE,
1771}; 1131};
1772 1132
1773static struct cftype cft_mem_exclusive = { 1133static struct cftype cft_mem_exclusive = {
1774 .name = "mem_exclusive", 1134 .name = "mem_exclusive",
1135 .read = cpuset_common_file_read,
1136 .write = cpuset_common_file_write,
1775 .private = FILE_MEM_EXCLUSIVE, 1137 .private = FILE_MEM_EXCLUSIVE,
1776}; 1138};
1777 1139
1778static struct cftype cft_notify_on_release = {
1779 .name = "notify_on_release",
1780 .private = FILE_NOTIFY_ON_RELEASE,
1781};
1782
1783static struct cftype cft_memory_migrate = { 1140static struct cftype cft_memory_migrate = {
1784 .name = "memory_migrate", 1141 .name = "memory_migrate",
1142 .read = cpuset_common_file_read,
1143 .write = cpuset_common_file_write,
1785 .private = FILE_MEMORY_MIGRATE, 1144 .private = FILE_MEMORY_MIGRATE,
1786}; 1145};
1787 1146
1788static struct cftype cft_memory_pressure_enabled = { 1147static struct cftype cft_memory_pressure_enabled = {
1789 .name = "memory_pressure_enabled", 1148 .name = "memory_pressure_enabled",
1149 .read = cpuset_common_file_read,
1150 .write = cpuset_common_file_write,
1790 .private = FILE_MEMORY_PRESSURE_ENABLED, 1151 .private = FILE_MEMORY_PRESSURE_ENABLED,
1791}; 1152};
1792 1153
1793static struct cftype cft_memory_pressure = { 1154static struct cftype cft_memory_pressure = {
1794 .name = "memory_pressure", 1155 .name = "memory_pressure",
1156 .read = cpuset_common_file_read,
1157 .write = cpuset_common_file_write,
1795 .private = FILE_MEMORY_PRESSURE, 1158 .private = FILE_MEMORY_PRESSURE,
1796}; 1159};
1797 1160
1798static struct cftype cft_spread_page = { 1161static struct cftype cft_spread_page = {
1799 .name = "memory_spread_page", 1162 .name = "memory_spread_page",
1163 .read = cpuset_common_file_read,
1164 .write = cpuset_common_file_write,
1800 .private = FILE_SPREAD_PAGE, 1165 .private = FILE_SPREAD_PAGE,
1801}; 1166};
1802 1167
1803static struct cftype cft_spread_slab = { 1168static struct cftype cft_spread_slab = {
1804 .name = "memory_spread_slab", 1169 .name = "memory_spread_slab",
1170 .read = cpuset_common_file_read,
1171 .write = cpuset_common_file_write,
1805 .private = FILE_SPREAD_SLAB, 1172 .private = FILE_SPREAD_SLAB,
1806}; 1173};
1807 1174
1808static int cpuset_populate_dir(struct dentry *cs_dentry) 1175static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809{ 1176{
1810 int err; 1177 int err;
1811 1178
1812 if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) 1179 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1813 return err; 1180 return err;
1814 if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) 1181 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1815 return err; 1182 return err;
1816 if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) 1183 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1817 return err; 1184 return err;
1818 if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) 1185 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1819 return err; 1186 return err;
1820 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1187 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1821 return err; 1188 return err;
1822 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1189 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1823 return err; 1190 return err;
1824 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1191 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1825 return err; 1192 return err;
1826 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) 1193 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1827 return err;
1828 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
1829 return err;
1830 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1831 return err; 1194 return err;
1195 /* memory_pressure_enabled is in root cpuset only */
1196 if (err == 0 && !cont->parent)
1197 err = cgroup_add_file(cont, ss,
1198 &cft_memory_pressure_enabled);
1832 return 0; 1199 return 0;
1833} 1200}
1834 1201
1835/* 1202/*
1203 * post_clone() is called at the end of cgroup_clone().
1204 * 'cgroup' was just created automatically as a result of
1205 * a cgroup_clone(), and the current task is about to
1206 * be moved into 'cgroup'.
1207 *
1208 * Currently we refuse to set up the cgroup - thereby
1209 * refusing the task to be entered, and as a result refusing
1210 * the sys_unshare() or clone() which initiated it - if any
1211 * sibling cpusets have exclusive cpus or mem.
1212 *
1213 * If this becomes a problem for some users who wish to
1214 * allow that scenario, then cpuset_post_clone() could be
1215 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1216 * (and likewise for mems) to the new cgroup.
1217 */
1218static void cpuset_post_clone(struct cgroup_subsys *ss,
1219 struct cgroup *cgroup)
1220{
1221 struct cgroup *parent, *child;
1222 struct cpuset *cs, *parent_cs;
1223
1224 parent = cgroup->parent;
1225 list_for_each_entry(child, &parent->children, sibling) {
1226 cs = cgroup_cs(child);
1227 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1228 return;
1229 }
1230 cs = cgroup_cs(cgroup);
1231 parent_cs = cgroup_cs(parent);
1232
1233 cs->mems_allowed = parent_cs->mems_allowed;
1234 cs->cpus_allowed = parent_cs->cpus_allowed;
1235 return;
1236}
1237
1238/*
1836 * cpuset_create - create a cpuset 1239 * cpuset_create - create a cpuset
1837 * parent: cpuset that will be parent of the new cpuset. 1240 * parent: cpuset that will be parent of the new cpuset.
1838 * name: name of the new cpuset. Will be strcpy'ed. 1241 * name: name of the new cpuset. Will be strcpy'ed.
@@ -1841,106 +1244,60 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1841 * Must be called with the mutex on the parent inode held 1244 * Must be called with the mutex on the parent inode held
1842 */ 1245 */
1843 1246
1844static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1247static struct cgroup_subsys_state *cpuset_create(
1248 struct cgroup_subsys *ss,
1249 struct cgroup *cont)
1845{ 1250{
1846 struct cpuset *cs; 1251 struct cpuset *cs;
1847 int err; 1252 struct cpuset *parent;
1848 1253
1254 if (!cont->parent) {
1255 /* This is early initialization for the top cgroup */
1256 top_cpuset.mems_generation = cpuset_mems_generation++;
1257 return &top_cpuset.css;
1258 }
1259 parent = cgroup_cs(cont->parent);
1849 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1260 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1850 if (!cs) 1261 if (!cs)
1851 return -ENOMEM; 1262 return ERR_PTR(-ENOMEM);
1852 1263
1853 mutex_lock(&manage_mutex);
1854 cpuset_update_task_memory_state(); 1264 cpuset_update_task_memory_state();
1855 cs->flags = 0; 1265 cs->flags = 0;
1856 if (notify_on_release(parent))
1857 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1858 if (is_spread_page(parent)) 1266 if (is_spread_page(parent))
1859 set_bit(CS_SPREAD_PAGE, &cs->flags); 1267 set_bit(CS_SPREAD_PAGE, &cs->flags);
1860 if (is_spread_slab(parent)) 1268 if (is_spread_slab(parent))
1861 set_bit(CS_SPREAD_SLAB, &cs->flags); 1269 set_bit(CS_SPREAD_SLAB, &cs->flags);
1862 cs->cpus_allowed = CPU_MASK_NONE; 1270 cs->cpus_allowed = CPU_MASK_NONE;
1863 cs->mems_allowed = NODE_MASK_NONE; 1271 cs->mems_allowed = NODE_MASK_NONE;
1864 atomic_set(&cs->count, 0);
1865 INIT_LIST_HEAD(&cs->sibling);
1866 INIT_LIST_HEAD(&cs->children);
1867 cs->mems_generation = cpuset_mems_generation++; 1272 cs->mems_generation = cpuset_mems_generation++;
1868 fmeter_init(&cs->fmeter); 1273 fmeter_init(&cs->fmeter);
1869 1274
1870 cs->parent = parent; 1275 cs->parent = parent;
1871
1872 mutex_lock(&callback_mutex);
1873 list_add(&cs->sibling, &cs->parent->children);
1874 number_of_cpusets++; 1276 number_of_cpusets++;
1875 mutex_unlock(&callback_mutex); 1277 return &cs->css ;
1876
1877 err = cpuset_create_dir(cs, name, mode);
1878 if (err < 0)
1879 goto err;
1880
1881 /*
1882 * Release manage_mutex before cpuset_populate_dir() because it
1883 * will down() this new directory's i_mutex and if we race with
1884 * another mkdir, we might deadlock.
1885 */
1886 mutex_unlock(&manage_mutex);
1887
1888 err = cpuset_populate_dir(cs->dentry);
1889 /* If err < 0, we have a half-filled directory - oh well ;) */
1890 return 0;
1891err:
1892 list_del(&cs->sibling);
1893 mutex_unlock(&manage_mutex);
1894 kfree(cs);
1895 return err;
1896}
1897
1898static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1899{
1900 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1901
1902 /* the vfs holds inode->i_mutex already */
1903 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1904} 1278}
1905 1279
1906static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1280static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1907{ 1281{
1908 struct cpuset *cs = dentry->d_fsdata; 1282 struct cpuset *cs = cgroup_cs(cont);
1909 struct dentry *d;
1910 struct cpuset *parent;
1911 char *pathbuf = NULL;
1912
1913 /* the vfs holds both inode->i_mutex already */
1914 1283
1915 mutex_lock(&manage_mutex);
1916 cpuset_update_task_memory_state(); 1284 cpuset_update_task_memory_state();
1917 if (atomic_read(&cs->count) > 0) {
1918 mutex_unlock(&manage_mutex);
1919 return -EBUSY;
1920 }
1921 if (!list_empty(&cs->children)) {
1922 mutex_unlock(&manage_mutex);
1923 return -EBUSY;
1924 }
1925 parent = cs->parent;
1926 mutex_lock(&callback_mutex);
1927 set_bit(CS_REMOVED, &cs->flags);
1928 list_del(&cs->sibling); /* delete my sibling from parent->children */
1929 spin_lock(&cs->dentry->d_lock);
1930 d = dget(cs->dentry);
1931 cs->dentry = NULL;
1932 spin_unlock(&d->d_lock);
1933 cpuset_d_remove_dir(d);
1934 dput(d);
1935 number_of_cpusets--; 1285 number_of_cpusets--;
1936 mutex_unlock(&callback_mutex); 1286 kfree(cs);
1937 if (list_empty(&parent->children))
1938 check_for_release(parent, &pathbuf);
1939 mutex_unlock(&manage_mutex);
1940 cpuset_release_agent(pathbuf);
1941 return 0;
1942} 1287}
1943 1288
1289struct cgroup_subsys cpuset_subsys = {
1290 .name = "cpuset",
1291 .create = cpuset_create,
1292 .destroy = cpuset_destroy,
1293 .can_attach = cpuset_can_attach,
1294 .attach = cpuset_attach,
1295 .populate = cpuset_populate,
1296 .post_clone = cpuset_post_clone,
1297 .subsys_id = cpuset_subsys_id,
1298 .early_init = 1,
1299};
1300
1944/* 1301/*
1945 * cpuset_init_early - just enough so that the calls to 1302 * cpuset_init_early - just enough so that the calls to
1946 * cpuset_update_task_memory_state() in early init code 1303 * cpuset_update_task_memory_state() in early init code
@@ -1949,13 +1306,11 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1949 1306
1950int __init cpuset_init_early(void) 1307int __init cpuset_init_early(void)
1951{ 1308{
1952 struct task_struct *tsk = current; 1309 top_cpuset.mems_generation = cpuset_mems_generation++;
1953
1954 tsk->cpuset = &top_cpuset;
1955 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1956 return 0; 1310 return 0;
1957} 1311}
1958 1312
1313
1959/** 1314/**
1960 * cpuset_init - initialize cpusets at system boot 1315 * cpuset_init - initialize cpusets at system boot
1961 * 1316 *
@@ -1964,8 +1319,7 @@ int __init cpuset_init_early(void)
1964 1319
1965int __init cpuset_init(void) 1320int __init cpuset_init(void)
1966{ 1321{
1967 struct dentry *root; 1322 int err = 0;
1968 int err;
1969 1323
1970 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1324 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1971 top_cpuset.mems_allowed = NODE_MASK_ALL; 1325 top_cpuset.mems_allowed = NODE_MASK_ALL;
@@ -1973,30 +1327,12 @@ int __init cpuset_init(void)
1973 fmeter_init(&top_cpuset.fmeter); 1327 fmeter_init(&top_cpuset.fmeter);
1974 top_cpuset.mems_generation = cpuset_mems_generation++; 1328 top_cpuset.mems_generation = cpuset_mems_generation++;
1975 1329
1976 init_task.cpuset = &top_cpuset;
1977
1978 err = register_filesystem(&cpuset_fs_type); 1330 err = register_filesystem(&cpuset_fs_type);
1979 if (err < 0) 1331 if (err < 0)
1980 goto out; 1332 return err;
1981 cpuset_mount = kern_mount(&cpuset_fs_type); 1333
1982 if (IS_ERR(cpuset_mount)) {
1983 printk(KERN_ERR "cpuset: could not mount!\n");
1984 err = PTR_ERR(cpuset_mount);
1985 cpuset_mount = NULL;
1986 goto out;
1987 }
1988 root = cpuset_mount->mnt_sb->s_root;
1989 root->d_fsdata = &top_cpuset;
1990 inc_nlink(root->d_inode);
1991 top_cpuset.dentry = root;
1992 root->d_inode->i_op = &cpuset_dir_inode_operations;
1993 number_of_cpusets = 1; 1334 number_of_cpusets = 1;
1994 err = cpuset_populate_dir(root); 1335 return 0;
1995 /* memory_pressure_enabled is in root cpuset only */
1996 if (err == 0)
1997 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1998out:
1999 return err;
2000} 1336}
2001 1337
2002/* 1338/*
@@ -2022,10 +1358,12 @@ out:
2022 1358
2023static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1359static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2024{ 1360{
1361 struct cgroup *cont;
2025 struct cpuset *c; 1362 struct cpuset *c;
2026 1363
2027 /* Each of our child cpusets mems must be online */ 1364 /* Each of our child cpusets mems must be online */
2028 list_for_each_entry(c, &cur->children, sibling) { 1365 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1366 c = cgroup_cs(cont);
2029 guarantee_online_cpus_mems_in_subtree(c); 1367 guarantee_online_cpus_mems_in_subtree(c);
2030 if (!cpus_empty(c->cpus_allowed)) 1368 if (!cpus_empty(c->cpus_allowed))
2031 guarantee_online_cpus(c, &c->cpus_allowed); 1369 guarantee_online_cpus(c, &c->cpus_allowed);
@@ -2053,7 +1391,7 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2053 1391
2054static void common_cpu_mem_hotplug_unplug(void) 1392static void common_cpu_mem_hotplug_unplug(void)
2055{ 1393{
2056 mutex_lock(&manage_mutex); 1394 cgroup_lock();
2057 mutex_lock(&callback_mutex); 1395 mutex_lock(&callback_mutex);
2058 1396
2059 guarantee_online_cpus_mems_in_subtree(&top_cpuset); 1397 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
@@ -2061,7 +1399,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2061 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1399 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2062 1400
2063 mutex_unlock(&callback_mutex); 1401 mutex_unlock(&callback_mutex);
2064 mutex_unlock(&manage_mutex); 1402 cgroup_unlock();
2065} 1403}
2066 1404
2067/* 1405/*
@@ -2113,109 +1451,7 @@ void __init cpuset_init_smp(void)
2113} 1451}
2114 1452
2115/** 1453/**
2116 * cpuset_fork - attach newly forked task to its parents cpuset.
2117 * @tsk: pointer to task_struct of forking parent process.
2118 *
2119 * Description: A task inherits its parent's cpuset at fork().
2120 *
2121 * A pointer to the shared cpuset was automatically copied in fork.c
2122 * by dup_task_struct(). However, we ignore that copy, since it was
2123 * not made under the protection of task_lock(), so might no longer be
2124 * a valid cpuset pointer. attach_task() might have already changed
2125 * current->cpuset, allowing the previously referenced cpuset to
2126 * be removed and freed. Instead, we task_lock(current) and copy
2127 * its present value of current->cpuset for our freshly forked child.
2128 *
2129 * At the point that cpuset_fork() is called, 'current' is the parent
2130 * task, and the passed argument 'child' points to the child task.
2131 **/
2132
2133void cpuset_fork(struct task_struct *child)
2134{
2135 task_lock(current);
2136 child->cpuset = current->cpuset;
2137 atomic_inc(&child->cpuset->count);
2138 task_unlock(current);
2139}
2140
2141/**
2142 * cpuset_exit - detach cpuset from exiting task
2143 * @tsk: pointer to task_struct of exiting process
2144 *
2145 * Description: Detach cpuset from @tsk and release it.
2146 *
2147 * Note that cpusets marked notify_on_release force every task in
2148 * them to take the global manage_mutex mutex when exiting.
2149 * This could impact scaling on very large systems. Be reluctant to
2150 * use notify_on_release cpusets where very high task exit scaling
2151 * is required on large systems.
2152 *
2153 * Don't even think about derefencing 'cs' after the cpuset use count
2154 * goes to zero, except inside a critical section guarded by manage_mutex
2155 * or callback_mutex. Otherwise a zero cpuset use count is a license to
2156 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
2157 *
2158 * This routine has to take manage_mutex, not callback_mutex, because
2159 * it is holding that mutex while calling check_for_release(),
2160 * which calls kmalloc(), so can't be called holding callback_mutex().
2161 *
2162 * the_top_cpuset_hack:
2163 *
2164 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
2165 *
2166 * Don't leave a task unable to allocate memory, as that is an
2167 * accident waiting to happen should someone add a callout in
2168 * do_exit() after the cpuset_exit() call that might allocate.
2169 * If a task tries to allocate memory with an invalid cpuset,
2170 * it will oops in cpuset_update_task_memory_state().
2171 *
2172 * We call cpuset_exit() while the task is still competent to
2173 * handle notify_on_release(), then leave the task attached to
2174 * the root cpuset (top_cpuset) for the remainder of its exit.
2175 *
2176 * To do this properly, we would increment the reference count on
2177 * top_cpuset, and near the very end of the kernel/exit.c do_exit()
2178 * code we would add a second cpuset function call, to drop that
2179 * reference. This would just create an unnecessary hot spot on
2180 * the top_cpuset reference count, to no avail.
2181 *
2182 * Normally, holding a reference to a cpuset without bumping its
2183 * count is unsafe. The cpuset could go away, or someone could
2184 * attach us to a different cpuset, decrementing the count on
2185 * the first cpuset that we never incremented. But in this case,
2186 * top_cpuset isn't going away, and either task has PF_EXITING set,
2187 * which wards off any attach_task() attempts, or task is a failed
2188 * fork, never visible to attach_task.
2189 *
2190 * Another way to do this would be to set the cpuset pointer
2191 * to NULL here, and check in cpuset_update_task_memory_state()
2192 * for a NULL pointer. This hack avoids that NULL check, for no
2193 * cost (other than this way too long comment ;).
2194 **/
2195
2196void cpuset_exit(struct task_struct *tsk)
2197{
2198 struct cpuset *cs;
2199
2200 task_lock(current);
2201 cs = tsk->cpuset;
2202 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2203 task_unlock(current);
2204
2205 if (notify_on_release(cs)) {
2206 char *pathbuf = NULL;
2207 1454
2208 mutex_lock(&manage_mutex);
2209 if (atomic_dec_and_test(&cs->count))
2210 check_for_release(cs, &pathbuf);
2211 mutex_unlock(&manage_mutex);
2212 cpuset_release_agent(pathbuf);
2213 } else {
2214 atomic_dec(&cs->count);
2215 }
2216}
2217
2218/**
2219 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1455 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2220 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1456 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2221 * 1457 *
@@ -2231,7 +1467,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2231 1467
2232 mutex_lock(&callback_mutex); 1468 mutex_lock(&callback_mutex);
2233 task_lock(tsk); 1469 task_lock(tsk);
2234 guarantee_online_cpus(tsk->cpuset, &mask); 1470 guarantee_online_cpus(task_cs(tsk), &mask);
2235 task_unlock(tsk); 1471 task_unlock(tsk);
2236 mutex_unlock(&callback_mutex); 1472 mutex_unlock(&callback_mutex);
2237 1473
@@ -2259,7 +1495,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2259 1495
2260 mutex_lock(&callback_mutex); 1496 mutex_lock(&callback_mutex);
2261 task_lock(tsk); 1497 task_lock(tsk);
2262 guarantee_online_mems(tsk->cpuset, &mask); 1498 guarantee_online_mems(task_cs(tsk), &mask);
2263 task_unlock(tsk); 1499 task_unlock(tsk);
2264 mutex_unlock(&callback_mutex); 1500 mutex_unlock(&callback_mutex);
2265 1501
@@ -2390,7 +1626,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2390 mutex_lock(&callback_mutex); 1626 mutex_lock(&callback_mutex);
2391 1627
2392 task_lock(current); 1628 task_lock(current);
2393 cs = nearest_exclusive_ancestor(current->cpuset); 1629 cs = nearest_exclusive_ancestor(task_cs(current));
2394 task_unlock(current); 1630 task_unlock(current);
2395 1631
2396 allowed = node_isset(node, cs->mems_allowed); 1632 allowed = node_isset(node, cs->mems_allowed);
@@ -2550,14 +1786,12 @@ int cpuset_memory_pressure_enabled __read_mostly;
2550 1786
2551void __cpuset_memory_pressure_bump(void) 1787void __cpuset_memory_pressure_bump(void)
2552{ 1788{
2553 struct cpuset *cs;
2554
2555 task_lock(current); 1789 task_lock(current);
2556 cs = current->cpuset; 1790 fmeter_markevent(&task_cs(current)->fmeter);
2557 fmeter_markevent(&cs->fmeter);
2558 task_unlock(current); 1791 task_unlock(current);
2559} 1792}
2560 1793
1794#ifdef CONFIG_PROC_PID_CPUSET
2561/* 1795/*
2562 * proc_cpuset_show() 1796 * proc_cpuset_show()
2563 * - Print tasks cpuset path into seq_file. 1797 * - Print tasks cpuset path into seq_file.
@@ -2574,6 +1808,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2574 struct pid *pid; 1808 struct pid *pid;
2575 struct task_struct *tsk; 1809 struct task_struct *tsk;
2576 char *buf; 1810 char *buf;
1811 struct cgroup_subsys_state *css;
2577 int retval; 1812 int retval;
2578 1813
2579 retval = -ENOMEM; 1814 retval = -ENOMEM;
@@ -2588,15 +1823,15 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2588 goto out_free; 1823 goto out_free;
2589 1824
2590 retval = -EINVAL; 1825 retval = -EINVAL;
2591 mutex_lock(&manage_mutex); 1826 cgroup_lock();
2592 1827 css = task_subsys_state(tsk, cpuset_subsys_id);
2593 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 1828 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2594 if (retval < 0) 1829 if (retval < 0)
2595 goto out_unlock; 1830 goto out_unlock;
2596 seq_puts(m, buf); 1831 seq_puts(m, buf);
2597 seq_putc(m, '\n'); 1832 seq_putc(m, '\n');
2598out_unlock: 1833out_unlock:
2599 mutex_unlock(&manage_mutex); 1834 cgroup_unlock();
2600 put_task_struct(tsk); 1835 put_task_struct(tsk);
2601out_free: 1836out_free:
2602 kfree(buf); 1837 kfree(buf);
@@ -2616,6 +1851,7 @@ const struct file_operations proc_cpuset_operations = {
2616 .llseek = seq_lseek, 1851 .llseek = seq_lseek,
2617 .release = single_release, 1852 .release = single_release,
2618}; 1853};
1854#endif /* CONFIG_PROC_PID_CPUSET */
2619 1855
2620/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 1856/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2621char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) 1857char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
diff --git a/kernel/exit.c b/kernel/exit.c
index 44ff6147556a..cf03a52c3a9a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,7 +31,6 @@
31#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h>
35#include <linux/cgroup.h> 34#include <linux/cgroup.h>
36#include <linux/syscalls.h> 35#include <linux/syscalls.h>
37#include <linux/signal.h> 36#include <linux/signal.h>
@@ -973,7 +972,6 @@ fastcall NORET_TYPE void do_exit(long code)
973 __exit_fs(tsk); 972 __exit_fs(tsk);
974 check_stack_usage(); 973 check_stack_usage();
975 exit_thread(); 974 exit_thread();
976 cpuset_exit(tsk);
977 cgroup_exit(tsk, 1); 975 cgroup_exit(tsk, 1);
978 exit_keys(tsk); 976 exit_keys(tsk);
979 977
diff --git a/kernel/fork.c b/kernel/fork.c
index fcac38929245..61516b89cb6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,7 +29,6 @@
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h>
33#include <linux/cgroup.h> 32#include <linux/cgroup.h>
34#include <linux/security.h> 33#include <linux/security.h>
35#include <linux/swap.h> 34#include <linux/swap.h>
@@ -1089,7 +1088,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1089#endif 1088#endif
1090 p->io_context = NULL; 1089 p->io_context = NULL;
1091 p->audit_context = NULL; 1090 p->audit_context = NULL;
1092 cpuset_fork(p);
1093 cgroup_fork(p); 1091 cgroup_fork(p);
1094#ifdef CONFIG_NUMA 1092#ifdef CONFIG_NUMA
1095 p->mempolicy = mpol_copy(p->mempolicy); 1093 p->mempolicy = mpol_copy(p->mempolicy);
@@ -1330,7 +1328,6 @@ bad_fork_cleanup_policy:
1330 mpol_free(p->mempolicy); 1328 mpol_free(p->mempolicy);
1331bad_fork_cleanup_cgroup: 1329bad_fork_cleanup_cgroup:
1332#endif 1330#endif
1333 cpuset_exit(p);
1334 cgroup_exit(p, cgroup_callbacks_done); 1331 cgroup_exit(p, cgroup_callbacks_done);
1335bad_fork_cleanup_delays_binfmt: 1332bad_fork_cleanup_delays_binfmt:
1336 delayacct_tsk_free(p); 1333 delayacct_tsk_free(p);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 568152ae6caf..7fef5ebfaf13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1388,7 +1388,6 @@ EXPORT_SYMBOL(alloc_pages_current);
1388 * keeps mempolicies cpuset relative after its cpuset moves. See 1388 * keeps mempolicies cpuset relative after its cpuset moves. See
1389 * further kernel/cpuset.c update_nodemask(). 1389 * further kernel/cpuset.c update_nodemask().
1390 */ 1390 */
1391void *cpuset_being_rebound;
1392 1391
1393/* Slow path of a mempolicy copy */ 1392/* Slow path of a mempolicy copy */
1394struct mempolicy *__mpol_copy(struct mempolicy *old) 1393struct mempolicy *__mpol_copy(struct mempolicy *old)
@@ -2019,4 +2018,3 @@ out:
2019 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; 2018 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2020 return 0; 2019 return 0;
2021} 2020}
2022