aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.instrumentation49
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/capability.c15
-rw-r--r--kernel/cgroup.c2805
-rw-r--r--kernel/cgroup_debug.c97
-rw-r--r--kernel/cpu.c12
-rw-r--r--kernel/cpu_acct.c186
-rw-r--r--kernel/cpuset.c1601
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/exit.c157
-rw-r--r--kernel/fork.c133
-rw-r--r--kernel/futex.c26
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/kexec.c168
-rw-r--r--kernel/lockdep.c24
-rw-r--r--kernel/marker.c525
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/notifier.c539
-rw-r--r--kernel/ns_cgroup.c100
-rw-r--r--kernel/nsproxy.c62
-rw-r--r--kernel/pid.c353
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c4
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rtmutex-debug.c15
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c269
-rw-r--r--kernel/signal.c66
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c586
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/taskstats.c67
-rw-r--r--kernel/time/clocksource.c22
-rw-r--r--kernel/time/tick-sched.c16
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/workqueue.c38
36 files changed, 6170 insertions, 1877 deletions
diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation
new file mode 100644
index 000000000000..f5f2c769d95e
--- /dev/null
+++ b/kernel/Kconfig.instrumentation
@@ -0,0 +1,49 @@
1menuconfig INSTRUMENTATION
2 bool "Instrumentation Support"
3 default y
4 ---help---
5 Say Y here to get to see options related to performance measurement,
6 system-wide debugging, and testing. This option alone does not add any
7 kernel code.
8
9 If you say N, all options in this submenu will be skipped and
10 disabled. If you're trying to debug the kernel itself, go see the
11 Kernel Hacking menu.
12
13if INSTRUMENTATION
14
15config PROFILING
16 bool "Profiling support (EXPERIMENTAL)"
17 help
18 Say Y here to enable the extended profiling support mechanisms used
19 by profilers such as OProfile.
20
21config OPROFILE
22 tristate "OProfile system profiling (EXPERIMENTAL)"
23 depends on PROFILING
24 depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64
25 help
26 OProfile is a profiling system capable of profiling the
27 whole system, include the kernel, kernel modules, libraries,
28 and applications.
29
30 If unsure, say N.
31
32config KPROBES
33 bool "Kprobes"
34 depends on KALLSYMS && MODULES
35 depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32
36 help
37 Kprobes allows you to trap at almost any kernel address and
38 execute a callback function. register_kprobe() establishes
39 a probepoint and specifies the callback. Kprobes is useful
40 for kernel debugging, non-intrusive instrumentation and testing.
41 If in doubt, say "N".
42
43config MARKERS
44 bool "Activate markers"
45 help
46 Place an empty function call at each marker site. Can be
47 dynamically changed for a probe function.
48
49endif # INSTRUMENTATION
diff --git a/kernel/Makefile b/kernel/Makefile
index d63fbb18798a..05c3e6df8597 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,8 +8,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
12 utsname.o sysctl_check.o 12 utsname.o sysctl_check.o notifier.o
13 13
14obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
15obj-y += time/ 15obj-y += time/
@@ -36,7 +36,11 @@ obj-$(CONFIG_PM) += power/
36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37obj-$(CONFIG_KEXEC) += kexec.o 37obj-$(CONFIG_KEXEC) += kexec.o
38obj-$(CONFIG_COMPAT) += compat.o 38obj-$(CONFIG_COMPAT) += compat.o
39obj-$(CONFIG_CGROUPS) += cgroup.o
40obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
39obj-$(CONFIG_CPUSETS) += cpuset.o 41obj-$(CONFIG_CPUSETS) += cpuset.o
42obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o
43obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
40obj-$(CONFIG_IKCONFIG) += configs.o 44obj-$(CONFIG_IKCONFIG) += configs.o
41obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 45obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
42obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 46obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -51,6 +55,7 @@ obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 55obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 56obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 57obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
58obj-$(CONFIG_MARKERS) += marker.o
54 59
55ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 60ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
56# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 61# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/capability.c b/kernel/capability.c
index cbc5fd60c0f3..efbd9cdce132 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/pid_namespace.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
17/* 18/*
@@ -61,8 +62,8 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
61 spin_lock(&task_capability_lock); 62 spin_lock(&task_capability_lock);
62 read_lock(&tasklist_lock); 63 read_lock(&tasklist_lock);
63 64
64 if (pid && pid != current->pid) { 65 if (pid && pid != task_pid_vnr(current)) {
65 target = find_task_by_pid(pid); 66 target = find_task_by_vpid(pid);
66 if (!target) { 67 if (!target) {
67 ret = -ESRCH; 68 ret = -ESRCH;
68 goto out; 69 goto out;
@@ -95,7 +96,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
95 int found = 0; 96 int found = 0;
96 struct pid *pgrp; 97 struct pid *pgrp;
97 98
98 pgrp = find_pid(pgrp_nr); 99 pgrp = find_vpid(pgrp_nr);
99 do_each_pid_task(pgrp, PIDTYPE_PGID, g) { 100 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
100 target = g; 101 target = g;
101 while_each_thread(g, target) { 102 while_each_thread(g, target) {
@@ -129,7 +130,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
129 int found = 0; 130 int found = 0;
130 131
131 do_each_thread(g, target) { 132 do_each_thread(g, target) {
132 if (target == current || is_init(target)) 133 if (target == current || is_container_init(target->group_leader))
133 continue; 134 continue;
134 found = 1; 135 found = 1;
135 if (security_capset_check(target, effective, inheritable, 136 if (security_capset_check(target, effective, inheritable,
@@ -184,7 +185,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
184 if (get_user(pid, &header->pid)) 185 if (get_user(pid, &header->pid))
185 return -EFAULT; 186 return -EFAULT;
186 187
187 if (pid && pid != current->pid && !capable(CAP_SETPCAP)) 188 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
188 return -EPERM; 189 return -EPERM;
189 190
190 if (copy_from_user(&effective, &data->effective, sizeof(effective)) || 191 if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
@@ -195,8 +196,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
195 spin_lock(&task_capability_lock); 196 spin_lock(&task_capability_lock);
196 read_lock(&tasklist_lock); 197 read_lock(&tasklist_lock);
197 198
198 if (pid > 0 && pid != current->pid) { 199 if (pid > 0 && pid != task_pid_vnr(current)) {
199 target = find_task_by_pid(pid); 200 target = find_task_by_vpid(pid);
200 if (!target) { 201 if (!target) {
201 ret = -ESRCH; 202 ret = -ESRCH;
202 goto out; 203 goto out;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..5987dccdb2a0
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,2805 @@
1/*
2 * kernel/cgroup.c
3 *
4 * Generic process-grouping system.
5 *
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
8 *
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
13 *
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
16 *
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
21 *
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
25 */
26
27#include <linux/cgroup.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/kernel.h>
31#include <linux/list.h>
32#include <linux/mm.h>
33#include <linux/mutex.h>
34#include <linux/mount.h>
35#include <linux/pagemap.h>
36#include <linux/proc_fs.h>
37#include <linux/rcupdate.h>
38#include <linux/sched.h>
39#include <linux/backing-dev.h>
40#include <linux/seq_file.h>
41#include <linux/slab.h>
42#include <linux/magic.h>
43#include <linux/spinlock.h>
44#include <linux/string.h>
45#include <linux/sort.h>
46#include <linux/kmod.h>
47#include <linux/delayacct.h>
48#include <linux/cgroupstats.h>
49
50#include <asm/atomic.h>
51
52static DEFINE_MUTEX(cgroup_mutex);
53
54/* Generate an array of cgroup subsystem pointers */
55#define SUBSYS(_x) &_x ## _subsys,
56
57static struct cgroup_subsys *subsys[] = {
58#include <linux/cgroup_subsys.h>
59};
60
61/*
62 * A cgroupfs_root represents the root of a cgroup hierarchy,
63 * and may be associated with a superblock to form an active
64 * hierarchy
65 */
66struct cgroupfs_root {
67 struct super_block *sb;
68
69 /*
70 * The bitmask of subsystems intended to be attached to this
71 * hierarchy
72 */
73 unsigned long subsys_bits;
74
75 /* The bitmask of subsystems currently attached to this hierarchy */
76 unsigned long actual_subsys_bits;
77
78 /* A list running through the attached subsystems */
79 struct list_head subsys_list;
80
81 /* The root cgroup for this hierarchy */
82 struct cgroup top_cgroup;
83
84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups;
86
87 /* A list running through the mounted hierarchies */
88 struct list_head root_list;
89
90 /* Hierarchy-specific flags */
91 unsigned long flags;
92
93 /* The path to use for release notifications. No locking
94 * between setting and use - so if userspace updates this
95 * while child cgroups exist, you could miss a
96 * notification. We ensure that it's always a valid
97 * NUL-terminated string */
98 char release_agent_path[PATH_MAX];
99};
100
101
102/*
103 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
104 * subsystems that are otherwise unattached - it never has more than a
105 * single cgroup, and all tasks are part of that cgroup.
106 */
107static struct cgroupfs_root rootnode;
108
109/* The list of hierarchy roots */
110
111static LIST_HEAD(roots);
112static int root_count;
113
114/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
115#define dummytop (&rootnode.top_cgroup)
116
117/* This flag indicates whether tasks in the fork and exit paths should
118 * take callback_mutex and check for fork/exit handlers to call. This
119 * avoids us having to do extra work in the fork/exit path if none of the
120 * subsystems need to be called.
121 */
122static int need_forkexit_callback;
123
124/* bits in struct cgroup flags field */
125enum {
126 /* Control Group is dead */
127 CGRP_REMOVED,
128 /* Control Group has previously had a child cgroup or a task,
129 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
130 CGRP_RELEASABLE,
131 /* Control Group requires release notifications to userspace */
132 CGRP_NOTIFY_ON_RELEASE,
133};
134
135/* convenient tests for these bits */
136inline int cgroup_is_removed(const struct cgroup *cgrp)
137{
138 return test_bit(CGRP_REMOVED, &cgrp->flags);
139}
140
141/* bits in struct cgroupfs_root flags field */
142enum {
143 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
144};
145
146inline int cgroup_is_releasable(const struct cgroup *cgrp)
147{
148 const int bits =
149 (1 << CGRP_RELEASABLE) |
150 (1 << CGRP_NOTIFY_ON_RELEASE);
151 return (cgrp->flags & bits) == bits;
152}
153
154inline int notify_on_release(const struct cgroup *cgrp)
155{
156 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
157}
158
159/*
160 * for_each_subsys() allows you to iterate on each subsystem attached to
161 * an active hierarchy
162 */
163#define for_each_subsys(_root, _ss) \
164list_for_each_entry(_ss, &_root->subsys_list, sibling)
165
166/* for_each_root() allows you to iterate across the active hierarchies */
167#define for_each_root(_root) \
168list_for_each_entry(_root, &roots, root_list)
169
170/* the list of cgroups eligible for automatic release. Protected by
171 * release_list_lock */
172static LIST_HEAD(release_list);
173static DEFINE_SPINLOCK(release_list_lock);
174static void cgroup_release_agent(struct work_struct *work);
175static DECLARE_WORK(release_agent_work, cgroup_release_agent);
176static void check_for_release(struct cgroup *cgrp);
177
178/* Link structure for associating css_set objects with cgroups */
179struct cg_cgroup_link {
180 /*
181 * List running through cg_cgroup_links associated with a
182 * cgroup, anchored on cgroup->css_sets
183 */
184 struct list_head cgrp_link_list;
185 /*
186 * List running through cg_cgroup_links pointing at a
187 * single css_set object, anchored on css_set->cg_links
188 */
189 struct list_head cg_link_list;
190 struct css_set *cg;
191};
192
193/* The default css_set - used by init and its children prior to any
194 * hierarchies being mounted. It contains a pointer to the root state
195 * for each subsystem. Also used to anchor the list of css_sets. Not
196 * reference-counted, to improve performance when child cgroups
197 * haven't been created.
198 */
199
200static struct css_set init_css_set;
201static struct cg_cgroup_link init_css_set_link;
202
203/* css_set_lock protects the list of css_set objects, and the
204 * chain of tasks off each css_set. Nests outside task->alloc_lock
205 * due to cgroup_iter_start() */
206static DEFINE_RWLOCK(css_set_lock);
207static int css_set_count;
208
209/* We don't maintain the lists running through each css_set to its
210 * task until after the first call to cgroup_iter_start(). This
211 * reduces the fork()/exit() overhead for people who have cgroups
212 * compiled into their kernel but not actually in use */
213static int use_task_css_set_links;
214
215/* When we create or destroy a css_set, the operation simply
216 * takes/releases a reference count on all the cgroups referenced
217 * by subsystems in this css_set. This can end up multiple-counting
218 * some cgroups, but that's OK - the ref-count is just a
219 * busy/not-busy indicator; ensuring that we only count each cgroup
220 * once would require taking a global lock to ensure that no
221 * subsystems moved between hierarchies while we were doing so.
222 *
223 * Possible TODO: decide at boot time based on the number of
224 * registered subsystems and the number of CPUs or NUMA nodes whether
225 * it's better for performance to ref-count every subsystem, or to
226 * take a global lock and only add one ref count to each hierarchy.
227 */
228
229/*
230 * unlink a css_set from the list and free it
231 */
232static void unlink_css_set(struct css_set *cg)
233{
234 write_lock(&css_set_lock);
235 list_del(&cg->list);
236 css_set_count--;
237 while (!list_empty(&cg->cg_links)) {
238 struct cg_cgroup_link *link;
239 link = list_entry(cg->cg_links.next,
240 struct cg_cgroup_link, cg_link_list);
241 list_del(&link->cg_link_list);
242 list_del(&link->cgrp_link_list);
243 kfree(link);
244 }
245 write_unlock(&css_set_lock);
246}
247
248static void __release_css_set(struct kref *k, int taskexit)
249{
250 int i;
251 struct css_set *cg = container_of(k, struct css_set, ref);
252
253 unlink_css_set(cg);
254
255 rcu_read_lock();
256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
257 struct cgroup *cgrp = cg->subsys[i]->cgroup;
258 if (atomic_dec_and_test(&cgrp->count) &&
259 notify_on_release(cgrp)) {
260 if (taskexit)
261 set_bit(CGRP_RELEASABLE, &cgrp->flags);
262 check_for_release(cgrp);
263 }
264 }
265 rcu_read_unlock();
266 kfree(cg);
267}
268
269static void release_css_set(struct kref *k)
270{
271 __release_css_set(k, 0);
272}
273
274static void release_css_set_taskexit(struct kref *k)
275{
276 __release_css_set(k, 1);
277}
278
279/*
280 * refcounted get/put for css_set objects
281 */
282static inline void get_css_set(struct css_set *cg)
283{
284 kref_get(&cg->ref);
285}
286
287static inline void put_css_set(struct css_set *cg)
288{
289 kref_put(&cg->ref, release_css_set);
290}
291
292static inline void put_css_set_taskexit(struct css_set *cg)
293{
294 kref_put(&cg->ref, release_css_set_taskexit);
295}
296
297/*
298 * find_existing_css_set() is a helper for
299 * find_css_set(), and checks to see whether an existing
300 * css_set is suitable. This currently walks a linked-list for
301 * simplicity; a later patch will use a hash table for better
302 * performance
303 *
304 * oldcg: the cgroup group that we're using before the cgroup
305 * transition
306 *
307 * cgrp: the cgroup that we're moving into
308 *
309 * template: location in which to build the desired set of subsystem
310 * state objects for the new cgroup group
311 */
312
313static struct css_set *find_existing_css_set(
314 struct css_set *oldcg,
315 struct cgroup *cgrp,
316 struct cgroup_subsys_state *template[])
317{
318 int i;
319 struct cgroupfs_root *root = cgrp->root;
320 struct list_head *l = &init_css_set.list;
321
322 /* Built the set of subsystem state objects that we want to
323 * see in the new css_set */
324 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
325 if (root->subsys_bits & (1ull << i)) {
326 /* Subsystem is in this hierarchy. So we want
327 * the subsystem state from the new
328 * cgroup */
329 template[i] = cgrp->subsys[i];
330 } else {
331 /* Subsystem is not in this hierarchy, so we
332 * don't want to change the subsystem state */
333 template[i] = oldcg->subsys[i];
334 }
335 }
336
337 /* Look through existing cgroup groups to find one to reuse */
338 do {
339 struct css_set *cg =
340 list_entry(l, struct css_set, list);
341
342 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
343 /* All subsystems matched */
344 return cg;
345 }
346 /* Try the next cgroup group */
347 l = l->next;
348 } while (l != &init_css_set.list);
349
350 /* No existing cgroup group matched */
351 return NULL;
352}
353
354/*
355 * allocate_cg_links() allocates "count" cg_cgroup_link structures
356 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
357 * success or a negative error
358 */
359
360static int allocate_cg_links(int count, struct list_head *tmp)
361{
362 struct cg_cgroup_link *link;
363 int i;
364 INIT_LIST_HEAD(tmp);
365 for (i = 0; i < count; i++) {
366 link = kmalloc(sizeof(*link), GFP_KERNEL);
367 if (!link) {
368 while (!list_empty(tmp)) {
369 link = list_entry(tmp->next,
370 struct cg_cgroup_link,
371 cgrp_link_list);
372 list_del(&link->cgrp_link_list);
373 kfree(link);
374 }
375 return -ENOMEM;
376 }
377 list_add(&link->cgrp_link_list, tmp);
378 }
379 return 0;
380}
381
382static void free_cg_links(struct list_head *tmp)
383{
384 while (!list_empty(tmp)) {
385 struct cg_cgroup_link *link;
386 link = list_entry(tmp->next,
387 struct cg_cgroup_link,
388 cgrp_link_list);
389 list_del(&link->cgrp_link_list);
390 kfree(link);
391 }
392}
393
394/*
395 * find_css_set() takes an existing cgroup group and a
396 * cgroup object, and returns a css_set object that's
397 * equivalent to the old group, but with the given cgroup
398 * substituted into the appropriate hierarchy. Must be called with
399 * cgroup_mutex held
400 */
401
402static struct css_set *find_css_set(
403 struct css_set *oldcg, struct cgroup *cgrp)
404{
405 struct css_set *res;
406 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
407 int i;
408
409 struct list_head tmp_cg_links;
410 struct cg_cgroup_link *link;
411
412 /* First see if we already have a cgroup group that matches
413 * the desired set */
414 write_lock(&css_set_lock);
415 res = find_existing_css_set(oldcg, cgrp, template);
416 if (res)
417 get_css_set(res);
418 write_unlock(&css_set_lock);
419
420 if (res)
421 return res;
422
423 res = kmalloc(sizeof(*res), GFP_KERNEL);
424 if (!res)
425 return NULL;
426
427 /* Allocate all the cg_cgroup_link objects that we'll need */
428 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
429 kfree(res);
430 return NULL;
431 }
432
433 kref_init(&res->ref);
434 INIT_LIST_HEAD(&res->cg_links);
435 INIT_LIST_HEAD(&res->tasks);
436
437 /* Copy the set of subsystem state objects generated in
438 * find_existing_css_set() */
439 memcpy(res->subsys, template, sizeof(res->subsys));
440
441 write_lock(&css_set_lock);
442 /* Add reference counts and links from the new css_set. */
443 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
444 struct cgroup *cgrp = res->subsys[i]->cgroup;
445 struct cgroup_subsys *ss = subsys[i];
446 atomic_inc(&cgrp->count);
447 /*
448 * We want to add a link once per cgroup, so we
449 * only do it for the first subsystem in each
450 * hierarchy
451 */
452 if (ss->root->subsys_list.next == &ss->sibling) {
453 BUG_ON(list_empty(&tmp_cg_links));
454 link = list_entry(tmp_cg_links.next,
455 struct cg_cgroup_link,
456 cgrp_link_list);
457 list_del(&link->cgrp_link_list);
458 list_add(&link->cgrp_link_list, &cgrp->css_sets);
459 link->cg = res;
460 list_add(&link->cg_link_list, &res->cg_links);
461 }
462 }
463 if (list_empty(&rootnode.subsys_list)) {
464 link = list_entry(tmp_cg_links.next,
465 struct cg_cgroup_link,
466 cgrp_link_list);
467 list_del(&link->cgrp_link_list);
468 list_add(&link->cgrp_link_list, &dummytop->css_sets);
469 link->cg = res;
470 list_add(&link->cg_link_list, &res->cg_links);
471 }
472
473 BUG_ON(!list_empty(&tmp_cg_links));
474
475 /* Link this cgroup group into the list */
476 list_add(&res->list, &init_css_set.list);
477 css_set_count++;
478 INIT_LIST_HEAD(&res->tasks);
479 write_unlock(&css_set_lock);
480
481 return res;
482}
483
484/*
485 * There is one global cgroup mutex. We also require taking
486 * task_lock() when dereferencing a task's cgroup subsys pointers.
487 * See "The task_lock() exception", at the end of this comment.
488 *
489 * A task must hold cgroup_mutex to modify cgroups.
490 *
491 * Any task can increment and decrement the count field without lock.
492 * So in general, code holding cgroup_mutex can't rely on the count
493 * field not changing. However, if the count goes to zero, then only
494 * attach_task() can increment it again. Because a count of zero
495 * means that no tasks are currently attached, therefore there is no
496 * way a task attached to that cgroup can fork (the other way to
497 * increment the count). So code holding cgroup_mutex can safely
498 * assume that if the count is zero, it will stay zero. Similarly, if
499 * a task holds cgroup_mutex on a cgroup with zero count, it
500 * knows that the cgroup won't be removed, as cgroup_rmdir()
501 * needs that mutex.
502 *
503 * The cgroup_common_file_write handler for operations that modify
504 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
505 * single threading all such cgroup modifications across the system.
506 *
507 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
508 * (usually) take cgroup_mutex. These are the two most performance
509 * critical pieces of code here. The exception occurs on cgroup_exit(),
510 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
511 * is taken, and if the cgroup count is zero, a usermode call made
512 * to /sbin/cgroup_release_agent with the name of the cgroup (path
513 * relative to the root of cgroup file system) as the argument.
514 *
515 * A cgroup can only be deleted if both its 'count' of using tasks
516 * is zero, and its list of 'children' cgroups is empty. Since all
517 * tasks in the system use _some_ cgroup, and since there is always at
518 * least one task in the system (init, pid == 1), therefore, top_cgroup
519 * always has either children cgroups and/or using tasks. So we don't
520 * need a special hack to ensure that top_cgroup cannot be deleted.
521 *
522 * The task_lock() exception
523 *
524 * The need for this exception arises from the action of
525 * attach_task(), which overwrites one tasks cgroup pointer with
526 * another. It does so using cgroup_mutexe, however there are
527 * several performance critical places that need to reference
528 * task->cgroup without the expense of grabbing a system global
529 * mutex. Therefore except as noted below, when dereferencing or, as
530 * in attach_task(), modifying a task'ss cgroup pointer we use
531 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
532 * the task_struct routinely used for such matters.
533 *
534 * P.S. One more locking exception. RCU is used to guard the
535 * update of a tasks cgroup pointer by attach_task()
536 */
537
538/**
539 * cgroup_lock - lock out any changes to cgroup structures
540 *
541 */
542
543void cgroup_lock(void)
544{
545 mutex_lock(&cgroup_mutex);
546}
547
548/**
549 * cgroup_unlock - release lock on cgroup changes
550 *
551 * Undo the lock taken in a previous cgroup_lock() call.
552 */
553
554void cgroup_unlock(void)
555{
556 mutex_unlock(&cgroup_mutex);
557}
558
559/*
560 * A couple of forward declarations required, due to cyclic reference loop:
561 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
562 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
563 * -> cgroup_mkdir.
564 */
565
566static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
567static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
568static int cgroup_populate_dir(struct cgroup *cgrp);
569static struct inode_operations cgroup_dir_inode_operations;
570static struct file_operations proc_cgroupstats_operations;
571
572static struct backing_dev_info cgroup_backing_dev_info = {
573 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
574};
575
576static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
577{
578 struct inode *inode = new_inode(sb);
579
580 if (inode) {
581 inode->i_mode = mode;
582 inode->i_uid = current->fsuid;
583 inode->i_gid = current->fsgid;
584 inode->i_blocks = 0;
585 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
586 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
587 }
588 return inode;
589}
590
591static void cgroup_diput(struct dentry *dentry, struct inode *inode)
592{
593 /* is dentry a directory ? if so, kfree() associated cgroup */
594 if (S_ISDIR(inode->i_mode)) {
595 struct cgroup *cgrp = dentry->d_fsdata;
596 BUG_ON(!(cgroup_is_removed(cgrp)));
597 /* It's possible for external users to be holding css
598 * reference counts on a cgroup; css_put() needs to
599 * be able to access the cgroup after decrementing
600 * the reference count in order to know if it needs to
601 * queue the cgroup to be handled by the release
602 * agent */
603 synchronize_rcu();
604 kfree(cgrp);
605 }
606 iput(inode);
607}
608
609static void remove_dir(struct dentry *d)
610{
611 struct dentry *parent = dget(d->d_parent);
612
613 d_delete(d);
614 simple_rmdir(parent->d_inode, d);
615 dput(parent);
616}
617
618static void cgroup_clear_directory(struct dentry *dentry)
619{
620 struct list_head *node;
621
622 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
623 spin_lock(&dcache_lock);
624 node = dentry->d_subdirs.next;
625 while (node != &dentry->d_subdirs) {
626 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
627 list_del_init(node);
628 if (d->d_inode) {
629 /* This should never be called on a cgroup
630 * directory with child cgroups */
631 BUG_ON(d->d_inode->i_mode & S_IFDIR);
632 d = dget_locked(d);
633 spin_unlock(&dcache_lock);
634 d_delete(d);
635 simple_unlink(dentry->d_inode, d);
636 dput(d);
637 spin_lock(&dcache_lock);
638 }
639 node = dentry->d_subdirs.next;
640 }
641 spin_unlock(&dcache_lock);
642}
643
644/*
645 * NOTE : the dentry must have been dget()'ed
646 */
647static void cgroup_d_remove_dir(struct dentry *dentry)
648{
649 cgroup_clear_directory(dentry);
650
651 spin_lock(&dcache_lock);
652 list_del_init(&dentry->d_u.d_child);
653 spin_unlock(&dcache_lock);
654 remove_dir(dentry);
655}
656
657static int rebind_subsystems(struct cgroupfs_root *root,
658 unsigned long final_bits)
659{
660 unsigned long added_bits, removed_bits;
661 struct cgroup *cgrp = &root->top_cgroup;
662 int i;
663
664 removed_bits = root->actual_subsys_bits & ~final_bits;
665 added_bits = final_bits & ~root->actual_subsys_bits;
666 /* Check that any added subsystems are currently free */
667 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
668 unsigned long long bit = 1ull << i;
669 struct cgroup_subsys *ss = subsys[i];
670 if (!(bit & added_bits))
671 continue;
672 if (ss->root != &rootnode) {
673 /* Subsystem isn't free */
674 return -EBUSY;
675 }
676 }
677
678 /* Currently we don't handle adding/removing subsystems when
679 * any child cgroups exist. This is theoretically supportable
680 * but involves complex error handling, so it's being left until
681 * later */
682 if (!list_empty(&cgrp->children))
683 return -EBUSY;
684
685 /* Process each subsystem */
686 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
687 struct cgroup_subsys *ss = subsys[i];
688 unsigned long bit = 1UL << i;
689 if (bit & added_bits) {
690 /* We're binding this subsystem to this hierarchy */
691 BUG_ON(cgrp->subsys[i]);
692 BUG_ON(!dummytop->subsys[i]);
693 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
694 cgrp->subsys[i] = dummytop->subsys[i];
695 cgrp->subsys[i]->cgroup = cgrp;
696 list_add(&ss->sibling, &root->subsys_list);
697 rcu_assign_pointer(ss->root, root);
698 if (ss->bind)
699 ss->bind(ss, cgrp);
700
701 } else if (bit & removed_bits) {
702 /* We're removing this subsystem */
703 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
704 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
705 if (ss->bind)
706 ss->bind(ss, dummytop);
707 dummytop->subsys[i]->cgroup = dummytop;
708 cgrp->subsys[i] = NULL;
709 rcu_assign_pointer(subsys[i]->root, &rootnode);
710 list_del(&ss->sibling);
711 } else if (bit & final_bits) {
712 /* Subsystem state should already exist */
713 BUG_ON(!cgrp->subsys[i]);
714 } else {
715 /* Subsystem state shouldn't exist */
716 BUG_ON(cgrp->subsys[i]);
717 }
718 }
719 root->subsys_bits = root->actual_subsys_bits = final_bits;
720 synchronize_rcu();
721
722 return 0;
723}
724
725static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
726{
727 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
728 struct cgroup_subsys *ss;
729
730 mutex_lock(&cgroup_mutex);
731 for_each_subsys(root, ss)
732 seq_printf(seq, ",%s", ss->name);
733 if (test_bit(ROOT_NOPREFIX, &root->flags))
734 seq_puts(seq, ",noprefix");
735 if (strlen(root->release_agent_path))
736 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
737 mutex_unlock(&cgroup_mutex);
738 return 0;
739}
740
741struct cgroup_sb_opts {
742 unsigned long subsys_bits;
743 unsigned long flags;
744 char *release_agent;
745};
746
747/* Convert a hierarchy specifier into a bitmask of subsystems and
748 * flags. */
749static int parse_cgroupfs_options(char *data,
750 struct cgroup_sb_opts *opts)
751{
752 char *token, *o = data ?: "all";
753
754 opts->subsys_bits = 0;
755 opts->flags = 0;
756 opts->release_agent = NULL;
757
758 while ((token = strsep(&o, ",")) != NULL) {
759 if (!*token)
760 return -EINVAL;
761 if (!strcmp(token, "all")) {
762 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
763 } else if (!strcmp(token, "noprefix")) {
764 set_bit(ROOT_NOPREFIX, &opts->flags);
765 } else if (!strncmp(token, "release_agent=", 14)) {
766 /* Specifying two release agents is forbidden */
767 if (opts->release_agent)
768 return -EINVAL;
769 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
770 if (!opts->release_agent)
771 return -ENOMEM;
772 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
773 opts->release_agent[PATH_MAX - 1] = 0;
774 } else {
775 struct cgroup_subsys *ss;
776 int i;
777 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
778 ss = subsys[i];
779 if (!strcmp(token, ss->name)) {
780 set_bit(i, &opts->subsys_bits);
781 break;
782 }
783 }
784 if (i == CGROUP_SUBSYS_COUNT)
785 return -ENOENT;
786 }
787 }
788
789 /* We can't have an empty hierarchy */
790 if (!opts->subsys_bits)
791 return -EINVAL;
792
793 return 0;
794}
795
796static int cgroup_remount(struct super_block *sb, int *flags, char *data)
797{
798 int ret = 0;
799 struct cgroupfs_root *root = sb->s_fs_info;
800 struct cgroup *cgrp = &root->top_cgroup;
801 struct cgroup_sb_opts opts;
802
803 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
804 mutex_lock(&cgroup_mutex);
805
806 /* See what subsystems are wanted */
807 ret = parse_cgroupfs_options(data, &opts);
808 if (ret)
809 goto out_unlock;
810
811 /* Don't allow flags to change at remount */
812 if (opts.flags != root->flags) {
813 ret = -EINVAL;
814 goto out_unlock;
815 }
816
817 ret = rebind_subsystems(root, opts.subsys_bits);
818
819 /* (re)populate subsystem files */
820 if (!ret)
821 cgroup_populate_dir(cgrp);
822
823 if (opts.release_agent)
824 strcpy(root->release_agent_path, opts.release_agent);
825 out_unlock:
826 if (opts.release_agent)
827 kfree(opts.release_agent);
828 mutex_unlock(&cgroup_mutex);
829 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
830 return ret;
831}
832
833static struct super_operations cgroup_ops = {
834 .statfs = simple_statfs,
835 .drop_inode = generic_delete_inode,
836 .show_options = cgroup_show_options,
837 .remount_fs = cgroup_remount,
838};
839
840static void init_cgroup_root(struct cgroupfs_root *root)
841{
842 struct cgroup *cgrp = &root->top_cgroup;
843 INIT_LIST_HEAD(&root->subsys_list);
844 INIT_LIST_HEAD(&root->root_list);
845 root->number_of_cgroups = 1;
846 cgrp->root = root;
847 cgrp->top_cgroup = cgrp;
848 INIT_LIST_HEAD(&cgrp->sibling);
849 INIT_LIST_HEAD(&cgrp->children);
850 INIT_LIST_HEAD(&cgrp->css_sets);
851 INIT_LIST_HEAD(&cgrp->release_list);
852}
853
854static int cgroup_test_super(struct super_block *sb, void *data)
855{
856 struct cgroupfs_root *new = data;
857 struct cgroupfs_root *root = sb->s_fs_info;
858
859 /* First check subsystems */
860 if (new->subsys_bits != root->subsys_bits)
861 return 0;
862
863 /* Next check flags */
864 if (new->flags != root->flags)
865 return 0;
866
867 return 1;
868}
869
870static int cgroup_set_super(struct super_block *sb, void *data)
871{
872 int ret;
873 struct cgroupfs_root *root = data;
874
875 ret = set_anon_super(sb, NULL);
876 if (ret)
877 return ret;
878
879 sb->s_fs_info = root;
880 root->sb = sb;
881
882 sb->s_blocksize = PAGE_CACHE_SIZE;
883 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
884 sb->s_magic = CGROUP_SUPER_MAGIC;
885 sb->s_op = &cgroup_ops;
886
887 return 0;
888}
889
890static int cgroup_get_rootdir(struct super_block *sb)
891{
892 struct inode *inode =
893 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
894 struct dentry *dentry;
895
896 if (!inode)
897 return -ENOMEM;
898
899 inode->i_op = &simple_dir_inode_operations;
900 inode->i_fop = &simple_dir_operations;
901 inode->i_op = &cgroup_dir_inode_operations;
902 /* directories start off with i_nlink == 2 (for "." entry) */
903 inc_nlink(inode);
904 dentry = d_alloc_root(inode);
905 if (!dentry) {
906 iput(inode);
907 return -ENOMEM;
908 }
909 sb->s_root = dentry;
910 return 0;
911}
912
913static int cgroup_get_sb(struct file_system_type *fs_type,
914 int flags, const char *unused_dev_name,
915 void *data, struct vfsmount *mnt)
916{
917 struct cgroup_sb_opts opts;
918 int ret = 0;
919 struct super_block *sb;
920 struct cgroupfs_root *root;
921 struct list_head tmp_cg_links, *l;
922 INIT_LIST_HEAD(&tmp_cg_links);
923
924 /* First find the desired set of subsystems */
925 ret = parse_cgroupfs_options(data, &opts);
926 if (ret) {
927 if (opts.release_agent)
928 kfree(opts.release_agent);
929 return ret;
930 }
931
932 root = kzalloc(sizeof(*root), GFP_KERNEL);
933 if (!root)
934 return -ENOMEM;
935
936 init_cgroup_root(root);
937 root->subsys_bits = opts.subsys_bits;
938 root->flags = opts.flags;
939 if (opts.release_agent) {
940 strcpy(root->release_agent_path, opts.release_agent);
941 kfree(opts.release_agent);
942 }
943
944 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
945
946 if (IS_ERR(sb)) {
947 kfree(root);
948 return PTR_ERR(sb);
949 }
950
951 if (sb->s_fs_info != root) {
952 /* Reusing an existing superblock */
953 BUG_ON(sb->s_root == NULL);
954 kfree(root);
955 root = NULL;
956 } else {
957 /* New superblock */
958 struct cgroup *cgrp = &root->top_cgroup;
959 struct inode *inode;
960
961 BUG_ON(sb->s_root != NULL);
962
963 ret = cgroup_get_rootdir(sb);
964 if (ret)
965 goto drop_new_super;
966 inode = sb->s_root->d_inode;
967
968 mutex_lock(&inode->i_mutex);
969 mutex_lock(&cgroup_mutex);
970
971 /*
972 * We're accessing css_set_count without locking
973 * css_set_lock here, but that's OK - it can only be
974 * increased by someone holding cgroup_lock, and
975 * that's us. The worst that can happen is that we
976 * have some link structures left over
977 */
978 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
979 if (ret) {
980 mutex_unlock(&cgroup_mutex);
981 mutex_unlock(&inode->i_mutex);
982 goto drop_new_super;
983 }
984
985 ret = rebind_subsystems(root, root->subsys_bits);
986 if (ret == -EBUSY) {
987 mutex_unlock(&cgroup_mutex);
988 mutex_unlock(&inode->i_mutex);
989 goto drop_new_super;
990 }
991
992 /* EBUSY should be the only error here */
993 BUG_ON(ret);
994
995 list_add(&root->root_list, &roots);
996 root_count++;
997
998 sb->s_root->d_fsdata = &root->top_cgroup;
999 root->top_cgroup.dentry = sb->s_root;
1000
1001 /* Link the top cgroup in this hierarchy into all
1002 * the css_set objects */
1003 write_lock(&css_set_lock);
1004 l = &init_css_set.list;
1005 do {
1006 struct css_set *cg;
1007 struct cg_cgroup_link *link;
1008 cg = list_entry(l, struct css_set, list);
1009 BUG_ON(list_empty(&tmp_cg_links));
1010 link = list_entry(tmp_cg_links.next,
1011 struct cg_cgroup_link,
1012 cgrp_link_list);
1013 list_del(&link->cgrp_link_list);
1014 link->cg = cg;
1015 list_add(&link->cgrp_link_list,
1016 &root->top_cgroup.css_sets);
1017 list_add(&link->cg_link_list, &cg->cg_links);
1018 l = l->next;
1019 } while (l != &init_css_set.list);
1020 write_unlock(&css_set_lock);
1021
1022 free_cg_links(&tmp_cg_links);
1023
1024 BUG_ON(!list_empty(&cgrp->sibling));
1025 BUG_ON(!list_empty(&cgrp->children));
1026 BUG_ON(root->number_of_cgroups != 1);
1027
1028 cgroup_populate_dir(cgrp);
1029 mutex_unlock(&inode->i_mutex);
1030 mutex_unlock(&cgroup_mutex);
1031 }
1032
1033 return simple_set_mnt(mnt, sb);
1034
1035 drop_new_super:
1036 up_write(&sb->s_umount);
1037 deactivate_super(sb);
1038 free_cg_links(&tmp_cg_links);
1039 return ret;
1040}
1041
1042static void cgroup_kill_sb(struct super_block *sb) {
1043 struct cgroupfs_root *root = sb->s_fs_info;
1044 struct cgroup *cgrp = &root->top_cgroup;
1045 int ret;
1046
1047 BUG_ON(!root);
1048
1049 BUG_ON(root->number_of_cgroups != 1);
1050 BUG_ON(!list_empty(&cgrp->children));
1051 BUG_ON(!list_empty(&cgrp->sibling));
1052
1053 mutex_lock(&cgroup_mutex);
1054
1055 /* Rebind all subsystems back to the default hierarchy */
1056 ret = rebind_subsystems(root, 0);
1057 /* Shouldn't be able to fail ... */
1058 BUG_ON(ret);
1059
1060 /*
1061 * Release all the links from css_sets to this hierarchy's
1062 * root cgroup
1063 */
1064 write_lock(&css_set_lock);
1065 while (!list_empty(&cgrp->css_sets)) {
1066 struct cg_cgroup_link *link;
1067 link = list_entry(cgrp->css_sets.next,
1068 struct cg_cgroup_link, cgrp_link_list);
1069 list_del(&link->cg_link_list);
1070 list_del(&link->cgrp_link_list);
1071 kfree(link);
1072 }
1073 write_unlock(&css_set_lock);
1074
1075 if (!list_empty(&root->root_list)) {
1076 list_del(&root->root_list);
1077 root_count--;
1078 }
1079 mutex_unlock(&cgroup_mutex);
1080
1081 kfree(root);
1082 kill_litter_super(sb);
1083}
1084
1085static struct file_system_type cgroup_fs_type = {
1086 .name = "cgroup",
1087 .get_sb = cgroup_get_sb,
1088 .kill_sb = cgroup_kill_sb,
1089};
1090
1091static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1092{
1093 return dentry->d_fsdata;
1094}
1095
1096static inline struct cftype *__d_cft(struct dentry *dentry)
1097{
1098 return dentry->d_fsdata;
1099}
1100
1101/*
1102 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1103 * Returns 0 on success, -errno on error.
1104 */
1105int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1106{
1107 char *start;
1108
1109 if (cgrp == dummytop) {
1110 /*
1111 * Inactive subsystems have no dentry for their root
1112 * cgroup
1113 */
1114 strcpy(buf, "/");
1115 return 0;
1116 }
1117
1118 start = buf + buflen;
1119
1120 *--start = '\0';
1121 for (;;) {
1122 int len = cgrp->dentry->d_name.len;
1123 if ((start -= len) < buf)
1124 return -ENAMETOOLONG;
1125 memcpy(start, cgrp->dentry->d_name.name, len);
1126 cgrp = cgrp->parent;
1127 if (!cgrp)
1128 break;
1129 if (!cgrp->parent)
1130 continue;
1131 if (--start < buf)
1132 return -ENAMETOOLONG;
1133 *start = '/';
1134 }
1135 memmove(buf, start, buf + buflen - start);
1136 return 0;
1137}
1138
1139/*
1140 * Return the first subsystem attached to a cgroup's hierarchy, and
1141 * its subsystem id.
1142 */
1143
1144static void get_first_subsys(const struct cgroup *cgrp,
1145 struct cgroup_subsys_state **css, int *subsys_id)
1146{
1147 const struct cgroupfs_root *root = cgrp->root;
1148 const struct cgroup_subsys *test_ss;
1149 BUG_ON(list_empty(&root->subsys_list));
1150 test_ss = list_entry(root->subsys_list.next,
1151 struct cgroup_subsys, sibling);
1152 if (css) {
1153 *css = cgrp->subsys[test_ss->subsys_id];
1154 BUG_ON(!*css);
1155 }
1156 if (subsys_id)
1157 *subsys_id = test_ss->subsys_id;
1158}
1159
1160/*
1161 * Attach task 'tsk' to cgroup 'cgrp'
1162 *
1163 * Call holding cgroup_mutex. May take task_lock of
1164 * the task 'pid' during call.
1165 */
1166static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1167{
1168 int retval = 0;
1169 struct cgroup_subsys *ss;
1170 struct cgroup *oldcgrp;
1171 struct css_set *cg = tsk->cgroups;
1172 struct css_set *newcg;
1173 struct cgroupfs_root *root = cgrp->root;
1174 int subsys_id;
1175
1176 get_first_subsys(cgrp, NULL, &subsys_id);
1177
1178 /* Nothing to do if the task is already in that cgroup */
1179 oldcgrp = task_cgroup(tsk, subsys_id);
1180 if (cgrp == oldcgrp)
1181 return 0;
1182
1183 for_each_subsys(root, ss) {
1184 if (ss->can_attach) {
1185 retval = ss->can_attach(ss, cgrp, tsk);
1186 if (retval) {
1187 return retval;
1188 }
1189 }
1190 }
1191
1192 /*
1193 * Locate or allocate a new css_set for this task,
1194 * based on its final set of cgroups
1195 */
1196 newcg = find_css_set(cg, cgrp);
1197 if (!newcg) {
1198 return -ENOMEM;
1199 }
1200
1201 task_lock(tsk);
1202 if (tsk->flags & PF_EXITING) {
1203 task_unlock(tsk);
1204 put_css_set(newcg);
1205 return -ESRCH;
1206 }
1207 rcu_assign_pointer(tsk->cgroups, newcg);
1208 task_unlock(tsk);
1209
1210 /* Update the css_set linked lists if we're using them */
1211 write_lock(&css_set_lock);
1212 if (!list_empty(&tsk->cg_list)) {
1213 list_del(&tsk->cg_list);
1214 list_add(&tsk->cg_list, &newcg->tasks);
1215 }
1216 write_unlock(&css_set_lock);
1217
1218 for_each_subsys(root, ss) {
1219 if (ss->attach) {
1220 ss->attach(ss, cgrp, oldcgrp, tsk);
1221 }
1222 }
1223 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1224 synchronize_rcu();
1225 put_css_set(cg);
1226 return 0;
1227}
1228
1229/*
1230 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
1231 * cgroup_mutex, may take task_lock of task
1232 */
1233static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1234{
1235 pid_t pid;
1236 struct task_struct *tsk;
1237 int ret;
1238
1239 if (sscanf(pidbuf, "%d", &pid) != 1)
1240 return -EIO;
1241
1242 if (pid) {
1243 rcu_read_lock();
1244 tsk = find_task_by_pid(pid);
1245 if (!tsk || tsk->flags & PF_EXITING) {
1246 rcu_read_unlock();
1247 return -ESRCH;
1248 }
1249 get_task_struct(tsk);
1250 rcu_read_unlock();
1251
1252 if ((current->euid) && (current->euid != tsk->uid)
1253 && (current->euid != tsk->suid)) {
1254 put_task_struct(tsk);
1255 return -EACCES;
1256 }
1257 } else {
1258 tsk = current;
1259 get_task_struct(tsk);
1260 }
1261
1262 ret = attach_task(cgrp, tsk);
1263 put_task_struct(tsk);
1264 return ret;
1265}
1266
1267/* The various types of files and directories in a cgroup file system */
1268
1269enum cgroup_filetype {
1270 FILE_ROOT,
1271 FILE_DIR,
1272 FILE_TASKLIST,
1273 FILE_NOTIFY_ON_RELEASE,
1274 FILE_RELEASABLE,
1275 FILE_RELEASE_AGENT,
1276};
1277
1278static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1279 struct file *file,
1280 const char __user *userbuf,
1281 size_t nbytes, loff_t *unused_ppos)
1282{
1283 char buffer[64];
1284 int retval = 0;
1285 u64 val;
1286 char *end;
1287
1288 if (!nbytes)
1289 return -EINVAL;
1290 if (nbytes >= sizeof(buffer))
1291 return -E2BIG;
1292 if (copy_from_user(buffer, userbuf, nbytes))
1293 return -EFAULT;
1294
1295 buffer[nbytes] = 0; /* nul-terminate */
1296
1297 /* strip newline if necessary */
1298 if (nbytes && (buffer[nbytes-1] == '\n'))
1299 buffer[nbytes-1] = 0;
1300 val = simple_strtoull(buffer, &end, 0);
1301 if (*end)
1302 return -EINVAL;
1303
1304 /* Pass to subsystem */
1305 retval = cft->write_uint(cgrp, cft, val);
1306 if (!retval)
1307 retval = nbytes;
1308 return retval;
1309}
1310
1311static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1312 struct cftype *cft,
1313 struct file *file,
1314 const char __user *userbuf,
1315 size_t nbytes, loff_t *unused_ppos)
1316{
1317 enum cgroup_filetype type = cft->private;
1318 char *buffer;
1319 int retval = 0;
1320
1321 if (nbytes >= PATH_MAX)
1322 return -E2BIG;
1323
1324 /* +1 for nul-terminator */
1325 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1326 if (buffer == NULL)
1327 return -ENOMEM;
1328
1329 if (copy_from_user(buffer, userbuf, nbytes)) {
1330 retval = -EFAULT;
1331 goto out1;
1332 }
1333 buffer[nbytes] = 0; /* nul-terminate */
1334
1335 mutex_lock(&cgroup_mutex);
1336
1337 if (cgroup_is_removed(cgrp)) {
1338 retval = -ENODEV;
1339 goto out2;
1340 }
1341
1342 switch (type) {
1343 case FILE_TASKLIST:
1344 retval = attach_task_by_pid(cgrp, buffer);
1345 break;
1346 case FILE_NOTIFY_ON_RELEASE:
1347 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1348 if (simple_strtoul(buffer, NULL, 10) != 0)
1349 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1350 else
1351 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1352 break;
1353 case FILE_RELEASE_AGENT:
1354 {
1355 struct cgroupfs_root *root = cgrp->root;
1356 /* Strip trailing newline */
1357 if (nbytes && (buffer[nbytes-1] == '\n')) {
1358 buffer[nbytes-1] = 0;
1359 }
1360 if (nbytes < sizeof(root->release_agent_path)) {
1361 /* We never write anything other than '\0'
1362 * into the last char of release_agent_path,
1363 * so it always remains a NUL-terminated
1364 * string */
1365 strncpy(root->release_agent_path, buffer, nbytes);
1366 root->release_agent_path[nbytes] = 0;
1367 } else {
1368 retval = -ENOSPC;
1369 }
1370 break;
1371 }
1372 default:
1373 retval = -EINVAL;
1374 goto out2;
1375 }
1376
1377 if (retval == 0)
1378 retval = nbytes;
1379out2:
1380 mutex_unlock(&cgroup_mutex);
1381out1:
1382 kfree(buffer);
1383 return retval;
1384}
1385
1386static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1387 size_t nbytes, loff_t *ppos)
1388{
1389 struct cftype *cft = __d_cft(file->f_dentry);
1390 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1391
1392 if (!cft)
1393 return -ENODEV;
1394 if (cft->write)
1395 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1396 if (cft->write_uint)
1397 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
1398 return -EINVAL;
1399}
1400
1401static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
1402 struct file *file,
1403 char __user *buf, size_t nbytes,
1404 loff_t *ppos)
1405{
1406 char tmp[64];
1407 u64 val = cft->read_uint(cgrp, cft);
1408 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1409
1410 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1411}
1412
1413static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1414 struct cftype *cft,
1415 struct file *file,
1416 char __user *buf,
1417 size_t nbytes, loff_t *ppos)
1418{
1419 enum cgroup_filetype type = cft->private;
1420 char *page;
1421 ssize_t retval = 0;
1422 char *s;
1423
1424 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1425 return -ENOMEM;
1426
1427 s = page;
1428
1429 switch (type) {
1430 case FILE_RELEASE_AGENT:
1431 {
1432 struct cgroupfs_root *root;
1433 size_t n;
1434 mutex_lock(&cgroup_mutex);
1435 root = cgrp->root;
1436 n = strnlen(root->release_agent_path,
1437 sizeof(root->release_agent_path));
1438 n = min(n, (size_t) PAGE_SIZE);
1439 strncpy(s, root->release_agent_path, n);
1440 mutex_unlock(&cgroup_mutex);
1441 s += n;
1442 break;
1443 }
1444 default:
1445 retval = -EINVAL;
1446 goto out;
1447 }
1448 *s++ = '\n';
1449
1450 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1451out:
1452 free_page((unsigned long)page);
1453 return retval;
1454}
1455
1456static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1457 size_t nbytes, loff_t *ppos)
1458{
1459 struct cftype *cft = __d_cft(file->f_dentry);
1460 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1461
1462 if (!cft)
1463 return -ENODEV;
1464
1465 if (cft->read)
1466 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1467 if (cft->read_uint)
1468 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
1469 return -EINVAL;
1470}
1471
1472static int cgroup_file_open(struct inode *inode, struct file *file)
1473{
1474 int err;
1475 struct cftype *cft;
1476
1477 err = generic_file_open(inode, file);
1478 if (err)
1479 return err;
1480
1481 cft = __d_cft(file->f_dentry);
1482 if (!cft)
1483 return -ENODEV;
1484 if (cft->open)
1485 err = cft->open(inode, file);
1486 else
1487 err = 0;
1488
1489 return err;
1490}
1491
1492static int cgroup_file_release(struct inode *inode, struct file *file)
1493{
1494 struct cftype *cft = __d_cft(file->f_dentry);
1495 if (cft->release)
1496 return cft->release(inode, file);
1497 return 0;
1498}
1499
1500/*
1501 * cgroup_rename - Only allow simple rename of directories in place.
1502 */
1503static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1504 struct inode *new_dir, struct dentry *new_dentry)
1505{
1506 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1507 return -ENOTDIR;
1508 if (new_dentry->d_inode)
1509 return -EEXIST;
1510 if (old_dir != new_dir)
1511 return -EIO;
1512 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1513}
1514
1515static struct file_operations cgroup_file_operations = {
1516 .read = cgroup_file_read,
1517 .write = cgroup_file_write,
1518 .llseek = generic_file_llseek,
1519 .open = cgroup_file_open,
1520 .release = cgroup_file_release,
1521};
1522
1523static struct inode_operations cgroup_dir_inode_operations = {
1524 .lookup = simple_lookup,
1525 .mkdir = cgroup_mkdir,
1526 .rmdir = cgroup_rmdir,
1527 .rename = cgroup_rename,
1528};
1529
1530static int cgroup_create_file(struct dentry *dentry, int mode,
1531 struct super_block *sb)
1532{
1533 static struct dentry_operations cgroup_dops = {
1534 .d_iput = cgroup_diput,
1535 };
1536
1537 struct inode *inode;
1538
1539 if (!dentry)
1540 return -ENOENT;
1541 if (dentry->d_inode)
1542 return -EEXIST;
1543
1544 inode = cgroup_new_inode(mode, sb);
1545 if (!inode)
1546 return -ENOMEM;
1547
1548 if (S_ISDIR(mode)) {
1549 inode->i_op = &cgroup_dir_inode_operations;
1550 inode->i_fop = &simple_dir_operations;
1551
1552 /* start off with i_nlink == 2 (for "." entry) */
1553 inc_nlink(inode);
1554
1555 /* start with the directory inode held, so that we can
1556 * populate it without racing with another mkdir */
1557 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1558 } else if (S_ISREG(mode)) {
1559 inode->i_size = 0;
1560 inode->i_fop = &cgroup_file_operations;
1561 }
1562 dentry->d_op = &cgroup_dops;
1563 d_instantiate(dentry, inode);
1564 dget(dentry); /* Extra count - pin the dentry in core */
1565 return 0;
1566}
1567
1568/*
1569 * cgroup_create_dir - create a directory for an object.
1570 * cgrp: the cgroup we create the directory for.
1571 * It must have a valid ->parent field
1572 * And we are going to fill its ->dentry field.
1573 * dentry: dentry of the new cgroup
1574 * mode: mode to set on new directory.
1575 */
1576static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1577 int mode)
1578{
1579 struct dentry *parent;
1580 int error = 0;
1581
1582 parent = cgrp->parent->dentry;
1583 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
1584 if (!error) {
1585 dentry->d_fsdata = cgrp;
1586 inc_nlink(parent->d_inode);
1587 cgrp->dentry = dentry;
1588 dget(dentry);
1589 }
1590 dput(dentry);
1591
1592 return error;
1593}
1594
1595int cgroup_add_file(struct cgroup *cgrp,
1596 struct cgroup_subsys *subsys,
1597 const struct cftype *cft)
1598{
1599 struct dentry *dir = cgrp->dentry;
1600 struct dentry *dentry;
1601 int error;
1602
1603 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1604 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
1605 strcpy(name, subsys->name);
1606 strcat(name, ".");
1607 }
1608 strcat(name, cft->name);
1609 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1610 dentry = lookup_one_len(name, dir, strlen(name));
1611 if (!IS_ERR(dentry)) {
1612 error = cgroup_create_file(dentry, 0644 | S_IFREG,
1613 cgrp->root->sb);
1614 if (!error)
1615 dentry->d_fsdata = (void *)cft;
1616 dput(dentry);
1617 } else
1618 error = PTR_ERR(dentry);
1619 return error;
1620}
1621
1622int cgroup_add_files(struct cgroup *cgrp,
1623 struct cgroup_subsys *subsys,
1624 const struct cftype cft[],
1625 int count)
1626{
1627 int i, err;
1628 for (i = 0; i < count; i++) {
1629 err = cgroup_add_file(cgrp, subsys, &cft[i]);
1630 if (err)
1631 return err;
1632 }
1633 return 0;
1634}
1635
1636/* Count the number of tasks in a cgroup. */
1637
1638int cgroup_task_count(const struct cgroup *cgrp)
1639{
1640 int count = 0;
1641 struct list_head *l;
1642
1643 read_lock(&css_set_lock);
1644 l = cgrp->css_sets.next;
1645 while (l != &cgrp->css_sets) {
1646 struct cg_cgroup_link *link =
1647 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1648 count += atomic_read(&link->cg->ref.refcount);
1649 l = l->next;
1650 }
1651 read_unlock(&css_set_lock);
1652 return count;
1653}
1654
1655/*
1656 * Advance a list_head iterator. The iterator should be positioned at
1657 * the start of a css_set
1658 */
1659static void cgroup_advance_iter(struct cgroup *cgrp,
1660 struct cgroup_iter *it)
1661{
1662 struct list_head *l = it->cg_link;
1663 struct cg_cgroup_link *link;
1664 struct css_set *cg;
1665
1666 /* Advance to the next non-empty css_set */
1667 do {
1668 l = l->next;
1669 if (l == &cgrp->css_sets) {
1670 it->cg_link = NULL;
1671 return;
1672 }
1673 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1674 cg = link->cg;
1675 } while (list_empty(&cg->tasks));
1676 it->cg_link = l;
1677 it->task = cg->tasks.next;
1678}
1679
1680void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1681{
1682 /*
1683 * The first time anyone tries to iterate across a cgroup,
1684 * we need to enable the list linking each css_set to its
1685 * tasks, and fix up all existing tasks.
1686 */
1687 if (!use_task_css_set_links) {
1688 struct task_struct *p, *g;
1689 write_lock(&css_set_lock);
1690 use_task_css_set_links = 1;
1691 do_each_thread(g, p) {
1692 task_lock(p);
1693 if (list_empty(&p->cg_list))
1694 list_add(&p->cg_list, &p->cgroups->tasks);
1695 task_unlock(p);
1696 } while_each_thread(g, p);
1697 write_unlock(&css_set_lock);
1698 }
1699 read_lock(&css_set_lock);
1700 it->cg_link = &cgrp->css_sets;
1701 cgroup_advance_iter(cgrp, it);
1702}
1703
1704struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1705 struct cgroup_iter *it)
1706{
1707 struct task_struct *res;
1708 struct list_head *l = it->task;
1709
1710 /* If the iterator cg is NULL, we have no tasks */
1711 if (!it->cg_link)
1712 return NULL;
1713 res = list_entry(l, struct task_struct, cg_list);
1714 /* Advance iterator to find next entry */
1715 l = l->next;
1716 if (l == &res->cgroups->tasks) {
1717 /* We reached the end of this task list - move on to
1718 * the next cg_cgroup_link */
1719 cgroup_advance_iter(cgrp, it);
1720 } else {
1721 it->task = l;
1722 }
1723 return res;
1724}
1725
1726void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1727{
1728 read_unlock(&css_set_lock);
1729}
1730
1731/*
1732 * Stuff for reading the 'tasks' file.
1733 *
1734 * Reading this file can return large amounts of data if a cgroup has
1735 * *lots* of attached tasks. So it may need several calls to read(),
1736 * but we cannot guarantee that the information we produce is correct
1737 * unless we produce it entirely atomically.
1738 *
1739 * Upon tasks file open(), a struct ctr_struct is allocated, that
1740 * will have a pointer to an array (also allocated here). The struct
1741 * ctr_struct * is stored in file->private_data. Its resources will
1742 * be freed by release() when the file is closed. The array is used
1743 * to sprintf the PIDs and then used by read().
1744 */
1745struct ctr_struct {
1746 char *buf;
1747 int bufsz;
1748};
1749
1750/*
1751 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1752 * 'cgrp'. Return actual number of pids loaded. No need to
1753 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1754 * read section, so the css_set can't go away, and is
1755 * immutable after creation.
1756 */
1757static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
1758{
1759 int n = 0;
1760 struct cgroup_iter it;
1761 struct task_struct *tsk;
1762 cgroup_iter_start(cgrp, &it);
1763 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1764 if (unlikely(n == npids))
1765 break;
1766 pidarray[n++] = task_pid_nr(tsk);
1767 }
1768 cgroup_iter_end(cgrp, &it);
1769 return n;
1770}
1771
1772/**
1773 * Build and fill cgroupstats so that taskstats can export it to user
1774 * space.
1775 *
1776 * @stats: cgroupstats to fill information into
1777 * @dentry: A dentry entry belonging to the cgroup for which stats have
1778 * been requested.
1779 */
1780int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
1781{
1782 int ret = -EINVAL;
1783 struct cgroup *cgrp;
1784 struct cgroup_iter it;
1785 struct task_struct *tsk;
1786 /*
1787 * Validate dentry by checking the superblock operations
1788 */
1789 if (dentry->d_sb->s_op != &cgroup_ops)
1790 goto err;
1791
1792 ret = 0;
1793 cgrp = dentry->d_fsdata;
1794 rcu_read_lock();
1795
1796 cgroup_iter_start(cgrp, &it);
1797 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1798 switch (tsk->state) {
1799 case TASK_RUNNING:
1800 stats->nr_running++;
1801 break;
1802 case TASK_INTERRUPTIBLE:
1803 stats->nr_sleeping++;
1804 break;
1805 case TASK_UNINTERRUPTIBLE:
1806 stats->nr_uninterruptible++;
1807 break;
1808 case TASK_STOPPED:
1809 stats->nr_stopped++;
1810 break;
1811 default:
1812 if (delayacct_is_task_waiting_on_io(tsk))
1813 stats->nr_io_wait++;
1814 break;
1815 }
1816 }
1817 cgroup_iter_end(cgrp, &it);
1818
1819 rcu_read_unlock();
1820err:
1821 return ret;
1822}
1823
1824static int cmppid(const void *a, const void *b)
1825{
1826 return *(pid_t *)a - *(pid_t *)b;
1827}
1828
1829/*
1830 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1831 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1832 * count 'cnt' of how many chars would be written if buf were large enough.
1833 */
1834static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1835{
1836 int cnt = 0;
1837 int i;
1838
1839 for (i = 0; i < npids; i++)
1840 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1841 return cnt;
1842}
1843
1844/*
1845 * Handle an open on 'tasks' file. Prepare a buffer listing the
1846 * process id's of tasks currently attached to the cgroup being opened.
1847 *
1848 * Does not require any specific cgroup mutexes, and does not take any.
1849 */
1850static int cgroup_tasks_open(struct inode *unused, struct file *file)
1851{
1852 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1853 struct ctr_struct *ctr;
1854 pid_t *pidarray;
1855 int npids;
1856 char c;
1857
1858 if (!(file->f_mode & FMODE_READ))
1859 return 0;
1860
1861 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1862 if (!ctr)
1863 goto err0;
1864
1865 /*
1866 * If cgroup gets more users after we read count, we won't have
1867 * enough space - tough. This race is indistinguishable to the
1868 * caller from the case that the additional cgroup users didn't
1869 * show up until sometime later on.
1870 */
1871 npids = cgroup_task_count(cgrp);
1872 if (npids) {
1873 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1874 if (!pidarray)
1875 goto err1;
1876
1877 npids = pid_array_load(pidarray, npids, cgrp);
1878 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1879
1880 /* Call pid_array_to_buf() twice, first just to get bufsz */
1881 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1882 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1883 if (!ctr->buf)
1884 goto err2;
1885 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1886
1887 kfree(pidarray);
1888 } else {
1889 ctr->buf = 0;
1890 ctr->bufsz = 0;
1891 }
1892 file->private_data = ctr;
1893 return 0;
1894
1895err2:
1896 kfree(pidarray);
1897err1:
1898 kfree(ctr);
1899err0:
1900 return -ENOMEM;
1901}
1902
1903static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
1904 struct cftype *cft,
1905 struct file *file, char __user *buf,
1906 size_t nbytes, loff_t *ppos)
1907{
1908 struct ctr_struct *ctr = file->private_data;
1909
1910 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1911}
1912
1913static int cgroup_tasks_release(struct inode *unused_inode,
1914 struct file *file)
1915{
1916 struct ctr_struct *ctr;
1917
1918 if (file->f_mode & FMODE_READ) {
1919 ctr = file->private_data;
1920 kfree(ctr->buf);
1921 kfree(ctr);
1922 }
1923 return 0;
1924}
1925
1926static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
1927 struct cftype *cft)
1928{
1929 return notify_on_release(cgrp);
1930}
1931
1932static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
1933{
1934 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
1935}
1936
1937/*
1938 * for the common functions, 'private' gives the type of file
1939 */
1940static struct cftype files[] = {
1941 {
1942 .name = "tasks",
1943 .open = cgroup_tasks_open,
1944 .read = cgroup_tasks_read,
1945 .write = cgroup_common_file_write,
1946 .release = cgroup_tasks_release,
1947 .private = FILE_TASKLIST,
1948 },
1949
1950 {
1951 .name = "notify_on_release",
1952 .read_uint = cgroup_read_notify_on_release,
1953 .write = cgroup_common_file_write,
1954 .private = FILE_NOTIFY_ON_RELEASE,
1955 },
1956
1957 {
1958 .name = "releasable",
1959 .read_uint = cgroup_read_releasable,
1960 .private = FILE_RELEASABLE,
1961 }
1962};
1963
1964static struct cftype cft_release_agent = {
1965 .name = "release_agent",
1966 .read = cgroup_common_file_read,
1967 .write = cgroup_common_file_write,
1968 .private = FILE_RELEASE_AGENT,
1969};
1970
1971static int cgroup_populate_dir(struct cgroup *cgrp)
1972{
1973 int err;
1974 struct cgroup_subsys *ss;
1975
1976 /* First clear out any existing files */
1977 cgroup_clear_directory(cgrp->dentry);
1978
1979 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
1980 if (err < 0)
1981 return err;
1982
1983 if (cgrp == cgrp->top_cgroup) {
1984 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
1985 return err;
1986 }
1987
1988 for_each_subsys(cgrp->root, ss) {
1989 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
1990 return err;
1991 }
1992
1993 return 0;
1994}
1995
1996static void init_cgroup_css(struct cgroup_subsys_state *css,
1997 struct cgroup_subsys *ss,
1998 struct cgroup *cgrp)
1999{
2000 css->cgroup = cgrp;
2001 atomic_set(&css->refcnt, 0);
2002 css->flags = 0;
2003 if (cgrp == dummytop)
2004 set_bit(CSS_ROOT, &css->flags);
2005 BUG_ON(cgrp->subsys[ss->subsys_id]);
2006 cgrp->subsys[ss->subsys_id] = css;
2007}
2008
2009/*
2010 * cgroup_create - create a cgroup
2011 * parent: cgroup that will be parent of the new cgroup.
2012 * name: name of the new cgroup. Will be strcpy'ed.
2013 * mode: mode to set on new inode
2014 *
2015 * Must be called with the mutex on the parent inode held
2016 */
2017
2018static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2019 int mode)
2020{
2021 struct cgroup *cgrp;
2022 struct cgroupfs_root *root = parent->root;
2023 int err = 0;
2024 struct cgroup_subsys *ss;
2025 struct super_block *sb = root->sb;
2026
2027 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
2028 if (!cgrp)
2029 return -ENOMEM;
2030
2031 /* Grab a reference on the superblock so the hierarchy doesn't
2032 * get deleted on unmount if there are child cgroups. This
2033 * can be done outside cgroup_mutex, since the sb can't
2034 * disappear while someone has an open control file on the
2035 * fs */
2036 atomic_inc(&sb->s_active);
2037
2038 mutex_lock(&cgroup_mutex);
2039
2040 cgrp->flags = 0;
2041 INIT_LIST_HEAD(&cgrp->sibling);
2042 INIT_LIST_HEAD(&cgrp->children);
2043 INIT_LIST_HEAD(&cgrp->css_sets);
2044 INIT_LIST_HEAD(&cgrp->release_list);
2045
2046 cgrp->parent = parent;
2047 cgrp->root = parent->root;
2048 cgrp->top_cgroup = parent->top_cgroup;
2049
2050 for_each_subsys(root, ss) {
2051 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2052 if (IS_ERR(css)) {
2053 err = PTR_ERR(css);
2054 goto err_destroy;
2055 }
2056 init_cgroup_css(css, ss, cgrp);
2057 }
2058
2059 list_add(&cgrp->sibling, &cgrp->parent->children);
2060 root->number_of_cgroups++;
2061
2062 err = cgroup_create_dir(cgrp, dentry, mode);
2063 if (err < 0)
2064 goto err_remove;
2065
2066 /* The cgroup directory was pre-locked for us */
2067 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
2068
2069 err = cgroup_populate_dir(cgrp);
2070 /* If err < 0, we have a half-filled directory - oh well ;) */
2071
2072 mutex_unlock(&cgroup_mutex);
2073 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
2074
2075 return 0;
2076
2077 err_remove:
2078
2079 list_del(&cgrp->sibling);
2080 root->number_of_cgroups--;
2081
2082 err_destroy:
2083
2084 for_each_subsys(root, ss) {
2085 if (cgrp->subsys[ss->subsys_id])
2086 ss->destroy(ss, cgrp);
2087 }
2088
2089 mutex_unlock(&cgroup_mutex);
2090
2091 /* Release the reference count that we took on the superblock */
2092 deactivate_super(sb);
2093
2094 kfree(cgrp);
2095 return err;
2096}
2097
2098static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2099{
2100 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
2101
2102 /* the vfs holds inode->i_mutex already */
2103 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2104}
2105
2106static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2107{
2108 /* Check the reference count on each subsystem. Since we
2109 * already established that there are no tasks in the
2110 * cgroup, if the css refcount is also 0, then there should
2111 * be no outstanding references, so the subsystem is safe to
2112 * destroy. We scan across all subsystems rather than using
2113 * the per-hierarchy linked list of mounted subsystems since
2114 * we can be called via check_for_release() with no
2115 * synchronization other than RCU, and the subsystem linked
2116 * list isn't RCU-safe */
2117 int i;
2118 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2119 struct cgroup_subsys *ss = subsys[i];
2120 struct cgroup_subsys_state *css;
2121 /* Skip subsystems not in this hierarchy */
2122 if (ss->root != cgrp->root)
2123 continue;
2124 css = cgrp->subsys[ss->subsys_id];
2125 /* When called from check_for_release() it's possible
2126 * that by this point the cgroup has been removed
2127 * and the css deleted. But a false-positive doesn't
2128 * matter, since it can only happen if the cgroup
2129 * has been deleted and hence no longer needs the
2130 * release agent to be called anyway. */
2131 if (css && atomic_read(&css->refcnt)) {
2132 return 1;
2133 }
2134 }
2135 return 0;
2136}
2137
2138static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2139{
2140 struct cgroup *cgrp = dentry->d_fsdata;
2141 struct dentry *d;
2142 struct cgroup *parent;
2143 struct cgroup_subsys *ss;
2144 struct super_block *sb;
2145 struct cgroupfs_root *root;
2146
2147 /* the vfs holds both inode->i_mutex already */
2148
2149 mutex_lock(&cgroup_mutex);
2150 if (atomic_read(&cgrp->count) != 0) {
2151 mutex_unlock(&cgroup_mutex);
2152 return -EBUSY;
2153 }
2154 if (!list_empty(&cgrp->children)) {
2155 mutex_unlock(&cgroup_mutex);
2156 return -EBUSY;
2157 }
2158
2159 parent = cgrp->parent;
2160 root = cgrp->root;
2161 sb = root->sb;
2162
2163 if (cgroup_has_css_refs(cgrp)) {
2164 mutex_unlock(&cgroup_mutex);
2165 return -EBUSY;
2166 }
2167
2168 for_each_subsys(root, ss) {
2169 if (cgrp->subsys[ss->subsys_id])
2170 ss->destroy(ss, cgrp);
2171 }
2172
2173 spin_lock(&release_list_lock);
2174 set_bit(CGRP_REMOVED, &cgrp->flags);
2175 if (!list_empty(&cgrp->release_list))
2176 list_del(&cgrp->release_list);
2177 spin_unlock(&release_list_lock);
2178 /* delete my sibling from parent->children */
2179 list_del(&cgrp->sibling);
2180 spin_lock(&cgrp->dentry->d_lock);
2181 d = dget(cgrp->dentry);
2182 cgrp->dentry = NULL;
2183 spin_unlock(&d->d_lock);
2184
2185 cgroup_d_remove_dir(d);
2186 dput(d);
2187 root->number_of_cgroups--;
2188
2189 set_bit(CGRP_RELEASABLE, &parent->flags);
2190 check_for_release(parent);
2191
2192 mutex_unlock(&cgroup_mutex);
2193 /* Drop the active superblock reference that we took when we
2194 * created the cgroup */
2195 deactivate_super(sb);
2196 return 0;
2197}
2198
2199static void cgroup_init_subsys(struct cgroup_subsys *ss)
2200{
2201 struct cgroup_subsys_state *css;
2202 struct list_head *l;
2203 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
2204
2205 /* Create the top cgroup state for this subsystem */
2206 ss->root = &rootnode;
2207 css = ss->create(ss, dummytop);
2208 /* We don't handle early failures gracefully */
2209 BUG_ON(IS_ERR(css));
2210 init_cgroup_css(css, ss, dummytop);
2211
2212 /* Update all cgroup groups to contain a subsys
2213 * pointer to this state - since the subsystem is
2214 * newly registered, all tasks and hence all cgroup
2215 * groups are in the subsystem's top cgroup. */
2216 write_lock(&css_set_lock);
2217 l = &init_css_set.list;
2218 do {
2219 struct css_set *cg =
2220 list_entry(l, struct css_set, list);
2221 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2222 l = l->next;
2223 } while (l != &init_css_set.list);
2224 write_unlock(&css_set_lock);
2225
2226 /* If this subsystem requested that it be notified with fork
2227 * events, we should send it one now for every process in the
2228 * system */
2229 if (ss->fork) {
2230 struct task_struct *g, *p;
2231
2232 read_lock(&tasklist_lock);
2233 do_each_thread(g, p) {
2234 ss->fork(ss, p);
2235 } while_each_thread(g, p);
2236 read_unlock(&tasklist_lock);
2237 }
2238
2239 need_forkexit_callback |= ss->fork || ss->exit;
2240
2241 ss->active = 1;
2242}
2243
2244/**
2245 * cgroup_init_early - initialize cgroups at system boot, and
2246 * initialize any subsystems that request early init.
2247 */
2248int __init cgroup_init_early(void)
2249{
2250 int i;
2251 kref_init(&init_css_set.ref);
2252 kref_get(&init_css_set.ref);
2253 INIT_LIST_HEAD(&init_css_set.list);
2254 INIT_LIST_HEAD(&init_css_set.cg_links);
2255 INIT_LIST_HEAD(&init_css_set.tasks);
2256 css_set_count = 1;
2257 init_cgroup_root(&rootnode);
2258 list_add(&rootnode.root_list, &roots);
2259 root_count = 1;
2260 init_task.cgroups = &init_css_set;
2261
2262 init_css_set_link.cg = &init_css_set;
2263 list_add(&init_css_set_link.cgrp_link_list,
2264 &rootnode.top_cgroup.css_sets);
2265 list_add(&init_css_set_link.cg_link_list,
2266 &init_css_set.cg_links);
2267
2268 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2269 struct cgroup_subsys *ss = subsys[i];
2270
2271 BUG_ON(!ss->name);
2272 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
2273 BUG_ON(!ss->create);
2274 BUG_ON(!ss->destroy);
2275 if (ss->subsys_id != i) {
2276 printk(KERN_ERR "Subsys %s id == %d\n",
2277 ss->name, ss->subsys_id);
2278 BUG();
2279 }
2280
2281 if (ss->early_init)
2282 cgroup_init_subsys(ss);
2283 }
2284 return 0;
2285}
2286
2287/**
2288 * cgroup_init - register cgroup filesystem and /proc file, and
2289 * initialize any subsystems that didn't request early init.
2290 */
2291int __init cgroup_init(void)
2292{
2293 int err;
2294 int i;
2295 struct proc_dir_entry *entry;
2296
2297 err = bdi_init(&cgroup_backing_dev_info);
2298 if (err)
2299 return err;
2300
2301 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2302 struct cgroup_subsys *ss = subsys[i];
2303 if (!ss->early_init)
2304 cgroup_init_subsys(ss);
2305 }
2306
2307 err = register_filesystem(&cgroup_fs_type);
2308 if (err < 0)
2309 goto out;
2310
2311 entry = create_proc_entry("cgroups", 0, NULL);
2312 if (entry)
2313 entry->proc_fops = &proc_cgroupstats_operations;
2314
2315out:
2316 if (err)
2317 bdi_destroy(&cgroup_backing_dev_info);
2318
2319 return err;
2320}
2321
2322/*
2323 * proc_cgroup_show()
2324 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2325 * - Used for /proc/<pid>/cgroup.
2326 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2327 * doesn't really matter if tsk->cgroup changes after we read it,
2328 * and we take cgroup_mutex, keeping attach_task() from changing it
2329 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2330 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2331 * cgroup to top_cgroup.
2332 */
2333
2334/* TODO: Use a proper seq_file iterator */
2335static int proc_cgroup_show(struct seq_file *m, void *v)
2336{
2337 struct pid *pid;
2338 struct task_struct *tsk;
2339 char *buf;
2340 int retval;
2341 struct cgroupfs_root *root;
2342
2343 retval = -ENOMEM;
2344 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2345 if (!buf)
2346 goto out;
2347
2348 retval = -ESRCH;
2349 pid = m->private;
2350 tsk = get_pid_task(pid, PIDTYPE_PID);
2351 if (!tsk)
2352 goto out_free;
2353
2354 retval = 0;
2355
2356 mutex_lock(&cgroup_mutex);
2357
2358 for_each_root(root) {
2359 struct cgroup_subsys *ss;
2360 struct cgroup *cgrp;
2361 int subsys_id;
2362 int count = 0;
2363
2364 /* Skip this hierarchy if it has no active subsystems */
2365 if (!root->actual_subsys_bits)
2366 continue;
2367 for_each_subsys(root, ss)
2368 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2369 seq_putc(m, ':');
2370 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2371 cgrp = task_cgroup(tsk, subsys_id);
2372 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2373 if (retval < 0)
2374 goto out_unlock;
2375 seq_puts(m, buf);
2376 seq_putc(m, '\n');
2377 }
2378
2379out_unlock:
2380 mutex_unlock(&cgroup_mutex);
2381 put_task_struct(tsk);
2382out_free:
2383 kfree(buf);
2384out:
2385 return retval;
2386}
2387
2388static int cgroup_open(struct inode *inode, struct file *file)
2389{
2390 struct pid *pid = PROC_I(inode)->pid;
2391 return single_open(file, proc_cgroup_show, pid);
2392}
2393
2394struct file_operations proc_cgroup_operations = {
2395 .open = cgroup_open,
2396 .read = seq_read,
2397 .llseek = seq_lseek,
2398 .release = single_release,
2399};
2400
2401/* Display information about each subsystem and each hierarchy */
2402static int proc_cgroupstats_show(struct seq_file *m, void *v)
2403{
2404 int i;
2405 struct cgroupfs_root *root;
2406
2407 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
2408 mutex_lock(&cgroup_mutex);
2409 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2410 struct cgroup_subsys *ss = subsys[i];
2411 seq_printf(m, "%s\t%lu\t%d\n",
2412 ss->name, ss->root->subsys_bits,
2413 ss->root->number_of_cgroups);
2414 }
2415 mutex_unlock(&cgroup_mutex);
2416 return 0;
2417}
2418
2419static int cgroupstats_open(struct inode *inode, struct file *file)
2420{
2421 return single_open(file, proc_cgroupstats_show, 0);
2422}
2423
2424static struct file_operations proc_cgroupstats_operations = {
2425 .open = cgroupstats_open,
2426 .read = seq_read,
2427 .llseek = seq_lseek,
2428 .release = single_release,
2429};
2430
2431/**
2432 * cgroup_fork - attach newly forked task to its parents cgroup.
2433 * @tsk: pointer to task_struct of forking parent process.
2434 *
2435 * Description: A task inherits its parent's cgroup at fork().
2436 *
2437 * A pointer to the shared css_set was automatically copied in
2438 * fork.c by dup_task_struct(). However, we ignore that copy, since
2439 * it was not made under the protection of RCU or cgroup_mutex, so
2440 * might no longer be a valid cgroup pointer. attach_task() might
2441 * have already changed current->cgroups, allowing the previously
2442 * referenced cgroup group to be removed and freed.
2443 *
2444 * At the point that cgroup_fork() is called, 'current' is the parent
2445 * task, and the passed argument 'child' points to the child task.
2446 */
2447void cgroup_fork(struct task_struct *child)
2448{
2449 task_lock(current);
2450 child->cgroups = current->cgroups;
2451 get_css_set(child->cgroups);
2452 task_unlock(current);
2453 INIT_LIST_HEAD(&child->cg_list);
2454}
2455
2456/**
2457 * cgroup_fork_callbacks - called on a new task very soon before
2458 * adding it to the tasklist. No need to take any locks since no-one
2459 * can be operating on this task
2460 */
2461void cgroup_fork_callbacks(struct task_struct *child)
2462{
2463 if (need_forkexit_callback) {
2464 int i;
2465 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2466 struct cgroup_subsys *ss = subsys[i];
2467 if (ss->fork)
2468 ss->fork(ss, child);
2469 }
2470 }
2471}
2472
2473/**
2474 * cgroup_post_fork - called on a new task after adding it to the
2475 * task list. Adds the task to the list running through its css_set
2476 * if necessary. Has to be after the task is visible on the task list
2477 * in case we race with the first call to cgroup_iter_start() - to
2478 * guarantee that the new task ends up on its list. */
2479void cgroup_post_fork(struct task_struct *child)
2480{
2481 if (use_task_css_set_links) {
2482 write_lock(&css_set_lock);
2483 if (list_empty(&child->cg_list))
2484 list_add(&child->cg_list, &child->cgroups->tasks);
2485 write_unlock(&css_set_lock);
2486 }
2487}
2488/**
2489 * cgroup_exit - detach cgroup from exiting task
2490 * @tsk: pointer to task_struct of exiting process
2491 *
2492 * Description: Detach cgroup from @tsk and release it.
2493 *
2494 * Note that cgroups marked notify_on_release force every task in
2495 * them to take the global cgroup_mutex mutex when exiting.
2496 * This could impact scaling on very large systems. Be reluctant to
2497 * use notify_on_release cgroups where very high task exit scaling
2498 * is required on large systems.
2499 *
2500 * the_top_cgroup_hack:
2501 *
2502 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2503 *
2504 * We call cgroup_exit() while the task is still competent to
2505 * handle notify_on_release(), then leave the task attached to the
2506 * root cgroup in each hierarchy for the remainder of its exit.
2507 *
2508 * To do this properly, we would increment the reference count on
2509 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2510 * code we would add a second cgroup function call, to drop that
2511 * reference. This would just create an unnecessary hot spot on
2512 * the top_cgroup reference count, to no avail.
2513 *
2514 * Normally, holding a reference to a cgroup without bumping its
2515 * count is unsafe. The cgroup could go away, or someone could
2516 * attach us to a different cgroup, decrementing the count on
2517 * the first cgroup that we never incremented. But in this case,
2518 * top_cgroup isn't going away, and either task has PF_EXITING set,
2519 * which wards off any attach_task() attempts, or task is a failed
2520 * fork, never visible to attach_task.
2521 *
2522 */
2523void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2524{
2525 int i;
2526 struct css_set *cg;
2527
2528 if (run_callbacks && need_forkexit_callback) {
2529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2530 struct cgroup_subsys *ss = subsys[i];
2531 if (ss->exit)
2532 ss->exit(ss, tsk);
2533 }
2534 }
2535
2536 /*
2537 * Unlink from the css_set task list if necessary.
2538 * Optimistically check cg_list before taking
2539 * css_set_lock
2540 */
2541 if (!list_empty(&tsk->cg_list)) {
2542 write_lock(&css_set_lock);
2543 if (!list_empty(&tsk->cg_list))
2544 list_del(&tsk->cg_list);
2545 write_unlock(&css_set_lock);
2546 }
2547
2548 /* Reassign the task to the init_css_set. */
2549 task_lock(tsk);
2550 cg = tsk->cgroups;
2551 tsk->cgroups = &init_css_set;
2552 task_unlock(tsk);
2553 if (cg)
2554 put_css_set_taskexit(cg);
2555}
2556
2557/**
2558 * cgroup_clone - duplicate the current cgroup in the hierarchy
2559 * that the given subsystem is attached to, and move this task into
2560 * the new child
2561 */
2562int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2563{
2564 struct dentry *dentry;
2565 int ret = 0;
2566 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2567 struct cgroup *parent, *child;
2568 struct inode *inode;
2569 struct css_set *cg;
2570 struct cgroupfs_root *root;
2571 struct cgroup_subsys *ss;
2572
2573 /* We shouldn't be called by an unregistered subsystem */
2574 BUG_ON(!subsys->active);
2575
2576 /* First figure out what hierarchy and cgroup we're dealing
2577 * with, and pin them so we can drop cgroup_mutex */
2578 mutex_lock(&cgroup_mutex);
2579 again:
2580 root = subsys->root;
2581 if (root == &rootnode) {
2582 printk(KERN_INFO
2583 "Not cloning cgroup for unused subsystem %s\n",
2584 subsys->name);
2585 mutex_unlock(&cgroup_mutex);
2586 return 0;
2587 }
2588 cg = tsk->cgroups;
2589 parent = task_cgroup(tsk, subsys->subsys_id);
2590
2591 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
2592
2593 /* Pin the hierarchy */
2594 atomic_inc(&parent->root->sb->s_active);
2595
2596 /* Keep the cgroup alive */
2597 get_css_set(cg);
2598 mutex_unlock(&cgroup_mutex);
2599
2600 /* Now do the VFS work to create a cgroup */
2601 inode = parent->dentry->d_inode;
2602
2603 /* Hold the parent directory mutex across this operation to
2604 * stop anyone else deleting the new cgroup */
2605 mutex_lock(&inode->i_mutex);
2606 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2607 if (IS_ERR(dentry)) {
2608 printk(KERN_INFO
2609 "Couldn't allocate dentry for %s: %ld\n", nodename,
2610 PTR_ERR(dentry));
2611 ret = PTR_ERR(dentry);
2612 goto out_release;
2613 }
2614
2615 /* Create the cgroup directory, which also creates the cgroup */
2616 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
2617 child = __d_cgrp(dentry);
2618 dput(dentry);
2619 if (ret) {
2620 printk(KERN_INFO
2621 "Failed to create cgroup %s: %d\n", nodename,
2622 ret);
2623 goto out_release;
2624 }
2625
2626 if (!child) {
2627 printk(KERN_INFO
2628 "Couldn't find new cgroup %s\n", nodename);
2629 ret = -ENOMEM;
2630 goto out_release;
2631 }
2632
2633 /* The cgroup now exists. Retake cgroup_mutex and check
2634 * that we're still in the same state that we thought we
2635 * were. */
2636 mutex_lock(&cgroup_mutex);
2637 if ((root != subsys->root) ||
2638 (parent != task_cgroup(tsk, subsys->subsys_id))) {
2639 /* Aargh, we raced ... */
2640 mutex_unlock(&inode->i_mutex);
2641 put_css_set(cg);
2642
2643 deactivate_super(parent->root->sb);
2644 /* The cgroup is still accessible in the VFS, but
2645 * we're not going to try to rmdir() it at this
2646 * point. */
2647 printk(KERN_INFO
2648 "Race in cgroup_clone() - leaking cgroup %s\n",
2649 nodename);
2650 goto again;
2651 }
2652
2653 /* do any required auto-setup */
2654 for_each_subsys(root, ss) {
2655 if (ss->post_clone)
2656 ss->post_clone(ss, child);
2657 }
2658
2659 /* All seems fine. Finish by moving the task into the new cgroup */
2660 ret = attach_task(child, tsk);
2661 mutex_unlock(&cgroup_mutex);
2662
2663 out_release:
2664 mutex_unlock(&inode->i_mutex);
2665
2666 mutex_lock(&cgroup_mutex);
2667 put_css_set(cg);
2668 mutex_unlock(&cgroup_mutex);
2669 deactivate_super(parent->root->sb);
2670 return ret;
2671}
2672
2673/*
2674 * See if "cgrp" is a descendant of the current task's cgroup in
2675 * the appropriate hierarchy
2676 *
2677 * If we are sending in dummytop, then presumably we are creating
2678 * the top cgroup in the subsystem.
2679 *
2680 * Called only by the ns (nsproxy) cgroup.
2681 */
2682int cgroup_is_descendant(const struct cgroup *cgrp)
2683{
2684 int ret;
2685 struct cgroup *target;
2686 int subsys_id;
2687
2688 if (cgrp == dummytop)
2689 return 1;
2690
2691 get_first_subsys(cgrp, NULL, &subsys_id);
2692 target = task_cgroup(current, subsys_id);
2693 while (cgrp != target && cgrp!= cgrp->top_cgroup)
2694 cgrp = cgrp->parent;
2695 ret = (cgrp == target);
2696 return ret;
2697}
2698
2699static void check_for_release(struct cgroup *cgrp)
2700{
2701 /* All of these checks rely on RCU to keep the cgroup
2702 * structure alive */
2703 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
2704 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
2705 /* Control Group is currently removeable. If it's not
2706 * already queued for a userspace notification, queue
2707 * it now */
2708 int need_schedule_work = 0;
2709 spin_lock(&release_list_lock);
2710 if (!cgroup_is_removed(cgrp) &&
2711 list_empty(&cgrp->release_list)) {
2712 list_add(&cgrp->release_list, &release_list);
2713 need_schedule_work = 1;
2714 }
2715 spin_unlock(&release_list_lock);
2716 if (need_schedule_work)
2717 schedule_work(&release_agent_work);
2718 }
2719}
2720
2721void __css_put(struct cgroup_subsys_state *css)
2722{
2723 struct cgroup *cgrp = css->cgroup;
2724 rcu_read_lock();
2725 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
2726 set_bit(CGRP_RELEASABLE, &cgrp->flags);
2727 check_for_release(cgrp);
2728 }
2729 rcu_read_unlock();
2730}
2731
2732/*
2733 * Notify userspace when a cgroup is released, by running the
2734 * configured release agent with the name of the cgroup (path
2735 * relative to the root of cgroup file system) as the argument.
2736 *
2737 * Most likely, this user command will try to rmdir this cgroup.
2738 *
2739 * This races with the possibility that some other task will be
2740 * attached to this cgroup before it is removed, or that some other
2741 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
2742 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
2743 * unused, and this cgroup will be reprieved from its death sentence,
2744 * to continue to serve a useful existence. Next time it's released,
2745 * we will get notified again, if it still has 'notify_on_release' set.
2746 *
2747 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
2748 * means only wait until the task is successfully execve()'d. The
2749 * separate release agent task is forked by call_usermodehelper(),
2750 * then control in this thread returns here, without waiting for the
2751 * release agent task. We don't bother to wait because the caller of
2752 * this routine has no use for the exit status of the release agent
2753 * task, so no sense holding our caller up for that.
2754 *
2755 */
2756
2757static void cgroup_release_agent(struct work_struct *work)
2758{
2759 BUG_ON(work != &release_agent_work);
2760 mutex_lock(&cgroup_mutex);
2761 spin_lock(&release_list_lock);
2762 while (!list_empty(&release_list)) {
2763 char *argv[3], *envp[3];
2764 int i;
2765 char *pathbuf;
2766 struct cgroup *cgrp = list_entry(release_list.next,
2767 struct cgroup,
2768 release_list);
2769 list_del_init(&cgrp->release_list);
2770 spin_unlock(&release_list_lock);
2771 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2772 if (!pathbuf) {
2773 spin_lock(&release_list_lock);
2774 continue;
2775 }
2776
2777 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
2778 kfree(pathbuf);
2779 spin_lock(&release_list_lock);
2780 continue;
2781 }
2782
2783 i = 0;
2784 argv[i++] = cgrp->root->release_agent_path;
2785 argv[i++] = (char *)pathbuf;
2786 argv[i] = NULL;
2787
2788 i = 0;
2789 /* minimal command environment */
2790 envp[i++] = "HOME=/";
2791 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
2792 envp[i] = NULL;
2793
2794 /* Drop the lock while we invoke the usermode helper,
2795 * since the exec could involve hitting disk and hence
2796 * be a slow process */
2797 mutex_unlock(&cgroup_mutex);
2798 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2799 kfree(pathbuf);
2800 mutex_lock(&cgroup_mutex);
2801 spin_lock(&release_list_lock);
2802 }
2803 spin_unlock(&release_list_lock);
2804 mutex_unlock(&cgroup_mutex);
2805}
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
new file mode 100644
index 000000000000..37301e877cb0
--- /dev/null
+++ b/kernel/cgroup_debug.c
@@ -0,0 +1,97 @@
1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 cgroup_lock();
44 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count;
47}
48
49static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
50{
51 return (u64)(long)current->cgroups;
52}
53
54static u64 current_css_set_refcount_read(struct cgroup *cont,
55 struct cftype *cft)
56{
57 u64 count;
58
59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount);
61 rcu_read_unlock();
62 return count;
63}
64
65static struct cftype files[] = {
66 {
67 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read,
69 },
70 {
71 .name = "taskcount",
72 .read_uint = taskcount_read,
73 },
74
75 {
76 .name = "current_css_set",
77 .read_uint = current_css_set_read,
78 },
79
80 {
81 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read,
83 },
84};
85
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
87{
88 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
89}
90
91struct cgroup_subsys debug_subsys = {
92 .name = "debug",
93 .create = debug_create,
94 .destroy = debug_destroy,
95 .populate = debug_populate,
96 .subsys_id = debug_subsys_id,
97};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a21f71af9d81..6b3a0c15144f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -98,7 +98,8 @@ static inline void check_for_tasks(int cpu)
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %x) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, task_pid_nr(p), cpu,
102 p->state, p->flags);
102 } 103 }
103 write_unlock_irq(&tasklist_lock); 104 write_unlock_irq(&tasklist_lock);
104} 105}
@@ -264,6 +265,15 @@ out_notify:
264int __cpuinit cpu_up(unsigned int cpu) 265int __cpuinit cpu_up(unsigned int cpu)
265{ 266{
266 int err = 0; 267 int err = 0;
268 if (!cpu_isset(cpu, cpu_possible_map)) {
269 printk(KERN_ERR "can't online cpu %d because it is not "
270 "configured as may-hotadd at boot time\n", cpu);
271#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
272 printk(KERN_ERR "please check additional_cpus= boot "
273 "parameter\n");
274#endif
275 return -EINVAL;
276 }
267 277
268 mutex_lock(&cpu_add_remove_lock); 278 mutex_lock(&cpu_add_remove_lock);
269 if (cpu_hotplug_disabled) 279 if (cpu_hotplug_disabled)
diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c
new file mode 100644
index 000000000000..731e47e7f164
--- /dev/null
+++ b/kernel/cpu_acct.c
@@ -0,0 +1,186 @@
1/*
2 * kernel/cpu_acct.c - CPU accounting cgroup subsystem
3 *
4 * Copyright (C) Google Inc, 2006
5 *
6 * Developed by Paul Menage (menage@google.com) and Balbir Singh
7 * (balbir@in.ibm.com)
8 *
9 */
10
11/*
12 * Example cgroup subsystem for reporting total CPU usage of tasks in a
13 * cgroup, along with percentage load over a time interval
14 */
15
16#include <linux/module.h>
17#include <linux/cgroup.h>
18#include <linux/fs.h>
19#include <linux/rcupdate.h>
20
21#include <asm/div64.h>
22
23struct cpuacct {
24 struct cgroup_subsys_state css;
25 spinlock_t lock;
26 /* total time used by this class */
27 cputime64_t time;
28
29 /* time when next load calculation occurs */
30 u64 next_interval_check;
31
32 /* time used in current period */
33 cputime64_t current_interval_time;
34
35 /* time used in last period */
36 cputime64_t last_interval_time;
37};
38
39struct cgroup_subsys cpuacct_subsys;
40
41static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
42{
43 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
44 struct cpuacct, css);
45}
46
47static inline struct cpuacct *task_ca(struct task_struct *task)
48{
49 return container_of(task_subsys_state(task, cpuacct_subsys_id),
50 struct cpuacct, css);
51}
52
53#define INTERVAL (HZ * 10)
54
55static inline u64 next_interval_boundary(u64 now)
56{
57 /* calculate the next interval boundary beyond the
58 * current time */
59 do_div(now, INTERVAL);
60 return (now + 1) * INTERVAL;
61}
62
63static struct cgroup_subsys_state *cpuacct_create(
64 struct cgroup_subsys *ss, struct cgroup *cont)
65{
66 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
67
68 if (!ca)
69 return ERR_PTR(-ENOMEM);
70 spin_lock_init(&ca->lock);
71 ca->next_interval_check = next_interval_boundary(get_jiffies_64());
72 return &ca->css;
73}
74
75static void cpuacct_destroy(struct cgroup_subsys *ss,
76 struct cgroup *cont)
77{
78 kfree(cgroup_ca(cont));
79}
80
81/* Lazily update the load calculation if necessary. Called with ca locked */
82static void cpuusage_update(struct cpuacct *ca)
83{
84 u64 now = get_jiffies_64();
85
86 /* If we're not due for an update, return */
87 if (ca->next_interval_check > now)
88 return;
89
90 if (ca->next_interval_check <= (now - INTERVAL)) {
91 /* If it's been more than an interval since the last
92 * check, then catch up - the last interval must have
93 * been zero load */
94 ca->last_interval_time = 0;
95 ca->next_interval_check = next_interval_boundary(now);
96 } else {
97 /* If a steal takes the last interval time negative,
98 * then we just ignore it */
99 if ((s64)ca->current_interval_time > 0)
100 ca->last_interval_time = ca->current_interval_time;
101 else
102 ca->last_interval_time = 0;
103 ca->next_interval_check += INTERVAL;
104 }
105 ca->current_interval_time = 0;
106}
107
108static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
109{
110 struct cpuacct *ca = cgroup_ca(cont);
111 u64 time;
112
113 spin_lock_irq(&ca->lock);
114 cpuusage_update(ca);
115 time = cputime64_to_jiffies64(ca->time);
116 spin_unlock_irq(&ca->lock);
117
118 /* Convert 64-bit jiffies to seconds */
119 time *= 1000;
120 do_div(time, HZ);
121 return time;
122}
123
124static u64 load_read(struct cgroup *cont, struct cftype *cft)
125{
126 struct cpuacct *ca = cgroup_ca(cont);
127 u64 time;
128
129 /* Find the time used in the previous interval */
130 spin_lock_irq(&ca->lock);
131 cpuusage_update(ca);
132 time = cputime64_to_jiffies64(ca->last_interval_time);
133 spin_unlock_irq(&ca->lock);
134
135 /* Convert time to a percentage, to give the load in the
136 * previous period */
137 time *= 100;
138 do_div(time, INTERVAL);
139
140 return time;
141}
142
143static struct cftype files[] = {
144 {
145 .name = "usage",
146 .read_uint = cpuusage_read,
147 },
148 {
149 .name = "load",
150 .read_uint = load_read,
151 }
152};
153
154static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
155{
156 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
157}
158
159void cpuacct_charge(struct task_struct *task, cputime_t cputime)
160{
161
162 struct cpuacct *ca;
163 unsigned long flags;
164
165 if (!cpuacct_subsys.active)
166 return;
167 rcu_read_lock();
168 ca = task_ca(task);
169 if (ca) {
170 spin_lock_irqsave(&ca->lock, flags);
171 cpuusage_update(ca);
172 ca->time = cputime64_add(ca->time, cputime);
173 ca->current_interval_time =
174 cputime64_add(ca->current_interval_time, cputime);
175 spin_unlock_irqrestore(&ca->lock, flags);
176 }
177 rcu_read_unlock();
178}
179
180struct cgroup_subsys cpuacct_subsys = {
181 .name = "cpuacct",
182 .create = cpuacct_create,
183 .destroy = cpuacct_destroy,
184 .populate = cpuacct_populate,
185 .subsys_id = cpuacct_subsys_id,
186};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64950fa5d321..50f5dc463688 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,7 +4,8 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
8 * 9 *
9 * Portions derived from Patrick Mochel's sysfs code. 10 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
@@ -12,6 +13,7 @@
12 * 2003-10-10 Written by Simon Derr. 13 * 2003-10-10 Written by Simon Derr.
13 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
14 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
15 * 17 *
16 * This file is subject to the terms and conditions of the GNU General Public 18 * This file is subject to the terms and conditions of the GNU General Public
17 * License. See the file COPYING in the main directory of the Linux 19 * License. See the file COPYING in the main directory of the Linux
@@ -36,6 +38,7 @@
36#include <linux/mount.h> 38#include <linux/mount.h>
37#include <linux/namei.h> 39#include <linux/namei.h>
38#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
39#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
40#include <linux/rcupdate.h> 43#include <linux/rcupdate.h>
41#include <linux/sched.h> 44#include <linux/sched.h>
@@ -52,8 +55,7 @@
52#include <asm/uaccess.h> 55#include <asm/uaccess.h>
53#include <asm/atomic.h> 56#include <asm/atomic.h>
54#include <linux/mutex.h> 57#include <linux/mutex.h>
55 58#include <linux/kfifo.h>
56#define CPUSET_SUPER_MAGIC 0x27e0eb
57 59
58/* 60/*
59 * Tracks how many cpusets are currently defined in system. 61 * Tracks how many cpusets are currently defined in system.
@@ -62,6 +64,10 @@
62 */ 64 */
63int number_of_cpusets __read_mostly; 65int number_of_cpusets __read_mostly;
64 66
67/* Retrieve the cpuset from a cgroup */
68struct cgroup_subsys cpuset_subsys;
69struct cpuset;
70
65/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
66 72
67struct fmeter { 73struct fmeter {
@@ -72,24 +78,13 @@ struct fmeter {
72}; 78};
73 79
74struct cpuset { 80struct cpuset {
81 struct cgroup_subsys_state css;
82
75 unsigned long flags; /* "unsigned long" so bitops work */ 83 unsigned long flags; /* "unsigned long" so bitops work */
76 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 84 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
77 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 85 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
78 86
79 /*
80 * Count is atomic so can incr (fork) or decr (exit) without a lock.
81 */
82 atomic_t count; /* count tasks using this cpuset */
83
84 /*
85 * We link our 'sibling' struct into our parents 'children'.
86 * Our children link their 'sibling' into our 'children'.
87 */
88 struct list_head sibling; /* my parents children */
89 struct list_head children; /* my children */
90
91 struct cpuset *parent; /* my parent */ 87 struct cpuset *parent; /* my parent */
92 struct dentry *dentry; /* cpuset fs entry */
93 88
94 /* 89 /*
95 * Copy of global cpuset_mems_generation as of the most 90 * Copy of global cpuset_mems_generation as of the most
@@ -98,15 +93,32 @@ struct cpuset {
98 int mems_generation; 93 int mems_generation;
99 94
100 struct fmeter fmeter; /* memory_pressure filter */ 95 struct fmeter fmeter; /* memory_pressure filter */
96
97 /* partition number for rebuild_sched_domains() */
98 int pn;
101}; 99};
102 100
101/* Retrieve the cpuset for a cgroup */
102static inline struct cpuset *cgroup_cs(struct cgroup *cont)
103{
104 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
105 struct cpuset, css);
106}
107
108/* Retrieve the cpuset for a task */
109static inline struct cpuset *task_cs(struct task_struct *task)
110{
111 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css);
113}
114
115
103/* bits in struct cpuset flags field */ 116/* bits in struct cpuset flags field */
104typedef enum { 117typedef enum {
105 CS_CPU_EXCLUSIVE, 118 CS_CPU_EXCLUSIVE,
106 CS_MEM_EXCLUSIVE, 119 CS_MEM_EXCLUSIVE,
107 CS_MEMORY_MIGRATE, 120 CS_MEMORY_MIGRATE,
108 CS_REMOVED, 121 CS_SCHED_LOAD_BALANCE,
109 CS_NOTIFY_ON_RELEASE,
110 CS_SPREAD_PAGE, 122 CS_SPREAD_PAGE,
111 CS_SPREAD_SLAB, 123 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 124} cpuset_flagbits_t;
@@ -122,14 +134,9 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
122 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 134 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 135}
124 136
125static inline int is_removed(const struct cpuset *cs) 137static inline int is_sched_load_balance(const struct cpuset *cs)
126{ 138{
127 return test_bit(CS_REMOVED, &cs->flags); 139 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
128}
129
130static inline int notify_on_release(const struct cpuset *cs)
131{
132 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 140}
134 141
135static inline int is_memory_migrate(const struct cpuset *cs) 142static inline int is_memory_migrate(const struct cpuset *cs)
@@ -172,14 +179,8 @@ static struct cpuset top_cpuset = {
172 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 179 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
173 .cpus_allowed = CPU_MASK_ALL, 180 .cpus_allowed = CPU_MASK_ALL,
174 .mems_allowed = NODE_MASK_ALL, 181 .mems_allowed = NODE_MASK_ALL,
175 .count = ATOMIC_INIT(0),
176 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
177 .children = LIST_HEAD_INIT(top_cpuset.children),
178}; 182};
179 183
180static struct vfsmount *cpuset_mount;
181static struct super_block *cpuset_sb;
182
183/* 184/*
184 * We have two global cpuset mutexes below. They can nest. 185 * We have two global cpuset mutexes below. They can nest.
185 * It is ok to first take manage_mutex, then nest callback_mutex. We also 186 * It is ok to first take manage_mutex, then nest callback_mutex. We also
@@ -263,297 +264,33 @@ static struct super_block *cpuset_sb;
263 * the routine cpuset_update_task_memory_state(). 264 * the routine cpuset_update_task_memory_state().
264 */ 265 */
265 266
266static DEFINE_MUTEX(manage_mutex);
267static DEFINE_MUTEX(callback_mutex); 267static DEFINE_MUTEX(callback_mutex);
268 268
269/* 269/* This is ugly, but preserves the userspace API for existing cpuset
270 * A couple of forward declarations required, due to cyclic reference loop: 270 * users. If someone tries to mount the "cpuset" filesystem, we
271 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 271 * silently switch it to mount "cgroup" instead */
272 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
273 */
274
275static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
276static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
277
278static struct backing_dev_info cpuset_backing_dev_info = {
279 .ra_pages = 0, /* No readahead */
280 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
281};
282
283static struct inode *cpuset_new_inode(mode_t mode)
284{
285 struct inode *inode = new_inode(cpuset_sb);
286
287 if (inode) {
288 inode->i_mode = mode;
289 inode->i_uid = current->fsuid;
290 inode->i_gid = current->fsgid;
291 inode->i_blocks = 0;
292 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
293 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
294 }
295 return inode;
296}
297
298static void cpuset_diput(struct dentry *dentry, struct inode *inode)
299{
300 /* is dentry a directory ? if so, kfree() associated cpuset */
301 if (S_ISDIR(inode->i_mode)) {
302 struct cpuset *cs = dentry->d_fsdata;
303 BUG_ON(!(is_removed(cs)));
304 kfree(cs);
305 }
306 iput(inode);
307}
308
309static struct dentry_operations cpuset_dops = {
310 .d_iput = cpuset_diput,
311};
312
313static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
314{
315 struct dentry *d = lookup_one_len(name, parent, strlen(name));
316 if (!IS_ERR(d))
317 d->d_op = &cpuset_dops;
318 return d;
319}
320
321static void remove_dir(struct dentry *d)
322{
323 struct dentry *parent = dget(d->d_parent);
324
325 d_delete(d);
326 simple_rmdir(parent->d_inode, d);
327 dput(parent);
328}
329
330/*
331 * NOTE : the dentry must have been dget()'ed
332 */
333static void cpuset_d_remove_dir(struct dentry *dentry)
334{
335 struct list_head *node;
336
337 spin_lock(&dcache_lock);
338 node = dentry->d_subdirs.next;
339 while (node != &dentry->d_subdirs) {
340 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
341 list_del_init(node);
342 if (d->d_inode) {
343 d = dget_locked(d);
344 spin_unlock(&dcache_lock);
345 d_delete(d);
346 simple_unlink(dentry->d_inode, d);
347 dput(d);
348 spin_lock(&dcache_lock);
349 }
350 node = dentry->d_subdirs.next;
351 }
352 list_del_init(&dentry->d_u.d_child);
353 spin_unlock(&dcache_lock);
354 remove_dir(dentry);
355}
356
357static struct super_operations cpuset_ops = {
358 .statfs = simple_statfs,
359 .drop_inode = generic_delete_inode,
360};
361
362static int cpuset_fill_super(struct super_block *sb, void *unused_data,
363 int unused_silent)
364{
365 struct inode *inode;
366 struct dentry *root;
367
368 sb->s_blocksize = PAGE_CACHE_SIZE;
369 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
370 sb->s_magic = CPUSET_SUPER_MAGIC;
371 sb->s_op = &cpuset_ops;
372 cpuset_sb = sb;
373
374 inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
375 if (inode) {
376 inode->i_op = &simple_dir_inode_operations;
377 inode->i_fop = &simple_dir_operations;
378 /* directories start off with i_nlink == 2 (for "." entry) */
379 inc_nlink(inode);
380 } else {
381 return -ENOMEM;
382 }
383
384 root = d_alloc_root(inode);
385 if (!root) {
386 iput(inode);
387 return -ENOMEM;
388 }
389 sb->s_root = root;
390 return 0;
391}
392
393static int cpuset_get_sb(struct file_system_type *fs_type, 272static int cpuset_get_sb(struct file_system_type *fs_type,
394 int flags, const char *unused_dev_name, 273 int flags, const char *unused_dev_name,
395 void *data, struct vfsmount *mnt) 274 void *data, struct vfsmount *mnt)
396{ 275{
397 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); 276 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
277 int ret = -ENODEV;
278 if (cgroup_fs) {
279 char mountopts[] =
280 "cpuset,noprefix,"
281 "release_agent=/sbin/cpuset_release_agent";
282 ret = cgroup_fs->get_sb(cgroup_fs, flags,
283 unused_dev_name, mountopts, mnt);
284 put_filesystem(cgroup_fs);
285 }
286 return ret;
398} 287}
399 288
400static struct file_system_type cpuset_fs_type = { 289static struct file_system_type cpuset_fs_type = {
401 .name = "cpuset", 290 .name = "cpuset",
402 .get_sb = cpuset_get_sb, 291 .get_sb = cpuset_get_sb,
403 .kill_sb = kill_litter_super,
404}; 292};
405 293
406/* struct cftype:
407 *
408 * The files in the cpuset filesystem mostly have a very simple read/write
409 * handling, some common function will take care of it. Nevertheless some cases
410 * (read tasks) are special and therefore I define this structure for every
411 * kind of file.
412 *
413 *
414 * When reading/writing to a file:
415 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
416 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
417 */
418
419struct cftype {
420 char *name;
421 int private;
422 int (*open) (struct inode *inode, struct file *file);
423 ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
424 loff_t *ppos);
425 int (*write) (struct file *file, const char __user *buf, size_t nbytes,
426 loff_t *ppos);
427 int (*release) (struct inode *inode, struct file *file);
428};
429
430static inline struct cpuset *__d_cs(struct dentry *dentry)
431{
432 return dentry->d_fsdata;
433}
434
435static inline struct cftype *__d_cft(struct dentry *dentry)
436{
437 return dentry->d_fsdata;
438}
439
440/*
441 * Call with manage_mutex held. Writes path of cpuset into buf.
442 * Returns 0 on success, -errno on error.
443 */
444
445static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
446{
447 char *start;
448
449 start = buf + buflen;
450
451 *--start = '\0';
452 for (;;) {
453 int len = cs->dentry->d_name.len;
454 if ((start -= len) < buf)
455 return -ENAMETOOLONG;
456 memcpy(start, cs->dentry->d_name.name, len);
457 cs = cs->parent;
458 if (!cs)
459 break;
460 if (!cs->parent)
461 continue;
462 if (--start < buf)
463 return -ENAMETOOLONG;
464 *start = '/';
465 }
466 memmove(buf, start, buf + buflen - start);
467 return 0;
468}
469
470/*
471 * Notify userspace when a cpuset is released, by running
472 * /sbin/cpuset_release_agent with the name of the cpuset (path
473 * relative to the root of cpuset file system) as the argument.
474 *
475 * Most likely, this user command will try to rmdir this cpuset.
476 *
477 * This races with the possibility that some other task will be
478 * attached to this cpuset before it is removed, or that some other
479 * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
480 * The presumed 'rmdir' will fail quietly if this cpuset is no longer
481 * unused, and this cpuset will be reprieved from its death sentence,
482 * to continue to serve a useful existence. Next time it's released,
483 * we will get notified again, if it still has 'notify_on_release' set.
484 *
485 * The final arg to call_usermodehelper() is 0, which means don't
486 * wait. The separate /sbin/cpuset_release_agent task is forked by
487 * call_usermodehelper(), then control in this thread returns here,
488 * without waiting for the release agent task. We don't bother to
489 * wait because the caller of this routine has no use for the exit
490 * status of the /sbin/cpuset_release_agent task, so no sense holding
491 * our caller up for that.
492 *
493 * When we had only one cpuset mutex, we had to call this
494 * without holding it, to avoid deadlock when call_usermodehelper()
495 * allocated memory. With two locks, we could now call this while
496 * holding manage_mutex, but we still don't, so as to minimize
497 * the time manage_mutex is held.
498 */
499
500static void cpuset_release_agent(const char *pathbuf)
501{
502 char *argv[3], *envp[3];
503 int i;
504
505 if (!pathbuf)
506 return;
507
508 i = 0;
509 argv[i++] = "/sbin/cpuset_release_agent";
510 argv[i++] = (char *)pathbuf;
511 argv[i] = NULL;
512
513 i = 0;
514 /* minimal command environment */
515 envp[i++] = "HOME=/";
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL;
518
519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf);
521}
522
523/*
524 * Either cs->count of using tasks transitioned to zero, or the
525 * cs->children list of child cpusets just became empty. If this
526 * cs is notify_on_release() and now both the user count is zero and
527 * the list of children is empty, prepare cpuset path in a kmalloc'd
528 * buffer, to be returned via ppathbuf, so that the caller can invoke
529 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
530 * Call here with manage_mutex held.
531 *
532 * This check_for_release() routine is responsible for kmalloc'ing
533 * pathbuf. The above cpuset_release_agent() is responsible for
534 * kfree'ing pathbuf. The caller of these routines is responsible
535 * for providing a pathbuf pointer, initialized to NULL, then
536 * calling check_for_release() with manage_mutex held and the address
537 * of the pathbuf pointer, then dropping manage_mutex, then calling
538 * cpuset_release_agent() with pathbuf, as set by check_for_release().
539 */
540
541static void check_for_release(struct cpuset *cs, char **ppathbuf)
542{
543 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
544 list_empty(&cs->children)) {
545 char *buf;
546
547 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
548 if (!buf)
549 return;
550 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
551 kfree(buf);
552 else
553 *ppathbuf = buf;
554 }
555}
556
557/* 294/*
558 * Return in *pmask the portion of a cpusets's cpus_allowed that 295 * Return in *pmask the portion of a cpusets's cpus_allowed that
559 * are online. If none are online, walk up the cpuset hierarchy 296 * are online. If none are online, walk up the cpuset hierarchy
@@ -653,20 +390,19 @@ void cpuset_update_task_memory_state(void)
653 struct task_struct *tsk = current; 390 struct task_struct *tsk = current;
654 struct cpuset *cs; 391 struct cpuset *cs;
655 392
656 if (tsk->cpuset == &top_cpuset) { 393 if (task_cs(tsk) == &top_cpuset) {
657 /* Don't need rcu for top_cpuset. It's never freed. */ 394 /* Don't need rcu for top_cpuset. It's never freed. */
658 my_cpusets_mem_gen = top_cpuset.mems_generation; 395 my_cpusets_mem_gen = top_cpuset.mems_generation;
659 } else { 396 } else {
660 rcu_read_lock(); 397 rcu_read_lock();
661 cs = rcu_dereference(tsk->cpuset); 398 my_cpusets_mem_gen = task_cs(current)->mems_generation;
662 my_cpusets_mem_gen = cs->mems_generation;
663 rcu_read_unlock(); 399 rcu_read_unlock();
664 } 400 }
665 401
666 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 402 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
667 mutex_lock(&callback_mutex); 403 mutex_lock(&callback_mutex);
668 task_lock(tsk); 404 task_lock(tsk);
669 cs = tsk->cpuset; /* Maybe changed when task not locked */ 405 cs = task_cs(tsk); /* Maybe changed when task not locked */
670 guarantee_online_mems(cs, &tsk->mems_allowed); 406 guarantee_online_mems(cs, &tsk->mems_allowed);
671 tsk->cpuset_mems_generation = cs->mems_generation; 407 tsk->cpuset_mems_generation = cs->mems_generation;
672 if (is_spread_page(cs)) 408 if (is_spread_page(cs))
@@ -721,11 +457,12 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
721 457
722static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 458static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
723{ 459{
460 struct cgroup *cont;
724 struct cpuset *c, *par; 461 struct cpuset *c, *par;
725 462
726 /* Each of our child cpusets must be a subset of us */ 463 /* Each of our child cpusets must be a subset of us */
727 list_for_each_entry(c, &cur->children, sibling) { 464 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
728 if (!is_cpuset_subset(c, trial)) 465 if (!is_cpuset_subset(cgroup_cs(cont), trial))
729 return -EBUSY; 466 return -EBUSY;
730 } 467 }
731 468
@@ -740,7 +477,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
740 return -EACCES; 477 return -EACCES;
741 478
742 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 479 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
743 list_for_each_entry(c, &par->children, sibling) { 480 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481 c = cgroup_cs(cont);
744 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 482 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
745 c != cur && 483 c != cur &&
746 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 484 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -751,17 +489,265 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
751 return -EINVAL; 489 return -EINVAL;
752 } 490 }
753 491
492 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
493 if (cgroup_task_count(cur->css.cgroup)) {
494 if (cpus_empty(trial->cpus_allowed) ||
495 nodes_empty(trial->mems_allowed)) {
496 return -ENOSPC;
497 }
498 }
499
754 return 0; 500 return 0;
755} 501}
756 502
757/* 503/*
504 * Helper routine for rebuild_sched_domains().
505 * Do cpusets a, b have overlapping cpus_allowed masks?
506 */
507
508static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
509{
510 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
511}
512
513/*
514 * rebuild_sched_domains()
515 *
516 * If the flag 'sched_load_balance' of any cpuset with non-empty
517 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
518 * which has that flag enabled, or if any cpuset with a non-empty
519 * 'cpus' is removed, then call this routine to rebuild the
520 * scheduler's dynamic sched domains.
521 *
522 * This routine builds a partial partition of the systems CPUs
523 * (the set of non-overlappping cpumask_t's in the array 'part'
524 * below), and passes that partial partition to the kernel/sched.c
525 * partition_sched_domains() routine, which will rebuild the
526 * schedulers load balancing domains (sched domains) as specified
527 * by that partial partition. A 'partial partition' is a set of
528 * non-overlapping subsets whose union is a subset of that set.
529 *
530 * See "What is sched_load_balance" in Documentation/cpusets.txt
531 * for a background explanation of this.
532 *
533 * Does not return errors, on the theory that the callers of this
534 * routine would rather not worry about failures to rebuild sched
535 * domains when operating in the severe memory shortage situations
536 * that could cause allocation failures below.
537 *
538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
544 * So the reverse nesting would risk an ABBA deadlock.
545 *
546 * The three key local variables below are:
547 * q - a kfifo queue of cpuset pointers, used to implement a
548 * top-down scan of all cpusets. This scan loads a pointer
549 * to each cpuset marked is_sched_load_balance into the
550 * array 'csa'. For our purposes, rebuilding the schedulers
551 * sched domains, we can ignore !is_sched_load_balance cpusets.
552 * csa - (for CpuSet Array) Array of pointers to all the cpusets
553 * that need to be load balanced, for convenient iterative
554 * access by the subsequent code that finds the best partition,
555 * i.e the set of domains (subsets) of CPUs such that the
556 * cpus_allowed of every cpuset marked is_sched_load_balance
557 * is a subset of one of these domains, while there are as
558 * many such domains as possible, each as small as possible.
559 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
560 * the kernel/sched.c routine partition_sched_domains() in a
561 * convenient format, that can be easily compared to the prior
562 * value to determine what partition elements (sched domains)
563 * were changed (added or removed.)
564 *
565 * Finding the best partition (set of domains):
566 * The triple nested loops below over i, j, k scan over the
567 * load balanced cpusets (using the array of cpuset pointers in
568 * csa[]) looking for pairs of cpusets that have overlapping
569 * cpus_allowed, but which don't have the same 'pn' partition
570 * number and gives them in the same partition number. It keeps
571 * looping on the 'restart' label until it can no longer find
572 * any such pairs.
573 *
574 * The union of the cpus_allowed masks from the set of
575 * all cpusets having the same 'pn' value then form the one
576 * element of the partition (one sched domain) to be passed to
577 * partition_sched_domains().
578 */
579
580static void rebuild_sched_domains(void)
581{
582 struct kfifo *q; /* queue of cpusets to be scanned */
583 struct cpuset *cp; /* scans q */
584 struct cpuset **csa; /* array of all cpuset ptrs */
585 int csn; /* how many cpuset ptrs in csa so far */
586 int i, j, k; /* indices for partition finding loops */
587 cpumask_t *doms; /* resulting partition; i.e. sched domains */
588 int ndoms; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */
590
591 q = NULL;
592 csa = NULL;
593 doms = NULL;
594
595 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) {
597 ndoms = 1;
598 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
599 if (!doms)
600 goto rebuild;
601 *doms = top_cpuset.cpus_allowed;
602 goto rebuild;
603 }
604
605 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
606 if (IS_ERR(q))
607 goto done;
608 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
609 if (!csa)
610 goto done;
611 csn = 0;
612
613 cp = &top_cpuset;
614 __kfifo_put(q, (void *)&cp, sizeof(cp));
615 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
616 struct cgroup *cont;
617 struct cpuset *child; /* scans child cpusets of cp */
618 if (is_sched_load_balance(cp))
619 csa[csn++] = cp;
620 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
621 child = cgroup_cs(cont);
622 __kfifo_put(q, (void *)&child, sizeof(cp));
623 }
624 }
625
626 for (i = 0; i < csn; i++)
627 csa[i]->pn = i;
628 ndoms = csn;
629
630restart:
631 /* Find the best partition (set of sched domains) */
632 for (i = 0; i < csn; i++) {
633 struct cpuset *a = csa[i];
634 int apn = a->pn;
635
636 for (j = 0; j < csn; j++) {
637 struct cpuset *b = csa[j];
638 int bpn = b->pn;
639
640 if (apn != bpn && cpusets_overlap(a, b)) {
641 for (k = 0; k < csn; k++) {
642 struct cpuset *c = csa[k];
643
644 if (c->pn == bpn)
645 c->pn = apn;
646 }
647 ndoms--; /* one less element */
648 goto restart;
649 }
650 }
651 }
652
653 /* Convert <csn, csa> to <ndoms, doms> */
654 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
655 if (!doms)
656 goto rebuild;
657
658 for (nslot = 0, i = 0; i < csn; i++) {
659 struct cpuset *a = csa[i];
660 int apn = a->pn;
661
662 if (apn >= 0) {
663 cpumask_t *dp = doms + nslot;
664
665 if (nslot == ndoms) {
666 static int warnings = 10;
667 if (warnings) {
668 printk(KERN_WARNING
669 "rebuild_sched_domains confused:"
670 " nslot %d, ndoms %d, csn %d, i %d,"
671 " apn %d\n",
672 nslot, ndoms, csn, i, apn);
673 warnings--;
674 }
675 continue;
676 }
677
678 cpus_clear(*dp);
679 for (j = i; j < csn; j++) {
680 struct cpuset *b = csa[j];
681
682 if (apn == b->pn) {
683 cpus_or(*dp, *dp, b->cpus_allowed);
684 b->pn = -1;
685 }
686 }
687 nslot++;
688 }
689 }
690 BUG_ON(nslot != ndoms);
691
692rebuild:
693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug();
695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug();
697
698done:
699 if (q && !IS_ERR(q))
700 kfifo_free(q);
701 kfree(csa);
702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
703}
704
705static inline int started_after_time(struct task_struct *t1,
706 struct timespec *time,
707 struct task_struct *t2)
708{
709 int start_diff = timespec_compare(&t1->start_time, time);
710 if (start_diff > 0) {
711 return 1;
712 } else if (start_diff < 0) {
713 return 0;
714 } else {
715 /*
716 * Arbitrarily, if two processes started at the same
717 * time, we'll say that the lower pointer value
718 * started first. Note that t2 may have exited by now
719 * so this may not be a valid pointer any longer, but
720 * that's fine - it still serves to distinguish
721 * between two tasks started (effectively)
722 * simultaneously.
723 */
724 return t1 > t2;
725 }
726}
727
728static inline int started_after(void *p1, void *p2)
729{
730 struct task_struct *t1 = p1;
731 struct task_struct *t2 = p2;
732 return started_after_time(t1, &t2->start_time, t2);
733}
734
735/*
758 * Call with manage_mutex held. May take callback_mutex during call. 736 * Call with manage_mutex held. May take callback_mutex during call.
759 */ 737 */
760 738
761static int update_cpumask(struct cpuset *cs, char *buf) 739static int update_cpumask(struct cpuset *cs, char *buf)
762{ 740{
763 struct cpuset trialcs; 741 struct cpuset trialcs;
764 int retval; 742 int retval, i;
743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 };
765 751
766 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
767 if (cs == &top_cpuset) 753 if (cs == &top_cpuset)
@@ -770,11 +756,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
770 trialcs = *cs; 756 trialcs = *cs;
771 757
772 /* 758 /*
773 * We allow a cpuset's cpus_allowed to be empty; if it has attached 759 * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
774 * tasks, we'll catch it later when we validate the change and return 760 * Since cpulist_parse() fails on an empty mask, we special case
775 * -ENOSPC. 761 * that parsing. The validate_change() call ensures that cpusets
762 * with tasks have cpus.
776 */ 763 */
777 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 764 buf = strstrip(buf);
765 if (!*buf) {
778 cpus_clear(trialcs.cpus_allowed); 766 cpus_clear(trialcs.cpus_allowed);
779 } else { 767 } else {
780 retval = cpulist_parse(buf, trialcs.cpus_allowed); 768 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -782,15 +770,79 @@ static int update_cpumask(struct cpuset *cs, char *buf)
782 return retval; 770 return retval;
783 } 771 }
784 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); 772 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
785 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
786 if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
787 return -ENOSPC;
788 retval = validate_change(cs, &trialcs); 773 retval = validate_change(cs, &trialcs);
789 if (retval < 0) 774 if (retval < 0)
790 return retval; 775 return retval;
776
777 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0;
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval)
782 return retval;
783
784 is_load_balanced = is_sched_load_balance(&trialcs);
785
791 mutex_lock(&callback_mutex); 786 mutex_lock(&callback_mutex);
792 cs->cpus_allowed = trialcs.cpus_allowed; 787 cs->cpus_allowed = trialcs.cpus_allowed;
793 mutex_unlock(&callback_mutex); 788 mutex_unlock(&callback_mutex);
789
790 again:
791 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed()
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */
801 heap.size = 0;
802 cgroup_iter_start(cgrp, &it);
803 while ((p = cgroup_iter_next(cgrp, &it))) {
804 /* Only affect tasks that don't have the right cpus_allowed */
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap);
843 if (is_load_balanced)
844 rebuild_sched_domains();
845
794 return 0; 846 return 0;
795} 847}
796 848
@@ -839,7 +891,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
839 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 891 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
840 892
841 mutex_lock(&callback_mutex); 893 mutex_lock(&callback_mutex);
842 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); 894 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
843 mutex_unlock(&callback_mutex); 895 mutex_unlock(&callback_mutex);
844} 896}
845 897
@@ -857,16 +909,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
857 * their mempolicies to the cpusets new mems_allowed. 909 * their mempolicies to the cpusets new mems_allowed.
858 */ 910 */
859 911
912static void *cpuset_being_rebound;
913
860static int update_nodemask(struct cpuset *cs, char *buf) 914static int update_nodemask(struct cpuset *cs, char *buf)
861{ 915{
862 struct cpuset trialcs; 916 struct cpuset trialcs;
863 nodemask_t oldmem; 917 nodemask_t oldmem;
864 struct task_struct *g, *p; 918 struct task_struct *p;
865 struct mm_struct **mmarray; 919 struct mm_struct **mmarray;
866 int i, n, ntasks; 920 int i, n, ntasks;
867 int migrate; 921 int migrate;
868 int fudge; 922 int fudge;
869 int retval; 923 int retval;
924 struct cgroup_iter it;
870 925
871 /* 926 /*
872 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 927 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -878,29 +933,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
878 trialcs = *cs; 933 trialcs = *cs;
879 934
880 /* 935 /*
881 * We allow a cpuset's mems_allowed to be empty; if it has attached 936 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
882 * tasks, we'll catch it later when we validate the change and return 937 * Since nodelist_parse() fails on an empty mask, we special case
883 * -ENOSPC. 938 * that parsing. The validate_change() call ensures that cpusets
939 * with tasks have memory.
884 */ 940 */
885 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 941 buf = strstrip(buf);
942 if (!*buf) {
886 nodes_clear(trialcs.mems_allowed); 943 nodes_clear(trialcs.mems_allowed);
887 } else { 944 } else {
888 retval = nodelist_parse(buf, trialcs.mems_allowed); 945 retval = nodelist_parse(buf, trialcs.mems_allowed);
889 if (retval < 0) 946 if (retval < 0)
890 goto done; 947 goto done;
891 if (!nodes_intersects(trialcs.mems_allowed,
892 node_states[N_HIGH_MEMORY])) {
893 /*
894 * error if only memoryless nodes specified.
895 */
896 retval = -ENOSPC;
897 goto done;
898 }
899 } 948 }
900 /*
901 * Exclude memoryless nodes. We know that trialcs.mems_allowed
902 * contains at least one node with memory.
903 */
904 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, 949 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
905 node_states[N_HIGH_MEMORY]); 950 node_states[N_HIGH_MEMORY]);
906 oldmem = cs->mems_allowed; 951 oldmem = cs->mems_allowed;
@@ -908,11 +953,6 @@ static int update_nodemask(struct cpuset *cs, char *buf)
908 retval = 0; /* Too easy - nothing to do */ 953 retval = 0; /* Too easy - nothing to do */
909 goto done; 954 goto done;
910 } 955 }
911 /* mems_allowed cannot be empty for a cpuset with attached tasks. */
912 if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
913 retval = -ENOSPC;
914 goto done;
915 }
916 retval = validate_change(cs, &trialcs); 956 retval = validate_change(cs, &trialcs);
917 if (retval < 0) 957 if (retval < 0)
918 goto done; 958 goto done;
@@ -922,7 +962,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 cs->mems_generation = cpuset_mems_generation++; 962 cs->mems_generation = cpuset_mems_generation++;
923 mutex_unlock(&callback_mutex); 963 mutex_unlock(&callback_mutex);
924 964
925 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 965 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
926 966
927 fudge = 10; /* spare mmarray[] slots */ 967 fudge = 10; /* spare mmarray[] slots */
928 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 968 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -936,13 +976,13 @@ static int update_nodemask(struct cpuset *cs, char *buf)
936 * enough mmarray[] w/o using GFP_ATOMIC. 976 * enough mmarray[] w/o using GFP_ATOMIC.
937 */ 977 */
938 while (1) { 978 while (1) {
939 ntasks = atomic_read(&cs->count); /* guess */ 979 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
940 ntasks += fudge; 980 ntasks += fudge;
941 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
942 if (!mmarray) 982 if (!mmarray)
943 goto done; 983 goto done;
944 read_lock(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
945 if (atomic_read(&cs->count) <= ntasks) 985 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
946 break; /* got enough */ 986 break; /* got enough */
947 read_unlock(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
948 kfree(mmarray); 988 kfree(mmarray);
@@ -951,21 +991,21 @@ static int update_nodemask(struct cpuset *cs, char *buf)
951 n = 0; 991 n = 0;
952 992
953 /* Load up mmarray[] with mm reference for each task in cpuset. */ 993 /* Load up mmarray[] with mm reference for each task in cpuset. */
954 do_each_thread(g, p) { 994 cgroup_iter_start(cs->css.cgroup, &it);
995 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
955 struct mm_struct *mm; 996 struct mm_struct *mm;
956 997
957 if (n >= ntasks) { 998 if (n >= ntasks) {
958 printk(KERN_WARNING 999 printk(KERN_WARNING
959 "Cpuset mempolicy rebind incomplete.\n"); 1000 "Cpuset mempolicy rebind incomplete.\n");
960 continue; 1001 break;
961 } 1002 }
962 if (p->cpuset != cs)
963 continue;
964 mm = get_task_mm(p); 1003 mm = get_task_mm(p);
965 if (!mm) 1004 if (!mm)
966 continue; 1005 continue;
967 mmarray[n++] = mm; 1006 mmarray[n++] = mm;
968 } while_each_thread(g, p); 1007 }
1008 cgroup_iter_end(cs->css.cgroup, &it);
969 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
970 1010
971 /* 1011 /*
@@ -993,12 +1033,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
993 1033
994 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */
995 kfree(mmarray); 1035 kfree(mmarray);
996 set_cpuset_being_rebound(NULL); 1036 cpuset_being_rebound = NULL;
997 retval = 0; 1037 retval = 0;
998done: 1038done:
999 return retval; 1039 return retval;
1000} 1040}
1001 1041
1042int current_cpuset_is_being_rebound(void)
1043{
1044 return task_cs(current) == cpuset_being_rebound;
1045}
1046
1002/* 1047/*
1003 * Call with manage_mutex held. 1048 * Call with manage_mutex held.
1004 */ 1049 */
@@ -1015,6 +1060,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1015/* 1060/*
1016 * update_flag - read a 0 or a 1 in a file and update associated flag 1061 * update_flag - read a 0 or a 1 in a file and update associated flag
1017 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1062 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
1063 * CS_SCHED_LOAD_BALANCE,
1018 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1064 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1019 * CS_SPREAD_PAGE, CS_SPREAD_SLAB) 1065 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1020 * cs: the cpuset to update 1066 * cs: the cpuset to update
@@ -1028,6 +1074,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1028 int turning_on; 1074 int turning_on;
1029 struct cpuset trialcs; 1075 struct cpuset trialcs;
1030 int err; 1076 int err;
1077 int cpus_nonempty, balance_flag_changed;
1031 1078
1032 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 1079 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1033 1080
@@ -1040,10 +1087,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1040 err = validate_change(cs, &trialcs); 1087 err = validate_change(cs, &trialcs);
1041 if (err < 0) 1088 if (err < 0)
1042 return err; 1089 return err;
1090
1091 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1092 balance_flag_changed = (is_sched_load_balance(cs) !=
1093 is_sched_load_balance(&trialcs));
1094
1043 mutex_lock(&callback_mutex); 1095 mutex_lock(&callback_mutex);
1044 cs->flags = trialcs.flags; 1096 cs->flags = trialcs.flags;
1045 mutex_unlock(&callback_mutex); 1097 mutex_unlock(&callback_mutex);
1046 1098
1099 if (cpus_nonempty && balance_flag_changed)
1100 rebuild_sched_domains();
1101
1047 return 0; 1102 return 0;
1048} 1103}
1049 1104
@@ -1145,85 +1200,34 @@ static int fmeter_getrate(struct fmeter *fmp)
1145 return val; 1200 return val;
1146} 1201}
1147 1202
1148/* 1203static int cpuset_can_attach(struct cgroup_subsys *ss,
1149 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1204 struct cgroup *cont, struct task_struct *tsk)
1150 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1151 * notified on release.
1152 *
1153 * Call holding manage_mutex. May take callback_mutex and task_lock of
1154 * the task 'pid' during call.
1155 */
1156
1157static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1158{ 1205{
1159 pid_t pid; 1206 struct cpuset *cs = cgroup_cs(cont);
1160 struct task_struct *tsk;
1161 struct cpuset *oldcs;
1162 cpumask_t cpus;
1163 nodemask_t from, to;
1164 struct mm_struct *mm;
1165 int retval;
1166 1207
1167 if (sscanf(pidbuf, "%d", &pid) != 1)
1168 return -EIO;
1169 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1208 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1170 return -ENOSPC; 1209 return -ENOSPC;
1171 1210
1172 if (pid) { 1211 return security_task_setscheduler(tsk, 0, NULL);
1173 read_lock(&tasklist_lock); 1212}
1174
1175 tsk = find_task_by_pid(pid);
1176 if (!tsk || tsk->flags & PF_EXITING) {
1177 read_unlock(&tasklist_lock);
1178 return -ESRCH;
1179 }
1180
1181 get_task_struct(tsk);
1182 read_unlock(&tasklist_lock);
1183
1184 if ((current->euid) && (current->euid != tsk->uid)
1185 && (current->euid != tsk->suid)) {
1186 put_task_struct(tsk);
1187 return -EACCES;
1188 }
1189 } else {
1190 tsk = current;
1191 get_task_struct(tsk);
1192 }
1193 1213
1194 retval = security_task_setscheduler(tsk, 0, NULL); 1214static void cpuset_attach(struct cgroup_subsys *ss,
1195 if (retval) { 1215 struct cgroup *cont, struct cgroup *oldcont,
1196 put_task_struct(tsk); 1216 struct task_struct *tsk)
1197 return retval; 1217{
1198 } 1218 cpumask_t cpus;
1219 nodemask_t from, to;
1220 struct mm_struct *mm;
1221 struct cpuset *cs = cgroup_cs(cont);
1222 struct cpuset *oldcs = cgroup_cs(oldcont);
1199 1223
1200 mutex_lock(&callback_mutex); 1224 mutex_lock(&callback_mutex);
1201
1202 task_lock(tsk);
1203 oldcs = tsk->cpuset;
1204 /*
1205 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1206 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1207 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1208 */
1209 if (tsk->flags & PF_EXITING) {
1210 task_unlock(tsk);
1211 mutex_unlock(&callback_mutex);
1212 put_task_struct(tsk);
1213 return -ESRCH;
1214 }
1215 atomic_inc(&cs->count);
1216 rcu_assign_pointer(tsk->cpuset, cs);
1217 task_unlock(tsk);
1218
1219 guarantee_online_cpus(cs, &cpus); 1225 guarantee_online_cpus(cs, &cpus);
1220 set_cpus_allowed(tsk, cpus); 1226 set_cpus_allowed(tsk, cpus);
1227 mutex_unlock(&callback_mutex);
1221 1228
1222 from = oldcs->mems_allowed; 1229 from = oldcs->mems_allowed;
1223 to = cs->mems_allowed; 1230 to = cs->mems_allowed;
1224
1225 mutex_unlock(&callback_mutex);
1226
1227 mm = get_task_mm(tsk); 1231 mm = get_task_mm(tsk);
1228 if (mm) { 1232 if (mm) {
1229 mpol_rebind_mm(mm, &to); 1233 mpol_rebind_mm(mm, &to);
@@ -1232,44 +1236,36 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1232 mmput(mm); 1236 mmput(mm);
1233 } 1237 }
1234 1238
1235 put_task_struct(tsk);
1236 synchronize_rcu();
1237 if (atomic_dec_and_test(&oldcs->count))
1238 check_for_release(oldcs, ppathbuf);
1239 return 0;
1240} 1239}
1241 1240
1242/* The various types of files and directories in a cpuset file system */ 1241/* The various types of files and directories in a cpuset file system */
1243 1242
1244typedef enum { 1243typedef enum {
1245 FILE_ROOT,
1246 FILE_DIR,
1247 FILE_MEMORY_MIGRATE, 1244 FILE_MEMORY_MIGRATE,
1248 FILE_CPULIST, 1245 FILE_CPULIST,
1249 FILE_MEMLIST, 1246 FILE_MEMLIST,
1250 FILE_CPU_EXCLUSIVE, 1247 FILE_CPU_EXCLUSIVE,
1251 FILE_MEM_EXCLUSIVE, 1248 FILE_MEM_EXCLUSIVE,
1252 FILE_NOTIFY_ON_RELEASE, 1249 FILE_SCHED_LOAD_BALANCE,
1253 FILE_MEMORY_PRESSURE_ENABLED, 1250 FILE_MEMORY_PRESSURE_ENABLED,
1254 FILE_MEMORY_PRESSURE, 1251 FILE_MEMORY_PRESSURE,
1255 FILE_SPREAD_PAGE, 1252 FILE_SPREAD_PAGE,
1256 FILE_SPREAD_SLAB, 1253 FILE_SPREAD_SLAB,
1257 FILE_TASKLIST,
1258} cpuset_filetype_t; 1254} cpuset_filetype_t;
1259 1255
1260static ssize_t cpuset_common_file_write(struct file *file, 1256static ssize_t cpuset_common_file_write(struct cgroup *cont,
1257 struct cftype *cft,
1258 struct file *file,
1261 const char __user *userbuf, 1259 const char __user *userbuf,
1262 size_t nbytes, loff_t *unused_ppos) 1260 size_t nbytes, loff_t *unused_ppos)
1263{ 1261{
1264 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); 1262 struct cpuset *cs = cgroup_cs(cont);
1265 struct cftype *cft = __d_cft(file->f_path.dentry);
1266 cpuset_filetype_t type = cft->private; 1263 cpuset_filetype_t type = cft->private;
1267 char *buffer; 1264 char *buffer;
1268 char *pathbuf = NULL;
1269 int retval = 0; 1265 int retval = 0;
1270 1266
1271 /* Crude upper limit on largest legitimate cpulist user might write. */ 1267 /* Crude upper limit on largest legitimate cpulist user might write. */
1272 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) 1268 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1273 return -E2BIG; 1269 return -E2BIG;
1274 1270
1275 /* +1 for nul-terminator */ 1271 /* +1 for nul-terminator */
@@ -1282,9 +1278,9 @@ static ssize_t cpuset_common_file_write(struct file *file,
1282 } 1278 }
1283 buffer[nbytes] = 0; /* nul-terminate */ 1279 buffer[nbytes] = 0; /* nul-terminate */
1284 1280
1285 mutex_lock(&manage_mutex); 1281 cgroup_lock();
1286 1282
1287 if (is_removed(cs)) { 1283 if (cgroup_is_removed(cont)) {
1288 retval = -ENODEV; 1284 retval = -ENODEV;
1289 goto out2; 1285 goto out2;
1290 } 1286 }
@@ -1302,8 +1298,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
1302 case FILE_MEM_EXCLUSIVE: 1298 case FILE_MEM_EXCLUSIVE:
1303 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1299 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1304 break; 1300 break;
1305 case FILE_NOTIFY_ON_RELEASE: 1301 case FILE_SCHED_LOAD_BALANCE:
1306 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1302 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1307 break; 1303 break;
1308 case FILE_MEMORY_MIGRATE: 1304 case FILE_MEMORY_MIGRATE:
1309 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1305 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
@@ -1322,9 +1318,6 @@ static ssize_t cpuset_common_file_write(struct file *file,
1322 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1318 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1323 cs->mems_generation = cpuset_mems_generation++; 1319 cs->mems_generation = cpuset_mems_generation++;
1324 break; 1320 break;
1325 case FILE_TASKLIST:
1326 retval = attach_task(cs, buffer, &pathbuf);
1327 break;
1328 default: 1321 default:
1329 retval = -EINVAL; 1322 retval = -EINVAL;
1330 goto out2; 1323 goto out2;
@@ -1333,30 +1326,12 @@ static ssize_t cpuset_common_file_write(struct file *file,
1333 if (retval == 0) 1326 if (retval == 0)
1334 retval = nbytes; 1327 retval = nbytes;
1335out2: 1328out2:
1336 mutex_unlock(&manage_mutex); 1329 cgroup_unlock();
1337 cpuset_release_agent(pathbuf);
1338out1: 1330out1:
1339 kfree(buffer); 1331 kfree(buffer);
1340 return retval; 1332 return retval;
1341} 1333}
1342 1334
1343static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1344 size_t nbytes, loff_t *ppos)
1345{
1346 ssize_t retval = 0;
1347 struct cftype *cft = __d_cft(file->f_path.dentry);
1348 if (!cft)
1349 return -ENODEV;
1350
1351 /* special function ? */
1352 if (cft->write)
1353 retval = cft->write(file, buf, nbytes, ppos);
1354 else
1355 retval = cpuset_common_file_write(file, buf, nbytes, ppos);
1356
1357 return retval;
1358}
1359
1360/* 1335/*
1361 * These ascii lists should be read in a single call, by using a user 1336 * These ascii lists should be read in a single call, by using a user
1362 * buffer large enough to hold the entire map. If read in smaller 1337 * buffer large enough to hold the entire map. If read in smaller
@@ -1391,11 +1366,13 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1391 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1366 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1392} 1367}
1393 1368
1394static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1369static ssize_t cpuset_common_file_read(struct cgroup *cont,
1395 size_t nbytes, loff_t *ppos) 1370 struct cftype *cft,
1371 struct file *file,
1372 char __user *buf,
1373 size_t nbytes, loff_t *ppos)
1396{ 1374{
1397 struct cftype *cft = __d_cft(file->f_path.dentry); 1375 struct cpuset *cs = cgroup_cs(cont);
1398 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1399 cpuset_filetype_t type = cft->private; 1376 cpuset_filetype_t type = cft->private;
1400 char *page; 1377 char *page;
1401 ssize_t retval = 0; 1378 ssize_t retval = 0;
@@ -1419,8 +1396,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1419 case FILE_MEM_EXCLUSIVE: 1396 case FILE_MEM_EXCLUSIVE:
1420 *s++ = is_mem_exclusive(cs) ? '1' : '0'; 1397 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1421 break; 1398 break;
1422 case FILE_NOTIFY_ON_RELEASE: 1399 case FILE_SCHED_LOAD_BALANCE:
1423 *s++ = notify_on_release(cs) ? '1' : '0'; 1400 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1424 break; 1401 break;
1425 case FILE_MEMORY_MIGRATE: 1402 case FILE_MEMORY_MIGRATE:
1426 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1403 *s++ = is_memory_migrate(cs) ? '1' : '0';
@@ -1449,390 +1426,150 @@ out:
1449 return retval; 1426 return retval;
1450} 1427}
1451 1428
1452static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
1453 loff_t *ppos)
1454{
1455 ssize_t retval = 0;
1456 struct cftype *cft = __d_cft(file->f_path.dentry);
1457 if (!cft)
1458 return -ENODEV;
1459
1460 /* special function ? */
1461 if (cft->read)
1462 retval = cft->read(file, buf, nbytes, ppos);
1463 else
1464 retval = cpuset_common_file_read(file, buf, nbytes, ppos);
1465
1466 return retval;
1467}
1468
1469static int cpuset_file_open(struct inode *inode, struct file *file)
1470{
1471 int err;
1472 struct cftype *cft;
1473
1474 err = generic_file_open(inode, file);
1475 if (err)
1476 return err;
1477
1478 cft = __d_cft(file->f_path.dentry);
1479 if (!cft)
1480 return -ENODEV;
1481 if (cft->open)
1482 err = cft->open(inode, file);
1483 else
1484 err = 0;
1485
1486 return err;
1487}
1488
1489static int cpuset_file_release(struct inode *inode, struct file *file)
1490{
1491 struct cftype *cft = __d_cft(file->f_path.dentry);
1492 if (cft->release)
1493 return cft->release(inode, file);
1494 return 0;
1495}
1496
1497/*
1498 * cpuset_rename - Only allow simple rename of directories in place.
1499 */
1500static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1501 struct inode *new_dir, struct dentry *new_dentry)
1502{
1503 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1504 return -ENOTDIR;
1505 if (new_dentry->d_inode)
1506 return -EEXIST;
1507 if (old_dir != new_dir)
1508 return -EIO;
1509 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1510}
1511
1512static const struct file_operations cpuset_file_operations = {
1513 .read = cpuset_file_read,
1514 .write = cpuset_file_write,
1515 .llseek = generic_file_llseek,
1516 .open = cpuset_file_open,
1517 .release = cpuset_file_release,
1518};
1519
1520static const struct inode_operations cpuset_dir_inode_operations = {
1521 .lookup = simple_lookup,
1522 .mkdir = cpuset_mkdir,
1523 .rmdir = cpuset_rmdir,
1524 .rename = cpuset_rename,
1525};
1526
1527static int cpuset_create_file(struct dentry *dentry, int mode)
1528{
1529 struct inode *inode;
1530
1531 if (!dentry)
1532 return -ENOENT;
1533 if (dentry->d_inode)
1534 return -EEXIST;
1535
1536 inode = cpuset_new_inode(mode);
1537 if (!inode)
1538 return -ENOMEM;
1539
1540 if (S_ISDIR(mode)) {
1541 inode->i_op = &cpuset_dir_inode_operations;
1542 inode->i_fop = &simple_dir_operations;
1543
1544 /* start off with i_nlink == 2 (for "." entry) */
1545 inc_nlink(inode);
1546 } else if (S_ISREG(mode)) {
1547 inode->i_size = 0;
1548 inode->i_fop = &cpuset_file_operations;
1549 }
1550
1551 d_instantiate(dentry, inode);
1552 dget(dentry); /* Extra count - pin the dentry in core */
1553 return 0;
1554}
1555
1556/*
1557 * cpuset_create_dir - create a directory for an object.
1558 * cs: the cpuset we create the directory for.
1559 * It must have a valid ->parent field
1560 * And we are going to fill its ->dentry field.
1561 * name: The name to give to the cpuset directory. Will be copied.
1562 * mode: mode to set on new directory.
1563 */
1564
1565static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1566{
1567 struct dentry *dentry = NULL;
1568 struct dentry *parent;
1569 int error = 0;
1570
1571 parent = cs->parent->dentry;
1572 dentry = cpuset_get_dentry(parent, name);
1573 if (IS_ERR(dentry))
1574 return PTR_ERR(dentry);
1575 error = cpuset_create_file(dentry, S_IFDIR | mode);
1576 if (!error) {
1577 dentry->d_fsdata = cs;
1578 inc_nlink(parent->d_inode);
1579 cs->dentry = dentry;
1580 }
1581 dput(dentry);
1582
1583 return error;
1584}
1585
1586static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1587{
1588 struct dentry *dentry;
1589 int error;
1590
1591 mutex_lock(&dir->d_inode->i_mutex);
1592 dentry = cpuset_get_dentry(dir, cft->name);
1593 if (!IS_ERR(dentry)) {
1594 error = cpuset_create_file(dentry, 0644 | S_IFREG);
1595 if (!error)
1596 dentry->d_fsdata = (void *)cft;
1597 dput(dentry);
1598 } else
1599 error = PTR_ERR(dentry);
1600 mutex_unlock(&dir->d_inode->i_mutex);
1601 return error;
1602}
1603
1604/*
1605 * Stuff for reading the 'tasks' file.
1606 *
1607 * Reading this file can return large amounts of data if a cpuset has
1608 * *lots* of attached tasks. So it may need several calls to read(),
1609 * but we cannot guarantee that the information we produce is correct
1610 * unless we produce it entirely atomically.
1611 *
1612 * Upon tasks file open(), a struct ctr_struct is allocated, that
1613 * will have a pointer to an array (also allocated here). The struct
1614 * ctr_struct * is stored in file->private_data. Its resources will
1615 * be freed by release() when the file is closed. The array is used
1616 * to sprintf the PIDs and then used by read().
1617 */
1618
1619/* cpusets_tasks_read array */
1620
1621struct ctr_struct {
1622 char *buf;
1623 int bufsz;
1624};
1625
1626/*
1627 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1628 * Return actual number of pids loaded. No need to task_lock(p)
1629 * when reading out p->cpuset, as we don't really care if it changes
1630 * on the next cycle, and we are not going to try to dereference it.
1631 */
1632static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1633{
1634 int n = 0;
1635 struct task_struct *g, *p;
1636
1637 read_lock(&tasklist_lock);
1638
1639 do_each_thread(g, p) {
1640 if (p->cpuset == cs) {
1641 if (unlikely(n == npids))
1642 goto array_full;
1643 pidarray[n++] = p->pid;
1644 }
1645 } while_each_thread(g, p);
1646
1647array_full:
1648 read_unlock(&tasklist_lock);
1649 return n;
1650}
1651
1652static int cmppid(const void *a, const void *b)
1653{
1654 return *(pid_t *)a - *(pid_t *)b;
1655}
1656
1657/*
1658 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1659 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1660 * count 'cnt' of how many chars would be written if buf were large enough.
1661 */
1662static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1663{
1664 int cnt = 0;
1665 int i;
1666
1667 for (i = 0; i < npids; i++)
1668 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1669 return cnt;
1670}
1671
1672/*
1673 * Handle an open on 'tasks' file. Prepare a buffer listing the
1674 * process id's of tasks currently attached to the cpuset being opened.
1675 *
1676 * Does not require any specific cpuset mutexes, and does not take any.
1677 */
1678static int cpuset_tasks_open(struct inode *unused, struct file *file)
1679{
1680 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1681 struct ctr_struct *ctr;
1682 pid_t *pidarray;
1683 int npids;
1684 char c;
1685
1686 if (!(file->f_mode & FMODE_READ))
1687 return 0;
1688
1689 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1690 if (!ctr)
1691 goto err0;
1692
1693 /*
1694 * If cpuset gets more users after we read count, we won't have
1695 * enough space - tough. This race is indistinguishable to the
1696 * caller from the case that the additional cpuset users didn't
1697 * show up until sometime later on.
1698 */
1699 npids = atomic_read(&cs->count);
1700 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1701 if (!pidarray)
1702 goto err1;
1703
1704 npids = pid_array_load(pidarray, npids, cs);
1705 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1706
1707 /* Call pid_array_to_buf() twice, first just to get bufsz */
1708 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1709 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1710 if (!ctr->buf)
1711 goto err2;
1712 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1713
1714 kfree(pidarray);
1715 file->private_data = ctr;
1716 return 0;
1717
1718err2:
1719 kfree(pidarray);
1720err1:
1721 kfree(ctr);
1722err0:
1723 return -ENOMEM;
1724}
1725
1726static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1727 size_t nbytes, loff_t *ppos)
1728{
1729 struct ctr_struct *ctr = file->private_data;
1730 1429
1731 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1732}
1733 1430
1734static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
1735{
1736 struct ctr_struct *ctr;
1737 1431
1738 if (file->f_mode & FMODE_READ) {
1739 ctr = file->private_data;
1740 kfree(ctr->buf);
1741 kfree(ctr);
1742 }
1743 return 0;
1744}
1745 1432
1746/* 1433/*
1747 * for the common functions, 'private' gives the type of file 1434 * for the common functions, 'private' gives the type of file
1748 */ 1435 */
1749 1436
1750static struct cftype cft_tasks = {
1751 .name = "tasks",
1752 .open = cpuset_tasks_open,
1753 .read = cpuset_tasks_read,
1754 .release = cpuset_tasks_release,
1755 .private = FILE_TASKLIST,
1756};
1757
1758static struct cftype cft_cpus = { 1437static struct cftype cft_cpus = {
1759 .name = "cpus", 1438 .name = "cpus",
1439 .read = cpuset_common_file_read,
1440 .write = cpuset_common_file_write,
1760 .private = FILE_CPULIST, 1441 .private = FILE_CPULIST,
1761}; 1442};
1762 1443
1763static struct cftype cft_mems = { 1444static struct cftype cft_mems = {
1764 .name = "mems", 1445 .name = "mems",
1446 .read = cpuset_common_file_read,
1447 .write = cpuset_common_file_write,
1765 .private = FILE_MEMLIST, 1448 .private = FILE_MEMLIST,
1766}; 1449};
1767 1450
1768static struct cftype cft_cpu_exclusive = { 1451static struct cftype cft_cpu_exclusive = {
1769 .name = "cpu_exclusive", 1452 .name = "cpu_exclusive",
1453 .read = cpuset_common_file_read,
1454 .write = cpuset_common_file_write,
1770 .private = FILE_CPU_EXCLUSIVE, 1455 .private = FILE_CPU_EXCLUSIVE,
1771}; 1456};
1772 1457
1773static struct cftype cft_mem_exclusive = { 1458static struct cftype cft_mem_exclusive = {
1774 .name = "mem_exclusive", 1459 .name = "mem_exclusive",
1460 .read = cpuset_common_file_read,
1461 .write = cpuset_common_file_write,
1775 .private = FILE_MEM_EXCLUSIVE, 1462 .private = FILE_MEM_EXCLUSIVE,
1776}; 1463};
1777 1464
1778static struct cftype cft_notify_on_release = { 1465static struct cftype cft_sched_load_balance = {
1779 .name = "notify_on_release", 1466 .name = "sched_load_balance",
1780 .private = FILE_NOTIFY_ON_RELEASE, 1467 .read = cpuset_common_file_read,
1468 .write = cpuset_common_file_write,
1469 .private = FILE_SCHED_LOAD_BALANCE,
1781}; 1470};
1782 1471
1783static struct cftype cft_memory_migrate = { 1472static struct cftype cft_memory_migrate = {
1784 .name = "memory_migrate", 1473 .name = "memory_migrate",
1474 .read = cpuset_common_file_read,
1475 .write = cpuset_common_file_write,
1785 .private = FILE_MEMORY_MIGRATE, 1476 .private = FILE_MEMORY_MIGRATE,
1786}; 1477};
1787 1478
1788static struct cftype cft_memory_pressure_enabled = { 1479static struct cftype cft_memory_pressure_enabled = {
1789 .name = "memory_pressure_enabled", 1480 .name = "memory_pressure_enabled",
1481 .read = cpuset_common_file_read,
1482 .write = cpuset_common_file_write,
1790 .private = FILE_MEMORY_PRESSURE_ENABLED, 1483 .private = FILE_MEMORY_PRESSURE_ENABLED,
1791}; 1484};
1792 1485
1793static struct cftype cft_memory_pressure = { 1486static struct cftype cft_memory_pressure = {
1794 .name = "memory_pressure", 1487 .name = "memory_pressure",
1488 .read = cpuset_common_file_read,
1489 .write = cpuset_common_file_write,
1795 .private = FILE_MEMORY_PRESSURE, 1490 .private = FILE_MEMORY_PRESSURE,
1796}; 1491};
1797 1492
1798static struct cftype cft_spread_page = { 1493static struct cftype cft_spread_page = {
1799 .name = "memory_spread_page", 1494 .name = "memory_spread_page",
1495 .read = cpuset_common_file_read,
1496 .write = cpuset_common_file_write,
1800 .private = FILE_SPREAD_PAGE, 1497 .private = FILE_SPREAD_PAGE,
1801}; 1498};
1802 1499
1803static struct cftype cft_spread_slab = { 1500static struct cftype cft_spread_slab = {
1804 .name = "memory_spread_slab", 1501 .name = "memory_spread_slab",
1502 .read = cpuset_common_file_read,
1503 .write = cpuset_common_file_write,
1805 .private = FILE_SPREAD_SLAB, 1504 .private = FILE_SPREAD_SLAB,
1806}; 1505};
1807 1506
1808static int cpuset_populate_dir(struct dentry *cs_dentry) 1507static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809{ 1508{
1810 int err; 1509 int err;
1811 1510
1812 if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) 1511 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1813 return err; 1512 return err;
1814 if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) 1513 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1815 return err; 1514 return err;
1816 if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) 1515 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1817 return err; 1516 return err;
1818 if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) 1517 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1819 return err; 1518 return err;
1820 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1519 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1821 return err; 1520 return err;
1822 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1521 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1823 return err; 1522 return err;
1824 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1523 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1825 return err; 1524 return err;
1826 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) 1525 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1827 return err; 1526 return err;
1828 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) 1527 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1829 return err;
1830 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1831 return err; 1528 return err;
1529 /* memory_pressure_enabled is in root cpuset only */
1530 if (err == 0 && !cont->parent)
1531 err = cgroup_add_file(cont, ss,
1532 &cft_memory_pressure_enabled);
1832 return 0; 1533 return 0;
1833} 1534}
1834 1535
1835/* 1536/*
1537 * post_clone() is called at the end of cgroup_clone().
1538 * 'cgroup' was just created automatically as a result of
1539 * a cgroup_clone(), and the current task is about to
1540 * be moved into 'cgroup'.
1541 *
1542 * Currently we refuse to set up the cgroup - thereby
1543 * refusing the task to be entered, and as a result refusing
1544 * the sys_unshare() or clone() which initiated it - if any
1545 * sibling cpusets have exclusive cpus or mem.
1546 *
1547 * If this becomes a problem for some users who wish to
1548 * allow that scenario, then cpuset_post_clone() could be
1549 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550 * (and likewise for mems) to the new cgroup.
1551 */
1552static void cpuset_post_clone(struct cgroup_subsys *ss,
1553 struct cgroup *cgroup)
1554{
1555 struct cgroup *parent, *child;
1556 struct cpuset *cs, *parent_cs;
1557
1558 parent = cgroup->parent;
1559 list_for_each_entry(child, &parent->children, sibling) {
1560 cs = cgroup_cs(child);
1561 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1562 return;
1563 }
1564 cs = cgroup_cs(cgroup);
1565 parent_cs = cgroup_cs(parent);
1566
1567 cs->mems_allowed = parent_cs->mems_allowed;
1568 cs->cpus_allowed = parent_cs->cpus_allowed;
1569 return;
1570}
1571
1572/*
1836 * cpuset_create - create a cpuset 1573 * cpuset_create - create a cpuset
1837 * parent: cpuset that will be parent of the new cpuset. 1574 * parent: cpuset that will be parent of the new cpuset.
1838 * name: name of the new cpuset. Will be strcpy'ed. 1575 * name: name of the new cpuset. Will be strcpy'ed.
@@ -1841,106 +1578,77 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1841 * Must be called with the mutex on the parent inode held 1578 * Must be called with the mutex on the parent inode held
1842 */ 1579 */
1843 1580
1844static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1581static struct cgroup_subsys_state *cpuset_create(
1582 struct cgroup_subsys *ss,
1583 struct cgroup *cont)
1845{ 1584{
1846 struct cpuset *cs; 1585 struct cpuset *cs;
1847 int err; 1586 struct cpuset *parent;
1848 1587
1588 if (!cont->parent) {
1589 /* This is early initialization for the top cgroup */
1590 top_cpuset.mems_generation = cpuset_mems_generation++;
1591 return &top_cpuset.css;
1592 }
1593 parent = cgroup_cs(cont->parent);
1849 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1594 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1850 if (!cs) 1595 if (!cs)
1851 return -ENOMEM; 1596 return ERR_PTR(-ENOMEM);
1852 1597
1853 mutex_lock(&manage_mutex);
1854 cpuset_update_task_memory_state(); 1598 cpuset_update_task_memory_state();
1855 cs->flags = 0; 1599 cs->flags = 0;
1856 if (notify_on_release(parent))
1857 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1858 if (is_spread_page(parent)) 1600 if (is_spread_page(parent))
1859 set_bit(CS_SPREAD_PAGE, &cs->flags); 1601 set_bit(CS_SPREAD_PAGE, &cs->flags);
1860 if (is_spread_slab(parent)) 1602 if (is_spread_slab(parent))
1861 set_bit(CS_SPREAD_SLAB, &cs->flags); 1603 set_bit(CS_SPREAD_SLAB, &cs->flags);
1604 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1862 cs->cpus_allowed = CPU_MASK_NONE; 1605 cs->cpus_allowed = CPU_MASK_NONE;
1863 cs->mems_allowed = NODE_MASK_NONE; 1606 cs->mems_allowed = NODE_MASK_NONE;
1864 atomic_set(&cs->count, 0);
1865 INIT_LIST_HEAD(&cs->sibling);
1866 INIT_LIST_HEAD(&cs->children);
1867 cs->mems_generation = cpuset_mems_generation++; 1607 cs->mems_generation = cpuset_mems_generation++;
1868 fmeter_init(&cs->fmeter); 1608 fmeter_init(&cs->fmeter);
1869 1609
1870 cs->parent = parent; 1610 cs->parent = parent;
1871
1872 mutex_lock(&callback_mutex);
1873 list_add(&cs->sibling, &cs->parent->children);
1874 number_of_cpusets++; 1611 number_of_cpusets++;
1875 mutex_unlock(&callback_mutex); 1612 return &cs->css ;
1876
1877 err = cpuset_create_dir(cs, name, mode);
1878 if (err < 0)
1879 goto err;
1880
1881 /*
1882 * Release manage_mutex before cpuset_populate_dir() because it
1883 * will down() this new directory's i_mutex and if we race with
1884 * another mkdir, we might deadlock.
1885 */
1886 mutex_unlock(&manage_mutex);
1887
1888 err = cpuset_populate_dir(cs->dentry);
1889 /* If err < 0, we have a half-filled directory - oh well ;) */
1890 return 0;
1891err:
1892 list_del(&cs->sibling);
1893 mutex_unlock(&manage_mutex);
1894 kfree(cs);
1895 return err;
1896} 1613}
1897 1614
1898static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1615/*
1899{ 1616 * Locking note on the strange update_flag() call below:
1900 struct cpuset *c_parent = dentry->d_parent->d_fsdata; 1617 *
1901 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1902 /* the vfs holds inode->i_mutex already */ 1619 * enabled, then simulate turning sched_load_balance off, which
1903 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug()
1904} 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an
1624 * ABBA deadlock.
1625 */
1905 1626
1906static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1627static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1907{ 1628{
1908 struct cpuset *cs = dentry->d_fsdata; 1629 struct cpuset *cs = cgroup_cs(cont);
1909 struct dentry *d;
1910 struct cpuset *parent;
1911 char *pathbuf = NULL;
1912 1630
1913 /* the vfs holds both inode->i_mutex already */
1914
1915 mutex_lock(&manage_mutex);
1916 cpuset_update_task_memory_state(); 1631 cpuset_update_task_memory_state();
1917 if (atomic_read(&cs->count) > 0) { 1632
1918 mutex_unlock(&manage_mutex); 1633 if (is_sched_load_balance(cs))
1919 return -EBUSY; 1634 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1920 } 1635
1921 if (!list_empty(&cs->children)) {
1922 mutex_unlock(&manage_mutex);
1923 return -EBUSY;
1924 }
1925 parent = cs->parent;
1926 mutex_lock(&callback_mutex);
1927 set_bit(CS_REMOVED, &cs->flags);
1928 list_del(&cs->sibling); /* delete my sibling from parent->children */
1929 spin_lock(&cs->dentry->d_lock);
1930 d = dget(cs->dentry);
1931 cs->dentry = NULL;
1932 spin_unlock(&d->d_lock);
1933 cpuset_d_remove_dir(d);
1934 dput(d);
1935 number_of_cpusets--; 1636 number_of_cpusets--;
1936 mutex_unlock(&callback_mutex); 1637 kfree(cs);
1937 if (list_empty(&parent->children))
1938 check_for_release(parent, &pathbuf);
1939 mutex_unlock(&manage_mutex);
1940 cpuset_release_agent(pathbuf);
1941 return 0;
1942} 1638}
1943 1639
1640struct cgroup_subsys cpuset_subsys = {
1641 .name = "cpuset",
1642 .create = cpuset_create,
1643 .destroy = cpuset_destroy,
1644 .can_attach = cpuset_can_attach,
1645 .attach = cpuset_attach,
1646 .populate = cpuset_populate,
1647 .post_clone = cpuset_post_clone,
1648 .subsys_id = cpuset_subsys_id,
1649 .early_init = 1,
1650};
1651
1944/* 1652/*
1945 * cpuset_init_early - just enough so that the calls to 1653 * cpuset_init_early - just enough so that the calls to
1946 * cpuset_update_task_memory_state() in early init code 1654 * cpuset_update_task_memory_state() in early init code
@@ -1949,13 +1657,11 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1949 1657
1950int __init cpuset_init_early(void) 1658int __init cpuset_init_early(void)
1951{ 1659{
1952 struct task_struct *tsk = current; 1660 top_cpuset.mems_generation = cpuset_mems_generation++;
1953
1954 tsk->cpuset = &top_cpuset;
1955 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1956 return 0; 1661 return 0;
1957} 1662}
1958 1663
1664
1959/** 1665/**
1960 * cpuset_init - initialize cpusets at system boot 1666 * cpuset_init - initialize cpusets at system boot
1961 * 1667 *
@@ -1964,39 +1670,21 @@ int __init cpuset_init_early(void)
1964 1670
1965int __init cpuset_init(void) 1671int __init cpuset_init(void)
1966{ 1672{
1967 struct dentry *root; 1673 int err = 0;
1968 int err;
1969 1674
1970 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1675 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1971 top_cpuset.mems_allowed = NODE_MASK_ALL; 1676 top_cpuset.mems_allowed = NODE_MASK_ALL;
1972 1677
1973 fmeter_init(&top_cpuset.fmeter); 1678 fmeter_init(&top_cpuset.fmeter);
1974 top_cpuset.mems_generation = cpuset_mems_generation++; 1679 top_cpuset.mems_generation = cpuset_mems_generation++;
1975 1680 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1976 init_task.cpuset = &top_cpuset;
1977 1681
1978 err = register_filesystem(&cpuset_fs_type); 1682 err = register_filesystem(&cpuset_fs_type);
1979 if (err < 0) 1683 if (err < 0)
1980 goto out; 1684 return err;
1981 cpuset_mount = kern_mount(&cpuset_fs_type); 1685
1982 if (IS_ERR(cpuset_mount)) {
1983 printk(KERN_ERR "cpuset: could not mount!\n");
1984 err = PTR_ERR(cpuset_mount);
1985 cpuset_mount = NULL;
1986 goto out;
1987 }
1988 root = cpuset_mount->mnt_sb->s_root;
1989 root->d_fsdata = &top_cpuset;
1990 inc_nlink(root->d_inode);
1991 top_cpuset.dentry = root;
1992 root->d_inode->i_op = &cpuset_dir_inode_operations;
1993 number_of_cpusets = 1; 1686 number_of_cpusets = 1;
1994 err = cpuset_populate_dir(root); 1687 return 0;
1995 /* memory_pressure_enabled is in root cpuset only */
1996 if (err == 0)
1997 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1998out:
1999 return err;
2000} 1688}
2001 1689
2002/* 1690/*
@@ -2022,10 +1710,12 @@ out:
2022 1710
2023static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2024{ 1712{
1713 struct cgroup *cont;
2025 struct cpuset *c; 1714 struct cpuset *c;
2026 1715
2027 /* Each of our child cpusets mems must be online */ 1716 /* Each of our child cpusets mems must be online */
2028 list_for_each_entry(c, &cur->children, sibling) { 1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1718 c = cgroup_cs(cont);
2029 guarantee_online_cpus_mems_in_subtree(c); 1719 guarantee_online_cpus_mems_in_subtree(c);
2030 if (!cpus_empty(c->cpus_allowed)) 1720 if (!cpus_empty(c->cpus_allowed))
2031 guarantee_online_cpus(c, &c->cpus_allowed); 1721 guarantee_online_cpus(c, &c->cpus_allowed);
@@ -2053,7 +1743,7 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2053 1743
2054static void common_cpu_mem_hotplug_unplug(void) 1744static void common_cpu_mem_hotplug_unplug(void)
2055{ 1745{
2056 mutex_lock(&manage_mutex); 1746 cgroup_lock();
2057 mutex_lock(&callback_mutex); 1747 mutex_lock(&callback_mutex);
2058 1748
2059 guarantee_online_cpus_mems_in_subtree(&top_cpuset); 1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
@@ -2061,7 +1751,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2061 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2062 1752
2063 mutex_unlock(&callback_mutex); 1753 mutex_unlock(&callback_mutex);
2064 mutex_unlock(&manage_mutex); 1754 cgroup_unlock();
2065} 1755}
2066 1756
2067/* 1757/*
@@ -2074,8 +1764,8 @@ static void common_cpu_mem_hotplug_unplug(void)
2074 * cpu_online_map on each CPU hotplug (cpuhp) event. 1764 * cpu_online_map on each CPU hotplug (cpuhp) event.
2075 */ 1765 */
2076 1766
2077static int cpuset_handle_cpuhp(struct notifier_block *nb, 1767static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
2078 unsigned long phase, void *cpu) 1768 unsigned long phase, void *unused_cpu)
2079{ 1769{
2080 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1770 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2081 return NOTIFY_DONE; 1771 return NOTIFY_DONE;
@@ -2113,109 +1803,7 @@ void __init cpuset_init_smp(void)
2113} 1803}
2114 1804
2115/** 1805/**
2116 * cpuset_fork - attach newly forked task to its parents cpuset.
2117 * @tsk: pointer to task_struct of forking parent process.
2118 *
2119 * Description: A task inherits its parent's cpuset at fork().
2120 *
2121 * A pointer to the shared cpuset was automatically copied in fork.c
2122 * by dup_task_struct(). However, we ignore that copy, since it was
2123 * not made under the protection of task_lock(), so might no longer be
2124 * a valid cpuset pointer. attach_task() might have already changed
2125 * current->cpuset, allowing the previously referenced cpuset to
2126 * be removed and freed. Instead, we task_lock(current) and copy
2127 * its present value of current->cpuset for our freshly forked child.
2128 *
2129 * At the point that cpuset_fork() is called, 'current' is the parent
2130 * task, and the passed argument 'child' points to the child task.
2131 **/
2132
2133void cpuset_fork(struct task_struct *child)
2134{
2135 task_lock(current);
2136 child->cpuset = current->cpuset;
2137 atomic_inc(&child->cpuset->count);
2138 task_unlock(current);
2139}
2140
2141/**
2142 * cpuset_exit - detach cpuset from exiting task
2143 * @tsk: pointer to task_struct of exiting process
2144 *
2145 * Description: Detach cpuset from @tsk and release it.
2146 *
2147 * Note that cpusets marked notify_on_release force every task in
2148 * them to take the global manage_mutex mutex when exiting.
2149 * This could impact scaling on very large systems. Be reluctant to
2150 * use notify_on_release cpusets where very high task exit scaling
2151 * is required on large systems.
2152 *
2153 * Don't even think about derefencing 'cs' after the cpuset use count
2154 * goes to zero, except inside a critical section guarded by manage_mutex
2155 * or callback_mutex. Otherwise a zero cpuset use count is a license to
2156 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
2157 *
2158 * This routine has to take manage_mutex, not callback_mutex, because
2159 * it is holding that mutex while calling check_for_release(),
2160 * which calls kmalloc(), so can't be called holding callback_mutex().
2161 *
2162 * the_top_cpuset_hack:
2163 *
2164 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
2165 *
2166 * Don't leave a task unable to allocate memory, as that is an
2167 * accident waiting to happen should someone add a callout in
2168 * do_exit() after the cpuset_exit() call that might allocate.
2169 * If a task tries to allocate memory with an invalid cpuset,
2170 * it will oops in cpuset_update_task_memory_state().
2171 *
2172 * We call cpuset_exit() while the task is still competent to
2173 * handle notify_on_release(), then leave the task attached to
2174 * the root cpuset (top_cpuset) for the remainder of its exit.
2175 *
2176 * To do this properly, we would increment the reference count on
2177 * top_cpuset, and near the very end of the kernel/exit.c do_exit()
2178 * code we would add a second cpuset function call, to drop that
2179 * reference. This would just create an unnecessary hot spot on
2180 * the top_cpuset reference count, to no avail.
2181 *
2182 * Normally, holding a reference to a cpuset without bumping its
2183 * count is unsafe. The cpuset could go away, or someone could
2184 * attach us to a different cpuset, decrementing the count on
2185 * the first cpuset that we never incremented. But in this case,
2186 * top_cpuset isn't going away, and either task has PF_EXITING set,
2187 * which wards off any attach_task() attempts, or task is a failed
2188 * fork, never visible to attach_task.
2189 *
2190 * Another way to do this would be to set the cpuset pointer
2191 * to NULL here, and check in cpuset_update_task_memory_state()
2192 * for a NULL pointer. This hack avoids that NULL check, for no
2193 * cost (other than this way too long comment ;).
2194 **/
2195 1806
2196void cpuset_exit(struct task_struct *tsk)
2197{
2198 struct cpuset *cs;
2199
2200 task_lock(current);
2201 cs = tsk->cpuset;
2202 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2203 task_unlock(current);
2204
2205 if (notify_on_release(cs)) {
2206 char *pathbuf = NULL;
2207
2208 mutex_lock(&manage_mutex);
2209 if (atomic_dec_and_test(&cs->count))
2210 check_for_release(cs, &pathbuf);
2211 mutex_unlock(&manage_mutex);
2212 cpuset_release_agent(pathbuf);
2213 } else {
2214 atomic_dec(&cs->count);
2215 }
2216}
2217
2218/**
2219 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1807 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2220 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1808 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2221 * 1809 *
@@ -2230,10 +1818,23 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2230 cpumask_t mask; 1818 cpumask_t mask;
2231 1819
2232 mutex_lock(&callback_mutex); 1820 mutex_lock(&callback_mutex);
1821 mask = cpuset_cpus_allowed_locked(tsk);
1822 mutex_unlock(&callback_mutex);
1823
1824 return mask;
1825}
1826
1827/**
1828 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829 * Must be called with callback_mutex held.
1830 **/
1831cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832{
1833 cpumask_t mask;
1834
2233 task_lock(tsk); 1835 task_lock(tsk);
2234 guarantee_online_cpus(tsk->cpuset, &mask); 1836 guarantee_online_cpus(task_cs(tsk), &mask);
2235 task_unlock(tsk); 1837 task_unlock(tsk);
2236 mutex_unlock(&callback_mutex);
2237 1838
2238 return mask; 1839 return mask;
2239} 1840}
@@ -2259,7 +1860,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2259 1860
2260 mutex_lock(&callback_mutex); 1861 mutex_lock(&callback_mutex);
2261 task_lock(tsk); 1862 task_lock(tsk);
2262 guarantee_online_mems(tsk->cpuset, &mask); 1863 guarantee_online_mems(task_cs(tsk), &mask);
2263 task_unlock(tsk); 1864 task_unlock(tsk);
2264 mutex_unlock(&callback_mutex); 1865 mutex_unlock(&callback_mutex);
2265 1866
@@ -2390,7 +1991,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2390 mutex_lock(&callback_mutex); 1991 mutex_lock(&callback_mutex);
2391 1992
2392 task_lock(current); 1993 task_lock(current);
2393 cs = nearest_exclusive_ancestor(current->cpuset); 1994 cs = nearest_exclusive_ancestor(task_cs(current));
2394 task_unlock(current); 1995 task_unlock(current);
2395 1996
2396 allowed = node_isset(node, cs->mems_allowed); 1997 allowed = node_isset(node, cs->mems_allowed);
@@ -2550,14 +2151,12 @@ int cpuset_memory_pressure_enabled __read_mostly;
2550 2151
2551void __cpuset_memory_pressure_bump(void) 2152void __cpuset_memory_pressure_bump(void)
2552{ 2153{
2553 struct cpuset *cs;
2554
2555 task_lock(current); 2154 task_lock(current);
2556 cs = current->cpuset; 2155 fmeter_markevent(&task_cs(current)->fmeter);
2557 fmeter_markevent(&cs->fmeter);
2558 task_unlock(current); 2156 task_unlock(current);
2559} 2157}
2560 2158
2159#ifdef CONFIG_PROC_PID_CPUSET
2561/* 2160/*
2562 * proc_cpuset_show() 2161 * proc_cpuset_show()
2563 * - Print tasks cpuset path into seq_file. 2162 * - Print tasks cpuset path into seq_file.
@@ -2569,11 +2168,12 @@ void __cpuset_memory_pressure_bump(void)
2569 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks 2168 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2570 * cpuset to top_cpuset. 2169 * cpuset to top_cpuset.
2571 */ 2170 */
2572static int proc_cpuset_show(struct seq_file *m, void *v) 2171static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2573{ 2172{
2574 struct pid *pid; 2173 struct pid *pid;
2575 struct task_struct *tsk; 2174 struct task_struct *tsk;
2576 char *buf; 2175 char *buf;
2176 struct cgroup_subsys_state *css;
2577 int retval; 2177 int retval;
2578 2178
2579 retval = -ENOMEM; 2179 retval = -ENOMEM;
@@ -2588,15 +2188,15 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2588 goto out_free; 2188 goto out_free;
2589 2189
2590 retval = -EINVAL; 2190 retval = -EINVAL;
2591 mutex_lock(&manage_mutex); 2191 cgroup_lock();
2592 2192 css = task_subsys_state(tsk, cpuset_subsys_id);
2593 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2193 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2594 if (retval < 0) 2194 if (retval < 0)
2595 goto out_unlock; 2195 goto out_unlock;
2596 seq_puts(m, buf); 2196 seq_puts(m, buf);
2597 seq_putc(m, '\n'); 2197 seq_putc(m, '\n');
2598out_unlock: 2198out_unlock:
2599 mutex_unlock(&manage_mutex); 2199 cgroup_unlock();
2600 put_task_struct(tsk); 2200 put_task_struct(tsk);
2601out_free: 2201out_free:
2602 kfree(buf); 2202 kfree(buf);
@@ -2616,6 +2216,7 @@ const struct file_operations proc_cpuset_operations = {
2616 .llseek = seq_lseek, 2216 .llseek = seq_lseek,
2617 .release = single_release, 2217 .release = single_release,
2618}; 2218};
2219#endif /* CONFIG_PROC_PID_CPUSET */
2619 2220
2620/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2221/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2621char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) 2222char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
deleted file mode 100644
index 0d98827887a7..000000000000
--- a/kernel/die_notifier.c
+++ /dev/null
@@ -1,38 +0,0 @@
1
2#include <linux/module.h>
3#include <linux/notifier.h>
4#include <linux/vmalloc.h>
5#include <linux/kdebug.h>
6
7
8static ATOMIC_NOTIFIER_HEAD(die_chain);
9
10int notify_die(enum die_val val, const char *str,
11 struct pt_regs *regs, long err, int trap, int sig)
12{
13 struct die_args args = {
14 .regs = regs,
15 .str = str,
16 .err = err,
17 .trapnr = trap,
18 .signr = sig,
19
20 };
21
22 return atomic_notifier_call_chain(&die_chain, val, &args);
23}
24
25int register_die_notifier(struct notifier_block *nb)
26{
27 vmalloc_sync_all();
28 return atomic_notifier_chain_register(&die_chain, nb);
29}
30EXPORT_SYMBOL_GPL(register_die_notifier);
31
32int unregister_die_notifier(struct notifier_block *nb)
33{
34 return atomic_notifier_chain_unregister(&die_chain, nb);
35}
36EXPORT_SYMBOL_GPL(unregister_die_notifier);
37
38
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..f1aec27f1df0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,7 +31,7 @@
31#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h> 34#include <linux/cgroup.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/signal.h> 36#include <linux/signal.h>
37#include <linux/posix-timers.h> 37#include <linux/posix-timers.h>
@@ -148,6 +148,7 @@ void release_task(struct task_struct * p)
148 int zap_leader; 148 int zap_leader;
149repeat: 149repeat:
150 atomic_dec(&p->user->processes); 150 atomic_dec(&p->user->processes);
151 proc_flush_task(p);
151 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
152 ptrace_unlink(p); 153 ptrace_unlink(p);
153 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 154 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -175,7 +176,6 @@ repeat:
175 } 176 }
176 177
177 write_unlock_irq(&tasklist_lock); 178 write_unlock_irq(&tasklist_lock);
178 proc_flush_task(p);
179 release_thread(p); 179 release_thread(p);
180 call_rcu(&p->rcu, delayed_put_task_struct); 180 call_rcu(&p->rcu, delayed_put_task_struct);
181 181
@@ -221,7 +221,7 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor
221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
222 if (p == ignored_task 222 if (p == ignored_task
223 || p->exit_state 223 || p->exit_state
224 || is_init(p->real_parent)) 224 || is_global_init(p->real_parent))
225 continue; 225 continue;
226 if (task_pgrp(p->real_parent) != pgrp && 226 if (task_pgrp(p->real_parent) != pgrp &&
227 task_session(p->real_parent) == task_session(p)) { 227 task_session(p->real_parent) == task_session(p)) {
@@ -299,14 +299,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
299{ 299{
300 struct task_struct *curr = current->group_leader; 300 struct task_struct *curr = current->group_leader;
301 301
302 if (process_session(curr) != session) { 302 if (task_session_nr(curr) != session) {
303 detach_pid(curr, PIDTYPE_SID); 303 detach_pid(curr, PIDTYPE_SID);
304 set_signal_session(curr->signal, session); 304 set_task_session(curr, session);
305 attach_pid(curr, PIDTYPE_SID, find_pid(session)); 305 attach_pid(curr, PIDTYPE_SID, find_pid(session));
306 } 306 }
307 if (process_group(curr) != pgrp) { 307 if (task_pgrp_nr(curr) != pgrp) {
308 detach_pid(curr, PIDTYPE_PGID); 308 detach_pid(curr, PIDTYPE_PGID);
309 curr->signal->pgrp = pgrp; 309 set_task_pgrp(curr, pgrp);
310 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); 310 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
311 } 311 }
312} 312}
@@ -400,11 +400,12 @@ void daemonize(const char *name, ...)
400 current->fs = fs; 400 current->fs = fs;
401 atomic_inc(&fs->count); 401 atomic_inc(&fs->count);
402 402
403 exit_task_namespaces(current); 403 if (current->nsproxy != init_task.nsproxy) {
404 current->nsproxy = init_task.nsproxy; 404 get_nsproxy(init_task.nsproxy);
405 get_task_namespaces(current); 405 switch_task_namespaces(current, init_task.nsproxy);
406 }
406 407
407 exit_files(current); 408 exit_files(current);
408 current->files = init_task.files; 409 current->files = init_task.files;
409 atomic_inc(&current->files->count); 410 atomic_inc(&current->files->count);
410 411
@@ -492,7 +493,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
492} 493}
493EXPORT_SYMBOL(reset_files_struct); 494EXPORT_SYMBOL(reset_files_struct);
494 495
495static inline void __exit_files(struct task_struct *tsk) 496static void __exit_files(struct task_struct *tsk)
496{ 497{
497 struct files_struct * files = tsk->files; 498 struct files_struct * files = tsk->files;
498 499
@@ -509,7 +510,7 @@ void exit_files(struct task_struct *tsk)
509 __exit_files(tsk); 510 __exit_files(tsk);
510} 511}
511 512
512static inline void __put_fs_struct(struct fs_struct *fs) 513static void __put_fs_struct(struct fs_struct *fs)
513{ 514{
514 /* No need to hold fs->lock if we are killing it */ 515 /* No need to hold fs->lock if we are killing it */
515 if (atomic_dec_and_test(&fs->count)) { 516 if (atomic_dec_and_test(&fs->count)) {
@@ -530,7 +531,7 @@ void put_fs_struct(struct fs_struct *fs)
530 __put_fs_struct(fs); 531 __put_fs_struct(fs);
531} 532}
532 533
533static inline void __exit_fs(struct task_struct *tsk) 534static void __exit_fs(struct task_struct *tsk)
534{ 535{
535 struct fs_struct * fs = tsk->fs; 536 struct fs_struct * fs = tsk->fs;
536 537
@@ -665,19 +666,22 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
665 * the child reaper process (ie "init") in our pid 666 * the child reaper process (ie "init") in our pid
666 * space. 667 * space.
667 */ 668 */
668static void 669static void forget_original_parent(struct task_struct *father)
669forget_original_parent(struct task_struct *father, struct list_head *to_release)
670{ 670{
671 struct task_struct *p, *reaper = father; 671 struct task_struct *p, *n, *reaper = father;
672 struct list_head *_p, *_n; 672 struct list_head ptrace_dead;
673
674 INIT_LIST_HEAD(&ptrace_dead);
675
676 write_lock_irq(&tasklist_lock);
673 677
674 do { 678 do {
675 reaper = next_thread(reaper); 679 reaper = next_thread(reaper);
676 if (reaper == father) { 680 if (reaper == father) {
677 reaper = child_reaper(father); 681 reaper = task_child_reaper(father);
678 break; 682 break;
679 } 683 }
680 } while (reaper->exit_state); 684 } while (reaper->flags & PF_EXITING);
681 685
682 /* 686 /*
683 * There are only two places where our children can be: 687 * There are only two places where our children can be:
@@ -687,9 +691,8 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
687 * 691 *
688 * Search them and reparent children. 692 * Search them and reparent children.
689 */ 693 */
690 list_for_each_safe(_p, _n, &father->children) { 694 list_for_each_entry_safe(p, n, &father->children, sibling) {
691 int ptrace; 695 int ptrace;
692 p = list_entry(_p, struct task_struct, sibling);
693 696
694 ptrace = p->ptrace; 697 ptrace = p->ptrace;
695 698
@@ -715,13 +718,23 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
715 * while it was being traced by us, to be able to see it in wait4. 718 * while it was being traced by us, to be able to see it in wait4.
716 */ 719 */
717 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 720 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
718 list_add(&p->ptrace_list, to_release); 721 list_add(&p->ptrace_list, &ptrace_dead);
719 } 722 }
720 list_for_each_safe(_p, _n, &father->ptrace_children) { 723
721 p = list_entry(_p, struct task_struct, ptrace_list); 724 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
722 p->real_parent = reaper; 725 p->real_parent = reaper;
723 reparent_thread(p, father, 1); 726 reparent_thread(p, father, 1);
724 } 727 }
728
729 write_unlock_irq(&tasklist_lock);
730 BUG_ON(!list_empty(&father->children));
731 BUG_ON(!list_empty(&father->ptrace_children));
732
733 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
734 list_del_init(&p->ptrace_list);
735 release_task(p);
736 }
737
725} 738}
726 739
727/* 740/*
@@ -732,7 +745,6 @@ static void exit_notify(struct task_struct *tsk)
732{ 745{
733 int state; 746 int state;
734 struct task_struct *t; 747 struct task_struct *t;
735 struct list_head ptrace_dead, *_p, *_n;
736 struct pid *pgrp; 748 struct pid *pgrp;
737 749
738 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) 750 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
@@ -753,8 +765,6 @@ static void exit_notify(struct task_struct *tsk)
753 spin_unlock_irq(&tsk->sighand->siglock); 765 spin_unlock_irq(&tsk->sighand->siglock);
754 } 766 }
755 767
756 write_lock_irq(&tasklist_lock);
757
758 /* 768 /*
759 * This does two things: 769 * This does two things:
760 * 770 *
@@ -763,12 +773,10 @@ static void exit_notify(struct task_struct *tsk)
763 * as a result of our exiting, and if they have any stopped 773 * as a result of our exiting, and if they have any stopped
764 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 774 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
765 */ 775 */
776 forget_original_parent(tsk);
777 exit_task_namespaces(tsk);
766 778
767 INIT_LIST_HEAD(&ptrace_dead); 779 write_lock_irq(&tasklist_lock);
768 forget_original_parent(tsk, &ptrace_dead);
769 BUG_ON(!list_empty(&tsk->children));
770 BUG_ON(!list_empty(&tsk->ptrace_children));
771
772 /* 780 /*
773 * Check to see if any process groups have become orphaned 781 * Check to see if any process groups have become orphaned
774 * as a result of our exiting, and if they have any stopped 782 * as a result of our exiting, and if they have any stopped
@@ -792,7 +800,7 @@ static void exit_notify(struct task_struct *tsk)
792 /* Let father know we died 800 /* Let father know we died
793 * 801 *
794 * Thread signals are configurable, but you aren't going to use 802 * Thread signals are configurable, but you aren't going to use
795 * that to send signals to arbitary processes. 803 * that to send signals to arbitary processes.
796 * That stops right now. 804 * That stops right now.
797 * 805 *
798 * If the parent exec id doesn't match the exec id we saved 806 * If the parent exec id doesn't match the exec id we saved
@@ -833,12 +841,6 @@ static void exit_notify(struct task_struct *tsk)
833 841
834 write_unlock_irq(&tasklist_lock); 842 write_unlock_irq(&tasklist_lock);
835 843
836 list_for_each_safe(_p, _n, &ptrace_dead) {
837 list_del_init(_p);
838 t = list_entry(_p, struct task_struct, ptrace_list);
839 release_task(t);
840 }
841
842 /* If the process is dead, release it - nobody will wait for it */ 844 /* If the process is dead, release it - nobody will wait for it */
843 if (state == EXIT_DEAD) 845 if (state == EXIT_DEAD)
844 release_task(tsk); 846 release_task(tsk);
@@ -874,10 +876,35 @@ static inline void check_stack_usage(void) {}
874 876
875static inline void exit_child_reaper(struct task_struct *tsk) 877static inline void exit_child_reaper(struct task_struct *tsk)
876{ 878{
877 if (likely(tsk->group_leader != child_reaper(tsk))) 879 if (likely(tsk->group_leader != task_child_reaper(tsk)))
878 return; 880 return;
879 881
880 panic("Attempted to kill init!"); 882 if (tsk->nsproxy->pid_ns == &init_pid_ns)
883 panic("Attempted to kill init!");
884
885 /*
886 * @tsk is the last thread in the 'cgroup-init' and is exiting.
887 * Terminate all remaining processes in the namespace and reap them
888 * before exiting @tsk.
889 *
890 * Note that @tsk (last thread of cgroup-init) may not necessarily
891 * be the child-reaper (i.e main thread of cgroup-init) of the
892 * namespace i.e the child_reaper may have already exited.
893 *
894 * Even after a child_reaper exits, we let it inherit orphaned children,
895 * because, pid_ns->child_reaper remains valid as long as there is
896 * at least one living sub-thread in the cgroup init.
897
898 * This living sub-thread of the cgroup-init will be notified when
899 * a child inherited by the 'child-reaper' exits (do_notify_parent()
900 * uses __group_send_sig_info()). Further, when reaping child processes,
901 * do_wait() iterates over children of all living sub threads.
902
903 * i.e even though 'child_reaper' thread is listed as the parent of the
904 * orphaned children, any living sub-thread in the cgroup-init can
905 * perform the role of the child_reaper.
906 */
907 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
881} 908}
882 909
883fastcall NORET_TYPE void do_exit(long code) 910fastcall NORET_TYPE void do_exit(long code)
@@ -932,7 +959,7 @@ fastcall NORET_TYPE void do_exit(long code)
932 959
933 if (unlikely(in_atomic())) 960 if (unlikely(in_atomic()))
934 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 961 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
935 current->comm, current->pid, 962 current->comm, task_pid_nr(current),
936 preempt_count()); 963 preempt_count());
937 964
938 acct_update_integrals(tsk); 965 acct_update_integrals(tsk);
@@ -972,7 +999,7 @@ fastcall NORET_TYPE void do_exit(long code)
972 __exit_fs(tsk); 999 __exit_fs(tsk);
973 check_stack_usage(); 1000 check_stack_usage();
974 exit_thread(); 1001 exit_thread();
975 cpuset_exit(tsk); 1002 cgroup_exit(tsk, 1);
976 exit_keys(tsk); 1003 exit_keys(tsk);
977 1004
978 if (group_dead && tsk->signal->leader) 1005 if (group_dead && tsk->signal->leader)
@@ -983,7 +1010,6 @@ fastcall NORET_TYPE void do_exit(long code)
983 module_put(tsk->binfmt->module); 1010 module_put(tsk->binfmt->module);
984 1011
985 proc_exit_connector(tsk); 1012 proc_exit_connector(tsk);
986 exit_task_namespaces(tsk);
987 exit_notify(tsk); 1013 exit_notify(tsk);
988#ifdef CONFIG_NUMA 1014#ifdef CONFIG_NUMA
989 mpol_free(tsk->mempolicy); 1015 mpol_free(tsk->mempolicy);
@@ -1086,15 +1112,17 @@ asmlinkage void sys_exit_group(int error_code)
1086static int eligible_child(pid_t pid, int options, struct task_struct *p) 1112static int eligible_child(pid_t pid, int options, struct task_struct *p)
1087{ 1113{
1088 int err; 1114 int err;
1115 struct pid_namespace *ns;
1089 1116
1117 ns = current->nsproxy->pid_ns;
1090 if (pid > 0) { 1118 if (pid > 0) {
1091 if (p->pid != pid) 1119 if (task_pid_nr_ns(p, ns) != pid)
1092 return 0; 1120 return 0;
1093 } else if (!pid) { 1121 } else if (!pid) {
1094 if (process_group(p) != process_group(current)) 1122 if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
1095 return 0; 1123 return 0;
1096 } else if (pid != -1) { 1124 } else if (pid != -1) {
1097 if (process_group(p) != -pid) 1125 if (task_pgrp_nr_ns(p, ns) != -pid)
1098 return 0; 1126 return 0;
1099 } 1127 }
1100 1128
@@ -1164,9 +1192,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1164{ 1192{
1165 unsigned long state; 1193 unsigned long state;
1166 int retval, status, traced; 1194 int retval, status, traced;
1195 struct pid_namespace *ns;
1196
1197 ns = current->nsproxy->pid_ns;
1167 1198
1168 if (unlikely(noreap)) { 1199 if (unlikely(noreap)) {
1169 pid_t pid = p->pid; 1200 pid_t pid = task_pid_nr_ns(p, ns);
1170 uid_t uid = p->uid; 1201 uid_t uid = p->uid;
1171 int exit_code = p->exit_code; 1202 int exit_code = p->exit_code;
1172 int why, status; 1203 int why, status;
@@ -1285,11 +1316,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1285 retval = put_user(status, &infop->si_status); 1316 retval = put_user(status, &infop->si_status);
1286 } 1317 }
1287 if (!retval && infop) 1318 if (!retval && infop)
1288 retval = put_user(p->pid, &infop->si_pid); 1319 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1289 if (!retval && infop) 1320 if (!retval && infop)
1290 retval = put_user(p->uid, &infop->si_uid); 1321 retval = put_user(p->uid, &infop->si_uid);
1291 if (!retval) 1322 if (!retval)
1292 retval = p->pid; 1323 retval = task_pid_nr_ns(p, ns);
1293 1324
1294 if (traced) { 1325 if (traced) {
1295 write_lock_irq(&tasklist_lock); 1326 write_lock_irq(&tasklist_lock);
@@ -1326,6 +1357,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1326 int __user *stat_addr, struct rusage __user *ru) 1357 int __user *stat_addr, struct rusage __user *ru)
1327{ 1358{
1328 int retval, exit_code; 1359 int retval, exit_code;
1360 struct pid_namespace *ns;
1329 1361
1330 if (!p->exit_code) 1362 if (!p->exit_code)
1331 return 0; 1363 return 0;
@@ -1344,11 +1376,12 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1344 * keep holding onto the tasklist_lock while we call getrusage and 1376 * keep holding onto the tasklist_lock while we call getrusage and
1345 * possibly take page faults for user memory. 1377 * possibly take page faults for user memory.
1346 */ 1378 */
1379 ns = current->nsproxy->pid_ns;
1347 get_task_struct(p); 1380 get_task_struct(p);
1348 read_unlock(&tasklist_lock); 1381 read_unlock(&tasklist_lock);
1349 1382
1350 if (unlikely(noreap)) { 1383 if (unlikely(noreap)) {
1351 pid_t pid = p->pid; 1384 pid_t pid = task_pid_nr_ns(p, ns);
1352 uid_t uid = p->uid; 1385 uid_t uid = p->uid;
1353 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1386 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1354 1387
@@ -1419,11 +1452,11 @@ bail_ref:
1419 if (!retval && infop) 1452 if (!retval && infop)
1420 retval = put_user(exit_code, &infop->si_status); 1453 retval = put_user(exit_code, &infop->si_status);
1421 if (!retval && infop) 1454 if (!retval && infop)
1422 retval = put_user(p->pid, &infop->si_pid); 1455 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1423 if (!retval && infop) 1456 if (!retval && infop)
1424 retval = put_user(p->uid, &infop->si_uid); 1457 retval = put_user(p->uid, &infop->si_uid);
1425 if (!retval) 1458 if (!retval)
1426 retval = p->pid; 1459 retval = task_pid_nr_ns(p, ns);
1427 put_task_struct(p); 1460 put_task_struct(p);
1428 1461
1429 BUG_ON(!retval); 1462 BUG_ON(!retval);
@@ -1443,6 +1476,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1443 int retval; 1476 int retval;
1444 pid_t pid; 1477 pid_t pid;
1445 uid_t uid; 1478 uid_t uid;
1479 struct pid_namespace *ns;
1446 1480
1447 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1481 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1448 return 0; 1482 return 0;
@@ -1457,7 +1491,8 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1457 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1491 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1458 spin_unlock_irq(&p->sighand->siglock); 1492 spin_unlock_irq(&p->sighand->siglock);
1459 1493
1460 pid = p->pid; 1494 ns = current->nsproxy->pid_ns;
1495 pid = task_pid_nr_ns(p, ns);
1461 uid = p->uid; 1496 uid = p->uid;
1462 get_task_struct(p); 1497 get_task_struct(p);
1463 read_unlock(&tasklist_lock); 1498 read_unlock(&tasklist_lock);
@@ -1468,7 +1503,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1468 if (!retval && stat_addr) 1503 if (!retval && stat_addr)
1469 retval = put_user(0xffff, stat_addr); 1504 retval = put_user(0xffff, stat_addr);
1470 if (!retval) 1505 if (!retval)
1471 retval = p->pid; 1506 retval = task_pid_nr_ns(p, ns);
1472 } else { 1507 } else {
1473 retval = wait_noreap_copyout(p, pid, uid, 1508 retval = wait_noreap_copyout(p, pid, uid,
1474 CLD_CONTINUED, SIGCONT, 1509 CLD_CONTINUED, SIGCONT,
@@ -1517,12 +1552,9 @@ repeat:
1517 tsk = current; 1552 tsk = current;
1518 do { 1553 do {
1519 struct task_struct *p; 1554 struct task_struct *p;
1520 struct list_head *_p;
1521 int ret; 1555 int ret;
1522 1556
1523 list_for_each(_p,&tsk->children) { 1557 list_for_each_entry(p, &tsk->children, sibling) {
1524 p = list_entry(_p, struct task_struct, sibling);
1525
1526 ret = eligible_child(pid, options, p); 1558 ret = eligible_child(pid, options, p);
1527 if (!ret) 1559 if (!ret)
1528 continue; 1560 continue;
@@ -1604,9 +1636,8 @@ check_continued:
1604 } 1636 }
1605 } 1637 }
1606 if (!flag) { 1638 if (!flag) {
1607 list_for_each(_p, &tsk->ptrace_children) { 1639 list_for_each_entry(p, &tsk->ptrace_children,
1608 p = list_entry(_p, struct task_struct, 1640 ptrace_list) {
1609 ptrace_list);
1610 if (!eligible_child(pid, options, p)) 1641 if (!eligible_child(pid, options, p))
1611 continue; 1642 continue;
1612 flag = 1; 1643 flag = 1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ce28f165e31..ddafdfac9456 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,7 +29,7 @@
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h> 32#include <linux/cgroup.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
@@ -50,6 +50,7 @@
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h>
53 54
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
@@ -116,7 +117,7 @@ EXPORT_SYMBOL(free_task);
116 117
117void __put_task_struct(struct task_struct *tsk) 118void __put_task_struct(struct task_struct *tsk)
118{ 119{
119 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 120 WARN_ON(!tsk->exit_state);
120 WARN_ON(atomic_read(&tsk->usage)); 121 WARN_ON(atomic_read(&tsk->usage));
121 WARN_ON(tsk == current); 122 WARN_ON(tsk == current);
122 123
@@ -205,7 +206,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
205} 206}
206 207
207#ifdef CONFIG_MMU 208#ifdef CONFIG_MMU
208static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 209static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
209{ 210{
210 struct vm_area_struct *mpnt, *tmp, **pprev; 211 struct vm_area_struct *mpnt, *tmp, **pprev;
211 struct rb_node **rb_link, *rb_parent; 212 struct rb_node **rb_link, *rb_parent;
@@ -583,7 +584,7 @@ fail_nomem:
583 return retval; 584 return retval;
584} 585}
585 586
586static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) 587static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
587{ 588{
588 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); 589 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
589 /* We don't need to lock fs - think why ;-) */ 590 /* We don't need to lock fs - think why ;-) */
@@ -615,7 +616,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
615 616
616EXPORT_SYMBOL_GPL(copy_fs_struct); 617EXPORT_SYMBOL_GPL(copy_fs_struct);
617 618
618static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) 619static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
619{ 620{
620 if (clone_flags & CLONE_FS) { 621 if (clone_flags & CLONE_FS) {
621 atomic_inc(&current->fs->count); 622 atomic_inc(&current->fs->count);
@@ -818,7 +819,7 @@ int unshare_files(void)
818 819
819EXPORT_SYMBOL(unshare_files); 820EXPORT_SYMBOL(unshare_files);
820 821
821static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 822static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
822{ 823{
823 struct sighand_struct *sig; 824 struct sighand_struct *sig;
824 825
@@ -841,7 +842,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
841 kmem_cache_free(sighand_cachep, sighand); 842 kmem_cache_free(sighand_cachep, sighand);
842} 843}
843 844
844static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 845static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
845{ 846{
846 struct signal_struct *sig; 847 struct signal_struct *sig;
847 int ret; 848 int ret;
@@ -923,7 +924,7 @@ void __cleanup_signal(struct signal_struct *sig)
923 kmem_cache_free(signal_cachep, sig); 924 kmem_cache_free(signal_cachep, sig);
924} 925}
925 926
926static inline void cleanup_signal(struct task_struct *tsk) 927static void cleanup_signal(struct task_struct *tsk)
927{ 928{
928 struct signal_struct *sig = tsk->signal; 929 struct signal_struct *sig = tsk->signal;
929 930
@@ -933,7 +934,7 @@ static inline void cleanup_signal(struct task_struct *tsk)
933 __cleanup_signal(sig); 934 __cleanup_signal(sig);
934} 935}
935 936
936static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 937static void copy_flags(unsigned long clone_flags, struct task_struct *p)
937{ 938{
938 unsigned long new_flags = p->flags; 939 unsigned long new_flags = p->flags;
939 940
@@ -949,10 +950,10 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
949{ 950{
950 current->clear_child_tid = tidptr; 951 current->clear_child_tid = tidptr;
951 952
952 return current->pid; 953 return task_pid_vnr(current);
953} 954}
954 955
955static inline void rt_mutex_init_task(struct task_struct *p) 956static void rt_mutex_init_task(struct task_struct *p)
956{ 957{
957 spin_lock_init(&p->pi_lock); 958 spin_lock_init(&p->pi_lock);
958#ifdef CONFIG_RT_MUTEXES 959#ifdef CONFIG_RT_MUTEXES
@@ -973,12 +974,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
973 unsigned long stack_start, 974 unsigned long stack_start,
974 struct pt_regs *regs, 975 struct pt_regs *regs,
975 unsigned long stack_size, 976 unsigned long stack_size,
976 int __user *parent_tidptr,
977 int __user *child_tidptr, 977 int __user *child_tidptr,
978 struct pid *pid) 978 struct pid *pid)
979{ 979{
980 int retval; 980 int retval;
981 struct task_struct *p = NULL; 981 struct task_struct *p;
982 int cgroup_callbacks_done = 0;
982 983
983 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 984 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
984 return ERR_PTR(-EINVAL); 985 return ERR_PTR(-EINVAL);
@@ -1042,12 +1043,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1042 p->did_exec = 0; 1043 p->did_exec = 0;
1043 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1044 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1044 copy_flags(clone_flags, p); 1045 copy_flags(clone_flags, p);
1045 p->pid = pid_nr(pid);
1046 retval = -EFAULT;
1047 if (clone_flags & CLONE_PARENT_SETTID)
1048 if (put_user(p->pid, parent_tidptr))
1049 goto bad_fork_cleanup_delays_binfmt;
1050
1051 INIT_LIST_HEAD(&p->children); 1046 INIT_LIST_HEAD(&p->children);
1052 INIT_LIST_HEAD(&p->sibling); 1047 INIT_LIST_HEAD(&p->sibling);
1053 p->vfork_done = NULL; 1048 p->vfork_done = NULL;
@@ -1087,13 +1082,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1087#endif 1082#endif
1088 p->io_context = NULL; 1083 p->io_context = NULL;
1089 p->audit_context = NULL; 1084 p->audit_context = NULL;
1090 cpuset_fork(p); 1085 cgroup_fork(p);
1091#ifdef CONFIG_NUMA 1086#ifdef CONFIG_NUMA
1092 p->mempolicy = mpol_copy(p->mempolicy); 1087 p->mempolicy = mpol_copy(p->mempolicy);
1093 if (IS_ERR(p->mempolicy)) { 1088 if (IS_ERR(p->mempolicy)) {
1094 retval = PTR_ERR(p->mempolicy); 1089 retval = PTR_ERR(p->mempolicy);
1095 p->mempolicy = NULL; 1090 p->mempolicy = NULL;
1096 goto bad_fork_cleanup_cpuset; 1091 goto bad_fork_cleanup_cgroup;
1097 } 1092 }
1098 mpol_fix_fork_child_flag(p); 1093 mpol_fix_fork_child_flag(p);
1099#endif 1094#endif
@@ -1126,10 +1121,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1126 p->blocked_on = NULL; /* not blocked yet */ 1121 p->blocked_on = NULL; /* not blocked yet */
1127#endif 1122#endif
1128 1123
1129 p->tgid = p->pid;
1130 if (clone_flags & CLONE_THREAD)
1131 p->tgid = current->tgid;
1132
1133 if ((retval = security_task_alloc(p))) 1124 if ((retval = security_task_alloc(p)))
1134 goto bad_fork_cleanup_policy; 1125 goto bad_fork_cleanup_policy;
1135 if ((retval = audit_alloc(p))) 1126 if ((retval = audit_alloc(p)))
@@ -1155,6 +1146,24 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1155 if (retval) 1146 if (retval)
1156 goto bad_fork_cleanup_namespaces; 1147 goto bad_fork_cleanup_namespaces;
1157 1148
1149 if (pid != &init_struct_pid) {
1150 retval = -ENOMEM;
1151 pid = alloc_pid(task_active_pid_ns(p));
1152 if (!pid)
1153 goto bad_fork_cleanup_namespaces;
1154
1155 if (clone_flags & CLONE_NEWPID) {
1156 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
1157 if (retval < 0)
1158 goto bad_fork_free_pid;
1159 }
1160 }
1161
1162 p->pid = pid_nr(pid);
1163 p->tgid = p->pid;
1164 if (clone_flags & CLONE_THREAD)
1165 p->tgid = current->tgid;
1166
1158 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1167 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1159 /* 1168 /*
1160 * Clear TID on mm_release()? 1169 * Clear TID on mm_release()?
@@ -1204,6 +1213,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1204 /* Perform scheduler related setup. Assign this task to a CPU. */ 1213 /* Perform scheduler related setup. Assign this task to a CPU. */
1205 sched_fork(p, clone_flags); 1214 sched_fork(p, clone_flags);
1206 1215
1216 /* Now that the task is set up, run cgroup callbacks if
1217 * necessary. We need to run them before the task is visible
1218 * on the tasklist. */
1219 cgroup_fork_callbacks(p);
1220 cgroup_callbacks_done = 1;
1221
1207 /* Need tasklist lock for parent etc handling! */ 1222 /* Need tasklist lock for parent etc handling! */
1208 write_lock_irq(&tasklist_lock); 1223 write_lock_irq(&tasklist_lock);
1209 1224
@@ -1246,7 +1261,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1246 spin_unlock(&current->sighand->siglock); 1261 spin_unlock(&current->sighand->siglock);
1247 write_unlock_irq(&tasklist_lock); 1262 write_unlock_irq(&tasklist_lock);
1248 retval = -ERESTARTNOINTR; 1263 retval = -ERESTARTNOINTR;
1249 goto bad_fork_cleanup_namespaces; 1264 goto bad_fork_free_pid;
1250 } 1265 }
1251 1266
1252 if (clone_flags & CLONE_THREAD) { 1267 if (clone_flags & CLONE_THREAD) {
@@ -1275,11 +1290,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1275 __ptrace_link(p, current->parent); 1290 __ptrace_link(p, current->parent);
1276 1291
1277 if (thread_group_leader(p)) { 1292 if (thread_group_leader(p)) {
1278 p->signal->tty = current->signal->tty; 1293 if (clone_flags & CLONE_NEWPID) {
1279 p->signal->pgrp = process_group(current); 1294 p->nsproxy->pid_ns->child_reaper = p;
1280 set_signal_session(p->signal, process_session(current)); 1295 p->signal->tty = NULL;
1281 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1296 set_task_pgrp(p, p->pid);
1282 attach_pid(p, PIDTYPE_SID, task_session(current)); 1297 set_task_session(p, p->pid);
1298 attach_pid(p, PIDTYPE_PGID, pid);
1299 attach_pid(p, PIDTYPE_SID, pid);
1300 } else {
1301 p->signal->tty = current->signal->tty;
1302 set_task_pgrp(p, task_pgrp_nr(current));
1303 set_task_session(p, task_session_nr(current));
1304 attach_pid(p, PIDTYPE_PGID,
1305 task_pgrp(current));
1306 attach_pid(p, PIDTYPE_SID,
1307 task_session(current));
1308 }
1283 1309
1284 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1310 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1285 __get_cpu_var(process_counts)++; 1311 __get_cpu_var(process_counts)++;
@@ -1292,8 +1318,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1292 spin_unlock(&current->sighand->siglock); 1318 spin_unlock(&current->sighand->siglock);
1293 write_unlock_irq(&tasklist_lock); 1319 write_unlock_irq(&tasklist_lock);
1294 proc_fork_connector(p); 1320 proc_fork_connector(p);
1321 cgroup_post_fork(p);
1295 return p; 1322 return p;
1296 1323
1324bad_fork_free_pid:
1325 if (pid != &init_struct_pid)
1326 free_pid(pid);
1297bad_fork_cleanup_namespaces: 1327bad_fork_cleanup_namespaces:
1298 exit_task_namespaces(p); 1328 exit_task_namespaces(p);
1299bad_fork_cleanup_keys: 1329bad_fork_cleanup_keys:
@@ -1318,10 +1348,9 @@ bad_fork_cleanup_security:
1318bad_fork_cleanup_policy: 1348bad_fork_cleanup_policy:
1319#ifdef CONFIG_NUMA 1349#ifdef CONFIG_NUMA
1320 mpol_free(p->mempolicy); 1350 mpol_free(p->mempolicy);
1321bad_fork_cleanup_cpuset: 1351bad_fork_cleanup_cgroup:
1322#endif 1352#endif
1323 cpuset_exit(p); 1353 cgroup_exit(p, cgroup_callbacks_done);
1324bad_fork_cleanup_delays_binfmt:
1325 delayacct_tsk_free(p); 1354 delayacct_tsk_free(p);
1326 if (p->binfmt) 1355 if (p->binfmt)
1327 module_put(p->binfmt->module); 1356 module_put(p->binfmt->module);
@@ -1348,7 +1377,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1348 struct task_struct *task; 1377 struct task_struct *task;
1349 struct pt_regs regs; 1378 struct pt_regs regs;
1350 1379
1351 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 1380 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1352 &init_struct_pid); 1381 &init_struct_pid);
1353 if (!IS_ERR(task)) 1382 if (!IS_ERR(task))
1354 init_idle(task, cpu); 1383 init_idle(task, cpu);
@@ -1356,7 +1385,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1356 return task; 1385 return task;
1357} 1386}
1358 1387
1359static inline int fork_traceflag (unsigned clone_flags) 1388static int fork_traceflag(unsigned clone_flags)
1360{ 1389{
1361 if (clone_flags & CLONE_UNTRACED) 1390 if (clone_flags & CLONE_UNTRACED)
1362 return 0; 1391 return 0;
@@ -1387,19 +1416,16 @@ long do_fork(unsigned long clone_flags,
1387{ 1416{
1388 struct task_struct *p; 1417 struct task_struct *p;
1389 int trace = 0; 1418 int trace = 0;
1390 struct pid *pid = alloc_pid();
1391 long nr; 1419 long nr;
1392 1420
1393 if (!pid)
1394 return -EAGAIN;
1395 nr = pid->nr;
1396 if (unlikely(current->ptrace)) { 1421 if (unlikely(current->ptrace)) {
1397 trace = fork_traceflag (clone_flags); 1422 trace = fork_traceflag (clone_flags);
1398 if (trace) 1423 if (trace)
1399 clone_flags |= CLONE_PTRACE; 1424 clone_flags |= CLONE_PTRACE;
1400 } 1425 }
1401 1426
1402 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1427 p = copy_process(clone_flags, stack_start, regs, stack_size,
1428 child_tidptr, NULL);
1403 /* 1429 /*
1404 * Do this prior waking up the new thread - the thread pointer 1430 * Do this prior waking up the new thread - the thread pointer
1405 * might get invalid after that point, if the thread exits quickly. 1431 * might get invalid after that point, if the thread exits quickly.
@@ -1407,6 +1433,17 @@ long do_fork(unsigned long clone_flags,
1407 if (!IS_ERR(p)) { 1433 if (!IS_ERR(p)) {
1408 struct completion vfork; 1434 struct completion vfork;
1409 1435
1436 /*
1437 * this is enough to call pid_nr_ns here, but this if
1438 * improves optimisation of regular fork()
1439 */
1440 nr = (clone_flags & CLONE_NEWPID) ?
1441 task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1442 task_pid_vnr(p);
1443
1444 if (clone_flags & CLONE_PARENT_SETTID)
1445 put_user(nr, parent_tidptr);
1446
1410 if (clone_flags & CLONE_VFORK) { 1447 if (clone_flags & CLONE_VFORK) {
1411 p->vfork_done = &vfork; 1448 p->vfork_done = &vfork;
1412 init_completion(&vfork); 1449 init_completion(&vfork);
@@ -1440,7 +1477,6 @@ long do_fork(unsigned long clone_flags,
1440 } 1477 }
1441 } 1478 }
1442 } else { 1479 } else {
1443 free_pid(pid);
1444 nr = PTR_ERR(p); 1480 nr = PTR_ERR(p);
1445 } 1481 }
1446 return nr; 1482 return nr;
@@ -1485,7 +1521,7 @@ void __init proc_caches_init(void)
1485 * Check constraints on flags passed to the unshare system call and 1521 * Check constraints on flags passed to the unshare system call and
1486 * force unsharing of additional process context as appropriate. 1522 * force unsharing of additional process context as appropriate.
1487 */ 1523 */
1488static inline void check_unshare_flags(unsigned long *flags_ptr) 1524static void check_unshare_flags(unsigned long *flags_ptr)
1489{ 1525{
1490 /* 1526 /*
1491 * If unsharing a thread from a thread group, must also 1527 * If unsharing a thread from a thread group, must also
@@ -1617,7 +1653,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1617 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1653 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1618 struct files_struct *fd, *new_fd = NULL; 1654 struct files_struct *fd, *new_fd = NULL;
1619 struct sem_undo_list *new_ulist = NULL; 1655 struct sem_undo_list *new_ulist = NULL;
1620 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; 1656 struct nsproxy *new_nsproxy = NULL;
1621 1657
1622 check_unshare_flags(&unshare_flags); 1658 check_unshare_flags(&unshare_flags);
1623 1659
@@ -1647,14 +1683,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1647 1683
1648 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1684 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1649 1685
1650 task_lock(current);
1651
1652 if (new_nsproxy) { 1686 if (new_nsproxy) {
1653 old_nsproxy = current->nsproxy; 1687 switch_task_namespaces(current, new_nsproxy);
1654 current->nsproxy = new_nsproxy; 1688 new_nsproxy = NULL;
1655 new_nsproxy = old_nsproxy;
1656 } 1689 }
1657 1690
1691 task_lock(current);
1692
1658 if (new_fs) { 1693 if (new_fs) {
1659 fs = current->fs; 1694 fs = current->fs;
1660 current->fs = new_fs; 1695 current->fs = new_fs;
diff --git a/kernel/futex.c b/kernel/futex.c
index e45a65e41686..32710451dc20 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,9 @@
53#include <linux/signal.h> 53#include <linux/signal.h>
54#include <linux/module.h> 54#include <linux/module.h>
55#include <linux/magic.h> 55#include <linux/magic.h>
56#include <linux/pid.h>
57#include <linux/nsproxy.h>
58
56#include <asm/futex.h> 59#include <asm/futex.h>
57 60
58#include "rtmutex_common.h" 61#include "rtmutex_common.h"
@@ -443,8 +446,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
443 struct task_struct *p; 446 struct task_struct *p;
444 447
445 rcu_read_lock(); 448 rcu_read_lock();
446 p = find_task_by_pid(pid); 449 p = find_task_by_vpid(pid);
447
448 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 450 if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
449 p = ERR_PTR(-ESRCH); 451 p = ERR_PTR(-ESRCH);
450 else 452 else
@@ -653,7 +655,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
653 if (!(uval & FUTEX_OWNER_DIED)) { 655 if (!(uval & FUTEX_OWNER_DIED)) {
654 int ret = 0; 656 int ret = 0;
655 657
656 newval = FUTEX_WAITERS | new_owner->pid; 658 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
657 659
658 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 660 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
659 661
@@ -1106,7 +1108,7 @@ static void unqueue_me_pi(struct futex_q *q)
1106static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1108static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1107 struct task_struct *curr) 1109 struct task_struct *curr)
1108{ 1110{
1109 u32 newtid = curr->pid | FUTEX_WAITERS; 1111 u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS;
1110 struct futex_pi_state *pi_state = q->pi_state; 1112 struct futex_pi_state *pi_state = q->pi_state;
1111 u32 uval, curval, newval; 1113 u32 uval, curval, newval;
1112 int ret; 1114 int ret;
@@ -1368,7 +1370,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1368 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1370 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1369 * the locks. It will most likely not succeed. 1371 * the locks. It will most likely not succeed.
1370 */ 1372 */
1371 newval = current->pid; 1373 newval = task_pid_vnr(current);
1372 1374
1373 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 1375 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1374 1376
@@ -1379,7 +1381,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1379 * Detect deadlocks. In case of REQUEUE_PI this is a valid 1381 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1380 * situation and we return success to user space. 1382 * situation and we return success to user space.
1381 */ 1383 */
1382 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1384 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1383 ret = -EDEADLK; 1385 ret = -EDEADLK;
1384 goto out_unlock_release_sem; 1386 goto out_unlock_release_sem;
1385 } 1387 }
@@ -1408,7 +1410,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1408 */ 1410 */
1409 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 1411 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1410 /* Keep the OWNER_DIED bit */ 1412 /* Keep the OWNER_DIED bit */
1411 newval = (curval & ~FUTEX_TID_MASK) | current->pid; 1413 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1412 ownerdied = 0; 1414 ownerdied = 0;
1413 lock_taken = 1; 1415 lock_taken = 1;
1414 } 1416 }
@@ -1587,7 +1589,7 @@ retry:
1587 /* 1589 /*
1588 * We release only a lock we actually own: 1590 * We release only a lock we actually own:
1589 */ 1591 */
1590 if ((uval & FUTEX_TID_MASK) != current->pid) 1592 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1591 return -EPERM; 1593 return -EPERM;
1592 /* 1594 /*
1593 * First take all the futex related locks: 1595 * First take all the futex related locks:
@@ -1608,7 +1610,7 @@ retry_unlocked:
1608 * anyone else up: 1610 * anyone else up:
1609 */ 1611 */
1610 if (!(uval & FUTEX_OWNER_DIED)) 1612 if (!(uval & FUTEX_OWNER_DIED))
1611 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); 1613 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
1612 1614
1613 1615
1614 if (unlikely(uval == -EFAULT)) 1616 if (unlikely(uval == -EFAULT))
@@ -1617,7 +1619,7 @@ retry_unlocked:
1617 * Rare case: we managed to release the lock atomically, 1619 * Rare case: we managed to release the lock atomically,
1618 * no need to wake anyone else up: 1620 * no need to wake anyone else up:
1619 */ 1621 */
1620 if (unlikely(uval == current->pid)) 1622 if (unlikely(uval == task_pid_vnr(current)))
1621 goto out_unlock; 1623 goto out_unlock;
1622 1624
1623 /* 1625 /*
@@ -1854,7 +1856,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1854 1856
1855 ret = -ESRCH; 1857 ret = -ESRCH;
1856 rcu_read_lock(); 1858 rcu_read_lock();
1857 p = find_task_by_pid(pid); 1859 p = find_task_by_vpid(pid);
1858 if (!p) 1860 if (!p)
1859 goto err_unlock; 1861 goto err_unlock;
1860 ret = -EPERM; 1862 ret = -EPERM;
@@ -1887,7 +1889,7 @@ retry:
1887 if (get_user(uval, uaddr)) 1889 if (get_user(uval, uaddr))
1888 return -1; 1890 return -1;
1889 1891
1890 if ((uval & FUTEX_TID_MASK) == curr->pid) { 1892 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
1891 /* 1893 /*
1892 * Ok, this dying thread is truly holding a futex 1894 * Ok, this dying thread is truly holding a futex
1893 * of interest. Set the OWNER_DIED bit atomically 1895 * of interest. Set the OWNER_DIED bit atomically
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c2e2954b713..00b572666cc7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <linux/nsproxy.h>
11#include <linux/futex.h> 12#include <linux/futex.h>
12 13
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
@@ -124,7 +125,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
124 125
125 ret = -ESRCH; 126 ret = -ESRCH;
126 read_lock(&tasklist_lock); 127 read_lock(&tasklist_lock);
127 p = find_task_by_pid(pid); 128 p = find_task_by_vpid(pid);
128 if (!p) 129 if (!p)
129 goto err_unlock; 130 goto err_unlock;
130 ret = -EPERM; 131 ret = -EPERM;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e9f1b4ea504d..aa74a1ef2da8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -51,7 +51,7 @@ struct resource crashk_res = {
51 51
52int kexec_should_crash(struct task_struct *p) 52int kexec_should_crash(struct task_struct *p)
53{ 53{
54 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) 54 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
55 return 1; 55 return 1;
56 return 0; 56 return 0;
57} 57}
@@ -1146,6 +1146,172 @@ static int __init crash_notes_memory_init(void)
1146} 1146}
1147module_init(crash_notes_memory_init) 1147module_init(crash_notes_memory_init)
1148 1148
1149
1150/*
1151 * parsing the "crashkernel" commandline
1152 *
1153 * this code is intended to be called from architecture specific code
1154 */
1155
1156
1157/*
1158 * This function parses command lines in the format
1159 *
1160 * crashkernel=ramsize-range:size[,...][@offset]
1161 *
1162 * The function returns 0 on success and -EINVAL on failure.
1163 */
1164static int __init parse_crashkernel_mem(char *cmdline,
1165 unsigned long long system_ram,
1166 unsigned long long *crash_size,
1167 unsigned long long *crash_base)
1168{
1169 char *cur = cmdline, *tmp;
1170
1171 /* for each entry of the comma-separated list */
1172 do {
1173 unsigned long long start, end = ULLONG_MAX, size;
1174
1175 /* get the start of the range */
1176 start = memparse(cur, &tmp);
1177 if (cur == tmp) {
1178 pr_warning("crashkernel: Memory value expected\n");
1179 return -EINVAL;
1180 }
1181 cur = tmp;
1182 if (*cur != '-') {
1183 pr_warning("crashkernel: '-' expected\n");
1184 return -EINVAL;
1185 }
1186 cur++;
1187
1188 /* if no ':' is here, than we read the end */
1189 if (*cur != ':') {
1190 end = memparse(cur, &tmp);
1191 if (cur == tmp) {
1192 pr_warning("crashkernel: Memory "
1193 "value expected\n");
1194 return -EINVAL;
1195 }
1196 cur = tmp;
1197 if (end <= start) {
1198 pr_warning("crashkernel: end <= start\n");
1199 return -EINVAL;
1200 }
1201 }
1202
1203 if (*cur != ':') {
1204 pr_warning("crashkernel: ':' expected\n");
1205 return -EINVAL;
1206 }
1207 cur++;
1208
1209 size = memparse(cur, &tmp);
1210 if (cur == tmp) {
1211 pr_warning("Memory value expected\n");
1212 return -EINVAL;
1213 }
1214 cur = tmp;
1215 if (size >= system_ram) {
1216 pr_warning("crashkernel: invalid size\n");
1217 return -EINVAL;
1218 }
1219
1220 /* match ? */
1221 if (system_ram >= start && system_ram <= end) {
1222 *crash_size = size;
1223 break;
1224 }
1225 } while (*cur++ == ',');
1226
1227 if (*crash_size > 0) {
1228 while (*cur != ' ' && *cur != '@')
1229 cur++;
1230 if (*cur == '@') {
1231 cur++;
1232 *crash_base = memparse(cur, &tmp);
1233 if (cur == tmp) {
1234 pr_warning("Memory value expected "
1235 "after '@'\n");
1236 return -EINVAL;
1237 }
1238 }
1239 }
1240
1241 return 0;
1242}
1243
1244/*
1245 * That function parses "simple" (old) crashkernel command lines like
1246 *
1247 * crashkernel=size[@offset]
1248 *
1249 * It returns 0 on success and -EINVAL on failure.
1250 */
1251static int __init parse_crashkernel_simple(char *cmdline,
1252 unsigned long long *crash_size,
1253 unsigned long long *crash_base)
1254{
1255 char *cur = cmdline;
1256
1257 *crash_size = memparse(cmdline, &cur);
1258 if (cmdline == cur) {
1259 pr_warning("crashkernel: memory value expected\n");
1260 return -EINVAL;
1261 }
1262
1263 if (*cur == '@')
1264 *crash_base = memparse(cur+1, &cur);
1265
1266 return 0;
1267}
1268
1269/*
1270 * That function is the entry point for command line parsing and should be
1271 * called from the arch-specific code.
1272 */
1273int __init parse_crashkernel(char *cmdline,
1274 unsigned long long system_ram,
1275 unsigned long long *crash_size,
1276 unsigned long long *crash_base)
1277{
1278 char *p = cmdline, *ck_cmdline = NULL;
1279 char *first_colon, *first_space;
1280
1281 BUG_ON(!crash_size || !crash_base);
1282 *crash_size = 0;
1283 *crash_base = 0;
1284
1285 /* find crashkernel and use the last one if there are more */
1286 p = strstr(p, "crashkernel=");
1287 while (p) {
1288 ck_cmdline = p;
1289 p = strstr(p+1, "crashkernel=");
1290 }
1291
1292 if (!ck_cmdline)
1293 return -EINVAL;
1294
1295 ck_cmdline += 12; /* strlen("crashkernel=") */
1296
1297 /*
1298 * if the commandline contains a ':', then that's the extended
1299 * syntax -- if not, it must be the classic syntax
1300 */
1301 first_colon = strchr(ck_cmdline, ':');
1302 first_space = strchr(ck_cmdline, ' ');
1303 if (first_colon && (!first_space || first_colon < first_space))
1304 return parse_crashkernel_mem(ck_cmdline, system_ram,
1305 crash_size, crash_base);
1306 else
1307 return parse_crashkernel_simple(ck_cmdline, crash_size,
1308 crash_base);
1309
1310 return 0;
1311}
1312
1313
1314
1149void crash_save_vmcoreinfo(void) 1315void crash_save_vmcoreinfo(void)
1150{ 1316{
1151 u32 *buf; 1317 u32 *buf;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a6f1ee9c92d9..55fe0c7cd95f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -511,11 +511,11 @@ static void lockdep_print_held_locks(struct task_struct *curr)
511 int i, depth = curr->lockdep_depth; 511 int i, depth = curr->lockdep_depth;
512 512
513 if (!depth) { 513 if (!depth) {
514 printk("no locks held by %s/%d.\n", curr->comm, curr->pid); 514 printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
515 return; 515 return;
516 } 516 }
517 printk("%d lock%s held by %s/%d:\n", 517 printk("%d lock%s held by %s/%d:\n",
518 depth, depth > 1 ? "s" : "", curr->comm, curr->pid); 518 depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
519 519
520 for (i = 0; i < depth; i++) { 520 for (i = 0; i < depth; i++) {
521 printk(" #%d: ", i); 521 printk(" #%d: ", i);
@@ -904,7 +904,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
904 print_kernel_version(); 904 print_kernel_version();
905 printk( "-------------------------------------------------------\n"); 905 printk( "-------------------------------------------------------\n");
906 printk("%s/%d is trying to acquire lock:\n", 906 printk("%s/%d is trying to acquire lock:\n",
907 curr->comm, curr->pid); 907 curr->comm, task_pid_nr(curr));
908 print_lock(check_source); 908 print_lock(check_source);
909 printk("\nbut task is already holding lock:\n"); 909 printk("\nbut task is already holding lock:\n");
910 print_lock(check_target); 910 print_lock(check_target);
@@ -1085,7 +1085,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1085 print_kernel_version(); 1085 print_kernel_version();
1086 printk( "------------------------------------------------------\n"); 1086 printk( "------------------------------------------------------\n");
1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1088 curr->comm, curr->pid, 1088 curr->comm, task_pid_nr(curr),
1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, 1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
1091 curr->hardirqs_enabled, 1091 curr->hardirqs_enabled,
@@ -1237,7 +1237,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1237 print_kernel_version(); 1237 print_kernel_version();
1238 printk( "---------------------------------------------\n"); 1238 printk( "---------------------------------------------\n");
1239 printk("%s/%d is trying to acquire lock:\n", 1239 printk("%s/%d is trying to acquire lock:\n",
1240 curr->comm, curr->pid); 1240 curr->comm, task_pid_nr(curr));
1241 print_lock(next); 1241 print_lock(next);
1242 printk("\nbut task is already holding lock:\n"); 1242 printk("\nbut task is already holding lock:\n");
1243 print_lock(prev); 1243 print_lock(prev);
@@ -1521,7 +1521,7 @@ cache_hit:
1521} 1521}
1522 1522
1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1524 struct held_lock *hlock, int chain_head, u64 chain_key) 1524 struct held_lock *hlock, int chain_head, u64 chain_key)
1525{ 1525{
1526 /* 1526 /*
1527 * Trylock needs to maintain the stack of held locks, but it 1527 * Trylock needs to maintain the stack of held locks, but it
@@ -1641,7 +1641,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1641 usage_str[prev_bit], usage_str[new_bit]); 1641 usage_str[prev_bit], usage_str[new_bit]);
1642 1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", 1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid, 1644 curr->comm, task_pid_nr(curr),
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, 1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, 1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr), 1647 trace_hardirqs_enabled(curr),
@@ -1694,7 +1694,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1694 print_kernel_version(); 1694 print_kernel_version();
1695 printk( "---------------------------------------------------------\n"); 1695 printk( "---------------------------------------------------------\n");
1696 printk("%s/%d just changed the state of lock:\n", 1696 printk("%s/%d just changed the state of lock:\n",
1697 curr->comm, curr->pid); 1697 curr->comm, task_pid_nr(curr));
1698 print_lock(this); 1698 print_lock(this);
1699 if (forwards) 1699 if (forwards)
1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
@@ -2487,7 +2487,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
2487 printk( "[ BUG: bad unlock balance detected! ]\n"); 2487 printk( "[ BUG: bad unlock balance detected! ]\n");
2488 printk( "-------------------------------------\n"); 2488 printk( "-------------------------------------\n");
2489 printk("%s/%d is trying to release lock (", 2489 printk("%s/%d is trying to release lock (",
2490 curr->comm, curr->pid); 2490 curr->comm, task_pid_nr(curr));
2491 print_lockdep_cache(lock); 2491 print_lockdep_cache(lock);
2492 printk(") at:\n"); 2492 printk(") at:\n");
2493 print_ip_sym(ip); 2493 print_ip_sym(ip);
@@ -2737,7 +2737,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2737 printk( "[ BUG: bad contention detected! ]\n"); 2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n"); 2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (", 2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid); 2740 curr->comm, task_pid_nr(curr));
2741 print_lockdep_cache(lock); 2741 print_lockdep_cache(lock);
2742 printk(") at:\n"); 2742 printk(") at:\n");
2743 print_ip_sym(ip); 2743 print_ip_sym(ip);
@@ -3072,7 +3072,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3072 printk( "[ BUG: held lock freed! ]\n"); 3072 printk( "[ BUG: held lock freed! ]\n");
3073 printk( "-------------------------\n"); 3073 printk( "-------------------------\n");
3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3075 curr->comm, curr->pid, mem_from, mem_to-1); 3075 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3076 print_lock(hlock); 3076 print_lock(hlock);
3077 lockdep_print_held_locks(curr); 3077 lockdep_print_held_locks(curr);
3078 3078
@@ -3125,7 +3125,7 @@ static void print_held_locks_bug(struct task_struct *curr)
3125 printk( "[ BUG: lock held at task exit time! ]\n"); 3125 printk( "[ BUG: lock held at task exit time! ]\n");
3126 printk( "-------------------------------------\n"); 3126 printk( "-------------------------------------\n");
3127 printk("%s/%d is exiting with locks still held!\n", 3127 printk("%s/%d is exiting with locks still held!\n",
3128 curr->comm, curr->pid); 3128 curr->comm, task_pid_nr(curr));
3129 lockdep_print_held_locks(curr); 3129 lockdep_print_held_locks(curr);
3130 3130
3131 printk("\nstack backtrace:\n"); 3131 printk("\nstack backtrace:\n");
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 000000000000..ccb48d9a3657
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,525 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26
27extern struct marker __start___markers[];
28extern struct marker __stop___markers[];
29
30/*
31 * module_mutex nests inside markers_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync.
33 */
34static DEFINE_MUTEX(markers_mutex);
35
36/*
37 * Marker deferred synchronization.
38 * Upon marker probe_unregister, we delay call to synchronize_sched() to
39 * accelerate mass unregistration (only when there is no more reference to a
40 * given module do we call synchronize_sched()). However, we need to make sure
41 * every critical region has ended before we re-arm a marker that has been
42 * unregistered and then registered back with a different probe data.
43 */
44static int deferred_sync;
45
46/*
47 * Marker hash table, containing the active markers.
48 * Protected by module_mutex.
49 */
50#define MARKER_HASH_BITS 6
51#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
52
53struct marker_entry {
54 struct hlist_node hlist;
55 char *format;
56 marker_probe_func *probe;
57 void *private;
58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; /* Contains name'\0'format'\0' */
60};
61
62static struct hlist_head marker_table[MARKER_TABLE_SIZE];
63
64/**
65 * __mark_empty_function - Empty probe callback
66 * @mdata: pointer of type const struct marker
67 * @fmt: format string
68 * @...: variable argument list
69 *
70 * Empty callback provided as a probe to the markers. By providing this to a
71 * disabled marker, we make sure the execution flow is always valid even
72 * though the function pointer change and the marker enabling are two distinct
73 * operations that modifies the execution flow of preemptible code.
74 */
75void __mark_empty_function(const struct marker *mdata, void *private,
76 const char *fmt, ...)
77{
78}
79EXPORT_SYMBOL_GPL(__mark_empty_function);
80
81/*
82 * Get marker if the marker is present in the marker hash table.
83 * Must be called with markers_mutex held.
84 * Returns NULL if not present.
85 */
86static struct marker_entry *get_marker(const char *name)
87{
88 struct hlist_head *head;
89 struct hlist_node *node;
90 struct marker_entry *e;
91 u32 hash = jhash(name, strlen(name), 0);
92
93 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
94 hlist_for_each_entry(e, node, head, hlist) {
95 if (!strcmp(name, e->name))
96 return e;
97 }
98 return NULL;
99}
100
101/*
102 * Add the marker to the marker hash table. Must be called with markers_mutex
103 * held.
104 */
105static int add_marker(const char *name, const char *format,
106 marker_probe_func *probe, void *private)
107{
108 struct hlist_head *head;
109 struct hlist_node *node;
110 struct marker_entry *e;
111 size_t name_len = strlen(name) + 1;
112 size_t format_len = 0;
113 u32 hash = jhash(name, name_len-1, 0);
114
115 if (format)
116 format_len = strlen(format) + 1;
117 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
118 hlist_for_each_entry(e, node, head, hlist) {
119 if (!strcmp(name, e->name)) {
120 printk(KERN_NOTICE
121 "Marker %s busy, probe %p already installed\n",
122 name, e->probe);
123 return -EBUSY; /* Already there */
124 }
125 }
126 /*
127 * Using kmalloc here to allocate a variable length element. Could
128 * cause some memory fragmentation if overused.
129 */
130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
131 GFP_KERNEL);
132 if (!e)
133 return -ENOMEM;
134 memcpy(&e->name[0], name, name_len);
135 if (format) {
136 e->format = &e->name[name_len];
137 memcpy(e->format, format, format_len);
138 trace_mark(core_marker_format, "name %s format %s",
139 e->name, e->format);
140 } else
141 e->format = NULL;
142 e->probe = probe;
143 e->private = private;
144 e->refcount = 0;
145 hlist_add_head(&e->hlist, head);
146 return 0;
147}
148
149/*
150 * Remove the marker from the marker hash table. Must be called with mutex_lock
151 * held.
152 */
153static void *remove_marker(const char *name)
154{
155 struct hlist_head *head;
156 struct hlist_node *node;
157 struct marker_entry *e;
158 int found = 0;
159 size_t len = strlen(name) + 1;
160 void *private = NULL;
161 u32 hash = jhash(name, len-1, 0);
162
163 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
164 hlist_for_each_entry(e, node, head, hlist) {
165 if (!strcmp(name, e->name)) {
166 found = 1;
167 break;
168 }
169 }
170 if (found) {
171 private = e->private;
172 hlist_del(&e->hlist);
173 kfree(e);
174 }
175 return private;
176}
177
178/*
179 * Set the mark_entry format to the format found in the element.
180 */
181static int marker_set_format(struct marker_entry **entry, const char *format)
182{
183 struct marker_entry *e;
184 size_t name_len = strlen((*entry)->name) + 1;
185 size_t format_len = strlen(format) + 1;
186
187 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
188 GFP_KERNEL);
189 if (!e)
190 return -ENOMEM;
191 memcpy(&e->name[0], (*entry)->name, name_len);
192 e->format = &e->name[name_len];
193 memcpy(e->format, format, format_len);
194 e->probe = (*entry)->probe;
195 e->private = (*entry)->private;
196 e->refcount = (*entry)->refcount;
197 hlist_add_before(&e->hlist, &(*entry)->hlist);
198 hlist_del(&(*entry)->hlist);
199 kfree(*entry);
200 *entry = e;
201 trace_mark(core_marker_format, "name %s format %s",
202 e->name, e->format);
203 return 0;
204}
205
206/*
207 * Sets the probe callback corresponding to one marker.
208 */
209static int set_marker(struct marker_entry **entry, struct marker *elem)
210{
211 int ret;
212 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
213
214 if ((*entry)->format) {
215 if (strcmp((*entry)->format, elem->format) != 0) {
216 printk(KERN_NOTICE
217 "Format mismatch for probe %s "
218 "(%s), marker (%s)\n",
219 (*entry)->name,
220 (*entry)->format,
221 elem->format);
222 return -EPERM;
223 }
224 } else {
225 ret = marker_set_format(entry, elem->format);
226 if (ret)
227 return ret;
228 }
229 elem->call = (*entry)->probe;
230 elem->private = (*entry)->private;
231 elem->state = 1;
232 return 0;
233}
234
235/*
236 * Disable a marker and its probe callback.
237 * Note: only after a synchronize_sched() issued after setting elem->call to the
238 * empty function insures that the original callback is not used anymore. This
239 * insured by preemption disabling around the call site.
240 */
241static void disable_marker(struct marker *elem)
242{
243 elem->state = 0;
244 elem->call = __mark_empty_function;
245 /*
246 * Leave the private data and id there, because removal is racy and
247 * should be done only after a synchronize_sched(). These are never used
248 * until the next initialization anyway.
249 */
250}
251
252/**
253 * marker_update_probe_range - Update a probe range
254 * @begin: beginning of the range
255 * @end: end of the range
256 * @probe_module: module address of the probe being updated
257 * @refcount: number of references left to the given probe_module (out)
258 *
259 * Updates the probe callback corresponding to a range of markers.
260 * Must be called with markers_mutex held.
261 */
262void marker_update_probe_range(struct marker *begin,
263 struct marker *end, struct module *probe_module,
264 int *refcount)
265{
266 struct marker *iter;
267 struct marker_entry *mark_entry;
268
269 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) {
272 set_marker(&mark_entry, iter);
273 /*
274 * ignore error, continue
275 */
276 if (probe_module)
277 if (probe_module ==
278 __module_text_address((unsigned long)mark_entry->probe))
279 (*refcount)++;
280 } else {
281 disable_marker(iter);
282 }
283 }
284}
285
286/*
287 * Update probes, removing the faulty probes.
288 * Issues a synchronize_sched() when no reference to the module passed
289 * as parameter is found in the probes so the probe module can be
290 * safely unloaded from now on.
291 */
292static void marker_update_probes(struct module *probe_module)
293{
294 int refcount = 0;
295
296 mutex_lock(&markers_mutex);
297 /* Core kernel markers */
298 marker_update_probe_range(__start___markers,
299 __stop___markers, probe_module, &refcount);
300 /* Markers in modules. */
301 module_update_markers(probe_module, &refcount);
302 if (probe_module && refcount == 0) {
303 synchronize_sched();
304 deferred_sync = 0;
305 }
306 mutex_unlock(&markers_mutex);
307}
308
309/**
310 * marker_probe_register - Connect a probe to a marker
311 * @name: marker name
312 * @format: format string
313 * @probe: probe handler
314 * @private: probe private data
315 *
316 * private data must be a valid allocated memory address, or NULL.
317 * Returns 0 if ok, error value on error.
318 */
319int marker_probe_register(const char *name, const char *format,
320 marker_probe_func *probe, void *private)
321{
322 struct marker_entry *entry;
323 int ret = 0, need_update = 0;
324
325 mutex_lock(&markers_mutex);
326 entry = get_marker(name);
327 if (entry && entry->refcount) {
328 ret = -EBUSY;
329 goto end;
330 }
331 if (deferred_sync) {
332 synchronize_sched();
333 deferred_sync = 0;
334 }
335 ret = add_marker(name, format, probe, private);
336 if (ret)
337 goto end;
338 need_update = 1;
339end:
340 mutex_unlock(&markers_mutex);
341 if (need_update)
342 marker_update_probes(NULL);
343 return ret;
344}
345EXPORT_SYMBOL_GPL(marker_probe_register);
346
347/**
348 * marker_probe_unregister - Disconnect a probe from a marker
349 * @name: marker name
350 *
351 * Returns the private data given to marker_probe_register, or an ERR_PTR().
352 */
353void *marker_probe_unregister(const char *name)
354{
355 struct module *probe_module;
356 struct marker_entry *entry;
357 void *private;
358 int need_update = 0;
359
360 mutex_lock(&markers_mutex);
361 entry = get_marker(name);
362 if (!entry) {
363 private = ERR_PTR(-ENOENT);
364 goto end;
365 }
366 entry->refcount = 0;
367 /* In what module is the probe handler ? */
368 probe_module = __module_text_address((unsigned long)entry->probe);
369 private = remove_marker(name);
370 deferred_sync = 1;
371 need_update = 1;
372end:
373 mutex_unlock(&markers_mutex);
374 if (need_update)
375 marker_update_probes(probe_module);
376 return private;
377}
378EXPORT_SYMBOL_GPL(marker_probe_unregister);
379
380/**
381 * marker_probe_unregister_private_data - Disconnect a probe from a marker
382 * @private: probe private data
383 *
384 * Unregister a marker by providing the registered private data.
385 * Returns the private data given to marker_probe_register, or an ERR_PTR().
386 */
387void *marker_probe_unregister_private_data(void *private)
388{
389 struct module *probe_module;
390 struct hlist_head *head;
391 struct hlist_node *node;
392 struct marker_entry *entry;
393 int found = 0;
394 unsigned int i;
395 int need_update = 0;
396
397 mutex_lock(&markers_mutex);
398 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
399 head = &marker_table[i];
400 hlist_for_each_entry(entry, node, head, hlist) {
401 if (entry->private == private) {
402 found = 1;
403 goto iter_end;
404 }
405 }
406 }
407iter_end:
408 if (!found) {
409 private = ERR_PTR(-ENOENT);
410 goto end;
411 }
412 entry->refcount = 0;
413 /* In what module is the probe handler ? */
414 probe_module = __module_text_address((unsigned long)entry->probe);
415 private = remove_marker(entry->name);
416 deferred_sync = 1;
417 need_update = 1;
418end:
419 mutex_unlock(&markers_mutex);
420 if (need_update)
421 marker_update_probes(probe_module);
422 return private;
423}
424EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
425
426/**
427 * marker_arm - Arm a marker
428 * @name: marker name
429 *
430 * Activate a marker. It keeps a reference count of the number of
431 * arming/disarming done.
432 * Returns 0 if ok, error value on error.
433 */
434int marker_arm(const char *name)
435{
436 struct marker_entry *entry;
437 int ret = 0, need_update = 0;
438
439 mutex_lock(&markers_mutex);
440 entry = get_marker(name);
441 if (!entry) {
442 ret = -ENOENT;
443 goto end;
444 }
445 /*
446 * Only need to update probes when refcount passes from 0 to 1.
447 */
448 if (entry->refcount++)
449 goto end;
450 need_update = 1;
451end:
452 mutex_unlock(&markers_mutex);
453 if (need_update)
454 marker_update_probes(NULL);
455 return ret;
456}
457EXPORT_SYMBOL_GPL(marker_arm);
458
459/**
460 * marker_disarm - Disarm a marker
461 * @name: marker name
462 *
463 * Disarm a marker. It keeps a reference count of the number of arming/disarming
464 * done.
465 * Returns 0 if ok, error value on error.
466 */
467int marker_disarm(const char *name)
468{
469 struct marker_entry *entry;
470 int ret = 0, need_update = 0;
471
472 mutex_lock(&markers_mutex);
473 entry = get_marker(name);
474 if (!entry) {
475 ret = -ENOENT;
476 goto end;
477 }
478 /*
479 * Only permit decrement refcount if higher than 0.
480 * Do probe update only on 1 -> 0 transition.
481 */
482 if (entry->refcount) {
483 if (--entry->refcount)
484 goto end;
485 } else {
486 ret = -EPERM;
487 goto end;
488 }
489 need_update = 1;
490end:
491 mutex_unlock(&markers_mutex);
492 if (need_update)
493 marker_update_probes(NULL);
494 return ret;
495}
496EXPORT_SYMBOL_GPL(marker_disarm);
497
498/**
499 * marker_get_private_data - Get a marker's probe private data
500 * @name: marker name
501 *
502 * Returns the private data pointer, or an ERR_PTR.
503 * The private data pointer should _only_ be dereferenced if the caller is the
504 * owner of the data, or its content could vanish. This is mostly used to
505 * confirm that a caller is the owner of a registered probe.
506 */
507void *marker_get_private_data(const char *name)
508{
509 struct hlist_head *head;
510 struct hlist_node *node;
511 struct marker_entry *e;
512 size_t name_len = strlen(name) + 1;
513 u32 hash = jhash(name, name_len-1, 0);
514 int found = 0;
515
516 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
517 hlist_for_each_entry(e, node, head, hlist) {
518 if (!strcmp(name, e->name)) {
519 found = 1;
520 return e->private;
521 }
522 }
523 return ERR_PTR(-ENOENT);
524}
525EXPORT_SYMBOL_GPL(marker_get_private_data);
diff --git a/kernel/module.c b/kernel/module.c
index 7734595bd329..3202c9950073 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1673,6 +1673,8 @@ static struct module *load_module(void __user *umod,
1673 unsigned int unusedcrcindex; 1673 unsigned int unusedcrcindex;
1674 unsigned int unusedgplindex; 1674 unsigned int unusedgplindex;
1675 unsigned int unusedgplcrcindex; 1675 unsigned int unusedgplcrcindex;
1676 unsigned int markersindex;
1677 unsigned int markersstringsindex;
1676 struct module *mod; 1678 struct module *mod;
1677 long err = 0; 1679 long err = 0;
1678 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1680 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1939,6 +1941,9 @@ static struct module *load_module(void __user *umod,
1939 add_taint_module(mod, TAINT_FORCED_MODULE); 1941 add_taint_module(mod, TAINT_FORCED_MODULE);
1940 } 1942 }
1941#endif 1943#endif
1944 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
1945 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
1946 "__markers_strings");
1942 1947
1943 /* Now do relocations. */ 1948 /* Now do relocations. */
1944 for (i = 1; i < hdr->e_shnum; i++) { 1949 for (i = 1; i < hdr->e_shnum; i++) {
@@ -1961,6 +1966,11 @@ static struct module *load_module(void __user *umod,
1961 if (err < 0) 1966 if (err < 0)
1962 goto cleanup; 1967 goto cleanup;
1963 } 1968 }
1969#ifdef CONFIG_MARKERS
1970 mod->markers = (void *)sechdrs[markersindex].sh_addr;
1971 mod->num_markers =
1972 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
1973#endif
1964 1974
1965 /* Find duplicate symbols */ 1975 /* Find duplicate symbols */
1966 err = verify_export_symbols(mod); 1976 err = verify_export_symbols(mod);
@@ -1979,6 +1989,11 @@ static struct module *load_module(void __user *umod,
1979 1989
1980 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 1990 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1981 1991
1992#ifdef CONFIG_MARKERS
1993 if (!mod->taints)
1994 marker_update_probe_range(mod->markers,
1995 mod->markers + mod->num_markers, NULL, NULL);
1996#endif
1982 err = module_finalize(hdr, sechdrs, mod); 1997 err = module_finalize(hdr, sechdrs, mod);
1983 if (err < 0) 1998 if (err < 0)
1984 goto cleanup; 1999 goto cleanup;
@@ -2570,3 +2585,18 @@ EXPORT_SYMBOL(module_remove_driver);
2570void struct_module(struct module *mod) { return; } 2585void struct_module(struct module *mod) { return; }
2571EXPORT_SYMBOL(struct_module); 2586EXPORT_SYMBOL(struct_module);
2572#endif 2587#endif
2588
2589#ifdef CONFIG_MARKERS
2590void module_update_markers(struct module *probe_module, int *refcount)
2591{
2592 struct module *mod;
2593
2594 mutex_lock(&module_mutex);
2595 list_for_each_entry(mod, &modules, list)
2596 if (!mod->taints)
2597 marker_update_probe_range(mod->markers,
2598 mod->markers + mod->num_markers,
2599 probe_module, refcount);
2600 mutex_unlock(&module_mutex);
2601}
2602#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
new file mode 100644
index 000000000000..4253f472f060
--- /dev/null
+++ b/kernel/notifier.c
@@ -0,0 +1,539 @@
1#include <linux/kdebug.h>
2#include <linux/kprobes.h>
3#include <linux/module.h>
4#include <linux/notifier.h>
5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h>
7
8/*
9 * Notifier list for kernel code which wants to be called
10 * at shutdown. This is used to stop any idling DMA operations
11 * and the like.
12 */
13BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
14
15/*
16 * Notifier chain core routines. The exported routines below
17 * are layered on top of these, with appropriate locking added.
18 */
19
20static int notifier_chain_register(struct notifier_block **nl,
21 struct notifier_block *n)
22{
23 while ((*nl) != NULL) {
24 if (n->priority > (*nl)->priority)
25 break;
26 nl = &((*nl)->next);
27 }
28 n->next = *nl;
29 rcu_assign_pointer(*nl, n);
30 return 0;
31}
32
33static int notifier_chain_unregister(struct notifier_block **nl,
34 struct notifier_block *n)
35{
36 while ((*nl) != NULL) {
37 if ((*nl) == n) {
38 rcu_assign_pointer(*nl, n->next);
39 return 0;
40 }
41 nl = &((*nl)->next);
42 }
43 return -ENOENT;
44}
45
46/**
47 * notifier_call_chain - Informs the registered notifiers about an event.
48 * @nl: Pointer to head of the blocking notifier chain
49 * @val: Value passed unmodified to notifier function
50 * @v: Pointer passed unmodified to notifier function
51 * @nr_to_call: Number of notifier functions to be called. Don't care
52 * value of this parameter is -1.
53 * @nr_calls: Records the number of notifications sent. Don't care
54 * value of this field is NULL.
55 * @returns: notifier_call_chain returns the value returned by the
56 * last notifier function called.
57 */
58static int __kprobes notifier_call_chain(struct notifier_block **nl,
59 unsigned long val, void *v,
60 int nr_to_call, int *nr_calls)
61{
62 int ret = NOTIFY_DONE;
63 struct notifier_block *nb, *next_nb;
64
65 nb = rcu_dereference(*nl);
66
67 while (nb && nr_to_call) {
68 next_nb = rcu_dereference(nb->next);
69 ret = nb->notifier_call(nb, val, v);
70
71 if (nr_calls)
72 (*nr_calls)++;
73
74 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
75 break;
76 nb = next_nb;
77 nr_to_call--;
78 }
79 return ret;
80}
81
82/*
83 * Atomic notifier chain routines. Registration and unregistration
84 * use a spinlock, and call_chain is synchronized by RCU (no locks).
85 */
86
87/**
88 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
89 * @nh: Pointer to head of the atomic notifier chain
90 * @n: New entry in notifier chain
91 *
92 * Adds a notifier to an atomic notifier chain.
93 *
94 * Currently always returns zero.
95 */
96int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
97 struct notifier_block *n)
98{
99 unsigned long flags;
100 int ret;
101
102 spin_lock_irqsave(&nh->lock, flags);
103 ret = notifier_chain_register(&nh->head, n);
104 spin_unlock_irqrestore(&nh->lock, flags);
105 return ret;
106}
107EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
108
109/**
110 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
111 * @nh: Pointer to head of the atomic notifier chain
112 * @n: Entry to remove from notifier chain
113 *
114 * Removes a notifier from an atomic notifier chain.
115 *
116 * Returns zero on success or %-ENOENT on failure.
117 */
118int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
119 struct notifier_block *n)
120{
121 unsigned long flags;
122 int ret;
123
124 spin_lock_irqsave(&nh->lock, flags);
125 ret = notifier_chain_unregister(&nh->head, n);
126 spin_unlock_irqrestore(&nh->lock, flags);
127 synchronize_rcu();
128 return ret;
129}
130EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
131
132/**
133 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
134 * @nh: Pointer to head of the atomic notifier chain
135 * @val: Value passed unmodified to notifier function
136 * @v: Pointer passed unmodified to notifier function
137 * @nr_to_call: See the comment for notifier_call_chain.
138 * @nr_calls: See the comment for notifier_call_chain.
139 *
140 * Calls each function in a notifier chain in turn. The functions
141 * run in an atomic context, so they must not block.
142 * This routine uses RCU to synchronize with changes to the chain.
143 *
144 * If the return value of the notifier can be and'ed
145 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
146 * will return immediately, with the return value of
147 * the notifier function which halted execution.
148 * Otherwise the return value is the return value
149 * of the last notifier function called.
150 */
151int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
152 unsigned long val, void *v,
153 int nr_to_call, int *nr_calls)
154{
155 int ret;
156
157 rcu_read_lock();
158 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
159 rcu_read_unlock();
160 return ret;
161}
162EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
163
164int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
165 unsigned long val, void *v)
166{
167 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
168}
169EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
170
171/*
172 * Blocking notifier chain routines. All access to the chain is
173 * synchronized by an rwsem.
174 */
175
176/**
177 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
178 * @nh: Pointer to head of the blocking notifier chain
179 * @n: New entry in notifier chain
180 *
181 * Adds a notifier to a blocking notifier chain.
182 * Must be called in process context.
183 *
184 * Currently always returns zero.
185 */
186int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
187 struct notifier_block *n)
188{
189 int ret;
190
191 /*
192 * This code gets used during boot-up, when task switching is
193 * not yet working and interrupts must remain disabled. At
194 * such times we must not call down_write().
195 */
196 if (unlikely(system_state == SYSTEM_BOOTING))
197 return notifier_chain_register(&nh->head, n);
198
199 down_write(&nh->rwsem);
200 ret = notifier_chain_register(&nh->head, n);
201 up_write(&nh->rwsem);
202 return ret;
203}
204EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
205
206/**
207 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
208 * @nh: Pointer to head of the blocking notifier chain
209 * @n: Entry to remove from notifier chain
210 *
211 * Removes a notifier from a blocking notifier chain.
212 * Must be called from process context.
213 *
214 * Returns zero on success or %-ENOENT on failure.
215 */
216int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
217 struct notifier_block *n)
218{
219 int ret;
220
221 /*
222 * This code gets used during boot-up, when task switching is
223 * not yet working and interrupts must remain disabled. At
224 * such times we must not call down_write().
225 */
226 if (unlikely(system_state == SYSTEM_BOOTING))
227 return notifier_chain_unregister(&nh->head, n);
228
229 down_write(&nh->rwsem);
230 ret = notifier_chain_unregister(&nh->head, n);
231 up_write(&nh->rwsem);
232 return ret;
233}
234EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
235
236/**
237 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
238 * @nh: Pointer to head of the blocking notifier chain
239 * @val: Value passed unmodified to notifier function
240 * @v: Pointer passed unmodified to notifier function
241 * @nr_to_call: See comment for notifier_call_chain.
242 * @nr_calls: See comment for notifier_call_chain.
243 *
244 * Calls each function in a notifier chain in turn. The functions
245 * run in a process context, so they are allowed to block.
246 *
247 * If the return value of the notifier can be and'ed
248 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
249 * will return immediately, with the return value of
250 * the notifier function which halted execution.
251 * Otherwise the return value is the return value
252 * of the last notifier function called.
253 */
254int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
255 unsigned long val, void *v,
256 int nr_to_call, int *nr_calls)
257{
258 int ret = NOTIFY_DONE;
259
260 /*
261 * We check the head outside the lock, but if this access is
262 * racy then it does not matter what the result of the test
263 * is, we re-check the list after having taken the lock anyway:
264 */
265 if (rcu_dereference(nh->head)) {
266 down_read(&nh->rwsem);
267 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
268 nr_calls);
269 up_read(&nh->rwsem);
270 }
271 return ret;
272}
273EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
274
275int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
276 unsigned long val, void *v)
277{
278 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
279}
280EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
281
282/*
283 * Raw notifier chain routines. There is no protection;
284 * the caller must provide it. Use at your own risk!
285 */
286
287/**
288 * raw_notifier_chain_register - Add notifier to a raw notifier chain
289 * @nh: Pointer to head of the raw notifier chain
290 * @n: New entry in notifier chain
291 *
292 * Adds a notifier to a raw notifier chain.
293 * All locking must be provided by the caller.
294 *
295 * Currently always returns zero.
296 */
297int raw_notifier_chain_register(struct raw_notifier_head *nh,
298 struct notifier_block *n)
299{
300 return notifier_chain_register(&nh->head, n);
301}
302EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
303
304/**
305 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
306 * @nh: Pointer to head of the raw notifier chain
307 * @n: Entry to remove from notifier chain
308 *
309 * Removes a notifier from a raw notifier chain.
310 * All locking must be provided by the caller.
311 *
312 * Returns zero on success or %-ENOENT on failure.
313 */
314int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
315 struct notifier_block *n)
316{
317 return notifier_chain_unregister(&nh->head, n);
318}
319EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
320
321/**
322 * __raw_notifier_call_chain - Call functions in a raw notifier chain
323 * @nh: Pointer to head of the raw notifier chain
324 * @val: Value passed unmodified to notifier function
325 * @v: Pointer passed unmodified to notifier function
326 * @nr_to_call: See comment for notifier_call_chain.
327 * @nr_calls: See comment for notifier_call_chain
328 *
329 * Calls each function in a notifier chain in turn. The functions
330 * run in an undefined context.
331 * All locking must be provided by the caller.
332 *
333 * If the return value of the notifier can be and'ed
334 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
335 * will return immediately, with the return value of
336 * the notifier function which halted execution.
337 * Otherwise the return value is the return value
338 * of the last notifier function called.
339 */
340int __raw_notifier_call_chain(struct raw_notifier_head *nh,
341 unsigned long val, void *v,
342 int nr_to_call, int *nr_calls)
343{
344 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
345}
346EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
347
348int raw_notifier_call_chain(struct raw_notifier_head *nh,
349 unsigned long val, void *v)
350{
351 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
352}
353EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
354
355/*
356 * SRCU notifier chain routines. Registration and unregistration
357 * use a mutex, and call_chain is synchronized by SRCU (no locks).
358 */
359
360/**
361 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
362 * @nh: Pointer to head of the SRCU notifier chain
363 * @n: New entry in notifier chain
364 *
365 * Adds a notifier to an SRCU notifier chain.
366 * Must be called in process context.
367 *
368 * Currently always returns zero.
369 */
370int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
371 struct notifier_block *n)
372{
373 int ret;
374
375 /*
376 * This code gets used during boot-up, when task switching is
377 * not yet working and interrupts must remain disabled. At
378 * such times we must not call mutex_lock().
379 */
380 if (unlikely(system_state == SYSTEM_BOOTING))
381 return notifier_chain_register(&nh->head, n);
382
383 mutex_lock(&nh->mutex);
384 ret = notifier_chain_register(&nh->head, n);
385 mutex_unlock(&nh->mutex);
386 return ret;
387}
388EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
389
390/**
391 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
392 * @nh: Pointer to head of the SRCU notifier chain
393 * @n: Entry to remove from notifier chain
394 *
395 * Removes a notifier from an SRCU notifier chain.
396 * Must be called from process context.
397 *
398 * Returns zero on success or %-ENOENT on failure.
399 */
400int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
401 struct notifier_block *n)
402{
403 int ret;
404
405 /*
406 * This code gets used during boot-up, when task switching is
407 * not yet working and interrupts must remain disabled. At
408 * such times we must not call mutex_lock().
409 */
410 if (unlikely(system_state == SYSTEM_BOOTING))
411 return notifier_chain_unregister(&nh->head, n);
412
413 mutex_lock(&nh->mutex);
414 ret = notifier_chain_unregister(&nh->head, n);
415 mutex_unlock(&nh->mutex);
416 synchronize_srcu(&nh->srcu);
417 return ret;
418}
419EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
420
421/**
422 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
423 * @nh: Pointer to head of the SRCU notifier chain
424 * @val: Value passed unmodified to notifier function
425 * @v: Pointer passed unmodified to notifier function
426 * @nr_to_call: See comment for notifier_call_chain.
427 * @nr_calls: See comment for notifier_call_chain
428 *
429 * Calls each function in a notifier chain in turn. The functions
430 * run in a process context, so they are allowed to block.
431 *
432 * If the return value of the notifier can be and'ed
433 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
434 * will return immediately, with the return value of
435 * the notifier function which halted execution.
436 * Otherwise the return value is the return value
437 * of the last notifier function called.
438 */
439int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
440 unsigned long val, void *v,
441 int nr_to_call, int *nr_calls)
442{
443 int ret;
444 int idx;
445
446 idx = srcu_read_lock(&nh->srcu);
447 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
448 srcu_read_unlock(&nh->srcu, idx);
449 return ret;
450}
451EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
452
453int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
454 unsigned long val, void *v)
455{
456 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
457}
458EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
459
460/**
461 * srcu_init_notifier_head - Initialize an SRCU notifier head
462 * @nh: Pointer to head of the srcu notifier chain
463 *
464 * Unlike other sorts of notifier heads, SRCU notifier heads require
465 * dynamic initialization. Be sure to call this routine before
466 * calling any of the other SRCU notifier routines for this head.
467 *
468 * If an SRCU notifier head is deallocated, it must first be cleaned
469 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
470 * per-cpu data (used by the SRCU mechanism) will leak.
471 */
472void srcu_init_notifier_head(struct srcu_notifier_head *nh)
473{
474 mutex_init(&nh->mutex);
475 if (init_srcu_struct(&nh->srcu) < 0)
476 BUG();
477 nh->head = NULL;
478}
479EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
480
481/**
482 * register_reboot_notifier - Register function to be called at reboot time
483 * @nb: Info about notifier function to be called
484 *
485 * Registers a function with the list of functions
486 * to be called at reboot time.
487 *
488 * Currently always returns zero, as blocking_notifier_chain_register()
489 * always returns zero.
490 */
491int register_reboot_notifier(struct notifier_block *nb)
492{
493 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
494}
495EXPORT_SYMBOL(register_reboot_notifier);
496
497/**
498 * unregister_reboot_notifier - Unregister previously registered reboot notifier
499 * @nb: Hook to be unregistered
500 *
501 * Unregisters a previously registered reboot
502 * notifier function.
503 *
504 * Returns zero on success, or %-ENOENT on failure.
505 */
506int unregister_reboot_notifier(struct notifier_block *nb)
507{
508 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
509}
510EXPORT_SYMBOL(unregister_reboot_notifier);
511
512static ATOMIC_NOTIFIER_HEAD(die_chain);
513
514int notify_die(enum die_val val, const char *str,
515 struct pt_regs *regs, long err, int trap, int sig)
516{
517 struct die_args args = {
518 .regs = regs,
519 .str = str,
520 .err = err,
521 .trapnr = trap,
522 .signr = sig,
523
524 };
525 return atomic_notifier_call_chain(&die_chain, val, &args);
526}
527
528int register_die_notifier(struct notifier_block *nb)
529{
530 vmalloc_sync_all();
531 return atomic_notifier_chain_register(&die_chain, nb);
532}
533EXPORT_SYMBOL_GPL(register_die_notifier);
534
535int unregister_die_notifier(struct notifier_block *nb)
536{
537 return atomic_notifier_chain_unregister(&die_chain, nb);
538}
539EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
new file mode 100644
index 000000000000..aead4d69f62b
--- /dev/null
+++ b/kernel/ns_cgroup.c
@@ -0,0 +1,100 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10
11struct ns_cgroup {
12 struct cgroup_subsys_state css;
13 spinlock_t lock;
14};
15
16struct cgroup_subsys ns_subsys;
17
18static inline struct ns_cgroup *cgroup_to_ns(
19 struct cgroup *cgroup)
20{
21 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
22 struct ns_cgroup, css);
23}
24
25int ns_cgroup_clone(struct task_struct *task)
26{
27 return cgroup_clone(task, &ns_subsys);
28}
29
30/*
31 * Rules:
32 * 1. you can only enter a cgroup which is a child of your current
33 * cgroup
34 * 2. you can only place another process into a cgroup if
35 * a. you have CAP_SYS_ADMIN
36 * b. your cgroup is an ancestor of task's destination cgroup
37 * (hence either you are in the same cgroup as task, or in an
38 * ancestor cgroup thereof)
39 */
40static int ns_can_attach(struct cgroup_subsys *ss,
41 struct cgroup *new_cgroup, struct task_struct *task)
42{
43 struct cgroup *orig;
44
45 if (current != task) {
46 if (!capable(CAP_SYS_ADMIN))
47 return -EPERM;
48
49 if (!cgroup_is_descendant(new_cgroup))
50 return -EPERM;
51 }
52
53 if (atomic_read(&new_cgroup->count) != 0)
54 return -EPERM;
55
56 orig = task_cgroup(task, ns_subsys_id);
57 if (orig && orig != new_cgroup->parent)
58 return -EPERM;
59
60 return 0;
61}
62
63/*
64 * Rules: you can only create a cgroup if
65 * 1. you are capable(CAP_SYS_ADMIN)
66 * 2. the target cgroup is a descendant of your own cgroup
67 */
68static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
69 struct cgroup *cgroup)
70{
71 struct ns_cgroup *ns_cgroup;
72
73 if (!capable(CAP_SYS_ADMIN))
74 return ERR_PTR(-EPERM);
75 if (!cgroup_is_descendant(cgroup))
76 return ERR_PTR(-EPERM);
77
78 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
79 if (!ns_cgroup)
80 return ERR_PTR(-ENOMEM);
81 spin_lock_init(&ns_cgroup->lock);
82 return &ns_cgroup->css;
83}
84
85static void ns_destroy(struct cgroup_subsys *ss,
86 struct cgroup *cgroup)
87{
88 struct ns_cgroup *ns_cgroup;
89
90 ns_cgroup = cgroup_to_ns(cgroup);
91 kfree(ns_cgroup);
92}
93
94struct cgroup_subsys ns_subsys = {
95 .name = "ns",
96 .can_attach = ns_can_attach,
97 .create = ns_create,
98 .destroy = ns_destroy,
99 .subsys_id = ns_subsys_id,
100};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 049e7c0ac566..79f871bc0ef4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,6 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29static inline void get_nsproxy(struct nsproxy *ns)
30{
31 atomic_inc(&ns->count);
32}
33
34void get_task_namespaces(struct task_struct *tsk)
35{
36 struct nsproxy *ns = tsk->nsproxy;
37 if (ns) {
38 get_nsproxy(ns);
39 }
40}
41
42/* 29/*
43 * creates a copy of "orig" with refcount 1. 30 * creates a copy of "orig" with refcount 1.
44 */ 31 */
@@ -87,7 +74,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
87 goto out_ipc; 74 goto out_ipc;
88 } 75 }
89 76
90 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 77 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
91 if (IS_ERR(new_nsp->pid_ns)) { 78 if (IS_ERR(new_nsp->pid_ns)) {
92 err = PTR_ERR(new_nsp->pid_ns); 79 err = PTR_ERR(new_nsp->pid_ns);
93 goto out_pid; 80 goto out_pid;
@@ -142,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
142 129
143 get_nsproxy(old_ns); 130 get_nsproxy(old_ns);
144 131
145 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) 132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
146 return 0; 134 return 0;
147 135
148 if (!capable(CAP_SYS_ADMIN)) { 136 if (!capable(CAP_SYS_ADMIN)) {
@@ -156,7 +144,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
156 goto out; 144 goto out;
157 } 145 }
158 146
147 err = ns_cgroup_clone(tsk);
148 if (err) {
149 put_nsproxy(new_ns);
150 goto out;
151 }
152
159 tsk->nsproxy = new_ns; 153 tsk->nsproxy = new_ns;
154
160out: 155out:
161 put_nsproxy(old_ns); 156 put_nsproxy(old_ns);
162 return err; 157 return err;
@@ -196,11 +191,46 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
196 191
197 *new_nsp = create_new_namespaces(unshare_flags, current, 192 *new_nsp = create_new_namespaces(unshare_flags, current,
198 new_fs ? new_fs : current->fs); 193 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) 194 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 195 err = PTR_ERR(*new_nsp);
196 goto out;
197 }
198
199 err = ns_cgroup_clone(current);
200 if (err)
201 put_nsproxy(*new_nsp);
202
203out:
201 return err; 204 return err;
202} 205}
203 206
207void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
208{
209 struct nsproxy *ns;
210
211 might_sleep();
212
213 ns = p->nsproxy;
214
215 rcu_assign_pointer(p->nsproxy, new);
216
217 if (ns && atomic_dec_and_test(&ns->count)) {
218 /*
219 * wait for others to get what they want from this nsproxy.
220 *
221 * cannot release this nsproxy via the call_rcu() since
222 * put_mnt_ns() will want to sleep
223 */
224 synchronize_rcu();
225 free_nsproxy(ns);
226 }
227}
228
229void exit_task_namespaces(struct task_struct *p)
230{
231 switch_task_namespaces(p, NULL);
232}
233
204static int __init nsproxy_cache_init(void) 234static int __init nsproxy_cache_init(void)
205{ 235{
206 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 236 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pid.c b/kernel/pid.c
index c6e3f9ffff87..d1db36b94674 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -18,6 +18,12 @@
18 * allocation scenario when all but one out of 1 million PIDs possible are 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 *
22 * Pid namespaces:
23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
25 * Many thanks to Oleg Nesterov for comments and help
26 *
21 */ 27 */
22 28
23#include <linux/mm.h> 29#include <linux/mm.h>
@@ -28,12 +34,14 @@
28#include <linux/hash.h> 34#include <linux/hash.h>
29#include <linux/pid_namespace.h> 35#include <linux/pid_namespace.h>
30#include <linux/init_task.h> 36#include <linux/init_task.h>
37#include <linux/syscalls.h>
31 38
32#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 39#define pid_hashfn(nr, ns) \
40 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
33static struct hlist_head *pid_hash; 41static struct hlist_head *pid_hash;
34static int pidhash_shift; 42static int pidhash_shift;
35static struct kmem_cache *pid_cachep;
36struct pid init_struct_pid = INIT_STRUCT_PID; 43struct pid init_struct_pid = INIT_STRUCT_PID;
44static struct kmem_cache *pid_ns_cachep;
37 45
38int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
39 47
@@ -68,8 +76,25 @@ struct pid_namespace init_pid_ns = {
68 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 76 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
69 }, 77 },
70 .last_pid = 0, 78 .last_pid = 0,
71 .child_reaper = &init_task 79 .level = 0,
80 .child_reaper = &init_task,
72}; 81};
82EXPORT_SYMBOL_GPL(init_pid_ns);
83
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
73 98
74/* 99/*
75 * Note: disable interrupts while the pidmap_lock is held as an 100 * Note: disable interrupts while the pidmap_lock is held as an
@@ -176,11 +201,17 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
176 201
177fastcall void put_pid(struct pid *pid) 202fastcall void put_pid(struct pid *pid)
178{ 203{
204 struct pid_namespace *ns;
205
179 if (!pid) 206 if (!pid)
180 return; 207 return;
208
209 ns = pid->numbers[pid->level].ns;
181 if ((atomic_read(&pid->count) == 1) || 210 if ((atomic_read(&pid->count) == 1) ||
182 atomic_dec_and_test(&pid->count)) 211 atomic_dec_and_test(&pid->count)) {
183 kmem_cache_free(pid_cachep, pid); 212 kmem_cache_free(ns->pid_cachep, pid);
213 put_pid_ns(ns);
214 }
184} 215}
185EXPORT_SYMBOL_GPL(put_pid); 216EXPORT_SYMBOL_GPL(put_pid);
186 217
@@ -193,60 +224,94 @@ static void delayed_put_pid(struct rcu_head *rhp)
193fastcall void free_pid(struct pid *pid) 224fastcall void free_pid(struct pid *pid)
194{ 225{
195 /* We can be called with write_lock_irq(&tasklist_lock) held */ 226 /* We can be called with write_lock_irq(&tasklist_lock) held */
227 int i;
196 unsigned long flags; 228 unsigned long flags;
197 229
198 spin_lock_irqsave(&pidmap_lock, flags); 230 spin_lock_irqsave(&pidmap_lock, flags);
199 hlist_del_rcu(&pid->pid_chain); 231 for (i = 0; i <= pid->level; i++)
232 hlist_del_rcu(&pid->numbers[i].pid_chain);
200 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
201 234
202 free_pidmap(&init_pid_ns, pid->nr); 235 for (i = 0; i <= pid->level; i++)
236 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
237
203 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
204} 239}
205 240
206struct pid *alloc_pid(void) 241struct pid *alloc_pid(struct pid_namespace *ns)
207{ 242{
208 struct pid *pid; 243 struct pid *pid;
209 enum pid_type type; 244 enum pid_type type;
210 int nr = -1; 245 int i, nr;
246 struct pid_namespace *tmp;
247 struct upid *upid;
211 248
212 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); 249 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
213 if (!pid) 250 if (!pid)
214 goto out; 251 goto out;
215 252
216 nr = alloc_pidmap(current->nsproxy->pid_ns); 253 tmp = ns;
217 if (nr < 0) 254 for (i = ns->level; i >= 0; i--) {
218 goto out_free; 255 nr = alloc_pidmap(tmp);
256 if (nr < 0)
257 goto out_free;
258
259 pid->numbers[i].nr = nr;
260 pid->numbers[i].ns = tmp;
261 tmp = tmp->parent;
262 }
219 263
264 get_pid_ns(ns);
265 pid->level = ns->level;
220 atomic_set(&pid->count, 1); 266 atomic_set(&pid->count, 1);
221 pid->nr = nr;
222 for (type = 0; type < PIDTYPE_MAX; ++type) 267 for (type = 0; type < PIDTYPE_MAX; ++type)
223 INIT_HLIST_HEAD(&pid->tasks[type]); 268 INIT_HLIST_HEAD(&pid->tasks[type]);
224 269
225 spin_lock_irq(&pidmap_lock); 270 spin_lock_irq(&pidmap_lock);
226 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); 271 for (i = ns->level; i >= 0; i--) {
272 upid = &pid->numbers[i];
273 hlist_add_head_rcu(&upid->pid_chain,
274 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
275 }
227 spin_unlock_irq(&pidmap_lock); 276 spin_unlock_irq(&pidmap_lock);
228 277
229out: 278out:
230 return pid; 279 return pid;
231 280
232out_free: 281out_free:
233 kmem_cache_free(pid_cachep, pid); 282 for (i++; i <= ns->level; i++)
283 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
284
285 kmem_cache_free(ns->pid_cachep, pid);
234 pid = NULL; 286 pid = NULL;
235 goto out; 287 goto out;
236} 288}
237 289
238struct pid * fastcall find_pid(int nr) 290struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
239{ 291{
240 struct hlist_node *elem; 292 struct hlist_node *elem;
241 struct pid *pid; 293 struct upid *pnr;
294
295 hlist_for_each_entry_rcu(pnr, elem,
296 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
297 if (pnr->nr == nr && pnr->ns == ns)
298 return container_of(pnr, struct pid,
299 numbers[ns->level]);
242 300
243 hlist_for_each_entry_rcu(pid, elem,
244 &pid_hash[pid_hashfn(nr)], pid_chain) {
245 if (pid->nr == nr)
246 return pid;
247 }
248 return NULL; 301 return NULL;
249} 302}
303EXPORT_SYMBOL_GPL(find_pid_ns);
304
305struct pid *find_vpid(int nr)
306{
307 return find_pid_ns(nr, current->nsproxy->pid_ns);
308}
309EXPORT_SYMBOL_GPL(find_vpid);
310
311struct pid *find_pid(int nr)
312{
313 return find_pid_ns(nr, &init_pid_ns);
314}
250EXPORT_SYMBOL_GPL(find_pid); 315EXPORT_SYMBOL_GPL(find_pid);
251 316
252/* 317/*
@@ -307,12 +372,32 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
307/* 372/*
308 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 373 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
309 */ 374 */
310struct task_struct *find_task_by_pid_type(int type, int nr) 375struct task_struct *find_task_by_pid_type_ns(int type, int nr,
376 struct pid_namespace *ns)
311{ 377{
312 return pid_task(find_pid(nr), type); 378 return pid_task(find_pid_ns(nr, ns), type);
313} 379}
314 380
315EXPORT_SYMBOL(find_task_by_pid_type); 381EXPORT_SYMBOL(find_task_by_pid_type_ns);
382
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr)
390{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399}
400EXPORT_SYMBOL(find_task_by_pid_ns);
316 401
317struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 402struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
318{ 403{
@@ -339,45 +424,239 @@ struct pid *find_get_pid(pid_t nr)
339 struct pid *pid; 424 struct pid *pid;
340 425
341 rcu_read_lock(); 426 rcu_read_lock();
342 pid = get_pid(find_pid(nr)); 427 pid = get_pid(find_vpid(nr));
343 rcu_read_unlock(); 428 rcu_read_unlock();
344 429
345 return pid; 430 return pid;
346} 431}
347 432
433pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
434{
435 struct upid *upid;
436 pid_t nr = 0;
437
438 if (pid && ns->level <= pid->level) {
439 upid = &pid->numbers[ns->level];
440 if (upid->ns == ns)
441 nr = upid->nr;
442 }
443 return nr;
444}
445
446pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
447{
448 return pid_nr_ns(task_pid(tsk), ns);
449}
450EXPORT_SYMBOL(task_pid_nr_ns);
451
452pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
453{
454 return pid_nr_ns(task_tgid(tsk), ns);
455}
456EXPORT_SYMBOL(task_tgid_nr_ns);
457
458pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
459{
460 return pid_nr_ns(task_pgrp(tsk), ns);
461}
462EXPORT_SYMBOL(task_pgrp_nr_ns);
463
464pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
465{
466 return pid_nr_ns(task_session(tsk), ns);
467}
468EXPORT_SYMBOL(task_session_nr_ns);
469
348/* 470/*
349 * Used by proc to find the first pid that is greater then or equal to nr. 471 * Used by proc to find the first pid that is greater then or equal to nr.
350 * 472 *
351 * If there is a pid at nr this function is exactly the same as find_pid. 473 * If there is a pid at nr this function is exactly the same as find_pid.
352 */ 474 */
353struct pid *find_ge_pid(int nr) 475struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
354{ 476{
355 struct pid *pid; 477 struct pid *pid;
356 478
357 do { 479 do {
358 pid = find_pid(nr); 480 pid = find_pid_ns(nr, ns);
359 if (pid) 481 if (pid)
360 break; 482 break;
361 nr = next_pidmap(current->nsproxy->pid_ns, nr); 483 nr = next_pidmap(ns, nr);
362 } while (nr > 0); 484 } while (nr > 0);
363 485
364 return pid; 486 return pid;
365} 487}
366EXPORT_SYMBOL_GPL(find_get_pid); 488EXPORT_SYMBOL_GPL(find_get_pid);
367 489
490struct pid_cache {
491 int nr_ids;
492 char name[16];
493 struct kmem_cache *cachep;
494 struct list_head list;
495};
496
497static LIST_HEAD(pid_caches_lh);
498static DEFINE_MUTEX(pid_caches_mutex);
499
500/*
501 * creates the kmem cache to allocate pids from.
502 * @nr_ids: the number of numerical ids this pid will have to carry
503 */
504
505static struct kmem_cache *create_pid_cachep(int nr_ids)
506{
507 struct pid_cache *pcache;
508 struct kmem_cache *cachep;
509
510 mutex_lock(&pid_caches_mutex);
511 list_for_each_entry (pcache, &pid_caches_lh, list)
512 if (pcache->nr_ids == nr_ids)
513 goto out;
514
515 pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
516 if (pcache == NULL)
517 goto err_alloc;
518
519 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
520 cachep = kmem_cache_create(pcache->name,
521 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
522 0, SLAB_HWCACHE_ALIGN, NULL);
523 if (cachep == NULL)
524 goto err_cachep;
525
526 pcache->nr_ids = nr_ids;
527 pcache->cachep = cachep;
528 list_add(&pcache->list, &pid_caches_lh);
529out:
530 mutex_unlock(&pid_caches_mutex);
531 return pcache->cachep;
532
533err_cachep:
534 kfree(pcache);
535err_alloc:
536 mutex_unlock(&pid_caches_mutex);
537 return NULL;
538}
539
540static struct pid_namespace *create_pid_namespace(int level)
541{
542 struct pid_namespace *ns;
543 int i;
544
545 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
546 if (ns == NULL)
547 goto out;
548
549 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
550 if (!ns->pidmap[0].page)
551 goto out_free;
552
553 ns->pid_cachep = create_pid_cachep(level + 1);
554 if (ns->pid_cachep == NULL)
555 goto out_free_map;
556
557 kref_init(&ns->kref);
558 ns->last_pid = 0;
559 ns->child_reaper = NULL;
560 ns->level = level;
561
562 set_bit(0, ns->pidmap[0].page);
563 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
564
565 for (i = 1; i < PIDMAP_ENTRIES; i++) {
566 ns->pidmap[i].page = 0;
567 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
568 }
569
570 return ns;
571
572out_free_map:
573 kfree(ns->pidmap[0].page);
574out_free:
575 kmem_cache_free(pid_ns_cachep, ns);
576out:
577 return ERR_PTR(-ENOMEM);
578}
579
580static void destroy_pid_namespace(struct pid_namespace *ns)
581{
582 int i;
583
584 for (i = 0; i < PIDMAP_ENTRIES; i++)
585 kfree(ns->pidmap[i].page);
586 kmem_cache_free(pid_ns_cachep, ns);
587}
588
368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 589struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 590{
591 struct pid_namespace *new_ns;
592
370 BUG_ON(!old_ns); 593 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 594 new_ns = get_pid_ns(old_ns);
372 return old_ns; 595 if (!(flags & CLONE_NEWPID))
596 goto out;
597
598 new_ns = ERR_PTR(-EINVAL);
599 if (flags & CLONE_THREAD)
600 goto out_put;
601
602 new_ns = create_pid_namespace(old_ns->level + 1);
603 if (!IS_ERR(new_ns))
604 new_ns->parent = get_pid_ns(old_ns);
605
606out_put:
607 put_pid_ns(old_ns);
608out:
609 return new_ns;
373} 610}
374 611
375void free_pid_ns(struct kref *kref) 612void free_pid_ns(struct kref *kref)
376{ 613{
377 struct pid_namespace *ns; 614 struct pid_namespace *ns, *parent;
378 615
379 ns = container_of(kref, struct pid_namespace, kref); 616 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns); 617
618 parent = ns->parent;
619 destroy_pid_namespace(ns);
620
621 if (parent != NULL)
622 put_pid_ns(parent);
623}
624
625void zap_pid_ns_processes(struct pid_namespace *pid_ns)
626{
627 int nr;
628 int rc;
629
630 /*
631 * The last thread in the cgroup-init thread group is terminating.
632 * Find remaining pid_ts in the namespace, signal and wait for them
633 * to exit.
634 *
635 * Note: This signals each threads in the namespace - even those that
636 * belong to the same thread group, To avoid this, we would have
637 * to walk the entire tasklist looking a processes in this
638 * namespace, but that could be unnecessarily expensive if the
639 * pid namespace has just a few processes. Or we need to
640 * maintain a tasklist for each pid namespace.
641 *
642 */
643 read_lock(&tasklist_lock);
644 nr = next_pidmap(pid_ns, 1);
645 while (nr > 0) {
646 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
647 nr = next_pidmap(pid_ns, nr);
648 }
649 read_unlock(&tasklist_lock);
650
651 do {
652 clear_thread_flag(TIF_SIGPENDING);
653 rc = sys_wait4(-1, NULL, __WALL, NULL);
654 } while (rc != -ECHILD);
655
656
657 /* Child reaper for the pid namespace is going away */
658 pid_ns->child_reaper = NULL;
659 return;
381} 660}
382 661
383/* 662/*
@@ -412,5 +691,9 @@ void __init pidmap_init(void)
412 set_bit(0, init_pid_ns.pidmap[0].page); 691 set_bit(0, init_pid_ns.pidmap[0].page);
413 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 692 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
414 693
415 pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); 694 init_pid_ns.pid_cachep = create_pid_cachep(1);
695 if (init_pid_ns.pid_cachep == NULL)
696 panic("Can't create pid_1 cachep\n");
697
698 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
416} 699}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b53c8fcd9d82..68c96376e84a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -21,8 +21,8 @@ static int check_clock(const clockid_t which_clock)
21 21
22 read_lock(&tasklist_lock); 22 read_lock(&tasklist_lock);
23 p = find_task_by_pid(pid); 23 p = find_task_by_pid(pid);
24 if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? 24 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
25 p->tgid != current->tgid : p->tgid != pid)) { 25 same_thread_group(p, current) : thread_group_leader(p))) {
26 error = -EINVAL; 26 error = -EINVAL;
27 } 27 }
28 read_unlock(&tasklist_lock); 28 read_unlock(&tasklist_lock);
@@ -308,13 +308,13 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
311 if (p->tgid == current->tgid) { 311 if (same_thread_group(p, current)) {
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else { 315 } else {
316 read_lock(&tasklist_lock); 316 read_lock(&tasklist_lock);
317 if (p->tgid == pid && p->signal) { 317 if (thread_group_leader(p) && p->signal) {
318 error = 318 error =
319 cpu_clock_sample_group(which_clock, 319 cpu_clock_sample_group(which_clock,
320 p, &rtn); 320 p, &rtn);
@@ -355,7 +355,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
355 p = current; 355 p = current;
356 } else { 356 } else {
357 p = find_task_by_pid(pid); 357 p = find_task_by_pid(pid);
358 if (p && p->tgid != current->tgid) 358 if (p && !same_thread_group(p, current))
359 p = NULL; 359 p = NULL;
360 } 360 }
361 } else { 361 } else {
@@ -363,7 +363,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
363 p = current->group_leader; 363 p = current->group_leader;
364 } else { 364 } else {
365 p = find_task_by_pid(pid); 365 p = find_task_by_pid(pid);
366 if (p && p->tgid != pid) 366 if (p && !thread_group_leader(p))
367 p = NULL; 367 p = NULL;
368 } 368 }
369 } 369 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d11f579d189a..35b4bbfc78ff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
404 404
405 if ((event->sigev_notify & SIGEV_THREAD_ID ) && 405 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || 406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
407 rtn->tgid != current->tgid || 407 !same_thread_group(rtn, current) ||
408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) 408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
409 return NULL; 409 return NULL;
410 410
@@ -608,7 +608,7 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
608 spin_lock(&timr->it_lock); 608 spin_lock(&timr->it_lock);
609 609
610 if ((timr->it_id != timer_id) || !(timr->it_process) || 610 if ((timr->it_id != timer_id) || !(timr->it_process) ||
611 timr->it_process->tgid != current->tgid) { 611 !same_thread_group(timr->it_process, current)) {
612 spin_unlock(&timr->it_lock); 612 spin_unlock(&timr->it_lock);
613 spin_unlock_irqrestore(&idr_lock, *flags); 613 spin_unlock_irqrestore(&idr_lock, *flags);
614 timr = NULL; 614 timr = NULL;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a73ebd3b9d4c..7c76f2ffaeaa 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -19,6 +19,7 @@
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/signal.h> 20#include <linux/signal.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
@@ -168,7 +169,7 @@ int ptrace_attach(struct task_struct *task)
168 retval = -EPERM; 169 retval = -EPERM;
169 if (task->pid <= 1) 170 if (task->pid <= 1)
170 goto out; 171 goto out;
171 if (task->tgid == current->tgid) 172 if (same_thread_group(task, current))
172 goto out; 173 goto out;
173 174
174repeat: 175repeat:
@@ -443,7 +444,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
443 return ERR_PTR(-EPERM); 444 return ERR_PTR(-EPERM);
444 445
445 read_lock(&tasklist_lock); 446 read_lock(&tasklist_lock);
446 child = find_task_by_pid(pid); 447 child = find_task_by_vpid(pid);
447 if (child) 448 if (child)
448 get_task_struct(child); 449 get_task_struct(child);
449 450
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 6b0703db152d..56d73cb8826d 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -87,7 +87,7 @@ static int rt_trace_on = 1;
87static void printk_task(struct task_struct *p) 87static void printk_task(struct task_struct *p)
88{ 88{
89 if (p) 89 if (p)
90 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); 90 printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
91 else 91 else
92 printk("<none>"); 92 printk("<none>");
93} 93}
@@ -152,22 +152,25 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
152 printk( "[ BUG: circular locking deadlock detected! ]\n"); 152 printk( "[ BUG: circular locking deadlock detected! ]\n");
153 printk( "--------------------------------------------\n"); 153 printk( "--------------------------------------------\n");
154 printk("%s/%d is deadlocking current task %s/%d\n\n", 154 printk("%s/%d is deadlocking current task %s/%d\n\n",
155 task->comm, task->pid, current->comm, current->pid); 155 task->comm, task_pid_nr(task),
156 current->comm, task_pid_nr(current));
156 157
157 printk("\n1) %s/%d is trying to acquire this lock:\n", 158 printk("\n1) %s/%d is trying to acquire this lock:\n",
158 current->comm, current->pid); 159 current->comm, task_pid_nr(current));
159 printk_lock(waiter->lock, 1); 160 printk_lock(waiter->lock, 1);
160 161
161 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); 162 printk("\n2) %s/%d is blocked on this lock:\n",
163 task->comm, task_pid_nr(task));
162 printk_lock(waiter->deadlock_lock, 1); 164 printk_lock(waiter->deadlock_lock, 1);
163 165
164 debug_show_held_locks(current); 166 debug_show_held_locks(current);
165 debug_show_held_locks(task); 167 debug_show_held_locks(task);
166 168
167 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); 169 printk("\n%s/%d's [blocked] stackdump:\n\n",
170 task->comm, task_pid_nr(task));
168 show_stack(task, NULL); 171 show_stack(task, NULL);
169 printk("\n%s/%d's [current] stackdump:\n\n", 172 printk("\n%s/%d's [current] stackdump:\n\n",
170 current->comm, current->pid); 173 current->comm, task_pid_nr(current));
171 dump_stack(); 174 dump_stack();
172 debug_show_all_locks(); 175 debug_show_all_locks();
173 176
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 8cd9bd2cdb34..0deef71ff8d2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -185,7 +185,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
185 prev_max = max_lock_depth; 185 prev_max = max_lock_depth;
186 printk(KERN_WARNING "Maximum lock depth %d reached " 186 printk(KERN_WARNING "Maximum lock depth %d reached "
187 "task: %s (%d)\n", max_lock_depth, 187 "task: %s (%d)\n", max_lock_depth,
188 top_task->comm, top_task->pid); 188 top_task->comm, task_pid_nr(top_task));
189 } 189 }
190 put_task_struct(task); 190 put_task_struct(task);
191 191
diff --git a/kernel/sched.c b/kernel/sched.c
index ed90be46fb31..afe76ec2e7fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
44#include <linux/vmalloc.h> 44#include <linux/vmalloc.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/delay.h> 46#include <linux/delay.h>
47#include <linux/pid_namespace.h>
47#include <linux/smp.h> 48#include <linux/smp.h>
48#include <linux/threads.h> 49#include <linux/threads.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
@@ -51,6 +52,7 @@
51#include <linux/cpu.h> 52#include <linux/cpu.h>
52#include <linux/cpuset.h> 53#include <linux/cpuset.h>
53#include <linux/percpu.h> 54#include <linux/percpu.h>
55#include <linux/cpu_acct.h>
54#include <linux/kthread.h> 56#include <linux/kthread.h>
55#include <linux/seq_file.h> 57#include <linux/seq_file.h>
56#include <linux/sysctl.h> 58#include <linux/sysctl.h>
@@ -153,10 +155,15 @@ struct rt_prio_array {
153 155
154#ifdef CONFIG_FAIR_GROUP_SCHED 156#ifdef CONFIG_FAIR_GROUP_SCHED
155 157
158#include <linux/cgroup.h>
159
156struct cfs_rq; 160struct cfs_rq;
157 161
158/* task group related information */ 162/* task group related information */
159struct task_group { 163struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED
165 struct cgroup_subsys_state css;
166#endif
160 /* schedulable entities of this group on each cpu */ 167 /* schedulable entities of this group on each cpu */
161 struct sched_entity **se; 168 struct sched_entity **se;
162 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
@@ -197,6 +204,9 @@ static inline struct task_group *task_group(struct task_struct *p)
197 204
198#ifdef CONFIG_FAIR_USER_SCHED 205#ifdef CONFIG_FAIR_USER_SCHED
199 tg = p->user->tg; 206 tg = p->user->tg;
207#elif defined(CONFIG_FAIR_CGROUP_SCHED)
208 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
209 struct task_group, css);
200#else 210#else
201 tg = &init_task_group; 211 tg = &init_task_group;
202#endif 212#endif
@@ -1875,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1875 preempt_enable(); 1885 preempt_enable();
1876#endif 1886#endif
1877 if (current->set_child_tid) 1887 if (current->set_child_tid)
1878 put_user(current->pid, current->set_child_tid); 1888 put_user(task_pid_vnr(current), current->set_child_tid);
1879} 1889}
1880 1890
1881/* 1891/*
@@ -3307,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3307{ 3317{
3308 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3318 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3309 cputime64_t tmp; 3319 cputime64_t tmp;
3320 struct rq *rq = this_rq();
3310 3321
3311 p->utime = cputime_add(p->utime, cputime); 3322 p->utime = cputime_add(p->utime, cputime);
3312 3323
3324 if (p != rq->idle)
3325 cpuacct_charge(p, cputime);
3326
3313 /* Add user time to cpustat. */ 3327 /* Add user time to cpustat. */
3314 tmp = cputime_to_cputime64(cputime); 3328 tmp = cputime_to_cputime64(cputime);
3315 if (TASK_NICE(p) > 0) 3329 if (TASK_NICE(p) > 0)
@@ -3374,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3374 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3388 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3375 else if (softirq_count()) 3389 else if (softirq_count())
3376 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3390 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3377 else if (p != rq->idle) 3391 else if (p != rq->idle) {
3378 cpustat->system = cputime64_add(cpustat->system, tmp); 3392 cpustat->system = cputime64_add(cpustat->system, tmp);
3379 else if (atomic_read(&rq->nr_iowait) > 0) 3393 cpuacct_charge(p, cputime);
3394 } else if (atomic_read(&rq->nr_iowait) > 0)
3380 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3395 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3381 else 3396 else
3382 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3397 cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3412,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3412 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3427 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3413 else 3428 else
3414 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3429 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3415 } else 3430 } else {
3416 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3431 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3432 cpuacct_charge(p, -tmp);
3433 }
3417} 3434}
3418 3435
3419/* 3436/*
@@ -3493,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count);
3493static noinline void __schedule_bug(struct task_struct *prev) 3510static noinline void __schedule_bug(struct task_struct *prev)
3494{ 3511{
3495 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3512 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3496 prev->comm, preempt_count(), prev->pid); 3513 prev->comm, preempt_count(), task_pid_nr(prev));
3497 debug_show_held_locks(prev); 3514 debug_show_held_locks(prev);
3498 if (irqs_disabled()) 3515 if (irqs_disabled())
3499 print_irqtrace_events(prev); 3516 print_irqtrace_events(prev);
@@ -4159,7 +4176,7 @@ struct task_struct *idle_task(int cpu)
4159 */ 4176 */
4160static struct task_struct *find_process_by_pid(pid_t pid) 4177static struct task_struct *find_process_by_pid(pid_t pid)
4161{ 4178{
4162 return pid ? find_task_by_pid(pid) : current; 4179 return pid ? find_task_by_vpid(pid) : current;
4163} 4180}
4164 4181
4165/* Actually do priority change: must hold rq lock. */ 4182/* Actually do priority change: must hold rq lock. */
@@ -4462,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4462 4479
4463 cpus_allowed = cpuset_cpus_allowed(p); 4480 cpus_allowed = cpuset_cpus_allowed(p);
4464 cpus_and(new_mask, new_mask, cpus_allowed); 4481 cpus_and(new_mask, new_mask, cpus_allowed);
4482 again:
4465 retval = set_cpus_allowed(p, new_mask); 4483 retval = set_cpus_allowed(p, new_mask);
4466 4484
4485 if (!retval) {
4486 cpus_allowed = cpuset_cpus_allowed(p);
4487 if (!cpus_subset(new_mask, cpus_allowed)) {
4488 /*
4489 * We must have raced with a concurrent cpuset
4490 * update. Just reset the cpus_allowed to the
4491 * cpuset's cpus_allowed
4492 */
4493 new_mask = cpus_allowed;
4494 goto again;
4495 }
4496 }
4467out_unlock: 4497out_unlock:
4468 put_task_struct(p); 4498 put_task_struct(p);
4469 mutex_unlock(&sched_hotcpu_mutex); 4499 mutex_unlock(&sched_hotcpu_mutex);
@@ -4843,7 +4873,8 @@ static void show_task(struct task_struct *p)
4843 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4873 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4844 } 4874 }
4845#endif 4875#endif
4846 printk(KERN_CONT "%5lu %5d %6d\n", free, p->pid, p->parent->pid); 4876 printk(KERN_CONT "%5lu %5d %6d\n", free,
4877 task_pid_nr(p), task_pid_nr(p->parent));
4847 4878
4848 if (state != TASK_RUNNING) 4879 if (state != TASK_RUNNING)
4849 show_stack(p, NULL); 4880 show_stack(p, NULL);
@@ -5137,8 +5168,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5137 5168
5138 /* No more Mr. Nice Guy. */ 5169 /* No more Mr. Nice Guy. */
5139 if (dest_cpu == NR_CPUS) { 5170 if (dest_cpu == NR_CPUS) {
5171 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5172 /*
5173 * Try to stay on the same cpuset, where the
5174 * current cpuset may be a subset of all cpus.
5175 * The cpuset_cpus_allowed_locked() variant of
5176 * cpuset_cpus_allowed() will not block. It must be
5177 * called within calls to cpuset_lock/cpuset_unlock.
5178 */
5140 rq = task_rq_lock(p, &flags); 5179 rq = task_rq_lock(p, &flags);
5141 cpus_setall(p->cpus_allowed); 5180 p->cpus_allowed = cpus_allowed;
5142 dest_cpu = any_online_cpu(p->cpus_allowed); 5181 dest_cpu = any_online_cpu(p->cpus_allowed);
5143 task_rq_unlock(rq, &flags); 5182 task_rq_unlock(rq, &flags);
5144 5183
@@ -5150,7 +5189,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5150 if (p->mm && printk_ratelimit()) 5189 if (p->mm && printk_ratelimit())
5151 printk(KERN_INFO "process %d (%s) no " 5190 printk(KERN_INFO "process %d (%s) no "
5152 "longer affine to cpu%d\n", 5191 "longer affine to cpu%d\n",
5153 p->pid, p->comm, dead_cpu); 5192 task_pid_nr(p), p->comm, dead_cpu);
5154 } 5193 }
5155 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 5194 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5156} 5195}
@@ -5257,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5257 struct rq *rq = cpu_rq(dead_cpu); 5296 struct rq *rq = cpu_rq(dead_cpu);
5258 5297
5259 /* Must be exiting, otherwise would be on tasklist. */ 5298 /* Must be exiting, otherwise would be on tasklist. */
5260 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5299 BUG_ON(!p->exit_state);
5261 5300
5262 /* Cannot have done final schedule yet: would have vanished. */ 5301 /* Cannot have done final schedule yet: would have vanished. */
5263 BUG_ON(p->state == TASK_DEAD); 5302 BUG_ON(p->state == TASK_DEAD);
@@ -5504,6 +5543,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5504 5543
5505 case CPU_DEAD: 5544 case CPU_DEAD:
5506 case CPU_DEAD_FROZEN: 5545 case CPU_DEAD_FROZEN:
5546 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5507 migrate_live_tasks(cpu); 5547 migrate_live_tasks(cpu);
5508 rq = cpu_rq(cpu); 5548 rq = cpu_rq(cpu);
5509 kthread_stop(rq->migration_thread); 5549 kthread_stop(rq->migration_thread);
@@ -5517,6 +5557,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5517 rq->idle->sched_class = &idle_sched_class; 5557 rq->idle->sched_class = &idle_sched_class;
5518 migrate_dead_tasks(cpu); 5558 migrate_dead_tasks(cpu);
5519 spin_unlock_irq(&rq->lock); 5559 spin_unlock_irq(&rq->lock);
5560 cpuset_unlock();
5520 migrate_nr_uninterruptible(rq); 5561 migrate_nr_uninterruptible(rq);
5521 BUG_ON(rq->nr_running != 0); 5562 BUG_ON(rq->nr_running != 0);
5522 5563
@@ -6367,26 +6408,31 @@ error:
6367 return -ENOMEM; 6408 return -ENOMEM;
6368#endif 6409#endif
6369} 6410}
6411
6412static cpumask_t *doms_cur; /* current sched domains */
6413static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6414
6415/*
6416 * Special case: If a kmalloc of a doms_cur partition (array of
6417 * cpumask_t) fails, then fallback to a single sched domain,
6418 * as determined by the single cpumask_t fallback_doms.
6419 */
6420static cpumask_t fallback_doms;
6421
6370/* 6422/*
6371 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6423 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6424 * For now this just excludes isolated cpus, but could be used to
6425 * exclude other special cases in the future.
6372 */ 6426 */
6373static int arch_init_sched_domains(const cpumask_t *cpu_map) 6427static int arch_init_sched_domains(const cpumask_t *cpu_map)
6374{ 6428{
6375 cpumask_t cpu_default_map; 6429 ndoms_cur = 1;
6376 int err; 6430 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6377 6431 if (!doms_cur)
6378 /* 6432 doms_cur = &fallback_doms;
6379 * Setup mask for cpus without special case scheduling requirements. 6433 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6380 * For now this just excludes isolated cpus, but could be used to
6381 * exclude other special cases in the future.
6382 */
6383 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6384
6385 err = build_sched_domains(&cpu_default_map);
6386
6387 register_sched_domain_sysctl(); 6434 register_sched_domain_sysctl();
6388 6435 return build_sched_domains(doms_cur);
6389 return err;
6390} 6436}
6391 6437
6392static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6438static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6410,6 +6456,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6410 arch_destroy_sched_domains(cpu_map); 6456 arch_destroy_sched_domains(cpu_map);
6411} 6457}
6412 6458
6459/*
6460 * Partition sched domains as specified by the 'ndoms_new'
6461 * cpumasks in the array doms_new[] of cpumasks. This compares
6462 * doms_new[] to the current sched domain partitioning, doms_cur[].
6463 * It destroys each deleted domain and builds each new domain.
6464 *
6465 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6466 * The masks don't intersect (don't overlap.) We should setup one
6467 * sched domain for each mask. CPUs not in any of the cpumasks will
6468 * not be load balanced. If the same cpumask appears both in the
6469 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6470 * it as it is.
6471 *
6472 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6473 * ownership of it and will kfree it when done with it. If the caller
6474 * failed the kmalloc call, then it can pass in doms_new == NULL,
6475 * and partition_sched_domains() will fallback to the single partition
6476 * 'fallback_doms'.
6477 *
6478 * Call with hotplug lock held
6479 */
6480void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6481{
6482 int i, j;
6483
6484 if (doms_new == NULL) {
6485 ndoms_new = 1;
6486 doms_new = &fallback_doms;
6487 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6488 }
6489
6490 /* Destroy deleted domains */
6491 for (i = 0; i < ndoms_cur; i++) {
6492 for (j = 0; j < ndoms_new; j++) {
6493 if (cpus_equal(doms_cur[i], doms_new[j]))
6494 goto match1;
6495 }
6496 /* no match - a current sched domain not in new doms_new[] */
6497 detach_destroy_domains(doms_cur + i);
6498match1:
6499 ;
6500 }
6501
6502 /* Build new domains */
6503 for (i = 0; i < ndoms_new; i++) {
6504 for (j = 0; j < ndoms_cur; j++) {
6505 if (cpus_equal(doms_new[i], doms_cur[j]))
6506 goto match2;
6507 }
6508 /* no match - add a new doms_new */
6509 build_sched_domains(doms_new + i);
6510match2:
6511 ;
6512 }
6513
6514 /* Remember the new sched domains */
6515 if (doms_cur != &fallback_doms)
6516 kfree(doms_cur);
6517 doms_cur = doms_new;
6518 ndoms_cur = ndoms_new;
6519}
6520
6413#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6521#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6414static int arch_reinit_sched_domains(void) 6522static int arch_reinit_sched_domains(void)
6415{ 6523{
@@ -6991,3 +7099,116 @@ unsigned long sched_group_shares(struct task_group *tg)
6991} 7099}
6992 7100
6993#endif /* CONFIG_FAIR_GROUP_SCHED */ 7101#endif /* CONFIG_FAIR_GROUP_SCHED */
7102
7103#ifdef CONFIG_FAIR_CGROUP_SCHED
7104
7105/* return corresponding task_group object of a cgroup */
7106static inline struct task_group *cgroup_tg(struct cgroup *cont)
7107{
7108 return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
7109 struct task_group, css);
7110}
7111
7112static struct cgroup_subsys_state *
7113cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
7114{
7115 struct task_group *tg;
7116
7117 if (!cont->parent) {
7118 /* This is early initialization for the top cgroup */
7119 init_task_group.css.cgroup = cont;
7120 return &init_task_group.css;
7121 }
7122
7123 /* we support only 1-level deep hierarchical scheduler atm */
7124 if (cont->parent->parent)
7125 return ERR_PTR(-EINVAL);
7126
7127 tg = sched_create_group();
7128 if (IS_ERR(tg))
7129 return ERR_PTR(-ENOMEM);
7130
7131 /* Bind the cgroup to task_group object we just created */
7132 tg->css.cgroup = cont;
7133
7134 return &tg->css;
7135}
7136
7137static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
7138 struct cgroup *cont)
7139{
7140 struct task_group *tg = cgroup_tg(cont);
7141
7142 sched_destroy_group(tg);
7143}
7144
7145static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7146 struct cgroup *cont, struct task_struct *tsk)
7147{
7148 /* We don't support RT-tasks being in separate groups */
7149 if (tsk->sched_class != &fair_sched_class)
7150 return -EINVAL;
7151
7152 return 0;
7153}
7154
7155static void
7156cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
7157 struct cgroup *old_cont, struct task_struct *tsk)
7158{
7159 sched_move_task(tsk);
7160}
7161
7162static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
7163 struct file *file, const char __user *userbuf,
7164 size_t nbytes, loff_t *ppos)
7165{
7166 unsigned long shareval;
7167 struct task_group *tg = cgroup_tg(cont);
7168 char buffer[2*sizeof(unsigned long) + 1];
7169 int rc;
7170
7171 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
7172 return -E2BIG;
7173
7174 if (copy_from_user(buffer, userbuf, nbytes))
7175 return -EFAULT;
7176
7177 buffer[nbytes] = 0; /* nul-terminate */
7178 shareval = simple_strtoul(buffer, NULL, 10);
7179
7180 rc = sched_group_set_shares(tg, shareval);
7181
7182 return (rc < 0 ? rc : nbytes);
7183}
7184
7185static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
7186{
7187 struct task_group *tg = cgroup_tg(cont);
7188
7189 return (u64) tg->shares;
7190}
7191
7192static struct cftype cpu_shares = {
7193 .name = "shares",
7194 .read_uint = cpu_shares_read_uint,
7195 .write = cpu_shares_write,
7196};
7197
7198static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7199{
7200 return cgroup_add_file(cont, ss, &cpu_shares);
7201}
7202
7203struct cgroup_subsys cpu_cgroup_subsys = {
7204 .name = "cpu",
7205 .create = cpu_cgroup_create,
7206 .destroy = cpu_cgroup_destroy,
7207 .can_attach = cpu_cgroup_can_attach,
7208 .attach = cpu_cgroup_attach,
7209 .populate = cpu_cgroup_populate,
7210 .subsys_id = cpu_cgroup_subsys_id,
7211 .early_init = 1,
7212};
7213
7214#endif /* CONFIG_FAIR_CGROUP_SCHED */
diff --git a/kernel/signal.c b/kernel/signal.c
index e4f059cd9867..12006308c7eb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -256,7 +256,7 @@ flush_signal_handlers(struct task_struct *t, int force_default)
256 256
257int unhandled_signal(struct task_struct *tsk, int sig) 257int unhandled_signal(struct task_struct *tsk, int sig)
258{ 258{
259 if (is_init(tsk)) 259 if (is_global_init(tsk))
260 return 1; 260 return 1;
261 if (tsk->ptrace & PT_PTRACED) 261 if (tsk->ptrace & PT_PTRACED)
262 return 0; 262 return 0;
@@ -536,7 +536,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
536 return error; 536 return error;
537 error = -EPERM; 537 error = -EPERM;
538 if (((sig != SIGCONT) || 538 if (((sig != SIGCONT) ||
539 (process_session(current) != process_session(t))) 539 (task_session_nr(current) != task_session_nr(t)))
540 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
541 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
542 && !capable(CAP_KILL)) 542 && !capable(CAP_KILL))
@@ -694,7 +694,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
694 q->info.si_signo = sig; 694 q->info.si_signo = sig;
695 q->info.si_errno = 0; 695 q->info.si_errno = 0;
696 q->info.si_code = SI_USER; 696 q->info.si_code = SI_USER;
697 q->info.si_pid = current->pid; 697 q->info.si_pid = task_pid_vnr(current);
698 q->info.si_uid = current->uid; 698 q->info.si_uid = current->uid;
699 break; 699 break;
700 case (unsigned long) SEND_SIG_PRIV: 700 case (unsigned long) SEND_SIG_PRIV:
@@ -730,7 +730,7 @@ int print_fatal_signals;
730static void print_fatal_signal(struct pt_regs *regs, int signr) 730static void print_fatal_signal(struct pt_regs *regs, int signr)
731{ 731{
732 printk("%s/%d: potentially unexpected fatal signal %d.\n", 732 printk("%s/%d: potentially unexpected fatal signal %d.\n",
733 current->comm, current->pid, signr); 733 current->comm, task_pid_nr(current), signr);
734 734
735#ifdef __i386__ 735#ifdef __i386__
736 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->eip);
@@ -1089,7 +1089,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1089{ 1089{
1090 int error; 1090 int error;
1091 rcu_read_lock(); 1091 rcu_read_lock();
1092 error = kill_pid_info(sig, info, find_pid(pid)); 1092 error = kill_pid_info(sig, info, find_vpid(pid));
1093 rcu_read_unlock(); 1093 rcu_read_unlock();
1094 return error; 1094 return error;
1095} 1095}
@@ -1150,7 +1150,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1150 1150
1151 read_lock(&tasklist_lock); 1151 read_lock(&tasklist_lock);
1152 for_each_process(p) { 1152 for_each_process(p) {
1153 if (p->pid > 1 && p->tgid != current->tgid) { 1153 if (p->pid > 1 && !same_thread_group(p, current)) {
1154 int err = group_send_sig_info(sig, info, p); 1154 int err = group_send_sig_info(sig, info, p);
1155 ++count; 1155 ++count;
1156 if (err != -EPERM) 1156 if (err != -EPERM)
@@ -1160,9 +1160,9 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1160 read_unlock(&tasklist_lock); 1160 read_unlock(&tasklist_lock);
1161 ret = count ? retval : -ESRCH; 1161 ret = count ? retval : -ESRCH;
1162 } else if (pid < 0) { 1162 } else if (pid < 0) {
1163 ret = kill_pgrp_info(sig, info, find_pid(-pid)); 1163 ret = kill_pgrp_info(sig, info, find_vpid(-pid));
1164 } else { 1164 } else {
1165 ret = kill_pid_info(sig, info, find_pid(pid)); 1165 ret = kill_pid_info(sig, info, find_vpid(pid));
1166 } 1166 }
1167 rcu_read_unlock(); 1167 rcu_read_unlock();
1168 return ret; 1168 return ret;
@@ -1266,7 +1266,12 @@ EXPORT_SYMBOL(kill_pid);
1266int 1266int
1267kill_proc(pid_t pid, int sig, int priv) 1267kill_proc(pid_t pid, int sig, int priv)
1268{ 1268{
1269 return kill_proc_info(sig, __si_special(priv), pid); 1269 int ret;
1270
1271 rcu_read_lock();
1272 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1273 rcu_read_unlock();
1274 return ret;
1270} 1275}
1271 1276
1272/* 1277/*
@@ -1443,7 +1448,22 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1443 1448
1444 info.si_signo = sig; 1449 info.si_signo = sig;
1445 info.si_errno = 0; 1450 info.si_errno = 0;
1446 info.si_pid = tsk->pid; 1451 /*
1452 * we are under tasklist_lock here so our parent is tied to
1453 * us and cannot exit and release its namespace.
1454 *
1455 * the only it can is to switch its nsproxy with sys_unshare,
1456 * bu uncharing pid namespaces is not allowed, so we'll always
1457 * see relevant namespace
1458 *
1459 * write_lock() currently calls preempt_disable() which is the
1460 * same as rcu_read_lock(), but according to Oleg, this is not
1461 * correct to rely on this
1462 */
1463 rcu_read_lock();
1464 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1465 rcu_read_unlock();
1466
1447 info.si_uid = tsk->uid; 1467 info.si_uid = tsk->uid;
1448 1468
1449 /* FIXME: find out whether or not this is supposed to be c*time. */ 1469 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1508,7 +1528,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1508 1528
1509 info.si_signo = SIGCHLD; 1529 info.si_signo = SIGCHLD;
1510 info.si_errno = 0; 1530 info.si_errno = 0;
1511 info.si_pid = tsk->pid; 1531 /*
1532 * see comment in do_notify_parent() abot the following 3 lines
1533 */
1534 rcu_read_lock();
1535 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1536 rcu_read_unlock();
1537
1512 info.si_uid = tsk->uid; 1538 info.si_uid = tsk->uid;
1513 1539
1514 /* FIXME: find out whether or not this is supposed to be c*time. */ 1540 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1634,7 +1660,7 @@ void ptrace_notify(int exit_code)
1634 memset(&info, 0, sizeof info); 1660 memset(&info, 0, sizeof info);
1635 info.si_signo = SIGTRAP; 1661 info.si_signo = SIGTRAP;
1636 info.si_code = exit_code; 1662 info.si_code = exit_code;
1637 info.si_pid = current->pid; 1663 info.si_pid = task_pid_vnr(current);
1638 info.si_uid = current->uid; 1664 info.si_uid = current->uid;
1639 1665
1640 /* Let the debugger run. */ 1666 /* Let the debugger run. */
@@ -1804,7 +1830,7 @@ relock:
1804 info->si_signo = signr; 1830 info->si_signo = signr;
1805 info->si_errno = 0; 1831 info->si_errno = 0;
1806 info->si_code = SI_USER; 1832 info->si_code = SI_USER;
1807 info->si_pid = current->parent->pid; 1833 info->si_pid = task_pid_vnr(current->parent);
1808 info->si_uid = current->parent->uid; 1834 info->si_uid = current->parent->uid;
1809 } 1835 }
1810 1836
@@ -1835,11 +1861,9 @@ relock:
1835 continue; 1861 continue;
1836 1862
1837 /* 1863 /*
1838 * Init of a pid space gets no signals it doesn't want from 1864 * Global init gets no signals it doesn't want.
1839 * within that pid space. It can of course get signals from
1840 * its parent pid space.
1841 */ 1865 */
1842 if (current == child_reaper(current)) 1866 if (is_global_init(current))
1843 continue; 1867 continue;
1844 1868
1845 if (sig_kernel_stop(signr)) { 1869 if (sig_kernel_stop(signr)) {
@@ -2193,7 +2217,7 @@ sys_kill(int pid, int sig)
2193 info.si_signo = sig; 2217 info.si_signo = sig;
2194 info.si_errno = 0; 2218 info.si_errno = 0;
2195 info.si_code = SI_USER; 2219 info.si_code = SI_USER;
2196 info.si_pid = current->tgid; 2220 info.si_pid = task_tgid_vnr(current);
2197 info.si_uid = current->uid; 2221 info.si_uid = current->uid;
2198 2222
2199 return kill_something_info(sig, &info, pid); 2223 return kill_something_info(sig, &info, pid);
@@ -2209,12 +2233,12 @@ static int do_tkill(int tgid, int pid, int sig)
2209 info.si_signo = sig; 2233 info.si_signo = sig;
2210 info.si_errno = 0; 2234 info.si_errno = 0;
2211 info.si_code = SI_TKILL; 2235 info.si_code = SI_TKILL;
2212 info.si_pid = current->tgid; 2236 info.si_pid = task_tgid_vnr(current);
2213 info.si_uid = current->uid; 2237 info.si_uid = current->uid;
2214 2238
2215 read_lock(&tasklist_lock); 2239 read_lock(&tasklist_lock);
2216 p = find_task_by_pid(pid); 2240 p = find_task_by_vpid(pid);
2217 if (p && (tgid <= 0 || p->tgid == tgid)) { 2241 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2218 error = check_kill_permission(sig, &info, p); 2242 error = check_kill_permission(sig, &info, p);
2219 /* 2243 /*
2220 * The null signal is a permissions and process existence 2244 * The null signal is a permissions and process existence
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index edeeef3a6a32..11df812263c8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -113,7 +113,7 @@ void softlockup_tick(void)
113 spin_lock(&print_lock); 113 spin_lock(&print_lock);
114 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 114 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
115 this_cpu, now - touch_timestamp, 115 this_cpu, now - touch_timestamp,
116 current->comm, current->pid); 116 current->comm, task_pid_nr(current));
117 if (regs) 117 if (regs)
118 show_regs(regs); 118 show_regs(regs);
119 else 119 else
diff --git a/kernel/sys.c b/kernel/sys.c
index bc8879c822a5..304b5410d746 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -106,537 +106,6 @@ EXPORT_SYMBOL(cad_pid);
106 106
107void (*pm_power_off_prepare)(void); 107void (*pm_power_off_prepare)(void);
108 108
109/*
110 * Notifier list for kernel code which wants to be called
111 * at shutdown. This is used to stop any idling DMA operations
112 * and the like.
113 */
114
115static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
116
117/*
118 * Notifier chain core routines. The exported routines below
119 * are layered on top of these, with appropriate locking added.
120 */
121
122static int notifier_chain_register(struct notifier_block **nl,
123 struct notifier_block *n)
124{
125 while ((*nl) != NULL) {
126 if (n->priority > (*nl)->priority)
127 break;
128 nl = &((*nl)->next);
129 }
130 n->next = *nl;
131 rcu_assign_pointer(*nl, n);
132 return 0;
133}
134
135static int notifier_chain_unregister(struct notifier_block **nl,
136 struct notifier_block *n)
137{
138 while ((*nl) != NULL) {
139 if ((*nl) == n) {
140 rcu_assign_pointer(*nl, n->next);
141 return 0;
142 }
143 nl = &((*nl)->next);
144 }
145 return -ENOENT;
146}
147
148/**
149 * notifier_call_chain - Informs the registered notifiers about an event.
150 * @nl: Pointer to head of the blocking notifier chain
151 * @val: Value passed unmodified to notifier function
152 * @v: Pointer passed unmodified to notifier function
153 * @nr_to_call: Number of notifier functions to be called. Don't care
154 * value of this parameter is -1.
155 * @nr_calls: Records the number of notifications sent. Don't care
156 * value of this field is NULL.
157 * @returns: notifier_call_chain returns the value returned by the
158 * last notifier function called.
159 */
160
161static int __kprobes notifier_call_chain(struct notifier_block **nl,
162 unsigned long val, void *v,
163 int nr_to_call, int *nr_calls)
164{
165 int ret = NOTIFY_DONE;
166 struct notifier_block *nb, *next_nb;
167
168 nb = rcu_dereference(*nl);
169
170 while (nb && nr_to_call) {
171 next_nb = rcu_dereference(nb->next);
172 ret = nb->notifier_call(nb, val, v);
173
174 if (nr_calls)
175 (*nr_calls)++;
176
177 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
178 break;
179 nb = next_nb;
180 nr_to_call--;
181 }
182 return ret;
183}
184
185/*
186 * Atomic notifier chain routines. Registration and unregistration
187 * use a spinlock, and call_chain is synchronized by RCU (no locks).
188 */
189
190/**
191 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
192 * @nh: Pointer to head of the atomic notifier chain
193 * @n: New entry in notifier chain
194 *
195 * Adds a notifier to an atomic notifier chain.
196 *
197 * Currently always returns zero.
198 */
199
200int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
201 struct notifier_block *n)
202{
203 unsigned long flags;
204 int ret;
205
206 spin_lock_irqsave(&nh->lock, flags);
207 ret = notifier_chain_register(&nh->head, n);
208 spin_unlock_irqrestore(&nh->lock, flags);
209 return ret;
210}
211
212EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
213
214/**
215 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
216 * @nh: Pointer to head of the atomic notifier chain
217 * @n: Entry to remove from notifier chain
218 *
219 * Removes a notifier from an atomic notifier chain.
220 *
221 * Returns zero on success or %-ENOENT on failure.
222 */
223int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
224 struct notifier_block *n)
225{
226 unsigned long flags;
227 int ret;
228
229 spin_lock_irqsave(&nh->lock, flags);
230 ret = notifier_chain_unregister(&nh->head, n);
231 spin_unlock_irqrestore(&nh->lock, flags);
232 synchronize_rcu();
233 return ret;
234}
235
236EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
237
238/**
239 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
240 * @nh: Pointer to head of the atomic notifier chain
241 * @val: Value passed unmodified to notifier function
242 * @v: Pointer passed unmodified to notifier function
243 * @nr_to_call: See the comment for notifier_call_chain.
244 * @nr_calls: See the comment for notifier_call_chain.
245 *
246 * Calls each function in a notifier chain in turn. The functions
247 * run in an atomic context, so they must not block.
248 * This routine uses RCU to synchronize with changes to the chain.
249 *
250 * If the return value of the notifier can be and'ed
251 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
252 * will return immediately, with the return value of
253 * the notifier function which halted execution.
254 * Otherwise the return value is the return value
255 * of the last notifier function called.
256 */
257
258int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
259 unsigned long val, void *v,
260 int nr_to_call, int *nr_calls)
261{
262 int ret;
263
264 rcu_read_lock();
265 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
266 rcu_read_unlock();
267 return ret;
268}
269
270EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
271
272int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
273 unsigned long val, void *v)
274{
275 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
276}
277
278EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
279/*
280 * Blocking notifier chain routines. All access to the chain is
281 * synchronized by an rwsem.
282 */
283
284/**
285 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
286 * @nh: Pointer to head of the blocking notifier chain
287 * @n: New entry in notifier chain
288 *
289 * Adds a notifier to a blocking notifier chain.
290 * Must be called in process context.
291 *
292 * Currently always returns zero.
293 */
294
295int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
296 struct notifier_block *n)
297{
298 int ret;
299
300 /*
301 * This code gets used during boot-up, when task switching is
302 * not yet working and interrupts must remain disabled. At
303 * such times we must not call down_write().
304 */
305 if (unlikely(system_state == SYSTEM_BOOTING))
306 return notifier_chain_register(&nh->head, n);
307
308 down_write(&nh->rwsem);
309 ret = notifier_chain_register(&nh->head, n);
310 up_write(&nh->rwsem);
311 return ret;
312}
313
314EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
315
316/**
317 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
318 * @nh: Pointer to head of the blocking notifier chain
319 * @n: Entry to remove from notifier chain
320 *
321 * Removes a notifier from a blocking notifier chain.
322 * Must be called from process context.
323 *
324 * Returns zero on success or %-ENOENT on failure.
325 */
326int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
327 struct notifier_block *n)
328{
329 int ret;
330
331 /*
332 * This code gets used during boot-up, when task switching is
333 * not yet working and interrupts must remain disabled. At
334 * such times we must not call down_write().
335 */
336 if (unlikely(system_state == SYSTEM_BOOTING))
337 return notifier_chain_unregister(&nh->head, n);
338
339 down_write(&nh->rwsem);
340 ret = notifier_chain_unregister(&nh->head, n);
341 up_write(&nh->rwsem);
342 return ret;
343}
344
345EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
346
347/**
348 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
349 * @nh: Pointer to head of the blocking notifier chain
350 * @val: Value passed unmodified to notifier function
351 * @v: Pointer passed unmodified to notifier function
352 * @nr_to_call: See comment for notifier_call_chain.
353 * @nr_calls: See comment for notifier_call_chain.
354 *
355 * Calls each function in a notifier chain in turn. The functions
356 * run in a process context, so they are allowed to block.
357 *
358 * If the return value of the notifier can be and'ed
359 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
360 * will return immediately, with the return value of
361 * the notifier function which halted execution.
362 * Otherwise the return value is the return value
363 * of the last notifier function called.
364 */
365
366int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
367 unsigned long val, void *v,
368 int nr_to_call, int *nr_calls)
369{
370 int ret = NOTIFY_DONE;
371
372 /*
373 * We check the head outside the lock, but if this access is
374 * racy then it does not matter what the result of the test
375 * is, we re-check the list after having taken the lock anyway:
376 */
377 if (rcu_dereference(nh->head)) {
378 down_read(&nh->rwsem);
379 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
380 nr_calls);
381 up_read(&nh->rwsem);
382 }
383 return ret;
384}
385EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
386
387int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
388 unsigned long val, void *v)
389{
390 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
391}
392EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
393
394/*
395 * Raw notifier chain routines. There is no protection;
396 * the caller must provide it. Use at your own risk!
397 */
398
399/**
400 * raw_notifier_chain_register - Add notifier to a raw notifier chain
401 * @nh: Pointer to head of the raw notifier chain
402 * @n: New entry in notifier chain
403 *
404 * Adds a notifier to a raw notifier chain.
405 * All locking must be provided by the caller.
406 *
407 * Currently always returns zero.
408 */
409
410int raw_notifier_chain_register(struct raw_notifier_head *nh,
411 struct notifier_block *n)
412{
413 return notifier_chain_register(&nh->head, n);
414}
415
416EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
417
418/**
419 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
420 * @nh: Pointer to head of the raw notifier chain
421 * @n: Entry to remove from notifier chain
422 *
423 * Removes a notifier from a raw notifier chain.
424 * All locking must be provided by the caller.
425 *
426 * Returns zero on success or %-ENOENT on failure.
427 */
428int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
429 struct notifier_block *n)
430{
431 return notifier_chain_unregister(&nh->head, n);
432}
433
434EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
435
436/**
437 * __raw_notifier_call_chain - Call functions in a raw notifier chain
438 * @nh: Pointer to head of the raw notifier chain
439 * @val: Value passed unmodified to notifier function
440 * @v: Pointer passed unmodified to notifier function
441 * @nr_to_call: See comment for notifier_call_chain.
442 * @nr_calls: See comment for notifier_call_chain
443 *
444 * Calls each function in a notifier chain in turn. The functions
445 * run in an undefined context.
446 * All locking must be provided by the caller.
447 *
448 * If the return value of the notifier can be and'ed
449 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
450 * will return immediately, with the return value of
451 * the notifier function which halted execution.
452 * Otherwise the return value is the return value
453 * of the last notifier function called.
454 */
455
456int __raw_notifier_call_chain(struct raw_notifier_head *nh,
457 unsigned long val, void *v,
458 int nr_to_call, int *nr_calls)
459{
460 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
461}
462
463EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
464
465int raw_notifier_call_chain(struct raw_notifier_head *nh,
466 unsigned long val, void *v)
467{
468 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
469}
470
471EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
472
473/*
474 * SRCU notifier chain routines. Registration and unregistration
475 * use a mutex, and call_chain is synchronized by SRCU (no locks).
476 */
477
478/**
479 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
480 * @nh: Pointer to head of the SRCU notifier chain
481 * @n: New entry in notifier chain
482 *
483 * Adds a notifier to an SRCU notifier chain.
484 * Must be called in process context.
485 *
486 * Currently always returns zero.
487 */
488
489int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
490 struct notifier_block *n)
491{
492 int ret;
493
494 /*
495 * This code gets used during boot-up, when task switching is
496 * not yet working and interrupts must remain disabled. At
497 * such times we must not call mutex_lock().
498 */
499 if (unlikely(system_state == SYSTEM_BOOTING))
500 return notifier_chain_register(&nh->head, n);
501
502 mutex_lock(&nh->mutex);
503 ret = notifier_chain_register(&nh->head, n);
504 mutex_unlock(&nh->mutex);
505 return ret;
506}
507
508EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
509
510/**
511 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
512 * @nh: Pointer to head of the SRCU notifier chain
513 * @n: Entry to remove from notifier chain
514 *
515 * Removes a notifier from an SRCU notifier chain.
516 * Must be called from process context.
517 *
518 * Returns zero on success or %-ENOENT on failure.
519 */
520int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
521 struct notifier_block *n)
522{
523 int ret;
524
525 /*
526 * This code gets used during boot-up, when task switching is
527 * not yet working and interrupts must remain disabled. At
528 * such times we must not call mutex_lock().
529 */
530 if (unlikely(system_state == SYSTEM_BOOTING))
531 return notifier_chain_unregister(&nh->head, n);
532
533 mutex_lock(&nh->mutex);
534 ret = notifier_chain_unregister(&nh->head, n);
535 mutex_unlock(&nh->mutex);
536 synchronize_srcu(&nh->srcu);
537 return ret;
538}
539
540EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
541
542/**
543 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
544 * @nh: Pointer to head of the SRCU notifier chain
545 * @val: Value passed unmodified to notifier function
546 * @v: Pointer passed unmodified to notifier function
547 * @nr_to_call: See comment for notifier_call_chain.
548 * @nr_calls: See comment for notifier_call_chain
549 *
550 * Calls each function in a notifier chain in turn. The functions
551 * run in a process context, so they are allowed to block.
552 *
553 * If the return value of the notifier can be and'ed
554 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
555 * will return immediately, with the return value of
556 * the notifier function which halted execution.
557 * Otherwise the return value is the return value
558 * of the last notifier function called.
559 */
560
561int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
562 unsigned long val, void *v,
563 int nr_to_call, int *nr_calls)
564{
565 int ret;
566 int idx;
567
568 idx = srcu_read_lock(&nh->srcu);
569 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
570 srcu_read_unlock(&nh->srcu, idx);
571 return ret;
572}
573EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
574
575int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
576 unsigned long val, void *v)
577{
578 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
579}
580EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
581
582/**
583 * srcu_init_notifier_head - Initialize an SRCU notifier head
584 * @nh: Pointer to head of the srcu notifier chain
585 *
586 * Unlike other sorts of notifier heads, SRCU notifier heads require
587 * dynamic initialization. Be sure to call this routine before
588 * calling any of the other SRCU notifier routines for this head.
589 *
590 * If an SRCU notifier head is deallocated, it must first be cleaned
591 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
592 * per-cpu data (used by the SRCU mechanism) will leak.
593 */
594
595void srcu_init_notifier_head(struct srcu_notifier_head *nh)
596{
597 mutex_init(&nh->mutex);
598 if (init_srcu_struct(&nh->srcu) < 0)
599 BUG();
600 nh->head = NULL;
601}
602
603EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
604
605/**
606 * register_reboot_notifier - Register function to be called at reboot time
607 * @nb: Info about notifier function to be called
608 *
609 * Registers a function with the list of functions
610 * to be called at reboot time.
611 *
612 * Currently always returns zero, as blocking_notifier_chain_register()
613 * always returns zero.
614 */
615
616int register_reboot_notifier(struct notifier_block * nb)
617{
618 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
619}
620
621EXPORT_SYMBOL(register_reboot_notifier);
622
623/**
624 * unregister_reboot_notifier - Unregister previously registered reboot notifier
625 * @nb: Hook to be unregistered
626 *
627 * Unregisters a previously registered reboot
628 * notifier function.
629 *
630 * Returns zero on success, or %-ENOENT on failure.
631 */
632
633int unregister_reboot_notifier(struct notifier_block * nb)
634{
635 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
636}
637
638EXPORT_SYMBOL(unregister_reboot_notifier);
639
640static int set_one_prio(struct task_struct *p, int niceval, int error) 109static int set_one_prio(struct task_struct *p, int niceval, int error)
641{ 110{
642 int no_nice; 111 int no_nice;
@@ -683,7 +152,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
683 switch (which) { 152 switch (which) {
684 case PRIO_PROCESS: 153 case PRIO_PROCESS:
685 if (who) 154 if (who)
686 p = find_task_by_pid(who); 155 p = find_task_by_vpid(who);
687 else 156 else
688 p = current; 157 p = current;
689 if (p) 158 if (p)
@@ -691,7 +160,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
691 break; 160 break;
692 case PRIO_PGRP: 161 case PRIO_PGRP:
693 if (who) 162 if (who)
694 pgrp = find_pid(who); 163 pgrp = find_vpid(who);
695 else 164 else
696 pgrp = task_pgrp(current); 165 pgrp = task_pgrp(current);
697 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 166 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -740,7 +209,7 @@ asmlinkage long sys_getpriority(int which, int who)
740 switch (which) { 209 switch (which) {
741 case PRIO_PROCESS: 210 case PRIO_PROCESS:
742 if (who) 211 if (who)
743 p = find_task_by_pid(who); 212 p = find_task_by_vpid(who);
744 else 213 else
745 p = current; 214 p = current;
746 if (p) { 215 if (p) {
@@ -751,7 +220,7 @@ asmlinkage long sys_getpriority(int which, int who)
751 break; 220 break;
752 case PRIO_PGRP: 221 case PRIO_PGRP:
753 if (who) 222 if (who)
754 pgrp = find_pid(who); 223 pgrp = find_vpid(who);
755 else 224 else
756 pgrp = task_pgrp(current); 225 pgrp = task_pgrp(current);
757 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 226 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -1448,9 +917,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1448 struct task_struct *p; 917 struct task_struct *p;
1449 struct task_struct *group_leader = current->group_leader; 918 struct task_struct *group_leader = current->group_leader;
1450 int err = -EINVAL; 919 int err = -EINVAL;
920 struct pid_namespace *ns;
1451 921
1452 if (!pid) 922 if (!pid)
1453 pid = group_leader->pid; 923 pid = task_pid_vnr(group_leader);
1454 if (!pgid) 924 if (!pgid)
1455 pgid = pid; 925 pgid = pid;
1456 if (pgid < 0) 926 if (pgid < 0)
@@ -1459,10 +929,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1459 /* From this point forward we keep holding onto the tasklist lock 929 /* From this point forward we keep holding onto the tasklist lock
1460 * so that our parent does not change from under us. -DaveM 930 * so that our parent does not change from under us. -DaveM
1461 */ 931 */
932 ns = current->nsproxy->pid_ns;
933
1462 write_lock_irq(&tasklist_lock); 934 write_lock_irq(&tasklist_lock);
1463 935
1464 err = -ESRCH; 936 err = -ESRCH;
1465 p = find_task_by_pid(pid); 937 p = find_task_by_pid_ns(pid, ns);
1466 if (!p) 938 if (!p)
1467 goto out; 939 goto out;
1468 940
@@ -1488,9 +960,9 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1488 goto out; 960 goto out;
1489 961
1490 if (pgid != pid) { 962 if (pgid != pid) {
1491 struct task_struct *g = 963 struct task_struct *g;
1492 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1493 964
965 g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
1494 if (!g || task_session(g) != task_session(group_leader)) 966 if (!g || task_session(g) != task_session(group_leader))
1495 goto out; 967 goto out;
1496 } 968 }
@@ -1499,10 +971,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1499 if (err) 971 if (err)
1500 goto out; 972 goto out;
1501 973
1502 if (process_group(p) != pgid) { 974 if (task_pgrp_nr_ns(p, ns) != pgid) {
975 struct pid *pid;
976
1503 detach_pid(p, PIDTYPE_PGID); 977 detach_pid(p, PIDTYPE_PGID);
1504 p->signal->pgrp = pgid; 978 pid = find_vpid(pgid);
1505 attach_pid(p, PIDTYPE_PGID, find_pid(pgid)); 979 attach_pid(p, PIDTYPE_PGID, pid);
980 set_task_pgrp(p, pid_nr(pid));
1506 } 981 }
1507 982
1508 err = 0; 983 err = 0;
@@ -1515,19 +990,21 @@ out:
1515asmlinkage long sys_getpgid(pid_t pid) 990asmlinkage long sys_getpgid(pid_t pid)
1516{ 991{
1517 if (!pid) 992 if (!pid)
1518 return process_group(current); 993 return task_pgrp_vnr(current);
1519 else { 994 else {
1520 int retval; 995 int retval;
1521 struct task_struct *p; 996 struct task_struct *p;
997 struct pid_namespace *ns;
1522 998
1523 read_lock(&tasklist_lock); 999 ns = current->nsproxy->pid_ns;
1524 p = find_task_by_pid(pid);
1525 1000
1001 read_lock(&tasklist_lock);
1002 p = find_task_by_pid_ns(pid, ns);
1526 retval = -ESRCH; 1003 retval = -ESRCH;
1527 if (p) { 1004 if (p) {
1528 retval = security_task_getpgid(p); 1005 retval = security_task_getpgid(p);
1529 if (!retval) 1006 if (!retval)
1530 retval = process_group(p); 1007 retval = task_pgrp_nr_ns(p, ns);
1531 } 1008 }
1532 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1533 return retval; 1010 return retval;
@@ -1539,7 +1016,7 @@ asmlinkage long sys_getpgid(pid_t pid)
1539asmlinkage long sys_getpgrp(void) 1016asmlinkage long sys_getpgrp(void)
1540{ 1017{
1541 /* SMP - assuming writes are word atomic this is fine */ 1018 /* SMP - assuming writes are word atomic this is fine */
1542 return process_group(current); 1019 return task_pgrp_vnr(current);
1543} 1020}
1544 1021
1545#endif 1022#endif
@@ -1547,19 +1024,21 @@ asmlinkage long sys_getpgrp(void)
1547asmlinkage long sys_getsid(pid_t pid) 1024asmlinkage long sys_getsid(pid_t pid)
1548{ 1025{
1549 if (!pid) 1026 if (!pid)
1550 return process_session(current); 1027 return task_session_vnr(current);
1551 else { 1028 else {
1552 int retval; 1029 int retval;
1553 struct task_struct *p; 1030 struct task_struct *p;
1031 struct pid_namespace *ns;
1554 1032
1555 read_lock(&tasklist_lock); 1033 ns = current->nsproxy->pid_ns;
1556 p = find_task_by_pid(pid);
1557 1034
1035 read_lock(&tasklist_lock);
1036 p = find_task_by_pid_ns(pid, ns);
1558 retval = -ESRCH; 1037 retval = -ESRCH;
1559 if (p) { 1038 if (p) {
1560 retval = security_task_getsid(p); 1039 retval = security_task_getsid(p);
1561 if (!retval) 1040 if (!retval)
1562 retval = process_session(p); 1041 retval = task_session_nr_ns(p, ns);
1563 } 1042 }
1564 read_unlock(&tasklist_lock); 1043 read_unlock(&tasklist_lock);
1565 return retval; 1044 return retval;
@@ -1586,7 +1065,8 @@ asmlinkage long sys_setsid(void)
1586 * session id and so the check will always fail and make it so 1065 * session id and so the check will always fail and make it so
1587 * init cannot successfully call setsid. 1066 * init cannot successfully call setsid.
1588 */ 1067 */
1589 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) 1068 if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
1069 session, &init_pid_ns))
1590 goto out; 1070 goto out;
1591 1071
1592 group_leader->signal->leader = 1; 1072 group_leader->signal->leader = 1;
@@ -1596,7 +1076,7 @@ asmlinkage long sys_setsid(void)
1596 group_leader->signal->tty = NULL; 1076 group_leader->signal->tty = NULL;
1597 spin_unlock(&group_leader->sighand->siglock); 1077 spin_unlock(&group_leader->sighand->siglock);
1598 1078
1599 err = process_group(group_leader); 1079 err = task_pgrp_vnr(group_leader);
1600out: 1080out:
1601 write_unlock_irq(&tasklist_lock); 1081 write_unlock_irq(&tasklist_lock);
1602 return err; 1082 return err;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 067554bda8b7..3b4efbe26445 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1888,7 +1888,7 @@ int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
1888 return -EPERM; 1888 return -EPERM;
1889 } 1889 }
1890 1890
1891 op = is_init(current) ? OP_SET : OP_AND; 1891 op = is_global_init(current) ? OP_SET : OP_AND;
1892 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1892 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1893 do_proc_dointvec_bset_conv,&op); 1893 do_proc_dointvec_bset_conv,&op);
1894} 1894}
@@ -2278,7 +2278,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2278 pid_t tmp; 2278 pid_t tmp;
2279 int r; 2279 int r;
2280 2280
2281 tmp = pid_nr(cad_pid); 2281 tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
2282 2282
2283 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2283 r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
2284 lenp, ppos, NULL, NULL); 2284 lenp, ppos, NULL, NULL);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 7d4d7f9c1bb2..9f360f68aad6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,10 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/cgroupstats.h>
26#include <linux/cgroup.h>
27#include <linux/fs.h>
28#include <linux/file.h>
25#include <net/genetlink.h> 29#include <net/genetlink.h>
26#include <asm/atomic.h> 30#include <asm/atomic.h>
27 31
@@ -49,6 +53,11 @@ __read_mostly = {
49 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
50 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
51 55
56static struct nla_policy
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59};
60
52struct listener { 61struct listener {
53 struct list_head list; 62 struct list_head list;
54 pid_t pid; 63 pid_t pid;
@@ -372,6 +381,51 @@ err:
372 return NULL; 381 return NULL;
373} 382}
374 383
384static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
385{
386 int rc = 0;
387 struct sk_buff *rep_skb;
388 struct cgroupstats *stats;
389 struct nlattr *na;
390 size_t size;
391 u32 fd;
392 struct file *file;
393 int fput_needed;
394
395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
396 if (!na)
397 return -EINVAL;
398
399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
400 file = fget_light(fd, &fput_needed);
401 if (file) {
402 size = nla_total_size(sizeof(struct cgroupstats));
403
404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
405 size);
406 if (rc < 0)
407 goto err;
408
409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
410 sizeof(struct cgroupstats));
411 stats = nla_data(na);
412 memset(stats, 0, sizeof(*stats));
413
414 rc = cgroupstats_build(stats, file->f_dentry);
415 if (rc < 0)
416 goto err;
417
418 fput_light(file, fput_needed);
419 return send_reply(rep_skb, info->snd_pid);
420 }
421
422err:
423 if (file)
424 fput_light(file, fput_needed);
425 nlmsg_free(rep_skb);
426 return rc;
427}
428
375static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
376{ 430{
377 int rc = 0; 431 int rc = 0;
@@ -522,6 +576,12 @@ static struct genl_ops taskstats_ops = {
522 .policy = taskstats_cmd_get_policy, 576 .policy = taskstats_cmd_get_policy,
523}; 577};
524 578
579static struct genl_ops cgroupstats_ops = {
580 .cmd = CGROUPSTATS_CMD_GET,
581 .doit = cgroupstats_user_cmd,
582 .policy = cgroupstats_cmd_get_policy,
583};
584
525/* Needed early in initialization */ 585/* Needed early in initialization */
526void __init taskstats_init_early(void) 586void __init taskstats_init_early(void)
527{ 587{
@@ -546,8 +606,15 @@ static int __init taskstats_init(void)
546 if (rc < 0) 606 if (rc < 0)
547 goto err; 607 goto err;
548 608
609 rc = genl_register_ops(&family, &cgroupstats_ops);
610 if (rc < 0)
611 goto err_cgroup_ops;
612
549 family_registered = 1; 613 family_registered = 1;
614 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
550 return 0; 615 return 0;
616err_cgroup_ops:
617 genl_unregister_ops(&family, &taskstats_ops);
551err: 618err:
552 genl_unregister_family(&family); 619 genl_unregister_family(&family);
553 return rc; 620 return rc;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 51b6a6a6158c..c8a9d13874df 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -207,15 +207,12 @@ static inline void clocksource_resume_watchdog(void) { }
207 */ 207 */
208void clocksource_resume(void) 208void clocksource_resume(void)
209{ 209{
210 struct list_head *tmp; 210 struct clocksource *cs;
211 unsigned long flags; 211 unsigned long flags;
212 212
213 spin_lock_irqsave(&clocksource_lock, flags); 213 spin_lock_irqsave(&clocksource_lock, flags);
214 214
215 list_for_each(tmp, &clocksource_list) { 215 list_for_each_entry(cs, &clocksource_list, list) {
216 struct clocksource *cs;
217
218 cs = list_entry(tmp, struct clocksource, list);
219 if (cs->resume) 216 if (cs->resume)
220 cs->resume(); 217 cs->resume();
221 } 218 }
@@ -369,7 +366,6 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
369 const char *buf, size_t count) 366 const char *buf, size_t count)
370{ 367{
371 struct clocksource *ovr = NULL; 368 struct clocksource *ovr = NULL;
372 struct list_head *tmp;
373 size_t ret = count; 369 size_t ret = count;
374 int len; 370 int len;
375 371
@@ -389,12 +385,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
389 385
390 len = strlen(override_name); 386 len = strlen(override_name);
391 if (len) { 387 if (len) {
388 struct clocksource *cs;
389
392 ovr = clocksource_override; 390 ovr = clocksource_override;
393 /* try to select it: */ 391 /* try to select it: */
394 list_for_each(tmp, &clocksource_list) { 392 list_for_each_entry(cs, &clocksource_list, list) {
395 struct clocksource *cs;
396
397 cs = list_entry(tmp, struct clocksource, list);
398 if (strlen(cs->name) == len && 393 if (strlen(cs->name) == len &&
399 !strcmp(cs->name, override_name)) 394 !strcmp(cs->name, override_name))
400 ovr = cs; 395 ovr = cs;
@@ -422,14 +417,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
422static ssize_t 417static ssize_t
423sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 418sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
424{ 419{
425 struct list_head *tmp; 420 struct clocksource *src;
426 char *curr = buf; 421 char *curr = buf;
427 422
428 spin_lock_irq(&clocksource_lock); 423 spin_lock_irq(&clocksource_lock);
429 list_for_each(tmp, &clocksource_list) { 424 list_for_each_entry(src, &clocksource_list, list) {
430 struct clocksource *src;
431
432 src = list_entry(tmp, struct clocksource, list);
433 curr += sprintf(curr, "%s ", src->name); 425 curr += sprintf(curr, "%s ", src->name);
434 } 426 }
435 spin_unlock_irq(&clocksource_lock); 427 spin_unlock_irq(&clocksource_lock);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ce89ffb474d0..10a1347597fd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_stop_sched_tick(void)
153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
154 struct tick_sched *ts; 154 struct tick_sched *ts;
155 ktime_t last_update, expires, now, delta; 155 ktime_t last_update, expires, now, delta;
156 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
156 int cpu; 157 int cpu;
157 158
158 local_irq_save(flags); 159 local_irq_save(flags);
@@ -302,11 +303,26 @@ void tick_nohz_stop_sched_tick(void)
302out: 303out:
303 ts->next_jiffies = next_jiffies; 304 ts->next_jiffies = next_jiffies;
304 ts->last_jiffies = last_jiffies; 305 ts->last_jiffies = last_jiffies;
306 ts->sleep_length = ktime_sub(dev->next_event, now);
305end: 307end:
306 local_irq_restore(flags); 308 local_irq_restore(flags);
307} 309}
308 310
309/** 311/**
312 * tick_nohz_get_sleep_length - return the length of the current sleep
313 *
314 * Called from power state control code with interrupts disabled
315 */
316ktime_t tick_nohz_get_sleep_length(void)
317{
318 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
319
320 return ts->sleep_length;
321}
322
323EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length);
324
325/**
310 * nohz_restart_sched_tick - restart the idle tick from the idle task 326 * nohz_restart_sched_tick - restart the idle tick from the idle task
311 * 327 *
312 * Restart the idle tick when the CPU is woken up from idle 328 * Restart the idle tick when the CPU is woken up from idle
diff --git a/kernel/timer.c b/kernel/timer.c
index 8521d10fbb27..fb4e67d5dd60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/pid_namespace.h>
29#include <linux/notifier.h> 30#include <linux/notifier.h>
30#include <linux/thread_info.h> 31#include <linux/thread_info.h>
31#include <linux/time.h> 32#include <linux/time.h>
@@ -956,7 +957,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
956 */ 957 */
957asmlinkage long sys_getpid(void) 958asmlinkage long sys_getpid(void)
958{ 959{
959 return current->tgid; 960 return task_tgid_vnr(current);
960} 961}
961 962
962/* 963/*
@@ -970,7 +971,7 @@ asmlinkage long sys_getppid(void)
970 int pid; 971 int pid;
971 972
972 rcu_read_lock(); 973 rcu_read_lock();
973 pid = rcu_dereference(current->real_parent)->tgid; 974 pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns);
974 rcu_read_unlock(); 975 rcu_read_unlock();
975 976
976 return pid; 977 return pid;
@@ -1102,7 +1103,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1102/* Thread ID - the internal kernel "pid" */ 1103/* Thread ID - the internal kernel "pid" */
1103asmlinkage long sys_gettid(void) 1104asmlinkage long sys_gettid(void)
1104{ 1105{
1105 return current->pid; 1106 return task_pid_vnr(current);
1106} 1107}
1107 1108
1108/** 1109/**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e080d1d744cc..52d5e7c9a8e6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -32,6 +32,7 @@
32#include <linux/freezer.h> 32#include <linux/freezer.h>
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h>
35 36
36/* 37/*
37 * The per-CPU workqueue (if single thread, we always use the first 38 * The per-CPU workqueue (if single thread, we always use the first
@@ -61,6 +62,9 @@ struct workqueue_struct {
61 const char *name; 62 const char *name;
62 int singlethread; 63 int singlethread;
63 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map;
67#endif
64}; 68};
65 69
66/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -250,6 +254,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
250 struct work_struct *work = list_entry(cwq->worklist.next, 254 struct work_struct *work = list_entry(cwq->worklist.next,
251 struct work_struct, entry); 255 struct work_struct, entry);
252 work_func_t f = work->func; 256 work_func_t f = work->func;
257#ifdef CONFIG_LOCKDEP
258 /*
259 * It is permissible to free the struct work_struct
260 * from inside the function that is called from it,
261 * this we need to take into account for lockdep too.
262 * To avoid bogus "held lock freed" warnings as well
263 * as problems when looking into work->lockdep_map,
264 * make a copy and use that here.
265 */
266 struct lockdep_map lockdep_map = work->lockdep_map;
267#endif
253 268
254 cwq->current_work = work; 269 cwq->current_work = work;
255 list_del_init(cwq->worklist.next); 270 list_del_init(cwq->worklist.next);
@@ -257,13 +272,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
257 272
258 BUG_ON(get_wq_data(work) != cwq); 273 BUG_ON(get_wq_data(work) != cwq);
259 work_clear_pending(work); 274 work_clear_pending(work);
275 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
276 lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
260 f(work); 277 f(work);
278 lock_release(&lockdep_map, 1, _THIS_IP_);
279 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
261 280
262 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 281 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
263 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 282 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
264 "%s/0x%08x/%d\n", 283 "%s/0x%08x/%d\n",
265 current->comm, preempt_count(), 284 current->comm, preempt_count(),
266 current->pid); 285 task_pid_nr(current));
267 printk(KERN_ERR " last function: "); 286 printk(KERN_ERR " last function: ");
268 print_symbol("%s\n", (unsigned long)f); 287 print_symbol("%s\n", (unsigned long)f);
269 debug_show_held_locks(current); 288 debug_show_held_locks(current);
@@ -376,6 +395,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
376 int cpu; 395 int cpu;
377 396
378 might_sleep(); 397 might_sleep();
398 lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
399 lock_release(&wq->lockdep_map, 1, _THIS_IP_);
379 for_each_cpu_mask(cpu, *cpu_map) 400 for_each_cpu_mask(cpu, *cpu_map)
380 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
381} 402}
@@ -446,6 +467,9 @@ static void wait_on_work(struct work_struct *work)
446 467
447 might_sleep(); 468 might_sleep();
448 469
470 lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
471 lock_release(&work->lockdep_map, 1, _THIS_IP_);
472
449 cwq = get_wq_data(work); 473 cwq = get_wq_data(work);
450 if (!cwq) 474 if (!cwq)
451 return; 475 return;
@@ -695,8 +719,10 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
695 } 719 }
696} 720}
697 721
698struct workqueue_struct *__create_workqueue(const char *name, 722struct workqueue_struct *__create_workqueue_key(const char *name,
699 int singlethread, int freezeable) 723 int singlethread,
724 int freezeable,
725 struct lock_class_key *key)
700{ 726{
701 struct workqueue_struct *wq; 727 struct workqueue_struct *wq;
702 struct cpu_workqueue_struct *cwq; 728 struct cpu_workqueue_struct *cwq;
@@ -713,6 +739,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
713 } 739 }
714 740
715 wq->name = name; 741 wq->name = name;
742 lockdep_init_map(&wq->lockdep_map, name, key, 0);
716 wq->singlethread = singlethread; 743 wq->singlethread = singlethread;
717 wq->freezeable = freezeable; 744 wq->freezeable = freezeable;
718 INIT_LIST_HEAD(&wq->list); 745 INIT_LIST_HEAD(&wq->list);
@@ -741,7 +768,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
741 } 768 }
742 return wq; 769 return wq;
743} 770}
744EXPORT_SYMBOL_GPL(__create_workqueue); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
745 772
746static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
747{ 774{
@@ -752,6 +779,9 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
752 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
753 return; 780 return;
754 781
782 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
783 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
784
755 flush_cpu_workqueue(cwq); 785 flush_cpu_workqueue(cwq);
756 /* 786 /*
757 * If the caller is CPU_DEAD and cwq->worklist was not empty, 787 * If the caller is CPU_DEAD and cwq->worklist was not empty,