aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.instrumentation49
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/acct.c66
-rw-r--r--kernel/audit.c14
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/capability.c200
-rw-r--r--kernel/cgroup.c2805
-rw-r--r--kernel/cgroup_debug.c97
-rw-r--r--kernel/compat.c117
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpu_acct.c186
-rw-r--r--kernel/cpuset.c1613
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/dma.c8
-rw-r--r--kernel/exec_domain.c2
-rw-r--r--kernel/exit.c157
-rw-r--r--kernel/fork.c157
-rw-r--r--kernel/futex.c30
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/hrtimer.c33
-rw-r--r--kernel/itimer.c4
-rw-r--r--kernel/kexec.c172
-rw-r--r--kernel/lockdep.c24
-rw-r--r--kernel/marker.c525
-rw-r--r--kernel/module.c68
-rw-r--r--kernel/notifier.c539
-rw-r--r--kernel/ns_cgroup.c100
-rw-r--r--kernel/nsproxy.c62
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/pid.c353
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c156
-rw-r--r--kernel/power/main.c48
-rw-r--r--kernel/power/power.h21
-rw-r--r--kernel/power/process.c141
-rw-r--r--kernel/power/snapshot.c62
-rw-r--r--kernel/power/swsusp.c33
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c16
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/rtmutex-debug.c15
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c366
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_stats.h8
-rw-r--r--kernel/signal.c67
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c587
-rw-r--r--kernel/sysctl.c280
-rw-r--r--kernel/sysctl_check.c1588
-rw-r--r--kernel/taskstats.c69
-rw-r--r--kernel/time.c14
-rw-r--r--kernel/time/clocksource.c22
-rw-r--r--kernel/time/tick-sched.c16
-rw-r--r--kernel/timer.c16
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/workqueue.c38
63 files changed, 8611 insertions, 2507 deletions
diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation
new file mode 100644
index 000000000000..f5f2c769d95e
--- /dev/null
+++ b/kernel/Kconfig.instrumentation
@@ -0,0 +1,49 @@
1menuconfig INSTRUMENTATION
2 bool "Instrumentation Support"
3 default y
4 ---help---
5 Say Y here to get to see options related to performance measurement,
6 system-wide debugging, and testing. This option alone does not add any
7 kernel code.
8
9 If you say N, all options in this submenu will be skipped and
10 disabled. If you're trying to debug the kernel itself, go see the
11 Kernel Hacking menu.
12
13if INSTRUMENTATION
14
15config PROFILING
16 bool "Profiling support (EXPERIMENTAL)"
17 help
18 Say Y here to enable the extended profiling support mechanisms used
19 by profilers such as OProfile.
20
21config OPROFILE
22 tristate "OProfile system profiling (EXPERIMENTAL)"
23 depends on PROFILING
24 depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64
25 help
26 OProfile is a profiling system capable of profiling the
27 whole system, include the kernel, kernel modules, libraries,
28 and applications.
29
30 If unsure, say N.
31
32config KPROBES
33 bool "Kprobes"
34 depends on KALLSYMS && MODULES
35 depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32
36 help
37 Kprobes allows you to trap at almost any kernel address and
38 execute a callback function. register_kprobe() establishes
39 a probepoint and specifies the callback. Kprobes is useful
40 for kernel debugging, non-intrusive instrumentation and testing.
41 If in doubt, say "N".
42
43config MARKERS
44 bool "Activate markers"
45 help
46 Place an empty function call at each marker site. Can be
47 dynamically changed for a probe function.
48
49endif # INSTRUMENTATION
diff --git a/kernel/Makefile b/kernel/Makefile
index 2a999836ca18..79f017e09fbd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,9 +8,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
12 utsname.o 12 utsname.o notifier.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o
14obj-$(CONFIG_STACKTRACE) += stacktrace.o 15obj-$(CONFIG_STACKTRACE) += stacktrace.o
15obj-y += time/ 16obj-y += time/
16obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 17obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
@@ -36,7 +37,11 @@ obj-$(CONFIG_PM) += power/
36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37obj-$(CONFIG_KEXEC) += kexec.o 38obj-$(CONFIG_KEXEC) += kexec.o
38obj-$(CONFIG_COMPAT) += compat.o 39obj-$(CONFIG_COMPAT) += compat.o
40obj-$(CONFIG_CGROUPS) += cgroup.o
41obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
39obj-$(CONFIG_CPUSETS) += cpuset.o 42obj-$(CONFIG_CPUSETS) += cpuset.o
43obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o
44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
40obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
41obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
42obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 47obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -51,6 +56,7 @@ obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 56obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 57obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 58obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
59obj-$(CONFIG_MARKERS) += marker.o
54 60
55ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 61ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
56# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 62# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f0f8b2ba72..fce53d8df8a7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -329,16 +329,16 @@ static comp_t encode_comp_t(unsigned long value)
329 } 329 }
330 330
331 /* 331 /*
332 * If we need to round up, do it (and handle overflow correctly). 332 * If we need to round up, do it (and handle overflow correctly).
333 */ 333 */
334 if (rnd && (++value > MAXFRACT)) { 334 if (rnd && (++value > MAXFRACT)) {
335 value >>= EXPSIZE; 335 value >>= EXPSIZE;
336 exp++; 336 exp++;
337 } 337 }
338 338
339 /* 339 /*
340 * Clean it up and polish it off. 340 * Clean it up and polish it off.
341 */ 341 */
342 exp <<= MANTSIZE; /* Shift the exponent into place */ 342 exp <<= MANTSIZE; /* Shift the exponent into place */
343 exp += value; /* and add on the mantissa. */ 343 exp += value; /* and add on the mantissa. */
344 return exp; 344 return exp;
@@ -361,30 +361,30 @@ static comp_t encode_comp_t(unsigned long value)
361 361
362static comp2_t encode_comp2_t(u64 value) 362static comp2_t encode_comp2_t(u64 value)
363{ 363{
364 int exp, rnd; 364 int exp, rnd;
365 365
366 exp = (value > (MAXFRACT2>>1)); 366 exp = (value > (MAXFRACT2>>1));
367 rnd = 0; 367 rnd = 0;
368 while (value > MAXFRACT2) { 368 while (value > MAXFRACT2) {
369 rnd = value & 1; 369 rnd = value & 1;
370 value >>= 1; 370 value >>= 1;
371 exp++; 371 exp++;
372 } 372 }
373 373
374 /* 374 /*
375 * If we need to round up, do it (and handle overflow correctly). 375 * If we need to round up, do it (and handle overflow correctly).
376 */ 376 */
377 if (rnd && (++value > MAXFRACT2)) { 377 if (rnd && (++value > MAXFRACT2)) {
378 value >>= 1; 378 value >>= 1;
379 exp++; 379 exp++;
380 } 380 }
381 381
382 if (exp > MAXEXP2) { 382 if (exp > MAXEXP2) {
383 /* Overflow. Return largest representable number instead. */ 383 /* Overflow. Return largest representable number instead. */
384 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; 384 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
385 } else { 385 } else {
386 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); 386 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
387 } 387 }
388} 388}
389#endif 389#endif
390 390
@@ -501,14 +501,14 @@ static void do_acct_process(struct file *file)
501 ac.ac_swaps = encode_comp_t(0); 501 ac.ac_swaps = encode_comp_t(0);
502 502
503 /* 503 /*
504 * Kernel segment override to datasegment and write it 504 * Kernel segment override to datasegment and write it
505 * to the accounting file. 505 * to the accounting file.
506 */ 506 */
507 fs = get_fs(); 507 fs = get_fs();
508 set_fs(KERNEL_DS); 508 set_fs(KERNEL_DS);
509 /* 509 /*
510 * Accounting records are not subject to resource limits. 510 * Accounting records are not subject to resource limits.
511 */ 511 */
512 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 512 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
513 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 513 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
514 file->f_op->write(file, (char *)&ac, 514 file->f_op->write(file, (char *)&ac,
diff --git a/kernel/audit.c b/kernel/audit.c
index 2924251a6547..6977ea57a7e2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -664,11 +664,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
664 if (sid) { 664 if (sid) {
665 if (selinux_sid_to_string( 665 if (selinux_sid_to_string(
666 sid, &ctx, &len)) { 666 sid, &ctx, &len)) {
667 audit_log_format(ab, 667 audit_log_format(ab,
668 " ssid=%u", sid); 668 " ssid=%u", sid);
669 /* Maybe call audit_panic? */ 669 /* Maybe call audit_panic? */
670 } else 670 } else
671 audit_log_format(ab, 671 audit_log_format(ab,
672 " subj=%s", ctx); 672 " subj=%s", ctx);
673 kfree(ctx); 673 kfree(ctx);
674 } 674 }
@@ -769,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
769 sig_data->pid = audit_sig_pid; 769 sig_data->pid = audit_sig_pid;
770 memcpy(sig_data->ctx, ctx, len); 770 memcpy(sig_data->ctx, ctx, len);
771 kfree(ctx); 771 kfree(ctx);
772 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 772 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
773 0, 0, sig_data, sizeof(*sig_data) + len); 773 0, 0, sig_data, sizeof(*sig_data) + len);
774 kfree(sig_data); 774 kfree(sig_data);
775 break; 775 break;
@@ -1005,7 +1005,7 @@ unsigned int audit_serial(void)
1005 return ret; 1005 return ret;
1006} 1006}
1007 1007
1008static inline void audit_get_stamp(struct audit_context *ctx, 1008static inline void audit_get_stamp(struct audit_context *ctx,
1009 struct timespec *t, unsigned int *serial) 1009 struct timespec *t, unsigned int *serial)
1010{ 1010{
1011 if (ctx) 1011 if (ctx)
@@ -1056,7 +1056,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1056 if (gfp_mask & __GFP_WAIT) 1056 if (gfp_mask & __GFP_WAIT)
1057 reserve = 0; 1057 reserve = 0;
1058 else 1058 else
1059 reserve = 5; /* Allow atomic callers to go up to five 1059 reserve = 5; /* Allow atomic callers to go up to five
1060 entries over the normal backlog limit */ 1060 entries over the normal backlog limit */
1061 1061
1062 while (audit_backlog_limit 1062 while (audit_backlog_limit
@@ -1319,7 +1319,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1319 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1319 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1320 /* FIXME: can we save some information here? */ 1320 /* FIXME: can we save some information here? */
1321 audit_log_format(ab, "<too long>"); 1321 audit_log_format(ab, "<too long>");
1322 } else 1322 } else
1323 audit_log_untrustedstring(ab, p); 1323 audit_log_untrustedstring(ab, p);
1324 kfree(path); 1324 kfree(path);
1325} 1325}
@@ -1365,7 +1365,7 @@ void audit_log_end(struct audit_buffer *ab)
1365 * audit_log_vformat, and audit_log_end. It may be called 1365 * audit_log_vformat, and audit_log_end. It may be called
1366 * in any context. 1366 * in any context.
1367 */ 1367 */
1368void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 1368void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1369 const char *fmt, ...) 1369 const char *fmt, ...)
1370{ 1370{
1371 struct audit_buffer *ab; 1371 struct audit_buffer *ab;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 359645cff5b2..df66a21fb360 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1498,7 +1498,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1498 * auditctl to read from it... which isn't ever going to 1498 * auditctl to read from it... which isn't ever going to
1499 * happen if we're actually running in the context of auditctl 1499 * happen if we're actually running in the context of auditctl
1500 * trying to _send_ the stuff */ 1500 * trying to _send_ the stuff */
1501 1501
1502 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); 1502 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1503 if (!dest) 1503 if (!dest)
1504 return -ENOMEM; 1504 return -ENOMEM;
@@ -1678,7 +1678,7 @@ int audit_filter_type(int type)
1678{ 1678{
1679 struct audit_entry *e; 1679 struct audit_entry *e;
1680 int result = 0; 1680 int result = 0;
1681 1681
1682 rcu_read_lock(); 1682 rcu_read_lock();
1683 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) 1683 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
1684 goto unlock_and_return; 1684 goto unlock_and_return;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 938e60a61882..e19b5a33aede 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -320,7 +320,7 @@ static int audit_filter_rules(struct task_struct *tsk,
320 result = audit_comparator(tsk->personality, f->op, f->val); 320 result = audit_comparator(tsk->personality, f->op, f->val);
321 break; 321 break;
322 case AUDIT_ARCH: 322 case AUDIT_ARCH:
323 if (ctx) 323 if (ctx)
324 result = audit_comparator(ctx->arch, f->op, f->val); 324 result = audit_comparator(ctx->arch, f->op, f->val);
325 break; 325 break;
326 326
@@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
898 if (context->personality != PER_LINUX) 898 if (context->personality != PER_LINUX)
899 audit_log_format(ab, " per=%lx", context->personality); 899 audit_log_format(ab, " per=%lx", context->personality);
900 if (context->return_valid) 900 if (context->return_valid)
901 audit_log_format(ab, " success=%s exit=%ld", 901 audit_log_format(ab, " success=%s exit=%ld",
902 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 902 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
903 context->return_code); 903 context->return_code);
904 904
@@ -1135,8 +1135,8 @@ void audit_free(struct task_struct *tsk)
1135 return; 1135 return;
1136 1136
1137 /* Check for system calls that do not go through the exit 1137 /* Check for system calls that do not go through the exit
1138 * function (e.g., exit_group), then free context block. 1138 * function (e.g., exit_group), then free context block.
1139 * We use GFP_ATOMIC here because we might be doing this 1139 * We use GFP_ATOMIC here because we might be doing this
1140 * in the context of the idle thread */ 1140 * in the context of the idle thread */
1141 /* that can happen only if we are called from do_exit() */ 1141 /* that can happen only if we are called from do_exit() */
1142 if (context->in_syscall && context->auditable) 1142 if (context->in_syscall && context->auditable)
@@ -1316,7 +1316,7 @@ void __audit_getname(const char *name)
1316 context->pwdmnt = mntget(current->fs->pwdmnt); 1316 context->pwdmnt = mntget(current->fs->pwdmnt);
1317 read_unlock(&current->fs->lock); 1317 read_unlock(&current->fs->lock);
1318 } 1318 }
1319 1319
1320} 1320}
1321 1321
1322/* audit_putname - intercept a putname request 1322/* audit_putname - intercept a putname request
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e350a36ed6a..efbd9cdce132 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -3,20 +3,18 @@
3 * 3 *
4 * Copyright (C) 1997 Andrew Main <zefram@fysh.org> 4 * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
5 * 5 *
6 * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com> 6 * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/pid_namespace.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
17unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
18kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
19
20/* 18/*
21 * This lock protects task->cap_* for all tasks including current. 19 * This lock protects task->cap_* for all tasks including current.
22 * Locking rule: acquire this prior to tasklist_lock. 20 * Locking rule: acquire this prior to tasklist_lock.
@@ -40,49 +38,49 @@ static DEFINE_SPINLOCK(task_capability_lock);
40 */ 38 */
41asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) 39asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
42{ 40{
43 int ret = 0; 41 int ret = 0;
44 pid_t pid; 42 pid_t pid;
45 __u32 version; 43 __u32 version;
46 struct task_struct *target; 44 struct task_struct *target;
47 struct __user_cap_data_struct data; 45 struct __user_cap_data_struct data;
48 46
49 if (get_user(version, &header->version)) 47 if (get_user(version, &header->version))
50 return -EFAULT; 48 return -EFAULT;
51 49
52 if (version != _LINUX_CAPABILITY_VERSION) { 50 if (version != _LINUX_CAPABILITY_VERSION) {
53 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 51 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
54 return -EFAULT; 52 return -EFAULT;
55 return -EINVAL; 53 return -EINVAL;
56 } 54 }
57 55
58 if (get_user(pid, &header->pid)) 56 if (get_user(pid, &header->pid))
59 return -EFAULT; 57 return -EFAULT;
60 58
61 if (pid < 0) 59 if (pid < 0)
62 return -EINVAL; 60 return -EINVAL;
63 61
64 spin_lock(&task_capability_lock); 62 spin_lock(&task_capability_lock);
65 read_lock(&tasklist_lock); 63 read_lock(&tasklist_lock);
66 64
67 if (pid && pid != current->pid) { 65 if (pid && pid != task_pid_vnr(current)) {
68 target = find_task_by_pid(pid); 66 target = find_task_by_vpid(pid);
69 if (!target) { 67 if (!target) {
70 ret = -ESRCH; 68 ret = -ESRCH;
71 goto out; 69 goto out;
72 } 70 }
73 } else 71 } else
74 target = current; 72 target = current;
75 73
76 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); 74 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
77 75
78out: 76out:
79 read_unlock(&tasklist_lock); 77 read_unlock(&tasklist_lock);
80 spin_unlock(&task_capability_lock); 78 spin_unlock(&task_capability_lock);
81 79
82 if (!ret && copy_to_user(dataptr, &data, sizeof data)) 80 if (!ret && copy_to_user(dataptr, &data, sizeof data))
83 return -EFAULT; 81 return -EFAULT;
84 82
85 return ret; 83 return ret;
86} 84}
87 85
88/* 86/*
@@ -98,7 +96,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
98 int found = 0; 96 int found = 0;
99 struct pid *pgrp; 97 struct pid *pgrp;
100 98
101 pgrp = find_pid(pgrp_nr); 99 pgrp = find_vpid(pgrp_nr);
102 do_each_pid_task(pgrp, PIDTYPE_PGID, g) { 100 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
103 target = g; 101 target = g;
104 while_each_thread(g, target) { 102 while_each_thread(g, target) {
@@ -115,7 +113,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
115 } while_each_pid_task(pgrp, PIDTYPE_PGID, g); 113 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
116 114
117 if (!found) 115 if (!found)
118 ret = 0; 116 ret = 0;
119 return ret; 117 return ret;
120} 118}
121 119
@@ -132,7 +130,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
132 int found = 0; 130 int found = 0;
133 131
134 do_each_thread(g, target) { 132 do_each_thread(g, target) {
135 if (target == current || is_init(target)) 133 if (target == current || is_container_init(target->group_leader))
136 continue; 134 continue;
137 found = 1; 135 found = 1;
138 if (security_capset_check(target, effective, inheritable, 136 if (security_capset_check(target, effective, inheritable,
@@ -169,68 +167,68 @@ static inline int cap_set_all(kernel_cap_t *effective,
169 */ 167 */
170asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 168asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
171{ 169{
172 kernel_cap_t inheritable, permitted, effective; 170 kernel_cap_t inheritable, permitted, effective;
173 __u32 version; 171 __u32 version;
174 struct task_struct *target; 172 struct task_struct *target;
175 int ret; 173 int ret;
176 pid_t pid; 174 pid_t pid;
177 175
178 if (get_user(version, &header->version)) 176 if (get_user(version, &header->version))
179 return -EFAULT; 177 return -EFAULT;
180 178
181 if (version != _LINUX_CAPABILITY_VERSION) { 179 if (version != _LINUX_CAPABILITY_VERSION) {
182 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 180 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
183 return -EFAULT; 181 return -EFAULT;
184 return -EINVAL; 182 return -EINVAL;
185 } 183 }
186 184
187 if (get_user(pid, &header->pid)) 185 if (get_user(pid, &header->pid))
188 return -EFAULT; 186 return -EFAULT;
189 187
190 if (pid && pid != current->pid && !capable(CAP_SETPCAP)) 188 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
191 return -EPERM; 189 return -EPERM;
192 190
193 if (copy_from_user(&effective, &data->effective, sizeof(effective)) || 191 if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
194 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || 192 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
195 copy_from_user(&permitted, &data->permitted, sizeof(permitted))) 193 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
196 return -EFAULT; 194 return -EFAULT;
197 195
198 spin_lock(&task_capability_lock); 196 spin_lock(&task_capability_lock);
199 read_lock(&tasklist_lock); 197 read_lock(&tasklist_lock);
200 198
201 if (pid > 0 && pid != current->pid) { 199 if (pid > 0 && pid != task_pid_vnr(current)) {
202 target = find_task_by_pid(pid); 200 target = find_task_by_vpid(pid);
203 if (!target) { 201 if (!target) {
204 ret = -ESRCH; 202 ret = -ESRCH;
205 goto out; 203 goto out;
206 } 204 }
207 } else 205 } else
208 target = current; 206 target = current;
209 207
210 ret = 0; 208 ret = 0;
211 209
212 /* having verified that the proposed changes are legal, 210 /* having verified that the proposed changes are legal,
213 we now put them into effect. */ 211 we now put them into effect. */
214 if (pid < 0) { 212 if (pid < 0) {
215 if (pid == -1) /* all procs other than current and init */ 213 if (pid == -1) /* all procs other than current and init */
216 ret = cap_set_all(&effective, &inheritable, &permitted); 214 ret = cap_set_all(&effective, &inheritable, &permitted);
217 215
218 else /* all procs in process group */ 216 else /* all procs in process group */
219 ret = cap_set_pg(-pid, &effective, &inheritable, 217 ret = cap_set_pg(-pid, &effective, &inheritable,
220 &permitted); 218 &permitted);
221 } else { 219 } else {
222 ret = security_capset_check(target, &effective, &inheritable, 220 ret = security_capset_check(target, &effective, &inheritable,
223 &permitted); 221 &permitted);
224 if (!ret) 222 if (!ret)
225 security_capset_set(target, &effective, &inheritable, 223 security_capset_set(target, &effective, &inheritable,
226 &permitted); 224 &permitted);
227 } 225 }
228 226
229out: 227out:
230 read_unlock(&tasklist_lock); 228 read_unlock(&tasklist_lock);
231 spin_unlock(&task_capability_lock); 229 spin_unlock(&task_capability_lock);
232 230
233 return ret; 231 return ret;
234} 232}
235 233
236int __capable(struct task_struct *t, int cap) 234int __capable(struct task_struct *t, int cap)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..5987dccdb2a0
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,2805 @@
1/*
2 * kernel/cgroup.c
3 *
4 * Generic process-grouping system.
5 *
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
8 *
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
13 *
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
16 *
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
21 *
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
25 */
26
27#include <linux/cgroup.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/kernel.h>
31#include <linux/list.h>
32#include <linux/mm.h>
33#include <linux/mutex.h>
34#include <linux/mount.h>
35#include <linux/pagemap.h>
36#include <linux/proc_fs.h>
37#include <linux/rcupdate.h>
38#include <linux/sched.h>
39#include <linux/backing-dev.h>
40#include <linux/seq_file.h>
41#include <linux/slab.h>
42#include <linux/magic.h>
43#include <linux/spinlock.h>
44#include <linux/string.h>
45#include <linux/sort.h>
46#include <linux/kmod.h>
47#include <linux/delayacct.h>
48#include <linux/cgroupstats.h>
49
50#include <asm/atomic.h>
51
52static DEFINE_MUTEX(cgroup_mutex);
53
54/* Generate an array of cgroup subsystem pointers */
55#define SUBSYS(_x) &_x ## _subsys,
56
57static struct cgroup_subsys *subsys[] = {
58#include <linux/cgroup_subsys.h>
59};
60
61/*
62 * A cgroupfs_root represents the root of a cgroup hierarchy,
63 * and may be associated with a superblock to form an active
64 * hierarchy
65 */
66struct cgroupfs_root {
67 struct super_block *sb;
68
69 /*
70 * The bitmask of subsystems intended to be attached to this
71 * hierarchy
72 */
73 unsigned long subsys_bits;
74
75 /* The bitmask of subsystems currently attached to this hierarchy */
76 unsigned long actual_subsys_bits;
77
78 /* A list running through the attached subsystems */
79 struct list_head subsys_list;
80
81 /* The root cgroup for this hierarchy */
82 struct cgroup top_cgroup;
83
84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups;
86
87 /* A list running through the mounted hierarchies */
88 struct list_head root_list;
89
90 /* Hierarchy-specific flags */
91 unsigned long flags;
92
93 /* The path to use for release notifications. No locking
94 * between setting and use - so if userspace updates this
95 * while child cgroups exist, you could miss a
96 * notification. We ensure that it's always a valid
97 * NUL-terminated string */
98 char release_agent_path[PATH_MAX];
99};
100
101
102/*
103 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
104 * subsystems that are otherwise unattached - it never has more than a
105 * single cgroup, and all tasks are part of that cgroup.
106 */
107static struct cgroupfs_root rootnode;
108
109/* The list of hierarchy roots */
110
111static LIST_HEAD(roots);
112static int root_count;
113
114/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
115#define dummytop (&rootnode.top_cgroup)
116
117/* This flag indicates whether tasks in the fork and exit paths should
118 * take callback_mutex and check for fork/exit handlers to call. This
119 * avoids us having to do extra work in the fork/exit path if none of the
120 * subsystems need to be called.
121 */
122static int need_forkexit_callback;
123
124/* bits in struct cgroup flags field */
125enum {
126 /* Control Group is dead */
127 CGRP_REMOVED,
128 /* Control Group has previously had a child cgroup or a task,
129 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
130 CGRP_RELEASABLE,
131 /* Control Group requires release notifications to userspace */
132 CGRP_NOTIFY_ON_RELEASE,
133};
134
135/* convenient tests for these bits */
136inline int cgroup_is_removed(const struct cgroup *cgrp)
137{
138 return test_bit(CGRP_REMOVED, &cgrp->flags);
139}
140
141/* bits in struct cgroupfs_root flags field */
142enum {
143 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
144};
145
146inline int cgroup_is_releasable(const struct cgroup *cgrp)
147{
148 const int bits =
149 (1 << CGRP_RELEASABLE) |
150 (1 << CGRP_NOTIFY_ON_RELEASE);
151 return (cgrp->flags & bits) == bits;
152}
153
154inline int notify_on_release(const struct cgroup *cgrp)
155{
156 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
157}
158
159/*
160 * for_each_subsys() allows you to iterate on each subsystem attached to
161 * an active hierarchy
162 */
163#define for_each_subsys(_root, _ss) \
164list_for_each_entry(_ss, &_root->subsys_list, sibling)
165
166/* for_each_root() allows you to iterate across the active hierarchies */
167#define for_each_root(_root) \
168list_for_each_entry(_root, &roots, root_list)
169
170/* the list of cgroups eligible for automatic release. Protected by
171 * release_list_lock */
172static LIST_HEAD(release_list);
173static DEFINE_SPINLOCK(release_list_lock);
174static void cgroup_release_agent(struct work_struct *work);
175static DECLARE_WORK(release_agent_work, cgroup_release_agent);
176static void check_for_release(struct cgroup *cgrp);
177
178/* Link structure for associating css_set objects with cgroups */
179struct cg_cgroup_link {
180 /*
181 * List running through cg_cgroup_links associated with a
182 * cgroup, anchored on cgroup->css_sets
183 */
184 struct list_head cgrp_link_list;
185 /*
186 * List running through cg_cgroup_links pointing at a
187 * single css_set object, anchored on css_set->cg_links
188 */
189 struct list_head cg_link_list;
190 struct css_set *cg;
191};
192
193/* The default css_set - used by init and its children prior to any
194 * hierarchies being mounted. It contains a pointer to the root state
195 * for each subsystem. Also used to anchor the list of css_sets. Not
196 * reference-counted, to improve performance when child cgroups
197 * haven't been created.
198 */
199
200static struct css_set init_css_set;
201static struct cg_cgroup_link init_css_set_link;
202
203/* css_set_lock protects the list of css_set objects, and the
204 * chain of tasks off each css_set. Nests outside task->alloc_lock
205 * due to cgroup_iter_start() */
206static DEFINE_RWLOCK(css_set_lock);
207static int css_set_count;
208
209/* We don't maintain the lists running through each css_set to its
210 * task until after the first call to cgroup_iter_start(). This
211 * reduces the fork()/exit() overhead for people who have cgroups
212 * compiled into their kernel but not actually in use */
213static int use_task_css_set_links;
214
215/* When we create or destroy a css_set, the operation simply
216 * takes/releases a reference count on all the cgroups referenced
217 * by subsystems in this css_set. This can end up multiple-counting
218 * some cgroups, but that's OK - the ref-count is just a
219 * busy/not-busy indicator; ensuring that we only count each cgroup
220 * once would require taking a global lock to ensure that no
221 * subsystems moved between hierarchies while we were doing so.
222 *
223 * Possible TODO: decide at boot time based on the number of
224 * registered subsystems and the number of CPUs or NUMA nodes whether
225 * it's better for performance to ref-count every subsystem, or to
226 * take a global lock and only add one ref count to each hierarchy.
227 */
228
229/*
230 * unlink a css_set from the list and free it
231 */
232static void unlink_css_set(struct css_set *cg)
233{
234 write_lock(&css_set_lock);
235 list_del(&cg->list);
236 css_set_count--;
237 while (!list_empty(&cg->cg_links)) {
238 struct cg_cgroup_link *link;
239 link = list_entry(cg->cg_links.next,
240 struct cg_cgroup_link, cg_link_list);
241 list_del(&link->cg_link_list);
242 list_del(&link->cgrp_link_list);
243 kfree(link);
244 }
245 write_unlock(&css_set_lock);
246}
247
248static void __release_css_set(struct kref *k, int taskexit)
249{
250 int i;
251 struct css_set *cg = container_of(k, struct css_set, ref);
252
253 unlink_css_set(cg);
254
255 rcu_read_lock();
256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
257 struct cgroup *cgrp = cg->subsys[i]->cgroup;
258 if (atomic_dec_and_test(&cgrp->count) &&
259 notify_on_release(cgrp)) {
260 if (taskexit)
261 set_bit(CGRP_RELEASABLE, &cgrp->flags);
262 check_for_release(cgrp);
263 }
264 }
265 rcu_read_unlock();
266 kfree(cg);
267}
268
269static void release_css_set(struct kref *k)
270{
271 __release_css_set(k, 0);
272}
273
274static void release_css_set_taskexit(struct kref *k)
275{
276 __release_css_set(k, 1);
277}
278
279/*
280 * refcounted get/put for css_set objects
281 */
282static inline void get_css_set(struct css_set *cg)
283{
284 kref_get(&cg->ref);
285}
286
287static inline void put_css_set(struct css_set *cg)
288{
289 kref_put(&cg->ref, release_css_set);
290}
291
292static inline void put_css_set_taskexit(struct css_set *cg)
293{
294 kref_put(&cg->ref, release_css_set_taskexit);
295}
296
297/*
298 * find_existing_css_set() is a helper for
299 * find_css_set(), and checks to see whether an existing
300 * css_set is suitable. This currently walks a linked-list for
301 * simplicity; a later patch will use a hash table for better
302 * performance
303 *
304 * oldcg: the cgroup group that we're using before the cgroup
305 * transition
306 *
307 * cgrp: the cgroup that we're moving into
308 *
309 * template: location in which to build the desired set of subsystem
310 * state objects for the new cgroup group
311 */
312
313static struct css_set *find_existing_css_set(
314 struct css_set *oldcg,
315 struct cgroup *cgrp,
316 struct cgroup_subsys_state *template[])
317{
318 int i;
319 struct cgroupfs_root *root = cgrp->root;
320 struct list_head *l = &init_css_set.list;
321
322 /* Built the set of subsystem state objects that we want to
323 * see in the new css_set */
324 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
325 if (root->subsys_bits & (1ull << i)) {
326 /* Subsystem is in this hierarchy. So we want
327 * the subsystem state from the new
328 * cgroup */
329 template[i] = cgrp->subsys[i];
330 } else {
331 /* Subsystem is not in this hierarchy, so we
332 * don't want to change the subsystem state */
333 template[i] = oldcg->subsys[i];
334 }
335 }
336
337 /* Look through existing cgroup groups to find one to reuse */
338 do {
339 struct css_set *cg =
340 list_entry(l, struct css_set, list);
341
342 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
343 /* All subsystems matched */
344 return cg;
345 }
346 /* Try the next cgroup group */
347 l = l->next;
348 } while (l != &init_css_set.list);
349
350 /* No existing cgroup group matched */
351 return NULL;
352}
353
354/*
355 * allocate_cg_links() allocates "count" cg_cgroup_link structures
356 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
357 * success or a negative error
358 */
359
360static int allocate_cg_links(int count, struct list_head *tmp)
361{
362 struct cg_cgroup_link *link;
363 int i;
364 INIT_LIST_HEAD(tmp);
365 for (i = 0; i < count; i++) {
366 link = kmalloc(sizeof(*link), GFP_KERNEL);
367 if (!link) {
368 while (!list_empty(tmp)) {
369 link = list_entry(tmp->next,
370 struct cg_cgroup_link,
371 cgrp_link_list);
372 list_del(&link->cgrp_link_list);
373 kfree(link);
374 }
375 return -ENOMEM;
376 }
377 list_add(&link->cgrp_link_list, tmp);
378 }
379 return 0;
380}
381
382static void free_cg_links(struct list_head *tmp)
383{
384 while (!list_empty(tmp)) {
385 struct cg_cgroup_link *link;
386 link = list_entry(tmp->next,
387 struct cg_cgroup_link,
388 cgrp_link_list);
389 list_del(&link->cgrp_link_list);
390 kfree(link);
391 }
392}
393
394/*
395 * find_css_set() takes an existing cgroup group and a
396 * cgroup object, and returns a css_set object that's
397 * equivalent to the old group, but with the given cgroup
398 * substituted into the appropriate hierarchy. Must be called with
399 * cgroup_mutex held
400 */
401
402static struct css_set *find_css_set(
403 struct css_set *oldcg, struct cgroup *cgrp)
404{
405 struct css_set *res;
406 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
407 int i;
408
409 struct list_head tmp_cg_links;
410 struct cg_cgroup_link *link;
411
412 /* First see if we already have a cgroup group that matches
413 * the desired set */
414 write_lock(&css_set_lock);
415 res = find_existing_css_set(oldcg, cgrp, template);
416 if (res)
417 get_css_set(res);
418 write_unlock(&css_set_lock);
419
420 if (res)
421 return res;
422
423 res = kmalloc(sizeof(*res), GFP_KERNEL);
424 if (!res)
425 return NULL;
426
427 /* Allocate all the cg_cgroup_link objects that we'll need */
428 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
429 kfree(res);
430 return NULL;
431 }
432
433 kref_init(&res->ref);
434 INIT_LIST_HEAD(&res->cg_links);
435 INIT_LIST_HEAD(&res->tasks);
436
437 /* Copy the set of subsystem state objects generated in
438 * find_existing_css_set() */
439 memcpy(res->subsys, template, sizeof(res->subsys));
440
441 write_lock(&css_set_lock);
442 /* Add reference counts and links from the new css_set. */
443 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
444 struct cgroup *cgrp = res->subsys[i]->cgroup;
445 struct cgroup_subsys *ss = subsys[i];
446 atomic_inc(&cgrp->count);
447 /*
448 * We want to add a link once per cgroup, so we
449 * only do it for the first subsystem in each
450 * hierarchy
451 */
452 if (ss->root->subsys_list.next == &ss->sibling) {
453 BUG_ON(list_empty(&tmp_cg_links));
454 link = list_entry(tmp_cg_links.next,
455 struct cg_cgroup_link,
456 cgrp_link_list);
457 list_del(&link->cgrp_link_list);
458 list_add(&link->cgrp_link_list, &cgrp->css_sets);
459 link->cg = res;
460 list_add(&link->cg_link_list, &res->cg_links);
461 }
462 }
463 if (list_empty(&rootnode.subsys_list)) {
464 link = list_entry(tmp_cg_links.next,
465 struct cg_cgroup_link,
466 cgrp_link_list);
467 list_del(&link->cgrp_link_list);
468 list_add(&link->cgrp_link_list, &dummytop->css_sets);
469 link->cg = res;
470 list_add(&link->cg_link_list, &res->cg_links);
471 }
472
473 BUG_ON(!list_empty(&tmp_cg_links));
474
475 /* Link this cgroup group into the list */
476 list_add(&res->list, &init_css_set.list);
477 css_set_count++;
478 INIT_LIST_HEAD(&res->tasks);
479 write_unlock(&css_set_lock);
480
481 return res;
482}
483
484/*
485 * There is one global cgroup mutex. We also require taking
486 * task_lock() when dereferencing a task's cgroup subsys pointers.
487 * See "The task_lock() exception", at the end of this comment.
488 *
489 * A task must hold cgroup_mutex to modify cgroups.
490 *
491 * Any task can increment and decrement the count field without lock.
492 * So in general, code holding cgroup_mutex can't rely on the count
493 * field not changing. However, if the count goes to zero, then only
494 * attach_task() can increment it again. Because a count of zero
495 * means that no tasks are currently attached, therefore there is no
496 * way a task attached to that cgroup can fork (the other way to
497 * increment the count). So code holding cgroup_mutex can safely
498 * assume that if the count is zero, it will stay zero. Similarly, if
499 * a task holds cgroup_mutex on a cgroup with zero count, it
500 * knows that the cgroup won't be removed, as cgroup_rmdir()
501 * needs that mutex.
502 *
503 * The cgroup_common_file_write handler for operations that modify
504 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
505 * single threading all such cgroup modifications across the system.
506 *
507 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
508 * (usually) take cgroup_mutex. These are the two most performance
509 * critical pieces of code here. The exception occurs on cgroup_exit(),
510 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
511 * is taken, and if the cgroup count is zero, a usermode call made
512 * to /sbin/cgroup_release_agent with the name of the cgroup (path
513 * relative to the root of cgroup file system) as the argument.
514 *
515 * A cgroup can only be deleted if both its 'count' of using tasks
516 * is zero, and its list of 'children' cgroups is empty. Since all
517 * tasks in the system use _some_ cgroup, and since there is always at
518 * least one task in the system (init, pid == 1), therefore, top_cgroup
519 * always has either children cgroups and/or using tasks. So we don't
520 * need a special hack to ensure that top_cgroup cannot be deleted.
521 *
522 * The task_lock() exception
523 *
524 * The need for this exception arises from the action of
525 * attach_task(), which overwrites one tasks cgroup pointer with
526 * another. It does so using cgroup_mutexe, however there are
527 * several performance critical places that need to reference
528 * task->cgroup without the expense of grabbing a system global
529 * mutex. Therefore except as noted below, when dereferencing or, as
530 * in attach_task(), modifying a task'ss cgroup pointer we use
531 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
532 * the task_struct routinely used for such matters.
533 *
534 * P.S. One more locking exception. RCU is used to guard the
535 * update of a tasks cgroup pointer by attach_task()
536 */
537
538/**
539 * cgroup_lock - lock out any changes to cgroup structures
540 *
541 */
542
543void cgroup_lock(void)
544{
545 mutex_lock(&cgroup_mutex);
546}
547
548/**
549 * cgroup_unlock - release lock on cgroup changes
550 *
551 * Undo the lock taken in a previous cgroup_lock() call.
552 */
553
554void cgroup_unlock(void)
555{
556 mutex_unlock(&cgroup_mutex);
557}
558
559/*
560 * A couple of forward declarations required, due to cyclic reference loop:
561 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
562 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
563 * -> cgroup_mkdir.
564 */
565
566static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
567static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
568static int cgroup_populate_dir(struct cgroup *cgrp);
569static struct inode_operations cgroup_dir_inode_operations;
570static struct file_operations proc_cgroupstats_operations;
571
572static struct backing_dev_info cgroup_backing_dev_info = {
573 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
574};
575
576static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
577{
578 struct inode *inode = new_inode(sb);
579
580 if (inode) {
581 inode->i_mode = mode;
582 inode->i_uid = current->fsuid;
583 inode->i_gid = current->fsgid;
584 inode->i_blocks = 0;
585 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
586 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
587 }
588 return inode;
589}
590
591static void cgroup_diput(struct dentry *dentry, struct inode *inode)
592{
593 /* is dentry a directory ? if so, kfree() associated cgroup */
594 if (S_ISDIR(inode->i_mode)) {
595 struct cgroup *cgrp = dentry->d_fsdata;
596 BUG_ON(!(cgroup_is_removed(cgrp)));
597 /* It's possible for external users to be holding css
598 * reference counts on a cgroup; css_put() needs to
599 * be able to access the cgroup after decrementing
600 * the reference count in order to know if it needs to
601 * queue the cgroup to be handled by the release
602 * agent */
603 synchronize_rcu();
604 kfree(cgrp);
605 }
606 iput(inode);
607}
608
609static void remove_dir(struct dentry *d)
610{
611 struct dentry *parent = dget(d->d_parent);
612
613 d_delete(d);
614 simple_rmdir(parent->d_inode, d);
615 dput(parent);
616}
617
618static void cgroup_clear_directory(struct dentry *dentry)
619{
620 struct list_head *node;
621
622 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
623 spin_lock(&dcache_lock);
624 node = dentry->d_subdirs.next;
625 while (node != &dentry->d_subdirs) {
626 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
627 list_del_init(node);
628 if (d->d_inode) {
629 /* This should never be called on a cgroup
630 * directory with child cgroups */
631 BUG_ON(d->d_inode->i_mode & S_IFDIR);
632 d = dget_locked(d);
633 spin_unlock(&dcache_lock);
634 d_delete(d);
635 simple_unlink(dentry->d_inode, d);
636 dput(d);
637 spin_lock(&dcache_lock);
638 }
639 node = dentry->d_subdirs.next;
640 }
641 spin_unlock(&dcache_lock);
642}
643
644/*
645 * NOTE : the dentry must have been dget()'ed
646 */
647static void cgroup_d_remove_dir(struct dentry *dentry)
648{
649 cgroup_clear_directory(dentry);
650
651 spin_lock(&dcache_lock);
652 list_del_init(&dentry->d_u.d_child);
653 spin_unlock(&dcache_lock);
654 remove_dir(dentry);
655}
656
657static int rebind_subsystems(struct cgroupfs_root *root,
658 unsigned long final_bits)
659{
660 unsigned long added_bits, removed_bits;
661 struct cgroup *cgrp = &root->top_cgroup;
662 int i;
663
664 removed_bits = root->actual_subsys_bits & ~final_bits;
665 added_bits = final_bits & ~root->actual_subsys_bits;
666 /* Check that any added subsystems are currently free */
667 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
668 unsigned long long bit = 1ull << i;
669 struct cgroup_subsys *ss = subsys[i];
670 if (!(bit & added_bits))
671 continue;
672 if (ss->root != &rootnode) {
673 /* Subsystem isn't free */
674 return -EBUSY;
675 }
676 }
677
678 /* Currently we don't handle adding/removing subsystems when
679 * any child cgroups exist. This is theoretically supportable
680 * but involves complex error handling, so it's being left until
681 * later */
682 if (!list_empty(&cgrp->children))
683 return -EBUSY;
684
685 /* Process each subsystem */
686 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
687 struct cgroup_subsys *ss = subsys[i];
688 unsigned long bit = 1UL << i;
689 if (bit & added_bits) {
690 /* We're binding this subsystem to this hierarchy */
691 BUG_ON(cgrp->subsys[i]);
692 BUG_ON(!dummytop->subsys[i]);
693 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
694 cgrp->subsys[i] = dummytop->subsys[i];
695 cgrp->subsys[i]->cgroup = cgrp;
696 list_add(&ss->sibling, &root->subsys_list);
697 rcu_assign_pointer(ss->root, root);
698 if (ss->bind)
699 ss->bind(ss, cgrp);
700
701 } else if (bit & removed_bits) {
702 /* We're removing this subsystem */
703 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
704 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
705 if (ss->bind)
706 ss->bind(ss, dummytop);
707 dummytop->subsys[i]->cgroup = dummytop;
708 cgrp->subsys[i] = NULL;
709 rcu_assign_pointer(subsys[i]->root, &rootnode);
710 list_del(&ss->sibling);
711 } else if (bit & final_bits) {
712 /* Subsystem state should already exist */
713 BUG_ON(!cgrp->subsys[i]);
714 } else {
715 /* Subsystem state shouldn't exist */
716 BUG_ON(cgrp->subsys[i]);
717 }
718 }
719 root->subsys_bits = root->actual_subsys_bits = final_bits;
720 synchronize_rcu();
721
722 return 0;
723}
724
725static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
726{
727 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
728 struct cgroup_subsys *ss;
729
730 mutex_lock(&cgroup_mutex);
731 for_each_subsys(root, ss)
732 seq_printf(seq, ",%s", ss->name);
733 if (test_bit(ROOT_NOPREFIX, &root->flags))
734 seq_puts(seq, ",noprefix");
735 if (strlen(root->release_agent_path))
736 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
737 mutex_unlock(&cgroup_mutex);
738 return 0;
739}
740
741struct cgroup_sb_opts {
742 unsigned long subsys_bits;
743 unsigned long flags;
744 char *release_agent;
745};
746
747/* Convert a hierarchy specifier into a bitmask of subsystems and
748 * flags. */
749static int parse_cgroupfs_options(char *data,
750 struct cgroup_sb_opts *opts)
751{
752 char *token, *o = data ?: "all";
753
754 opts->subsys_bits = 0;
755 opts->flags = 0;
756 opts->release_agent = NULL;
757
758 while ((token = strsep(&o, ",")) != NULL) {
759 if (!*token)
760 return -EINVAL;
761 if (!strcmp(token, "all")) {
762 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
763 } else if (!strcmp(token, "noprefix")) {
764 set_bit(ROOT_NOPREFIX, &opts->flags);
765 } else if (!strncmp(token, "release_agent=", 14)) {
766 /* Specifying two release agents is forbidden */
767 if (opts->release_agent)
768 return -EINVAL;
769 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
770 if (!opts->release_agent)
771 return -ENOMEM;
772 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
773 opts->release_agent[PATH_MAX - 1] = 0;
774 } else {
775 struct cgroup_subsys *ss;
776 int i;
777 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
778 ss = subsys[i];
779 if (!strcmp(token, ss->name)) {
780 set_bit(i, &opts->subsys_bits);
781 break;
782 }
783 }
784 if (i == CGROUP_SUBSYS_COUNT)
785 return -ENOENT;
786 }
787 }
788
789 /* We can't have an empty hierarchy */
790 if (!opts->subsys_bits)
791 return -EINVAL;
792
793 return 0;
794}
795
796static int cgroup_remount(struct super_block *sb, int *flags, char *data)
797{
798 int ret = 0;
799 struct cgroupfs_root *root = sb->s_fs_info;
800 struct cgroup *cgrp = &root->top_cgroup;
801 struct cgroup_sb_opts opts;
802
803 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
804 mutex_lock(&cgroup_mutex);
805
806 /* See what subsystems are wanted */
807 ret = parse_cgroupfs_options(data, &opts);
808 if (ret)
809 goto out_unlock;
810
811 /* Don't allow flags to change at remount */
812 if (opts.flags != root->flags) {
813 ret = -EINVAL;
814 goto out_unlock;
815 }
816
817 ret = rebind_subsystems(root, opts.subsys_bits);
818
819 /* (re)populate subsystem files */
820 if (!ret)
821 cgroup_populate_dir(cgrp);
822
823 if (opts.release_agent)
824 strcpy(root->release_agent_path, opts.release_agent);
825 out_unlock:
826 if (opts.release_agent)
827 kfree(opts.release_agent);
828 mutex_unlock(&cgroup_mutex);
829 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
830 return ret;
831}
832
833static struct super_operations cgroup_ops = {
834 .statfs = simple_statfs,
835 .drop_inode = generic_delete_inode,
836 .show_options = cgroup_show_options,
837 .remount_fs = cgroup_remount,
838};
839
840static void init_cgroup_root(struct cgroupfs_root *root)
841{
842 struct cgroup *cgrp = &root->top_cgroup;
843 INIT_LIST_HEAD(&root->subsys_list);
844 INIT_LIST_HEAD(&root->root_list);
845 root->number_of_cgroups = 1;
846 cgrp->root = root;
847 cgrp->top_cgroup = cgrp;
848 INIT_LIST_HEAD(&cgrp->sibling);
849 INIT_LIST_HEAD(&cgrp->children);
850 INIT_LIST_HEAD(&cgrp->css_sets);
851 INIT_LIST_HEAD(&cgrp->release_list);
852}
853
854static int cgroup_test_super(struct super_block *sb, void *data)
855{
856 struct cgroupfs_root *new = data;
857 struct cgroupfs_root *root = sb->s_fs_info;
858
859 /* First check subsystems */
860 if (new->subsys_bits != root->subsys_bits)
861 return 0;
862
863 /* Next check flags */
864 if (new->flags != root->flags)
865 return 0;
866
867 return 1;
868}
869
870static int cgroup_set_super(struct super_block *sb, void *data)
871{
872 int ret;
873 struct cgroupfs_root *root = data;
874
875 ret = set_anon_super(sb, NULL);
876 if (ret)
877 return ret;
878
879 sb->s_fs_info = root;
880 root->sb = sb;
881
882 sb->s_blocksize = PAGE_CACHE_SIZE;
883 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
884 sb->s_magic = CGROUP_SUPER_MAGIC;
885 sb->s_op = &cgroup_ops;
886
887 return 0;
888}
889
890static int cgroup_get_rootdir(struct super_block *sb)
891{
892 struct inode *inode =
893 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
894 struct dentry *dentry;
895
896 if (!inode)
897 return -ENOMEM;
898
899 inode->i_op = &simple_dir_inode_operations;
900 inode->i_fop = &simple_dir_operations;
901 inode->i_op = &cgroup_dir_inode_operations;
902 /* directories start off with i_nlink == 2 (for "." entry) */
903 inc_nlink(inode);
904 dentry = d_alloc_root(inode);
905 if (!dentry) {
906 iput(inode);
907 return -ENOMEM;
908 }
909 sb->s_root = dentry;
910 return 0;
911}
912
913static int cgroup_get_sb(struct file_system_type *fs_type,
914 int flags, const char *unused_dev_name,
915 void *data, struct vfsmount *mnt)
916{
917 struct cgroup_sb_opts opts;
918 int ret = 0;
919 struct super_block *sb;
920 struct cgroupfs_root *root;
921 struct list_head tmp_cg_links, *l;
922 INIT_LIST_HEAD(&tmp_cg_links);
923
924 /* First find the desired set of subsystems */
925 ret = parse_cgroupfs_options(data, &opts);
926 if (ret) {
927 if (opts.release_agent)
928 kfree(opts.release_agent);
929 return ret;
930 }
931
932 root = kzalloc(sizeof(*root), GFP_KERNEL);
933 if (!root)
934 return -ENOMEM;
935
936 init_cgroup_root(root);
937 root->subsys_bits = opts.subsys_bits;
938 root->flags = opts.flags;
939 if (opts.release_agent) {
940 strcpy(root->release_agent_path, opts.release_agent);
941 kfree(opts.release_agent);
942 }
943
944 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
945
946 if (IS_ERR(sb)) {
947 kfree(root);
948 return PTR_ERR(sb);
949 }
950
951 if (sb->s_fs_info != root) {
952 /* Reusing an existing superblock */
953 BUG_ON(sb->s_root == NULL);
954 kfree(root);
955 root = NULL;
956 } else {
957 /* New superblock */
958 struct cgroup *cgrp = &root->top_cgroup;
959 struct inode *inode;
960
961 BUG_ON(sb->s_root != NULL);
962
963 ret = cgroup_get_rootdir(sb);
964 if (ret)
965 goto drop_new_super;
966 inode = sb->s_root->d_inode;
967
968 mutex_lock(&inode->i_mutex);
969 mutex_lock(&cgroup_mutex);
970
971 /*
972 * We're accessing css_set_count without locking
973 * css_set_lock here, but that's OK - it can only be
974 * increased by someone holding cgroup_lock, and
975 * that's us. The worst that can happen is that we
976 * have some link structures left over
977 */
978 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
979 if (ret) {
980 mutex_unlock(&cgroup_mutex);
981 mutex_unlock(&inode->i_mutex);
982 goto drop_new_super;
983 }
984
985 ret = rebind_subsystems(root, root->subsys_bits);
986 if (ret == -EBUSY) {
987 mutex_unlock(&cgroup_mutex);
988 mutex_unlock(&inode->i_mutex);
989 goto drop_new_super;
990 }
991
992 /* EBUSY should be the only error here */
993 BUG_ON(ret);
994
995 list_add(&root->root_list, &roots);
996 root_count++;
997
998 sb->s_root->d_fsdata = &root->top_cgroup;
999 root->top_cgroup.dentry = sb->s_root;
1000
1001 /* Link the top cgroup in this hierarchy into all
1002 * the css_set objects */
1003 write_lock(&css_set_lock);
1004 l = &init_css_set.list;
1005 do {
1006 struct css_set *cg;
1007 struct cg_cgroup_link *link;
1008 cg = list_entry(l, struct css_set, list);
1009 BUG_ON(list_empty(&tmp_cg_links));
1010 link = list_entry(tmp_cg_links.next,
1011 struct cg_cgroup_link,
1012 cgrp_link_list);
1013 list_del(&link->cgrp_link_list);
1014 link->cg = cg;
1015 list_add(&link->cgrp_link_list,
1016 &root->top_cgroup.css_sets);
1017 list_add(&link->cg_link_list, &cg->cg_links);
1018 l = l->next;
1019 } while (l != &init_css_set.list);
1020 write_unlock(&css_set_lock);
1021
1022 free_cg_links(&tmp_cg_links);
1023
1024 BUG_ON(!list_empty(&cgrp->sibling));
1025 BUG_ON(!list_empty(&cgrp->children));
1026 BUG_ON(root->number_of_cgroups != 1);
1027
1028 cgroup_populate_dir(cgrp);
1029 mutex_unlock(&inode->i_mutex);
1030 mutex_unlock(&cgroup_mutex);
1031 }
1032
1033 return simple_set_mnt(mnt, sb);
1034
1035 drop_new_super:
1036 up_write(&sb->s_umount);
1037 deactivate_super(sb);
1038 free_cg_links(&tmp_cg_links);
1039 return ret;
1040}
1041
1042static void cgroup_kill_sb(struct super_block *sb) {
1043 struct cgroupfs_root *root = sb->s_fs_info;
1044 struct cgroup *cgrp = &root->top_cgroup;
1045 int ret;
1046
1047 BUG_ON(!root);
1048
1049 BUG_ON(root->number_of_cgroups != 1);
1050 BUG_ON(!list_empty(&cgrp->children));
1051 BUG_ON(!list_empty(&cgrp->sibling));
1052
1053 mutex_lock(&cgroup_mutex);
1054
1055 /* Rebind all subsystems back to the default hierarchy */
1056 ret = rebind_subsystems(root, 0);
1057 /* Shouldn't be able to fail ... */
1058 BUG_ON(ret);
1059
1060 /*
1061 * Release all the links from css_sets to this hierarchy's
1062 * root cgroup
1063 */
1064 write_lock(&css_set_lock);
1065 while (!list_empty(&cgrp->css_sets)) {
1066 struct cg_cgroup_link *link;
1067 link = list_entry(cgrp->css_sets.next,
1068 struct cg_cgroup_link, cgrp_link_list);
1069 list_del(&link->cg_link_list);
1070 list_del(&link->cgrp_link_list);
1071 kfree(link);
1072 }
1073 write_unlock(&css_set_lock);
1074
1075 if (!list_empty(&root->root_list)) {
1076 list_del(&root->root_list);
1077 root_count--;
1078 }
1079 mutex_unlock(&cgroup_mutex);
1080
1081 kfree(root);
1082 kill_litter_super(sb);
1083}
1084
1085static struct file_system_type cgroup_fs_type = {
1086 .name = "cgroup",
1087 .get_sb = cgroup_get_sb,
1088 .kill_sb = cgroup_kill_sb,
1089};
1090
1091static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1092{
1093 return dentry->d_fsdata;
1094}
1095
1096static inline struct cftype *__d_cft(struct dentry *dentry)
1097{
1098 return dentry->d_fsdata;
1099}
1100
1101/*
1102 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1103 * Returns 0 on success, -errno on error.
1104 */
1105int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1106{
1107 char *start;
1108
1109 if (cgrp == dummytop) {
1110 /*
1111 * Inactive subsystems have no dentry for their root
1112 * cgroup
1113 */
1114 strcpy(buf, "/");
1115 return 0;
1116 }
1117
1118 start = buf + buflen;
1119
1120 *--start = '\0';
1121 for (;;) {
1122 int len = cgrp->dentry->d_name.len;
1123 if ((start -= len) < buf)
1124 return -ENAMETOOLONG;
1125 memcpy(start, cgrp->dentry->d_name.name, len);
1126 cgrp = cgrp->parent;
1127 if (!cgrp)
1128 break;
1129 if (!cgrp->parent)
1130 continue;
1131 if (--start < buf)
1132 return -ENAMETOOLONG;
1133 *start = '/';
1134 }
1135 memmove(buf, start, buf + buflen - start);
1136 return 0;
1137}
1138
1139/*
1140 * Return the first subsystem attached to a cgroup's hierarchy, and
1141 * its subsystem id.
1142 */
1143
1144static void get_first_subsys(const struct cgroup *cgrp,
1145 struct cgroup_subsys_state **css, int *subsys_id)
1146{
1147 const struct cgroupfs_root *root = cgrp->root;
1148 const struct cgroup_subsys *test_ss;
1149 BUG_ON(list_empty(&root->subsys_list));
1150 test_ss = list_entry(root->subsys_list.next,
1151 struct cgroup_subsys, sibling);
1152 if (css) {
1153 *css = cgrp->subsys[test_ss->subsys_id];
1154 BUG_ON(!*css);
1155 }
1156 if (subsys_id)
1157 *subsys_id = test_ss->subsys_id;
1158}
1159
1160/*
1161 * Attach task 'tsk' to cgroup 'cgrp'
1162 *
1163 * Call holding cgroup_mutex. May take task_lock of
1164 * the task 'pid' during call.
1165 */
1166static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1167{
1168 int retval = 0;
1169 struct cgroup_subsys *ss;
1170 struct cgroup *oldcgrp;
1171 struct css_set *cg = tsk->cgroups;
1172 struct css_set *newcg;
1173 struct cgroupfs_root *root = cgrp->root;
1174 int subsys_id;
1175
1176 get_first_subsys(cgrp, NULL, &subsys_id);
1177
1178 /* Nothing to do if the task is already in that cgroup */
1179 oldcgrp = task_cgroup(tsk, subsys_id);
1180 if (cgrp == oldcgrp)
1181 return 0;
1182
1183 for_each_subsys(root, ss) {
1184 if (ss->can_attach) {
1185 retval = ss->can_attach(ss, cgrp, tsk);
1186 if (retval) {
1187 return retval;
1188 }
1189 }
1190 }
1191
1192 /*
1193 * Locate or allocate a new css_set for this task,
1194 * based on its final set of cgroups
1195 */
1196 newcg = find_css_set(cg, cgrp);
1197 if (!newcg) {
1198 return -ENOMEM;
1199 }
1200
1201 task_lock(tsk);
1202 if (tsk->flags & PF_EXITING) {
1203 task_unlock(tsk);
1204 put_css_set(newcg);
1205 return -ESRCH;
1206 }
1207 rcu_assign_pointer(tsk->cgroups, newcg);
1208 task_unlock(tsk);
1209
1210 /* Update the css_set linked lists if we're using them */
1211 write_lock(&css_set_lock);
1212 if (!list_empty(&tsk->cg_list)) {
1213 list_del(&tsk->cg_list);
1214 list_add(&tsk->cg_list, &newcg->tasks);
1215 }
1216 write_unlock(&css_set_lock);
1217
1218 for_each_subsys(root, ss) {
1219 if (ss->attach) {
1220 ss->attach(ss, cgrp, oldcgrp, tsk);
1221 }
1222 }
1223 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1224 synchronize_rcu();
1225 put_css_set(cg);
1226 return 0;
1227}
1228
1229/*
1230 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
1231 * cgroup_mutex, may take task_lock of task
1232 */
1233static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1234{
1235 pid_t pid;
1236 struct task_struct *tsk;
1237 int ret;
1238
1239 if (sscanf(pidbuf, "%d", &pid) != 1)
1240 return -EIO;
1241
1242 if (pid) {
1243 rcu_read_lock();
1244 tsk = find_task_by_pid(pid);
1245 if (!tsk || tsk->flags & PF_EXITING) {
1246 rcu_read_unlock();
1247 return -ESRCH;
1248 }
1249 get_task_struct(tsk);
1250 rcu_read_unlock();
1251
1252 if ((current->euid) && (current->euid != tsk->uid)
1253 && (current->euid != tsk->suid)) {
1254 put_task_struct(tsk);
1255 return -EACCES;
1256 }
1257 } else {
1258 tsk = current;
1259 get_task_struct(tsk);
1260 }
1261
1262 ret = attach_task(cgrp, tsk);
1263 put_task_struct(tsk);
1264 return ret;
1265}
1266
1267/* The various types of files and directories in a cgroup file system */
1268
1269enum cgroup_filetype {
1270 FILE_ROOT,
1271 FILE_DIR,
1272 FILE_TASKLIST,
1273 FILE_NOTIFY_ON_RELEASE,
1274 FILE_RELEASABLE,
1275 FILE_RELEASE_AGENT,
1276};
1277
1278static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1279 struct file *file,
1280 const char __user *userbuf,
1281 size_t nbytes, loff_t *unused_ppos)
1282{
1283 char buffer[64];
1284 int retval = 0;
1285 u64 val;
1286 char *end;
1287
1288 if (!nbytes)
1289 return -EINVAL;
1290 if (nbytes >= sizeof(buffer))
1291 return -E2BIG;
1292 if (copy_from_user(buffer, userbuf, nbytes))
1293 return -EFAULT;
1294
1295 buffer[nbytes] = 0; /* nul-terminate */
1296
1297 /* strip newline if necessary */
1298 if (nbytes && (buffer[nbytes-1] == '\n'))
1299 buffer[nbytes-1] = 0;
1300 val = simple_strtoull(buffer, &end, 0);
1301 if (*end)
1302 return -EINVAL;
1303
1304 /* Pass to subsystem */
1305 retval = cft->write_uint(cgrp, cft, val);
1306 if (!retval)
1307 retval = nbytes;
1308 return retval;
1309}
1310
1311static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1312 struct cftype *cft,
1313 struct file *file,
1314 const char __user *userbuf,
1315 size_t nbytes, loff_t *unused_ppos)
1316{
1317 enum cgroup_filetype type = cft->private;
1318 char *buffer;
1319 int retval = 0;
1320
1321 if (nbytes >= PATH_MAX)
1322 return -E2BIG;
1323
1324 /* +1 for nul-terminator */
1325 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1326 if (buffer == NULL)
1327 return -ENOMEM;
1328
1329 if (copy_from_user(buffer, userbuf, nbytes)) {
1330 retval = -EFAULT;
1331 goto out1;
1332 }
1333 buffer[nbytes] = 0; /* nul-terminate */
1334
1335 mutex_lock(&cgroup_mutex);
1336
1337 if (cgroup_is_removed(cgrp)) {
1338 retval = -ENODEV;
1339 goto out2;
1340 }
1341
1342 switch (type) {
1343 case FILE_TASKLIST:
1344 retval = attach_task_by_pid(cgrp, buffer);
1345 break;
1346 case FILE_NOTIFY_ON_RELEASE:
1347 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1348 if (simple_strtoul(buffer, NULL, 10) != 0)
1349 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1350 else
1351 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1352 break;
1353 case FILE_RELEASE_AGENT:
1354 {
1355 struct cgroupfs_root *root = cgrp->root;
1356 /* Strip trailing newline */
1357 if (nbytes && (buffer[nbytes-1] == '\n')) {
1358 buffer[nbytes-1] = 0;
1359 }
1360 if (nbytes < sizeof(root->release_agent_path)) {
1361 /* We never write anything other than '\0'
1362 * into the last char of release_agent_path,
1363 * so it always remains a NUL-terminated
1364 * string */
1365 strncpy(root->release_agent_path, buffer, nbytes);
1366 root->release_agent_path[nbytes] = 0;
1367 } else {
1368 retval = -ENOSPC;
1369 }
1370 break;
1371 }
1372 default:
1373 retval = -EINVAL;
1374 goto out2;
1375 }
1376
1377 if (retval == 0)
1378 retval = nbytes;
1379out2:
1380 mutex_unlock(&cgroup_mutex);
1381out1:
1382 kfree(buffer);
1383 return retval;
1384}
1385
1386static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1387 size_t nbytes, loff_t *ppos)
1388{
1389 struct cftype *cft = __d_cft(file->f_dentry);
1390 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1391
1392 if (!cft)
1393 return -ENODEV;
1394 if (cft->write)
1395 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1396 if (cft->write_uint)
1397 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
1398 return -EINVAL;
1399}
1400
1401static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
1402 struct file *file,
1403 char __user *buf, size_t nbytes,
1404 loff_t *ppos)
1405{
1406 char tmp[64];
1407 u64 val = cft->read_uint(cgrp, cft);
1408 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1409
1410 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1411}
1412
1413static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1414 struct cftype *cft,
1415 struct file *file,
1416 char __user *buf,
1417 size_t nbytes, loff_t *ppos)
1418{
1419 enum cgroup_filetype type = cft->private;
1420 char *page;
1421 ssize_t retval = 0;
1422 char *s;
1423
1424 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1425 return -ENOMEM;
1426
1427 s = page;
1428
1429 switch (type) {
1430 case FILE_RELEASE_AGENT:
1431 {
1432 struct cgroupfs_root *root;
1433 size_t n;
1434 mutex_lock(&cgroup_mutex);
1435 root = cgrp->root;
1436 n = strnlen(root->release_agent_path,
1437 sizeof(root->release_agent_path));
1438 n = min(n, (size_t) PAGE_SIZE);
1439 strncpy(s, root->release_agent_path, n);
1440 mutex_unlock(&cgroup_mutex);
1441 s += n;
1442 break;
1443 }
1444 default:
1445 retval = -EINVAL;
1446 goto out;
1447 }
1448 *s++ = '\n';
1449
1450 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1451out:
1452 free_page((unsigned long)page);
1453 return retval;
1454}
1455
1456static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1457 size_t nbytes, loff_t *ppos)
1458{
1459 struct cftype *cft = __d_cft(file->f_dentry);
1460 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1461
1462 if (!cft)
1463 return -ENODEV;
1464
1465 if (cft->read)
1466 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1467 if (cft->read_uint)
1468 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
1469 return -EINVAL;
1470}
1471
1472static int cgroup_file_open(struct inode *inode, struct file *file)
1473{
1474 int err;
1475 struct cftype *cft;
1476
1477 err = generic_file_open(inode, file);
1478 if (err)
1479 return err;
1480
1481 cft = __d_cft(file->f_dentry);
1482 if (!cft)
1483 return -ENODEV;
1484 if (cft->open)
1485 err = cft->open(inode, file);
1486 else
1487 err = 0;
1488
1489 return err;
1490}
1491
1492static int cgroup_file_release(struct inode *inode, struct file *file)
1493{
1494 struct cftype *cft = __d_cft(file->f_dentry);
1495 if (cft->release)
1496 return cft->release(inode, file);
1497 return 0;
1498}
1499
1500/*
1501 * cgroup_rename - Only allow simple rename of directories in place.
1502 */
1503static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1504 struct inode *new_dir, struct dentry *new_dentry)
1505{
1506 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1507 return -ENOTDIR;
1508 if (new_dentry->d_inode)
1509 return -EEXIST;
1510 if (old_dir != new_dir)
1511 return -EIO;
1512 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1513}
1514
1515static struct file_operations cgroup_file_operations = {
1516 .read = cgroup_file_read,
1517 .write = cgroup_file_write,
1518 .llseek = generic_file_llseek,
1519 .open = cgroup_file_open,
1520 .release = cgroup_file_release,
1521};
1522
1523static struct inode_operations cgroup_dir_inode_operations = {
1524 .lookup = simple_lookup,
1525 .mkdir = cgroup_mkdir,
1526 .rmdir = cgroup_rmdir,
1527 .rename = cgroup_rename,
1528};
1529
1530static int cgroup_create_file(struct dentry *dentry, int mode,
1531 struct super_block *sb)
1532{
1533 static struct dentry_operations cgroup_dops = {
1534 .d_iput = cgroup_diput,
1535 };
1536
1537 struct inode *inode;
1538
1539 if (!dentry)
1540 return -ENOENT;
1541 if (dentry->d_inode)
1542 return -EEXIST;
1543
1544 inode = cgroup_new_inode(mode, sb);
1545 if (!inode)
1546 return -ENOMEM;
1547
1548 if (S_ISDIR(mode)) {
1549 inode->i_op = &cgroup_dir_inode_operations;
1550 inode->i_fop = &simple_dir_operations;
1551
1552 /* start off with i_nlink == 2 (for "." entry) */
1553 inc_nlink(inode);
1554
1555 /* start with the directory inode held, so that we can
1556 * populate it without racing with another mkdir */
1557 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1558 } else if (S_ISREG(mode)) {
1559 inode->i_size = 0;
1560 inode->i_fop = &cgroup_file_operations;
1561 }
1562 dentry->d_op = &cgroup_dops;
1563 d_instantiate(dentry, inode);
1564 dget(dentry); /* Extra count - pin the dentry in core */
1565 return 0;
1566}
1567
1568/*
1569 * cgroup_create_dir - create a directory for an object.
1570 * cgrp: the cgroup we create the directory for.
1571 * It must have a valid ->parent field
1572 * And we are going to fill its ->dentry field.
1573 * dentry: dentry of the new cgroup
1574 * mode: mode to set on new directory.
1575 */
1576static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1577 int mode)
1578{
1579 struct dentry *parent;
1580 int error = 0;
1581
1582 parent = cgrp->parent->dentry;
1583 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
1584 if (!error) {
1585 dentry->d_fsdata = cgrp;
1586 inc_nlink(parent->d_inode);
1587 cgrp->dentry = dentry;
1588 dget(dentry);
1589 }
1590 dput(dentry);
1591
1592 return error;
1593}
1594
1595int cgroup_add_file(struct cgroup *cgrp,
1596 struct cgroup_subsys *subsys,
1597 const struct cftype *cft)
1598{
1599 struct dentry *dir = cgrp->dentry;
1600 struct dentry *dentry;
1601 int error;
1602
1603 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1604 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
1605 strcpy(name, subsys->name);
1606 strcat(name, ".");
1607 }
1608 strcat(name, cft->name);
1609 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1610 dentry = lookup_one_len(name, dir, strlen(name));
1611 if (!IS_ERR(dentry)) {
1612 error = cgroup_create_file(dentry, 0644 | S_IFREG,
1613 cgrp->root->sb);
1614 if (!error)
1615 dentry->d_fsdata = (void *)cft;
1616 dput(dentry);
1617 } else
1618 error = PTR_ERR(dentry);
1619 return error;
1620}
1621
1622int cgroup_add_files(struct cgroup *cgrp,
1623 struct cgroup_subsys *subsys,
1624 const struct cftype cft[],
1625 int count)
1626{
1627 int i, err;
1628 for (i = 0; i < count; i++) {
1629 err = cgroup_add_file(cgrp, subsys, &cft[i]);
1630 if (err)
1631 return err;
1632 }
1633 return 0;
1634}
1635
1636/* Count the number of tasks in a cgroup. */
1637
1638int cgroup_task_count(const struct cgroup *cgrp)
1639{
1640 int count = 0;
1641 struct list_head *l;
1642
1643 read_lock(&css_set_lock);
1644 l = cgrp->css_sets.next;
1645 while (l != &cgrp->css_sets) {
1646 struct cg_cgroup_link *link =
1647 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1648 count += atomic_read(&link->cg->ref.refcount);
1649 l = l->next;
1650 }
1651 read_unlock(&css_set_lock);
1652 return count;
1653}
1654
1655/*
1656 * Advance a list_head iterator. The iterator should be positioned at
1657 * the start of a css_set
1658 */
1659static void cgroup_advance_iter(struct cgroup *cgrp,
1660 struct cgroup_iter *it)
1661{
1662 struct list_head *l = it->cg_link;
1663 struct cg_cgroup_link *link;
1664 struct css_set *cg;
1665
1666 /* Advance to the next non-empty css_set */
1667 do {
1668 l = l->next;
1669 if (l == &cgrp->css_sets) {
1670 it->cg_link = NULL;
1671 return;
1672 }
1673 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1674 cg = link->cg;
1675 } while (list_empty(&cg->tasks));
1676 it->cg_link = l;
1677 it->task = cg->tasks.next;
1678}
1679
1680void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1681{
1682 /*
1683 * The first time anyone tries to iterate across a cgroup,
1684 * we need to enable the list linking each css_set to its
1685 * tasks, and fix up all existing tasks.
1686 */
1687 if (!use_task_css_set_links) {
1688 struct task_struct *p, *g;
1689 write_lock(&css_set_lock);
1690 use_task_css_set_links = 1;
1691 do_each_thread(g, p) {
1692 task_lock(p);
1693 if (list_empty(&p->cg_list))
1694 list_add(&p->cg_list, &p->cgroups->tasks);
1695 task_unlock(p);
1696 } while_each_thread(g, p);
1697 write_unlock(&css_set_lock);
1698 }
1699 read_lock(&css_set_lock);
1700 it->cg_link = &cgrp->css_sets;
1701 cgroup_advance_iter(cgrp, it);
1702}
1703
1704struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1705 struct cgroup_iter *it)
1706{
1707 struct task_struct *res;
1708 struct list_head *l = it->task;
1709
1710 /* If the iterator cg is NULL, we have no tasks */
1711 if (!it->cg_link)
1712 return NULL;
1713 res = list_entry(l, struct task_struct, cg_list);
1714 /* Advance iterator to find next entry */
1715 l = l->next;
1716 if (l == &res->cgroups->tasks) {
1717 /* We reached the end of this task list - move on to
1718 * the next cg_cgroup_link */
1719 cgroup_advance_iter(cgrp, it);
1720 } else {
1721 it->task = l;
1722 }
1723 return res;
1724}
1725
1726void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1727{
1728 read_unlock(&css_set_lock);
1729}
1730
1731/*
1732 * Stuff for reading the 'tasks' file.
1733 *
1734 * Reading this file can return large amounts of data if a cgroup has
1735 * *lots* of attached tasks. So it may need several calls to read(),
1736 * but we cannot guarantee that the information we produce is correct
1737 * unless we produce it entirely atomically.
1738 *
1739 * Upon tasks file open(), a struct ctr_struct is allocated, that
1740 * will have a pointer to an array (also allocated here). The struct
1741 * ctr_struct * is stored in file->private_data. Its resources will
1742 * be freed by release() when the file is closed. The array is used
1743 * to sprintf the PIDs and then used by read().
1744 */
1745struct ctr_struct {
1746 char *buf;
1747 int bufsz;
1748};
1749
1750/*
1751 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1752 * 'cgrp'. Return actual number of pids loaded. No need to
1753 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1754 * read section, so the css_set can't go away, and is
1755 * immutable after creation.
1756 */
1757static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
1758{
1759 int n = 0;
1760 struct cgroup_iter it;
1761 struct task_struct *tsk;
1762 cgroup_iter_start(cgrp, &it);
1763 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1764 if (unlikely(n == npids))
1765 break;
1766 pidarray[n++] = task_pid_nr(tsk);
1767 }
1768 cgroup_iter_end(cgrp, &it);
1769 return n;
1770}
1771
1772/**
1773 * Build and fill cgroupstats so that taskstats can export it to user
1774 * space.
1775 *
1776 * @stats: cgroupstats to fill information into
1777 * @dentry: A dentry entry belonging to the cgroup for which stats have
1778 * been requested.
1779 */
1780int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
1781{
1782 int ret = -EINVAL;
1783 struct cgroup *cgrp;
1784 struct cgroup_iter it;
1785 struct task_struct *tsk;
1786 /*
1787 * Validate dentry by checking the superblock operations
1788 */
1789 if (dentry->d_sb->s_op != &cgroup_ops)
1790 goto err;
1791
1792 ret = 0;
1793 cgrp = dentry->d_fsdata;
1794 rcu_read_lock();
1795
1796 cgroup_iter_start(cgrp, &it);
1797 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1798 switch (tsk->state) {
1799 case TASK_RUNNING:
1800 stats->nr_running++;
1801 break;
1802 case TASK_INTERRUPTIBLE:
1803 stats->nr_sleeping++;
1804 break;
1805 case TASK_UNINTERRUPTIBLE:
1806 stats->nr_uninterruptible++;
1807 break;
1808 case TASK_STOPPED:
1809 stats->nr_stopped++;
1810 break;
1811 default:
1812 if (delayacct_is_task_waiting_on_io(tsk))
1813 stats->nr_io_wait++;
1814 break;
1815 }
1816 }
1817 cgroup_iter_end(cgrp, &it);
1818
1819 rcu_read_unlock();
1820err:
1821 return ret;
1822}
1823
1824static int cmppid(const void *a, const void *b)
1825{
1826 return *(pid_t *)a - *(pid_t *)b;
1827}
1828
1829/*
1830 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1831 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1832 * count 'cnt' of how many chars would be written if buf were large enough.
1833 */
1834static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1835{
1836 int cnt = 0;
1837 int i;
1838
1839 for (i = 0; i < npids; i++)
1840 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1841 return cnt;
1842}
1843
1844/*
1845 * Handle an open on 'tasks' file. Prepare a buffer listing the
1846 * process id's of tasks currently attached to the cgroup being opened.
1847 *
1848 * Does not require any specific cgroup mutexes, and does not take any.
1849 */
1850static int cgroup_tasks_open(struct inode *unused, struct file *file)
1851{
1852 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1853 struct ctr_struct *ctr;
1854 pid_t *pidarray;
1855 int npids;
1856 char c;
1857
1858 if (!(file->f_mode & FMODE_READ))
1859 return 0;
1860
1861 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1862 if (!ctr)
1863 goto err0;
1864
1865 /*
1866 * If cgroup gets more users after we read count, we won't have
1867 * enough space - tough. This race is indistinguishable to the
1868 * caller from the case that the additional cgroup users didn't
1869 * show up until sometime later on.
1870 */
1871 npids = cgroup_task_count(cgrp);
1872 if (npids) {
1873 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1874 if (!pidarray)
1875 goto err1;
1876
1877 npids = pid_array_load(pidarray, npids, cgrp);
1878 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1879
1880 /* Call pid_array_to_buf() twice, first just to get bufsz */
1881 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1882 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1883 if (!ctr->buf)
1884 goto err2;
1885 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1886
1887 kfree(pidarray);
1888 } else {
1889 ctr->buf = 0;
1890 ctr->bufsz = 0;
1891 }
1892 file->private_data = ctr;
1893 return 0;
1894
1895err2:
1896 kfree(pidarray);
1897err1:
1898 kfree(ctr);
1899err0:
1900 return -ENOMEM;
1901}
1902
1903static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
1904 struct cftype *cft,
1905 struct file *file, char __user *buf,
1906 size_t nbytes, loff_t *ppos)
1907{
1908 struct ctr_struct *ctr = file->private_data;
1909
1910 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1911}
1912
1913static int cgroup_tasks_release(struct inode *unused_inode,
1914 struct file *file)
1915{
1916 struct ctr_struct *ctr;
1917
1918 if (file->f_mode & FMODE_READ) {
1919 ctr = file->private_data;
1920 kfree(ctr->buf);
1921 kfree(ctr);
1922 }
1923 return 0;
1924}
1925
1926static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
1927 struct cftype *cft)
1928{
1929 return notify_on_release(cgrp);
1930}
1931
1932static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
1933{
1934 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
1935}
1936
1937/*
1938 * for the common functions, 'private' gives the type of file
1939 */
1940static struct cftype files[] = {
1941 {
1942 .name = "tasks",
1943 .open = cgroup_tasks_open,
1944 .read = cgroup_tasks_read,
1945 .write = cgroup_common_file_write,
1946 .release = cgroup_tasks_release,
1947 .private = FILE_TASKLIST,
1948 },
1949
1950 {
1951 .name = "notify_on_release",
1952 .read_uint = cgroup_read_notify_on_release,
1953 .write = cgroup_common_file_write,
1954 .private = FILE_NOTIFY_ON_RELEASE,
1955 },
1956
1957 {
1958 .name = "releasable",
1959 .read_uint = cgroup_read_releasable,
1960 .private = FILE_RELEASABLE,
1961 }
1962};
1963
1964static struct cftype cft_release_agent = {
1965 .name = "release_agent",
1966 .read = cgroup_common_file_read,
1967 .write = cgroup_common_file_write,
1968 .private = FILE_RELEASE_AGENT,
1969};
1970
1971static int cgroup_populate_dir(struct cgroup *cgrp)
1972{
1973 int err;
1974 struct cgroup_subsys *ss;
1975
1976 /* First clear out any existing files */
1977 cgroup_clear_directory(cgrp->dentry);
1978
1979 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
1980 if (err < 0)
1981 return err;
1982
1983 if (cgrp == cgrp->top_cgroup) {
1984 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
1985 return err;
1986 }
1987
1988 for_each_subsys(cgrp->root, ss) {
1989 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
1990 return err;
1991 }
1992
1993 return 0;
1994}
1995
1996static void init_cgroup_css(struct cgroup_subsys_state *css,
1997 struct cgroup_subsys *ss,
1998 struct cgroup *cgrp)
1999{
2000 css->cgroup = cgrp;
2001 atomic_set(&css->refcnt, 0);
2002 css->flags = 0;
2003 if (cgrp == dummytop)
2004 set_bit(CSS_ROOT, &css->flags);
2005 BUG_ON(cgrp->subsys[ss->subsys_id]);
2006 cgrp->subsys[ss->subsys_id] = css;
2007}
2008
2009/*
2010 * cgroup_create - create a cgroup
2011 * parent: cgroup that will be parent of the new cgroup.
2012 * name: name of the new cgroup. Will be strcpy'ed.
2013 * mode: mode to set on new inode
2014 *
2015 * Must be called with the mutex on the parent inode held
2016 */
2017
2018static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2019 int mode)
2020{
2021 struct cgroup *cgrp;
2022 struct cgroupfs_root *root = parent->root;
2023 int err = 0;
2024 struct cgroup_subsys *ss;
2025 struct super_block *sb = root->sb;
2026
2027 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
2028 if (!cgrp)
2029 return -ENOMEM;
2030
2031 /* Grab a reference on the superblock so the hierarchy doesn't
2032 * get deleted on unmount if there are child cgroups. This
2033 * can be done outside cgroup_mutex, since the sb can't
2034 * disappear while someone has an open control file on the
2035 * fs */
2036 atomic_inc(&sb->s_active);
2037
2038 mutex_lock(&cgroup_mutex);
2039
2040 cgrp->flags = 0;
2041 INIT_LIST_HEAD(&cgrp->sibling);
2042 INIT_LIST_HEAD(&cgrp->children);
2043 INIT_LIST_HEAD(&cgrp->css_sets);
2044 INIT_LIST_HEAD(&cgrp->release_list);
2045
2046 cgrp->parent = parent;
2047 cgrp->root = parent->root;
2048 cgrp->top_cgroup = parent->top_cgroup;
2049
2050 for_each_subsys(root, ss) {
2051 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2052 if (IS_ERR(css)) {
2053 err = PTR_ERR(css);
2054 goto err_destroy;
2055 }
2056 init_cgroup_css(css, ss, cgrp);
2057 }
2058
2059 list_add(&cgrp->sibling, &cgrp->parent->children);
2060 root->number_of_cgroups++;
2061
2062 err = cgroup_create_dir(cgrp, dentry, mode);
2063 if (err < 0)
2064 goto err_remove;
2065
2066 /* The cgroup directory was pre-locked for us */
2067 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
2068
2069 err = cgroup_populate_dir(cgrp);
2070 /* If err < 0, we have a half-filled directory - oh well ;) */
2071
2072 mutex_unlock(&cgroup_mutex);
2073 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
2074
2075 return 0;
2076
2077 err_remove:
2078
2079 list_del(&cgrp->sibling);
2080 root->number_of_cgroups--;
2081
2082 err_destroy:
2083
2084 for_each_subsys(root, ss) {
2085 if (cgrp->subsys[ss->subsys_id])
2086 ss->destroy(ss, cgrp);
2087 }
2088
2089 mutex_unlock(&cgroup_mutex);
2090
2091 /* Release the reference count that we took on the superblock */
2092 deactivate_super(sb);
2093
2094 kfree(cgrp);
2095 return err;
2096}
2097
2098static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2099{
2100 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
2101
2102 /* the vfs holds inode->i_mutex already */
2103 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2104}
2105
2106static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2107{
2108 /* Check the reference count on each subsystem. Since we
2109 * already established that there are no tasks in the
2110 * cgroup, if the css refcount is also 0, then there should
2111 * be no outstanding references, so the subsystem is safe to
2112 * destroy. We scan across all subsystems rather than using
2113 * the per-hierarchy linked list of mounted subsystems since
2114 * we can be called via check_for_release() with no
2115 * synchronization other than RCU, and the subsystem linked
2116 * list isn't RCU-safe */
2117 int i;
2118 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2119 struct cgroup_subsys *ss = subsys[i];
2120 struct cgroup_subsys_state *css;
2121 /* Skip subsystems not in this hierarchy */
2122 if (ss->root != cgrp->root)
2123 continue;
2124 css = cgrp->subsys[ss->subsys_id];
2125 /* When called from check_for_release() it's possible
2126 * that by this point the cgroup has been removed
2127 * and the css deleted. But a false-positive doesn't
2128 * matter, since it can only happen if the cgroup
2129 * has been deleted and hence no longer needs the
2130 * release agent to be called anyway. */
2131 if (css && atomic_read(&css->refcnt)) {
2132 return 1;
2133 }
2134 }
2135 return 0;
2136}
2137
2138static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2139{
2140 struct cgroup *cgrp = dentry->d_fsdata;
2141 struct dentry *d;
2142 struct cgroup *parent;
2143 struct cgroup_subsys *ss;
2144 struct super_block *sb;
2145 struct cgroupfs_root *root;
2146
2147 /* the vfs holds both inode->i_mutex already */
2148
2149 mutex_lock(&cgroup_mutex);
2150 if (atomic_read(&cgrp->count) != 0) {
2151 mutex_unlock(&cgroup_mutex);
2152 return -EBUSY;
2153 }
2154 if (!list_empty(&cgrp->children)) {
2155 mutex_unlock(&cgroup_mutex);
2156 return -EBUSY;
2157 }
2158
2159 parent = cgrp->parent;
2160 root = cgrp->root;
2161 sb = root->sb;
2162
2163 if (cgroup_has_css_refs(cgrp)) {
2164 mutex_unlock(&cgroup_mutex);
2165 return -EBUSY;
2166 }
2167
2168 for_each_subsys(root, ss) {
2169 if (cgrp->subsys[ss->subsys_id])
2170 ss->destroy(ss, cgrp);
2171 }
2172
2173 spin_lock(&release_list_lock);
2174 set_bit(CGRP_REMOVED, &cgrp->flags);
2175 if (!list_empty(&cgrp->release_list))
2176 list_del(&cgrp->release_list);
2177 spin_unlock(&release_list_lock);
2178 /* delete my sibling from parent->children */
2179 list_del(&cgrp->sibling);
2180 spin_lock(&cgrp->dentry->d_lock);
2181 d = dget(cgrp->dentry);
2182 cgrp->dentry = NULL;
2183 spin_unlock(&d->d_lock);
2184
2185 cgroup_d_remove_dir(d);
2186 dput(d);
2187 root->number_of_cgroups--;
2188
2189 set_bit(CGRP_RELEASABLE, &parent->flags);
2190 check_for_release(parent);
2191
2192 mutex_unlock(&cgroup_mutex);
2193 /* Drop the active superblock reference that we took when we
2194 * created the cgroup */
2195 deactivate_super(sb);
2196 return 0;
2197}
2198
2199static void cgroup_init_subsys(struct cgroup_subsys *ss)
2200{
2201 struct cgroup_subsys_state *css;
2202 struct list_head *l;
2203 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
2204
2205 /* Create the top cgroup state for this subsystem */
2206 ss->root = &rootnode;
2207 css = ss->create(ss, dummytop);
2208 /* We don't handle early failures gracefully */
2209 BUG_ON(IS_ERR(css));
2210 init_cgroup_css(css, ss, dummytop);
2211
2212 /* Update all cgroup groups to contain a subsys
2213 * pointer to this state - since the subsystem is
2214 * newly registered, all tasks and hence all cgroup
2215 * groups are in the subsystem's top cgroup. */
2216 write_lock(&css_set_lock);
2217 l = &init_css_set.list;
2218 do {
2219 struct css_set *cg =
2220 list_entry(l, struct css_set, list);
2221 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2222 l = l->next;
2223 } while (l != &init_css_set.list);
2224 write_unlock(&css_set_lock);
2225
2226 /* If this subsystem requested that it be notified with fork
2227 * events, we should send it one now for every process in the
2228 * system */
2229 if (ss->fork) {
2230 struct task_struct *g, *p;
2231
2232 read_lock(&tasklist_lock);
2233 do_each_thread(g, p) {
2234 ss->fork(ss, p);
2235 } while_each_thread(g, p);
2236 read_unlock(&tasklist_lock);
2237 }
2238
2239 need_forkexit_callback |= ss->fork || ss->exit;
2240
2241 ss->active = 1;
2242}
2243
2244/**
2245 * cgroup_init_early - initialize cgroups at system boot, and
2246 * initialize any subsystems that request early init.
2247 */
2248int __init cgroup_init_early(void)
2249{
2250 int i;
2251 kref_init(&init_css_set.ref);
2252 kref_get(&init_css_set.ref);
2253 INIT_LIST_HEAD(&init_css_set.list);
2254 INIT_LIST_HEAD(&init_css_set.cg_links);
2255 INIT_LIST_HEAD(&init_css_set.tasks);
2256 css_set_count = 1;
2257 init_cgroup_root(&rootnode);
2258 list_add(&rootnode.root_list, &roots);
2259 root_count = 1;
2260 init_task.cgroups = &init_css_set;
2261
2262 init_css_set_link.cg = &init_css_set;
2263 list_add(&init_css_set_link.cgrp_link_list,
2264 &rootnode.top_cgroup.css_sets);
2265 list_add(&init_css_set_link.cg_link_list,
2266 &init_css_set.cg_links);
2267
2268 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2269 struct cgroup_subsys *ss = subsys[i];
2270
2271 BUG_ON(!ss->name);
2272 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
2273 BUG_ON(!ss->create);
2274 BUG_ON(!ss->destroy);
2275 if (ss->subsys_id != i) {
2276 printk(KERN_ERR "Subsys %s id == %d\n",
2277 ss->name, ss->subsys_id);
2278 BUG();
2279 }
2280
2281 if (ss->early_init)
2282 cgroup_init_subsys(ss);
2283 }
2284 return 0;
2285}
2286
2287/**
2288 * cgroup_init - register cgroup filesystem and /proc file, and
2289 * initialize any subsystems that didn't request early init.
2290 */
2291int __init cgroup_init(void)
2292{
2293 int err;
2294 int i;
2295 struct proc_dir_entry *entry;
2296
2297 err = bdi_init(&cgroup_backing_dev_info);
2298 if (err)
2299 return err;
2300
2301 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2302 struct cgroup_subsys *ss = subsys[i];
2303 if (!ss->early_init)
2304 cgroup_init_subsys(ss);
2305 }
2306
2307 err = register_filesystem(&cgroup_fs_type);
2308 if (err < 0)
2309 goto out;
2310
2311 entry = create_proc_entry("cgroups", 0, NULL);
2312 if (entry)
2313 entry->proc_fops = &proc_cgroupstats_operations;
2314
2315out:
2316 if (err)
2317 bdi_destroy(&cgroup_backing_dev_info);
2318
2319 return err;
2320}
2321
2322/*
2323 * proc_cgroup_show()
2324 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2325 * - Used for /proc/<pid>/cgroup.
2326 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2327 * doesn't really matter if tsk->cgroup changes after we read it,
2328 * and we take cgroup_mutex, keeping attach_task() from changing it
2329 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2330 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2331 * cgroup to top_cgroup.
2332 */
2333
2334/* TODO: Use a proper seq_file iterator */
2335static int proc_cgroup_show(struct seq_file *m, void *v)
2336{
2337 struct pid *pid;
2338 struct task_struct *tsk;
2339 char *buf;
2340 int retval;
2341 struct cgroupfs_root *root;
2342
2343 retval = -ENOMEM;
2344 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2345 if (!buf)
2346 goto out;
2347
2348 retval = -ESRCH;
2349 pid = m->private;
2350 tsk = get_pid_task(pid, PIDTYPE_PID);
2351 if (!tsk)
2352 goto out_free;
2353
2354 retval = 0;
2355
2356 mutex_lock(&cgroup_mutex);
2357
2358 for_each_root(root) {
2359 struct cgroup_subsys *ss;
2360 struct cgroup *cgrp;
2361 int subsys_id;
2362 int count = 0;
2363
2364 /* Skip this hierarchy if it has no active subsystems */
2365 if (!root->actual_subsys_bits)
2366 continue;
2367 for_each_subsys(root, ss)
2368 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2369 seq_putc(m, ':');
2370 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2371 cgrp = task_cgroup(tsk, subsys_id);
2372 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2373 if (retval < 0)
2374 goto out_unlock;
2375 seq_puts(m, buf);
2376 seq_putc(m, '\n');
2377 }
2378
2379out_unlock:
2380 mutex_unlock(&cgroup_mutex);
2381 put_task_struct(tsk);
2382out_free:
2383 kfree(buf);
2384out:
2385 return retval;
2386}
2387
2388static int cgroup_open(struct inode *inode, struct file *file)
2389{
2390 struct pid *pid = PROC_I(inode)->pid;
2391 return single_open(file, proc_cgroup_show, pid);
2392}
2393
2394struct file_operations proc_cgroup_operations = {
2395 .open = cgroup_open,
2396 .read = seq_read,
2397 .llseek = seq_lseek,
2398 .release = single_release,
2399};
2400
2401/* Display information about each subsystem and each hierarchy */
2402static int proc_cgroupstats_show(struct seq_file *m, void *v)
2403{
2404 int i;
2405 struct cgroupfs_root *root;
2406
2407 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
2408 mutex_lock(&cgroup_mutex);
2409 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2410 struct cgroup_subsys *ss = subsys[i];
2411 seq_printf(m, "%s\t%lu\t%d\n",
2412 ss->name, ss->root->subsys_bits,
2413 ss->root->number_of_cgroups);
2414 }
2415 mutex_unlock(&cgroup_mutex);
2416 return 0;
2417}
2418
2419static int cgroupstats_open(struct inode *inode, struct file *file)
2420{
2421 return single_open(file, proc_cgroupstats_show, 0);
2422}
2423
2424static struct file_operations proc_cgroupstats_operations = {
2425 .open = cgroupstats_open,
2426 .read = seq_read,
2427 .llseek = seq_lseek,
2428 .release = single_release,
2429};
2430
2431/**
2432 * cgroup_fork - attach newly forked task to its parents cgroup.
2433 * @tsk: pointer to task_struct of forking parent process.
2434 *
2435 * Description: A task inherits its parent's cgroup at fork().
2436 *
2437 * A pointer to the shared css_set was automatically copied in
2438 * fork.c by dup_task_struct(). However, we ignore that copy, since
2439 * it was not made under the protection of RCU or cgroup_mutex, so
2440 * might no longer be a valid cgroup pointer. attach_task() might
2441 * have already changed current->cgroups, allowing the previously
2442 * referenced cgroup group to be removed and freed.
2443 *
2444 * At the point that cgroup_fork() is called, 'current' is the parent
2445 * task, and the passed argument 'child' points to the child task.
2446 */
2447void cgroup_fork(struct task_struct *child)
2448{
2449 task_lock(current);
2450 child->cgroups = current->cgroups;
2451 get_css_set(child->cgroups);
2452 task_unlock(current);
2453 INIT_LIST_HEAD(&child->cg_list);
2454}
2455
2456/**
2457 * cgroup_fork_callbacks - called on a new task very soon before
2458 * adding it to the tasklist. No need to take any locks since no-one
2459 * can be operating on this task
2460 */
2461void cgroup_fork_callbacks(struct task_struct *child)
2462{
2463 if (need_forkexit_callback) {
2464 int i;
2465 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2466 struct cgroup_subsys *ss = subsys[i];
2467 if (ss->fork)
2468 ss->fork(ss, child);
2469 }
2470 }
2471}
2472
2473/**
2474 * cgroup_post_fork - called on a new task after adding it to the
2475 * task list. Adds the task to the list running through its css_set
2476 * if necessary. Has to be after the task is visible on the task list
2477 * in case we race with the first call to cgroup_iter_start() - to
2478 * guarantee that the new task ends up on its list. */
2479void cgroup_post_fork(struct task_struct *child)
2480{
2481 if (use_task_css_set_links) {
2482 write_lock(&css_set_lock);
2483 if (list_empty(&child->cg_list))
2484 list_add(&child->cg_list, &child->cgroups->tasks);
2485 write_unlock(&css_set_lock);
2486 }
2487}
2488/**
2489 * cgroup_exit - detach cgroup from exiting task
2490 * @tsk: pointer to task_struct of exiting process
2491 *
2492 * Description: Detach cgroup from @tsk and release it.
2493 *
2494 * Note that cgroups marked notify_on_release force every task in
2495 * them to take the global cgroup_mutex mutex when exiting.
2496 * This could impact scaling on very large systems. Be reluctant to
2497 * use notify_on_release cgroups where very high task exit scaling
2498 * is required on large systems.
2499 *
2500 * the_top_cgroup_hack:
2501 *
2502 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2503 *
2504 * We call cgroup_exit() while the task is still competent to
2505 * handle notify_on_release(), then leave the task attached to the
2506 * root cgroup in each hierarchy for the remainder of its exit.
2507 *
2508 * To do this properly, we would increment the reference count on
2509 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2510 * code we would add a second cgroup function call, to drop that
2511 * reference. This would just create an unnecessary hot spot on
2512 * the top_cgroup reference count, to no avail.
2513 *
2514 * Normally, holding a reference to a cgroup without bumping its
2515 * count is unsafe. The cgroup could go away, or someone could
2516 * attach us to a different cgroup, decrementing the count on
2517 * the first cgroup that we never incremented. But in this case,
2518 * top_cgroup isn't going away, and either task has PF_EXITING set,
2519 * which wards off any attach_task() attempts, or task is a failed
2520 * fork, never visible to attach_task.
2521 *
2522 */
2523void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2524{
2525 int i;
2526 struct css_set *cg;
2527
2528 if (run_callbacks && need_forkexit_callback) {
2529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2530 struct cgroup_subsys *ss = subsys[i];
2531 if (ss->exit)
2532 ss->exit(ss, tsk);
2533 }
2534 }
2535
2536 /*
2537 * Unlink from the css_set task list if necessary.
2538 * Optimistically check cg_list before taking
2539 * css_set_lock
2540 */
2541 if (!list_empty(&tsk->cg_list)) {
2542 write_lock(&css_set_lock);
2543 if (!list_empty(&tsk->cg_list))
2544 list_del(&tsk->cg_list);
2545 write_unlock(&css_set_lock);
2546 }
2547
2548 /* Reassign the task to the init_css_set. */
2549 task_lock(tsk);
2550 cg = tsk->cgroups;
2551 tsk->cgroups = &init_css_set;
2552 task_unlock(tsk);
2553 if (cg)
2554 put_css_set_taskexit(cg);
2555}
2556
2557/**
2558 * cgroup_clone - duplicate the current cgroup in the hierarchy
2559 * that the given subsystem is attached to, and move this task into
2560 * the new child
2561 */
2562int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2563{
2564 struct dentry *dentry;
2565 int ret = 0;
2566 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2567 struct cgroup *parent, *child;
2568 struct inode *inode;
2569 struct css_set *cg;
2570 struct cgroupfs_root *root;
2571 struct cgroup_subsys *ss;
2572
2573 /* We shouldn't be called by an unregistered subsystem */
2574 BUG_ON(!subsys->active);
2575
2576 /* First figure out what hierarchy and cgroup we're dealing
2577 * with, and pin them so we can drop cgroup_mutex */
2578 mutex_lock(&cgroup_mutex);
2579 again:
2580 root = subsys->root;
2581 if (root == &rootnode) {
2582 printk(KERN_INFO
2583 "Not cloning cgroup for unused subsystem %s\n",
2584 subsys->name);
2585 mutex_unlock(&cgroup_mutex);
2586 return 0;
2587 }
2588 cg = tsk->cgroups;
2589 parent = task_cgroup(tsk, subsys->subsys_id);
2590
2591 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
2592
2593 /* Pin the hierarchy */
2594 atomic_inc(&parent->root->sb->s_active);
2595
2596 /* Keep the cgroup alive */
2597 get_css_set(cg);
2598 mutex_unlock(&cgroup_mutex);
2599
2600 /* Now do the VFS work to create a cgroup */
2601 inode = parent->dentry->d_inode;
2602
2603 /* Hold the parent directory mutex across this operation to
2604 * stop anyone else deleting the new cgroup */
2605 mutex_lock(&inode->i_mutex);
2606 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2607 if (IS_ERR(dentry)) {
2608 printk(KERN_INFO
2609 "Couldn't allocate dentry for %s: %ld\n", nodename,
2610 PTR_ERR(dentry));
2611 ret = PTR_ERR(dentry);
2612 goto out_release;
2613 }
2614
2615 /* Create the cgroup directory, which also creates the cgroup */
2616 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
2617 child = __d_cgrp(dentry);
2618 dput(dentry);
2619 if (ret) {
2620 printk(KERN_INFO
2621 "Failed to create cgroup %s: %d\n", nodename,
2622 ret);
2623 goto out_release;
2624 }
2625
2626 if (!child) {
2627 printk(KERN_INFO
2628 "Couldn't find new cgroup %s\n", nodename);
2629 ret = -ENOMEM;
2630 goto out_release;
2631 }
2632
2633 /* The cgroup now exists. Retake cgroup_mutex and check
2634 * that we're still in the same state that we thought we
2635 * were. */
2636 mutex_lock(&cgroup_mutex);
2637 if ((root != subsys->root) ||
2638 (parent != task_cgroup(tsk, subsys->subsys_id))) {
2639 /* Aargh, we raced ... */
2640 mutex_unlock(&inode->i_mutex);
2641 put_css_set(cg);
2642
2643 deactivate_super(parent->root->sb);
2644 /* The cgroup is still accessible in the VFS, but
2645 * we're not going to try to rmdir() it at this
2646 * point. */
2647 printk(KERN_INFO
2648 "Race in cgroup_clone() - leaking cgroup %s\n",
2649 nodename);
2650 goto again;
2651 }
2652
2653 /* do any required auto-setup */
2654 for_each_subsys(root, ss) {
2655 if (ss->post_clone)
2656 ss->post_clone(ss, child);
2657 }
2658
2659 /* All seems fine. Finish by moving the task into the new cgroup */
2660 ret = attach_task(child, tsk);
2661 mutex_unlock(&cgroup_mutex);
2662
2663 out_release:
2664 mutex_unlock(&inode->i_mutex);
2665
2666 mutex_lock(&cgroup_mutex);
2667 put_css_set(cg);
2668 mutex_unlock(&cgroup_mutex);
2669 deactivate_super(parent->root->sb);
2670 return ret;
2671}
2672
2673/*
2674 * See if "cgrp" is a descendant of the current task's cgroup in
2675 * the appropriate hierarchy
2676 *
2677 * If we are sending in dummytop, then presumably we are creating
2678 * the top cgroup in the subsystem.
2679 *
2680 * Called only by the ns (nsproxy) cgroup.
2681 */
2682int cgroup_is_descendant(const struct cgroup *cgrp)
2683{
2684 int ret;
2685 struct cgroup *target;
2686 int subsys_id;
2687
2688 if (cgrp == dummytop)
2689 return 1;
2690
2691 get_first_subsys(cgrp, NULL, &subsys_id);
2692 target = task_cgroup(current, subsys_id);
2693 while (cgrp != target && cgrp!= cgrp->top_cgroup)
2694 cgrp = cgrp->parent;
2695 ret = (cgrp == target);
2696 return ret;
2697}
2698
2699static void check_for_release(struct cgroup *cgrp)
2700{
2701 /* All of these checks rely on RCU to keep the cgroup
2702 * structure alive */
2703 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
2704 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
2705 /* Control Group is currently removeable. If it's not
2706 * already queued for a userspace notification, queue
2707 * it now */
2708 int need_schedule_work = 0;
2709 spin_lock(&release_list_lock);
2710 if (!cgroup_is_removed(cgrp) &&
2711 list_empty(&cgrp->release_list)) {
2712 list_add(&cgrp->release_list, &release_list);
2713 need_schedule_work = 1;
2714 }
2715 spin_unlock(&release_list_lock);
2716 if (need_schedule_work)
2717 schedule_work(&release_agent_work);
2718 }
2719}
2720
2721void __css_put(struct cgroup_subsys_state *css)
2722{
2723 struct cgroup *cgrp = css->cgroup;
2724 rcu_read_lock();
2725 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
2726 set_bit(CGRP_RELEASABLE, &cgrp->flags);
2727 check_for_release(cgrp);
2728 }
2729 rcu_read_unlock();
2730}
2731
2732/*
2733 * Notify userspace when a cgroup is released, by running the
2734 * configured release agent with the name of the cgroup (path
2735 * relative to the root of cgroup file system) as the argument.
2736 *
2737 * Most likely, this user command will try to rmdir this cgroup.
2738 *
2739 * This races with the possibility that some other task will be
2740 * attached to this cgroup before it is removed, or that some other
2741 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
2742 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
2743 * unused, and this cgroup will be reprieved from its death sentence,
2744 * to continue to serve a useful existence. Next time it's released,
2745 * we will get notified again, if it still has 'notify_on_release' set.
2746 *
2747 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
2748 * means only wait until the task is successfully execve()'d. The
2749 * separate release agent task is forked by call_usermodehelper(),
2750 * then control in this thread returns here, without waiting for the
2751 * release agent task. We don't bother to wait because the caller of
2752 * this routine has no use for the exit status of the release agent
2753 * task, so no sense holding our caller up for that.
2754 *
2755 */
2756
2757static void cgroup_release_agent(struct work_struct *work)
2758{
2759 BUG_ON(work != &release_agent_work);
2760 mutex_lock(&cgroup_mutex);
2761 spin_lock(&release_list_lock);
2762 while (!list_empty(&release_list)) {
2763 char *argv[3], *envp[3];
2764 int i;
2765 char *pathbuf;
2766 struct cgroup *cgrp = list_entry(release_list.next,
2767 struct cgroup,
2768 release_list);
2769 list_del_init(&cgrp->release_list);
2770 spin_unlock(&release_list_lock);
2771 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2772 if (!pathbuf) {
2773 spin_lock(&release_list_lock);
2774 continue;
2775 }
2776
2777 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
2778 kfree(pathbuf);
2779 spin_lock(&release_list_lock);
2780 continue;
2781 }
2782
2783 i = 0;
2784 argv[i++] = cgrp->root->release_agent_path;
2785 argv[i++] = (char *)pathbuf;
2786 argv[i] = NULL;
2787
2788 i = 0;
2789 /* minimal command environment */
2790 envp[i++] = "HOME=/";
2791 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
2792 envp[i] = NULL;
2793
2794 /* Drop the lock while we invoke the usermode helper,
2795 * since the exec could involve hitting disk and hence
2796 * be a slow process */
2797 mutex_unlock(&cgroup_mutex);
2798 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2799 kfree(pathbuf);
2800 mutex_lock(&cgroup_mutex);
2801 spin_lock(&release_list_lock);
2802 }
2803 spin_unlock(&release_list_lock);
2804 mutex_unlock(&cgroup_mutex);
2805}
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
new file mode 100644
index 000000000000..37301e877cb0
--- /dev/null
+++ b/kernel/cgroup_debug.c
@@ -0,0 +1,97 @@
1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 cgroup_lock();
44 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count;
47}
48
49static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
50{
51 return (u64)(long)current->cgroups;
52}
53
54static u64 current_css_set_refcount_read(struct cgroup *cont,
55 struct cftype *cft)
56{
57 u64 count;
58
59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount);
61 rcu_read_unlock();
62 return count;
63}
64
65static struct cftype files[] = {
66 {
67 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read,
69 },
70 {
71 .name = "taskcount",
72 .read_uint = taskcount_read,
73 },
74
75 {
76 .name = "current_css_set",
77 .read_uint = current_css_set_read,
78 },
79
80 {
81 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read,
83 },
84};
85
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
87{
88 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
89}
90
91struct cgroup_subsys debug_subsys = {
92 .name = "debug",
93 .create = debug_create,
94 .destroy = debug_destroy,
95 .populate = debug_populate,
96 .subsys_id = debug_subsys_id,
97};
diff --git a/kernel/compat.c b/kernel/compat.c
index 3bae3742c2aa..42a1ed4b61b1 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,62 +40,27 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
41} 41}
42 42
43static long compat_nanosleep_restart(struct restart_block *restart)
44{
45 unsigned long expire = restart->arg0, now = jiffies;
46 struct compat_timespec __user *rmtp;
47
48 /* Did it expire while we handled signals? */
49 if (!time_after(expire, now))
50 return 0;
51
52 expire = schedule_timeout_interruptible(expire - now);
53 if (expire == 0)
54 return 0;
55
56 rmtp = (struct compat_timespec __user *)restart->arg1;
57 if (rmtp) {
58 struct compat_timespec ct;
59 struct timespec t;
60
61 jiffies_to_timespec(expire, &t);
62 ct.tv_sec = t.tv_sec;
63 ct.tv_nsec = t.tv_nsec;
64 if (copy_to_user(rmtp, &ct, sizeof(ct)))
65 return -EFAULT;
66 }
67 /* The 'restart' block is already filled in */
68 return -ERESTART_RESTARTBLOCK;
69}
70
71asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, 43asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
72 struct compat_timespec __user *rmtp) 44 struct compat_timespec __user *rmtp)
73{ 45{
74 struct timespec t; 46 struct timespec tu, rmt;
75 struct restart_block *restart; 47 long ret;
76 unsigned long expire;
77 48
78 if (get_compat_timespec(&t, rqtp)) 49 if (get_compat_timespec(&tu, rqtp))
79 return -EFAULT; 50 return -EFAULT;
80 51
81 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) 52 if (!timespec_valid(&tu))
82 return -EINVAL; 53 return -EINVAL;
83 54
84 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 55 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
85 expire = schedule_timeout_interruptible(expire); 56 CLOCK_MONOTONIC);
86 if (expire == 0)
87 return 0;
88 57
89 if (rmtp) { 58 if (ret && rmtp) {
90 jiffies_to_timespec(expire, &t); 59 if (put_compat_timespec(&rmt, rmtp))
91 if (put_compat_timespec(&t, rmtp))
92 return -EFAULT; 60 return -EFAULT;
93 } 61 }
94 restart = &current_thread_info()->restart_block; 62
95 restart->fn = compat_nanosleep_restart; 63 return ret;
96 restart->arg0 = jiffies + expire;
97 restart->arg1 = (unsigned long) rmtp;
98 return -ERESTART_RESTARTBLOCK;
99} 64}
100 65
101static inline long get_compat_itimerval(struct itimerval *o, 66static inline long get_compat_itimerval(struct itimerval *o,
@@ -247,8 +212,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
247 int ret; 212 int ret;
248 mm_segment_t old_fs = get_fs (); 213 mm_segment_t old_fs = get_fs ();
249 214
250 if (resource >= RLIM_NLIMITS) 215 if (resource >= RLIM_NLIMITS)
251 return -EINVAL; 216 return -EINVAL;
252 217
253 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 218 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
254 __get_user(r.rlim_cur, &rlim->rlim_cur) || 219 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -477,21 +442,21 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
477 442
478int get_compat_itimerspec(struct itimerspec *dst, 443int get_compat_itimerspec(struct itimerspec *dst,
479 const struct compat_itimerspec __user *src) 444 const struct compat_itimerspec __user *src)
480{ 445{
481 if (get_compat_timespec(&dst->it_interval, &src->it_interval) || 446 if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
482 get_compat_timespec(&dst->it_value, &src->it_value)) 447 get_compat_timespec(&dst->it_value, &src->it_value))
483 return -EFAULT; 448 return -EFAULT;
484 return 0; 449 return 0;
485} 450}
486 451
487int put_compat_itimerspec(struct compat_itimerspec __user *dst, 452int put_compat_itimerspec(struct compat_itimerspec __user *dst,
488 const struct itimerspec *src) 453 const struct itimerspec *src)
489{ 454{
490 if (put_compat_timespec(&src->it_interval, &dst->it_interval) || 455 if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
491 put_compat_timespec(&src->it_value, &dst->it_value)) 456 put_compat_timespec(&src->it_value, &dst->it_value))
492 return -EFAULT; 457 return -EFAULT;
493 return 0; 458 return 0;
494} 459}
495 460
496long compat_sys_timer_create(clockid_t which_clock, 461long compat_sys_timer_create(clockid_t which_clock,
497 struct compat_sigevent __user *timer_event_spec, 462 struct compat_sigevent __user *timer_event_spec,
@@ -512,9 +477,9 @@ long compat_sys_timer_create(clockid_t which_clock,
512} 477}
513 478
514long compat_sys_timer_settime(timer_t timer_id, int flags, 479long compat_sys_timer_settime(timer_t timer_id, int flags,
515 struct compat_itimerspec __user *new, 480 struct compat_itimerspec __user *new,
516 struct compat_itimerspec __user *old) 481 struct compat_itimerspec __user *old)
517{ 482{
518 long err; 483 long err;
519 mm_segment_t oldfs; 484 mm_segment_t oldfs;
520 struct itimerspec newts, oldts; 485 struct itimerspec newts, oldts;
@@ -522,58 +487,58 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
522 if (!new) 487 if (!new)
523 return -EINVAL; 488 return -EINVAL;
524 if (get_compat_itimerspec(&newts, new)) 489 if (get_compat_itimerspec(&newts, new))
525 return -EFAULT; 490 return -EFAULT;
526 oldfs = get_fs(); 491 oldfs = get_fs();
527 set_fs(KERNEL_DS); 492 set_fs(KERNEL_DS);
528 err = sys_timer_settime(timer_id, flags, 493 err = sys_timer_settime(timer_id, flags,
529 (struct itimerspec __user *) &newts, 494 (struct itimerspec __user *) &newts,
530 (struct itimerspec __user *) &oldts); 495 (struct itimerspec __user *) &oldts);
531 set_fs(oldfs); 496 set_fs(oldfs);
532 if (!err && old && put_compat_itimerspec(old, &oldts)) 497 if (!err && old && put_compat_itimerspec(old, &oldts))
533 return -EFAULT; 498 return -EFAULT;
534 return err; 499 return err;
535} 500}
536 501
537long compat_sys_timer_gettime(timer_t timer_id, 502long compat_sys_timer_gettime(timer_t timer_id,
538 struct compat_itimerspec __user *setting) 503 struct compat_itimerspec __user *setting)
539{ 504{
540 long err; 505 long err;
541 mm_segment_t oldfs; 506 mm_segment_t oldfs;
542 struct itimerspec ts; 507 struct itimerspec ts;
543 508
544 oldfs = get_fs(); 509 oldfs = get_fs();
545 set_fs(KERNEL_DS); 510 set_fs(KERNEL_DS);
546 err = sys_timer_gettime(timer_id, 511 err = sys_timer_gettime(timer_id,
547 (struct itimerspec __user *) &ts); 512 (struct itimerspec __user *) &ts);
548 set_fs(oldfs); 513 set_fs(oldfs);
549 if (!err && put_compat_itimerspec(setting, &ts)) 514 if (!err && put_compat_itimerspec(setting, &ts))
550 return -EFAULT; 515 return -EFAULT;
551 return err; 516 return err;
552} 517}
553 518
554long compat_sys_clock_settime(clockid_t which_clock, 519long compat_sys_clock_settime(clockid_t which_clock,
555 struct compat_timespec __user *tp) 520 struct compat_timespec __user *tp)
556{ 521{
557 long err; 522 long err;
558 mm_segment_t oldfs; 523 mm_segment_t oldfs;
559 struct timespec ts; 524 struct timespec ts;
560 525
561 if (get_compat_timespec(&ts, tp)) 526 if (get_compat_timespec(&ts, tp))
562 return -EFAULT; 527 return -EFAULT;
563 oldfs = get_fs(); 528 oldfs = get_fs();
564 set_fs(KERNEL_DS); 529 set_fs(KERNEL_DS);
565 err = sys_clock_settime(which_clock, 530 err = sys_clock_settime(which_clock,
566 (struct timespec __user *) &ts); 531 (struct timespec __user *) &ts);
567 set_fs(oldfs); 532 set_fs(oldfs);
568 return err; 533 return err;
569} 534}
570 535
571long compat_sys_clock_gettime(clockid_t which_clock, 536long compat_sys_clock_gettime(clockid_t which_clock,
572 struct compat_timespec __user *tp) 537 struct compat_timespec __user *tp)
573{ 538{
574 long err; 539 long err;
575 mm_segment_t oldfs; 540 mm_segment_t oldfs;
576 struct timespec ts; 541 struct timespec ts;
577 542
578 oldfs = get_fs(); 543 oldfs = get_fs();
579 set_fs(KERNEL_DS); 544 set_fs(KERNEL_DS);
@@ -581,16 +546,16 @@ long compat_sys_clock_gettime(clockid_t which_clock,
581 (struct timespec __user *) &ts); 546 (struct timespec __user *) &ts);
582 set_fs(oldfs); 547 set_fs(oldfs);
583 if (!err && put_compat_timespec(&ts, tp)) 548 if (!err && put_compat_timespec(&ts, tp))
584 return -EFAULT; 549 return -EFAULT;
585 return err; 550 return err;
586} 551}
587 552
588long compat_sys_clock_getres(clockid_t which_clock, 553long compat_sys_clock_getres(clockid_t which_clock,
589 struct compat_timespec __user *tp) 554 struct compat_timespec __user *tp)
590{ 555{
591 long err; 556 long err;
592 mm_segment_t oldfs; 557 mm_segment_t oldfs;
593 struct timespec ts; 558 struct timespec ts;
594 559
595 oldfs = get_fs(); 560 oldfs = get_fs();
596 set_fs(KERNEL_DS); 561 set_fs(KERNEL_DS);
@@ -598,9 +563,9 @@ long compat_sys_clock_getres(clockid_t which_clock,
598 (struct timespec __user *) &ts); 563 (struct timespec __user *) &ts);
599 set_fs(oldfs); 564 set_fs(oldfs);
600 if (!err && tp && put_compat_timespec(&ts, tp)) 565 if (!err && tp && put_compat_timespec(&ts, tp))
601 return -EFAULT; 566 return -EFAULT;
602 return err; 567 return err;
603} 568}
604 569
605static long compat_clock_nanosleep_restart(struct restart_block *restart) 570static long compat_clock_nanosleep_restart(struct restart_block *restart)
606{ 571{
@@ -632,10 +597,10 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
632{ 597{
633 long err; 598 long err;
634 mm_segment_t oldfs; 599 mm_segment_t oldfs;
635 struct timespec in, out; 600 struct timespec in, out;
636 struct restart_block *restart; 601 struct restart_block *restart;
637 602
638 if (get_compat_timespec(&in, rqtp)) 603 if (get_compat_timespec(&in, rqtp))
639 return -EFAULT; 604 return -EFAULT;
640 605
641 oldfs = get_fs(); 606 oldfs = get_fs();
@@ -654,8 +619,8 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
654 restart->fn = compat_clock_nanosleep_restart; 619 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp; 620 restart->arg1 = (unsigned long) rmtp;
656 } 621 }
657 return err; 622 return err;
658} 623}
659 624
660/* 625/*
661 * We currently only need the following fields from the sigevent 626 * We currently only need the following fields from the sigevent
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 38033db8d8ec..6b3a0c15144f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -98,7 +98,8 @@ static inline void check_for_tasks(int cpu)
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %x) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, task_pid_nr(p), cpu,
102 p->state, p->flags);
102 } 103 }
103 write_unlock_irq(&tasklist_lock); 104 write_unlock_irq(&tasklist_lock);
104} 105}
@@ -150,6 +151,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
150 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
151 hcpu, -1, &nr_calls); 152 hcpu, -1, &nr_calls);
152 if (err == NOTIFY_BAD) { 153 if (err == NOTIFY_BAD) {
154 nr_calls--;
153 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 155 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
154 hcpu, nr_calls, NULL); 156 hcpu, nr_calls, NULL);
155 printk("%s: attempt to take down CPU %u failed\n", 157 printk("%s: attempt to take down CPU %u failed\n",
@@ -233,6 +235,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
233 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
234 -1, &nr_calls); 236 -1, &nr_calls);
235 if (ret == NOTIFY_BAD) { 237 if (ret == NOTIFY_BAD) {
238 nr_calls--;
236 printk("%s: attempt to bring up CPU %u failed\n", 239 printk("%s: attempt to bring up CPU %u failed\n",
237 __FUNCTION__, cpu); 240 __FUNCTION__, cpu);
238 ret = -EINVAL; 241 ret = -EINVAL;
@@ -262,6 +265,15 @@ out_notify:
262int __cpuinit cpu_up(unsigned int cpu) 265int __cpuinit cpu_up(unsigned int cpu)
263{ 266{
264 int err = 0; 267 int err = 0;
268 if (!cpu_isset(cpu, cpu_possible_map)) {
269 printk(KERN_ERR "can't online cpu %d because it is not "
270 "configured as may-hotadd at boot time\n", cpu);
271#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
272 printk(KERN_ERR "please check additional_cpus= boot "
273 "parameter\n");
274#endif
275 return -EINVAL;
276 }
265 277
266 mutex_lock(&cpu_add_remove_lock); 278 mutex_lock(&cpu_add_remove_lock);
267 if (cpu_hotplug_disabled) 279 if (cpu_hotplug_disabled)
diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c
new file mode 100644
index 000000000000..731e47e7f164
--- /dev/null
+++ b/kernel/cpu_acct.c
@@ -0,0 +1,186 @@
1/*
2 * kernel/cpu_acct.c - CPU accounting cgroup subsystem
3 *
4 * Copyright (C) Google Inc, 2006
5 *
6 * Developed by Paul Menage (menage@google.com) and Balbir Singh
7 * (balbir@in.ibm.com)
8 *
9 */
10
11/*
12 * Example cgroup subsystem for reporting total CPU usage of tasks in a
13 * cgroup, along with percentage load over a time interval
14 */
15
16#include <linux/module.h>
17#include <linux/cgroup.h>
18#include <linux/fs.h>
19#include <linux/rcupdate.h>
20
21#include <asm/div64.h>
22
23struct cpuacct {
24 struct cgroup_subsys_state css;
25 spinlock_t lock;
26 /* total time used by this class */
27 cputime64_t time;
28
29 /* time when next load calculation occurs */
30 u64 next_interval_check;
31
32 /* time used in current period */
33 cputime64_t current_interval_time;
34
35 /* time used in last period */
36 cputime64_t last_interval_time;
37};
38
39struct cgroup_subsys cpuacct_subsys;
40
41static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
42{
43 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
44 struct cpuacct, css);
45}
46
47static inline struct cpuacct *task_ca(struct task_struct *task)
48{
49 return container_of(task_subsys_state(task, cpuacct_subsys_id),
50 struct cpuacct, css);
51}
52
53#define INTERVAL (HZ * 10)
54
55static inline u64 next_interval_boundary(u64 now)
56{
57 /* calculate the next interval boundary beyond the
58 * current time */
59 do_div(now, INTERVAL);
60 return (now + 1) * INTERVAL;
61}
62
63static struct cgroup_subsys_state *cpuacct_create(
64 struct cgroup_subsys *ss, struct cgroup *cont)
65{
66 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
67
68 if (!ca)
69 return ERR_PTR(-ENOMEM);
70 spin_lock_init(&ca->lock);
71 ca->next_interval_check = next_interval_boundary(get_jiffies_64());
72 return &ca->css;
73}
74
75static void cpuacct_destroy(struct cgroup_subsys *ss,
76 struct cgroup *cont)
77{
78 kfree(cgroup_ca(cont));
79}
80
81/* Lazily update the load calculation if necessary. Called with ca locked */
82static void cpuusage_update(struct cpuacct *ca)
83{
84 u64 now = get_jiffies_64();
85
86 /* If we're not due for an update, return */
87 if (ca->next_interval_check > now)
88 return;
89
90 if (ca->next_interval_check <= (now - INTERVAL)) {
91 /* If it's been more than an interval since the last
92 * check, then catch up - the last interval must have
93 * been zero load */
94 ca->last_interval_time = 0;
95 ca->next_interval_check = next_interval_boundary(now);
96 } else {
97 /* If a steal takes the last interval time negative,
98 * then we just ignore it */
99 if ((s64)ca->current_interval_time > 0)
100 ca->last_interval_time = ca->current_interval_time;
101 else
102 ca->last_interval_time = 0;
103 ca->next_interval_check += INTERVAL;
104 }
105 ca->current_interval_time = 0;
106}
107
108static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
109{
110 struct cpuacct *ca = cgroup_ca(cont);
111 u64 time;
112
113 spin_lock_irq(&ca->lock);
114 cpuusage_update(ca);
115 time = cputime64_to_jiffies64(ca->time);
116 spin_unlock_irq(&ca->lock);
117
118 /* Convert 64-bit jiffies to seconds */
119 time *= 1000;
120 do_div(time, HZ);
121 return time;
122}
123
124static u64 load_read(struct cgroup *cont, struct cftype *cft)
125{
126 struct cpuacct *ca = cgroup_ca(cont);
127 u64 time;
128
129 /* Find the time used in the previous interval */
130 spin_lock_irq(&ca->lock);
131 cpuusage_update(ca);
132 time = cputime64_to_jiffies64(ca->last_interval_time);
133 spin_unlock_irq(&ca->lock);
134
135 /* Convert time to a percentage, to give the load in the
136 * previous period */
137 time *= 100;
138 do_div(time, INTERVAL);
139
140 return time;
141}
142
143static struct cftype files[] = {
144 {
145 .name = "usage",
146 .read_uint = cpuusage_read,
147 },
148 {
149 .name = "load",
150 .read_uint = load_read,
151 }
152};
153
154static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
155{
156 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
157}
158
159void cpuacct_charge(struct task_struct *task, cputime_t cputime)
160{
161
162 struct cpuacct *ca;
163 unsigned long flags;
164
165 if (!cpuacct_subsys.active)
166 return;
167 rcu_read_lock();
168 ca = task_ca(task);
169 if (ca) {
170 spin_lock_irqsave(&ca->lock, flags);
171 cpuusage_update(ca);
172 ca->time = cputime64_add(ca->time, cputime);
173 ca->current_interval_time =
174 cputime64_add(ca->current_interval_time, cputime);
175 spin_unlock_irqrestore(&ca->lock, flags);
176 }
177 rcu_read_unlock();
178}
179
180struct cgroup_subsys cpuacct_subsys = {
181 .name = "cpuacct",
182 .create = cpuacct_create,
183 .destroy = cpuacct_destroy,
184 .populate = cpuacct_populate,
185 .subsys_id = cpuacct_subsys_id,
186};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2eb2e50db0d6..50f5dc463688 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,7 +4,8 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
8 * 9 *
9 * Portions derived from Patrick Mochel's sysfs code. 10 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
@@ -12,6 +13,7 @@
12 * 2003-10-10 Written by Simon Derr. 13 * 2003-10-10 Written by Simon Derr.
13 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
14 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
15 * 17 *
16 * This file is subject to the terms and conditions of the GNU General Public 18 * This file is subject to the terms and conditions of the GNU General Public
17 * License. See the file COPYING in the main directory of the Linux 19 * License. See the file COPYING in the main directory of the Linux
@@ -36,6 +38,7 @@
36#include <linux/mount.h> 38#include <linux/mount.h>
37#include <linux/namei.h> 39#include <linux/namei.h>
38#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
39#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
40#include <linux/rcupdate.h> 43#include <linux/rcupdate.h>
41#include <linux/sched.h> 44#include <linux/sched.h>
@@ -52,8 +55,7 @@
52#include <asm/uaccess.h> 55#include <asm/uaccess.h>
53#include <asm/atomic.h> 56#include <asm/atomic.h>
54#include <linux/mutex.h> 57#include <linux/mutex.h>
55 58#include <linux/kfifo.h>
56#define CPUSET_SUPER_MAGIC 0x27e0eb
57 59
58/* 60/*
59 * Tracks how many cpusets are currently defined in system. 61 * Tracks how many cpusets are currently defined in system.
@@ -62,6 +64,10 @@
62 */ 64 */
63int number_of_cpusets __read_mostly; 65int number_of_cpusets __read_mostly;
64 66
67/* Retrieve the cpuset from a cgroup */
68struct cgroup_subsys cpuset_subsys;
69struct cpuset;
70
65/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
66 72
67struct fmeter { 73struct fmeter {
@@ -72,24 +78,13 @@ struct fmeter {
72}; 78};
73 79
74struct cpuset { 80struct cpuset {
81 struct cgroup_subsys_state css;
82
75 unsigned long flags; /* "unsigned long" so bitops work */ 83 unsigned long flags; /* "unsigned long" so bitops work */
76 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 84 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
77 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 85 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
78 86
79 /*
80 * Count is atomic so can incr (fork) or decr (exit) without a lock.
81 */
82 atomic_t count; /* count tasks using this cpuset */
83
84 /*
85 * We link our 'sibling' struct into our parents 'children'.
86 * Our children link their 'sibling' into our 'children'.
87 */
88 struct list_head sibling; /* my parents children */
89 struct list_head children; /* my children */
90
91 struct cpuset *parent; /* my parent */ 87 struct cpuset *parent; /* my parent */
92 struct dentry *dentry; /* cpuset fs entry */
93 88
94 /* 89 /*
95 * Copy of global cpuset_mems_generation as of the most 90 * Copy of global cpuset_mems_generation as of the most
@@ -98,15 +93,32 @@ struct cpuset {
98 int mems_generation; 93 int mems_generation;
99 94
100 struct fmeter fmeter; /* memory_pressure filter */ 95 struct fmeter fmeter; /* memory_pressure filter */
96
97 /* partition number for rebuild_sched_domains() */
98 int pn;
101}; 99};
102 100
101/* Retrieve the cpuset for a cgroup */
102static inline struct cpuset *cgroup_cs(struct cgroup *cont)
103{
104 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
105 struct cpuset, css);
106}
107
108/* Retrieve the cpuset for a task */
109static inline struct cpuset *task_cs(struct task_struct *task)
110{
111 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css);
113}
114
115
103/* bits in struct cpuset flags field */ 116/* bits in struct cpuset flags field */
104typedef enum { 117typedef enum {
105 CS_CPU_EXCLUSIVE, 118 CS_CPU_EXCLUSIVE,
106 CS_MEM_EXCLUSIVE, 119 CS_MEM_EXCLUSIVE,
107 CS_MEMORY_MIGRATE, 120 CS_MEMORY_MIGRATE,
108 CS_REMOVED, 121 CS_SCHED_LOAD_BALANCE,
109 CS_NOTIFY_ON_RELEASE,
110 CS_SPREAD_PAGE, 122 CS_SPREAD_PAGE,
111 CS_SPREAD_SLAB, 123 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 124} cpuset_flagbits_t;
@@ -122,14 +134,9 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
122 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 134 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 135}
124 136
125static inline int is_removed(const struct cpuset *cs) 137static inline int is_sched_load_balance(const struct cpuset *cs)
126{ 138{
127 return test_bit(CS_REMOVED, &cs->flags); 139 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
128}
129
130static inline int notify_on_release(const struct cpuset *cs)
131{
132 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 140}
134 141
135static inline int is_memory_migrate(const struct cpuset *cs) 142static inline int is_memory_migrate(const struct cpuset *cs)
@@ -172,14 +179,8 @@ static struct cpuset top_cpuset = {
172 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 179 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
173 .cpus_allowed = CPU_MASK_ALL, 180 .cpus_allowed = CPU_MASK_ALL,
174 .mems_allowed = NODE_MASK_ALL, 181 .mems_allowed = NODE_MASK_ALL,
175 .count = ATOMIC_INIT(0),
176 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
177 .children = LIST_HEAD_INIT(top_cpuset.children),
178}; 182};
179 183
180static struct vfsmount *cpuset_mount;
181static struct super_block *cpuset_sb;
182
183/* 184/*
184 * We have two global cpuset mutexes below. They can nest. 185 * We have two global cpuset mutexes below. They can nest.
185 * It is ok to first take manage_mutex, then nest callback_mutex. We also 186 * It is ok to first take manage_mutex, then nest callback_mutex. We also
@@ -263,297 +264,33 @@ static struct super_block *cpuset_sb;
263 * the routine cpuset_update_task_memory_state(). 264 * the routine cpuset_update_task_memory_state().
264 */ 265 */
265 266
266static DEFINE_MUTEX(manage_mutex);
267static DEFINE_MUTEX(callback_mutex); 267static DEFINE_MUTEX(callback_mutex);
268 268
269/* 269/* This is ugly, but preserves the userspace API for existing cpuset
270 * A couple of forward declarations required, due to cyclic reference loop: 270 * users. If someone tries to mount the "cpuset" filesystem, we
271 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 271 * silently switch it to mount "cgroup" instead */
272 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
273 */
274
275static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
276static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
277
278static struct backing_dev_info cpuset_backing_dev_info = {
279 .ra_pages = 0, /* No readahead */
280 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
281};
282
283static struct inode *cpuset_new_inode(mode_t mode)
284{
285 struct inode *inode = new_inode(cpuset_sb);
286
287 if (inode) {
288 inode->i_mode = mode;
289 inode->i_uid = current->fsuid;
290 inode->i_gid = current->fsgid;
291 inode->i_blocks = 0;
292 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
293 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
294 }
295 return inode;
296}
297
298static void cpuset_diput(struct dentry *dentry, struct inode *inode)
299{
300 /* is dentry a directory ? if so, kfree() associated cpuset */
301 if (S_ISDIR(inode->i_mode)) {
302 struct cpuset *cs = dentry->d_fsdata;
303 BUG_ON(!(is_removed(cs)));
304 kfree(cs);
305 }
306 iput(inode);
307}
308
309static struct dentry_operations cpuset_dops = {
310 .d_iput = cpuset_diput,
311};
312
313static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
314{
315 struct dentry *d = lookup_one_len(name, parent, strlen(name));
316 if (!IS_ERR(d))
317 d->d_op = &cpuset_dops;
318 return d;
319}
320
321static void remove_dir(struct dentry *d)
322{
323 struct dentry *parent = dget(d->d_parent);
324
325 d_delete(d);
326 simple_rmdir(parent->d_inode, d);
327 dput(parent);
328}
329
330/*
331 * NOTE : the dentry must have been dget()'ed
332 */
333static void cpuset_d_remove_dir(struct dentry *dentry)
334{
335 struct list_head *node;
336
337 spin_lock(&dcache_lock);
338 node = dentry->d_subdirs.next;
339 while (node != &dentry->d_subdirs) {
340 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
341 list_del_init(node);
342 if (d->d_inode) {
343 d = dget_locked(d);
344 spin_unlock(&dcache_lock);
345 d_delete(d);
346 simple_unlink(dentry->d_inode, d);
347 dput(d);
348 spin_lock(&dcache_lock);
349 }
350 node = dentry->d_subdirs.next;
351 }
352 list_del_init(&dentry->d_u.d_child);
353 spin_unlock(&dcache_lock);
354 remove_dir(dentry);
355}
356
357static struct super_operations cpuset_ops = {
358 .statfs = simple_statfs,
359 .drop_inode = generic_delete_inode,
360};
361
362static int cpuset_fill_super(struct super_block *sb, void *unused_data,
363 int unused_silent)
364{
365 struct inode *inode;
366 struct dentry *root;
367
368 sb->s_blocksize = PAGE_CACHE_SIZE;
369 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
370 sb->s_magic = CPUSET_SUPER_MAGIC;
371 sb->s_op = &cpuset_ops;
372 cpuset_sb = sb;
373
374 inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
375 if (inode) {
376 inode->i_op = &simple_dir_inode_operations;
377 inode->i_fop = &simple_dir_operations;
378 /* directories start off with i_nlink == 2 (for "." entry) */
379 inc_nlink(inode);
380 } else {
381 return -ENOMEM;
382 }
383
384 root = d_alloc_root(inode);
385 if (!root) {
386 iput(inode);
387 return -ENOMEM;
388 }
389 sb->s_root = root;
390 return 0;
391}
392
393static int cpuset_get_sb(struct file_system_type *fs_type, 272static int cpuset_get_sb(struct file_system_type *fs_type,
394 int flags, const char *unused_dev_name, 273 int flags, const char *unused_dev_name,
395 void *data, struct vfsmount *mnt) 274 void *data, struct vfsmount *mnt)
396{ 275{
397 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); 276 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
277 int ret = -ENODEV;
278 if (cgroup_fs) {
279 char mountopts[] =
280 "cpuset,noprefix,"
281 "release_agent=/sbin/cpuset_release_agent";
282 ret = cgroup_fs->get_sb(cgroup_fs, flags,
283 unused_dev_name, mountopts, mnt);
284 put_filesystem(cgroup_fs);
285 }
286 return ret;
398} 287}
399 288
400static struct file_system_type cpuset_fs_type = { 289static struct file_system_type cpuset_fs_type = {
401 .name = "cpuset", 290 .name = "cpuset",
402 .get_sb = cpuset_get_sb, 291 .get_sb = cpuset_get_sb,
403 .kill_sb = kill_litter_super,
404}; 292};
405 293
406/* struct cftype:
407 *
408 * The files in the cpuset filesystem mostly have a very simple read/write
409 * handling, some common function will take care of it. Nevertheless some cases
410 * (read tasks) are special and therefore I define this structure for every
411 * kind of file.
412 *
413 *
414 * When reading/writing to a file:
415 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
416 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
417 */
418
419struct cftype {
420 char *name;
421 int private;
422 int (*open) (struct inode *inode, struct file *file);
423 ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
424 loff_t *ppos);
425 int (*write) (struct file *file, const char __user *buf, size_t nbytes,
426 loff_t *ppos);
427 int (*release) (struct inode *inode, struct file *file);
428};
429
430static inline struct cpuset *__d_cs(struct dentry *dentry)
431{
432 return dentry->d_fsdata;
433}
434
435static inline struct cftype *__d_cft(struct dentry *dentry)
436{
437 return dentry->d_fsdata;
438}
439
440/*
441 * Call with manage_mutex held. Writes path of cpuset into buf.
442 * Returns 0 on success, -errno on error.
443 */
444
445static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
446{
447 char *start;
448
449 start = buf + buflen;
450
451 *--start = '\0';
452 for (;;) {
453 int len = cs->dentry->d_name.len;
454 if ((start -= len) < buf)
455 return -ENAMETOOLONG;
456 memcpy(start, cs->dentry->d_name.name, len);
457 cs = cs->parent;
458 if (!cs)
459 break;
460 if (!cs->parent)
461 continue;
462 if (--start < buf)
463 return -ENAMETOOLONG;
464 *start = '/';
465 }
466 memmove(buf, start, buf + buflen - start);
467 return 0;
468}
469
470/*
471 * Notify userspace when a cpuset is released, by running
472 * /sbin/cpuset_release_agent with the name of the cpuset (path
473 * relative to the root of cpuset file system) as the argument.
474 *
475 * Most likely, this user command will try to rmdir this cpuset.
476 *
477 * This races with the possibility that some other task will be
478 * attached to this cpuset before it is removed, or that some other
479 * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
480 * The presumed 'rmdir' will fail quietly if this cpuset is no longer
481 * unused, and this cpuset will be reprieved from its death sentence,
482 * to continue to serve a useful existence. Next time it's released,
483 * we will get notified again, if it still has 'notify_on_release' set.
484 *
485 * The final arg to call_usermodehelper() is 0, which means don't
486 * wait. The separate /sbin/cpuset_release_agent task is forked by
487 * call_usermodehelper(), then control in this thread returns here,
488 * without waiting for the release agent task. We don't bother to
489 * wait because the caller of this routine has no use for the exit
490 * status of the /sbin/cpuset_release_agent task, so no sense holding
491 * our caller up for that.
492 *
493 * When we had only one cpuset mutex, we had to call this
494 * without holding it, to avoid deadlock when call_usermodehelper()
495 * allocated memory. With two locks, we could now call this while
496 * holding manage_mutex, but we still don't, so as to minimize
497 * the time manage_mutex is held.
498 */
499
500static void cpuset_release_agent(const char *pathbuf)
501{
502 char *argv[3], *envp[3];
503 int i;
504
505 if (!pathbuf)
506 return;
507
508 i = 0;
509 argv[i++] = "/sbin/cpuset_release_agent";
510 argv[i++] = (char *)pathbuf;
511 argv[i] = NULL;
512
513 i = 0;
514 /* minimal command environment */
515 envp[i++] = "HOME=/";
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL;
518
519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf);
521}
522
523/*
524 * Either cs->count of using tasks transitioned to zero, or the
525 * cs->children list of child cpusets just became empty. If this
526 * cs is notify_on_release() and now both the user count is zero and
527 * the list of children is empty, prepare cpuset path in a kmalloc'd
528 * buffer, to be returned via ppathbuf, so that the caller can invoke
529 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
530 * Call here with manage_mutex held.
531 *
532 * This check_for_release() routine is responsible for kmalloc'ing
533 * pathbuf. The above cpuset_release_agent() is responsible for
534 * kfree'ing pathbuf. The caller of these routines is responsible
535 * for providing a pathbuf pointer, initialized to NULL, then
536 * calling check_for_release() with manage_mutex held and the address
537 * of the pathbuf pointer, then dropping manage_mutex, then calling
538 * cpuset_release_agent() with pathbuf, as set by check_for_release().
539 */
540
541static void check_for_release(struct cpuset *cs, char **ppathbuf)
542{
543 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
544 list_empty(&cs->children)) {
545 char *buf;
546
547 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
548 if (!buf)
549 return;
550 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
551 kfree(buf);
552 else
553 *ppathbuf = buf;
554 }
555}
556
557/* 294/*
558 * Return in *pmask the portion of a cpusets's cpus_allowed that 295 * Return in *pmask the portion of a cpusets's cpus_allowed that
559 * are online. If none are online, walk up the cpuset hierarchy 296 * are online. If none are online, walk up the cpuset hierarchy
@@ -653,20 +390,19 @@ void cpuset_update_task_memory_state(void)
653 struct task_struct *tsk = current; 390 struct task_struct *tsk = current;
654 struct cpuset *cs; 391 struct cpuset *cs;
655 392
656 if (tsk->cpuset == &top_cpuset) { 393 if (task_cs(tsk) == &top_cpuset) {
657 /* Don't need rcu for top_cpuset. It's never freed. */ 394 /* Don't need rcu for top_cpuset. It's never freed. */
658 my_cpusets_mem_gen = top_cpuset.mems_generation; 395 my_cpusets_mem_gen = top_cpuset.mems_generation;
659 } else { 396 } else {
660 rcu_read_lock(); 397 rcu_read_lock();
661 cs = rcu_dereference(tsk->cpuset); 398 my_cpusets_mem_gen = task_cs(current)->mems_generation;
662 my_cpusets_mem_gen = cs->mems_generation;
663 rcu_read_unlock(); 399 rcu_read_unlock();
664 } 400 }
665 401
666 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 402 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
667 mutex_lock(&callback_mutex); 403 mutex_lock(&callback_mutex);
668 task_lock(tsk); 404 task_lock(tsk);
669 cs = tsk->cpuset; /* Maybe changed when task not locked */ 405 cs = task_cs(tsk); /* Maybe changed when task not locked */
670 guarantee_online_mems(cs, &tsk->mems_allowed); 406 guarantee_online_mems(cs, &tsk->mems_allowed);
671 tsk->cpuset_mems_generation = cs->mems_generation; 407 tsk->cpuset_mems_generation = cs->mems_generation;
672 if (is_spread_page(cs)) 408 if (is_spread_page(cs))
@@ -721,11 +457,12 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
721 457
722static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 458static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
723{ 459{
460 struct cgroup *cont;
724 struct cpuset *c, *par; 461 struct cpuset *c, *par;
725 462
726 /* Each of our child cpusets must be a subset of us */ 463 /* Each of our child cpusets must be a subset of us */
727 list_for_each_entry(c, &cur->children, sibling) { 464 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
728 if (!is_cpuset_subset(c, trial)) 465 if (!is_cpuset_subset(cgroup_cs(cont), trial))
729 return -EBUSY; 466 return -EBUSY;
730 } 467 }
731 468
@@ -740,7 +477,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
740 return -EACCES; 477 return -EACCES;
741 478
742 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 479 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
743 list_for_each_entry(c, &par->children, sibling) { 480 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481 c = cgroup_cs(cont);
744 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 482 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
745 c != cur && 483 c != cur &&
746 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 484 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -751,17 +489,265 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
751 return -EINVAL; 489 return -EINVAL;
752 } 490 }
753 491
492 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
493 if (cgroup_task_count(cur->css.cgroup)) {
494 if (cpus_empty(trial->cpus_allowed) ||
495 nodes_empty(trial->mems_allowed)) {
496 return -ENOSPC;
497 }
498 }
499
754 return 0; 500 return 0;
755} 501}
756 502
757/* 503/*
504 * Helper routine for rebuild_sched_domains().
505 * Do cpusets a, b have overlapping cpus_allowed masks?
506 */
507
508static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
509{
510 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
511}
512
513/*
514 * rebuild_sched_domains()
515 *
516 * If the flag 'sched_load_balance' of any cpuset with non-empty
517 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
518 * which has that flag enabled, or if any cpuset with a non-empty
519 * 'cpus' is removed, then call this routine to rebuild the
520 * scheduler's dynamic sched domains.
521 *
522 * This routine builds a partial partition of the systems CPUs
523 * (the set of non-overlappping cpumask_t's in the array 'part'
524 * below), and passes that partial partition to the kernel/sched.c
525 * partition_sched_domains() routine, which will rebuild the
526 * schedulers load balancing domains (sched domains) as specified
527 * by that partial partition. A 'partial partition' is a set of
528 * non-overlapping subsets whose union is a subset of that set.
529 *
530 * See "What is sched_load_balance" in Documentation/cpusets.txt
531 * for a background explanation of this.
532 *
533 * Does not return errors, on the theory that the callers of this
534 * routine would rather not worry about failures to rebuild sched
535 * domains when operating in the severe memory shortage situations
536 * that could cause allocation failures below.
537 *
538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
544 * So the reverse nesting would risk an ABBA deadlock.
545 *
546 * The three key local variables below are:
547 * q - a kfifo queue of cpuset pointers, used to implement a
548 * top-down scan of all cpusets. This scan loads a pointer
549 * to each cpuset marked is_sched_load_balance into the
550 * array 'csa'. For our purposes, rebuilding the schedulers
551 * sched domains, we can ignore !is_sched_load_balance cpusets.
552 * csa - (for CpuSet Array) Array of pointers to all the cpusets
553 * that need to be load balanced, for convenient iterative
554 * access by the subsequent code that finds the best partition,
555 * i.e the set of domains (subsets) of CPUs such that the
556 * cpus_allowed of every cpuset marked is_sched_load_balance
557 * is a subset of one of these domains, while there are as
558 * many such domains as possible, each as small as possible.
559 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
560 * the kernel/sched.c routine partition_sched_domains() in a
561 * convenient format, that can be easily compared to the prior
562 * value to determine what partition elements (sched domains)
563 * were changed (added or removed.)
564 *
565 * Finding the best partition (set of domains):
566 * The triple nested loops below over i, j, k scan over the
567 * load balanced cpusets (using the array of cpuset pointers in
568 * csa[]) looking for pairs of cpusets that have overlapping
569 * cpus_allowed, but which don't have the same 'pn' partition
570 * number and gives them in the same partition number. It keeps
571 * looping on the 'restart' label until it can no longer find
572 * any such pairs.
573 *
574 * The union of the cpus_allowed masks from the set of
575 * all cpusets having the same 'pn' value then form the one
576 * element of the partition (one sched domain) to be passed to
577 * partition_sched_domains().
578 */
579
580static void rebuild_sched_domains(void)
581{
582 struct kfifo *q; /* queue of cpusets to be scanned */
583 struct cpuset *cp; /* scans q */
584 struct cpuset **csa; /* array of all cpuset ptrs */
585 int csn; /* how many cpuset ptrs in csa so far */
586 int i, j, k; /* indices for partition finding loops */
587 cpumask_t *doms; /* resulting partition; i.e. sched domains */
588 int ndoms; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */
590
591 q = NULL;
592 csa = NULL;
593 doms = NULL;
594
595 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) {
597 ndoms = 1;
598 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
599 if (!doms)
600 goto rebuild;
601 *doms = top_cpuset.cpus_allowed;
602 goto rebuild;
603 }
604
605 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
606 if (IS_ERR(q))
607 goto done;
608 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
609 if (!csa)
610 goto done;
611 csn = 0;
612
613 cp = &top_cpuset;
614 __kfifo_put(q, (void *)&cp, sizeof(cp));
615 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
616 struct cgroup *cont;
617 struct cpuset *child; /* scans child cpusets of cp */
618 if (is_sched_load_balance(cp))
619 csa[csn++] = cp;
620 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
621 child = cgroup_cs(cont);
622 __kfifo_put(q, (void *)&child, sizeof(cp));
623 }
624 }
625
626 for (i = 0; i < csn; i++)
627 csa[i]->pn = i;
628 ndoms = csn;
629
630restart:
631 /* Find the best partition (set of sched domains) */
632 for (i = 0; i < csn; i++) {
633 struct cpuset *a = csa[i];
634 int apn = a->pn;
635
636 for (j = 0; j < csn; j++) {
637 struct cpuset *b = csa[j];
638 int bpn = b->pn;
639
640 if (apn != bpn && cpusets_overlap(a, b)) {
641 for (k = 0; k < csn; k++) {
642 struct cpuset *c = csa[k];
643
644 if (c->pn == bpn)
645 c->pn = apn;
646 }
647 ndoms--; /* one less element */
648 goto restart;
649 }
650 }
651 }
652
653 /* Convert <csn, csa> to <ndoms, doms> */
654 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
655 if (!doms)
656 goto rebuild;
657
658 for (nslot = 0, i = 0; i < csn; i++) {
659 struct cpuset *a = csa[i];
660 int apn = a->pn;
661
662 if (apn >= 0) {
663 cpumask_t *dp = doms + nslot;
664
665 if (nslot == ndoms) {
666 static int warnings = 10;
667 if (warnings) {
668 printk(KERN_WARNING
669 "rebuild_sched_domains confused:"
670 " nslot %d, ndoms %d, csn %d, i %d,"
671 " apn %d\n",
672 nslot, ndoms, csn, i, apn);
673 warnings--;
674 }
675 continue;
676 }
677
678 cpus_clear(*dp);
679 for (j = i; j < csn; j++) {
680 struct cpuset *b = csa[j];
681
682 if (apn == b->pn) {
683 cpus_or(*dp, *dp, b->cpus_allowed);
684 b->pn = -1;
685 }
686 }
687 nslot++;
688 }
689 }
690 BUG_ON(nslot != ndoms);
691
692rebuild:
693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug();
695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug();
697
698done:
699 if (q && !IS_ERR(q))
700 kfifo_free(q);
701 kfree(csa);
702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
703}
704
705static inline int started_after_time(struct task_struct *t1,
706 struct timespec *time,
707 struct task_struct *t2)
708{
709 int start_diff = timespec_compare(&t1->start_time, time);
710 if (start_diff > 0) {
711 return 1;
712 } else if (start_diff < 0) {
713 return 0;
714 } else {
715 /*
716 * Arbitrarily, if two processes started at the same
717 * time, we'll say that the lower pointer value
718 * started first. Note that t2 may have exited by now
719 * so this may not be a valid pointer any longer, but
720 * that's fine - it still serves to distinguish
721 * between two tasks started (effectively)
722 * simultaneously.
723 */
724 return t1 > t2;
725 }
726}
727
728static inline int started_after(void *p1, void *p2)
729{
730 struct task_struct *t1 = p1;
731 struct task_struct *t2 = p2;
732 return started_after_time(t1, &t2->start_time, t2);
733}
734
735/*
758 * Call with manage_mutex held. May take callback_mutex during call. 736 * Call with manage_mutex held. May take callback_mutex during call.
759 */ 737 */
760 738
761static int update_cpumask(struct cpuset *cs, char *buf) 739static int update_cpumask(struct cpuset *cs, char *buf)
762{ 740{
763 struct cpuset trialcs; 741 struct cpuset trialcs;
764 int retval; 742 int retval, i;
743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 };
765 751
766 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
767 if (cs == &top_cpuset) 753 if (cs == &top_cpuset)
@@ -770,11 +756,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
770 trialcs = *cs; 756 trialcs = *cs;
771 757
772 /* 758 /*
773 * We allow a cpuset's cpus_allowed to be empty; if it has attached 759 * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
774 * tasks, we'll catch it later when we validate the change and return 760 * Since cpulist_parse() fails on an empty mask, we special case
775 * -ENOSPC. 761 * that parsing. The validate_change() call ensures that cpusets
762 * with tasks have cpus.
776 */ 763 */
777 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 764 buf = strstrip(buf);
765 if (!*buf) {
778 cpus_clear(trialcs.cpus_allowed); 766 cpus_clear(trialcs.cpus_allowed);
779 } else { 767 } else {
780 retval = cpulist_parse(buf, trialcs.cpus_allowed); 768 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -782,15 +770,79 @@ static int update_cpumask(struct cpuset *cs, char *buf)
782 return retval; 770 return retval;
783 } 771 }
784 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); 772 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
785 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
786 if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
787 return -ENOSPC;
788 retval = validate_change(cs, &trialcs); 773 retval = validate_change(cs, &trialcs);
789 if (retval < 0) 774 if (retval < 0)
790 return retval; 775 return retval;
776
777 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0;
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval)
782 return retval;
783
784 is_load_balanced = is_sched_load_balance(&trialcs);
785
791 mutex_lock(&callback_mutex); 786 mutex_lock(&callback_mutex);
792 cs->cpus_allowed = trialcs.cpus_allowed; 787 cs->cpus_allowed = trialcs.cpus_allowed;
793 mutex_unlock(&callback_mutex); 788 mutex_unlock(&callback_mutex);
789
790 again:
791 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed()
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */
801 heap.size = 0;
802 cgroup_iter_start(cgrp, &it);
803 while ((p = cgroup_iter_next(cgrp, &it))) {
804 /* Only affect tasks that don't have the right cpus_allowed */
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap);
843 if (is_load_balanced)
844 rebuild_sched_domains();
845
794 return 0; 846 return 0;
795} 847}
796 848
@@ -839,7 +891,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
839 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 891 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
840 892
841 mutex_lock(&callback_mutex); 893 mutex_lock(&callback_mutex);
842 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); 894 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
843 mutex_unlock(&callback_mutex); 895 mutex_unlock(&callback_mutex);
844} 896}
845 897
@@ -857,16 +909,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
857 * their mempolicies to the cpusets new mems_allowed. 909 * their mempolicies to the cpusets new mems_allowed.
858 */ 910 */
859 911
912static void *cpuset_being_rebound;
913
860static int update_nodemask(struct cpuset *cs, char *buf) 914static int update_nodemask(struct cpuset *cs, char *buf)
861{ 915{
862 struct cpuset trialcs; 916 struct cpuset trialcs;
863 nodemask_t oldmem; 917 nodemask_t oldmem;
864 struct task_struct *g, *p; 918 struct task_struct *p;
865 struct mm_struct **mmarray; 919 struct mm_struct **mmarray;
866 int i, n, ntasks; 920 int i, n, ntasks;
867 int migrate; 921 int migrate;
868 int fudge; 922 int fudge;
869 int retval; 923 int retval;
924 struct cgroup_iter it;
870 925
871 /* 926 /*
872 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 927 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -878,29 +933,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
878 trialcs = *cs; 933 trialcs = *cs;
879 934
880 /* 935 /*
881 * We allow a cpuset's mems_allowed to be empty; if it has attached 936 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
882 * tasks, we'll catch it later when we validate the change and return 937 * Since nodelist_parse() fails on an empty mask, we special case
883 * -ENOSPC. 938 * that parsing. The validate_change() call ensures that cpusets
939 * with tasks have memory.
884 */ 940 */
885 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 941 buf = strstrip(buf);
942 if (!*buf) {
886 nodes_clear(trialcs.mems_allowed); 943 nodes_clear(trialcs.mems_allowed);
887 } else { 944 } else {
888 retval = nodelist_parse(buf, trialcs.mems_allowed); 945 retval = nodelist_parse(buf, trialcs.mems_allowed);
889 if (retval < 0) 946 if (retval < 0)
890 goto done; 947 goto done;
891 if (!nodes_intersects(trialcs.mems_allowed,
892 node_states[N_HIGH_MEMORY])) {
893 /*
894 * error if only memoryless nodes specified.
895 */
896 retval = -ENOSPC;
897 goto done;
898 }
899 } 948 }
900 /*
901 * Exclude memoryless nodes. We know that trialcs.mems_allowed
902 * contains at least one node with memory.
903 */
904 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, 949 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
905 node_states[N_HIGH_MEMORY]); 950 node_states[N_HIGH_MEMORY]);
906 oldmem = cs->mems_allowed; 951 oldmem = cs->mems_allowed;
@@ -908,11 +953,6 @@ static int update_nodemask(struct cpuset *cs, char *buf)
908 retval = 0; /* Too easy - nothing to do */ 953 retval = 0; /* Too easy - nothing to do */
909 goto done; 954 goto done;
910 } 955 }
911 /* mems_allowed cannot be empty for a cpuset with attached tasks. */
912 if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
913 retval = -ENOSPC;
914 goto done;
915 }
916 retval = validate_change(cs, &trialcs); 956 retval = validate_change(cs, &trialcs);
917 if (retval < 0) 957 if (retval < 0)
918 goto done; 958 goto done;
@@ -922,7 +962,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
922 cs->mems_generation = cpuset_mems_generation++; 962 cs->mems_generation = cpuset_mems_generation++;
923 mutex_unlock(&callback_mutex); 963 mutex_unlock(&callback_mutex);
924 964
925 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 965 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
926 966
927 fudge = 10; /* spare mmarray[] slots */ 967 fudge = 10; /* spare mmarray[] slots */
928 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 968 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -936,13 +976,13 @@ static int update_nodemask(struct cpuset *cs, char *buf)
936 * enough mmarray[] w/o using GFP_ATOMIC. 976 * enough mmarray[] w/o using GFP_ATOMIC.
937 */ 977 */
938 while (1) { 978 while (1) {
939 ntasks = atomic_read(&cs->count); /* guess */ 979 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
940 ntasks += fudge; 980 ntasks += fudge;
941 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
942 if (!mmarray) 982 if (!mmarray)
943 goto done; 983 goto done;
944 read_lock(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
945 if (atomic_read(&cs->count) <= ntasks) 985 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
946 break; /* got enough */ 986 break; /* got enough */
947 read_unlock(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
948 kfree(mmarray); 988 kfree(mmarray);
@@ -951,21 +991,21 @@ static int update_nodemask(struct cpuset *cs, char *buf)
951 n = 0; 991 n = 0;
952 992
953 /* Load up mmarray[] with mm reference for each task in cpuset. */ 993 /* Load up mmarray[] with mm reference for each task in cpuset. */
954 do_each_thread(g, p) { 994 cgroup_iter_start(cs->css.cgroup, &it);
995 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
955 struct mm_struct *mm; 996 struct mm_struct *mm;
956 997
957 if (n >= ntasks) { 998 if (n >= ntasks) {
958 printk(KERN_WARNING 999 printk(KERN_WARNING
959 "Cpuset mempolicy rebind incomplete.\n"); 1000 "Cpuset mempolicy rebind incomplete.\n");
960 continue; 1001 break;
961 } 1002 }
962 if (p->cpuset != cs)
963 continue;
964 mm = get_task_mm(p); 1003 mm = get_task_mm(p);
965 if (!mm) 1004 if (!mm)
966 continue; 1005 continue;
967 mmarray[n++] = mm; 1006 mmarray[n++] = mm;
968 } while_each_thread(g, p); 1007 }
1008 cgroup_iter_end(cs->css.cgroup, &it);
969 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
970 1010
971 /* 1011 /*
@@ -993,12 +1033,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
993 1033
994 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */
995 kfree(mmarray); 1035 kfree(mmarray);
996 set_cpuset_being_rebound(NULL); 1036 cpuset_being_rebound = NULL;
997 retval = 0; 1037 retval = 0;
998done: 1038done:
999 return retval; 1039 return retval;
1000} 1040}
1001 1041
1042int current_cpuset_is_being_rebound(void)
1043{
1044 return task_cs(current) == cpuset_being_rebound;
1045}
1046
1002/* 1047/*
1003 * Call with manage_mutex held. 1048 * Call with manage_mutex held.
1004 */ 1049 */
@@ -1015,6 +1060,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1015/* 1060/*
1016 * update_flag - read a 0 or a 1 in a file and update associated flag 1061 * update_flag - read a 0 or a 1 in a file and update associated flag
1017 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1062 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
1063 * CS_SCHED_LOAD_BALANCE,
1018 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1064 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1019 * CS_SPREAD_PAGE, CS_SPREAD_SLAB) 1065 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1020 * cs: the cpuset to update 1066 * cs: the cpuset to update
@@ -1028,6 +1074,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1028 int turning_on; 1074 int turning_on;
1029 struct cpuset trialcs; 1075 struct cpuset trialcs;
1030 int err; 1076 int err;
1077 int cpus_nonempty, balance_flag_changed;
1031 1078
1032 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 1079 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1033 1080
@@ -1040,10 +1087,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1040 err = validate_change(cs, &trialcs); 1087 err = validate_change(cs, &trialcs);
1041 if (err < 0) 1088 if (err < 0)
1042 return err; 1089 return err;
1090
1091 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1092 balance_flag_changed = (is_sched_load_balance(cs) !=
1093 is_sched_load_balance(&trialcs));
1094
1043 mutex_lock(&callback_mutex); 1095 mutex_lock(&callback_mutex);
1044 cs->flags = trialcs.flags; 1096 cs->flags = trialcs.flags;
1045 mutex_unlock(&callback_mutex); 1097 mutex_unlock(&callback_mutex);
1046 1098
1099 if (cpus_nonempty && balance_flag_changed)
1100 rebuild_sched_domains();
1101
1047 return 0; 1102 return 0;
1048} 1103}
1049 1104
@@ -1145,85 +1200,34 @@ static int fmeter_getrate(struct fmeter *fmp)
1145 return val; 1200 return val;
1146} 1201}
1147 1202
1148/* 1203static int cpuset_can_attach(struct cgroup_subsys *ss,
1149 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1204 struct cgroup *cont, struct task_struct *tsk)
1150 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1151 * notified on release.
1152 *
1153 * Call holding manage_mutex. May take callback_mutex and task_lock of
1154 * the task 'pid' during call.
1155 */
1156
1157static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1158{ 1205{
1159 pid_t pid; 1206 struct cpuset *cs = cgroup_cs(cont);
1160 struct task_struct *tsk;
1161 struct cpuset *oldcs;
1162 cpumask_t cpus;
1163 nodemask_t from, to;
1164 struct mm_struct *mm;
1165 int retval;
1166 1207
1167 if (sscanf(pidbuf, "%d", &pid) != 1)
1168 return -EIO;
1169 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1208 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1170 return -ENOSPC; 1209 return -ENOSPC;
1171 1210
1172 if (pid) { 1211 return security_task_setscheduler(tsk, 0, NULL);
1173 read_lock(&tasklist_lock); 1212}
1174
1175 tsk = find_task_by_pid(pid);
1176 if (!tsk || tsk->flags & PF_EXITING) {
1177 read_unlock(&tasklist_lock);
1178 return -ESRCH;
1179 }
1180
1181 get_task_struct(tsk);
1182 read_unlock(&tasklist_lock);
1183
1184 if ((current->euid) && (current->euid != tsk->uid)
1185 && (current->euid != tsk->suid)) {
1186 put_task_struct(tsk);
1187 return -EACCES;
1188 }
1189 } else {
1190 tsk = current;
1191 get_task_struct(tsk);
1192 }
1193 1213
1194 retval = security_task_setscheduler(tsk, 0, NULL); 1214static void cpuset_attach(struct cgroup_subsys *ss,
1195 if (retval) { 1215 struct cgroup *cont, struct cgroup *oldcont,
1196 put_task_struct(tsk); 1216 struct task_struct *tsk)
1197 return retval; 1217{
1198 } 1218 cpumask_t cpus;
1219 nodemask_t from, to;
1220 struct mm_struct *mm;
1221 struct cpuset *cs = cgroup_cs(cont);
1222 struct cpuset *oldcs = cgroup_cs(oldcont);
1199 1223
1200 mutex_lock(&callback_mutex); 1224 mutex_lock(&callback_mutex);
1201
1202 task_lock(tsk);
1203 oldcs = tsk->cpuset;
1204 /*
1205 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1206 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1207 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1208 */
1209 if (tsk->flags & PF_EXITING) {
1210 task_unlock(tsk);
1211 mutex_unlock(&callback_mutex);
1212 put_task_struct(tsk);
1213 return -ESRCH;
1214 }
1215 atomic_inc(&cs->count);
1216 rcu_assign_pointer(tsk->cpuset, cs);
1217 task_unlock(tsk);
1218
1219 guarantee_online_cpus(cs, &cpus); 1225 guarantee_online_cpus(cs, &cpus);
1220 set_cpus_allowed(tsk, cpus); 1226 set_cpus_allowed(tsk, cpus);
1227 mutex_unlock(&callback_mutex);
1221 1228
1222 from = oldcs->mems_allowed; 1229 from = oldcs->mems_allowed;
1223 to = cs->mems_allowed; 1230 to = cs->mems_allowed;
1224
1225 mutex_unlock(&callback_mutex);
1226
1227 mm = get_task_mm(tsk); 1231 mm = get_task_mm(tsk);
1228 if (mm) { 1232 if (mm) {
1229 mpol_rebind_mm(mm, &to); 1233 mpol_rebind_mm(mm, &to);
@@ -1232,44 +1236,36 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1232 mmput(mm); 1236 mmput(mm);
1233 } 1237 }
1234 1238
1235 put_task_struct(tsk);
1236 synchronize_rcu();
1237 if (atomic_dec_and_test(&oldcs->count))
1238 check_for_release(oldcs, ppathbuf);
1239 return 0;
1240} 1239}
1241 1240
1242/* The various types of files and directories in a cpuset file system */ 1241/* The various types of files and directories in a cpuset file system */
1243 1242
1244typedef enum { 1243typedef enum {
1245 FILE_ROOT,
1246 FILE_DIR,
1247 FILE_MEMORY_MIGRATE, 1244 FILE_MEMORY_MIGRATE,
1248 FILE_CPULIST, 1245 FILE_CPULIST,
1249 FILE_MEMLIST, 1246 FILE_MEMLIST,
1250 FILE_CPU_EXCLUSIVE, 1247 FILE_CPU_EXCLUSIVE,
1251 FILE_MEM_EXCLUSIVE, 1248 FILE_MEM_EXCLUSIVE,
1252 FILE_NOTIFY_ON_RELEASE, 1249 FILE_SCHED_LOAD_BALANCE,
1253 FILE_MEMORY_PRESSURE_ENABLED, 1250 FILE_MEMORY_PRESSURE_ENABLED,
1254 FILE_MEMORY_PRESSURE, 1251 FILE_MEMORY_PRESSURE,
1255 FILE_SPREAD_PAGE, 1252 FILE_SPREAD_PAGE,
1256 FILE_SPREAD_SLAB, 1253 FILE_SPREAD_SLAB,
1257 FILE_TASKLIST,
1258} cpuset_filetype_t; 1254} cpuset_filetype_t;
1259 1255
1260static ssize_t cpuset_common_file_write(struct file *file, 1256static ssize_t cpuset_common_file_write(struct cgroup *cont,
1257 struct cftype *cft,
1258 struct file *file,
1261 const char __user *userbuf, 1259 const char __user *userbuf,
1262 size_t nbytes, loff_t *unused_ppos) 1260 size_t nbytes, loff_t *unused_ppos)
1263{ 1261{
1264 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); 1262 struct cpuset *cs = cgroup_cs(cont);
1265 struct cftype *cft = __d_cft(file->f_path.dentry);
1266 cpuset_filetype_t type = cft->private; 1263 cpuset_filetype_t type = cft->private;
1267 char *buffer; 1264 char *buffer;
1268 char *pathbuf = NULL;
1269 int retval = 0; 1265 int retval = 0;
1270 1266
1271 /* Crude upper limit on largest legitimate cpulist user might write. */ 1267 /* Crude upper limit on largest legitimate cpulist user might write. */
1272 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) 1268 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1273 return -E2BIG; 1269 return -E2BIG;
1274 1270
1275 /* +1 for nul-terminator */ 1271 /* +1 for nul-terminator */
@@ -1282,9 +1278,9 @@ static ssize_t cpuset_common_file_write(struct file *file,
1282 } 1278 }
1283 buffer[nbytes] = 0; /* nul-terminate */ 1279 buffer[nbytes] = 0; /* nul-terminate */
1284 1280
1285 mutex_lock(&manage_mutex); 1281 cgroup_lock();
1286 1282
1287 if (is_removed(cs)) { 1283 if (cgroup_is_removed(cont)) {
1288 retval = -ENODEV; 1284 retval = -ENODEV;
1289 goto out2; 1285 goto out2;
1290 } 1286 }
@@ -1302,8 +1298,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
1302 case FILE_MEM_EXCLUSIVE: 1298 case FILE_MEM_EXCLUSIVE:
1303 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1299 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1304 break; 1300 break;
1305 case FILE_NOTIFY_ON_RELEASE: 1301 case FILE_SCHED_LOAD_BALANCE:
1306 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1302 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1307 break; 1303 break;
1308 case FILE_MEMORY_MIGRATE: 1304 case FILE_MEMORY_MIGRATE:
1309 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1305 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
@@ -1322,9 +1318,6 @@ static ssize_t cpuset_common_file_write(struct file *file,
1322 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1318 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1323 cs->mems_generation = cpuset_mems_generation++; 1319 cs->mems_generation = cpuset_mems_generation++;
1324 break; 1320 break;
1325 case FILE_TASKLIST:
1326 retval = attach_task(cs, buffer, &pathbuf);
1327 break;
1328 default: 1321 default:
1329 retval = -EINVAL; 1322 retval = -EINVAL;
1330 goto out2; 1323 goto out2;
@@ -1333,30 +1326,12 @@ static ssize_t cpuset_common_file_write(struct file *file,
1333 if (retval == 0) 1326 if (retval == 0)
1334 retval = nbytes; 1327 retval = nbytes;
1335out2: 1328out2:
1336 mutex_unlock(&manage_mutex); 1329 cgroup_unlock();
1337 cpuset_release_agent(pathbuf);
1338out1: 1330out1:
1339 kfree(buffer); 1331 kfree(buffer);
1340 return retval; 1332 return retval;
1341} 1333}
1342 1334
1343static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1344 size_t nbytes, loff_t *ppos)
1345{
1346 ssize_t retval = 0;
1347 struct cftype *cft = __d_cft(file->f_path.dentry);
1348 if (!cft)
1349 return -ENODEV;
1350
1351 /* special function ? */
1352 if (cft->write)
1353 retval = cft->write(file, buf, nbytes, ppos);
1354 else
1355 retval = cpuset_common_file_write(file, buf, nbytes, ppos);
1356
1357 return retval;
1358}
1359
1360/* 1335/*
1361 * These ascii lists should be read in a single call, by using a user 1336 * These ascii lists should be read in a single call, by using a user
1362 * buffer large enough to hold the entire map. If read in smaller 1337 * buffer large enough to hold the entire map. If read in smaller
@@ -1391,11 +1366,13 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1391 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1366 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1392} 1367}
1393 1368
1394static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1369static ssize_t cpuset_common_file_read(struct cgroup *cont,
1395 size_t nbytes, loff_t *ppos) 1370 struct cftype *cft,
1371 struct file *file,
1372 char __user *buf,
1373 size_t nbytes, loff_t *ppos)
1396{ 1374{
1397 struct cftype *cft = __d_cft(file->f_path.dentry); 1375 struct cpuset *cs = cgroup_cs(cont);
1398 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1399 cpuset_filetype_t type = cft->private; 1376 cpuset_filetype_t type = cft->private;
1400 char *page; 1377 char *page;
1401 ssize_t retval = 0; 1378 ssize_t retval = 0;
@@ -1419,8 +1396,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1419 case FILE_MEM_EXCLUSIVE: 1396 case FILE_MEM_EXCLUSIVE:
1420 *s++ = is_mem_exclusive(cs) ? '1' : '0'; 1397 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1421 break; 1398 break;
1422 case FILE_NOTIFY_ON_RELEASE: 1399 case FILE_SCHED_LOAD_BALANCE:
1423 *s++ = notify_on_release(cs) ? '1' : '0'; 1400 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1424 break; 1401 break;
1425 case FILE_MEMORY_MIGRATE: 1402 case FILE_MEMORY_MIGRATE:
1426 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1403 *s++ = is_memory_migrate(cs) ? '1' : '0';
@@ -1449,390 +1426,150 @@ out:
1449 return retval; 1426 return retval;
1450} 1427}
1451 1428
1452static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
1453 loff_t *ppos)
1454{
1455 ssize_t retval = 0;
1456 struct cftype *cft = __d_cft(file->f_path.dentry);
1457 if (!cft)
1458 return -ENODEV;
1459
1460 /* special function ? */
1461 if (cft->read)
1462 retval = cft->read(file, buf, nbytes, ppos);
1463 else
1464 retval = cpuset_common_file_read(file, buf, nbytes, ppos);
1465
1466 return retval;
1467}
1468 1429
1469static int cpuset_file_open(struct inode *inode, struct file *file)
1470{
1471 int err;
1472 struct cftype *cft;
1473 1430
1474 err = generic_file_open(inode, file);
1475 if (err)
1476 return err;
1477 1431
1478 cft = __d_cft(file->f_path.dentry);
1479 if (!cft)
1480 return -ENODEV;
1481 if (cft->open)
1482 err = cft->open(inode, file);
1483 else
1484 err = 0;
1485
1486 return err;
1487}
1488
1489static int cpuset_file_release(struct inode *inode, struct file *file)
1490{
1491 struct cftype *cft = __d_cft(file->f_path.dentry);
1492 if (cft->release)
1493 return cft->release(inode, file);
1494 return 0;
1495}
1496
1497/*
1498 * cpuset_rename - Only allow simple rename of directories in place.
1499 */
1500static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1501 struct inode *new_dir, struct dentry *new_dentry)
1502{
1503 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1504 return -ENOTDIR;
1505 if (new_dentry->d_inode)
1506 return -EEXIST;
1507 if (old_dir != new_dir)
1508 return -EIO;
1509 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1510}
1511
1512static const struct file_operations cpuset_file_operations = {
1513 .read = cpuset_file_read,
1514 .write = cpuset_file_write,
1515 .llseek = generic_file_llseek,
1516 .open = cpuset_file_open,
1517 .release = cpuset_file_release,
1518};
1519
1520static const struct inode_operations cpuset_dir_inode_operations = {
1521 .lookup = simple_lookup,
1522 .mkdir = cpuset_mkdir,
1523 .rmdir = cpuset_rmdir,
1524 .rename = cpuset_rename,
1525};
1526
1527static int cpuset_create_file(struct dentry *dentry, int mode)
1528{
1529 struct inode *inode;
1530
1531 if (!dentry)
1532 return -ENOENT;
1533 if (dentry->d_inode)
1534 return -EEXIST;
1535
1536 inode = cpuset_new_inode(mode);
1537 if (!inode)
1538 return -ENOMEM;
1539
1540 if (S_ISDIR(mode)) {
1541 inode->i_op = &cpuset_dir_inode_operations;
1542 inode->i_fop = &simple_dir_operations;
1543
1544 /* start off with i_nlink == 2 (for "." entry) */
1545 inc_nlink(inode);
1546 } else if (S_ISREG(mode)) {
1547 inode->i_size = 0;
1548 inode->i_fop = &cpuset_file_operations;
1549 }
1550
1551 d_instantiate(dentry, inode);
1552 dget(dentry); /* Extra count - pin the dentry in core */
1553 return 0;
1554}
1555
1556/*
1557 * cpuset_create_dir - create a directory for an object.
1558 * cs: the cpuset we create the directory for.
1559 * It must have a valid ->parent field
1560 * And we are going to fill its ->dentry field.
1561 * name: The name to give to the cpuset directory. Will be copied.
1562 * mode: mode to set on new directory.
1563 */
1564
1565static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1566{
1567 struct dentry *dentry = NULL;
1568 struct dentry *parent;
1569 int error = 0;
1570
1571 parent = cs->parent->dentry;
1572 dentry = cpuset_get_dentry(parent, name);
1573 if (IS_ERR(dentry))
1574 return PTR_ERR(dentry);
1575 error = cpuset_create_file(dentry, S_IFDIR | mode);
1576 if (!error) {
1577 dentry->d_fsdata = cs;
1578 inc_nlink(parent->d_inode);
1579 cs->dentry = dentry;
1580 }
1581 dput(dentry);
1582
1583 return error;
1584}
1585
1586static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1587{
1588 struct dentry *dentry;
1589 int error;
1590
1591 mutex_lock(&dir->d_inode->i_mutex);
1592 dentry = cpuset_get_dentry(dir, cft->name);
1593 if (!IS_ERR(dentry)) {
1594 error = cpuset_create_file(dentry, 0644 | S_IFREG);
1595 if (!error)
1596 dentry->d_fsdata = (void *)cft;
1597 dput(dentry);
1598 } else
1599 error = PTR_ERR(dentry);
1600 mutex_unlock(&dir->d_inode->i_mutex);
1601 return error;
1602}
1603
1604/*
1605 * Stuff for reading the 'tasks' file.
1606 *
1607 * Reading this file can return large amounts of data if a cpuset has
1608 * *lots* of attached tasks. So it may need several calls to read(),
1609 * but we cannot guarantee that the information we produce is correct
1610 * unless we produce it entirely atomically.
1611 *
1612 * Upon tasks file open(), a struct ctr_struct is allocated, that
1613 * will have a pointer to an array (also allocated here). The struct
1614 * ctr_struct * is stored in file->private_data. Its resources will
1615 * be freed by release() when the file is closed. The array is used
1616 * to sprintf the PIDs and then used by read().
1617 */
1618
1619/* cpusets_tasks_read array */
1620
1621struct ctr_struct {
1622 char *buf;
1623 int bufsz;
1624};
1625
1626/*
1627 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1628 * Return actual number of pids loaded. No need to task_lock(p)
1629 * when reading out p->cpuset, as we don't really care if it changes
1630 * on the next cycle, and we are not going to try to dereference it.
1631 */
1632static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1633{
1634 int n = 0;
1635 struct task_struct *g, *p;
1636
1637 read_lock(&tasklist_lock);
1638
1639 do_each_thread(g, p) {
1640 if (p->cpuset == cs) {
1641 if (unlikely(n == npids))
1642 goto array_full;
1643 pidarray[n++] = p->pid;
1644 }
1645 } while_each_thread(g, p);
1646
1647array_full:
1648 read_unlock(&tasklist_lock);
1649 return n;
1650}
1651
1652static int cmppid(const void *a, const void *b)
1653{
1654 return *(pid_t *)a - *(pid_t *)b;
1655}
1656
1657/*
1658 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1659 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1660 * count 'cnt' of how many chars would be written if buf were large enough.
1661 */
1662static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1663{
1664 int cnt = 0;
1665 int i;
1666
1667 for (i = 0; i < npids; i++)
1668 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1669 return cnt;
1670}
1671
1672/*
1673 * Handle an open on 'tasks' file. Prepare a buffer listing the
1674 * process id's of tasks currently attached to the cpuset being opened.
1675 *
1676 * Does not require any specific cpuset mutexes, and does not take any.
1677 */
1678static int cpuset_tasks_open(struct inode *unused, struct file *file)
1679{
1680 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1681 struct ctr_struct *ctr;
1682 pid_t *pidarray;
1683 int npids;
1684 char c;
1685
1686 if (!(file->f_mode & FMODE_READ))
1687 return 0;
1688
1689 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1690 if (!ctr)
1691 goto err0;
1692
1693 /*
1694 * If cpuset gets more users after we read count, we won't have
1695 * enough space - tough. This race is indistinguishable to the
1696 * caller from the case that the additional cpuset users didn't
1697 * show up until sometime later on.
1698 */
1699 npids = atomic_read(&cs->count);
1700 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1701 if (!pidarray)
1702 goto err1;
1703
1704 npids = pid_array_load(pidarray, npids, cs);
1705 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1706
1707 /* Call pid_array_to_buf() twice, first just to get bufsz */
1708 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1709 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1710 if (!ctr->buf)
1711 goto err2;
1712 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1713
1714 kfree(pidarray);
1715 file->private_data = ctr;
1716 return 0;
1717
1718err2:
1719 kfree(pidarray);
1720err1:
1721 kfree(ctr);
1722err0:
1723 return -ENOMEM;
1724}
1725
1726static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1727 size_t nbytes, loff_t *ppos)
1728{
1729 struct ctr_struct *ctr = file->private_data;
1730
1731 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1732}
1733
1734static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
1735{
1736 struct ctr_struct *ctr;
1737
1738 if (file->f_mode & FMODE_READ) {
1739 ctr = file->private_data;
1740 kfree(ctr->buf);
1741 kfree(ctr);
1742 }
1743 return 0;
1744}
1745 1432
1746/* 1433/*
1747 * for the common functions, 'private' gives the type of file 1434 * for the common functions, 'private' gives the type of file
1748 */ 1435 */
1749 1436
1750static struct cftype cft_tasks = {
1751 .name = "tasks",
1752 .open = cpuset_tasks_open,
1753 .read = cpuset_tasks_read,
1754 .release = cpuset_tasks_release,
1755 .private = FILE_TASKLIST,
1756};
1757
1758static struct cftype cft_cpus = { 1437static struct cftype cft_cpus = {
1759 .name = "cpus", 1438 .name = "cpus",
1439 .read = cpuset_common_file_read,
1440 .write = cpuset_common_file_write,
1760 .private = FILE_CPULIST, 1441 .private = FILE_CPULIST,
1761}; 1442};
1762 1443
1763static struct cftype cft_mems = { 1444static struct cftype cft_mems = {
1764 .name = "mems", 1445 .name = "mems",
1446 .read = cpuset_common_file_read,
1447 .write = cpuset_common_file_write,
1765 .private = FILE_MEMLIST, 1448 .private = FILE_MEMLIST,
1766}; 1449};
1767 1450
1768static struct cftype cft_cpu_exclusive = { 1451static struct cftype cft_cpu_exclusive = {
1769 .name = "cpu_exclusive", 1452 .name = "cpu_exclusive",
1453 .read = cpuset_common_file_read,
1454 .write = cpuset_common_file_write,
1770 .private = FILE_CPU_EXCLUSIVE, 1455 .private = FILE_CPU_EXCLUSIVE,
1771}; 1456};
1772 1457
1773static struct cftype cft_mem_exclusive = { 1458static struct cftype cft_mem_exclusive = {
1774 .name = "mem_exclusive", 1459 .name = "mem_exclusive",
1460 .read = cpuset_common_file_read,
1461 .write = cpuset_common_file_write,
1775 .private = FILE_MEM_EXCLUSIVE, 1462 .private = FILE_MEM_EXCLUSIVE,
1776}; 1463};
1777 1464
1778static struct cftype cft_notify_on_release = { 1465static struct cftype cft_sched_load_balance = {
1779 .name = "notify_on_release", 1466 .name = "sched_load_balance",
1780 .private = FILE_NOTIFY_ON_RELEASE, 1467 .read = cpuset_common_file_read,
1468 .write = cpuset_common_file_write,
1469 .private = FILE_SCHED_LOAD_BALANCE,
1781}; 1470};
1782 1471
1783static struct cftype cft_memory_migrate = { 1472static struct cftype cft_memory_migrate = {
1784 .name = "memory_migrate", 1473 .name = "memory_migrate",
1474 .read = cpuset_common_file_read,
1475 .write = cpuset_common_file_write,
1785 .private = FILE_MEMORY_MIGRATE, 1476 .private = FILE_MEMORY_MIGRATE,
1786}; 1477};
1787 1478
1788static struct cftype cft_memory_pressure_enabled = { 1479static struct cftype cft_memory_pressure_enabled = {
1789 .name = "memory_pressure_enabled", 1480 .name = "memory_pressure_enabled",
1481 .read = cpuset_common_file_read,
1482 .write = cpuset_common_file_write,
1790 .private = FILE_MEMORY_PRESSURE_ENABLED, 1483 .private = FILE_MEMORY_PRESSURE_ENABLED,
1791}; 1484};
1792 1485
1793static struct cftype cft_memory_pressure = { 1486static struct cftype cft_memory_pressure = {
1794 .name = "memory_pressure", 1487 .name = "memory_pressure",
1488 .read = cpuset_common_file_read,
1489 .write = cpuset_common_file_write,
1795 .private = FILE_MEMORY_PRESSURE, 1490 .private = FILE_MEMORY_PRESSURE,
1796}; 1491};
1797 1492
1798static struct cftype cft_spread_page = { 1493static struct cftype cft_spread_page = {
1799 .name = "memory_spread_page", 1494 .name = "memory_spread_page",
1495 .read = cpuset_common_file_read,
1496 .write = cpuset_common_file_write,
1800 .private = FILE_SPREAD_PAGE, 1497 .private = FILE_SPREAD_PAGE,
1801}; 1498};
1802 1499
1803static struct cftype cft_spread_slab = { 1500static struct cftype cft_spread_slab = {
1804 .name = "memory_spread_slab", 1501 .name = "memory_spread_slab",
1502 .read = cpuset_common_file_read,
1503 .write = cpuset_common_file_write,
1805 .private = FILE_SPREAD_SLAB, 1504 .private = FILE_SPREAD_SLAB,
1806}; 1505};
1807 1506
1808static int cpuset_populate_dir(struct dentry *cs_dentry) 1507static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809{ 1508{
1810 int err; 1509 int err;
1811 1510
1812 if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) 1511 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1813 return err;
1814 if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
1815 return err; 1512 return err;
1816 if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) 1513 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1817 return err; 1514 return err;
1818 if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) 1515 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1819 return err; 1516 return err;
1820 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1517 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1821 return err; 1518 return err;
1822 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1519 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1823 return err; 1520 return err;
1824 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1521 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1825 return err; 1522 return err;
1826 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) 1523 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1827 return err; 1524 return err;
1828 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) 1525 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1829 return err; 1526 return err;
1830 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1527 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1831 return err; 1528 return err;
1529 /* memory_pressure_enabled is in root cpuset only */
1530 if (err == 0 && !cont->parent)
1531 err = cgroup_add_file(cont, ss,
1532 &cft_memory_pressure_enabled);
1832 return 0; 1533 return 0;
1833} 1534}
1834 1535
1835/* 1536/*
1537 * post_clone() is called at the end of cgroup_clone().
1538 * 'cgroup' was just created automatically as a result of
1539 * a cgroup_clone(), and the current task is about to
1540 * be moved into 'cgroup'.
1541 *
1542 * Currently we refuse to set up the cgroup - thereby
1543 * refusing the task to be entered, and as a result refusing
1544 * the sys_unshare() or clone() which initiated it - if any
1545 * sibling cpusets have exclusive cpus or mem.
1546 *
1547 * If this becomes a problem for some users who wish to
1548 * allow that scenario, then cpuset_post_clone() could be
1549 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550 * (and likewise for mems) to the new cgroup.
1551 */
1552static void cpuset_post_clone(struct cgroup_subsys *ss,
1553 struct cgroup *cgroup)
1554{
1555 struct cgroup *parent, *child;
1556 struct cpuset *cs, *parent_cs;
1557
1558 parent = cgroup->parent;
1559 list_for_each_entry(child, &parent->children, sibling) {
1560 cs = cgroup_cs(child);
1561 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1562 return;
1563 }
1564 cs = cgroup_cs(cgroup);
1565 parent_cs = cgroup_cs(parent);
1566
1567 cs->mems_allowed = parent_cs->mems_allowed;
1568 cs->cpus_allowed = parent_cs->cpus_allowed;
1569 return;
1570}
1571
1572/*
1836 * cpuset_create - create a cpuset 1573 * cpuset_create - create a cpuset
1837 * parent: cpuset that will be parent of the new cpuset. 1574 * parent: cpuset that will be parent of the new cpuset.
1838 * name: name of the new cpuset. Will be strcpy'ed. 1575 * name: name of the new cpuset. Will be strcpy'ed.
@@ -1841,106 +1578,77 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1841 * Must be called with the mutex on the parent inode held 1578 * Must be called with the mutex on the parent inode held
1842 */ 1579 */
1843 1580
1844static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1581static struct cgroup_subsys_state *cpuset_create(
1582 struct cgroup_subsys *ss,
1583 struct cgroup *cont)
1845{ 1584{
1846 struct cpuset *cs; 1585 struct cpuset *cs;
1847 int err; 1586 struct cpuset *parent;
1848 1587
1588 if (!cont->parent) {
1589 /* This is early initialization for the top cgroup */
1590 top_cpuset.mems_generation = cpuset_mems_generation++;
1591 return &top_cpuset.css;
1592 }
1593 parent = cgroup_cs(cont->parent);
1849 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1594 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1850 if (!cs) 1595 if (!cs)
1851 return -ENOMEM; 1596 return ERR_PTR(-ENOMEM);
1852 1597
1853 mutex_lock(&manage_mutex);
1854 cpuset_update_task_memory_state(); 1598 cpuset_update_task_memory_state();
1855 cs->flags = 0; 1599 cs->flags = 0;
1856 if (notify_on_release(parent))
1857 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1858 if (is_spread_page(parent)) 1600 if (is_spread_page(parent))
1859 set_bit(CS_SPREAD_PAGE, &cs->flags); 1601 set_bit(CS_SPREAD_PAGE, &cs->flags);
1860 if (is_spread_slab(parent)) 1602 if (is_spread_slab(parent))
1861 set_bit(CS_SPREAD_SLAB, &cs->flags); 1603 set_bit(CS_SPREAD_SLAB, &cs->flags);
1604 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1862 cs->cpus_allowed = CPU_MASK_NONE; 1605 cs->cpus_allowed = CPU_MASK_NONE;
1863 cs->mems_allowed = NODE_MASK_NONE; 1606 cs->mems_allowed = NODE_MASK_NONE;
1864 atomic_set(&cs->count, 0);
1865 INIT_LIST_HEAD(&cs->sibling);
1866 INIT_LIST_HEAD(&cs->children);
1867 cs->mems_generation = cpuset_mems_generation++; 1607 cs->mems_generation = cpuset_mems_generation++;
1868 fmeter_init(&cs->fmeter); 1608 fmeter_init(&cs->fmeter);
1869 1609
1870 cs->parent = parent; 1610 cs->parent = parent;
1871
1872 mutex_lock(&callback_mutex);
1873 list_add(&cs->sibling, &cs->parent->children);
1874 number_of_cpusets++; 1611 number_of_cpusets++;
1875 mutex_unlock(&callback_mutex); 1612 return &cs->css ;
1876
1877 err = cpuset_create_dir(cs, name, mode);
1878 if (err < 0)
1879 goto err;
1880
1881 /*
1882 * Release manage_mutex before cpuset_populate_dir() because it
1883 * will down() this new directory's i_mutex and if we race with
1884 * another mkdir, we might deadlock.
1885 */
1886 mutex_unlock(&manage_mutex);
1887
1888 err = cpuset_populate_dir(cs->dentry);
1889 /* If err < 0, we have a half-filled directory - oh well ;) */
1890 return 0;
1891err:
1892 list_del(&cs->sibling);
1893 mutex_unlock(&manage_mutex);
1894 kfree(cs);
1895 return err;
1896} 1613}
1897 1614
1898static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1615/*
1899{ 1616 * Locking note on the strange update_flag() call below:
1900 struct cpuset *c_parent = dentry->d_parent->d_fsdata; 1617 *
1901 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1902 /* the vfs holds inode->i_mutex already */ 1619 * enabled, then simulate turning sched_load_balance off, which
1903 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug()
1904} 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an
1624 * ABBA deadlock.
1625 */
1905 1626
1906static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1627static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1907{ 1628{
1908 struct cpuset *cs = dentry->d_fsdata; 1629 struct cpuset *cs = cgroup_cs(cont);
1909 struct dentry *d;
1910 struct cpuset *parent;
1911 char *pathbuf = NULL;
1912 1630
1913 /* the vfs holds both inode->i_mutex already */
1914
1915 mutex_lock(&manage_mutex);
1916 cpuset_update_task_memory_state(); 1631 cpuset_update_task_memory_state();
1917 if (atomic_read(&cs->count) > 0) { 1632
1918 mutex_unlock(&manage_mutex); 1633 if (is_sched_load_balance(cs))
1919 return -EBUSY; 1634 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1920 } 1635
1921 if (!list_empty(&cs->children)) {
1922 mutex_unlock(&manage_mutex);
1923 return -EBUSY;
1924 }
1925 parent = cs->parent;
1926 mutex_lock(&callback_mutex);
1927 set_bit(CS_REMOVED, &cs->flags);
1928 list_del(&cs->sibling); /* delete my sibling from parent->children */
1929 spin_lock(&cs->dentry->d_lock);
1930 d = dget(cs->dentry);
1931 cs->dentry = NULL;
1932 spin_unlock(&d->d_lock);
1933 cpuset_d_remove_dir(d);
1934 dput(d);
1935 number_of_cpusets--; 1636 number_of_cpusets--;
1936 mutex_unlock(&callback_mutex); 1637 kfree(cs);
1937 if (list_empty(&parent->children))
1938 check_for_release(parent, &pathbuf);
1939 mutex_unlock(&manage_mutex);
1940 cpuset_release_agent(pathbuf);
1941 return 0;
1942} 1638}
1943 1639
1640struct cgroup_subsys cpuset_subsys = {
1641 .name = "cpuset",
1642 .create = cpuset_create,
1643 .destroy = cpuset_destroy,
1644 .can_attach = cpuset_can_attach,
1645 .attach = cpuset_attach,
1646 .populate = cpuset_populate,
1647 .post_clone = cpuset_post_clone,
1648 .subsys_id = cpuset_subsys_id,
1649 .early_init = 1,
1650};
1651
1944/* 1652/*
1945 * cpuset_init_early - just enough so that the calls to 1653 * cpuset_init_early - just enough so that the calls to
1946 * cpuset_update_task_memory_state() in early init code 1654 * cpuset_update_task_memory_state() in early init code
@@ -1949,13 +1657,11 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1949 1657
1950int __init cpuset_init_early(void) 1658int __init cpuset_init_early(void)
1951{ 1659{
1952 struct task_struct *tsk = current; 1660 top_cpuset.mems_generation = cpuset_mems_generation++;
1953
1954 tsk->cpuset = &top_cpuset;
1955 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1956 return 0; 1661 return 0;
1957} 1662}
1958 1663
1664
1959/** 1665/**
1960 * cpuset_init - initialize cpusets at system boot 1666 * cpuset_init - initialize cpusets at system boot
1961 * 1667 *
@@ -1964,39 +1670,21 @@ int __init cpuset_init_early(void)
1964 1670
1965int __init cpuset_init(void) 1671int __init cpuset_init(void)
1966{ 1672{
1967 struct dentry *root; 1673 int err = 0;
1968 int err;
1969 1674
1970 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1675 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1971 top_cpuset.mems_allowed = NODE_MASK_ALL; 1676 top_cpuset.mems_allowed = NODE_MASK_ALL;
1972 1677
1973 fmeter_init(&top_cpuset.fmeter); 1678 fmeter_init(&top_cpuset.fmeter);
1974 top_cpuset.mems_generation = cpuset_mems_generation++; 1679 top_cpuset.mems_generation = cpuset_mems_generation++;
1975 1680 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1976 init_task.cpuset = &top_cpuset;
1977 1681
1978 err = register_filesystem(&cpuset_fs_type); 1682 err = register_filesystem(&cpuset_fs_type);
1979 if (err < 0) 1683 if (err < 0)
1980 goto out; 1684 return err;
1981 cpuset_mount = kern_mount(&cpuset_fs_type); 1685
1982 if (IS_ERR(cpuset_mount)) {
1983 printk(KERN_ERR "cpuset: could not mount!\n");
1984 err = PTR_ERR(cpuset_mount);
1985 cpuset_mount = NULL;
1986 goto out;
1987 }
1988 root = cpuset_mount->mnt_sb->s_root;
1989 root->d_fsdata = &top_cpuset;
1990 inc_nlink(root->d_inode);
1991 top_cpuset.dentry = root;
1992 root->d_inode->i_op = &cpuset_dir_inode_operations;
1993 number_of_cpusets = 1; 1686 number_of_cpusets = 1;
1994 err = cpuset_populate_dir(root); 1687 return 0;
1995 /* memory_pressure_enabled is in root cpuset only */
1996 if (err == 0)
1997 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1998out:
1999 return err;
2000} 1688}
2001 1689
2002/* 1690/*
@@ -2022,10 +1710,12 @@ out:
2022 1710
2023static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2024{ 1712{
1713 struct cgroup *cont;
2025 struct cpuset *c; 1714 struct cpuset *c;
2026 1715
2027 /* Each of our child cpusets mems must be online */ 1716 /* Each of our child cpusets mems must be online */
2028 list_for_each_entry(c, &cur->children, sibling) { 1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1718 c = cgroup_cs(cont);
2029 guarantee_online_cpus_mems_in_subtree(c); 1719 guarantee_online_cpus_mems_in_subtree(c);
2030 if (!cpus_empty(c->cpus_allowed)) 1720 if (!cpus_empty(c->cpus_allowed))
2031 guarantee_online_cpus(c, &c->cpus_allowed); 1721 guarantee_online_cpus(c, &c->cpus_allowed);
@@ -2053,7 +1743,7 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2053 1743
2054static void common_cpu_mem_hotplug_unplug(void) 1744static void common_cpu_mem_hotplug_unplug(void)
2055{ 1745{
2056 mutex_lock(&manage_mutex); 1746 cgroup_lock();
2057 mutex_lock(&callback_mutex); 1747 mutex_lock(&callback_mutex);
2058 1748
2059 guarantee_online_cpus_mems_in_subtree(&top_cpuset); 1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
@@ -2061,7 +1751,7 @@ static void common_cpu_mem_hotplug_unplug(void)
2061 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2062 1752
2063 mutex_unlock(&callback_mutex); 1753 mutex_unlock(&callback_mutex);
2064 mutex_unlock(&manage_mutex); 1754 cgroup_unlock();
2065} 1755}
2066 1756
2067/* 1757/*
@@ -2074,8 +1764,8 @@ static void common_cpu_mem_hotplug_unplug(void)
2074 * cpu_online_map on each CPU hotplug (cpuhp) event. 1764 * cpu_online_map on each CPU hotplug (cpuhp) event.
2075 */ 1765 */
2076 1766
2077static int cpuset_handle_cpuhp(struct notifier_block *nb, 1767static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
2078 unsigned long phase, void *cpu) 1768 unsigned long phase, void *unused_cpu)
2079{ 1769{
2080 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1770 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2081 return NOTIFY_DONE; 1771 return NOTIFY_DONE;
@@ -2113,109 +1803,7 @@ void __init cpuset_init_smp(void)
2113} 1803}
2114 1804
2115/** 1805/**
2116 * cpuset_fork - attach newly forked task to its parents cpuset.
2117 * @tsk: pointer to task_struct of forking parent process.
2118 *
2119 * Description: A task inherits its parent's cpuset at fork().
2120 *
2121 * A pointer to the shared cpuset was automatically copied in fork.c
2122 * by dup_task_struct(). However, we ignore that copy, since it was
2123 * not made under the protection of task_lock(), so might no longer be
2124 * a valid cpuset pointer. attach_task() might have already changed
2125 * current->cpuset, allowing the previously referenced cpuset to
2126 * be removed and freed. Instead, we task_lock(current) and copy
2127 * its present value of current->cpuset for our freshly forked child.
2128 *
2129 * At the point that cpuset_fork() is called, 'current' is the parent
2130 * task, and the passed argument 'child' points to the child task.
2131 **/
2132
2133void cpuset_fork(struct task_struct *child)
2134{
2135 task_lock(current);
2136 child->cpuset = current->cpuset;
2137 atomic_inc(&child->cpuset->count);
2138 task_unlock(current);
2139}
2140
2141/**
2142 * cpuset_exit - detach cpuset from exiting task
2143 * @tsk: pointer to task_struct of exiting process
2144 *
2145 * Description: Detach cpuset from @tsk and release it.
2146 *
2147 * Note that cpusets marked notify_on_release force every task in
2148 * them to take the global manage_mutex mutex when exiting.
2149 * This could impact scaling on very large systems. Be reluctant to
2150 * use notify_on_release cpusets where very high task exit scaling
2151 * is required on large systems.
2152 *
2153 * Don't even think about derefencing 'cs' after the cpuset use count
2154 * goes to zero, except inside a critical section guarded by manage_mutex
2155 * or callback_mutex. Otherwise a zero cpuset use count is a license to
2156 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
2157 *
2158 * This routine has to take manage_mutex, not callback_mutex, because
2159 * it is holding that mutex while calling check_for_release(),
2160 * which calls kmalloc(), so can't be called holding callback_mutex().
2161 *
2162 * the_top_cpuset_hack:
2163 *
2164 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
2165 *
2166 * Don't leave a task unable to allocate memory, as that is an
2167 * accident waiting to happen should someone add a callout in
2168 * do_exit() after the cpuset_exit() call that might allocate.
2169 * If a task tries to allocate memory with an invalid cpuset,
2170 * it will oops in cpuset_update_task_memory_state().
2171 *
2172 * We call cpuset_exit() while the task is still competent to
2173 * handle notify_on_release(), then leave the task attached to
2174 * the root cpuset (top_cpuset) for the remainder of its exit.
2175 *
2176 * To do this properly, we would increment the reference count on
2177 * top_cpuset, and near the very end of the kernel/exit.c do_exit()
2178 * code we would add a second cpuset function call, to drop that
2179 * reference. This would just create an unnecessary hot spot on
2180 * the top_cpuset reference count, to no avail.
2181 *
2182 * Normally, holding a reference to a cpuset without bumping its
2183 * count is unsafe. The cpuset could go away, or someone could
2184 * attach us to a different cpuset, decrementing the count on
2185 * the first cpuset that we never incremented. But in this case,
2186 * top_cpuset isn't going away, and either task has PF_EXITING set,
2187 * which wards off any attach_task() attempts, or task is a failed
2188 * fork, never visible to attach_task.
2189 *
2190 * Another way to do this would be to set the cpuset pointer
2191 * to NULL here, and check in cpuset_update_task_memory_state()
2192 * for a NULL pointer. This hack avoids that NULL check, for no
2193 * cost (other than this way too long comment ;).
2194 **/
2195
2196void cpuset_exit(struct task_struct *tsk)
2197{
2198 struct cpuset *cs;
2199
2200 task_lock(current);
2201 cs = tsk->cpuset;
2202 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2203 task_unlock(current);
2204
2205 if (notify_on_release(cs)) {
2206 char *pathbuf = NULL;
2207 1806
2208 mutex_lock(&manage_mutex);
2209 if (atomic_dec_and_test(&cs->count))
2210 check_for_release(cs, &pathbuf);
2211 mutex_unlock(&manage_mutex);
2212 cpuset_release_agent(pathbuf);
2213 } else {
2214 atomic_dec(&cs->count);
2215 }
2216}
2217
2218/**
2219 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1807 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2220 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1808 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2221 * 1809 *
@@ -2230,10 +1818,23 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2230 cpumask_t mask; 1818 cpumask_t mask;
2231 1819
2232 mutex_lock(&callback_mutex); 1820 mutex_lock(&callback_mutex);
1821 mask = cpuset_cpus_allowed_locked(tsk);
1822 mutex_unlock(&callback_mutex);
1823
1824 return mask;
1825}
1826
1827/**
1828 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829 * Must be called with callback_mutex held.
1830 **/
1831cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832{
1833 cpumask_t mask;
1834
2233 task_lock(tsk); 1835 task_lock(tsk);
2234 guarantee_online_cpus(tsk->cpuset, &mask); 1836 guarantee_online_cpus(task_cs(tsk), &mask);
2235 task_unlock(tsk); 1837 task_unlock(tsk);
2236 mutex_unlock(&callback_mutex);
2237 1838
2238 return mask; 1839 return mask;
2239} 1840}
@@ -2259,7 +1860,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2259 1860
2260 mutex_lock(&callback_mutex); 1861 mutex_lock(&callback_mutex);
2261 task_lock(tsk); 1862 task_lock(tsk);
2262 guarantee_online_mems(tsk->cpuset, &mask); 1863 guarantee_online_mems(task_cs(tsk), &mask);
2263 task_unlock(tsk); 1864 task_unlock(tsk);
2264 mutex_unlock(&callback_mutex); 1865 mutex_unlock(&callback_mutex);
2265 1866
@@ -2390,7 +1991,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2390 mutex_lock(&callback_mutex); 1991 mutex_lock(&callback_mutex);
2391 1992
2392 task_lock(current); 1993 task_lock(current);
2393 cs = nearest_exclusive_ancestor(current->cpuset); 1994 cs = nearest_exclusive_ancestor(task_cs(current));
2394 task_unlock(current); 1995 task_unlock(current);
2395 1996
2396 allowed = node_isset(node, cs->mems_allowed); 1997 allowed = node_isset(node, cs->mems_allowed);
@@ -2431,12 +2032,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2431 node = zone_to_nid(z); 2032 node = zone_to_nid(z);
2432 if (node_isset(node, current->mems_allowed)) 2033 if (node_isset(node, current->mems_allowed))
2433 return 1; 2034 return 1;
2434 /* 2035 /*
2435 * Allow tasks that have access to memory reserves because they have 2036 * Allow tasks that have access to memory reserves because they have
2436 * been OOM killed to get memory anywhere. 2037 * been OOM killed to get memory anywhere.
2437 */ 2038 */
2438 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2039 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2439 return 1; 2040 return 1;
2440 return 0; 2041 return 0;
2441} 2042}
2442 2043
@@ -2550,14 +2151,12 @@ int cpuset_memory_pressure_enabled __read_mostly;
2550 2151
2551void __cpuset_memory_pressure_bump(void) 2152void __cpuset_memory_pressure_bump(void)
2552{ 2153{
2553 struct cpuset *cs;
2554
2555 task_lock(current); 2154 task_lock(current);
2556 cs = current->cpuset; 2155 fmeter_markevent(&task_cs(current)->fmeter);
2557 fmeter_markevent(&cs->fmeter);
2558 task_unlock(current); 2156 task_unlock(current);
2559} 2157}
2560 2158
2159#ifdef CONFIG_PROC_PID_CPUSET
2561/* 2160/*
2562 * proc_cpuset_show() 2161 * proc_cpuset_show()
2563 * - Print tasks cpuset path into seq_file. 2162 * - Print tasks cpuset path into seq_file.
@@ -2569,11 +2168,12 @@ void __cpuset_memory_pressure_bump(void)
2569 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks 2168 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2570 * cpuset to top_cpuset. 2169 * cpuset to top_cpuset.
2571 */ 2170 */
2572static int proc_cpuset_show(struct seq_file *m, void *v) 2171static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2573{ 2172{
2574 struct pid *pid; 2173 struct pid *pid;
2575 struct task_struct *tsk; 2174 struct task_struct *tsk;
2576 char *buf; 2175 char *buf;
2176 struct cgroup_subsys_state *css;
2577 int retval; 2177 int retval;
2578 2178
2579 retval = -ENOMEM; 2179 retval = -ENOMEM;
@@ -2588,15 +2188,15 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2588 goto out_free; 2188 goto out_free;
2589 2189
2590 retval = -EINVAL; 2190 retval = -EINVAL;
2591 mutex_lock(&manage_mutex); 2191 cgroup_lock();
2592 2192 css = task_subsys_state(tsk, cpuset_subsys_id);
2593 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2193 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2594 if (retval < 0) 2194 if (retval < 0)
2595 goto out_unlock; 2195 goto out_unlock;
2596 seq_puts(m, buf); 2196 seq_puts(m, buf);
2597 seq_putc(m, '\n'); 2197 seq_putc(m, '\n');
2598out_unlock: 2198out_unlock:
2599 mutex_unlock(&manage_mutex); 2199 cgroup_unlock();
2600 put_task_struct(tsk); 2200 put_task_struct(tsk);
2601out_free: 2201out_free:
2602 kfree(buf); 2202 kfree(buf);
@@ -2616,6 +2216,7 @@ const struct file_operations proc_cpuset_operations = {
2616 .llseek = seq_lseek, 2216 .llseek = seq_lseek,
2617 .release = single_release, 2217 .release = single_release,
2618}; 2218};
2219#endif /* CONFIG_PROC_PID_CPUSET */
2619 2220
2620/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2221/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2621char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) 2222char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 09e9574eeb26..10e43fd8b721 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -115,6 +115,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
115 tmp += timespec_to_ns(&ts); 115 tmp += timespec_to_ns(&ts);
116 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 116 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
117 117
118 tmp = (s64)d->cpu_scaled_run_real_total;
119 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
120 tmp += timespec_to_ns(&ts);
121 d->cpu_scaled_run_real_total =
122 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
123
118 /* 124 /*
119 * No locking available for sched_info (and too expensive to add one) 125 * No locking available for sched_info (and too expensive to add one)
120 * Mitigate by taking snapshot of values 126 * Mitigate by taking snapshot of values
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
deleted file mode 100644
index 0d98827887a7..000000000000
--- a/kernel/die_notifier.c
+++ /dev/null
@@ -1,38 +0,0 @@
1
2#include <linux/module.h>
3#include <linux/notifier.h>
4#include <linux/vmalloc.h>
5#include <linux/kdebug.h>
6
7
8static ATOMIC_NOTIFIER_HEAD(die_chain);
9
10int notify_die(enum die_val val, const char *str,
11 struct pt_regs *regs, long err, int trap, int sig)
12{
13 struct die_args args = {
14 .regs = regs,
15 .str = str,
16 .err = err,
17 .trapnr = trap,
18 .signr = sig,
19
20 };
21
22 return atomic_notifier_call_chain(&die_chain, val, &args);
23}
24
25int register_die_notifier(struct notifier_block *nb)
26{
27 vmalloc_sync_all();
28 return atomic_notifier_chain_register(&die_chain, nb);
29}
30EXPORT_SYMBOL_GPL(register_die_notifier);
31
32int unregister_die_notifier(struct notifier_block *nb)
33{
34 return atomic_notifier_chain_unregister(&die_chain, nb);
35}
36EXPORT_SYMBOL_GPL(unregister_die_notifier);
37
38
diff --git a/kernel/dma.c b/kernel/dma.c
index 937b13ca33ba..6a82bb716dac 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -20,7 +20,7 @@
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h> 21#include <asm/system.h>
22 22
23 23
24 24
25/* A note on resource allocation: 25/* A note on resource allocation:
26 * 26 *
@@ -95,7 +95,7 @@ void free_dma(unsigned int dmanr)
95 if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { 95 if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
96 printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); 96 printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
97 return; 97 return;
98 } 98 }
99 99
100} /* free_dma */ 100} /* free_dma */
101 101
@@ -121,8 +121,8 @@ static int proc_dma_show(struct seq_file *m, void *v)
121 121
122 for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { 122 for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
123 if (dma_chan_busy[i].lock) { 123 if (dma_chan_busy[i].lock) {
124 seq_printf(m, "%2d: %s\n", i, 124 seq_printf(m, "%2d: %s\n", i,
125 dma_chan_busy[i].device_id); 125 dma_chan_busy[i].device_id);
126 } 126 }
127 } 127 }
128 return 0; 128 return 0;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 3c2eaea66b1e..a9e6bad9f706 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -57,7 +57,7 @@ lookup_exec_domain(u_long personality)
57{ 57{
58 struct exec_domain * ep; 58 struct exec_domain * ep;
59 u_long pers = personality(personality); 59 u_long pers = personality(personality);
60 60
61 read_lock(&exec_domains_lock); 61 read_lock(&exec_domains_lock);
62 for (ep = exec_domains; ep; ep = ep->next) { 62 for (ep = exec_domains; ep; ep = ep->next) {
63 if (pers >= ep->pers_low && pers <= ep->pers_high) 63 if (pers >= ep->pers_low && pers <= ep->pers_high)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2c704c86edb3..f1aec27f1df0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,7 +31,7 @@
31#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h> 34#include <linux/cgroup.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/signal.h> 36#include <linux/signal.h>
37#include <linux/posix-timers.h> 37#include <linux/posix-timers.h>
@@ -148,6 +148,7 @@ void release_task(struct task_struct * p)
148 int zap_leader; 148 int zap_leader;
149repeat: 149repeat:
150 atomic_dec(&p->user->processes); 150 atomic_dec(&p->user->processes);
151 proc_flush_task(p);
151 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
152 ptrace_unlink(p); 153 ptrace_unlink(p);
153 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 154 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -175,7 +176,6 @@ repeat:
175 } 176 }
176 177
177 write_unlock_irq(&tasklist_lock); 178 write_unlock_irq(&tasklist_lock);
178 proc_flush_task(p);
179 release_thread(p); 179 release_thread(p);
180 call_rcu(&p->rcu, delayed_put_task_struct); 180 call_rcu(&p->rcu, delayed_put_task_struct);
181 181
@@ -221,7 +221,7 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor
221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
222 if (p == ignored_task 222 if (p == ignored_task
223 || p->exit_state 223 || p->exit_state
224 || is_init(p->real_parent)) 224 || is_global_init(p->real_parent))
225 continue; 225 continue;
226 if (task_pgrp(p->real_parent) != pgrp && 226 if (task_pgrp(p->real_parent) != pgrp &&
227 task_session(p->real_parent) == task_session(p)) { 227 task_session(p->real_parent) == task_session(p)) {
@@ -299,14 +299,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
299{ 299{
300 struct task_struct *curr = current->group_leader; 300 struct task_struct *curr = current->group_leader;
301 301
302 if (process_session(curr) != session) { 302 if (task_session_nr(curr) != session) {
303 detach_pid(curr, PIDTYPE_SID); 303 detach_pid(curr, PIDTYPE_SID);
304 set_signal_session(curr->signal, session); 304 set_task_session(curr, session);
305 attach_pid(curr, PIDTYPE_SID, find_pid(session)); 305 attach_pid(curr, PIDTYPE_SID, find_pid(session));
306 } 306 }
307 if (process_group(curr) != pgrp) { 307 if (task_pgrp_nr(curr) != pgrp) {
308 detach_pid(curr, PIDTYPE_PGID); 308 detach_pid(curr, PIDTYPE_PGID);
309 curr->signal->pgrp = pgrp; 309 set_task_pgrp(curr, pgrp);
310 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); 310 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
311 } 311 }
312} 312}
@@ -400,11 +400,12 @@ void daemonize(const char *name, ...)
400 current->fs = fs; 400 current->fs = fs;
401 atomic_inc(&fs->count); 401 atomic_inc(&fs->count);
402 402
403 exit_task_namespaces(current); 403 if (current->nsproxy != init_task.nsproxy) {
404 current->nsproxy = init_task.nsproxy; 404 get_nsproxy(init_task.nsproxy);
405 get_task_namespaces(current); 405 switch_task_namespaces(current, init_task.nsproxy);
406 }
406 407
407 exit_files(current); 408 exit_files(current);
408 current->files = init_task.files; 409 current->files = init_task.files;
409 atomic_inc(&current->files->count); 410 atomic_inc(&current->files->count);
410 411
@@ -492,7 +493,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
492} 493}
493EXPORT_SYMBOL(reset_files_struct); 494EXPORT_SYMBOL(reset_files_struct);
494 495
495static inline void __exit_files(struct task_struct *tsk) 496static void __exit_files(struct task_struct *tsk)
496{ 497{
497 struct files_struct * files = tsk->files; 498 struct files_struct * files = tsk->files;
498 499
@@ -509,7 +510,7 @@ void exit_files(struct task_struct *tsk)
509 __exit_files(tsk); 510 __exit_files(tsk);
510} 511}
511 512
512static inline void __put_fs_struct(struct fs_struct *fs) 513static void __put_fs_struct(struct fs_struct *fs)
513{ 514{
514 /* No need to hold fs->lock if we are killing it */ 515 /* No need to hold fs->lock if we are killing it */
515 if (atomic_dec_and_test(&fs->count)) { 516 if (atomic_dec_and_test(&fs->count)) {
@@ -530,7 +531,7 @@ void put_fs_struct(struct fs_struct *fs)
530 __put_fs_struct(fs); 531 __put_fs_struct(fs);
531} 532}
532 533
533static inline void __exit_fs(struct task_struct *tsk) 534static void __exit_fs(struct task_struct *tsk)
534{ 535{
535 struct fs_struct * fs = tsk->fs; 536 struct fs_struct * fs = tsk->fs;
536 537
@@ -665,19 +666,22 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
665 * the child reaper process (ie "init") in our pid 666 * the child reaper process (ie "init") in our pid
666 * space. 667 * space.
667 */ 668 */
668static void 669static void forget_original_parent(struct task_struct *father)
669forget_original_parent(struct task_struct *father, struct list_head *to_release)
670{ 670{
671 struct task_struct *p, *reaper = father; 671 struct task_struct *p, *n, *reaper = father;
672 struct list_head *_p, *_n; 672 struct list_head ptrace_dead;
673
674 INIT_LIST_HEAD(&ptrace_dead);
675
676 write_lock_irq(&tasklist_lock);
673 677
674 do { 678 do {
675 reaper = next_thread(reaper); 679 reaper = next_thread(reaper);
676 if (reaper == father) { 680 if (reaper == father) {
677 reaper = child_reaper(father); 681 reaper = task_child_reaper(father);
678 break; 682 break;
679 } 683 }
680 } while (reaper->exit_state); 684 } while (reaper->flags & PF_EXITING);
681 685
682 /* 686 /*
683 * There are only two places where our children can be: 687 * There are only two places where our children can be:
@@ -687,9 +691,8 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
687 * 691 *
688 * Search them and reparent children. 692 * Search them and reparent children.
689 */ 693 */
690 list_for_each_safe(_p, _n, &father->children) { 694 list_for_each_entry_safe(p, n, &father->children, sibling) {
691 int ptrace; 695 int ptrace;
692 p = list_entry(_p, struct task_struct, sibling);
693 696
694 ptrace = p->ptrace; 697 ptrace = p->ptrace;
695 698
@@ -715,13 +718,23 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
715 * while it was being traced by us, to be able to see it in wait4. 718 * while it was being traced by us, to be able to see it in wait4.
716 */ 719 */
717 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 720 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
718 list_add(&p->ptrace_list, to_release); 721 list_add(&p->ptrace_list, &ptrace_dead);
719 } 722 }
720 list_for_each_safe(_p, _n, &father->ptrace_children) { 723
721 p = list_entry(_p, struct task_struct, ptrace_list); 724 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
722 p->real_parent = reaper; 725 p->real_parent = reaper;
723 reparent_thread(p, father, 1); 726 reparent_thread(p, father, 1);
724 } 727 }
728
729 write_unlock_irq(&tasklist_lock);
730 BUG_ON(!list_empty(&father->children));
731 BUG_ON(!list_empty(&father->ptrace_children));
732
733 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
734 list_del_init(&p->ptrace_list);
735 release_task(p);
736 }
737
725} 738}
726 739
727/* 740/*
@@ -732,7 +745,6 @@ static void exit_notify(struct task_struct *tsk)
732{ 745{
733 int state; 746 int state;
734 struct task_struct *t; 747 struct task_struct *t;
735 struct list_head ptrace_dead, *_p, *_n;
736 struct pid *pgrp; 748 struct pid *pgrp;
737 749
738 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) 750 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
@@ -753,8 +765,6 @@ static void exit_notify(struct task_struct *tsk)
753 spin_unlock_irq(&tsk->sighand->siglock); 765 spin_unlock_irq(&tsk->sighand->siglock);
754 } 766 }
755 767
756 write_lock_irq(&tasklist_lock);
757
758 /* 768 /*
759 * This does two things: 769 * This does two things:
760 * 770 *
@@ -763,12 +773,10 @@ static void exit_notify(struct task_struct *tsk)
763 * as a result of our exiting, and if they have any stopped 773 * as a result of our exiting, and if they have any stopped
764 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 774 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
765 */ 775 */
776 forget_original_parent(tsk);
777 exit_task_namespaces(tsk);
766 778
767 INIT_LIST_HEAD(&ptrace_dead); 779 write_lock_irq(&tasklist_lock);
768 forget_original_parent(tsk, &ptrace_dead);
769 BUG_ON(!list_empty(&tsk->children));
770 BUG_ON(!list_empty(&tsk->ptrace_children));
771
772 /* 780 /*
773 * Check to see if any process groups have become orphaned 781 * Check to see if any process groups have become orphaned
774 * as a result of our exiting, and if they have any stopped 782 * as a result of our exiting, and if they have any stopped
@@ -792,7 +800,7 @@ static void exit_notify(struct task_struct *tsk)
792 /* Let father know we died 800 /* Let father know we died
793 * 801 *
794 * Thread signals are configurable, but you aren't going to use 802 * Thread signals are configurable, but you aren't going to use
795 * that to send signals to arbitary processes. 803 * that to send signals to arbitary processes.
796 * That stops right now. 804 * That stops right now.
797 * 805 *
798 * If the parent exec id doesn't match the exec id we saved 806 * If the parent exec id doesn't match the exec id we saved
@@ -833,12 +841,6 @@ static void exit_notify(struct task_struct *tsk)
833 841
834 write_unlock_irq(&tasklist_lock); 842 write_unlock_irq(&tasklist_lock);
835 843
836 list_for_each_safe(_p, _n, &ptrace_dead) {
837 list_del_init(_p);
838 t = list_entry(_p, struct task_struct, ptrace_list);
839 release_task(t);
840 }
841
842 /* If the process is dead, release it - nobody will wait for it */ 844 /* If the process is dead, release it - nobody will wait for it */
843 if (state == EXIT_DEAD) 845 if (state == EXIT_DEAD)
844 release_task(tsk); 846 release_task(tsk);
@@ -874,10 +876,35 @@ static inline void check_stack_usage(void) {}
874 876
875static inline void exit_child_reaper(struct task_struct *tsk) 877static inline void exit_child_reaper(struct task_struct *tsk)
876{ 878{
877 if (likely(tsk->group_leader != child_reaper(tsk))) 879 if (likely(tsk->group_leader != task_child_reaper(tsk)))
878 return; 880 return;
879 881
880 panic("Attempted to kill init!"); 882 if (tsk->nsproxy->pid_ns == &init_pid_ns)
883 panic("Attempted to kill init!");
884
885 /*
886 * @tsk is the last thread in the 'cgroup-init' and is exiting.
887 * Terminate all remaining processes in the namespace and reap them
888 * before exiting @tsk.
889 *
890 * Note that @tsk (last thread of cgroup-init) may not necessarily
891 * be the child-reaper (i.e main thread of cgroup-init) of the
892 * namespace i.e the child_reaper may have already exited.
893 *
894 * Even after a child_reaper exits, we let it inherit orphaned children,
895 * because, pid_ns->child_reaper remains valid as long as there is
896 * at least one living sub-thread in the cgroup init.
897
898 * This living sub-thread of the cgroup-init will be notified when
899 * a child inherited by the 'child-reaper' exits (do_notify_parent()
900 * uses __group_send_sig_info()). Further, when reaping child processes,
901 * do_wait() iterates over children of all living sub threads.
902
903 * i.e even though 'child_reaper' thread is listed as the parent of the
904 * orphaned children, any living sub-thread in the cgroup-init can
905 * perform the role of the child_reaper.
906 */
907 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
881} 908}
882 909
883fastcall NORET_TYPE void do_exit(long code) 910fastcall NORET_TYPE void do_exit(long code)
@@ -932,7 +959,7 @@ fastcall NORET_TYPE void do_exit(long code)
932 959
933 if (unlikely(in_atomic())) 960 if (unlikely(in_atomic()))
934 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 961 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
935 current->comm, current->pid, 962 current->comm, task_pid_nr(current),
936 preempt_count()); 963 preempt_count());
937 964
938 acct_update_integrals(tsk); 965 acct_update_integrals(tsk);
@@ -972,7 +999,7 @@ fastcall NORET_TYPE void do_exit(long code)
972 __exit_fs(tsk); 999 __exit_fs(tsk);
973 check_stack_usage(); 1000 check_stack_usage();
974 exit_thread(); 1001 exit_thread();
975 cpuset_exit(tsk); 1002 cgroup_exit(tsk, 1);
976 exit_keys(tsk); 1003 exit_keys(tsk);
977 1004
978 if (group_dead && tsk->signal->leader) 1005 if (group_dead && tsk->signal->leader)
@@ -983,7 +1010,6 @@ fastcall NORET_TYPE void do_exit(long code)
983 module_put(tsk->binfmt->module); 1010 module_put(tsk->binfmt->module);
984 1011
985 proc_exit_connector(tsk); 1012 proc_exit_connector(tsk);
986 exit_task_namespaces(tsk);
987 exit_notify(tsk); 1013 exit_notify(tsk);
988#ifdef CONFIG_NUMA 1014#ifdef CONFIG_NUMA
989 mpol_free(tsk->mempolicy); 1015 mpol_free(tsk->mempolicy);
@@ -1086,15 +1112,17 @@ asmlinkage void sys_exit_group(int error_code)
1086static int eligible_child(pid_t pid, int options, struct task_struct *p) 1112static int eligible_child(pid_t pid, int options, struct task_struct *p)
1087{ 1113{
1088 int err; 1114 int err;
1115 struct pid_namespace *ns;
1089 1116
1117 ns = current->nsproxy->pid_ns;
1090 if (pid > 0) { 1118 if (pid > 0) {
1091 if (p->pid != pid) 1119 if (task_pid_nr_ns(p, ns) != pid)
1092 return 0; 1120 return 0;
1093 } else if (!pid) { 1121 } else if (!pid) {
1094 if (process_group(p) != process_group(current)) 1122 if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
1095 return 0; 1123 return 0;
1096 } else if (pid != -1) { 1124 } else if (pid != -1) {
1097 if (process_group(p) != -pid) 1125 if (task_pgrp_nr_ns(p, ns) != -pid)
1098 return 0; 1126 return 0;
1099 } 1127 }
1100 1128
@@ -1164,9 +1192,12 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1164{ 1192{
1165 unsigned long state; 1193 unsigned long state;
1166 int retval, status, traced; 1194 int retval, status, traced;
1195 struct pid_namespace *ns;
1196
1197 ns = current->nsproxy->pid_ns;
1167 1198
1168 if (unlikely(noreap)) { 1199 if (unlikely(noreap)) {
1169 pid_t pid = p->pid; 1200 pid_t pid = task_pid_nr_ns(p, ns);
1170 uid_t uid = p->uid; 1201 uid_t uid = p->uid;
1171 int exit_code = p->exit_code; 1202 int exit_code = p->exit_code;
1172 int why, status; 1203 int why, status;
@@ -1285,11 +1316,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1285 retval = put_user(status, &infop->si_status); 1316 retval = put_user(status, &infop->si_status);
1286 } 1317 }
1287 if (!retval && infop) 1318 if (!retval && infop)
1288 retval = put_user(p->pid, &infop->si_pid); 1319 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1289 if (!retval && infop) 1320 if (!retval && infop)
1290 retval = put_user(p->uid, &infop->si_uid); 1321 retval = put_user(p->uid, &infop->si_uid);
1291 if (!retval) 1322 if (!retval)
1292 retval = p->pid; 1323 retval = task_pid_nr_ns(p, ns);
1293 1324
1294 if (traced) { 1325 if (traced) {
1295 write_lock_irq(&tasklist_lock); 1326 write_lock_irq(&tasklist_lock);
@@ -1326,6 +1357,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1326 int __user *stat_addr, struct rusage __user *ru) 1357 int __user *stat_addr, struct rusage __user *ru)
1327{ 1358{
1328 int retval, exit_code; 1359 int retval, exit_code;
1360 struct pid_namespace *ns;
1329 1361
1330 if (!p->exit_code) 1362 if (!p->exit_code)
1331 return 0; 1363 return 0;
@@ -1344,11 +1376,12 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1344 * keep holding onto the tasklist_lock while we call getrusage and 1376 * keep holding onto the tasklist_lock while we call getrusage and
1345 * possibly take page faults for user memory. 1377 * possibly take page faults for user memory.
1346 */ 1378 */
1379 ns = current->nsproxy->pid_ns;
1347 get_task_struct(p); 1380 get_task_struct(p);
1348 read_unlock(&tasklist_lock); 1381 read_unlock(&tasklist_lock);
1349 1382
1350 if (unlikely(noreap)) { 1383 if (unlikely(noreap)) {
1351 pid_t pid = p->pid; 1384 pid_t pid = task_pid_nr_ns(p, ns);
1352 uid_t uid = p->uid; 1385 uid_t uid = p->uid;
1353 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1386 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1354 1387
@@ -1419,11 +1452,11 @@ bail_ref:
1419 if (!retval && infop) 1452 if (!retval && infop)
1420 retval = put_user(exit_code, &infop->si_status); 1453 retval = put_user(exit_code, &infop->si_status);
1421 if (!retval && infop) 1454 if (!retval && infop)
1422 retval = put_user(p->pid, &infop->si_pid); 1455 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1423 if (!retval && infop) 1456 if (!retval && infop)
1424 retval = put_user(p->uid, &infop->si_uid); 1457 retval = put_user(p->uid, &infop->si_uid);
1425 if (!retval) 1458 if (!retval)
1426 retval = p->pid; 1459 retval = task_pid_nr_ns(p, ns);
1427 put_task_struct(p); 1460 put_task_struct(p);
1428 1461
1429 BUG_ON(!retval); 1462 BUG_ON(!retval);
@@ -1443,6 +1476,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1443 int retval; 1476 int retval;
1444 pid_t pid; 1477 pid_t pid;
1445 uid_t uid; 1478 uid_t uid;
1479 struct pid_namespace *ns;
1446 1480
1447 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1481 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1448 return 0; 1482 return 0;
@@ -1457,7 +1491,8 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1457 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1491 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1458 spin_unlock_irq(&p->sighand->siglock); 1492 spin_unlock_irq(&p->sighand->siglock);
1459 1493
1460 pid = p->pid; 1494 ns = current->nsproxy->pid_ns;
1495 pid = task_pid_nr_ns(p, ns);
1461 uid = p->uid; 1496 uid = p->uid;
1462 get_task_struct(p); 1497 get_task_struct(p);
1463 read_unlock(&tasklist_lock); 1498 read_unlock(&tasklist_lock);
@@ -1468,7 +1503,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1468 if (!retval && stat_addr) 1503 if (!retval && stat_addr)
1469 retval = put_user(0xffff, stat_addr); 1504 retval = put_user(0xffff, stat_addr);
1470 if (!retval) 1505 if (!retval)
1471 retval = p->pid; 1506 retval = task_pid_nr_ns(p, ns);
1472 } else { 1507 } else {
1473 retval = wait_noreap_copyout(p, pid, uid, 1508 retval = wait_noreap_copyout(p, pid, uid,
1474 CLD_CONTINUED, SIGCONT, 1509 CLD_CONTINUED, SIGCONT,
@@ -1517,12 +1552,9 @@ repeat:
1517 tsk = current; 1552 tsk = current;
1518 do { 1553 do {
1519 struct task_struct *p; 1554 struct task_struct *p;
1520 struct list_head *_p;
1521 int ret; 1555 int ret;
1522 1556
1523 list_for_each(_p,&tsk->children) { 1557 list_for_each_entry(p, &tsk->children, sibling) {
1524 p = list_entry(_p, struct task_struct, sibling);
1525
1526 ret = eligible_child(pid, options, p); 1558 ret = eligible_child(pid, options, p);
1527 if (!ret) 1559 if (!ret)
1528 continue; 1560 continue;
@@ -1604,9 +1636,8 @@ check_continued:
1604 } 1636 }
1605 } 1637 }
1606 if (!flag) { 1638 if (!flag) {
1607 list_for_each(_p, &tsk->ptrace_children) { 1639 list_for_each_entry(p, &tsk->ptrace_children,
1608 p = list_entry(_p, struct task_struct, 1640 ptrace_list) {
1609 ptrace_list);
1610 if (!eligible_child(pid, options, p)) 1641 if (!eligible_child(pid, options, p))
1611 continue; 1642 continue;
1612 flag = 1; 1643 flag = 1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 490495a39c7e..ddafdfac9456 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,7 +29,7 @@
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h> 32#include <linux/cgroup.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
@@ -50,6 +50,7 @@
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h>
53 54
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
@@ -116,7 +117,7 @@ EXPORT_SYMBOL(free_task);
116 117
117void __put_task_struct(struct task_struct *tsk) 118void __put_task_struct(struct task_struct *tsk)
118{ 119{
119 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 120 WARN_ON(!tsk->exit_state);
120 WARN_ON(atomic_read(&tsk->usage)); 121 WARN_ON(atomic_read(&tsk->usage));
121 WARN_ON(tsk == current); 122 WARN_ON(tsk == current);
122 123
@@ -205,7 +206,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
205} 206}
206 207
207#ifdef CONFIG_MMU 208#ifdef CONFIG_MMU
208static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 209static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
209{ 210{
210 struct vm_area_struct *mpnt, *tmp, **pprev; 211 struct vm_area_struct *mpnt, *tmp, **pprev;
211 struct rb_node **rb_link, *rb_parent; 212 struct rb_node **rb_link, *rb_parent;
@@ -268,7 +269,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
268 get_file(file); 269 get_file(file);
269 if (tmp->vm_flags & VM_DENYWRITE) 270 if (tmp->vm_flags & VM_DENYWRITE)
270 atomic_dec(&inode->i_writecount); 271 atomic_dec(&inode->i_writecount);
271 272
272 /* insert tmp into the share list, just after mpnt */ 273 /* insert tmp into the share list, just after mpnt */
273 spin_lock(&file->f_mapping->i_mmap_lock); 274 spin_lock(&file->f_mapping->i_mmap_lock);
274 tmp->vm_truncate_count = mpnt->vm_truncate_count; 275 tmp->vm_truncate_count = mpnt->vm_truncate_count;
@@ -331,7 +332,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
331#define mm_free_pgd(mm) 332#define mm_free_pgd(mm)
332#endif /* CONFIG_MMU */ 333#endif /* CONFIG_MMU */
333 334
334 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 335__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
335 336
336#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 337#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
337#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 338#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
@@ -583,7 +584,7 @@ fail_nomem:
583 return retval; 584 return retval;
584} 585}
585 586
586static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) 587static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
587{ 588{
588 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); 589 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
589 /* We don't need to lock fs - think why ;-) */ 590 /* We don't need to lock fs - think why ;-) */
@@ -615,7 +616,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
615 616
616EXPORT_SYMBOL_GPL(copy_fs_struct); 617EXPORT_SYMBOL_GPL(copy_fs_struct);
617 618
618static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) 619static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
619{ 620{
620 if (clone_flags & CLONE_FS) { 621 if (clone_flags & CLONE_FS) {
621 atomic_inc(&current->fs->count); 622 atomic_inc(&current->fs->count);
@@ -738,8 +739,8 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
738 /* compute the remainder to be cleared */ 739 /* compute the remainder to be cleared */
739 size = (new_fdt->max_fds - open_files) * sizeof(struct file *); 740 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
740 741
741 /* This is long word aligned thus could use a optimized version */ 742 /* This is long word aligned thus could use a optimized version */
742 memset(new_fds, 0, size); 743 memset(new_fds, 0, size);
743 744
744 if (new_fdt->max_fds > open_files) { 745 if (new_fdt->max_fds > open_files) {
745 int left = (new_fdt->max_fds-open_files)/8; 746 int left = (new_fdt->max_fds-open_files)/8;
@@ -818,7 +819,7 @@ int unshare_files(void)
818 819
819EXPORT_SYMBOL(unshare_files); 820EXPORT_SYMBOL(unshare_files);
820 821
821static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 822static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
822{ 823{
823 struct sighand_struct *sig; 824 struct sighand_struct *sig;
824 825
@@ -841,7 +842,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
841 kmem_cache_free(sighand_cachep, sighand); 842 kmem_cache_free(sighand_cachep, sighand);
842} 843}
843 844
844static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 845static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
845{ 846{
846 struct signal_struct *sig; 847 struct signal_struct *sig;
847 int ret; 848 int ret;
@@ -923,7 +924,7 @@ void __cleanup_signal(struct signal_struct *sig)
923 kmem_cache_free(signal_cachep, sig); 924 kmem_cache_free(signal_cachep, sig);
924} 925}
925 926
926static inline void cleanup_signal(struct task_struct *tsk) 927static void cleanup_signal(struct task_struct *tsk)
927{ 928{
928 struct signal_struct *sig = tsk->signal; 929 struct signal_struct *sig = tsk->signal;
929 930
@@ -933,7 +934,7 @@ static inline void cleanup_signal(struct task_struct *tsk)
933 __cleanup_signal(sig); 934 __cleanup_signal(sig);
934} 935}
935 936
936static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 937static void copy_flags(unsigned long clone_flags, struct task_struct *p)
937{ 938{
938 unsigned long new_flags = p->flags; 939 unsigned long new_flags = p->flags;
939 940
@@ -942,16 +943,17 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
942 if (!(clone_flags & CLONE_PTRACE)) 943 if (!(clone_flags & CLONE_PTRACE))
943 p->ptrace = 0; 944 p->ptrace = 0;
944 p->flags = new_flags; 945 p->flags = new_flags;
946 clear_freeze_flag(p);
945} 947}
946 948
947asmlinkage long sys_set_tid_address(int __user *tidptr) 949asmlinkage long sys_set_tid_address(int __user *tidptr)
948{ 950{
949 current->clear_child_tid = tidptr; 951 current->clear_child_tid = tidptr;
950 952
951 return current->pid; 953 return task_pid_vnr(current);
952} 954}
953 955
954static inline void rt_mutex_init_task(struct task_struct *p) 956static void rt_mutex_init_task(struct task_struct *p)
955{ 957{
956 spin_lock_init(&p->pi_lock); 958 spin_lock_init(&p->pi_lock);
957#ifdef CONFIG_RT_MUTEXES 959#ifdef CONFIG_RT_MUTEXES
@@ -972,12 +974,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
972 unsigned long stack_start, 974 unsigned long stack_start,
973 struct pt_regs *regs, 975 struct pt_regs *regs,
974 unsigned long stack_size, 976 unsigned long stack_size,
975 int __user *parent_tidptr,
976 int __user *child_tidptr, 977 int __user *child_tidptr,
977 struct pid *pid) 978 struct pid *pid)
978{ 979{
979 int retval; 980 int retval;
980 struct task_struct *p = NULL; 981 struct task_struct *p;
982 int cgroup_callbacks_done = 0;
981 983
982 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 984 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
983 return ERR_PTR(-EINVAL); 985 return ERR_PTR(-EINVAL);
@@ -1041,12 +1043,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1041 p->did_exec = 0; 1043 p->did_exec = 0;
1042 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1044 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1043 copy_flags(clone_flags, p); 1045 copy_flags(clone_flags, p);
1044 p->pid = pid_nr(pid);
1045 retval = -EFAULT;
1046 if (clone_flags & CLONE_PARENT_SETTID)
1047 if (put_user(p->pid, parent_tidptr))
1048 goto bad_fork_cleanup_delays_binfmt;
1049
1050 INIT_LIST_HEAD(&p->children); 1046 INIT_LIST_HEAD(&p->children);
1051 INIT_LIST_HEAD(&p->sibling); 1047 INIT_LIST_HEAD(&p->sibling);
1052 p->vfork_done = NULL; 1048 p->vfork_done = NULL;
@@ -1058,6 +1054,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1058 p->utime = cputime_zero; 1054 p->utime = cputime_zero;
1059 p->stime = cputime_zero; 1055 p->stime = cputime_zero;
1060 p->gtime = cputime_zero; 1056 p->gtime = cputime_zero;
1057 p->utimescaled = cputime_zero;
1058 p->stimescaled = cputime_zero;
1061 1059
1062#ifdef CONFIG_TASK_XACCT 1060#ifdef CONFIG_TASK_XACCT
1063 p->rchar = 0; /* I/O counter: bytes read */ 1061 p->rchar = 0; /* I/O counter: bytes read */
@@ -1068,12 +1066,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1068 task_io_accounting_init(p); 1066 task_io_accounting_init(p);
1069 acct_clear_integrals(p); 1067 acct_clear_integrals(p);
1070 1068
1071 p->it_virt_expires = cputime_zero; 1069 p->it_virt_expires = cputime_zero;
1072 p->it_prof_expires = cputime_zero; 1070 p->it_prof_expires = cputime_zero;
1073 p->it_sched_expires = 0; 1071 p->it_sched_expires = 0;
1074 INIT_LIST_HEAD(&p->cpu_timers[0]); 1072 INIT_LIST_HEAD(&p->cpu_timers[0]);
1075 INIT_LIST_HEAD(&p->cpu_timers[1]); 1073 INIT_LIST_HEAD(&p->cpu_timers[1]);
1076 INIT_LIST_HEAD(&p->cpu_timers[2]); 1074 INIT_LIST_HEAD(&p->cpu_timers[2]);
1077 1075
1078 p->lock_depth = -1; /* -1 = no lock */ 1076 p->lock_depth = -1; /* -1 = no lock */
1079 do_posix_clock_monotonic_gettime(&p->start_time); 1077 do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1083,15 +1081,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1083 p->security = NULL; 1081 p->security = NULL;
1084#endif 1082#endif
1085 p->io_context = NULL; 1083 p->io_context = NULL;
1086 p->io_wait = NULL;
1087 p->audit_context = NULL; 1084 p->audit_context = NULL;
1088 cpuset_fork(p); 1085 cgroup_fork(p);
1089#ifdef CONFIG_NUMA 1086#ifdef CONFIG_NUMA
1090 p->mempolicy = mpol_copy(p->mempolicy); 1087 p->mempolicy = mpol_copy(p->mempolicy);
1091 if (IS_ERR(p->mempolicy)) { 1088 if (IS_ERR(p->mempolicy)) {
1092 retval = PTR_ERR(p->mempolicy); 1089 retval = PTR_ERR(p->mempolicy);
1093 p->mempolicy = NULL; 1090 p->mempolicy = NULL;
1094 goto bad_fork_cleanup_cpuset; 1091 goto bad_fork_cleanup_cgroup;
1095 } 1092 }
1096 mpol_fix_fork_child_flag(p); 1093 mpol_fix_fork_child_flag(p);
1097#endif 1094#endif
@@ -1124,10 +1121,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1124 p->blocked_on = NULL; /* not blocked yet */ 1121 p->blocked_on = NULL; /* not blocked yet */
1125#endif 1122#endif
1126 1123
1127 p->tgid = p->pid;
1128 if (clone_flags & CLONE_THREAD)
1129 p->tgid = current->tgid;
1130
1131 if ((retval = security_task_alloc(p))) 1124 if ((retval = security_task_alloc(p)))
1132 goto bad_fork_cleanup_policy; 1125 goto bad_fork_cleanup_policy;
1133 if ((retval = audit_alloc(p))) 1126 if ((retval = audit_alloc(p)))
@@ -1153,6 +1146,24 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1153 if (retval) 1146 if (retval)
1154 goto bad_fork_cleanup_namespaces; 1147 goto bad_fork_cleanup_namespaces;
1155 1148
1149 if (pid != &init_struct_pid) {
1150 retval = -ENOMEM;
1151 pid = alloc_pid(task_active_pid_ns(p));
1152 if (!pid)
1153 goto bad_fork_cleanup_namespaces;
1154
1155 if (clone_flags & CLONE_NEWPID) {
1156 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
1157 if (retval < 0)
1158 goto bad_fork_free_pid;
1159 }
1160 }
1161
1162 p->pid = pid_nr(pid);
1163 p->tgid = p->pid;
1164 if (clone_flags & CLONE_THREAD)
1165 p->tgid = current->tgid;
1166
1156 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1167 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1157 /* 1168 /*
1158 * Clear TID on mm_release()? 1169 * Clear TID on mm_release()?
@@ -1202,6 +1213,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1202 /* Perform scheduler related setup. Assign this task to a CPU. */ 1213 /* Perform scheduler related setup. Assign this task to a CPU. */
1203 sched_fork(p, clone_flags); 1214 sched_fork(p, clone_flags);
1204 1215
1216 /* Now that the task is set up, run cgroup callbacks if
1217 * necessary. We need to run them before the task is visible
1218 * on the tasklist. */
1219 cgroup_fork_callbacks(p);
1220 cgroup_callbacks_done = 1;
1221
1205 /* Need tasklist lock for parent etc handling! */ 1222 /* Need tasklist lock for parent etc handling! */
1206 write_lock_irq(&tasklist_lock); 1223 write_lock_irq(&tasklist_lock);
1207 1224
@@ -1239,12 +1256,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1239 * A fatal signal pending means that current will exit, so the new 1256 * A fatal signal pending means that current will exit, so the new
1240 * thread can't slip out of an OOM kill (or normal SIGKILL). 1257 * thread can't slip out of an OOM kill (or normal SIGKILL).
1241 */ 1258 */
1242 recalc_sigpending(); 1259 recalc_sigpending();
1243 if (signal_pending(current)) { 1260 if (signal_pending(current)) {
1244 spin_unlock(&current->sighand->siglock); 1261 spin_unlock(&current->sighand->siglock);
1245 write_unlock_irq(&tasklist_lock); 1262 write_unlock_irq(&tasklist_lock);
1246 retval = -ERESTARTNOINTR; 1263 retval = -ERESTARTNOINTR;
1247 goto bad_fork_cleanup_namespaces; 1264 goto bad_fork_free_pid;
1248 } 1265 }
1249 1266
1250 if (clone_flags & CLONE_THREAD) { 1267 if (clone_flags & CLONE_THREAD) {
@@ -1273,11 +1290,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1273 __ptrace_link(p, current->parent); 1290 __ptrace_link(p, current->parent);
1274 1291
1275 if (thread_group_leader(p)) { 1292 if (thread_group_leader(p)) {
1276 p->signal->tty = current->signal->tty; 1293 if (clone_flags & CLONE_NEWPID) {
1277 p->signal->pgrp = process_group(current); 1294 p->nsproxy->pid_ns->child_reaper = p;
1278 set_signal_session(p->signal, process_session(current)); 1295 p->signal->tty = NULL;
1279 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1296 set_task_pgrp(p, p->pid);
1280 attach_pid(p, PIDTYPE_SID, task_session(current)); 1297 set_task_session(p, p->pid);
1298 attach_pid(p, PIDTYPE_PGID, pid);
1299 attach_pid(p, PIDTYPE_SID, pid);
1300 } else {
1301 p->signal->tty = current->signal->tty;
1302 set_task_pgrp(p, task_pgrp_nr(current));
1303 set_task_session(p, task_session_nr(current));
1304 attach_pid(p, PIDTYPE_PGID,
1305 task_pgrp(current));
1306 attach_pid(p, PIDTYPE_SID,
1307 task_session(current));
1308 }
1281 1309
1282 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1310 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1283 __get_cpu_var(process_counts)++; 1311 __get_cpu_var(process_counts)++;
@@ -1290,8 +1318,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290 spin_unlock(&current->sighand->siglock); 1318 spin_unlock(&current->sighand->siglock);
1291 write_unlock_irq(&tasklist_lock); 1319 write_unlock_irq(&tasklist_lock);
1292 proc_fork_connector(p); 1320 proc_fork_connector(p);
1321 cgroup_post_fork(p);
1293 return p; 1322 return p;
1294 1323
1324bad_fork_free_pid:
1325 if (pid != &init_struct_pid)
1326 free_pid(pid);
1295bad_fork_cleanup_namespaces: 1327bad_fork_cleanup_namespaces:
1296 exit_task_namespaces(p); 1328 exit_task_namespaces(p);
1297bad_fork_cleanup_keys: 1329bad_fork_cleanup_keys:
@@ -1316,10 +1348,9 @@ bad_fork_cleanup_security:
1316bad_fork_cleanup_policy: 1348bad_fork_cleanup_policy:
1317#ifdef CONFIG_NUMA 1349#ifdef CONFIG_NUMA
1318 mpol_free(p->mempolicy); 1350 mpol_free(p->mempolicy);
1319bad_fork_cleanup_cpuset: 1351bad_fork_cleanup_cgroup:
1320#endif 1352#endif
1321 cpuset_exit(p); 1353 cgroup_exit(p, cgroup_callbacks_done);
1322bad_fork_cleanup_delays_binfmt:
1323 delayacct_tsk_free(p); 1354 delayacct_tsk_free(p);
1324 if (p->binfmt) 1355 if (p->binfmt)
1325 module_put(p->binfmt->module); 1356 module_put(p->binfmt->module);
@@ -1346,7 +1377,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1346 struct task_struct *task; 1377 struct task_struct *task;
1347 struct pt_regs regs; 1378 struct pt_regs regs;
1348 1379
1349 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 1380 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1350 &init_struct_pid); 1381 &init_struct_pid);
1351 if (!IS_ERR(task)) 1382 if (!IS_ERR(task))
1352 init_idle(task, cpu); 1383 init_idle(task, cpu);
@@ -1354,7 +1385,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1354 return task; 1385 return task;
1355} 1386}
1356 1387
1357static inline int fork_traceflag (unsigned clone_flags) 1388static int fork_traceflag(unsigned clone_flags)
1358{ 1389{
1359 if (clone_flags & CLONE_UNTRACED) 1390 if (clone_flags & CLONE_UNTRACED)
1360 return 0; 1391 return 0;
@@ -1385,19 +1416,16 @@ long do_fork(unsigned long clone_flags,
1385{ 1416{
1386 struct task_struct *p; 1417 struct task_struct *p;
1387 int trace = 0; 1418 int trace = 0;
1388 struct pid *pid = alloc_pid();
1389 long nr; 1419 long nr;
1390 1420
1391 if (!pid)
1392 return -EAGAIN;
1393 nr = pid->nr;
1394 if (unlikely(current->ptrace)) { 1421 if (unlikely(current->ptrace)) {
1395 trace = fork_traceflag (clone_flags); 1422 trace = fork_traceflag (clone_flags);
1396 if (trace) 1423 if (trace)
1397 clone_flags |= CLONE_PTRACE; 1424 clone_flags |= CLONE_PTRACE;
1398 } 1425 }
1399 1426
1400 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1427 p = copy_process(clone_flags, stack_start, regs, stack_size,
1428 child_tidptr, NULL);
1401 /* 1429 /*
1402 * Do this prior waking up the new thread - the thread pointer 1430 * Do this prior waking up the new thread - the thread pointer
1403 * might get invalid after that point, if the thread exits quickly. 1431 * might get invalid after that point, if the thread exits quickly.
@@ -1405,6 +1433,17 @@ long do_fork(unsigned long clone_flags,
1405 if (!IS_ERR(p)) { 1433 if (!IS_ERR(p)) {
1406 struct completion vfork; 1434 struct completion vfork;
1407 1435
1436 /*
1437 * this is enough to call pid_nr_ns here, but this if
1438 * improves optimisation of regular fork()
1439 */
1440 nr = (clone_flags & CLONE_NEWPID) ?
1441 task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1442 task_pid_vnr(p);
1443
1444 if (clone_flags & CLONE_PARENT_SETTID)
1445 put_user(nr, parent_tidptr);
1446
1408 if (clone_flags & CLONE_VFORK) { 1447 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1448 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1449 init_completion(&vfork);
@@ -1438,7 +1477,6 @@ long do_fork(unsigned long clone_flags,
1438 } 1477 }
1439 } 1478 }
1440 } else { 1479 } else {
1441 free_pid(pid);
1442 nr = PTR_ERR(p); 1480 nr = PTR_ERR(p);
1443 } 1481 }
1444 return nr; 1482 return nr;
@@ -1483,7 +1521,7 @@ void __init proc_caches_init(void)
1483 * Check constraints on flags passed to the unshare system call and 1521 * Check constraints on flags passed to the unshare system call and
1484 * force unsharing of additional process context as appropriate. 1522 * force unsharing of additional process context as appropriate.
1485 */ 1523 */
1486static inline void check_unshare_flags(unsigned long *flags_ptr) 1524static void check_unshare_flags(unsigned long *flags_ptr)
1487{ 1525{
1488 /* 1526 /*
1489 * If unsharing a thread from a thread group, must also 1527 * If unsharing a thread from a thread group, must also
@@ -1615,7 +1653,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1615 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1653 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1616 struct files_struct *fd, *new_fd = NULL; 1654 struct files_struct *fd, *new_fd = NULL;
1617 struct sem_undo_list *new_ulist = NULL; 1655 struct sem_undo_list *new_ulist = NULL;
1618 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; 1656 struct nsproxy *new_nsproxy = NULL;
1619 1657
1620 check_unshare_flags(&unshare_flags); 1658 check_unshare_flags(&unshare_flags);
1621 1659
@@ -1645,14 +1683,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1645 1683
1646 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1684 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1647 1685
1648 task_lock(current);
1649
1650 if (new_nsproxy) { 1686 if (new_nsproxy) {
1651 old_nsproxy = current->nsproxy; 1687 switch_task_namespaces(current, new_nsproxy);
1652 current->nsproxy = new_nsproxy; 1688 new_nsproxy = NULL;
1653 new_nsproxy = old_nsproxy;
1654 } 1689 }
1655 1690
1691 task_lock(current);
1692
1656 if (new_fs) { 1693 if (new_fs) {
1657 fs = current->fs; 1694 fs = current->fs;
1658 current->fs = new_fs; 1695 current->fs = new_fs;
diff --git a/kernel/futex.c b/kernel/futex.c
index d725676d84f3..32710451dc20 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,9 @@
53#include <linux/signal.h> 53#include <linux/signal.h>
54#include <linux/module.h> 54#include <linux/module.h>
55#include <linux/magic.h> 55#include <linux/magic.h>
56#include <linux/pid.h>
57#include <linux/nsproxy.h>
58
56#include <asm/futex.h> 59#include <asm/futex.h>
57 60
58#include "rtmutex_common.h" 61#include "rtmutex_common.h"
@@ -293,7 +296,7 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
293 */ 296 */
294void drop_futex_key_refs(union futex_key *key) 297void drop_futex_key_refs(union futex_key *key)
295{ 298{
296 if (key->both.ptr == 0) 299 if (!key->both.ptr)
297 return; 300 return;
298 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 301 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
299 case FUT_OFF_INODE: 302 case FUT_OFF_INODE:
@@ -443,8 +446,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
443 struct task_struct *p; 446 struct task_struct *p;
444 447
445 rcu_read_lock(); 448 rcu_read_lock();
446 p = find_task_by_pid(pid); 449 p = find_task_by_vpid(pid);
447
448 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 450 if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
449 p = ERR_PTR(-ESRCH); 451 p = ERR_PTR(-ESRCH);
450 else 452 else
@@ -653,7 +655,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
653 if (!(uval & FUTEX_OWNER_DIED)) { 655 if (!(uval & FUTEX_OWNER_DIED)) {
654 int ret = 0; 656 int ret = 0;
655 657
656 newval = FUTEX_WAITERS | new_owner->pid; 658 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
657 659
658 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 660 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
659 661
@@ -1046,7 +1048,7 @@ static int unqueue_me(struct futex_q *q)
1046 retry: 1048 retry:
1047 lock_ptr = q->lock_ptr; 1049 lock_ptr = q->lock_ptr;
1048 barrier(); 1050 barrier();
1049 if (lock_ptr != 0) { 1051 if (lock_ptr != NULL) {
1050 spin_lock(lock_ptr); 1052 spin_lock(lock_ptr);
1051 /* 1053 /*
1052 * q->lock_ptr can change between reading it and 1054 * q->lock_ptr can change between reading it and
@@ -1106,7 +1108,7 @@ static void unqueue_me_pi(struct futex_q *q)
1106static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1108static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1107 struct task_struct *curr) 1109 struct task_struct *curr)
1108{ 1110{
1109 u32 newtid = curr->pid | FUTEX_WAITERS; 1111 u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS;
1110 struct futex_pi_state *pi_state = q->pi_state; 1112 struct futex_pi_state *pi_state = q->pi_state;
1111 u32 uval, curval, newval; 1113 u32 uval, curval, newval;
1112 int ret; 1114 int ret;
@@ -1368,7 +1370,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1368 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1370 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1369 * the locks. It will most likely not succeed. 1371 * the locks. It will most likely not succeed.
1370 */ 1372 */
1371 newval = current->pid; 1373 newval = task_pid_vnr(current);
1372 1374
1373 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 1375 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1374 1376
@@ -1379,7 +1381,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1379 * Detect deadlocks. In case of REQUEUE_PI this is a valid 1381 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1380 * situation and we return success to user space. 1382 * situation and we return success to user space.
1381 */ 1383 */
1382 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1384 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1383 ret = -EDEADLK; 1385 ret = -EDEADLK;
1384 goto out_unlock_release_sem; 1386 goto out_unlock_release_sem;
1385 } 1387 }
@@ -1408,7 +1410,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1408 */ 1410 */
1409 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 1411 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1410 /* Keep the OWNER_DIED bit */ 1412 /* Keep the OWNER_DIED bit */
1411 newval = (curval & ~FUTEX_TID_MASK) | current->pid; 1413 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1412 ownerdied = 0; 1414 ownerdied = 0;
1413 lock_taken = 1; 1415 lock_taken = 1;
1414 } 1416 }
@@ -1587,7 +1589,7 @@ retry:
1587 /* 1589 /*
1588 * We release only a lock we actually own: 1590 * We release only a lock we actually own:
1589 */ 1591 */
1590 if ((uval & FUTEX_TID_MASK) != current->pid) 1592 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1591 return -EPERM; 1593 return -EPERM;
1592 /* 1594 /*
1593 * First take all the futex related locks: 1595 * First take all the futex related locks:
@@ -1608,7 +1610,7 @@ retry_unlocked:
1608 * anyone else up: 1610 * anyone else up:
1609 */ 1611 */
1610 if (!(uval & FUTEX_OWNER_DIED)) 1612 if (!(uval & FUTEX_OWNER_DIED))
1611 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); 1613 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
1612 1614
1613 1615
1614 if (unlikely(uval == -EFAULT)) 1616 if (unlikely(uval == -EFAULT))
@@ -1617,7 +1619,7 @@ retry_unlocked:
1617 * Rare case: we managed to release the lock atomically, 1619 * Rare case: we managed to release the lock atomically,
1618 * no need to wake anyone else up: 1620 * no need to wake anyone else up:
1619 */ 1621 */
1620 if (unlikely(uval == current->pid)) 1622 if (unlikely(uval == task_pid_vnr(current)))
1621 goto out_unlock; 1623 goto out_unlock;
1622 1624
1623 /* 1625 /*
@@ -1854,7 +1856,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1854 1856
1855 ret = -ESRCH; 1857 ret = -ESRCH;
1856 rcu_read_lock(); 1858 rcu_read_lock();
1857 p = find_task_by_pid(pid); 1859 p = find_task_by_vpid(pid);
1858 if (!p) 1860 if (!p)
1859 goto err_unlock; 1861 goto err_unlock;
1860 ret = -EPERM; 1862 ret = -EPERM;
@@ -1887,7 +1889,7 @@ retry:
1887 if (get_user(uval, uaddr)) 1889 if (get_user(uval, uaddr))
1888 return -1; 1890 return -1;
1889 1891
1890 if ((uval & FUTEX_TID_MASK) == curr->pid) { 1892 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
1891 /* 1893 /*
1892 * Ok, this dying thread is truly holding a futex 1894 * Ok, this dying thread is truly holding a futex
1893 * of interest. Set the OWNER_DIED bit atomically 1895 * of interest. Set the OWNER_DIED bit atomically
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c2e2954b713..00b572666cc7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <linux/nsproxy.h>
11#include <linux/futex.h> 12#include <linux/futex.h>
12 13
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
@@ -124,7 +125,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
124 125
125 ret = -ESRCH; 126 ret = -ESRCH;
126 read_lock(&tasklist_lock); 127 read_lock(&tasklist_lock);
127 p = find_task_by_pid(pid); 128 p = find_task_by_vpid(pid);
128 if (!p) 129 if (!p)
129 goto err_unlock; 130 goto err_unlock;
130 ret = -EPERM; 131 ret = -EPERM;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index dc8a4451d79b..b6d2ff7e37ee 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -412,7 +412,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
412 /* 412 /*
413 * When the callback is running, we do not reprogram the clock event 413 * When the callback is running, we do not reprogram the clock event
414 * device. The timer callback is either running on a different CPU or 414 * device. The timer callback is either running on a different CPU or
415 * the callback is executed in the hrtimer_interupt context. The 415 * the callback is executed in the hrtimer_interrupt context. The
416 * reprogramming is handled either by the softirq, which called the 416 * reprogramming is handled either by the softirq, which called the
417 * callback or at the end of the hrtimer_interrupt. 417 * callback or at the end of the hrtimer_interrupt.
418 */ 418 */
@@ -638,7 +638,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
638#endif 638#endif
639 639
640/* 640/*
641 * Counterpart to lock_timer_base above: 641 * Counterpart to lock_hrtimer_base above:
642 */ 642 */
643static inline 643static inline
644void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 644void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
@@ -1286,8 +1286,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1286long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 1286long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1287{ 1287{
1288 struct hrtimer_sleeper t; 1288 struct hrtimer_sleeper t;
1289 struct timespec __user *rmtp; 1289 struct timespec *rmtp;
1290 struct timespec tu;
1291 ktime_t time; 1290 ktime_t time;
1292 1291
1293 restart->fn = do_no_restart_syscall; 1292 restart->fn = do_no_restart_syscall;
@@ -1298,14 +1297,12 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1298 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1297 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1299 return 0; 1298 return 0;
1300 1299
1301 rmtp = (struct timespec __user *) restart->arg1; 1300 rmtp = (struct timespec *)restart->arg1;
1302 if (rmtp) { 1301 if (rmtp) {
1303 time = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1302 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
1304 if (time.tv64 <= 0) 1303 if (time.tv64 <= 0)
1305 return 0; 1304 return 0;
1306 tu = ktime_to_timespec(time); 1305 *rmtp = ktime_to_timespec(time);
1307 if (copy_to_user(rmtp, &tu, sizeof(tu)))
1308 return -EFAULT;
1309 } 1306 }
1310 1307
1311 restart->fn = hrtimer_nanosleep_restart; 1308 restart->fn = hrtimer_nanosleep_restart;
@@ -1314,12 +1311,11 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1314 return -ERESTART_RESTARTBLOCK; 1311 return -ERESTART_RESTARTBLOCK;
1315} 1312}
1316 1313
1317long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1314long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
1318 const enum hrtimer_mode mode, const clockid_t clockid) 1315 const enum hrtimer_mode mode, const clockid_t clockid)
1319{ 1316{
1320 struct restart_block *restart; 1317 struct restart_block *restart;
1321 struct hrtimer_sleeper t; 1318 struct hrtimer_sleeper t;
1322 struct timespec tu;
1323 ktime_t rem; 1319 ktime_t rem;
1324 1320
1325 hrtimer_init(&t.timer, clockid, mode); 1321 hrtimer_init(&t.timer, clockid, mode);
@@ -1335,9 +1331,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1335 rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1331 rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
1336 if (rem.tv64 <= 0) 1332 if (rem.tv64 <= 0)
1337 return 0; 1333 return 0;
1338 tu = ktime_to_timespec(rem); 1334 *rmtp = ktime_to_timespec(rem);
1339 if (copy_to_user(rmtp, &tu, sizeof(tu)))
1340 return -EFAULT;
1341 } 1335 }
1342 1336
1343 restart = &current_thread_info()->restart_block; 1337 restart = &current_thread_info()->restart_block;
@@ -1353,7 +1347,8 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1353asmlinkage long 1347asmlinkage long
1354sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1348sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1355{ 1349{
1356 struct timespec tu; 1350 struct timespec tu, rmt;
1351 int ret;
1357 1352
1358 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1353 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1359 return -EFAULT; 1354 return -EFAULT;
@@ -1361,7 +1356,15 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1361 if (!timespec_valid(&tu)) 1356 if (!timespec_valid(&tu))
1362 return -EINVAL; 1357 return -EINVAL;
1363 1358
1364 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1359 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
1360 CLOCK_MONOTONIC);
1361
1362 if (ret && rmtp) {
1363 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1364 return -EFAULT;
1365 }
1366
1367 return ret;
1365} 1368}
1366 1369
1367/* 1370/*
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 3205e8e114fa..2fab344dbf56 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -130,7 +130,7 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
130enum hrtimer_restart it_real_fn(struct hrtimer *timer) 130enum hrtimer_restart it_real_fn(struct hrtimer *timer)
131{ 131{
132 struct signal_struct *sig = 132 struct signal_struct *sig =
133 container_of(timer, struct signal_struct, real_timer); 133 container_of(timer, struct signal_struct, real_timer);
134 134
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
136 136
@@ -291,6 +291,6 @@ asmlinkage long sys_setitimer(int which,
291 return error; 291 return error;
292 292
293 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) 293 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
294 return -EFAULT; 294 return -EFAULT;
295 return 0; 295 return 0;
296} 296}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7885269b0da2..aa74a1ef2da8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -51,7 +51,7 @@ struct resource crashk_res = {
51 51
52int kexec_should_crash(struct task_struct *p) 52int kexec_should_crash(struct task_struct *p)
53{ 53{
54 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) 54 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
55 return 1; 55 return 1;
56 return 0; 56 return 0;
57} 57}
@@ -785,7 +785,7 @@ static int kimage_load_normal_segment(struct kimage *image,
785 size_t uchunk, mchunk; 785 size_t uchunk, mchunk;
786 786
787 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 787 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
788 if (page == 0) { 788 if (!page) {
789 result = -ENOMEM; 789 result = -ENOMEM;
790 goto out; 790 goto out;
791 } 791 }
@@ -844,7 +844,7 @@ static int kimage_load_crash_segment(struct kimage *image,
844 size_t uchunk, mchunk; 844 size_t uchunk, mchunk;
845 845
846 page = pfn_to_page(maddr >> PAGE_SHIFT); 846 page = pfn_to_page(maddr >> PAGE_SHIFT);
847 if (page == 0) { 847 if (!page) {
848 result = -ENOMEM; 848 result = -ENOMEM;
849 goto out; 849 goto out;
850 } 850 }
@@ -1146,6 +1146,172 @@ static int __init crash_notes_memory_init(void)
1146} 1146}
1147module_init(crash_notes_memory_init) 1147module_init(crash_notes_memory_init)
1148 1148
1149
1150/*
1151 * parsing the "crashkernel" commandline
1152 *
1153 * this code is intended to be called from architecture specific code
1154 */
1155
1156
1157/*
1158 * This function parses command lines in the format
1159 *
1160 * crashkernel=ramsize-range:size[,...][@offset]
1161 *
1162 * The function returns 0 on success and -EINVAL on failure.
1163 */
1164static int __init parse_crashkernel_mem(char *cmdline,
1165 unsigned long long system_ram,
1166 unsigned long long *crash_size,
1167 unsigned long long *crash_base)
1168{
1169 char *cur = cmdline, *tmp;
1170
1171 /* for each entry of the comma-separated list */
1172 do {
1173 unsigned long long start, end = ULLONG_MAX, size;
1174
1175 /* get the start of the range */
1176 start = memparse(cur, &tmp);
1177 if (cur == tmp) {
1178 pr_warning("crashkernel: Memory value expected\n");
1179 return -EINVAL;
1180 }
1181 cur = tmp;
1182 if (*cur != '-') {
1183 pr_warning("crashkernel: '-' expected\n");
1184 return -EINVAL;
1185 }
1186 cur++;
1187
1188 /* if no ':' is here, than we read the end */
1189 if (*cur != ':') {
1190 end = memparse(cur, &tmp);
1191 if (cur == tmp) {
1192 pr_warning("crashkernel: Memory "
1193 "value expected\n");
1194 return -EINVAL;
1195 }
1196 cur = tmp;
1197 if (end <= start) {
1198 pr_warning("crashkernel: end <= start\n");
1199 return -EINVAL;
1200 }
1201 }
1202
1203 if (*cur != ':') {
1204 pr_warning("crashkernel: ':' expected\n");
1205 return -EINVAL;
1206 }
1207 cur++;
1208
1209 size = memparse(cur, &tmp);
1210 if (cur == tmp) {
1211 pr_warning("Memory value expected\n");
1212 return -EINVAL;
1213 }
1214 cur = tmp;
1215 if (size >= system_ram) {
1216 pr_warning("crashkernel: invalid size\n");
1217 return -EINVAL;
1218 }
1219
1220 /* match ? */
1221 if (system_ram >= start && system_ram <= end) {
1222 *crash_size = size;
1223 break;
1224 }
1225 } while (*cur++ == ',');
1226
1227 if (*crash_size > 0) {
1228 while (*cur != ' ' && *cur != '@')
1229 cur++;
1230 if (*cur == '@') {
1231 cur++;
1232 *crash_base = memparse(cur, &tmp);
1233 if (cur == tmp) {
1234 pr_warning("Memory value expected "
1235 "after '@'\n");
1236 return -EINVAL;
1237 }
1238 }
1239 }
1240
1241 return 0;
1242}
1243
1244/*
1245 * That function parses "simple" (old) crashkernel command lines like
1246 *
1247 * crashkernel=size[@offset]
1248 *
1249 * It returns 0 on success and -EINVAL on failure.
1250 */
1251static int __init parse_crashkernel_simple(char *cmdline,
1252 unsigned long long *crash_size,
1253 unsigned long long *crash_base)
1254{
1255 char *cur = cmdline;
1256
1257 *crash_size = memparse(cmdline, &cur);
1258 if (cmdline == cur) {
1259 pr_warning("crashkernel: memory value expected\n");
1260 return -EINVAL;
1261 }
1262
1263 if (*cur == '@')
1264 *crash_base = memparse(cur+1, &cur);
1265
1266 return 0;
1267}
1268
1269/*
1270 * That function is the entry point for command line parsing and should be
1271 * called from the arch-specific code.
1272 */
1273int __init parse_crashkernel(char *cmdline,
1274 unsigned long long system_ram,
1275 unsigned long long *crash_size,
1276 unsigned long long *crash_base)
1277{
1278 char *p = cmdline, *ck_cmdline = NULL;
1279 char *first_colon, *first_space;
1280
1281 BUG_ON(!crash_size || !crash_base);
1282 *crash_size = 0;
1283 *crash_base = 0;
1284
1285 /* find crashkernel and use the last one if there are more */
1286 p = strstr(p, "crashkernel=");
1287 while (p) {
1288 ck_cmdline = p;
1289 p = strstr(p+1, "crashkernel=");
1290 }
1291
1292 if (!ck_cmdline)
1293 return -EINVAL;
1294
1295 ck_cmdline += 12; /* strlen("crashkernel=") */
1296
1297 /*
1298 * if the commandline contains a ':', then that's the extended
1299 * syntax -- if not, it must be the classic syntax
1300 */
1301 first_colon = strchr(ck_cmdline, ':');
1302 first_space = strchr(ck_cmdline, ' ');
1303 if (first_colon && (!first_space || first_colon < first_space))
1304 return parse_crashkernel_mem(ck_cmdline, system_ram,
1305 crash_size, crash_base);
1306 else
1307 return parse_crashkernel_simple(ck_cmdline, crash_size,
1308 crash_base);
1309
1310 return 0;
1311}
1312
1313
1314
1149void crash_save_vmcoreinfo(void) 1315void crash_save_vmcoreinfo(void)
1150{ 1316{
1151 u32 *buf; 1317 u32 *buf;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a6f1ee9c92d9..55fe0c7cd95f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -511,11 +511,11 @@ static void lockdep_print_held_locks(struct task_struct *curr)
511 int i, depth = curr->lockdep_depth; 511 int i, depth = curr->lockdep_depth;
512 512
513 if (!depth) { 513 if (!depth) {
514 printk("no locks held by %s/%d.\n", curr->comm, curr->pid); 514 printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
515 return; 515 return;
516 } 516 }
517 printk("%d lock%s held by %s/%d:\n", 517 printk("%d lock%s held by %s/%d:\n",
518 depth, depth > 1 ? "s" : "", curr->comm, curr->pid); 518 depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
519 519
520 for (i = 0; i < depth; i++) { 520 for (i = 0; i < depth; i++) {
521 printk(" #%d: ", i); 521 printk(" #%d: ", i);
@@ -904,7 +904,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
904 print_kernel_version(); 904 print_kernel_version();
905 printk( "-------------------------------------------------------\n"); 905 printk( "-------------------------------------------------------\n");
906 printk("%s/%d is trying to acquire lock:\n", 906 printk("%s/%d is trying to acquire lock:\n",
907 curr->comm, curr->pid); 907 curr->comm, task_pid_nr(curr));
908 print_lock(check_source); 908 print_lock(check_source);
909 printk("\nbut task is already holding lock:\n"); 909 printk("\nbut task is already holding lock:\n");
910 print_lock(check_target); 910 print_lock(check_target);
@@ -1085,7 +1085,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1085 print_kernel_version(); 1085 print_kernel_version();
1086 printk( "------------------------------------------------------\n"); 1086 printk( "------------------------------------------------------\n");
1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1088 curr->comm, curr->pid, 1088 curr->comm, task_pid_nr(curr),
1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, 1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
1091 curr->hardirqs_enabled, 1091 curr->hardirqs_enabled,
@@ -1237,7 +1237,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1237 print_kernel_version(); 1237 print_kernel_version();
1238 printk( "---------------------------------------------\n"); 1238 printk( "---------------------------------------------\n");
1239 printk("%s/%d is trying to acquire lock:\n", 1239 printk("%s/%d is trying to acquire lock:\n",
1240 curr->comm, curr->pid); 1240 curr->comm, task_pid_nr(curr));
1241 print_lock(next); 1241 print_lock(next);
1242 printk("\nbut task is already holding lock:\n"); 1242 printk("\nbut task is already holding lock:\n");
1243 print_lock(prev); 1243 print_lock(prev);
@@ -1521,7 +1521,7 @@ cache_hit:
1521} 1521}
1522 1522
1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1524 struct held_lock *hlock, int chain_head, u64 chain_key) 1524 struct held_lock *hlock, int chain_head, u64 chain_key)
1525{ 1525{
1526 /* 1526 /*
1527 * Trylock needs to maintain the stack of held locks, but it 1527 * Trylock needs to maintain the stack of held locks, but it
@@ -1641,7 +1641,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1641 usage_str[prev_bit], usage_str[new_bit]); 1641 usage_str[prev_bit], usage_str[new_bit]);
1642 1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", 1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid, 1644 curr->comm, task_pid_nr(curr),
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, 1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, 1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr), 1647 trace_hardirqs_enabled(curr),
@@ -1694,7 +1694,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1694 print_kernel_version(); 1694 print_kernel_version();
1695 printk( "---------------------------------------------------------\n"); 1695 printk( "---------------------------------------------------------\n");
1696 printk("%s/%d just changed the state of lock:\n", 1696 printk("%s/%d just changed the state of lock:\n",
1697 curr->comm, curr->pid); 1697 curr->comm, task_pid_nr(curr));
1698 print_lock(this); 1698 print_lock(this);
1699 if (forwards) 1699 if (forwards)
1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
@@ -2487,7 +2487,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
2487 printk( "[ BUG: bad unlock balance detected! ]\n"); 2487 printk( "[ BUG: bad unlock balance detected! ]\n");
2488 printk( "-------------------------------------\n"); 2488 printk( "-------------------------------------\n");
2489 printk("%s/%d is trying to release lock (", 2489 printk("%s/%d is trying to release lock (",
2490 curr->comm, curr->pid); 2490 curr->comm, task_pid_nr(curr));
2491 print_lockdep_cache(lock); 2491 print_lockdep_cache(lock);
2492 printk(") at:\n"); 2492 printk(") at:\n");
2493 print_ip_sym(ip); 2493 print_ip_sym(ip);
@@ -2737,7 +2737,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2737 printk( "[ BUG: bad contention detected! ]\n"); 2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n"); 2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (", 2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid); 2740 curr->comm, task_pid_nr(curr));
2741 print_lockdep_cache(lock); 2741 print_lockdep_cache(lock);
2742 printk(") at:\n"); 2742 printk(") at:\n");
2743 print_ip_sym(ip); 2743 print_ip_sym(ip);
@@ -3072,7 +3072,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3072 printk( "[ BUG: held lock freed! ]\n"); 3072 printk( "[ BUG: held lock freed! ]\n");
3073 printk( "-------------------------\n"); 3073 printk( "-------------------------\n");
3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3075 curr->comm, curr->pid, mem_from, mem_to-1); 3075 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3076 print_lock(hlock); 3076 print_lock(hlock);
3077 lockdep_print_held_locks(curr); 3077 lockdep_print_held_locks(curr);
3078 3078
@@ -3125,7 +3125,7 @@ static void print_held_locks_bug(struct task_struct *curr)
3125 printk( "[ BUG: lock held at task exit time! ]\n"); 3125 printk( "[ BUG: lock held at task exit time! ]\n");
3126 printk( "-------------------------------------\n"); 3126 printk( "-------------------------------------\n");
3127 printk("%s/%d is exiting with locks still held!\n", 3127 printk("%s/%d is exiting with locks still held!\n",
3128 curr->comm, curr->pid); 3128 curr->comm, task_pid_nr(curr));
3129 lockdep_print_held_locks(curr); 3129 lockdep_print_held_locks(curr);
3130 3130
3131 printk("\nstack backtrace:\n"); 3131 printk("\nstack backtrace:\n");
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 000000000000..ccb48d9a3657
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,525 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26
27extern struct marker __start___markers[];
28extern struct marker __stop___markers[];
29
30/*
31 * module_mutex nests inside markers_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync.
33 */
34static DEFINE_MUTEX(markers_mutex);
35
36/*
37 * Marker deferred synchronization.
38 * Upon marker probe_unregister, we delay call to synchronize_sched() to
39 * accelerate mass unregistration (only when there is no more reference to a
40 * given module do we call synchronize_sched()). However, we need to make sure
41 * every critical region has ended before we re-arm a marker that has been
42 * unregistered and then registered back with a different probe data.
43 */
44static int deferred_sync;
45
46/*
47 * Marker hash table, containing the active markers.
48 * Protected by module_mutex.
49 */
50#define MARKER_HASH_BITS 6
51#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
52
53struct marker_entry {
54 struct hlist_node hlist;
55 char *format;
56 marker_probe_func *probe;
57 void *private;
58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; /* Contains name'\0'format'\0' */
60};
61
62static struct hlist_head marker_table[MARKER_TABLE_SIZE];
63
64/**
65 * __mark_empty_function - Empty probe callback
66 * @mdata: pointer of type const struct marker
67 * @fmt: format string
68 * @...: variable argument list
69 *
70 * Empty callback provided as a probe to the markers. By providing this to a
71 * disabled marker, we make sure the execution flow is always valid even
72 * though the function pointer change and the marker enabling are two distinct
73 * operations that modifies the execution flow of preemptible code.
74 */
75void __mark_empty_function(const struct marker *mdata, void *private,
76 const char *fmt, ...)
77{
78}
79EXPORT_SYMBOL_GPL(__mark_empty_function);
80
81/*
82 * Get marker if the marker is present in the marker hash table.
83 * Must be called with markers_mutex held.
84 * Returns NULL if not present.
85 */
86static struct marker_entry *get_marker(const char *name)
87{
88 struct hlist_head *head;
89 struct hlist_node *node;
90 struct marker_entry *e;
91 u32 hash = jhash(name, strlen(name), 0);
92
93 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
94 hlist_for_each_entry(e, node, head, hlist) {
95 if (!strcmp(name, e->name))
96 return e;
97 }
98 return NULL;
99}
100
101/*
102 * Add the marker to the marker hash table. Must be called with markers_mutex
103 * held.
104 */
105static int add_marker(const char *name, const char *format,
106 marker_probe_func *probe, void *private)
107{
108 struct hlist_head *head;
109 struct hlist_node *node;
110 struct marker_entry *e;
111 size_t name_len = strlen(name) + 1;
112 size_t format_len = 0;
113 u32 hash = jhash(name, name_len-1, 0);
114
115 if (format)
116 format_len = strlen(format) + 1;
117 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
118 hlist_for_each_entry(e, node, head, hlist) {
119 if (!strcmp(name, e->name)) {
120 printk(KERN_NOTICE
121 "Marker %s busy, probe %p already installed\n",
122 name, e->probe);
123 return -EBUSY; /* Already there */
124 }
125 }
126 /*
127 * Using kmalloc here to allocate a variable length element. Could
128 * cause some memory fragmentation if overused.
129 */
130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
131 GFP_KERNEL);
132 if (!e)
133 return -ENOMEM;
134 memcpy(&e->name[0], name, name_len);
135 if (format) {
136 e->format = &e->name[name_len];
137 memcpy(e->format, format, format_len);
138 trace_mark(core_marker_format, "name %s format %s",
139 e->name, e->format);
140 } else
141 e->format = NULL;
142 e->probe = probe;
143 e->private = private;
144 e->refcount = 0;
145 hlist_add_head(&e->hlist, head);
146 return 0;
147}
148
149/*
150 * Remove the marker from the marker hash table. Must be called with mutex_lock
151 * held.
152 */
153static void *remove_marker(const char *name)
154{
155 struct hlist_head *head;
156 struct hlist_node *node;
157 struct marker_entry *e;
158 int found = 0;
159 size_t len = strlen(name) + 1;
160 void *private = NULL;
161 u32 hash = jhash(name, len-1, 0);
162
163 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
164 hlist_for_each_entry(e, node, head, hlist) {
165 if (!strcmp(name, e->name)) {
166 found = 1;
167 break;
168 }
169 }
170 if (found) {
171 private = e->private;
172 hlist_del(&e->hlist);
173 kfree(e);
174 }
175 return private;
176}
177
178/*
179 * Set the mark_entry format to the format found in the element.
180 */
181static int marker_set_format(struct marker_entry **entry, const char *format)
182{
183 struct marker_entry *e;
184 size_t name_len = strlen((*entry)->name) + 1;
185 size_t format_len = strlen(format) + 1;
186
187 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
188 GFP_KERNEL);
189 if (!e)
190 return -ENOMEM;
191 memcpy(&e->name[0], (*entry)->name, name_len);
192 e->format = &e->name[name_len];
193 memcpy(e->format, format, format_len);
194 e->probe = (*entry)->probe;
195 e->private = (*entry)->private;
196 e->refcount = (*entry)->refcount;
197 hlist_add_before(&e->hlist, &(*entry)->hlist);
198 hlist_del(&(*entry)->hlist);
199 kfree(*entry);
200 *entry = e;
201 trace_mark(core_marker_format, "name %s format %s",
202 e->name, e->format);
203 return 0;
204}
205
206/*
207 * Sets the probe callback corresponding to one marker.
208 */
209static int set_marker(struct marker_entry **entry, struct marker *elem)
210{
211 int ret;
212 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
213
214 if ((*entry)->format) {
215 if (strcmp((*entry)->format, elem->format) != 0) {
216 printk(KERN_NOTICE
217 "Format mismatch for probe %s "
218 "(%s), marker (%s)\n",
219 (*entry)->name,
220 (*entry)->format,
221 elem->format);
222 return -EPERM;
223 }
224 } else {
225 ret = marker_set_format(entry, elem->format);
226 if (ret)
227 return ret;
228 }
229 elem->call = (*entry)->probe;
230 elem->private = (*entry)->private;
231 elem->state = 1;
232 return 0;
233}
234
235/*
236 * Disable a marker and its probe callback.
237 * Note: only after a synchronize_sched() issued after setting elem->call to the
238 * empty function insures that the original callback is not used anymore. This
239 * insured by preemption disabling around the call site.
240 */
241static void disable_marker(struct marker *elem)
242{
243 elem->state = 0;
244 elem->call = __mark_empty_function;
245 /*
246 * Leave the private data and id there, because removal is racy and
247 * should be done only after a synchronize_sched(). These are never used
248 * until the next initialization anyway.
249 */
250}
251
252/**
253 * marker_update_probe_range - Update a probe range
254 * @begin: beginning of the range
255 * @end: end of the range
256 * @probe_module: module address of the probe being updated
257 * @refcount: number of references left to the given probe_module (out)
258 *
259 * Updates the probe callback corresponding to a range of markers.
260 * Must be called with markers_mutex held.
261 */
262void marker_update_probe_range(struct marker *begin,
263 struct marker *end, struct module *probe_module,
264 int *refcount)
265{
266 struct marker *iter;
267 struct marker_entry *mark_entry;
268
269 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) {
272 set_marker(&mark_entry, iter);
273 /*
274 * ignore error, continue
275 */
276 if (probe_module)
277 if (probe_module ==
278 __module_text_address((unsigned long)mark_entry->probe))
279 (*refcount)++;
280 } else {
281 disable_marker(iter);
282 }
283 }
284}
285
286/*
287 * Update probes, removing the faulty probes.
288 * Issues a synchronize_sched() when no reference to the module passed
289 * as parameter is found in the probes so the probe module can be
290 * safely unloaded from now on.
291 */
292static void marker_update_probes(struct module *probe_module)
293{
294 int refcount = 0;
295
296 mutex_lock(&markers_mutex);
297 /* Core kernel markers */
298 marker_update_probe_range(__start___markers,
299 __stop___markers, probe_module, &refcount);
300 /* Markers in modules. */
301 module_update_markers(probe_module, &refcount);
302 if (probe_module && refcount == 0) {
303 synchronize_sched();
304 deferred_sync = 0;
305 }
306 mutex_unlock(&markers_mutex);
307}
308
309/**
310 * marker_probe_register - Connect a probe to a marker
311 * @name: marker name
312 * @format: format string
313 * @probe: probe handler
314 * @private: probe private data
315 *
316 * private data must be a valid allocated memory address, or NULL.
317 * Returns 0 if ok, error value on error.
318 */
319int marker_probe_register(const char *name, const char *format,
320 marker_probe_func *probe, void *private)
321{
322 struct marker_entry *entry;
323 int ret = 0, need_update = 0;
324
325 mutex_lock(&markers_mutex);
326 entry = get_marker(name);
327 if (entry && entry->refcount) {
328 ret = -EBUSY;
329 goto end;
330 }
331 if (deferred_sync) {
332 synchronize_sched();
333 deferred_sync = 0;
334 }
335 ret = add_marker(name, format, probe, private);
336 if (ret)
337 goto end;
338 need_update = 1;
339end:
340 mutex_unlock(&markers_mutex);
341 if (need_update)
342 marker_update_probes(NULL);
343 return ret;
344}
345EXPORT_SYMBOL_GPL(marker_probe_register);
346
347/**
348 * marker_probe_unregister - Disconnect a probe from a marker
349 * @name: marker name
350 *
351 * Returns the private data given to marker_probe_register, or an ERR_PTR().
352 */
353void *marker_probe_unregister(const char *name)
354{
355 struct module *probe_module;
356 struct marker_entry *entry;
357 void *private;
358 int need_update = 0;
359
360 mutex_lock(&markers_mutex);
361 entry = get_marker(name);
362 if (!entry) {
363 private = ERR_PTR(-ENOENT);
364 goto end;
365 }
366 entry->refcount = 0;
367 /* In what module is the probe handler ? */
368 probe_module = __module_text_address((unsigned long)entry->probe);
369 private = remove_marker(name);
370 deferred_sync = 1;
371 need_update = 1;
372end:
373 mutex_unlock(&markers_mutex);
374 if (need_update)
375 marker_update_probes(probe_module);
376 return private;
377}
378EXPORT_SYMBOL_GPL(marker_probe_unregister);
379
380/**
381 * marker_probe_unregister_private_data - Disconnect a probe from a marker
382 * @private: probe private data
383 *
384 * Unregister a marker by providing the registered private data.
385 * Returns the private data given to marker_probe_register, or an ERR_PTR().
386 */
387void *marker_probe_unregister_private_data(void *private)
388{
389 struct module *probe_module;
390 struct hlist_head *head;
391 struct hlist_node *node;
392 struct marker_entry *entry;
393 int found = 0;
394 unsigned int i;
395 int need_update = 0;
396
397 mutex_lock(&markers_mutex);
398 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
399 head = &marker_table[i];
400 hlist_for_each_entry(entry, node, head, hlist) {
401 if (entry->private == private) {
402 found = 1;
403 goto iter_end;
404 }
405 }
406 }
407iter_end:
408 if (!found) {
409 private = ERR_PTR(-ENOENT);
410 goto end;
411 }
412 entry->refcount = 0;
413 /* In what module is the probe handler ? */
414 probe_module = __module_text_address((unsigned long)entry->probe);
415 private = remove_marker(entry->name);
416 deferred_sync = 1;
417 need_update = 1;
418end:
419 mutex_unlock(&markers_mutex);
420 if (need_update)
421 marker_update_probes(probe_module);
422 return private;
423}
424EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
425
426/**
427 * marker_arm - Arm a marker
428 * @name: marker name
429 *
430 * Activate a marker. It keeps a reference count of the number of
431 * arming/disarming done.
432 * Returns 0 if ok, error value on error.
433 */
434int marker_arm(const char *name)
435{
436 struct marker_entry *entry;
437 int ret = 0, need_update = 0;
438
439 mutex_lock(&markers_mutex);
440 entry = get_marker(name);
441 if (!entry) {
442 ret = -ENOENT;
443 goto end;
444 }
445 /*
446 * Only need to update probes when refcount passes from 0 to 1.
447 */
448 if (entry->refcount++)
449 goto end;
450 need_update = 1;
451end:
452 mutex_unlock(&markers_mutex);
453 if (need_update)
454 marker_update_probes(NULL);
455 return ret;
456}
457EXPORT_SYMBOL_GPL(marker_arm);
458
459/**
460 * marker_disarm - Disarm a marker
461 * @name: marker name
462 *
463 * Disarm a marker. It keeps a reference count of the number of arming/disarming
464 * done.
465 * Returns 0 if ok, error value on error.
466 */
467int marker_disarm(const char *name)
468{
469 struct marker_entry *entry;
470 int ret = 0, need_update = 0;
471
472 mutex_lock(&markers_mutex);
473 entry = get_marker(name);
474 if (!entry) {
475 ret = -ENOENT;
476 goto end;
477 }
478 /*
479 * Only permit decrement refcount if higher than 0.
480 * Do probe update only on 1 -> 0 transition.
481 */
482 if (entry->refcount) {
483 if (--entry->refcount)
484 goto end;
485 } else {
486 ret = -EPERM;
487 goto end;
488 }
489 need_update = 1;
490end:
491 mutex_unlock(&markers_mutex);
492 if (need_update)
493 marker_update_probes(NULL);
494 return ret;
495}
496EXPORT_SYMBOL_GPL(marker_disarm);
497
498/**
499 * marker_get_private_data - Get a marker's probe private data
500 * @name: marker name
501 *
502 * Returns the private data pointer, or an ERR_PTR.
503 * The private data pointer should _only_ be dereferenced if the caller is the
504 * owner of the data, or its content could vanish. This is mostly used to
505 * confirm that a caller is the owner of a registered probe.
506 */
507void *marker_get_private_data(const char *name)
508{
509 struct hlist_head *head;
510 struct hlist_node *node;
511 struct marker_entry *e;
512 size_t name_len = strlen(name) + 1;
513 u32 hash = jhash(name, name_len-1, 0);
514 int found = 0;
515
516 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
517 hlist_for_each_entry(e, node, head, hlist) {
518 if (!strcmp(name, e->name)) {
519 found = 1;
520 return e->private;
521 }
522 }
523 return ERR_PTR(-ENOENT);
524}
525EXPORT_SYMBOL_GPL(marker_get_private_data);
diff --git a/kernel/module.c b/kernel/module.c
index a389b423c279..3202c9950073 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,7 +105,7 @@ void __module_put_and_exit(struct module *mod, long code)
105 do_exit(code); 105 do_exit(code);
106} 106}
107EXPORT_SYMBOL(__module_put_and_exit); 107EXPORT_SYMBOL(__module_put_and_exit);
108 108
109/* Find a module section: 0 means not found. */ 109/* Find a module section: 0 means not found. */
110static unsigned int find_sec(Elf_Ehdr *hdr, 110static unsigned int find_sec(Elf_Ehdr *hdr,
111 Elf_Shdr *sechdrs, 111 Elf_Shdr *sechdrs,
@@ -179,7 +179,7 @@ static unsigned long __find_symbol(const char *name,
179 struct module *mod; 179 struct module *mod;
180 const struct kernel_symbol *ks; 180 const struct kernel_symbol *ks;
181 181
182 /* Core kernel first. */ 182 /* Core kernel first. */
183 *owner = NULL; 183 *owner = NULL;
184 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); 184 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
185 if (ks) { 185 if (ks) {
@@ -231,7 +231,7 @@ static unsigned long __find_symbol(const char *name,
231 return ks->value; 231 return ks->value;
232 } 232 }
233 233
234 /* Now try modules. */ 234 /* Now try modules. */
235 list_for_each_entry(mod, &modules, list) { 235 list_for_each_entry(mod, &modules, list) {
236 *owner = mod; 236 *owner = mod;
237 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); 237 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
@@ -285,7 +285,7 @@ static unsigned long __find_symbol(const char *name,
285 } 285 }
286 } 286 }
287 DEBUGP("Failed to find symbol %s\n", name); 287 DEBUGP("Failed to find symbol %s\n", name);
288 return 0; 288 return 0;
289} 289}
290 290
291/* Search for module by name: must hold module_mutex. */ 291/* Search for module by name: must hold module_mutex. */
@@ -441,7 +441,7 @@ static int percpu_modinit(void)
441 } 441 }
442 442
443 return 0; 443 return 0;
444} 444}
445__initcall(percpu_modinit); 445__initcall(percpu_modinit);
446#else /* ... !CONFIG_SMP */ 446#else /* ... !CONFIG_SMP */
447static inline void *percpu_modalloc(unsigned long size, unsigned long align, 447static inline void *percpu_modalloc(unsigned long size, unsigned long align,
@@ -483,8 +483,8 @@ static int modinfo_##field##_exists(struct module *mod) \
483} \ 483} \
484static void free_modinfo_##field(struct module *mod) \ 484static void free_modinfo_##field(struct module *mod) \
485{ \ 485{ \
486 kfree(mod->field); \ 486 kfree(mod->field); \
487 mod->field = NULL; \ 487 mod->field = NULL; \
488} \ 488} \
489static struct module_attribute modinfo_##field = { \ 489static struct module_attribute modinfo_##field = { \
490 .attr = { .name = __stringify(field), .mode = 0444 }, \ 490 .attr = { .name = __stringify(field), .mode = 0444 }, \
@@ -990,7 +990,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
990 struct module_sect_attrs *sect_attrs; 990 struct module_sect_attrs *sect_attrs;
991 struct module_sect_attr *sattr; 991 struct module_sect_attr *sattr;
992 struct attribute **gattr; 992 struct attribute **gattr;
993 993
994 /* Count loaded sections and allocate structures */ 994 /* Count loaded sections and allocate structures */
995 for (i = 0; i < nsect; i++) 995 for (i = 0; i < nsect; i++)
996 if (sechdrs[i].sh_flags & SHF_ALLOC) 996 if (sechdrs[i].sh_flags & SHF_ALLOC)
@@ -1348,14 +1348,14 @@ static int verify_export_symbols(struct module *mod)
1348 const unsigned long *crc; 1348 const unsigned long *crc;
1349 1349
1350 for (i = 0; i < mod->num_syms; i++) 1350 for (i = 0; i < mod->num_syms; i++)
1351 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { 1351 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
1352 name = mod->syms[i].name; 1352 name = mod->syms[i].name;
1353 ret = -ENOEXEC; 1353 ret = -ENOEXEC;
1354 goto dup; 1354 goto dup;
1355 } 1355 }
1356 1356
1357 for (i = 0; i < mod->num_gpl_syms; i++) 1357 for (i = 0; i < mod->num_gpl_syms; i++)
1358 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { 1358 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
1359 name = mod->gpl_syms[i].name; 1359 name = mod->gpl_syms[i].name;
1360 ret = -ENOEXEC; 1360 ret = -ENOEXEC;
1361 goto dup; 1361 goto dup;
@@ -1673,6 +1673,8 @@ static struct module *load_module(void __user *umod,
1673 unsigned int unusedcrcindex; 1673 unsigned int unusedcrcindex;
1674 unsigned int unusedgplindex; 1674 unsigned int unusedgplindex;
1675 unsigned int unusedgplcrcindex; 1675 unsigned int unusedgplcrcindex;
1676 unsigned int markersindex;
1677 unsigned int markersstringsindex;
1676 struct module *mod; 1678 struct module *mod;
1677 long err = 0; 1679 long err = 0;
1678 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1680 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1929,7 +1931,7 @@ static struct module *load_module(void __user *umod,
1929 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; 1931 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1930 1932
1931#ifdef CONFIG_MODVERSIONS 1933#ifdef CONFIG_MODVERSIONS
1932 if ((mod->num_syms && !crcindex) || 1934 if ((mod->num_syms && !crcindex) ||
1933 (mod->num_gpl_syms && !gplcrcindex) || 1935 (mod->num_gpl_syms && !gplcrcindex) ||
1934 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 1936 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1935 (mod->num_unused_syms && !unusedcrcindex) || 1937 (mod->num_unused_syms && !unusedcrcindex) ||
@@ -1939,6 +1941,9 @@ static struct module *load_module(void __user *umod,
1939 add_taint_module(mod, TAINT_FORCED_MODULE); 1941 add_taint_module(mod, TAINT_FORCED_MODULE);
1940 } 1942 }
1941#endif 1943#endif
1944 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
1945 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
1946 "__markers_strings");
1942 1947
1943 /* Now do relocations. */ 1948 /* Now do relocations. */
1944 for (i = 1; i < hdr->e_shnum; i++) { 1949 for (i = 1; i < hdr->e_shnum; i++) {
@@ -1961,6 +1966,11 @@ static struct module *load_module(void __user *umod,
1961 if (err < 0) 1966 if (err < 0)
1962 goto cleanup; 1967 goto cleanup;
1963 } 1968 }
1969#ifdef CONFIG_MARKERS
1970 mod->markers = (void *)sechdrs[markersindex].sh_addr;
1971 mod->num_markers =
1972 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
1973#endif
1964 1974
1965 /* Find duplicate symbols */ 1975 /* Find duplicate symbols */
1966 err = verify_export_symbols(mod); 1976 err = verify_export_symbols(mod);
@@ -1979,6 +1989,11 @@ static struct module *load_module(void __user *umod,
1979 1989
1980 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 1990 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1981 1991
1992#ifdef CONFIG_MARKERS
1993 if (!mod->taints)
1994 marker_update_probe_range(mod->markers,
1995 mod->markers + mod->num_markers, NULL, NULL);
1996#endif
1982 err = module_finalize(hdr, sechdrs, mod); 1997 err = module_finalize(hdr, sechdrs, mod);
1983 if (err < 0) 1998 if (err < 0)
1984 goto cleanup; 1999 goto cleanup;
@@ -2016,7 +2031,7 @@ static struct module *load_module(void __user *umod,
2016 if (err < 0) 2031 if (err < 0)
2017 goto arch_cleanup; 2032 goto arch_cleanup;
2018 2033
2019 err = mod_sysfs_setup(mod, 2034 err = mod_sysfs_setup(mod,
2020 (struct kernel_param *) 2035 (struct kernel_param *)
2021 sechdrs[setupindex].sh_addr, 2036 sechdrs[setupindex].sh_addr,
2022 sechdrs[setupindex].sh_size 2037 sechdrs[setupindex].sh_size
@@ -2028,8 +2043,8 @@ static struct module *load_module(void __user *umod,
2028 2043
2029 /* Size of section 0 is 0, so this works well if no unwind info. */ 2044 /* Size of section 0 is 0, so this works well if no unwind info. */
2030 mod->unwind_info = unwind_add_table(mod, 2045 mod->unwind_info = unwind_add_table(mod,
2031 (void *)sechdrs[unwindex].sh_addr, 2046 (void *)sechdrs[unwindex].sh_addr,
2032 sechdrs[unwindex].sh_size); 2047 sechdrs[unwindex].sh_size);
2033 2048
2034 /* Get rid of temporary copy */ 2049 /* Get rid of temporary copy */
2035 vfree(hdr); 2050 vfree(hdr);
@@ -2146,7 +2161,7 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
2146 */ 2161 */
2147static inline int is_arm_mapping_symbol(const char *str) 2162static inline int is_arm_mapping_symbol(const char *str)
2148{ 2163{
2149 return str[0] == '$' && strchr("atd", str[1]) 2164 return str[0] == '$' && strchr("atd", str[1])
2150 && (str[2] == '\0' || str[2] == '.'); 2165 && (str[2] == '\0' || str[2] == '.');
2151} 2166}
2152 2167
@@ -2161,11 +2176,11 @@ static const char *get_ksymbol(struct module *mod,
2161 /* At worse, next value is at end of module */ 2176 /* At worse, next value is at end of module */
2162 if (within(addr, mod->module_init, mod->init_size)) 2177 if (within(addr, mod->module_init, mod->init_size))
2163 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2178 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2164 else 2179 else
2165 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2180 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2166 2181
2167 /* Scan for closest preceeding symbol, and next symbol. (ELF 2182 /* Scan for closest preceeding symbol, and next symbol. (ELF
2168 starts real symbols at 1). */ 2183 starts real symbols at 1). */
2169 for (i = 1; i < mod->num_symtab; i++) { 2184 for (i = 1; i < mod->num_symtab; i++) {
2170 if (mod->symtab[i].st_shndx == SHN_UNDEF) 2185 if (mod->symtab[i].st_shndx == SHN_UNDEF)
2171 continue; 2186 continue;
@@ -2407,7 +2422,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2407 list_for_each_entry(mod, &modules, list) { 2422 list_for_each_entry(mod, &modules, list) {
2408 if (mod->num_exentries == 0) 2423 if (mod->num_exentries == 0)
2409 continue; 2424 continue;
2410 2425
2411 e = search_extable(mod->extable, 2426 e = search_extable(mod->extable,
2412 mod->extable + mod->num_exentries - 1, 2427 mod->extable + mod->num_exentries - 1,
2413 addr); 2428 addr);
@@ -2417,7 +2432,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2417 preempt_enable(); 2432 preempt_enable();
2418 2433
2419 /* Now, if we found one, we are running inside it now, hence 2434 /* Now, if we found one, we are running inside it now, hence
2420 we cannot unload the module, hence no refcnt needed. */ 2435 we cannot unload the module, hence no refcnt needed. */
2421 return e; 2436 return e;
2422} 2437}
2423 2438
@@ -2570,3 +2585,18 @@ EXPORT_SYMBOL(module_remove_driver);
2570void struct_module(struct module *mod) { return; } 2585void struct_module(struct module *mod) { return; }
2571EXPORT_SYMBOL(struct_module); 2586EXPORT_SYMBOL(struct_module);
2572#endif 2587#endif
2588
2589#ifdef CONFIG_MARKERS
2590void module_update_markers(struct module *probe_module, int *refcount)
2591{
2592 struct module *mod;
2593
2594 mutex_lock(&module_mutex);
2595 list_for_each_entry(mod, &modules, list)
2596 if (!mod->taints)
2597 marker_update_probe_range(mod->markers,
2598 mod->markers + mod->num_markers,
2599 probe_module, refcount);
2600 mutex_unlock(&module_mutex);
2601}
2602#endif
diff --git a/kernel/notifier.c b/kernel/notifier.c
new file mode 100644
index 000000000000..4253f472f060
--- /dev/null
+++ b/kernel/notifier.c
@@ -0,0 +1,539 @@
1#include <linux/kdebug.h>
2#include <linux/kprobes.h>
3#include <linux/module.h>
4#include <linux/notifier.h>
5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h>
7
8/*
9 * Notifier list for kernel code which wants to be called
10 * at shutdown. This is used to stop any idling DMA operations
11 * and the like.
12 */
13BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
14
15/*
16 * Notifier chain core routines. The exported routines below
17 * are layered on top of these, with appropriate locking added.
18 */
19
20static int notifier_chain_register(struct notifier_block **nl,
21 struct notifier_block *n)
22{
23 while ((*nl) != NULL) {
24 if (n->priority > (*nl)->priority)
25 break;
26 nl = &((*nl)->next);
27 }
28 n->next = *nl;
29 rcu_assign_pointer(*nl, n);
30 return 0;
31}
32
33static int notifier_chain_unregister(struct notifier_block **nl,
34 struct notifier_block *n)
35{
36 while ((*nl) != NULL) {
37 if ((*nl) == n) {
38 rcu_assign_pointer(*nl, n->next);
39 return 0;
40 }
41 nl = &((*nl)->next);
42 }
43 return -ENOENT;
44}
45
46/**
47 * notifier_call_chain - Informs the registered notifiers about an event.
48 * @nl: Pointer to head of the blocking notifier chain
49 * @val: Value passed unmodified to notifier function
50 * @v: Pointer passed unmodified to notifier function
51 * @nr_to_call: Number of notifier functions to be called. Don't care
52 * value of this parameter is -1.
53 * @nr_calls: Records the number of notifications sent. Don't care
54 * value of this field is NULL.
55 * @returns: notifier_call_chain returns the value returned by the
56 * last notifier function called.
57 */
58static int __kprobes notifier_call_chain(struct notifier_block **nl,
59 unsigned long val, void *v,
60 int nr_to_call, int *nr_calls)
61{
62 int ret = NOTIFY_DONE;
63 struct notifier_block *nb, *next_nb;
64
65 nb = rcu_dereference(*nl);
66
67 while (nb && nr_to_call) {
68 next_nb = rcu_dereference(nb->next);
69 ret = nb->notifier_call(nb, val, v);
70
71 if (nr_calls)
72 (*nr_calls)++;
73
74 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
75 break;
76 nb = next_nb;
77 nr_to_call--;
78 }
79 return ret;
80}
81
82/*
83 * Atomic notifier chain routines. Registration and unregistration
84 * use a spinlock, and call_chain is synchronized by RCU (no locks).
85 */
86
87/**
88 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
89 * @nh: Pointer to head of the atomic notifier chain
90 * @n: New entry in notifier chain
91 *
92 * Adds a notifier to an atomic notifier chain.
93 *
94 * Currently always returns zero.
95 */
96int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
97 struct notifier_block *n)
98{
99 unsigned long flags;
100 int ret;
101
102 spin_lock_irqsave(&nh->lock, flags);
103 ret = notifier_chain_register(&nh->head, n);
104 spin_unlock_irqrestore(&nh->lock, flags);
105 return ret;
106}
107EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
108
109/**
110 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
111 * @nh: Pointer to head of the atomic notifier chain
112 * @n: Entry to remove from notifier chain
113 *
114 * Removes a notifier from an atomic notifier chain.
115 *
116 * Returns zero on success or %-ENOENT on failure.
117 */
118int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
119 struct notifier_block *n)
120{
121 unsigned long flags;
122 int ret;
123
124 spin_lock_irqsave(&nh->lock, flags);
125 ret = notifier_chain_unregister(&nh->head, n);
126 spin_unlock_irqrestore(&nh->lock, flags);
127 synchronize_rcu();
128 return ret;
129}
130EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
131
132/**
133 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
134 * @nh: Pointer to head of the atomic notifier chain
135 * @val: Value passed unmodified to notifier function
136 * @v: Pointer passed unmodified to notifier function
137 * @nr_to_call: See the comment for notifier_call_chain.
138 * @nr_calls: See the comment for notifier_call_chain.
139 *
140 * Calls each function in a notifier chain in turn. The functions
141 * run in an atomic context, so they must not block.
142 * This routine uses RCU to synchronize with changes to the chain.
143 *
144 * If the return value of the notifier can be and'ed
145 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
146 * will return immediately, with the return value of
147 * the notifier function which halted execution.
148 * Otherwise the return value is the return value
149 * of the last notifier function called.
150 */
151int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
152 unsigned long val, void *v,
153 int nr_to_call, int *nr_calls)
154{
155 int ret;
156
157 rcu_read_lock();
158 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
159 rcu_read_unlock();
160 return ret;
161}
162EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
163
164int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
165 unsigned long val, void *v)
166{
167 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
168}
169EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
170
171/*
172 * Blocking notifier chain routines. All access to the chain is
173 * synchronized by an rwsem.
174 */
175
176/**
177 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
178 * @nh: Pointer to head of the blocking notifier chain
179 * @n: New entry in notifier chain
180 *
181 * Adds a notifier to a blocking notifier chain.
182 * Must be called in process context.
183 *
184 * Currently always returns zero.
185 */
186int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
187 struct notifier_block *n)
188{
189 int ret;
190
191 /*
192 * This code gets used during boot-up, when task switching is
193 * not yet working and interrupts must remain disabled. At
194 * such times we must not call down_write().
195 */
196 if (unlikely(system_state == SYSTEM_BOOTING))
197 return notifier_chain_register(&nh->head, n);
198
199 down_write(&nh->rwsem);
200 ret = notifier_chain_register(&nh->head, n);
201 up_write(&nh->rwsem);
202 return ret;
203}
204EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
205
206/**
207 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
208 * @nh: Pointer to head of the blocking notifier chain
209 * @n: Entry to remove from notifier chain
210 *
211 * Removes a notifier from a blocking notifier chain.
212 * Must be called from process context.
213 *
214 * Returns zero on success or %-ENOENT on failure.
215 */
216int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
217 struct notifier_block *n)
218{
219 int ret;
220
221 /*
222 * This code gets used during boot-up, when task switching is
223 * not yet working and interrupts must remain disabled. At
224 * such times we must not call down_write().
225 */
226 if (unlikely(system_state == SYSTEM_BOOTING))
227 return notifier_chain_unregister(&nh->head, n);
228
229 down_write(&nh->rwsem);
230 ret = notifier_chain_unregister(&nh->head, n);
231 up_write(&nh->rwsem);
232 return ret;
233}
234EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
235
236/**
237 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
238 * @nh: Pointer to head of the blocking notifier chain
239 * @val: Value passed unmodified to notifier function
240 * @v: Pointer passed unmodified to notifier function
241 * @nr_to_call: See comment for notifier_call_chain.
242 * @nr_calls: See comment for notifier_call_chain.
243 *
244 * Calls each function in a notifier chain in turn. The functions
245 * run in a process context, so they are allowed to block.
246 *
247 * If the return value of the notifier can be and'ed
248 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
249 * will return immediately, with the return value of
250 * the notifier function which halted execution.
251 * Otherwise the return value is the return value
252 * of the last notifier function called.
253 */
254int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
255 unsigned long val, void *v,
256 int nr_to_call, int *nr_calls)
257{
258 int ret = NOTIFY_DONE;
259
260 /*
261 * We check the head outside the lock, but if this access is
262 * racy then it does not matter what the result of the test
263 * is, we re-check the list after having taken the lock anyway:
264 */
265 if (rcu_dereference(nh->head)) {
266 down_read(&nh->rwsem);
267 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
268 nr_calls);
269 up_read(&nh->rwsem);
270 }
271 return ret;
272}
273EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
274
275int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
276 unsigned long val, void *v)
277{
278 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
279}
280EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
281
282/*
283 * Raw notifier chain routines. There is no protection;
284 * the caller must provide it. Use at your own risk!
285 */
286
287/**
288 * raw_notifier_chain_register - Add notifier to a raw notifier chain
289 * @nh: Pointer to head of the raw notifier chain
290 * @n: New entry in notifier chain
291 *
292 * Adds a notifier to a raw notifier chain.
293 * All locking must be provided by the caller.
294 *
295 * Currently always returns zero.
296 */
297int raw_notifier_chain_register(struct raw_notifier_head *nh,
298 struct notifier_block *n)
299{
300 return notifier_chain_register(&nh->head, n);
301}
302EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
303
304/**
305 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
306 * @nh: Pointer to head of the raw notifier chain
307 * @n: Entry to remove from notifier chain
308 *
309 * Removes a notifier from a raw notifier chain.
310 * All locking must be provided by the caller.
311 *
312 * Returns zero on success or %-ENOENT on failure.
313 */
314int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
315 struct notifier_block *n)
316{
317 return notifier_chain_unregister(&nh->head, n);
318}
319EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
320
321/**
322 * __raw_notifier_call_chain - Call functions in a raw notifier chain
323 * @nh: Pointer to head of the raw notifier chain
324 * @val: Value passed unmodified to notifier function
325 * @v: Pointer passed unmodified to notifier function
326 * @nr_to_call: See comment for notifier_call_chain.
327 * @nr_calls: See comment for notifier_call_chain
328 *
329 * Calls each function in a notifier chain in turn. The functions
330 * run in an undefined context.
331 * All locking must be provided by the caller.
332 *
333 * If the return value of the notifier can be and'ed
334 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
335 * will return immediately, with the return value of
336 * the notifier function which halted execution.
337 * Otherwise the return value is the return value
338 * of the last notifier function called.
339 */
340int __raw_notifier_call_chain(struct raw_notifier_head *nh,
341 unsigned long val, void *v,
342 int nr_to_call, int *nr_calls)
343{
344 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
345}
346EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
347
348int raw_notifier_call_chain(struct raw_notifier_head *nh,
349 unsigned long val, void *v)
350{
351 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
352}
353EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
354
355/*
356 * SRCU notifier chain routines. Registration and unregistration
357 * use a mutex, and call_chain is synchronized by SRCU (no locks).
358 */
359
360/**
361 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
362 * @nh: Pointer to head of the SRCU notifier chain
363 * @n: New entry in notifier chain
364 *
365 * Adds a notifier to an SRCU notifier chain.
366 * Must be called in process context.
367 *
368 * Currently always returns zero.
369 */
370int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
371 struct notifier_block *n)
372{
373 int ret;
374
375 /*
376 * This code gets used during boot-up, when task switching is
377 * not yet working and interrupts must remain disabled. At
378 * such times we must not call mutex_lock().
379 */
380 if (unlikely(system_state == SYSTEM_BOOTING))
381 return notifier_chain_register(&nh->head, n);
382
383 mutex_lock(&nh->mutex);
384 ret = notifier_chain_register(&nh->head, n);
385 mutex_unlock(&nh->mutex);
386 return ret;
387}
388EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
389
390/**
391 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
392 * @nh: Pointer to head of the SRCU notifier chain
393 * @n: Entry to remove from notifier chain
394 *
395 * Removes a notifier from an SRCU notifier chain.
396 * Must be called from process context.
397 *
398 * Returns zero on success or %-ENOENT on failure.
399 */
400int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
401 struct notifier_block *n)
402{
403 int ret;
404
405 /*
406 * This code gets used during boot-up, when task switching is
407 * not yet working and interrupts must remain disabled. At
408 * such times we must not call mutex_lock().
409 */
410 if (unlikely(system_state == SYSTEM_BOOTING))
411 return notifier_chain_unregister(&nh->head, n);
412
413 mutex_lock(&nh->mutex);
414 ret = notifier_chain_unregister(&nh->head, n);
415 mutex_unlock(&nh->mutex);
416 synchronize_srcu(&nh->srcu);
417 return ret;
418}
419EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
420
421/**
422 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
423 * @nh: Pointer to head of the SRCU notifier chain
424 * @val: Value passed unmodified to notifier function
425 * @v: Pointer passed unmodified to notifier function
426 * @nr_to_call: See comment for notifier_call_chain.
427 * @nr_calls: See comment for notifier_call_chain
428 *
429 * Calls each function in a notifier chain in turn. The functions
430 * run in a process context, so they are allowed to block.
431 *
432 * If the return value of the notifier can be and'ed
433 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
434 * will return immediately, with the return value of
435 * the notifier function which halted execution.
436 * Otherwise the return value is the return value
437 * of the last notifier function called.
438 */
439int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
440 unsigned long val, void *v,
441 int nr_to_call, int *nr_calls)
442{
443 int ret;
444 int idx;
445
446 idx = srcu_read_lock(&nh->srcu);
447 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
448 srcu_read_unlock(&nh->srcu, idx);
449 return ret;
450}
451EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
452
453int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
454 unsigned long val, void *v)
455{
456 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
457}
458EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
459
460/**
461 * srcu_init_notifier_head - Initialize an SRCU notifier head
462 * @nh: Pointer to head of the srcu notifier chain
463 *
464 * Unlike other sorts of notifier heads, SRCU notifier heads require
465 * dynamic initialization. Be sure to call this routine before
466 * calling any of the other SRCU notifier routines for this head.
467 *
468 * If an SRCU notifier head is deallocated, it must first be cleaned
469 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
470 * per-cpu data (used by the SRCU mechanism) will leak.
471 */
472void srcu_init_notifier_head(struct srcu_notifier_head *nh)
473{
474 mutex_init(&nh->mutex);
475 if (init_srcu_struct(&nh->srcu) < 0)
476 BUG();
477 nh->head = NULL;
478}
479EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
480
481/**
482 * register_reboot_notifier - Register function to be called at reboot time
483 * @nb: Info about notifier function to be called
484 *
485 * Registers a function with the list of functions
486 * to be called at reboot time.
487 *
488 * Currently always returns zero, as blocking_notifier_chain_register()
489 * always returns zero.
490 */
491int register_reboot_notifier(struct notifier_block *nb)
492{
493 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
494}
495EXPORT_SYMBOL(register_reboot_notifier);
496
497/**
498 * unregister_reboot_notifier - Unregister previously registered reboot notifier
499 * @nb: Hook to be unregistered
500 *
501 * Unregisters a previously registered reboot
502 * notifier function.
503 *
504 * Returns zero on success, or %-ENOENT on failure.
505 */
506int unregister_reboot_notifier(struct notifier_block *nb)
507{
508 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
509}
510EXPORT_SYMBOL(unregister_reboot_notifier);
511
512static ATOMIC_NOTIFIER_HEAD(die_chain);
513
514int notify_die(enum die_val val, const char *str,
515 struct pt_regs *regs, long err, int trap, int sig)
516{
517 struct die_args args = {
518 .regs = regs,
519 .str = str,
520 .err = err,
521 .trapnr = trap,
522 .signr = sig,
523
524 };
525 return atomic_notifier_call_chain(&die_chain, val, &args);
526}
527
528int register_die_notifier(struct notifier_block *nb)
529{
530 vmalloc_sync_all();
531 return atomic_notifier_chain_register(&die_chain, nb);
532}
533EXPORT_SYMBOL_GPL(register_die_notifier);
534
535int unregister_die_notifier(struct notifier_block *nb)
536{
537 return atomic_notifier_chain_unregister(&die_chain, nb);
538}
539EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
new file mode 100644
index 000000000000..aead4d69f62b
--- /dev/null
+++ b/kernel/ns_cgroup.c
@@ -0,0 +1,100 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10
11struct ns_cgroup {
12 struct cgroup_subsys_state css;
13 spinlock_t lock;
14};
15
16struct cgroup_subsys ns_subsys;
17
18static inline struct ns_cgroup *cgroup_to_ns(
19 struct cgroup *cgroup)
20{
21 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
22 struct ns_cgroup, css);
23}
24
25int ns_cgroup_clone(struct task_struct *task)
26{
27 return cgroup_clone(task, &ns_subsys);
28}
29
30/*
31 * Rules:
32 * 1. you can only enter a cgroup which is a child of your current
33 * cgroup
34 * 2. you can only place another process into a cgroup if
35 * a. you have CAP_SYS_ADMIN
36 * b. your cgroup is an ancestor of task's destination cgroup
37 * (hence either you are in the same cgroup as task, or in an
38 * ancestor cgroup thereof)
39 */
40static int ns_can_attach(struct cgroup_subsys *ss,
41 struct cgroup *new_cgroup, struct task_struct *task)
42{
43 struct cgroup *orig;
44
45 if (current != task) {
46 if (!capable(CAP_SYS_ADMIN))
47 return -EPERM;
48
49 if (!cgroup_is_descendant(new_cgroup))
50 return -EPERM;
51 }
52
53 if (atomic_read(&new_cgroup->count) != 0)
54 return -EPERM;
55
56 orig = task_cgroup(task, ns_subsys_id);
57 if (orig && orig != new_cgroup->parent)
58 return -EPERM;
59
60 return 0;
61}
62
63/*
64 * Rules: you can only create a cgroup if
65 * 1. you are capable(CAP_SYS_ADMIN)
66 * 2. the target cgroup is a descendant of your own cgroup
67 */
68static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
69 struct cgroup *cgroup)
70{
71 struct ns_cgroup *ns_cgroup;
72
73 if (!capable(CAP_SYS_ADMIN))
74 return ERR_PTR(-EPERM);
75 if (!cgroup_is_descendant(cgroup))
76 return ERR_PTR(-EPERM);
77
78 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
79 if (!ns_cgroup)
80 return ERR_PTR(-ENOMEM);
81 spin_lock_init(&ns_cgroup->lock);
82 return &ns_cgroup->css;
83}
84
85static void ns_destroy(struct cgroup_subsys *ss,
86 struct cgroup *cgroup)
87{
88 struct ns_cgroup *ns_cgroup;
89
90 ns_cgroup = cgroup_to_ns(cgroup);
91 kfree(ns_cgroup);
92}
93
94struct cgroup_subsys ns_subsys = {
95 .name = "ns",
96 .can_attach = ns_can_attach,
97 .create = ns_create,
98 .destroy = ns_destroy,
99 .subsys_id = ns_subsys_id,
100};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 049e7c0ac566..79f871bc0ef4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,6 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29static inline void get_nsproxy(struct nsproxy *ns)
30{
31 atomic_inc(&ns->count);
32}
33
34void get_task_namespaces(struct task_struct *tsk)
35{
36 struct nsproxy *ns = tsk->nsproxy;
37 if (ns) {
38 get_nsproxy(ns);
39 }
40}
41
42/* 29/*
43 * creates a copy of "orig" with refcount 1. 30 * creates a copy of "orig" with refcount 1.
44 */ 31 */
@@ -87,7 +74,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
87 goto out_ipc; 74 goto out_ipc;
88 } 75 }
89 76
90 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 77 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
91 if (IS_ERR(new_nsp->pid_ns)) { 78 if (IS_ERR(new_nsp->pid_ns)) {
92 err = PTR_ERR(new_nsp->pid_ns); 79 err = PTR_ERR(new_nsp->pid_ns);
93 goto out_pid; 80 goto out_pid;
@@ -142,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
142 129
143 get_nsproxy(old_ns); 130 get_nsproxy(old_ns);
144 131
145 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) 132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
146 return 0; 134 return 0;
147 135
148 if (!capable(CAP_SYS_ADMIN)) { 136 if (!capable(CAP_SYS_ADMIN)) {
@@ -156,7 +144,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
156 goto out; 144 goto out;
157 } 145 }
158 146
147 err = ns_cgroup_clone(tsk);
148 if (err) {
149 put_nsproxy(new_ns);
150 goto out;
151 }
152
159 tsk->nsproxy = new_ns; 153 tsk->nsproxy = new_ns;
154
160out: 155out:
161 put_nsproxy(old_ns); 156 put_nsproxy(old_ns);
162 return err; 157 return err;
@@ -196,11 +191,46 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
196 191
197 *new_nsp = create_new_namespaces(unshare_flags, current, 192 *new_nsp = create_new_namespaces(unshare_flags, current,
198 new_fs ? new_fs : current->fs); 193 new_fs ? new_fs : current->fs);
199 if (IS_ERR(*new_nsp)) 194 if (IS_ERR(*new_nsp)) {
200 err = PTR_ERR(*new_nsp); 195 err = PTR_ERR(*new_nsp);
196 goto out;
197 }
198
199 err = ns_cgroup_clone(current);
200 if (err)
201 put_nsproxy(*new_nsp);
202
203out:
201 return err; 204 return err;
202} 205}
203 206
207void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
208{
209 struct nsproxy *ns;
210
211 might_sleep();
212
213 ns = p->nsproxy;
214
215 rcu_assign_pointer(p->nsproxy, new);
216
217 if (ns && atomic_dec_and_test(&ns->count)) {
218 /*
219 * wait for others to get what they want from this nsproxy.
220 *
221 * cannot release this nsproxy via the call_rcu() since
222 * put_mnt_ns() will want to sleep
223 */
224 synchronize_rcu();
225 free_nsproxy(ns);
226 }
227}
228
229void exit_task_namespaces(struct task_struct *p)
230{
231 switch_task_namespaces(p, NULL);
232}
233
204static int __init nsproxy_cache_init(void) 234static int __init nsproxy_cache_init(void)
205{ 235{
206 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 236 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/panic.c b/kernel/panic.c
index f64f4c1ac11f..6f6e03e91595 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -56,14 +56,14 @@ EXPORT_SYMBOL(panic_blink);
56 * 56 *
57 * This function never returns. 57 * This function never returns.
58 */ 58 */
59 59
60NORET_TYPE void panic(const char * fmt, ...) 60NORET_TYPE void panic(const char * fmt, ...)
61{ 61{
62 long i; 62 long i;
63 static char buf[1024]; 63 static char buf[1024];
64 va_list args; 64 va_list args;
65#if defined(CONFIG_S390) 65#if defined(CONFIG_S390)
66 unsigned long caller = (unsigned long) __builtin_return_address(0); 66 unsigned long caller = (unsigned long) __builtin_return_address(0);
67#endif 67#endif
68 68
69 /* 69 /*
@@ -128,7 +128,7 @@ NORET_TYPE void panic(const char * fmt, ...)
128 } 128 }
129#endif 129#endif
130#if defined(CONFIG_S390) 130#if defined(CONFIG_S390)
131 disabled_wait(caller); 131 disabled_wait(caller);
132#endif 132#endif
133 local_irq_enable(); 133 local_irq_enable();
134 for (i = 0;;) { 134 for (i = 0;;) {
@@ -148,13 +148,13 @@ EXPORT_SYMBOL(panic);
148 * 'F' - Module has been forcibly loaded. 148 * 'F' - Module has been forcibly loaded.
149 * 'S' - SMP with CPUs not designed for SMP. 149 * 'S' - SMP with CPUs not designed for SMP.
150 * 'R' - User forced a module unload. 150 * 'R' - User forced a module unload.
151 * 'M' - Machine had a machine check experience. 151 * 'M' - System experienced a machine check exception.
152 * 'B' - System has hit bad_page. 152 * 'B' - System has hit bad_page.
153 * 'U' - Userspace-defined naughtiness. 153 * 'U' - Userspace-defined naughtiness.
154 * 154 *
155 * The string is overwritten by the next call to print_taint(). 155 * The string is overwritten by the next call to print_taint().
156 */ 156 */
157 157
158const char *print_tainted(void) 158const char *print_tainted(void)
159{ 159{
160 static char buf[20]; 160 static char buf[20];
@@ -164,7 +164,7 @@ const char *print_tainted(void)
164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
168 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' ', 169 tainted & TAINT_USER ? 'U' : ' ',
170 tainted & TAINT_DIE ? 'D' : ' '); 170 tainted & TAINT_DIE ? 'D' : ' ');
diff --git a/kernel/params.c b/kernel/params.c
index 1d6aca288cdc..16f269e9ddc9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -592,11 +592,17 @@ static void __init param_sysfs_builtin(void)
592 592
593 for (i=0; i < __stop___param - __start___param; i++) { 593 for (i=0; i < __stop___param - __start___param; i++) {
594 char *dot; 594 char *dot;
595 size_t kplen;
595 596
596 kp = &__start___param[i]; 597 kp = &__start___param[i];
598 kplen = strlen(kp->name);
597 599
598 /* We do not handle args without periods. */ 600 /* We do not handle args without periods. */
599 dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME); 601 if (kplen > MAX_KBUILD_MODNAME) {
602 DEBUGP("kernel parameter name is too long: %s\n", kp->name);
603 continue;
604 }
605 dot = memchr(kp->name, '.', kplen);
600 if (!dot) { 606 if (!dot) {
601 DEBUGP("couldn't find period in %s\n", kp->name); 607 DEBUGP("couldn't find period in %s\n", kp->name);
602 continue; 608 continue;
diff --git a/kernel/pid.c b/kernel/pid.c
index c6e3f9ffff87..d1db36b94674 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -18,6 +18,12 @@
18 * allocation scenario when all but one out of 1 million PIDs possible are 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 *
22 * Pid namespaces:
23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
25 * Many thanks to Oleg Nesterov for comments and help
26 *
21 */ 27 */
22 28
23#include <linux/mm.h> 29#include <linux/mm.h>
@@ -28,12 +34,14 @@
28#include <linux/hash.h> 34#include <linux/hash.h>
29#include <linux/pid_namespace.h> 35#include <linux/pid_namespace.h>
30#include <linux/init_task.h> 36#include <linux/init_task.h>
37#include <linux/syscalls.h>
31 38
32#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 39#define pid_hashfn(nr, ns) \
40 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
33static struct hlist_head *pid_hash; 41static struct hlist_head *pid_hash;
34static int pidhash_shift; 42static int pidhash_shift;
35static struct kmem_cache *pid_cachep;
36struct pid init_struct_pid = INIT_STRUCT_PID; 43struct pid init_struct_pid = INIT_STRUCT_PID;
44static struct kmem_cache *pid_ns_cachep;
37 45
38int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
39 47
@@ -68,8 +76,25 @@ struct pid_namespace init_pid_ns = {
68 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 76 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
69 }, 77 },
70 .last_pid = 0, 78 .last_pid = 0,
71 .child_reaper = &init_task 79 .level = 0,
80 .child_reaper = &init_task,
72}; 81};
82EXPORT_SYMBOL_GPL(init_pid_ns);
83
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
73 98
74/* 99/*
75 * Note: disable interrupts while the pidmap_lock is held as an 100 * Note: disable interrupts while the pidmap_lock is held as an
@@ -176,11 +201,17 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
176 201
177fastcall void put_pid(struct pid *pid) 202fastcall void put_pid(struct pid *pid)
178{ 203{
204 struct pid_namespace *ns;
205
179 if (!pid) 206 if (!pid)
180 return; 207 return;
208
209 ns = pid->numbers[pid->level].ns;
181 if ((atomic_read(&pid->count) == 1) || 210 if ((atomic_read(&pid->count) == 1) ||
182 atomic_dec_and_test(&pid->count)) 211 atomic_dec_and_test(&pid->count)) {
183 kmem_cache_free(pid_cachep, pid); 212 kmem_cache_free(ns->pid_cachep, pid);
213 put_pid_ns(ns);
214 }
184} 215}
185EXPORT_SYMBOL_GPL(put_pid); 216EXPORT_SYMBOL_GPL(put_pid);
186 217
@@ -193,60 +224,94 @@ static void delayed_put_pid(struct rcu_head *rhp)
193fastcall void free_pid(struct pid *pid) 224fastcall void free_pid(struct pid *pid)
194{ 225{
195 /* We can be called with write_lock_irq(&tasklist_lock) held */ 226 /* We can be called with write_lock_irq(&tasklist_lock) held */
227 int i;
196 unsigned long flags; 228 unsigned long flags;
197 229
198 spin_lock_irqsave(&pidmap_lock, flags); 230 spin_lock_irqsave(&pidmap_lock, flags);
199 hlist_del_rcu(&pid->pid_chain); 231 for (i = 0; i <= pid->level; i++)
232 hlist_del_rcu(&pid->numbers[i].pid_chain);
200 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
201 234
202 free_pidmap(&init_pid_ns, pid->nr); 235 for (i = 0; i <= pid->level; i++)
236 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
237
203 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
204} 239}
205 240
206struct pid *alloc_pid(void) 241struct pid *alloc_pid(struct pid_namespace *ns)
207{ 242{
208 struct pid *pid; 243 struct pid *pid;
209 enum pid_type type; 244 enum pid_type type;
210 int nr = -1; 245 int i, nr;
246 struct pid_namespace *tmp;
247 struct upid *upid;
211 248
212 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); 249 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
213 if (!pid) 250 if (!pid)
214 goto out; 251 goto out;
215 252
216 nr = alloc_pidmap(current->nsproxy->pid_ns); 253 tmp = ns;
217 if (nr < 0) 254 for (i = ns->level; i >= 0; i--) {
218 goto out_free; 255 nr = alloc_pidmap(tmp);
256 if (nr < 0)
257 goto out_free;
258
259 pid->numbers[i].nr = nr;
260 pid->numbers[i].ns = tmp;
261 tmp = tmp->parent;
262 }
219 263
264 get_pid_ns(ns);
265 pid->level = ns->level;
220 atomic_set(&pid->count, 1); 266 atomic_set(&pid->count, 1);
221 pid->nr = nr;
222 for (type = 0; type < PIDTYPE_MAX; ++type) 267 for (type = 0; type < PIDTYPE_MAX; ++type)
223 INIT_HLIST_HEAD(&pid->tasks[type]); 268 INIT_HLIST_HEAD(&pid->tasks[type]);
224 269
225 spin_lock_irq(&pidmap_lock); 270 spin_lock_irq(&pidmap_lock);
226 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); 271 for (i = ns->level; i >= 0; i--) {
272 upid = &pid->numbers[i];
273 hlist_add_head_rcu(&upid->pid_chain,
274 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
275 }
227 spin_unlock_irq(&pidmap_lock); 276 spin_unlock_irq(&pidmap_lock);
228 277
229out: 278out:
230 return pid; 279 return pid;
231 280
232out_free: 281out_free:
233 kmem_cache_free(pid_cachep, pid); 282 for (i++; i <= ns->level; i++)
283 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
284
285 kmem_cache_free(ns->pid_cachep, pid);
234 pid = NULL; 286 pid = NULL;
235 goto out; 287 goto out;
236} 288}
237 289
238struct pid * fastcall find_pid(int nr) 290struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
239{ 291{
240 struct hlist_node *elem; 292 struct hlist_node *elem;
241 struct pid *pid; 293 struct upid *pnr;
294
295 hlist_for_each_entry_rcu(pnr, elem,
296 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
297 if (pnr->nr == nr && pnr->ns == ns)
298 return container_of(pnr, struct pid,
299 numbers[ns->level]);
242 300
243 hlist_for_each_entry_rcu(pid, elem,
244 &pid_hash[pid_hashfn(nr)], pid_chain) {
245 if (pid->nr == nr)
246 return pid;
247 }
248 return NULL; 301 return NULL;
249} 302}
303EXPORT_SYMBOL_GPL(find_pid_ns);
304
305struct pid *find_vpid(int nr)
306{
307 return find_pid_ns(nr, current->nsproxy->pid_ns);
308}
309EXPORT_SYMBOL_GPL(find_vpid);
310
311struct pid *find_pid(int nr)
312{
313 return find_pid_ns(nr, &init_pid_ns);
314}
250EXPORT_SYMBOL_GPL(find_pid); 315EXPORT_SYMBOL_GPL(find_pid);
251 316
252/* 317/*
@@ -307,12 +372,32 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
307/* 372/*
308 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 373 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
309 */ 374 */
310struct task_struct *find_task_by_pid_type(int type, int nr) 375struct task_struct *find_task_by_pid_type_ns(int type, int nr,
376 struct pid_namespace *ns)
311{ 377{
312 return pid_task(find_pid(nr), type); 378 return pid_task(find_pid_ns(nr, ns), type);
313} 379}
314 380
315EXPORT_SYMBOL(find_task_by_pid_type); 381EXPORT_SYMBOL(find_task_by_pid_type_ns);
382
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr)
390{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399}
400EXPORT_SYMBOL(find_task_by_pid_ns);
316 401
317struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 402struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
318{ 403{
@@ -339,45 +424,239 @@ struct pid *find_get_pid(pid_t nr)
339 struct pid *pid; 424 struct pid *pid;
340 425
341 rcu_read_lock(); 426 rcu_read_lock();
342 pid = get_pid(find_pid(nr)); 427 pid = get_pid(find_vpid(nr));
343 rcu_read_unlock(); 428 rcu_read_unlock();
344 429
345 return pid; 430 return pid;
346} 431}
347 432
433pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
434{
435 struct upid *upid;
436 pid_t nr = 0;
437
438 if (pid && ns->level <= pid->level) {
439 upid = &pid->numbers[ns->level];
440 if (upid->ns == ns)
441 nr = upid->nr;
442 }
443 return nr;
444}
445
446pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
447{
448 return pid_nr_ns(task_pid(tsk), ns);
449}
450EXPORT_SYMBOL(task_pid_nr_ns);
451
452pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
453{
454 return pid_nr_ns(task_tgid(tsk), ns);
455}
456EXPORT_SYMBOL(task_tgid_nr_ns);
457
458pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
459{
460 return pid_nr_ns(task_pgrp(tsk), ns);
461}
462EXPORT_SYMBOL(task_pgrp_nr_ns);
463
464pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
465{
466 return pid_nr_ns(task_session(tsk), ns);
467}
468EXPORT_SYMBOL(task_session_nr_ns);
469
348/* 470/*
349 * Used by proc to find the first pid that is greater then or equal to nr. 471 * Used by proc to find the first pid that is greater then or equal to nr.
350 * 472 *
351 * If there is a pid at nr this function is exactly the same as find_pid. 473 * If there is a pid at nr this function is exactly the same as find_pid.
352 */ 474 */
353struct pid *find_ge_pid(int nr) 475struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
354{ 476{
355 struct pid *pid; 477 struct pid *pid;
356 478
357 do { 479 do {
358 pid = find_pid(nr); 480 pid = find_pid_ns(nr, ns);
359 if (pid) 481 if (pid)
360 break; 482 break;
361 nr = next_pidmap(current->nsproxy->pid_ns, nr); 483 nr = next_pidmap(ns, nr);
362 } while (nr > 0); 484 } while (nr > 0);
363 485
364 return pid; 486 return pid;
365} 487}
366EXPORT_SYMBOL_GPL(find_get_pid); 488EXPORT_SYMBOL_GPL(find_get_pid);
367 489
490struct pid_cache {
491 int nr_ids;
492 char name[16];
493 struct kmem_cache *cachep;
494 struct list_head list;
495};
496
497static LIST_HEAD(pid_caches_lh);
498static DEFINE_MUTEX(pid_caches_mutex);
499
500/*
501 * creates the kmem cache to allocate pids from.
502 * @nr_ids: the number of numerical ids this pid will have to carry
503 */
504
505static struct kmem_cache *create_pid_cachep(int nr_ids)
506{
507 struct pid_cache *pcache;
508 struct kmem_cache *cachep;
509
510 mutex_lock(&pid_caches_mutex);
511 list_for_each_entry (pcache, &pid_caches_lh, list)
512 if (pcache->nr_ids == nr_ids)
513 goto out;
514
515 pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
516 if (pcache == NULL)
517 goto err_alloc;
518
519 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
520 cachep = kmem_cache_create(pcache->name,
521 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
522 0, SLAB_HWCACHE_ALIGN, NULL);
523 if (cachep == NULL)
524 goto err_cachep;
525
526 pcache->nr_ids = nr_ids;
527 pcache->cachep = cachep;
528 list_add(&pcache->list, &pid_caches_lh);
529out:
530 mutex_unlock(&pid_caches_mutex);
531 return pcache->cachep;
532
533err_cachep:
534 kfree(pcache);
535err_alloc:
536 mutex_unlock(&pid_caches_mutex);
537 return NULL;
538}
539
540static struct pid_namespace *create_pid_namespace(int level)
541{
542 struct pid_namespace *ns;
543 int i;
544
545 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
546 if (ns == NULL)
547 goto out;
548
549 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
550 if (!ns->pidmap[0].page)
551 goto out_free;
552
553 ns->pid_cachep = create_pid_cachep(level + 1);
554 if (ns->pid_cachep == NULL)
555 goto out_free_map;
556
557 kref_init(&ns->kref);
558 ns->last_pid = 0;
559 ns->child_reaper = NULL;
560 ns->level = level;
561
562 set_bit(0, ns->pidmap[0].page);
563 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
564
565 for (i = 1; i < PIDMAP_ENTRIES; i++) {
566 ns->pidmap[i].page = 0;
567 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
568 }
569
570 return ns;
571
572out_free_map:
573 kfree(ns->pidmap[0].page);
574out_free:
575 kmem_cache_free(pid_ns_cachep, ns);
576out:
577 return ERR_PTR(-ENOMEM);
578}
579
580static void destroy_pid_namespace(struct pid_namespace *ns)
581{
582 int i;
583
584 for (i = 0; i < PIDMAP_ENTRIES; i++)
585 kfree(ns->pidmap[i].page);
586 kmem_cache_free(pid_ns_cachep, ns);
587}
588
368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 589struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 590{
591 struct pid_namespace *new_ns;
592
370 BUG_ON(!old_ns); 593 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 594 new_ns = get_pid_ns(old_ns);
372 return old_ns; 595 if (!(flags & CLONE_NEWPID))
596 goto out;
597
598 new_ns = ERR_PTR(-EINVAL);
599 if (flags & CLONE_THREAD)
600 goto out_put;
601
602 new_ns = create_pid_namespace(old_ns->level + 1);
603 if (!IS_ERR(new_ns))
604 new_ns->parent = get_pid_ns(old_ns);
605
606out_put:
607 put_pid_ns(old_ns);
608out:
609 return new_ns;
373} 610}
374 611
375void free_pid_ns(struct kref *kref) 612void free_pid_ns(struct kref *kref)
376{ 613{
377 struct pid_namespace *ns; 614 struct pid_namespace *ns, *parent;
378 615
379 ns = container_of(kref, struct pid_namespace, kref); 616 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns); 617
618 parent = ns->parent;
619 destroy_pid_namespace(ns);
620
621 if (parent != NULL)
622 put_pid_ns(parent);
623}
624
625void zap_pid_ns_processes(struct pid_namespace *pid_ns)
626{
627 int nr;
628 int rc;
629
630 /*
631 * The last thread in the cgroup-init thread group is terminating.
632 * Find remaining pid_ts in the namespace, signal and wait for them
633 * to exit.
634 *
635 * Note: This signals each threads in the namespace - even those that
636 * belong to the same thread group, To avoid this, we would have
637 * to walk the entire tasklist looking a processes in this
638 * namespace, but that could be unnecessarily expensive if the
639 * pid namespace has just a few processes. Or we need to
640 * maintain a tasklist for each pid namespace.
641 *
642 */
643 read_lock(&tasklist_lock);
644 nr = next_pidmap(pid_ns, 1);
645 while (nr > 0) {
646 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
647 nr = next_pidmap(pid_ns, nr);
648 }
649 read_unlock(&tasklist_lock);
650
651 do {
652 clear_thread_flag(TIF_SIGPENDING);
653 rc = sys_wait4(-1, NULL, __WALL, NULL);
654 } while (rc != -ECHILD);
655
656
657 /* Child reaper for the pid namespace is going away */
658 pid_ns->child_reaper = NULL;
659 return;
381} 660}
382 661
383/* 662/*
@@ -412,5 +691,9 @@ void __init pidmap_init(void)
412 set_bit(0, init_pid_ns.pidmap[0].page); 691 set_bit(0, init_pid_ns.pidmap[0].page);
413 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 692 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
414 693
415 pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); 694 init_pid_ns.pid_cachep = create_pid_cachep(1);
695 if (init_pid_ns.pid_cachep == NULL)
696 panic("Can't create pid_1 cachep\n");
697
698 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
416} 699}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b53c8fcd9d82..68c96376e84a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -21,8 +21,8 @@ static int check_clock(const clockid_t which_clock)
21 21
22 read_lock(&tasklist_lock); 22 read_lock(&tasklist_lock);
23 p = find_task_by_pid(pid); 23 p = find_task_by_pid(pid);
24 if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? 24 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
25 p->tgid != current->tgid : p->tgid != pid)) { 25 same_thread_group(p, current) : thread_group_leader(p))) {
26 error = -EINVAL; 26 error = -EINVAL;
27 } 27 }
28 read_unlock(&tasklist_lock); 28 read_unlock(&tasklist_lock);
@@ -308,13 +308,13 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
311 if (p->tgid == current->tgid) { 311 if (same_thread_group(p, current)) {
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else { 315 } else {
316 read_lock(&tasklist_lock); 316 read_lock(&tasklist_lock);
317 if (p->tgid == pid && p->signal) { 317 if (thread_group_leader(p) && p->signal) {
318 error = 318 error =
319 cpu_clock_sample_group(which_clock, 319 cpu_clock_sample_group(which_clock,
320 p, &rtn); 320 p, &rtn);
@@ -355,7 +355,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
355 p = current; 355 p = current;
356 } else { 356 } else {
357 p = find_task_by_pid(pid); 357 p = find_task_by_pid(pid);
358 if (p && p->tgid != current->tgid) 358 if (p && !same_thread_group(p, current))
359 p = NULL; 359 p = NULL;
360 } 360 }
361 } else { 361 } else {
@@ -363,7 +363,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
363 p = current->group_leader; 363 p = current->group_leader;
364 } else { 364 } else {
365 p = find_task_by_pid(pid); 365 p = find_task_by_pid(pid);
366 if (p && p->tgid != pid) 366 if (p && !thread_group_leader(p))
367 p = NULL; 367 p = NULL;
368 } 368 }
369 } 369 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d71ed09fe1dd..35b4bbfc78ff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
404 404
405 if ((event->sigev_notify & SIGEV_THREAD_ID ) && 405 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || 406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
407 rtn->tgid != current->tgid || 407 !same_thread_group(rtn, current) ||
408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) 408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
409 return NULL; 409 return NULL;
410 410
@@ -608,7 +608,7 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
608 spin_lock(&timr->it_lock); 608 spin_lock(&timr->it_lock);
609 609
610 if ((timr->it_id != timer_id) || !(timr->it_process) || 610 if ((timr->it_id != timer_id) || !(timr->it_process) ||
611 timr->it_process->tgid != current->tgid) { 611 !same_thread_group(timr->it_process, current)) {
612 spin_unlock(&timr->it_lock); 612 spin_unlock(&timr->it_lock);
613 spin_unlock_irqrestore(&idr_lock, *flags); 613 spin_unlock_irqrestore(&idr_lock, *flags);
614 timr = NULL; 614 timr = NULL;
@@ -981,9 +981,20 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
981static int common_nsleep(const clockid_t which_clock, int flags, 981static int common_nsleep(const clockid_t which_clock, int flags,
982 struct timespec *tsave, struct timespec __user *rmtp) 982 struct timespec *tsave, struct timespec __user *rmtp)
983{ 983{
984 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 984 struct timespec rmt;
985 HRTIMER_MODE_ABS : HRTIMER_MODE_REL, 985 int ret;
986 which_clock); 986
987 ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
988 flags & TIMER_ABSTIME ?
989 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
990 which_clock);
991
992 if (ret && rmtp) {
993 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
994 return -EFAULT;
995 }
996
997 return ret;
987} 998}
988 999
989asmlinkage long 1000asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 14b0e10dc95c..8e186c678149 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -44,17 +44,6 @@ config PM_VERBOSE
44 ---help--- 44 ---help---
45 This option enables verbose messages from the Power Management code. 45 This option enables verbose messages from the Power Management code.
46 46
47config DISABLE_CONSOLE_SUSPEND
48 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
49 depends on PM_DEBUG && PM_SLEEP
50 default n
51 ---help---
52 This option turns off the console suspend mechanism that prevents
53 debug messages from reaching the console during the suspend/resume
54 operations. This may be helpful when debugging device drivers'
55 suspend/resume routines, but may itself lead to problems, for example
56 if netconsole is used.
57
58config PM_TRACE 47config PM_TRACE
59 bool "Suspend/resume event tracing" 48 bool "Suspend/resume event tracing"
60 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL 49 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index eb72255b5c86..8b15f777010a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,17 +45,18 @@ enum {
45 45
46static int hibernation_mode = HIBERNATION_SHUTDOWN; 46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47 47
48static struct hibernation_ops *hibernation_ops; 48static struct platform_hibernation_ops *hibernation_ops;
49 49
50/** 50/**
51 * hibernation_set_ops - set the global hibernate operations 51 * hibernation_set_ops - set the global hibernate operations
52 * @ops: the hibernation operations to use in subsequent hibernation transitions 52 * @ops: the hibernation operations to use in subsequent hibernation transitions
53 */ 53 */
54 54
55void hibernation_set_ops(struct hibernation_ops *ops) 55void hibernation_set_ops(struct platform_hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish 57 if (ops && !(ops->start && ops->pre_snapshot && ops->finish
58 && ops->pre_restore && ops->restore_cleanup)) { 58 && ops->prepare && ops->enter && ops->pre_restore
59 && ops->restore_cleanup)) {
59 WARN_ON(1); 60 WARN_ON(1);
60 return; 61 return;
61 } 62 }
@@ -69,16 +70,37 @@ void hibernation_set_ops(struct hibernation_ops *ops)
69 mutex_unlock(&pm_mutex); 70 mutex_unlock(&pm_mutex);
70} 71}
71 72
73/**
74 * platform_start - tell the platform driver that we're starting
75 * hibernation
76 */
77
78static int platform_start(int platform_mode)
79{
80 return (platform_mode && hibernation_ops) ?
81 hibernation_ops->start() : 0;
82}
72 83
73/** 84/**
74 * platform_prepare - prepare the machine for hibernation using the 85 * platform_pre_snapshot - prepare the machine for hibernation using the
75 * platform driver if so configured and return an error code if it fails 86 * platform driver if so configured and return an error code if it fails
76 */ 87 */
77 88
78static int platform_prepare(int platform_mode) 89static int platform_pre_snapshot(int platform_mode)
79{ 90{
80 return (platform_mode && hibernation_ops) ? 91 return (platform_mode && hibernation_ops) ?
81 hibernation_ops->prepare() : 0; 92 hibernation_ops->pre_snapshot() : 0;
93}
94
95/**
96 * platform_leave - prepare the machine for switching to the normal mode
97 * of operation using the platform driver (called with interrupts disabled)
98 */
99
100static void platform_leave(int platform_mode)
101{
102 if (platform_mode && hibernation_ops)
103 hibernation_ops->leave();
82} 104}
83 105
84/** 106/**
@@ -118,6 +140,51 @@ static void platform_restore_cleanup(int platform_mode)
118} 140}
119 141
120/** 142/**
143 * create_image - freeze devices that need to be frozen with interrupts
144 * off, create the hibernation image and thaw those devices. Control
145 * reappears in this routine after a restore.
146 */
147
148int create_image(int platform_mode)
149{
150 int error;
151
152 error = arch_prepare_suspend();
153 if (error)
154 return error;
155
156 local_irq_disable();
157 /* At this point, device_suspend() has been called, but *not*
158 * device_power_down(). We *must* call device_power_down() now.
159 * Otherwise, drivers for some devices (e.g. interrupt controllers)
160 * become desynchronized with the actual state of the hardware
161 * at resume time, and evil weirdness ensues.
162 */
163 error = device_power_down(PMSG_FREEZE);
164 if (error) {
165 printk(KERN_ERR "Some devices failed to power down, "
166 KERN_ERR "aborting suspend\n");
167 goto Enable_irqs;
168 }
169
170 save_processor_state();
171 error = swsusp_arch_suspend();
172 if (error)
173 printk(KERN_ERR "Error %d while creating the image\n", error);
174 /* Restore control flow magically appears here */
175 restore_processor_state();
176 if (!in_suspend)
177 platform_leave(platform_mode);
178 /* NOTE: device_power_up() is just a resume() for devices
179 * that suspended with irqs off ... no overall powerup.
180 */
181 device_power_up();
182 Enable_irqs:
183 local_irq_enable();
184 return error;
185}
186
187/**
121 * hibernation_snapshot - quiesce devices and create the hibernation 188 * hibernation_snapshot - quiesce devices and create the hibernation
122 * snapshot image. 189 * snapshot image.
123 * @platform_mode - if set, use the platform driver, if available, to 190 * @platform_mode - if set, use the platform driver, if available, to
@@ -135,12 +202,16 @@ int hibernation_snapshot(int platform_mode)
135 if (error) 202 if (error)
136 return error; 203 return error;
137 204
205 error = platform_start(platform_mode);
206 if (error)
207 return error;
208
138 suspend_console(); 209 suspend_console();
139 error = device_suspend(PMSG_FREEZE); 210 error = device_suspend(PMSG_FREEZE);
140 if (error) 211 if (error)
141 goto Resume_console; 212 goto Resume_console;
142 213
143 error = platform_prepare(platform_mode); 214 error = platform_pre_snapshot(platform_mode);
144 if (error) 215 if (error)
145 goto Resume_devices; 216 goto Resume_devices;
146 217
@@ -148,7 +219,7 @@ int hibernation_snapshot(int platform_mode)
148 if (!error) { 219 if (!error) {
149 if (hibernation_mode != HIBERNATION_TEST) { 220 if (hibernation_mode != HIBERNATION_TEST) {
150 in_suspend = 1; 221 in_suspend = 1;
151 error = swsusp_suspend(); 222 error = create_image(platform_mode);
152 /* Control returns here after successful restore */ 223 /* Control returns here after successful restore */
153 } else { 224 } else {
154 printk("swsusp debug: Waiting for 5 seconds.\n"); 225 printk("swsusp debug: Waiting for 5 seconds.\n");
@@ -207,21 +278,50 @@ int hibernation_platform_enter(void)
207{ 278{
208 int error; 279 int error;
209 280
210 if (hibernation_ops) { 281 if (!hibernation_ops)
211 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 282 return -ENOSYS;
212 /* 283
213 * We have cancelled the power transition by running 284 /*
214 * hibernation_ops->finish() before saving the image, so we 285 * We have cancelled the power transition by running
215 * should let the firmware know that we're going to enter the 286 * hibernation_ops->finish() before saving the image, so we should let
216 * sleep state after all 287 * the firmware know that we're going to enter the sleep state after all
217 */ 288 */
218 error = hibernation_ops->prepare(); 289 error = hibernation_ops->start();
219 sysdev_shutdown(); 290 if (error)
220 if (!error) 291 return error;
221 error = hibernation_ops->enter(); 292
222 } else { 293 suspend_console();
223 error = -ENOSYS; 294 error = device_suspend(PMSG_SUSPEND);
295 if (error)
296 goto Resume_console;
297
298 error = hibernation_ops->prepare();
299 if (error)
300 goto Resume_devices;
301
302 error = disable_nonboot_cpus();
303 if (error)
304 goto Finish;
305
306 local_irq_disable();
307 error = device_power_down(PMSG_SUSPEND);
308 if (!error) {
309 hibernation_ops->enter();
310 /* We should never get here */
311 while (1);
224 } 312 }
313 local_irq_enable();
314
315 /*
316 * We don't need to reenable the nonboot CPUs or resume consoles, since
317 * the system is going to be halted anyway.
318 */
319 Finish:
320 hibernation_ops->finish();
321 Resume_devices:
322 device_resume();
323 Resume_console:
324 resume_console();
225 return error; 325 return error;
226} 326}
227 327
@@ -238,14 +338,14 @@ static void power_down(void)
238 case HIBERNATION_TEST: 338 case HIBERNATION_TEST:
239 case HIBERNATION_TESTPROC: 339 case HIBERNATION_TESTPROC:
240 break; 340 break;
241 case HIBERNATION_SHUTDOWN:
242 kernel_power_off();
243 break;
244 case HIBERNATION_REBOOT: 341 case HIBERNATION_REBOOT:
245 kernel_restart(NULL); 342 kernel_restart(NULL);
246 break; 343 break;
247 case HIBERNATION_PLATFORM: 344 case HIBERNATION_PLATFORM:
248 hibernation_platform_enter(); 345 hibernation_platform_enter();
346 case HIBERNATION_SHUTDOWN:
347 kernel_power_off();
348 break;
249 } 349 }
250 kernel_halt(); 350 kernel_halt();
251 /* 351 /*
@@ -298,6 +398,10 @@ int hibernate(void)
298 if (error) 398 if (error)
299 goto Exit; 399 goto Exit;
300 400
401 printk("Syncing filesystems ... ");
402 sys_sync();
403 printk("done.\n");
404
301 error = prepare_processes(); 405 error = prepare_processes();
302 if (error) 406 if (error)
303 goto Finish; 407 goto Finish;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 350b485b3b60..3cdf95b1dc92 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -20,6 +20,7 @@
20#include <linux/resume-trace.h> 20#include <linux/resume-trace.h>
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/syscalls.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -32,39 +33,32 @@ DEFINE_MUTEX(pm_mutex);
32/* This is just an arbitrary number */ 33/* This is just an arbitrary number */
33#define FREE_PAGE_NUMBER (100) 34#define FREE_PAGE_NUMBER (100)
34 35
35struct pm_ops *pm_ops; 36static struct platform_suspend_ops *suspend_ops;
36 37
37/** 38/**
38 * pm_set_ops - Set the global power method table. 39 * suspend_set_ops - Set the global suspend method table.
39 * @ops: Pointer to ops structure. 40 * @ops: Pointer to ops structure.
40 */ 41 */
41 42
42void pm_set_ops(struct pm_ops * ops) 43void suspend_set_ops(struct platform_suspend_ops *ops)
43{ 44{
44 mutex_lock(&pm_mutex); 45 mutex_lock(&pm_mutex);
45 pm_ops = ops; 46 suspend_ops = ops;
46 mutex_unlock(&pm_mutex); 47 mutex_unlock(&pm_mutex);
47} 48}
48 49
49/** 50/**
50 * pm_valid_only_mem - generic memory-only valid callback 51 * suspend_valid_only_mem - generic memory-only valid callback
51 * 52 *
52 * pm_ops drivers that implement mem suspend only and only need 53 * Platform drivers that implement mem suspend only and only need
53 * to check for that in their .valid callback can use this instead 54 * to check for that in their .valid callback can use this instead
54 * of rolling their own .valid callback. 55 * of rolling their own .valid callback.
55 */ 56 */
56int pm_valid_only_mem(suspend_state_t state) 57int suspend_valid_only_mem(suspend_state_t state)
57{ 58{
58 return state == PM_SUSPEND_MEM; 59 return state == PM_SUSPEND_MEM;
59} 60}
60 61
61
62static inline void pm_finish(suspend_state_t state)
63{
64 if (pm_ops->finish)
65 pm_ops->finish(state);
66}
67
68/** 62/**
69 * suspend_prepare - Do prep work before entering low-power state. 63 * suspend_prepare - Do prep work before entering low-power state.
70 * 64 *
@@ -76,7 +70,7 @@ static int suspend_prepare(void)
76 int error; 70 int error;
77 unsigned int free_pages; 71 unsigned int free_pages;
78 72
79 if (!pm_ops || !pm_ops->enter) 73 if (!suspend_ops || !suspend_ops->enter)
80 return -EPERM; 74 return -EPERM;
81 75
82 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); 76 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
@@ -128,7 +122,7 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
128 * 122 *
129 * This function should be called after devices have been suspended. 123 * This function should be called after devices have been suspended.
130 */ 124 */
131int suspend_enter(suspend_state_t state) 125static int suspend_enter(suspend_state_t state)
132{ 126{
133 int error = 0; 127 int error = 0;
134 128
@@ -139,7 +133,7 @@ int suspend_enter(suspend_state_t state)
139 printk(KERN_ERR "Some devices failed to power down\n"); 133 printk(KERN_ERR "Some devices failed to power down\n");
140 goto Done; 134 goto Done;
141 } 135 }
142 error = pm_ops->enter(state); 136 error = suspend_ops->enter(state);
143 device_power_up(); 137 device_power_up();
144 Done: 138 Done:
145 arch_suspend_enable_irqs(); 139 arch_suspend_enable_irqs();
@@ -156,11 +150,11 @@ int suspend_devices_and_enter(suspend_state_t state)
156{ 150{
157 int error; 151 int error;
158 152
159 if (!pm_ops) 153 if (!suspend_ops)
160 return -ENOSYS; 154 return -ENOSYS;
161 155
162 if (pm_ops->set_target) { 156 if (suspend_ops->set_target) {
163 error = pm_ops->set_target(state); 157 error = suspend_ops->set_target(state);
164 if (error) 158 if (error)
165 return error; 159 return error;
166 } 160 }
@@ -170,8 +164,8 @@ int suspend_devices_and_enter(suspend_state_t state)
170 printk(KERN_ERR "Some devices failed to suspend\n"); 164 printk(KERN_ERR "Some devices failed to suspend\n");
171 goto Resume_console; 165 goto Resume_console;
172 } 166 }
173 if (pm_ops->prepare) { 167 if (suspend_ops->prepare) {
174 error = pm_ops->prepare(state); 168 error = suspend_ops->prepare();
175 if (error) 169 if (error)
176 goto Resume_devices; 170 goto Resume_devices;
177 } 171 }
@@ -180,7 +174,8 @@ int suspend_devices_and_enter(suspend_state_t state)
180 suspend_enter(state); 174 suspend_enter(state);
181 175
182 enable_nonboot_cpus(); 176 enable_nonboot_cpus();
183 pm_finish(state); 177 if (suspend_ops->finish)
178 suspend_ops->finish();
184 Resume_devices: 179 Resume_devices:
185 device_resume(); 180 device_resume();
186 Resume_console: 181 Resume_console:
@@ -214,7 +209,7 @@ static inline int valid_state(suspend_state_t state)
214 /* All states need lowlevel support and need to be valid 209 /* All states need lowlevel support and need to be valid
215 * to the lowlevel implementation, no valid callback 210 * to the lowlevel implementation, no valid callback
216 * implies that none are valid. */ 211 * implies that none are valid. */
217 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) 212 if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
218 return 0; 213 return 0;
219 return 1; 214 return 1;
220} 215}
@@ -236,9 +231,14 @@ static int enter_state(suspend_state_t state)
236 231
237 if (!valid_state(state)) 232 if (!valid_state(state))
238 return -ENODEV; 233 return -ENODEV;
234
239 if (!mutex_trylock(&pm_mutex)) 235 if (!mutex_trylock(&pm_mutex))
240 return -EBUSY; 236 return -EBUSY;
241 237
238 printk("Syncing filesystems ... ");
239 sys_sync();
240 printk("done.\n");
241
242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
243 if ((error = suspend_prepare())) 243 if ((error = suspend_prepare()))
244 goto Unlock; 244 goto Unlock;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 95fbf2dd3fe3..195dc4611764 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -11,14 +11,32 @@ struct swsusp_info {
11 unsigned long size; 11 unsigned long size;
12} __attribute__((aligned(PAGE_SIZE))); 12} __attribute__((aligned(PAGE_SIZE)));
13 13
14#ifdef CONFIG_HIBERNATION
15#ifdef CONFIG_ARCH_HIBERNATION_HEADER
16/* Maximum size of architecture specific data in a hibernation header */
17#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
14 18
19extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
20extern int arch_hibernation_header_restore(void *addr);
21
22static inline int init_header_complete(struct swsusp_info *info)
23{
24 return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
25}
26
27static inline char *check_image_kernel(struct swsusp_info *info)
28{
29 return arch_hibernation_header_restore(info) ?
30 "architecture specific data" : NULL;
31}
32#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
15 33
16#ifdef CONFIG_HIBERNATION
17/* 34/*
18 * Keep some memory free so that I/O operations can succeed without paging 35 * Keep some memory free so that I/O operations can succeed without paging
19 * [Might this be more than 4 MB?] 36 * [Might this be more than 4 MB?]
20 */ 37 */
21#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) 38#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
39
22/* 40/*
23 * Keep 1 MB of memory free so that device drivers can allocate some pages in 41 * Keep 1 MB of memory free so that device drivers can allocate some pages in
24 * their .suspend() routines without breaking the suspend to disk. 42 * their .suspend() routines without breaking the suspend to disk.
@@ -165,7 +183,6 @@ extern int swsusp_swap_in_use(void);
165extern int swsusp_check(void); 183extern int swsusp_check(void);
166extern int swsusp_shrink_memory(void); 184extern int swsusp_shrink_memory(void);
167extern void swsusp_free(void); 185extern void swsusp_free(void);
168extern int swsusp_suspend(void);
169extern int swsusp_resume(void); 186extern int swsusp_resume(void);
170extern int swsusp_read(unsigned int *flags_p); 187extern int swsusp_read(unsigned int *flags_p);
171extern int swsusp_write(unsigned int flags); 188extern int swsusp_write(unsigned int flags);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3434940a3df1..6533923e711b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,21 +75,79 @@ void refrigerator(void)
75 __set_current_state(save); 75 __set_current_state(save);
76} 76}
77 77
78static void freeze_task(struct task_struct *p) 78static void fake_signal_wake_up(struct task_struct *p, int resume)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 if (!freezing(p)) { 82 spin_lock_irqsave(&p->sighand->siglock, flags);
83 signal_wake_up(p, resume);
84 spin_unlock_irqrestore(&p->sighand->siglock, flags);
85}
86
87static void send_fake_signal(struct task_struct *p)
88{
89 if (p->state == TASK_STOPPED)
90 force_sig_specific(SIGSTOP, p);
91 fake_signal_wake_up(p, p->state == TASK_STOPPED);
92}
93
94static int has_mm(struct task_struct *p)
95{
96 return (p->mm && !(p->flags & PF_BORROWED_MM));
97}
98
99/**
100 * freeze_task - send a freeze request to given task
101 * @p: task to send the request to
102 * @with_mm_only: if set, the request will only be sent if the task has its
103 * own mm
104 * Return value: 0, if @with_mm_only is set and the task has no mm of its
105 * own or the task is frozen, 1, otherwise
106 *
107 * The freeze request is sent by seting the tasks's TIF_FREEZE flag and
108 * either sending a fake signal to it or waking it up, depending on whether
109 * or not it has its own mm (ie. it is a user land task). If @with_mm_only
110 * is set and the task has no mm of its own (ie. it is a kernel thread),
111 * its TIF_FREEZE flag should not be set.
112 *
113 * The task_lock() is necessary to prevent races with exit_mm() or
114 * use_mm()/unuse_mm() from occuring.
115 */
116static int freeze_task(struct task_struct *p, int with_mm_only)
117{
118 int ret = 1;
119
120 task_lock(p);
121 if (freezing(p)) {
122 if (has_mm(p)) {
123 if (!signal_pending(p))
124 fake_signal_wake_up(p, 0);
125 } else {
126 if (with_mm_only)
127 ret = 0;
128 else
129 wake_up_state(p, TASK_INTERRUPTIBLE);
130 }
131 } else {
83 rmb(); 132 rmb();
84 if (!frozen(p)) { 133 if (frozen(p)) {
85 set_freeze_flag(p); 134 ret = 0;
86 if (p->state == TASK_STOPPED) 135 } else {
87 force_sig_specific(SIGSTOP, p); 136 if (has_mm(p)) {
88 spin_lock_irqsave(&p->sighand->siglock, flags); 137 set_freeze_flag(p);
89 signal_wake_up(p, p->state == TASK_STOPPED); 138 send_fake_signal(p);
90 spin_unlock_irqrestore(&p->sighand->siglock, flags); 139 } else {
140 if (with_mm_only) {
141 ret = 0;
142 } else {
143 set_freeze_flag(p);
144 wake_up_state(p, TASK_INTERRUPTIBLE);
145 }
146 }
91 } 147 }
92 } 148 }
149 task_unlock(p);
150 return ret;
93} 151}
94 152
95static void cancel_freezing(struct task_struct *p) 153static void cancel_freezing(struct task_struct *p)
@@ -110,6 +168,11 @@ static int try_to_freeze_tasks(int freeze_user_space)
110 struct task_struct *g, *p; 168 struct task_struct *g, *p;
111 unsigned long end_time; 169 unsigned long end_time;
112 unsigned int todo; 170 unsigned int todo;
171 struct timeval start, end;
172 s64 elapsed_csecs64;
173 unsigned int elapsed_csecs;
174
175 do_gettimeofday(&start);
113 176
114 end_time = jiffies + TIMEOUT; 177 end_time = jiffies + TIMEOUT;
115 do { 178 do {
@@ -119,31 +182,14 @@ static int try_to_freeze_tasks(int freeze_user_space)
119 if (frozen(p) || !freezeable(p)) 182 if (frozen(p) || !freezeable(p))
120 continue; 183 continue;
121 184
122 if (freeze_user_space) { 185 if (p->state == TASK_TRACED && frozen(p->parent)) {
123 if (p->state == TASK_TRACED && 186 cancel_freezing(p);
124 frozen(p->parent)) { 187 continue;
125 cancel_freezing(p);
126 continue;
127 }
128 /*
129 * Kernel threads should not have TIF_FREEZE set
130 * at this point, so we must ensure that either
131 * p->mm is not NULL *and* PF_BORROWED_MM is
132 * unset, or TIF_FRREZE is left unset.
133 * The task_lock() is necessary to prevent races
134 * with exit_mm() or use_mm()/unuse_mm() from
135 * occuring.
136 */
137 task_lock(p);
138 if (!p->mm || (p->flags & PF_BORROWED_MM)) {
139 task_unlock(p);
140 continue;
141 }
142 freeze_task(p);
143 task_unlock(p);
144 } else {
145 freeze_task(p);
146 } 188 }
189
190 if (!freeze_task(p, freeze_user_space))
191 continue;
192
147 if (!freezer_should_skip(p)) 193 if (!freezer_should_skip(p))
148 todo++; 194 todo++;
149 } while_each_thread(g, p); 195 } while_each_thread(g, p);
@@ -153,6 +199,11 @@ static int try_to_freeze_tasks(int freeze_user_space)
153 break; 199 break;
154 } while (todo); 200 } while (todo);
155 201
202 do_gettimeofday(&end);
203 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
204 do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
205 elapsed_csecs = elapsed_csecs64;
206
156 if (todo) { 207 if (todo) {
157 /* This does not unfreeze processes that are already frozen 208 /* This does not unfreeze processes that are already frozen
158 * (we have slightly ugly calling convention in that respect, 209 * (we have slightly ugly calling convention in that respect,
@@ -160,10 +211,9 @@ static int try_to_freeze_tasks(int freeze_user_space)
160 * but it cleans up leftover PF_FREEZE requests. 211 * but it cleans up leftover PF_FREEZE requests.
161 */ 212 */
162 printk("\n"); 213 printk("\n");
163 printk(KERN_ERR "Freezing of %s timed out after %d seconds " 214 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
164 "(%d tasks refusing to freeze):\n", 215 "(%d tasks refusing to freeze):\n",
165 freeze_user_space ? "user space " : "tasks ", 216 elapsed_csecs / 100, elapsed_csecs % 100, todo);
166 TIMEOUT / HZ, todo);
167 show_state(); 217 show_state();
168 read_lock(&tasklist_lock); 218 read_lock(&tasklist_lock);
169 do_each_thread(g, p) { 219 do_each_thread(g, p) {
@@ -174,6 +224,9 @@ static int try_to_freeze_tasks(int freeze_user_space)
174 task_unlock(p); 224 task_unlock(p);
175 } while_each_thread(g, p); 225 } while_each_thread(g, p);
176 read_unlock(&tasklist_lock); 226 read_unlock(&tasklist_lock);
227 } else {
228 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
229 elapsed_csecs % 100);
177 } 230 }
178 231
179 return todo ? -EBUSY : 0; 232 return todo ? -EBUSY : 0;
@@ -186,19 +239,21 @@ int freeze_processes(void)
186{ 239{
187 int error; 240 int error;
188 241
189 printk("Stopping tasks ... "); 242 printk("Freezing user space processes ... ");
190 error = try_to_freeze_tasks(FREEZER_USER_SPACE); 243 error = try_to_freeze_tasks(FREEZER_USER_SPACE);
191 if (error) 244 if (error)
192 return error; 245 goto Exit;
246 printk("done.\n");
193 247
194 sys_sync(); 248 printk("Freezing remaining freezable tasks ... ");
195 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 249 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
196 if (error) 250 if (error)
197 return error; 251 goto Exit;
198 252 printk("done.");
199 printk("done.\n"); 253 Exit:
200 BUG_ON(in_atomic()); 254 BUG_ON(in_atomic());
201 return 0; 255 printk("\n");
256 return error;
202} 257}
203 258
204static void thaw_tasks(int thaw_user_space) 259static void thaw_tasks(int thaw_user_space)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a686590d88c1..78039b477d2b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1005,11 +1005,12 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1005 } 1005 }
1006 memory_bm_position_reset(orig_bm); 1006 memory_bm_position_reset(orig_bm);
1007 memory_bm_position_reset(copy_bm); 1007 memory_bm_position_reset(copy_bm);
1008 do { 1008 for(;;) {
1009 pfn = memory_bm_next_pfn(orig_bm); 1009 pfn = memory_bm_next_pfn(orig_bm);
1010 if (likely(pfn != BM_END_OF_MAP)) 1010 if (unlikely(pfn == BM_END_OF_MAP))
1011 copy_data_page(memory_bm_next_pfn(copy_bm), pfn); 1011 break;
1012 } while (pfn != BM_END_OF_MAP); 1012 copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
1013 }
1013} 1014}
1014 1015
1015/* Total number of image pages */ 1016/* Total number of image pages */
@@ -1239,17 +1240,39 @@ asmlinkage int swsusp_save(void)
1239 return 0; 1240 return 0;
1240} 1241}
1241 1242
1242static void init_header(struct swsusp_info *info) 1243#ifndef CONFIG_ARCH_HIBERNATION_HEADER
1244static int init_header_complete(struct swsusp_info *info)
1243{ 1245{
1244 memset(info, 0, sizeof(struct swsusp_info)); 1246 memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
1245 info->version_code = LINUX_VERSION_CODE; 1247 info->version_code = LINUX_VERSION_CODE;
1248 return 0;
1249}
1250
1251static char *check_image_kernel(struct swsusp_info *info)
1252{
1253 if (info->version_code != LINUX_VERSION_CODE)
1254 return "kernel version";
1255 if (strcmp(info->uts.sysname,init_utsname()->sysname))
1256 return "system type";
1257 if (strcmp(info->uts.release,init_utsname()->release))
1258 return "kernel release";
1259 if (strcmp(info->uts.version,init_utsname()->version))
1260 return "version";
1261 if (strcmp(info->uts.machine,init_utsname()->machine))
1262 return "machine";
1263 return NULL;
1264}
1265#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
1266
1267static int init_header(struct swsusp_info *info)
1268{
1269 memset(info, 0, sizeof(struct swsusp_info));
1246 info->num_physpages = num_physpages; 1270 info->num_physpages = num_physpages;
1247 memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
1248 info->cpus = num_online_cpus();
1249 info->image_pages = nr_copy_pages; 1271 info->image_pages = nr_copy_pages;
1250 info->pages = nr_copy_pages + nr_meta_pages + 1; 1272 info->pages = nr_copy_pages + nr_meta_pages + 1;
1251 info->size = info->pages; 1273 info->size = info->pages;
1252 info->size <<= PAGE_SHIFT; 1274 info->size <<= PAGE_SHIFT;
1275 return init_header_complete(info);
1253} 1276}
1254 1277
1255/** 1278/**
@@ -1303,7 +1326,11 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1303 return -ENOMEM; 1326 return -ENOMEM;
1304 } 1327 }
1305 if (!handle->offset) { 1328 if (!handle->offset) {
1306 init_header((struct swsusp_info *)buffer); 1329 int error;
1330
1331 error = init_header((struct swsusp_info *)buffer);
1332 if (error)
1333 return error;
1307 handle->buffer = buffer; 1334 handle->buffer = buffer;
1308 memory_bm_position_reset(&orig_bm); 1335 memory_bm_position_reset(&orig_bm);
1309 memory_bm_position_reset(&copy_bm); 1336 memory_bm_position_reset(&copy_bm);
@@ -1394,22 +1421,13 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
1394 } 1421 }
1395} 1422}
1396 1423
1397static inline int check_header(struct swsusp_info *info) 1424static int check_header(struct swsusp_info *info)
1398{ 1425{
1399 char *reason = NULL; 1426 char *reason;
1400 1427
1401 if (info->version_code != LINUX_VERSION_CODE) 1428 reason = check_image_kernel(info);
1402 reason = "kernel version"; 1429 if (!reason && info->num_physpages != num_physpages)
1403 if (info->num_physpages != num_physpages)
1404 reason = "memory size"; 1430 reason = "memory size";
1405 if (strcmp(info->uts.sysname,init_utsname()->sysname))
1406 reason = "system type";
1407 if (strcmp(info->uts.release,init_utsname()->release))
1408 reason = "kernel release";
1409 if (strcmp(info->uts.version,init_utsname()->version))
1410 reason = "version";
1411 if (strcmp(info->uts.machine,init_utsname()->machine))
1412 reason = "machine";
1413 if (reason) { 1431 if (reason) {
1414 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); 1432 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
1415 return -EPERM; 1433 return -EPERM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 5da304c8f1f6..e1722d3155f1 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -270,39 +270,6 @@ int swsusp_shrink_memory(void)
270 return 0; 270 return 0;
271} 271}
272 272
273int swsusp_suspend(void)
274{
275 int error;
276
277 if ((error = arch_prepare_suspend()))
278 return error;
279
280 local_irq_disable();
281 /* At this point, device_suspend() has been called, but *not*
282 * device_power_down(). We *must* device_power_down() now.
283 * Otherwise, drivers for some devices (e.g. interrupt controllers)
284 * become desynchronized with the actual state of the hardware
285 * at resume time, and evil weirdness ensues.
286 */
287 if ((error = device_power_down(PMSG_FREEZE))) {
288 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
289 goto Enable_irqs;
290 }
291
292 save_processor_state();
293 if ((error = swsusp_arch_suspend()))
294 printk(KERN_ERR "Error %d suspending\n", error);
295 /* Restore control flow magically appears here */
296 restore_processor_state();
297 /* NOTE: device_power_up() is just a resume() for devices
298 * that suspended with irqs off ... no overall powerup.
299 */
300 device_power_up();
301 Enable_irqs:
302 local_irq_enable();
303 return error;
304}
305
306int swsusp_resume(void) 273int swsusp_resume(void)
307{ 274{
308 int error; 275 int error;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bd0723a7df3f..5bd321bcbb75 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -153,6 +153,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
153 mutex_lock(&pm_mutex); 153 mutex_lock(&pm_mutex);
154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
155 if (!error) { 155 if (!error) {
156 printk("Syncing filesystems ... ");
157 sys_sync();
158 printk("done.\n");
159
156 error = freeze_processes(); 160 error = freeze_processes();
157 if (error) 161 if (error)
158 thaw_processes(); 162 thaw_processes();
diff --git a/kernel/printk.c b/kernel/printk.c
index 52493474f0ab..a30fe33de395 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -862,7 +862,16 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
862 return -1; 862 return -1;
863} 863}
864 864
865#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND 865int console_suspend_enabled = 1;
866EXPORT_SYMBOL(console_suspend_enabled);
867
868static int __init console_suspend_disable(char *str)
869{
870 console_suspend_enabled = 0;
871 return 1;
872}
873__setup("no_console_suspend", console_suspend_disable);
874
866/** 875/**
867 * suspend_console - suspend the console subsystem 876 * suspend_console - suspend the console subsystem
868 * 877 *
@@ -870,6 +879,8 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
870 */ 879 */
871void suspend_console(void) 880void suspend_console(void)
872{ 881{
882 if (!console_suspend_enabled)
883 return;
873 printk("Suspending console(s)\n"); 884 printk("Suspending console(s)\n");
874 acquire_console_sem(); 885 acquire_console_sem();
875 console_suspended = 1; 886 console_suspended = 1;
@@ -877,10 +888,11 @@ void suspend_console(void)
877 888
878void resume_console(void) 889void resume_console(void)
879{ 890{
891 if (!console_suspend_enabled)
892 return;
880 console_suspended = 0; 893 console_suspended = 0;
881 release_console_sem(); 894 release_console_sem();
882} 895}
883#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
884 896
885/** 897/**
886 * acquire_console_sem - lock the console system for exclusive use. 898 * acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a73ebd3b9d4c..7c76f2ffaeaa 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -19,6 +19,7 @@
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/signal.h> 20#include <linux/signal.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
@@ -168,7 +169,7 @@ int ptrace_attach(struct task_struct *task)
168 retval = -EPERM; 169 retval = -EPERM;
169 if (task->pid <= 1) 170 if (task->pid <= 1)
170 goto out; 171 goto out;
171 if (task->tgid == current->tgid) 172 if (same_thread_group(task, current))
172 goto out; 173 goto out;
173 174
174repeat: 175repeat:
@@ -443,7 +444,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
443 return ERR_PTR(-EPERM); 444 return ERR_PTR(-EPERM);
444 445
445 read_lock(&tasklist_lock); 446 read_lock(&tasklist_lock);
446 child = find_task_by_pid(pid); 447 child = find_task_by_vpid(pid);
447 if (child) 448 if (child)
448 get_task_struct(child); 449 get_task_struct(child);
449 450
diff --git a/kernel/relay.c b/kernel/relay.c
index ad855017bc59..61134eb7a0c8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -370,7 +370,7 @@ void relay_reset(struct rchan *chan)
370 if (!chan) 370 if (!chan)
371 return; 371 return;
372 372
373 if (chan->is_global && chan->buf[0]) { 373 if (chan->is_global && chan->buf[0]) {
374 __relay_reset(chan->buf[0], 0); 374 __relay_reset(chan->buf[0], 0);
375 return; 375 return;
376 } 376 }
@@ -850,13 +850,13 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
850 buf->subbufs_consumed = consumed; 850 buf->subbufs_consumed = consumed;
851 buf->bytes_consumed = 0; 851 buf->bytes_consumed = 0;
852 } 852 }
853 853
854 produced = (produced % n_subbufs) * subbuf_size + buf->offset; 854 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; 855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
856 856
857 if (consumed > produced) 857 if (consumed > produced)
858 produced += n_subbufs * subbuf_size; 858 produced += n_subbufs * subbuf_size;
859 859
860 if (consumed == produced) 860 if (consumed == produced)
861 return 0; 861 return 0;
862 862
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 6b0703db152d..56d73cb8826d 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -87,7 +87,7 @@ static int rt_trace_on = 1;
87static void printk_task(struct task_struct *p) 87static void printk_task(struct task_struct *p)
88{ 88{
89 if (p) 89 if (p)
90 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); 90 printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
91 else 91 else
92 printk("<none>"); 92 printk("<none>");
93} 93}
@@ -152,22 +152,25 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
152 printk( "[ BUG: circular locking deadlock detected! ]\n"); 152 printk( "[ BUG: circular locking deadlock detected! ]\n");
153 printk( "--------------------------------------------\n"); 153 printk( "--------------------------------------------\n");
154 printk("%s/%d is deadlocking current task %s/%d\n\n", 154 printk("%s/%d is deadlocking current task %s/%d\n\n",
155 task->comm, task->pid, current->comm, current->pid); 155 task->comm, task_pid_nr(task),
156 current->comm, task_pid_nr(current));
156 157
157 printk("\n1) %s/%d is trying to acquire this lock:\n", 158 printk("\n1) %s/%d is trying to acquire this lock:\n",
158 current->comm, current->pid); 159 current->comm, task_pid_nr(current));
159 printk_lock(waiter->lock, 1); 160 printk_lock(waiter->lock, 1);
160 161
161 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); 162 printk("\n2) %s/%d is blocked on this lock:\n",
163 task->comm, task_pid_nr(task));
162 printk_lock(waiter->deadlock_lock, 1); 164 printk_lock(waiter->deadlock_lock, 1);
163 165
164 debug_show_held_locks(current); 166 debug_show_held_locks(current);
165 debug_show_held_locks(task); 167 debug_show_held_locks(task);
166 168
167 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); 169 printk("\n%s/%d's [blocked] stackdump:\n\n",
170 task->comm, task_pid_nr(task));
168 show_stack(task, NULL); 171 show_stack(task, NULL);
169 printk("\n%s/%d's [current] stackdump:\n\n", 172 printk("\n%s/%d's [current] stackdump:\n\n",
170 current->comm, current->pid); 173 current->comm, task_pid_nr(current));
171 dump_stack(); 174 dump_stack();
172 debug_show_all_locks(); 175 debug_show_all_locks();
173 176
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 8cd9bd2cdb34..0deef71ff8d2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -185,7 +185,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
185 prev_max = max_lock_depth; 185 prev_max = max_lock_depth;
186 printk(KERN_WARNING "Maximum lock depth %d reached " 186 printk(KERN_WARNING "Maximum lock depth %d reached "
187 "task: %s (%d)\n", max_lock_depth, 187 "task: %s (%d)\n", max_lock_depth,
188 top_task->comm, top_task->pid); 188 top_task->comm, task_pid_nr(top_task));
189 } 189 }
190 put_task_struct(task); 190 put_task_struct(task);
191 191
diff --git a/kernel/sched.c b/kernel/sched.c
index 92721d1534b8..7581e331b139 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
44#include <linux/vmalloc.h> 44#include <linux/vmalloc.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/delay.h> 46#include <linux/delay.h>
47#include <linux/pid_namespace.h>
47#include <linux/smp.h> 48#include <linux/smp.h>
48#include <linux/threads.h> 49#include <linux/threads.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
@@ -51,6 +52,7 @@
51#include <linux/cpu.h> 52#include <linux/cpu.h>
52#include <linux/cpuset.h> 53#include <linux/cpuset.h>
53#include <linux/percpu.h> 54#include <linux/percpu.h>
55#include <linux/cpu_acct.h>
54#include <linux/kthread.h> 56#include <linux/kthread.h>
55#include <linux/seq_file.h> 57#include <linux/seq_file.h>
56#include <linux/sysctl.h> 58#include <linux/sysctl.h>
@@ -153,10 +155,15 @@ struct rt_prio_array {
153 155
154#ifdef CONFIG_FAIR_GROUP_SCHED 156#ifdef CONFIG_FAIR_GROUP_SCHED
155 157
158#include <linux/cgroup.h>
159
156struct cfs_rq; 160struct cfs_rq;
157 161
158/* task group related information */ 162/* task group related information */
159struct task_group { 163struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED
165 struct cgroup_subsys_state css;
166#endif
160 /* schedulable entities of this group on each cpu */ 167 /* schedulable entities of this group on each cpu */
161 struct sched_entity **se; 168 struct sched_entity **se;
162 /* runqueue "owned" by this group on each cpu */ 169 /* runqueue "owned" by this group on each cpu */
@@ -197,6 +204,9 @@ static inline struct task_group *task_group(struct task_struct *p)
197 204
198#ifdef CONFIG_FAIR_USER_SCHED 205#ifdef CONFIG_FAIR_USER_SCHED
199 tg = p->user->tg; 206 tg = p->user->tg;
207#elif defined(CONFIG_FAIR_CGROUP_SCHED)
208 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
209 struct task_group, css);
200#else 210#else
201 tg = &init_task_group; 211 tg = &init_task_group;
202#endif 212#endif
@@ -266,7 +276,8 @@ struct rt_rq {
266 * acquire operations must be ordered by ascending &runqueue. 276 * acquire operations must be ordered by ascending &runqueue.
267 */ 277 */
268struct rq { 278struct rq {
269 spinlock_t lock; /* runqueue lock */ 279 /* runqueue lock: */
280 spinlock_t lock;
270 281
271 /* 282 /*
272 * nr_running and cpu_load should be in the same cacheline because 283 * nr_running and cpu_load should be in the same cacheline because
@@ -279,13 +290,15 @@ struct rq {
279#ifdef CONFIG_NO_HZ 290#ifdef CONFIG_NO_HZ
280 unsigned char in_nohz_recently; 291 unsigned char in_nohz_recently;
281#endif 292#endif
282 struct load_weight load; /* capture load from *all* tasks on this cpu */ 293 /* capture load from *all* tasks on this cpu: */
294 struct load_weight load;
283 unsigned long nr_load_updates; 295 unsigned long nr_load_updates;
284 u64 nr_switches; 296 u64 nr_switches;
285 297
286 struct cfs_rq cfs; 298 struct cfs_rq cfs;
287#ifdef CONFIG_FAIR_GROUP_SCHED 299#ifdef CONFIG_FAIR_GROUP_SCHED
288 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ 300 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list;
289#endif 302#endif
290 struct rt_rq rt; 303 struct rt_rq rt;
291 304
@@ -317,7 +330,8 @@ struct rq {
317 /* For active balancing */ 330 /* For active balancing */
318 int active_balance; 331 int active_balance;
319 int push_cpu; 332 int push_cpu;
320 int cpu; /* cpu of this runqueue */ 333 /* cpu of this runqueue: */
334 int cpu;
321 335
322 struct task_struct *migration_thread; 336 struct task_struct *migration_thread;
323 struct list_head migration_queue; 337 struct list_head migration_queue;
@@ -328,22 +342,22 @@ struct rq {
328 struct sched_info rq_sched_info; 342 struct sched_info rq_sched_info;
329 343
330 /* sys_sched_yield() stats */ 344 /* sys_sched_yield() stats */
331 unsigned long yld_exp_empty; 345 unsigned int yld_exp_empty;
332 unsigned long yld_act_empty; 346 unsigned int yld_act_empty;
333 unsigned long yld_both_empty; 347 unsigned int yld_both_empty;
334 unsigned long yld_count; 348 unsigned int yld_count;
335 349
336 /* schedule() stats */ 350 /* schedule() stats */
337 unsigned long sched_switch; 351 unsigned int sched_switch;
338 unsigned long sched_count; 352 unsigned int sched_count;
339 unsigned long sched_goidle; 353 unsigned int sched_goidle;
340 354
341 /* try_to_wake_up() stats */ 355 /* try_to_wake_up() stats */
342 unsigned long ttwu_count; 356 unsigned int ttwu_count;
343 unsigned long ttwu_local; 357 unsigned int ttwu_local;
344 358
345 /* BKL stats */ 359 /* BKL stats */
346 unsigned long bkl_count; 360 unsigned int bkl_count;
347#endif 361#endif
348 struct lock_class_key rq_lock_key; 362 struct lock_class_key rq_lock_key;
349}; 363};
@@ -449,12 +463,12 @@ enum {
449}; 463};
450 464
451const_debug unsigned int sysctl_sched_features = 465const_debug unsigned int sysctl_sched_features =
452 SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | 466 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
453 SCHED_FEAT_START_DEBIT *1 | 467 SCHED_FEAT_START_DEBIT * 1 |
454 SCHED_FEAT_TREE_AVG *0 | 468 SCHED_FEAT_TREE_AVG * 0 |
455 SCHED_FEAT_APPROX_AVG *0 | 469 SCHED_FEAT_APPROX_AVG * 0 |
456 SCHED_FEAT_WAKEUP_PREEMPT *1 | 470 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
457 SCHED_FEAT_PREEMPT_RESTRICT *1; 471 SCHED_FEAT_PREEMPT_RESTRICT * 1;
458 472
459#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 473#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
460 474
@@ -1871,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1871 preempt_enable(); 1885 preempt_enable();
1872#endif 1886#endif
1873 if (current->set_child_tid) 1887 if (current->set_child_tid)
1874 put_user(current->pid, current->set_child_tid); 1888 put_user(task_pid_vnr(current), current->set_child_tid);
1875} 1889}
1876 1890
1877/* 1891/*
@@ -3296,16 +3310,19 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3296/* 3310/*
3297 * Account user cpu time to a process. 3311 * Account user cpu time to a process.
3298 * @p: the process that the cpu time gets accounted to 3312 * @p: the process that the cpu time gets accounted to
3299 * @hardirq_offset: the offset to subtract from hardirq_count()
3300 * @cputime: the cpu time spent in user space since the last update 3313 * @cputime: the cpu time spent in user space since the last update
3301 */ 3314 */
3302void account_user_time(struct task_struct *p, cputime_t cputime) 3315void account_user_time(struct task_struct *p, cputime_t cputime)
3303{ 3316{
3304 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3317 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3305 cputime64_t tmp; 3318 cputime64_t tmp;
3319 struct rq *rq = this_rq();
3306 3320
3307 p->utime = cputime_add(p->utime, cputime); 3321 p->utime = cputime_add(p->utime, cputime);
3308 3322
3323 if (p != rq->idle)
3324 cpuacct_charge(p, cputime);
3325
3309 /* Add user time to cpustat. */ 3326 /* Add user time to cpustat. */
3310 tmp = cputime_to_cputime64(cputime); 3327 tmp = cputime_to_cputime64(cputime);
3311 if (TASK_NICE(p) > 0) 3328 if (TASK_NICE(p) > 0)
@@ -3334,6 +3351,16 @@ void account_guest_time(struct task_struct *p, cputime_t cputime)
3334} 3351}
3335 3352
3336/* 3353/*
3354 * Account scaled user cpu time to a process.
3355 * @p: the process that the cpu time gets accounted to
3356 * @cputime: the cpu time spent in user space since the last update
3357 */
3358void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3359{
3360 p->utimescaled = cputime_add(p->utimescaled, cputime);
3361}
3362
3363/*
3337 * Account system cpu time to a process. 3364 * Account system cpu time to a process.
3338 * @p: the process that the cpu time gets accounted to 3365 * @p: the process that the cpu time gets accounted to
3339 * @hardirq_offset: the offset to subtract from hardirq_count() 3366 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3360,9 +3387,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3360 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3387 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3361 else if (softirq_count()) 3388 else if (softirq_count())
3362 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3389 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3363 else if (p != rq->idle) 3390 else if (p != rq->idle) {
3364 cpustat->system = cputime64_add(cpustat->system, tmp); 3391 cpustat->system = cputime64_add(cpustat->system, tmp);
3365 else if (atomic_read(&rq->nr_iowait) > 0) 3392 cpuacct_charge(p, cputime);
3393 } else if (atomic_read(&rq->nr_iowait) > 0)
3366 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3394 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3367 else 3395 else
3368 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3396 cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3371,6 +3399,17 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3371} 3399}
3372 3400
3373/* 3401/*
3402 * Account scaled system cpu time to a process.
3403 * @p: the process that the cpu time gets accounted to
3404 * @hardirq_offset: the offset to subtract from hardirq_count()
3405 * @cputime: the cpu time spent in kernel space since the last update
3406 */
3407void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3408{
3409 p->stimescaled = cputime_add(p->stimescaled, cputime);
3410}
3411
3412/*
3374 * Account for involuntary wait time. 3413 * Account for involuntary wait time.
3375 * @p: the process from which the cpu time has been stolen 3414 * @p: the process from which the cpu time has been stolen
3376 * @steal: the cpu time spent in involuntary wait 3415 * @steal: the cpu time spent in involuntary wait
@@ -3387,8 +3426,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3387 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3426 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3388 else 3427 else
3389 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3428 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3390 } else 3429 } else {
3391 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3430 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3431 cpuacct_charge(p, -tmp);
3432 }
3392} 3433}
3393 3434
3394/* 3435/*
@@ -3468,7 +3509,7 @@ EXPORT_SYMBOL(sub_preempt_count);
3468static noinline void __schedule_bug(struct task_struct *prev) 3509static noinline void __schedule_bug(struct task_struct *prev)
3469{ 3510{
3470 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3511 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3471 prev->comm, preempt_count(), prev->pid); 3512 prev->comm, preempt_count(), task_pid_nr(prev));
3472 debug_show_held_locks(prev); 3513 debug_show_held_locks(prev);
3473 if (irqs_disabled()) 3514 if (irqs_disabled())
3474 print_irqtrace_events(prev); 3515 print_irqtrace_events(prev);
@@ -3859,7 +3900,10 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
3859 3900
3860int __sched wait_for_completion_interruptible(struct completion *x) 3901int __sched wait_for_completion_interruptible(struct completion *x)
3861{ 3902{
3862 return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 3903 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3904 if (t == -ERESTARTSYS)
3905 return t;
3906 return 0;
3863} 3907}
3864EXPORT_SYMBOL(wait_for_completion_interruptible); 3908EXPORT_SYMBOL(wait_for_completion_interruptible);
3865 3909
@@ -4131,7 +4175,7 @@ struct task_struct *idle_task(int cpu)
4131 */ 4175 */
4132static struct task_struct *find_process_by_pid(pid_t pid) 4176static struct task_struct *find_process_by_pid(pid_t pid)
4133{ 4177{
4134 return pid ? find_task_by_pid(pid) : current; 4178 return pid ? find_task_by_vpid(pid) : current;
4135} 4179}
4136 4180
4137/* Actually do priority change: must hold rq lock. */ 4181/* Actually do priority change: must hold rq lock. */
@@ -4434,8 +4478,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4434 4478
4435 cpus_allowed = cpuset_cpus_allowed(p); 4479 cpus_allowed = cpuset_cpus_allowed(p);
4436 cpus_and(new_mask, new_mask, cpus_allowed); 4480 cpus_and(new_mask, new_mask, cpus_allowed);
4481 again:
4437 retval = set_cpus_allowed(p, new_mask); 4482 retval = set_cpus_allowed(p, new_mask);
4438 4483
4484 if (!retval) {
4485 cpus_allowed = cpuset_cpus_allowed(p);
4486 if (!cpus_subset(new_mask, cpus_allowed)) {
4487 /*
4488 * We must have raced with a concurrent cpuset
4489 * update. Just reset the cpus_allowed to the
4490 * cpuset's cpus_allowed
4491 */
4492 new_mask = cpus_allowed;
4493 goto again;
4494 }
4495 }
4439out_unlock: 4496out_unlock:
4440 put_task_struct(p); 4497 put_task_struct(p);
4441 mutex_unlock(&sched_hotcpu_mutex); 4498 mutex_unlock(&sched_hotcpu_mutex);
@@ -4794,18 +4851,18 @@ static void show_task(struct task_struct *p)
4794 unsigned state; 4851 unsigned state;
4795 4852
4796 state = p->state ? __ffs(p->state) + 1 : 0; 4853 state = p->state ? __ffs(p->state) + 1 : 0;
4797 printk("%-13.13s %c", p->comm, 4854 printk(KERN_INFO "%-13.13s %c", p->comm,
4798 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4855 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4799#if BITS_PER_LONG == 32 4856#if BITS_PER_LONG == 32
4800 if (state == TASK_RUNNING) 4857 if (state == TASK_RUNNING)
4801 printk(" running "); 4858 printk(KERN_CONT " running ");
4802 else 4859 else
4803 printk(" %08lx ", thread_saved_pc(p)); 4860 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4804#else 4861#else
4805 if (state == TASK_RUNNING) 4862 if (state == TASK_RUNNING)
4806 printk(" running task "); 4863 printk(KERN_CONT " running task ");
4807 else 4864 else
4808 printk(" %016lx ", thread_saved_pc(p)); 4865 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4809#endif 4866#endif
4810#ifdef CONFIG_DEBUG_STACK_USAGE 4867#ifdef CONFIG_DEBUG_STACK_USAGE
4811 { 4868 {
@@ -4815,7 +4872,8 @@ static void show_task(struct task_struct *p)
4815 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4872 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4816 } 4873 }
4817#endif 4874#endif
4818 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); 4875 printk(KERN_CONT "%5lu %5d %6d\n", free,
4876 task_pid_nr(p), task_pid_nr(p->parent));
4819 4877
4820 if (state != TASK_RUNNING) 4878 if (state != TASK_RUNNING)
4821 show_stack(p, NULL); 4879 show_stack(p, NULL);
@@ -5087,7 +5145,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5087} 5145}
5088 5146
5089/* 5147/*
5090 * Figure out where task on dead CPU should go, use force if neccessary. 5148 * Figure out where task on dead CPU should go, use force if necessary.
5091 * NOTE: interrupts should be disabled by the caller 5149 * NOTE: interrupts should be disabled by the caller
5092 */ 5150 */
5093static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5151static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
@@ -5109,8 +5167,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5109 5167
5110 /* No more Mr. Nice Guy. */ 5168 /* No more Mr. Nice Guy. */
5111 if (dest_cpu == NR_CPUS) { 5169 if (dest_cpu == NR_CPUS) {
5170 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5171 /*
5172 * Try to stay on the same cpuset, where the
5173 * current cpuset may be a subset of all cpus.
5174 * The cpuset_cpus_allowed_locked() variant of
5175 * cpuset_cpus_allowed() will not block. It must be
5176 * called within calls to cpuset_lock/cpuset_unlock.
5177 */
5112 rq = task_rq_lock(p, &flags); 5178 rq = task_rq_lock(p, &flags);
5113 cpus_setall(p->cpus_allowed); 5179 p->cpus_allowed = cpus_allowed;
5114 dest_cpu = any_online_cpu(p->cpus_allowed); 5180 dest_cpu = any_online_cpu(p->cpus_allowed);
5115 task_rq_unlock(rq, &flags); 5181 task_rq_unlock(rq, &flags);
5116 5182
@@ -5122,7 +5188,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5122 if (p->mm && printk_ratelimit()) 5188 if (p->mm && printk_ratelimit())
5123 printk(KERN_INFO "process %d (%s) no " 5189 printk(KERN_INFO "process %d (%s) no "
5124 "longer affine to cpu%d\n", 5190 "longer affine to cpu%d\n",
5125 p->pid, p->comm, dead_cpu); 5191 task_pid_nr(p), p->comm, dead_cpu);
5126 } 5192 }
5127 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 5193 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5128} 5194}
@@ -5229,7 +5295,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5229 struct rq *rq = cpu_rq(dead_cpu); 5295 struct rq *rq = cpu_rq(dead_cpu);
5230 5296
5231 /* Must be exiting, otherwise would be on tasklist. */ 5297 /* Must be exiting, otherwise would be on tasklist. */
5232 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5298 BUG_ON(!p->exit_state);
5233 5299
5234 /* Cannot have done final schedule yet: would have vanished. */ 5300 /* Cannot have done final schedule yet: would have vanished. */
5235 BUG_ON(p->state == TASK_DEAD); 5301 BUG_ON(p->state == TASK_DEAD);
@@ -5364,7 +5430,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5364 return table; 5430 return table;
5365} 5431}
5366 5432
5367static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5433static ctl_table * sd_alloc_ctl_cpu_table(int cpu)
5368{ 5434{
5369 struct ctl_table *entry, *table; 5435 struct ctl_table *entry, *table;
5370 struct sched_domain *sd; 5436 struct sched_domain *sd;
@@ -5458,7 +5524,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5458 5524
5459 case CPU_ONLINE: 5525 case CPU_ONLINE:
5460 case CPU_ONLINE_FROZEN: 5526 case CPU_ONLINE_FROZEN:
5461 /* Strictly unneccessary, as first user will wake it. */ 5527 /* Strictly unnecessary, as first user will wake it. */
5462 wake_up_process(cpu_rq(cpu)->migration_thread); 5528 wake_up_process(cpu_rq(cpu)->migration_thread);
5463 break; 5529 break;
5464 5530
@@ -5476,6 +5542,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5476 5542
5477 case CPU_DEAD: 5543 case CPU_DEAD:
5478 case CPU_DEAD_FROZEN: 5544 case CPU_DEAD_FROZEN:
5545 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5479 migrate_live_tasks(cpu); 5546 migrate_live_tasks(cpu);
5480 rq = cpu_rq(cpu); 5547 rq = cpu_rq(cpu);
5481 kthread_stop(rq->migration_thread); 5548 kthread_stop(rq->migration_thread);
@@ -5489,6 +5556,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5489 rq->idle->sched_class = &idle_sched_class; 5556 rq->idle->sched_class = &idle_sched_class;
5490 migrate_dead_tasks(cpu); 5557 migrate_dead_tasks(cpu);
5491 spin_unlock_irq(&rq->lock); 5558 spin_unlock_irq(&rq->lock);
5559 cpuset_unlock();
5492 migrate_nr_uninterruptible(rq); 5560 migrate_nr_uninterruptible(rq);
5493 BUG_ON(rq->nr_running != 0); 5561 BUG_ON(rq->nr_running != 0);
5494 5562
@@ -5598,20 +5666,20 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5598 } 5666 }
5599 5667
5600 if (!group->__cpu_power) { 5668 if (!group->__cpu_power) {
5601 printk("\n"); 5669 printk(KERN_CONT "\n");
5602 printk(KERN_ERR "ERROR: domain->cpu_power not " 5670 printk(KERN_ERR "ERROR: domain->cpu_power not "
5603 "set\n"); 5671 "set\n");
5604 break; 5672 break;
5605 } 5673 }
5606 5674
5607 if (!cpus_weight(group->cpumask)) { 5675 if (!cpus_weight(group->cpumask)) {
5608 printk("\n"); 5676 printk(KERN_CONT "\n");
5609 printk(KERN_ERR "ERROR: empty group\n"); 5677 printk(KERN_ERR "ERROR: empty group\n");
5610 break; 5678 break;
5611 } 5679 }
5612 5680
5613 if (cpus_intersects(groupmask, group->cpumask)) { 5681 if (cpus_intersects(groupmask, group->cpumask)) {
5614 printk("\n"); 5682 printk(KERN_CONT "\n");
5615 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5683 printk(KERN_ERR "ERROR: repeated CPUs\n");
5616 break; 5684 break;
5617 } 5685 }
@@ -5619,11 +5687,11 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5619 cpus_or(groupmask, groupmask, group->cpumask); 5687 cpus_or(groupmask, groupmask, group->cpumask);
5620 5688
5621 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 5689 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5622 printk(" %s", str); 5690 printk(KERN_CONT " %s", str);
5623 5691
5624 group = group->next; 5692 group = group->next;
5625 } while (group != sd->groups); 5693 } while (group != sd->groups);
5626 printk("\n"); 5694 printk(KERN_CONT "\n");
5627 5695
5628 if (!cpus_equal(sd->span, groupmask)) 5696 if (!cpus_equal(sd->span, groupmask))
5629 printk(KERN_ERR "ERROR: groups don't span " 5697 printk(KERN_ERR "ERROR: groups don't span "
@@ -6339,26 +6407,31 @@ error:
6339 return -ENOMEM; 6407 return -ENOMEM;
6340#endif 6408#endif
6341} 6409}
6410
6411static cpumask_t *doms_cur; /* current sched domains */
6412static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6413
6414/*
6415 * Special case: If a kmalloc of a doms_cur partition (array of
6416 * cpumask_t) fails, then fallback to a single sched domain,
6417 * as determined by the single cpumask_t fallback_doms.
6418 */
6419static cpumask_t fallback_doms;
6420
6342/* 6421/*
6343 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6422 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6423 * For now this just excludes isolated cpus, but could be used to
6424 * exclude other special cases in the future.
6344 */ 6425 */
6345static int arch_init_sched_domains(const cpumask_t *cpu_map) 6426static int arch_init_sched_domains(const cpumask_t *cpu_map)
6346{ 6427{
6347 cpumask_t cpu_default_map; 6428 ndoms_cur = 1;
6348 int err; 6429 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6349 6430 if (!doms_cur)
6350 /* 6431 doms_cur = &fallback_doms;
6351 * Setup mask for cpus without special case scheduling requirements. 6432 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6352 * For now this just excludes isolated cpus, but could be used to
6353 * exclude other special cases in the future.
6354 */
6355 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6356
6357 err = build_sched_domains(&cpu_default_map);
6358
6359 register_sched_domain_sysctl(); 6433 register_sched_domain_sysctl();
6360 6434 return build_sched_domains(doms_cur);
6361 return err;
6362} 6435}
6363 6436
6364static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6437static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6382,6 +6455,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6382 arch_destroy_sched_domains(cpu_map); 6455 arch_destroy_sched_domains(cpu_map);
6383} 6456}
6384 6457
6458/*
6459 * Partition sched domains as specified by the 'ndoms_new'
6460 * cpumasks in the array doms_new[] of cpumasks. This compares
6461 * doms_new[] to the current sched domain partitioning, doms_cur[].
6462 * It destroys each deleted domain and builds each new domain.
6463 *
6464 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6465 * The masks don't intersect (don't overlap.) We should setup one
6466 * sched domain for each mask. CPUs not in any of the cpumasks will
6467 * not be load balanced. If the same cpumask appears both in the
6468 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6469 * it as it is.
6470 *
6471 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6472 * ownership of it and will kfree it when done with it. If the caller
6473 * failed the kmalloc call, then it can pass in doms_new == NULL,
6474 * and partition_sched_domains() will fallback to the single partition
6475 * 'fallback_doms'.
6476 *
6477 * Call with hotplug lock held
6478 */
6479void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6480{
6481 int i, j;
6482
6483 if (doms_new == NULL) {
6484 ndoms_new = 1;
6485 doms_new = &fallback_doms;
6486 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6487 }
6488
6489 /* Destroy deleted domains */
6490 for (i = 0; i < ndoms_cur; i++) {
6491 for (j = 0; j < ndoms_new; j++) {
6492 if (cpus_equal(doms_cur[i], doms_new[j]))
6493 goto match1;
6494 }
6495 /* no match - a current sched domain not in new doms_new[] */
6496 detach_destroy_domains(doms_cur + i);
6497match1:
6498 ;
6499 }
6500
6501 /* Build new domains */
6502 for (i = 0; i < ndoms_new; i++) {
6503 for (j = 0; j < ndoms_cur; j++) {
6504 if (cpus_equal(doms_new[i], doms_cur[j]))
6505 goto match2;
6506 }
6507 /* no match - add a new doms_new */
6508 build_sched_domains(doms_new + i);
6509match2:
6510 ;
6511 }
6512
6513 /* Remember the new sched domains */
6514 if (doms_cur != &fallback_doms)
6515 kfree(doms_cur);
6516 doms_cur = doms_new;
6517 ndoms_cur = ndoms_new;
6518}
6519
6385#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6520#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6386static int arch_reinit_sched_domains(void) 6521static int arch_reinit_sched_domains(void)
6387{ 6522{
@@ -6963,3 +7098,116 @@ unsigned long sched_group_shares(struct task_group *tg)
6963} 7098}
6964 7099
6965#endif /* CONFIG_FAIR_GROUP_SCHED */ 7100#endif /* CONFIG_FAIR_GROUP_SCHED */
7101
7102#ifdef CONFIG_FAIR_CGROUP_SCHED
7103
7104/* return corresponding task_group object of a cgroup */
7105static inline struct task_group *cgroup_tg(struct cgroup *cont)
7106{
7107 return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
7108 struct task_group, css);
7109}
7110
7111static struct cgroup_subsys_state *
7112cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
7113{
7114 struct task_group *tg;
7115
7116 if (!cont->parent) {
7117 /* This is early initialization for the top cgroup */
7118 init_task_group.css.cgroup = cont;
7119 return &init_task_group.css;
7120 }
7121
7122 /* we support only 1-level deep hierarchical scheduler atm */
7123 if (cont->parent->parent)
7124 return ERR_PTR(-EINVAL);
7125
7126 tg = sched_create_group();
7127 if (IS_ERR(tg))
7128 return ERR_PTR(-ENOMEM);
7129
7130 /* Bind the cgroup to task_group object we just created */
7131 tg->css.cgroup = cont;
7132
7133 return &tg->css;
7134}
7135
7136static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
7137 struct cgroup *cont)
7138{
7139 struct task_group *tg = cgroup_tg(cont);
7140
7141 sched_destroy_group(tg);
7142}
7143
7144static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7145 struct cgroup *cont, struct task_struct *tsk)
7146{
7147 /* We don't support RT-tasks being in separate groups */
7148 if (tsk->sched_class != &fair_sched_class)
7149 return -EINVAL;
7150
7151 return 0;
7152}
7153
7154static void
7155cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
7156 struct cgroup *old_cont, struct task_struct *tsk)
7157{
7158 sched_move_task(tsk);
7159}
7160
7161static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
7162 struct file *file, const char __user *userbuf,
7163 size_t nbytes, loff_t *ppos)
7164{
7165 unsigned long shareval;
7166 struct task_group *tg = cgroup_tg(cont);
7167 char buffer[2*sizeof(unsigned long) + 1];
7168 int rc;
7169
7170 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
7171 return -E2BIG;
7172
7173 if (copy_from_user(buffer, userbuf, nbytes))
7174 return -EFAULT;
7175
7176 buffer[nbytes] = 0; /* nul-terminate */
7177 shareval = simple_strtoul(buffer, NULL, 10);
7178
7179 rc = sched_group_set_shares(tg, shareval);
7180
7181 return (rc < 0 ? rc : nbytes);
7182}
7183
7184static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
7185{
7186 struct task_group *tg = cgroup_tg(cont);
7187
7188 return (u64) tg->shares;
7189}
7190
7191static struct cftype cpu_shares = {
7192 .name = "shares",
7193 .read_uint = cpu_shares_read_uint,
7194 .write = cpu_shares_write,
7195};
7196
7197static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7198{
7199 return cgroup_add_file(cont, ss, &cpu_shares);
7200}
7201
7202struct cgroup_subsys cpu_cgroup_subsys = {
7203 .name = "cpu",
7204 .create = cpu_cgroup_create,
7205 .destroy = cpu_cgroup_destroy,
7206 .can_attach = cpu_cgroup_can_attach,
7207 .attach = cpu_cgroup_attach,
7208 .populate = cpu_cgroup_populate,
7209 .subsys_id = cpu_cgroup_subsys_id,
7210 .early_init = 1,
7211};
7212
7213#endif /* CONFIG_FAIR_CGROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index a5e517ec07c3..e6fb392e5164 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -137,7 +137,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
137 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 137 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
138 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 138 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
139#ifdef CONFIG_SCHEDSTATS 139#ifdef CONFIG_SCHEDSTATS
140 SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", 140 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
141 rq->bkl_count); 141 rq->bkl_count);
142#endif 142#endif
143 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 143 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 1c084842c3e7..ef1a7df80ea2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -21,7 +21,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 21
22 /* runqueue-specific stats */ 22 /* runqueue-specific stats */
23 seq_printf(seq, 23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", 24 "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
25 cpu, rq->yld_both_empty, 25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
27 rq->sched_switch, rq->sched_count, rq->sched_goidle, 27 rq->sched_switch, rq->sched_count, rq->sched_goidle,
@@ -42,8 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 seq_printf(seq, "domain%d %s", dcount++, mask_str); 42 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " 45 seq_printf(seq, " %u %u %u %u %u %u %u %u",
46 "%lu",
47 sd->lb_count[itype], 46 sd->lb_count[itype],
48 sd->lb_balanced[itype], 47 sd->lb_balanced[itype],
49 sd->lb_failed[itype], 48 sd->lb_failed[itype],
@@ -53,8 +52,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
53 sd->lb_nobusyq[itype], 52 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]); 53 sd->lb_nobusyg[itype]);
55 } 54 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" 55 seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n",
57 " %lu %lu %lu\n",
58 sd->alb_count, sd->alb_failed, sd->alb_pushed, 56 sd->alb_count, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, 57 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, 58 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
diff --git a/kernel/signal.c b/kernel/signal.c
index 2124ffadcfde..12006308c7eb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -99,7 +99,6 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
99static int recalc_sigpending_tsk(struct task_struct *t) 99static int recalc_sigpending_tsk(struct task_struct *t)
100{ 100{
101 if (t->signal->group_stop_count > 0 || 101 if (t->signal->group_stop_count > 0 ||
102 (freezing(t)) ||
103 PENDING(&t->pending, &t->blocked) || 102 PENDING(&t->pending, &t->blocked) ||
104 PENDING(&t->signal->shared_pending, &t->blocked)) { 103 PENDING(&t->signal->shared_pending, &t->blocked)) {
105 set_tsk_thread_flag(t, TIF_SIGPENDING); 104 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -257,7 +256,7 @@ flush_signal_handlers(struct task_struct *t, int force_default)
257 256
258int unhandled_signal(struct task_struct *tsk, int sig) 257int unhandled_signal(struct task_struct *tsk, int sig)
259{ 258{
260 if (is_init(tsk)) 259 if (is_global_init(tsk))
261 return 1; 260 return 1;
262 if (tsk->ptrace & PT_PTRACED) 261 if (tsk->ptrace & PT_PTRACED)
263 return 0; 262 return 0;
@@ -537,7 +536,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
537 return error; 536 return error;
538 error = -EPERM; 537 error = -EPERM;
539 if (((sig != SIGCONT) || 538 if (((sig != SIGCONT) ||
540 (process_session(current) != process_session(t))) 539 (task_session_nr(current) != task_session_nr(t)))
541 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
542 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
543 && !capable(CAP_KILL)) 542 && !capable(CAP_KILL))
@@ -695,7 +694,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
695 q->info.si_signo = sig; 694 q->info.si_signo = sig;
696 q->info.si_errno = 0; 695 q->info.si_errno = 0;
697 q->info.si_code = SI_USER; 696 q->info.si_code = SI_USER;
698 q->info.si_pid = current->pid; 697 q->info.si_pid = task_pid_vnr(current);
699 q->info.si_uid = current->uid; 698 q->info.si_uid = current->uid;
700 break; 699 break;
701 case (unsigned long) SEND_SIG_PRIV: 700 case (unsigned long) SEND_SIG_PRIV:
@@ -731,7 +730,7 @@ int print_fatal_signals;
731static void print_fatal_signal(struct pt_regs *regs, int signr) 730static void print_fatal_signal(struct pt_regs *regs, int signr)
732{ 731{
733 printk("%s/%d: potentially unexpected fatal signal %d.\n", 732 printk("%s/%d: potentially unexpected fatal signal %d.\n",
734 current->comm, current->pid, signr); 733 current->comm, task_pid_nr(current), signr);
735 734
736#ifdef __i386__ 735#ifdef __i386__
737 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->eip);
@@ -1090,7 +1089,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1090{ 1089{
1091 int error; 1090 int error;
1092 rcu_read_lock(); 1091 rcu_read_lock();
1093 error = kill_pid_info(sig, info, find_pid(pid)); 1092 error = kill_pid_info(sig, info, find_vpid(pid));
1094 rcu_read_unlock(); 1093 rcu_read_unlock();
1095 return error; 1094 return error;
1096} 1095}
@@ -1151,7 +1150,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1151 1150
1152 read_lock(&tasklist_lock); 1151 read_lock(&tasklist_lock);
1153 for_each_process(p) { 1152 for_each_process(p) {
1154 if (p->pid > 1 && p->tgid != current->tgid) { 1153 if (p->pid > 1 && !same_thread_group(p, current)) {
1155 int err = group_send_sig_info(sig, info, p); 1154 int err = group_send_sig_info(sig, info, p);
1156 ++count; 1155 ++count;
1157 if (err != -EPERM) 1156 if (err != -EPERM)
@@ -1161,9 +1160,9 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1161 read_unlock(&tasklist_lock); 1160 read_unlock(&tasklist_lock);
1162 ret = count ? retval : -ESRCH; 1161 ret = count ? retval : -ESRCH;
1163 } else if (pid < 0) { 1162 } else if (pid < 0) {
1164 ret = kill_pgrp_info(sig, info, find_pid(-pid)); 1163 ret = kill_pgrp_info(sig, info, find_vpid(-pid));
1165 } else { 1164 } else {
1166 ret = kill_pid_info(sig, info, find_pid(pid)); 1165 ret = kill_pid_info(sig, info, find_vpid(pid));
1167 } 1166 }
1168 rcu_read_unlock(); 1167 rcu_read_unlock();
1169 return ret; 1168 return ret;
@@ -1267,7 +1266,12 @@ EXPORT_SYMBOL(kill_pid);
1267int 1266int
1268kill_proc(pid_t pid, int sig, int priv) 1267kill_proc(pid_t pid, int sig, int priv)
1269{ 1268{
1270 return kill_proc_info(sig, __si_special(priv), pid); 1269 int ret;
1270
1271 rcu_read_lock();
1272 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1273 rcu_read_unlock();
1274 return ret;
1271} 1275}
1272 1276
1273/* 1277/*
@@ -1444,7 +1448,22 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1444 1448
1445 info.si_signo = sig; 1449 info.si_signo = sig;
1446 info.si_errno = 0; 1450 info.si_errno = 0;
1447 info.si_pid = tsk->pid; 1451 /*
1452 * we are under tasklist_lock here so our parent is tied to
1453 * us and cannot exit and release its namespace.
1454 *
1455 * the only it can is to switch its nsproxy with sys_unshare,
1456 * bu uncharing pid namespaces is not allowed, so we'll always
1457 * see relevant namespace
1458 *
1459 * write_lock() currently calls preempt_disable() which is the
1460 * same as rcu_read_lock(), but according to Oleg, this is not
1461 * correct to rely on this
1462 */
1463 rcu_read_lock();
1464 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1465 rcu_read_unlock();
1466
1448 info.si_uid = tsk->uid; 1467 info.si_uid = tsk->uid;
1449 1468
1450 /* FIXME: find out whether or not this is supposed to be c*time. */ 1469 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1509,7 +1528,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1509 1528
1510 info.si_signo = SIGCHLD; 1529 info.si_signo = SIGCHLD;
1511 info.si_errno = 0; 1530 info.si_errno = 0;
1512 info.si_pid = tsk->pid; 1531 /*
1532 * see comment in do_notify_parent() abot the following 3 lines
1533 */
1534 rcu_read_lock();
1535 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1536 rcu_read_unlock();
1537
1513 info.si_uid = tsk->uid; 1538 info.si_uid = tsk->uid;
1514 1539
1515 /* FIXME: find out whether or not this is supposed to be c*time. */ 1540 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1635,7 +1660,7 @@ void ptrace_notify(int exit_code)
1635 memset(&info, 0, sizeof info); 1660 memset(&info, 0, sizeof info);
1636 info.si_signo = SIGTRAP; 1661 info.si_signo = SIGTRAP;
1637 info.si_code = exit_code; 1662 info.si_code = exit_code;
1638 info.si_pid = current->pid; 1663 info.si_pid = task_pid_vnr(current);
1639 info.si_uid = current->uid; 1664 info.si_uid = current->uid;
1640 1665
1641 /* Let the debugger run. */ 1666 /* Let the debugger run. */
@@ -1805,7 +1830,7 @@ relock:
1805 info->si_signo = signr; 1830 info->si_signo = signr;
1806 info->si_errno = 0; 1831 info->si_errno = 0;
1807 info->si_code = SI_USER; 1832 info->si_code = SI_USER;
1808 info->si_pid = current->parent->pid; 1833 info->si_pid = task_pid_vnr(current->parent);
1809 info->si_uid = current->parent->uid; 1834 info->si_uid = current->parent->uid;
1810 } 1835 }
1811 1836
@@ -1836,11 +1861,9 @@ relock:
1836 continue; 1861 continue;
1837 1862
1838 /* 1863 /*
1839 * Init of a pid space gets no signals it doesn't want from 1864 * Global init gets no signals it doesn't want.
1840 * within that pid space. It can of course get signals from
1841 * its parent pid space.
1842 */ 1865 */
1843 if (current == child_reaper(current)) 1866 if (is_global_init(current))
1844 continue; 1867 continue;
1845 1868
1846 if (sig_kernel_stop(signr)) { 1869 if (sig_kernel_stop(signr)) {
@@ -2194,7 +2217,7 @@ sys_kill(int pid, int sig)
2194 info.si_signo = sig; 2217 info.si_signo = sig;
2195 info.si_errno = 0; 2218 info.si_errno = 0;
2196 info.si_code = SI_USER; 2219 info.si_code = SI_USER;
2197 info.si_pid = current->tgid; 2220 info.si_pid = task_tgid_vnr(current);
2198 info.si_uid = current->uid; 2221 info.si_uid = current->uid;
2199 2222
2200 return kill_something_info(sig, &info, pid); 2223 return kill_something_info(sig, &info, pid);
@@ -2210,12 +2233,12 @@ static int do_tkill(int tgid, int pid, int sig)
2210 info.si_signo = sig; 2233 info.si_signo = sig;
2211 info.si_errno = 0; 2234 info.si_errno = 0;
2212 info.si_code = SI_TKILL; 2235 info.si_code = SI_TKILL;
2213 info.si_pid = current->tgid; 2236 info.si_pid = task_tgid_vnr(current);
2214 info.si_uid = current->uid; 2237 info.si_uid = current->uid;
2215 2238
2216 read_lock(&tasklist_lock); 2239 read_lock(&tasklist_lock);
2217 p = find_task_by_pid(pid); 2240 p = find_task_by_vpid(pid);
2218 if (p && (tgid <= 0 || p->tgid == tgid)) { 2241 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2219 error = check_kill_permission(sig, &info, p); 2242 error = check_kill_permission(sig, &info, p);
2220 /* 2243 /*
2221 * The null signal is a permissions and process existence 2244 * The null signal is a permissions and process existence
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index edeeef3a6a32..11df812263c8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -113,7 +113,7 @@ void softlockup_tick(void)
113 spin_lock(&print_lock); 113 spin_lock(&print_lock);
114 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 114 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
115 this_cpu, now - touch_timestamp, 115 this_cpu, now - touch_timestamp,
116 current->comm, current->pid); 116 current->comm, task_pid_nr(current));
117 if (regs) 117 if (regs)
118 show_regs(regs); 118 show_regs(regs);
119 else 119 else
diff --git a/kernel/sys.c b/kernel/sys.c
index 8ae2e636eb1b..304b5410d746 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -105,538 +105,6 @@ EXPORT_SYMBOL(cad_pid);
105 */ 105 */
106 106
107void (*pm_power_off_prepare)(void); 107void (*pm_power_off_prepare)(void);
108EXPORT_SYMBOL(pm_power_off_prepare);
109
110/*
111 * Notifier list for kernel code which wants to be called
112 * at shutdown. This is used to stop any idling DMA operations
113 * and the like.
114 */
115
116static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
117
118/*
119 * Notifier chain core routines. The exported routines below
120 * are layered on top of these, with appropriate locking added.
121 */
122
123static int notifier_chain_register(struct notifier_block **nl,
124 struct notifier_block *n)
125{
126 while ((*nl) != NULL) {
127 if (n->priority > (*nl)->priority)
128 break;
129 nl = &((*nl)->next);
130 }
131 n->next = *nl;
132 rcu_assign_pointer(*nl, n);
133 return 0;
134}
135
136static int notifier_chain_unregister(struct notifier_block **nl,
137 struct notifier_block *n)
138{
139 while ((*nl) != NULL) {
140 if ((*nl) == n) {
141 rcu_assign_pointer(*nl, n->next);
142 return 0;
143 }
144 nl = &((*nl)->next);
145 }
146 return -ENOENT;
147}
148
149/**
150 * notifier_call_chain - Informs the registered notifiers about an event.
151 * @nl: Pointer to head of the blocking notifier chain
152 * @val: Value passed unmodified to notifier function
153 * @v: Pointer passed unmodified to notifier function
154 * @nr_to_call: Number of notifier functions to be called. Don't care
155 * value of this parameter is -1.
156 * @nr_calls: Records the number of notifications sent. Don't care
157 * value of this field is NULL.
158 * @returns: notifier_call_chain returns the value returned by the
159 * last notifier function called.
160 */
161
162static int __kprobes notifier_call_chain(struct notifier_block **nl,
163 unsigned long val, void *v,
164 int nr_to_call, int *nr_calls)
165{
166 int ret = NOTIFY_DONE;
167 struct notifier_block *nb, *next_nb;
168
169 nb = rcu_dereference(*nl);
170
171 while (nb && nr_to_call) {
172 next_nb = rcu_dereference(nb->next);
173 ret = nb->notifier_call(nb, val, v);
174
175 if (nr_calls)
176 (*nr_calls)++;
177
178 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
179 break;
180 nb = next_nb;
181 nr_to_call--;
182 }
183 return ret;
184}
185
186/*
187 * Atomic notifier chain routines. Registration and unregistration
188 * use a spinlock, and call_chain is synchronized by RCU (no locks).
189 */
190
191/**
192 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
193 * @nh: Pointer to head of the atomic notifier chain
194 * @n: New entry in notifier chain
195 *
196 * Adds a notifier to an atomic notifier chain.
197 *
198 * Currently always returns zero.
199 */
200
201int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
202 struct notifier_block *n)
203{
204 unsigned long flags;
205 int ret;
206
207 spin_lock_irqsave(&nh->lock, flags);
208 ret = notifier_chain_register(&nh->head, n);
209 spin_unlock_irqrestore(&nh->lock, flags);
210 return ret;
211}
212
213EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
214
215/**
216 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
217 * @nh: Pointer to head of the atomic notifier chain
218 * @n: Entry to remove from notifier chain
219 *
220 * Removes a notifier from an atomic notifier chain.
221 *
222 * Returns zero on success or %-ENOENT on failure.
223 */
224int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
225 struct notifier_block *n)
226{
227 unsigned long flags;
228 int ret;
229
230 spin_lock_irqsave(&nh->lock, flags);
231 ret = notifier_chain_unregister(&nh->head, n);
232 spin_unlock_irqrestore(&nh->lock, flags);
233 synchronize_rcu();
234 return ret;
235}
236
237EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
238
239/**
240 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
241 * @nh: Pointer to head of the atomic notifier chain
242 * @val: Value passed unmodified to notifier function
243 * @v: Pointer passed unmodified to notifier function
244 * @nr_to_call: See the comment for notifier_call_chain.
245 * @nr_calls: See the comment for notifier_call_chain.
246 *
247 * Calls each function in a notifier chain in turn. The functions
248 * run in an atomic context, so they must not block.
249 * This routine uses RCU to synchronize with changes to the chain.
250 *
251 * If the return value of the notifier can be and'ed
252 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
253 * will return immediately, with the return value of
254 * the notifier function which halted execution.
255 * Otherwise the return value is the return value
256 * of the last notifier function called.
257 */
258
259int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
260 unsigned long val, void *v,
261 int nr_to_call, int *nr_calls)
262{
263 int ret;
264
265 rcu_read_lock();
266 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
267 rcu_read_unlock();
268 return ret;
269}
270
271EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
272
273int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
274 unsigned long val, void *v)
275{
276 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
277}
278
279EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
280/*
281 * Blocking notifier chain routines. All access to the chain is
282 * synchronized by an rwsem.
283 */
284
285/**
286 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
287 * @nh: Pointer to head of the blocking notifier chain
288 * @n: New entry in notifier chain
289 *
290 * Adds a notifier to a blocking notifier chain.
291 * Must be called in process context.
292 *
293 * Currently always returns zero.
294 */
295
296int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
297 struct notifier_block *n)
298{
299 int ret;
300
301 /*
302 * This code gets used during boot-up, when task switching is
303 * not yet working and interrupts must remain disabled. At
304 * such times we must not call down_write().
305 */
306 if (unlikely(system_state == SYSTEM_BOOTING))
307 return notifier_chain_register(&nh->head, n);
308
309 down_write(&nh->rwsem);
310 ret = notifier_chain_register(&nh->head, n);
311 up_write(&nh->rwsem);
312 return ret;
313}
314
315EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
316
317/**
318 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
319 * @nh: Pointer to head of the blocking notifier chain
320 * @n: Entry to remove from notifier chain
321 *
322 * Removes a notifier from a blocking notifier chain.
323 * Must be called from process context.
324 *
325 * Returns zero on success or %-ENOENT on failure.
326 */
327int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
328 struct notifier_block *n)
329{
330 int ret;
331
332 /*
333 * This code gets used during boot-up, when task switching is
334 * not yet working and interrupts must remain disabled. At
335 * such times we must not call down_write().
336 */
337 if (unlikely(system_state == SYSTEM_BOOTING))
338 return notifier_chain_unregister(&nh->head, n);
339
340 down_write(&nh->rwsem);
341 ret = notifier_chain_unregister(&nh->head, n);
342 up_write(&nh->rwsem);
343 return ret;
344}
345
346EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
347
348/**
349 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
350 * @nh: Pointer to head of the blocking notifier chain
351 * @val: Value passed unmodified to notifier function
352 * @v: Pointer passed unmodified to notifier function
353 * @nr_to_call: See comment for notifier_call_chain.
354 * @nr_calls: See comment for notifier_call_chain.
355 *
356 * Calls each function in a notifier chain in turn. The functions
357 * run in a process context, so they are allowed to block.
358 *
359 * If the return value of the notifier can be and'ed
360 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
361 * will return immediately, with the return value of
362 * the notifier function which halted execution.
363 * Otherwise the return value is the return value
364 * of the last notifier function called.
365 */
366
367int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
368 unsigned long val, void *v,
369 int nr_to_call, int *nr_calls)
370{
371 int ret = NOTIFY_DONE;
372
373 /*
374 * We check the head outside the lock, but if this access is
375 * racy then it does not matter what the result of the test
376 * is, we re-check the list after having taken the lock anyway:
377 */
378 if (rcu_dereference(nh->head)) {
379 down_read(&nh->rwsem);
380 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
381 nr_calls);
382 up_read(&nh->rwsem);
383 }
384 return ret;
385}
386EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
387
388int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
389 unsigned long val, void *v)
390{
391 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
392}
393EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
394
395/*
396 * Raw notifier chain routines. There is no protection;
397 * the caller must provide it. Use at your own risk!
398 */
399
400/**
401 * raw_notifier_chain_register - Add notifier to a raw notifier chain
402 * @nh: Pointer to head of the raw notifier chain
403 * @n: New entry in notifier chain
404 *
405 * Adds a notifier to a raw notifier chain.
406 * All locking must be provided by the caller.
407 *
408 * Currently always returns zero.
409 */
410
411int raw_notifier_chain_register(struct raw_notifier_head *nh,
412 struct notifier_block *n)
413{
414 return notifier_chain_register(&nh->head, n);
415}
416
417EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
418
419/**
420 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
421 * @nh: Pointer to head of the raw notifier chain
422 * @n: Entry to remove from notifier chain
423 *
424 * Removes a notifier from a raw notifier chain.
425 * All locking must be provided by the caller.
426 *
427 * Returns zero on success or %-ENOENT on failure.
428 */
429int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
430 struct notifier_block *n)
431{
432 return notifier_chain_unregister(&nh->head, n);
433}
434
435EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
436
437/**
438 * __raw_notifier_call_chain - Call functions in a raw notifier chain
439 * @nh: Pointer to head of the raw notifier chain
440 * @val: Value passed unmodified to notifier function
441 * @v: Pointer passed unmodified to notifier function
442 * @nr_to_call: See comment for notifier_call_chain.
443 * @nr_calls: See comment for notifier_call_chain
444 *
445 * Calls each function in a notifier chain in turn. The functions
446 * run in an undefined context.
447 * All locking must be provided by the caller.
448 *
449 * If the return value of the notifier can be and'ed
450 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
451 * will return immediately, with the return value of
452 * the notifier function which halted execution.
453 * Otherwise the return value is the return value
454 * of the last notifier function called.
455 */
456
457int __raw_notifier_call_chain(struct raw_notifier_head *nh,
458 unsigned long val, void *v,
459 int nr_to_call, int *nr_calls)
460{
461 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
462}
463
464EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
465
466int raw_notifier_call_chain(struct raw_notifier_head *nh,
467 unsigned long val, void *v)
468{
469 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
470}
471
472EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
473
474/*
475 * SRCU notifier chain routines. Registration and unregistration
476 * use a mutex, and call_chain is synchronized by SRCU (no locks).
477 */
478
479/**
480 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
481 * @nh: Pointer to head of the SRCU notifier chain
482 * @n: New entry in notifier chain
483 *
484 * Adds a notifier to an SRCU notifier chain.
485 * Must be called in process context.
486 *
487 * Currently always returns zero.
488 */
489
490int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
491 struct notifier_block *n)
492{
493 int ret;
494
495 /*
496 * This code gets used during boot-up, when task switching is
497 * not yet working and interrupts must remain disabled. At
498 * such times we must not call mutex_lock().
499 */
500 if (unlikely(system_state == SYSTEM_BOOTING))
501 return notifier_chain_register(&nh->head, n);
502
503 mutex_lock(&nh->mutex);
504 ret = notifier_chain_register(&nh->head, n);
505 mutex_unlock(&nh->mutex);
506 return ret;
507}
508
509EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
510
511/**
512 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
513 * @nh: Pointer to head of the SRCU notifier chain
514 * @n: Entry to remove from notifier chain
515 *
516 * Removes a notifier from an SRCU notifier chain.
517 * Must be called from process context.
518 *
519 * Returns zero on success or %-ENOENT on failure.
520 */
521int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
522 struct notifier_block *n)
523{
524 int ret;
525
526 /*
527 * This code gets used during boot-up, when task switching is
528 * not yet working and interrupts must remain disabled. At
529 * such times we must not call mutex_lock().
530 */
531 if (unlikely(system_state == SYSTEM_BOOTING))
532 return notifier_chain_unregister(&nh->head, n);
533
534 mutex_lock(&nh->mutex);
535 ret = notifier_chain_unregister(&nh->head, n);
536 mutex_unlock(&nh->mutex);
537 synchronize_srcu(&nh->srcu);
538 return ret;
539}
540
541EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
542
543/**
544 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
545 * @nh: Pointer to head of the SRCU notifier chain
546 * @val: Value passed unmodified to notifier function
547 * @v: Pointer passed unmodified to notifier function
548 * @nr_to_call: See comment for notifier_call_chain.
549 * @nr_calls: See comment for notifier_call_chain
550 *
551 * Calls each function in a notifier chain in turn. The functions
552 * run in a process context, so they are allowed to block.
553 *
554 * If the return value of the notifier can be and'ed
555 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
556 * will return immediately, with the return value of
557 * the notifier function which halted execution.
558 * Otherwise the return value is the return value
559 * of the last notifier function called.
560 */
561
562int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
563 unsigned long val, void *v,
564 int nr_to_call, int *nr_calls)
565{
566 int ret;
567 int idx;
568
569 idx = srcu_read_lock(&nh->srcu);
570 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
571 srcu_read_unlock(&nh->srcu, idx);
572 return ret;
573}
574EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
575
576int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
577 unsigned long val, void *v)
578{
579 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
580}
581EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
582
583/**
584 * srcu_init_notifier_head - Initialize an SRCU notifier head
585 * @nh: Pointer to head of the srcu notifier chain
586 *
587 * Unlike other sorts of notifier heads, SRCU notifier heads require
588 * dynamic initialization. Be sure to call this routine before
589 * calling any of the other SRCU notifier routines for this head.
590 *
591 * If an SRCU notifier head is deallocated, it must first be cleaned
592 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
593 * per-cpu data (used by the SRCU mechanism) will leak.
594 */
595
596void srcu_init_notifier_head(struct srcu_notifier_head *nh)
597{
598 mutex_init(&nh->mutex);
599 if (init_srcu_struct(&nh->srcu) < 0)
600 BUG();
601 nh->head = NULL;
602}
603
604EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
605
606/**
607 * register_reboot_notifier - Register function to be called at reboot time
608 * @nb: Info about notifier function to be called
609 *
610 * Registers a function with the list of functions
611 * to be called at reboot time.
612 *
613 * Currently always returns zero, as blocking_notifier_chain_register()
614 * always returns zero.
615 */
616
617int register_reboot_notifier(struct notifier_block * nb)
618{
619 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
620}
621
622EXPORT_SYMBOL(register_reboot_notifier);
623
624/**
625 * unregister_reboot_notifier - Unregister previously registered reboot notifier
626 * @nb: Hook to be unregistered
627 *
628 * Unregisters a previously registered reboot
629 * notifier function.
630 *
631 * Returns zero on success, or %-ENOENT on failure.
632 */
633
634int unregister_reboot_notifier(struct notifier_block * nb)
635{
636 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
637}
638
639EXPORT_SYMBOL(unregister_reboot_notifier);
640 108
641static int set_one_prio(struct task_struct *p, int niceval, int error) 109static int set_one_prio(struct task_struct *p, int niceval, int error)
642{ 110{
@@ -684,7 +152,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
684 switch (which) { 152 switch (which) {
685 case PRIO_PROCESS: 153 case PRIO_PROCESS:
686 if (who) 154 if (who)
687 p = find_task_by_pid(who); 155 p = find_task_by_vpid(who);
688 else 156 else
689 p = current; 157 p = current;
690 if (p) 158 if (p)
@@ -692,7 +160,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
692 break; 160 break;
693 case PRIO_PGRP: 161 case PRIO_PGRP:
694 if (who) 162 if (who)
695 pgrp = find_pid(who); 163 pgrp = find_vpid(who);
696 else 164 else
697 pgrp = task_pgrp(current); 165 pgrp = task_pgrp(current);
698 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 166 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -741,7 +209,7 @@ asmlinkage long sys_getpriority(int which, int who)
741 switch (which) { 209 switch (which) {
742 case PRIO_PROCESS: 210 case PRIO_PROCESS:
743 if (who) 211 if (who)
744 p = find_task_by_pid(who); 212 p = find_task_by_vpid(who);
745 else 213 else
746 p = current; 214 p = current;
747 if (p) { 215 if (p) {
@@ -752,7 +220,7 @@ asmlinkage long sys_getpriority(int which, int who)
752 break; 220 break;
753 case PRIO_PGRP: 221 case PRIO_PGRP:
754 if (who) 222 if (who)
755 pgrp = find_pid(who); 223 pgrp = find_vpid(who);
756 else 224 else
757 pgrp = task_pgrp(current); 225 pgrp = task_pgrp(current);
758 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 226 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -1449,9 +917,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1449 struct task_struct *p; 917 struct task_struct *p;
1450 struct task_struct *group_leader = current->group_leader; 918 struct task_struct *group_leader = current->group_leader;
1451 int err = -EINVAL; 919 int err = -EINVAL;
920 struct pid_namespace *ns;
1452 921
1453 if (!pid) 922 if (!pid)
1454 pid = group_leader->pid; 923 pid = task_pid_vnr(group_leader);
1455 if (!pgid) 924 if (!pgid)
1456 pgid = pid; 925 pgid = pid;
1457 if (pgid < 0) 926 if (pgid < 0)
@@ -1460,10 +929,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1460 /* From this point forward we keep holding onto the tasklist lock 929 /* From this point forward we keep holding onto the tasklist lock
1461 * so that our parent does not change from under us. -DaveM 930 * so that our parent does not change from under us. -DaveM
1462 */ 931 */
932 ns = current->nsproxy->pid_ns;
933
1463 write_lock_irq(&tasklist_lock); 934 write_lock_irq(&tasklist_lock);
1464 935
1465 err = -ESRCH; 936 err = -ESRCH;
1466 p = find_task_by_pid(pid); 937 p = find_task_by_pid_ns(pid, ns);
1467 if (!p) 938 if (!p)
1468 goto out; 939 goto out;
1469 940
@@ -1489,9 +960,9 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1489 goto out; 960 goto out;
1490 961
1491 if (pgid != pid) { 962 if (pgid != pid) {
1492 struct task_struct *g = 963 struct task_struct *g;
1493 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1494 964
965 g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
1495 if (!g || task_session(g) != task_session(group_leader)) 966 if (!g || task_session(g) != task_session(group_leader))
1496 goto out; 967 goto out;
1497 } 968 }
@@ -1500,10 +971,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1500 if (err) 971 if (err)
1501 goto out; 972 goto out;
1502 973
1503 if (process_group(p) != pgid) { 974 if (task_pgrp_nr_ns(p, ns) != pgid) {
975 struct pid *pid;
976
1504 detach_pid(p, PIDTYPE_PGID); 977 detach_pid(p, PIDTYPE_PGID);
1505 p->signal->pgrp = pgid; 978 pid = find_vpid(pgid);
1506 attach_pid(p, PIDTYPE_PGID, find_pid(pgid)); 979 attach_pid(p, PIDTYPE_PGID, pid);
980 set_task_pgrp(p, pid_nr(pid));
1507 } 981 }
1508 982
1509 err = 0; 983 err = 0;
@@ -1516,19 +990,21 @@ out:
1516asmlinkage long sys_getpgid(pid_t pid) 990asmlinkage long sys_getpgid(pid_t pid)
1517{ 991{
1518 if (!pid) 992 if (!pid)
1519 return process_group(current); 993 return task_pgrp_vnr(current);
1520 else { 994 else {
1521 int retval; 995 int retval;
1522 struct task_struct *p; 996 struct task_struct *p;
997 struct pid_namespace *ns;
1523 998
1524 read_lock(&tasklist_lock); 999 ns = current->nsproxy->pid_ns;
1525 p = find_task_by_pid(pid);
1526 1000
1001 read_lock(&tasklist_lock);
1002 p = find_task_by_pid_ns(pid, ns);
1527 retval = -ESRCH; 1003 retval = -ESRCH;
1528 if (p) { 1004 if (p) {
1529 retval = security_task_getpgid(p); 1005 retval = security_task_getpgid(p);
1530 if (!retval) 1006 if (!retval)
1531 retval = process_group(p); 1007 retval = task_pgrp_nr_ns(p, ns);
1532 } 1008 }
1533 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1534 return retval; 1010 return retval;
@@ -1540,7 +1016,7 @@ asmlinkage long sys_getpgid(pid_t pid)
1540asmlinkage long sys_getpgrp(void) 1016asmlinkage long sys_getpgrp(void)
1541{ 1017{
1542 /* SMP - assuming writes are word atomic this is fine */ 1018 /* SMP - assuming writes are word atomic this is fine */
1543 return process_group(current); 1019 return task_pgrp_vnr(current);
1544} 1020}
1545 1021
1546#endif 1022#endif
@@ -1548,19 +1024,21 @@ asmlinkage long sys_getpgrp(void)
1548asmlinkage long sys_getsid(pid_t pid) 1024asmlinkage long sys_getsid(pid_t pid)
1549{ 1025{
1550 if (!pid) 1026 if (!pid)
1551 return process_session(current); 1027 return task_session_vnr(current);
1552 else { 1028 else {
1553 int retval; 1029 int retval;
1554 struct task_struct *p; 1030 struct task_struct *p;
1031 struct pid_namespace *ns;
1555 1032
1556 read_lock(&tasklist_lock); 1033 ns = current->nsproxy->pid_ns;
1557 p = find_task_by_pid(pid);
1558 1034
1035 read_lock(&tasklist_lock);
1036 p = find_task_by_pid_ns(pid, ns);
1559 retval = -ESRCH; 1037 retval = -ESRCH;
1560 if (p) { 1038 if (p) {
1561 retval = security_task_getsid(p); 1039 retval = security_task_getsid(p);
1562 if (!retval) 1040 if (!retval)
1563 retval = process_session(p); 1041 retval = task_session_nr_ns(p, ns);
1564 } 1042 }
1565 read_unlock(&tasklist_lock); 1043 read_unlock(&tasklist_lock);
1566 return retval; 1044 return retval;
@@ -1587,7 +1065,8 @@ asmlinkage long sys_setsid(void)
1587 * session id and so the check will always fail and make it so 1065 * session id and so the check will always fail and make it so
1588 * init cannot successfully call setsid. 1066 * init cannot successfully call setsid.
1589 */ 1067 */
1590 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) 1068 if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
1069 session, &init_pid_ns))
1591 goto out; 1070 goto out;
1592 1071
1593 group_leader->signal->leader = 1; 1072 group_leader->signal->leader = 1;
@@ -1597,7 +1076,7 @@ asmlinkage long sys_setsid(void)
1597 group_leader->signal->tty = NULL; 1076 group_leader->signal->tty = NULL;
1598 spin_unlock(&group_leader->sighand->siglock); 1077 spin_unlock(&group_leader->sighand->siglock);
1599 1078
1600 err = process_group(group_leader); 1079 err = task_pgrp_vnr(group_leader);
1601out: 1080out:
1602 write_unlock_irq(&tasklist_lock); 1081 write_unlock_irq(&tasklist_lock);
1603 return err; 1082 return err;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dde3d53e8adc..3b4efbe26445 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,7 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/capability.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
@@ -55,6 +55,8 @@
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#endif 56#endif
57 57
58static int deprecated_sysctl_warning(struct __sysctl_args *args);
59
58#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
59 61
60/* External variables not in a header file. */ 62/* External variables not in a header file. */
@@ -142,32 +144,29 @@ extern int max_lock_depth;
142 144
143#ifdef CONFIG_SYSCTL_SYSCALL 145#ifdef CONFIG_SYSCTL_SYSCALL
144static int parse_table(int __user *, int, void __user *, size_t __user *, 146static int parse_table(int __user *, int, void __user *, size_t __user *,
145 void __user *, size_t, ctl_table *); 147 void __user *, size_t, struct ctl_table *);
146#endif 148#endif
147 149
148 150
149#ifdef CONFIG_PROC_SYSCTL 151#ifdef CONFIG_PROC_SYSCTL
150static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 152static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
151 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
152static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, 154static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
153 void __user *buffer, size_t *lenp, loff_t *ppos); 155 void __user *buffer, size_t *lenp, loff_t *ppos);
154#endif 156#endif
155 157
156static ctl_table root_table[]; 158static struct ctl_table root_table[];
157static struct ctl_table_header root_table_header = 159static struct ctl_table_header root_table_header =
158 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; 160 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
159 161
160static ctl_table kern_table[]; 162static struct ctl_table kern_table[];
161static ctl_table vm_table[]; 163static struct ctl_table vm_table[];
162static ctl_table fs_table[]; 164static struct ctl_table fs_table[];
163static ctl_table debug_table[]; 165static struct ctl_table debug_table[];
164static ctl_table dev_table[]; 166static struct ctl_table dev_table[];
165extern ctl_table random_table[]; 167extern struct ctl_table random_table[];
166#ifdef CONFIG_UNIX98_PTYS
167extern ctl_table pty_table[];
168#endif
169#ifdef CONFIG_INOTIFY_USER 168#ifdef CONFIG_INOTIFY_USER
170extern ctl_table inotify_table[]; 169extern struct ctl_table inotify_table[];
171#endif 170#endif
172 171
173#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 172#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -179,7 +178,7 @@ extern int lock_stat;
179 178
180/* The default sysctl tables: */ 179/* The default sysctl tables: */
181 180
182static ctl_table root_table[] = { 181static struct ctl_table root_table[] = {
183 { 182 {
184 .ctl_name = CTL_KERN, 183 .ctl_name = CTL_KERN,
185 .procname = "kernel", 184 .procname = "kernel",
@@ -232,7 +231,7 @@ static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
232static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ 231static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
233#endif 232#endif
234 233
235static ctl_table kern_table[] = { 234static struct ctl_table kern_table[] = {
236#ifdef CONFIG_SCHED_DEBUG 235#ifdef CONFIG_SCHED_DEBUG
237 { 236 {
238 .ctl_name = CTL_UNNUMBERED, 237 .ctl_name = CTL_UNNUMBERED,
@@ -365,7 +364,6 @@ static ctl_table kern_table[] = {
365 }, 364 },
366#ifdef CONFIG_PROC_SYSCTL 365#ifdef CONFIG_PROC_SYSCTL
367 { 366 {
368 .ctl_name = KERN_TAINTED,
369 .procname = "tainted", 367 .procname = "tainted",
370 .data = &tainted, 368 .data = &tainted,
371 .maxlen = sizeof(int), 369 .maxlen = sizeof(int),
@@ -373,14 +371,15 @@ static ctl_table kern_table[] = {
373 .proc_handler = &proc_dointvec_taint, 371 .proc_handler = &proc_dointvec_taint,
374 }, 372 },
375#endif 373#endif
374#ifdef CONFIG_SECURITY_CAPABILITIES
376 { 375 {
377 .ctl_name = KERN_CAP_BSET,
378 .procname = "cap-bound", 376 .procname = "cap-bound",
379 .data = &cap_bset, 377 .data = &cap_bset,
380 .maxlen = sizeof(kernel_cap_t), 378 .maxlen = sizeof(kernel_cap_t),
381 .mode = 0600, 379 .mode = 0600,
382 .proc_handler = &proc_dointvec_bset, 380 .proc_handler = &proc_dointvec_bset,
383 }, 381 },
382#endif /* def CONFIG_SECURITY_CAPABILITIES */
384#ifdef CONFIG_BLK_DEV_INITRD 383#ifdef CONFIG_BLK_DEV_INITRD
385 { 384 {
386 .ctl_name = KERN_REALROOTDEV, 385 .ctl_name = KERN_REALROOTDEV,
@@ -514,7 +513,6 @@ static ctl_table kern_table[] = {
514#endif 513#endif
515#ifdef CONFIG_PROC_SYSCTL 514#ifdef CONFIG_PROC_SYSCTL
516 { 515 {
517 .ctl_name = KERN_CADPID,
518 .procname = "cad_pid", 516 .procname = "cad_pid",
519 .data = NULL, 517 .data = NULL,
520 .maxlen = sizeof (int), 518 .maxlen = sizeof (int),
@@ -536,14 +534,6 @@ static ctl_table kern_table[] = {
536 .mode = 0555, 534 .mode = 0555,
537 .child = random_table, 535 .child = random_table,
538 }, 536 },
539#ifdef CONFIG_UNIX98_PTYS
540 {
541 .ctl_name = KERN_PTY,
542 .procname = "pty",
543 .mode = 0555,
544 .child = pty_table,
545 },
546#endif
547 { 537 {
548 .ctl_name = KERN_OVERFLOWUID, 538 .ctl_name = KERN_OVERFLOWUID,
549 .procname = "overflowuid", 539 .procname = "overflowuid",
@@ -650,7 +640,6 @@ static ctl_table kern_table[] = {
650 .proc_handler = &proc_dointvec, 640 .proc_handler = &proc_dointvec,
651 }, 641 },
652 { 642 {
653 .ctl_name = KERN_NMI_WATCHDOG,
654 .procname = "nmi_watchdog", 643 .procname = "nmi_watchdog",
655 .data = &nmi_watchdog_enabled, 644 .data = &nmi_watchdog_enabled,
656 .maxlen = sizeof (int), 645 .maxlen = sizeof (int),
@@ -706,7 +695,6 @@ static ctl_table kern_table[] = {
706#endif 695#endif
707#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 696#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
708 { 697 {
709 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
710 .procname = "acpi_video_flags", 698 .procname = "acpi_video_flags",
711 .data = &acpi_realmode_flags, 699 .data = &acpi_realmode_flags,
712 .maxlen = sizeof (unsigned long), 700 .maxlen = sizeof (unsigned long),
@@ -783,7 +771,7 @@ static ctl_table kern_table[] = {
783 { .ctl_name = 0 } 771 { .ctl_name = 0 }
784}; 772};
785 773
786static ctl_table vm_table[] = { 774static struct ctl_table vm_table[] = {
787 { 775 {
788 .ctl_name = VM_OVERCOMMIT_MEMORY, 776 .ctl_name = VM_OVERCOMMIT_MEMORY,
789 .procname = "overcommit_memory", 777 .procname = "overcommit_memory",
@@ -847,7 +835,6 @@ static ctl_table vm_table[] = {
847 .extra2 = &one_hundred, 835 .extra2 = &one_hundred,
848 }, 836 },
849 { 837 {
850 .ctl_name = VM_DIRTY_WB_CS,
851 .procname = "dirty_writeback_centisecs", 838 .procname = "dirty_writeback_centisecs",
852 .data = &dirty_writeback_interval, 839 .data = &dirty_writeback_interval,
853 .maxlen = sizeof(dirty_writeback_interval), 840 .maxlen = sizeof(dirty_writeback_interval),
@@ -855,7 +842,6 @@ static ctl_table vm_table[] = {
855 .proc_handler = &dirty_writeback_centisecs_handler, 842 .proc_handler = &dirty_writeback_centisecs_handler,
856 }, 843 },
857 { 844 {
858 .ctl_name = VM_DIRTY_EXPIRE_CS,
859 .procname = "dirty_expire_centisecs", 845 .procname = "dirty_expire_centisecs",
860 .data = &dirty_expire_interval, 846 .data = &dirty_expire_interval,
861 .maxlen = sizeof(dirty_expire_interval), 847 .maxlen = sizeof(dirty_expire_interval),
@@ -883,7 +869,6 @@ static ctl_table vm_table[] = {
883 }, 869 },
884#ifdef CONFIG_HUGETLB_PAGE 870#ifdef CONFIG_HUGETLB_PAGE
885 { 871 {
886 .ctl_name = VM_HUGETLB_PAGES,
887 .procname = "nr_hugepages", 872 .procname = "nr_hugepages",
888 .data = &max_huge_pages, 873 .data = &max_huge_pages,
889 .maxlen = sizeof(unsigned long), 874 .maxlen = sizeof(unsigned long),
@@ -1093,12 +1078,12 @@ static ctl_table vm_table[] = {
1093}; 1078};
1094 1079
1095#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1080#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1096static ctl_table binfmt_misc_table[] = { 1081static struct ctl_table binfmt_misc_table[] = {
1097 { .ctl_name = 0 } 1082 { .ctl_name = 0 }
1098}; 1083};
1099#endif 1084#endif
1100 1085
1101static ctl_table fs_table[] = { 1086static struct ctl_table fs_table[] = {
1102 { 1087 {
1103 .ctl_name = FS_NRINODE, 1088 .ctl_name = FS_NRINODE,
1104 .procname = "inode-nr", 1089 .procname = "inode-nr",
@@ -1116,7 +1101,6 @@ static ctl_table fs_table[] = {
1116 .proc_handler = &proc_dointvec, 1101 .proc_handler = &proc_dointvec,
1117 }, 1102 },
1118 { 1103 {
1119 .ctl_name = FS_NRFILE,
1120 .procname = "file-nr", 1104 .procname = "file-nr",
1121 .data = &files_stat, 1105 .data = &files_stat,
1122 .maxlen = 3*sizeof(int), 1106 .maxlen = 3*sizeof(int),
@@ -1192,7 +1176,6 @@ static ctl_table fs_table[] = {
1192 .extra2 = &two, 1176 .extra2 = &two,
1193 }, 1177 },
1194 { 1178 {
1195 .ctl_name = FS_AIO_NR,
1196 .procname = "aio-nr", 1179 .procname = "aio-nr",
1197 .data = &aio_nr, 1180 .data = &aio_nr,
1198 .maxlen = sizeof(aio_nr), 1181 .maxlen = sizeof(aio_nr),
@@ -1200,7 +1183,6 @@ static ctl_table fs_table[] = {
1200 .proc_handler = &proc_doulongvec_minmax, 1183 .proc_handler = &proc_doulongvec_minmax,
1201 }, 1184 },
1202 { 1185 {
1203 .ctl_name = FS_AIO_MAX_NR,
1204 .procname = "aio-max-nr", 1186 .procname = "aio-max-nr",
1205 .data = &aio_max_nr, 1187 .data = &aio_max_nr,
1206 .maxlen = sizeof(aio_max_nr), 1188 .maxlen = sizeof(aio_max_nr),
@@ -1239,7 +1221,7 @@ static ctl_table fs_table[] = {
1239 { .ctl_name = 0 } 1221 { .ctl_name = 0 }
1240}; 1222};
1241 1223
1242static ctl_table debug_table[] = { 1224static struct ctl_table debug_table[] = {
1243#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1225#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1244 { 1226 {
1245 .ctl_name = CTL_UNNUMBERED, 1227 .ctl_name = CTL_UNNUMBERED,
@@ -1253,7 +1235,7 @@ static ctl_table debug_table[] = {
1253 { .ctl_name = 0 } 1235 { .ctl_name = 0 }
1254}; 1236};
1255 1237
1256static ctl_table dev_table[] = { 1238static struct ctl_table dev_table[] = {
1257 { .ctl_name = 0 } 1239 { .ctl_name = 0 }
1258}; 1240};
1259 1241
@@ -1369,10 +1351,15 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1369 if (copy_from_user(&tmp, args, sizeof(tmp))) 1351 if (copy_from_user(&tmp, args, sizeof(tmp)))
1370 return -EFAULT; 1352 return -EFAULT;
1371 1353
1354 error = deprecated_sysctl_warning(&tmp);
1355 if (error)
1356 goto out;
1357
1372 lock_kernel(); 1358 lock_kernel();
1373 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, 1359 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1374 tmp.newval, tmp.newlen); 1360 tmp.newval, tmp.newlen);
1375 unlock_kernel(); 1361 unlock_kernel();
1362out:
1376 return error; 1363 return error;
1377} 1364}
1378#endif /* CONFIG_SYSCTL_SYSCALL */ 1365#endif /* CONFIG_SYSCTL_SYSCALL */
@@ -1393,7 +1380,7 @@ static int test_perm(int mode, int op)
1393 return -EACCES; 1380 return -EACCES;
1394} 1381}
1395 1382
1396int sysctl_perm(ctl_table *table, int op) 1383int sysctl_perm(struct ctl_table *table, int op)
1397{ 1384{
1398 int error; 1385 int error;
1399 error = security_sysctl(table, op); 1386 error = security_sysctl(table, op);
@@ -1406,7 +1393,7 @@ int sysctl_perm(ctl_table *table, int op)
1406static int parse_table(int __user *name, int nlen, 1393static int parse_table(int __user *name, int nlen,
1407 void __user *oldval, size_t __user *oldlenp, 1394 void __user *oldval, size_t __user *oldlenp,
1408 void __user *newval, size_t newlen, 1395 void __user *newval, size_t newlen,
1409 ctl_table *table) 1396 struct ctl_table *table)
1410{ 1397{
1411 int n; 1398 int n;
1412repeat: 1399repeat:
@@ -1437,13 +1424,12 @@ repeat:
1437} 1424}
1438 1425
1439/* Perform the actual read/write of a sysctl table entry. */ 1426/* Perform the actual read/write of a sysctl table entry. */
1440int do_sysctl_strategy (ctl_table *table, 1427int do_sysctl_strategy (struct ctl_table *table,
1441 int __user *name, int nlen, 1428 int __user *name, int nlen,
1442 void __user *oldval, size_t __user *oldlenp, 1429 void __user *oldval, size_t __user *oldlenp,
1443 void __user *newval, size_t newlen) 1430 void __user *newval, size_t newlen)
1444{ 1431{
1445 int op = 0, rc; 1432 int op = 0, rc;
1446 size_t len;
1447 1433
1448 if (oldval) 1434 if (oldval)
1449 op |= 004; 1435 op |= 004;
@@ -1464,25 +1450,10 @@ int do_sysctl_strategy (ctl_table *table,
1464 /* If there is no strategy routine, or if the strategy returns 1450 /* If there is no strategy routine, or if the strategy returns
1465 * zero, proceed with automatic r/w */ 1451 * zero, proceed with automatic r/w */
1466 if (table->data && table->maxlen) { 1452 if (table->data && table->maxlen) {
1467 if (oldval && oldlenp) { 1453 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1468 if (get_user(len, oldlenp)) 1454 newval, newlen);
1469 return -EFAULT; 1455 if (rc < 0)
1470 if (len) { 1456 return rc;
1471 if (len > table->maxlen)
1472 len = table->maxlen;
1473 if(copy_to_user(oldval, table->data, len))
1474 return -EFAULT;
1475 if(put_user(len, oldlenp))
1476 return -EFAULT;
1477 }
1478 }
1479 if (newval && newlen) {
1480 len = newlen;
1481 if (len > table->maxlen)
1482 len = table->maxlen;
1483 if(copy_from_user(table->data, newval, len))
1484 return -EFAULT;
1485 }
1486 } 1457 }
1487 return 0; 1458 return 0;
1488} 1459}
@@ -1499,7 +1470,9 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1499 1470
1500static __init int sysctl_init(void) 1471static __init int sysctl_init(void)
1501{ 1472{
1473 int err;
1502 sysctl_set_parent(NULL, root_table); 1474 sysctl_set_parent(NULL, root_table);
1475 err = sysctl_check_table(root_table);
1503 return 0; 1476 return 0;
1504} 1477}
1505 1478
@@ -1512,7 +1485,7 @@ core_initcall(sysctl_init);
1512 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1485 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1513 * array. An entry with a ctl_name of 0 terminates the table. 1486 * array. An entry with a ctl_name of 0 terminates the table.
1514 * 1487 *
1515 * The members of the &ctl_table structure are used as follows: 1488 * The members of the &struct ctl_table structure are used as follows:
1516 * 1489 *
1517 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number 1490 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
1518 * must be unique within that level of sysctl 1491 * must be unique within that level of sysctl
@@ -1573,7 +1546,7 @@ core_initcall(sysctl_init);
1573 * This routine returns %NULL on a failure to register, and a pointer 1546 * This routine returns %NULL on a failure to register, and a pointer
1574 * to the table header on success. 1547 * to the table header on success.
1575 */ 1548 */
1576struct ctl_table_header *register_sysctl_table(ctl_table * table) 1549struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1577{ 1550{
1578 struct ctl_table_header *tmp; 1551 struct ctl_table_header *tmp;
1579 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1552 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1584,6 +1557,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table)
1584 tmp->used = 0; 1557 tmp->used = 0;
1585 tmp->unregistering = NULL; 1558 tmp->unregistering = NULL;
1586 sysctl_set_parent(NULL, table); 1559 sysctl_set_parent(NULL, table);
1560 if (sysctl_check_table(tmp->ctl_table)) {
1561 kfree(tmp);
1562 return NULL;
1563 }
1587 spin_lock(&sysctl_lock); 1564 spin_lock(&sysctl_lock);
1588 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1565 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1589 spin_unlock(&sysctl_lock); 1566 spin_unlock(&sysctl_lock);
@@ -1607,7 +1584,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1607} 1584}
1608 1585
1609#else /* !CONFIG_SYSCTL */ 1586#else /* !CONFIG_SYSCTL */
1610struct ctl_table_header *register_sysctl_table(ctl_table * table) 1587struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1611{ 1588{
1612 return NULL; 1589 return NULL;
1613} 1590}
@@ -1700,7 +1677,7 @@ static int _proc_do_string(void* data, int maxlen, int write,
1700 * 1677 *
1701 * Returns 0 on success. 1678 * Returns 0 on success.
1702 */ 1679 */
1703int proc_dostring(ctl_table *table, int write, struct file *filp, 1680int proc_dostring(struct ctl_table *table, int write, struct file *filp,
1704 void __user *buffer, size_t *lenp, loff_t *ppos) 1681 void __user *buffer, size_t *lenp, loff_t *ppos)
1705{ 1682{
1706 return _proc_do_string(table->data, table->maxlen, write, filp, 1683 return _proc_do_string(table->data, table->maxlen, write, filp,
@@ -1727,7 +1704,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1727 return 0; 1704 return 0;
1728} 1705}
1729 1706
1730static int __do_proc_dointvec(void *tbl_data, ctl_table *table, 1707static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
1731 int write, struct file *filp, void __user *buffer, 1708 int write, struct file *filp, void __user *buffer,
1732 size_t *lenp, loff_t *ppos, 1709 size_t *lenp, loff_t *ppos,
1733 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 1710 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
@@ -1837,7 +1814,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1837#undef TMPBUFLEN 1814#undef TMPBUFLEN
1838} 1815}
1839 1816
1840static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, 1817static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
1841 void __user *buffer, size_t *lenp, loff_t *ppos, 1818 void __user *buffer, size_t *lenp, loff_t *ppos,
1842 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 1819 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
1843 int write, void *data), 1820 int write, void *data),
@@ -1861,7 +1838,7 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1861 * 1838 *
1862 * Returns 0 on success. 1839 * Returns 0 on success.
1863 */ 1840 */
1864int proc_dointvec(ctl_table *table, int write, struct file *filp, 1841int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
1865 void __user *buffer, size_t *lenp, loff_t *ppos) 1842 void __user *buffer, size_t *lenp, loff_t *ppos)
1866{ 1843{
1867 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1844 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -1897,11 +1874,12 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1897 return 0; 1874 return 0;
1898} 1875}
1899 1876
1877#ifdef CONFIG_SECURITY_CAPABILITIES
1900/* 1878/*
1901 * init may raise the set. 1879 * init may raise the set.
1902 */ 1880 */
1903 1881
1904int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, 1882int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
1905 void __user *buffer, size_t *lenp, loff_t *ppos) 1883 void __user *buffer, size_t *lenp, loff_t *ppos)
1906{ 1884{
1907 int op; 1885 int op;
@@ -1910,15 +1888,16 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1910 return -EPERM; 1888 return -EPERM;
1911 } 1889 }
1912 1890
1913 op = is_init(current) ? OP_SET : OP_AND; 1891 op = is_global_init(current) ? OP_SET : OP_AND;
1914 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1892 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1915 do_proc_dointvec_bset_conv,&op); 1893 do_proc_dointvec_bset_conv,&op);
1916} 1894}
1895#endif /* def CONFIG_SECURITY_CAPABILITIES */
1917 1896
1918/* 1897/*
1919 * Taint values can only be increased 1898 * Taint values can only be increased
1920 */ 1899 */
1921static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, 1900static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
1922 void __user *buffer, size_t *lenp, loff_t *ppos) 1901 void __user *buffer, size_t *lenp, loff_t *ppos)
1923{ 1902{
1924 int op; 1903 int op;
@@ -1977,7 +1956,7 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
1977 * 1956 *
1978 * Returns 0 on success. 1957 * Returns 0 on success.
1979 */ 1958 */
1980int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, 1959int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
1981 void __user *buffer, size_t *lenp, loff_t *ppos) 1960 void __user *buffer, size_t *lenp, loff_t *ppos)
1982{ 1961{
1983 struct do_proc_dointvec_minmax_conv_param param = { 1962 struct do_proc_dointvec_minmax_conv_param param = {
@@ -1988,7 +1967,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
1988 do_proc_dointvec_minmax_conv, &param); 1967 do_proc_dointvec_minmax_conv, &param);
1989} 1968}
1990 1969
1991static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, 1970static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
1992 struct file *filp, 1971 struct file *filp,
1993 void __user *buffer, 1972 void __user *buffer,
1994 size_t *lenp, loff_t *ppos, 1973 size_t *lenp, loff_t *ppos,
@@ -2093,7 +2072,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2093#undef TMPBUFLEN 2072#undef TMPBUFLEN
2094} 2073}
2095 2074
2096static int do_proc_doulongvec_minmax(ctl_table *table, int write, 2075static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2097 struct file *filp, 2076 struct file *filp,
2098 void __user *buffer, 2077 void __user *buffer,
2099 size_t *lenp, loff_t *ppos, 2078 size_t *lenp, loff_t *ppos,
@@ -2121,7 +2100,7 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
2121 * 2100 *
2122 * Returns 0 on success. 2101 * Returns 0 on success.
2123 */ 2102 */
2124int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, 2103int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
2125 void __user *buffer, size_t *lenp, loff_t *ppos) 2104 void __user *buffer, size_t *lenp, loff_t *ppos)
2126{ 2105{
2127 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2106 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
@@ -2145,7 +2124,7 @@ int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2145 * 2124 *
2146 * Returns 0 on success. 2125 * Returns 0 on success.
2147 */ 2126 */
2148int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, 2127int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2149 struct file *filp, 2128 struct file *filp,
2150 void __user *buffer, 2129 void __user *buffer,
2151 size_t *lenp, loff_t *ppos) 2130 size_t *lenp, loff_t *ppos)
@@ -2238,7 +2217,7 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2238 * 2217 *
2239 * Returns 0 on success. 2218 * Returns 0 on success.
2240 */ 2219 */
2241int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, 2220int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2242 void __user *buffer, size_t *lenp, loff_t *ppos) 2221 void __user *buffer, size_t *lenp, loff_t *ppos)
2243{ 2222{
2244 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2223 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -2261,7 +2240,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2261 * 2240 *
2262 * Returns 0 on success. 2241 * Returns 0 on success.
2263 */ 2242 */
2264int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, 2243int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
2265 void __user *buffer, size_t *lenp, loff_t *ppos) 2244 void __user *buffer, size_t *lenp, loff_t *ppos)
2266{ 2245{
2267 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2246 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -2285,21 +2264,21 @@ int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2285 * 2264 *
2286 * Returns 0 on success. 2265 * Returns 0 on success.
2287 */ 2266 */
2288int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, 2267int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
2289 void __user *buffer, size_t *lenp, loff_t *ppos) 2268 void __user *buffer, size_t *lenp, loff_t *ppos)
2290{ 2269{
2291 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2270 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
2292 do_proc_dointvec_ms_jiffies_conv, NULL); 2271 do_proc_dointvec_ms_jiffies_conv, NULL);
2293} 2272}
2294 2273
2295static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2274static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
2296 void __user *buffer, size_t *lenp, loff_t *ppos) 2275 void __user *buffer, size_t *lenp, loff_t *ppos)
2297{ 2276{
2298 struct pid *new_pid; 2277 struct pid *new_pid;
2299 pid_t tmp; 2278 pid_t tmp;
2300 int r; 2279 int r;
2301 2280
2302 tmp = pid_nr(cad_pid); 2281 tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
2303 2282
2304 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2283 r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
2305 lenp, ppos, NULL, NULL); 2284 lenp, ppos, NULL, NULL);
@@ -2316,55 +2295,55 @@ static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
2316 2295
2317#else /* CONFIG_PROC_FS */ 2296#else /* CONFIG_PROC_FS */
2318 2297
2319int proc_dostring(ctl_table *table, int write, struct file *filp, 2298int proc_dostring(struct ctl_table *table, int write, struct file *filp,
2320 void __user *buffer, size_t *lenp, loff_t *ppos) 2299 void __user *buffer, size_t *lenp, loff_t *ppos)
2321{ 2300{
2322 return -ENOSYS; 2301 return -ENOSYS;
2323} 2302}
2324 2303
2325int proc_dointvec(ctl_table *table, int write, struct file *filp, 2304int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2326 void __user *buffer, size_t *lenp, loff_t *ppos) 2305 void __user *buffer, size_t *lenp, loff_t *ppos)
2327{ 2306{
2328 return -ENOSYS; 2307 return -ENOSYS;
2329} 2308}
2330 2309
2331int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, 2310int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
2332 void __user *buffer, size_t *lenp, loff_t *ppos) 2311 void __user *buffer, size_t *lenp, loff_t *ppos)
2333{ 2312{
2334 return -ENOSYS; 2313 return -ENOSYS;
2335} 2314}
2336 2315
2337int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, 2316int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
2338 void __user *buffer, size_t *lenp, loff_t *ppos) 2317 void __user *buffer, size_t *lenp, loff_t *ppos)
2339{ 2318{
2340 return -ENOSYS; 2319 return -ENOSYS;
2341} 2320}
2342 2321
2343int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, 2322int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2344 void __user *buffer, size_t *lenp, loff_t *ppos) 2323 void __user *buffer, size_t *lenp, loff_t *ppos)
2345{ 2324{
2346 return -ENOSYS; 2325 return -ENOSYS;
2347} 2326}
2348 2327
2349int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, 2328int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
2350 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2351{ 2330{
2352 return -ENOSYS; 2331 return -ENOSYS;
2353} 2332}
2354 2333
2355int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, 2334int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
2356 void __user *buffer, size_t *lenp, loff_t *ppos) 2335 void __user *buffer, size_t *lenp, loff_t *ppos)
2357{ 2336{
2358 return -ENOSYS; 2337 return -ENOSYS;
2359} 2338}
2360 2339
2361int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, 2340int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
2362 void __user *buffer, size_t *lenp, loff_t *ppos) 2341 void __user *buffer, size_t *lenp, loff_t *ppos)
2363{ 2342{
2364 return -ENOSYS; 2343 return -ENOSYS;
2365} 2344}
2366 2345
2367int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, 2346int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2368 struct file *filp, 2347 struct file *filp,
2369 void __user *buffer, 2348 void __user *buffer,
2370 size_t *lenp, loff_t *ppos) 2349 size_t *lenp, loff_t *ppos)
@@ -2381,8 +2360,42 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2381 * General sysctl support routines 2360 * General sysctl support routines
2382 */ 2361 */
2383 2362
2363/* The generic sysctl data routine (used if no strategy routine supplied) */
2364int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2365 void __user *oldval, size_t __user *oldlenp,
2366 void __user *newval, size_t newlen)
2367{
2368 size_t len;
2369
2370 /* Get out of I don't have a variable */
2371 if (!table->data || !table->maxlen)
2372 return -ENOTDIR;
2373
2374 if (oldval && oldlenp) {
2375 if (get_user(len, oldlenp))
2376 return -EFAULT;
2377 if (len) {
2378 if (len > table->maxlen)
2379 len = table->maxlen;
2380 if (copy_to_user(oldval, table->data, len))
2381 return -EFAULT;
2382 if (put_user(len, oldlenp))
2383 return -EFAULT;
2384 }
2385 }
2386
2387 if (newval && newlen) {
2388 if (newlen > table->maxlen)
2389 newlen = table->maxlen;
2390
2391 if (copy_from_user(table->data, newval, newlen))
2392 return -EFAULT;
2393 }
2394 return 1;
2395}
2396
2384/* The generic string strategy routine: */ 2397/* The generic string strategy routine: */
2385int sysctl_string(ctl_table *table, int __user *name, int nlen, 2398int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2386 void __user *oldval, size_t __user *oldlenp, 2399 void __user *oldval, size_t __user *oldlenp,
2387 void __user *newval, size_t newlen) 2400 void __user *newval, size_t newlen)
2388{ 2401{
@@ -2428,7 +2441,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2428 * are between the minimum and maximum values given in the arrays 2441 * are between the minimum and maximum values given in the arrays
2429 * table->extra1 and table->extra2, respectively. 2442 * table->extra1 and table->extra2, respectively.
2430 */ 2443 */
2431int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2444int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2432 void __user *oldval, size_t __user *oldlenp, 2445 void __user *oldval, size_t __user *oldlenp,
2433 void __user *newval, size_t newlen) 2446 void __user *newval, size_t newlen)
2434{ 2447{
@@ -2464,7 +2477,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2464} 2477}
2465 2478
2466/* Strategy function to convert jiffies to seconds */ 2479/* Strategy function to convert jiffies to seconds */
2467int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2480int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2468 void __user *oldval, size_t __user *oldlenp, 2481 void __user *oldval, size_t __user *oldlenp,
2469 void __user *newval, size_t newlen) 2482 void __user *newval, size_t newlen)
2470{ 2483{
@@ -2498,7 +2511,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2498} 2511}
2499 2512
2500/* Strategy function to convert jiffies to seconds */ 2513/* Strategy function to convert jiffies to seconds */
2501int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2514int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
2502 void __user *oldval, size_t __user *oldlenp, 2515 void __user *oldval, size_t __user *oldlenp,
2503 void __user *newval, size_t newlen) 2516 void __user *newval, size_t newlen)
2504{ 2517{
@@ -2538,59 +2551,50 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2538 2551
2539asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2552asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2540{ 2553{
2541 static int msg_count;
2542 struct __sysctl_args tmp; 2554 struct __sysctl_args tmp;
2543 int name[CTL_MAXNAME]; 2555 int error;
2544 int i;
2545 2556
2546 /* Read in the sysctl name for better debug message logging */
2547 if (copy_from_user(&tmp, args, sizeof(tmp))) 2557 if (copy_from_user(&tmp, args, sizeof(tmp)))
2548 return -EFAULT; 2558 return -EFAULT;
2549 if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
2550 return -ENOTDIR;
2551 for (i = 0; i < tmp.nlen; i++)
2552 if (get_user(name[i], tmp.name + i))
2553 return -EFAULT;
2554 2559
2555 /* Ignore accesses to kernel.version */ 2560 error = deprecated_sysctl_warning(&tmp);
2556 if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2557 goto out;
2558 2561
2559 if (msg_count < 5) { 2562 /* If no error reading the parameters then just -ENOSYS ... */
2560 msg_count++; 2563 if (!error)
2561 printk(KERN_INFO 2564 error = -ENOSYS;
2562 "warning: process `%s' used the removed sysctl " 2565
2563 "system call with ", current->comm); 2566 return error;
2564 for (i = 0; i < tmp.nlen; i++) 2567}
2565 printk("%d.", name[i]); 2568
2566 printk("\n"); 2569int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2567 } 2570 void __user *oldval, size_t __user *oldlenp,
2568out: 2571 void __user *newval, size_t newlen)
2572{
2569 return -ENOSYS; 2573 return -ENOSYS;
2570} 2574}
2571 2575
2572int sysctl_string(ctl_table *table, int __user *name, int nlen, 2576int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2573 void __user *oldval, size_t __user *oldlenp, 2577 void __user *oldval, size_t __user *oldlenp,
2574 void __user *newval, size_t newlen) 2578 void __user *newval, size_t newlen)
2575{ 2579{
2576 return -ENOSYS; 2580 return -ENOSYS;
2577} 2581}
2578 2582
2579int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2583int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2580 void __user *oldval, size_t __user *oldlenp, 2584 void __user *oldval, size_t __user *oldlenp,
2581 void __user *newval, size_t newlen) 2585 void __user *newval, size_t newlen)
2582{ 2586{
2583 return -ENOSYS; 2587 return -ENOSYS;
2584} 2588}
2585 2589
2586int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2590int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2587 void __user *oldval, size_t __user *oldlenp, 2591 void __user *oldval, size_t __user *oldlenp,
2588 void __user *newval, size_t newlen) 2592 void __user *newval, size_t newlen)
2589{ 2593{
2590 return -ENOSYS; 2594 return -ENOSYS;
2591} 2595}
2592 2596
2593int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2597int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
2594 void __user *oldval, size_t __user *oldlenp, 2598 void __user *oldval, size_t __user *oldlenp,
2595 void __user *newval, size_t newlen) 2599 void __user *newval, size_t newlen)
2596{ 2600{
@@ -2599,6 +2603,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2599 2603
2600#endif /* CONFIG_SYSCTL_SYSCALL */ 2604#endif /* CONFIG_SYSCTL_SYSCALL */
2601 2605
2606static int deprecated_sysctl_warning(struct __sysctl_args *args)
2607{
2608 static int msg_count;
2609 int name[CTL_MAXNAME];
2610 int i;
2611
2612 /* Read in the sysctl name for better debug message logging */
2613 for (i = 0; i < args->nlen; i++)
2614 if (get_user(name[i], args->name + i))
2615 return -EFAULT;
2616
2617 /* Ignore accesses to kernel.version */
2618 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2619 return 0;
2620
2621 if (msg_count < 5) {
2622 msg_count++;
2623 printk(KERN_INFO
2624 "warning: process `%s' used the deprecated sysctl "
2625 "system call with ", current->comm);
2626 for (i = 0; i < args->nlen; i++)
2627 printk("%d.", name[i]);
2628 printk("\n");
2629 }
2630 return 0;
2631}
2632
2602/* 2633/*
2603 * No sense putting this after each symbol definition, twice, 2634 * No sense putting this after each symbol definition, twice,
2604 * exception granted :-) 2635 * exception granted :-)
@@ -2616,4 +2647,5 @@ EXPORT_SYMBOL(sysctl_intvec);
2616EXPORT_SYMBOL(sysctl_jiffies); 2647EXPORT_SYMBOL(sysctl_jiffies);
2617EXPORT_SYMBOL(sysctl_ms_jiffies); 2648EXPORT_SYMBOL(sysctl_ms_jiffies);
2618EXPORT_SYMBOL(sysctl_string); 2649EXPORT_SYMBOL(sysctl_string);
2650EXPORT_SYMBOL(sysctl_data);
2619EXPORT_SYMBOL(unregister_sysctl_table); 2651EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
new file mode 100644
index 000000000000..3c9ef5a7d575
--- /dev/null
+++ b/kernel/sysctl_check.c
@@ -0,0 +1,1588 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../arch/s390/appldata/appldata.h"
4#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
5#include <linux/sunrpc/debug.h>
6#include <linux/string.h>
7#include <net/ip_vs.h>
8
9struct trans_ctl_table {
10 int ctl_name;
11 const char *procname;
12 struct trans_ctl_table *child;
13};
14
15static struct trans_ctl_table trans_random_table[] = {
16 { RANDOM_POOLSIZE, "poolsize" },
17 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
18 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
19 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
20 { RANDOM_BOOT_ID, "boot_id" },
21 { RANDOM_UUID, "uuid" },
22 {}
23};
24
25static struct trans_ctl_table trans_pty_table[] = {
26 { PTY_MAX, "max" },
27 { PTY_NR, "nr" },
28 {}
29};
30
31static struct trans_ctl_table trans_kern_table[] = {
32 { KERN_OSTYPE, "ostype" },
33 { KERN_OSRELEASE, "osrelease" },
34 /* KERN_OSREV not used */
35 { KERN_VERSION, "version" },
36 /* KERN_SECUREMASK not used */
37 /* KERN_PROF not used */
38 { KERN_NODENAME, "hostname" },
39 { KERN_DOMAINNAME, "domainname" },
40
41#ifdef CONFIG_SECURITY_CAPABILITIES
42 { KERN_CAP_BSET, "cap-bound" },
43#endif /* def CONFIG_SECURITY_CAPABILITIES */
44
45 { KERN_PANIC, "panic" },
46 { KERN_REALROOTDEV, "real-root-dev" },
47
48 { KERN_SPARC_REBOOT, "reboot-cmd" },
49 { KERN_CTLALTDEL, "ctrl-alt-del" },
50 { KERN_PRINTK, "printk" },
51
52 /* KERN_NAMETRANS not used */
53 /* KERN_PPC_HTABRECLAIM not used */
54 /* KERN_PPC_ZEROPAGED not used */
55 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
56
57 { KERN_MODPROBE, "modprobe" },
58 { KERN_SG_BIG_BUFF, "sg-big-buff" },
59 { KERN_ACCT, "acct" },
60 { KERN_PPC_L2CR, "l2cr" },
61
62 /* KERN_RTSIGNR not used */
63 /* KERN_RTSIGMAX not used */
64
65 { KERN_SHMMAX, "shmmax" },
66 { KERN_MSGMAX, "msgmax" },
67 { KERN_MSGMNB, "msgmnb" },
68 /* KERN_MSGPOOL not used*/
69 { KERN_SYSRQ, "sysrq" },
70 { KERN_MAX_THREADS, "threads-max" },
71 { KERN_RANDOM, "random", trans_random_table },
72 { KERN_SHMALL, "shmall" },
73 { KERN_MSGMNI, "msgmni" },
74 { KERN_SEM, "sem" },
75 { KERN_SPARC_STOP_A, "stop-a" },
76 { KERN_SHMMNI, "shmmni" },
77
78 { KERN_OVERFLOWUID, "overflowuid" },
79 { KERN_OVERFLOWGID, "overflowgid" },
80
81 { KERN_HOTPLUG, "hotplug", },
82 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
83
84 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
85 { KERN_CORE_USES_PID, "core_uses_pid" },
86 { KERN_TAINTED, "tainted" },
87 { KERN_CADPID, "cad_pid" },
88 { KERN_PIDMAX, "pid_max" },
89 { KERN_CORE_PATTERN, "core_pattern" },
90 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
91 { KERN_HPPA_PWRSW, "soft-power" },
92 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
93
94 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
95 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
96
97 { KERN_PTY, "pty", trans_pty_table },
98 { KERN_NGROUPS_MAX, "ngroups_max" },
99 { KERN_SPARC_SCONS_PWROFF, "scons_poweroff" },
100 { KERN_HZ_TIMER, "hz_timer" },
101 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
102 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
103 { KERN_RANDOMIZE, "randomize_va_space" },
104
105 { KERN_SPIN_RETRY, "spin_retry" },
106 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
107 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
108 { KERN_COMPAT_LOG, "compat-log" },
109 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
110 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
111 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
112 {}
113};
114
115static struct trans_ctl_table trans_vm_table[] = {
116 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
117 { VM_PAGE_CLUSTER, "page-cluster" },
118 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
119 { VM_DIRTY_RATIO, "dirty_ratio" },
120 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
121 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
122 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
123 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
124 /* VM_PAGEBUF unused */
125 { VM_HUGETLB_PAGES, "nr_hugepages" },
126 { VM_SWAPPINESS, "swappiness" },
127 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
128 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
129 { VM_MAX_MAP_COUNT, "max_map_count" },
130 { VM_LAPTOP_MODE, "laptop_mode" },
131 { VM_BLOCK_DUMP, "block_dump" },
132 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
133 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
134 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
135 /* VM_SWAP_TOKEN_TIMEOUT unused */
136 { VM_DROP_PAGECACHE, "drop_caches" },
137 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
138 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
139 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
140 { VM_PANIC_ON_OOM, "panic_on_oom" },
141 { VM_VDSO_ENABLED, "vdso_enabled" },
142 { VM_MIN_SLAB, "min_slab_ratio" },
143 { VM_CMM_PAGES, "cmm_pages" },
144 { VM_CMM_TIMED_PAGES, "cmm_timed_pages" },
145 { VM_CMM_TIMEOUT, "cmm_timeout" },
146
147 {}
148};
149
150static struct trans_ctl_table trans_net_core_table[] = {
151 { NET_CORE_WMEM_MAX, "wmem_max" },
152 { NET_CORE_RMEM_MAX, "rmem_max" },
153 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
154 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
155 /* NET_CORE_DESTROY_DELAY unused */
156 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
157 /* NET_CORE_FASTROUTE unused */
158 { NET_CORE_MSG_COST, "message_cost" },
159 { NET_CORE_MSG_BURST, "message_burst" },
160 { NET_CORE_OPTMEM_MAX, "optmem_max" },
161 /* NET_CORE_HOT_LIST_LENGTH unused */
162 /* NET_CORE_DIVERT_VERSION unused */
163 /* NET_CORE_NO_CONG_THRESH unused */
164 /* NET_CORE_NO_CONG unused */
165 /* NET_CORE_LO_CONG unused */
166 /* NET_CORE_MOD_CONG unused */
167 { NET_CORE_DEV_WEIGHT, "dev_weight" },
168 { NET_CORE_SOMAXCONN, "somaxconn" },
169 { NET_CORE_BUDGET, "netdev_budget" },
170 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
171 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
172 { NET_CORE_WARNINGS, "warnings" },
173 {},
174};
175
176static struct trans_ctl_table trans_net_unix_table[] = {
177 /* NET_UNIX_DESTROY_DELAY unused */
178 /* NET_UNIX_DELETE_DELAY unused */
179 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
180 {}
181};
182
183static struct trans_ctl_table trans_net_ipv4_route_table[] = {
184 { NET_IPV4_ROUTE_FLUSH, "flush" },
185 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
186 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
187 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
188 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
189 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
190 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
191 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
192 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
193 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
194 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
195 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
196 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
197 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
198 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
199 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
200 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
201 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
202 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
203 {}
204};
205
206static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
207 { NET_IPV4_CONF_FORWARDING, "forwarding" },
208 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
209
210 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
211 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
212 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
213 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
214 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
215 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
216 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
217 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
218 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
219 { NET_IPV4_CONF_TAG, "tag" },
220 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
221 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
222 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
223 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
224 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
225
226 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
227 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
228 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
229 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
230 {}
231};
232
233static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
234 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
235 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
236 { 0, NULL, trans_net_ipv4_conf_vars_table },
237 {}
238};
239
240
241static struct trans_ctl_table trans_net_ipv4_vs_table[] = {
242 { NET_IPV4_VS_AMEMTHRESH, "amemthresh" },
243 { NET_IPV4_VS_DEBUG_LEVEL, "debug_level" },
244 { NET_IPV4_VS_AMDROPRATE, "am_droprate" },
245 { NET_IPV4_VS_DROP_ENTRY, "drop_entry" },
246 { NET_IPV4_VS_DROP_PACKET, "drop_packet" },
247 { NET_IPV4_VS_SECURE_TCP, "secure_tcp" },
248 { NET_IPV4_VS_TO_ES, "timeout_established" },
249 { NET_IPV4_VS_TO_SS, "timeout_synsent" },
250 { NET_IPV4_VS_TO_SR, "timeout_synrecv" },
251 { NET_IPV4_VS_TO_FW, "timeout_finwait" },
252 { NET_IPV4_VS_TO_TW, "timeout_timewait" },
253 { NET_IPV4_VS_TO_CL, "timeout_close" },
254 { NET_IPV4_VS_TO_CW, "timeout_closewait" },
255 { NET_IPV4_VS_TO_LA, "timeout_lastack" },
256 { NET_IPV4_VS_TO_LI, "timeout_listen" },
257 { NET_IPV4_VS_TO_SA, "timeout_synack" },
258 { NET_IPV4_VS_TO_UDP, "timeout_udp" },
259 { NET_IPV4_VS_TO_ICMP, "timeout_icmp" },
260 { NET_IPV4_VS_CACHE_BYPASS, "cache_bypass" },
261 { NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn" },
262 { NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, "expire_quiescent_template" },
263 { NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold" },
264 { NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send" },
265 { NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration" },
266 { NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration" },
267 {}
268};
269
270static struct trans_ctl_table trans_net_neigh_vars_table[] = {
271 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
272 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
273 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
274 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
275 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
276 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
277 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
278 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
279 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
280 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
281 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
282 { NET_NEIGH_LOCKTIME, "locktime" },
283 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
284 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
285 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
286 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
287 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
288 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
289 {}
290};
291
292static struct trans_ctl_table trans_net_neigh_table[] = {
293 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
294 { 0, NULL, trans_net_neigh_vars_table },
295 {}
296};
297
298static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
299 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
300
301 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
302 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
303 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
304 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
305 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
306 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
307 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
308 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
309
310 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
311 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
312 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
313 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
314
315 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
316 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
317 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
318 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
319 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
320 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
321
322 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
323 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
324 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
325 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
326 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
327 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
328 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
329
330 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
331 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
332 {}
333};
334
335static struct trans_ctl_table trans_net_ipv4_table[] = {
336 { NET_IPV4_FORWARD, "ip_forward" },
337 { NET_IPV4_DYNADDR, "ip_dynaddr" },
338
339 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
340 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
341 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
342 /* NET_IPV4_FIB_HASH unused */
343 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
344 { NET_IPV4_VS, "vs", trans_net_ipv4_vs_table },
345
346 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
347 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
348 { NET_IPV4_TCP_SACK, "tcp_sack" },
349 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
350 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
351 /* NET_IPV4_AUTOCONFIG unused */
352 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
353 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
354 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
355 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
356 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
357 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
358 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
359 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
360 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
361 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
362 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
363 /* NET_IPV4_IP_MASQ_DEBUG unused */
364 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
365 { NET_TCP_STDURG, "tcp_stdurg" },
366 { NET_TCP_RFC1337, "tcp_rfc1337" },
367 /* NET_TCP_SYN_TAILDROP unused */
368 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
369 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
370 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
371 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
372 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
373 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
374 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
375 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
376 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
377 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
378 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
379 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
380 /* NET_IPV4_ALWAYS_DEFRAG unused */
381 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
382 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
383 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
384 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
385 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
386 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
387 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
388 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
389 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
390 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
391 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
392 { NET_TCP_FACK, "tcp_fack" },
393 { NET_TCP_REORDERING, "tcp_reordering" },
394 { NET_TCP_ECN, "tcp_ecn" },
395 { NET_TCP_DSACK, "tcp_dsack" },
396 { NET_TCP_MEM, "tcp_mem" },
397 { NET_TCP_WMEM, "tcp_wmem" },
398 { NET_TCP_RMEM, "tcp_rmem" },
399 { NET_TCP_APP_WIN, "tcp_app_win" },
400 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
401 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
402 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
403 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
404 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
405 { NET_TCP_FRTO, "tcp_frto" },
406 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
407 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
408 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
409 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
410 /* NET_TCP_DEFAULT_WIN_SCALE unused */
411 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
412 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
413 /* NET_TCP_BIC_BETA unused */
414 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
415 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
416 { NET_TCP_ABC, "tcp_abc" },
417 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
418 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
419 { NET_TCP_BASE_MSS, "tcp_base_mss" },
420 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
421 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
422 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
423 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
424 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
425 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
426 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
427 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
428 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
429 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
430 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
431 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
432 {}
433};
434
435static struct trans_ctl_table trans_net_ipx_table[] = {
436 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
437 /* NET_IPX_FORWARDING unused */
438 {}
439};
440
441static struct trans_ctl_table trans_net_atalk_table[] = {
442 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
443 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
444 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
445 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
446 {},
447};
448
449static struct trans_ctl_table trans_net_netrom_table[] = {
450 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
451 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
452 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
453 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
454 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
455 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
456 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
457 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
458 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
459 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
460 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
461 { NET_NETROM_RESET, "reset" },
462 {}
463};
464
465static struct trans_ctl_table trans_net_ax25_table[] = {
466 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
467 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
468 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
469 { NET_AX25_CONNECT_MODE, "connect_mode" },
470 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
471 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
472 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
473 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
474 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
475 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
476 { NET_AX25_N2, "maximum_retry_count" },
477 { NET_AX25_PACLEN, "maximum_packet_length" },
478 { NET_AX25_PROTOCOL, "protocol" },
479 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
480 {}
481};
482
483static struct trans_ctl_table trans_net_bridge_table[] = {
484 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
485 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
486 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
487 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
488 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
489 {}
490};
491
492static struct trans_ctl_table trans_net_rose_table[] = {
493 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
494 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
495 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
496 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
497 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
498 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
499 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
500 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
501 { NET_ROSE_WINDOW_SIZE, "window_size" },
502 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
503 {}
504};
505
506static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
507 { NET_IPV6_FORWARDING, "forwarding" },
508 { NET_IPV6_HOP_LIMIT, "hop_limit" },
509 { NET_IPV6_MTU, "mtu" },
510 { NET_IPV6_ACCEPT_RA, "accept_ra" },
511 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
512 { NET_IPV6_AUTOCONF, "autoconf" },
513 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
514 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
515 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
516 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
517 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
518 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
519 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
520 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
521 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
522 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
523 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
524 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
525 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
526 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
527 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
528 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
529 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
530 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
531 {}
532};
533
534static struct trans_ctl_table trans_net_ipv6_conf_table[] = {
535 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
536 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
537 { 0, NULL, trans_net_ipv6_conf_var_table },
538 {}
539};
540
541static struct trans_ctl_table trans_net_ipv6_route_table[] = {
542 { NET_IPV6_ROUTE_FLUSH, "flush" },
543 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
544 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
545 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
546 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
547 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
548 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
549 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
550 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
551 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
552 {}
553};
554
555static struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
556 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
557 {}
558};
559
560static struct trans_ctl_table trans_net_ipv6_table[] = {
561 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
562 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
563 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
564 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
565 { NET_IPV6_BINDV6ONLY, "bindv6only" },
566 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
567 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
568 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
569 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
570 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
571 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
572 {}
573};
574
575static struct trans_ctl_table trans_net_x25_table[] = {
576 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
577 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
578 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
579 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
580 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
581 { NET_X25_FORWARD, "x25_forward" },
582 {}
583};
584
585static struct trans_ctl_table trans_net_tr_table[] = {
586 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
587 {}
588};
589
590
591static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
592 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
593 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
594 { NET_DECNET_CONF_DEV_T2, "t2" },
595 { NET_DECNET_CONF_DEV_T3, "t3" },
596 {}
597};
598
599static struct trans_ctl_table trans_net_decnet_conf[] = {
600 { 0, NULL, trans_net_decnet_conf_vars },
601 {}
602};
603
604static struct trans_ctl_table trans_net_decnet_table[] = {
605 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
606 { NET_DECNET_NODE_ADDRESS, "node_address" },
607 { NET_DECNET_NODE_NAME, "node_name" },
608 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
609 { NET_DECNET_TIME_WAIT, "time_wait" },
610 { NET_DECNET_DN_COUNT, "dn_count" },
611 { NET_DECNET_DI_COUNT, "di_count" },
612 { NET_DECNET_DR_COUNT, "dr_count" },
613 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
614 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
615 { NET_DECNET_MEM, "decnet_mem" },
616 { NET_DECNET_RMEM, "decnet_rmem" },
617 { NET_DECNET_WMEM, "decnet_wmem" },
618 { NET_DECNET_DEBUG_LEVEL, "debug" },
619 {}
620};
621
622static struct trans_ctl_table trans_net_sctp_table[] = {
623 { NET_SCTP_RTO_INITIAL, "rto_initial" },
624 { NET_SCTP_RTO_MIN, "rto_min" },
625 { NET_SCTP_RTO_MAX, "rto_max" },
626 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
627 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
628 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
629 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
630 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
631 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
632 { NET_SCTP_HB_INTERVAL, "hb_interval" },
633 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
634 { NET_SCTP_MAX_BURST, "max_burst" },
635 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
636 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
637 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
638 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
639 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
640 {}
641};
642
643static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
644 { NET_LLC2_ACK_TIMEOUT, "ack" },
645 { NET_LLC2_P_TIMEOUT, "p" },
646 { NET_LLC2_REJ_TIMEOUT, "rej" },
647 { NET_LLC2_BUSY_TIMEOUT, "busy" },
648 {}
649};
650
651static struct trans_ctl_table trans_net_llc_station_table[] = {
652 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
653 {}
654};
655
656static struct trans_ctl_table trans_net_llc_llc2_table[] = {
657 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
658 {}
659};
660
661static struct trans_ctl_table trans_net_llc_table[] = {
662 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
663 { NET_LLC_STATION, "station", trans_net_llc_station_table },
664 {}
665};
666
667static struct trans_ctl_table trans_net_netfilter_table[] = {
668 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
669 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
670 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
671 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
672 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
673 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
674 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
675 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
676 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
677 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
678 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
679 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
680 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
681 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
682 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
683 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
684 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
685 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
686 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
687 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
688 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
689 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
690 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
691 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
692 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
693 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
694 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
695 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
696 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
697 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
698 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
699 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
700
701 {}
702};
703
704static struct trans_ctl_table trans_net_dccp_table[] = {
705 { NET_DCCP_DEFAULT, "default" },
706 {}
707};
708
709static struct trans_ctl_table trans_net_irda_table[] = {
710 { NET_IRDA_DISCOVERY, "discovery" },
711 { NET_IRDA_DEVNAME, "devname" },
712 { NET_IRDA_DEBUG, "debug" },
713 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
714 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
715 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
716 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
717 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
718 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
719 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
720 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
721 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
722 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
723 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
724 {}
725};
726
727static struct trans_ctl_table trans_net_table[] = {
728 { NET_CORE, "core", trans_net_core_table },
729 /* NET_ETHER not used */
730 /* NET_802 not used */
731 { NET_UNIX, "unix", trans_net_unix_table },
732 { NET_IPV4, "ipv4", trans_net_ipv4_table },
733 { NET_IPX, "ipx", trans_net_ipx_table },
734 { NET_ATALK, "atalk", trans_net_atalk_table },
735 { NET_NETROM, "netrom", trans_net_netrom_table },
736 { NET_AX25, "ax25", trans_net_ax25_table },
737 { NET_BRIDGE, "bridge", trans_net_bridge_table },
738 { NET_ROSE, "rose", trans_net_rose_table },
739 { NET_IPV6, "ipv6", trans_net_ipv6_table },
740 { NET_X25, "x25", trans_net_x25_table },
741 { NET_TR, "tr", trans_net_tr_table },
742 { NET_DECNET, "decnet", trans_net_decnet_table },
743 /* NET_ECONET not used */
744 { NET_SCTP, "sctp", trans_net_sctp_table },
745 { NET_LLC, "llc", trans_net_llc_table },
746 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
747 { NET_DCCP, "dccp", trans_net_dccp_table },
748 { NET_IRDA, "irda", trans_net_irda_table },
749 { 2089, "nf_conntrack_max" },
750 {}
751};
752
753static struct trans_ctl_table trans_fs_quota_table[] = {
754 { FS_DQ_LOOKUPS, "lookups" },
755 { FS_DQ_DROPS, "drops" },
756 { FS_DQ_READS, "reads" },
757 { FS_DQ_WRITES, "writes" },
758 { FS_DQ_CACHE_HITS, "cache_hits" },
759 { FS_DQ_ALLOCATED, "allocated_dquots" },
760 { FS_DQ_FREE, "free_dquots" },
761 { FS_DQ_SYNCS, "syncs" },
762 { FS_DQ_WARNINGS, "warnings" },
763 {}
764};
765
766static struct trans_ctl_table trans_fs_xfs_table[] = {
767 { XFS_RESTRICT_CHOWN, "restrict_chown" },
768 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
769 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
770 { XFS_PANIC_MASK, "panic_mask" },
771
772 { XFS_ERRLEVEL, "error_level" },
773 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
774 { XFS_INHERIT_SYNC, "inherit_sync" },
775 { XFS_INHERIT_NODUMP, "inherit_nodump" },
776 { XFS_INHERIT_NOATIME, "inherit_noatime" },
777 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
778 { XFS_BUF_AGE, "age_buffer_centisecs" },
779 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
780 { XFS_ROTORSTEP, "rotorstep" },
781 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
782 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
783 { XFS_STATS_CLEAR, "stats_clear" },
784 {}
785};
786
787static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
788 { 1, "hb_ctl_path" },
789 {}
790};
791
792static struct trans_ctl_table trans_fs_ocfs2_table[] = {
793 { 1, "nm", trans_fs_ocfs2_nm_table },
794 {}
795};
796
797static struct trans_ctl_table trans_inotify_table[] = {
798 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
799 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
800 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
801 {}
802};
803
804static struct trans_ctl_table trans_fs_table[] = {
805 { FS_NRINODE, "inode-nr" },
806 { FS_STATINODE, "inode-state" },
807 /* FS_MAXINODE unused */
808 /* FS_NRDQUOT unused */
809 /* FS_MAXDQUOT unused */
810 { FS_NRFILE, "file-nr" },
811 { FS_MAXFILE, "file-max" },
812 { FS_DENTRY, "dentry-state" },
813 /* FS_NRSUPER unused */
814 /* FS_MAXUPSER unused */
815 { FS_OVERFLOWUID, "overflowuid" },
816 { FS_OVERFLOWGID, "overflowgid" },
817 { FS_LEASES, "leases-enable" },
818 { FS_DIR_NOTIFY, "dir-notify-enable" },
819 { FS_LEASE_TIME, "lease-break-time" },
820 { FS_DQSTATS, "quota", trans_fs_quota_table },
821 { FS_XFS, "xfs", trans_fs_xfs_table },
822 { FS_AIO_NR, "aio-nr" },
823 { FS_AIO_MAX_NR, "aio-max-nr" },
824 { FS_INOTIFY, "inotify", trans_inotify_table },
825 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
826 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
827 {}
828};
829
830static struct trans_ctl_table trans_debug_table[] = {
831 {}
832};
833
834static struct trans_ctl_table trans_cdrom_table[] = {
835 { DEV_CDROM_INFO, "info" },
836 { DEV_CDROM_AUTOCLOSE, "autoclose" },
837 { DEV_CDROM_AUTOEJECT, "autoeject" },
838 { DEV_CDROM_DEBUG, "debug" },
839 { DEV_CDROM_LOCK, "lock" },
840 { DEV_CDROM_CHECK_MEDIA, "check_media" },
841 {}
842};
843
844static struct trans_ctl_table trans_ipmi_table[] = {
845 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
846 {}
847};
848
849static struct trans_ctl_table trans_mac_hid_files[] = {
850 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
851 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
852 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
853 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
854 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
855 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
856 {}
857};
858
859static struct trans_ctl_table trans_raid_table[] = {
860 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
861 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
862 {}
863};
864
865static struct trans_ctl_table trans_scsi_table[] = {
866 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
867 {}
868};
869
870static struct trans_ctl_table trans_parport_default_table[] = {
871 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
872 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
873 {}
874};
875
876static struct trans_ctl_table trans_parport_device_table[] = {
877 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
878 {}
879};
880
881static struct trans_ctl_table trans_parport_devices_table[] = {
882 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
883 { 0, NULL, trans_parport_device_table },
884 {}
885};
886
887static struct trans_ctl_table trans_parport_parport_table[] = {
888 { DEV_PARPORT_SPINTIME, "spintime" },
889 { DEV_PARPORT_BASE_ADDR, "base-addr" },
890 { DEV_PARPORT_IRQ, "irq" },
891 { DEV_PARPORT_DMA, "dma" },
892 { DEV_PARPORT_MODES, "modes" },
893 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
894 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
895 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
896 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
897 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
898 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
899 {}
900};
901static struct trans_ctl_table trans_parport_table[] = {
902 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
903 { 0, NULL, trans_parport_parport_table },
904 {}
905};
906
907static struct trans_ctl_table trans_dev_table[] = {
908 { DEV_CDROM, "cdrom", trans_cdrom_table },
909 /* DEV_HWMON unused */
910 { DEV_PARPORT, "parport", trans_parport_table },
911 { DEV_RAID, "raid", trans_raid_table },
912 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
913 { DEV_SCSI, "scsi", trans_scsi_table },
914 { DEV_IPMI, "ipmi", trans_ipmi_table },
915 {}
916};
917
918static struct trans_ctl_table trans_bus_isa_table[] = {
919 { BUS_ISA_MEM_BASE, "membase" },
920 { BUS_ISA_PORT_BASE, "portbase" },
921 { BUS_ISA_PORT_SHIFT, "portshift" },
922 {}
923};
924
925static struct trans_ctl_table trans_bus_table[] = {
926 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
927 {}
928};
929
930static struct trans_ctl_table trans_arlan_conf_table0[] = {
931 { 1, "spreadingCode" },
932 { 2, "channelNumber" },
933 { 3, "scramblingDisable" },
934 { 4, "txAttenuation" },
935 { 5, "systemId" },
936 { 6, "maxDatagramSize" },
937 { 7, "maxFrameSize" },
938 { 8, "maxRetries" },
939 { 9, "receiveMode" },
940 { 10, "priority" },
941 { 11, "rootOrRepeater" },
942 { 12, "SID" },
943 { 13, "registrationMode" },
944 { 14, "registrationFill" },
945 { 15, "localTalkAddress" },
946 { 16, "codeFormat" },
947 { 17, "numChannels" },
948 { 18, "channel1" },
949 { 19, "channel2" },
950 { 20, "channel3" },
951 { 21, "channel4" },
952 { 22, "txClear" },
953 { 23, "txRetries" },
954 { 24, "txRouting" },
955 { 25, "txScrambled" },
956 { 26, "rxParameter" },
957 { 27, "txTimeoutMs" },
958 { 28, "waitCardTimeout" },
959 { 29, "channelSet" },
960 { 30, "name" },
961 { 31, "waitTime" },
962 { 32, "lParameter" },
963 { 33, "_15" },
964 { 34, "headerSize" },
965 { 36, "tx_delay_ms" },
966 { 37, "retries" },
967 { 38, "ReTransmitPacketMaxSize" },
968 { 39, "waitReTransmitPacketMaxSize" },
969 { 40, "fastReTransCount" },
970 { 41, "driverRetransmissions" },
971 { 42, "txAckTimeoutMs" },
972 { 43, "registrationInterrupts" },
973 { 44, "hardwareType" },
974 { 45, "radioType" },
975 { 46, "writeEEPROM" },
976 { 47, "writeRadioType" },
977 { 48, "entry_exit_debug" },
978 { 49, "debug" },
979 { 50, "in_speed" },
980 { 51, "out_speed" },
981 { 52, "in_speed10" },
982 { 53, "out_speed10" },
983 { 54, "in_speed_max" },
984 { 55, "out_speed_max" },
985 { 56, "measure_rate" },
986 { 57, "pre_Command_Wait" },
987 { 58, "rx_tweak1" },
988 { 59, "rx_tweak2" },
989 { 60, "tx_queue_len" },
990
991 { 150, "arlan0-txRing" },
992 { 151, "arlan0-rxRing" },
993 { 152, "arlan0-18" },
994 { 153, "arlan0-ring" },
995 { 154, "arlan0-shm-cpy" },
996 { 155, "config0" },
997 { 156, "reset0" },
998 {}
999};
1000
1001static struct trans_ctl_table trans_arlan_conf_table1[] = {
1002 { 1, "spreadingCode" },
1003 { 2, "channelNumber" },
1004 { 3, "scramblingDisable" },
1005 { 4, "txAttenuation" },
1006 { 5, "systemId" },
1007 { 6, "maxDatagramSize" },
1008 { 7, "maxFrameSize" },
1009 { 8, "maxRetries" },
1010 { 9, "receiveMode" },
1011 { 10, "priority" },
1012 { 11, "rootOrRepeater" },
1013 { 12, "SID" },
1014 { 13, "registrationMode" },
1015 { 14, "registrationFill" },
1016 { 15, "localTalkAddress" },
1017 { 16, "codeFormat" },
1018 { 17, "numChannels" },
1019 { 18, "channel1" },
1020 { 19, "channel2" },
1021 { 20, "channel3" },
1022 { 21, "channel4" },
1023 { 22, "txClear" },
1024 { 23, "txRetries" },
1025 { 24, "txRouting" },
1026 { 25, "txScrambled" },
1027 { 26, "rxParameter" },
1028 { 27, "txTimeoutMs" },
1029 { 28, "waitCardTimeout" },
1030 { 29, "channelSet" },
1031 { 30, "name" },
1032 { 31, "waitTime" },
1033 { 32, "lParameter" },
1034 { 33, "_15" },
1035 { 34, "headerSize" },
1036 { 36, "tx_delay_ms" },
1037 { 37, "retries" },
1038 { 38, "ReTransmitPacketMaxSize" },
1039 { 39, "waitReTransmitPacketMaxSize" },
1040 { 40, "fastReTransCount" },
1041 { 41, "driverRetransmissions" },
1042 { 42, "txAckTimeoutMs" },
1043 { 43, "registrationInterrupts" },
1044 { 44, "hardwareType" },
1045 { 45, "radioType" },
1046 { 46, "writeEEPROM" },
1047 { 47, "writeRadioType" },
1048 { 48, "entry_exit_debug" },
1049 { 49, "debug" },
1050 { 50, "in_speed" },
1051 { 51, "out_speed" },
1052 { 52, "in_speed10" },
1053 { 53, "out_speed10" },
1054 { 54, "in_speed_max" },
1055 { 55, "out_speed_max" },
1056 { 56, "measure_rate" },
1057 { 57, "pre_Command_Wait" },
1058 { 58, "rx_tweak1" },
1059 { 59, "rx_tweak2" },
1060 { 60, "tx_queue_len" },
1061
1062 { 150, "arlan1-txRing" },
1063 { 151, "arlan1-rxRing" },
1064 { 152, "arlan1-18" },
1065 { 153, "arlan1-ring" },
1066 { 154, "arlan1-shm-cpy" },
1067 { 155, "config1" },
1068 { 156, "reset1" },
1069 {}
1070};
1071
1072static struct trans_ctl_table trans_arlan_conf_table2[] = {
1073 { 1, "spreadingCode" },
1074 { 2, "channelNumber" },
1075 { 3, "scramblingDisable" },
1076 { 4, "txAttenuation" },
1077 { 5, "systemId" },
1078 { 6, "maxDatagramSize" },
1079 { 7, "maxFrameSize" },
1080 { 8, "maxRetries" },
1081 { 9, "receiveMode" },
1082 { 10, "priority" },
1083 { 11, "rootOrRepeater" },
1084 { 12, "SID" },
1085 { 13, "registrationMode" },
1086 { 14, "registrationFill" },
1087 { 15, "localTalkAddress" },
1088 { 16, "codeFormat" },
1089 { 17, "numChannels" },
1090 { 18, "channel1" },
1091 { 19, "channel2" },
1092 { 20, "channel3" },
1093 { 21, "channel4" },
1094 { 22, "txClear" },
1095 { 23, "txRetries" },
1096 { 24, "txRouting" },
1097 { 25, "txScrambled" },
1098 { 26, "rxParameter" },
1099 { 27, "txTimeoutMs" },
1100 { 28, "waitCardTimeout" },
1101 { 29, "channelSet" },
1102 { 30, "name" },
1103 { 31, "waitTime" },
1104 { 32, "lParameter" },
1105 { 33, "_15" },
1106 { 34, "headerSize" },
1107 { 36, "tx_delay_ms" },
1108 { 37, "retries" },
1109 { 38, "ReTransmitPacketMaxSize" },
1110 { 39, "waitReTransmitPacketMaxSize" },
1111 { 40, "fastReTransCount" },
1112 { 41, "driverRetransmissions" },
1113 { 42, "txAckTimeoutMs" },
1114 { 43, "registrationInterrupts" },
1115 { 44, "hardwareType" },
1116 { 45, "radioType" },
1117 { 46, "writeEEPROM" },
1118 { 47, "writeRadioType" },
1119 { 48, "entry_exit_debug" },
1120 { 49, "debug" },
1121 { 50, "in_speed" },
1122 { 51, "out_speed" },
1123 { 52, "in_speed10" },
1124 { 53, "out_speed10" },
1125 { 54, "in_speed_max" },
1126 { 55, "out_speed_max" },
1127 { 56, "measure_rate" },
1128 { 57, "pre_Command_Wait" },
1129 { 58, "rx_tweak1" },
1130 { 59, "rx_tweak2" },
1131 { 60, "tx_queue_len" },
1132
1133 { 150, "arlan2-txRing" },
1134 { 151, "arlan2-rxRing" },
1135 { 152, "arlan2-18" },
1136 { 153, "arlan2-ring" },
1137 { 154, "arlan2-shm-cpy" },
1138 { 155, "config2" },
1139 { 156, "reset2" },
1140 {}
1141};
1142
1143static struct trans_ctl_table trans_arlan_conf_table3[] = {
1144 { 1, "spreadingCode" },
1145 { 2, "channelNumber" },
1146 { 3, "scramblingDisable" },
1147 { 4, "txAttenuation" },
1148 { 5, "systemId" },
1149 { 6, "maxDatagramSize" },
1150 { 7, "maxFrameSize" },
1151 { 8, "maxRetries" },
1152 { 9, "receiveMode" },
1153 { 10, "priority" },
1154 { 11, "rootOrRepeater" },
1155 { 12, "SID" },
1156 { 13, "registrationMode" },
1157 { 14, "registrationFill" },
1158 { 15, "localTalkAddress" },
1159 { 16, "codeFormat" },
1160 { 17, "numChannels" },
1161 { 18, "channel1" },
1162 { 19, "channel2" },
1163 { 20, "channel3" },
1164 { 21, "channel4" },
1165 { 22, "txClear" },
1166 { 23, "txRetries" },
1167 { 24, "txRouting" },
1168 { 25, "txScrambled" },
1169 { 26, "rxParameter" },
1170 { 27, "txTimeoutMs" },
1171 { 28, "waitCardTimeout" },
1172 { 29, "channelSet" },
1173 { 30, "name" },
1174 { 31, "waitTime" },
1175 { 32, "lParameter" },
1176 { 33, "_15" },
1177 { 34, "headerSize" },
1178 { 36, "tx_delay_ms" },
1179 { 37, "retries" },
1180 { 38, "ReTransmitPacketMaxSize" },
1181 { 39, "waitReTransmitPacketMaxSize" },
1182 { 40, "fastReTransCount" },
1183 { 41, "driverRetransmissions" },
1184 { 42, "txAckTimeoutMs" },
1185 { 43, "registrationInterrupts" },
1186 { 44, "hardwareType" },
1187 { 45, "radioType" },
1188 { 46, "writeEEPROM" },
1189 { 47, "writeRadioType" },
1190 { 48, "entry_exit_debug" },
1191 { 49, "debug" },
1192 { 50, "in_speed" },
1193 { 51, "out_speed" },
1194 { 52, "in_speed10" },
1195 { 53, "out_speed10" },
1196 { 54, "in_speed_max" },
1197 { 55, "out_speed_max" },
1198 { 56, "measure_rate" },
1199 { 57, "pre_Command_Wait" },
1200 { 58, "rx_tweak1" },
1201 { 59, "rx_tweak2" },
1202 { 60, "tx_queue_len" },
1203
1204 { 150, "arlan3-txRing" },
1205 { 151, "arlan3-rxRing" },
1206 { 152, "arlan3-18" },
1207 { 153, "arlan3-ring" },
1208 { 154, "arlan3-shm-cpy" },
1209 { 155, "config3" },
1210 { 156, "reset3" },
1211 {}
1212};
1213
1214static struct trans_ctl_table trans_arlan_table[] = {
1215 { 1, "arlan0", trans_arlan_conf_table0 },
1216 { 2, "arlan1", trans_arlan_conf_table1 },
1217 { 3, "arlan2", trans_arlan_conf_table2 },
1218 { 4, "arlan3", trans_arlan_conf_table3 },
1219 {}
1220};
1221
1222static struct trans_ctl_table trans_appldata_table[] = {
1223 { CTL_APPLDATA_TIMER, "timer" },
1224 { CTL_APPLDATA_INTERVAL, "interval" },
1225 { CTL_APPLDATA_OS, "os" },
1226 { CTL_APPLDATA_NET_SUM, "net_sum" },
1227 { CTL_APPLDATA_MEM, "mem" },
1228 {}
1229
1230};
1231
1232static struct trans_ctl_table trans_s390dbf_table[] = {
1233 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1234 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1235 {}
1236};
1237
1238static struct trans_ctl_table trans_sunrpc_table[] = {
1239 { CTL_RPCDEBUG, "rpc_debug" },
1240 { CTL_NFSDEBUG, "nfs_debug" },
1241 { CTL_NFSDDEBUG, "nfsd_debug" },
1242 { CTL_NLMDEBUG, "nlm_debug" },
1243 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1244 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1245 { CTL_MIN_RESVPORT, "min_resvport" },
1246 { CTL_MAX_RESVPORT, "max_resvport" },
1247 {}
1248};
1249
1250static struct trans_ctl_table trans_pm_table[] = {
1251 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1252 { 2 /* CTL_PM_CMODE */, "cmode" },
1253 { 3 /* CTL_PM_P0 */, "p0" },
1254 { 4 /* CTL_PM_CM */, "cm" },
1255 {}
1256};
1257
1258static struct trans_ctl_table trans_frv_table[] = {
1259 { 1, "cache-mode" },
1260 { 2, "pin-cxnr" },
1261 {}
1262};
1263
1264static struct trans_ctl_table trans_root_table[] = {
1265 { CTL_KERN, "kernel", trans_kern_table },
1266 { CTL_VM, "vm", trans_vm_table },
1267 { CTL_NET, "net", trans_net_table },
1268 /* CTL_PROC not used */
1269 { CTL_FS, "fs", trans_fs_table },
1270 { CTL_DEBUG, "debug", trans_debug_table },
1271 { CTL_DEV, "dev", trans_dev_table },
1272 { CTL_BUS, "bus", trans_bus_table },
1273 { CTL_ABI, "abi" },
1274 /* CTL_CPU not used */
1275 { CTL_ARLAN, "arlan", trans_arlan_table },
1276 { CTL_APPLDATA, "appldata", trans_appldata_table },
1277 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1278 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1279 { CTL_PM, "pm", trans_pm_table },
1280 { CTL_FRV, "frv", trans_frv_table },
1281 {}
1282};
1283
1284
1285
1286
1287static int sysctl_depth(struct ctl_table *table)
1288{
1289 struct ctl_table *tmp;
1290 int depth;
1291
1292 depth = 0;
1293 for (tmp = table; tmp->parent; tmp = tmp->parent)
1294 depth++;
1295
1296 return depth;
1297}
1298
1299static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1300{
1301 int i;
1302
1303 for (i = 0; table && i < n; i++)
1304 table = table->parent;
1305
1306 return table;
1307}
1308
1309static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1310{
1311 struct ctl_table *test;
1312 struct trans_ctl_table *ref;
1313 int depth, cur_depth;
1314
1315 depth = sysctl_depth(table);
1316
1317 cur_depth = depth;
1318 ref = trans_root_table;
1319repeat:
1320 test = sysctl_parent(table, cur_depth);
1321 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1322 int match = 0;
1323
1324 if (cur_depth && !ref->child)
1325 continue;
1326
1327 if (test->procname && ref->procname &&
1328 (strcmp(test->procname, ref->procname) == 0))
1329 match++;
1330
1331 if (test->ctl_name && ref->ctl_name &&
1332 (test->ctl_name == ref->ctl_name))
1333 match++;
1334
1335 if (!ref->ctl_name && !ref->procname)
1336 match++;
1337
1338 if (match) {
1339 if (cur_depth != 0) {
1340 cur_depth--;
1341 ref = ref->child;
1342 goto repeat;
1343 }
1344 goto out;
1345 }
1346 }
1347 ref = NULL;
1348out:
1349 return ref;
1350}
1351
1352static void sysctl_print_path(struct ctl_table *table)
1353{
1354 struct ctl_table *tmp;
1355 int depth, i;
1356 depth = sysctl_depth(table);
1357 if (table->procname) {
1358 for (i = depth; i >= 0; i--) {
1359 tmp = sysctl_parent(table, i);
1360 printk("/%s", tmp->procname?tmp->procname:"");
1361 }
1362 }
1363 printk(" ");
1364 if (table->ctl_name) {
1365 for (i = depth; i >= 0; i--) {
1366 tmp = sysctl_parent(table, i);
1367 printk(".%d", tmp->ctl_name);
1368 }
1369 }
1370}
1371
1372static void sysctl_repair_table(struct ctl_table *table)
1373{
1374 /* Don't complain about the classic default
1375 * sysctl strategy routine. Maybe later we
1376 * can get the tables fixed and complain about
1377 * this.
1378 */
1379 if (table->ctl_name && table->procname &&
1380 (table->proc_handler == proc_dointvec) &&
1381 (!table->strategy)) {
1382 table->strategy = sysctl_data;
1383 }
1384}
1385
1386static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
1387{
1388 struct ctl_table_header *head;
1389 struct ctl_table *ref, *test;
1390 int depth, cur_depth;
1391
1392 depth = sysctl_depth(table);
1393
1394 for (head = sysctl_head_next(NULL); head;
1395 head = sysctl_head_next(head)) {
1396 cur_depth = depth;
1397 ref = head->ctl_table;
1398repeat:
1399 test = sysctl_parent(table, cur_depth);
1400 for (; ref->ctl_name || ref->procname; ref++) {
1401 int match = 0;
1402 if (cur_depth && !ref->child)
1403 continue;
1404
1405 if (test->procname && ref->procname &&
1406 (strcmp(test->procname, ref->procname) == 0))
1407 match++;
1408
1409 if (test->ctl_name && ref->ctl_name &&
1410 (test->ctl_name == ref->ctl_name))
1411 match++;
1412
1413 if (match) {
1414 if (cur_depth != 0) {
1415 cur_depth--;
1416 ref = ref->child;
1417 goto repeat;
1418 }
1419 goto out;
1420 }
1421 }
1422 }
1423 ref = NULL;
1424out:
1425 sysctl_head_finish(head);
1426 return ref;
1427}
1428
1429static void set_fail(const char **fail, struct ctl_table *table, const char *str)
1430{
1431 if (*fail) {
1432 printk(KERN_ERR "sysctl table check failed: ");
1433 sysctl_print_path(table);
1434 printk(" %s\n", *fail);
1435 }
1436 *fail = str;
1437}
1438
1439static int sysctl_check_dir(struct ctl_table *table)
1440{
1441 struct ctl_table *ref;
1442 int error;
1443
1444 error = 0;
1445 ref = sysctl_check_lookup(table);
1446 if (ref) {
1447 int match = 0;
1448 if ((!table->procname && !ref->procname) ||
1449 (table->procname && ref->procname &&
1450 (strcmp(table->procname, ref->procname) == 0)))
1451 match++;
1452
1453 if ((!table->ctl_name && !ref->ctl_name) ||
1454 (table->ctl_name && ref->ctl_name &&
1455 (table->ctl_name == ref->ctl_name)))
1456 match++;
1457
1458 if (match != 2) {
1459 printk(KERN_ERR "%s: failed: ", __func__);
1460 sysctl_print_path(table);
1461 printk(" ref: ");
1462 sysctl_print_path(ref);
1463 printk("\n");
1464 error = -EINVAL;
1465 }
1466 }
1467 return error;
1468}
1469
1470static void sysctl_check_leaf(struct ctl_table *table, const char **fail)
1471{
1472 struct ctl_table *ref;
1473
1474 ref = sysctl_check_lookup(table);
1475 if (ref && (ref != table))
1476 set_fail(fail, table, "Sysctl already exists");
1477}
1478
1479static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1480{
1481 struct trans_ctl_table *ref;
1482
1483 ref = sysctl_binary_lookup(table);
1484 if (table->ctl_name && !ref)
1485 set_fail(fail, table, "Unknown sysctl binary path");
1486 if (ref) {
1487 if (ref->procname &&
1488 (!table->procname ||
1489 (strcmp(table->procname, ref->procname) != 0)))
1490 set_fail(fail, table, "procname does not match binary path procname");
1491
1492 if (ref->ctl_name && table->ctl_name &&
1493 (table->ctl_name != ref->ctl_name))
1494 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1495 }
1496}
1497
1498int sysctl_check_table(struct ctl_table *table)
1499{
1500 int error = 0;
1501 for (; table->ctl_name || table->procname; table++) {
1502 const char *fail = NULL;
1503
1504 sysctl_repair_table(table);
1505 if (table->parent) {
1506 if (table->procname && !table->parent->procname)
1507 set_fail(&fail, table, "Parent without procname");
1508 if (table->ctl_name && !table->parent->ctl_name)
1509 set_fail(&fail, table, "Parent without ctl_name");
1510 }
1511 if (!table->procname)
1512 set_fail(&fail, table, "No procname");
1513 if (table->child) {
1514 if (table->data)
1515 set_fail(&fail, table, "Directory with data?");
1516 if (table->maxlen)
1517 set_fail(&fail, table, "Directory with maxlen?");
1518 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
1519 set_fail(&fail, table, "Writable sysctl directory");
1520 if (table->proc_handler)
1521 set_fail(&fail, table, "Directory with proc_handler");
1522 if (table->strategy)
1523 set_fail(&fail, table, "Directory with strategy");
1524 if (table->extra1)
1525 set_fail(&fail, table, "Directory with extra1");
1526 if (table->extra2)
1527 set_fail(&fail, table, "Directory with extra2");
1528 if (sysctl_check_dir(table))
1529 set_fail(&fail, table, "Inconsistent directory names");
1530 } else {
1531 if ((table->strategy == sysctl_data) ||
1532 (table->strategy == sysctl_string) ||
1533 (table->strategy == sysctl_intvec) ||
1534 (table->strategy == sysctl_jiffies) ||
1535 (table->strategy == sysctl_ms_jiffies) ||
1536 (table->proc_handler == proc_dostring) ||
1537 (table->proc_handler == proc_dointvec) ||
1538#ifdef CONFIG_SECURITY_CAPABILITIES
1539 (table->proc_handler == proc_dointvec_bset) ||
1540#endif /* def CONFIG_SECURITY_CAPABILITIES */
1541 (table->proc_handler == proc_dointvec_minmax) ||
1542 (table->proc_handler == proc_dointvec_jiffies) ||
1543 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
1544 (table->proc_handler == proc_dointvec_ms_jiffies) ||
1545 (table->proc_handler == proc_doulongvec_minmax) ||
1546 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1547 if (!table->data)
1548 set_fail(&fail, table, "No data");
1549 if (!table->maxlen)
1550 set_fail(&fail, table, "No maxlen");
1551 }
1552 if ((table->proc_handler == proc_doulongvec_minmax) ||
1553 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1554 if (table->maxlen > sizeof (unsigned long)) {
1555 if (!table->extra1)
1556 set_fail(&fail, table, "No min");
1557 if (!table->extra2)
1558 set_fail(&fail, table, "No max");
1559 }
1560 }
1561#ifdef CONFIG_SYSCTL_SYSCALL
1562 if (table->ctl_name && !table->strategy)
1563 set_fail(&fail, table, "Missing strategy");
1564#endif
1565#if 0
1566 if (!table->ctl_name && table->strategy)
1567 set_fail(&fail, table, "Strategy without ctl_name");
1568#endif
1569#ifdef CONFIG_PROC_FS
1570 if (table->procname && !table->proc_handler)
1571 set_fail(&fail, table, "No proc_handler");
1572#endif
1573#if 0
1574 if (!table->procname && table->proc_handler)
1575 set_fail(&fail, table, "proc_handler without procname");
1576#endif
1577 sysctl_check_leaf(table, &fail);
1578 }
1579 sysctl_check_bin_path(table, &fail);
1580 if (fail) {
1581 set_fail(&fail, table, NULL);
1582 error = -EINVAL;
1583 }
1584 if (table->child)
1585 error |= sysctl_check_table(table->child);
1586 }
1587 return error;
1588}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 7d4d7f9c1bb2..354e74bc17c1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,10 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/cgroupstats.h>
26#include <linux/cgroup.h>
27#include <linux/fs.h>
28#include <linux/file.h>
25#include <net/genetlink.h> 29#include <net/genetlink.h>
26#include <asm/atomic.h> 30#include <asm/atomic.h>
27 31
@@ -49,6 +53,11 @@ __read_mostly = {
49 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
50 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
51 55
56static struct nla_policy
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59};
60
52struct listener { 61struct listener {
53 struct list_head list; 62 struct list_head list;
54 pid_t pid; 63 pid_t pid;
@@ -254,7 +263,7 @@ out:
254 263
255 stats->version = TASKSTATS_VERSION; 264 stats->version = TASKSTATS_VERSION;
256 /* 265 /*
257 * Accounting subsytems can also add calls here to modify 266 * Accounting subsystems can also add calls here to modify
258 * fields of taskstats. 267 * fields of taskstats.
259 */ 268 */
260 return rc; 269 return rc;
@@ -372,6 +381,51 @@ err:
372 return NULL; 381 return NULL;
373} 382}
374 383
384static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
385{
386 int rc = 0;
387 struct sk_buff *rep_skb;
388 struct cgroupstats *stats;
389 struct nlattr *na;
390 size_t size;
391 u32 fd;
392 struct file *file;
393 int fput_needed;
394
395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
396 if (!na)
397 return -EINVAL;
398
399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
400 file = fget_light(fd, &fput_needed);
401 if (file) {
402 size = nla_total_size(sizeof(struct cgroupstats));
403
404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
405 size);
406 if (rc < 0)
407 goto err;
408
409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
410 sizeof(struct cgroupstats));
411 stats = nla_data(na);
412 memset(stats, 0, sizeof(*stats));
413
414 rc = cgroupstats_build(stats, file->f_dentry);
415 if (rc < 0)
416 goto err;
417
418 fput_light(file, fput_needed);
419 return send_reply(rep_skb, info->snd_pid);
420 }
421
422err:
423 if (file)
424 fput_light(file, fput_needed);
425 nlmsg_free(rep_skb);
426 return rc;
427}
428
375static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
376{ 430{
377 int rc = 0; 431 int rc = 0;
@@ -522,6 +576,12 @@ static struct genl_ops taskstats_ops = {
522 .policy = taskstats_cmd_get_policy, 576 .policy = taskstats_cmd_get_policy,
523}; 577};
524 578
579static struct genl_ops cgroupstats_ops = {
580 .cmd = CGROUPSTATS_CMD_GET,
581 .doit = cgroupstats_user_cmd,
582 .policy = cgroupstats_cmd_get_policy,
583};
584
525/* Needed early in initialization */ 585/* Needed early in initialization */
526void __init taskstats_init_early(void) 586void __init taskstats_init_early(void)
527{ 587{
@@ -546,8 +606,15 @@ static int __init taskstats_init(void)
546 if (rc < 0) 606 if (rc < 0)
547 goto err; 607 goto err;
548 608
609 rc = genl_register_ops(&family, &cgroupstats_ops);
610 if (rc < 0)
611 goto err_cgroup_ops;
612
549 family_registered = 1; 613 family_registered = 1;
614 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
550 return 0; 615 return 0;
616err_cgroup_ops:
617 genl_unregister_ops(&family, &taskstats_ops);
551err: 618err:
552 genl_unregister_family(&family); 619 genl_unregister_family(&family);
553 return rc; 620 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 2d5b6a682138..09d3c45c4da7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -9,9 +9,9 @@
9 */ 9 */
10/* 10/*
11 * Modification history kernel/time.c 11 * Modification history kernel/time.c
12 * 12 *
13 * 1993-09-02 Philip Gladstone 13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex() 14 * Created file with time related functions from sched.c and adjtimex()
15 * 1993-10-08 Torsten Duwe 15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code 16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe 17 * 1995-08-13 Torsten Duwe
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h>
33#include <linux/errno.h> 34#include <linux/errno.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/security.h> 36#include <linux/security.h>
@@ -38,7 +39,7 @@
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/unistd.h> 40#include <asm/unistd.h>
40 41
41/* 42/*
42 * The timezone where the local system is located. Used as a default by some 43 * The timezone where the local system is located. Used as a default by some
43 * programs who obtain this value by using gettimeofday. 44 * programs who obtain this value by using gettimeofday.
44 */ 45 */
@@ -71,7 +72,7 @@ asmlinkage long sys_time(time_t __user * tloc)
71 * why not move it into the appropriate arch directory (for those 72 * why not move it into the appropriate arch directory (for those
72 * architectures that need it). 73 * architectures that need it).
73 */ 74 */
74 75
75asmlinkage long sys_stime(time_t __user *tptr) 76asmlinkage long sys_stime(time_t __user *tptr)
76{ 77{
77 struct timespec tv; 78 struct timespec tv;
@@ -110,10 +111,10 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
110/* 111/*
111 * Adjust the time obtained from the CMOS to be UTC time instead of 112 * Adjust the time obtained from the CMOS to be UTC time instead of
112 * local time. 113 * local time.
113 * 114 *
114 * This is ugly, but preferable to the alternatives. Otherwise we 115 * This is ugly, but preferable to the alternatives. Otherwise we
115 * would either need to write a program to do it in /etc/rc (and risk 116 * would either need to write a program to do it in /etc/rc (and risk
116 * confusion if the program gets run more than once; it would also be 117 * confusion if the program gets run more than once; it would also be
117 * hard to make the program warp the clock precisely n hours) or 118 * hard to make the program warp the clock precisely n hours) or
118 * compile in the timezone information into the kernel. Bad, bad.... 119 * compile in the timezone information into the kernel. Bad, bad....
119 * 120 *
@@ -158,6 +159,7 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
158 if (tz) { 159 if (tz) {
159 /* SMP safe, global irq locking makes it work. */ 160 /* SMP safe, global irq locking makes it work. */
160 sys_tz = *tz; 161 sys_tz = *tz;
162 update_vsyscall_tz();
161 if (firsttime) { 163 if (firsttime) {
162 firsttime = 0; 164 firsttime = 0;
163 if (!tv) 165 if (!tv)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 51b6a6a6158c..c8a9d13874df 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -207,15 +207,12 @@ static inline void clocksource_resume_watchdog(void) { }
207 */ 207 */
208void clocksource_resume(void) 208void clocksource_resume(void)
209{ 209{
210 struct list_head *tmp; 210 struct clocksource *cs;
211 unsigned long flags; 211 unsigned long flags;
212 212
213 spin_lock_irqsave(&clocksource_lock, flags); 213 spin_lock_irqsave(&clocksource_lock, flags);
214 214
215 list_for_each(tmp, &clocksource_list) { 215 list_for_each_entry(cs, &clocksource_list, list) {
216 struct clocksource *cs;
217
218 cs = list_entry(tmp, struct clocksource, list);
219 if (cs->resume) 216 if (cs->resume)
220 cs->resume(); 217 cs->resume();
221 } 218 }
@@ -369,7 +366,6 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
369 const char *buf, size_t count) 366 const char *buf, size_t count)
370{ 367{
371 struct clocksource *ovr = NULL; 368 struct clocksource *ovr = NULL;
372 struct list_head *tmp;
373 size_t ret = count; 369 size_t ret = count;
374 int len; 370 int len;
375 371
@@ -389,12 +385,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
389 385
390 len = strlen(override_name); 386 len = strlen(override_name);
391 if (len) { 387 if (len) {
388 struct clocksource *cs;
389
392 ovr = clocksource_override; 390 ovr = clocksource_override;
393 /* try to select it: */ 391 /* try to select it: */
394 list_for_each(tmp, &clocksource_list) { 392 list_for_each_entry(cs, &clocksource_list, list) {
395 struct clocksource *cs;
396
397 cs = list_entry(tmp, struct clocksource, list);
398 if (strlen(cs->name) == len && 393 if (strlen(cs->name) == len &&
399 !strcmp(cs->name, override_name)) 394 !strcmp(cs->name, override_name))
400 ovr = cs; 395 ovr = cs;
@@ -422,14 +417,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
422static ssize_t 417static ssize_t
423sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 418sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
424{ 419{
425 struct list_head *tmp; 420 struct clocksource *src;
426 char *curr = buf; 421 char *curr = buf;
427 422
428 spin_lock_irq(&clocksource_lock); 423 spin_lock_irq(&clocksource_lock);
429 list_for_each(tmp, &clocksource_list) { 424 list_for_each_entry(src, &clocksource_list, list) {
430 struct clocksource *src;
431
432 src = list_entry(tmp, struct clocksource, list);
433 curr += sprintf(curr, "%s ", src->name); 425 curr += sprintf(curr, "%s ", src->name);
434 } 426 }
435 spin_unlock_irq(&clocksource_lock); 427 spin_unlock_irq(&clocksource_lock);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ce89ffb474d0..10a1347597fd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_stop_sched_tick(void)
153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
154 struct tick_sched *ts; 154 struct tick_sched *ts;
155 ktime_t last_update, expires, now, delta; 155 ktime_t last_update, expires, now, delta;
156 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
156 int cpu; 157 int cpu;
157 158
158 local_irq_save(flags); 159 local_irq_save(flags);
@@ -302,11 +303,26 @@ void tick_nohz_stop_sched_tick(void)
302out: 303out:
303 ts->next_jiffies = next_jiffies; 304 ts->next_jiffies = next_jiffies;
304 ts->last_jiffies = last_jiffies; 305 ts->last_jiffies = last_jiffies;
306 ts->sleep_length = ktime_sub(dev->next_event, now);
305end: 307end:
306 local_irq_restore(flags); 308 local_irq_restore(flags);
307} 309}
308 310
309/** 311/**
312 * tick_nohz_get_sleep_length - return the length of the current sleep
313 *
314 * Called from power state control code with interrupts disabled
315 */
316ktime_t tick_nohz_get_sleep_length(void)
317{
318 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
319
320 return ts->sleep_length;
321}
322
323EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length);
324
325/**
310 * nohz_restart_sched_tick - restart the idle tick from the idle task 326 * nohz_restart_sched_tick - restart the idle tick from the idle task
311 * 327 *
312 * Restart the idle tick when the CPU is woken up from idle 328 * Restart the idle tick when the CPU is woken up from idle
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ce1952eea7d..fb4e67d5dd60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/pid_namespace.h>
29#include <linux/notifier.h> 30#include <linux/notifier.h>
30#include <linux/thread_info.h> 31#include <linux/thread_info.h>
31#include <linux/time.h> 32#include <linux/time.h>
@@ -817,7 +818,7 @@ unsigned long next_timer_interrupt(void)
817#endif 818#endif
818 819
819/* 820/*
820 * Called from the timer interrupt handler to charge one tick to the current 821 * Called from the timer interrupt handler to charge one tick to the current
821 * process. user_tick is 1 if the tick is user time, 0 for system. 822 * process. user_tick is 1 if the tick is user time, 0 for system.
822 */ 823 */
823void update_process_times(int user_tick) 824void update_process_times(int user_tick)
@@ -826,10 +827,13 @@ void update_process_times(int user_tick)
826 int cpu = smp_processor_id(); 827 int cpu = smp_processor_id();
827 828
828 /* Note: this timer irq context must be accounted for as well. */ 829 /* Note: this timer irq context must be accounted for as well. */
829 if (user_tick) 830 if (user_tick) {
830 account_user_time(p, jiffies_to_cputime(1)); 831 account_user_time(p, jiffies_to_cputime(1));
831 else 832 account_user_time_scaled(p, jiffies_to_cputime(1));
833 } else {
832 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); 834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
835 account_system_time_scaled(p, jiffies_to_cputime(1));
836 }
833 run_local_timers(); 837 run_local_timers();
834 if (rcu_pending(cpu)) 838 if (rcu_pending(cpu))
835 rcu_check_callbacks(cpu, user_tick); 839 rcu_check_callbacks(cpu, user_tick);
@@ -953,7 +957,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
953 */ 957 */
954asmlinkage long sys_getpid(void) 958asmlinkage long sys_getpid(void)
955{ 959{
956 return current->tgid; 960 return task_tgid_vnr(current);
957} 961}
958 962
959/* 963/*
@@ -967,7 +971,7 @@ asmlinkage long sys_getppid(void)
967 int pid; 971 int pid;
968 972
969 rcu_read_lock(); 973 rcu_read_lock();
970 pid = rcu_dereference(current->real_parent)->tgid; 974 pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns);
971 rcu_read_unlock(); 975 rcu_read_unlock();
972 976
973 return pid; 977 return pid;
@@ -1099,7 +1103,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1099/* Thread ID - the internal kernel "pid" */ 1103/* Thread ID - the internal kernel "pid" */
1100asmlinkage long sys_gettid(void) 1104asmlinkage long sys_gettid(void)
1101{ 1105{
1102 return current->pid; 1106 return task_pid_vnr(current);
1103} 1107}
1104 1108
1105/** 1109/**
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index c122131a122f..4ab1b584961b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
62 rcu_read_unlock(); 62 rcu_read_unlock();
63 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 63 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
64 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 64 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
65 stats->ac_utimescaled =
66 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
67 stats->ac_stimescaled =
68 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
65 stats->ac_minflt = tsk->min_flt; 69 stats->ac_minflt = tsk->min_flt;
66 stats->ac_majflt = tsk->maj_flt; 70 stats->ac_majflt = tsk->maj_flt;
67 71
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e080d1d744cc..52d5e7c9a8e6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -32,6 +32,7 @@
32#include <linux/freezer.h> 32#include <linux/freezer.h>
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h>
35 36
36/* 37/*
37 * The per-CPU workqueue (if single thread, we always use the first 38 * The per-CPU workqueue (if single thread, we always use the first
@@ -61,6 +62,9 @@ struct workqueue_struct {
61 const char *name; 62 const char *name;
62 int singlethread; 63 int singlethread;
63 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map;
67#endif
64}; 68};
65 69
66/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -250,6 +254,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
250 struct work_struct *work = list_entry(cwq->worklist.next, 254 struct work_struct *work = list_entry(cwq->worklist.next,
251 struct work_struct, entry); 255 struct work_struct, entry);
252 work_func_t f = work->func; 256 work_func_t f = work->func;
257#ifdef CONFIG_LOCKDEP
258 /*
259 * It is permissible to free the struct work_struct
260 * from inside the function that is called from it,
261 * this we need to take into account for lockdep too.
262 * To avoid bogus "held lock freed" warnings as well
263 * as problems when looking into work->lockdep_map,
264 * make a copy and use that here.
265 */
266 struct lockdep_map lockdep_map = work->lockdep_map;
267#endif
253 268
254 cwq->current_work = work; 269 cwq->current_work = work;
255 list_del_init(cwq->worklist.next); 270 list_del_init(cwq->worklist.next);
@@ -257,13 +272,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
257 272
258 BUG_ON(get_wq_data(work) != cwq); 273 BUG_ON(get_wq_data(work) != cwq);
259 work_clear_pending(work); 274 work_clear_pending(work);
275 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
276 lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
260 f(work); 277 f(work);
278 lock_release(&lockdep_map, 1, _THIS_IP_);
279 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
261 280
262 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 281 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
263 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 282 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
264 "%s/0x%08x/%d\n", 283 "%s/0x%08x/%d\n",
265 current->comm, preempt_count(), 284 current->comm, preempt_count(),
266 current->pid); 285 task_pid_nr(current));
267 printk(KERN_ERR " last function: "); 286 printk(KERN_ERR " last function: ");
268 print_symbol("%s\n", (unsigned long)f); 287 print_symbol("%s\n", (unsigned long)f);
269 debug_show_held_locks(current); 288 debug_show_held_locks(current);
@@ -376,6 +395,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
376 int cpu; 395 int cpu;
377 396
378 might_sleep(); 397 might_sleep();
398 lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
399 lock_release(&wq->lockdep_map, 1, _THIS_IP_);
379 for_each_cpu_mask(cpu, *cpu_map) 400 for_each_cpu_mask(cpu, *cpu_map)
380 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
381} 402}
@@ -446,6 +467,9 @@ static void wait_on_work(struct work_struct *work)
446 467
447 might_sleep(); 468 might_sleep();
448 469
470 lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
471 lock_release(&work->lockdep_map, 1, _THIS_IP_);
472
449 cwq = get_wq_data(work); 473 cwq = get_wq_data(work);
450 if (!cwq) 474 if (!cwq)
451 return; 475 return;
@@ -695,8 +719,10 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
695 } 719 }
696} 720}
697 721
698struct workqueue_struct *__create_workqueue(const char *name, 722struct workqueue_struct *__create_workqueue_key(const char *name,
699 int singlethread, int freezeable) 723 int singlethread,
724 int freezeable,
725 struct lock_class_key *key)
700{ 726{
701 struct workqueue_struct *wq; 727 struct workqueue_struct *wq;
702 struct cpu_workqueue_struct *cwq; 728 struct cpu_workqueue_struct *cwq;
@@ -713,6 +739,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
713 } 739 }
714 740
715 wq->name = name; 741 wq->name = name;
742 lockdep_init_map(&wq->lockdep_map, name, key, 0);
716 wq->singlethread = singlethread; 743 wq->singlethread = singlethread;
717 wq->freezeable = freezeable; 744 wq->freezeable = freezeable;
718 INIT_LIST_HEAD(&wq->list); 745 INIT_LIST_HEAD(&wq->list);
@@ -741,7 +768,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
741 } 768 }
742 return wq; 769 return wq;
743} 770}
744EXPORT_SYMBOL_GPL(__create_workqueue); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
745 772
746static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
747{ 774{
@@ -752,6 +779,9 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
752 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
753 return; 780 return;
754 781
782 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
783 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
784
755 flush_cpu_workqueue(cwq); 785 flush_cpu_workqueue(cwq);
756 /* 786 /*
757 * If the caller is CPU_DEAD and cwq->worklist was not empty, 787 * If the caller is CPU_DEAD and cwq->worklist was not empty,