aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.instrumentation49
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c66
-rw-r--r--kernel/audit.c30
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/capability.c204
-rw-r--r--kernel/cgroup.c2805
-rw-r--r--kernel/cgroup_debug.c97
-rw-r--r--kernel/compat.c117
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpu_acct.c186
-rw-r--r--kernel/cpuset.c1736
-rw-r--r--kernel/delayacct.c8
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/dma.c8
-rw-r--r--kernel/exec_domain.c2
-rw-r--r--kernel/exit.c273
-rw-r--r--kernel/fork.c181
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/hrtimer.c53
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/manage.c31
-rw-r--r--kernel/itimer.c4
-rw-r--r--kernel/kexec.c285
-rw-r--r--kernel/kprobes.c62
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/lockdep.c48
-rw-r--r--kernel/lockdep_proc.c61
-rw-r--r--kernel/marker.c525
-rw-r--r--kernel/module.c198
-rw-r--r--kernel/mutex.c35
-rw-r--r--kernel/notifier.c539
-rw-r--r--kernel/ns_cgroup.c100
-rw-r--r--kernel/nsproxy.c78
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid.c353
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/posix-timers.c30
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/disk.c156
-rw-r--r--kernel/power/main.c48
-rw-r--r--kernel/power/power.h21
-rw-r--r--kernel/power/process.c141
-rw-r--r--kernel/power/snapshot.c53
-rw-r--r--kernel/power/swsusp.c33
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk.c127
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c18
-rw-r--r--kernel/rcupdate.c9
-rw-r--r--kernel/rcutorture.c10
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/resource.c26
-rw-r--r--kernel/rtmutex-debug.c22
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c1847
-rw-r--r--kernel/sched_debug.c282
-rw-r--r--kernel/sched_fair.c809
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stats.h36
-rw-r--r--kernel/signal.c91
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c54
-rw-r--r--kernel/sys.c587
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c373
-rw-r--r--kernel/sysctl_check.c1588
-rw-r--r--kernel/taskstats.c68
-rw-r--r--kernel/time.c21
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/clocksource.c22
-rw-r--r--kernel/time/tick-broadcast.c51
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c12
-rw-r--r--kernel/timer.c16
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/user.c260
-rw-r--r--kernel/workqueue.c38
86 files changed, 11050 insertions, 4202 deletions
diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation
new file mode 100644
index 000000000000..f5f2c769d95e
--- /dev/null
+++ b/kernel/Kconfig.instrumentation
@@ -0,0 +1,49 @@
1menuconfig INSTRUMENTATION
2 bool "Instrumentation Support"
3 default y
4 ---help---
5 Say Y here to get to see options related to performance measurement,
6 system-wide debugging, and testing. This option alone does not add any
7 kernel code.
8
9 If you say N, all options in this submenu will be skipped and
10 disabled. If you're trying to debug the kernel itself, go see the
11 Kernel Hacking menu.
12
13if INSTRUMENTATION
14
15config PROFILING
16 bool "Profiling support (EXPERIMENTAL)"
17 help
18 Say Y here to enable the extended profiling support mechanisms used
19 by profilers such as OProfile.
20
21config OPROFILE
22 tristate "OProfile system profiling (EXPERIMENTAL)"
23 depends on PROFILING
24 depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64
25 help
26 OProfile is a profiling system capable of profiling the
27 whole system, include the kernel, kernel modules, libraries,
28 and applications.
29
30 If unsure, say N.
31
32config KPROBES
33 bool "Kprobes"
34 depends on KALLSYMS && MODULES
35 depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32
36 help
37 Kprobes allows you to trap at almost any kernel address and
38 execute a callback function. register_kprobe() establishes
39 a probepoint and specifies the callback. Kprobes is useful
40 for kernel debugging, non-intrusive instrumentation and testing.
41 If in doubt, say "N".
42
43config MARKERS
44 bool "Activate markers"
45 help
46 Place an empty function call at each marker site. Can be
47 dynamically changed for a probe function.
48
49endif # INSTRUMENTATION
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 6b066632e40c..c64ce9c14207 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -63,6 +63,3 @@ config PREEMPT_BKL
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure. 64 Say N if you are unsure.
65 65
66config PREEMPT_NOTIFIERS
67 bool
68
diff --git a/kernel/Makefile b/kernel/Makefile
index 2a999836ca18..05c3e6df8597 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,8 +8,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
12 utsname.o 12 utsname.o sysctl_check.o notifier.o
13 13
14obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
15obj-y += time/ 15obj-y += time/
@@ -36,7 +36,11 @@ obj-$(CONFIG_PM) += power/
36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
37obj-$(CONFIG_KEXEC) += kexec.o 37obj-$(CONFIG_KEXEC) += kexec.o
38obj-$(CONFIG_COMPAT) += compat.o 38obj-$(CONFIG_COMPAT) += compat.o
39obj-$(CONFIG_CGROUPS) += cgroup.o
40obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
39obj-$(CONFIG_CPUSETS) += cpuset.o 41obj-$(CONFIG_CPUSETS) += cpuset.o
42obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o
43obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
40obj-$(CONFIG_IKCONFIG) += configs.o 44obj-$(CONFIG_IKCONFIG) += configs.o
41obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 45obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
42obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 46obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -51,6 +55,7 @@ obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 55obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 56obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 57obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
58obj-$(CONFIG_MARKERS) += marker.o
54 59
55ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 60ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
56# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 61# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f0f8b2ba72..fce53d8df8a7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -329,16 +329,16 @@ static comp_t encode_comp_t(unsigned long value)
329 } 329 }
330 330
331 /* 331 /*
332 * If we need to round up, do it (and handle overflow correctly). 332 * If we need to round up, do it (and handle overflow correctly).
333 */ 333 */
334 if (rnd && (++value > MAXFRACT)) { 334 if (rnd && (++value > MAXFRACT)) {
335 value >>= EXPSIZE; 335 value >>= EXPSIZE;
336 exp++; 336 exp++;
337 } 337 }
338 338
339 /* 339 /*
340 * Clean it up and polish it off. 340 * Clean it up and polish it off.
341 */ 341 */
342 exp <<= MANTSIZE; /* Shift the exponent into place */ 342 exp <<= MANTSIZE; /* Shift the exponent into place */
343 exp += value; /* and add on the mantissa. */ 343 exp += value; /* and add on the mantissa. */
344 return exp; 344 return exp;
@@ -361,30 +361,30 @@ static comp_t encode_comp_t(unsigned long value)
361 361
362static comp2_t encode_comp2_t(u64 value) 362static comp2_t encode_comp2_t(u64 value)
363{ 363{
364 int exp, rnd; 364 int exp, rnd;
365 365
366 exp = (value > (MAXFRACT2>>1)); 366 exp = (value > (MAXFRACT2>>1));
367 rnd = 0; 367 rnd = 0;
368 while (value > MAXFRACT2) { 368 while (value > MAXFRACT2) {
369 rnd = value & 1; 369 rnd = value & 1;
370 value >>= 1; 370 value >>= 1;
371 exp++; 371 exp++;
372 } 372 }
373 373
374 /* 374 /*
375 * If we need to round up, do it (and handle overflow correctly). 375 * If we need to round up, do it (and handle overflow correctly).
376 */ 376 */
377 if (rnd && (++value > MAXFRACT2)) { 377 if (rnd && (++value > MAXFRACT2)) {
378 value >>= 1; 378 value >>= 1;
379 exp++; 379 exp++;
380 } 380 }
381 381
382 if (exp > MAXEXP2) { 382 if (exp > MAXEXP2) {
383 /* Overflow. Return largest representable number instead. */ 383 /* Overflow. Return largest representable number instead. */
384 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; 384 return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
385 } else { 385 } else {
386 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); 386 return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
387 } 387 }
388} 388}
389#endif 389#endif
390 390
@@ -501,14 +501,14 @@ static void do_acct_process(struct file *file)
501 ac.ac_swaps = encode_comp_t(0); 501 ac.ac_swaps = encode_comp_t(0);
502 502
503 /* 503 /*
504 * Kernel segment override to datasegment and write it 504 * Kernel segment override to datasegment and write it
505 * to the accounting file. 505 * to the accounting file.
506 */ 506 */
507 fs = get_fs(); 507 fs = get_fs();
508 set_fs(KERNEL_DS); 508 set_fs(KERNEL_DS);
509 /* 509 /*
510 * Accounting records are not subject to resource limits. 510 * Accounting records are not subject to resource limits.
511 */ 511 */
512 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 512 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
513 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 513 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
514 file->f_op->write(file, (char *)&ac, 514 file->f_op->write(file, (char *)&ac,
diff --git a/kernel/audit.c b/kernel/audit.c
index eb0f9165b401..6977ea57a7e2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -664,11 +664,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
664 if (sid) { 664 if (sid) {
665 if (selinux_sid_to_string( 665 if (selinux_sid_to_string(
666 sid, &ctx, &len)) { 666 sid, &ctx, &len)) {
667 audit_log_format(ab, 667 audit_log_format(ab,
668 " ssid=%u", sid); 668 " ssid=%u", sid);
669 /* Maybe call audit_panic? */ 669 /* Maybe call audit_panic? */
670 } else 670 } else
671 audit_log_format(ab, 671 audit_log_format(ab,
672 " subj=%s", ctx); 672 " subj=%s", ctx);
673 kfree(ctx); 673 kfree(ctx);
674 } 674 }
@@ -769,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
769 sig_data->pid = audit_sig_pid; 769 sig_data->pid = audit_sig_pid;
770 memcpy(sig_data->ctx, ctx, len); 770 memcpy(sig_data->ctx, ctx, len);
771 kfree(ctx); 771 kfree(ctx);
772 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 772 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
773 0, 0, sig_data, sizeof(*sig_data) + len); 773 0, 0, sig_data, sizeof(*sig_data) + len);
774 kfree(sig_data); 774 kfree(sig_data);
775 break; 775 break;
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb)
847} 847}
848 848
849/* Receive messages from netlink socket. */ 849/* Receive messages from netlink socket. */
850static void audit_receive(struct sock *sk, int length) 850static void audit_receive(struct sk_buff *skb)
851{ 851{
852 struct sk_buff *skb;
853 unsigned int qlen;
854
855 mutex_lock(&audit_cmd_mutex); 852 mutex_lock(&audit_cmd_mutex);
856 853 audit_receive_skb(skb);
857 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
858 skb = skb_dequeue(&sk->sk_receive_queue);
859 audit_receive_skb(skb);
860 kfree_skb(skb);
861 }
862 mutex_unlock(&audit_cmd_mutex); 854 mutex_unlock(&audit_cmd_mutex);
863} 855}
864 856
@@ -876,8 +868,8 @@ static int __init audit_init(void)
876 868
877 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 869 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
878 audit_default ? "enabled" : "disabled"); 870 audit_default ? "enabled" : "disabled");
879 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 871 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
880 NULL, THIS_MODULE); 872 audit_receive, NULL, THIS_MODULE);
881 if (!audit_sock) 873 if (!audit_sock)
882 audit_panic("cannot initialize netlink socket"); 874 audit_panic("cannot initialize netlink socket");
883 else 875 else
@@ -1013,7 +1005,7 @@ unsigned int audit_serial(void)
1013 return ret; 1005 return ret;
1014} 1006}
1015 1007
1016static inline void audit_get_stamp(struct audit_context *ctx, 1008static inline void audit_get_stamp(struct audit_context *ctx,
1017 struct timespec *t, unsigned int *serial) 1009 struct timespec *t, unsigned int *serial)
1018{ 1010{
1019 if (ctx) 1011 if (ctx)
@@ -1064,7 +1056,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1064 if (gfp_mask & __GFP_WAIT) 1056 if (gfp_mask & __GFP_WAIT)
1065 reserve = 0; 1057 reserve = 0;
1066 else 1058 else
1067 reserve = 5; /* Allow atomic callers to go up to five 1059 reserve = 5; /* Allow atomic callers to go up to five
1068 entries over the normal backlog limit */ 1060 entries over the normal backlog limit */
1069 1061
1070 while (audit_backlog_limit 1062 while (audit_backlog_limit
@@ -1327,7 +1319,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1327 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1319 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1328 /* FIXME: can we save some information here? */ 1320 /* FIXME: can we save some information here? */
1329 audit_log_format(ab, "<too long>"); 1321 audit_log_format(ab, "<too long>");
1330 } else 1322 } else
1331 audit_log_untrustedstring(ab, p); 1323 audit_log_untrustedstring(ab, p);
1332 kfree(path); 1324 kfree(path);
1333} 1325}
@@ -1373,7 +1365,7 @@ void audit_log_end(struct audit_buffer *ab)
1373 * audit_log_vformat, and audit_log_end. It may be called 1365 * audit_log_vformat, and audit_log_end. It may be called
1374 * in any context. 1366 * in any context.
1375 */ 1367 */
1376void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 1368void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1377 const char *fmt, ...) 1369 const char *fmt, ...)
1378{ 1370{
1379 struct audit_buffer *ab; 1371 struct audit_buffer *ab;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 359645cff5b2..df66a21fb360 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1498,7 +1498,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1498 * auditctl to read from it... which isn't ever going to 1498 * auditctl to read from it... which isn't ever going to
1499 * happen if we're actually running in the context of auditctl 1499 * happen if we're actually running in the context of auditctl
1500 * trying to _send_ the stuff */ 1500 * trying to _send_ the stuff */
1501 1501
1502 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); 1502 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
1503 if (!dest) 1503 if (!dest)
1504 return -ENOMEM; 1504 return -ENOMEM;
@@ -1678,7 +1678,7 @@ int audit_filter_type(int type)
1678{ 1678{
1679 struct audit_entry *e; 1679 struct audit_entry *e;
1680 int result = 0; 1680 int result = 0;
1681 1681
1682 rcu_read_lock(); 1682 rcu_read_lock();
1683 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) 1683 if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
1684 goto unlock_and_return; 1684 goto unlock_and_return;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 04f3ffb8d9d4..e19b5a33aede 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -45,7 +45,6 @@
45#include <linux/init.h> 45#include <linux/init.h>
46#include <asm/types.h> 46#include <asm/types.h>
47#include <asm/atomic.h> 47#include <asm/atomic.h>
48#include <asm/types.h>
49#include <linux/fs.h> 48#include <linux/fs.h>
50#include <linux/namei.h> 49#include <linux/namei.h>
51#include <linux/mm.h> 50#include <linux/mm.h>
@@ -321,7 +320,7 @@ static int audit_filter_rules(struct task_struct *tsk,
321 result = audit_comparator(tsk->personality, f->op, f->val); 320 result = audit_comparator(tsk->personality, f->op, f->val);
322 break; 321 break;
323 case AUDIT_ARCH: 322 case AUDIT_ARCH:
324 if (ctx) 323 if (ctx)
325 result = audit_comparator(ctx->arch, f->op, f->val); 324 result = audit_comparator(ctx->arch, f->op, f->val);
326 break; 325 break;
327 326
@@ -899,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
899 if (context->personality != PER_LINUX) 898 if (context->personality != PER_LINUX)
900 audit_log_format(ab, " per=%lx", context->personality); 899 audit_log_format(ab, " per=%lx", context->personality);
901 if (context->return_valid) 900 if (context->return_valid)
902 audit_log_format(ab, " success=%s exit=%ld", 901 audit_log_format(ab, " success=%s exit=%ld",
903 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 902 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
904 context->return_code); 903 context->return_code);
905 904
@@ -1136,8 +1135,8 @@ void audit_free(struct task_struct *tsk)
1136 return; 1135 return;
1137 1136
1138 /* Check for system calls that do not go through the exit 1137 /* Check for system calls that do not go through the exit
1139 * function (e.g., exit_group), then free context block. 1138 * function (e.g., exit_group), then free context block.
1140 * We use GFP_ATOMIC here because we might be doing this 1139 * We use GFP_ATOMIC here because we might be doing this
1141 * in the context of the idle thread */ 1140 * in the context of the idle thread */
1142 /* that can happen only if we are called from do_exit() */ 1141 /* that can happen only if we are called from do_exit() */
1143 if (context->in_syscall && context->auditable) 1142 if (context->in_syscall && context->auditable)
@@ -1317,7 +1316,7 @@ void __audit_getname(const char *name)
1317 context->pwdmnt = mntget(current->fs->pwdmnt); 1316 context->pwdmnt = mntget(current->fs->pwdmnt);
1318 read_unlock(&current->fs->lock); 1317 read_unlock(&current->fs->lock);
1319 } 1318 }
1320 1319
1321} 1320}
1322 1321
1323/* audit_putname - intercept a putname request 1322/* audit_putname - intercept a putname request
@@ -1525,6 +1524,7 @@ add_names:
1525 context->names[idx].ino = (unsigned long)-1; 1524 context->names[idx].ino = (unsigned long)-1;
1526 } 1525 }
1527} 1526}
1527EXPORT_SYMBOL_GPL(__audit_inode_child);
1528 1528
1529/** 1529/**
1530 * auditsc_get_stamp - get local copies of audit_context values 1530 * auditsc_get_stamp - get local copies of audit_context values
diff --git a/kernel/capability.c b/kernel/capability.c
index c8d3c7762034..efbd9cdce132 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -3,23 +3,18 @@
3 * 3 *
4 * Copyright (C) 1997 Andrew Main <zefram@fysh.org> 4 * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
5 * 5 *
6 * Integrated into 2.1.97+, Andrew G. Morgan <morgan@transmeta.com> 6 * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/pid_namespace.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
17unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
18kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
19
20EXPORT_SYMBOL(securebits);
21EXPORT_SYMBOL(cap_bset);
22
23/* 18/*
24 * This lock protects task->cap_* for all tasks including current. 19 * This lock protects task->cap_* for all tasks including current.
25 * Locking rule: acquire this prior to tasklist_lock. 20 * Locking rule: acquire this prior to tasklist_lock.
@@ -43,49 +38,49 @@ static DEFINE_SPINLOCK(task_capability_lock);
43 */ 38 */
44asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) 39asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
45{ 40{
46 int ret = 0; 41 int ret = 0;
47 pid_t pid; 42 pid_t pid;
48 __u32 version; 43 __u32 version;
49 struct task_struct *target; 44 struct task_struct *target;
50 struct __user_cap_data_struct data; 45 struct __user_cap_data_struct data;
51 46
52 if (get_user(version, &header->version)) 47 if (get_user(version, &header->version))
53 return -EFAULT; 48 return -EFAULT;
54 49
55 if (version != _LINUX_CAPABILITY_VERSION) { 50 if (version != _LINUX_CAPABILITY_VERSION) {
56 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 51 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
57 return -EFAULT; 52 return -EFAULT;
58 return -EINVAL; 53 return -EINVAL;
59 } 54 }
60 55
61 if (get_user(pid, &header->pid)) 56 if (get_user(pid, &header->pid))
62 return -EFAULT; 57 return -EFAULT;
63 58
64 if (pid < 0) 59 if (pid < 0)
65 return -EINVAL; 60 return -EINVAL;
66 61
67 spin_lock(&task_capability_lock); 62 spin_lock(&task_capability_lock);
68 read_lock(&tasklist_lock); 63 read_lock(&tasklist_lock);
69 64
70 if (pid && pid != current->pid) { 65 if (pid && pid != task_pid_vnr(current)) {
71 target = find_task_by_pid(pid); 66 target = find_task_by_vpid(pid);
72 if (!target) { 67 if (!target) {
73 ret = -ESRCH; 68 ret = -ESRCH;
74 goto out; 69 goto out;
75 } 70 }
76 } else 71 } else
77 target = current; 72 target = current;
78 73
79 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted); 74 ret = security_capget(target, &data.effective, &data.inheritable, &data.permitted);
80 75
81out: 76out:
82 read_unlock(&tasklist_lock); 77 read_unlock(&tasklist_lock);
83 spin_unlock(&task_capability_lock); 78 spin_unlock(&task_capability_lock);
84 79
85 if (!ret && copy_to_user(dataptr, &data, sizeof data)) 80 if (!ret && copy_to_user(dataptr, &data, sizeof data))
86 return -EFAULT; 81 return -EFAULT;
87 82
88 return ret; 83 return ret;
89} 84}
90 85
91/* 86/*
@@ -101,7 +96,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
101 int found = 0; 96 int found = 0;
102 struct pid *pgrp; 97 struct pid *pgrp;
103 98
104 pgrp = find_pid(pgrp_nr); 99 pgrp = find_vpid(pgrp_nr);
105 do_each_pid_task(pgrp, PIDTYPE_PGID, g) { 100 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
106 target = g; 101 target = g;
107 while_each_thread(g, target) { 102 while_each_thread(g, target) {
@@ -118,7 +113,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
118 } while_each_pid_task(pgrp, PIDTYPE_PGID, g); 113 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
119 114
120 if (!found) 115 if (!found)
121 ret = 0; 116 ret = 0;
122 return ret; 117 return ret;
123} 118}
124 119
@@ -135,7 +130,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
135 int found = 0; 130 int found = 0;
136 131
137 do_each_thread(g, target) { 132 do_each_thread(g, target) {
138 if (target == current || is_init(target)) 133 if (target == current || is_container_init(target->group_leader))
139 continue; 134 continue;
140 found = 1; 135 found = 1;
141 if (security_capset_check(target, effective, inheritable, 136 if (security_capset_check(target, effective, inheritable,
@@ -172,68 +167,68 @@ static inline int cap_set_all(kernel_cap_t *effective,
172 */ 167 */
173asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 168asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
174{ 169{
175 kernel_cap_t inheritable, permitted, effective; 170 kernel_cap_t inheritable, permitted, effective;
176 __u32 version; 171 __u32 version;
177 struct task_struct *target; 172 struct task_struct *target;
178 int ret; 173 int ret;
179 pid_t pid; 174 pid_t pid;
180 175
181 if (get_user(version, &header->version)) 176 if (get_user(version, &header->version))
182 return -EFAULT; 177 return -EFAULT;
183 178
184 if (version != _LINUX_CAPABILITY_VERSION) { 179 if (version != _LINUX_CAPABILITY_VERSION) {
185 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) 180 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
186 return -EFAULT; 181 return -EFAULT;
187 return -EINVAL; 182 return -EINVAL;
188 } 183 }
189 184
190 if (get_user(pid, &header->pid)) 185 if (get_user(pid, &header->pid))
191 return -EFAULT; 186 return -EFAULT;
192 187
193 if (pid && pid != current->pid && !capable(CAP_SETPCAP)) 188 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
194 return -EPERM; 189 return -EPERM;
195 190
196 if (copy_from_user(&effective, &data->effective, sizeof(effective)) || 191 if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
197 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) || 192 copy_from_user(&inheritable, &data->inheritable, sizeof(inheritable)) ||
198 copy_from_user(&permitted, &data->permitted, sizeof(permitted))) 193 copy_from_user(&permitted, &data->permitted, sizeof(permitted)))
199 return -EFAULT; 194 return -EFAULT;
200 195
201 spin_lock(&task_capability_lock); 196 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock); 197 read_lock(&tasklist_lock);
203 198
204 if (pid > 0 && pid != current->pid) { 199 if (pid > 0 && pid != task_pid_vnr(current)) {
205 target = find_task_by_pid(pid); 200 target = find_task_by_vpid(pid);
206 if (!target) { 201 if (!target) {
207 ret = -ESRCH; 202 ret = -ESRCH;
208 goto out; 203 goto out;
209 } 204 }
210 } else 205 } else
211 target = current; 206 target = current;
212 207
213 ret = 0; 208 ret = 0;
214 209
215 /* having verified that the proposed changes are legal, 210 /* having verified that the proposed changes are legal,
216 we now put them into effect. */ 211 we now put them into effect. */
217 if (pid < 0) { 212 if (pid < 0) {
218 if (pid == -1) /* all procs other than current and init */ 213 if (pid == -1) /* all procs other than current and init */
219 ret = cap_set_all(&effective, &inheritable, &permitted); 214 ret = cap_set_all(&effective, &inheritable, &permitted);
220 215
221 else /* all procs in process group */ 216 else /* all procs in process group */
222 ret = cap_set_pg(-pid, &effective, &inheritable, 217 ret = cap_set_pg(-pid, &effective, &inheritable,
223 &permitted); 218 &permitted);
224 } else { 219 } else {
225 ret = security_capset_check(target, &effective, &inheritable, 220 ret = security_capset_check(target, &effective, &inheritable,
226 &permitted); 221 &permitted);
227 if (!ret) 222 if (!ret)
228 security_capset_set(target, &effective, &inheritable, 223 security_capset_set(target, &effective, &inheritable,
229 &permitted); 224 &permitted);
230 } 225 }
231 226
232out: 227out:
233 read_unlock(&tasklist_lock); 228 read_unlock(&tasklist_lock);
234 spin_unlock(&task_capability_lock); 229 spin_unlock(&task_capability_lock);
235 230
236 return ret; 231 return ret;
237} 232}
238 233
239int __capable(struct task_struct *t, int cap) 234int __capable(struct task_struct *t, int cap)
@@ -244,7 +239,6 @@ int __capable(struct task_struct *t, int cap)
244 } 239 }
245 return 0; 240 return 0;
246} 241}
247EXPORT_SYMBOL(__capable);
248 242
249int capable(int cap) 243int capable(int cap)
250{ 244{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000000..5987dccdb2a0
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,2805 @@
1/*
2 * kernel/cgroup.c
3 *
4 * Generic process-grouping system.
5 *
6 * Based originally on the cpuset system, extracted by Paul Menage
7 * Copyright (C) 2006 Google, Inc
8 *
9 * Copyright notices from the original cpuset code:
10 * --------------------------------------------------
11 * Copyright (C) 2003 BULL SA.
12 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
13 *
14 * Portions derived from Patrick Mochel's sysfs code.
15 * sysfs is Copyright (c) 2001-3 Patrick Mochel
16 *
17 * 2003-10-10 Written by Simon Derr.
18 * 2003-10-22 Updates by Stephen Hemminger.
19 * 2004 May-July Rework by Paul Jackson.
20 * ---------------------------------------------------
21 *
22 * This file is subject to the terms and conditions of the GNU General Public
23 * License. See the file COPYING in the main directory of the Linux
24 * distribution for more details.
25 */
26
27#include <linux/cgroup.h>
28#include <linux/errno.h>
29#include <linux/fs.h>
30#include <linux/kernel.h>
31#include <linux/list.h>
32#include <linux/mm.h>
33#include <linux/mutex.h>
34#include <linux/mount.h>
35#include <linux/pagemap.h>
36#include <linux/proc_fs.h>
37#include <linux/rcupdate.h>
38#include <linux/sched.h>
39#include <linux/backing-dev.h>
40#include <linux/seq_file.h>
41#include <linux/slab.h>
42#include <linux/magic.h>
43#include <linux/spinlock.h>
44#include <linux/string.h>
45#include <linux/sort.h>
46#include <linux/kmod.h>
47#include <linux/delayacct.h>
48#include <linux/cgroupstats.h>
49
50#include <asm/atomic.h>
51
52static DEFINE_MUTEX(cgroup_mutex);
53
54/* Generate an array of cgroup subsystem pointers */
55#define SUBSYS(_x) &_x ## _subsys,
56
57static struct cgroup_subsys *subsys[] = {
58#include <linux/cgroup_subsys.h>
59};
60
61/*
62 * A cgroupfs_root represents the root of a cgroup hierarchy,
63 * and may be associated with a superblock to form an active
64 * hierarchy
65 */
66struct cgroupfs_root {
67 struct super_block *sb;
68
69 /*
70 * The bitmask of subsystems intended to be attached to this
71 * hierarchy
72 */
73 unsigned long subsys_bits;
74
75 /* The bitmask of subsystems currently attached to this hierarchy */
76 unsigned long actual_subsys_bits;
77
78 /* A list running through the attached subsystems */
79 struct list_head subsys_list;
80
81 /* The root cgroup for this hierarchy */
82 struct cgroup top_cgroup;
83
84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups;
86
87 /* A list running through the mounted hierarchies */
88 struct list_head root_list;
89
90 /* Hierarchy-specific flags */
91 unsigned long flags;
92
93 /* The path to use for release notifications. No locking
94 * between setting and use - so if userspace updates this
95 * while child cgroups exist, you could miss a
96 * notification. We ensure that it's always a valid
97 * NUL-terminated string */
98 char release_agent_path[PATH_MAX];
99};
100
101
102/*
103 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
104 * subsystems that are otherwise unattached - it never has more than a
105 * single cgroup, and all tasks are part of that cgroup.
106 */
107static struct cgroupfs_root rootnode;
108
109/* The list of hierarchy roots */
110
111static LIST_HEAD(roots);
112static int root_count;
113
114/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
115#define dummytop (&rootnode.top_cgroup)
116
117/* This flag indicates whether tasks in the fork and exit paths should
118 * take callback_mutex and check for fork/exit handlers to call. This
119 * avoids us having to do extra work in the fork/exit path if none of the
120 * subsystems need to be called.
121 */
122static int need_forkexit_callback;
123
124/* bits in struct cgroup flags field */
125enum {
126 /* Control Group is dead */
127 CGRP_REMOVED,
128 /* Control Group has previously had a child cgroup or a task,
129 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
130 CGRP_RELEASABLE,
131 /* Control Group requires release notifications to userspace */
132 CGRP_NOTIFY_ON_RELEASE,
133};
134
135/* convenient tests for these bits */
136inline int cgroup_is_removed(const struct cgroup *cgrp)
137{
138 return test_bit(CGRP_REMOVED, &cgrp->flags);
139}
140
141/* bits in struct cgroupfs_root flags field */
142enum {
143 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
144};
145
146inline int cgroup_is_releasable(const struct cgroup *cgrp)
147{
148 const int bits =
149 (1 << CGRP_RELEASABLE) |
150 (1 << CGRP_NOTIFY_ON_RELEASE);
151 return (cgrp->flags & bits) == bits;
152}
153
154inline int notify_on_release(const struct cgroup *cgrp)
155{
156 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
157}
158
159/*
160 * for_each_subsys() allows you to iterate on each subsystem attached to
161 * an active hierarchy
162 */
163#define for_each_subsys(_root, _ss) \
164list_for_each_entry(_ss, &_root->subsys_list, sibling)
165
166/* for_each_root() allows you to iterate across the active hierarchies */
167#define for_each_root(_root) \
168list_for_each_entry(_root, &roots, root_list)
169
170/* the list of cgroups eligible for automatic release. Protected by
171 * release_list_lock */
172static LIST_HEAD(release_list);
173static DEFINE_SPINLOCK(release_list_lock);
174static void cgroup_release_agent(struct work_struct *work);
175static DECLARE_WORK(release_agent_work, cgroup_release_agent);
176static void check_for_release(struct cgroup *cgrp);
177
178/* Link structure for associating css_set objects with cgroups */
179struct cg_cgroup_link {
180 /*
181 * List running through cg_cgroup_links associated with a
182 * cgroup, anchored on cgroup->css_sets
183 */
184 struct list_head cgrp_link_list;
185 /*
186 * List running through cg_cgroup_links pointing at a
187 * single css_set object, anchored on css_set->cg_links
188 */
189 struct list_head cg_link_list;
190 struct css_set *cg;
191};
192
193/* The default css_set - used by init and its children prior to any
194 * hierarchies being mounted. It contains a pointer to the root state
195 * for each subsystem. Also used to anchor the list of css_sets. Not
196 * reference-counted, to improve performance when child cgroups
197 * haven't been created.
198 */
199
200static struct css_set init_css_set;
201static struct cg_cgroup_link init_css_set_link;
202
203/* css_set_lock protects the list of css_set objects, and the
204 * chain of tasks off each css_set. Nests outside task->alloc_lock
205 * due to cgroup_iter_start() */
206static DEFINE_RWLOCK(css_set_lock);
207static int css_set_count;
208
209/* We don't maintain the lists running through each css_set to its
210 * task until after the first call to cgroup_iter_start(). This
211 * reduces the fork()/exit() overhead for people who have cgroups
212 * compiled into their kernel but not actually in use */
213static int use_task_css_set_links;
214
215/* When we create or destroy a css_set, the operation simply
216 * takes/releases a reference count on all the cgroups referenced
217 * by subsystems in this css_set. This can end up multiple-counting
218 * some cgroups, but that's OK - the ref-count is just a
219 * busy/not-busy indicator; ensuring that we only count each cgroup
220 * once would require taking a global lock to ensure that no
221 * subsystems moved between hierarchies while we were doing so.
222 *
223 * Possible TODO: decide at boot time based on the number of
224 * registered subsystems and the number of CPUs or NUMA nodes whether
225 * it's better for performance to ref-count every subsystem, or to
226 * take a global lock and only add one ref count to each hierarchy.
227 */
228
229/*
230 * unlink a css_set from the list and free it
231 */
232static void unlink_css_set(struct css_set *cg)
233{
234 write_lock(&css_set_lock);
235 list_del(&cg->list);
236 css_set_count--;
237 while (!list_empty(&cg->cg_links)) {
238 struct cg_cgroup_link *link;
239 link = list_entry(cg->cg_links.next,
240 struct cg_cgroup_link, cg_link_list);
241 list_del(&link->cg_link_list);
242 list_del(&link->cgrp_link_list);
243 kfree(link);
244 }
245 write_unlock(&css_set_lock);
246}
247
248static void __release_css_set(struct kref *k, int taskexit)
249{
250 int i;
251 struct css_set *cg = container_of(k, struct css_set, ref);
252
253 unlink_css_set(cg);
254
255 rcu_read_lock();
256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
257 struct cgroup *cgrp = cg->subsys[i]->cgroup;
258 if (atomic_dec_and_test(&cgrp->count) &&
259 notify_on_release(cgrp)) {
260 if (taskexit)
261 set_bit(CGRP_RELEASABLE, &cgrp->flags);
262 check_for_release(cgrp);
263 }
264 }
265 rcu_read_unlock();
266 kfree(cg);
267}
268
269static void release_css_set(struct kref *k)
270{
271 __release_css_set(k, 0);
272}
273
274static void release_css_set_taskexit(struct kref *k)
275{
276 __release_css_set(k, 1);
277}
278
279/*
280 * refcounted get/put for css_set objects
281 */
282static inline void get_css_set(struct css_set *cg)
283{
284 kref_get(&cg->ref);
285}
286
287static inline void put_css_set(struct css_set *cg)
288{
289 kref_put(&cg->ref, release_css_set);
290}
291
292static inline void put_css_set_taskexit(struct css_set *cg)
293{
294 kref_put(&cg->ref, release_css_set_taskexit);
295}
296
297/*
298 * find_existing_css_set() is a helper for
299 * find_css_set(), and checks to see whether an existing
300 * css_set is suitable. This currently walks a linked-list for
301 * simplicity; a later patch will use a hash table for better
302 * performance
303 *
304 * oldcg: the cgroup group that we're using before the cgroup
305 * transition
306 *
307 * cgrp: the cgroup that we're moving into
308 *
309 * template: location in which to build the desired set of subsystem
310 * state objects for the new cgroup group
311 */
312
313static struct css_set *find_existing_css_set(
314 struct css_set *oldcg,
315 struct cgroup *cgrp,
316 struct cgroup_subsys_state *template[])
317{
318 int i;
319 struct cgroupfs_root *root = cgrp->root;
320 struct list_head *l = &init_css_set.list;
321
322 /* Built the set of subsystem state objects that we want to
323 * see in the new css_set */
324 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
325 if (root->subsys_bits & (1ull << i)) {
326 /* Subsystem is in this hierarchy. So we want
327 * the subsystem state from the new
328 * cgroup */
329 template[i] = cgrp->subsys[i];
330 } else {
331 /* Subsystem is not in this hierarchy, so we
332 * don't want to change the subsystem state */
333 template[i] = oldcg->subsys[i];
334 }
335 }
336
337 /* Look through existing cgroup groups to find one to reuse */
338 do {
339 struct css_set *cg =
340 list_entry(l, struct css_set, list);
341
342 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
343 /* All subsystems matched */
344 return cg;
345 }
346 /* Try the next cgroup group */
347 l = l->next;
348 } while (l != &init_css_set.list);
349
350 /* No existing cgroup group matched */
351 return NULL;
352}
353
354/*
355 * allocate_cg_links() allocates "count" cg_cgroup_link structures
356 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
357 * success or a negative error
358 */
359
360static int allocate_cg_links(int count, struct list_head *tmp)
361{
362 struct cg_cgroup_link *link;
363 int i;
364 INIT_LIST_HEAD(tmp);
365 for (i = 0; i < count; i++) {
366 link = kmalloc(sizeof(*link), GFP_KERNEL);
367 if (!link) {
368 while (!list_empty(tmp)) {
369 link = list_entry(tmp->next,
370 struct cg_cgroup_link,
371 cgrp_link_list);
372 list_del(&link->cgrp_link_list);
373 kfree(link);
374 }
375 return -ENOMEM;
376 }
377 list_add(&link->cgrp_link_list, tmp);
378 }
379 return 0;
380}
381
382static void free_cg_links(struct list_head *tmp)
383{
384 while (!list_empty(tmp)) {
385 struct cg_cgroup_link *link;
386 link = list_entry(tmp->next,
387 struct cg_cgroup_link,
388 cgrp_link_list);
389 list_del(&link->cgrp_link_list);
390 kfree(link);
391 }
392}
393
394/*
395 * find_css_set() takes an existing cgroup group and a
396 * cgroup object, and returns a css_set object that's
397 * equivalent to the old group, but with the given cgroup
398 * substituted into the appropriate hierarchy. Must be called with
399 * cgroup_mutex held
400 */
401
402static struct css_set *find_css_set(
403 struct css_set *oldcg, struct cgroup *cgrp)
404{
405 struct css_set *res;
406 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
407 int i;
408
409 struct list_head tmp_cg_links;
410 struct cg_cgroup_link *link;
411
412 /* First see if we already have a cgroup group that matches
413 * the desired set */
414 write_lock(&css_set_lock);
415 res = find_existing_css_set(oldcg, cgrp, template);
416 if (res)
417 get_css_set(res);
418 write_unlock(&css_set_lock);
419
420 if (res)
421 return res;
422
423 res = kmalloc(sizeof(*res), GFP_KERNEL);
424 if (!res)
425 return NULL;
426
427 /* Allocate all the cg_cgroup_link objects that we'll need */
428 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
429 kfree(res);
430 return NULL;
431 }
432
433 kref_init(&res->ref);
434 INIT_LIST_HEAD(&res->cg_links);
435 INIT_LIST_HEAD(&res->tasks);
436
437 /* Copy the set of subsystem state objects generated in
438 * find_existing_css_set() */
439 memcpy(res->subsys, template, sizeof(res->subsys));
440
441 write_lock(&css_set_lock);
442 /* Add reference counts and links from the new css_set. */
443 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
444 struct cgroup *cgrp = res->subsys[i]->cgroup;
445 struct cgroup_subsys *ss = subsys[i];
446 atomic_inc(&cgrp->count);
447 /*
448 * We want to add a link once per cgroup, so we
449 * only do it for the first subsystem in each
450 * hierarchy
451 */
452 if (ss->root->subsys_list.next == &ss->sibling) {
453 BUG_ON(list_empty(&tmp_cg_links));
454 link = list_entry(tmp_cg_links.next,
455 struct cg_cgroup_link,
456 cgrp_link_list);
457 list_del(&link->cgrp_link_list);
458 list_add(&link->cgrp_link_list, &cgrp->css_sets);
459 link->cg = res;
460 list_add(&link->cg_link_list, &res->cg_links);
461 }
462 }
463 if (list_empty(&rootnode.subsys_list)) {
464 link = list_entry(tmp_cg_links.next,
465 struct cg_cgroup_link,
466 cgrp_link_list);
467 list_del(&link->cgrp_link_list);
468 list_add(&link->cgrp_link_list, &dummytop->css_sets);
469 link->cg = res;
470 list_add(&link->cg_link_list, &res->cg_links);
471 }
472
473 BUG_ON(!list_empty(&tmp_cg_links));
474
475 /* Link this cgroup group into the list */
476 list_add(&res->list, &init_css_set.list);
477 css_set_count++;
478 INIT_LIST_HEAD(&res->tasks);
479 write_unlock(&css_set_lock);
480
481 return res;
482}
483
484/*
485 * There is one global cgroup mutex. We also require taking
486 * task_lock() when dereferencing a task's cgroup subsys pointers.
487 * See "The task_lock() exception", at the end of this comment.
488 *
489 * A task must hold cgroup_mutex to modify cgroups.
490 *
491 * Any task can increment and decrement the count field without lock.
492 * So in general, code holding cgroup_mutex can't rely on the count
493 * field not changing. However, if the count goes to zero, then only
494 * attach_task() can increment it again. Because a count of zero
495 * means that no tasks are currently attached, therefore there is no
496 * way a task attached to that cgroup can fork (the other way to
497 * increment the count). So code holding cgroup_mutex can safely
498 * assume that if the count is zero, it will stay zero. Similarly, if
499 * a task holds cgroup_mutex on a cgroup with zero count, it
500 * knows that the cgroup won't be removed, as cgroup_rmdir()
501 * needs that mutex.
502 *
503 * The cgroup_common_file_write handler for operations that modify
504 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
505 * single threading all such cgroup modifications across the system.
506 *
507 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
508 * (usually) take cgroup_mutex. These are the two most performance
509 * critical pieces of code here. The exception occurs on cgroup_exit(),
510 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
511 * is taken, and if the cgroup count is zero, a usermode call made
512 * to /sbin/cgroup_release_agent with the name of the cgroup (path
513 * relative to the root of cgroup file system) as the argument.
514 *
515 * A cgroup can only be deleted if both its 'count' of using tasks
516 * is zero, and its list of 'children' cgroups is empty. Since all
517 * tasks in the system use _some_ cgroup, and since there is always at
518 * least one task in the system (init, pid == 1), therefore, top_cgroup
519 * always has either children cgroups and/or using tasks. So we don't
520 * need a special hack to ensure that top_cgroup cannot be deleted.
521 *
522 * The task_lock() exception
523 *
524 * The need for this exception arises from the action of
525 * attach_task(), which overwrites one tasks cgroup pointer with
526 * another. It does so using cgroup_mutexe, however there are
527 * several performance critical places that need to reference
528 * task->cgroup without the expense of grabbing a system global
529 * mutex. Therefore except as noted below, when dereferencing or, as
530 * in attach_task(), modifying a task'ss cgroup pointer we use
531 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
532 * the task_struct routinely used for such matters.
533 *
534 * P.S. One more locking exception. RCU is used to guard the
535 * update of a tasks cgroup pointer by attach_task()
536 */
537
538/**
539 * cgroup_lock - lock out any changes to cgroup structures
540 *
541 */
542
543void cgroup_lock(void)
544{
545 mutex_lock(&cgroup_mutex);
546}
547
548/**
549 * cgroup_unlock - release lock on cgroup changes
550 *
551 * Undo the lock taken in a previous cgroup_lock() call.
552 */
553
554void cgroup_unlock(void)
555{
556 mutex_unlock(&cgroup_mutex);
557}
558
559/*
560 * A couple of forward declarations required, due to cyclic reference loop:
561 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
562 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
563 * -> cgroup_mkdir.
564 */
565
566static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
567static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
568static int cgroup_populate_dir(struct cgroup *cgrp);
569static struct inode_operations cgroup_dir_inode_operations;
570static struct file_operations proc_cgroupstats_operations;
571
572static struct backing_dev_info cgroup_backing_dev_info = {
573 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
574};
575
576static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
577{
578 struct inode *inode = new_inode(sb);
579
580 if (inode) {
581 inode->i_mode = mode;
582 inode->i_uid = current->fsuid;
583 inode->i_gid = current->fsgid;
584 inode->i_blocks = 0;
585 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
586 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
587 }
588 return inode;
589}
590
591static void cgroup_diput(struct dentry *dentry, struct inode *inode)
592{
593 /* is dentry a directory ? if so, kfree() associated cgroup */
594 if (S_ISDIR(inode->i_mode)) {
595 struct cgroup *cgrp = dentry->d_fsdata;
596 BUG_ON(!(cgroup_is_removed(cgrp)));
597 /* It's possible for external users to be holding css
598 * reference counts on a cgroup; css_put() needs to
599 * be able to access the cgroup after decrementing
600 * the reference count in order to know if it needs to
601 * queue the cgroup to be handled by the release
602 * agent */
603 synchronize_rcu();
604 kfree(cgrp);
605 }
606 iput(inode);
607}
608
609static void remove_dir(struct dentry *d)
610{
611 struct dentry *parent = dget(d->d_parent);
612
613 d_delete(d);
614 simple_rmdir(parent->d_inode, d);
615 dput(parent);
616}
617
618static void cgroup_clear_directory(struct dentry *dentry)
619{
620 struct list_head *node;
621
622 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
623 spin_lock(&dcache_lock);
624 node = dentry->d_subdirs.next;
625 while (node != &dentry->d_subdirs) {
626 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
627 list_del_init(node);
628 if (d->d_inode) {
629 /* This should never be called on a cgroup
630 * directory with child cgroups */
631 BUG_ON(d->d_inode->i_mode & S_IFDIR);
632 d = dget_locked(d);
633 spin_unlock(&dcache_lock);
634 d_delete(d);
635 simple_unlink(dentry->d_inode, d);
636 dput(d);
637 spin_lock(&dcache_lock);
638 }
639 node = dentry->d_subdirs.next;
640 }
641 spin_unlock(&dcache_lock);
642}
643
644/*
645 * NOTE : the dentry must have been dget()'ed
646 */
647static void cgroup_d_remove_dir(struct dentry *dentry)
648{
649 cgroup_clear_directory(dentry);
650
651 spin_lock(&dcache_lock);
652 list_del_init(&dentry->d_u.d_child);
653 spin_unlock(&dcache_lock);
654 remove_dir(dentry);
655}
656
657static int rebind_subsystems(struct cgroupfs_root *root,
658 unsigned long final_bits)
659{
660 unsigned long added_bits, removed_bits;
661 struct cgroup *cgrp = &root->top_cgroup;
662 int i;
663
664 removed_bits = root->actual_subsys_bits & ~final_bits;
665 added_bits = final_bits & ~root->actual_subsys_bits;
666 /* Check that any added subsystems are currently free */
667 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
668 unsigned long long bit = 1ull << i;
669 struct cgroup_subsys *ss = subsys[i];
670 if (!(bit & added_bits))
671 continue;
672 if (ss->root != &rootnode) {
673 /* Subsystem isn't free */
674 return -EBUSY;
675 }
676 }
677
678 /* Currently we don't handle adding/removing subsystems when
679 * any child cgroups exist. This is theoretically supportable
680 * but involves complex error handling, so it's being left until
681 * later */
682 if (!list_empty(&cgrp->children))
683 return -EBUSY;
684
685 /* Process each subsystem */
686 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
687 struct cgroup_subsys *ss = subsys[i];
688 unsigned long bit = 1UL << i;
689 if (bit & added_bits) {
690 /* We're binding this subsystem to this hierarchy */
691 BUG_ON(cgrp->subsys[i]);
692 BUG_ON(!dummytop->subsys[i]);
693 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
694 cgrp->subsys[i] = dummytop->subsys[i];
695 cgrp->subsys[i]->cgroup = cgrp;
696 list_add(&ss->sibling, &root->subsys_list);
697 rcu_assign_pointer(ss->root, root);
698 if (ss->bind)
699 ss->bind(ss, cgrp);
700
701 } else if (bit & removed_bits) {
702 /* We're removing this subsystem */
703 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
704 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
705 if (ss->bind)
706 ss->bind(ss, dummytop);
707 dummytop->subsys[i]->cgroup = dummytop;
708 cgrp->subsys[i] = NULL;
709 rcu_assign_pointer(subsys[i]->root, &rootnode);
710 list_del(&ss->sibling);
711 } else if (bit & final_bits) {
712 /* Subsystem state should already exist */
713 BUG_ON(!cgrp->subsys[i]);
714 } else {
715 /* Subsystem state shouldn't exist */
716 BUG_ON(cgrp->subsys[i]);
717 }
718 }
719 root->subsys_bits = root->actual_subsys_bits = final_bits;
720 synchronize_rcu();
721
722 return 0;
723}
724
725static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
726{
727 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
728 struct cgroup_subsys *ss;
729
730 mutex_lock(&cgroup_mutex);
731 for_each_subsys(root, ss)
732 seq_printf(seq, ",%s", ss->name);
733 if (test_bit(ROOT_NOPREFIX, &root->flags))
734 seq_puts(seq, ",noprefix");
735 if (strlen(root->release_agent_path))
736 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
737 mutex_unlock(&cgroup_mutex);
738 return 0;
739}
740
741struct cgroup_sb_opts {
742 unsigned long subsys_bits;
743 unsigned long flags;
744 char *release_agent;
745};
746
747/* Convert a hierarchy specifier into a bitmask of subsystems and
748 * flags. */
749static int parse_cgroupfs_options(char *data,
750 struct cgroup_sb_opts *opts)
751{
752 char *token, *o = data ?: "all";
753
754 opts->subsys_bits = 0;
755 opts->flags = 0;
756 opts->release_agent = NULL;
757
758 while ((token = strsep(&o, ",")) != NULL) {
759 if (!*token)
760 return -EINVAL;
761 if (!strcmp(token, "all")) {
762 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
763 } else if (!strcmp(token, "noprefix")) {
764 set_bit(ROOT_NOPREFIX, &opts->flags);
765 } else if (!strncmp(token, "release_agent=", 14)) {
766 /* Specifying two release agents is forbidden */
767 if (opts->release_agent)
768 return -EINVAL;
769 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
770 if (!opts->release_agent)
771 return -ENOMEM;
772 strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
773 opts->release_agent[PATH_MAX - 1] = 0;
774 } else {
775 struct cgroup_subsys *ss;
776 int i;
777 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
778 ss = subsys[i];
779 if (!strcmp(token, ss->name)) {
780 set_bit(i, &opts->subsys_bits);
781 break;
782 }
783 }
784 if (i == CGROUP_SUBSYS_COUNT)
785 return -ENOENT;
786 }
787 }
788
789 /* We can't have an empty hierarchy */
790 if (!opts->subsys_bits)
791 return -EINVAL;
792
793 return 0;
794}
795
796static int cgroup_remount(struct super_block *sb, int *flags, char *data)
797{
798 int ret = 0;
799 struct cgroupfs_root *root = sb->s_fs_info;
800 struct cgroup *cgrp = &root->top_cgroup;
801 struct cgroup_sb_opts opts;
802
803 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
804 mutex_lock(&cgroup_mutex);
805
806 /* See what subsystems are wanted */
807 ret = parse_cgroupfs_options(data, &opts);
808 if (ret)
809 goto out_unlock;
810
811 /* Don't allow flags to change at remount */
812 if (opts.flags != root->flags) {
813 ret = -EINVAL;
814 goto out_unlock;
815 }
816
817 ret = rebind_subsystems(root, opts.subsys_bits);
818
819 /* (re)populate subsystem files */
820 if (!ret)
821 cgroup_populate_dir(cgrp);
822
823 if (opts.release_agent)
824 strcpy(root->release_agent_path, opts.release_agent);
825 out_unlock:
826 if (opts.release_agent)
827 kfree(opts.release_agent);
828 mutex_unlock(&cgroup_mutex);
829 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
830 return ret;
831}
832
833static struct super_operations cgroup_ops = {
834 .statfs = simple_statfs,
835 .drop_inode = generic_delete_inode,
836 .show_options = cgroup_show_options,
837 .remount_fs = cgroup_remount,
838};
839
840static void init_cgroup_root(struct cgroupfs_root *root)
841{
842 struct cgroup *cgrp = &root->top_cgroup;
843 INIT_LIST_HEAD(&root->subsys_list);
844 INIT_LIST_HEAD(&root->root_list);
845 root->number_of_cgroups = 1;
846 cgrp->root = root;
847 cgrp->top_cgroup = cgrp;
848 INIT_LIST_HEAD(&cgrp->sibling);
849 INIT_LIST_HEAD(&cgrp->children);
850 INIT_LIST_HEAD(&cgrp->css_sets);
851 INIT_LIST_HEAD(&cgrp->release_list);
852}
853
854static int cgroup_test_super(struct super_block *sb, void *data)
855{
856 struct cgroupfs_root *new = data;
857 struct cgroupfs_root *root = sb->s_fs_info;
858
859 /* First check subsystems */
860 if (new->subsys_bits != root->subsys_bits)
861 return 0;
862
863 /* Next check flags */
864 if (new->flags != root->flags)
865 return 0;
866
867 return 1;
868}
869
870static int cgroup_set_super(struct super_block *sb, void *data)
871{
872 int ret;
873 struct cgroupfs_root *root = data;
874
875 ret = set_anon_super(sb, NULL);
876 if (ret)
877 return ret;
878
879 sb->s_fs_info = root;
880 root->sb = sb;
881
882 sb->s_blocksize = PAGE_CACHE_SIZE;
883 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
884 sb->s_magic = CGROUP_SUPER_MAGIC;
885 sb->s_op = &cgroup_ops;
886
887 return 0;
888}
889
890static int cgroup_get_rootdir(struct super_block *sb)
891{
892 struct inode *inode =
893 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
894 struct dentry *dentry;
895
896 if (!inode)
897 return -ENOMEM;
898
899 inode->i_op = &simple_dir_inode_operations;
900 inode->i_fop = &simple_dir_operations;
901 inode->i_op = &cgroup_dir_inode_operations;
902 /* directories start off with i_nlink == 2 (for "." entry) */
903 inc_nlink(inode);
904 dentry = d_alloc_root(inode);
905 if (!dentry) {
906 iput(inode);
907 return -ENOMEM;
908 }
909 sb->s_root = dentry;
910 return 0;
911}
912
913static int cgroup_get_sb(struct file_system_type *fs_type,
914 int flags, const char *unused_dev_name,
915 void *data, struct vfsmount *mnt)
916{
917 struct cgroup_sb_opts opts;
918 int ret = 0;
919 struct super_block *sb;
920 struct cgroupfs_root *root;
921 struct list_head tmp_cg_links, *l;
922 INIT_LIST_HEAD(&tmp_cg_links);
923
924 /* First find the desired set of subsystems */
925 ret = parse_cgroupfs_options(data, &opts);
926 if (ret) {
927 if (opts.release_agent)
928 kfree(opts.release_agent);
929 return ret;
930 }
931
932 root = kzalloc(sizeof(*root), GFP_KERNEL);
933 if (!root)
934 return -ENOMEM;
935
936 init_cgroup_root(root);
937 root->subsys_bits = opts.subsys_bits;
938 root->flags = opts.flags;
939 if (opts.release_agent) {
940 strcpy(root->release_agent_path, opts.release_agent);
941 kfree(opts.release_agent);
942 }
943
944 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
945
946 if (IS_ERR(sb)) {
947 kfree(root);
948 return PTR_ERR(sb);
949 }
950
951 if (sb->s_fs_info != root) {
952 /* Reusing an existing superblock */
953 BUG_ON(sb->s_root == NULL);
954 kfree(root);
955 root = NULL;
956 } else {
957 /* New superblock */
958 struct cgroup *cgrp = &root->top_cgroup;
959 struct inode *inode;
960
961 BUG_ON(sb->s_root != NULL);
962
963 ret = cgroup_get_rootdir(sb);
964 if (ret)
965 goto drop_new_super;
966 inode = sb->s_root->d_inode;
967
968 mutex_lock(&inode->i_mutex);
969 mutex_lock(&cgroup_mutex);
970
971 /*
972 * We're accessing css_set_count without locking
973 * css_set_lock here, but that's OK - it can only be
974 * increased by someone holding cgroup_lock, and
975 * that's us. The worst that can happen is that we
976 * have some link structures left over
977 */
978 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
979 if (ret) {
980 mutex_unlock(&cgroup_mutex);
981 mutex_unlock(&inode->i_mutex);
982 goto drop_new_super;
983 }
984
985 ret = rebind_subsystems(root, root->subsys_bits);
986 if (ret == -EBUSY) {
987 mutex_unlock(&cgroup_mutex);
988 mutex_unlock(&inode->i_mutex);
989 goto drop_new_super;
990 }
991
992 /* EBUSY should be the only error here */
993 BUG_ON(ret);
994
995 list_add(&root->root_list, &roots);
996 root_count++;
997
998 sb->s_root->d_fsdata = &root->top_cgroup;
999 root->top_cgroup.dentry = sb->s_root;
1000
1001 /* Link the top cgroup in this hierarchy into all
1002 * the css_set objects */
1003 write_lock(&css_set_lock);
1004 l = &init_css_set.list;
1005 do {
1006 struct css_set *cg;
1007 struct cg_cgroup_link *link;
1008 cg = list_entry(l, struct css_set, list);
1009 BUG_ON(list_empty(&tmp_cg_links));
1010 link = list_entry(tmp_cg_links.next,
1011 struct cg_cgroup_link,
1012 cgrp_link_list);
1013 list_del(&link->cgrp_link_list);
1014 link->cg = cg;
1015 list_add(&link->cgrp_link_list,
1016 &root->top_cgroup.css_sets);
1017 list_add(&link->cg_link_list, &cg->cg_links);
1018 l = l->next;
1019 } while (l != &init_css_set.list);
1020 write_unlock(&css_set_lock);
1021
1022 free_cg_links(&tmp_cg_links);
1023
1024 BUG_ON(!list_empty(&cgrp->sibling));
1025 BUG_ON(!list_empty(&cgrp->children));
1026 BUG_ON(root->number_of_cgroups != 1);
1027
1028 cgroup_populate_dir(cgrp);
1029 mutex_unlock(&inode->i_mutex);
1030 mutex_unlock(&cgroup_mutex);
1031 }
1032
1033 return simple_set_mnt(mnt, sb);
1034
1035 drop_new_super:
1036 up_write(&sb->s_umount);
1037 deactivate_super(sb);
1038 free_cg_links(&tmp_cg_links);
1039 return ret;
1040}
1041
1042static void cgroup_kill_sb(struct super_block *sb) {
1043 struct cgroupfs_root *root = sb->s_fs_info;
1044 struct cgroup *cgrp = &root->top_cgroup;
1045 int ret;
1046
1047 BUG_ON(!root);
1048
1049 BUG_ON(root->number_of_cgroups != 1);
1050 BUG_ON(!list_empty(&cgrp->children));
1051 BUG_ON(!list_empty(&cgrp->sibling));
1052
1053 mutex_lock(&cgroup_mutex);
1054
1055 /* Rebind all subsystems back to the default hierarchy */
1056 ret = rebind_subsystems(root, 0);
1057 /* Shouldn't be able to fail ... */
1058 BUG_ON(ret);
1059
1060 /*
1061 * Release all the links from css_sets to this hierarchy's
1062 * root cgroup
1063 */
1064 write_lock(&css_set_lock);
1065 while (!list_empty(&cgrp->css_sets)) {
1066 struct cg_cgroup_link *link;
1067 link = list_entry(cgrp->css_sets.next,
1068 struct cg_cgroup_link, cgrp_link_list);
1069 list_del(&link->cg_link_list);
1070 list_del(&link->cgrp_link_list);
1071 kfree(link);
1072 }
1073 write_unlock(&css_set_lock);
1074
1075 if (!list_empty(&root->root_list)) {
1076 list_del(&root->root_list);
1077 root_count--;
1078 }
1079 mutex_unlock(&cgroup_mutex);
1080
1081 kfree(root);
1082 kill_litter_super(sb);
1083}
1084
1085static struct file_system_type cgroup_fs_type = {
1086 .name = "cgroup",
1087 .get_sb = cgroup_get_sb,
1088 .kill_sb = cgroup_kill_sb,
1089};
1090
1091static inline struct cgroup *__d_cgrp(struct dentry *dentry)
1092{
1093 return dentry->d_fsdata;
1094}
1095
1096static inline struct cftype *__d_cft(struct dentry *dentry)
1097{
1098 return dentry->d_fsdata;
1099}
1100
1101/*
1102 * Called with cgroup_mutex held. Writes path of cgroup into buf.
1103 * Returns 0 on success, -errno on error.
1104 */
1105int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1106{
1107 char *start;
1108
1109 if (cgrp == dummytop) {
1110 /*
1111 * Inactive subsystems have no dentry for their root
1112 * cgroup
1113 */
1114 strcpy(buf, "/");
1115 return 0;
1116 }
1117
1118 start = buf + buflen;
1119
1120 *--start = '\0';
1121 for (;;) {
1122 int len = cgrp->dentry->d_name.len;
1123 if ((start -= len) < buf)
1124 return -ENAMETOOLONG;
1125 memcpy(start, cgrp->dentry->d_name.name, len);
1126 cgrp = cgrp->parent;
1127 if (!cgrp)
1128 break;
1129 if (!cgrp->parent)
1130 continue;
1131 if (--start < buf)
1132 return -ENAMETOOLONG;
1133 *start = '/';
1134 }
1135 memmove(buf, start, buf + buflen - start);
1136 return 0;
1137}
1138
1139/*
1140 * Return the first subsystem attached to a cgroup's hierarchy, and
1141 * its subsystem id.
1142 */
1143
1144static void get_first_subsys(const struct cgroup *cgrp,
1145 struct cgroup_subsys_state **css, int *subsys_id)
1146{
1147 const struct cgroupfs_root *root = cgrp->root;
1148 const struct cgroup_subsys *test_ss;
1149 BUG_ON(list_empty(&root->subsys_list));
1150 test_ss = list_entry(root->subsys_list.next,
1151 struct cgroup_subsys, sibling);
1152 if (css) {
1153 *css = cgrp->subsys[test_ss->subsys_id];
1154 BUG_ON(!*css);
1155 }
1156 if (subsys_id)
1157 *subsys_id = test_ss->subsys_id;
1158}
1159
1160/*
1161 * Attach task 'tsk' to cgroup 'cgrp'
1162 *
1163 * Call holding cgroup_mutex. May take task_lock of
1164 * the task 'pid' during call.
1165 */
1166static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1167{
1168 int retval = 0;
1169 struct cgroup_subsys *ss;
1170 struct cgroup *oldcgrp;
1171 struct css_set *cg = tsk->cgroups;
1172 struct css_set *newcg;
1173 struct cgroupfs_root *root = cgrp->root;
1174 int subsys_id;
1175
1176 get_first_subsys(cgrp, NULL, &subsys_id);
1177
1178 /* Nothing to do if the task is already in that cgroup */
1179 oldcgrp = task_cgroup(tsk, subsys_id);
1180 if (cgrp == oldcgrp)
1181 return 0;
1182
1183 for_each_subsys(root, ss) {
1184 if (ss->can_attach) {
1185 retval = ss->can_attach(ss, cgrp, tsk);
1186 if (retval) {
1187 return retval;
1188 }
1189 }
1190 }
1191
1192 /*
1193 * Locate or allocate a new css_set for this task,
1194 * based on its final set of cgroups
1195 */
1196 newcg = find_css_set(cg, cgrp);
1197 if (!newcg) {
1198 return -ENOMEM;
1199 }
1200
1201 task_lock(tsk);
1202 if (tsk->flags & PF_EXITING) {
1203 task_unlock(tsk);
1204 put_css_set(newcg);
1205 return -ESRCH;
1206 }
1207 rcu_assign_pointer(tsk->cgroups, newcg);
1208 task_unlock(tsk);
1209
1210 /* Update the css_set linked lists if we're using them */
1211 write_lock(&css_set_lock);
1212 if (!list_empty(&tsk->cg_list)) {
1213 list_del(&tsk->cg_list);
1214 list_add(&tsk->cg_list, &newcg->tasks);
1215 }
1216 write_unlock(&css_set_lock);
1217
1218 for_each_subsys(root, ss) {
1219 if (ss->attach) {
1220 ss->attach(ss, cgrp, oldcgrp, tsk);
1221 }
1222 }
1223 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1224 synchronize_rcu();
1225 put_css_set(cg);
1226 return 0;
1227}
1228
1229/*
1230 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
1231 * cgroup_mutex, may take task_lock of task
1232 */
1233static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1234{
1235 pid_t pid;
1236 struct task_struct *tsk;
1237 int ret;
1238
1239 if (sscanf(pidbuf, "%d", &pid) != 1)
1240 return -EIO;
1241
1242 if (pid) {
1243 rcu_read_lock();
1244 tsk = find_task_by_pid(pid);
1245 if (!tsk || tsk->flags & PF_EXITING) {
1246 rcu_read_unlock();
1247 return -ESRCH;
1248 }
1249 get_task_struct(tsk);
1250 rcu_read_unlock();
1251
1252 if ((current->euid) && (current->euid != tsk->uid)
1253 && (current->euid != tsk->suid)) {
1254 put_task_struct(tsk);
1255 return -EACCES;
1256 }
1257 } else {
1258 tsk = current;
1259 get_task_struct(tsk);
1260 }
1261
1262 ret = attach_task(cgrp, tsk);
1263 put_task_struct(tsk);
1264 return ret;
1265}
1266
1267/* The various types of files and directories in a cgroup file system */
1268
1269enum cgroup_filetype {
1270 FILE_ROOT,
1271 FILE_DIR,
1272 FILE_TASKLIST,
1273 FILE_NOTIFY_ON_RELEASE,
1274 FILE_RELEASABLE,
1275 FILE_RELEASE_AGENT,
1276};
1277
1278static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
1279 struct file *file,
1280 const char __user *userbuf,
1281 size_t nbytes, loff_t *unused_ppos)
1282{
1283 char buffer[64];
1284 int retval = 0;
1285 u64 val;
1286 char *end;
1287
1288 if (!nbytes)
1289 return -EINVAL;
1290 if (nbytes >= sizeof(buffer))
1291 return -E2BIG;
1292 if (copy_from_user(buffer, userbuf, nbytes))
1293 return -EFAULT;
1294
1295 buffer[nbytes] = 0; /* nul-terminate */
1296
1297 /* strip newline if necessary */
1298 if (nbytes && (buffer[nbytes-1] == '\n'))
1299 buffer[nbytes-1] = 0;
1300 val = simple_strtoull(buffer, &end, 0);
1301 if (*end)
1302 return -EINVAL;
1303
1304 /* Pass to subsystem */
1305 retval = cft->write_uint(cgrp, cft, val);
1306 if (!retval)
1307 retval = nbytes;
1308 return retval;
1309}
1310
1311static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1312 struct cftype *cft,
1313 struct file *file,
1314 const char __user *userbuf,
1315 size_t nbytes, loff_t *unused_ppos)
1316{
1317 enum cgroup_filetype type = cft->private;
1318 char *buffer;
1319 int retval = 0;
1320
1321 if (nbytes >= PATH_MAX)
1322 return -E2BIG;
1323
1324 /* +1 for nul-terminator */
1325 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1326 if (buffer == NULL)
1327 return -ENOMEM;
1328
1329 if (copy_from_user(buffer, userbuf, nbytes)) {
1330 retval = -EFAULT;
1331 goto out1;
1332 }
1333 buffer[nbytes] = 0; /* nul-terminate */
1334
1335 mutex_lock(&cgroup_mutex);
1336
1337 if (cgroup_is_removed(cgrp)) {
1338 retval = -ENODEV;
1339 goto out2;
1340 }
1341
1342 switch (type) {
1343 case FILE_TASKLIST:
1344 retval = attach_task_by_pid(cgrp, buffer);
1345 break;
1346 case FILE_NOTIFY_ON_RELEASE:
1347 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1348 if (simple_strtoul(buffer, NULL, 10) != 0)
1349 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1350 else
1351 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1352 break;
1353 case FILE_RELEASE_AGENT:
1354 {
1355 struct cgroupfs_root *root = cgrp->root;
1356 /* Strip trailing newline */
1357 if (nbytes && (buffer[nbytes-1] == '\n')) {
1358 buffer[nbytes-1] = 0;
1359 }
1360 if (nbytes < sizeof(root->release_agent_path)) {
1361 /* We never write anything other than '\0'
1362 * into the last char of release_agent_path,
1363 * so it always remains a NUL-terminated
1364 * string */
1365 strncpy(root->release_agent_path, buffer, nbytes);
1366 root->release_agent_path[nbytes] = 0;
1367 } else {
1368 retval = -ENOSPC;
1369 }
1370 break;
1371 }
1372 default:
1373 retval = -EINVAL;
1374 goto out2;
1375 }
1376
1377 if (retval == 0)
1378 retval = nbytes;
1379out2:
1380 mutex_unlock(&cgroup_mutex);
1381out1:
1382 kfree(buffer);
1383 return retval;
1384}
1385
1386static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1387 size_t nbytes, loff_t *ppos)
1388{
1389 struct cftype *cft = __d_cft(file->f_dentry);
1390 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1391
1392 if (!cft)
1393 return -ENODEV;
1394 if (cft->write)
1395 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1396 if (cft->write_uint)
1397 return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
1398 return -EINVAL;
1399}
1400
1401static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
1402 struct file *file,
1403 char __user *buf, size_t nbytes,
1404 loff_t *ppos)
1405{
1406 char tmp[64];
1407 u64 val = cft->read_uint(cgrp, cft);
1408 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1409
1410 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1411}
1412
1413static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1414 struct cftype *cft,
1415 struct file *file,
1416 char __user *buf,
1417 size_t nbytes, loff_t *ppos)
1418{
1419 enum cgroup_filetype type = cft->private;
1420 char *page;
1421 ssize_t retval = 0;
1422 char *s;
1423
1424 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1425 return -ENOMEM;
1426
1427 s = page;
1428
1429 switch (type) {
1430 case FILE_RELEASE_AGENT:
1431 {
1432 struct cgroupfs_root *root;
1433 size_t n;
1434 mutex_lock(&cgroup_mutex);
1435 root = cgrp->root;
1436 n = strnlen(root->release_agent_path,
1437 sizeof(root->release_agent_path));
1438 n = min(n, (size_t) PAGE_SIZE);
1439 strncpy(s, root->release_agent_path, n);
1440 mutex_unlock(&cgroup_mutex);
1441 s += n;
1442 break;
1443 }
1444 default:
1445 retval = -EINVAL;
1446 goto out;
1447 }
1448 *s++ = '\n';
1449
1450 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1451out:
1452 free_page((unsigned long)page);
1453 return retval;
1454}
1455
1456static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1457 size_t nbytes, loff_t *ppos)
1458{
1459 struct cftype *cft = __d_cft(file->f_dentry);
1460 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1461
1462 if (!cft)
1463 return -ENODEV;
1464
1465 if (cft->read)
1466 return cft->read(cgrp, cft, file, buf, nbytes, ppos);
1467 if (cft->read_uint)
1468 return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
1469 return -EINVAL;
1470}
1471
1472static int cgroup_file_open(struct inode *inode, struct file *file)
1473{
1474 int err;
1475 struct cftype *cft;
1476
1477 err = generic_file_open(inode, file);
1478 if (err)
1479 return err;
1480
1481 cft = __d_cft(file->f_dentry);
1482 if (!cft)
1483 return -ENODEV;
1484 if (cft->open)
1485 err = cft->open(inode, file);
1486 else
1487 err = 0;
1488
1489 return err;
1490}
1491
1492static int cgroup_file_release(struct inode *inode, struct file *file)
1493{
1494 struct cftype *cft = __d_cft(file->f_dentry);
1495 if (cft->release)
1496 return cft->release(inode, file);
1497 return 0;
1498}
1499
1500/*
1501 * cgroup_rename - Only allow simple rename of directories in place.
1502 */
1503static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1504 struct inode *new_dir, struct dentry *new_dentry)
1505{
1506 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1507 return -ENOTDIR;
1508 if (new_dentry->d_inode)
1509 return -EEXIST;
1510 if (old_dir != new_dir)
1511 return -EIO;
1512 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1513}
1514
1515static struct file_operations cgroup_file_operations = {
1516 .read = cgroup_file_read,
1517 .write = cgroup_file_write,
1518 .llseek = generic_file_llseek,
1519 .open = cgroup_file_open,
1520 .release = cgroup_file_release,
1521};
1522
1523static struct inode_operations cgroup_dir_inode_operations = {
1524 .lookup = simple_lookup,
1525 .mkdir = cgroup_mkdir,
1526 .rmdir = cgroup_rmdir,
1527 .rename = cgroup_rename,
1528};
1529
1530static int cgroup_create_file(struct dentry *dentry, int mode,
1531 struct super_block *sb)
1532{
1533 static struct dentry_operations cgroup_dops = {
1534 .d_iput = cgroup_diput,
1535 };
1536
1537 struct inode *inode;
1538
1539 if (!dentry)
1540 return -ENOENT;
1541 if (dentry->d_inode)
1542 return -EEXIST;
1543
1544 inode = cgroup_new_inode(mode, sb);
1545 if (!inode)
1546 return -ENOMEM;
1547
1548 if (S_ISDIR(mode)) {
1549 inode->i_op = &cgroup_dir_inode_operations;
1550 inode->i_fop = &simple_dir_operations;
1551
1552 /* start off with i_nlink == 2 (for "." entry) */
1553 inc_nlink(inode);
1554
1555 /* start with the directory inode held, so that we can
1556 * populate it without racing with another mkdir */
1557 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1558 } else if (S_ISREG(mode)) {
1559 inode->i_size = 0;
1560 inode->i_fop = &cgroup_file_operations;
1561 }
1562 dentry->d_op = &cgroup_dops;
1563 d_instantiate(dentry, inode);
1564 dget(dentry); /* Extra count - pin the dentry in core */
1565 return 0;
1566}
1567
1568/*
1569 * cgroup_create_dir - create a directory for an object.
1570 * cgrp: the cgroup we create the directory for.
1571 * It must have a valid ->parent field
1572 * And we are going to fill its ->dentry field.
1573 * dentry: dentry of the new cgroup
1574 * mode: mode to set on new directory.
1575 */
1576static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1577 int mode)
1578{
1579 struct dentry *parent;
1580 int error = 0;
1581
1582 parent = cgrp->parent->dentry;
1583 error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
1584 if (!error) {
1585 dentry->d_fsdata = cgrp;
1586 inc_nlink(parent->d_inode);
1587 cgrp->dentry = dentry;
1588 dget(dentry);
1589 }
1590 dput(dentry);
1591
1592 return error;
1593}
1594
1595int cgroup_add_file(struct cgroup *cgrp,
1596 struct cgroup_subsys *subsys,
1597 const struct cftype *cft)
1598{
1599 struct dentry *dir = cgrp->dentry;
1600 struct dentry *dentry;
1601 int error;
1602
1603 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1604 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
1605 strcpy(name, subsys->name);
1606 strcat(name, ".");
1607 }
1608 strcat(name, cft->name);
1609 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1610 dentry = lookup_one_len(name, dir, strlen(name));
1611 if (!IS_ERR(dentry)) {
1612 error = cgroup_create_file(dentry, 0644 | S_IFREG,
1613 cgrp->root->sb);
1614 if (!error)
1615 dentry->d_fsdata = (void *)cft;
1616 dput(dentry);
1617 } else
1618 error = PTR_ERR(dentry);
1619 return error;
1620}
1621
1622int cgroup_add_files(struct cgroup *cgrp,
1623 struct cgroup_subsys *subsys,
1624 const struct cftype cft[],
1625 int count)
1626{
1627 int i, err;
1628 for (i = 0; i < count; i++) {
1629 err = cgroup_add_file(cgrp, subsys, &cft[i]);
1630 if (err)
1631 return err;
1632 }
1633 return 0;
1634}
1635
1636/* Count the number of tasks in a cgroup. */
1637
1638int cgroup_task_count(const struct cgroup *cgrp)
1639{
1640 int count = 0;
1641 struct list_head *l;
1642
1643 read_lock(&css_set_lock);
1644 l = cgrp->css_sets.next;
1645 while (l != &cgrp->css_sets) {
1646 struct cg_cgroup_link *link =
1647 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1648 count += atomic_read(&link->cg->ref.refcount);
1649 l = l->next;
1650 }
1651 read_unlock(&css_set_lock);
1652 return count;
1653}
1654
1655/*
1656 * Advance a list_head iterator. The iterator should be positioned at
1657 * the start of a css_set
1658 */
1659static void cgroup_advance_iter(struct cgroup *cgrp,
1660 struct cgroup_iter *it)
1661{
1662 struct list_head *l = it->cg_link;
1663 struct cg_cgroup_link *link;
1664 struct css_set *cg;
1665
1666 /* Advance to the next non-empty css_set */
1667 do {
1668 l = l->next;
1669 if (l == &cgrp->css_sets) {
1670 it->cg_link = NULL;
1671 return;
1672 }
1673 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1674 cg = link->cg;
1675 } while (list_empty(&cg->tasks));
1676 it->cg_link = l;
1677 it->task = cg->tasks.next;
1678}
1679
1680void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1681{
1682 /*
1683 * The first time anyone tries to iterate across a cgroup,
1684 * we need to enable the list linking each css_set to its
1685 * tasks, and fix up all existing tasks.
1686 */
1687 if (!use_task_css_set_links) {
1688 struct task_struct *p, *g;
1689 write_lock(&css_set_lock);
1690 use_task_css_set_links = 1;
1691 do_each_thread(g, p) {
1692 task_lock(p);
1693 if (list_empty(&p->cg_list))
1694 list_add(&p->cg_list, &p->cgroups->tasks);
1695 task_unlock(p);
1696 } while_each_thread(g, p);
1697 write_unlock(&css_set_lock);
1698 }
1699 read_lock(&css_set_lock);
1700 it->cg_link = &cgrp->css_sets;
1701 cgroup_advance_iter(cgrp, it);
1702}
1703
1704struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1705 struct cgroup_iter *it)
1706{
1707 struct task_struct *res;
1708 struct list_head *l = it->task;
1709
1710 /* If the iterator cg is NULL, we have no tasks */
1711 if (!it->cg_link)
1712 return NULL;
1713 res = list_entry(l, struct task_struct, cg_list);
1714 /* Advance iterator to find next entry */
1715 l = l->next;
1716 if (l == &res->cgroups->tasks) {
1717 /* We reached the end of this task list - move on to
1718 * the next cg_cgroup_link */
1719 cgroup_advance_iter(cgrp, it);
1720 } else {
1721 it->task = l;
1722 }
1723 return res;
1724}
1725
1726void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1727{
1728 read_unlock(&css_set_lock);
1729}
1730
1731/*
1732 * Stuff for reading the 'tasks' file.
1733 *
1734 * Reading this file can return large amounts of data if a cgroup has
1735 * *lots* of attached tasks. So it may need several calls to read(),
1736 * but we cannot guarantee that the information we produce is correct
1737 * unless we produce it entirely atomically.
1738 *
1739 * Upon tasks file open(), a struct ctr_struct is allocated, that
1740 * will have a pointer to an array (also allocated here). The struct
1741 * ctr_struct * is stored in file->private_data. Its resources will
1742 * be freed by release() when the file is closed. The array is used
1743 * to sprintf the PIDs and then used by read().
1744 */
1745struct ctr_struct {
1746 char *buf;
1747 int bufsz;
1748};
1749
1750/*
1751 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1752 * 'cgrp'. Return actual number of pids loaded. No need to
1753 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1754 * read section, so the css_set can't go away, and is
1755 * immutable after creation.
1756 */
1757static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
1758{
1759 int n = 0;
1760 struct cgroup_iter it;
1761 struct task_struct *tsk;
1762 cgroup_iter_start(cgrp, &it);
1763 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1764 if (unlikely(n == npids))
1765 break;
1766 pidarray[n++] = task_pid_nr(tsk);
1767 }
1768 cgroup_iter_end(cgrp, &it);
1769 return n;
1770}
1771
1772/**
1773 * Build and fill cgroupstats so that taskstats can export it to user
1774 * space.
1775 *
1776 * @stats: cgroupstats to fill information into
1777 * @dentry: A dentry entry belonging to the cgroup for which stats have
1778 * been requested.
1779 */
1780int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
1781{
1782 int ret = -EINVAL;
1783 struct cgroup *cgrp;
1784 struct cgroup_iter it;
1785 struct task_struct *tsk;
1786 /*
1787 * Validate dentry by checking the superblock operations
1788 */
1789 if (dentry->d_sb->s_op != &cgroup_ops)
1790 goto err;
1791
1792 ret = 0;
1793 cgrp = dentry->d_fsdata;
1794 rcu_read_lock();
1795
1796 cgroup_iter_start(cgrp, &it);
1797 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1798 switch (tsk->state) {
1799 case TASK_RUNNING:
1800 stats->nr_running++;
1801 break;
1802 case TASK_INTERRUPTIBLE:
1803 stats->nr_sleeping++;
1804 break;
1805 case TASK_UNINTERRUPTIBLE:
1806 stats->nr_uninterruptible++;
1807 break;
1808 case TASK_STOPPED:
1809 stats->nr_stopped++;
1810 break;
1811 default:
1812 if (delayacct_is_task_waiting_on_io(tsk))
1813 stats->nr_io_wait++;
1814 break;
1815 }
1816 }
1817 cgroup_iter_end(cgrp, &it);
1818
1819 rcu_read_unlock();
1820err:
1821 return ret;
1822}
1823
1824static int cmppid(const void *a, const void *b)
1825{
1826 return *(pid_t *)a - *(pid_t *)b;
1827}
1828
1829/*
1830 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1831 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1832 * count 'cnt' of how many chars would be written if buf were large enough.
1833 */
1834static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1835{
1836 int cnt = 0;
1837 int i;
1838
1839 for (i = 0; i < npids; i++)
1840 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1841 return cnt;
1842}
1843
1844/*
1845 * Handle an open on 'tasks' file. Prepare a buffer listing the
1846 * process id's of tasks currently attached to the cgroup being opened.
1847 *
1848 * Does not require any specific cgroup mutexes, and does not take any.
1849 */
1850static int cgroup_tasks_open(struct inode *unused, struct file *file)
1851{
1852 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1853 struct ctr_struct *ctr;
1854 pid_t *pidarray;
1855 int npids;
1856 char c;
1857
1858 if (!(file->f_mode & FMODE_READ))
1859 return 0;
1860
1861 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1862 if (!ctr)
1863 goto err0;
1864
1865 /*
1866 * If cgroup gets more users after we read count, we won't have
1867 * enough space - tough. This race is indistinguishable to the
1868 * caller from the case that the additional cgroup users didn't
1869 * show up until sometime later on.
1870 */
1871 npids = cgroup_task_count(cgrp);
1872 if (npids) {
1873 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1874 if (!pidarray)
1875 goto err1;
1876
1877 npids = pid_array_load(pidarray, npids, cgrp);
1878 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1879
1880 /* Call pid_array_to_buf() twice, first just to get bufsz */
1881 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1882 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1883 if (!ctr->buf)
1884 goto err2;
1885 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1886
1887 kfree(pidarray);
1888 } else {
1889 ctr->buf = 0;
1890 ctr->bufsz = 0;
1891 }
1892 file->private_data = ctr;
1893 return 0;
1894
1895err2:
1896 kfree(pidarray);
1897err1:
1898 kfree(ctr);
1899err0:
1900 return -ENOMEM;
1901}
1902
1903static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
1904 struct cftype *cft,
1905 struct file *file, char __user *buf,
1906 size_t nbytes, loff_t *ppos)
1907{
1908 struct ctr_struct *ctr = file->private_data;
1909
1910 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1911}
1912
1913static int cgroup_tasks_release(struct inode *unused_inode,
1914 struct file *file)
1915{
1916 struct ctr_struct *ctr;
1917
1918 if (file->f_mode & FMODE_READ) {
1919 ctr = file->private_data;
1920 kfree(ctr->buf);
1921 kfree(ctr);
1922 }
1923 return 0;
1924}
1925
1926static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
1927 struct cftype *cft)
1928{
1929 return notify_on_release(cgrp);
1930}
1931
1932static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
1933{
1934 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
1935}
1936
1937/*
1938 * for the common functions, 'private' gives the type of file
1939 */
1940static struct cftype files[] = {
1941 {
1942 .name = "tasks",
1943 .open = cgroup_tasks_open,
1944 .read = cgroup_tasks_read,
1945 .write = cgroup_common_file_write,
1946 .release = cgroup_tasks_release,
1947 .private = FILE_TASKLIST,
1948 },
1949
1950 {
1951 .name = "notify_on_release",
1952 .read_uint = cgroup_read_notify_on_release,
1953 .write = cgroup_common_file_write,
1954 .private = FILE_NOTIFY_ON_RELEASE,
1955 },
1956
1957 {
1958 .name = "releasable",
1959 .read_uint = cgroup_read_releasable,
1960 .private = FILE_RELEASABLE,
1961 }
1962};
1963
1964static struct cftype cft_release_agent = {
1965 .name = "release_agent",
1966 .read = cgroup_common_file_read,
1967 .write = cgroup_common_file_write,
1968 .private = FILE_RELEASE_AGENT,
1969};
1970
1971static int cgroup_populate_dir(struct cgroup *cgrp)
1972{
1973 int err;
1974 struct cgroup_subsys *ss;
1975
1976 /* First clear out any existing files */
1977 cgroup_clear_directory(cgrp->dentry);
1978
1979 err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
1980 if (err < 0)
1981 return err;
1982
1983 if (cgrp == cgrp->top_cgroup) {
1984 if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
1985 return err;
1986 }
1987
1988 for_each_subsys(cgrp->root, ss) {
1989 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
1990 return err;
1991 }
1992
1993 return 0;
1994}
1995
1996static void init_cgroup_css(struct cgroup_subsys_state *css,
1997 struct cgroup_subsys *ss,
1998 struct cgroup *cgrp)
1999{
2000 css->cgroup = cgrp;
2001 atomic_set(&css->refcnt, 0);
2002 css->flags = 0;
2003 if (cgrp == dummytop)
2004 set_bit(CSS_ROOT, &css->flags);
2005 BUG_ON(cgrp->subsys[ss->subsys_id]);
2006 cgrp->subsys[ss->subsys_id] = css;
2007}
2008
2009/*
2010 * cgroup_create - create a cgroup
2011 * parent: cgroup that will be parent of the new cgroup.
2012 * name: name of the new cgroup. Will be strcpy'ed.
2013 * mode: mode to set on new inode
2014 *
2015 * Must be called with the mutex on the parent inode held
2016 */
2017
2018static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2019 int mode)
2020{
2021 struct cgroup *cgrp;
2022 struct cgroupfs_root *root = parent->root;
2023 int err = 0;
2024 struct cgroup_subsys *ss;
2025 struct super_block *sb = root->sb;
2026
2027 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
2028 if (!cgrp)
2029 return -ENOMEM;
2030
2031 /* Grab a reference on the superblock so the hierarchy doesn't
2032 * get deleted on unmount if there are child cgroups. This
2033 * can be done outside cgroup_mutex, since the sb can't
2034 * disappear while someone has an open control file on the
2035 * fs */
2036 atomic_inc(&sb->s_active);
2037
2038 mutex_lock(&cgroup_mutex);
2039
2040 cgrp->flags = 0;
2041 INIT_LIST_HEAD(&cgrp->sibling);
2042 INIT_LIST_HEAD(&cgrp->children);
2043 INIT_LIST_HEAD(&cgrp->css_sets);
2044 INIT_LIST_HEAD(&cgrp->release_list);
2045
2046 cgrp->parent = parent;
2047 cgrp->root = parent->root;
2048 cgrp->top_cgroup = parent->top_cgroup;
2049
2050 for_each_subsys(root, ss) {
2051 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2052 if (IS_ERR(css)) {
2053 err = PTR_ERR(css);
2054 goto err_destroy;
2055 }
2056 init_cgroup_css(css, ss, cgrp);
2057 }
2058
2059 list_add(&cgrp->sibling, &cgrp->parent->children);
2060 root->number_of_cgroups++;
2061
2062 err = cgroup_create_dir(cgrp, dentry, mode);
2063 if (err < 0)
2064 goto err_remove;
2065
2066 /* The cgroup directory was pre-locked for us */
2067 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
2068
2069 err = cgroup_populate_dir(cgrp);
2070 /* If err < 0, we have a half-filled directory - oh well ;) */
2071
2072 mutex_unlock(&cgroup_mutex);
2073 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
2074
2075 return 0;
2076
2077 err_remove:
2078
2079 list_del(&cgrp->sibling);
2080 root->number_of_cgroups--;
2081
2082 err_destroy:
2083
2084 for_each_subsys(root, ss) {
2085 if (cgrp->subsys[ss->subsys_id])
2086 ss->destroy(ss, cgrp);
2087 }
2088
2089 mutex_unlock(&cgroup_mutex);
2090
2091 /* Release the reference count that we took on the superblock */
2092 deactivate_super(sb);
2093
2094 kfree(cgrp);
2095 return err;
2096}
2097
2098static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2099{
2100 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
2101
2102 /* the vfs holds inode->i_mutex already */
2103 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2104}
2105
2106static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2107{
2108 /* Check the reference count on each subsystem. Since we
2109 * already established that there are no tasks in the
2110 * cgroup, if the css refcount is also 0, then there should
2111 * be no outstanding references, so the subsystem is safe to
2112 * destroy. We scan across all subsystems rather than using
2113 * the per-hierarchy linked list of mounted subsystems since
2114 * we can be called via check_for_release() with no
2115 * synchronization other than RCU, and the subsystem linked
2116 * list isn't RCU-safe */
2117 int i;
2118 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2119 struct cgroup_subsys *ss = subsys[i];
2120 struct cgroup_subsys_state *css;
2121 /* Skip subsystems not in this hierarchy */
2122 if (ss->root != cgrp->root)
2123 continue;
2124 css = cgrp->subsys[ss->subsys_id];
2125 /* When called from check_for_release() it's possible
2126 * that by this point the cgroup has been removed
2127 * and the css deleted. But a false-positive doesn't
2128 * matter, since it can only happen if the cgroup
2129 * has been deleted and hence no longer needs the
2130 * release agent to be called anyway. */
2131 if (css && atomic_read(&css->refcnt)) {
2132 return 1;
2133 }
2134 }
2135 return 0;
2136}
2137
2138static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2139{
2140 struct cgroup *cgrp = dentry->d_fsdata;
2141 struct dentry *d;
2142 struct cgroup *parent;
2143 struct cgroup_subsys *ss;
2144 struct super_block *sb;
2145 struct cgroupfs_root *root;
2146
2147 /* the vfs holds both inode->i_mutex already */
2148
2149 mutex_lock(&cgroup_mutex);
2150 if (atomic_read(&cgrp->count) != 0) {
2151 mutex_unlock(&cgroup_mutex);
2152 return -EBUSY;
2153 }
2154 if (!list_empty(&cgrp->children)) {
2155 mutex_unlock(&cgroup_mutex);
2156 return -EBUSY;
2157 }
2158
2159 parent = cgrp->parent;
2160 root = cgrp->root;
2161 sb = root->sb;
2162
2163 if (cgroup_has_css_refs(cgrp)) {
2164 mutex_unlock(&cgroup_mutex);
2165 return -EBUSY;
2166 }
2167
2168 for_each_subsys(root, ss) {
2169 if (cgrp->subsys[ss->subsys_id])
2170 ss->destroy(ss, cgrp);
2171 }
2172
2173 spin_lock(&release_list_lock);
2174 set_bit(CGRP_REMOVED, &cgrp->flags);
2175 if (!list_empty(&cgrp->release_list))
2176 list_del(&cgrp->release_list);
2177 spin_unlock(&release_list_lock);
2178 /* delete my sibling from parent->children */
2179 list_del(&cgrp->sibling);
2180 spin_lock(&cgrp->dentry->d_lock);
2181 d = dget(cgrp->dentry);
2182 cgrp->dentry = NULL;
2183 spin_unlock(&d->d_lock);
2184
2185 cgroup_d_remove_dir(d);
2186 dput(d);
2187 root->number_of_cgroups--;
2188
2189 set_bit(CGRP_RELEASABLE, &parent->flags);
2190 check_for_release(parent);
2191
2192 mutex_unlock(&cgroup_mutex);
2193 /* Drop the active superblock reference that we took when we
2194 * created the cgroup */
2195 deactivate_super(sb);
2196 return 0;
2197}
2198
2199static void cgroup_init_subsys(struct cgroup_subsys *ss)
2200{
2201 struct cgroup_subsys_state *css;
2202 struct list_head *l;
2203 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
2204
2205 /* Create the top cgroup state for this subsystem */
2206 ss->root = &rootnode;
2207 css = ss->create(ss, dummytop);
2208 /* We don't handle early failures gracefully */
2209 BUG_ON(IS_ERR(css));
2210 init_cgroup_css(css, ss, dummytop);
2211
2212 /* Update all cgroup groups to contain a subsys
2213 * pointer to this state - since the subsystem is
2214 * newly registered, all tasks and hence all cgroup
2215 * groups are in the subsystem's top cgroup. */
2216 write_lock(&css_set_lock);
2217 l = &init_css_set.list;
2218 do {
2219 struct css_set *cg =
2220 list_entry(l, struct css_set, list);
2221 cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2222 l = l->next;
2223 } while (l != &init_css_set.list);
2224 write_unlock(&css_set_lock);
2225
2226 /* If this subsystem requested that it be notified with fork
2227 * events, we should send it one now for every process in the
2228 * system */
2229 if (ss->fork) {
2230 struct task_struct *g, *p;
2231
2232 read_lock(&tasklist_lock);
2233 do_each_thread(g, p) {
2234 ss->fork(ss, p);
2235 } while_each_thread(g, p);
2236 read_unlock(&tasklist_lock);
2237 }
2238
2239 need_forkexit_callback |= ss->fork || ss->exit;
2240
2241 ss->active = 1;
2242}
2243
2244/**
2245 * cgroup_init_early - initialize cgroups at system boot, and
2246 * initialize any subsystems that request early init.
2247 */
2248int __init cgroup_init_early(void)
2249{
2250 int i;
2251 kref_init(&init_css_set.ref);
2252 kref_get(&init_css_set.ref);
2253 INIT_LIST_HEAD(&init_css_set.list);
2254 INIT_LIST_HEAD(&init_css_set.cg_links);
2255 INIT_LIST_HEAD(&init_css_set.tasks);
2256 css_set_count = 1;
2257 init_cgroup_root(&rootnode);
2258 list_add(&rootnode.root_list, &roots);
2259 root_count = 1;
2260 init_task.cgroups = &init_css_set;
2261
2262 init_css_set_link.cg = &init_css_set;
2263 list_add(&init_css_set_link.cgrp_link_list,
2264 &rootnode.top_cgroup.css_sets);
2265 list_add(&init_css_set_link.cg_link_list,
2266 &init_css_set.cg_links);
2267
2268 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2269 struct cgroup_subsys *ss = subsys[i];
2270
2271 BUG_ON(!ss->name);
2272 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
2273 BUG_ON(!ss->create);
2274 BUG_ON(!ss->destroy);
2275 if (ss->subsys_id != i) {
2276 printk(KERN_ERR "Subsys %s id == %d\n",
2277 ss->name, ss->subsys_id);
2278 BUG();
2279 }
2280
2281 if (ss->early_init)
2282 cgroup_init_subsys(ss);
2283 }
2284 return 0;
2285}
2286
2287/**
2288 * cgroup_init - register cgroup filesystem and /proc file, and
2289 * initialize any subsystems that didn't request early init.
2290 */
2291int __init cgroup_init(void)
2292{
2293 int err;
2294 int i;
2295 struct proc_dir_entry *entry;
2296
2297 err = bdi_init(&cgroup_backing_dev_info);
2298 if (err)
2299 return err;
2300
2301 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2302 struct cgroup_subsys *ss = subsys[i];
2303 if (!ss->early_init)
2304 cgroup_init_subsys(ss);
2305 }
2306
2307 err = register_filesystem(&cgroup_fs_type);
2308 if (err < 0)
2309 goto out;
2310
2311 entry = create_proc_entry("cgroups", 0, NULL);
2312 if (entry)
2313 entry->proc_fops = &proc_cgroupstats_operations;
2314
2315out:
2316 if (err)
2317 bdi_destroy(&cgroup_backing_dev_info);
2318
2319 return err;
2320}
2321
2322/*
2323 * proc_cgroup_show()
2324 * - Print task's cgroup paths into seq_file, one line for each hierarchy
2325 * - Used for /proc/<pid>/cgroup.
2326 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2327 * doesn't really matter if tsk->cgroup changes after we read it,
2328 * and we take cgroup_mutex, keeping attach_task() from changing it
2329 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2330 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2331 * cgroup to top_cgroup.
2332 */
2333
2334/* TODO: Use a proper seq_file iterator */
2335static int proc_cgroup_show(struct seq_file *m, void *v)
2336{
2337 struct pid *pid;
2338 struct task_struct *tsk;
2339 char *buf;
2340 int retval;
2341 struct cgroupfs_root *root;
2342
2343 retval = -ENOMEM;
2344 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2345 if (!buf)
2346 goto out;
2347
2348 retval = -ESRCH;
2349 pid = m->private;
2350 tsk = get_pid_task(pid, PIDTYPE_PID);
2351 if (!tsk)
2352 goto out_free;
2353
2354 retval = 0;
2355
2356 mutex_lock(&cgroup_mutex);
2357
2358 for_each_root(root) {
2359 struct cgroup_subsys *ss;
2360 struct cgroup *cgrp;
2361 int subsys_id;
2362 int count = 0;
2363
2364 /* Skip this hierarchy if it has no active subsystems */
2365 if (!root->actual_subsys_bits)
2366 continue;
2367 for_each_subsys(root, ss)
2368 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2369 seq_putc(m, ':');
2370 get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
2371 cgrp = task_cgroup(tsk, subsys_id);
2372 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2373 if (retval < 0)
2374 goto out_unlock;
2375 seq_puts(m, buf);
2376 seq_putc(m, '\n');
2377 }
2378
2379out_unlock:
2380 mutex_unlock(&cgroup_mutex);
2381 put_task_struct(tsk);
2382out_free:
2383 kfree(buf);
2384out:
2385 return retval;
2386}
2387
2388static int cgroup_open(struct inode *inode, struct file *file)
2389{
2390 struct pid *pid = PROC_I(inode)->pid;
2391 return single_open(file, proc_cgroup_show, pid);
2392}
2393
2394struct file_operations proc_cgroup_operations = {
2395 .open = cgroup_open,
2396 .read = seq_read,
2397 .llseek = seq_lseek,
2398 .release = single_release,
2399};
2400
2401/* Display information about each subsystem and each hierarchy */
2402static int proc_cgroupstats_show(struct seq_file *m, void *v)
2403{
2404 int i;
2405 struct cgroupfs_root *root;
2406
2407 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
2408 mutex_lock(&cgroup_mutex);
2409 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2410 struct cgroup_subsys *ss = subsys[i];
2411 seq_printf(m, "%s\t%lu\t%d\n",
2412 ss->name, ss->root->subsys_bits,
2413 ss->root->number_of_cgroups);
2414 }
2415 mutex_unlock(&cgroup_mutex);
2416 return 0;
2417}
2418
2419static int cgroupstats_open(struct inode *inode, struct file *file)
2420{
2421 return single_open(file, proc_cgroupstats_show, 0);
2422}
2423
2424static struct file_operations proc_cgroupstats_operations = {
2425 .open = cgroupstats_open,
2426 .read = seq_read,
2427 .llseek = seq_lseek,
2428 .release = single_release,
2429};
2430
2431/**
2432 * cgroup_fork - attach newly forked task to its parents cgroup.
2433 * @tsk: pointer to task_struct of forking parent process.
2434 *
2435 * Description: A task inherits its parent's cgroup at fork().
2436 *
2437 * A pointer to the shared css_set was automatically copied in
2438 * fork.c by dup_task_struct(). However, we ignore that copy, since
2439 * it was not made under the protection of RCU or cgroup_mutex, so
2440 * might no longer be a valid cgroup pointer. attach_task() might
2441 * have already changed current->cgroups, allowing the previously
2442 * referenced cgroup group to be removed and freed.
2443 *
2444 * At the point that cgroup_fork() is called, 'current' is the parent
2445 * task, and the passed argument 'child' points to the child task.
2446 */
2447void cgroup_fork(struct task_struct *child)
2448{
2449 task_lock(current);
2450 child->cgroups = current->cgroups;
2451 get_css_set(child->cgroups);
2452 task_unlock(current);
2453 INIT_LIST_HEAD(&child->cg_list);
2454}
2455
2456/**
2457 * cgroup_fork_callbacks - called on a new task very soon before
2458 * adding it to the tasklist. No need to take any locks since no-one
2459 * can be operating on this task
2460 */
2461void cgroup_fork_callbacks(struct task_struct *child)
2462{
2463 if (need_forkexit_callback) {
2464 int i;
2465 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2466 struct cgroup_subsys *ss = subsys[i];
2467 if (ss->fork)
2468 ss->fork(ss, child);
2469 }
2470 }
2471}
2472
2473/**
2474 * cgroup_post_fork - called on a new task after adding it to the
2475 * task list. Adds the task to the list running through its css_set
2476 * if necessary. Has to be after the task is visible on the task list
2477 * in case we race with the first call to cgroup_iter_start() - to
2478 * guarantee that the new task ends up on its list. */
2479void cgroup_post_fork(struct task_struct *child)
2480{
2481 if (use_task_css_set_links) {
2482 write_lock(&css_set_lock);
2483 if (list_empty(&child->cg_list))
2484 list_add(&child->cg_list, &child->cgroups->tasks);
2485 write_unlock(&css_set_lock);
2486 }
2487}
2488/**
2489 * cgroup_exit - detach cgroup from exiting task
2490 * @tsk: pointer to task_struct of exiting process
2491 *
2492 * Description: Detach cgroup from @tsk and release it.
2493 *
2494 * Note that cgroups marked notify_on_release force every task in
2495 * them to take the global cgroup_mutex mutex when exiting.
2496 * This could impact scaling on very large systems. Be reluctant to
2497 * use notify_on_release cgroups where very high task exit scaling
2498 * is required on large systems.
2499 *
2500 * the_top_cgroup_hack:
2501 *
2502 * Set the exiting tasks cgroup to the root cgroup (top_cgroup).
2503 *
2504 * We call cgroup_exit() while the task is still competent to
2505 * handle notify_on_release(), then leave the task attached to the
2506 * root cgroup in each hierarchy for the remainder of its exit.
2507 *
2508 * To do this properly, we would increment the reference count on
2509 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
2510 * code we would add a second cgroup function call, to drop that
2511 * reference. This would just create an unnecessary hot spot on
2512 * the top_cgroup reference count, to no avail.
2513 *
2514 * Normally, holding a reference to a cgroup without bumping its
2515 * count is unsafe. The cgroup could go away, or someone could
2516 * attach us to a different cgroup, decrementing the count on
2517 * the first cgroup that we never incremented. But in this case,
2518 * top_cgroup isn't going away, and either task has PF_EXITING set,
2519 * which wards off any attach_task() attempts, or task is a failed
2520 * fork, never visible to attach_task.
2521 *
2522 */
2523void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2524{
2525 int i;
2526 struct css_set *cg;
2527
2528 if (run_callbacks && need_forkexit_callback) {
2529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2530 struct cgroup_subsys *ss = subsys[i];
2531 if (ss->exit)
2532 ss->exit(ss, tsk);
2533 }
2534 }
2535
2536 /*
2537 * Unlink from the css_set task list if necessary.
2538 * Optimistically check cg_list before taking
2539 * css_set_lock
2540 */
2541 if (!list_empty(&tsk->cg_list)) {
2542 write_lock(&css_set_lock);
2543 if (!list_empty(&tsk->cg_list))
2544 list_del(&tsk->cg_list);
2545 write_unlock(&css_set_lock);
2546 }
2547
2548 /* Reassign the task to the init_css_set. */
2549 task_lock(tsk);
2550 cg = tsk->cgroups;
2551 tsk->cgroups = &init_css_set;
2552 task_unlock(tsk);
2553 if (cg)
2554 put_css_set_taskexit(cg);
2555}
2556
2557/**
2558 * cgroup_clone - duplicate the current cgroup in the hierarchy
2559 * that the given subsystem is attached to, and move this task into
2560 * the new child
2561 */
2562int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2563{
2564 struct dentry *dentry;
2565 int ret = 0;
2566 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2567 struct cgroup *parent, *child;
2568 struct inode *inode;
2569 struct css_set *cg;
2570 struct cgroupfs_root *root;
2571 struct cgroup_subsys *ss;
2572
2573 /* We shouldn't be called by an unregistered subsystem */
2574 BUG_ON(!subsys->active);
2575
2576 /* First figure out what hierarchy and cgroup we're dealing
2577 * with, and pin them so we can drop cgroup_mutex */
2578 mutex_lock(&cgroup_mutex);
2579 again:
2580 root = subsys->root;
2581 if (root == &rootnode) {
2582 printk(KERN_INFO
2583 "Not cloning cgroup for unused subsystem %s\n",
2584 subsys->name);
2585 mutex_unlock(&cgroup_mutex);
2586 return 0;
2587 }
2588 cg = tsk->cgroups;
2589 parent = task_cgroup(tsk, subsys->subsys_id);
2590
2591 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
2592
2593 /* Pin the hierarchy */
2594 atomic_inc(&parent->root->sb->s_active);
2595
2596 /* Keep the cgroup alive */
2597 get_css_set(cg);
2598 mutex_unlock(&cgroup_mutex);
2599
2600 /* Now do the VFS work to create a cgroup */
2601 inode = parent->dentry->d_inode;
2602
2603 /* Hold the parent directory mutex across this operation to
2604 * stop anyone else deleting the new cgroup */
2605 mutex_lock(&inode->i_mutex);
2606 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2607 if (IS_ERR(dentry)) {
2608 printk(KERN_INFO
2609 "Couldn't allocate dentry for %s: %ld\n", nodename,
2610 PTR_ERR(dentry));
2611 ret = PTR_ERR(dentry);
2612 goto out_release;
2613 }
2614
2615 /* Create the cgroup directory, which also creates the cgroup */
2616 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
2617 child = __d_cgrp(dentry);
2618 dput(dentry);
2619 if (ret) {
2620 printk(KERN_INFO
2621 "Failed to create cgroup %s: %d\n", nodename,
2622 ret);
2623 goto out_release;
2624 }
2625
2626 if (!child) {
2627 printk(KERN_INFO
2628 "Couldn't find new cgroup %s\n", nodename);
2629 ret = -ENOMEM;
2630 goto out_release;
2631 }
2632
2633 /* The cgroup now exists. Retake cgroup_mutex and check
2634 * that we're still in the same state that we thought we
2635 * were. */
2636 mutex_lock(&cgroup_mutex);
2637 if ((root != subsys->root) ||
2638 (parent != task_cgroup(tsk, subsys->subsys_id))) {
2639 /* Aargh, we raced ... */
2640 mutex_unlock(&inode->i_mutex);
2641 put_css_set(cg);
2642
2643 deactivate_super(parent->root->sb);
2644 /* The cgroup is still accessible in the VFS, but
2645 * we're not going to try to rmdir() it at this
2646 * point. */
2647 printk(KERN_INFO
2648 "Race in cgroup_clone() - leaking cgroup %s\n",
2649 nodename);
2650 goto again;
2651 }
2652
2653 /* do any required auto-setup */
2654 for_each_subsys(root, ss) {
2655 if (ss->post_clone)
2656 ss->post_clone(ss, child);
2657 }
2658
2659 /* All seems fine. Finish by moving the task into the new cgroup */
2660 ret = attach_task(child, tsk);
2661 mutex_unlock(&cgroup_mutex);
2662
2663 out_release:
2664 mutex_unlock(&inode->i_mutex);
2665
2666 mutex_lock(&cgroup_mutex);
2667 put_css_set(cg);
2668 mutex_unlock(&cgroup_mutex);
2669 deactivate_super(parent->root->sb);
2670 return ret;
2671}
2672
2673/*
2674 * See if "cgrp" is a descendant of the current task's cgroup in
2675 * the appropriate hierarchy
2676 *
2677 * If we are sending in dummytop, then presumably we are creating
2678 * the top cgroup in the subsystem.
2679 *
2680 * Called only by the ns (nsproxy) cgroup.
2681 */
2682int cgroup_is_descendant(const struct cgroup *cgrp)
2683{
2684 int ret;
2685 struct cgroup *target;
2686 int subsys_id;
2687
2688 if (cgrp == dummytop)
2689 return 1;
2690
2691 get_first_subsys(cgrp, NULL, &subsys_id);
2692 target = task_cgroup(current, subsys_id);
2693 while (cgrp != target && cgrp!= cgrp->top_cgroup)
2694 cgrp = cgrp->parent;
2695 ret = (cgrp == target);
2696 return ret;
2697}
2698
2699static void check_for_release(struct cgroup *cgrp)
2700{
2701 /* All of these checks rely on RCU to keep the cgroup
2702 * structure alive */
2703 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
2704 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
2705 /* Control Group is currently removeable. If it's not
2706 * already queued for a userspace notification, queue
2707 * it now */
2708 int need_schedule_work = 0;
2709 spin_lock(&release_list_lock);
2710 if (!cgroup_is_removed(cgrp) &&
2711 list_empty(&cgrp->release_list)) {
2712 list_add(&cgrp->release_list, &release_list);
2713 need_schedule_work = 1;
2714 }
2715 spin_unlock(&release_list_lock);
2716 if (need_schedule_work)
2717 schedule_work(&release_agent_work);
2718 }
2719}
2720
2721void __css_put(struct cgroup_subsys_state *css)
2722{
2723 struct cgroup *cgrp = css->cgroup;
2724 rcu_read_lock();
2725 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
2726 set_bit(CGRP_RELEASABLE, &cgrp->flags);
2727 check_for_release(cgrp);
2728 }
2729 rcu_read_unlock();
2730}
2731
2732/*
2733 * Notify userspace when a cgroup is released, by running the
2734 * configured release agent with the name of the cgroup (path
2735 * relative to the root of cgroup file system) as the argument.
2736 *
2737 * Most likely, this user command will try to rmdir this cgroup.
2738 *
2739 * This races with the possibility that some other task will be
2740 * attached to this cgroup before it is removed, or that some other
2741 * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
2742 * The presumed 'rmdir' will fail quietly if this cgroup is no longer
2743 * unused, and this cgroup will be reprieved from its death sentence,
2744 * to continue to serve a useful existence. Next time it's released,
2745 * we will get notified again, if it still has 'notify_on_release' set.
2746 *
2747 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
2748 * means only wait until the task is successfully execve()'d. The
2749 * separate release agent task is forked by call_usermodehelper(),
2750 * then control in this thread returns here, without waiting for the
2751 * release agent task. We don't bother to wait because the caller of
2752 * this routine has no use for the exit status of the release agent
2753 * task, so no sense holding our caller up for that.
2754 *
2755 */
2756
2757static void cgroup_release_agent(struct work_struct *work)
2758{
2759 BUG_ON(work != &release_agent_work);
2760 mutex_lock(&cgroup_mutex);
2761 spin_lock(&release_list_lock);
2762 while (!list_empty(&release_list)) {
2763 char *argv[3], *envp[3];
2764 int i;
2765 char *pathbuf;
2766 struct cgroup *cgrp = list_entry(release_list.next,
2767 struct cgroup,
2768 release_list);
2769 list_del_init(&cgrp->release_list);
2770 spin_unlock(&release_list_lock);
2771 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2772 if (!pathbuf) {
2773 spin_lock(&release_list_lock);
2774 continue;
2775 }
2776
2777 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
2778 kfree(pathbuf);
2779 spin_lock(&release_list_lock);
2780 continue;
2781 }
2782
2783 i = 0;
2784 argv[i++] = cgrp->root->release_agent_path;
2785 argv[i++] = (char *)pathbuf;
2786 argv[i] = NULL;
2787
2788 i = 0;
2789 /* minimal command environment */
2790 envp[i++] = "HOME=/";
2791 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
2792 envp[i] = NULL;
2793
2794 /* Drop the lock while we invoke the usermode helper,
2795 * since the exec could involve hitting disk and hence
2796 * be a slow process */
2797 mutex_unlock(&cgroup_mutex);
2798 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2799 kfree(pathbuf);
2800 mutex_lock(&cgroup_mutex);
2801 spin_lock(&release_list_lock);
2802 }
2803 spin_unlock(&release_list_lock);
2804 mutex_unlock(&cgroup_mutex);
2805}
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
new file mode 100644
index 000000000000..37301e877cb0
--- /dev/null
+++ b/kernel/cgroup_debug.c
@@ -0,0 +1,97 @@
1/*
2 * kernel/ccontainer_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 cgroup_lock();
44 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count;
47}
48
49static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
50{
51 return (u64)(long)current->cgroups;
52}
53
54static u64 current_css_set_refcount_read(struct cgroup *cont,
55 struct cftype *cft)
56{
57 u64 count;
58
59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount);
61 rcu_read_unlock();
62 return count;
63}
64
65static struct cftype files[] = {
66 {
67 .name = "cgroup_refcount",
68 .read_uint = cgroup_refcount_read,
69 },
70 {
71 .name = "taskcount",
72 .read_uint = taskcount_read,
73 },
74
75 {
76 .name = "current_css_set",
77 .read_uint = current_css_set_read,
78 },
79
80 {
81 .name = "current_css_set_refcount",
82 .read_uint = current_css_set_refcount_read,
83 },
84};
85
86static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
87{
88 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
89}
90
91struct cgroup_subsys debug_subsys = {
92 .name = "debug",
93 .create = debug_create,
94 .destroy = debug_destroy,
95 .populate = debug_populate,
96 .subsys_id = debug_subsys_id,
97};
diff --git a/kernel/compat.c b/kernel/compat.c
index 3bae3742c2aa..42a1ed4b61b1 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,62 +40,27 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 40 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
41} 41}
42 42
43static long compat_nanosleep_restart(struct restart_block *restart)
44{
45 unsigned long expire = restart->arg0, now = jiffies;
46 struct compat_timespec __user *rmtp;
47
48 /* Did it expire while we handled signals? */
49 if (!time_after(expire, now))
50 return 0;
51
52 expire = schedule_timeout_interruptible(expire - now);
53 if (expire == 0)
54 return 0;
55
56 rmtp = (struct compat_timespec __user *)restart->arg1;
57 if (rmtp) {
58 struct compat_timespec ct;
59 struct timespec t;
60
61 jiffies_to_timespec(expire, &t);
62 ct.tv_sec = t.tv_sec;
63 ct.tv_nsec = t.tv_nsec;
64 if (copy_to_user(rmtp, &ct, sizeof(ct)))
65 return -EFAULT;
66 }
67 /* The 'restart' block is already filled in */
68 return -ERESTART_RESTARTBLOCK;
69}
70
71asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, 43asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
72 struct compat_timespec __user *rmtp) 44 struct compat_timespec __user *rmtp)
73{ 45{
74 struct timespec t; 46 struct timespec tu, rmt;
75 struct restart_block *restart; 47 long ret;
76 unsigned long expire;
77 48
78 if (get_compat_timespec(&t, rqtp)) 49 if (get_compat_timespec(&tu, rqtp))
79 return -EFAULT; 50 return -EFAULT;
80 51
81 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) 52 if (!timespec_valid(&tu))
82 return -EINVAL; 53 return -EINVAL;
83 54
84 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 55 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
85 expire = schedule_timeout_interruptible(expire); 56 CLOCK_MONOTONIC);
86 if (expire == 0)
87 return 0;
88 57
89 if (rmtp) { 58 if (ret && rmtp) {
90 jiffies_to_timespec(expire, &t); 59 if (put_compat_timespec(&rmt, rmtp))
91 if (put_compat_timespec(&t, rmtp))
92 return -EFAULT; 60 return -EFAULT;
93 } 61 }
94 restart = &current_thread_info()->restart_block; 62
95 restart->fn = compat_nanosleep_restart; 63 return ret;
96 restart->arg0 = jiffies + expire;
97 restart->arg1 = (unsigned long) rmtp;
98 return -ERESTART_RESTARTBLOCK;
99} 64}
100 65
101static inline long get_compat_itimerval(struct itimerval *o, 66static inline long get_compat_itimerval(struct itimerval *o,
@@ -247,8 +212,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
247 int ret; 212 int ret;
248 mm_segment_t old_fs = get_fs (); 213 mm_segment_t old_fs = get_fs ();
249 214
250 if (resource >= RLIM_NLIMITS) 215 if (resource >= RLIM_NLIMITS)
251 return -EINVAL; 216 return -EINVAL;
252 217
253 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || 218 if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
254 __get_user(r.rlim_cur, &rlim->rlim_cur) || 219 __get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -477,21 +442,21 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
477 442
478int get_compat_itimerspec(struct itimerspec *dst, 443int get_compat_itimerspec(struct itimerspec *dst,
479 const struct compat_itimerspec __user *src) 444 const struct compat_itimerspec __user *src)
480{ 445{
481 if (get_compat_timespec(&dst->it_interval, &src->it_interval) || 446 if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
482 get_compat_timespec(&dst->it_value, &src->it_value)) 447 get_compat_timespec(&dst->it_value, &src->it_value))
483 return -EFAULT; 448 return -EFAULT;
484 return 0; 449 return 0;
485} 450}
486 451
487int put_compat_itimerspec(struct compat_itimerspec __user *dst, 452int put_compat_itimerspec(struct compat_itimerspec __user *dst,
488 const struct itimerspec *src) 453 const struct itimerspec *src)
489{ 454{
490 if (put_compat_timespec(&src->it_interval, &dst->it_interval) || 455 if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
491 put_compat_timespec(&src->it_value, &dst->it_value)) 456 put_compat_timespec(&src->it_value, &dst->it_value))
492 return -EFAULT; 457 return -EFAULT;
493 return 0; 458 return 0;
494} 459}
495 460
496long compat_sys_timer_create(clockid_t which_clock, 461long compat_sys_timer_create(clockid_t which_clock,
497 struct compat_sigevent __user *timer_event_spec, 462 struct compat_sigevent __user *timer_event_spec,
@@ -512,9 +477,9 @@ long compat_sys_timer_create(clockid_t which_clock,
512} 477}
513 478
514long compat_sys_timer_settime(timer_t timer_id, int flags, 479long compat_sys_timer_settime(timer_t timer_id, int flags,
515 struct compat_itimerspec __user *new, 480 struct compat_itimerspec __user *new,
516 struct compat_itimerspec __user *old) 481 struct compat_itimerspec __user *old)
517{ 482{
518 long err; 483 long err;
519 mm_segment_t oldfs; 484 mm_segment_t oldfs;
520 struct itimerspec newts, oldts; 485 struct itimerspec newts, oldts;
@@ -522,58 +487,58 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
522 if (!new) 487 if (!new)
523 return -EINVAL; 488 return -EINVAL;
524 if (get_compat_itimerspec(&newts, new)) 489 if (get_compat_itimerspec(&newts, new))
525 return -EFAULT; 490 return -EFAULT;
526 oldfs = get_fs(); 491 oldfs = get_fs();
527 set_fs(KERNEL_DS); 492 set_fs(KERNEL_DS);
528 err = sys_timer_settime(timer_id, flags, 493 err = sys_timer_settime(timer_id, flags,
529 (struct itimerspec __user *) &newts, 494 (struct itimerspec __user *) &newts,
530 (struct itimerspec __user *) &oldts); 495 (struct itimerspec __user *) &oldts);
531 set_fs(oldfs); 496 set_fs(oldfs);
532 if (!err && old && put_compat_itimerspec(old, &oldts)) 497 if (!err && old && put_compat_itimerspec(old, &oldts))
533 return -EFAULT; 498 return -EFAULT;
534 return err; 499 return err;
535} 500}
536 501
537long compat_sys_timer_gettime(timer_t timer_id, 502long compat_sys_timer_gettime(timer_t timer_id,
538 struct compat_itimerspec __user *setting) 503 struct compat_itimerspec __user *setting)
539{ 504{
540 long err; 505 long err;
541 mm_segment_t oldfs; 506 mm_segment_t oldfs;
542 struct itimerspec ts; 507 struct itimerspec ts;
543 508
544 oldfs = get_fs(); 509 oldfs = get_fs();
545 set_fs(KERNEL_DS); 510 set_fs(KERNEL_DS);
546 err = sys_timer_gettime(timer_id, 511 err = sys_timer_gettime(timer_id,
547 (struct itimerspec __user *) &ts); 512 (struct itimerspec __user *) &ts);
548 set_fs(oldfs); 513 set_fs(oldfs);
549 if (!err && put_compat_itimerspec(setting, &ts)) 514 if (!err && put_compat_itimerspec(setting, &ts))
550 return -EFAULT; 515 return -EFAULT;
551 return err; 516 return err;
552} 517}
553 518
554long compat_sys_clock_settime(clockid_t which_clock, 519long compat_sys_clock_settime(clockid_t which_clock,
555 struct compat_timespec __user *tp) 520 struct compat_timespec __user *tp)
556{ 521{
557 long err; 522 long err;
558 mm_segment_t oldfs; 523 mm_segment_t oldfs;
559 struct timespec ts; 524 struct timespec ts;
560 525
561 if (get_compat_timespec(&ts, tp)) 526 if (get_compat_timespec(&ts, tp))
562 return -EFAULT; 527 return -EFAULT;
563 oldfs = get_fs(); 528 oldfs = get_fs();
564 set_fs(KERNEL_DS); 529 set_fs(KERNEL_DS);
565 err = sys_clock_settime(which_clock, 530 err = sys_clock_settime(which_clock,
566 (struct timespec __user *) &ts); 531 (struct timespec __user *) &ts);
567 set_fs(oldfs); 532 set_fs(oldfs);
568 return err; 533 return err;
569} 534}
570 535
571long compat_sys_clock_gettime(clockid_t which_clock, 536long compat_sys_clock_gettime(clockid_t which_clock,
572 struct compat_timespec __user *tp) 537 struct compat_timespec __user *tp)
573{ 538{
574 long err; 539 long err;
575 mm_segment_t oldfs; 540 mm_segment_t oldfs;
576 struct timespec ts; 541 struct timespec ts;
577 542
578 oldfs = get_fs(); 543 oldfs = get_fs();
579 set_fs(KERNEL_DS); 544 set_fs(KERNEL_DS);
@@ -581,16 +546,16 @@ long compat_sys_clock_gettime(clockid_t which_clock,
581 (struct timespec __user *) &ts); 546 (struct timespec __user *) &ts);
582 set_fs(oldfs); 547 set_fs(oldfs);
583 if (!err && put_compat_timespec(&ts, tp)) 548 if (!err && put_compat_timespec(&ts, tp))
584 return -EFAULT; 549 return -EFAULT;
585 return err; 550 return err;
586} 551}
587 552
588long compat_sys_clock_getres(clockid_t which_clock, 553long compat_sys_clock_getres(clockid_t which_clock,
589 struct compat_timespec __user *tp) 554 struct compat_timespec __user *tp)
590{ 555{
591 long err; 556 long err;
592 mm_segment_t oldfs; 557 mm_segment_t oldfs;
593 struct timespec ts; 558 struct timespec ts;
594 559
595 oldfs = get_fs(); 560 oldfs = get_fs();
596 set_fs(KERNEL_DS); 561 set_fs(KERNEL_DS);
@@ -598,9 +563,9 @@ long compat_sys_clock_getres(clockid_t which_clock,
598 (struct timespec __user *) &ts); 563 (struct timespec __user *) &ts);
599 set_fs(oldfs); 564 set_fs(oldfs);
600 if (!err && tp && put_compat_timespec(&ts, tp)) 565 if (!err && tp && put_compat_timespec(&ts, tp))
601 return -EFAULT; 566 return -EFAULT;
602 return err; 567 return err;
603} 568}
604 569
605static long compat_clock_nanosleep_restart(struct restart_block *restart) 570static long compat_clock_nanosleep_restart(struct restart_block *restart)
606{ 571{
@@ -632,10 +597,10 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
632{ 597{
633 long err; 598 long err;
634 mm_segment_t oldfs; 599 mm_segment_t oldfs;
635 struct timespec in, out; 600 struct timespec in, out;
636 struct restart_block *restart; 601 struct restart_block *restart;
637 602
638 if (get_compat_timespec(&in, rqtp)) 603 if (get_compat_timespec(&in, rqtp))
639 return -EFAULT; 604 return -EFAULT;
640 605
641 oldfs = get_fs(); 606 oldfs = get_fs();
@@ -654,8 +619,8 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
654 restart->fn = compat_clock_nanosleep_restart; 619 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp; 620 restart->arg1 = (unsigned long) rmtp;
656 } 621 }
657 return err; 622 return err;
658} 623}
659 624
660/* 625/*
661 * We currently only need the following fields from the sigevent 626 * We currently only need the following fields from the sigevent
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 38033db8d8ec..6b3a0c15144f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -98,7 +98,8 @@ static inline void check_for_tasks(int cpu)
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %x) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, task_pid_nr(p), cpu,
102 p->state, p->flags);
102 } 103 }
103 write_unlock_irq(&tasklist_lock); 104 write_unlock_irq(&tasklist_lock);
104} 105}
@@ -150,6 +151,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
150 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
151 hcpu, -1, &nr_calls); 152 hcpu, -1, &nr_calls);
152 if (err == NOTIFY_BAD) { 153 if (err == NOTIFY_BAD) {
154 nr_calls--;
153 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 155 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
154 hcpu, nr_calls, NULL); 156 hcpu, nr_calls, NULL);
155 printk("%s: attempt to take down CPU %u failed\n", 157 printk("%s: attempt to take down CPU %u failed\n",
@@ -233,6 +235,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
233 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
234 -1, &nr_calls); 236 -1, &nr_calls);
235 if (ret == NOTIFY_BAD) { 237 if (ret == NOTIFY_BAD) {
238 nr_calls--;
236 printk("%s: attempt to bring up CPU %u failed\n", 239 printk("%s: attempt to bring up CPU %u failed\n",
237 __FUNCTION__, cpu); 240 __FUNCTION__, cpu);
238 ret = -EINVAL; 241 ret = -EINVAL;
@@ -262,6 +265,15 @@ out_notify:
262int __cpuinit cpu_up(unsigned int cpu) 265int __cpuinit cpu_up(unsigned int cpu)
263{ 266{
264 int err = 0; 267 int err = 0;
268 if (!cpu_isset(cpu, cpu_possible_map)) {
269 printk(KERN_ERR "can't online cpu %d because it is not "
270 "configured as may-hotadd at boot time\n", cpu);
271#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
272 printk(KERN_ERR "please check additional_cpus= boot "
273 "parameter\n");
274#endif
275 return -EINVAL;
276 }
265 277
266 mutex_lock(&cpu_add_remove_lock); 278 mutex_lock(&cpu_add_remove_lock);
267 if (cpu_hotplug_disabled) 279 if (cpu_hotplug_disabled)
diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c
new file mode 100644
index 000000000000..731e47e7f164
--- /dev/null
+++ b/kernel/cpu_acct.c
@@ -0,0 +1,186 @@
1/*
2 * kernel/cpu_acct.c - CPU accounting cgroup subsystem
3 *
4 * Copyright (C) Google Inc, 2006
5 *
6 * Developed by Paul Menage (menage@google.com) and Balbir Singh
7 * (balbir@in.ibm.com)
8 *
9 */
10
11/*
12 * Example cgroup subsystem for reporting total CPU usage of tasks in a
13 * cgroup, along with percentage load over a time interval
14 */
15
16#include <linux/module.h>
17#include <linux/cgroup.h>
18#include <linux/fs.h>
19#include <linux/rcupdate.h>
20
21#include <asm/div64.h>
22
23struct cpuacct {
24 struct cgroup_subsys_state css;
25 spinlock_t lock;
26 /* total time used by this class */
27 cputime64_t time;
28
29 /* time when next load calculation occurs */
30 u64 next_interval_check;
31
32 /* time used in current period */
33 cputime64_t current_interval_time;
34
35 /* time used in last period */
36 cputime64_t last_interval_time;
37};
38
39struct cgroup_subsys cpuacct_subsys;
40
41static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
42{
43 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
44 struct cpuacct, css);
45}
46
47static inline struct cpuacct *task_ca(struct task_struct *task)
48{
49 return container_of(task_subsys_state(task, cpuacct_subsys_id),
50 struct cpuacct, css);
51}
52
53#define INTERVAL (HZ * 10)
54
55static inline u64 next_interval_boundary(u64 now)
56{
57 /* calculate the next interval boundary beyond the
58 * current time */
59 do_div(now, INTERVAL);
60 return (now + 1) * INTERVAL;
61}
62
63static struct cgroup_subsys_state *cpuacct_create(
64 struct cgroup_subsys *ss, struct cgroup *cont)
65{
66 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
67
68 if (!ca)
69 return ERR_PTR(-ENOMEM);
70 spin_lock_init(&ca->lock);
71 ca->next_interval_check = next_interval_boundary(get_jiffies_64());
72 return &ca->css;
73}
74
75static void cpuacct_destroy(struct cgroup_subsys *ss,
76 struct cgroup *cont)
77{
78 kfree(cgroup_ca(cont));
79}
80
81/* Lazily update the load calculation if necessary. Called with ca locked */
82static void cpuusage_update(struct cpuacct *ca)
83{
84 u64 now = get_jiffies_64();
85
86 /* If we're not due for an update, return */
87 if (ca->next_interval_check > now)
88 return;
89
90 if (ca->next_interval_check <= (now - INTERVAL)) {
91 /* If it's been more than an interval since the last
92 * check, then catch up - the last interval must have
93 * been zero load */
94 ca->last_interval_time = 0;
95 ca->next_interval_check = next_interval_boundary(now);
96 } else {
97 /* If a steal takes the last interval time negative,
98 * then we just ignore it */
99 if ((s64)ca->current_interval_time > 0)
100 ca->last_interval_time = ca->current_interval_time;
101 else
102 ca->last_interval_time = 0;
103 ca->next_interval_check += INTERVAL;
104 }
105 ca->current_interval_time = 0;
106}
107
108static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
109{
110 struct cpuacct *ca = cgroup_ca(cont);
111 u64 time;
112
113 spin_lock_irq(&ca->lock);
114 cpuusage_update(ca);
115 time = cputime64_to_jiffies64(ca->time);
116 spin_unlock_irq(&ca->lock);
117
118 /* Convert 64-bit jiffies to seconds */
119 time *= 1000;
120 do_div(time, HZ);
121 return time;
122}
123
124static u64 load_read(struct cgroup *cont, struct cftype *cft)
125{
126 struct cpuacct *ca = cgroup_ca(cont);
127 u64 time;
128
129 /* Find the time used in the previous interval */
130 spin_lock_irq(&ca->lock);
131 cpuusage_update(ca);
132 time = cputime64_to_jiffies64(ca->last_interval_time);
133 spin_unlock_irq(&ca->lock);
134
135 /* Convert time to a percentage, to give the load in the
136 * previous period */
137 time *= 100;
138 do_div(time, INTERVAL);
139
140 return time;
141}
142
143static struct cftype files[] = {
144 {
145 .name = "usage",
146 .read_uint = cpuusage_read,
147 },
148 {
149 .name = "load",
150 .read_uint = load_read,
151 }
152};
153
154static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
155{
156 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
157}
158
159void cpuacct_charge(struct task_struct *task, cputime_t cputime)
160{
161
162 struct cpuacct *ca;
163 unsigned long flags;
164
165 if (!cpuacct_subsys.active)
166 return;
167 rcu_read_lock();
168 ca = task_ca(task);
169 if (ca) {
170 spin_lock_irqsave(&ca->lock, flags);
171 cpuusage_update(ca);
172 ca->time = cputime64_add(ca->time, cputime);
173 ca->current_interval_time =
174 cputime64_add(ca->current_interval_time, cputime);
175 spin_unlock_irqrestore(&ca->lock, flags);
176 }
177 rcu_read_unlock();
178}
179
180struct cgroup_subsys cpuacct_subsys = {
181 .name = "cpuacct",
182 .create = cpuacct_create,
183 .destroy = cpuacct_destroy,
184 .populate = cpuacct_populate,
185 .subsys_id = cpuacct_subsys_id,
186};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 57e6448b171e..50f5dc463688 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,7 +4,8 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc
8 * 9 *
9 * Portions derived from Patrick Mochel's sysfs code. 10 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 11 * sysfs is Copyright (c) 2001-3 Patrick Mochel
@@ -12,6 +13,7 @@
12 * 2003-10-10 Written by Simon Derr. 13 * 2003-10-10 Written by Simon Derr.
13 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
14 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups
15 * 17 *
16 * This file is subject to the terms and conditions of the GNU General Public 18 * This file is subject to the terms and conditions of the GNU General Public
17 * License. See the file COPYING in the main directory of the Linux 19 * License. See the file COPYING in the main directory of the Linux
@@ -36,6 +38,7 @@
36#include <linux/mount.h> 38#include <linux/mount.h>
37#include <linux/namei.h> 39#include <linux/namei.h>
38#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
39#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
40#include <linux/rcupdate.h> 43#include <linux/rcupdate.h>
41#include <linux/sched.h> 44#include <linux/sched.h>
@@ -52,8 +55,7 @@
52#include <asm/uaccess.h> 55#include <asm/uaccess.h>
53#include <asm/atomic.h> 56#include <asm/atomic.h>
54#include <linux/mutex.h> 57#include <linux/mutex.h>
55 58#include <linux/kfifo.h>
56#define CPUSET_SUPER_MAGIC 0x27e0eb
57 59
58/* 60/*
59 * Tracks how many cpusets are currently defined in system. 61 * Tracks how many cpusets are currently defined in system.
@@ -62,6 +64,10 @@
62 */ 64 */
63int number_of_cpusets __read_mostly; 65int number_of_cpusets __read_mostly;
64 66
67/* Retrieve the cpuset from a cgroup */
68struct cgroup_subsys cpuset_subsys;
69struct cpuset;
70
65/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
66 72
67struct fmeter { 73struct fmeter {
@@ -72,24 +78,13 @@ struct fmeter {
72}; 78};
73 79
74struct cpuset { 80struct cpuset {
81 struct cgroup_subsys_state css;
82
75 unsigned long flags; /* "unsigned long" so bitops work */ 83 unsigned long flags; /* "unsigned long" so bitops work */
76 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 84 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
77 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 85 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
78 86
79 /*
80 * Count is atomic so can incr (fork) or decr (exit) without a lock.
81 */
82 atomic_t count; /* count tasks using this cpuset */
83
84 /*
85 * We link our 'sibling' struct into our parents 'children'.
86 * Our children link their 'sibling' into our 'children'.
87 */
88 struct list_head sibling; /* my parents children */
89 struct list_head children; /* my children */
90
91 struct cpuset *parent; /* my parent */ 87 struct cpuset *parent; /* my parent */
92 struct dentry *dentry; /* cpuset fs entry */
93 88
94 /* 89 /*
95 * Copy of global cpuset_mems_generation as of the most 90 * Copy of global cpuset_mems_generation as of the most
@@ -98,15 +93,32 @@ struct cpuset {
98 int mems_generation; 93 int mems_generation;
99 94
100 struct fmeter fmeter; /* memory_pressure filter */ 95 struct fmeter fmeter; /* memory_pressure filter */
96
97 /* partition number for rebuild_sched_domains() */
98 int pn;
101}; 99};
102 100
101/* Retrieve the cpuset for a cgroup */
102static inline struct cpuset *cgroup_cs(struct cgroup *cont)
103{
104 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
105 struct cpuset, css);
106}
107
108/* Retrieve the cpuset for a task */
109static inline struct cpuset *task_cs(struct task_struct *task)
110{
111 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css);
113}
114
115
103/* bits in struct cpuset flags field */ 116/* bits in struct cpuset flags field */
104typedef enum { 117typedef enum {
105 CS_CPU_EXCLUSIVE, 118 CS_CPU_EXCLUSIVE,
106 CS_MEM_EXCLUSIVE, 119 CS_MEM_EXCLUSIVE,
107 CS_MEMORY_MIGRATE, 120 CS_MEMORY_MIGRATE,
108 CS_REMOVED, 121 CS_SCHED_LOAD_BALANCE,
109 CS_NOTIFY_ON_RELEASE,
110 CS_SPREAD_PAGE, 122 CS_SPREAD_PAGE,
111 CS_SPREAD_SLAB, 123 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 124} cpuset_flagbits_t;
@@ -122,14 +134,9 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
122 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 134 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 135}
124 136
125static inline int is_removed(const struct cpuset *cs) 137static inline int is_sched_load_balance(const struct cpuset *cs)
126{ 138{
127 return test_bit(CS_REMOVED, &cs->flags); 139 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
128}
129
130static inline int notify_on_release(const struct cpuset *cs)
131{
132 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 140}
134 141
135static inline int is_memory_migrate(const struct cpuset *cs) 142static inline int is_memory_migrate(const struct cpuset *cs)
@@ -172,14 +179,8 @@ static struct cpuset top_cpuset = {
172 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 179 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
173 .cpus_allowed = CPU_MASK_ALL, 180 .cpus_allowed = CPU_MASK_ALL,
174 .mems_allowed = NODE_MASK_ALL, 181 .mems_allowed = NODE_MASK_ALL,
175 .count = ATOMIC_INIT(0),
176 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
177 .children = LIST_HEAD_INIT(top_cpuset.children),
178}; 182};
179 183
180static struct vfsmount *cpuset_mount;
181static struct super_block *cpuset_sb;
182
183/* 184/*
184 * We have two global cpuset mutexes below. They can nest. 185 * We have two global cpuset mutexes below. They can nest.
185 * It is ok to first take manage_mutex, then nest callback_mutex. We also 186 * It is ok to first take manage_mutex, then nest callback_mutex. We also
@@ -263,297 +264,33 @@ static struct super_block *cpuset_sb;
263 * the routine cpuset_update_task_memory_state(). 264 * the routine cpuset_update_task_memory_state().
264 */ 265 */
265 266
266static DEFINE_MUTEX(manage_mutex);
267static DEFINE_MUTEX(callback_mutex); 267static DEFINE_MUTEX(callback_mutex);
268 268
269/* 269/* This is ugly, but preserves the userspace API for existing cpuset
270 * A couple of forward declarations required, due to cyclic reference loop: 270 * users. If someone tries to mount the "cpuset" filesystem, we
271 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 271 * silently switch it to mount "cgroup" instead */
272 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
273 */
274
275static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);
276static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);
277
278static struct backing_dev_info cpuset_backing_dev_info = {
279 .ra_pages = 0, /* No readahead */
280 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
281};
282
283static struct inode *cpuset_new_inode(mode_t mode)
284{
285 struct inode *inode = new_inode(cpuset_sb);
286
287 if (inode) {
288 inode->i_mode = mode;
289 inode->i_uid = current->fsuid;
290 inode->i_gid = current->fsgid;
291 inode->i_blocks = 0;
292 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
293 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
294 }
295 return inode;
296}
297
298static void cpuset_diput(struct dentry *dentry, struct inode *inode)
299{
300 /* is dentry a directory ? if so, kfree() associated cpuset */
301 if (S_ISDIR(inode->i_mode)) {
302 struct cpuset *cs = dentry->d_fsdata;
303 BUG_ON(!(is_removed(cs)));
304 kfree(cs);
305 }
306 iput(inode);
307}
308
309static struct dentry_operations cpuset_dops = {
310 .d_iput = cpuset_diput,
311};
312
313static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
314{
315 struct dentry *d = lookup_one_len(name, parent, strlen(name));
316 if (!IS_ERR(d))
317 d->d_op = &cpuset_dops;
318 return d;
319}
320
321static void remove_dir(struct dentry *d)
322{
323 struct dentry *parent = dget(d->d_parent);
324
325 d_delete(d);
326 simple_rmdir(parent->d_inode, d);
327 dput(parent);
328}
329
330/*
331 * NOTE : the dentry must have been dget()'ed
332 */
333static void cpuset_d_remove_dir(struct dentry *dentry)
334{
335 struct list_head *node;
336
337 spin_lock(&dcache_lock);
338 node = dentry->d_subdirs.next;
339 while (node != &dentry->d_subdirs) {
340 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
341 list_del_init(node);
342 if (d->d_inode) {
343 d = dget_locked(d);
344 spin_unlock(&dcache_lock);
345 d_delete(d);
346 simple_unlink(dentry->d_inode, d);
347 dput(d);
348 spin_lock(&dcache_lock);
349 }
350 node = dentry->d_subdirs.next;
351 }
352 list_del_init(&dentry->d_u.d_child);
353 spin_unlock(&dcache_lock);
354 remove_dir(dentry);
355}
356
357static struct super_operations cpuset_ops = {
358 .statfs = simple_statfs,
359 .drop_inode = generic_delete_inode,
360};
361
362static int cpuset_fill_super(struct super_block *sb, void *unused_data,
363 int unused_silent)
364{
365 struct inode *inode;
366 struct dentry *root;
367
368 sb->s_blocksize = PAGE_CACHE_SIZE;
369 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
370 sb->s_magic = CPUSET_SUPER_MAGIC;
371 sb->s_op = &cpuset_ops;
372 cpuset_sb = sb;
373
374 inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
375 if (inode) {
376 inode->i_op = &simple_dir_inode_operations;
377 inode->i_fop = &simple_dir_operations;
378 /* directories start off with i_nlink == 2 (for "." entry) */
379 inc_nlink(inode);
380 } else {
381 return -ENOMEM;
382 }
383
384 root = d_alloc_root(inode);
385 if (!root) {
386 iput(inode);
387 return -ENOMEM;
388 }
389 sb->s_root = root;
390 return 0;
391}
392
393static int cpuset_get_sb(struct file_system_type *fs_type, 272static int cpuset_get_sb(struct file_system_type *fs_type,
394 int flags, const char *unused_dev_name, 273 int flags, const char *unused_dev_name,
395 void *data, struct vfsmount *mnt) 274 void *data, struct vfsmount *mnt)
396{ 275{
397 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); 276 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
277 int ret = -ENODEV;
278 if (cgroup_fs) {
279 char mountopts[] =
280 "cpuset,noprefix,"
281 "release_agent=/sbin/cpuset_release_agent";
282 ret = cgroup_fs->get_sb(cgroup_fs, flags,
283 unused_dev_name, mountopts, mnt);
284 put_filesystem(cgroup_fs);
285 }
286 return ret;
398} 287}
399 288
400static struct file_system_type cpuset_fs_type = { 289static struct file_system_type cpuset_fs_type = {
401 .name = "cpuset", 290 .name = "cpuset",
402 .get_sb = cpuset_get_sb, 291 .get_sb = cpuset_get_sb,
403 .kill_sb = kill_litter_super,
404};
405
406/* struct cftype:
407 *
408 * The files in the cpuset filesystem mostly have a very simple read/write
409 * handling, some common function will take care of it. Nevertheless some cases
410 * (read tasks) are special and therefore I define this structure for every
411 * kind of file.
412 *
413 *
414 * When reading/writing to a file:
415 * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
416 * - the 'cftype' of the file is file->f_path.dentry->d_fsdata
417 */
418
419struct cftype {
420 char *name;
421 int private;
422 int (*open) (struct inode *inode, struct file *file);
423 ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes,
424 loff_t *ppos);
425 int (*write) (struct file *file, const char __user *buf, size_t nbytes,
426 loff_t *ppos);
427 int (*release) (struct inode *inode, struct file *file);
428}; 292};
429 293
430static inline struct cpuset *__d_cs(struct dentry *dentry)
431{
432 return dentry->d_fsdata;
433}
434
435static inline struct cftype *__d_cft(struct dentry *dentry)
436{
437 return dentry->d_fsdata;
438}
439
440/*
441 * Call with manage_mutex held. Writes path of cpuset into buf.
442 * Returns 0 on success, -errno on error.
443 */
444
445static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
446{
447 char *start;
448
449 start = buf + buflen;
450
451 *--start = '\0';
452 for (;;) {
453 int len = cs->dentry->d_name.len;
454 if ((start -= len) < buf)
455 return -ENAMETOOLONG;
456 memcpy(start, cs->dentry->d_name.name, len);
457 cs = cs->parent;
458 if (!cs)
459 break;
460 if (!cs->parent)
461 continue;
462 if (--start < buf)
463 return -ENAMETOOLONG;
464 *start = '/';
465 }
466 memmove(buf, start, buf + buflen - start);
467 return 0;
468}
469
470/*
471 * Notify userspace when a cpuset is released, by running
472 * /sbin/cpuset_release_agent with the name of the cpuset (path
473 * relative to the root of cpuset file system) as the argument.
474 *
475 * Most likely, this user command will try to rmdir this cpuset.
476 *
477 * This races with the possibility that some other task will be
478 * attached to this cpuset before it is removed, or that some other
479 * user task will 'mkdir' a child cpuset of this cpuset. That's ok.
480 * The presumed 'rmdir' will fail quietly if this cpuset is no longer
481 * unused, and this cpuset will be reprieved from its death sentence,
482 * to continue to serve a useful existence. Next time it's released,
483 * we will get notified again, if it still has 'notify_on_release' set.
484 *
485 * The final arg to call_usermodehelper() is 0, which means don't
486 * wait. The separate /sbin/cpuset_release_agent task is forked by
487 * call_usermodehelper(), then control in this thread returns here,
488 * without waiting for the release agent task. We don't bother to
489 * wait because the caller of this routine has no use for the exit
490 * status of the /sbin/cpuset_release_agent task, so no sense holding
491 * our caller up for that.
492 *
493 * When we had only one cpuset mutex, we had to call this
494 * without holding it, to avoid deadlock when call_usermodehelper()
495 * allocated memory. With two locks, we could now call this while
496 * holding manage_mutex, but we still don't, so as to minimize
497 * the time manage_mutex is held.
498 */
499
500static void cpuset_release_agent(const char *pathbuf)
501{
502 char *argv[3], *envp[3];
503 int i;
504
505 if (!pathbuf)
506 return;
507
508 i = 0;
509 argv[i++] = "/sbin/cpuset_release_agent";
510 argv[i++] = (char *)pathbuf;
511 argv[i] = NULL;
512
513 i = 0;
514 /* minimal command environment */
515 envp[i++] = "HOME=/";
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL;
518
519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf);
521}
522
523/*
524 * Either cs->count of using tasks transitioned to zero, or the
525 * cs->children list of child cpusets just became empty. If this
526 * cs is notify_on_release() and now both the user count is zero and
527 * the list of children is empty, prepare cpuset path in a kmalloc'd
528 * buffer, to be returned via ppathbuf, so that the caller can invoke
529 * cpuset_release_agent() with it later on, once manage_mutex is dropped.
530 * Call here with manage_mutex held.
531 *
532 * This check_for_release() routine is responsible for kmalloc'ing
533 * pathbuf. The above cpuset_release_agent() is responsible for
534 * kfree'ing pathbuf. The caller of these routines is responsible
535 * for providing a pathbuf pointer, initialized to NULL, then
536 * calling check_for_release() with manage_mutex held and the address
537 * of the pathbuf pointer, then dropping manage_mutex, then calling
538 * cpuset_release_agent() with pathbuf, as set by check_for_release().
539 */
540
541static void check_for_release(struct cpuset *cs, char **ppathbuf)
542{
543 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
544 list_empty(&cs->children)) {
545 char *buf;
546
547 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
548 if (!buf)
549 return;
550 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
551 kfree(buf);
552 else
553 *ppathbuf = buf;
554 }
555}
556
557/* 294/*
558 * Return in *pmask the portion of a cpusets's cpus_allowed that 295 * Return in *pmask the portion of a cpusets's cpus_allowed that
559 * are online. If none are online, walk up the cpuset hierarchy 296 * are online. If none are online, walk up the cpuset hierarchy
@@ -581,26 +318,28 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
581 318
582/* 319/*
583 * Return in *pmask the portion of a cpusets's mems_allowed that 320 * Return in *pmask the portion of a cpusets's mems_allowed that
584 * are online. If none are online, walk up the cpuset hierarchy 321 * are online, with memory. If none are online with memory, walk
585 * until we find one that does have some online mems. If we get 322 * up the cpuset hierarchy until we find one that does have some
586 * all the way to the top and still haven't found any online mems, 323 * online mems. If we get all the way to the top and still haven't
587 * return node_online_map. 324 * found any online mems, return node_states[N_HIGH_MEMORY].
588 * 325 *
589 * One way or another, we guarantee to return some non-empty subset 326 * One way or another, we guarantee to return some non-empty subset
590 * of node_online_map. 327 * of node_states[N_HIGH_MEMORY].
591 * 328 *
592 * Call with callback_mutex held. 329 * Call with callback_mutex held.
593 */ 330 */
594 331
595static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 332static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
596{ 333{
597 while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) 334 while (cs && !nodes_intersects(cs->mems_allowed,
335 node_states[N_HIGH_MEMORY]))
598 cs = cs->parent; 336 cs = cs->parent;
599 if (cs) 337 if (cs)
600 nodes_and(*pmask, cs->mems_allowed, node_online_map); 338 nodes_and(*pmask, cs->mems_allowed,
339 node_states[N_HIGH_MEMORY]);
601 else 340 else
602 *pmask = node_online_map; 341 *pmask = node_states[N_HIGH_MEMORY];
603 BUG_ON(!nodes_intersects(*pmask, node_online_map)); 342 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
604} 343}
605 344
606/** 345/**
@@ -651,20 +390,19 @@ void cpuset_update_task_memory_state(void)
651 struct task_struct *tsk = current; 390 struct task_struct *tsk = current;
652 struct cpuset *cs; 391 struct cpuset *cs;
653 392
654 if (tsk->cpuset == &top_cpuset) { 393 if (task_cs(tsk) == &top_cpuset) {
655 /* Don't need rcu for top_cpuset. It's never freed. */ 394 /* Don't need rcu for top_cpuset. It's never freed. */
656 my_cpusets_mem_gen = top_cpuset.mems_generation; 395 my_cpusets_mem_gen = top_cpuset.mems_generation;
657 } else { 396 } else {
658 rcu_read_lock(); 397 rcu_read_lock();
659 cs = rcu_dereference(tsk->cpuset); 398 my_cpusets_mem_gen = task_cs(current)->mems_generation;
660 my_cpusets_mem_gen = cs->mems_generation;
661 rcu_read_unlock(); 399 rcu_read_unlock();
662 } 400 }
663 401
664 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 402 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
665 mutex_lock(&callback_mutex); 403 mutex_lock(&callback_mutex);
666 task_lock(tsk); 404 task_lock(tsk);
667 cs = tsk->cpuset; /* Maybe changed when task not locked */ 405 cs = task_cs(tsk); /* Maybe changed when task not locked */
668 guarantee_online_mems(cs, &tsk->mems_allowed); 406 guarantee_online_mems(cs, &tsk->mems_allowed);
669 tsk->cpuset_mems_generation = cs->mems_generation; 407 tsk->cpuset_mems_generation = cs->mems_generation;
670 if (is_spread_page(cs)) 408 if (is_spread_page(cs))
@@ -719,11 +457,12 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
719 457
720static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 458static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
721{ 459{
460 struct cgroup *cont;
722 struct cpuset *c, *par; 461 struct cpuset *c, *par;
723 462
724 /* Each of our child cpusets must be a subset of us */ 463 /* Each of our child cpusets must be a subset of us */
725 list_for_each_entry(c, &cur->children, sibling) { 464 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
726 if (!is_cpuset_subset(c, trial)) 465 if (!is_cpuset_subset(cgroup_cs(cont), trial))
727 return -EBUSY; 466 return -EBUSY;
728 } 467 }
729 468
@@ -738,7 +477,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
738 return -EACCES; 477 return -EACCES;
739 478
740 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 479 /* If either I or some sibling (!= me) is exclusive, we can't overlap */
741 list_for_each_entry(c, &par->children, sibling) { 480 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481 c = cgroup_cs(cont);
742 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 482 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
743 c != cur && 483 c != cur &&
744 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 484 cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -749,62 +489,247 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
749 return -EINVAL; 489 return -EINVAL;
750 } 490 }
751 491
492 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
493 if (cgroup_task_count(cur->css.cgroup)) {
494 if (cpus_empty(trial->cpus_allowed) ||
495 nodes_empty(trial->mems_allowed)) {
496 return -ENOSPC;
497 }
498 }
499
752 return 0; 500 return 0;
753} 501}
754 502
755/* 503/*
756 * For a given cpuset cur, partition the system as follows 504 * Helper routine for rebuild_sched_domains().
757 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any 505 * Do cpusets a, b have overlapping cpus_allowed masks?
758 * exclusive child cpusets
759 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
760 * exclusive child cpusets
761 * Build these two partitions by calling partition_sched_domains
762 *
763 * Call with manage_mutex held. May nest a call to the
764 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
765 * Must not be called holding callback_mutex, because we must
766 * not call lock_cpu_hotplug() while holding callback_mutex.
767 */ 506 */
768 507
769static void update_cpu_domains(struct cpuset *cur) 508static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
770{ 509{
771 struct cpuset *c, *par = cur->parent; 510 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
772 cpumask_t pspan, cspan; 511}
773 512
774 if (par == NULL || cpus_empty(cur->cpus_allowed)) 513/*
775 return; 514 * rebuild_sched_domains()
515 *
516 * If the flag 'sched_load_balance' of any cpuset with non-empty
517 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
518 * which has that flag enabled, or if any cpuset with a non-empty
519 * 'cpus' is removed, then call this routine to rebuild the
520 * scheduler's dynamic sched domains.
521 *
522 * This routine builds a partial partition of the systems CPUs
523 * (the set of non-overlappping cpumask_t's in the array 'part'
524 * below), and passes that partial partition to the kernel/sched.c
525 * partition_sched_domains() routine, which will rebuild the
526 * schedulers load balancing domains (sched domains) as specified
527 * by that partial partition. A 'partial partition' is a set of
528 * non-overlapping subsets whose union is a subset of that set.
529 *
530 * See "What is sched_load_balance" in Documentation/cpusets.txt
531 * for a background explanation of this.
532 *
533 * Does not return errors, on the theory that the callers of this
534 * routine would rather not worry about failures to rebuild sched
535 * domains when operating in the severe memory shortage situations
536 * that could cause allocation failures below.
537 *
538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
544 * So the reverse nesting would risk an ABBA deadlock.
545 *
546 * The three key local variables below are:
547 * q - a kfifo queue of cpuset pointers, used to implement a
548 * top-down scan of all cpusets. This scan loads a pointer
549 * to each cpuset marked is_sched_load_balance into the
550 * array 'csa'. For our purposes, rebuilding the schedulers
551 * sched domains, we can ignore !is_sched_load_balance cpusets.
552 * csa - (for CpuSet Array) Array of pointers to all the cpusets
553 * that need to be load balanced, for convenient iterative
554 * access by the subsequent code that finds the best partition,
555 * i.e the set of domains (subsets) of CPUs such that the
556 * cpus_allowed of every cpuset marked is_sched_load_balance
557 * is a subset of one of these domains, while there are as
558 * many such domains as possible, each as small as possible.
559 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
560 * the kernel/sched.c routine partition_sched_domains() in a
561 * convenient format, that can be easily compared to the prior
562 * value to determine what partition elements (sched domains)
563 * were changed (added or removed.)
564 *
565 * Finding the best partition (set of domains):
566 * The triple nested loops below over i, j, k scan over the
567 * load balanced cpusets (using the array of cpuset pointers in
568 * csa[]) looking for pairs of cpusets that have overlapping
569 * cpus_allowed, but which don't have the same 'pn' partition
570 * number and gives them in the same partition number. It keeps
571 * looping on the 'restart' label until it can no longer find
572 * any such pairs.
573 *
574 * The union of the cpus_allowed masks from the set of
575 * all cpusets having the same 'pn' value then form the one
576 * element of the partition (one sched domain) to be passed to
577 * partition_sched_domains().
578 */
776 579
777 /* 580static void rebuild_sched_domains(void)
778 * Get all cpus from parent's cpus_allowed not part of exclusive 581{
779 * children 582 struct kfifo *q; /* queue of cpusets to be scanned */
780 */ 583 struct cpuset *cp; /* scans q */
781 pspan = par->cpus_allowed; 584 struct cpuset **csa; /* array of all cpuset ptrs */
782 list_for_each_entry(c, &par->children, sibling) { 585 int csn; /* how many cpuset ptrs in csa so far */
783 if (is_cpu_exclusive(c)) 586 int i, j, k; /* indices for partition finding loops */
784 cpus_andnot(pspan, pspan, c->cpus_allowed); 587 cpumask_t *doms; /* resulting partition; i.e. sched domains */
588 int ndoms; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */
590
591 q = NULL;
592 csa = NULL;
593 doms = NULL;
594
595 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) {
597 ndoms = 1;
598 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
599 if (!doms)
600 goto rebuild;
601 *doms = top_cpuset.cpus_allowed;
602 goto rebuild;
785 } 603 }
786 if (!is_cpu_exclusive(cur)) { 604
787 cpus_or(pspan, pspan, cur->cpus_allowed); 605 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
788 if (cpus_equal(pspan, cur->cpus_allowed)) 606 if (IS_ERR(q))
789 return; 607 goto done;
790 cspan = CPU_MASK_NONE; 608 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
791 } else { 609 if (!csa)
792 if (cpus_empty(pspan)) 610 goto done;
793 return; 611 csn = 0;
794 cspan = cur->cpus_allowed; 612
795 /* 613 cp = &top_cpuset;
796 * Get all cpus from current cpuset's cpus_allowed not part 614 __kfifo_put(q, (void *)&cp, sizeof(cp));
797 * of exclusive children 615 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
798 */ 616 struct cgroup *cont;
799 list_for_each_entry(c, &cur->children, sibling) { 617 struct cpuset *child; /* scans child cpusets of cp */
800 if (is_cpu_exclusive(c)) 618 if (is_sched_load_balance(cp))
801 cpus_andnot(cspan, cspan, c->cpus_allowed); 619 csa[csn++] = cp;
620 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
621 child = cgroup_cs(cont);
622 __kfifo_put(q, (void *)&child, sizeof(cp));
623 }
624 }
625
626 for (i = 0; i < csn; i++)
627 csa[i]->pn = i;
628 ndoms = csn;
629
630restart:
631 /* Find the best partition (set of sched domains) */
632 for (i = 0; i < csn; i++) {
633 struct cpuset *a = csa[i];
634 int apn = a->pn;
635
636 for (j = 0; j < csn; j++) {
637 struct cpuset *b = csa[j];
638 int bpn = b->pn;
639
640 if (apn != bpn && cpusets_overlap(a, b)) {
641 for (k = 0; k < csn; k++) {
642 struct cpuset *c = csa[k];
643
644 if (c->pn == bpn)
645 c->pn = apn;
646 }
647 ndoms--; /* one less element */
648 goto restart;
649 }
650 }
651 }
652
653 /* Convert <csn, csa> to <ndoms, doms> */
654 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
655 if (!doms)
656 goto rebuild;
657
658 for (nslot = 0, i = 0; i < csn; i++) {
659 struct cpuset *a = csa[i];
660 int apn = a->pn;
661
662 if (apn >= 0) {
663 cpumask_t *dp = doms + nslot;
664
665 if (nslot == ndoms) {
666 static int warnings = 10;
667 if (warnings) {
668 printk(KERN_WARNING
669 "rebuild_sched_domains confused:"
670 " nslot %d, ndoms %d, csn %d, i %d,"
671 " apn %d\n",
672 nslot, ndoms, csn, i, apn);
673 warnings--;
674 }
675 continue;
676 }
677
678 cpus_clear(*dp);
679 for (j = i; j < csn; j++) {
680 struct cpuset *b = csa[j];
681
682 if (apn == b->pn) {
683 cpus_or(*dp, *dp, b->cpus_allowed);
684 b->pn = -1;
685 }
686 }
687 nslot++;
802 } 688 }
803 } 689 }
690 BUG_ON(nslot != ndoms);
804 691
692rebuild:
693 /* Have scheduler rebuild sched domains */
805 lock_cpu_hotplug(); 694 lock_cpu_hotplug();
806 partition_sched_domains(&pspan, &cspan); 695 partition_sched_domains(ndoms, doms);
807 unlock_cpu_hotplug(); 696 unlock_cpu_hotplug();
697
698done:
699 if (q && !IS_ERR(q))
700 kfifo_free(q);
701 kfree(csa);
702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
703}
704
705static inline int started_after_time(struct task_struct *t1,
706 struct timespec *time,
707 struct task_struct *t2)
708{
709 int start_diff = timespec_compare(&t1->start_time, time);
710 if (start_diff > 0) {
711 return 1;
712 } else if (start_diff < 0) {
713 return 0;
714 } else {
715 /*
716 * Arbitrarily, if two processes started at the same
717 * time, we'll say that the lower pointer value
718 * started first. Note that t2 may have exited by now
719 * so this may not be a valid pointer any longer, but
720 * that's fine - it still serves to distinguish
721 * between two tasks started (effectively)
722 * simultaneously.
723 */
724 return t1 > t2;
725 }
726}
727
728static inline int started_after(void *p1, void *p2)
729{
730 struct task_struct *t1 = p1;
731 struct task_struct *t2 = p2;
732 return started_after_time(t1, &t2->start_time, t2);
808} 733}
809 734
810/* 735/*
@@ -814,7 +739,15 @@ static void update_cpu_domains(struct cpuset *cur)
814static int update_cpumask(struct cpuset *cs, char *buf) 739static int update_cpumask(struct cpuset *cs, char *buf)
815{ 740{
816 struct cpuset trialcs; 741 struct cpuset trialcs;
817 int retval, cpus_unchanged; 742 int retval, i;
743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 };
818 751
819 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
820 if (cs == &top_cpuset) 753 if (cs == &top_cpuset)
@@ -823,11 +756,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
823 trialcs = *cs; 756 trialcs = *cs;
824 757
825 /* 758 /*
826 * We allow a cpuset's cpus_allowed to be empty; if it has attached 759 * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
827 * tasks, we'll catch it later when we validate the change and return 760 * Since cpulist_parse() fails on an empty mask, we special case
828 * -ENOSPC. 761 * that parsing. The validate_change() call ensures that cpusets
762 * with tasks have cpus.
829 */ 763 */
830 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 764 buf = strstrip(buf);
765 if (!*buf) {
831 cpus_clear(trialcs.cpus_allowed); 766 cpus_clear(trialcs.cpus_allowed);
832 } else { 767 } else {
833 retval = cpulist_parse(buf, trialcs.cpus_allowed); 768 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -835,18 +770,79 @@ static int update_cpumask(struct cpuset *cs, char *buf)
835 return retval; 770 return retval;
836 } 771 }
837 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); 772 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
838 /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
839 if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
840 return -ENOSPC;
841 retval = validate_change(cs, &trialcs); 773 retval = validate_change(cs, &trialcs);
842 if (retval < 0) 774 if (retval < 0)
843 return retval; 775 return retval;
844 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 776
777 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0;
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval)
782 return retval;
783
784 is_load_balanced = is_sched_load_balance(&trialcs);
785
845 mutex_lock(&callback_mutex); 786 mutex_lock(&callback_mutex);
846 cs->cpus_allowed = trialcs.cpus_allowed; 787 cs->cpus_allowed = trialcs.cpus_allowed;
847 mutex_unlock(&callback_mutex); 788 mutex_unlock(&callback_mutex);
848 if (is_cpu_exclusive(cs) && !cpus_unchanged) 789
849 update_cpu_domains(cs); 790 again:
791 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed()
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */
801 heap.size = 0;
802 cgroup_iter_start(cgrp, &it);
803 while ((p = cgroup_iter_next(cgrp, &it))) {
804 /* Only affect tasks that don't have the right cpus_allowed */
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap);
843 if (is_load_balanced)
844 rebuild_sched_domains();
845
850 return 0; 846 return 0;
851} 847}
852 848
@@ -895,7 +891,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
895 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 891 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
896 892
897 mutex_lock(&callback_mutex); 893 mutex_lock(&callback_mutex);
898 guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); 894 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
899 mutex_unlock(&callback_mutex); 895 mutex_unlock(&callback_mutex);
900} 896}
901 897
@@ -913,46 +909,50 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
913 * their mempolicies to the cpusets new mems_allowed. 909 * their mempolicies to the cpusets new mems_allowed.
914 */ 910 */
915 911
912static void *cpuset_being_rebound;
913
916static int update_nodemask(struct cpuset *cs, char *buf) 914static int update_nodemask(struct cpuset *cs, char *buf)
917{ 915{
918 struct cpuset trialcs; 916 struct cpuset trialcs;
919 nodemask_t oldmem; 917 nodemask_t oldmem;
920 struct task_struct *g, *p; 918 struct task_struct *p;
921 struct mm_struct **mmarray; 919 struct mm_struct **mmarray;
922 int i, n, ntasks; 920 int i, n, ntasks;
923 int migrate; 921 int migrate;
924 int fudge; 922 int fudge;
925 int retval; 923 int retval;
924 struct cgroup_iter it;
926 925
927 /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ 926 /*
927 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
928 * it's read-only
929 */
928 if (cs == &top_cpuset) 930 if (cs == &top_cpuset)
929 return -EACCES; 931 return -EACCES;
930 932
931 trialcs = *cs; 933 trialcs = *cs;
932 934
933 /* 935 /*
934 * We allow a cpuset's mems_allowed to be empty; if it has attached 936 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
935 * tasks, we'll catch it later when we validate the change and return 937 * Since nodelist_parse() fails on an empty mask, we special case
936 * -ENOSPC. 938 * that parsing. The validate_change() call ensures that cpusets
939 * with tasks have memory.
937 */ 940 */
938 if (!buf[0] || (buf[0] == '\n' && !buf[1])) { 941 buf = strstrip(buf);
942 if (!*buf) {
939 nodes_clear(trialcs.mems_allowed); 943 nodes_clear(trialcs.mems_allowed);
940 } else { 944 } else {
941 retval = nodelist_parse(buf, trialcs.mems_allowed); 945 retval = nodelist_parse(buf, trialcs.mems_allowed);
942 if (retval < 0) 946 if (retval < 0)
943 goto done; 947 goto done;
944 } 948 }
945 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); 949 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
950 node_states[N_HIGH_MEMORY]);
946 oldmem = cs->mems_allowed; 951 oldmem = cs->mems_allowed;
947 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 952 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
948 retval = 0; /* Too easy - nothing to do */ 953 retval = 0; /* Too easy - nothing to do */
949 goto done; 954 goto done;
950 } 955 }
951 /* mems_allowed cannot be empty for a cpuset with attached tasks. */
952 if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
953 retval = -ENOSPC;
954 goto done;
955 }
956 retval = validate_change(cs, &trialcs); 956 retval = validate_change(cs, &trialcs);
957 if (retval < 0) 957 if (retval < 0)
958 goto done; 958 goto done;
@@ -962,7 +962,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
962 cs->mems_generation = cpuset_mems_generation++; 962 cs->mems_generation = cpuset_mems_generation++;
963 mutex_unlock(&callback_mutex); 963 mutex_unlock(&callback_mutex);
964 964
965 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 965 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
966 966
967 fudge = 10; /* spare mmarray[] slots */ 967 fudge = 10; /* spare mmarray[] slots */
968 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 968 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -976,13 +976,13 @@ static int update_nodemask(struct cpuset *cs, char *buf)
976 * enough mmarray[] w/o using GFP_ATOMIC. 976 * enough mmarray[] w/o using GFP_ATOMIC.
977 */ 977 */
978 while (1) { 978 while (1) {
979 ntasks = atomic_read(&cs->count); /* guess */ 979 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
980 ntasks += fudge; 980 ntasks += fudge;
981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
982 if (!mmarray) 982 if (!mmarray)
983 goto done; 983 goto done;
984 read_lock(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
985 if (atomic_read(&cs->count) <= ntasks) 985 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
986 break; /* got enough */ 986 break; /* got enough */
987 read_unlock(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
988 kfree(mmarray); 988 kfree(mmarray);
@@ -991,21 +991,21 @@ static int update_nodemask(struct cpuset *cs, char *buf)
991 n = 0; 991 n = 0;
992 992
993 /* Load up mmarray[] with mm reference for each task in cpuset. */ 993 /* Load up mmarray[] with mm reference for each task in cpuset. */
994 do_each_thread(g, p) { 994 cgroup_iter_start(cs->css.cgroup, &it);
995 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
995 struct mm_struct *mm; 996 struct mm_struct *mm;
996 997
997 if (n >= ntasks) { 998 if (n >= ntasks) {
998 printk(KERN_WARNING 999 printk(KERN_WARNING
999 "Cpuset mempolicy rebind incomplete.\n"); 1000 "Cpuset mempolicy rebind incomplete.\n");
1000 continue; 1001 break;
1001 } 1002 }
1002 if (p->cpuset != cs)
1003 continue;
1004 mm = get_task_mm(p); 1003 mm = get_task_mm(p);
1005 if (!mm) 1004 if (!mm)
1006 continue; 1005 continue;
1007 mmarray[n++] = mm; 1006 mmarray[n++] = mm;
1008 } while_each_thread(g, p); 1007 }
1008 cgroup_iter_end(cs->css.cgroup, &it);
1009 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1010 1010
1011 /* 1011 /*
@@ -1033,12 +1033,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1033 1033
1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */
1035 kfree(mmarray); 1035 kfree(mmarray);
1036 set_cpuset_being_rebound(NULL); 1036 cpuset_being_rebound = NULL;
1037 retval = 0; 1037 retval = 0;
1038done: 1038done:
1039 return retval; 1039 return retval;
1040} 1040}
1041 1041
1042int current_cpuset_is_being_rebound(void)
1043{
1044 return task_cs(current) == cpuset_being_rebound;
1045}
1046
1042/* 1047/*
1043 * Call with manage_mutex held. 1048 * Call with manage_mutex held.
1044 */ 1049 */
@@ -1055,6 +1060,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1055/* 1060/*
1056 * update_flag - read a 0 or a 1 in a file and update associated flag 1061 * update_flag - read a 0 or a 1 in a file and update associated flag
1057 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1062 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
1063 * CS_SCHED_LOAD_BALANCE,
1058 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1064 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1059 * CS_SPREAD_PAGE, CS_SPREAD_SLAB) 1065 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1060 * cs: the cpuset to update 1066 * cs: the cpuset to update
@@ -1067,7 +1073,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1067{ 1073{
1068 int turning_on; 1074 int turning_on;
1069 struct cpuset trialcs; 1075 struct cpuset trialcs;
1070 int err, cpu_exclusive_changed; 1076 int err;
1077 int cpus_nonempty, balance_flag_changed;
1071 1078
1072 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 1079 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1073 1080
@@ -1080,14 +1087,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1080 err = validate_change(cs, &trialcs); 1087 err = validate_change(cs, &trialcs);
1081 if (err < 0) 1088 if (err < 0)
1082 return err; 1089 return err;
1083 cpu_exclusive_changed = 1090
1084 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 1091 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1092 balance_flag_changed = (is_sched_load_balance(cs) !=
1093 is_sched_load_balance(&trialcs));
1094
1085 mutex_lock(&callback_mutex); 1095 mutex_lock(&callback_mutex);
1086 cs->flags = trialcs.flags; 1096 cs->flags = trialcs.flags;
1087 mutex_unlock(&callback_mutex); 1097 mutex_unlock(&callback_mutex);
1088 1098
1089 if (cpu_exclusive_changed) 1099 if (cpus_nonempty && balance_flag_changed)
1090 update_cpu_domains(cs); 1100 rebuild_sched_domains();
1101
1091 return 0; 1102 return 0;
1092} 1103}
1093 1104
@@ -1189,85 +1200,34 @@ static int fmeter_getrate(struct fmeter *fmp)
1189 return val; 1200 return val;
1190} 1201}
1191 1202
1192/* 1203static int cpuset_can_attach(struct cgroup_subsys *ss,
1193 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1204 struct cgroup *cont, struct task_struct *tsk)
1194 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
1195 * notified on release.
1196 *
1197 * Call holding manage_mutex. May take callback_mutex and task_lock of
1198 * the task 'pid' during call.
1199 */
1200
1201static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1202{ 1205{
1203 pid_t pid; 1206 struct cpuset *cs = cgroup_cs(cont);
1204 struct task_struct *tsk;
1205 struct cpuset *oldcs;
1206 cpumask_t cpus;
1207 nodemask_t from, to;
1208 struct mm_struct *mm;
1209 int retval;
1210 1207
1211 if (sscanf(pidbuf, "%d", &pid) != 1)
1212 return -EIO;
1213 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1208 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1214 return -ENOSPC; 1209 return -ENOSPC;
1215 1210
1216 if (pid) { 1211 return security_task_setscheduler(tsk, 0, NULL);
1217 read_lock(&tasklist_lock); 1212}
1218
1219 tsk = find_task_by_pid(pid);
1220 if (!tsk || tsk->flags & PF_EXITING) {
1221 read_unlock(&tasklist_lock);
1222 return -ESRCH;
1223 }
1224
1225 get_task_struct(tsk);
1226 read_unlock(&tasklist_lock);
1227
1228 if ((current->euid) && (current->euid != tsk->uid)
1229 && (current->euid != tsk->suid)) {
1230 put_task_struct(tsk);
1231 return -EACCES;
1232 }
1233 } else {
1234 tsk = current;
1235 get_task_struct(tsk);
1236 }
1237 1213
1238 retval = security_task_setscheduler(tsk, 0, NULL); 1214static void cpuset_attach(struct cgroup_subsys *ss,
1239 if (retval) { 1215 struct cgroup *cont, struct cgroup *oldcont,
1240 put_task_struct(tsk); 1216 struct task_struct *tsk)
1241 return retval; 1217{
1242 } 1218 cpumask_t cpus;
1219 nodemask_t from, to;
1220 struct mm_struct *mm;
1221 struct cpuset *cs = cgroup_cs(cont);
1222 struct cpuset *oldcs = cgroup_cs(oldcont);
1243 1223
1244 mutex_lock(&callback_mutex); 1224 mutex_lock(&callback_mutex);
1245
1246 task_lock(tsk);
1247 oldcs = tsk->cpuset;
1248 /*
1249 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1250 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1251 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1252 */
1253 if (tsk->flags & PF_EXITING) {
1254 task_unlock(tsk);
1255 mutex_unlock(&callback_mutex);
1256 put_task_struct(tsk);
1257 return -ESRCH;
1258 }
1259 atomic_inc(&cs->count);
1260 rcu_assign_pointer(tsk->cpuset, cs);
1261 task_unlock(tsk);
1262
1263 guarantee_online_cpus(cs, &cpus); 1225 guarantee_online_cpus(cs, &cpus);
1264 set_cpus_allowed(tsk, cpus); 1226 set_cpus_allowed(tsk, cpus);
1227 mutex_unlock(&callback_mutex);
1265 1228
1266 from = oldcs->mems_allowed; 1229 from = oldcs->mems_allowed;
1267 to = cs->mems_allowed; 1230 to = cs->mems_allowed;
1268
1269 mutex_unlock(&callback_mutex);
1270
1271 mm = get_task_mm(tsk); 1231 mm = get_task_mm(tsk);
1272 if (mm) { 1232 if (mm) {
1273 mpol_rebind_mm(mm, &to); 1233 mpol_rebind_mm(mm, &to);
@@ -1276,44 +1236,36 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1276 mmput(mm); 1236 mmput(mm);
1277 } 1237 }
1278 1238
1279 put_task_struct(tsk);
1280 synchronize_rcu();
1281 if (atomic_dec_and_test(&oldcs->count))
1282 check_for_release(oldcs, ppathbuf);
1283 return 0;
1284} 1239}
1285 1240
1286/* The various types of files and directories in a cpuset file system */ 1241/* The various types of files and directories in a cpuset file system */
1287 1242
1288typedef enum { 1243typedef enum {
1289 FILE_ROOT,
1290 FILE_DIR,
1291 FILE_MEMORY_MIGRATE, 1244 FILE_MEMORY_MIGRATE,
1292 FILE_CPULIST, 1245 FILE_CPULIST,
1293 FILE_MEMLIST, 1246 FILE_MEMLIST,
1294 FILE_CPU_EXCLUSIVE, 1247 FILE_CPU_EXCLUSIVE,
1295 FILE_MEM_EXCLUSIVE, 1248 FILE_MEM_EXCLUSIVE,
1296 FILE_NOTIFY_ON_RELEASE, 1249 FILE_SCHED_LOAD_BALANCE,
1297 FILE_MEMORY_PRESSURE_ENABLED, 1250 FILE_MEMORY_PRESSURE_ENABLED,
1298 FILE_MEMORY_PRESSURE, 1251 FILE_MEMORY_PRESSURE,
1299 FILE_SPREAD_PAGE, 1252 FILE_SPREAD_PAGE,
1300 FILE_SPREAD_SLAB, 1253 FILE_SPREAD_SLAB,
1301 FILE_TASKLIST,
1302} cpuset_filetype_t; 1254} cpuset_filetype_t;
1303 1255
1304static ssize_t cpuset_common_file_write(struct file *file, 1256static ssize_t cpuset_common_file_write(struct cgroup *cont,
1257 struct cftype *cft,
1258 struct file *file,
1305 const char __user *userbuf, 1259 const char __user *userbuf,
1306 size_t nbytes, loff_t *unused_ppos) 1260 size_t nbytes, loff_t *unused_ppos)
1307{ 1261{
1308 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); 1262 struct cpuset *cs = cgroup_cs(cont);
1309 struct cftype *cft = __d_cft(file->f_path.dentry);
1310 cpuset_filetype_t type = cft->private; 1263 cpuset_filetype_t type = cft->private;
1311 char *buffer; 1264 char *buffer;
1312 char *pathbuf = NULL;
1313 int retval = 0; 1265 int retval = 0;
1314 1266
1315 /* Crude upper limit on largest legitimate cpulist user might write. */ 1267 /* Crude upper limit on largest legitimate cpulist user might write. */
1316 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) 1268 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1317 return -E2BIG; 1269 return -E2BIG;
1318 1270
1319 /* +1 for nul-terminator */ 1271 /* +1 for nul-terminator */
@@ -1326,9 +1278,9 @@ static ssize_t cpuset_common_file_write(struct file *file,
1326 } 1278 }
1327 buffer[nbytes] = 0; /* nul-terminate */ 1279 buffer[nbytes] = 0; /* nul-terminate */
1328 1280
1329 mutex_lock(&manage_mutex); 1281 cgroup_lock();
1330 1282
1331 if (is_removed(cs)) { 1283 if (cgroup_is_removed(cont)) {
1332 retval = -ENODEV; 1284 retval = -ENODEV;
1333 goto out2; 1285 goto out2;
1334 } 1286 }
@@ -1346,8 +1298,8 @@ static ssize_t cpuset_common_file_write(struct file *file,
1346 case FILE_MEM_EXCLUSIVE: 1298 case FILE_MEM_EXCLUSIVE:
1347 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1299 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1348 break; 1300 break;
1349 case FILE_NOTIFY_ON_RELEASE: 1301 case FILE_SCHED_LOAD_BALANCE:
1350 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1302 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1351 break; 1303 break;
1352 case FILE_MEMORY_MIGRATE: 1304 case FILE_MEMORY_MIGRATE:
1353 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1305 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
@@ -1366,9 +1318,6 @@ static ssize_t cpuset_common_file_write(struct file *file,
1366 retval = update_flag(CS_SPREAD_SLAB, cs, buffer); 1318 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1367 cs->mems_generation = cpuset_mems_generation++; 1319 cs->mems_generation = cpuset_mems_generation++;
1368 break; 1320 break;
1369 case FILE_TASKLIST:
1370 retval = attach_task(cs, buffer, &pathbuf);
1371 break;
1372 default: 1321 default:
1373 retval = -EINVAL; 1322 retval = -EINVAL;
1374 goto out2; 1323 goto out2;
@@ -1377,30 +1326,12 @@ static ssize_t cpuset_common_file_write(struct file *file,
1377 if (retval == 0) 1326 if (retval == 0)
1378 retval = nbytes; 1327 retval = nbytes;
1379out2: 1328out2:
1380 mutex_unlock(&manage_mutex); 1329 cgroup_unlock();
1381 cpuset_release_agent(pathbuf);
1382out1: 1330out1:
1383 kfree(buffer); 1331 kfree(buffer);
1384 return retval; 1332 return retval;
1385} 1333}
1386 1334
1387static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
1388 size_t nbytes, loff_t *ppos)
1389{
1390 ssize_t retval = 0;
1391 struct cftype *cft = __d_cft(file->f_path.dentry);
1392 if (!cft)
1393 return -ENODEV;
1394
1395 /* special function ? */
1396 if (cft->write)
1397 retval = cft->write(file, buf, nbytes, ppos);
1398 else
1399 retval = cpuset_common_file_write(file, buf, nbytes, ppos);
1400
1401 return retval;
1402}
1403
1404/* 1335/*
1405 * These ascii lists should be read in a single call, by using a user 1336 * These ascii lists should be read in a single call, by using a user
1406 * buffer large enough to hold the entire map. If read in smaller 1337 * buffer large enough to hold the entire map. If read in smaller
@@ -1435,17 +1366,19 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1435 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1366 return nodelist_scnprintf(page, PAGE_SIZE, mask);
1436} 1367}
1437 1368
1438static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, 1369static ssize_t cpuset_common_file_read(struct cgroup *cont,
1439 size_t nbytes, loff_t *ppos) 1370 struct cftype *cft,
1371 struct file *file,
1372 char __user *buf,
1373 size_t nbytes, loff_t *ppos)
1440{ 1374{
1441 struct cftype *cft = __d_cft(file->f_path.dentry); 1375 struct cpuset *cs = cgroup_cs(cont);
1442 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1443 cpuset_filetype_t type = cft->private; 1376 cpuset_filetype_t type = cft->private;
1444 char *page; 1377 char *page;
1445 ssize_t retval = 0; 1378 ssize_t retval = 0;
1446 char *s; 1379 char *s;
1447 1380
1448 if (!(page = (char *)__get_free_page(GFP_KERNEL))) 1381 if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1449 return -ENOMEM; 1382 return -ENOMEM;
1450 1383
1451 s = page; 1384 s = page;
@@ -1463,8 +1396,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1463 case FILE_MEM_EXCLUSIVE: 1396 case FILE_MEM_EXCLUSIVE:
1464 *s++ = is_mem_exclusive(cs) ? '1' : '0'; 1397 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1465 break; 1398 break;
1466 case FILE_NOTIFY_ON_RELEASE: 1399 case FILE_SCHED_LOAD_BALANCE:
1467 *s++ = notify_on_release(cs) ? '1' : '0'; 1400 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1468 break; 1401 break;
1469 case FILE_MEMORY_MIGRATE: 1402 case FILE_MEMORY_MIGRATE:
1470 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1403 *s++ = is_memory_migrate(cs) ? '1' : '0';
@@ -1493,390 +1426,150 @@ out:
1493 return retval; 1426 return retval;
1494} 1427}
1495 1428
1496static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbytes,
1497 loff_t *ppos)
1498{
1499 ssize_t retval = 0;
1500 struct cftype *cft = __d_cft(file->f_path.dentry);
1501 if (!cft)
1502 return -ENODEV;
1503 1429
1504 /* special function ? */
1505 if (cft->read)
1506 retval = cft->read(file, buf, nbytes, ppos);
1507 else
1508 retval = cpuset_common_file_read(file, buf, nbytes, ppos);
1509 1430
1510 return retval;
1511}
1512 1431
1513static int cpuset_file_open(struct inode *inode, struct file *file)
1514{
1515 int err;
1516 struct cftype *cft;
1517
1518 err = generic_file_open(inode, file);
1519 if (err)
1520 return err;
1521
1522 cft = __d_cft(file->f_path.dentry);
1523 if (!cft)
1524 return -ENODEV;
1525 if (cft->open)
1526 err = cft->open(inode, file);
1527 else
1528 err = 0;
1529
1530 return err;
1531}
1532
1533static int cpuset_file_release(struct inode *inode, struct file *file)
1534{
1535 struct cftype *cft = __d_cft(file->f_path.dentry);
1536 if (cft->release)
1537 return cft->release(inode, file);
1538 return 0;
1539}
1540
1541/*
1542 * cpuset_rename - Only allow simple rename of directories in place.
1543 */
1544static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1545 struct inode *new_dir, struct dentry *new_dentry)
1546{
1547 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1548 return -ENOTDIR;
1549 if (new_dentry->d_inode)
1550 return -EEXIST;
1551 if (old_dir != new_dir)
1552 return -EIO;
1553 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1554}
1555
1556static const struct file_operations cpuset_file_operations = {
1557 .read = cpuset_file_read,
1558 .write = cpuset_file_write,
1559 .llseek = generic_file_llseek,
1560 .open = cpuset_file_open,
1561 .release = cpuset_file_release,
1562};
1563
1564static const struct inode_operations cpuset_dir_inode_operations = {
1565 .lookup = simple_lookup,
1566 .mkdir = cpuset_mkdir,
1567 .rmdir = cpuset_rmdir,
1568 .rename = cpuset_rename,
1569};
1570
1571static int cpuset_create_file(struct dentry *dentry, int mode)
1572{
1573 struct inode *inode;
1574
1575 if (!dentry)
1576 return -ENOENT;
1577 if (dentry->d_inode)
1578 return -EEXIST;
1579
1580 inode = cpuset_new_inode(mode);
1581 if (!inode)
1582 return -ENOMEM;
1583
1584 if (S_ISDIR(mode)) {
1585 inode->i_op = &cpuset_dir_inode_operations;
1586 inode->i_fop = &simple_dir_operations;
1587
1588 /* start off with i_nlink == 2 (for "." entry) */
1589 inc_nlink(inode);
1590 } else if (S_ISREG(mode)) {
1591 inode->i_size = 0;
1592 inode->i_fop = &cpuset_file_operations;
1593 }
1594
1595 d_instantiate(dentry, inode);
1596 dget(dentry); /* Extra count - pin the dentry in core */
1597 return 0;
1598}
1599
1600/*
1601 * cpuset_create_dir - create a directory for an object.
1602 * cs: the cpuset we create the directory for.
1603 * It must have a valid ->parent field
1604 * And we are going to fill its ->dentry field.
1605 * name: The name to give to the cpuset directory. Will be copied.
1606 * mode: mode to set on new directory.
1607 */
1608
1609static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
1610{
1611 struct dentry *dentry = NULL;
1612 struct dentry *parent;
1613 int error = 0;
1614
1615 parent = cs->parent->dentry;
1616 dentry = cpuset_get_dentry(parent, name);
1617 if (IS_ERR(dentry))
1618 return PTR_ERR(dentry);
1619 error = cpuset_create_file(dentry, S_IFDIR | mode);
1620 if (!error) {
1621 dentry->d_fsdata = cs;
1622 inc_nlink(parent->d_inode);
1623 cs->dentry = dentry;
1624 }
1625 dput(dentry);
1626
1627 return error;
1628}
1629
1630static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1631{
1632 struct dentry *dentry;
1633 int error;
1634
1635 mutex_lock(&dir->d_inode->i_mutex);
1636 dentry = cpuset_get_dentry(dir, cft->name);
1637 if (!IS_ERR(dentry)) {
1638 error = cpuset_create_file(dentry, 0644 | S_IFREG);
1639 if (!error)
1640 dentry->d_fsdata = (void *)cft;
1641 dput(dentry);
1642 } else
1643 error = PTR_ERR(dentry);
1644 mutex_unlock(&dir->d_inode->i_mutex);
1645 return error;
1646}
1647
1648/*
1649 * Stuff for reading the 'tasks' file.
1650 *
1651 * Reading this file can return large amounts of data if a cpuset has
1652 * *lots* of attached tasks. So it may need several calls to read(),
1653 * but we cannot guarantee that the information we produce is correct
1654 * unless we produce it entirely atomically.
1655 *
1656 * Upon tasks file open(), a struct ctr_struct is allocated, that
1657 * will have a pointer to an array (also allocated here). The struct
1658 * ctr_struct * is stored in file->private_data. Its resources will
1659 * be freed by release() when the file is closed. The array is used
1660 * to sprintf the PIDs and then used by read().
1661 */
1662
1663/* cpusets_tasks_read array */
1664
1665struct ctr_struct {
1666 char *buf;
1667 int bufsz;
1668};
1669
1670/*
1671 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1672 * Return actual number of pids loaded. No need to task_lock(p)
1673 * when reading out p->cpuset, as we don't really care if it changes
1674 * on the next cycle, and we are not going to try to dereference it.
1675 */
1676static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1677{
1678 int n = 0;
1679 struct task_struct *g, *p;
1680
1681 read_lock(&tasklist_lock);
1682
1683 do_each_thread(g, p) {
1684 if (p->cpuset == cs) {
1685 if (unlikely(n == npids))
1686 goto array_full;
1687 pidarray[n++] = p->pid;
1688 }
1689 } while_each_thread(g, p);
1690
1691array_full:
1692 read_unlock(&tasklist_lock);
1693 return n;
1694}
1695
1696static int cmppid(const void *a, const void *b)
1697{
1698 return *(pid_t *)a - *(pid_t *)b;
1699}
1700
1701/*
1702 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1703 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1704 * count 'cnt' of how many chars would be written if buf were large enough.
1705 */
1706static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1707{
1708 int cnt = 0;
1709 int i;
1710
1711 for (i = 0; i < npids; i++)
1712 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1713 return cnt;
1714}
1715
1716/*
1717 * Handle an open on 'tasks' file. Prepare a buffer listing the
1718 * process id's of tasks currently attached to the cpuset being opened.
1719 *
1720 * Does not require any specific cpuset mutexes, and does not take any.
1721 */
1722static int cpuset_tasks_open(struct inode *unused, struct file *file)
1723{
1724 struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
1725 struct ctr_struct *ctr;
1726 pid_t *pidarray;
1727 int npids;
1728 char c;
1729
1730 if (!(file->f_mode & FMODE_READ))
1731 return 0;
1732
1733 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1734 if (!ctr)
1735 goto err0;
1736
1737 /*
1738 * If cpuset gets more users after we read count, we won't have
1739 * enough space - tough. This race is indistinguishable to the
1740 * caller from the case that the additional cpuset users didn't
1741 * show up until sometime later on.
1742 */
1743 npids = atomic_read(&cs->count);
1744 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1745 if (!pidarray)
1746 goto err1;
1747
1748 npids = pid_array_load(pidarray, npids, cs);
1749 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1750
1751 /* Call pid_array_to_buf() twice, first just to get bufsz */
1752 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1753 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1754 if (!ctr->buf)
1755 goto err2;
1756 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1757
1758 kfree(pidarray);
1759 file->private_data = ctr;
1760 return 0;
1761
1762err2:
1763 kfree(pidarray);
1764err1:
1765 kfree(ctr);
1766err0:
1767 return -ENOMEM;
1768}
1769
1770static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1771 size_t nbytes, loff_t *ppos)
1772{
1773 struct ctr_struct *ctr = file->private_data;
1774
1775 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1776}
1777
1778static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
1779{
1780 struct ctr_struct *ctr;
1781
1782 if (file->f_mode & FMODE_READ) {
1783 ctr = file->private_data;
1784 kfree(ctr->buf);
1785 kfree(ctr);
1786 }
1787 return 0;
1788}
1789 1432
1790/* 1433/*
1791 * for the common functions, 'private' gives the type of file 1434 * for the common functions, 'private' gives the type of file
1792 */ 1435 */
1793 1436
1794static struct cftype cft_tasks = {
1795 .name = "tasks",
1796 .open = cpuset_tasks_open,
1797 .read = cpuset_tasks_read,
1798 .release = cpuset_tasks_release,
1799 .private = FILE_TASKLIST,
1800};
1801
1802static struct cftype cft_cpus = { 1437static struct cftype cft_cpus = {
1803 .name = "cpus", 1438 .name = "cpus",
1439 .read = cpuset_common_file_read,
1440 .write = cpuset_common_file_write,
1804 .private = FILE_CPULIST, 1441 .private = FILE_CPULIST,
1805}; 1442};
1806 1443
1807static struct cftype cft_mems = { 1444static struct cftype cft_mems = {
1808 .name = "mems", 1445 .name = "mems",
1446 .read = cpuset_common_file_read,
1447 .write = cpuset_common_file_write,
1809 .private = FILE_MEMLIST, 1448 .private = FILE_MEMLIST,
1810}; 1449};
1811 1450
1812static struct cftype cft_cpu_exclusive = { 1451static struct cftype cft_cpu_exclusive = {
1813 .name = "cpu_exclusive", 1452 .name = "cpu_exclusive",
1453 .read = cpuset_common_file_read,
1454 .write = cpuset_common_file_write,
1814 .private = FILE_CPU_EXCLUSIVE, 1455 .private = FILE_CPU_EXCLUSIVE,
1815}; 1456};
1816 1457
1817static struct cftype cft_mem_exclusive = { 1458static struct cftype cft_mem_exclusive = {
1818 .name = "mem_exclusive", 1459 .name = "mem_exclusive",
1460 .read = cpuset_common_file_read,
1461 .write = cpuset_common_file_write,
1819 .private = FILE_MEM_EXCLUSIVE, 1462 .private = FILE_MEM_EXCLUSIVE,
1820}; 1463};
1821 1464
1822static struct cftype cft_notify_on_release = { 1465static struct cftype cft_sched_load_balance = {
1823 .name = "notify_on_release", 1466 .name = "sched_load_balance",
1824 .private = FILE_NOTIFY_ON_RELEASE, 1467 .read = cpuset_common_file_read,
1468 .write = cpuset_common_file_write,
1469 .private = FILE_SCHED_LOAD_BALANCE,
1825}; 1470};
1826 1471
1827static struct cftype cft_memory_migrate = { 1472static struct cftype cft_memory_migrate = {
1828 .name = "memory_migrate", 1473 .name = "memory_migrate",
1474 .read = cpuset_common_file_read,
1475 .write = cpuset_common_file_write,
1829 .private = FILE_MEMORY_MIGRATE, 1476 .private = FILE_MEMORY_MIGRATE,
1830}; 1477};
1831 1478
1832static struct cftype cft_memory_pressure_enabled = { 1479static struct cftype cft_memory_pressure_enabled = {
1833 .name = "memory_pressure_enabled", 1480 .name = "memory_pressure_enabled",
1481 .read = cpuset_common_file_read,
1482 .write = cpuset_common_file_write,
1834 .private = FILE_MEMORY_PRESSURE_ENABLED, 1483 .private = FILE_MEMORY_PRESSURE_ENABLED,
1835}; 1484};
1836 1485
1837static struct cftype cft_memory_pressure = { 1486static struct cftype cft_memory_pressure = {
1838 .name = "memory_pressure", 1487 .name = "memory_pressure",
1488 .read = cpuset_common_file_read,
1489 .write = cpuset_common_file_write,
1839 .private = FILE_MEMORY_PRESSURE, 1490 .private = FILE_MEMORY_PRESSURE,
1840}; 1491};
1841 1492
1842static struct cftype cft_spread_page = { 1493static struct cftype cft_spread_page = {
1843 .name = "memory_spread_page", 1494 .name = "memory_spread_page",
1495 .read = cpuset_common_file_read,
1496 .write = cpuset_common_file_write,
1844 .private = FILE_SPREAD_PAGE, 1497 .private = FILE_SPREAD_PAGE,
1845}; 1498};
1846 1499
1847static struct cftype cft_spread_slab = { 1500static struct cftype cft_spread_slab = {
1848 .name = "memory_spread_slab", 1501 .name = "memory_spread_slab",
1502 .read = cpuset_common_file_read,
1503 .write = cpuset_common_file_write,
1849 .private = FILE_SPREAD_SLAB, 1504 .private = FILE_SPREAD_SLAB,
1850}; 1505};
1851 1506
1852static int cpuset_populate_dir(struct dentry *cs_dentry) 1507static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1853{ 1508{
1854 int err; 1509 int err;
1855 1510
1856 if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) 1511 if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1857 return err;
1858 if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0)
1859 return err; 1512 return err;
1860 if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) 1513 if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1861 return err; 1514 return err;
1862 if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) 1515 if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1863 return err; 1516 return err;
1864 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1517 if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1865 return err; 1518 return err;
1866 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1519 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1867 return err; 1520 return err;
1868 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1521 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1869 return err; 1522 return err;
1870 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) 1523 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1871 return err; 1524 return err;
1872 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) 1525 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1873 return err; 1526 return err;
1874 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1527 if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1875 return err; 1528 return err;
1529 /* memory_pressure_enabled is in root cpuset only */
1530 if (err == 0 && !cont->parent)
1531 err = cgroup_add_file(cont, ss,
1532 &cft_memory_pressure_enabled);
1876 return 0; 1533 return 0;
1877} 1534}
1878 1535
1879/* 1536/*
1537 * post_clone() is called at the end of cgroup_clone().
1538 * 'cgroup' was just created automatically as a result of
1539 * a cgroup_clone(), and the current task is about to
1540 * be moved into 'cgroup'.
1541 *
1542 * Currently we refuse to set up the cgroup - thereby
1543 * refusing the task to be entered, and as a result refusing
1544 * the sys_unshare() or clone() which initiated it - if any
1545 * sibling cpusets have exclusive cpus or mem.
1546 *
1547 * If this becomes a problem for some users who wish to
1548 * allow that scenario, then cpuset_post_clone() could be
1549 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550 * (and likewise for mems) to the new cgroup.
1551 */
1552static void cpuset_post_clone(struct cgroup_subsys *ss,
1553 struct cgroup *cgroup)
1554{
1555 struct cgroup *parent, *child;
1556 struct cpuset *cs, *parent_cs;
1557
1558 parent = cgroup->parent;
1559 list_for_each_entry(child, &parent->children, sibling) {
1560 cs = cgroup_cs(child);
1561 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1562 return;
1563 }
1564 cs = cgroup_cs(cgroup);
1565 parent_cs = cgroup_cs(parent);
1566
1567 cs->mems_allowed = parent_cs->mems_allowed;
1568 cs->cpus_allowed = parent_cs->cpus_allowed;
1569 return;
1570}
1571
1572/*
1880 * cpuset_create - create a cpuset 1573 * cpuset_create - create a cpuset
1881 * parent: cpuset that will be parent of the new cpuset. 1574 * parent: cpuset that will be parent of the new cpuset.
1882 * name: name of the new cpuset. Will be strcpy'ed. 1575 * name: name of the new cpuset. Will be strcpy'ed.
@@ -1885,124 +1578,77 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1885 * Must be called with the mutex on the parent inode held 1578 * Must be called with the mutex on the parent inode held
1886 */ 1579 */
1887 1580
1888static long cpuset_create(struct cpuset *parent, const char *name, int mode) 1581static struct cgroup_subsys_state *cpuset_create(
1582 struct cgroup_subsys *ss,
1583 struct cgroup *cont)
1889{ 1584{
1890 struct cpuset *cs; 1585 struct cpuset *cs;
1891 int err; 1586 struct cpuset *parent;
1892 1587
1588 if (!cont->parent) {
1589 /* This is early initialization for the top cgroup */
1590 top_cpuset.mems_generation = cpuset_mems_generation++;
1591 return &top_cpuset.css;
1592 }
1593 parent = cgroup_cs(cont->parent);
1893 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1594 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1894 if (!cs) 1595 if (!cs)
1895 return -ENOMEM; 1596 return ERR_PTR(-ENOMEM);
1896 1597
1897 mutex_lock(&manage_mutex);
1898 cpuset_update_task_memory_state(); 1598 cpuset_update_task_memory_state();
1899 cs->flags = 0; 1599 cs->flags = 0;
1900 if (notify_on_release(parent))
1901 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1902 if (is_spread_page(parent)) 1600 if (is_spread_page(parent))
1903 set_bit(CS_SPREAD_PAGE, &cs->flags); 1601 set_bit(CS_SPREAD_PAGE, &cs->flags);
1904 if (is_spread_slab(parent)) 1602 if (is_spread_slab(parent))
1905 set_bit(CS_SPREAD_SLAB, &cs->flags); 1603 set_bit(CS_SPREAD_SLAB, &cs->flags);
1604 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1906 cs->cpus_allowed = CPU_MASK_NONE; 1605 cs->cpus_allowed = CPU_MASK_NONE;
1907 cs->mems_allowed = NODE_MASK_NONE; 1606 cs->mems_allowed = NODE_MASK_NONE;
1908 atomic_set(&cs->count, 0);
1909 INIT_LIST_HEAD(&cs->sibling);
1910 INIT_LIST_HEAD(&cs->children);
1911 cs->mems_generation = cpuset_mems_generation++; 1607 cs->mems_generation = cpuset_mems_generation++;
1912 fmeter_init(&cs->fmeter); 1608 fmeter_init(&cs->fmeter);
1913 1609
1914 cs->parent = parent; 1610 cs->parent = parent;
1915
1916 mutex_lock(&callback_mutex);
1917 list_add(&cs->sibling, &cs->parent->children);
1918 number_of_cpusets++; 1611 number_of_cpusets++;
1919 mutex_unlock(&callback_mutex); 1612 return &cs->css ;
1920
1921 err = cpuset_create_dir(cs, name, mode);
1922 if (err < 0)
1923 goto err;
1924
1925 /*
1926 * Release manage_mutex before cpuset_populate_dir() because it
1927 * will down() this new directory's i_mutex and if we race with
1928 * another mkdir, we might deadlock.
1929 */
1930 mutex_unlock(&manage_mutex);
1931
1932 err = cpuset_populate_dir(cs->dentry);
1933 /* If err < 0, we have a half-filled directory - oh well ;) */
1934 return 0;
1935err:
1936 list_del(&cs->sibling);
1937 mutex_unlock(&manage_mutex);
1938 kfree(cs);
1939 return err;
1940}
1941
1942static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1943{
1944 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1945
1946 /* the vfs holds inode->i_mutex already */
1947 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1948} 1613}
1949 1614
1950/* 1615/*
1951 * Locking note on the strange update_flag() call below: 1616 * Locking note on the strange update_flag() call below:
1952 * 1617 *
1953 * If the cpuset being removed is marked cpu_exclusive, then simulate 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1954 * turning cpu_exclusive off, which will call update_cpu_domains(). 1619 * enabled, then simulate turning sched_load_balance off, which
1955 * The lock_cpu_hotplug() call in update_cpu_domains() must not be 1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug()
1956 * made while holding callback_mutex. Elsewhere the kernel nests 1621 * call in rebuild_sched_domains() must not be made while holding
1957 * callback_mutex inside lock_cpu_hotplug() calls. So the reverse 1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1958 * nesting would risk an ABBA deadlock. 1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an
1624 * ABBA deadlock.
1959 */ 1625 */
1960 1626
1961static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1627static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1962{ 1628{
1963 struct cpuset *cs = dentry->d_fsdata; 1629 struct cpuset *cs = cgroup_cs(cont);
1964 struct dentry *d;
1965 struct cpuset *parent;
1966 char *pathbuf = NULL;
1967
1968 /* the vfs holds both inode->i_mutex already */
1969 1630
1970 mutex_lock(&manage_mutex);
1971 cpuset_update_task_memory_state(); 1631 cpuset_update_task_memory_state();
1972 if (atomic_read(&cs->count) > 0) { 1632
1973 mutex_unlock(&manage_mutex); 1633 if (is_sched_load_balance(cs))
1974 return -EBUSY; 1634 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1975 } 1635
1976 if (!list_empty(&cs->children)) {
1977 mutex_unlock(&manage_mutex);
1978 return -EBUSY;
1979 }
1980 if (is_cpu_exclusive(cs)) {
1981 int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
1982 if (retval < 0) {
1983 mutex_unlock(&manage_mutex);
1984 return retval;
1985 }
1986 }
1987 parent = cs->parent;
1988 mutex_lock(&callback_mutex);
1989 set_bit(CS_REMOVED, &cs->flags);
1990 list_del(&cs->sibling); /* delete my sibling from parent->children */
1991 spin_lock(&cs->dentry->d_lock);
1992 d = dget(cs->dentry);
1993 cs->dentry = NULL;
1994 spin_unlock(&d->d_lock);
1995 cpuset_d_remove_dir(d);
1996 dput(d);
1997 number_of_cpusets--; 1636 number_of_cpusets--;
1998 mutex_unlock(&callback_mutex); 1637 kfree(cs);
1999 if (list_empty(&parent->children))
2000 check_for_release(parent, &pathbuf);
2001 mutex_unlock(&manage_mutex);
2002 cpuset_release_agent(pathbuf);
2003 return 0;
2004} 1638}
2005 1639
1640struct cgroup_subsys cpuset_subsys = {
1641 .name = "cpuset",
1642 .create = cpuset_create,
1643 .destroy = cpuset_destroy,
1644 .can_attach = cpuset_can_attach,
1645 .attach = cpuset_attach,
1646 .populate = cpuset_populate,
1647 .post_clone = cpuset_post_clone,
1648 .subsys_id = cpuset_subsys_id,
1649 .early_init = 1,
1650};
1651
2006/* 1652/*
2007 * cpuset_init_early - just enough so that the calls to 1653 * cpuset_init_early - just enough so that the calls to
2008 * cpuset_update_task_memory_state() in early init code 1654 * cpuset_update_task_memory_state() in early init code
@@ -2011,13 +1657,11 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
2011 1657
2012int __init cpuset_init_early(void) 1658int __init cpuset_init_early(void)
2013{ 1659{
2014 struct task_struct *tsk = current; 1660 top_cpuset.mems_generation = cpuset_mems_generation++;
2015
2016 tsk->cpuset = &top_cpuset;
2017 tsk->cpuset->mems_generation = cpuset_mems_generation++;
2018 return 0; 1661 return 0;
2019} 1662}
2020 1663
1664
2021/** 1665/**
2022 * cpuset_init - initialize cpusets at system boot 1666 * cpuset_init - initialize cpusets at system boot
2023 * 1667 *
@@ -2026,39 +1670,21 @@ int __init cpuset_init_early(void)
2026 1670
2027int __init cpuset_init(void) 1671int __init cpuset_init(void)
2028{ 1672{
2029 struct dentry *root; 1673 int err = 0;
2030 int err;
2031 1674
2032 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1675 top_cpuset.cpus_allowed = CPU_MASK_ALL;
2033 top_cpuset.mems_allowed = NODE_MASK_ALL; 1676 top_cpuset.mems_allowed = NODE_MASK_ALL;
2034 1677
2035 fmeter_init(&top_cpuset.fmeter); 1678 fmeter_init(&top_cpuset.fmeter);
2036 top_cpuset.mems_generation = cpuset_mems_generation++; 1679 top_cpuset.mems_generation = cpuset_mems_generation++;
2037 1680 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2038 init_task.cpuset = &top_cpuset;
2039 1681
2040 err = register_filesystem(&cpuset_fs_type); 1682 err = register_filesystem(&cpuset_fs_type);
2041 if (err < 0) 1683 if (err < 0)
2042 goto out; 1684 return err;
2043 cpuset_mount = kern_mount(&cpuset_fs_type); 1685
2044 if (IS_ERR(cpuset_mount)) {
2045 printk(KERN_ERR "cpuset: could not mount!\n");
2046 err = PTR_ERR(cpuset_mount);
2047 cpuset_mount = NULL;
2048 goto out;
2049 }
2050 root = cpuset_mount->mnt_sb->s_root;
2051 root->d_fsdata = &top_cpuset;
2052 inc_nlink(root->d_inode);
2053 top_cpuset.dentry = root;
2054 root->d_inode->i_op = &cpuset_dir_inode_operations;
2055 number_of_cpusets = 1; 1686 number_of_cpusets = 1;
2056 err = cpuset_populate_dir(root); 1687 return 0;
2057 /* memory_pressure_enabled is in root cpuset only */
2058 if (err == 0)
2059 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
2060out:
2061 return err;
2062} 1688}
2063 1689
2064/* 1690/*
@@ -2084,10 +1710,12 @@ out:
2084 1710
2085static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) 1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2086{ 1712{
1713 struct cgroup *cont;
2087 struct cpuset *c; 1714 struct cpuset *c;
2088 1715
2089 /* Each of our child cpusets mems must be online */ 1716 /* Each of our child cpusets mems must be online */
2090 list_for_each_entry(c, &cur->children, sibling) { 1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
1718 c = cgroup_cs(cont);
2091 guarantee_online_cpus_mems_in_subtree(c); 1719 guarantee_online_cpus_mems_in_subtree(c);
2092 if (!cpus_empty(c->cpus_allowed)) 1720 if (!cpus_empty(c->cpus_allowed))
2093 guarantee_online_cpus(c, &c->cpus_allowed); 1721 guarantee_online_cpus(c, &c->cpus_allowed);
@@ -2098,8 +1726,9 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2098 1726
2099/* 1727/*
2100 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
2101 * cpu_online_map and node_online_map. Force the top cpuset to track 1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
2102 * whats online after any CPU or memory node hotplug or unplug event. 1730 * track what's online after any CPU or memory node hotplug or unplug
1731 * event.
2103 * 1732 *
2104 * To ensure that we don't remove a CPU or node from the top cpuset 1733 * To ensure that we don't remove a CPU or node from the top cpuset
2105 * that is currently in use by a child cpuset (which would violate 1734 * that is currently in use by a child cpuset (which would violate
@@ -2114,15 +1743,15 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2114 1743
2115static void common_cpu_mem_hotplug_unplug(void) 1744static void common_cpu_mem_hotplug_unplug(void)
2116{ 1745{
2117 mutex_lock(&manage_mutex); 1746 cgroup_lock();
2118 mutex_lock(&callback_mutex); 1747 mutex_lock(&callback_mutex);
2119 1748
2120 guarantee_online_cpus_mems_in_subtree(&top_cpuset); 1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
2121 top_cpuset.cpus_allowed = cpu_online_map; 1750 top_cpuset.cpus_allowed = cpu_online_map;
2122 top_cpuset.mems_allowed = node_online_map; 1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2123 1752
2124 mutex_unlock(&callback_mutex); 1753 mutex_unlock(&callback_mutex);
2125 mutex_unlock(&manage_mutex); 1754 cgroup_unlock();
2126} 1755}
2127 1756
2128/* 1757/*
@@ -2135,8 +1764,8 @@ static void common_cpu_mem_hotplug_unplug(void)
2135 * cpu_online_map on each CPU hotplug (cpuhp) event. 1764 * cpu_online_map on each CPU hotplug (cpuhp) event.
2136 */ 1765 */
2137 1766
2138static int cpuset_handle_cpuhp(struct notifier_block *nb, 1767static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
2139 unsigned long phase, void *cpu) 1768 unsigned long phase, void *unused_cpu)
2140{ 1769{
2141 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1770 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2142 return NOTIFY_DONE; 1771 return NOTIFY_DONE;
@@ -2147,8 +1776,9 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
2147 1776
2148#ifdef CONFIG_MEMORY_HOTPLUG 1777#ifdef CONFIG_MEMORY_HOTPLUG
2149/* 1778/*
2150 * Keep top_cpuset.mems_allowed tracking node_online_map. 1779 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
2151 * Call this routine anytime after you change node_online_map. 1780 * Call this routine anytime after you change
1781 * node_states[N_HIGH_MEMORY].
2152 * See also the previous routine cpuset_handle_cpuhp(). 1782 * See also the previous routine cpuset_handle_cpuhp().
2153 */ 1783 */
2154 1784
@@ -2167,115 +1797,13 @@ void cpuset_track_online_nodes(void)
2167void __init cpuset_init_smp(void) 1797void __init cpuset_init_smp(void)
2168{ 1798{
2169 top_cpuset.cpus_allowed = cpu_online_map; 1799 top_cpuset.cpus_allowed = cpu_online_map;
2170 top_cpuset.mems_allowed = node_online_map; 1800 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2171 1801
2172 hotcpu_notifier(cpuset_handle_cpuhp, 0); 1802 hotcpu_notifier(cpuset_handle_cpuhp, 0);
2173} 1803}
2174 1804
2175/** 1805/**
2176 * cpuset_fork - attach newly forked task to its parents cpuset.
2177 * @tsk: pointer to task_struct of forking parent process.
2178 *
2179 * Description: A task inherits its parent's cpuset at fork().
2180 *
2181 * A pointer to the shared cpuset was automatically copied in fork.c
2182 * by dup_task_struct(). However, we ignore that copy, since it was
2183 * not made under the protection of task_lock(), so might no longer be
2184 * a valid cpuset pointer. attach_task() might have already changed
2185 * current->cpuset, allowing the previously referenced cpuset to
2186 * be removed and freed. Instead, we task_lock(current) and copy
2187 * its present value of current->cpuset for our freshly forked child.
2188 *
2189 * At the point that cpuset_fork() is called, 'current' is the parent
2190 * task, and the passed argument 'child' points to the child task.
2191 **/
2192
2193void cpuset_fork(struct task_struct *child)
2194{
2195 task_lock(current);
2196 child->cpuset = current->cpuset;
2197 atomic_inc(&child->cpuset->count);
2198 task_unlock(current);
2199}
2200
2201/**
2202 * cpuset_exit - detach cpuset from exiting task
2203 * @tsk: pointer to task_struct of exiting process
2204 *
2205 * Description: Detach cpuset from @tsk and release it.
2206 *
2207 * Note that cpusets marked notify_on_release force every task in
2208 * them to take the global manage_mutex mutex when exiting.
2209 * This could impact scaling on very large systems. Be reluctant to
2210 * use notify_on_release cpusets where very high task exit scaling
2211 * is required on large systems.
2212 *
2213 * Don't even think about derefencing 'cs' after the cpuset use count
2214 * goes to zero, except inside a critical section guarded by manage_mutex
2215 * or callback_mutex. Otherwise a zero cpuset use count is a license to
2216 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
2217 *
2218 * This routine has to take manage_mutex, not callback_mutex, because
2219 * it is holding that mutex while calling check_for_release(),
2220 * which calls kmalloc(), so can't be called holding callback_mutex().
2221 *
2222 * the_top_cpuset_hack:
2223 *
2224 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
2225 *
2226 * Don't leave a task unable to allocate memory, as that is an
2227 * accident waiting to happen should someone add a callout in
2228 * do_exit() after the cpuset_exit() call that might allocate.
2229 * If a task tries to allocate memory with an invalid cpuset,
2230 * it will oops in cpuset_update_task_memory_state().
2231 *
2232 * We call cpuset_exit() while the task is still competent to
2233 * handle notify_on_release(), then leave the task attached to
2234 * the root cpuset (top_cpuset) for the remainder of its exit.
2235 *
2236 * To do this properly, we would increment the reference count on
2237 * top_cpuset, and near the very end of the kernel/exit.c do_exit()
2238 * code we would add a second cpuset function call, to drop that
2239 * reference. This would just create an unnecessary hot spot on
2240 * the top_cpuset reference count, to no avail.
2241 *
2242 * Normally, holding a reference to a cpuset without bumping its
2243 * count is unsafe. The cpuset could go away, or someone could
2244 * attach us to a different cpuset, decrementing the count on
2245 * the first cpuset that we never incremented. But in this case,
2246 * top_cpuset isn't going away, and either task has PF_EXITING set,
2247 * which wards off any attach_task() attempts, or task is a failed
2248 * fork, never visible to attach_task.
2249 *
2250 * Another way to do this would be to set the cpuset pointer
2251 * to NULL here, and check in cpuset_update_task_memory_state()
2252 * for a NULL pointer. This hack avoids that NULL check, for no
2253 * cost (other than this way too long comment ;).
2254 **/
2255
2256void cpuset_exit(struct task_struct *tsk)
2257{
2258 struct cpuset *cs;
2259
2260 task_lock(current);
2261 cs = tsk->cpuset;
2262 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2263 task_unlock(current);
2264
2265 if (notify_on_release(cs)) {
2266 char *pathbuf = NULL;
2267
2268 mutex_lock(&manage_mutex);
2269 if (atomic_dec_and_test(&cs->count))
2270 check_for_release(cs, &pathbuf);
2271 mutex_unlock(&manage_mutex);
2272 cpuset_release_agent(pathbuf);
2273 } else {
2274 atomic_dec(&cs->count);
2275 }
2276}
2277 1806
2278/**
2279 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1807 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2280 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1808 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2281 * 1809 *
@@ -2290,10 +1818,23 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
2290 cpumask_t mask; 1818 cpumask_t mask;
2291 1819
2292 mutex_lock(&callback_mutex); 1820 mutex_lock(&callback_mutex);
1821 mask = cpuset_cpus_allowed_locked(tsk);
1822 mutex_unlock(&callback_mutex);
1823
1824 return mask;
1825}
1826
1827/**
1828 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829 * Must be called with callback_mutex held.
1830 **/
1831cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832{
1833 cpumask_t mask;
1834
2293 task_lock(tsk); 1835 task_lock(tsk);
2294 guarantee_online_cpus(tsk->cpuset, &mask); 1836 guarantee_online_cpus(task_cs(tsk), &mask);
2295 task_unlock(tsk); 1837 task_unlock(tsk);
2296 mutex_unlock(&callback_mutex);
2297 1838
2298 return mask; 1839 return mask;
2299} 1840}
@@ -2309,7 +1850,7 @@ void cpuset_init_current_mems_allowed(void)
2309 * 1850 *
2310 * Description: Returns the nodemask_t mems_allowed of the cpuset 1851 * Description: Returns the nodemask_t mems_allowed of the cpuset
2311 * attached to the specified @tsk. Guaranteed to return some non-empty 1852 * attached to the specified @tsk. Guaranteed to return some non-empty
2312 * subset of node_online_map, even if this means going outside the 1853 * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
2313 * tasks cpuset. 1854 * tasks cpuset.
2314 **/ 1855 **/
2315 1856
@@ -2319,7 +1860,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2319 1860
2320 mutex_lock(&callback_mutex); 1861 mutex_lock(&callback_mutex);
2321 task_lock(tsk); 1862 task_lock(tsk);
2322 guarantee_online_mems(tsk->cpuset, &mask); 1863 guarantee_online_mems(task_cs(tsk), &mask);
2323 task_unlock(tsk); 1864 task_unlock(tsk);
2324 mutex_unlock(&callback_mutex); 1865 mutex_unlock(&callback_mutex);
2325 1866
@@ -2450,7 +1991,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2450 mutex_lock(&callback_mutex); 1991 mutex_lock(&callback_mutex);
2451 1992
2452 task_lock(current); 1993 task_lock(current);
2453 cs = nearest_exclusive_ancestor(current->cpuset); 1994 cs = nearest_exclusive_ancestor(task_cs(current));
2454 task_unlock(current); 1995 task_unlock(current);
2455 1996
2456 allowed = node_isset(node, cs->mems_allowed); 1997 allowed = node_isset(node, cs->mems_allowed);
@@ -2491,12 +2032,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2491 node = zone_to_nid(z); 2032 node = zone_to_nid(z);
2492 if (node_isset(node, current->mems_allowed)) 2033 if (node_isset(node, current->mems_allowed))
2493 return 1; 2034 return 1;
2494 /* 2035 /*
2495 * Allow tasks that have access to memory reserves because they have 2036 * Allow tasks that have access to memory reserves because they have
2496 * been OOM killed to get memory anywhere. 2037 * been OOM killed to get memory anywhere.
2497 */ 2038 */
2498 if (unlikely(test_thread_flag(TIF_MEMDIE))) 2039 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2499 return 1; 2040 return 1;
2500 return 0; 2041 return 0;
2501} 2042}
2502 2043
@@ -2566,41 +2107,20 @@ int cpuset_mem_spread_node(void)
2566EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2107EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2567 2108
2568/** 2109/**
2569 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2110 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
2570 * @p: pointer to task_struct of some other task. 2111 * @tsk1: pointer to task_struct of some task.
2571 * 2112 * @tsk2: pointer to task_struct of some other task.
2572 * Description: Return true if the nearest mem_exclusive ancestor 2113 *
2573 * cpusets of tasks @p and current overlap. Used by oom killer to 2114 * Description: Return true if @tsk1's mems_allowed intersects the
2574 * determine if task @p's memory usage might impact the memory 2115 * mems_allowed of @tsk2. Used by the OOM killer to determine if
2575 * available to the current task. 2116 * one of the task's memory usage might impact the memory available
2576 * 2117 * to the other.
2577 * Call while holding callback_mutex.
2578 **/ 2118 **/
2579 2119
2580int cpuset_excl_nodes_overlap(const struct task_struct *p) 2120int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2121 const struct task_struct *tsk2)
2581{ 2122{
2582 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2123 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2583 int overlap = 1; /* do cpusets overlap? */
2584
2585 task_lock(current);
2586 if (current->flags & PF_EXITING) {
2587 task_unlock(current);
2588 goto done;
2589 }
2590 cs1 = nearest_exclusive_ancestor(current->cpuset);
2591 task_unlock(current);
2592
2593 task_lock((struct task_struct *)p);
2594 if (p->flags & PF_EXITING) {
2595 task_unlock((struct task_struct *)p);
2596 goto done;
2597 }
2598 cs2 = nearest_exclusive_ancestor(p->cpuset);
2599 task_unlock((struct task_struct *)p);
2600
2601 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
2602done:
2603 return overlap;
2604} 2124}
2605 2125
2606/* 2126/*
@@ -2631,14 +2151,12 @@ int cpuset_memory_pressure_enabled __read_mostly;
2631 2151
2632void __cpuset_memory_pressure_bump(void) 2152void __cpuset_memory_pressure_bump(void)
2633{ 2153{
2634 struct cpuset *cs;
2635
2636 task_lock(current); 2154 task_lock(current);
2637 cs = current->cpuset; 2155 fmeter_markevent(&task_cs(current)->fmeter);
2638 fmeter_markevent(&cs->fmeter);
2639 task_unlock(current); 2156 task_unlock(current);
2640} 2157}
2641 2158
2159#ifdef CONFIG_PROC_PID_CPUSET
2642/* 2160/*
2643 * proc_cpuset_show() 2161 * proc_cpuset_show()
2644 * - Print tasks cpuset path into seq_file. 2162 * - Print tasks cpuset path into seq_file.
@@ -2650,11 +2168,12 @@ void __cpuset_memory_pressure_bump(void)
2650 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks 2168 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2651 * cpuset to top_cpuset. 2169 * cpuset to top_cpuset.
2652 */ 2170 */
2653static int proc_cpuset_show(struct seq_file *m, void *v) 2171static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2654{ 2172{
2655 struct pid *pid; 2173 struct pid *pid;
2656 struct task_struct *tsk; 2174 struct task_struct *tsk;
2657 char *buf; 2175 char *buf;
2176 struct cgroup_subsys_state *css;
2658 int retval; 2177 int retval;
2659 2178
2660 retval = -ENOMEM; 2179 retval = -ENOMEM;
@@ -2669,15 +2188,15 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2669 goto out_free; 2188 goto out_free;
2670 2189
2671 retval = -EINVAL; 2190 retval = -EINVAL;
2672 mutex_lock(&manage_mutex); 2191 cgroup_lock();
2673 2192 css = task_subsys_state(tsk, cpuset_subsys_id);
2674 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2193 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2675 if (retval < 0) 2194 if (retval < 0)
2676 goto out_unlock; 2195 goto out_unlock;
2677 seq_puts(m, buf); 2196 seq_puts(m, buf);
2678 seq_putc(m, '\n'); 2197 seq_putc(m, '\n');
2679out_unlock: 2198out_unlock:
2680 mutex_unlock(&manage_mutex); 2199 cgroup_unlock();
2681 put_task_struct(tsk); 2200 put_task_struct(tsk);
2682out_free: 2201out_free:
2683 kfree(buf); 2202 kfree(buf);
@@ -2697,6 +2216,7 @@ const struct file_operations proc_cpuset_operations = {
2697 .llseek = seq_lseek, 2216 .llseek = seq_lseek,
2698 .release = single_release, 2217 .release = single_release,
2699}; 2218};
2219#endif /* CONFIG_PROC_PID_CPUSET */
2700 2220
2701/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ 2221/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2702char *cpuset_task_status_allowed(struct task_struct *task, char *buffer) 2222char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e697829633..10e43fd8b721 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -115,11 +115,17 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
115 tmp += timespec_to_ns(&ts); 115 tmp += timespec_to_ns(&ts);
116 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; 116 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
117 117
118 tmp = (s64)d->cpu_scaled_run_real_total;
119 cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
120 tmp += timespec_to_ns(&ts);
121 d->cpu_scaled_run_real_total =
122 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
123
118 /* 124 /*
119 * No locking available for sched_info (and too expensive to add one) 125 * No locking available for sched_info (and too expensive to add one)
120 * Mitigate by taking snapshot of values 126 * Mitigate by taking snapshot of values
121 */ 127 */
122 t1 = tsk->sched_info.pcnt; 128 t1 = tsk->sched_info.pcount;
123 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
124 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->sched_info.cpu_time;
125 131
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
deleted file mode 100644
index 0d98827887a7..000000000000
--- a/kernel/die_notifier.c
+++ /dev/null
@@ -1,38 +0,0 @@
1
2#include <linux/module.h>
3#include <linux/notifier.h>
4#include <linux/vmalloc.h>
5#include <linux/kdebug.h>
6
7
8static ATOMIC_NOTIFIER_HEAD(die_chain);
9
10int notify_die(enum die_val val, const char *str,
11 struct pt_regs *regs, long err, int trap, int sig)
12{
13 struct die_args args = {
14 .regs = regs,
15 .str = str,
16 .err = err,
17 .trapnr = trap,
18 .signr = sig,
19
20 };
21
22 return atomic_notifier_call_chain(&die_chain, val, &args);
23}
24
25int register_die_notifier(struct notifier_block *nb)
26{
27 vmalloc_sync_all();
28 return atomic_notifier_chain_register(&die_chain, nb);
29}
30EXPORT_SYMBOL_GPL(register_die_notifier);
31
32int unregister_die_notifier(struct notifier_block *nb)
33{
34 return atomic_notifier_chain_unregister(&die_chain, nb);
35}
36EXPORT_SYMBOL_GPL(unregister_die_notifier);
37
38
diff --git a/kernel/dma.c b/kernel/dma.c
index 937b13ca33ba..6a82bb716dac 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -20,7 +20,7 @@
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h> 21#include <asm/system.h>
22 22
23 23
24 24
25/* A note on resource allocation: 25/* A note on resource allocation:
26 * 26 *
@@ -95,7 +95,7 @@ void free_dma(unsigned int dmanr)
95 if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { 95 if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
96 printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); 96 printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
97 return; 97 return;
98 } 98 }
99 99
100} /* free_dma */ 100} /* free_dma */
101 101
@@ -121,8 +121,8 @@ static int proc_dma_show(struct seq_file *m, void *v)
121 121
122 for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { 122 for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
123 if (dma_chan_busy[i].lock) { 123 if (dma_chan_busy[i].lock) {
124 seq_printf(m, "%2d: %s\n", i, 124 seq_printf(m, "%2d: %s\n", i,
125 dma_chan_busy[i].device_id); 125 dma_chan_busy[i].device_id);
126 } 126 }
127 } 127 }
128 return 0; 128 return 0;
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 3c2eaea66b1e..a9e6bad9f706 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -57,7 +57,7 @@ lookup_exec_domain(u_long personality)
57{ 57{
58 struct exec_domain * ep; 58 struct exec_domain * ep;
59 u_long pers = personality(personality); 59 u_long pers = personality(personality);
60 60
61 read_lock(&exec_domains_lock); 61 read_lock(&exec_domains_lock);
62 for (ep = exec_domains; ep; ep = ep->next) { 62 for (ep = exec_domains; ep; ep = ep->next) {
63 if (pers >= ep->pers_low && pers <= ep->pers_high) 63 if (pers >= ep->pers_low && pers <= ep->pers_high)
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369ee94d1..f1aec27f1df0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,7 +31,7 @@
31#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
32#include <linux/delayacct.h> 32#include <linux/delayacct.h>
33#include <linux/freezer.h> 33#include <linux/freezer.h>
34#include <linux/cpuset.h> 34#include <linux/cgroup.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/signal.h> 36#include <linux/signal.h>
37#include <linux/posix-timers.h> 37#include <linux/posix-timers.h>
@@ -44,7 +44,6 @@
44#include <linux/resource.h> 44#include <linux/resource.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/task_io_accounting_ops.h> 46#include <linux/task_io_accounting_ops.h>
47#include <linux/freezer.h>
48 47
49#include <asm/uaccess.h> 48#include <asm/uaccess.h>
50#include <asm/unistd.h> 49#include <asm/unistd.h>
@@ -93,10 +92,9 @@ static void __exit_signal(struct task_struct *tsk)
93 * If there is any task waiting for the group exit 92 * If there is any task waiting for the group exit
94 * then notify it: 93 * then notify it:
95 */ 94 */
96 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { 95 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
97 wake_up_process(sig->group_exit_task); 96 wake_up_process(sig->group_exit_task);
98 sig->group_exit_task = NULL; 97
99 }
100 if (tsk == sig->curr_target) 98 if (tsk == sig->curr_target)
101 sig->curr_target = next_thread(tsk); 99 sig->curr_target = next_thread(tsk);
102 /* 100 /*
@@ -111,6 +109,7 @@ static void __exit_signal(struct task_struct *tsk)
111 */ 109 */
112 sig->utime = cputime_add(sig->utime, tsk->utime); 110 sig->utime = cputime_add(sig->utime, tsk->utime);
113 sig->stime = cputime_add(sig->stime, tsk->stime); 111 sig->stime = cputime_add(sig->stime, tsk->stime);
112 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
114 sig->min_flt += tsk->min_flt; 113 sig->min_flt += tsk->min_flt;
115 sig->maj_flt += tsk->maj_flt; 114 sig->maj_flt += tsk->maj_flt;
116 sig->nvcsw += tsk->nvcsw; 115 sig->nvcsw += tsk->nvcsw;
@@ -149,6 +148,7 @@ void release_task(struct task_struct * p)
149 int zap_leader; 148 int zap_leader;
150repeat: 149repeat:
151 atomic_dec(&p->user->processes); 150 atomic_dec(&p->user->processes);
151 proc_flush_task(p);
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 ptrace_unlink(p); 153 ptrace_unlink(p);
154 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 154 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -176,7 +176,6 @@ repeat:
176 } 176 }
177 177
178 write_unlock_irq(&tasklist_lock); 178 write_unlock_irq(&tasklist_lock);
179 proc_flush_task(p);
180 release_thread(p); 179 release_thread(p);
181 call_rcu(&p->rcu, delayed_put_task_struct); 180 call_rcu(&p->rcu, delayed_put_task_struct);
182 181
@@ -222,7 +221,7 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor
222 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 221 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
223 if (p == ignored_task 222 if (p == ignored_task
224 || p->exit_state 223 || p->exit_state
225 || is_init(p->real_parent)) 224 || is_global_init(p->real_parent))
226 continue; 225 continue;
227 if (task_pgrp(p->real_parent) != pgrp && 226 if (task_pgrp(p->real_parent) != pgrp &&
228 task_session(p->real_parent) == task_session(p)) { 227 task_session(p->real_parent) == task_session(p)) {
@@ -300,14 +299,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
300{ 299{
301 struct task_struct *curr = current->group_leader; 300 struct task_struct *curr = current->group_leader;
302 301
303 if (process_session(curr) != session) { 302 if (task_session_nr(curr) != session) {
304 detach_pid(curr, PIDTYPE_SID); 303 detach_pid(curr, PIDTYPE_SID);
305 set_signal_session(curr->signal, session); 304 set_task_session(curr, session);
306 attach_pid(curr, PIDTYPE_SID, find_pid(session)); 305 attach_pid(curr, PIDTYPE_SID, find_pid(session));
307 } 306 }
308 if (process_group(curr) != pgrp) { 307 if (task_pgrp_nr(curr) != pgrp) {
309 detach_pid(curr, PIDTYPE_PGID); 308 detach_pid(curr, PIDTYPE_PGID);
310 curr->signal->pgrp = pgrp; 309 set_task_pgrp(curr, pgrp);
311 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp)); 310 attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
312 } 311 }
313} 312}
@@ -401,11 +400,12 @@ void daemonize(const char *name, ...)
401 current->fs = fs; 400 current->fs = fs;
402 atomic_inc(&fs->count); 401 atomic_inc(&fs->count);
403 402
404 exit_task_namespaces(current); 403 if (current->nsproxy != init_task.nsproxy) {
405 current->nsproxy = init_task.nsproxy; 404 get_nsproxy(init_task.nsproxy);
406 get_task_namespaces(current); 405 switch_task_namespaces(current, init_task.nsproxy);
406 }
407 407
408 exit_files(current); 408 exit_files(current);
409 current->files = init_task.files; 409 current->files = init_task.files;
410 atomic_inc(&current->files->count); 410 atomic_inc(&current->files->count);
411 411
@@ -493,7 +493,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
493} 493}
494EXPORT_SYMBOL(reset_files_struct); 494EXPORT_SYMBOL(reset_files_struct);
495 495
496static inline void __exit_files(struct task_struct *tsk) 496static void __exit_files(struct task_struct *tsk)
497{ 497{
498 struct files_struct * files = tsk->files; 498 struct files_struct * files = tsk->files;
499 499
@@ -510,7 +510,7 @@ void exit_files(struct task_struct *tsk)
510 __exit_files(tsk); 510 __exit_files(tsk);
511} 511}
512 512
513static inline void __put_fs_struct(struct fs_struct *fs) 513static void __put_fs_struct(struct fs_struct *fs)
514{ 514{
515 /* No need to hold fs->lock if we are killing it */ 515 /* No need to hold fs->lock if we are killing it */
516 if (atomic_dec_and_test(&fs->count)) { 516 if (atomic_dec_and_test(&fs->count)) {
@@ -531,7 +531,7 @@ void put_fs_struct(struct fs_struct *fs)
531 __put_fs_struct(fs); 531 __put_fs_struct(fs);
532} 532}
533 533
534static inline void __exit_fs(struct task_struct *tsk) 534static void __exit_fs(struct task_struct *tsk)
535{ 535{
536 struct fs_struct * fs = tsk->fs; 536 struct fs_struct * fs = tsk->fs;
537 537
@@ -592,17 +592,6 @@ static void exit_mm(struct task_struct * tsk)
592 mmput(mm); 592 mmput(mm);
593} 593}
594 594
595static inline void
596choose_new_parent(struct task_struct *p, struct task_struct *reaper)
597{
598 /*
599 * Make sure we're not reparenting to ourselves and that
600 * the parent is not a zombie.
601 */
602 BUG_ON(p == reaper || reaper->exit_state);
603 p->real_parent = reaper;
604}
605
606static void 595static void
607reparent_thread(struct task_struct *p, struct task_struct *father, int traced) 596reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
608{ 597{
@@ -677,19 +666,22 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
677 * the child reaper process (ie "init") in our pid 666 * the child reaper process (ie "init") in our pid
678 * space. 667 * space.
679 */ 668 */
680static void 669static void forget_original_parent(struct task_struct *father)
681forget_original_parent(struct task_struct *father, struct list_head *to_release)
682{ 670{
683 struct task_struct *p, *reaper = father; 671 struct task_struct *p, *n, *reaper = father;
684 struct list_head *_p, *_n; 672 struct list_head ptrace_dead;
673
674 INIT_LIST_HEAD(&ptrace_dead);
675
676 write_lock_irq(&tasklist_lock);
685 677
686 do { 678 do {
687 reaper = next_thread(reaper); 679 reaper = next_thread(reaper);
688 if (reaper == father) { 680 if (reaper == father) {
689 reaper = child_reaper(father); 681 reaper = task_child_reaper(father);
690 break; 682 break;
691 } 683 }
692 } while (reaper->exit_state); 684 } while (reaper->flags & PF_EXITING);
693 685
694 /* 686 /*
695 * There are only two places where our children can be: 687 * There are only two places where our children can be:
@@ -699,9 +691,8 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
699 * 691 *
700 * Search them and reparent children. 692 * Search them and reparent children.
701 */ 693 */
702 list_for_each_safe(_p, _n, &father->children) { 694 list_for_each_entry_safe(p, n, &father->children, sibling) {
703 int ptrace; 695 int ptrace;
704 p = list_entry(_p, struct task_struct, sibling);
705 696
706 ptrace = p->ptrace; 697 ptrace = p->ptrace;
707 698
@@ -710,7 +701,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
710 701
711 if (father == p->real_parent) { 702 if (father == p->real_parent) {
712 /* reparent with a reaper, real father it's us */ 703 /* reparent with a reaper, real father it's us */
713 choose_new_parent(p, reaper); 704 p->real_parent = reaper;
714 reparent_thread(p, father, 0); 705 reparent_thread(p, father, 0);
715 } else { 706 } else {
716 /* reparent ptraced task to its real parent */ 707 /* reparent ptraced task to its real parent */
@@ -727,13 +718,23 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
727 * while it was being traced by us, to be able to see it in wait4. 718 * while it was being traced by us, to be able to see it in wait4.
728 */ 719 */
729 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) 720 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
730 list_add(&p->ptrace_list, to_release); 721 list_add(&p->ptrace_list, &ptrace_dead);
731 } 722 }
732 list_for_each_safe(_p, _n, &father->ptrace_children) { 723
733 p = list_entry(_p, struct task_struct, ptrace_list); 724 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
734 choose_new_parent(p, reaper); 725 p->real_parent = reaper;
735 reparent_thread(p, father, 1); 726 reparent_thread(p, father, 1);
736 } 727 }
728
729 write_unlock_irq(&tasklist_lock);
730 BUG_ON(!list_empty(&father->children));
731 BUG_ON(!list_empty(&father->ptrace_children));
732
733 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
734 list_del_init(&p->ptrace_list);
735 release_task(p);
736 }
737
737} 738}
738 739
739/* 740/*
@@ -744,7 +745,6 @@ static void exit_notify(struct task_struct *tsk)
744{ 745{
745 int state; 746 int state;
746 struct task_struct *t; 747 struct task_struct *t;
747 struct list_head ptrace_dead, *_p, *_n;
748 struct pid *pgrp; 748 struct pid *pgrp;
749 749
750 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) 750 if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
@@ -758,17 +758,13 @@ static void exit_notify(struct task_struct *tsk)
758 * Now we'll wake all the threads in the group just to make 758 * Now we'll wake all the threads in the group just to make
759 * sure someone gets all the pending signals. 759 * sure someone gets all the pending signals.
760 */ 760 */
761 read_lock(&tasklist_lock);
762 spin_lock_irq(&tsk->sighand->siglock); 761 spin_lock_irq(&tsk->sighand->siglock);
763 for (t = next_thread(tsk); t != tsk; t = next_thread(t)) 762 for (t = next_thread(tsk); t != tsk; t = next_thread(t))
764 if (!signal_pending(t) && !(t->flags & PF_EXITING)) 763 if (!signal_pending(t) && !(t->flags & PF_EXITING))
765 recalc_sigpending_and_wake(t); 764 recalc_sigpending_and_wake(t);
766 spin_unlock_irq(&tsk->sighand->siglock); 765 spin_unlock_irq(&tsk->sighand->siglock);
767 read_unlock(&tasklist_lock);
768 } 766 }
769 767
770 write_lock_irq(&tasklist_lock);
771
772 /* 768 /*
773 * This does two things: 769 * This does two things:
774 * 770 *
@@ -777,12 +773,10 @@ static void exit_notify(struct task_struct *tsk)
777 * as a result of our exiting, and if they have any stopped 773 * as a result of our exiting, and if they have any stopped
778 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 774 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
779 */ 775 */
776 forget_original_parent(tsk);
777 exit_task_namespaces(tsk);
780 778
781 INIT_LIST_HEAD(&ptrace_dead); 779 write_lock_irq(&tasklist_lock);
782 forget_original_parent(tsk, &ptrace_dead);
783 BUG_ON(!list_empty(&tsk->children));
784 BUG_ON(!list_empty(&tsk->ptrace_children));
785
786 /* 780 /*
787 * Check to see if any process groups have become orphaned 781 * Check to see if any process groups have become orphaned
788 * as a result of our exiting, and if they have any stopped 782 * as a result of our exiting, and if they have any stopped
@@ -792,9 +786,8 @@ static void exit_notify(struct task_struct *tsk)
792 * and we were the only connection outside, so our pgrp 786 * and we were the only connection outside, so our pgrp
793 * is about to become orphaned. 787 * is about to become orphaned.
794 */ 788 */
795
796 t = tsk->real_parent; 789 t = tsk->real_parent;
797 790
798 pgrp = task_pgrp(tsk); 791 pgrp = task_pgrp(tsk);
799 if ((task_pgrp(t) != pgrp) && 792 if ((task_pgrp(t) != pgrp) &&
800 (task_session(t) == task_session(tsk)) && 793 (task_session(t) == task_session(tsk)) &&
@@ -807,7 +800,7 @@ static void exit_notify(struct task_struct *tsk)
807 /* Let father know we died 800 /* Let father know we died
808 * 801 *
809 * Thread signals are configurable, but you aren't going to use 802 * Thread signals are configurable, but you aren't going to use
810 * that to send signals to arbitary processes. 803 * that to send signals to arbitary processes.
811 * That stops right now. 804 * That stops right now.
812 * 805 *
813 * If the parent exec id doesn't match the exec id we saved 806 * If the parent exec id doesn't match the exec id we saved
@@ -841,13 +834,12 @@ static void exit_notify(struct task_struct *tsk)
841 state = EXIT_DEAD; 834 state = EXIT_DEAD;
842 tsk->exit_state = state; 835 tsk->exit_state = state;
843 836
844 write_unlock_irq(&tasklist_lock); 837 if (thread_group_leader(tsk) &&
838 tsk->signal->notify_count < 0 &&
839 tsk->signal->group_exit_task)
840 wake_up_process(tsk->signal->group_exit_task);
845 841
846 list_for_each_safe(_p, _n, &ptrace_dead) { 842 write_unlock_irq(&tasklist_lock);
847 list_del_init(_p);
848 t = list_entry(_p, struct task_struct, ptrace_list);
849 release_task(t);
850 }
851 843
852 /* If the process is dead, release it - nobody will wait for it */ 844 /* If the process is dead, release it - nobody will wait for it */
853 if (state == EXIT_DEAD) 845 if (state == EXIT_DEAD)
@@ -882,6 +874,39 @@ static void check_stack_usage(void)
882static inline void check_stack_usage(void) {} 874static inline void check_stack_usage(void) {}
883#endif 875#endif
884 876
877static inline void exit_child_reaper(struct task_struct *tsk)
878{
879 if (likely(tsk->group_leader != task_child_reaper(tsk)))
880 return;
881
882 if (tsk->nsproxy->pid_ns == &init_pid_ns)
883 panic("Attempted to kill init!");
884
885 /*
886 * @tsk is the last thread in the 'cgroup-init' and is exiting.
887 * Terminate all remaining processes in the namespace and reap them
888 * before exiting @tsk.
889 *
890 * Note that @tsk (last thread of cgroup-init) may not necessarily
891 * be the child-reaper (i.e main thread of cgroup-init) of the
892 * namespace i.e the child_reaper may have already exited.
893 *
894 * Even after a child_reaper exits, we let it inherit orphaned children,
895 * because, pid_ns->child_reaper remains valid as long as there is
896 * at least one living sub-thread in the cgroup init.
897
898 * This living sub-thread of the cgroup-init will be notified when
899 * a child inherited by the 'child-reaper' exits (do_notify_parent()
900 * uses __group_send_sig_info()). Further, when reaping child processes,
901 * do_wait() iterates over children of all living sub threads.
902
903 * i.e even though 'child_reaper' thread is listed as the parent of the
904 * orphaned children, any living sub-thread in the cgroup-init can
905 * perform the role of the child_reaper.
906 */
907 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
908}
909
885fastcall NORET_TYPE void do_exit(long code) 910fastcall NORET_TYPE void do_exit(long code)
886{ 911{
887 struct task_struct *tsk = current; 912 struct task_struct *tsk = current;
@@ -895,13 +920,6 @@ fastcall NORET_TYPE void do_exit(long code)
895 panic("Aiee, killing interrupt handler!"); 920 panic("Aiee, killing interrupt handler!");
896 if (unlikely(!tsk->pid)) 921 if (unlikely(!tsk->pid))
897 panic("Attempted to kill the idle task!"); 922 panic("Attempted to kill the idle task!");
898 if (unlikely(tsk == child_reaper(tsk))) {
899 if (tsk->nsproxy->pid_ns != &init_pid_ns)
900 tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
901 else
902 panic("Attempted to kill init!");
903 }
904
905 923
906 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 924 if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
907 current->ptrace_message = code; 925 current->ptrace_message = code;
@@ -931,17 +949,17 @@ fastcall NORET_TYPE void do_exit(long code)
931 schedule(); 949 schedule();
932 } 950 }
933 951
952 tsk->flags |= PF_EXITING;
934 /* 953 /*
935 * tsk->flags are checked in the futex code to protect against 954 * tsk->flags are checked in the futex code to protect against
936 * an exiting task cleaning up the robust pi futexes. 955 * an exiting task cleaning up the robust pi futexes.
937 */ 956 */
938 spin_lock_irq(&tsk->pi_lock); 957 smp_mb();
939 tsk->flags |= PF_EXITING; 958 spin_unlock_wait(&tsk->pi_lock);
940 spin_unlock_irq(&tsk->pi_lock);
941 959
942 if (unlikely(in_atomic())) 960 if (unlikely(in_atomic()))
943 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 961 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
944 current->comm, current->pid, 962 current->comm, task_pid_nr(current),
945 preempt_count()); 963 preempt_count());
946 964
947 acct_update_integrals(tsk); 965 acct_update_integrals(tsk);
@@ -951,16 +969,19 @@ fastcall NORET_TYPE void do_exit(long code)
951 } 969 }
952 group_dead = atomic_dec_and_test(&tsk->signal->live); 970 group_dead = atomic_dec_and_test(&tsk->signal->live);
953 if (group_dead) { 971 if (group_dead) {
972 exit_child_reaper(tsk);
954 hrtimer_cancel(&tsk->signal->real_timer); 973 hrtimer_cancel(&tsk->signal->real_timer);
955 exit_itimers(tsk->signal); 974 exit_itimers(tsk->signal);
956 } 975 }
957 acct_collect(code, group_dead); 976 acct_collect(code, group_dead);
977#ifdef CONFIG_FUTEX
958 if (unlikely(tsk->robust_list)) 978 if (unlikely(tsk->robust_list))
959 exit_robust_list(tsk); 979 exit_robust_list(tsk);
960#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) 980#ifdef CONFIG_COMPAT
961 if (unlikely(tsk->compat_robust_list)) 981 if (unlikely(tsk->compat_robust_list))
962 compat_exit_robust_list(tsk); 982 compat_exit_robust_list(tsk);
963#endif 983#endif
984#endif
964 if (group_dead) 985 if (group_dead)
965 tty_audit_exit(); 986 tty_audit_exit();
966 if (unlikely(tsk->audit_context)) 987 if (unlikely(tsk->audit_context))
@@ -978,7 +999,7 @@ fastcall NORET_TYPE void do_exit(long code)
978 __exit_fs(tsk); 999 __exit_fs(tsk);
979 check_stack_usage(); 1000 check_stack_usage();
980 exit_thread(); 1001 exit_thread();
981 cpuset_exit(tsk); 1002 cgroup_exit(tsk, 1);
982 exit_keys(tsk); 1003 exit_keys(tsk);
983 1004
984 if (group_dead && tsk->signal->leader) 1005 if (group_dead && tsk->signal->leader)
@@ -989,12 +1010,12 @@ fastcall NORET_TYPE void do_exit(long code)
989 module_put(tsk->binfmt->module); 1010 module_put(tsk->binfmt->module);
990 1011
991 proc_exit_connector(tsk); 1012 proc_exit_connector(tsk);
992 exit_task_namespaces(tsk);
993 exit_notify(tsk); 1013 exit_notify(tsk);
994#ifdef CONFIG_NUMA 1014#ifdef CONFIG_NUMA
995 mpol_free(tsk->mempolicy); 1015 mpol_free(tsk->mempolicy);
996 tsk->mempolicy = NULL; 1016 tsk->mempolicy = NULL;
997#endif 1017#endif
1018#ifdef CONFIG_FUTEX
998 /* 1019 /*
999 * This must happen late, after the PID is not 1020 * This must happen late, after the PID is not
1000 * hashed anymore: 1021 * hashed anymore:
@@ -1003,6 +1024,7 @@ fastcall NORET_TYPE void do_exit(long code)
1003 exit_pi_state_list(tsk); 1024 exit_pi_state_list(tsk);
1004 if (unlikely(current->pi_state_cache)) 1025 if (unlikely(current->pi_state_cache))
1005 kfree(current->pi_state_cache); 1026 kfree(current->pi_state_cache);
1027#endif
1006 /* 1028 /*
1007 * Make sure we are holding no locks: 1029 * Make sure we are holding no locks:
1008 */ 1030 */
@@ -1090,15 +1112,17 @@ asmlinkage void sys_exit_group(int error_code)
1090static int eligible_child(pid_t pid, int options, struct task_struct *p) 1112static int eligible_child(pid_t pid, int options, struct task_struct *p)
1091{ 1113{
1092 int err; 1114 int err;
1115 struct pid_namespace *ns;
1093 1116
1117 ns = current->nsproxy->pid_ns;
1094 if (pid > 0) { 1118 if (pid > 0) {
1095 if (p->pid != pid) 1119 if (task_pid_nr_ns(p, ns) != pid)
1096 return 0; 1120 return 0;
1097 } else if (!pid) { 1121 } else if (!pid) {
1098 if (process_group(p) != process_group(current)) 1122 if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
1099 return 0; 1123 return 0;
1100 } else if (pid != -1) { 1124 } else if (pid != -1) {
1101 if (process_group(p) != -pid) 1125 if (task_pgrp_nr_ns(p, ns) != -pid)
1102 return 0; 1126 return 0;
1103 } 1127 }
1104 1128
@@ -1167,11 +1191,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1167 int __user *stat_addr, struct rusage __user *ru) 1191 int __user *stat_addr, struct rusage __user *ru)
1168{ 1192{
1169 unsigned long state; 1193 unsigned long state;
1170 int retval; 1194 int retval, status, traced;
1171 int status; 1195 struct pid_namespace *ns;
1196
1197 ns = current->nsproxy->pid_ns;
1172 1198
1173 if (unlikely(noreap)) { 1199 if (unlikely(noreap)) {
1174 pid_t pid = p->pid; 1200 pid_t pid = task_pid_nr_ns(p, ns);
1175 uid_t uid = p->uid; 1201 uid_t uid = p->uid;
1176 int exit_code = p->exit_code; 1202 int exit_code = p->exit_code;
1177 int why, status; 1203 int why, status;
@@ -1202,15 +1228,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1202 BUG_ON(state != EXIT_DEAD); 1228 BUG_ON(state != EXIT_DEAD);
1203 return 0; 1229 return 0;
1204 } 1230 }
1205 if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) {
1206 /*
1207 * This can only happen in a race with a ptraced thread
1208 * dying on another processor.
1209 */
1210 return 0;
1211 }
1212 1231
1213 if (likely(p->real_parent == p->parent) && likely(p->signal)) { 1232 /* traced means p->ptrace, but not vice versa */
1233 traced = (p->real_parent != p->parent);
1234
1235 if (likely(!traced)) {
1214 struct signal_struct *psig; 1236 struct signal_struct *psig;
1215 struct signal_struct *sig; 1237 struct signal_struct *sig;
1216 1238
@@ -1242,6 +1264,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1242 cputime_add(p->stime, 1264 cputime_add(p->stime,
1243 cputime_add(sig->stime, 1265 cputime_add(sig->stime,
1244 sig->cstime))); 1266 sig->cstime)));
1267 psig->cgtime =
1268 cputime_add(psig->cgtime,
1269 cputime_add(p->gtime,
1270 cputime_add(sig->gtime,
1271 sig->cgtime)));
1245 psig->cmin_flt += 1272 psig->cmin_flt +=
1246 p->min_flt + sig->min_flt + sig->cmin_flt; 1273 p->min_flt + sig->min_flt + sig->cmin_flt;
1247 psig->cmaj_flt += 1274 psig->cmaj_flt +=
@@ -1289,38 +1316,33 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1289 retval = put_user(status, &infop->si_status); 1316 retval = put_user(status, &infop->si_status);
1290 } 1317 }
1291 if (!retval && infop) 1318 if (!retval && infop)
1292 retval = put_user(p->pid, &infop->si_pid); 1319 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1293 if (!retval && infop) 1320 if (!retval && infop)
1294 retval = put_user(p->uid, &infop->si_uid); 1321 retval = put_user(p->uid, &infop->si_uid);
1295 if (retval) { 1322 if (!retval)
1296 // TODO: is this safe? 1323 retval = task_pid_nr_ns(p, ns);
1297 p->exit_state = EXIT_ZOMBIE; 1324
1298 return retval; 1325 if (traced) {
1299 }
1300 retval = p->pid;
1301 if (p->real_parent != p->parent) {
1302 write_lock_irq(&tasklist_lock); 1326 write_lock_irq(&tasklist_lock);
1303 /* Double-check with lock held. */ 1327 /* We dropped tasklist, ptracer could die and untrace */
1304 if (p->real_parent != p->parent) { 1328 ptrace_unlink(p);
1305 __ptrace_unlink(p); 1329 /*
1306 // TODO: is this safe? 1330 * If this is not a detached task, notify the parent.
1307 p->exit_state = EXIT_ZOMBIE; 1331 * If it's still not detached after that, don't release
1308 /* 1332 * it now.
1309 * If this is not a detached task, notify the parent. 1333 */
1310 * If it's still not detached after that, don't release 1334 if (p->exit_signal != -1) {
1311 * it now. 1335 do_notify_parent(p, p->exit_signal);
1312 */
1313 if (p->exit_signal != -1) { 1336 if (p->exit_signal != -1) {
1314 do_notify_parent(p, p->exit_signal); 1337 p->exit_state = EXIT_ZOMBIE;
1315 if (p->exit_signal != -1) 1338 p = NULL;
1316 p = NULL;
1317 } 1339 }
1318 } 1340 }
1319 write_unlock_irq(&tasklist_lock); 1341 write_unlock_irq(&tasklist_lock);
1320 } 1342 }
1321 if (p != NULL) 1343 if (p != NULL)
1322 release_task(p); 1344 release_task(p);
1323 BUG_ON(!retval); 1345
1324 return retval; 1346 return retval;
1325} 1347}
1326 1348
@@ -1335,11 +1357,12 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1335 int __user *stat_addr, struct rusage __user *ru) 1357 int __user *stat_addr, struct rusage __user *ru)
1336{ 1358{
1337 int retval, exit_code; 1359 int retval, exit_code;
1360 struct pid_namespace *ns;
1338 1361
1339 if (!p->exit_code) 1362 if (!p->exit_code)
1340 return 0; 1363 return 0;
1341 if (delayed_group_leader && !(p->ptrace & PT_PTRACED) && 1364 if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
1342 p->signal && p->signal->group_stop_count > 0) 1365 p->signal->group_stop_count > 0)
1343 /* 1366 /*
1344 * A group stop is in progress and this is the group leader. 1367 * A group stop is in progress and this is the group leader.
1345 * We won't report until all threads have stopped. 1368 * We won't report until all threads have stopped.
@@ -1353,11 +1376,12 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1353 * keep holding onto the tasklist_lock while we call getrusage and 1376 * keep holding onto the tasklist_lock while we call getrusage and
1354 * possibly take page faults for user memory. 1377 * possibly take page faults for user memory.
1355 */ 1378 */
1379 ns = current->nsproxy->pid_ns;
1356 get_task_struct(p); 1380 get_task_struct(p);
1357 read_unlock(&tasklist_lock); 1381 read_unlock(&tasklist_lock);
1358 1382
1359 if (unlikely(noreap)) { 1383 if (unlikely(noreap)) {
1360 pid_t pid = p->pid; 1384 pid_t pid = task_pid_nr_ns(p, ns);
1361 uid_t uid = p->uid; 1385 uid_t uid = p->uid;
1362 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1386 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1363 1387
@@ -1428,11 +1452,11 @@ bail_ref:
1428 if (!retval && infop) 1452 if (!retval && infop)
1429 retval = put_user(exit_code, &infop->si_status); 1453 retval = put_user(exit_code, &infop->si_status);
1430 if (!retval && infop) 1454 if (!retval && infop)
1431 retval = put_user(p->pid, &infop->si_pid); 1455 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
1432 if (!retval && infop) 1456 if (!retval && infop)
1433 retval = put_user(p->uid, &infop->si_uid); 1457 retval = put_user(p->uid, &infop->si_uid);
1434 if (!retval) 1458 if (!retval)
1435 retval = p->pid; 1459 retval = task_pid_nr_ns(p, ns);
1436 put_task_struct(p); 1460 put_task_struct(p);
1437 1461
1438 BUG_ON(!retval); 1462 BUG_ON(!retval);
@@ -1452,9 +1476,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1452 int retval; 1476 int retval;
1453 pid_t pid; 1477 pid_t pid;
1454 uid_t uid; 1478 uid_t uid;
1455 1479 struct pid_namespace *ns;
1456 if (unlikely(!p->signal))
1457 return 0;
1458 1480
1459 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1481 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1460 return 0; 1482 return 0;
@@ -1469,7 +1491,8 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1469 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1491 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1470 spin_unlock_irq(&p->sighand->siglock); 1492 spin_unlock_irq(&p->sighand->siglock);
1471 1493
1472 pid = p->pid; 1494 ns = current->nsproxy->pid_ns;
1495 pid = task_pid_nr_ns(p, ns);
1473 uid = p->uid; 1496 uid = p->uid;
1474 get_task_struct(p); 1497 get_task_struct(p);
1475 read_unlock(&tasklist_lock); 1498 read_unlock(&tasklist_lock);
@@ -1480,7 +1503,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1480 if (!retval && stat_addr) 1503 if (!retval && stat_addr)
1481 retval = put_user(0xffff, stat_addr); 1504 retval = put_user(0xffff, stat_addr);
1482 if (!retval) 1505 if (!retval)
1483 retval = p->pid; 1506 retval = task_pid_nr_ns(p, ns);
1484 } else { 1507 } else {
1485 retval = wait_noreap_copyout(p, pid, uid, 1508 retval = wait_noreap_copyout(p, pid, uid,
1486 CLD_CONTINUED, SIGCONT, 1509 CLD_CONTINUED, SIGCONT,
@@ -1529,12 +1552,9 @@ repeat:
1529 tsk = current; 1552 tsk = current;
1530 do { 1553 do {
1531 struct task_struct *p; 1554 struct task_struct *p;
1532 struct list_head *_p;
1533 int ret; 1555 int ret;
1534 1556
1535 list_for_each(_p,&tsk->children) { 1557 list_for_each_entry(p, &tsk->children, sibling) {
1536 p = list_entry(_p, struct task_struct, sibling);
1537
1538 ret = eligible_child(pid, options, p); 1558 ret = eligible_child(pid, options, p);
1539 if (!ret) 1559 if (!ret)
1540 continue; 1560 continue;
@@ -1616,9 +1636,8 @@ check_continued:
1616 } 1636 }
1617 } 1637 }
1618 if (!flag) { 1638 if (!flag) {
1619 list_for_each(_p, &tsk->ptrace_children) { 1639 list_for_each_entry(p, &tsk->ptrace_children,
1620 p = list_entry(_p, struct task_struct, 1640 ptrace_list) {
1621 ptrace_list);
1622 if (!eligible_child(pid, options, p)) 1641 if (!eligible_child(pid, options, p))
1623 continue; 1642 continue;
1624 flag = 1; 1643 flag = 1;
diff --git a/kernel/fork.c b/kernel/fork.c
index 33f12f48684a..ddafdfac9456 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -29,7 +29,7 @@
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/cpuset.h> 32#include <linux/cgroup.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
@@ -50,6 +50,7 @@
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h>
53 54
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
@@ -107,6 +108,7 @@ static struct kmem_cache *mm_cachep;
107 108
108void free_task(struct task_struct *tsk) 109void free_task(struct task_struct *tsk)
109{ 110{
111 prop_local_destroy_single(&tsk->dirties);
110 free_thread_info(tsk->stack); 112 free_thread_info(tsk->stack);
111 rt_mutex_debug_task_free(tsk); 113 rt_mutex_debug_task_free(tsk);
112 free_task_struct(tsk); 114 free_task_struct(tsk);
@@ -115,7 +117,7 @@ EXPORT_SYMBOL(free_task);
115 117
116void __put_task_struct(struct task_struct *tsk) 118void __put_task_struct(struct task_struct *tsk)
117{ 119{
118 WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); 120 WARN_ON(!tsk->exit_state);
119 WARN_ON(atomic_read(&tsk->usage)); 121 WARN_ON(atomic_read(&tsk->usage));
120 WARN_ON(tsk == current); 122 WARN_ON(tsk == current);
121 123
@@ -163,6 +165,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
163{ 165{
164 struct task_struct *tsk; 166 struct task_struct *tsk;
165 struct thread_info *ti; 167 struct thread_info *ti;
168 int err;
166 169
167 prepare_to_copy(orig); 170 prepare_to_copy(orig);
168 171
@@ -178,6 +181,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
178 181
179 *tsk = *orig; 182 *tsk = *orig;
180 tsk->stack = ti; 183 tsk->stack = ti;
184
185 err = prop_local_init_single(&tsk->dirties);
186 if (err) {
187 free_thread_info(ti);
188 free_task_struct(tsk);
189 return NULL;
190 }
191
181 setup_thread_stack(tsk, orig); 192 setup_thread_stack(tsk, orig);
182 193
183#ifdef CONFIG_CC_STACKPROTECTOR 194#ifdef CONFIG_CC_STACKPROTECTOR
@@ -195,7 +206,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
195} 206}
196 207
197#ifdef CONFIG_MMU 208#ifdef CONFIG_MMU
198static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) 209static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
199{ 210{
200 struct vm_area_struct *mpnt, *tmp, **pprev; 211 struct vm_area_struct *mpnt, *tmp, **pprev;
201 struct rb_node **rb_link, *rb_parent; 212 struct rb_node **rb_link, *rb_parent;
@@ -258,7 +269,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
258 get_file(file); 269 get_file(file);
259 if (tmp->vm_flags & VM_DENYWRITE) 270 if (tmp->vm_flags & VM_DENYWRITE)
260 atomic_dec(&inode->i_writecount); 271 atomic_dec(&inode->i_writecount);
261 272
262 /* insert tmp into the share list, just after mpnt */ 273 /* insert tmp into the share list, just after mpnt */
263 spin_lock(&file->f_mapping->i_mmap_lock); 274 spin_lock(&file->f_mapping->i_mmap_lock);
264 tmp->vm_truncate_count = mpnt->vm_truncate_count; 275 tmp->vm_truncate_count = mpnt->vm_truncate_count;
@@ -321,7 +332,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
321#define mm_free_pgd(mm) 332#define mm_free_pgd(mm)
322#endif /* CONFIG_MMU */ 333#endif /* CONFIG_MMU */
323 334
324 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 335__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
325 336
326#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 337#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
327#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 338#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
@@ -573,7 +584,7 @@ fail_nomem:
573 return retval; 584 return retval;
574} 585}
575 586
576static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) 587static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
577{ 588{
578 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); 589 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
579 /* We don't need to lock fs - think why ;-) */ 590 /* We don't need to lock fs - think why ;-) */
@@ -605,7 +616,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
605 616
606EXPORT_SYMBOL_GPL(copy_fs_struct); 617EXPORT_SYMBOL_GPL(copy_fs_struct);
607 618
608static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) 619static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
609{ 620{
610 if (clone_flags & CLONE_FS) { 621 if (clone_flags & CLONE_FS) {
611 atomic_inc(&current->fs->count); 622 atomic_inc(&current->fs->count);
@@ -728,8 +739,8 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
728 /* compute the remainder to be cleared */ 739 /* compute the remainder to be cleared */
729 size = (new_fdt->max_fds - open_files) * sizeof(struct file *); 740 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
730 741
731 /* This is long word aligned thus could use a optimized version */ 742 /* This is long word aligned thus could use a optimized version */
732 memset(new_fds, 0, size); 743 memset(new_fds, 0, size);
733 744
734 if (new_fdt->max_fds > open_files) { 745 if (new_fdt->max_fds > open_files) {
735 int left = (new_fdt->max_fds-open_files)/8; 746 int left = (new_fdt->max_fds-open_files)/8;
@@ -808,7 +819,7 @@ int unshare_files(void)
808 819
809EXPORT_SYMBOL(unshare_files); 820EXPORT_SYMBOL(unshare_files);
810 821
811static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) 822static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
812{ 823{
813 struct sighand_struct *sig; 824 struct sighand_struct *sig;
814 825
@@ -831,7 +842,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
831 kmem_cache_free(sighand_cachep, sighand); 842 kmem_cache_free(sighand_cachep, sighand);
832} 843}
833 844
834static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) 845static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
835{ 846{
836 struct signal_struct *sig; 847 struct signal_struct *sig;
837 int ret; 848 int ret;
@@ -877,6 +888,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->tty_old_pgrp = NULL; 888 sig->tty_old_pgrp = NULL;
878 889
879 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 890 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
891 sig->gtime = cputime_zero;
892 sig->cgtime = cputime_zero;
880 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 893 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
881 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 894 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
882 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 895 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -911,7 +924,7 @@ void __cleanup_signal(struct signal_struct *sig)
911 kmem_cache_free(signal_cachep, sig); 924 kmem_cache_free(signal_cachep, sig);
912} 925}
913 926
914static inline void cleanup_signal(struct task_struct *tsk) 927static void cleanup_signal(struct task_struct *tsk)
915{ 928{
916 struct signal_struct *sig = tsk->signal; 929 struct signal_struct *sig = tsk->signal;
917 930
@@ -921,7 +934,7 @@ static inline void cleanup_signal(struct task_struct *tsk)
921 __cleanup_signal(sig); 934 __cleanup_signal(sig);
922} 935}
923 936
924static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) 937static void copy_flags(unsigned long clone_flags, struct task_struct *p)
925{ 938{
926 unsigned long new_flags = p->flags; 939 unsigned long new_flags = p->flags;
927 940
@@ -930,16 +943,17 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
930 if (!(clone_flags & CLONE_PTRACE)) 943 if (!(clone_flags & CLONE_PTRACE))
931 p->ptrace = 0; 944 p->ptrace = 0;
932 p->flags = new_flags; 945 p->flags = new_flags;
946 clear_freeze_flag(p);
933} 947}
934 948
935asmlinkage long sys_set_tid_address(int __user *tidptr) 949asmlinkage long sys_set_tid_address(int __user *tidptr)
936{ 950{
937 current->clear_child_tid = tidptr; 951 current->clear_child_tid = tidptr;
938 952
939 return current->pid; 953 return task_pid_vnr(current);
940} 954}
941 955
942static inline void rt_mutex_init_task(struct task_struct *p) 956static void rt_mutex_init_task(struct task_struct *p)
943{ 957{
944 spin_lock_init(&p->pi_lock); 958 spin_lock_init(&p->pi_lock);
945#ifdef CONFIG_RT_MUTEXES 959#ifdef CONFIG_RT_MUTEXES
@@ -960,12 +974,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
960 unsigned long stack_start, 974 unsigned long stack_start,
961 struct pt_regs *regs, 975 struct pt_regs *regs,
962 unsigned long stack_size, 976 unsigned long stack_size,
963 int __user *parent_tidptr,
964 int __user *child_tidptr, 977 int __user *child_tidptr,
965 struct pid *pid) 978 struct pid *pid)
966{ 979{
967 int retval; 980 int retval;
968 struct task_struct *p = NULL; 981 struct task_struct *p;
982 int cgroup_callbacks_done = 0;
969 983
970 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 984 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
971 return ERR_PTR(-EINVAL); 985 return ERR_PTR(-EINVAL);
@@ -1029,12 +1043,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1029 p->did_exec = 0; 1043 p->did_exec = 0;
1030 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1044 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1031 copy_flags(clone_flags, p); 1045 copy_flags(clone_flags, p);
1032 p->pid = pid_nr(pid);
1033 retval = -EFAULT;
1034 if (clone_flags & CLONE_PARENT_SETTID)
1035 if (put_user(p->pid, parent_tidptr))
1036 goto bad_fork_cleanup_delays_binfmt;
1037
1038 INIT_LIST_HEAD(&p->children); 1046 INIT_LIST_HEAD(&p->children);
1039 INIT_LIST_HEAD(&p->sibling); 1047 INIT_LIST_HEAD(&p->sibling);
1040 p->vfork_done = NULL; 1048 p->vfork_done = NULL;
@@ -1045,6 +1053,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 1053
1046 p->utime = cputime_zero; 1054 p->utime = cputime_zero;
1047 p->stime = cputime_zero; 1055 p->stime = cputime_zero;
1056 p->gtime = cputime_zero;
1057 p->utimescaled = cputime_zero;
1058 p->stimescaled = cputime_zero;
1048 1059
1049#ifdef CONFIG_TASK_XACCT 1060#ifdef CONFIG_TASK_XACCT
1050 p->rchar = 0; /* I/O counter: bytes read */ 1061 p->rchar = 0; /* I/O counter: bytes read */
@@ -1055,28 +1066,29 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1055 task_io_accounting_init(p); 1066 task_io_accounting_init(p);
1056 acct_clear_integrals(p); 1067 acct_clear_integrals(p);
1057 1068
1058 p->it_virt_expires = cputime_zero; 1069 p->it_virt_expires = cputime_zero;
1059 p->it_prof_expires = cputime_zero; 1070 p->it_prof_expires = cputime_zero;
1060 p->it_sched_expires = 0; 1071 p->it_sched_expires = 0;
1061 INIT_LIST_HEAD(&p->cpu_timers[0]); 1072 INIT_LIST_HEAD(&p->cpu_timers[0]);
1062 INIT_LIST_HEAD(&p->cpu_timers[1]); 1073 INIT_LIST_HEAD(&p->cpu_timers[1]);
1063 INIT_LIST_HEAD(&p->cpu_timers[2]); 1074 INIT_LIST_HEAD(&p->cpu_timers[2]);
1064 1075
1065 p->lock_depth = -1; /* -1 = no lock */ 1076 p->lock_depth = -1; /* -1 = no lock */
1066 do_posix_clock_monotonic_gettime(&p->start_time); 1077 do_posix_clock_monotonic_gettime(&p->start_time);
1067 p->real_start_time = p->start_time; 1078 p->real_start_time = p->start_time;
1068 monotonic_to_bootbased(&p->real_start_time); 1079 monotonic_to_bootbased(&p->real_start_time);
1080#ifdef CONFIG_SECURITY
1069 p->security = NULL; 1081 p->security = NULL;
1082#endif
1070 p->io_context = NULL; 1083 p->io_context = NULL;
1071 p->io_wait = NULL;
1072 p->audit_context = NULL; 1084 p->audit_context = NULL;
1073 cpuset_fork(p); 1085 cgroup_fork(p);
1074#ifdef CONFIG_NUMA 1086#ifdef CONFIG_NUMA
1075 p->mempolicy = mpol_copy(p->mempolicy); 1087 p->mempolicy = mpol_copy(p->mempolicy);
1076 if (IS_ERR(p->mempolicy)) { 1088 if (IS_ERR(p->mempolicy)) {
1077 retval = PTR_ERR(p->mempolicy); 1089 retval = PTR_ERR(p->mempolicy);
1078 p->mempolicy = NULL; 1090 p->mempolicy = NULL;
1079 goto bad_fork_cleanup_cpuset; 1091 goto bad_fork_cleanup_cgroup;
1080 } 1092 }
1081 mpol_fix_fork_child_flag(p); 1093 mpol_fix_fork_child_flag(p);
1082#endif 1094#endif
@@ -1109,10 +1121,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1109 p->blocked_on = NULL; /* not blocked yet */ 1121 p->blocked_on = NULL; /* not blocked yet */
1110#endif 1122#endif
1111 1123
1112 p->tgid = p->pid;
1113 if (clone_flags & CLONE_THREAD)
1114 p->tgid = current->tgid;
1115
1116 if ((retval = security_task_alloc(p))) 1124 if ((retval = security_task_alloc(p)))
1117 goto bad_fork_cleanup_policy; 1125 goto bad_fork_cleanup_policy;
1118 if ((retval = audit_alloc(p))) 1126 if ((retval = audit_alloc(p)))
@@ -1138,18 +1146,37 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1138 if (retval) 1146 if (retval)
1139 goto bad_fork_cleanup_namespaces; 1147 goto bad_fork_cleanup_namespaces;
1140 1148
1149 if (pid != &init_struct_pid) {
1150 retval = -ENOMEM;
1151 pid = alloc_pid(task_active_pid_ns(p));
1152 if (!pid)
1153 goto bad_fork_cleanup_namespaces;
1154
1155 if (clone_flags & CLONE_NEWPID) {
1156 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
1157 if (retval < 0)
1158 goto bad_fork_free_pid;
1159 }
1160 }
1161
1162 p->pid = pid_nr(pid);
1163 p->tgid = p->pid;
1164 if (clone_flags & CLONE_THREAD)
1165 p->tgid = current->tgid;
1166
1141 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1167 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1142 /* 1168 /*
1143 * Clear TID on mm_release()? 1169 * Clear TID on mm_release()?
1144 */ 1170 */
1145 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1171 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
1172#ifdef CONFIG_FUTEX
1146 p->robust_list = NULL; 1173 p->robust_list = NULL;
1147#ifdef CONFIG_COMPAT 1174#ifdef CONFIG_COMPAT
1148 p->compat_robust_list = NULL; 1175 p->compat_robust_list = NULL;
1149#endif 1176#endif
1150 INIT_LIST_HEAD(&p->pi_state_list); 1177 INIT_LIST_HEAD(&p->pi_state_list);
1151 p->pi_state_cache = NULL; 1178 p->pi_state_cache = NULL;
1152 1179#endif
1153 /* 1180 /*
1154 * sigaltstack should be cleared when sharing the same VM 1181 * sigaltstack should be cleared when sharing the same VM
1155 */ 1182 */
@@ -1186,6 +1213,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1186 /* Perform scheduler related setup. Assign this task to a CPU. */ 1213 /* Perform scheduler related setup. Assign this task to a CPU. */
1187 sched_fork(p, clone_flags); 1214 sched_fork(p, clone_flags);
1188 1215
1216 /* Now that the task is set up, run cgroup callbacks if
1217 * necessary. We need to run them before the task is visible
1218 * on the tasklist. */
1219 cgroup_fork_callbacks(p);
1220 cgroup_callbacks_done = 1;
1221
1189 /* Need tasklist lock for parent etc handling! */ 1222 /* Need tasklist lock for parent etc handling! */
1190 write_lock_irq(&tasklist_lock); 1223 write_lock_irq(&tasklist_lock);
1191 1224
@@ -1223,12 +1256,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1223 * A fatal signal pending means that current will exit, so the new 1256 * A fatal signal pending means that current will exit, so the new
1224 * thread can't slip out of an OOM kill (or normal SIGKILL). 1257 * thread can't slip out of an OOM kill (or normal SIGKILL).
1225 */ 1258 */
1226 recalc_sigpending(); 1259 recalc_sigpending();
1227 if (signal_pending(current)) { 1260 if (signal_pending(current)) {
1228 spin_unlock(&current->sighand->siglock); 1261 spin_unlock(&current->sighand->siglock);
1229 write_unlock_irq(&tasklist_lock); 1262 write_unlock_irq(&tasklist_lock);
1230 retval = -ERESTARTNOINTR; 1263 retval = -ERESTARTNOINTR;
1231 goto bad_fork_cleanup_namespaces; 1264 goto bad_fork_free_pid;
1232 } 1265 }
1233 1266
1234 if (clone_flags & CLONE_THREAD) { 1267 if (clone_flags & CLONE_THREAD) {
@@ -1257,11 +1290,22 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1257 __ptrace_link(p, current->parent); 1290 __ptrace_link(p, current->parent);
1258 1291
1259 if (thread_group_leader(p)) { 1292 if (thread_group_leader(p)) {
1260 p->signal->tty = current->signal->tty; 1293 if (clone_flags & CLONE_NEWPID) {
1261 p->signal->pgrp = process_group(current); 1294 p->nsproxy->pid_ns->child_reaper = p;
1262 set_signal_session(p->signal, process_session(current)); 1295 p->signal->tty = NULL;
1263 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1296 set_task_pgrp(p, p->pid);
1264 attach_pid(p, PIDTYPE_SID, task_session(current)); 1297 set_task_session(p, p->pid);
1298 attach_pid(p, PIDTYPE_PGID, pid);
1299 attach_pid(p, PIDTYPE_SID, pid);
1300 } else {
1301 p->signal->tty = current->signal->tty;
1302 set_task_pgrp(p, task_pgrp_nr(current));
1303 set_task_session(p, task_session_nr(current));
1304 attach_pid(p, PIDTYPE_PGID,
1305 task_pgrp(current));
1306 attach_pid(p, PIDTYPE_SID,
1307 task_session(current));
1308 }
1265 1309
1266 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1310 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1267 __get_cpu_var(process_counts)++; 1311 __get_cpu_var(process_counts)++;
@@ -1274,8 +1318,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1274 spin_unlock(&current->sighand->siglock); 1318 spin_unlock(&current->sighand->siglock);
1275 write_unlock_irq(&tasklist_lock); 1319 write_unlock_irq(&tasklist_lock);
1276 proc_fork_connector(p); 1320 proc_fork_connector(p);
1321 cgroup_post_fork(p);
1277 return p; 1322 return p;
1278 1323
1324bad_fork_free_pid:
1325 if (pid != &init_struct_pid)
1326 free_pid(pid);
1279bad_fork_cleanup_namespaces: 1327bad_fork_cleanup_namespaces:
1280 exit_task_namespaces(p); 1328 exit_task_namespaces(p);
1281bad_fork_cleanup_keys: 1329bad_fork_cleanup_keys:
@@ -1300,10 +1348,9 @@ bad_fork_cleanup_security:
1300bad_fork_cleanup_policy: 1348bad_fork_cleanup_policy:
1301#ifdef CONFIG_NUMA 1349#ifdef CONFIG_NUMA
1302 mpol_free(p->mempolicy); 1350 mpol_free(p->mempolicy);
1303bad_fork_cleanup_cpuset: 1351bad_fork_cleanup_cgroup:
1304#endif 1352#endif
1305 cpuset_exit(p); 1353 cgroup_exit(p, cgroup_callbacks_done);
1306bad_fork_cleanup_delays_binfmt:
1307 delayacct_tsk_free(p); 1354 delayacct_tsk_free(p);
1308 if (p->binfmt) 1355 if (p->binfmt)
1309 module_put(p->binfmt->module); 1356 module_put(p->binfmt->module);
@@ -1330,7 +1377,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1330 struct task_struct *task; 1377 struct task_struct *task;
1331 struct pt_regs regs; 1378 struct pt_regs regs;
1332 1379
1333 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 1380 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1334 &init_struct_pid); 1381 &init_struct_pid);
1335 if (!IS_ERR(task)) 1382 if (!IS_ERR(task))
1336 init_idle(task, cpu); 1383 init_idle(task, cpu);
@@ -1338,7 +1385,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1338 return task; 1385 return task;
1339} 1386}
1340 1387
1341static inline int fork_traceflag (unsigned clone_flags) 1388static int fork_traceflag(unsigned clone_flags)
1342{ 1389{
1343 if (clone_flags & CLONE_UNTRACED) 1390 if (clone_flags & CLONE_UNTRACED)
1344 return 0; 1391 return 0;
@@ -1369,19 +1416,16 @@ long do_fork(unsigned long clone_flags,
1369{ 1416{
1370 struct task_struct *p; 1417 struct task_struct *p;
1371 int trace = 0; 1418 int trace = 0;
1372 struct pid *pid = alloc_pid();
1373 long nr; 1419 long nr;
1374 1420
1375 if (!pid)
1376 return -EAGAIN;
1377 nr = pid->nr;
1378 if (unlikely(current->ptrace)) { 1421 if (unlikely(current->ptrace)) {
1379 trace = fork_traceflag (clone_flags); 1422 trace = fork_traceflag (clone_flags);
1380 if (trace) 1423 if (trace)
1381 clone_flags |= CLONE_PTRACE; 1424 clone_flags |= CLONE_PTRACE;
1382 } 1425 }
1383 1426
1384 p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); 1427 p = copy_process(clone_flags, stack_start, regs, stack_size,
1428 child_tidptr, NULL);
1385 /* 1429 /*
1386 * Do this prior waking up the new thread - the thread pointer 1430 * Do this prior waking up the new thread - the thread pointer
1387 * might get invalid after that point, if the thread exits quickly. 1431 * might get invalid after that point, if the thread exits quickly.
@@ -1389,6 +1433,17 @@ long do_fork(unsigned long clone_flags,
1389 if (!IS_ERR(p)) { 1433 if (!IS_ERR(p)) {
1390 struct completion vfork; 1434 struct completion vfork;
1391 1435
1436 /*
1437 * this is enough to call pid_nr_ns here, but this if
1438 * improves optimisation of regular fork()
1439 */
1440 nr = (clone_flags & CLONE_NEWPID) ?
1441 task_pid_nr_ns(p, current->nsproxy->pid_ns) :
1442 task_pid_vnr(p);
1443
1444 if (clone_flags & CLONE_PARENT_SETTID)
1445 put_user(nr, parent_tidptr);
1446
1392 if (clone_flags & CLONE_VFORK) { 1447 if (clone_flags & CLONE_VFORK) {
1393 p->vfork_done = &vfork; 1448 p->vfork_done = &vfork;
1394 init_completion(&vfork); 1449 init_completion(&vfork);
@@ -1422,7 +1477,6 @@ long do_fork(unsigned long clone_flags,
1422 } 1477 }
1423 } 1478 }
1424 } else { 1479 } else {
1425 free_pid(pid);
1426 nr = PTR_ERR(p); 1480 nr = PTR_ERR(p);
1427 } 1481 }
1428 return nr; 1482 return nr;
@@ -1432,8 +1486,7 @@ long do_fork(unsigned long clone_flags,
1432#define ARCH_MIN_MMSTRUCT_ALIGN 0 1486#define ARCH_MIN_MMSTRUCT_ALIGN 0
1433#endif 1487#endif
1434 1488
1435static void sighand_ctor(void *data, struct kmem_cache *cachep, 1489static void sighand_ctor(struct kmem_cache *cachep, void *data)
1436 unsigned long flags)
1437{ 1490{
1438 struct sighand_struct *sighand = data; 1491 struct sighand_struct *sighand = data;
1439 1492
@@ -1468,7 +1521,7 @@ void __init proc_caches_init(void)
1468 * Check constraints on flags passed to the unshare system call and 1521 * Check constraints on flags passed to the unshare system call and
1469 * force unsharing of additional process context as appropriate. 1522 * force unsharing of additional process context as appropriate.
1470 */ 1523 */
1471static inline void check_unshare_flags(unsigned long *flags_ptr) 1524static void check_unshare_flags(unsigned long *flags_ptr)
1472{ 1525{
1473 /* 1526 /*
1474 * If unsharing a thread from a thread group, must also 1527 * If unsharing a thread from a thread group, must also
@@ -1600,7 +1653,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1600 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1653 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1601 struct files_struct *fd, *new_fd = NULL; 1654 struct files_struct *fd, *new_fd = NULL;
1602 struct sem_undo_list *new_ulist = NULL; 1655 struct sem_undo_list *new_ulist = NULL;
1603 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; 1656 struct nsproxy *new_nsproxy = NULL;
1604 1657
1605 check_unshare_flags(&unshare_flags); 1658 check_unshare_flags(&unshare_flags);
1606 1659
@@ -1608,7 +1661,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1608 err = -EINVAL; 1661 err = -EINVAL;
1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1662 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1663 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) 1664 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
1665 CLONE_NEWNET))
1612 goto bad_unshare_out; 1666 goto bad_unshare_out;
1613 1667
1614 if ((err = unshare_thread(unshare_flags))) 1668 if ((err = unshare_thread(unshare_flags)))
@@ -1629,14 +1683,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1629 1683
1630 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1684 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1631 1685
1632 task_lock(current);
1633
1634 if (new_nsproxy) { 1686 if (new_nsproxy) {
1635 old_nsproxy = current->nsproxy; 1687 switch_task_namespaces(current, new_nsproxy);
1636 current->nsproxy = new_nsproxy; 1688 new_nsproxy = NULL;
1637 new_nsproxy = old_nsproxy;
1638 } 1689 }
1639 1690
1691 task_lock(current);
1692
1640 if (new_fs) { 1693 if (new_fs) {
1641 fs = current->fs; 1694 fs = current->fs;
1642 current->fs = new_fs; 1695 current->fs = new_fs;
diff --git a/kernel/futex.c b/kernel/futex.c
index fcc94e7b4086..32710451dc20 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -52,6 +52,10 @@
52#include <linux/syscalls.h> 52#include <linux/syscalls.h>
53#include <linux/signal.h> 53#include <linux/signal.h>
54#include <linux/module.h> 54#include <linux/module.h>
55#include <linux/magic.h>
56#include <linux/pid.h>
57#include <linux/nsproxy.h>
58
55#include <asm/futex.h> 59#include <asm/futex.h>
56 60
57#include "rtmutex_common.h" 61#include "rtmutex_common.h"
@@ -292,7 +296,7 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
292 */ 296 */
293void drop_futex_key_refs(union futex_key *key) 297void drop_futex_key_refs(union futex_key *key)
294{ 298{
295 if (key->both.ptr == 0) 299 if (!key->both.ptr)
296 return; 300 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 301 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE: 302 case FUT_OFF_INODE:
@@ -442,8 +446,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
442 struct task_struct *p; 446 struct task_struct *p;
443 447
444 rcu_read_lock(); 448 rcu_read_lock();
445 p = find_task_by_pid(pid); 449 p = find_task_by_vpid(pid);
446
447 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 450 if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
448 p = ERR_PTR(-ESRCH); 451 p = ERR_PTR(-ESRCH);
449 else 452 else
@@ -652,7 +655,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
652 if (!(uval & FUTEX_OWNER_DIED)) { 655 if (!(uval & FUTEX_OWNER_DIED)) {
653 int ret = 0; 656 int ret = 0;
654 657
655 newval = FUTEX_WAITERS | new_owner->pid; 658 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
656 659
657 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 660 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
658 661
@@ -1045,7 +1048,7 @@ static int unqueue_me(struct futex_q *q)
1045 retry: 1048 retry:
1046 lock_ptr = q->lock_ptr; 1049 lock_ptr = q->lock_ptr;
1047 barrier(); 1050 barrier();
1048 if (lock_ptr != 0) { 1051 if (lock_ptr != NULL) {
1049 spin_lock(lock_ptr); 1052 spin_lock(lock_ptr);
1050 /* 1053 /*
1051 * q->lock_ptr can change between reading it and 1054 * q->lock_ptr can change between reading it and
@@ -1105,7 +1108,7 @@ static void unqueue_me_pi(struct futex_q *q)
1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1108static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1106 struct task_struct *curr) 1109 struct task_struct *curr)
1107{ 1110{
1108 u32 newtid = curr->pid | FUTEX_WAITERS; 1111 u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS;
1109 struct futex_pi_state *pi_state = q->pi_state; 1112 struct futex_pi_state *pi_state = q->pi_state;
1110 u32 uval, curval, newval; 1113 u32 uval, curval, newval;
1111 int ret; 1114 int ret;
@@ -1367,7 +1370,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1367 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1370 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1368 * the locks. It will most likely not succeed. 1371 * the locks. It will most likely not succeed.
1369 */ 1372 */
1370 newval = current->pid; 1373 newval = task_pid_vnr(current);
1371 1374
1372 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 1375 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1373 1376
@@ -1378,7 +1381,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1378 * Detect deadlocks. In case of REQUEUE_PI this is a valid 1381 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1379 * situation and we return success to user space. 1382 * situation and we return success to user space.
1380 */ 1383 */
1381 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1384 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1382 ret = -EDEADLK; 1385 ret = -EDEADLK;
1383 goto out_unlock_release_sem; 1386 goto out_unlock_release_sem;
1384 } 1387 }
@@ -1407,7 +1410,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1407 */ 1410 */
1408 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 1411 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1409 /* Keep the OWNER_DIED bit */ 1412 /* Keep the OWNER_DIED bit */
1410 newval = (curval & ~FUTEX_TID_MASK) | current->pid; 1413 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1411 ownerdied = 0; 1414 ownerdied = 0;
1412 lock_taken = 1; 1415 lock_taken = 1;
1413 } 1416 }
@@ -1586,7 +1589,7 @@ retry:
1586 /* 1589 /*
1587 * We release only a lock we actually own: 1590 * We release only a lock we actually own:
1588 */ 1591 */
1589 if ((uval & FUTEX_TID_MASK) != current->pid) 1592 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1590 return -EPERM; 1593 return -EPERM;
1591 /* 1594 /*
1592 * First take all the futex related locks: 1595 * First take all the futex related locks:
@@ -1607,7 +1610,7 @@ retry_unlocked:
1607 * anyone else up: 1610 * anyone else up:
1608 */ 1611 */
1609 if (!(uval & FUTEX_OWNER_DIED)) 1612 if (!(uval & FUTEX_OWNER_DIED))
1610 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); 1613 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
1611 1614
1612 1615
1613 if (unlikely(uval == -EFAULT)) 1616 if (unlikely(uval == -EFAULT))
@@ -1616,7 +1619,7 @@ retry_unlocked:
1616 * Rare case: we managed to release the lock atomically, 1619 * Rare case: we managed to release the lock atomically,
1617 * no need to wake anyone else up: 1620 * no need to wake anyone else up:
1618 */ 1621 */
1619 if (unlikely(uval == current->pid)) 1622 if (unlikely(uval == task_pid_vnr(current)))
1620 goto out_unlock; 1623 goto out_unlock;
1621 1624
1622 /* 1625 /*
@@ -1853,7 +1856,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1853 1856
1854 ret = -ESRCH; 1857 ret = -ESRCH;
1855 rcu_read_lock(); 1858 rcu_read_lock();
1856 p = find_task_by_pid(pid); 1859 p = find_task_by_vpid(pid);
1857 if (!p) 1860 if (!p)
1858 goto err_unlock; 1861 goto err_unlock;
1859 ret = -EPERM; 1862 ret = -EPERM;
@@ -1886,7 +1889,7 @@ retry:
1886 if (get_user(uval, uaddr)) 1889 if (get_user(uval, uaddr))
1887 return -1; 1890 return -1;
1888 1891
1889 if ((uval & FUTEX_TID_MASK) == curr->pid) { 1892 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
1890 /* 1893 /*
1891 * Ok, this dying thread is truly holding a futex 1894 * Ok, this dying thread is truly holding a futex
1892 * of interest. Set the OWNER_DIED bit atomically 1895 * of interest. Set the OWNER_DIED bit atomically
@@ -2080,7 +2083,7 @@ static int futexfs_get_sb(struct file_system_type *fs_type,
2080 int flags, const char *dev_name, void *data, 2083 int flags, const char *dev_name, void *data,
2081 struct vfsmount *mnt) 2084 struct vfsmount *mnt)
2082{ 2085{
2083 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); 2086 return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
2084} 2087}
2085 2088
2086static struct file_system_type futex_fs_type = { 2089static struct file_system_type futex_fs_type = {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c2e2954b713..00b572666cc7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11#include <linux/nsproxy.h>
11#include <linux/futex.h> 12#include <linux/futex.h>
12 13
13#include <asm/uaccess.h> 14#include <asm/uaccess.h>
@@ -124,7 +125,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
124 125
125 ret = -ESRCH; 126 ret = -ESRCH;
126 read_lock(&tasklist_lock); 127 read_lock(&tasklist_lock);
127 p = find_task_by_pid(pid); 128 p = find_task_by_vpid(pid);
128 if (!p) 129 if (!p)
129 goto err_unlock; 130 goto err_unlock;
130 ret = -EPERM; 131 ret = -EPERM;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c21ca6bfaa66..b2b2c2b0a49b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
277} 277}
278 278
279EXPORT_SYMBOL_GPL(ktime_add_ns); 279EXPORT_SYMBOL_GPL(ktime_add_ns);
280
281/**
282 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
283 * @kt: minuend
284 * @nsec: the scalar nsec value to subtract
285 *
286 * Returns the subtraction of @nsec from @kt in ktime_t format
287 */
288ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
289{
290 ktime_t tmp;
291
292 if (likely(nsec < NSEC_PER_SEC)) {
293 tmp.tv64 = nsec;
294 } else {
295 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
296
297 tmp = ktime_set((long)nsec, rem);
298 }
299
300 return ktime_sub(kt, tmp);
301}
302
303EXPORT_SYMBOL_GPL(ktime_sub_ns);
280# endif /* !CONFIG_KTIME_SCALAR */ 304# endif /* !CONFIG_KTIME_SCALAR */
281 305
282/* 306/*
@@ -1262,8 +1286,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1262long __sched hrtimer_nanosleep_restart(struct restart_block *restart) 1286long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1263{ 1287{
1264 struct hrtimer_sleeper t; 1288 struct hrtimer_sleeper t;
1265 struct timespec __user *rmtp; 1289 struct timespec *rmtp;
1266 struct timespec tu;
1267 ktime_t time; 1290 ktime_t time;
1268 1291
1269 restart->fn = do_no_restart_syscall; 1292 restart->fn = do_no_restart_syscall;
@@ -1274,14 +1297,12 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1274 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1297 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1275 return 0; 1298 return 0;
1276 1299
1277 rmtp = (struct timespec __user *) restart->arg1; 1300 rmtp = (struct timespec *)restart->arg1;
1278 if (rmtp) { 1301 if (rmtp) {
1279 time = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1302 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
1280 if (time.tv64 <= 0) 1303 if (time.tv64 <= 0)
1281 return 0; 1304 return 0;
1282 tu = ktime_to_timespec(time); 1305 *rmtp = ktime_to_timespec(time);
1283 if (copy_to_user(rmtp, &tu, sizeof(tu)))
1284 return -EFAULT;
1285 } 1306 }
1286 1307
1287 restart->fn = hrtimer_nanosleep_restart; 1308 restart->fn = hrtimer_nanosleep_restart;
@@ -1290,12 +1311,11 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1290 return -ERESTART_RESTARTBLOCK; 1311 return -ERESTART_RESTARTBLOCK;
1291} 1312}
1292 1313
1293long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, 1314long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
1294 const enum hrtimer_mode mode, const clockid_t clockid) 1315 const enum hrtimer_mode mode, const clockid_t clockid)
1295{ 1316{
1296 struct restart_block *restart; 1317 struct restart_block *restart;
1297 struct hrtimer_sleeper t; 1318 struct hrtimer_sleeper t;
1298 struct timespec tu;
1299 ktime_t rem; 1319 ktime_t rem;
1300 1320
1301 hrtimer_init(&t.timer, clockid, mode); 1321 hrtimer_init(&t.timer, clockid, mode);
@@ -1311,9 +1331,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1311 rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); 1331 rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
1312 if (rem.tv64 <= 0) 1332 if (rem.tv64 <= 0)
1313 return 0; 1333 return 0;
1314 tu = ktime_to_timespec(rem); 1334 *rmtp = ktime_to_timespec(rem);
1315 if (copy_to_user(rmtp, &tu, sizeof(tu)))
1316 return -EFAULT;
1317 } 1335 }
1318 1336
1319 restart = &current_thread_info()->restart_block; 1337 restart = &current_thread_info()->restart_block;
@@ -1329,7 +1347,8 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1329asmlinkage long 1347asmlinkage long
1330sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1348sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1331{ 1349{
1332 struct timespec tu; 1350 struct timespec tu, rmt;
1351 int ret;
1333 1352
1334 if (copy_from_user(&tu, rqtp, sizeof(tu))) 1353 if (copy_from_user(&tu, rqtp, sizeof(tu)))
1335 return -EFAULT; 1354 return -EFAULT;
@@ -1337,7 +1356,15 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1337 if (!timespec_valid(&tu)) 1356 if (!timespec_valid(&tu))
1338 return -EINVAL; 1357 return -EINVAL;
1339 1358
1340 return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); 1359 ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
1360 CLOCK_MONOTONIC);
1361
1362 if (ret && rmtp) {
1363 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
1364 return -EFAULT;
1365 }
1366
1367 return ret;
1341} 1368}
1342 1369
1343/* 1370/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f1a73f0b54e7..9b5dff6b3f6a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -503,7 +503,6 @@ out_unlock:
503 spin_unlock(&desc->lock); 503 spin_unlock(&desc->lock);
504} 504}
505 505
506#ifdef CONFIG_SMP
507/** 506/**
508 * handle_percpu_IRQ - Per CPU local irq handler 507 * handle_percpu_IRQ - Per CPU local irq handler
509 * @irq: the interrupt number 508 * @irq: the interrupt number
@@ -529,8 +528,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
529 desc->chip->eoi(irq); 528 desc->chip->eoi(irq);
530} 529}
531 530
532#endif /* CONFIG_SMP */
533
534void 531void
535__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 532__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
536 const char *name) 533 const char *name)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7230d914eaa2..80eab7a04205 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -405,7 +405,6 @@ void free_irq(unsigned int irq, void *dev_id)
405 struct irq_desc *desc; 405 struct irq_desc *desc;
406 struct irqaction **p; 406 struct irqaction **p;
407 unsigned long flags; 407 unsigned long flags;
408 irqreturn_t (*handler)(int, void *) = NULL;
409 408
410 WARN_ON(in_interrupt()); 409 WARN_ON(in_interrupt());
411 if (irq >= NR_IRQS) 410 if (irq >= NR_IRQS)
@@ -445,8 +444,21 @@ void free_irq(unsigned int irq, void *dev_id)
445 444
446 /* Make sure it's not being used on another CPU */ 445 /* Make sure it's not being used on another CPU */
447 synchronize_irq(irq); 446 synchronize_irq(irq);
448 if (action->flags & IRQF_SHARED) 447#ifdef CONFIG_DEBUG_SHIRQ
449 handler = action->handler; 448 /*
449 * It's a shared IRQ -- the driver ought to be
450 * prepared for it to happen even now it's
451 * being freed, so let's make sure.... We do
452 * this after actually deregistering it, to
453 * make sure that a 'real' IRQ doesn't run in
454 * parallel with our fake
455 */
456 if (action->flags & IRQF_SHARED) {
457 local_irq_save(flags);
458 action->handler(irq, dev_id);
459 local_irq_restore(flags);
460 }
461#endif
450 kfree(action); 462 kfree(action);
451 return; 463 return;
452 } 464 }
@@ -454,19 +466,6 @@ void free_irq(unsigned int irq, void *dev_id)
454 spin_unlock_irqrestore(&desc->lock, flags); 466 spin_unlock_irqrestore(&desc->lock, flags);
455 return; 467 return;
456 } 468 }
457#ifdef CONFIG_DEBUG_SHIRQ
458 if (handler) {
459 /*
460 * It's a shared IRQ -- the driver ought to be prepared for it
461 * to happen even now it's being freed, so let's make sure....
462 * We do this after actually deregistering it, to make sure that
463 * a 'real' IRQ doesn't run in parallel with our fake
464 */
465 local_irq_save(flags);
466 handler(irq, dev_id);
467 local_irq_restore(flags);
468 }
469#endif
470} 469}
471EXPORT_SYMBOL(free_irq); 470EXPORT_SYMBOL(free_irq);
472 471
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 3205e8e114fa..2fab344dbf56 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -130,7 +130,7 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
130enum hrtimer_restart it_real_fn(struct hrtimer *timer) 130enum hrtimer_restart it_real_fn(struct hrtimer *timer)
131{ 131{
132 struct signal_struct *sig = 132 struct signal_struct *sig =
133 container_of(timer, struct signal_struct, real_timer); 133 container_of(timer, struct signal_struct, real_timer);
134 134
135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); 135 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
136 136
@@ -291,6 +291,6 @@ asmlinkage long sys_setitimer(int which,
291 return error; 291 return error;
292 292
293 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) 293 if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
294 return -EFAULT; 294 return -EFAULT;
295 return 0; 295 return 0;
296} 296}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 25db14b89e82..aa74a1ef2da8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -17,21 +17,30 @@
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/reboot.h> 19#include <linux/reboot.h>
20#include <linux/syscalls.h>
21#include <linux/ioport.h> 20#include <linux/ioport.h>
22#include <linux/hardirq.h> 21#include <linux/hardirq.h>
23#include <linux/elf.h> 22#include <linux/elf.h>
24#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h>
25#include <linux/utsname.h>
26#include <linux/numa.h>
25 27
26#include <asm/page.h> 28#include <asm/page.h>
27#include <asm/uaccess.h> 29#include <asm/uaccess.h>
28#include <asm/io.h> 30#include <asm/io.h>
29#include <asm/system.h> 31#include <asm/system.h>
30#include <asm/semaphore.h> 32#include <asm/semaphore.h>
33#include <asm/sections.h>
31 34
32/* Per cpu memory for storing cpu states in case of system crash. */ 35/* Per cpu memory for storing cpu states in case of system crash. */
33note_buf_t* crash_notes; 36note_buf_t* crash_notes;
34 37
38/* vmcoreinfo stuff */
39unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
40u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
41size_t vmcoreinfo_size;
42size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
43
35/* Location of the reserved area for the crash kernel */ 44/* Location of the reserved area for the crash kernel */
36struct resource crashk_res = { 45struct resource crashk_res = {
37 .name = "Crash kernel", 46 .name = "Crash kernel",
@@ -42,7 +51,7 @@ struct resource crashk_res = {
42 51
43int kexec_should_crash(struct task_struct *p) 52int kexec_should_crash(struct task_struct *p)
44{ 53{
45 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) 54 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
46 return 1; 55 return 1;
47 return 0; 56 return 0;
48} 57}
@@ -776,7 +785,7 @@ static int kimage_load_normal_segment(struct kimage *image,
776 size_t uchunk, mchunk; 785 size_t uchunk, mchunk;
777 786
778 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 787 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
779 if (page == 0) { 788 if (!page) {
780 result = -ENOMEM; 789 result = -ENOMEM;
781 goto out; 790 goto out;
782 } 791 }
@@ -835,7 +844,7 @@ static int kimage_load_crash_segment(struct kimage *image,
835 size_t uchunk, mchunk; 844 size_t uchunk, mchunk;
836 845
837 page = pfn_to_page(maddr >> PAGE_SHIFT); 846 page = pfn_to_page(maddr >> PAGE_SHIFT);
838 if (page == 0) { 847 if (!page) {
839 result = -ENOMEM; 848 result = -ENOMEM;
840 goto out; 849 goto out;
841 } 850 }
@@ -1061,6 +1070,7 @@ void crash_kexec(struct pt_regs *regs)
1061 if (kexec_crash_image) { 1070 if (kexec_crash_image) {
1062 struct pt_regs fixed_regs; 1071 struct pt_regs fixed_regs;
1063 crash_setup_regs(&fixed_regs, regs); 1072 crash_setup_regs(&fixed_regs, regs);
1073 crash_save_vmcoreinfo();
1064 machine_crash_shutdown(&fixed_regs); 1074 machine_crash_shutdown(&fixed_regs);
1065 machine_kexec(kexec_crash_image); 1075 machine_kexec(kexec_crash_image);
1066 } 1076 }
@@ -1135,3 +1145,270 @@ static int __init crash_notes_memory_init(void)
1135 return 0; 1145 return 0;
1136} 1146}
1137module_init(crash_notes_memory_init) 1147module_init(crash_notes_memory_init)
1148
1149
1150/*
1151 * parsing the "crashkernel" commandline
1152 *
1153 * this code is intended to be called from architecture specific code
1154 */
1155
1156
1157/*
1158 * This function parses command lines in the format
1159 *
1160 * crashkernel=ramsize-range:size[,...][@offset]
1161 *
1162 * The function returns 0 on success and -EINVAL on failure.
1163 */
1164static int __init parse_crashkernel_mem(char *cmdline,
1165 unsigned long long system_ram,
1166 unsigned long long *crash_size,
1167 unsigned long long *crash_base)
1168{
1169 char *cur = cmdline, *tmp;
1170
1171 /* for each entry of the comma-separated list */
1172 do {
1173 unsigned long long start, end = ULLONG_MAX, size;
1174
1175 /* get the start of the range */
1176 start = memparse(cur, &tmp);
1177 if (cur == tmp) {
1178 pr_warning("crashkernel: Memory value expected\n");
1179 return -EINVAL;
1180 }
1181 cur = tmp;
1182 if (*cur != '-') {
1183 pr_warning("crashkernel: '-' expected\n");
1184 return -EINVAL;
1185 }
1186 cur++;
1187
1188 /* if no ':' is here, than we read the end */
1189 if (*cur != ':') {
1190 end = memparse(cur, &tmp);
1191 if (cur == tmp) {
1192 pr_warning("crashkernel: Memory "
1193 "value expected\n");
1194 return -EINVAL;
1195 }
1196 cur = tmp;
1197 if (end <= start) {
1198 pr_warning("crashkernel: end <= start\n");
1199 return -EINVAL;
1200 }
1201 }
1202
1203 if (*cur != ':') {
1204 pr_warning("crashkernel: ':' expected\n");
1205 return -EINVAL;
1206 }
1207 cur++;
1208
1209 size = memparse(cur, &tmp);
1210 if (cur == tmp) {
1211 pr_warning("Memory value expected\n");
1212 return -EINVAL;
1213 }
1214 cur = tmp;
1215 if (size >= system_ram) {
1216 pr_warning("crashkernel: invalid size\n");
1217 return -EINVAL;
1218 }
1219
1220 /* match ? */
1221 if (system_ram >= start && system_ram <= end) {
1222 *crash_size = size;
1223 break;
1224 }
1225 } while (*cur++ == ',');
1226
1227 if (*crash_size > 0) {
1228 while (*cur != ' ' && *cur != '@')
1229 cur++;
1230 if (*cur == '@') {
1231 cur++;
1232 *crash_base = memparse(cur, &tmp);
1233 if (cur == tmp) {
1234 pr_warning("Memory value expected "
1235 "after '@'\n");
1236 return -EINVAL;
1237 }
1238 }
1239 }
1240
1241 return 0;
1242}
1243
1244/*
1245 * That function parses "simple" (old) crashkernel command lines like
1246 *
1247 * crashkernel=size[@offset]
1248 *
1249 * It returns 0 on success and -EINVAL on failure.
1250 */
1251static int __init parse_crashkernel_simple(char *cmdline,
1252 unsigned long long *crash_size,
1253 unsigned long long *crash_base)
1254{
1255 char *cur = cmdline;
1256
1257 *crash_size = memparse(cmdline, &cur);
1258 if (cmdline == cur) {
1259 pr_warning("crashkernel: memory value expected\n");
1260 return -EINVAL;
1261 }
1262
1263 if (*cur == '@')
1264 *crash_base = memparse(cur+1, &cur);
1265
1266 return 0;
1267}
1268
1269/*
1270 * That function is the entry point for command line parsing and should be
1271 * called from the arch-specific code.
1272 */
1273int __init parse_crashkernel(char *cmdline,
1274 unsigned long long system_ram,
1275 unsigned long long *crash_size,
1276 unsigned long long *crash_base)
1277{
1278 char *p = cmdline, *ck_cmdline = NULL;
1279 char *first_colon, *first_space;
1280
1281 BUG_ON(!crash_size || !crash_base);
1282 *crash_size = 0;
1283 *crash_base = 0;
1284
1285 /* find crashkernel and use the last one if there are more */
1286 p = strstr(p, "crashkernel=");
1287 while (p) {
1288 ck_cmdline = p;
1289 p = strstr(p+1, "crashkernel=");
1290 }
1291
1292 if (!ck_cmdline)
1293 return -EINVAL;
1294
1295 ck_cmdline += 12; /* strlen("crashkernel=") */
1296
1297 /*
1298 * if the commandline contains a ':', then that's the extended
1299 * syntax -- if not, it must be the classic syntax
1300 */
1301 first_colon = strchr(ck_cmdline, ':');
1302 first_space = strchr(ck_cmdline, ' ');
1303 if (first_colon && (!first_space || first_colon < first_space))
1304 return parse_crashkernel_mem(ck_cmdline, system_ram,
1305 crash_size, crash_base);
1306 else
1307 return parse_crashkernel_simple(ck_cmdline, crash_size,
1308 crash_base);
1309
1310 return 0;
1311}
1312
1313
1314
1315void crash_save_vmcoreinfo(void)
1316{
1317 u32 *buf;
1318
1319 if (!vmcoreinfo_size)
1320 return;
1321
1322 vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1323
1324 buf = (u32 *)vmcoreinfo_note;
1325
1326 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1327 vmcoreinfo_size);
1328
1329 final_note(buf);
1330}
1331
1332void vmcoreinfo_append_str(const char *fmt, ...)
1333{
1334 va_list args;
1335 char buf[0x50];
1336 int r;
1337
1338 va_start(args, fmt);
1339 r = vsnprintf(buf, sizeof(buf), fmt, args);
1340 va_end(args);
1341
1342 if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1343 r = vmcoreinfo_max_size - vmcoreinfo_size;
1344
1345 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1346
1347 vmcoreinfo_size += r;
1348}
1349
1350/*
1351 * provide an empty default implementation here -- architecture
1352 * code may override this
1353 */
1354void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1355{}
1356
1357unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1358{
1359 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1360}
1361
1362static int __init crash_save_vmcoreinfo_init(void)
1363{
1364 vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release);
1365 vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE);
1366
1367 VMCOREINFO_SYMBOL(init_uts_ns);
1368 VMCOREINFO_SYMBOL(node_online_map);
1369 VMCOREINFO_SYMBOL(swapper_pg_dir);
1370 VMCOREINFO_SYMBOL(_stext);
1371
1372#ifndef CONFIG_NEED_MULTIPLE_NODES
1373 VMCOREINFO_SYMBOL(mem_map);
1374 VMCOREINFO_SYMBOL(contig_page_data);
1375#endif
1376#ifdef CONFIG_SPARSEMEM
1377 VMCOREINFO_SYMBOL(mem_section);
1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1379 VMCOREINFO_SIZE(mem_section);
1380 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1381#endif
1382 VMCOREINFO_SIZE(page);
1383 VMCOREINFO_SIZE(pglist_data);
1384 VMCOREINFO_SIZE(zone);
1385 VMCOREINFO_SIZE(free_area);
1386 VMCOREINFO_SIZE(list_head);
1387 VMCOREINFO_TYPEDEF_SIZE(nodemask_t);
1388 VMCOREINFO_OFFSET(page, flags);
1389 VMCOREINFO_OFFSET(page, _count);
1390 VMCOREINFO_OFFSET(page, mapping);
1391 VMCOREINFO_OFFSET(page, lru);
1392 VMCOREINFO_OFFSET(pglist_data, node_zones);
1393 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1394#ifdef CONFIG_FLAT_NODE_MEM_MAP
1395 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1396#endif
1397 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1398 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1399 VMCOREINFO_OFFSET(pglist_data, node_id);
1400 VMCOREINFO_OFFSET(zone, free_area);
1401 VMCOREINFO_OFFSET(zone, vm_stat);
1402 VMCOREINFO_OFFSET(zone, spanned_pages);
1403 VMCOREINFO_OFFSET(free_area, free_list);
1404 VMCOREINFO_OFFSET(list_head, next);
1405 VMCOREINFO_OFFSET(list_head, prev);
1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1407 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408
1409 arch_crash_save_vmcoreinfo();
1410
1411 return 0;
1412}
1413
1414module_init(crash_save_vmcoreinfo_init)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4b8a4493c541..e3a5d817ac9b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -64,7 +64,6 @@
64 64
65static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 65static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
67static atomic_t kprobe_count;
68 67
69/* NOTE: change this value only with kprobe_mutex held */ 68/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 69static bool kprobe_enabled;
@@ -73,11 +72,6 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
75 74
76static struct notifier_block kprobe_page_fault_nb = {
77 .notifier_call = kprobe_exceptions_notify,
78 .priority = 0x7fffffff /* we need to notified first */
79};
80
81#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 75#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
82/* 76/*
83 * kprobe->ainsn.insn points to the copy of the instruction to be 77 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -556,8 +550,6 @@ static int __kprobes __register_kprobe(struct kprobe *p,
556 old_p = get_kprobe(p->addr); 550 old_p = get_kprobe(p->addr);
557 if (old_p) { 551 if (old_p) {
558 ret = register_aggr_kprobe(old_p, p); 552 ret = register_aggr_kprobe(old_p, p);
559 if (!ret)
560 atomic_inc(&kprobe_count);
561 goto out; 553 goto out;
562 } 554 }
563 555
@@ -569,13 +561,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
569 hlist_add_head_rcu(&p->hlist, 561 hlist_add_head_rcu(&p->hlist,
570 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 562 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
571 563
572 if (kprobe_enabled) { 564 if (kprobe_enabled)
573 if (atomic_add_return(1, &kprobe_count) == \
574 (ARCH_INACTIVE_KPROBE_COUNT + 1))
575 register_page_fault_notifier(&kprobe_page_fault_nb);
576
577 arch_arm_kprobe(p); 565 arch_arm_kprobe(p);
578 } 566
579out: 567out:
580 mutex_unlock(&kprobe_mutex); 568 mutex_unlock(&kprobe_mutex);
581 569
@@ -658,16 +646,6 @@ valid_p:
658 } 646 }
659 mutex_unlock(&kprobe_mutex); 647 mutex_unlock(&kprobe_mutex);
660 } 648 }
661
662 /* Call unregister_page_fault_notifier()
663 * if no probes are active
664 */
665 mutex_lock(&kprobe_mutex);
666 if (atomic_add_return(-1, &kprobe_count) == \
667 ARCH_INACTIVE_KPROBE_COUNT)
668 unregister_page_fault_notifier(&kprobe_page_fault_nb);
669 mutex_unlock(&kprobe_mutex);
670 return;
671} 649}
672 650
673static struct notifier_block kprobe_exceptions_nb = { 651static struct notifier_block kprobe_exceptions_nb = {
@@ -738,6 +716,18 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
738 int ret = 0; 716 int ret = 0;
739 struct kretprobe_instance *inst; 717 struct kretprobe_instance *inst;
740 int i; 718 int i;
719 void *addr = rp->kp.addr;
720
721 if (kretprobe_blacklist_size) {
722 if (addr == NULL)
723 kprobe_lookup_name(rp->kp.symbol_name, addr);
724 addr += rp->kp.offset;
725
726 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
727 if (kretprobe_blacklist[i].addr == addr)
728 return -EINVAL;
729 }
730 }
741 731
742 rp->kp.pre_handler = pre_handler_kretprobe; 732 rp->kp.pre_handler = pre_handler_kretprobe;
743 rp->kp.post_handler = NULL; 733 rp->kp.post_handler = NULL;
@@ -815,7 +805,17 @@ static int __init init_kprobes(void)
815 INIT_HLIST_HEAD(&kprobe_table[i]); 805 INIT_HLIST_HEAD(&kprobe_table[i]);
816 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 806 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
817 } 807 }
818 atomic_set(&kprobe_count, 0); 808
809 if (kretprobe_blacklist_size) {
810 /* lookup the function address from its name */
811 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
812 kprobe_lookup_name(kretprobe_blacklist[i].name,
813 kretprobe_blacklist[i].addr);
814 if (!kretprobe_blacklist[i].addr)
815 printk("kretprobe: lookup failed: %s\n",
816 kretprobe_blacklist[i].name);
817 }
818 }
819 819
820 /* By default, kprobes are enabled */ 820 /* By default, kprobes are enabled */
821 kprobe_enabled = true; 821 kprobe_enabled = true;
@@ -921,13 +921,6 @@ static void __kprobes enable_all_kprobes(void)
921 if (kprobe_enabled) 921 if (kprobe_enabled)
922 goto already_enabled; 922 goto already_enabled;
923 923
924 /*
925 * Re-register the page fault notifier only if there are any
926 * active probes at the time of enabling kprobes globally
927 */
928 if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
929 register_page_fault_notifier(&kprobe_page_fault_nb);
930
931 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 924 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
932 head = &kprobe_table[i]; 925 head = &kprobe_table[i];
933 hlist_for_each_entry_rcu(p, node, head, hlist) 926 hlist_for_each_entry_rcu(p, node, head, hlist)
@@ -968,10 +961,7 @@ static void __kprobes disable_all_kprobes(void)
968 mutex_unlock(&kprobe_mutex); 961 mutex_unlock(&kprobe_mutex);
969 /* Allow all currently running kprobes to complete */ 962 /* Allow all currently running kprobes to complete */
970 synchronize_sched(); 963 synchronize_sched();
971 964 return;
972 mutex_lock(&kprobe_mutex);
973 /* Unconditionally unregister the page_fault notifier */
974 unregister_page_fault_notifier(&kprobe_page_fault_nb);
975 965
976already_disabled: 966already_disabled:
977 mutex_unlock(&kprobe_mutex); 967 mutex_unlock(&kprobe_mutex);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48e18c7..65daa5373ca6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/sched.h>
17 18
18#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -60,6 +61,15 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
60 return sprintf(page, "%d\n", !!kexec_crash_image); 61 return sprintf(page, "%d\n", !!kexec_crash_image);
61} 62}
62KERNEL_ATTR_RO(kexec_crash_loaded); 63KERNEL_ATTR_RO(kexec_crash_loaded);
64
65static ssize_t vmcoreinfo_show(struct kset *kset, char *page)
66{
67 return sprintf(page, "%lx %x\n",
68 paddr_vmcoreinfo_note(),
69 (unsigned int)vmcoreinfo_max_size);
70}
71KERNEL_ATTR_RO(vmcoreinfo);
72
63#endif /* CONFIG_KEXEC */ 73#endif /* CONFIG_KEXEC */
64 74
65/* 75/*
@@ -95,6 +105,7 @@ static struct attribute * kernel_attrs[] = {
95#ifdef CONFIG_KEXEC 105#ifdef CONFIG_KEXEC
96 &kexec_loaded_attr.attr, 106 &kexec_loaded_attr.attr,
97 &kexec_crash_loaded_attr.attr, 107 &kexec_crash_loaded_attr.attr,
108 &vmcoreinfo_attr.attr,
98#endif 109#endif
99 NULL 110 NULL
100}; 111};
@@ -116,6 +127,13 @@ static int __init ksysfs_init(void)
116 &notes_attr); 127 &notes_attr);
117 } 128 }
118 129
130 /*
131 * Create "/sys/kernel/uids" directory and corresponding root user's
132 * directory under it.
133 */
134 if (!error)
135 error = uids_kobject_init();
136
119 return error; 137 return error;
120} 138}
121 139
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 734da579ad13..55fe0c7cd95f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -511,11 +511,11 @@ static void lockdep_print_held_locks(struct task_struct *curr)
511 int i, depth = curr->lockdep_depth; 511 int i, depth = curr->lockdep_depth;
512 512
513 if (!depth) { 513 if (!depth) {
514 printk("no locks held by %s/%d.\n", curr->comm, curr->pid); 514 printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
515 return; 515 return;
516 } 516 }
517 printk("%d lock%s held by %s/%d:\n", 517 printk("%d lock%s held by %s/%d:\n",
518 depth, depth > 1 ? "s" : "", curr->comm, curr->pid); 518 depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
519 519
520 for (i = 0; i < depth; i++) { 520 for (i = 0; i < depth; i++) {
521 printk(" #%d: ", i); 521 printk(" #%d: ", i);
@@ -904,7 +904,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
904 print_kernel_version(); 904 print_kernel_version();
905 printk( "-------------------------------------------------------\n"); 905 printk( "-------------------------------------------------------\n");
906 printk("%s/%d is trying to acquire lock:\n", 906 printk("%s/%d is trying to acquire lock:\n",
907 curr->comm, curr->pid); 907 curr->comm, task_pid_nr(curr));
908 print_lock(check_source); 908 print_lock(check_source);
909 printk("\nbut task is already holding lock:\n"); 909 printk("\nbut task is already holding lock:\n");
910 print_lock(check_target); 910 print_lock(check_target);
@@ -1085,7 +1085,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1085 print_kernel_version(); 1085 print_kernel_version();
1086 printk( "------------------------------------------------------\n"); 1086 printk( "------------------------------------------------------\n");
1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1087 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1088 curr->comm, curr->pid, 1088 curr->comm, task_pid_nr(curr),
1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1089 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, 1090 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
1091 curr->hardirqs_enabled, 1091 curr->hardirqs_enabled,
@@ -1237,7 +1237,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1237 print_kernel_version(); 1237 print_kernel_version();
1238 printk( "---------------------------------------------\n"); 1238 printk( "---------------------------------------------\n");
1239 printk("%s/%d is trying to acquire lock:\n", 1239 printk("%s/%d is trying to acquire lock:\n",
1240 curr->comm, curr->pid); 1240 curr->comm, task_pid_nr(curr));
1241 print_lock(next); 1241 print_lock(next);
1242 printk("\nbut task is already holding lock:\n"); 1242 printk("\nbut task is already holding lock:\n");
1243 print_lock(prev); 1243 print_lock(prev);
@@ -1521,7 +1521,7 @@ cache_hit:
1521} 1521}
1522 1522
1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1524 struct held_lock *hlock, int chain_head) 1524 struct held_lock *hlock, int chain_head, u64 chain_key)
1525{ 1525{
1526 /* 1526 /*
1527 * Trylock needs to maintain the stack of held locks, but it 1527 * Trylock needs to maintain the stack of held locks, but it
@@ -1534,7 +1534,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1534 * graph_lock for us) 1534 * graph_lock for us)
1535 */ 1535 */
1536 if (!hlock->trylock && (hlock->check == 2) && 1536 if (!hlock->trylock && (hlock->check == 2) &&
1537 lookup_chain_cache(curr->curr_chain_key, hlock->class)) { 1537 lookup_chain_cache(chain_key, hlock->class)) {
1538 /* 1538 /*
1539 * Check whether last held lock: 1539 * Check whether last held lock:
1540 * 1540 *
@@ -1576,7 +1576,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1576#else 1576#else
1577static inline int validate_chain(struct task_struct *curr, 1577static inline int validate_chain(struct task_struct *curr,
1578 struct lockdep_map *lock, struct held_lock *hlock, 1578 struct lockdep_map *lock, struct held_lock *hlock,
1579 int chain_head) 1579 int chain_head, u64 chain_key)
1580{ 1580{
1581 return 1; 1581 return 1;
1582} 1582}
@@ -1641,7 +1641,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1641 usage_str[prev_bit], usage_str[new_bit]); 1641 usage_str[prev_bit], usage_str[new_bit]);
1642 1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", 1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid, 1644 curr->comm, task_pid_nr(curr),
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, 1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, 1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr), 1647 trace_hardirqs_enabled(curr),
@@ -1694,7 +1694,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1694 print_kernel_version(); 1694 print_kernel_version();
1695 printk( "---------------------------------------------------------\n"); 1695 printk( "---------------------------------------------------------\n");
1696 printk("%s/%d just changed the state of lock:\n", 1696 printk("%s/%d just changed the state of lock:\n",
1697 curr->comm, curr->pid); 1697 curr->comm, task_pid_nr(curr));
1698 print_lock(this); 1698 print_lock(this);
1699 if (forwards) 1699 if (forwards)
1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1700 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
@@ -2450,11 +2450,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2450 chain_head = 1; 2450 chain_head = 1;
2451 } 2451 }
2452 chain_key = iterate_chain_key(chain_key, id); 2452 chain_key = iterate_chain_key(chain_key, id);
2453 curr->curr_chain_key = chain_key;
2454 2453
2455 if (!validate_chain(curr, lock, hlock, chain_head)) 2454 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
2456 return 0; 2455 return 0;
2457 2456
2457 curr->curr_chain_key = chain_key;
2458 curr->lockdep_depth++; 2458 curr->lockdep_depth++;
2459 check_chain_key(curr); 2459 check_chain_key(curr);
2460#ifdef CONFIG_DEBUG_LOCKDEP 2460#ifdef CONFIG_DEBUG_LOCKDEP
@@ -2487,7 +2487,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
2487 printk( "[ BUG: bad unlock balance detected! ]\n"); 2487 printk( "[ BUG: bad unlock balance detected! ]\n");
2488 printk( "-------------------------------------\n"); 2488 printk( "-------------------------------------\n");
2489 printk("%s/%d is trying to release lock (", 2489 printk("%s/%d is trying to release lock (",
2490 curr->comm, curr->pid); 2490 curr->comm, task_pid_nr(curr));
2491 print_lockdep_cache(lock); 2491 print_lockdep_cache(lock);
2492 printk(") at:\n"); 2492 printk(") at:\n");
2493 print_ip_sym(ip); 2493 print_ip_sym(ip);
@@ -2737,7 +2737,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2737 printk( "[ BUG: bad contention detected! ]\n"); 2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n"); 2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (", 2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid); 2740 curr->comm, task_pid_nr(curr));
2741 print_lockdep_cache(lock); 2741 print_lockdep_cache(lock);
2742 printk(") at:\n"); 2742 printk(") at:\n");
2743 print_ip_sym(ip); 2743 print_ip_sym(ip);
@@ -3072,7 +3072,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3072 printk( "[ BUG: held lock freed! ]\n"); 3072 printk( "[ BUG: held lock freed! ]\n");
3073 printk( "-------------------------\n"); 3073 printk( "-------------------------\n");
3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 3074 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3075 curr->comm, curr->pid, mem_from, mem_to-1); 3075 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
3076 print_lock(hlock); 3076 print_lock(hlock);
3077 lockdep_print_held_locks(curr); 3077 lockdep_print_held_locks(curr);
3078 3078
@@ -3125,7 +3125,7 @@ static void print_held_locks_bug(struct task_struct *curr)
3125 printk( "[ BUG: lock held at task exit time! ]\n"); 3125 printk( "[ BUG: lock held at task exit time! ]\n");
3126 printk( "-------------------------------------\n"); 3126 printk( "-------------------------------------\n");
3127 printk("%s/%d is exiting with locks still held!\n", 3127 printk("%s/%d is exiting with locks still held!\n",
3128 curr->comm, curr->pid); 3128 curr->comm, task_pid_nr(curr));
3129 lockdep_print_held_locks(curr); 3129 lockdep_print_held_locks(curr);
3130 3130
3131 printk("\nstack backtrace:\n"); 3131 printk("\nstack backtrace:\n");
@@ -3199,3 +3199,19 @@ void debug_show_held_locks(struct task_struct *task)
3199} 3199}
3200 3200
3201EXPORT_SYMBOL_GPL(debug_show_held_locks); 3201EXPORT_SYMBOL_GPL(debug_show_held_locks);
3202
3203void lockdep_sys_exit(void)
3204{
3205 struct task_struct *curr = current;
3206
3207 if (unlikely(curr->lockdep_depth)) {
3208 if (!debug_locks_off())
3209 return;
3210 printk("\n================================================\n");
3211 printk( "[ BUG: lock held when returning to user space! ]\n");
3212 printk( "------------------------------------------------\n");
3213 printk("%s/%d is leaving the kernel with locks still held!\n",
3214 curr->comm, curr->pid);
3215 lockdep_print_held_locks(curr);
3216 }
3217}
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index c851b2dcc685..8a135bd163c2 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,28 +25,38 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class = v; 28 struct lock_class *class;
29 29
30 (*pos)++; 30 (*pos)++;
31 31
32 if (class->lock_entry.next != &all_lock_classes) 32 if (v == SEQ_START_TOKEN)
33 class = list_entry(class->lock_entry.next, struct lock_class, 33 class = m->private;
34 lock_entry); 34 else {
35 else 35 class = v;
36 class = NULL; 36
37 m->private = class; 37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
38 43
39 return class; 44 return class;
40} 45}
41 46
42static void *l_start(struct seq_file *m, loff_t *pos) 47static void *l_start(struct seq_file *m, loff_t *pos)
43{ 48{
44 struct lock_class *class = m->private; 49 struct lock_class *class;
50 loff_t i = 0;
45 51
46 if (&class->lock_entry == all_lock_classes.next) 52 if (*pos == 0)
47 seq_printf(m, "all lock classes:\n"); 53 return SEQ_START_TOKEN;
48 54
49 return class; 55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
50} 60}
51 61
52static void l_stop(struct seq_file *m, void *v) 62static void l_stop(struct seq_file *m, void *v)
@@ -101,10 +111,15 @@ static void print_name(struct seq_file *m, struct lock_class *class)
101static int l_show(struct seq_file *m, void *v) 111static int l_show(struct seq_file *m, void *v)
102{ 112{
103 unsigned long nr_forward_deps, nr_backward_deps; 113 unsigned long nr_forward_deps, nr_backward_deps;
104 struct lock_class *class = m->private; 114 struct lock_class *class = v;
105 struct lock_list *entry; 115 struct lock_list *entry;
106 char c1, c2, c3, c4; 116 char c1, c2, c3, c4;
107 117
118 if (v == SEQ_START_TOKEN) {
119 seq_printf(m, "all lock classes:\n");
120 return 0;
121 }
122
108 seq_printf(m, "%p", class->key); 123 seq_printf(m, "%p", class->key);
109#ifdef CONFIG_DEBUG_LOCKDEP 124#ifdef CONFIG_DEBUG_LOCKDEP
110 seq_printf(m, " OPS:%8ld", class->ops); 125 seq_printf(m, " OPS:%8ld", class->ops);
@@ -523,10 +538,11 @@ static void *ls_start(struct seq_file *m, loff_t *pos)
523{ 538{
524 struct lock_stat_seq *data = m->private; 539 struct lock_stat_seq *data = m->private;
525 540
526 if (data->iter == data->stats) 541 if (*pos == 0)
527 seq_header(m); 542 return SEQ_START_TOKEN;
528 543
529 if (data->iter == data->iter_end) 544 data->iter = data->stats + *pos;
545 if (data->iter >= data->iter_end)
530 data->iter = NULL; 546 data->iter = NULL;
531 547
532 return data->iter; 548 return data->iter;
@@ -538,8 +554,13 @@ static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
538 554
539 (*pos)++; 555 (*pos)++;
540 556
541 data->iter = v; 557 if (v == SEQ_START_TOKEN)
542 data->iter++; 558 data->iter = data->stats;
559 else {
560 data->iter = v;
561 data->iter++;
562 }
563
543 if (data->iter == data->iter_end) 564 if (data->iter == data->iter_end)
544 data->iter = NULL; 565 data->iter = NULL;
545 566
@@ -552,9 +573,11 @@ static void ls_stop(struct seq_file *m, void *v)
552 573
553static int ls_show(struct seq_file *m, void *v) 574static int ls_show(struct seq_file *m, void *v)
554{ 575{
555 struct lock_stat_seq *data = m->private; 576 if (v == SEQ_START_TOKEN)
577 seq_header(m);
578 else
579 seq_stats(m, v);
556 580
557 seq_stats(m, data->iter);
558 return 0; 581 return 0;
559} 582}
560 583
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 000000000000..ccb48d9a3657
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,525 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26
27extern struct marker __start___markers[];
28extern struct marker __stop___markers[];
29
30/*
31 * module_mutex nests inside markers_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync.
33 */
34static DEFINE_MUTEX(markers_mutex);
35
36/*
37 * Marker deferred synchronization.
38 * Upon marker probe_unregister, we delay call to synchronize_sched() to
39 * accelerate mass unregistration (only when there is no more reference to a
40 * given module do we call synchronize_sched()). However, we need to make sure
41 * every critical region has ended before we re-arm a marker that has been
42 * unregistered and then registered back with a different probe data.
43 */
44static int deferred_sync;
45
46/*
47 * Marker hash table, containing the active markers.
48 * Protected by module_mutex.
49 */
50#define MARKER_HASH_BITS 6
51#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
52
53struct marker_entry {
54 struct hlist_node hlist;
55 char *format;
56 marker_probe_func *probe;
57 void *private;
58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; /* Contains name'\0'format'\0' */
60};
61
62static struct hlist_head marker_table[MARKER_TABLE_SIZE];
63
64/**
65 * __mark_empty_function - Empty probe callback
66 * @mdata: pointer of type const struct marker
67 * @fmt: format string
68 * @...: variable argument list
69 *
70 * Empty callback provided as a probe to the markers. By providing this to a
71 * disabled marker, we make sure the execution flow is always valid even
72 * though the function pointer change and the marker enabling are two distinct
73 * operations that modifies the execution flow of preemptible code.
74 */
75void __mark_empty_function(const struct marker *mdata, void *private,
76 const char *fmt, ...)
77{
78}
79EXPORT_SYMBOL_GPL(__mark_empty_function);
80
81/*
82 * Get marker if the marker is present in the marker hash table.
83 * Must be called with markers_mutex held.
84 * Returns NULL if not present.
85 */
86static struct marker_entry *get_marker(const char *name)
87{
88 struct hlist_head *head;
89 struct hlist_node *node;
90 struct marker_entry *e;
91 u32 hash = jhash(name, strlen(name), 0);
92
93 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
94 hlist_for_each_entry(e, node, head, hlist) {
95 if (!strcmp(name, e->name))
96 return e;
97 }
98 return NULL;
99}
100
101/*
102 * Add the marker to the marker hash table. Must be called with markers_mutex
103 * held.
104 */
105static int add_marker(const char *name, const char *format,
106 marker_probe_func *probe, void *private)
107{
108 struct hlist_head *head;
109 struct hlist_node *node;
110 struct marker_entry *e;
111 size_t name_len = strlen(name) + 1;
112 size_t format_len = 0;
113 u32 hash = jhash(name, name_len-1, 0);
114
115 if (format)
116 format_len = strlen(format) + 1;
117 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
118 hlist_for_each_entry(e, node, head, hlist) {
119 if (!strcmp(name, e->name)) {
120 printk(KERN_NOTICE
121 "Marker %s busy, probe %p already installed\n",
122 name, e->probe);
123 return -EBUSY; /* Already there */
124 }
125 }
126 /*
127 * Using kmalloc here to allocate a variable length element. Could
128 * cause some memory fragmentation if overused.
129 */
130 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
131 GFP_KERNEL);
132 if (!e)
133 return -ENOMEM;
134 memcpy(&e->name[0], name, name_len);
135 if (format) {
136 e->format = &e->name[name_len];
137 memcpy(e->format, format, format_len);
138 trace_mark(core_marker_format, "name %s format %s",
139 e->name, e->format);
140 } else
141 e->format = NULL;
142 e->probe = probe;
143 e->private = private;
144 e->refcount = 0;
145 hlist_add_head(&e->hlist, head);
146 return 0;
147}
148
149/*
150 * Remove the marker from the marker hash table. Must be called with mutex_lock
151 * held.
152 */
153static void *remove_marker(const char *name)
154{
155 struct hlist_head *head;
156 struct hlist_node *node;
157 struct marker_entry *e;
158 int found = 0;
159 size_t len = strlen(name) + 1;
160 void *private = NULL;
161 u32 hash = jhash(name, len-1, 0);
162
163 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
164 hlist_for_each_entry(e, node, head, hlist) {
165 if (!strcmp(name, e->name)) {
166 found = 1;
167 break;
168 }
169 }
170 if (found) {
171 private = e->private;
172 hlist_del(&e->hlist);
173 kfree(e);
174 }
175 return private;
176}
177
178/*
179 * Set the mark_entry format to the format found in the element.
180 */
181static int marker_set_format(struct marker_entry **entry, const char *format)
182{
183 struct marker_entry *e;
184 size_t name_len = strlen((*entry)->name) + 1;
185 size_t format_len = strlen(format) + 1;
186
187 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
188 GFP_KERNEL);
189 if (!e)
190 return -ENOMEM;
191 memcpy(&e->name[0], (*entry)->name, name_len);
192 e->format = &e->name[name_len];
193 memcpy(e->format, format, format_len);
194 e->probe = (*entry)->probe;
195 e->private = (*entry)->private;
196 e->refcount = (*entry)->refcount;
197 hlist_add_before(&e->hlist, &(*entry)->hlist);
198 hlist_del(&(*entry)->hlist);
199 kfree(*entry);
200 *entry = e;
201 trace_mark(core_marker_format, "name %s format %s",
202 e->name, e->format);
203 return 0;
204}
205
206/*
207 * Sets the probe callback corresponding to one marker.
208 */
209static int set_marker(struct marker_entry **entry, struct marker *elem)
210{
211 int ret;
212 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
213
214 if ((*entry)->format) {
215 if (strcmp((*entry)->format, elem->format) != 0) {
216 printk(KERN_NOTICE
217 "Format mismatch for probe %s "
218 "(%s), marker (%s)\n",
219 (*entry)->name,
220 (*entry)->format,
221 elem->format);
222 return -EPERM;
223 }
224 } else {
225 ret = marker_set_format(entry, elem->format);
226 if (ret)
227 return ret;
228 }
229 elem->call = (*entry)->probe;
230 elem->private = (*entry)->private;
231 elem->state = 1;
232 return 0;
233}
234
235/*
236 * Disable a marker and its probe callback.
237 * Note: only after a synchronize_sched() issued after setting elem->call to the
238 * empty function insures that the original callback is not used anymore. This
239 * insured by preemption disabling around the call site.
240 */
241static void disable_marker(struct marker *elem)
242{
243 elem->state = 0;
244 elem->call = __mark_empty_function;
245 /*
246 * Leave the private data and id there, because removal is racy and
247 * should be done only after a synchronize_sched(). These are never used
248 * until the next initialization anyway.
249 */
250}
251
252/**
253 * marker_update_probe_range - Update a probe range
254 * @begin: beginning of the range
255 * @end: end of the range
256 * @probe_module: module address of the probe being updated
257 * @refcount: number of references left to the given probe_module (out)
258 *
259 * Updates the probe callback corresponding to a range of markers.
260 * Must be called with markers_mutex held.
261 */
262void marker_update_probe_range(struct marker *begin,
263 struct marker *end, struct module *probe_module,
264 int *refcount)
265{
266 struct marker *iter;
267 struct marker_entry *mark_entry;
268
269 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) {
272 set_marker(&mark_entry, iter);
273 /*
274 * ignore error, continue
275 */
276 if (probe_module)
277 if (probe_module ==
278 __module_text_address((unsigned long)mark_entry->probe))
279 (*refcount)++;
280 } else {
281 disable_marker(iter);
282 }
283 }
284}
285
286/*
287 * Update probes, removing the faulty probes.
288 * Issues a synchronize_sched() when no reference to the module passed
289 * as parameter is found in the probes so the probe module can be
290 * safely unloaded from now on.
291 */
292static void marker_update_probes(struct module *probe_module)
293{
294 int refcount = 0;
295
296 mutex_lock(&markers_mutex);
297 /* Core kernel markers */
298 marker_update_probe_range(__start___markers,
299 __stop___markers, probe_module, &refcount);
300 /* Markers in modules. */
301 module_update_markers(probe_module, &refcount);
302 if (probe_module && refcount == 0) {
303 synchronize_sched();
304 deferred_sync = 0;
305 }
306 mutex_unlock(&markers_mutex);
307}
308
309/**
310 * marker_probe_register - Connect a probe to a marker
311 * @name: marker name
312 * @format: format string
313 * @probe: probe handler
314 * @private: probe private data
315 *
316 * private data must be a valid allocated memory address, or NULL.
317 * Returns 0 if ok, error value on error.
318 */
319int marker_probe_register(const char *name, const char *format,
320 marker_probe_func *probe, void *private)
321{
322 struct marker_entry *entry;
323 int ret = 0, need_update = 0;
324
325 mutex_lock(&markers_mutex);
326 entry = get_marker(name);
327 if (entry && entry->refcount) {
328 ret = -EBUSY;
329 goto end;
330 }
331 if (deferred_sync) {
332 synchronize_sched();
333 deferred_sync = 0;
334 }
335 ret = add_marker(name, format, probe, private);
336 if (ret)
337 goto end;
338 need_update = 1;
339end:
340 mutex_unlock(&markers_mutex);
341 if (need_update)
342 marker_update_probes(NULL);
343 return ret;
344}
345EXPORT_SYMBOL_GPL(marker_probe_register);
346
347/**
348 * marker_probe_unregister - Disconnect a probe from a marker
349 * @name: marker name
350 *
351 * Returns the private data given to marker_probe_register, or an ERR_PTR().
352 */
353void *marker_probe_unregister(const char *name)
354{
355 struct module *probe_module;
356 struct marker_entry *entry;
357 void *private;
358 int need_update = 0;
359
360 mutex_lock(&markers_mutex);
361 entry = get_marker(name);
362 if (!entry) {
363 private = ERR_PTR(-ENOENT);
364 goto end;
365 }
366 entry->refcount = 0;
367 /* In what module is the probe handler ? */
368 probe_module = __module_text_address((unsigned long)entry->probe);
369 private = remove_marker(name);
370 deferred_sync = 1;
371 need_update = 1;
372end:
373 mutex_unlock(&markers_mutex);
374 if (need_update)
375 marker_update_probes(probe_module);
376 return private;
377}
378EXPORT_SYMBOL_GPL(marker_probe_unregister);
379
380/**
381 * marker_probe_unregister_private_data - Disconnect a probe from a marker
382 * @private: probe private data
383 *
384 * Unregister a marker by providing the registered private data.
385 * Returns the private data given to marker_probe_register, or an ERR_PTR().
386 */
387void *marker_probe_unregister_private_data(void *private)
388{
389 struct module *probe_module;
390 struct hlist_head *head;
391 struct hlist_node *node;
392 struct marker_entry *entry;
393 int found = 0;
394 unsigned int i;
395 int need_update = 0;
396
397 mutex_lock(&markers_mutex);
398 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
399 head = &marker_table[i];
400 hlist_for_each_entry(entry, node, head, hlist) {
401 if (entry->private == private) {
402 found = 1;
403 goto iter_end;
404 }
405 }
406 }
407iter_end:
408 if (!found) {
409 private = ERR_PTR(-ENOENT);
410 goto end;
411 }
412 entry->refcount = 0;
413 /* In what module is the probe handler ? */
414 probe_module = __module_text_address((unsigned long)entry->probe);
415 private = remove_marker(entry->name);
416 deferred_sync = 1;
417 need_update = 1;
418end:
419 mutex_unlock(&markers_mutex);
420 if (need_update)
421 marker_update_probes(probe_module);
422 return private;
423}
424EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
425
426/**
427 * marker_arm - Arm a marker
428 * @name: marker name
429 *
430 * Activate a marker. It keeps a reference count of the number of
431 * arming/disarming done.
432 * Returns 0 if ok, error value on error.
433 */
434int marker_arm(const char *name)
435{
436 struct marker_entry *entry;
437 int ret = 0, need_update = 0;
438
439 mutex_lock(&markers_mutex);
440 entry = get_marker(name);
441 if (!entry) {
442 ret = -ENOENT;
443 goto end;
444 }
445 /*
446 * Only need to update probes when refcount passes from 0 to 1.
447 */
448 if (entry->refcount++)
449 goto end;
450 need_update = 1;
451end:
452 mutex_unlock(&markers_mutex);
453 if (need_update)
454 marker_update_probes(NULL);
455 return ret;
456}
457EXPORT_SYMBOL_GPL(marker_arm);
458
459/**
460 * marker_disarm - Disarm a marker
461 * @name: marker name
462 *
463 * Disarm a marker. It keeps a reference count of the number of arming/disarming
464 * done.
465 * Returns 0 if ok, error value on error.
466 */
467int marker_disarm(const char *name)
468{
469 struct marker_entry *entry;
470 int ret = 0, need_update = 0;
471
472 mutex_lock(&markers_mutex);
473 entry = get_marker(name);
474 if (!entry) {
475 ret = -ENOENT;
476 goto end;
477 }
478 /*
479 * Only permit decrement refcount if higher than 0.
480 * Do probe update only on 1 -> 0 transition.
481 */
482 if (entry->refcount) {
483 if (--entry->refcount)
484 goto end;
485 } else {
486 ret = -EPERM;
487 goto end;
488 }
489 need_update = 1;
490end:
491 mutex_unlock(&markers_mutex);
492 if (need_update)
493 marker_update_probes(NULL);
494 return ret;
495}
496EXPORT_SYMBOL_GPL(marker_disarm);
497
498/**
499 * marker_get_private_data - Get a marker's probe private data
500 * @name: marker name
501 *
502 * Returns the private data pointer, or an ERR_PTR.
503 * The private data pointer should _only_ be dereferenced if the caller is the
504 * owner of the data, or its content could vanish. This is mostly used to
505 * confirm that a caller is the owner of a registered probe.
506 */
507void *marker_get_private_data(const char *name)
508{
509 struct hlist_head *head;
510 struct hlist_node *node;
511 struct marker_entry *e;
512 size_t name_len = strlen(name) + 1;
513 u32 hash = jhash(name, name_len-1, 0);
514 int found = 0;
515
516 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
517 hlist_for_each_entry(e, node, head, hlist) {
518 if (!strcmp(name, e->name)) {
519 found = 1;
520 return e->private;
521 }
522 }
523 return ERR_PTR(-ENOENT);
524}
525EXPORT_SYMBOL_GPL(marker_get_private_data);
diff --git a/kernel/module.c b/kernel/module.c
index db0ead0363e2..3202c9950073 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kallsyms.h> 22#include <linux/kallsyms.h>
23#include <linux/sysfs.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
@@ -104,7 +105,7 @@ void __module_put_and_exit(struct module *mod, long code)
104 do_exit(code); 105 do_exit(code);
105} 106}
106EXPORT_SYMBOL(__module_put_and_exit); 107EXPORT_SYMBOL(__module_put_and_exit);
107 108
108/* Find a module section: 0 means not found. */ 109/* Find a module section: 0 means not found. */
109static unsigned int find_sec(Elf_Ehdr *hdr, 110static unsigned int find_sec(Elf_Ehdr *hdr,
110 Elf_Shdr *sechdrs, 111 Elf_Shdr *sechdrs,
@@ -178,7 +179,7 @@ static unsigned long __find_symbol(const char *name,
178 struct module *mod; 179 struct module *mod;
179 const struct kernel_symbol *ks; 180 const struct kernel_symbol *ks;
180 181
181 /* Core kernel first. */ 182 /* Core kernel first. */
182 *owner = NULL; 183 *owner = NULL;
183 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); 184 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
184 if (ks) { 185 if (ks) {
@@ -230,7 +231,7 @@ static unsigned long __find_symbol(const char *name,
230 return ks->value; 231 return ks->value;
231 } 232 }
232 233
233 /* Now try modules. */ 234 /* Now try modules. */
234 list_for_each_entry(mod, &modules, list) { 235 list_for_each_entry(mod, &modules, list) {
235 *owner = mod; 236 *owner = mod;
236 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); 237 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
@@ -284,7 +285,7 @@ static unsigned long __find_symbol(const char *name,
284 } 285 }
285 } 286 }
286 DEBUGP("Failed to find symbol %s\n", name); 287 DEBUGP("Failed to find symbol %s\n", name);
287 return 0; 288 return 0;
288} 289}
289 290
290/* Search for module by name: must hold module_mutex. */ 291/* Search for module by name: must hold module_mutex. */
@@ -440,7 +441,7 @@ static int percpu_modinit(void)
440 } 441 }
441 442
442 return 0; 443 return 0;
443} 444}
444__initcall(percpu_modinit); 445__initcall(percpu_modinit);
445#else /* ... !CONFIG_SMP */ 446#else /* ... !CONFIG_SMP */
446static inline void *percpu_modalloc(unsigned long size, unsigned long align, 447static inline void *percpu_modalloc(unsigned long size, unsigned long align,
@@ -482,8 +483,8 @@ static int modinfo_##field##_exists(struct module *mod) \
482} \ 483} \
483static void free_modinfo_##field(struct module *mod) \ 484static void free_modinfo_##field(struct module *mod) \
484{ \ 485{ \
485 kfree(mod->field); \ 486 kfree(mod->field); \
486 mod->field = NULL; \ 487 mod->field = NULL; \
487} \ 488} \
488static struct module_attribute modinfo_##field = { \ 489static struct module_attribute modinfo_##field = { \
489 .attr = { .name = __stringify(field), .mode = 0444 }, \ 490 .attr = { .name = __stringify(field), .mode = 0444 }, \
@@ -692,8 +693,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
692 } 693 }
693 694
694 /* If it has an init func, it must have an exit func to unload */ 695 /* If it has an init func, it must have an exit func to unload */
695 if ((mod->init != NULL && mod->exit == NULL) 696 if (mod->init && !mod->exit) {
696 || mod->unsafe) {
697 forced = try_force_unload(flags); 697 forced = try_force_unload(flags);
698 if (!forced) { 698 if (!forced) {
699 /* This module can't be removed */ 699 /* This module can't be removed */
@@ -741,11 +741,6 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
741 seq_printf(m, "%s,", use->module_which_uses->name); 741 seq_printf(m, "%s,", use->module_which_uses->name);
742 } 742 }
743 743
744 if (mod->unsafe) {
745 printed_something = 1;
746 seq_printf(m, "[unsafe],");
747 }
748
749 if (mod->init != NULL && mod->exit == NULL) { 744 if (mod->init != NULL && mod->exit == NULL) {
750 printed_something = 1; 745 printed_something = 1;
751 seq_printf(m, "[permanent],"); 746 seq_printf(m, "[permanent],");
@@ -995,7 +990,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
995 struct module_sect_attrs *sect_attrs; 990 struct module_sect_attrs *sect_attrs;
996 struct module_sect_attr *sattr; 991 struct module_sect_attr *sattr;
997 struct attribute **gattr; 992 struct attribute **gattr;
998 993
999 /* Count loaded sections and allocate structures */ 994 /* Count loaded sections and allocate structures */
1000 for (i = 0; i < nsect; i++) 995 for (i = 0; i < nsect; i++)
1001 if (sechdrs[i].sh_flags & SHF_ALLOC) 996 if (sechdrs[i].sh_flags & SHF_ALLOC)
@@ -1053,6 +1048,100 @@ static void remove_sect_attrs(struct module *mod)
1053 } 1048 }
1054} 1049}
1055 1050
1051/*
1052 * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
1053 */
1054
1055struct module_notes_attrs {
1056 struct kobject *dir;
1057 unsigned int notes;
1058 struct bin_attribute attrs[0];
1059};
1060
1061static ssize_t module_notes_read(struct kobject *kobj,
1062 struct bin_attribute *bin_attr,
1063 char *buf, loff_t pos, size_t count)
1064{
1065 /*
1066 * The caller checked the pos and count against our size.
1067 */
1068 memcpy(buf, bin_attr->private + pos, count);
1069 return count;
1070}
1071
1072static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1073 unsigned int i)
1074{
1075 if (notes_attrs->dir) {
1076 while (i-- > 0)
1077 sysfs_remove_bin_file(notes_attrs->dir,
1078 &notes_attrs->attrs[i]);
1079 kobject_del(notes_attrs->dir);
1080 }
1081 kfree(notes_attrs);
1082}
1083
1084static void add_notes_attrs(struct module *mod, unsigned int nsect,
1085 char *secstrings, Elf_Shdr *sechdrs)
1086{
1087 unsigned int notes, loaded, i;
1088 struct module_notes_attrs *notes_attrs;
1089 struct bin_attribute *nattr;
1090
1091 /* Count notes sections and allocate structures. */
1092 notes = 0;
1093 for (i = 0; i < nsect; i++)
1094 if ((sechdrs[i].sh_flags & SHF_ALLOC) &&
1095 (sechdrs[i].sh_type == SHT_NOTE))
1096 ++notes;
1097
1098 if (notes == 0)
1099 return;
1100
1101 notes_attrs = kzalloc(sizeof(*notes_attrs)
1102 + notes * sizeof(notes_attrs->attrs[0]),
1103 GFP_KERNEL);
1104 if (notes_attrs == NULL)
1105 return;
1106
1107 notes_attrs->notes = notes;
1108 nattr = &notes_attrs->attrs[0];
1109 for (loaded = i = 0; i < nsect; ++i) {
1110 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1111 continue;
1112 if (sechdrs[i].sh_type == SHT_NOTE) {
1113 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1114 nattr->attr.mode = S_IRUGO;
1115 nattr->size = sechdrs[i].sh_size;
1116 nattr->private = (void *) sechdrs[i].sh_addr;
1117 nattr->read = module_notes_read;
1118 ++nattr;
1119 }
1120 ++loaded;
1121 }
1122
1123 notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes");
1124 if (!notes_attrs->dir)
1125 goto out;
1126
1127 for (i = 0; i < notes; ++i)
1128 if (sysfs_create_bin_file(notes_attrs->dir,
1129 &notes_attrs->attrs[i]))
1130 goto out;
1131
1132 mod->notes_attrs = notes_attrs;
1133 return;
1134
1135 out:
1136 free_notes_attrs(notes_attrs, i);
1137}
1138
1139static void remove_notes_attrs(struct module *mod)
1140{
1141 if (mod->notes_attrs)
1142 free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
1143}
1144
1056#else 1145#else
1057 1146
1058static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1147static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
@@ -1063,6 +1152,15 @@ static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
1063static inline void remove_sect_attrs(struct module *mod) 1152static inline void remove_sect_attrs(struct module *mod)
1064{ 1153{
1065} 1154}
1155
1156static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
1157 char *sectstrings, Elf_Shdr *sechdrs)
1158{
1159}
1160
1161static inline void remove_notes_attrs(struct module *mod)
1162{
1163}
1066#endif /* CONFIG_KALLSYMS */ 1164#endif /* CONFIG_KALLSYMS */
1067 1165
1068#ifdef CONFIG_SYSFS 1166#ifdef CONFIG_SYSFS
@@ -1197,6 +1295,7 @@ static void free_module(struct module *mod)
1197{ 1295{
1198 /* Delete from various lists */ 1296 /* Delete from various lists */
1199 stop_machine_run(__unlink_module, mod, NR_CPUS); 1297 stop_machine_run(__unlink_module, mod, NR_CPUS);
1298 remove_notes_attrs(mod);
1200 remove_sect_attrs(mod); 1299 remove_sect_attrs(mod);
1201 mod_kobject_remove(mod); 1300 mod_kobject_remove(mod);
1202 1301
@@ -1249,14 +1348,14 @@ static int verify_export_symbols(struct module *mod)
1249 const unsigned long *crc; 1348 const unsigned long *crc;
1250 1349
1251 for (i = 0; i < mod->num_syms; i++) 1350 for (i = 0; i < mod->num_syms; i++)
1252 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { 1351 if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
1253 name = mod->syms[i].name; 1352 name = mod->syms[i].name;
1254 ret = -ENOEXEC; 1353 ret = -ENOEXEC;
1255 goto dup; 1354 goto dup;
1256 } 1355 }
1257 1356
1258 for (i = 0; i < mod->num_gpl_syms; i++) 1357 for (i = 0; i < mod->num_gpl_syms; i++)
1259 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { 1358 if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
1260 name = mod->gpl_syms[i].name; 1359 name = mod->gpl_syms[i].name;
1261 ret = -ENOEXEC; 1360 ret = -ENOEXEC;
1262 goto dup; 1361 goto dup;
@@ -1574,6 +1673,8 @@ static struct module *load_module(void __user *umod,
1574 unsigned int unusedcrcindex; 1673 unsigned int unusedcrcindex;
1575 unsigned int unusedgplindex; 1674 unsigned int unusedgplindex;
1576 unsigned int unusedgplcrcindex; 1675 unsigned int unusedgplcrcindex;
1676 unsigned int markersindex;
1677 unsigned int markersstringsindex;
1577 struct module *mod; 1678 struct module *mod;
1578 long err = 0; 1679 long err = 0;
1579 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1680 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1782,7 +1883,8 @@ static struct module *load_module(void __user *umod,
1782 module_unload_init(mod); 1883 module_unload_init(mod);
1783 1884
1784 /* Initialize kobject, so we can reference it. */ 1885 /* Initialize kobject, so we can reference it. */
1785 if (mod_sysfs_init(mod) != 0) 1886 err = mod_sysfs_init(mod);
1887 if (err)
1786 goto cleanup; 1888 goto cleanup;
1787 1889
1788 /* Set up license info based on the info section */ 1890 /* Set up license info based on the info section */
@@ -1829,7 +1931,7 @@ static struct module *load_module(void __user *umod,
1829 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; 1931 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1830 1932
1831#ifdef CONFIG_MODVERSIONS 1933#ifdef CONFIG_MODVERSIONS
1832 if ((mod->num_syms && !crcindex) || 1934 if ((mod->num_syms && !crcindex) ||
1833 (mod->num_gpl_syms && !gplcrcindex) || 1935 (mod->num_gpl_syms && !gplcrcindex) ||
1834 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 1936 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1835 (mod->num_unused_syms && !unusedcrcindex) || 1937 (mod->num_unused_syms && !unusedcrcindex) ||
@@ -1839,6 +1941,9 @@ static struct module *load_module(void __user *umod,
1839 add_taint_module(mod, TAINT_FORCED_MODULE); 1941 add_taint_module(mod, TAINT_FORCED_MODULE);
1840 } 1942 }
1841#endif 1943#endif
1944 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
1945 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
1946 "__markers_strings");
1842 1947
1843 /* Now do relocations. */ 1948 /* Now do relocations. */
1844 for (i = 1; i < hdr->e_shnum; i++) { 1949 for (i = 1; i < hdr->e_shnum; i++) {
@@ -1861,6 +1966,11 @@ static struct module *load_module(void __user *umod,
1861 if (err < 0) 1966 if (err < 0)
1862 goto cleanup; 1967 goto cleanup;
1863 } 1968 }
1969#ifdef CONFIG_MARKERS
1970 mod->markers = (void *)sechdrs[markersindex].sh_addr;
1971 mod->num_markers =
1972 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
1973#endif
1864 1974
1865 /* Find duplicate symbols */ 1975 /* Find duplicate symbols */
1866 err = verify_export_symbols(mod); 1976 err = verify_export_symbols(mod);
@@ -1879,6 +1989,11 @@ static struct module *load_module(void __user *umod,
1879 1989
1880 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 1990 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
1881 1991
1992#ifdef CONFIG_MARKERS
1993 if (!mod->taints)
1994 marker_update_probe_range(mod->markers,
1995 mod->markers + mod->num_markers, NULL, NULL);
1996#endif
1882 err = module_finalize(hdr, sechdrs, mod); 1997 err = module_finalize(hdr, sechdrs, mod);
1883 if (err < 0) 1998 if (err < 0)
1884 goto cleanup; 1999 goto cleanup;
@@ -1916,7 +2031,7 @@ static struct module *load_module(void __user *umod,
1916 if (err < 0) 2031 if (err < 0)
1917 goto arch_cleanup; 2032 goto arch_cleanup;
1918 2033
1919 err = mod_sysfs_setup(mod, 2034 err = mod_sysfs_setup(mod,
1920 (struct kernel_param *) 2035 (struct kernel_param *)
1921 sechdrs[setupindex].sh_addr, 2036 sechdrs[setupindex].sh_addr,
1922 sechdrs[setupindex].sh_size 2037 sechdrs[setupindex].sh_size
@@ -1924,11 +2039,12 @@ static struct module *load_module(void __user *umod,
1924 if (err < 0) 2039 if (err < 0)
1925 goto arch_cleanup; 2040 goto arch_cleanup;
1926 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2041 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2042 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1927 2043
1928 /* Size of section 0 is 0, so this works well if no unwind info. */ 2044 /* Size of section 0 is 0, so this works well if no unwind info. */
1929 mod->unwind_info = unwind_add_table(mod, 2045 mod->unwind_info = unwind_add_table(mod,
1930 (void *)sechdrs[unwindex].sh_addr, 2046 (void *)sechdrs[unwindex].sh_addr,
1931 sechdrs[unwindex].sh_size); 2047 sechdrs[unwindex].sh_size);
1932 2048
1933 /* Get rid of temporary copy */ 2049 /* Get rid of temporary copy */
1934 vfree(hdr); 2050 vfree(hdr);
@@ -2011,15 +2127,10 @@ sys_init_module(void __user *umod,
2011 buggy refcounters. */ 2127 buggy refcounters. */
2012 mod->state = MODULE_STATE_GOING; 2128 mod->state = MODULE_STATE_GOING;
2013 synchronize_sched(); 2129 synchronize_sched();
2014 if (mod->unsafe) 2130 module_put(mod);
2015 printk(KERN_ERR "%s: module is now stuck!\n", 2131 mutex_lock(&module_mutex);
2016 mod->name); 2132 free_module(mod);
2017 else { 2133 mutex_unlock(&module_mutex);
2018 module_put(mod);
2019 mutex_lock(&module_mutex);
2020 free_module(mod);
2021 mutex_unlock(&module_mutex);
2022 }
2023 return ret; 2134 return ret;
2024 } 2135 }
2025 2136
@@ -2050,7 +2161,7 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
2050 */ 2161 */
2051static inline int is_arm_mapping_symbol(const char *str) 2162static inline int is_arm_mapping_symbol(const char *str)
2052{ 2163{
2053 return str[0] == '$' && strchr("atd", str[1]) 2164 return str[0] == '$' && strchr("atd", str[1])
2054 && (str[2] == '\0' || str[2] == '.'); 2165 && (str[2] == '\0' || str[2] == '.');
2055} 2166}
2056 2167
@@ -2065,11 +2176,11 @@ static const char *get_ksymbol(struct module *mod,
2065 /* At worse, next value is at end of module */ 2176 /* At worse, next value is at end of module */
2066 if (within(addr, mod->module_init, mod->init_size)) 2177 if (within(addr, mod->module_init, mod->init_size))
2067 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2178 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2068 else 2179 else
2069 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2180 nextval = (unsigned long)mod->module_core+mod->core_text_size;
2070 2181
2071 /* Scan for closest preceeding symbol, and next symbol. (ELF 2182 /* Scan for closest preceeding symbol, and next symbol. (ELF
2072 starts real symbols at 1). */ 2183 starts real symbols at 1). */
2073 for (i = 1; i < mod->num_symtab; i++) { 2184 for (i = 1; i < mod->num_symtab; i++) {
2074 if (mod->symtab[i].st_shndx == SHN_UNDEF) 2185 if (mod->symtab[i].st_shndx == SHN_UNDEF)
2075 continue; 2186 continue;
@@ -2311,7 +2422,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2311 list_for_each_entry(mod, &modules, list) { 2422 list_for_each_entry(mod, &modules, list) {
2312 if (mod->num_exentries == 0) 2423 if (mod->num_exentries == 0)
2313 continue; 2424 continue;
2314 2425
2315 e = search_extable(mod->extable, 2426 e = search_extable(mod->extable,
2316 mod->extable + mod->num_exentries - 1, 2427 mod->extable + mod->num_exentries - 1,
2317 addr); 2428 addr);
@@ -2321,7 +2432,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2321 preempt_enable(); 2432 preempt_enable();
2322 2433
2323 /* Now, if we found one, we are running inside it now, hence 2434 /* Now, if we found one, we are running inside it now, hence
2324 we cannot unload the module, hence no refcnt needed. */ 2435 we cannot unload the module, hence no refcnt needed. */
2325 return e; 2436 return e;
2326} 2437}
2327 2438
@@ -2474,3 +2585,18 @@ EXPORT_SYMBOL(module_remove_driver);
2474void struct_module(struct module *mod) { return; } 2585void struct_module(struct module *mod) { return; }
2475EXPORT_SYMBOL(struct_module); 2586EXPORT_SYMBOL(struct_module);
2476#endif 2587#endif
2588
2589#ifdef CONFIG_MARKERS
2590void module_update_markers(struct module *probe_module, int *refcount)
2591{
2592 struct module *mod;
2593
2594 mutex_lock(&module_mutex);
2595 list_for_each_entry(mod, &modules, list)
2596 if (!mod->taints)
2597 marker_update_probe_range(mod->markers,
2598 mod->markers + mod->num_markers,
2599 probe_module, refcount);
2600 mutex_unlock(&module_mutex);
2601}
2602#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 691b86564dd9..d7fe50cc556f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -51,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
51 51
52EXPORT_SYMBOL(__mutex_init); 52EXPORT_SYMBOL(__mutex_init);
53 53
54#ifndef CONFIG_DEBUG_LOCK_ALLOC
54/* 55/*
55 * We split the mutex lock/unlock logic into separate fastpath and 56 * We split the mutex lock/unlock logic into separate fastpath and
56 * slowpath functions, to reduce the register pressure on the fastpath. 57 * slowpath functions, to reduce the register pressure on the fastpath.
@@ -92,6 +93,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
92} 93}
93 94
94EXPORT_SYMBOL(mutex_lock); 95EXPORT_SYMBOL(mutex_lock);
96#endif
95 97
96static void fastcall noinline __sched 98static void fastcall noinline __sched
97__mutex_unlock_slowpath(atomic_t *lock_count); 99__mutex_unlock_slowpath(atomic_t *lock_count);
@@ -122,7 +124,8 @@ EXPORT_SYMBOL(mutex_unlock);
122 * Lock a mutex (possibly interruptible), slowpath: 124 * Lock a mutex (possibly interruptible), slowpath:
123 */ 125 */
124static inline int __sched 126static inline int __sched
125__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) 127__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
128 unsigned long ip)
126{ 129{
127 struct task_struct *task = current; 130 struct task_struct *task = current;
128 struct mutex_waiter waiter; 131 struct mutex_waiter waiter;
@@ -132,7 +135,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
132 spin_lock_mutex(&lock->wait_lock, flags); 135 spin_lock_mutex(&lock->wait_lock, flags);
133 136
134 debug_mutex_lock_common(lock, &waiter); 137 debug_mutex_lock_common(lock, &waiter);
135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 138 mutex_acquire(&lock->dep_map, subclass, 0, ip);
136 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 139 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
137 140
138 /* add waiting tasks to the end of the waitqueue (FIFO): */ 141 /* add waiting tasks to the end of the waitqueue (FIFO): */
@@ -143,7 +146,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
143 if (old_val == 1) 146 if (old_val == 1)
144 goto done; 147 goto done;
145 148
146 lock_contended(&lock->dep_map, _RET_IP_); 149 lock_contended(&lock->dep_map, ip);
147 150
148 for (;;) { 151 for (;;) {
149 /* 152 /*
@@ -166,7 +169,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
166 if (unlikely(state == TASK_INTERRUPTIBLE && 169 if (unlikely(state == TASK_INTERRUPTIBLE &&
167 signal_pending(task))) { 170 signal_pending(task))) {
168 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 171 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
169 mutex_release(&lock->dep_map, 1, _RET_IP_); 172 mutex_release(&lock->dep_map, 1, ip);
170 spin_unlock_mutex(&lock->wait_lock, flags); 173 spin_unlock_mutex(&lock->wait_lock, flags);
171 174
172 debug_mutex_free_waiter(&waiter); 175 debug_mutex_free_waiter(&waiter);
@@ -197,20 +200,12 @@ done:
197 return 0; 200 return 0;
198} 201}
199 202
200static void fastcall noinline __sched
201__mutex_lock_slowpath(atomic_t *lock_count)
202{
203 struct mutex *lock = container_of(lock_count, struct mutex, count);
204
205 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
206}
207
208#ifdef CONFIG_DEBUG_LOCK_ALLOC 203#ifdef CONFIG_DEBUG_LOCK_ALLOC
209void __sched 204void __sched
210mutex_lock_nested(struct mutex *lock, unsigned int subclass) 205mutex_lock_nested(struct mutex *lock, unsigned int subclass)
211{ 206{
212 might_sleep(); 207 might_sleep();
213 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); 208 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
214} 209}
215 210
216EXPORT_SYMBOL_GPL(mutex_lock_nested); 211EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -219,7 +214,7 @@ int __sched
219mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 214mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
220{ 215{
221 might_sleep(); 216 might_sleep();
222 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); 217 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
223} 218}
224 219
225EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 220EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -271,6 +266,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
271 __mutex_unlock_common_slowpath(lock_count, 1); 266 __mutex_unlock_common_slowpath(lock_count, 1);
272} 267}
273 268
269#ifndef CONFIG_DEBUG_LOCK_ALLOC
274/* 270/*
275 * Here come the less common (and hence less performance-critical) APIs: 271 * Here come the less common (and hence less performance-critical) APIs:
276 * mutex_lock_interruptible() and mutex_trylock(). 272 * mutex_lock_interruptible() and mutex_trylock().
@@ -298,13 +294,22 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
298 294
299EXPORT_SYMBOL(mutex_lock_interruptible); 295EXPORT_SYMBOL(mutex_lock_interruptible);
300 296
297static void fastcall noinline __sched
298__mutex_lock_slowpath(atomic_t *lock_count)
299{
300 struct mutex *lock = container_of(lock_count, struct mutex, count);
301
302 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
303}
304
301static int fastcall noinline __sched 305static int fastcall noinline __sched
302__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 306__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
303{ 307{
304 struct mutex *lock = container_of(lock_count, struct mutex, count); 308 struct mutex *lock = container_of(lock_count, struct mutex, count);
305 309
306 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); 310 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
307} 311}
312#endif
308 313
309/* 314/*
310 * Spinlock based trylock, we take the spinlock and check whether we 315 * Spinlock based trylock, we take the spinlock and check whether we
diff --git a/kernel/notifier.c b/kernel/notifier.c
new file mode 100644
index 000000000000..4253f472f060
--- /dev/null
+++ b/kernel/notifier.c
@@ -0,0 +1,539 @@
1#include <linux/kdebug.h>
2#include <linux/kprobes.h>
3#include <linux/module.h>
4#include <linux/notifier.h>
5#include <linux/rcupdate.h>
6#include <linux/vmalloc.h>
7
8/*
9 * Notifier list for kernel code which wants to be called
10 * at shutdown. This is used to stop any idling DMA operations
11 * and the like.
12 */
13BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
14
15/*
16 * Notifier chain core routines. The exported routines below
17 * are layered on top of these, with appropriate locking added.
18 */
19
20static int notifier_chain_register(struct notifier_block **nl,
21 struct notifier_block *n)
22{
23 while ((*nl) != NULL) {
24 if (n->priority > (*nl)->priority)
25 break;
26 nl = &((*nl)->next);
27 }
28 n->next = *nl;
29 rcu_assign_pointer(*nl, n);
30 return 0;
31}
32
33static int notifier_chain_unregister(struct notifier_block **nl,
34 struct notifier_block *n)
35{
36 while ((*nl) != NULL) {
37 if ((*nl) == n) {
38 rcu_assign_pointer(*nl, n->next);
39 return 0;
40 }
41 nl = &((*nl)->next);
42 }
43 return -ENOENT;
44}
45
46/**
47 * notifier_call_chain - Informs the registered notifiers about an event.
48 * @nl: Pointer to head of the blocking notifier chain
49 * @val: Value passed unmodified to notifier function
50 * @v: Pointer passed unmodified to notifier function
51 * @nr_to_call: Number of notifier functions to be called. Don't care
52 * value of this parameter is -1.
53 * @nr_calls: Records the number of notifications sent. Don't care
54 * value of this field is NULL.
55 * @returns: notifier_call_chain returns the value returned by the
56 * last notifier function called.
57 */
58static int __kprobes notifier_call_chain(struct notifier_block **nl,
59 unsigned long val, void *v,
60 int nr_to_call, int *nr_calls)
61{
62 int ret = NOTIFY_DONE;
63 struct notifier_block *nb, *next_nb;
64
65 nb = rcu_dereference(*nl);
66
67 while (nb && nr_to_call) {
68 next_nb = rcu_dereference(nb->next);
69 ret = nb->notifier_call(nb, val, v);
70
71 if (nr_calls)
72 (*nr_calls)++;
73
74 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
75 break;
76 nb = next_nb;
77 nr_to_call--;
78 }
79 return ret;
80}
81
82/*
83 * Atomic notifier chain routines. Registration and unregistration
84 * use a spinlock, and call_chain is synchronized by RCU (no locks).
85 */
86
87/**
88 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
89 * @nh: Pointer to head of the atomic notifier chain
90 * @n: New entry in notifier chain
91 *
92 * Adds a notifier to an atomic notifier chain.
93 *
94 * Currently always returns zero.
95 */
96int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
97 struct notifier_block *n)
98{
99 unsigned long flags;
100 int ret;
101
102 spin_lock_irqsave(&nh->lock, flags);
103 ret = notifier_chain_register(&nh->head, n);
104 spin_unlock_irqrestore(&nh->lock, flags);
105 return ret;
106}
107EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
108
109/**
110 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
111 * @nh: Pointer to head of the atomic notifier chain
112 * @n: Entry to remove from notifier chain
113 *
114 * Removes a notifier from an atomic notifier chain.
115 *
116 * Returns zero on success or %-ENOENT on failure.
117 */
118int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
119 struct notifier_block *n)
120{
121 unsigned long flags;
122 int ret;
123
124 spin_lock_irqsave(&nh->lock, flags);
125 ret = notifier_chain_unregister(&nh->head, n);
126 spin_unlock_irqrestore(&nh->lock, flags);
127 synchronize_rcu();
128 return ret;
129}
130EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
131
132/**
133 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
134 * @nh: Pointer to head of the atomic notifier chain
135 * @val: Value passed unmodified to notifier function
136 * @v: Pointer passed unmodified to notifier function
137 * @nr_to_call: See the comment for notifier_call_chain.
138 * @nr_calls: See the comment for notifier_call_chain.
139 *
140 * Calls each function in a notifier chain in turn. The functions
141 * run in an atomic context, so they must not block.
142 * This routine uses RCU to synchronize with changes to the chain.
143 *
144 * If the return value of the notifier can be and'ed
145 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
146 * will return immediately, with the return value of
147 * the notifier function which halted execution.
148 * Otherwise the return value is the return value
149 * of the last notifier function called.
150 */
151int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
152 unsigned long val, void *v,
153 int nr_to_call, int *nr_calls)
154{
155 int ret;
156
157 rcu_read_lock();
158 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
159 rcu_read_unlock();
160 return ret;
161}
162EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
163
164int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
165 unsigned long val, void *v)
166{
167 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
168}
169EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
170
171/*
172 * Blocking notifier chain routines. All access to the chain is
173 * synchronized by an rwsem.
174 */
175
176/**
177 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
178 * @nh: Pointer to head of the blocking notifier chain
179 * @n: New entry in notifier chain
180 *
181 * Adds a notifier to a blocking notifier chain.
182 * Must be called in process context.
183 *
184 * Currently always returns zero.
185 */
186int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
187 struct notifier_block *n)
188{
189 int ret;
190
191 /*
192 * This code gets used during boot-up, when task switching is
193 * not yet working and interrupts must remain disabled. At
194 * such times we must not call down_write().
195 */
196 if (unlikely(system_state == SYSTEM_BOOTING))
197 return notifier_chain_register(&nh->head, n);
198
199 down_write(&nh->rwsem);
200 ret = notifier_chain_register(&nh->head, n);
201 up_write(&nh->rwsem);
202 return ret;
203}
204EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
205
206/**
207 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
208 * @nh: Pointer to head of the blocking notifier chain
209 * @n: Entry to remove from notifier chain
210 *
211 * Removes a notifier from a blocking notifier chain.
212 * Must be called from process context.
213 *
214 * Returns zero on success or %-ENOENT on failure.
215 */
216int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
217 struct notifier_block *n)
218{
219 int ret;
220
221 /*
222 * This code gets used during boot-up, when task switching is
223 * not yet working and interrupts must remain disabled. At
224 * such times we must not call down_write().
225 */
226 if (unlikely(system_state == SYSTEM_BOOTING))
227 return notifier_chain_unregister(&nh->head, n);
228
229 down_write(&nh->rwsem);
230 ret = notifier_chain_unregister(&nh->head, n);
231 up_write(&nh->rwsem);
232 return ret;
233}
234EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
235
236/**
237 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
238 * @nh: Pointer to head of the blocking notifier chain
239 * @val: Value passed unmodified to notifier function
240 * @v: Pointer passed unmodified to notifier function
241 * @nr_to_call: See comment for notifier_call_chain.
242 * @nr_calls: See comment for notifier_call_chain.
243 *
244 * Calls each function in a notifier chain in turn. The functions
245 * run in a process context, so they are allowed to block.
246 *
247 * If the return value of the notifier can be and'ed
248 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
249 * will return immediately, with the return value of
250 * the notifier function which halted execution.
251 * Otherwise the return value is the return value
252 * of the last notifier function called.
253 */
254int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
255 unsigned long val, void *v,
256 int nr_to_call, int *nr_calls)
257{
258 int ret = NOTIFY_DONE;
259
260 /*
261 * We check the head outside the lock, but if this access is
262 * racy then it does not matter what the result of the test
263 * is, we re-check the list after having taken the lock anyway:
264 */
265 if (rcu_dereference(nh->head)) {
266 down_read(&nh->rwsem);
267 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
268 nr_calls);
269 up_read(&nh->rwsem);
270 }
271 return ret;
272}
273EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
274
275int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
276 unsigned long val, void *v)
277{
278 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
279}
280EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
281
282/*
283 * Raw notifier chain routines. There is no protection;
284 * the caller must provide it. Use at your own risk!
285 */
286
287/**
288 * raw_notifier_chain_register - Add notifier to a raw notifier chain
289 * @nh: Pointer to head of the raw notifier chain
290 * @n: New entry in notifier chain
291 *
292 * Adds a notifier to a raw notifier chain.
293 * All locking must be provided by the caller.
294 *
295 * Currently always returns zero.
296 */
297int raw_notifier_chain_register(struct raw_notifier_head *nh,
298 struct notifier_block *n)
299{
300 return notifier_chain_register(&nh->head, n);
301}
302EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
303
304/**
305 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
306 * @nh: Pointer to head of the raw notifier chain
307 * @n: Entry to remove from notifier chain
308 *
309 * Removes a notifier from a raw notifier chain.
310 * All locking must be provided by the caller.
311 *
312 * Returns zero on success or %-ENOENT on failure.
313 */
314int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
315 struct notifier_block *n)
316{
317 return notifier_chain_unregister(&nh->head, n);
318}
319EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
320
321/**
322 * __raw_notifier_call_chain - Call functions in a raw notifier chain
323 * @nh: Pointer to head of the raw notifier chain
324 * @val: Value passed unmodified to notifier function
325 * @v: Pointer passed unmodified to notifier function
326 * @nr_to_call: See comment for notifier_call_chain.
327 * @nr_calls: See comment for notifier_call_chain
328 *
329 * Calls each function in a notifier chain in turn. The functions
330 * run in an undefined context.
331 * All locking must be provided by the caller.
332 *
333 * If the return value of the notifier can be and'ed
334 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
335 * will return immediately, with the return value of
336 * the notifier function which halted execution.
337 * Otherwise the return value is the return value
338 * of the last notifier function called.
339 */
340int __raw_notifier_call_chain(struct raw_notifier_head *nh,
341 unsigned long val, void *v,
342 int nr_to_call, int *nr_calls)
343{
344 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
345}
346EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
347
348int raw_notifier_call_chain(struct raw_notifier_head *nh,
349 unsigned long val, void *v)
350{
351 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
352}
353EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
354
355/*
356 * SRCU notifier chain routines. Registration and unregistration
357 * use a mutex, and call_chain is synchronized by SRCU (no locks).
358 */
359
360/**
361 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
362 * @nh: Pointer to head of the SRCU notifier chain
363 * @n: New entry in notifier chain
364 *
365 * Adds a notifier to an SRCU notifier chain.
366 * Must be called in process context.
367 *
368 * Currently always returns zero.
369 */
370int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
371 struct notifier_block *n)
372{
373 int ret;
374
375 /*
376 * This code gets used during boot-up, when task switching is
377 * not yet working and interrupts must remain disabled. At
378 * such times we must not call mutex_lock().
379 */
380 if (unlikely(system_state == SYSTEM_BOOTING))
381 return notifier_chain_register(&nh->head, n);
382
383 mutex_lock(&nh->mutex);
384 ret = notifier_chain_register(&nh->head, n);
385 mutex_unlock(&nh->mutex);
386 return ret;
387}
388EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
389
390/**
391 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
392 * @nh: Pointer to head of the SRCU notifier chain
393 * @n: Entry to remove from notifier chain
394 *
395 * Removes a notifier from an SRCU notifier chain.
396 * Must be called from process context.
397 *
398 * Returns zero on success or %-ENOENT on failure.
399 */
400int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
401 struct notifier_block *n)
402{
403 int ret;
404
405 /*
406 * This code gets used during boot-up, when task switching is
407 * not yet working and interrupts must remain disabled. At
408 * such times we must not call mutex_lock().
409 */
410 if (unlikely(system_state == SYSTEM_BOOTING))
411 return notifier_chain_unregister(&nh->head, n);
412
413 mutex_lock(&nh->mutex);
414 ret = notifier_chain_unregister(&nh->head, n);
415 mutex_unlock(&nh->mutex);
416 synchronize_srcu(&nh->srcu);
417 return ret;
418}
419EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
420
421/**
422 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
423 * @nh: Pointer to head of the SRCU notifier chain
424 * @val: Value passed unmodified to notifier function
425 * @v: Pointer passed unmodified to notifier function
426 * @nr_to_call: See comment for notifier_call_chain.
427 * @nr_calls: See comment for notifier_call_chain
428 *
429 * Calls each function in a notifier chain in turn. The functions
430 * run in a process context, so they are allowed to block.
431 *
432 * If the return value of the notifier can be and'ed
433 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
434 * will return immediately, with the return value of
435 * the notifier function which halted execution.
436 * Otherwise the return value is the return value
437 * of the last notifier function called.
438 */
439int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
440 unsigned long val, void *v,
441 int nr_to_call, int *nr_calls)
442{
443 int ret;
444 int idx;
445
446 idx = srcu_read_lock(&nh->srcu);
447 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
448 srcu_read_unlock(&nh->srcu, idx);
449 return ret;
450}
451EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
452
453int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
454 unsigned long val, void *v)
455{
456 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
457}
458EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
459
460/**
461 * srcu_init_notifier_head - Initialize an SRCU notifier head
462 * @nh: Pointer to head of the srcu notifier chain
463 *
464 * Unlike other sorts of notifier heads, SRCU notifier heads require
465 * dynamic initialization. Be sure to call this routine before
466 * calling any of the other SRCU notifier routines for this head.
467 *
468 * If an SRCU notifier head is deallocated, it must first be cleaned
469 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
470 * per-cpu data (used by the SRCU mechanism) will leak.
471 */
472void srcu_init_notifier_head(struct srcu_notifier_head *nh)
473{
474 mutex_init(&nh->mutex);
475 if (init_srcu_struct(&nh->srcu) < 0)
476 BUG();
477 nh->head = NULL;
478}
479EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
480
481/**
482 * register_reboot_notifier - Register function to be called at reboot time
483 * @nb: Info about notifier function to be called
484 *
485 * Registers a function with the list of functions
486 * to be called at reboot time.
487 *
488 * Currently always returns zero, as blocking_notifier_chain_register()
489 * always returns zero.
490 */
491int register_reboot_notifier(struct notifier_block *nb)
492{
493 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
494}
495EXPORT_SYMBOL(register_reboot_notifier);
496
497/**
498 * unregister_reboot_notifier - Unregister previously registered reboot notifier
499 * @nb: Hook to be unregistered
500 *
501 * Unregisters a previously registered reboot
502 * notifier function.
503 *
504 * Returns zero on success, or %-ENOENT on failure.
505 */
506int unregister_reboot_notifier(struct notifier_block *nb)
507{
508 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
509}
510EXPORT_SYMBOL(unregister_reboot_notifier);
511
512static ATOMIC_NOTIFIER_HEAD(die_chain);
513
514int notify_die(enum die_val val, const char *str,
515 struct pt_regs *regs, long err, int trap, int sig)
516{
517 struct die_args args = {
518 .regs = regs,
519 .str = str,
520 .err = err,
521 .trapnr = trap,
522 .signr = sig,
523
524 };
525 return atomic_notifier_call_chain(&die_chain, val, &args);
526}
527
528int register_die_notifier(struct notifier_block *nb)
529{
530 vmalloc_sync_all();
531 return atomic_notifier_chain_register(&die_chain, nb);
532}
533EXPORT_SYMBOL_GPL(register_die_notifier);
534
535int unregister_die_notifier(struct notifier_block *nb)
536{
537 return atomic_notifier_chain_unregister(&die_chain, nb);
538}
539EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
new file mode 100644
index 000000000000..aead4d69f62b
--- /dev/null
+++ b/kernel/ns_cgroup.c
@@ -0,0 +1,100 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10
11struct ns_cgroup {
12 struct cgroup_subsys_state css;
13 spinlock_t lock;
14};
15
16struct cgroup_subsys ns_subsys;
17
18static inline struct ns_cgroup *cgroup_to_ns(
19 struct cgroup *cgroup)
20{
21 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
22 struct ns_cgroup, css);
23}
24
25int ns_cgroup_clone(struct task_struct *task)
26{
27 return cgroup_clone(task, &ns_subsys);
28}
29
30/*
31 * Rules:
32 * 1. you can only enter a cgroup which is a child of your current
33 * cgroup
34 * 2. you can only place another process into a cgroup if
35 * a. you have CAP_SYS_ADMIN
36 * b. your cgroup is an ancestor of task's destination cgroup
37 * (hence either you are in the same cgroup as task, or in an
38 * ancestor cgroup thereof)
39 */
40static int ns_can_attach(struct cgroup_subsys *ss,
41 struct cgroup *new_cgroup, struct task_struct *task)
42{
43 struct cgroup *orig;
44
45 if (current != task) {
46 if (!capable(CAP_SYS_ADMIN))
47 return -EPERM;
48
49 if (!cgroup_is_descendant(new_cgroup))
50 return -EPERM;
51 }
52
53 if (atomic_read(&new_cgroup->count) != 0)
54 return -EPERM;
55
56 orig = task_cgroup(task, ns_subsys_id);
57 if (orig && orig != new_cgroup->parent)
58 return -EPERM;
59
60 return 0;
61}
62
63/*
64 * Rules: you can only create a cgroup if
65 * 1. you are capable(CAP_SYS_ADMIN)
66 * 2. the target cgroup is a descendant of your own cgroup
67 */
68static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
69 struct cgroup *cgroup)
70{
71 struct ns_cgroup *ns_cgroup;
72
73 if (!capable(CAP_SYS_ADMIN))
74 return ERR_PTR(-EPERM);
75 if (!cgroup_is_descendant(cgroup))
76 return ERR_PTR(-EPERM);
77
78 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
79 if (!ns_cgroup)
80 return ERR_PTR(-ENOMEM);
81 spin_lock_init(&ns_cgroup->lock);
82 return &ns_cgroup->css;
83}
84
85static void ns_destroy(struct cgroup_subsys *ss,
86 struct cgroup *cgroup)
87{
88 struct ns_cgroup *ns_cgroup;
89
90 ns_cgroup = cgroup_to_ns(cgroup);
91 kfree(ns_cgroup);
92}
93
94struct cgroup_subsys ns_subsys = {
95 .name = "ns",
96 .can_attach = ns_can_attach,
97 .create = ns_create,
98 .destroy = ns_destroy,
99 .subsys_id = ns_subsys_id,
100};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a4fb7d46971f..79f871bc0ef4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,24 +20,12 @@
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h>
23 24
24static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
25 26
26struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
27 28
28static inline void get_nsproxy(struct nsproxy *ns)
29{
30 atomic_inc(&ns->count);
31}
32
33void get_task_namespaces(struct task_struct *tsk)
34{
35 struct nsproxy *ns = tsk->nsproxy;
36 if (ns) {
37 get_nsproxy(ns);
38 }
39}
40
41/* 29/*
42 * creates a copy of "orig" with refcount 1. 30 * creates a copy of "orig" with refcount 1.
43 */ 31 */
@@ -86,7 +74,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
86 goto out_ipc; 74 goto out_ipc;
87 } 75 }
88 76
89 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 77 new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
90 if (IS_ERR(new_nsp->pid_ns)) { 78 if (IS_ERR(new_nsp->pid_ns)) {
91 err = PTR_ERR(new_nsp->pid_ns); 79 err = PTR_ERR(new_nsp->pid_ns);
92 goto out_pid; 80 goto out_pid;
@@ -98,8 +86,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
98 goto out_user; 86 goto out_user;
99 } 87 }
100 88
89 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
90 if (IS_ERR(new_nsp->net_ns)) {
91 err = PTR_ERR(new_nsp->net_ns);
92 goto out_net;
93 }
94
101 return new_nsp; 95 return new_nsp;
102 96
97out_net:
98 if (new_nsp->user_ns)
99 put_user_ns(new_nsp->user_ns);
103out_user: 100out_user:
104 if (new_nsp->pid_ns) 101 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns); 102 put_pid_ns(new_nsp->pid_ns);
@@ -132,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
132 129
133 get_nsproxy(old_ns); 130 get_nsproxy(old_ns);
134 131
135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) 132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
136 return 0; 134 return 0;
137 135
138 if (!capable(CAP_SYS_ADMIN)) { 136 if (!capable(CAP_SYS_ADMIN)) {
@@ -146,7 +144,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
146 goto out; 144 goto out;
147 } 145 }
148 146
147 err = ns_cgroup_clone(tsk);
148 if (err) {
149 put_nsproxy(new_ns);
150 goto out;
151 }
152
149 tsk->nsproxy = new_ns; 153 tsk->nsproxy = new_ns;
154
150out: 155out:
151 put_nsproxy(old_ns); 156 put_nsproxy(old_ns);
152 return err; 157 return err;
@@ -164,6 +169,7 @@ void free_nsproxy(struct nsproxy *ns)
164 put_pid_ns(ns->pid_ns); 169 put_pid_ns(ns->pid_ns);
165 if (ns->user_ns) 170 if (ns->user_ns)
166 put_user_ns(ns->user_ns); 171 put_user_ns(ns->user_ns);
172 put_net(ns->net_ns);
167 kmem_cache_free(nsproxy_cachep, ns); 173 kmem_cache_free(nsproxy_cachep, ns);
168} 174}
169 175
@@ -177,7 +183,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
177 int err = 0; 183 int err = 0;
178 184
179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 185 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER))) 186 CLONE_NEWUSER | CLONE_NEWNET)))
181 return 0; 187 return 0;
182 188
183 if (!capable(CAP_SYS_ADMIN)) 189 if (!capable(CAP_SYS_ADMIN))
@@ -185,15 +191,49 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
185 191
186 *new_nsp = create_new_namespaces(unshare_flags, current, 192 *new_nsp = create_new_namespaces(unshare_flags, current,
187 new_fs ? new_fs : current->fs); 193 new_fs ? new_fs : current->fs);
188 if (IS_ERR(*new_nsp)) 194 if (IS_ERR(*new_nsp)) {
189 err = PTR_ERR(*new_nsp); 195 err = PTR_ERR(*new_nsp);
196 goto out;
197 }
198
199 err = ns_cgroup_clone(current);
200 if (err)
201 put_nsproxy(*new_nsp);
202
203out:
190 return err; 204 return err;
191} 205}
192 206
207void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
208{
209 struct nsproxy *ns;
210
211 might_sleep();
212
213 ns = p->nsproxy;
214
215 rcu_assign_pointer(p->nsproxy, new);
216
217 if (ns && atomic_dec_and_test(&ns->count)) {
218 /*
219 * wait for others to get what they want from this nsproxy.
220 *
221 * cannot release this nsproxy via the call_rcu() since
222 * put_mnt_ns() will want to sleep
223 */
224 synchronize_rcu();
225 free_nsproxy(ns);
226 }
227}
228
229void exit_task_namespaces(struct task_struct *p)
230{
231 switch_task_namespaces(p, NULL);
232}
233
193static int __init nsproxy_cache_init(void) 234static int __init nsproxy_cache_init(void)
194{ 235{
195 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), 236 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
196 0, SLAB_PANIC, NULL);
197 return 0; 237 return 0;
198} 238}
199 239
diff --git a/kernel/panic.c b/kernel/panic.c
index f64f4c1ac11f..3886bd8230fe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -56,14 +56,14 @@ EXPORT_SYMBOL(panic_blink);
56 * 56 *
57 * This function never returns. 57 * This function never returns.
58 */ 58 */
59 59
60NORET_TYPE void panic(const char * fmt, ...) 60NORET_TYPE void panic(const char * fmt, ...)
61{ 61{
62 long i; 62 long i;
63 static char buf[1024]; 63 static char buf[1024];
64 va_list args; 64 va_list args;
65#if defined(CONFIG_S390) 65#if defined(CONFIG_S390)
66 unsigned long caller = (unsigned long) __builtin_return_address(0); 66 unsigned long caller = (unsigned long) __builtin_return_address(0);
67#endif 67#endif
68 68
69 /* 69 /*
@@ -128,7 +128,7 @@ NORET_TYPE void panic(const char * fmt, ...)
128 } 128 }
129#endif 129#endif
130#if defined(CONFIG_S390) 130#if defined(CONFIG_S390)
131 disabled_wait(caller); 131 disabled_wait(caller);
132#endif 132#endif
133 local_irq_enable(); 133 local_irq_enable();
134 for (i = 0;;) { 134 for (i = 0;;) {
@@ -154,7 +154,7 @@ EXPORT_SYMBOL(panic);
154 * 154 *
155 * The string is overwritten by the next call to print_taint(). 155 * The string is overwritten by the next call to print_taint().
156 */ 156 */
157 157
158const char *print_tainted(void) 158const char *print_tainted(void)
159{ 159{
160 static char buf[20]; 160 static char buf[20];
@@ -164,7 +164,7 @@ const char *print_tainted(void)
164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
168 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' ', 169 tainted & TAINT_USER ? 'U' : ' ',
170 tainted & TAINT_DIE ? 'D' : ' '); 170 tainted & TAINT_DIE ? 'D' : ' ');
diff --git a/kernel/params.c b/kernel/params.c
index 4e57732fcfb4..16f269e9ddc9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -252,8 +252,9 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
252int param_set_invbool(const char *val, struct kernel_param *kp) 252int param_set_invbool(const char *val, struct kernel_param *kp)
253{ 253{
254 int boolval, ret; 254 int boolval, ret;
255 struct kernel_param dummy = { .arg = &boolval }; 255 struct kernel_param dummy;
256 256
257 dummy.arg = &boolval;
257 ret = param_set_bool(val, &dummy); 258 ret = param_set_bool(val, &dummy);
258 if (ret == 0) 259 if (ret == 0)
259 *(int *)kp->arg = !boolval; 260 *(int *)kp->arg = !boolval;
@@ -262,11 +263,7 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
262 263
263int param_get_invbool(char *buffer, struct kernel_param *kp) 264int param_get_invbool(char *buffer, struct kernel_param *kp)
264{ 265{
265 int val; 266 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y');
266 struct kernel_param dummy = { .arg = &val };
267
268 val = !*(int *)kp->arg;
269 return param_get_bool(buffer, &dummy);
270} 267}
271 268
272/* We break the rule and mangle the string. */ 269/* We break the rule and mangle the string. */
@@ -325,7 +322,7 @@ static int param_array(const char *name,
325 322
326int param_array_set(const char *val, struct kernel_param *kp) 323int param_array_set(const char *val, struct kernel_param *kp)
327{ 324{
328 struct kparam_array *arr = kp->arg; 325 const struct kparam_array *arr = kp->arr;
329 unsigned int temp_num; 326 unsigned int temp_num;
330 327
331 return param_array(kp->name, val, 1, arr->max, arr->elem, 328 return param_array(kp->name, val, 1, arr->max, arr->elem,
@@ -335,7 +332,7 @@ int param_array_set(const char *val, struct kernel_param *kp)
335int param_array_get(char *buffer, struct kernel_param *kp) 332int param_array_get(char *buffer, struct kernel_param *kp)
336{ 333{
337 int i, off, ret; 334 int i, off, ret;
338 struct kparam_array *arr = kp->arg; 335 const struct kparam_array *arr = kp->arr;
339 struct kernel_param p; 336 struct kernel_param p;
340 337
341 p = *kp; 338 p = *kp;
@@ -354,7 +351,7 @@ int param_array_get(char *buffer, struct kernel_param *kp)
354 351
355int param_set_copystring(const char *val, struct kernel_param *kp) 352int param_set_copystring(const char *val, struct kernel_param *kp)
356{ 353{
357 struct kparam_string *kps = kp->arg; 354 const struct kparam_string *kps = kp->str;
358 355
359 if (!val) { 356 if (!val) {
360 printk(KERN_ERR "%s: missing param set value\n", kp->name); 357 printk(KERN_ERR "%s: missing param set value\n", kp->name);
@@ -371,7 +368,7 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
371 368
372int param_get_string(char *buffer, struct kernel_param *kp) 369int param_get_string(char *buffer, struct kernel_param *kp)
373{ 370{
374 struct kparam_string *kps = kp->arg; 371 const struct kparam_string *kps = kp->str;
375 return strlcpy(buffer, kps->string, kps->maxlen); 372 return strlcpy(buffer, kps->string, kps->maxlen);
376} 373}
377 374
@@ -595,11 +592,17 @@ static void __init param_sysfs_builtin(void)
595 592
596 for (i=0; i < __stop___param - __start___param; i++) { 593 for (i=0; i < __stop___param - __start___param; i++) {
597 char *dot; 594 char *dot;
595 size_t kplen;
598 596
599 kp = &__start___param[i]; 597 kp = &__start___param[i];
598 kplen = strlen(kp->name);
600 599
601 /* We do not handle args without periods. */ 600 /* We do not handle args without periods. */
602 dot = memchr(kp->name, '.', MAX_KBUILD_MODNAME); 601 if (kplen > MAX_KBUILD_MODNAME) {
602 DEBUGP("kernel parameter name is too long: %s\n", kp->name);
603 continue;
604 }
605 dot = memchr(kp->name, '.', kplen);
603 if (!dot) { 606 if (!dot) {
604 DEBUGP("couldn't find period in %s\n", kp->name); 607 DEBUGP("couldn't find period in %s\n", kp->name);
605 continue; 608 continue;
diff --git a/kernel/pid.c b/kernel/pid.c
index c6e3f9ffff87..d1db36b94674 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -18,6 +18,12 @@
18 * allocation scenario when all but one out of 1 million PIDs possible are 18 * allocation scenario when all but one out of 1 million PIDs possible are
19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE 19 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). 20 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
21 *
22 * Pid namespaces:
23 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
24 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
25 * Many thanks to Oleg Nesterov for comments and help
26 *
21 */ 27 */
22 28
23#include <linux/mm.h> 29#include <linux/mm.h>
@@ -28,12 +34,14 @@
28#include <linux/hash.h> 34#include <linux/hash.h>
29#include <linux/pid_namespace.h> 35#include <linux/pid_namespace.h>
30#include <linux/init_task.h> 36#include <linux/init_task.h>
37#include <linux/syscalls.h>
31 38
32#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) 39#define pid_hashfn(nr, ns) \
40 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
33static struct hlist_head *pid_hash; 41static struct hlist_head *pid_hash;
34static int pidhash_shift; 42static int pidhash_shift;
35static struct kmem_cache *pid_cachep;
36struct pid init_struct_pid = INIT_STRUCT_PID; 43struct pid init_struct_pid = INIT_STRUCT_PID;
44static struct kmem_cache *pid_ns_cachep;
37 45
38int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
39 47
@@ -68,8 +76,25 @@ struct pid_namespace init_pid_ns = {
68 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 76 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
69 }, 77 },
70 .last_pid = 0, 78 .last_pid = 0,
71 .child_reaper = &init_task 79 .level = 0,
80 .child_reaper = &init_task,
72}; 81};
82EXPORT_SYMBOL_GPL(init_pid_ns);
83
84int is_container_init(struct task_struct *tsk)
85{
86 int ret = 0;
87 struct pid *pid;
88
89 rcu_read_lock();
90 pid = task_pid(tsk);
91 if (pid != NULL && pid->numbers[pid->level].nr == 1)
92 ret = 1;
93 rcu_read_unlock();
94
95 return ret;
96}
97EXPORT_SYMBOL(is_container_init);
73 98
74/* 99/*
75 * Note: disable interrupts while the pidmap_lock is held as an 100 * Note: disable interrupts while the pidmap_lock is held as an
@@ -176,11 +201,17 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
176 201
177fastcall void put_pid(struct pid *pid) 202fastcall void put_pid(struct pid *pid)
178{ 203{
204 struct pid_namespace *ns;
205
179 if (!pid) 206 if (!pid)
180 return; 207 return;
208
209 ns = pid->numbers[pid->level].ns;
181 if ((atomic_read(&pid->count) == 1) || 210 if ((atomic_read(&pid->count) == 1) ||
182 atomic_dec_and_test(&pid->count)) 211 atomic_dec_and_test(&pid->count)) {
183 kmem_cache_free(pid_cachep, pid); 212 kmem_cache_free(ns->pid_cachep, pid);
213 put_pid_ns(ns);
214 }
184} 215}
185EXPORT_SYMBOL_GPL(put_pid); 216EXPORT_SYMBOL_GPL(put_pid);
186 217
@@ -193,60 +224,94 @@ static void delayed_put_pid(struct rcu_head *rhp)
193fastcall void free_pid(struct pid *pid) 224fastcall void free_pid(struct pid *pid)
194{ 225{
195 /* We can be called with write_lock_irq(&tasklist_lock) held */ 226 /* We can be called with write_lock_irq(&tasklist_lock) held */
227 int i;
196 unsigned long flags; 228 unsigned long flags;
197 229
198 spin_lock_irqsave(&pidmap_lock, flags); 230 spin_lock_irqsave(&pidmap_lock, flags);
199 hlist_del_rcu(&pid->pid_chain); 231 for (i = 0; i <= pid->level; i++)
232 hlist_del_rcu(&pid->numbers[i].pid_chain);
200 spin_unlock_irqrestore(&pidmap_lock, flags); 233 spin_unlock_irqrestore(&pidmap_lock, flags);
201 234
202 free_pidmap(&init_pid_ns, pid->nr); 235 for (i = 0; i <= pid->level; i++)
236 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
237
203 call_rcu(&pid->rcu, delayed_put_pid); 238 call_rcu(&pid->rcu, delayed_put_pid);
204} 239}
205 240
206struct pid *alloc_pid(void) 241struct pid *alloc_pid(struct pid_namespace *ns)
207{ 242{
208 struct pid *pid; 243 struct pid *pid;
209 enum pid_type type; 244 enum pid_type type;
210 int nr = -1; 245 int i, nr;
246 struct pid_namespace *tmp;
247 struct upid *upid;
211 248
212 pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL); 249 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
213 if (!pid) 250 if (!pid)
214 goto out; 251 goto out;
215 252
216 nr = alloc_pidmap(current->nsproxy->pid_ns); 253 tmp = ns;
217 if (nr < 0) 254 for (i = ns->level; i >= 0; i--) {
218 goto out_free; 255 nr = alloc_pidmap(tmp);
256 if (nr < 0)
257 goto out_free;
258
259 pid->numbers[i].nr = nr;
260 pid->numbers[i].ns = tmp;
261 tmp = tmp->parent;
262 }
219 263
264 get_pid_ns(ns);
265 pid->level = ns->level;
220 atomic_set(&pid->count, 1); 266 atomic_set(&pid->count, 1);
221 pid->nr = nr;
222 for (type = 0; type < PIDTYPE_MAX; ++type) 267 for (type = 0; type < PIDTYPE_MAX; ++type)
223 INIT_HLIST_HEAD(&pid->tasks[type]); 268 INIT_HLIST_HEAD(&pid->tasks[type]);
224 269
225 spin_lock_irq(&pidmap_lock); 270 spin_lock_irq(&pidmap_lock);
226 hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]); 271 for (i = ns->level; i >= 0; i--) {
272 upid = &pid->numbers[i];
273 hlist_add_head_rcu(&upid->pid_chain,
274 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
275 }
227 spin_unlock_irq(&pidmap_lock); 276 spin_unlock_irq(&pidmap_lock);
228 277
229out: 278out:
230 return pid; 279 return pid;
231 280
232out_free: 281out_free:
233 kmem_cache_free(pid_cachep, pid); 282 for (i++; i <= ns->level; i++)
283 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
284
285 kmem_cache_free(ns->pid_cachep, pid);
234 pid = NULL; 286 pid = NULL;
235 goto out; 287 goto out;
236} 288}
237 289
238struct pid * fastcall find_pid(int nr) 290struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
239{ 291{
240 struct hlist_node *elem; 292 struct hlist_node *elem;
241 struct pid *pid; 293 struct upid *pnr;
294
295 hlist_for_each_entry_rcu(pnr, elem,
296 &pid_hash[pid_hashfn(nr, ns)], pid_chain)
297 if (pnr->nr == nr && pnr->ns == ns)
298 return container_of(pnr, struct pid,
299 numbers[ns->level]);
242 300
243 hlist_for_each_entry_rcu(pid, elem,
244 &pid_hash[pid_hashfn(nr)], pid_chain) {
245 if (pid->nr == nr)
246 return pid;
247 }
248 return NULL; 301 return NULL;
249} 302}
303EXPORT_SYMBOL_GPL(find_pid_ns);
304
305struct pid *find_vpid(int nr)
306{
307 return find_pid_ns(nr, current->nsproxy->pid_ns);
308}
309EXPORT_SYMBOL_GPL(find_vpid);
310
311struct pid *find_pid(int nr)
312{
313 return find_pid_ns(nr, &init_pid_ns);
314}
250EXPORT_SYMBOL_GPL(find_pid); 315EXPORT_SYMBOL_GPL(find_pid);
251 316
252/* 317/*
@@ -307,12 +372,32 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
307/* 372/*
308 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 373 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
309 */ 374 */
310struct task_struct *find_task_by_pid_type(int type, int nr) 375struct task_struct *find_task_by_pid_type_ns(int type, int nr,
376 struct pid_namespace *ns)
311{ 377{
312 return pid_task(find_pid(nr), type); 378 return pid_task(find_pid_ns(nr, ns), type);
313} 379}
314 380
315EXPORT_SYMBOL(find_task_by_pid_type); 381EXPORT_SYMBOL(find_task_by_pid_type_ns);
382
383struct task_struct *find_task_by_pid(pid_t nr)
384{
385 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
386}
387EXPORT_SYMBOL(find_task_by_pid);
388
389struct task_struct *find_task_by_vpid(pid_t vnr)
390{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399}
400EXPORT_SYMBOL(find_task_by_pid_ns);
316 401
317struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 402struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
318{ 403{
@@ -339,45 +424,239 @@ struct pid *find_get_pid(pid_t nr)
339 struct pid *pid; 424 struct pid *pid;
340 425
341 rcu_read_lock(); 426 rcu_read_lock();
342 pid = get_pid(find_pid(nr)); 427 pid = get_pid(find_vpid(nr));
343 rcu_read_unlock(); 428 rcu_read_unlock();
344 429
345 return pid; 430 return pid;
346} 431}
347 432
433pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
434{
435 struct upid *upid;
436 pid_t nr = 0;
437
438 if (pid && ns->level <= pid->level) {
439 upid = &pid->numbers[ns->level];
440 if (upid->ns == ns)
441 nr = upid->nr;
442 }
443 return nr;
444}
445
446pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
447{
448 return pid_nr_ns(task_pid(tsk), ns);
449}
450EXPORT_SYMBOL(task_pid_nr_ns);
451
452pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
453{
454 return pid_nr_ns(task_tgid(tsk), ns);
455}
456EXPORT_SYMBOL(task_tgid_nr_ns);
457
458pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
459{
460 return pid_nr_ns(task_pgrp(tsk), ns);
461}
462EXPORT_SYMBOL(task_pgrp_nr_ns);
463
464pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
465{
466 return pid_nr_ns(task_session(tsk), ns);
467}
468EXPORT_SYMBOL(task_session_nr_ns);
469
348/* 470/*
349 * Used by proc to find the first pid that is greater then or equal to nr. 471 * Used by proc to find the first pid that is greater then or equal to nr.
350 * 472 *
351 * If there is a pid at nr this function is exactly the same as find_pid. 473 * If there is a pid at nr this function is exactly the same as find_pid.
352 */ 474 */
353struct pid *find_ge_pid(int nr) 475struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
354{ 476{
355 struct pid *pid; 477 struct pid *pid;
356 478
357 do { 479 do {
358 pid = find_pid(nr); 480 pid = find_pid_ns(nr, ns);
359 if (pid) 481 if (pid)
360 break; 482 break;
361 nr = next_pidmap(current->nsproxy->pid_ns, nr); 483 nr = next_pidmap(ns, nr);
362 } while (nr > 0); 484 } while (nr > 0);
363 485
364 return pid; 486 return pid;
365} 487}
366EXPORT_SYMBOL_GPL(find_get_pid); 488EXPORT_SYMBOL_GPL(find_get_pid);
367 489
490struct pid_cache {
491 int nr_ids;
492 char name[16];
493 struct kmem_cache *cachep;
494 struct list_head list;
495};
496
497static LIST_HEAD(pid_caches_lh);
498static DEFINE_MUTEX(pid_caches_mutex);
499
500/*
501 * creates the kmem cache to allocate pids from.
502 * @nr_ids: the number of numerical ids this pid will have to carry
503 */
504
505static struct kmem_cache *create_pid_cachep(int nr_ids)
506{
507 struct pid_cache *pcache;
508 struct kmem_cache *cachep;
509
510 mutex_lock(&pid_caches_mutex);
511 list_for_each_entry (pcache, &pid_caches_lh, list)
512 if (pcache->nr_ids == nr_ids)
513 goto out;
514
515 pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
516 if (pcache == NULL)
517 goto err_alloc;
518
519 snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
520 cachep = kmem_cache_create(pcache->name,
521 sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
522 0, SLAB_HWCACHE_ALIGN, NULL);
523 if (cachep == NULL)
524 goto err_cachep;
525
526 pcache->nr_ids = nr_ids;
527 pcache->cachep = cachep;
528 list_add(&pcache->list, &pid_caches_lh);
529out:
530 mutex_unlock(&pid_caches_mutex);
531 return pcache->cachep;
532
533err_cachep:
534 kfree(pcache);
535err_alloc:
536 mutex_unlock(&pid_caches_mutex);
537 return NULL;
538}
539
540static struct pid_namespace *create_pid_namespace(int level)
541{
542 struct pid_namespace *ns;
543 int i;
544
545 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
546 if (ns == NULL)
547 goto out;
548
549 ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
550 if (!ns->pidmap[0].page)
551 goto out_free;
552
553 ns->pid_cachep = create_pid_cachep(level + 1);
554 if (ns->pid_cachep == NULL)
555 goto out_free_map;
556
557 kref_init(&ns->kref);
558 ns->last_pid = 0;
559 ns->child_reaper = NULL;
560 ns->level = level;
561
562 set_bit(0, ns->pidmap[0].page);
563 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
564
565 for (i = 1; i < PIDMAP_ENTRIES; i++) {
566 ns->pidmap[i].page = 0;
567 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
568 }
569
570 return ns;
571
572out_free_map:
573 kfree(ns->pidmap[0].page);
574out_free:
575 kmem_cache_free(pid_ns_cachep, ns);
576out:
577 return ERR_PTR(-ENOMEM);
578}
579
580static void destroy_pid_namespace(struct pid_namespace *ns)
581{
582 int i;
583
584 for (i = 0; i < PIDMAP_ENTRIES; i++)
585 kfree(ns->pidmap[i].page);
586 kmem_cache_free(pid_ns_cachep, ns);
587}
588
368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 589struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 590{
591 struct pid_namespace *new_ns;
592
370 BUG_ON(!old_ns); 593 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 594 new_ns = get_pid_ns(old_ns);
372 return old_ns; 595 if (!(flags & CLONE_NEWPID))
596 goto out;
597
598 new_ns = ERR_PTR(-EINVAL);
599 if (flags & CLONE_THREAD)
600 goto out_put;
601
602 new_ns = create_pid_namespace(old_ns->level + 1);
603 if (!IS_ERR(new_ns))
604 new_ns->parent = get_pid_ns(old_ns);
605
606out_put:
607 put_pid_ns(old_ns);
608out:
609 return new_ns;
373} 610}
374 611
375void free_pid_ns(struct kref *kref) 612void free_pid_ns(struct kref *kref)
376{ 613{
377 struct pid_namespace *ns; 614 struct pid_namespace *ns, *parent;
378 615
379 ns = container_of(kref, struct pid_namespace, kref); 616 ns = container_of(kref, struct pid_namespace, kref);
380 kfree(ns); 617
618 parent = ns->parent;
619 destroy_pid_namespace(ns);
620
621 if (parent != NULL)
622 put_pid_ns(parent);
623}
624
625void zap_pid_ns_processes(struct pid_namespace *pid_ns)
626{
627 int nr;
628 int rc;
629
630 /*
631 * The last thread in the cgroup-init thread group is terminating.
632 * Find remaining pid_ts in the namespace, signal and wait for them
633 * to exit.
634 *
635 * Note: This signals each threads in the namespace - even those that
636 * belong to the same thread group, To avoid this, we would have
637 * to walk the entire tasklist looking a processes in this
638 * namespace, but that could be unnecessarily expensive if the
639 * pid namespace has just a few processes. Or we need to
640 * maintain a tasklist for each pid namespace.
641 *
642 */
643 read_lock(&tasklist_lock);
644 nr = next_pidmap(pid_ns, 1);
645 while (nr > 0) {
646 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
647 nr = next_pidmap(pid_ns, nr);
648 }
649 read_unlock(&tasklist_lock);
650
651 do {
652 clear_thread_flag(TIF_SIGPENDING);
653 rc = sys_wait4(-1, NULL, __WALL, NULL);
654 } while (rc != -ECHILD);
655
656
657 /* Child reaper for the pid namespace is going away */
658 pid_ns->child_reaper = NULL;
659 return;
381} 660}
382 661
383/* 662/*
@@ -412,5 +691,9 @@ void __init pidmap_init(void)
412 set_bit(0, init_pid_ns.pidmap[0].page); 691 set_bit(0, init_pid_ns.pidmap[0].page);
413 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 692 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
414 693
415 pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); 694 init_pid_ns.pid_cachep = create_pid_cachep(1);
695 if (init_pid_ns.pid_cachep == NULL)
696 panic("Can't create pid_1 cachep\n");
697
698 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
416} 699}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b53c8fcd9d82..68c96376e84a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -21,8 +21,8 @@ static int check_clock(const clockid_t which_clock)
21 21
22 read_lock(&tasklist_lock); 22 read_lock(&tasklist_lock);
23 p = find_task_by_pid(pid); 23 p = find_task_by_pid(pid);
24 if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? 24 if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
25 p->tgid != current->tgid : p->tgid != pid)) { 25 same_thread_group(p, current) : thread_group_leader(p))) {
26 error = -EINVAL; 26 error = -EINVAL;
27 } 27 }
28 read_unlock(&tasklist_lock); 28 read_unlock(&tasklist_lock);
@@ -308,13 +308,13 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
308 p = find_task_by_pid(pid); 308 p = find_task_by_pid(pid);
309 if (p) { 309 if (p) {
310 if (CPUCLOCK_PERTHREAD(which_clock)) { 310 if (CPUCLOCK_PERTHREAD(which_clock)) {
311 if (p->tgid == current->tgid) { 311 if (same_thread_group(p, current)) {
312 error = cpu_clock_sample(which_clock, 312 error = cpu_clock_sample(which_clock,
313 p, &rtn); 313 p, &rtn);
314 } 314 }
315 } else { 315 } else {
316 read_lock(&tasklist_lock); 316 read_lock(&tasklist_lock);
317 if (p->tgid == pid && p->signal) { 317 if (thread_group_leader(p) && p->signal) {
318 error = 318 error =
319 cpu_clock_sample_group(which_clock, 319 cpu_clock_sample_group(which_clock,
320 p, &rtn); 320 p, &rtn);
@@ -355,7 +355,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
355 p = current; 355 p = current;
356 } else { 356 } else {
357 p = find_task_by_pid(pid); 357 p = find_task_by_pid(pid);
358 if (p && p->tgid != current->tgid) 358 if (p && !same_thread_group(p, current))
359 p = NULL; 359 p = NULL;
360 } 360 }
361 } else { 361 } else {
@@ -363,7 +363,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
363 p = current->group_leader; 363 p = current->group_leader;
364 } else { 364 } else {
365 p = find_task_by_pid(pid); 365 p = find_task_by_pid(pid);
366 if (p && p->tgid != pid) 366 if (p && !thread_group_leader(p))
367 p = NULL; 367 p = NULL;
368 } 368 }
369 } 369 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7a15afb73ed0..35b4bbfc78ff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,8 @@ static __init int init_posix_timers(void)
241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
242 242
243 posix_timers_cache = kmem_cache_create("posix_timers_cache", 243 posix_timers_cache = kmem_cache_create("posix_timers_cache",
244 sizeof (struct k_itimer), 0, 0, NULL); 244 sizeof (struct k_itimer), 0, SLAB_PANIC,
245 NULL);
245 idr_init(&posix_timers_id); 246 idr_init(&posix_timers_id);
246 return 0; 247 return 0;
247} 248}
@@ -403,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
403 404
404 if ((event->sigev_notify & SIGEV_THREAD_ID ) && 405 if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
405 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || 406 (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
406 rtn->tgid != current->tgid || 407 !same_thread_group(rtn, current) ||
407 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) 408 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
408 return NULL; 409 return NULL;
409 410
@@ -607,7 +608,7 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
607 spin_lock(&timr->it_lock); 608 spin_lock(&timr->it_lock);
608 609
609 if ((timr->it_id != timer_id) || !(timr->it_process) || 610 if ((timr->it_id != timer_id) || !(timr->it_process) ||
610 timr->it_process->tgid != current->tgid) { 611 !same_thread_group(timr->it_process, current)) {
611 spin_unlock(&timr->it_lock); 612 spin_unlock(&timr->it_lock);
612 spin_unlock_irqrestore(&idr_lock, *flags); 613 spin_unlock_irqrestore(&idr_lock, *flags);
613 timr = NULL; 614 timr = NULL;
@@ -712,7 +713,7 @@ sys_timer_getoverrun(timer_t timer_id)
712{ 713{
713 struct k_itimer *timr; 714 struct k_itimer *timr;
714 int overrun; 715 int overrun;
715 long flags; 716 unsigned long flags;
716 717
717 timr = lock_timer(timer_id, &flags); 718 timr = lock_timer(timer_id, &flags);
718 if (!timr) 719 if (!timr)
@@ -784,7 +785,7 @@ sys_timer_settime(timer_t timer_id, int flags,
784 struct k_itimer *timr; 785 struct k_itimer *timr;
785 struct itimerspec new_spec, old_spec; 786 struct itimerspec new_spec, old_spec;
786 int error = 0; 787 int error = 0;
787 long flag; 788 unsigned long flag;
788 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 789 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
789 790
790 if (!new_setting) 791 if (!new_setting)
@@ -836,7 +837,7 @@ asmlinkage long
836sys_timer_delete(timer_t timer_id) 837sys_timer_delete(timer_t timer_id)
837{ 838{
838 struct k_itimer *timer; 839 struct k_itimer *timer;
839 long flags; 840 unsigned long flags;
840 841
841retry_delete: 842retry_delete:
842 timer = lock_timer(timer_id, &flags); 843 timer = lock_timer(timer_id, &flags);
@@ -980,9 +981,20 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
980static int common_nsleep(const clockid_t which_clock, int flags, 981static int common_nsleep(const clockid_t which_clock, int flags,
981 struct timespec *tsave, struct timespec __user *rmtp) 982 struct timespec *tsave, struct timespec __user *rmtp)
982{ 983{
983 return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? 984 struct timespec rmt;
984 HRTIMER_MODE_ABS : HRTIMER_MODE_REL, 985 int ret;
985 which_clock); 986
987 ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
988 flags & TIMER_ABSTIME ?
989 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
990 which_clock);
991
992 if (ret && rmtp) {
993 if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
994 return -EFAULT;
995 }
996
997 return ret;
986} 998}
987 999
988asmlinkage long 1000asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 14b0e10dc95c..8e186c678149 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -44,17 +44,6 @@ config PM_VERBOSE
44 ---help--- 44 ---help---
45 This option enables verbose messages from the Power Management code. 45 This option enables verbose messages from the Power Management code.
46 46
47config DISABLE_CONSOLE_SUSPEND
48 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
49 depends on PM_DEBUG && PM_SLEEP
50 default n
51 ---help---
52 This option turns off the console suspend mechanism that prevents
53 debug messages from reaching the console during the suspend/resume
54 operations. This may be helpful when debugging device drivers'
55 suspend/resume routines, but may itself lead to problems, for example
56 if netconsole is used.
57
58config PM_TRACE 47config PM_TRACE
59 bool "Suspend/resume event tracing" 48 bool "Suspend/resume event tracing"
60 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL 49 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index eb72255b5c86..8b15f777010a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,17 +45,18 @@ enum {
45 45
46static int hibernation_mode = HIBERNATION_SHUTDOWN; 46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47 47
48static struct hibernation_ops *hibernation_ops; 48static struct platform_hibernation_ops *hibernation_ops;
49 49
50/** 50/**
51 * hibernation_set_ops - set the global hibernate operations 51 * hibernation_set_ops - set the global hibernate operations
52 * @ops: the hibernation operations to use in subsequent hibernation transitions 52 * @ops: the hibernation operations to use in subsequent hibernation transitions
53 */ 53 */
54 54
55void hibernation_set_ops(struct hibernation_ops *ops) 55void hibernation_set_ops(struct platform_hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish 57 if (ops && !(ops->start && ops->pre_snapshot && ops->finish
58 && ops->pre_restore && ops->restore_cleanup)) { 58 && ops->prepare && ops->enter && ops->pre_restore
59 && ops->restore_cleanup)) {
59 WARN_ON(1); 60 WARN_ON(1);
60 return; 61 return;
61 } 62 }
@@ -69,16 +70,37 @@ void hibernation_set_ops(struct hibernation_ops *ops)
69 mutex_unlock(&pm_mutex); 70 mutex_unlock(&pm_mutex);
70} 71}
71 72
73/**
74 * platform_start - tell the platform driver that we're starting
75 * hibernation
76 */
77
78static int platform_start(int platform_mode)
79{
80 return (platform_mode && hibernation_ops) ?
81 hibernation_ops->start() : 0;
82}
72 83
73/** 84/**
74 * platform_prepare - prepare the machine for hibernation using the 85 * platform_pre_snapshot - prepare the machine for hibernation using the
75 * platform driver if so configured and return an error code if it fails 86 * platform driver if so configured and return an error code if it fails
76 */ 87 */
77 88
78static int platform_prepare(int platform_mode) 89static int platform_pre_snapshot(int platform_mode)
79{ 90{
80 return (platform_mode && hibernation_ops) ? 91 return (platform_mode && hibernation_ops) ?
81 hibernation_ops->prepare() : 0; 92 hibernation_ops->pre_snapshot() : 0;
93}
94
95/**
96 * platform_leave - prepare the machine for switching to the normal mode
97 * of operation using the platform driver (called with interrupts disabled)
98 */
99
100static void platform_leave(int platform_mode)
101{
102 if (platform_mode && hibernation_ops)
103 hibernation_ops->leave();
82} 104}
83 105
84/** 106/**
@@ -118,6 +140,51 @@ static void platform_restore_cleanup(int platform_mode)
118} 140}
119 141
120/** 142/**
143 * create_image - freeze devices that need to be frozen with interrupts
144 * off, create the hibernation image and thaw those devices. Control
145 * reappears in this routine after a restore.
146 */
147
148int create_image(int platform_mode)
149{
150 int error;
151
152 error = arch_prepare_suspend();
153 if (error)
154 return error;
155
156 local_irq_disable();
157 /* At this point, device_suspend() has been called, but *not*
158 * device_power_down(). We *must* call device_power_down() now.
159 * Otherwise, drivers for some devices (e.g. interrupt controllers)
160 * become desynchronized with the actual state of the hardware
161 * at resume time, and evil weirdness ensues.
162 */
163 error = device_power_down(PMSG_FREEZE);
164 if (error) {
165 printk(KERN_ERR "Some devices failed to power down, "
166 KERN_ERR "aborting suspend\n");
167 goto Enable_irqs;
168 }
169
170 save_processor_state();
171 error = swsusp_arch_suspend();
172 if (error)
173 printk(KERN_ERR "Error %d while creating the image\n", error);
174 /* Restore control flow magically appears here */
175 restore_processor_state();
176 if (!in_suspend)
177 platform_leave(platform_mode);
178 /* NOTE: device_power_up() is just a resume() for devices
179 * that suspended with irqs off ... no overall powerup.
180 */
181 device_power_up();
182 Enable_irqs:
183 local_irq_enable();
184 return error;
185}
186
187/**
121 * hibernation_snapshot - quiesce devices and create the hibernation 188 * hibernation_snapshot - quiesce devices and create the hibernation
122 * snapshot image. 189 * snapshot image.
123 * @platform_mode - if set, use the platform driver, if available, to 190 * @platform_mode - if set, use the platform driver, if available, to
@@ -135,12 +202,16 @@ int hibernation_snapshot(int platform_mode)
135 if (error) 202 if (error)
136 return error; 203 return error;
137 204
205 error = platform_start(platform_mode);
206 if (error)
207 return error;
208
138 suspend_console(); 209 suspend_console();
139 error = device_suspend(PMSG_FREEZE); 210 error = device_suspend(PMSG_FREEZE);
140 if (error) 211 if (error)
141 goto Resume_console; 212 goto Resume_console;
142 213
143 error = platform_prepare(platform_mode); 214 error = platform_pre_snapshot(platform_mode);
144 if (error) 215 if (error)
145 goto Resume_devices; 216 goto Resume_devices;
146 217
@@ -148,7 +219,7 @@ int hibernation_snapshot(int platform_mode)
148 if (!error) { 219 if (!error) {
149 if (hibernation_mode != HIBERNATION_TEST) { 220 if (hibernation_mode != HIBERNATION_TEST) {
150 in_suspend = 1; 221 in_suspend = 1;
151 error = swsusp_suspend(); 222 error = create_image(platform_mode);
152 /* Control returns here after successful restore */ 223 /* Control returns here after successful restore */
153 } else { 224 } else {
154 printk("swsusp debug: Waiting for 5 seconds.\n"); 225 printk("swsusp debug: Waiting for 5 seconds.\n");
@@ -207,21 +278,50 @@ int hibernation_platform_enter(void)
207{ 278{
208 int error; 279 int error;
209 280
210 if (hibernation_ops) { 281 if (!hibernation_ops)
211 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 282 return -ENOSYS;
212 /* 283
213 * We have cancelled the power transition by running 284 /*
214 * hibernation_ops->finish() before saving the image, so we 285 * We have cancelled the power transition by running
215 * should let the firmware know that we're going to enter the 286 * hibernation_ops->finish() before saving the image, so we should let
216 * sleep state after all 287 * the firmware know that we're going to enter the sleep state after all
217 */ 288 */
218 error = hibernation_ops->prepare(); 289 error = hibernation_ops->start();
219 sysdev_shutdown(); 290 if (error)
220 if (!error) 291 return error;
221 error = hibernation_ops->enter(); 292
222 } else { 293 suspend_console();
223 error = -ENOSYS; 294 error = device_suspend(PMSG_SUSPEND);
295 if (error)
296 goto Resume_console;
297
298 error = hibernation_ops->prepare();
299 if (error)
300 goto Resume_devices;
301
302 error = disable_nonboot_cpus();
303 if (error)
304 goto Finish;
305
306 local_irq_disable();
307 error = device_power_down(PMSG_SUSPEND);
308 if (!error) {
309 hibernation_ops->enter();
310 /* We should never get here */
311 while (1);
224 } 312 }
313 local_irq_enable();
314
315 /*
316 * We don't need to reenable the nonboot CPUs or resume consoles, since
317 * the system is going to be halted anyway.
318 */
319 Finish:
320 hibernation_ops->finish();
321 Resume_devices:
322 device_resume();
323 Resume_console:
324 resume_console();
225 return error; 325 return error;
226} 326}
227 327
@@ -238,14 +338,14 @@ static void power_down(void)
238 case HIBERNATION_TEST: 338 case HIBERNATION_TEST:
239 case HIBERNATION_TESTPROC: 339 case HIBERNATION_TESTPROC:
240 break; 340 break;
241 case HIBERNATION_SHUTDOWN:
242 kernel_power_off();
243 break;
244 case HIBERNATION_REBOOT: 341 case HIBERNATION_REBOOT:
245 kernel_restart(NULL); 342 kernel_restart(NULL);
246 break; 343 break;
247 case HIBERNATION_PLATFORM: 344 case HIBERNATION_PLATFORM:
248 hibernation_platform_enter(); 345 hibernation_platform_enter();
346 case HIBERNATION_SHUTDOWN:
347 kernel_power_off();
348 break;
249 } 349 }
250 kernel_halt(); 350 kernel_halt();
251 /* 351 /*
@@ -298,6 +398,10 @@ int hibernate(void)
298 if (error) 398 if (error)
299 goto Exit; 399 goto Exit;
300 400
401 printk("Syncing filesystems ... ");
402 sys_sync();
403 printk("done.\n");
404
301 error = prepare_processes(); 405 error = prepare_processes();
302 if (error) 406 if (error)
303 goto Finish; 407 goto Finish;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 350b485b3b60..3cdf95b1dc92 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -20,6 +20,7 @@
20#include <linux/resume-trace.h> 20#include <linux/resume-trace.h>
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/syscalls.h>
23 24
24#include "power.h" 25#include "power.h"
25 26
@@ -32,39 +33,32 @@ DEFINE_MUTEX(pm_mutex);
32/* This is just an arbitrary number */ 33/* This is just an arbitrary number */
33#define FREE_PAGE_NUMBER (100) 34#define FREE_PAGE_NUMBER (100)
34 35
35struct pm_ops *pm_ops; 36static struct platform_suspend_ops *suspend_ops;
36 37
37/** 38/**
38 * pm_set_ops - Set the global power method table. 39 * suspend_set_ops - Set the global suspend method table.
39 * @ops: Pointer to ops structure. 40 * @ops: Pointer to ops structure.
40 */ 41 */
41 42
42void pm_set_ops(struct pm_ops * ops) 43void suspend_set_ops(struct platform_suspend_ops *ops)
43{ 44{
44 mutex_lock(&pm_mutex); 45 mutex_lock(&pm_mutex);
45 pm_ops = ops; 46 suspend_ops = ops;
46 mutex_unlock(&pm_mutex); 47 mutex_unlock(&pm_mutex);
47} 48}
48 49
49/** 50/**
50 * pm_valid_only_mem - generic memory-only valid callback 51 * suspend_valid_only_mem - generic memory-only valid callback
51 * 52 *
52 * pm_ops drivers that implement mem suspend only and only need 53 * Platform drivers that implement mem suspend only and only need
53 * to check for that in their .valid callback can use this instead 54 * to check for that in their .valid callback can use this instead
54 * of rolling their own .valid callback. 55 * of rolling their own .valid callback.
55 */ 56 */
56int pm_valid_only_mem(suspend_state_t state) 57int suspend_valid_only_mem(suspend_state_t state)
57{ 58{
58 return state == PM_SUSPEND_MEM; 59 return state == PM_SUSPEND_MEM;
59} 60}
60 61
61
62static inline void pm_finish(suspend_state_t state)
63{
64 if (pm_ops->finish)
65 pm_ops->finish(state);
66}
67
68/** 62/**
69 * suspend_prepare - Do prep work before entering low-power state. 63 * suspend_prepare - Do prep work before entering low-power state.
70 * 64 *
@@ -76,7 +70,7 @@ static int suspend_prepare(void)
76 int error; 70 int error;
77 unsigned int free_pages; 71 unsigned int free_pages;
78 72
79 if (!pm_ops || !pm_ops->enter) 73 if (!suspend_ops || !suspend_ops->enter)
80 return -EPERM; 74 return -EPERM;
81 75
82 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); 76 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
@@ -128,7 +122,7 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
128 * 122 *
129 * This function should be called after devices have been suspended. 123 * This function should be called after devices have been suspended.
130 */ 124 */
131int suspend_enter(suspend_state_t state) 125static int suspend_enter(suspend_state_t state)
132{ 126{
133 int error = 0; 127 int error = 0;
134 128
@@ -139,7 +133,7 @@ int suspend_enter(suspend_state_t state)
139 printk(KERN_ERR "Some devices failed to power down\n"); 133 printk(KERN_ERR "Some devices failed to power down\n");
140 goto Done; 134 goto Done;
141 } 135 }
142 error = pm_ops->enter(state); 136 error = suspend_ops->enter(state);
143 device_power_up(); 137 device_power_up();
144 Done: 138 Done:
145 arch_suspend_enable_irqs(); 139 arch_suspend_enable_irqs();
@@ -156,11 +150,11 @@ int suspend_devices_and_enter(suspend_state_t state)
156{ 150{
157 int error; 151 int error;
158 152
159 if (!pm_ops) 153 if (!suspend_ops)
160 return -ENOSYS; 154 return -ENOSYS;
161 155
162 if (pm_ops->set_target) { 156 if (suspend_ops->set_target) {
163 error = pm_ops->set_target(state); 157 error = suspend_ops->set_target(state);
164 if (error) 158 if (error)
165 return error; 159 return error;
166 } 160 }
@@ -170,8 +164,8 @@ int suspend_devices_and_enter(suspend_state_t state)
170 printk(KERN_ERR "Some devices failed to suspend\n"); 164 printk(KERN_ERR "Some devices failed to suspend\n");
171 goto Resume_console; 165 goto Resume_console;
172 } 166 }
173 if (pm_ops->prepare) { 167 if (suspend_ops->prepare) {
174 error = pm_ops->prepare(state); 168 error = suspend_ops->prepare();
175 if (error) 169 if (error)
176 goto Resume_devices; 170 goto Resume_devices;
177 } 171 }
@@ -180,7 +174,8 @@ int suspend_devices_and_enter(suspend_state_t state)
180 suspend_enter(state); 174 suspend_enter(state);
181 175
182 enable_nonboot_cpus(); 176 enable_nonboot_cpus();
183 pm_finish(state); 177 if (suspend_ops->finish)
178 suspend_ops->finish();
184 Resume_devices: 179 Resume_devices:
185 device_resume(); 180 device_resume();
186 Resume_console: 181 Resume_console:
@@ -214,7 +209,7 @@ static inline int valid_state(suspend_state_t state)
214 /* All states need lowlevel support and need to be valid 209 /* All states need lowlevel support and need to be valid
215 * to the lowlevel implementation, no valid callback 210 * to the lowlevel implementation, no valid callback
216 * implies that none are valid. */ 211 * implies that none are valid. */
217 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) 212 if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
218 return 0; 213 return 0;
219 return 1; 214 return 1;
220} 215}
@@ -236,9 +231,14 @@ static int enter_state(suspend_state_t state)
236 231
237 if (!valid_state(state)) 232 if (!valid_state(state))
238 return -ENODEV; 233 return -ENODEV;
234
239 if (!mutex_trylock(&pm_mutex)) 235 if (!mutex_trylock(&pm_mutex))
240 return -EBUSY; 236 return -EBUSY;
241 237
238 printk("Syncing filesystems ... ");
239 sys_sync();
240 printk("done.\n");
241
242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
243 if ((error = suspend_prepare())) 243 if ((error = suspend_prepare()))
244 goto Unlock; 244 goto Unlock;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 95fbf2dd3fe3..195dc4611764 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -11,14 +11,32 @@ struct swsusp_info {
11 unsigned long size; 11 unsigned long size;
12} __attribute__((aligned(PAGE_SIZE))); 12} __attribute__((aligned(PAGE_SIZE)));
13 13
14#ifdef CONFIG_HIBERNATION
15#ifdef CONFIG_ARCH_HIBERNATION_HEADER
16/* Maximum size of architecture specific data in a hibernation header */
17#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
14 18
19extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
20extern int arch_hibernation_header_restore(void *addr);
21
22static inline int init_header_complete(struct swsusp_info *info)
23{
24 return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
25}
26
27static inline char *check_image_kernel(struct swsusp_info *info)
28{
29 return arch_hibernation_header_restore(info) ?
30 "architecture specific data" : NULL;
31}
32#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
15 33
16#ifdef CONFIG_HIBERNATION
17/* 34/*
18 * Keep some memory free so that I/O operations can succeed without paging 35 * Keep some memory free so that I/O operations can succeed without paging
19 * [Might this be more than 4 MB?] 36 * [Might this be more than 4 MB?]
20 */ 37 */
21#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) 38#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
39
22/* 40/*
23 * Keep 1 MB of memory free so that device drivers can allocate some pages in 41 * Keep 1 MB of memory free so that device drivers can allocate some pages in
24 * their .suspend() routines without breaking the suspend to disk. 42 * their .suspend() routines without breaking the suspend to disk.
@@ -165,7 +183,6 @@ extern int swsusp_swap_in_use(void);
165extern int swsusp_check(void); 183extern int swsusp_check(void);
166extern int swsusp_shrink_memory(void); 184extern int swsusp_shrink_memory(void);
167extern void swsusp_free(void); 185extern void swsusp_free(void);
168extern int swsusp_suspend(void);
169extern int swsusp_resume(void); 186extern int swsusp_resume(void);
170extern int swsusp_read(unsigned int *flags_p); 187extern int swsusp_read(unsigned int *flags_p);
171extern int swsusp_write(unsigned int flags); 188extern int swsusp_write(unsigned int flags);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3434940a3df1..6533923e711b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,21 +75,79 @@ void refrigerator(void)
75 __set_current_state(save); 75 __set_current_state(save);
76} 76}
77 77
78static void freeze_task(struct task_struct *p) 78static void fake_signal_wake_up(struct task_struct *p, int resume)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 if (!freezing(p)) { 82 spin_lock_irqsave(&p->sighand->siglock, flags);
83 signal_wake_up(p, resume);
84 spin_unlock_irqrestore(&p->sighand->siglock, flags);
85}
86
87static void send_fake_signal(struct task_struct *p)
88{
89 if (p->state == TASK_STOPPED)
90 force_sig_specific(SIGSTOP, p);
91 fake_signal_wake_up(p, p->state == TASK_STOPPED);
92}
93
94static int has_mm(struct task_struct *p)
95{
96 return (p->mm && !(p->flags & PF_BORROWED_MM));
97}
98
99/**
100 * freeze_task - send a freeze request to given task
101 * @p: task to send the request to
102 * @with_mm_only: if set, the request will only be sent if the task has its
103 * own mm
104 * Return value: 0, if @with_mm_only is set and the task has no mm of its
105 * own or the task is frozen, 1, otherwise
106 *
107 * The freeze request is sent by seting the tasks's TIF_FREEZE flag and
108 * either sending a fake signal to it or waking it up, depending on whether
109 * or not it has its own mm (ie. it is a user land task). If @with_mm_only
110 * is set and the task has no mm of its own (ie. it is a kernel thread),
111 * its TIF_FREEZE flag should not be set.
112 *
113 * The task_lock() is necessary to prevent races with exit_mm() or
114 * use_mm()/unuse_mm() from occuring.
115 */
116static int freeze_task(struct task_struct *p, int with_mm_only)
117{
118 int ret = 1;
119
120 task_lock(p);
121 if (freezing(p)) {
122 if (has_mm(p)) {
123 if (!signal_pending(p))
124 fake_signal_wake_up(p, 0);
125 } else {
126 if (with_mm_only)
127 ret = 0;
128 else
129 wake_up_state(p, TASK_INTERRUPTIBLE);
130 }
131 } else {
83 rmb(); 132 rmb();
84 if (!frozen(p)) { 133 if (frozen(p)) {
85 set_freeze_flag(p); 134 ret = 0;
86 if (p->state == TASK_STOPPED) 135 } else {
87 force_sig_specific(SIGSTOP, p); 136 if (has_mm(p)) {
88 spin_lock_irqsave(&p->sighand->siglock, flags); 137 set_freeze_flag(p);
89 signal_wake_up(p, p->state == TASK_STOPPED); 138 send_fake_signal(p);
90 spin_unlock_irqrestore(&p->sighand->siglock, flags); 139 } else {
140 if (with_mm_only) {
141 ret = 0;
142 } else {
143 set_freeze_flag(p);
144 wake_up_state(p, TASK_INTERRUPTIBLE);
145 }
146 }
91 } 147 }
92 } 148 }
149 task_unlock(p);
150 return ret;
93} 151}
94 152
95static void cancel_freezing(struct task_struct *p) 153static void cancel_freezing(struct task_struct *p)
@@ -110,6 +168,11 @@ static int try_to_freeze_tasks(int freeze_user_space)
110 struct task_struct *g, *p; 168 struct task_struct *g, *p;
111 unsigned long end_time; 169 unsigned long end_time;
112 unsigned int todo; 170 unsigned int todo;
171 struct timeval start, end;
172 s64 elapsed_csecs64;
173 unsigned int elapsed_csecs;
174
175 do_gettimeofday(&start);
113 176
114 end_time = jiffies + TIMEOUT; 177 end_time = jiffies + TIMEOUT;
115 do { 178 do {
@@ -119,31 +182,14 @@ static int try_to_freeze_tasks(int freeze_user_space)
119 if (frozen(p) || !freezeable(p)) 182 if (frozen(p) || !freezeable(p))
120 continue; 183 continue;
121 184
122 if (freeze_user_space) { 185 if (p->state == TASK_TRACED && frozen(p->parent)) {
123 if (p->state == TASK_TRACED && 186 cancel_freezing(p);
124 frozen(p->parent)) { 187 continue;
125 cancel_freezing(p);
126 continue;
127 }
128 /*
129 * Kernel threads should not have TIF_FREEZE set
130 * at this point, so we must ensure that either
131 * p->mm is not NULL *and* PF_BORROWED_MM is
132 * unset, or TIF_FRREZE is left unset.
133 * The task_lock() is necessary to prevent races
134 * with exit_mm() or use_mm()/unuse_mm() from
135 * occuring.
136 */
137 task_lock(p);
138 if (!p->mm || (p->flags & PF_BORROWED_MM)) {
139 task_unlock(p);
140 continue;
141 }
142 freeze_task(p);
143 task_unlock(p);
144 } else {
145 freeze_task(p);
146 } 188 }
189
190 if (!freeze_task(p, freeze_user_space))
191 continue;
192
147 if (!freezer_should_skip(p)) 193 if (!freezer_should_skip(p))
148 todo++; 194 todo++;
149 } while_each_thread(g, p); 195 } while_each_thread(g, p);
@@ -153,6 +199,11 @@ static int try_to_freeze_tasks(int freeze_user_space)
153 break; 199 break;
154 } while (todo); 200 } while (todo);
155 201
202 do_gettimeofday(&end);
203 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
204 do_div(elapsed_csecs64, NSEC_PER_SEC / 100);
205 elapsed_csecs = elapsed_csecs64;
206
156 if (todo) { 207 if (todo) {
157 /* This does not unfreeze processes that are already frozen 208 /* This does not unfreeze processes that are already frozen
158 * (we have slightly ugly calling convention in that respect, 209 * (we have slightly ugly calling convention in that respect,
@@ -160,10 +211,9 @@ static int try_to_freeze_tasks(int freeze_user_space)
160 * but it cleans up leftover PF_FREEZE requests. 211 * but it cleans up leftover PF_FREEZE requests.
161 */ 212 */
162 printk("\n"); 213 printk("\n");
163 printk(KERN_ERR "Freezing of %s timed out after %d seconds " 214 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
164 "(%d tasks refusing to freeze):\n", 215 "(%d tasks refusing to freeze):\n",
165 freeze_user_space ? "user space " : "tasks ", 216 elapsed_csecs / 100, elapsed_csecs % 100, todo);
166 TIMEOUT / HZ, todo);
167 show_state(); 217 show_state();
168 read_lock(&tasklist_lock); 218 read_lock(&tasklist_lock);
169 do_each_thread(g, p) { 219 do_each_thread(g, p) {
@@ -174,6 +224,9 @@ static int try_to_freeze_tasks(int freeze_user_space)
174 task_unlock(p); 224 task_unlock(p);
175 } while_each_thread(g, p); 225 } while_each_thread(g, p);
176 read_unlock(&tasklist_lock); 226 read_unlock(&tasklist_lock);
227 } else {
228 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
229 elapsed_csecs % 100);
177 } 230 }
178 231
179 return todo ? -EBUSY : 0; 232 return todo ? -EBUSY : 0;
@@ -186,19 +239,21 @@ int freeze_processes(void)
186{ 239{
187 int error; 240 int error;
188 241
189 printk("Stopping tasks ... "); 242 printk("Freezing user space processes ... ");
190 error = try_to_freeze_tasks(FREEZER_USER_SPACE); 243 error = try_to_freeze_tasks(FREEZER_USER_SPACE);
191 if (error) 244 if (error)
192 return error; 245 goto Exit;
246 printk("done.\n");
193 247
194 sys_sync(); 248 printk("Freezing remaining freezable tasks ... ");
195 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 249 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
196 if (error) 250 if (error)
197 return error; 251 goto Exit;
198 252 printk("done.");
199 printk("done.\n"); 253 Exit:
200 BUG_ON(in_atomic()); 254 BUG_ON(in_atomic());
201 return 0; 255 printk("\n");
256 return error;
202} 257}
203 258
204static void thaw_tasks(int thaw_user_space) 259static void thaw_tasks(int thaw_user_space)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a686590d88c1..ccc95ac07bed 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1239,17 +1239,39 @@ asmlinkage int swsusp_save(void)
1239 return 0; 1239 return 0;
1240} 1240}
1241 1241
1242static void init_header(struct swsusp_info *info) 1242#ifndef CONFIG_ARCH_HIBERNATION_HEADER
1243static int init_header_complete(struct swsusp_info *info)
1243{ 1244{
1244 memset(info, 0, sizeof(struct swsusp_info)); 1245 memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
1245 info->version_code = LINUX_VERSION_CODE; 1246 info->version_code = LINUX_VERSION_CODE;
1247 return 0;
1248}
1249
1250static char *check_image_kernel(struct swsusp_info *info)
1251{
1252 if (info->version_code != LINUX_VERSION_CODE)
1253 return "kernel version";
1254 if (strcmp(info->uts.sysname,init_utsname()->sysname))
1255 return "system type";
1256 if (strcmp(info->uts.release,init_utsname()->release))
1257 return "kernel release";
1258 if (strcmp(info->uts.version,init_utsname()->version))
1259 return "version";
1260 if (strcmp(info->uts.machine,init_utsname()->machine))
1261 return "machine";
1262 return NULL;
1263}
1264#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
1265
1266static int init_header(struct swsusp_info *info)
1267{
1268 memset(info, 0, sizeof(struct swsusp_info));
1246 info->num_physpages = num_physpages; 1269 info->num_physpages = num_physpages;
1247 memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
1248 info->cpus = num_online_cpus();
1249 info->image_pages = nr_copy_pages; 1270 info->image_pages = nr_copy_pages;
1250 info->pages = nr_copy_pages + nr_meta_pages + 1; 1271 info->pages = nr_copy_pages + nr_meta_pages + 1;
1251 info->size = info->pages; 1272 info->size = info->pages;
1252 info->size <<= PAGE_SHIFT; 1273 info->size <<= PAGE_SHIFT;
1274 return init_header_complete(info);
1253} 1275}
1254 1276
1255/** 1277/**
@@ -1303,7 +1325,11 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1303 return -ENOMEM; 1325 return -ENOMEM;
1304 } 1326 }
1305 if (!handle->offset) { 1327 if (!handle->offset) {
1306 init_header((struct swsusp_info *)buffer); 1328 int error;
1329
1330 error = init_header((struct swsusp_info *)buffer);
1331 if (error)
1332 return error;
1307 handle->buffer = buffer; 1333 handle->buffer = buffer;
1308 memory_bm_position_reset(&orig_bm); 1334 memory_bm_position_reset(&orig_bm);
1309 memory_bm_position_reset(&copy_bm); 1335 memory_bm_position_reset(&copy_bm);
@@ -1394,22 +1420,13 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
1394 } 1420 }
1395} 1421}
1396 1422
1397static inline int check_header(struct swsusp_info *info) 1423static int check_header(struct swsusp_info *info)
1398{ 1424{
1399 char *reason = NULL; 1425 char *reason;
1400 1426
1401 if (info->version_code != LINUX_VERSION_CODE) 1427 reason = check_image_kernel(info);
1402 reason = "kernel version"; 1428 if (!reason && info->num_physpages != num_physpages)
1403 if (info->num_physpages != num_physpages)
1404 reason = "memory size"; 1429 reason = "memory size";
1405 if (strcmp(info->uts.sysname,init_utsname()->sysname))
1406 reason = "system type";
1407 if (strcmp(info->uts.release,init_utsname()->release))
1408 reason = "kernel release";
1409 if (strcmp(info->uts.version,init_utsname()->version))
1410 reason = "version";
1411 if (strcmp(info->uts.machine,init_utsname()->machine))
1412 reason = "machine";
1413 if (reason) { 1430 if (reason) {
1414 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); 1431 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
1415 return -EPERM; 1432 return -EPERM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 5da304c8f1f6..e1722d3155f1 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -270,39 +270,6 @@ int swsusp_shrink_memory(void)
270 return 0; 270 return 0;
271} 271}
272 272
273int swsusp_suspend(void)
274{
275 int error;
276
277 if ((error = arch_prepare_suspend()))
278 return error;
279
280 local_irq_disable();
281 /* At this point, device_suspend() has been called, but *not*
282 * device_power_down(). We *must* device_power_down() now.
283 * Otherwise, drivers for some devices (e.g. interrupt controllers)
284 * become desynchronized with the actual state of the hardware
285 * at resume time, and evil weirdness ensues.
286 */
287 if ((error = device_power_down(PMSG_FREEZE))) {
288 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
289 goto Enable_irqs;
290 }
291
292 save_processor_state();
293 if ((error = swsusp_arch_suspend()))
294 printk(KERN_ERR "Error %d suspending\n", error);
295 /* Restore control flow magically appears here */
296 restore_processor_state();
297 /* NOTE: device_power_up() is just a resume() for devices
298 * that suspended with irqs off ... no overall powerup.
299 */
300 device_power_up();
301 Enable_irqs:
302 local_irq_enable();
303 return error;
304}
305
306int swsusp_resume(void) 273int swsusp_resume(void)
307{ 274{
308 int error; 275 int error;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bd0723a7df3f..5bd321bcbb75 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -153,6 +153,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
153 mutex_lock(&pm_mutex); 153 mutex_lock(&pm_mutex);
154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
155 if (!error) { 155 if (!error) {
156 printk("Syncing filesystems ... ");
157 sys_sync();
158 printk("done.\n");
159
156 error = freeze_processes(); 160 error = freeze_processes();
157 if (error) 161 if (error)
158 thaw_processes(); 162 thaw_processes();
diff --git a/kernel/printk.c b/kernel/printk.c
index 8451dfc31d25..a30fe33de395 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -22,6 +22,8 @@
22#include <linux/tty_driver.h> 22#include <linux/tty_driver.h>
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/jiffies.h>
26#include <linux/nmi.h>
25#include <linux/module.h> 27#include <linux/module.h>
26#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 29#include <linux/interrupt.h> /* For in_interrupt() */
@@ -162,6 +164,113 @@ out:
162 164
163__setup("log_buf_len=", log_buf_len_setup); 165__setup("log_buf_len=", log_buf_len_setup);
164 166
167#ifdef CONFIG_BOOT_PRINTK_DELAY
168
169static unsigned int boot_delay; /* msecs delay after each printk during bootup */
170static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */
171
172static int __init boot_delay_setup(char *str)
173{
174 unsigned long lpj;
175 unsigned long long loops_per_msec;
176
177 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
178 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
179
180 get_option(&str, &boot_delay);
181 if (boot_delay > 10 * 1000)
182 boot_delay = 0;
183
184 printk_delay_msec = loops_per_msec;
185 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
186 "HZ: %d, printk_delay_msec: %llu\n",
187 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
188 return 1;
189}
190__setup("boot_delay=", boot_delay_setup);
191
192static void boot_delay_msec(void)
193{
194 unsigned long long k;
195 unsigned long timeout;
196
197 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
198 return;
199
200 k = (unsigned long long)printk_delay_msec * boot_delay;
201
202 timeout = jiffies + msecs_to_jiffies(boot_delay);
203 while (k) {
204 k--;
205 cpu_relax();
206 /*
207 * use (volatile) jiffies to prevent
208 * compiler reduction; loop termination via jiffies
209 * is secondary and may or may not happen.
210 */
211 if (time_after(jiffies, timeout))
212 break;
213 touch_nmi_watchdog();
214 }
215}
216#else
217static inline void boot_delay_msec(void)
218{
219}
220#endif
221
222/*
223 * Return the number of unread characters in the log buffer.
224 */
225int log_buf_get_len(void)
226{
227 return logged_chars;
228}
229
230/*
231 * Copy a range of characters from the log buffer.
232 */
233int log_buf_copy(char *dest, int idx, int len)
234{
235 int ret, max;
236 bool took_lock = false;
237
238 if (!oops_in_progress) {
239 spin_lock_irq(&logbuf_lock);
240 took_lock = true;
241 }
242
243 max = log_buf_get_len();
244 if (idx < 0 || idx >= max) {
245 ret = -1;
246 } else {
247 if (len > max)
248 len = max;
249 ret = len;
250 idx += (log_end - max);
251 while (len-- > 0)
252 dest[len] = LOG_BUF(idx + len);
253 }
254
255 if (took_lock)
256 spin_unlock_irq(&logbuf_lock);
257
258 return ret;
259}
260
261/*
262 * Extract a single character from the log buffer.
263 */
264int log_buf_read(int idx)
265{
266 char ret;
267
268 if (log_buf_copy(&ret, idx, 1) == 1)
269 return ret;
270 else
271 return -1;
272}
273
165/* 274/*
166 * Commands to do_syslog: 275 * Commands to do_syslog:
167 * 276 *
@@ -527,6 +636,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
527 static char printk_buf[1024]; 636 static char printk_buf[1024];
528 static int log_level_unknown = 1; 637 static int log_level_unknown = 1;
529 638
639 boot_delay_msec();
640
530 preempt_disable(); 641 preempt_disable();
531 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) 642 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
532 /* If a crash is occurring during printk() on this CPU, 643 /* If a crash is occurring during printk() on this CPU,
@@ -751,7 +862,16 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
751 return -1; 862 return -1;
752} 863}
753 864
754#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND 865int console_suspend_enabled = 1;
866EXPORT_SYMBOL(console_suspend_enabled);
867
868static int __init console_suspend_disable(char *str)
869{
870 console_suspend_enabled = 0;
871 return 1;
872}
873__setup("no_console_suspend", console_suspend_disable);
874
755/** 875/**
756 * suspend_console - suspend the console subsystem 876 * suspend_console - suspend the console subsystem
757 * 877 *
@@ -759,6 +879,8 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
759 */ 879 */
760void suspend_console(void) 880void suspend_console(void)
761{ 881{
882 if (!console_suspend_enabled)
883 return;
762 printk("Suspending console(s)\n"); 884 printk("Suspending console(s)\n");
763 acquire_console_sem(); 885 acquire_console_sem();
764 console_suspended = 1; 886 console_suspended = 1;
@@ -766,10 +888,11 @@ void suspend_console(void)
766 888
767void resume_console(void) 889void resume_console(void)
768{ 890{
891 if (!console_suspend_enabled)
892 return;
769 console_suspended = 0; 893 console_suspended = 0;
770 release_console_sem(); 894 release_console_sem();
771} 895}
772#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
773 896
774/** 897/**
775 * acquire_console_sem - lock the console system for exclusive use. 898 * acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/profile.c b/kernel/profile.c
index cb1e37d2dac3..631b75c25d7e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,7 +37,7 @@ struct profile_hit {
37#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 37#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
38 38
39/* Oprofile timer tick hook */ 39/* Oprofile timer tick hook */
40int (*timer_hook)(struct pt_regs *) __read_mostly; 40static int (*timer_hook)(struct pt_regs *) __read_mostly;
41 41
42static atomic_t *prof_buffer; 42static atomic_t *prof_buffer;
43static unsigned long prof_len, prof_shift; 43static unsigned long prof_len, prof_shift;
@@ -346,7 +346,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
346 per_cpu(cpu_profile_flip, cpu) = 0; 346 per_cpu(cpu_profile_flip, cpu) = 0;
347 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 347 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
348 page = alloc_pages_node(node, 348 page = alloc_pages_node(node,
349 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 349 GFP_KERNEL | __GFP_ZERO,
350 0); 350 0);
351 if (!page) 351 if (!page)
352 return NOTIFY_BAD; 352 return NOTIFY_BAD;
@@ -354,7 +354,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
354 } 354 }
355 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 355 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
356 page = alloc_pages_node(node, 356 page = alloc_pages_node(node,
357 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 357 GFP_KERNEL | __GFP_ZERO,
358 0); 358 0);
359 if (!page) 359 if (!page)
360 goto out_free; 360 goto out_free;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 3eca7a55f2ee..7c76f2ffaeaa 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -19,6 +19,7 @@
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/signal.h> 20#include <linux/signal.h>
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h>
22 23
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
@@ -168,7 +169,7 @@ int ptrace_attach(struct task_struct *task)
168 retval = -EPERM; 169 retval = -EPERM;
169 if (task->pid <= 1) 170 if (task->pid <= 1)
170 goto out; 171 goto out;
171 if (task->tgid == current->tgid) 172 if (same_thread_group(task, current))
172 goto out; 173 goto out;
173 174
174repeat: 175repeat:
@@ -386,6 +387,9 @@ int ptrace_request(struct task_struct *child, long request,
386 case PTRACE_SETSIGINFO: 387 case PTRACE_SETSIGINFO:
387 ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); 388 ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
388 break; 389 break;
390 case PTRACE_DETACH: /* detach a process that was attached. */
391 ret = ptrace_detach(child, data);
392 break;
389 default: 393 default:
390 break; 394 break;
391 } 395 }
@@ -440,7 +444,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
440 return ERR_PTR(-EPERM); 444 return ERR_PTR(-EPERM);
441 445
442 read_lock(&tasklist_lock); 446 read_lock(&tasklist_lock);
443 child = find_task_by_pid(pid); 447 child = find_task_by_vpid(pid);
444 if (child) 448 if (child)
445 get_task_struct(child); 449 get_task_struct(child);
446 450
@@ -450,6 +454,10 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
450 return child; 454 return child;
451} 455}
452 456
457#ifndef arch_ptrace_attach
458#define arch_ptrace_attach(child) do { } while (0)
459#endif
460
453#ifndef __ARCH_SYS_PTRACE 461#ifndef __ARCH_SYS_PTRACE
454asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 462asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
455{ 463{
@@ -473,6 +481,12 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
473 481
474 if (request == PTRACE_ATTACH) { 482 if (request == PTRACE_ATTACH) {
475 ret = ptrace_attach(child); 483 ret = ptrace_attach(child);
484 /*
485 * Some architectures need to do book-keeping after
486 * a ptrace attach.
487 */
488 if (!ret)
489 arch_ptrace_attach(child);
476 goto out_put_task_struct; 490 goto out_put_task_struct;
477 } 491 }
478 492
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2c2dd8410dc4..a66d4d1615f7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,10 +45,17 @@
45#include <linux/moduleparam.h> 45#include <linux/moduleparam.h>
46#include <linux/percpu.h> 46#include <linux/percpu.h>
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/rcupdate.h>
49#include <linux/cpu.h> 48#include <linux/cpu.h>
50#include <linux/mutex.h> 49#include <linux/mutex.h>
51 50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key;
53struct lockdep_map rcu_lock_map =
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
52/* Definition for rcupdate control block. */ 59/* Definition for rcupdate control block. */
53static struct rcu_ctrlblk rcu_ctrlblk = { 60static struct rcu_ctrlblk rcu_ctrlblk = {
54 .cur = -300, 61 .cur = -300,
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index ddff33247785..c3e165c2318f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -35,14 +35,12 @@
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <asm/atomic.h> 36#include <asm/atomic.h>
37#include <linux/bitops.h> 37#include <linux/bitops.h>
38#include <linux/module.h>
39#include <linux/completion.h> 38#include <linux/completion.h>
40#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
41#include <linux/percpu.h> 40#include <linux/percpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
43#include <linux/freezer.h> 42#include <linux/freezer.h>
44#include <linux/cpu.h> 43#include <linux/cpu.h>
45#include <linux/random.h>
46#include <linux/delay.h> 44#include <linux/delay.h>
47#include <linux/byteorder/swabb.h> 45#include <linux/byteorder/swabb.h>
48#include <linux/stat.h> 46#include <linux/stat.h>
@@ -166,16 +164,14 @@ struct rcu_random_state {
166 164
167/* 165/*
168 * Crude but fast random-number generator. Uses a linear congruential 166 * Crude but fast random-number generator. Uses a linear congruential
169 * generator, with occasional help from get_random_bytes(). 167 * generator, with occasional help from cpu_clock().
170 */ 168 */
171static unsigned long 169static unsigned long
172rcu_random(struct rcu_random_state *rrsp) 170rcu_random(struct rcu_random_state *rrsp)
173{ 171{
174 long refresh;
175
176 if (--rrsp->rrs_count < 0) { 172 if (--rrsp->rrs_count < 0) {
177 get_random_bytes(&refresh, sizeof(refresh)); 173 rrsp->rrs_state +=
178 rrsp->rrs_state += refresh; 174 (unsigned long)cpu_clock(raw_smp_processor_id());
179 rrsp->rrs_count = RCU_RANDOM_REFRESH; 175 rrsp->rrs_count = RCU_RANDOM_REFRESH;
180 } 176 }
181 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 177 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/relay.c b/kernel/relay.c
index ad855017bc59..61134eb7a0c8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -370,7 +370,7 @@ void relay_reset(struct rchan *chan)
370 if (!chan) 370 if (!chan)
371 return; 371 return;
372 372
373 if (chan->is_global && chan->buf[0]) { 373 if (chan->is_global && chan->buf[0]) {
374 __relay_reset(chan->buf[0], 0); 374 __relay_reset(chan->buf[0], 0);
375 return; 375 return;
376 } 376 }
@@ -850,13 +850,13 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
850 buf->subbufs_consumed = consumed; 850 buf->subbufs_consumed = consumed;
851 buf->bytes_consumed = 0; 851 buf->bytes_consumed = 0;
852 } 852 }
853 853
854 produced = (produced % n_subbufs) * subbuf_size + buf->offset; 854 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; 855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
856 856
857 if (consumed > produced) 857 if (consumed > produced)
858 produced += n_subbufs * subbuf_size; 858 produced += n_subbufs * subbuf_size;
859 859
860 if (consumed == produced) 860 if (consumed == produced)
861 return 0; 861 return 0;
862 862
diff --git a/kernel/resource.c b/kernel/resource.c
index 9bd14fd3e6de..a358142ff48f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -234,7 +234,7 @@ EXPORT_SYMBOL(release_resource);
234 * the caller must specify res->start, res->end, res->flags. 234 * the caller must specify res->start, res->end, res->flags.
235 * If found, returns 0, res is overwritten, if not found, returns -1. 235 * If found, returns 0, res is overwritten, if not found, returns -1.
236 */ 236 */
237int find_next_system_ram(struct resource *res) 237static int find_next_system_ram(struct resource *res)
238{ 238{
239 resource_size_t start, end; 239 resource_size_t start, end;
240 struct resource *p; 240 struct resource *p;
@@ -267,6 +267,30 @@ int find_next_system_ram(struct resource *res)
267 res->end = p->end; 267 res->end = p->end;
268 return 0; 268 return 0;
269} 269}
270int
271walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
272 int (*func)(unsigned long, unsigned long, void *))
273{
274 struct resource res;
275 unsigned long pfn, len;
276 u64 orig_end;
277 int ret = -1;
278 res.start = (u64) start_pfn << PAGE_SHIFT;
279 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
280 res.flags = IORESOURCE_MEM;
281 orig_end = res.end;
282 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
283 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
284 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
285 ret = (*func)(pfn, len, arg);
286 if (ret)
287 break;
288 res.start = res.end + 1;
289 res.end = orig_end;
290 }
291 return ret;
292}
293
270#endif 294#endif
271 295
272/* 296/*
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5aedbee014df..56d73cb8826d 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -82,17 +82,12 @@ do { \
82 * into the tracing code when doing error printk or 82 * into the tracing code when doing error printk or
83 * executing a BUG(): 83 * executing a BUG():
84 */ 84 */
85int rt_trace_on = 1; 85static int rt_trace_on = 1;
86
87void deadlock_trace_off(void)
88{
89 rt_trace_on = 0;
90}
91 86
92static void printk_task(struct task_struct *p) 87static void printk_task(struct task_struct *p)
93{ 88{
94 if (p) 89 if (p)
95 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); 90 printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
96 else 91 else
97 printk("<none>"); 92 printk("<none>");
98} 93}
@@ -157,22 +152,25 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
157 printk( "[ BUG: circular locking deadlock detected! ]\n"); 152 printk( "[ BUG: circular locking deadlock detected! ]\n");
158 printk( "--------------------------------------------\n"); 153 printk( "--------------------------------------------\n");
159 printk("%s/%d is deadlocking current task %s/%d\n\n", 154 printk("%s/%d is deadlocking current task %s/%d\n\n",
160 task->comm, task->pid, current->comm, current->pid); 155 task->comm, task_pid_nr(task),
156 current->comm, task_pid_nr(current));
161 157
162 printk("\n1) %s/%d is trying to acquire this lock:\n", 158 printk("\n1) %s/%d is trying to acquire this lock:\n",
163 current->comm, current->pid); 159 current->comm, task_pid_nr(current));
164 printk_lock(waiter->lock, 1); 160 printk_lock(waiter->lock, 1);
165 161
166 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); 162 printk("\n2) %s/%d is blocked on this lock:\n",
163 task->comm, task_pid_nr(task));
167 printk_lock(waiter->deadlock_lock, 1); 164 printk_lock(waiter->deadlock_lock, 1);
168 165
169 debug_show_held_locks(current); 166 debug_show_held_locks(current);
170 debug_show_held_locks(task); 167 debug_show_held_locks(task);
171 168
172 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); 169 printk("\n%s/%d's [blocked] stackdump:\n\n",
170 task->comm, task_pid_nr(task));
173 show_stack(task, NULL); 171 show_stack(task, NULL);
174 printk("\n%s/%d's [current] stackdump:\n\n", 172 printk("\n%s/%d's [current] stackdump:\n\n",
175 current->comm, current->pid); 173 current->comm, task_pid_nr(current));
176 dump_stack(); 174 dump_stack();
177 debug_show_all_locks(); 175 debug_show_all_locks();
178 176
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 8cd9bd2cdb34..0deef71ff8d2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -185,7 +185,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
185 prev_max = max_lock_depth; 185 prev_max = max_lock_depth;
186 printk(KERN_WARNING "Maximum lock depth %d reached " 186 printk(KERN_WARNING "Maximum lock depth %d reached "
187 "task: %s (%d)\n", max_lock_depth, 187 "task: %s (%d)\n", max_lock_depth,
188 top_task->comm, top_task->pid); 188 top_task->comm, task_pid_nr(top_task));
189 } 189 }
190 put_task_struct(task); 190 put_task_struct(task);
191 191
diff --git a/kernel/sched.c b/kernel/sched.c
index 6107a0cd6325..afe76ec2e7fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -44,6 +44,7 @@
44#include <linux/vmalloc.h> 44#include <linux/vmalloc.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/delay.h> 46#include <linux/delay.h>
47#include <linux/pid_namespace.h>
47#include <linux/smp.h> 48#include <linux/smp.h>
48#include <linux/threads.h> 49#include <linux/threads.h>
49#include <linux/timer.h> 50#include <linux/timer.h>
@@ -51,6 +52,7 @@
51#include <linux/cpu.h> 52#include <linux/cpu.h>
52#include <linux/cpuset.h> 53#include <linux/cpuset.h>
53#include <linux/percpu.h> 54#include <linux/percpu.h>
55#include <linux/cpu_acct.h>
54#include <linux/kthread.h> 56#include <linux/kthread.h>
55#include <linux/seq_file.h> 57#include <linux/seq_file.h>
56#include <linux/sysctl.h> 58#include <linux/sysctl.h>
@@ -61,6 +63,7 @@
61#include <linux/delayacct.h> 63#include <linux/delayacct.h>
62#include <linux/reciprocal_div.h> 64#include <linux/reciprocal_div.h>
63#include <linux/unistd.h> 65#include <linux/unistd.h>
66#include <linux/pagemap.h>
64 67
65#include <asm/tlb.h> 68#include <asm/tlb.h>
66 69
@@ -95,7 +98,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
95/* 98/*
96 * Some helpers for converting nanosecond timing to jiffy resolution 99 * Some helpers for converting nanosecond timing to jiffy resolution
97 */ 100 */
98#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
99#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 102#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100 103
101#define NICE_0_LOAD SCHED_LOAD_SCALE 104#define NICE_0_LOAD SCHED_LOAD_SCALE
@@ -104,11 +107,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
104/* 107/*
105 * These are the 'tuning knobs' of the scheduler: 108 * These are the 'tuning knobs' of the scheduler:
106 * 109 *
107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 110 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
109 * Timeslices get refilled after they expire. 111 * Timeslices get refilled after they expire.
110 */ 112 */
111#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
112#define DEF_TIMESLICE (100 * HZ / 1000) 113#define DEF_TIMESLICE (100 * HZ / 1000)
113 114
114#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
@@ -132,24 +133,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
132} 133}
133#endif 134#endif
134 135
135#define SCALE_PRIO(x, prio) \
136 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
137
138/*
139 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
140 * to time slice values: [800ms ... 100ms ... 5ms]
141 */
142static unsigned int static_prio_timeslice(int static_prio)
143{
144 if (static_prio == NICE_TO_PRIO(19))
145 return 1;
146
147 if (static_prio < NICE_TO_PRIO(0))
148 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
149 else
150 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
151}
152
153static inline int rt_policy(int policy) 136static inline int rt_policy(int policy)
154{ 137{
155 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 138 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -170,31 +153,99 @@ struct rt_prio_array {
170 struct list_head queue[MAX_RT_PRIO]; 153 struct list_head queue[MAX_RT_PRIO];
171}; 154};
172 155
173struct load_stat { 156#ifdef CONFIG_FAIR_GROUP_SCHED
174 struct load_weight load; 157
175 u64 load_update_start, load_update_last; 158#include <linux/cgroup.h>
176 unsigned long delta_fair, delta_exec, delta_stat; 159
160struct cfs_rq;
161
162/* task group related information */
163struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED
165 struct cgroup_subsys_state css;
166#endif
167 /* schedulable entities of this group on each cpu */
168 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq;
171 unsigned long shares;
172 /* spinlock to serialize modification to shares */
173 spinlock_t lock;
174};
175
176/* Default task group's sched entity on each cpu */
177static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
178/* Default task group's cfs_rq on each cpu */
179static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
180
181static struct sched_entity *init_sched_entity_p[NR_CPUS];
182static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
183
184/* Default task group.
185 * Every task in system belong to this group at bootup.
186 */
187struct task_group init_task_group = {
188 .se = init_sched_entity_p,
189 .cfs_rq = init_cfs_rq_p,
177}; 190};
178 191
192#ifdef CONFIG_FAIR_USER_SCHED
193# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
194#else
195# define INIT_TASK_GRP_LOAD NICE_0_LOAD
196#endif
197
198static int init_task_group_load = INIT_TASK_GRP_LOAD;
199
200/* return group to which a task belongs */
201static inline struct task_group *task_group(struct task_struct *p)
202{
203 struct task_group *tg;
204
205#ifdef CONFIG_FAIR_USER_SCHED
206 tg = p->user->tg;
207#elif defined(CONFIG_FAIR_CGROUP_SCHED)
208 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
209 struct task_group, css);
210#else
211 tg = &init_task_group;
212#endif
213
214 return tg;
215}
216
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p)
219{
220 p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
221 p->se.parent = task_group(p)->se[task_cpu(p)];
222}
223
224#else
225
226static inline void set_task_cfs_rq(struct task_struct *p) { }
227
228#endif /* CONFIG_FAIR_GROUP_SCHED */
229
179/* CFS-related fields in a runqueue */ 230/* CFS-related fields in a runqueue */
180struct cfs_rq { 231struct cfs_rq {
181 struct load_weight load; 232 struct load_weight load;
182 unsigned long nr_running; 233 unsigned long nr_running;
183 234
184 s64 fair_clock;
185 u64 exec_clock; 235 u64 exec_clock;
186 s64 wait_runtime; 236 u64 min_vruntime;
187 u64 sleeper_bonus;
188 unsigned long wait_runtime_overruns, wait_runtime_underruns;
189 237
190 struct rb_root tasks_timeline; 238 struct rb_root tasks_timeline;
191 struct rb_node *rb_leftmost; 239 struct rb_node *rb_leftmost;
192 struct rb_node *rb_load_balance_curr; 240 struct rb_node *rb_load_balance_curr;
193#ifdef CONFIG_FAIR_GROUP_SCHED
194 /* 'curr' points to currently running entity on this cfs_rq. 241 /* 'curr' points to currently running entity on this cfs_rq.
195 * It is set to NULL otherwise (i.e when none are currently running). 242 * It is set to NULL otherwise (i.e when none are currently running).
196 */ 243 */
197 struct sched_entity *curr; 244 struct sched_entity *curr;
245
246 unsigned long nr_spread_over;
247
248#ifdef CONFIG_FAIR_GROUP_SCHED
198 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 249 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
199 250
200 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 251 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -205,6 +256,8 @@ struct cfs_rq {
205 * list is used during load balance. 256 * list is used during load balance.
206 */ 257 */
207 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 258 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
259 struct task_group *tg; /* group that "owns" this runqueue */
260 struct rcu_head rcu;
208#endif 261#endif
209}; 262};
210 263
@@ -223,7 +276,8 @@ struct rt_rq {
223 * acquire operations must be ordered by ascending &runqueue. 276 * acquire operations must be ordered by ascending &runqueue.
224 */ 277 */
225struct rq { 278struct rq {
226 spinlock_t lock; /* runqueue lock */ 279 /* runqueue lock: */
280 spinlock_t lock;
227 281
228 /* 282 /*
229 * nr_running and cpu_load should be in the same cacheline because 283 * nr_running and cpu_load should be in the same cacheline because
@@ -236,13 +290,15 @@ struct rq {
236#ifdef CONFIG_NO_HZ 290#ifdef CONFIG_NO_HZ
237 unsigned char in_nohz_recently; 291 unsigned char in_nohz_recently;
238#endif 292#endif
239 struct load_stat ls; /* capture load from *all* tasks on this cpu */ 293 /* capture load from *all* tasks on this cpu: */
294 struct load_weight load;
240 unsigned long nr_load_updates; 295 unsigned long nr_load_updates;
241 u64 nr_switches; 296 u64 nr_switches;
242 297
243 struct cfs_rq cfs; 298 struct cfs_rq cfs;
244#ifdef CONFIG_FAIR_GROUP_SCHED 299#ifdef CONFIG_FAIR_GROUP_SCHED
245 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ 300 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list;
246#endif 302#endif
247 struct rt_rq rt; 303 struct rt_rq rt;
248 304
@@ -274,7 +330,8 @@ struct rq {
274 /* For active balancing */ 330 /* For active balancing */
275 int active_balance; 331 int active_balance;
276 int push_cpu; 332 int push_cpu;
277 int cpu; /* cpu of this runqueue */ 333 /* cpu of this runqueue: */
334 int cpu;
278 335
279 struct task_struct *migration_thread; 336 struct task_struct *migration_thread;
280 struct list_head migration_queue; 337 struct list_head migration_queue;
@@ -285,19 +342,22 @@ struct rq {
285 struct sched_info rq_sched_info; 342 struct sched_info rq_sched_info;
286 343
287 /* sys_sched_yield() stats */ 344 /* sys_sched_yield() stats */
288 unsigned long yld_exp_empty; 345 unsigned int yld_exp_empty;
289 unsigned long yld_act_empty; 346 unsigned int yld_act_empty;
290 unsigned long yld_both_empty; 347 unsigned int yld_both_empty;
291 unsigned long yld_cnt; 348 unsigned int yld_count;
292 349
293 /* schedule() stats */ 350 /* schedule() stats */
294 unsigned long sched_switch; 351 unsigned int sched_switch;
295 unsigned long sched_cnt; 352 unsigned int sched_count;
296 unsigned long sched_goidle; 353 unsigned int sched_goidle;
297 354
298 /* try_to_wake_up() stats */ 355 /* try_to_wake_up() stats */
299 unsigned long ttwu_cnt; 356 unsigned int ttwu_count;
300 unsigned long ttwu_local; 357 unsigned int ttwu_local;
358
359 /* BKL stats */
360 unsigned int bkl_count;
301#endif 361#endif
302 struct lock_class_key rq_lock_key; 362 struct lock_class_key rq_lock_key;
303}; 363};
@@ -382,6 +442,37 @@ static void update_rq_clock(struct rq *rq)
382#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 442#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
383 443
384/* 444/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */
447#ifdef CONFIG_SCHED_DEBUG
448# define const_debug __read_mostly
449#else
450# define const_debug static const
451#endif
452
453/*
454 * Debugging: various feature bits
455 */
456enum {
457 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
458 SCHED_FEAT_START_DEBIT = 2,
459 SCHED_FEAT_TREE_AVG = 4,
460 SCHED_FEAT_APPROX_AVG = 8,
461 SCHED_FEAT_WAKEUP_PREEMPT = 16,
462 SCHED_FEAT_PREEMPT_RESTRICT = 32,
463};
464
465const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
467 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0 |
470 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
471 SCHED_FEAT_PREEMPT_RESTRICT * 1;
472
473#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
474
475/*
385 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 476 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
386 * clock constructed from sched_clock(): 477 * clock constructed from sched_clock():
387 */ 478 */
@@ -399,18 +490,7 @@ unsigned long long cpu_clock(int cpu)
399 490
400 return now; 491 return now;
401} 492}
402 493EXPORT_SYMBOL_GPL(cpu_clock);
403#ifdef CONFIG_FAIR_GROUP_SCHED
404/* Change a task's ->cfs_rq if it moves across CPUs */
405static inline void set_task_cfs_rq(struct task_struct *p)
406{
407 p->se.cfs_rq = &task_rq(p)->cfs;
408}
409#else
410static inline void set_task_cfs_rq(struct task_struct *p)
411{
412}
413#endif
414 494
415#ifndef prepare_arch_switch 495#ifndef prepare_arch_switch
416# define prepare_arch_switch(next) do { } while (0) 496# define prepare_arch_switch(next) do { } while (0)
@@ -496,16 +576,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
496static inline struct rq *__task_rq_lock(struct task_struct *p) 576static inline struct rq *__task_rq_lock(struct task_struct *p)
497 __acquires(rq->lock) 577 __acquires(rq->lock)
498{ 578{
499 struct rq *rq; 579 for (;;) {
500 580 struct rq *rq = task_rq(p);
501repeat_lock_task: 581 spin_lock(&rq->lock);
502 rq = task_rq(p); 582 if (likely(rq == task_rq(p)))
503 spin_lock(&rq->lock); 583 return rq;
504 if (unlikely(rq != task_rq(p))) {
505 spin_unlock(&rq->lock); 584 spin_unlock(&rq->lock);
506 goto repeat_lock_task;
507 } 585 }
508 return rq;
509} 586}
510 587
511/* 588/*
@@ -518,18 +595,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
518{ 595{
519 struct rq *rq; 596 struct rq *rq;
520 597
521repeat_lock_task: 598 for (;;) {
522 local_irq_save(*flags); 599 local_irq_save(*flags);
523 rq = task_rq(p); 600 rq = task_rq(p);
524 spin_lock(&rq->lock); 601 spin_lock(&rq->lock);
525 if (unlikely(rq != task_rq(p))) { 602 if (likely(rq == task_rq(p)))
603 return rq;
526 spin_unlock_irqrestore(&rq->lock, *flags); 604 spin_unlock_irqrestore(&rq->lock, *flags);
527 goto repeat_lock_task;
528 } 605 }
529 return rq;
530} 606}
531 607
532static inline void __task_rq_unlock(struct rq *rq) 608static void __task_rq_unlock(struct rq *rq)
533 __releases(rq->lock) 609 __releases(rq->lock)
534{ 610{
535 spin_unlock(&rq->lock); 611 spin_unlock(&rq->lock);
@@ -544,7 +620,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
544/* 620/*
545 * this_rq_lock - lock this runqueue and disable interrupts. 621 * this_rq_lock - lock this runqueue and disable interrupts.
546 */ 622 */
547static inline struct rq *this_rq_lock(void) 623static struct rq *this_rq_lock(void)
548 __acquires(rq->lock) 624 __acquires(rq->lock)
549{ 625{
550 struct rq *rq; 626 struct rq *rq;
@@ -644,19 +720,6 @@ static inline void resched_task(struct task_struct *p)
644} 720}
645#endif 721#endif
646 722
647static u64 div64_likely32(u64 divident, unsigned long divisor)
648{
649#if BITS_PER_LONG == 32
650 if (likely(divident <= 0xffffffffULL))
651 return (u32)divident / divisor;
652 do_div(divident, divisor);
653
654 return divident;
655#else
656 return divident / divisor;
657#endif
658}
659
660#if BITS_PER_LONG == 32 723#if BITS_PER_LONG == 32
661# define WMULT_CONST (~0UL) 724# define WMULT_CONST (~0UL)
662#else 725#else
@@ -698,16 +761,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
698 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); 761 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
699} 762}
700 763
701static void update_load_add(struct load_weight *lw, unsigned long inc) 764static inline void update_load_add(struct load_weight *lw, unsigned long inc)
702{ 765{
703 lw->weight += inc; 766 lw->weight += inc;
704 lw->inv_weight = 0;
705} 767}
706 768
707static void update_load_sub(struct load_weight *lw, unsigned long dec) 769static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
708{ 770{
709 lw->weight -= dec; 771 lw->weight -= dec;
710 lw->inv_weight = 0;
711} 772}
712 773
713/* 774/*
@@ -783,29 +844,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
783 int *this_best_prio, struct rq_iterator *iterator); 844 int *this_best_prio, struct rq_iterator *iterator);
784 845
785#include "sched_stats.h" 846#include "sched_stats.h"
786#include "sched_rt.c"
787#include "sched_fair.c"
788#include "sched_idletask.c" 847#include "sched_idletask.c"
848#include "sched_fair.c"
849#include "sched_rt.c"
789#ifdef CONFIG_SCHED_DEBUG 850#ifdef CONFIG_SCHED_DEBUG
790# include "sched_debug.c" 851# include "sched_debug.c"
791#endif 852#endif
792 853
793#define sched_class_highest (&rt_sched_class) 854#define sched_class_highest (&rt_sched_class)
794 855
795static void __update_curr_load(struct rq *rq, struct load_stat *ls)
796{
797 if (rq->curr != rq->idle && ls->load.weight) {
798 ls->delta_exec += ls->delta_stat;
799 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
800 ls->delta_stat = 0;
801 }
802}
803
804/* 856/*
805 * Update delta_exec, delta_fair fields for rq. 857 * Update delta_exec, delta_fair fields for rq.
806 * 858 *
807 * delta_fair clock advances at a rate inversely proportional to 859 * delta_fair clock advances at a rate inversely proportional to
808 * total load (rq->ls.load.weight) on the runqueue, while 860 * total load (rq->load.weight) on the runqueue, while
809 * delta_exec advances at the same rate as wall-clock (provided 861 * delta_exec advances at the same rate as wall-clock (provided
810 * cpu is not idle). 862 * cpu is not idle).
811 * 863 *
@@ -813,35 +865,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
813 * runqueue over any given interval. This (smoothened) load is used 865 * runqueue over any given interval. This (smoothened) load is used
814 * during load balance. 866 * during load balance.
815 * 867 *
816 * This function is called /before/ updating rq->ls.load 868 * This function is called /before/ updating rq->load
817 * and when switching tasks. 869 * and when switching tasks.
818 */ 870 */
819static void update_curr_load(struct rq *rq)
820{
821 struct load_stat *ls = &rq->ls;
822 u64 start;
823
824 start = ls->load_update_start;
825 ls->load_update_start = rq->clock;
826 ls->delta_stat += rq->clock - start;
827 /*
828 * Stagger updates to ls->delta_fair. Very frequent updates
829 * can be expensive.
830 */
831 if (ls->delta_stat >= sysctl_sched_stat_granularity)
832 __update_curr_load(rq, ls);
833}
834
835static inline void inc_load(struct rq *rq, const struct task_struct *p) 871static inline void inc_load(struct rq *rq, const struct task_struct *p)
836{ 872{
837 update_curr_load(rq); 873 update_load_add(&rq->load, p->se.load.weight);
838 update_load_add(&rq->ls.load, p->se.load.weight);
839} 874}
840 875
841static inline void dec_load(struct rq *rq, const struct task_struct *p) 876static inline void dec_load(struct rq *rq, const struct task_struct *p)
842{ 877{
843 update_curr_load(rq); 878 update_load_sub(&rq->load, p->se.load.weight);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
845} 879}
846 880
847static void inc_nr_running(struct task_struct *p, struct rq *rq) 881static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -858,8 +892,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
858 892
859static void set_load_weight(struct task_struct *p) 893static void set_load_weight(struct task_struct *p)
860{ 894{
861 p->se.wait_runtime = 0;
862
863 if (task_has_rt_policy(p)) { 895 if (task_has_rt_policy(p)) {
864 p->se.load.weight = prio_to_weight[0] * 2; 896 p->se.load.weight = prio_to_weight[0] * 2;
865 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 897 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -951,20 +983,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
951} 983}
952 984
953/* 985/*
954 * activate_idle_task - move idle task to the _front_ of runqueue.
955 */
956static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
957{
958 update_rq_clock(rq);
959
960 if (p->state == TASK_UNINTERRUPTIBLE)
961 rq->nr_uninterruptible--;
962
963 enqueue_task(rq, p, 0);
964 inc_nr_running(p, rq);
965}
966
967/*
968 * deactivate_task - remove a task from the runqueue. 986 * deactivate_task - remove a task from the runqueue.
969 */ 987 */
970static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 988static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -988,32 +1006,50 @@ inline int task_curr(const struct task_struct *p)
988/* Used instead of source_load when we know the type == 0 */ 1006/* Used instead of source_load when we know the type == 0 */
989unsigned long weighted_cpuload(const int cpu) 1007unsigned long weighted_cpuload(const int cpu)
990{ 1008{
991 return cpu_rq(cpu)->ls.load.weight; 1009 return cpu_rq(cpu)->load.weight;
992} 1010}
993 1011
994static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1012static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
995{ 1013{
996#ifdef CONFIG_SMP 1014#ifdef CONFIG_SMP
997 task_thread_info(p)->cpu = cpu; 1015 task_thread_info(p)->cpu = cpu;
998 set_task_cfs_rq(p);
999#endif 1016#endif
1017 set_task_cfs_rq(p);
1000} 1018}
1001 1019
1002#ifdef CONFIG_SMP 1020#ifdef CONFIG_SMP
1003 1021
1022/*
1023 * Is this task likely cache-hot:
1024 */
1025static inline int
1026task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1027{
1028 s64 delta;
1029
1030 if (p->sched_class != &fair_sched_class)
1031 return 0;
1032
1033 if (sysctl_sched_migration_cost == -1)
1034 return 1;
1035 if (sysctl_sched_migration_cost == 0)
1036 return 0;
1037
1038 delta = now - p->se.exec_start;
1039
1040 return delta < (s64)sysctl_sched_migration_cost;
1041}
1042
1043
1004void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1044void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1005{ 1045{
1006 int old_cpu = task_cpu(p); 1046 int old_cpu = task_cpu(p);
1007 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 1047 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1008 u64 clock_offset, fair_clock_offset; 1048 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
1049 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
1050 u64 clock_offset;
1009 1051
1010 clock_offset = old_rq->clock - new_rq->clock; 1052 clock_offset = old_rq->clock - new_rq->clock;
1011 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
1012
1013 if (p->se.wait_start_fair)
1014 p->se.wait_start_fair -= fair_clock_offset;
1015 if (p->se.sleep_start_fair)
1016 p->se.sleep_start_fair -= fair_clock_offset;
1017 1053
1018#ifdef CONFIG_SCHEDSTATS 1054#ifdef CONFIG_SCHEDSTATS
1019 if (p->se.wait_start) 1055 if (p->se.wait_start)
@@ -1022,7 +1058,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1022 p->se.sleep_start -= clock_offset; 1058 p->se.sleep_start -= clock_offset;
1023 if (p->se.block_start) 1059 if (p->se.block_start)
1024 p->se.block_start -= clock_offset; 1060 p->se.block_start -= clock_offset;
1061 if (old_cpu != new_cpu) {
1062 schedstat_inc(p, se.nr_migrations);
1063 if (task_hot(p, old_rq->clock, NULL))
1064 schedstat_inc(p, se.nr_forced2_migrations);
1065 }
1025#endif 1066#endif
1067 p->se.vruntime -= old_cfsrq->min_vruntime -
1068 new_cfsrq->min_vruntime;
1026 1069
1027 __set_task_cpu(p, new_cpu); 1070 __set_task_cpu(p, new_cpu);
1028} 1071}
@@ -1077,69 +1120,71 @@ void wait_task_inactive(struct task_struct *p)
1077 int running, on_rq; 1120 int running, on_rq;
1078 struct rq *rq; 1121 struct rq *rq;
1079 1122
1080repeat: 1123 for (;;) {
1081 /* 1124 /*
1082 * We do the initial early heuristics without holding 1125 * We do the initial early heuristics without holding
1083 * any task-queue locks at all. We'll only try to get 1126 * any task-queue locks at all. We'll only try to get
1084 * the runqueue lock when things look like they will 1127 * the runqueue lock when things look like they will
1085 * work out! 1128 * work out!
1086 */ 1129 */
1087 rq = task_rq(p); 1130 rq = task_rq(p);
1088 1131
1089 /* 1132 /*
1090 * If the task is actively running on another CPU 1133 * If the task is actively running on another CPU
1091 * still, just relax and busy-wait without holding 1134 * still, just relax and busy-wait without holding
1092 * any locks. 1135 * any locks.
1093 * 1136 *
1094 * NOTE! Since we don't hold any locks, it's not 1137 * NOTE! Since we don't hold any locks, it's not
1095 * even sure that "rq" stays as the right runqueue! 1138 * even sure that "rq" stays as the right runqueue!
1096 * But we don't care, since "task_running()" will 1139 * But we don't care, since "task_running()" will
1097 * return false if the runqueue has changed and p 1140 * return false if the runqueue has changed and p
1098 * is actually now running somewhere else! 1141 * is actually now running somewhere else!
1099 */ 1142 */
1100 while (task_running(rq, p)) 1143 while (task_running(rq, p))
1101 cpu_relax(); 1144 cpu_relax();
1102 1145
1103 /* 1146 /*
1104 * Ok, time to look more closely! We need the rq 1147 * Ok, time to look more closely! We need the rq
1105 * lock now, to be *sure*. If we're wrong, we'll 1148 * lock now, to be *sure*. If we're wrong, we'll
1106 * just go back and repeat. 1149 * just go back and repeat.
1107 */ 1150 */
1108 rq = task_rq_lock(p, &flags); 1151 rq = task_rq_lock(p, &flags);
1109 running = task_running(rq, p); 1152 running = task_running(rq, p);
1110 on_rq = p->se.on_rq; 1153 on_rq = p->se.on_rq;
1111 task_rq_unlock(rq, &flags); 1154 task_rq_unlock(rq, &flags);
1112 1155
1113 /* 1156 /*
1114 * Was it really running after all now that we 1157 * Was it really running after all now that we
1115 * checked with the proper locks actually held? 1158 * checked with the proper locks actually held?
1116 * 1159 *
1117 * Oops. Go back and try again.. 1160 * Oops. Go back and try again..
1118 */ 1161 */
1119 if (unlikely(running)) { 1162 if (unlikely(running)) {
1120 cpu_relax(); 1163 cpu_relax();
1121 goto repeat; 1164 continue;
1122 } 1165 }
1123 1166
1124 /* 1167 /*
1125 * It's not enough that it's not actively running, 1168 * It's not enough that it's not actively running,
1126 * it must be off the runqueue _entirely_, and not 1169 * it must be off the runqueue _entirely_, and not
1127 * preempted! 1170 * preempted!
1128 * 1171 *
1129 * So if it wa still runnable (but just not actively 1172 * So if it wa still runnable (but just not actively
1130 * running right now), it's preempted, and we should 1173 * running right now), it's preempted, and we should
1131 * yield - it could be a while. 1174 * yield - it could be a while.
1132 */ 1175 */
1133 if (unlikely(on_rq)) { 1176 if (unlikely(on_rq)) {
1134 yield(); 1177 schedule_timeout_uninterruptible(1);
1135 goto repeat; 1178 continue;
1136 } 1179 }
1137 1180
1138 /* 1181 /*
1139 * Ahh, all good. It wasn't running, and it wasn't 1182 * Ahh, all good. It wasn't running, and it wasn't
1140 * runnable, which means that it will never become 1183 * runnable, which means that it will never become
1141 * running in the future either. We're all done! 1184 * running in the future either. We're all done!
1142 */ 1185 */
1186 break;
1187 }
1143} 1188}
1144 1189
1145/*** 1190/***
@@ -1173,7 +1218,7 @@ void kick_process(struct task_struct *p)
1173 * We want to under-estimate the load of migration sources, to 1218 * We want to under-estimate the load of migration sources, to
1174 * balance conservatively. 1219 * balance conservatively.
1175 */ 1220 */
1176static inline unsigned long source_load(int cpu, int type) 1221static unsigned long source_load(int cpu, int type)
1177{ 1222{
1178 struct rq *rq = cpu_rq(cpu); 1223 struct rq *rq = cpu_rq(cpu);
1179 unsigned long total = weighted_cpuload(cpu); 1224 unsigned long total = weighted_cpuload(cpu);
@@ -1188,7 +1233,7 @@ static inline unsigned long source_load(int cpu, int type)
1188 * Return a high guess at the load of a migration-target cpu weighted 1233 * Return a high guess at the load of a migration-target cpu weighted
1189 * according to the scheduling class and "nice" value. 1234 * according to the scheduling class and "nice" value.
1190 */ 1235 */
1191static inline unsigned long target_load(int cpu, int type) 1236static unsigned long target_load(int cpu, int type)
1192{ 1237{
1193 struct rq *rq = cpu_rq(cpu); 1238 struct rq *rq = cpu_rq(cpu);
1194 unsigned long total = weighted_cpuload(cpu); 1239 unsigned long total = weighted_cpuload(cpu);
@@ -1230,7 +1275,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1230 1275
1231 /* Skip over this group if it has no CPUs allowed */ 1276 /* Skip over this group if it has no CPUs allowed */
1232 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 1277 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1233 goto nextgroup; 1278 continue;
1234 1279
1235 local_group = cpu_isset(this_cpu, group->cpumask); 1280 local_group = cpu_isset(this_cpu, group->cpumask);
1236 1281
@@ -1258,9 +1303,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1258 min_load = avg_load; 1303 min_load = avg_load;
1259 idlest = group; 1304 idlest = group;
1260 } 1305 }
1261nextgroup: 1306 } while (group = group->next, group != sd->groups);
1262 group = group->next;
1263 } while (group != sd->groups);
1264 1307
1265 if (!idlest || 100*this_load < imbalance*min_load) 1308 if (!idlest || 100*this_load < imbalance*min_load)
1266 return NULL; 1309 return NULL;
@@ -1392,8 +1435,13 @@ static int wake_idle(int cpu, struct task_struct *p)
1392 if (sd->flags & SD_WAKE_IDLE) { 1435 if (sd->flags & SD_WAKE_IDLE) {
1393 cpus_and(tmp, sd->span, p->cpus_allowed); 1436 cpus_and(tmp, sd->span, p->cpus_allowed);
1394 for_each_cpu_mask(i, tmp) { 1437 for_each_cpu_mask(i, tmp) {
1395 if (idle_cpu(i)) 1438 if (idle_cpu(i)) {
1439 if (i != task_cpu(p)) {
1440 schedstat_inc(p,
1441 se.nr_wakeups_idle);
1442 }
1396 return i; 1443 return i;
1444 }
1397 } 1445 }
1398 } else { 1446 } else {
1399 break; 1447 break;
@@ -1424,7 +1472,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1424 */ 1472 */
1425static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 1473static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1426{ 1474{
1427 int cpu, this_cpu, success = 0; 1475 int cpu, orig_cpu, this_cpu, success = 0;
1428 unsigned long flags; 1476 unsigned long flags;
1429 long old_state; 1477 long old_state;
1430 struct rq *rq; 1478 struct rq *rq;
@@ -1443,6 +1491,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1443 goto out_running; 1491 goto out_running;
1444 1492
1445 cpu = task_cpu(p); 1493 cpu = task_cpu(p);
1494 orig_cpu = cpu;
1446 this_cpu = smp_processor_id(); 1495 this_cpu = smp_processor_id();
1447 1496
1448#ifdef CONFIG_SMP 1497#ifdef CONFIG_SMP
@@ -1451,7 +1500,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1451 1500
1452 new_cpu = cpu; 1501 new_cpu = cpu;
1453 1502
1454 schedstat_inc(rq, ttwu_cnt); 1503 schedstat_inc(rq, ttwu_count);
1455 if (cpu == this_cpu) { 1504 if (cpu == this_cpu) {
1456 schedstat_inc(rq, ttwu_local); 1505 schedstat_inc(rq, ttwu_local);
1457 goto out_set_cpu; 1506 goto out_set_cpu;
@@ -1486,6 +1535,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1486 unsigned long tl = this_load; 1535 unsigned long tl = this_load;
1487 unsigned long tl_per_task; 1536 unsigned long tl_per_task;
1488 1537
1538 /*
1539 * Attract cache-cold tasks on sync wakeups:
1540 */
1541 if (sync && !task_hot(p, rq->clock, this_sd))
1542 goto out_set_cpu;
1543
1544 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1489 tl_per_task = cpu_avg_load_per_task(this_cpu); 1545 tl_per_task = cpu_avg_load_per_task(this_cpu);
1490 1546
1491 /* 1547 /*
@@ -1505,6 +1561,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1505 * there is no bad imbalance. 1561 * there is no bad imbalance.
1506 */ 1562 */
1507 schedstat_inc(this_sd, ttwu_move_affine); 1563 schedstat_inc(this_sd, ttwu_move_affine);
1564 schedstat_inc(p, se.nr_wakeups_affine);
1508 goto out_set_cpu; 1565 goto out_set_cpu;
1509 } 1566 }
1510 } 1567 }
@@ -1516,6 +1573,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1516 if (this_sd->flags & SD_WAKE_BALANCE) { 1573 if (this_sd->flags & SD_WAKE_BALANCE) {
1517 if (imbalance*this_load <= 100*load) { 1574 if (imbalance*this_load <= 100*load) {
1518 schedstat_inc(this_sd, ttwu_move_balance); 1575 schedstat_inc(this_sd, ttwu_move_balance);
1576 schedstat_inc(p, se.nr_wakeups_passive);
1519 goto out_set_cpu; 1577 goto out_set_cpu;
1520 } 1578 }
1521 } 1579 }
@@ -1541,18 +1599,18 @@ out_set_cpu:
1541 1599
1542out_activate: 1600out_activate:
1543#endif /* CONFIG_SMP */ 1601#endif /* CONFIG_SMP */
1602 schedstat_inc(p, se.nr_wakeups);
1603 if (sync)
1604 schedstat_inc(p, se.nr_wakeups_sync);
1605 if (orig_cpu != cpu)
1606 schedstat_inc(p, se.nr_wakeups_migrate);
1607 if (cpu == this_cpu)
1608 schedstat_inc(p, se.nr_wakeups_local);
1609 else
1610 schedstat_inc(p, se.nr_wakeups_remote);
1544 update_rq_clock(rq); 1611 update_rq_clock(rq);
1545 activate_task(rq, p, 1); 1612 activate_task(rq, p, 1);
1546 /* 1613 check_preempt_curr(rq, p);
1547 * Sync wakeups (i.e. those types of wakeups where the waker
1548 * has indicated that it will leave the CPU in short order)
1549 * don't trigger a preemption, if the woken up task will run on
1550 * this cpu. (in this case the 'I will reschedule' promise of
1551 * the waker guarantees that the freshly woken up task is going
1552 * to be considered on this CPU.)
1553 */
1554 if (!sync || cpu != this_cpu)
1555 check_preempt_curr(rq, p);
1556 success = 1; 1614 success = 1;
1557 1615
1558out_running: 1616out_running:
@@ -1583,28 +1641,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1583 */ 1641 */
1584static void __sched_fork(struct task_struct *p) 1642static void __sched_fork(struct task_struct *p)
1585{ 1643{
1586 p->se.wait_start_fair = 0;
1587 p->se.exec_start = 0; 1644 p->se.exec_start = 0;
1588 p->se.sum_exec_runtime = 0; 1645 p->se.sum_exec_runtime = 0;
1589 p->se.prev_sum_exec_runtime = 0; 1646 p->se.prev_sum_exec_runtime = 0;
1590 p->se.delta_exec = 0;
1591 p->se.delta_fair_run = 0;
1592 p->se.delta_fair_sleep = 0;
1593 p->se.wait_runtime = 0;
1594 p->se.sleep_start_fair = 0;
1595 1647
1596#ifdef CONFIG_SCHEDSTATS 1648#ifdef CONFIG_SCHEDSTATS
1597 p->se.wait_start = 0; 1649 p->se.wait_start = 0;
1598 p->se.sum_wait_runtime = 0;
1599 p->se.sum_sleep_runtime = 0; 1650 p->se.sum_sleep_runtime = 0;
1600 p->se.sleep_start = 0; 1651 p->se.sleep_start = 0;
1601 p->se.block_start = 0; 1652 p->se.block_start = 0;
1602 p->se.sleep_max = 0; 1653 p->se.sleep_max = 0;
1603 p->se.block_max = 0; 1654 p->se.block_max = 0;
1604 p->se.exec_max = 0; 1655 p->se.exec_max = 0;
1656 p->se.slice_max = 0;
1605 p->se.wait_max = 0; 1657 p->se.wait_max = 0;
1606 p->se.wait_runtime_overruns = 0;
1607 p->se.wait_runtime_underruns = 0;
1608#endif 1658#endif
1609 1659
1610 INIT_LIST_HEAD(&p->run_list); 1660 INIT_LIST_HEAD(&p->run_list);
@@ -1635,12 +1685,14 @@ void sched_fork(struct task_struct *p, int clone_flags)
1635#ifdef CONFIG_SMP 1685#ifdef CONFIG_SMP
1636 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1686 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1637#endif 1687#endif
1638 __set_task_cpu(p, cpu); 1688 set_task_cpu(p, cpu);
1639 1689
1640 /* 1690 /*
1641 * Make sure we do not leak PI boosting priority to the child: 1691 * Make sure we do not leak PI boosting priority to the child:
1642 */ 1692 */
1643 p->prio = current->normal_prio; 1693 p->prio = current->normal_prio;
1694 if (!rt_prio(p->prio))
1695 p->sched_class = &fair_sched_class;
1644 1696
1645#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1697#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1646 if (likely(sched_info_on())) 1698 if (likely(sched_info_on()))
@@ -1657,12 +1709,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
1657} 1709}
1658 1710
1659/* 1711/*
1660 * After fork, child runs first. (default) If set to 0 then
1661 * parent will (try to) run first.
1662 */
1663unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1664
1665/*
1666 * wake_up_new_task - wake up a newly created task for the first time. 1712 * wake_up_new_task - wake up a newly created task for the first time.
1667 * 1713 *
1668 * This function will do some initial scheduler statistics housekeeping 1714 * This function will do some initial scheduler statistics housekeeping
@@ -1673,24 +1719,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1673{ 1719{
1674 unsigned long flags; 1720 unsigned long flags;
1675 struct rq *rq; 1721 struct rq *rq;
1676 int this_cpu;
1677 1722
1678 rq = task_rq_lock(p, &flags); 1723 rq = task_rq_lock(p, &flags);
1679 BUG_ON(p->state != TASK_RUNNING); 1724 BUG_ON(p->state != TASK_RUNNING);
1680 this_cpu = smp_processor_id(); /* parent's CPU */
1681 update_rq_clock(rq); 1725 update_rq_clock(rq);
1682 1726
1683 p->prio = effective_prio(p); 1727 p->prio = effective_prio(p);
1684 1728
1685 if (rt_prio(p->prio)) 1729 if (!p->sched_class->task_new || !current->se.on_rq) {
1686 p->sched_class = &rt_sched_class;
1687 else
1688 p->sched_class = &fair_sched_class;
1689
1690 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1691 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1692 !current->se.on_rq) {
1693
1694 activate_task(rq, p, 0); 1730 activate_task(rq, p, 0);
1695 } else { 1731 } else {
1696 /* 1732 /*
@@ -1799,7 +1835,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1799 * with the lock held can cause deadlocks; see schedule() for 1835 * with the lock held can cause deadlocks; see schedule() for
1800 * details.) 1836 * details.)
1801 */ 1837 */
1802static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) 1838static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1803 __releases(rq->lock) 1839 __releases(rq->lock)
1804{ 1840{
1805 struct mm_struct *mm = rq->prev_mm; 1841 struct mm_struct *mm = rq->prev_mm;
@@ -1849,7 +1885,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1849 preempt_enable(); 1885 preempt_enable();
1850#endif 1886#endif
1851 if (current->set_child_tid) 1887 if (current->set_child_tid)
1852 put_user(current->pid, current->set_child_tid); 1888 put_user(task_pid_vnr(current), current->set_child_tid);
1853} 1889}
1854 1890
1855/* 1891/*
@@ -1981,42 +2017,10 @@ unsigned long nr_active(void)
1981 */ 2017 */
1982static void update_cpu_load(struct rq *this_rq) 2018static void update_cpu_load(struct rq *this_rq)
1983{ 2019{
1984 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; 2020 unsigned long this_load = this_rq->load.weight;
1985 unsigned long total_load = this_rq->ls.load.weight;
1986 unsigned long this_load = total_load;
1987 struct load_stat *ls = &this_rq->ls;
1988 int i, scale; 2021 int i, scale;
1989 2022
1990 this_rq->nr_load_updates++; 2023 this_rq->nr_load_updates++;
1991 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1992 goto do_avg;
1993
1994 /* Update delta_fair/delta_exec fields first */
1995 update_curr_load(this_rq);
1996
1997 fair_delta64 = ls->delta_fair + 1;
1998 ls->delta_fair = 0;
1999
2000 exec_delta64 = ls->delta_exec + 1;
2001 ls->delta_exec = 0;
2002
2003 sample_interval64 = this_rq->clock - ls->load_update_last;
2004 ls->load_update_last = this_rq->clock;
2005
2006 if ((s64)sample_interval64 < (s64)TICK_NSEC)
2007 sample_interval64 = TICK_NSEC;
2008
2009 if (exec_delta64 > sample_interval64)
2010 exec_delta64 = sample_interval64;
2011
2012 idle_delta64 = sample_interval64 - exec_delta64;
2013
2014 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
2015 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
2016
2017 this_load = (unsigned long)tmp64;
2018
2019do_avg:
2020 2024
2021 /* Update our load: */ 2025 /* Update our load: */
2022 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2026,7 +2030,13 @@ do_avg:
2026 2030
2027 old_load = this_rq->cpu_load[i]; 2031 old_load = this_rq->cpu_load[i];
2028 new_load = this_load; 2032 new_load = this_load;
2029 2033 /*
2034 * Round up the averaging division if load is increasing. This
2035 * prevents us from getting stuck on 9 if the load is 10, for
2036 * example.
2037 */
2038 if (new_load > old_load)
2039 new_load += scale-1;
2030 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2031 } 2041 }
2032} 2042}
@@ -2178,13 +2188,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2178 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2188 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2179 * 3) are cache-hot on their current CPU. 2189 * 3) are cache-hot on their current CPU.
2180 */ 2190 */
2181 if (!cpu_isset(this_cpu, p->cpus_allowed)) 2191 if (!cpu_isset(this_cpu, p->cpus_allowed)) {
2192 schedstat_inc(p, se.nr_failed_migrations_affine);
2182 return 0; 2193 return 0;
2194 }
2183 *all_pinned = 0; 2195 *all_pinned = 0;
2184 2196
2185 if (task_running(rq, p)) 2197 if (task_running(rq, p)) {
2198 schedstat_inc(p, se.nr_failed_migrations_running);
2186 return 0; 2199 return 0;
2200 }
2201
2202 /*
2203 * Aggressive migration if:
2204 * 1) task is cache cold, or
2205 * 2) too many balance attempts have failed.
2206 */
2207
2208 if (!task_hot(p, rq->clock, sd) ||
2209 sd->nr_balance_failed > sd->cache_nice_tries) {
2210#ifdef CONFIG_SCHEDSTATS
2211 if (task_hot(p, rq->clock, sd)) {
2212 schedstat_inc(sd, lb_hot_gained[idle]);
2213 schedstat_inc(p, se.nr_forced_migrations);
2214 }
2215#endif
2216 return 1;
2217 }
2187 2218
2219 if (task_hot(p, rq->clock, sd)) {
2220 schedstat_inc(p, se.nr_failed_migrations_hot);
2221 return 0;
2222 }
2188 return 1; 2223 return 1;
2189} 2224}
2190 2225
@@ -2263,7 +2298,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2263 struct sched_domain *sd, enum cpu_idle_type idle, 2298 struct sched_domain *sd, enum cpu_idle_type idle,
2264 int *all_pinned) 2299 int *all_pinned)
2265{ 2300{
2266 struct sched_class *class = sched_class_highest; 2301 const struct sched_class *class = sched_class_highest;
2267 unsigned long total_load_moved = 0; 2302 unsigned long total_load_moved = 0;
2268 int this_best_prio = this_rq->curr->prio; 2303 int this_best_prio = this_rq->curr->prio;
2269 2304
@@ -2288,7 +2323,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2288static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 2323static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2289 struct sched_domain *sd, enum cpu_idle_type idle) 2324 struct sched_domain *sd, enum cpu_idle_type idle)
2290{ 2325{
2291 struct sched_class *class; 2326 const struct sched_class *class;
2292 int this_best_prio = MAX_PRIO; 2327 int this_best_prio = MAX_PRIO;
2293 2328
2294 for (class = sched_class_highest; class; class = class->next) 2329 for (class = sched_class_highest; class; class = class->next)
@@ -2315,7 +2350,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2315 unsigned long max_pull; 2350 unsigned long max_pull;
2316 unsigned long busiest_load_per_task, busiest_nr_running; 2351 unsigned long busiest_load_per_task, busiest_nr_running;
2317 unsigned long this_load_per_task, this_nr_running; 2352 unsigned long this_load_per_task, this_nr_running;
2318 int load_idx; 2353 int load_idx, group_imb = 0;
2319#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2354#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2320 int power_savings_balance = 1; 2355 int power_savings_balance = 1;
2321 unsigned long leader_nr_running = 0, min_load_per_task = 0; 2356 unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2334,9 +2369,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2334 load_idx = sd->idle_idx; 2369 load_idx = sd->idle_idx;
2335 2370
2336 do { 2371 do {
2337 unsigned long load, group_capacity; 2372 unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
2338 int local_group; 2373 int local_group;
2339 int i; 2374 int i;
2375 int __group_imb = 0;
2340 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2376 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2341 unsigned long sum_nr_running, sum_weighted_load; 2377 unsigned long sum_nr_running, sum_weighted_load;
2342 2378
@@ -2347,6 +2383,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2347 2383
2348 /* Tally up the load of all CPUs in the group */ 2384 /* Tally up the load of all CPUs in the group */
2349 sum_weighted_load = sum_nr_running = avg_load = 0; 2385 sum_weighted_load = sum_nr_running = avg_load = 0;
2386 max_cpu_load = 0;
2387 min_cpu_load = ~0UL;
2350 2388
2351 for_each_cpu_mask(i, group->cpumask) { 2389 for_each_cpu_mask(i, group->cpumask) {
2352 struct rq *rq; 2390 struct rq *rq;
@@ -2367,8 +2405,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2367 } 2405 }
2368 2406
2369 load = target_load(i, load_idx); 2407 load = target_load(i, load_idx);
2370 } else 2408 } else {
2371 load = source_load(i, load_idx); 2409 load = source_load(i, load_idx);
2410 if (load > max_cpu_load)
2411 max_cpu_load = load;
2412 if (min_cpu_load > load)
2413 min_cpu_load = load;
2414 }
2372 2415
2373 avg_load += load; 2416 avg_load += load;
2374 sum_nr_running += rq->nr_running; 2417 sum_nr_running += rq->nr_running;
@@ -2394,6 +2437,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2394 avg_load = sg_div_cpu_power(group, 2437 avg_load = sg_div_cpu_power(group,
2395 avg_load * SCHED_LOAD_SCALE); 2438 avg_load * SCHED_LOAD_SCALE);
2396 2439
2440 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
2441 __group_imb = 1;
2442
2397 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 2443 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2398 2444
2399 if (local_group) { 2445 if (local_group) {
@@ -2402,11 +2448,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2402 this_nr_running = sum_nr_running; 2448 this_nr_running = sum_nr_running;
2403 this_load_per_task = sum_weighted_load; 2449 this_load_per_task = sum_weighted_load;
2404 } else if (avg_load > max_load && 2450 } else if (avg_load > max_load &&
2405 sum_nr_running > group_capacity) { 2451 (sum_nr_running > group_capacity || __group_imb)) {
2406 max_load = avg_load; 2452 max_load = avg_load;
2407 busiest = group; 2453 busiest = group;
2408 busiest_nr_running = sum_nr_running; 2454 busiest_nr_running = sum_nr_running;
2409 busiest_load_per_task = sum_weighted_load; 2455 busiest_load_per_task = sum_weighted_load;
2456 group_imb = __group_imb;
2410 } 2457 }
2411 2458
2412#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2459#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2478,6 +2525,9 @@ group_next:
2478 goto out_balanced; 2525 goto out_balanced;
2479 2526
2480 busiest_load_per_task /= busiest_nr_running; 2527 busiest_load_per_task /= busiest_nr_running;
2528 if (group_imb)
2529 busiest_load_per_task = min(busiest_load_per_task, avg_load);
2530
2481 /* 2531 /*
2482 * We're trying to get all the cpus to the average_load, so we don't 2532 * We're trying to get all the cpus to the average_load, so we don't
2483 * want to push ourselves above the average load, nor do we wish to 2533 * want to push ourselves above the average load, nor do we wish to
@@ -2652,7 +2702,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2652 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2702 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2653 sd_idle = 1; 2703 sd_idle = 1;
2654 2704
2655 schedstat_inc(sd, lb_cnt[idle]); 2705 schedstat_inc(sd, lb_count[idle]);
2656 2706
2657redo: 2707redo:
2658 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2708 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2805,7 +2855,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2805 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2855 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2806 sd_idle = 1; 2856 sd_idle = 1;
2807 2857
2808 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); 2858 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
2809redo: 2859redo:
2810 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 2860 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2811 &sd_idle, &cpus, NULL); 2861 &sd_idle, &cpus, NULL);
@@ -2939,7 +2989,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2939 } 2989 }
2940 2990
2941 if (likely(sd)) { 2991 if (likely(sd)) {
2942 schedstat_inc(sd, alb_cnt); 2992 schedstat_inc(sd, alb_count);
2943 2993
2944 if (move_one_task(target_rq, target_cpu, busiest_rq, 2994 if (move_one_task(target_rq, target_cpu, busiest_rq,
2945 sd, CPU_IDLE)) 2995 sd, CPU_IDLE))
@@ -3032,7 +3082,7 @@ static DEFINE_SPINLOCK(balancing);
3032 * 3082 *
3033 * Balancing parameters are set up in arch_init_sched_domains. 3083 * Balancing parameters are set up in arch_init_sched_domains.
3034 */ 3084 */
3035static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) 3085static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3036{ 3086{
3037 int balance = 1; 3087 int balance = 1;
3038 struct rq *rq = cpu_rq(cpu); 3088 struct rq *rq = cpu_rq(cpu);
@@ -3267,9 +3317,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3267{ 3317{
3268 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3318 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3269 cputime64_t tmp; 3319 cputime64_t tmp;
3320 struct rq *rq = this_rq();
3270 3321
3271 p->utime = cputime_add(p->utime, cputime); 3322 p->utime = cputime_add(p->utime, cputime);
3272 3323
3324 if (p != rq->idle)
3325 cpuacct_charge(p, cputime);
3326
3273 /* Add user time to cpustat. */ 3327 /* Add user time to cpustat. */
3274 tmp = cputime_to_cputime64(cputime); 3328 tmp = cputime_to_cputime64(cputime);
3275 if (TASK_NICE(p) > 0) 3329 if (TASK_NICE(p) > 0)
@@ -3279,6 +3333,35 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3279} 3333}
3280 3334
3281/* 3335/*
3336 * Account guest cpu time to a process.
3337 * @p: the process that the cpu time gets accounted to
3338 * @cputime: the cpu time spent in virtual machine since the last update
3339 */
3340void account_guest_time(struct task_struct *p, cputime_t cputime)
3341{
3342 cputime64_t tmp;
3343 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3344
3345 tmp = cputime_to_cputime64(cputime);
3346
3347 p->utime = cputime_add(p->utime, cputime);
3348 p->gtime = cputime_add(p->gtime, cputime);
3349
3350 cpustat->user = cputime64_add(cpustat->user, tmp);
3351 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3352}
3353
3354/*
3355 * Account scaled user cpu time to a process.
3356 * @p: the process that the cpu time gets accounted to
3357 * @cputime: the cpu time spent in user space since the last update
3358 */
3359void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
3360{
3361 p->utimescaled = cputime_add(p->utimescaled, cputime);
3362}
3363
3364/*
3282 * Account system cpu time to a process. 3365 * Account system cpu time to a process.
3283 * @p: the process that the cpu time gets accounted to 3366 * @p: the process that the cpu time gets accounted to
3284 * @hardirq_offset: the offset to subtract from hardirq_count() 3367 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3291,6 +3374,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3291 struct rq *rq = this_rq(); 3374 struct rq *rq = this_rq();
3292 cputime64_t tmp; 3375 cputime64_t tmp;
3293 3376
3377 if (p->flags & PF_VCPU) {
3378 account_guest_time(p, cputime);
3379 p->flags &= ~PF_VCPU;
3380 return;
3381 }
3382
3294 p->stime = cputime_add(p->stime, cputime); 3383 p->stime = cputime_add(p->stime, cputime);
3295 3384
3296 /* Add system time to cpustat. */ 3385 /* Add system time to cpustat. */
@@ -3299,9 +3388,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3299 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3388 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3300 else if (softirq_count()) 3389 else if (softirq_count())
3301 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3390 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3302 else if (p != rq->idle) 3391 else if (p != rq->idle) {
3303 cpustat->system = cputime64_add(cpustat->system, tmp); 3392 cpustat->system = cputime64_add(cpustat->system, tmp);
3304 else if (atomic_read(&rq->nr_iowait) > 0) 3393 cpuacct_charge(p, cputime);
3394 } else if (atomic_read(&rq->nr_iowait) > 0)
3305 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3395 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3306 else 3396 else
3307 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3397 cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3310,6 +3400,17 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3310} 3400}
3311 3401
3312/* 3402/*
3403 * Account scaled system cpu time to a process.
3404 * @p: the process that the cpu time gets accounted to
3405 * @hardirq_offset: the offset to subtract from hardirq_count()
3406 * @cputime: the cpu time spent in kernel space since the last update
3407 */
3408void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
3409{
3410 p->stimescaled = cputime_add(p->stimescaled, cputime);
3411}
3412
3413/*
3313 * Account for involuntary wait time. 3414 * Account for involuntary wait time.
3314 * @p: the process from which the cpu time has been stolen 3415 * @p: the process from which the cpu time has been stolen
3315 * @steal: the cpu time spent in involuntary wait 3416 * @steal: the cpu time spent in involuntary wait
@@ -3326,8 +3427,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3326 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3427 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3327 else 3428 else
3328 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3429 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3329 } else 3430 } else {
3330 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3431 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3432 cpuacct_charge(p, -tmp);
3433 }
3331} 3434}
3332 3435
3333/* 3436/*
@@ -3407,7 +3510,7 @@ EXPORT_SYMBOL(sub_preempt_count);
3407static noinline void __schedule_bug(struct task_struct *prev) 3510static noinline void __schedule_bug(struct task_struct *prev)
3408{ 3511{
3409 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3512 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3410 prev->comm, preempt_count(), prev->pid); 3513 prev->comm, preempt_count(), task_pid_nr(prev));
3411 debug_show_held_locks(prev); 3514 debug_show_held_locks(prev);
3412 if (irqs_disabled()) 3515 if (irqs_disabled())
3413 print_irqtrace_events(prev); 3516 print_irqtrace_events(prev);
@@ -3429,7 +3532,13 @@ static inline void schedule_debug(struct task_struct *prev)
3429 3532
3430 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3533 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3431 3534
3432 schedstat_inc(this_rq(), sched_cnt); 3535 schedstat_inc(this_rq(), sched_count);
3536#ifdef CONFIG_SCHEDSTATS
3537 if (unlikely(prev->lock_depth >= 0)) {
3538 schedstat_inc(this_rq(), bkl_count);
3539 schedstat_inc(prev, sched_info.bkl_count);
3540 }
3541#endif
3433} 3542}
3434 3543
3435/* 3544/*
@@ -3438,7 +3547,7 @@ static inline void schedule_debug(struct task_struct *prev)
3438static inline struct task_struct * 3547static inline struct task_struct *
3439pick_next_task(struct rq *rq, struct task_struct *prev) 3548pick_next_task(struct rq *rq, struct task_struct *prev)
3440{ 3549{
3441 struct sched_class *class; 3550 const struct sched_class *class;
3442 struct task_struct *p; 3551 struct task_struct *p;
3443 3552
3444 /* 3553 /*
@@ -3487,9 +3596,13 @@ need_resched_nonpreemptible:
3487 3596
3488 schedule_debug(prev); 3597 schedule_debug(prev);
3489 3598
3490 spin_lock_irq(&rq->lock); 3599 /*
3491 clear_tsk_need_resched(prev); 3600 * Do the rq-clock update outside the rq lock:
3601 */
3602 local_irq_disable();
3492 __update_rq_clock(rq); 3603 __update_rq_clock(rq);
3604 spin_lock(&rq->lock);
3605 clear_tsk_need_resched(prev);
3493 3606
3494 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3607 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3495 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3608 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3549,27 +3662,30 @@ asmlinkage void __sched preempt_schedule(void)
3549 if (likely(ti->preempt_count || irqs_disabled())) 3662 if (likely(ti->preempt_count || irqs_disabled()))
3550 return; 3663 return;
3551 3664
3552need_resched: 3665 do {
3553 add_preempt_count(PREEMPT_ACTIVE); 3666 add_preempt_count(PREEMPT_ACTIVE);
3554 /* 3667
3555 * We keep the big kernel semaphore locked, but we 3668 /*
3556 * clear ->lock_depth so that schedule() doesnt 3669 * We keep the big kernel semaphore locked, but we
3557 * auto-release the semaphore: 3670 * clear ->lock_depth so that schedule() doesnt
3558 */ 3671 * auto-release the semaphore:
3672 */
3559#ifdef CONFIG_PREEMPT_BKL 3673#ifdef CONFIG_PREEMPT_BKL
3560 saved_lock_depth = task->lock_depth; 3674 saved_lock_depth = task->lock_depth;
3561 task->lock_depth = -1; 3675 task->lock_depth = -1;
3562#endif 3676#endif
3563 schedule(); 3677 schedule();
3564#ifdef CONFIG_PREEMPT_BKL 3678#ifdef CONFIG_PREEMPT_BKL
3565 task->lock_depth = saved_lock_depth; 3679 task->lock_depth = saved_lock_depth;
3566#endif 3680#endif
3567 sub_preempt_count(PREEMPT_ACTIVE); 3681 sub_preempt_count(PREEMPT_ACTIVE);
3568 3682
3569 /* we could miss a preemption opportunity between schedule and now */ 3683 /*
3570 barrier(); 3684 * Check again in case we missed a preemption opportunity
3571 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3685 * between schedule and now.
3572 goto need_resched; 3686 */
3687 barrier();
3688 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3573} 3689}
3574EXPORT_SYMBOL(preempt_schedule); 3690EXPORT_SYMBOL(preempt_schedule);
3575 3691
@@ -3589,29 +3705,32 @@ asmlinkage void __sched preempt_schedule_irq(void)
3589 /* Catch callers which need to be fixed */ 3705 /* Catch callers which need to be fixed */
3590 BUG_ON(ti->preempt_count || !irqs_disabled()); 3706 BUG_ON(ti->preempt_count || !irqs_disabled());
3591 3707
3592need_resched: 3708 do {
3593 add_preempt_count(PREEMPT_ACTIVE); 3709 add_preempt_count(PREEMPT_ACTIVE);
3594 /* 3710
3595 * We keep the big kernel semaphore locked, but we 3711 /*
3596 * clear ->lock_depth so that schedule() doesnt 3712 * We keep the big kernel semaphore locked, but we
3597 * auto-release the semaphore: 3713 * clear ->lock_depth so that schedule() doesnt
3598 */ 3714 * auto-release the semaphore:
3715 */
3599#ifdef CONFIG_PREEMPT_BKL 3716#ifdef CONFIG_PREEMPT_BKL
3600 saved_lock_depth = task->lock_depth; 3717 saved_lock_depth = task->lock_depth;
3601 task->lock_depth = -1; 3718 task->lock_depth = -1;
3602#endif 3719#endif
3603 local_irq_enable(); 3720 local_irq_enable();
3604 schedule(); 3721 schedule();
3605 local_irq_disable(); 3722 local_irq_disable();
3606#ifdef CONFIG_PREEMPT_BKL 3723#ifdef CONFIG_PREEMPT_BKL
3607 task->lock_depth = saved_lock_depth; 3724 task->lock_depth = saved_lock_depth;
3608#endif 3725#endif
3609 sub_preempt_count(PREEMPT_ACTIVE); 3726 sub_preempt_count(PREEMPT_ACTIVE);
3610 3727
3611 /* we could miss a preemption opportunity between schedule and now */ 3728 /*
3612 barrier(); 3729 * Check again in case we missed a preemption opportunity
3613 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3730 * between schedule and now.
3614 goto need_resched; 3731 */
3732 barrier();
3733 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
3615} 3734}
3616 3735
3617#endif /* CONFIG_PREEMPT */ 3736#endif /* CONFIG_PREEMPT */
@@ -3635,10 +3754,9 @@ EXPORT_SYMBOL(default_wake_function);
3635static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3754static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3636 int nr_exclusive, int sync, void *key) 3755 int nr_exclusive, int sync, void *key)
3637{ 3756{
3638 struct list_head *tmp, *next; 3757 wait_queue_t *curr, *next;
3639 3758
3640 list_for_each_safe(tmp, next, &q->task_list) { 3759 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3641 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3642 unsigned flags = curr->flags; 3760 unsigned flags = curr->flags;
3643 3761
3644 if (curr->func(curr, mode, sync, key) && 3762 if (curr->func(curr, mode, sync, key) &&
@@ -3728,206 +3846,119 @@ void fastcall complete_all(struct completion *x)
3728} 3846}
3729EXPORT_SYMBOL(complete_all); 3847EXPORT_SYMBOL(complete_all);
3730 3848
3731void fastcall __sched wait_for_completion(struct completion *x) 3849static inline long __sched
3850do_wait_for_common(struct completion *x, long timeout, int state)
3732{ 3851{
3733 might_sleep();
3734
3735 spin_lock_irq(&x->wait.lock);
3736 if (!x->done) { 3852 if (!x->done) {
3737 DECLARE_WAITQUEUE(wait, current); 3853 DECLARE_WAITQUEUE(wait, current);
3738 3854
3739 wait.flags |= WQ_FLAG_EXCLUSIVE; 3855 wait.flags |= WQ_FLAG_EXCLUSIVE;
3740 __add_wait_queue_tail(&x->wait, &wait); 3856 __add_wait_queue_tail(&x->wait, &wait);
3741 do { 3857 do {
3742 __set_current_state(TASK_UNINTERRUPTIBLE); 3858 if (state == TASK_INTERRUPTIBLE &&
3743 spin_unlock_irq(&x->wait.lock); 3859 signal_pending(current)) {
3744 schedule(); 3860 __remove_wait_queue(&x->wait, &wait);
3745 spin_lock_irq(&x->wait.lock); 3861 return -ERESTARTSYS;
3746 } while (!x->done); 3862 }
3747 __remove_wait_queue(&x->wait, &wait); 3863 __set_current_state(state);
3748 }
3749 x->done--;
3750 spin_unlock_irq(&x->wait.lock);
3751}
3752EXPORT_SYMBOL(wait_for_completion);
3753
3754unsigned long fastcall __sched
3755wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3756{
3757 might_sleep();
3758
3759 spin_lock_irq(&x->wait.lock);
3760 if (!x->done) {
3761 DECLARE_WAITQUEUE(wait, current);
3762
3763 wait.flags |= WQ_FLAG_EXCLUSIVE;
3764 __add_wait_queue_tail(&x->wait, &wait);
3765 do {
3766 __set_current_state(TASK_UNINTERRUPTIBLE);
3767 spin_unlock_irq(&x->wait.lock); 3864 spin_unlock_irq(&x->wait.lock);
3768 timeout = schedule_timeout(timeout); 3865 timeout = schedule_timeout(timeout);
3769 spin_lock_irq(&x->wait.lock); 3866 spin_lock_irq(&x->wait.lock);
3770 if (!timeout) { 3867 if (!timeout) {
3771 __remove_wait_queue(&x->wait, &wait); 3868 __remove_wait_queue(&x->wait, &wait);
3772 goto out; 3869 return timeout;
3773 } 3870 }
3774 } while (!x->done); 3871 } while (!x->done);
3775 __remove_wait_queue(&x->wait, &wait); 3872 __remove_wait_queue(&x->wait, &wait);
3776 } 3873 }
3777 x->done--; 3874 x->done--;
3778out:
3779 spin_unlock_irq(&x->wait.lock);
3780 return timeout; 3875 return timeout;
3781} 3876}
3782EXPORT_SYMBOL(wait_for_completion_timeout);
3783 3877
3784int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3878static long __sched
3879wait_for_common(struct completion *x, long timeout, int state)
3785{ 3880{
3786 int ret = 0;
3787
3788 might_sleep(); 3881 might_sleep();
3789 3882
3790 spin_lock_irq(&x->wait.lock); 3883 spin_lock_irq(&x->wait.lock);
3791 if (!x->done) { 3884 timeout = do_wait_for_common(x, timeout, state);
3792 DECLARE_WAITQUEUE(wait, current);
3793
3794 wait.flags |= WQ_FLAG_EXCLUSIVE;
3795 __add_wait_queue_tail(&x->wait, &wait);
3796 do {
3797 if (signal_pending(current)) {
3798 ret = -ERESTARTSYS;
3799 __remove_wait_queue(&x->wait, &wait);
3800 goto out;
3801 }
3802 __set_current_state(TASK_INTERRUPTIBLE);
3803 spin_unlock_irq(&x->wait.lock);
3804 schedule();
3805 spin_lock_irq(&x->wait.lock);
3806 } while (!x->done);
3807 __remove_wait_queue(&x->wait, &wait);
3808 }
3809 x->done--;
3810out:
3811 spin_unlock_irq(&x->wait.lock); 3885 spin_unlock_irq(&x->wait.lock);
3886 return timeout;
3887}
3812 3888
3813 return ret; 3889void fastcall __sched wait_for_completion(struct completion *x)
3890{
3891 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3814} 3892}
3815EXPORT_SYMBOL(wait_for_completion_interruptible); 3893EXPORT_SYMBOL(wait_for_completion);
3816 3894
3817unsigned long fastcall __sched 3895unsigned long fastcall __sched
3818wait_for_completion_interruptible_timeout(struct completion *x, 3896wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3819 unsigned long timeout)
3820{ 3897{
3821 might_sleep(); 3898 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3822
3823 spin_lock_irq(&x->wait.lock);
3824 if (!x->done) {
3825 DECLARE_WAITQUEUE(wait, current);
3826
3827 wait.flags |= WQ_FLAG_EXCLUSIVE;
3828 __add_wait_queue_tail(&x->wait, &wait);
3829 do {
3830 if (signal_pending(current)) {
3831 timeout = -ERESTARTSYS;
3832 __remove_wait_queue(&x->wait, &wait);
3833 goto out;
3834 }
3835 __set_current_state(TASK_INTERRUPTIBLE);
3836 spin_unlock_irq(&x->wait.lock);
3837 timeout = schedule_timeout(timeout);
3838 spin_lock_irq(&x->wait.lock);
3839 if (!timeout) {
3840 __remove_wait_queue(&x->wait, &wait);
3841 goto out;
3842 }
3843 } while (!x->done);
3844 __remove_wait_queue(&x->wait, &wait);
3845 }
3846 x->done--;
3847out:
3848 spin_unlock_irq(&x->wait.lock);
3849 return timeout;
3850} 3899}
3851EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3900EXPORT_SYMBOL(wait_for_completion_timeout);
3852 3901
3853static inline void 3902int __sched wait_for_completion_interruptible(struct completion *x)
3854sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3855{ 3903{
3856 spin_lock_irqsave(&q->lock, *flags); 3904 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3857 __add_wait_queue(q, wait); 3905 if (t == -ERESTARTSYS)
3858 spin_unlock(&q->lock); 3906 return t;
3907 return 0;
3859} 3908}
3909EXPORT_SYMBOL(wait_for_completion_interruptible);
3860 3910
3861static inline void 3911unsigned long fastcall __sched
3862sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) 3912wait_for_completion_interruptible_timeout(struct completion *x,
3913 unsigned long timeout)
3863{ 3914{
3864 spin_lock_irq(&q->lock); 3915 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3865 __remove_wait_queue(q, wait);
3866 spin_unlock_irqrestore(&q->lock, *flags);
3867} 3916}
3917EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3868 3918
3869void __sched interruptible_sleep_on(wait_queue_head_t *q) 3919static long __sched
3920sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3870{ 3921{
3871 unsigned long flags; 3922 unsigned long flags;
3872 wait_queue_t wait; 3923 wait_queue_t wait;
3873 3924
3874 init_waitqueue_entry(&wait, current); 3925 init_waitqueue_entry(&wait, current);
3875 3926
3876 current->state = TASK_INTERRUPTIBLE; 3927 __set_current_state(state);
3877 3928
3878 sleep_on_head(q, &wait, &flags); 3929 spin_lock_irqsave(&q->lock, flags);
3879 schedule(); 3930 __add_wait_queue(q, &wait);
3880 sleep_on_tail(q, &wait, &flags); 3931 spin_unlock(&q->lock);
3932 timeout = schedule_timeout(timeout);
3933 spin_lock_irq(&q->lock);
3934 __remove_wait_queue(q, &wait);
3935 spin_unlock_irqrestore(&q->lock, flags);
3936
3937 return timeout;
3938}
3939
3940void __sched interruptible_sleep_on(wait_queue_head_t *q)
3941{
3942 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3881} 3943}
3882EXPORT_SYMBOL(interruptible_sleep_on); 3944EXPORT_SYMBOL(interruptible_sleep_on);
3883 3945
3884long __sched 3946long __sched
3885interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3947interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3886{ 3948{
3887 unsigned long flags; 3949 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3888 wait_queue_t wait;
3889
3890 init_waitqueue_entry(&wait, current);
3891
3892 current->state = TASK_INTERRUPTIBLE;
3893
3894 sleep_on_head(q, &wait, &flags);
3895 timeout = schedule_timeout(timeout);
3896 sleep_on_tail(q, &wait, &flags);
3897
3898 return timeout;
3899} 3950}
3900EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3951EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3901 3952
3902void __sched sleep_on(wait_queue_head_t *q) 3953void __sched sleep_on(wait_queue_head_t *q)
3903{ 3954{
3904 unsigned long flags; 3955 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3905 wait_queue_t wait;
3906
3907 init_waitqueue_entry(&wait, current);
3908
3909 current->state = TASK_UNINTERRUPTIBLE;
3910
3911 sleep_on_head(q, &wait, &flags);
3912 schedule();
3913 sleep_on_tail(q, &wait, &flags);
3914} 3956}
3915EXPORT_SYMBOL(sleep_on); 3957EXPORT_SYMBOL(sleep_on);
3916 3958
3917long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3959long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3918{ 3960{
3919 unsigned long flags; 3961 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3920 wait_queue_t wait;
3921
3922 init_waitqueue_entry(&wait, current);
3923
3924 current->state = TASK_UNINTERRUPTIBLE;
3925
3926 sleep_on_head(q, &wait, &flags);
3927 timeout = schedule_timeout(timeout);
3928 sleep_on_tail(q, &wait, &flags);
3929
3930 return timeout;
3931} 3962}
3932EXPORT_SYMBOL(sleep_on_timeout); 3963EXPORT_SYMBOL(sleep_on_timeout);
3933 3964
@@ -3946,7 +3977,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
3946void rt_mutex_setprio(struct task_struct *p, int prio) 3977void rt_mutex_setprio(struct task_struct *p, int prio)
3947{ 3978{
3948 unsigned long flags; 3979 unsigned long flags;
3949 int oldprio, on_rq; 3980 int oldprio, on_rq, running;
3950 struct rq *rq; 3981 struct rq *rq;
3951 3982
3952 BUG_ON(prio < 0 || prio > MAX_PRIO); 3983 BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3956,8 +3987,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3956 3987
3957 oldprio = p->prio; 3988 oldprio = p->prio;
3958 on_rq = p->se.on_rq; 3989 on_rq = p->se.on_rq;
3959 if (on_rq) 3990 running = task_running(rq, p);
3991 if (on_rq) {
3960 dequeue_task(rq, p, 0); 3992 dequeue_task(rq, p, 0);
3993 if (running)
3994 p->sched_class->put_prev_task(rq, p);
3995 }
3961 3996
3962 if (rt_prio(prio)) 3997 if (rt_prio(prio))
3963 p->sched_class = &rt_sched_class; 3998 p->sched_class = &rt_sched_class;
@@ -3967,13 +4002,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3967 p->prio = prio; 4002 p->prio = prio;
3968 4003
3969 if (on_rq) { 4004 if (on_rq) {
4005 if (running)
4006 p->sched_class->set_curr_task(rq);
3970 enqueue_task(rq, p, 0); 4007 enqueue_task(rq, p, 0);
3971 /* 4008 /*
3972 * Reschedule if we are currently running on this runqueue and 4009 * Reschedule if we are currently running on this runqueue and
3973 * our priority decreased, or if we are not currently running on 4010 * our priority decreased, or if we are not currently running on
3974 * this runqueue and our priority is higher than the current's 4011 * this runqueue and our priority is higher than the current's
3975 */ 4012 */
3976 if (task_running(rq, p)) { 4013 if (running) {
3977 if (p->prio > oldprio) 4014 if (p->prio > oldprio)
3978 resched_task(rq->curr); 4015 resched_task(rq->curr);
3979 } else { 4016 } else {
@@ -4137,9 +4174,9 @@ struct task_struct *idle_task(int cpu)
4137 * find_process_by_pid - find a process with a matching PID value. 4174 * find_process_by_pid - find a process with a matching PID value.
4138 * @pid: the pid in question. 4175 * @pid: the pid in question.
4139 */ 4176 */
4140static inline struct task_struct *find_process_by_pid(pid_t pid) 4177static struct task_struct *find_process_by_pid(pid_t pid)
4141{ 4178{
4142 return pid ? find_task_by_pid(pid) : current; 4179 return pid ? find_task_by_vpid(pid) : current;
4143} 4180}
4144 4181
4145/* Actually do priority change: must hold rq lock. */ 4182/* Actually do priority change: must hold rq lock. */
@@ -4179,7 +4216,7 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4179int sched_setscheduler(struct task_struct *p, int policy, 4216int sched_setscheduler(struct task_struct *p, int policy,
4180 struct sched_param *param) 4217 struct sched_param *param)
4181{ 4218{
4182 int retval, oldprio, oldpolicy = -1, on_rq; 4219 int retval, oldprio, oldpolicy = -1, on_rq, running;
4183 unsigned long flags; 4220 unsigned long flags;
4184 struct rq *rq; 4221 struct rq *rq;
4185 4222
@@ -4261,18 +4298,26 @@ recheck:
4261 } 4298 }
4262 update_rq_clock(rq); 4299 update_rq_clock(rq);
4263 on_rq = p->se.on_rq; 4300 on_rq = p->se.on_rq;
4264 if (on_rq) 4301 running = task_running(rq, p);
4302 if (on_rq) {
4265 deactivate_task(rq, p, 0); 4303 deactivate_task(rq, p, 0);
4304 if (running)
4305 p->sched_class->put_prev_task(rq, p);
4306 }
4307
4266 oldprio = p->prio; 4308 oldprio = p->prio;
4267 __setscheduler(rq, p, policy, param->sched_priority); 4309 __setscheduler(rq, p, policy, param->sched_priority);
4310
4268 if (on_rq) { 4311 if (on_rq) {
4312 if (running)
4313 p->sched_class->set_curr_task(rq);
4269 activate_task(rq, p, 0); 4314 activate_task(rq, p, 0);
4270 /* 4315 /*
4271 * Reschedule if we are currently running on this runqueue and 4316 * Reschedule if we are currently running on this runqueue and
4272 * our priority decreased, or if we are not currently running on 4317 * our priority decreased, or if we are not currently running on
4273 * this runqueue and our priority is higher than the current's 4318 * this runqueue and our priority is higher than the current's
4274 */ 4319 */
4275 if (task_running(rq, p)) { 4320 if (running) {
4276 if (p->prio > oldprio) 4321 if (p->prio > oldprio)
4277 resched_task(rq->curr); 4322 resched_task(rq->curr);
4278 } else { 4323 } else {
@@ -4343,10 +4388,10 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4343asmlinkage long sys_sched_getscheduler(pid_t pid) 4388asmlinkage long sys_sched_getscheduler(pid_t pid)
4344{ 4389{
4345 struct task_struct *p; 4390 struct task_struct *p;
4346 int retval = -EINVAL; 4391 int retval;
4347 4392
4348 if (pid < 0) 4393 if (pid < 0)
4349 goto out_nounlock; 4394 return -EINVAL;
4350 4395
4351 retval = -ESRCH; 4396 retval = -ESRCH;
4352 read_lock(&tasklist_lock); 4397 read_lock(&tasklist_lock);
@@ -4357,8 +4402,6 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
4357 retval = p->policy; 4402 retval = p->policy;
4358 } 4403 }
4359 read_unlock(&tasklist_lock); 4404 read_unlock(&tasklist_lock);
4360
4361out_nounlock:
4362 return retval; 4405 return retval;
4363} 4406}
4364 4407
@@ -4371,10 +4414,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4371{ 4414{
4372 struct sched_param lp; 4415 struct sched_param lp;
4373 struct task_struct *p; 4416 struct task_struct *p;
4374 int retval = -EINVAL; 4417 int retval;
4375 4418
4376 if (!param || pid < 0) 4419 if (!param || pid < 0)
4377 goto out_nounlock; 4420 return -EINVAL;
4378 4421
4379 read_lock(&tasklist_lock); 4422 read_lock(&tasklist_lock);
4380 p = find_process_by_pid(pid); 4423 p = find_process_by_pid(pid);
@@ -4394,7 +4437,6 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4394 */ 4437 */
4395 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4438 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4396 4439
4397out_nounlock:
4398 return retval; 4440 return retval;
4399 4441
4400out_unlock: 4442out_unlock:
@@ -4437,8 +4479,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4437 4479
4438 cpus_allowed = cpuset_cpus_allowed(p); 4480 cpus_allowed = cpuset_cpus_allowed(p);
4439 cpus_and(new_mask, new_mask, cpus_allowed); 4481 cpus_and(new_mask, new_mask, cpus_allowed);
4482 again:
4440 retval = set_cpus_allowed(p, new_mask); 4483 retval = set_cpus_allowed(p, new_mask);
4441 4484
4485 if (!retval) {
4486 cpus_allowed = cpuset_cpus_allowed(p);
4487 if (!cpus_subset(new_mask, cpus_allowed)) {
4488 /*
4489 * We must have raced with a concurrent cpuset
4490 * update. Just reset the cpus_allowed to the
4491 * cpuset's cpus_allowed
4492 */
4493 new_mask = cpus_allowed;
4494 goto again;
4495 }
4496 }
4442out_unlock: 4497out_unlock:
4443 put_task_struct(p); 4498 put_task_struct(p);
4444 mutex_unlock(&sched_hotcpu_mutex); 4499 mutex_unlock(&sched_hotcpu_mutex);
@@ -4554,8 +4609,8 @@ asmlinkage long sys_sched_yield(void)
4554{ 4609{
4555 struct rq *rq = this_rq_lock(); 4610 struct rq *rq = this_rq_lock();
4556 4611
4557 schedstat_inc(rq, yld_cnt); 4612 schedstat_inc(rq, yld_count);
4558 current->sched_class->yield_task(rq, current); 4613 current->sched_class->yield_task(rq);
4559 4614
4560 /* 4615 /*
4561 * Since we are going to call schedule() anyway, there's 4616 * Since we are going to call schedule() anyway, there's
@@ -4749,11 +4804,12 @@ asmlinkage
4749long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4804long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4750{ 4805{
4751 struct task_struct *p; 4806 struct task_struct *p;
4752 int retval = -EINVAL; 4807 unsigned int time_slice;
4808 int retval;
4753 struct timespec t; 4809 struct timespec t;
4754 4810
4755 if (pid < 0) 4811 if (pid < 0)
4756 goto out_nounlock; 4812 return -EINVAL;
4757 4813
4758 retval = -ESRCH; 4814 retval = -ESRCH;
4759 read_lock(&tasklist_lock); 4815 read_lock(&tasklist_lock);
@@ -4765,12 +4821,24 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4765 if (retval) 4821 if (retval)
4766 goto out_unlock; 4822 goto out_unlock;
4767 4823
4768 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4824 if (p->policy == SCHED_FIFO)
4769 0 : static_prio_timeslice(p->static_prio), &t); 4825 time_slice = 0;
4826 else if (p->policy == SCHED_RR)
4827 time_slice = DEF_TIMESLICE;
4828 else {
4829 struct sched_entity *se = &p->se;
4830 unsigned long flags;
4831 struct rq *rq;
4832
4833 rq = task_rq_lock(p, &flags);
4834 time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
4835 task_rq_unlock(rq, &flags);
4836 }
4770 read_unlock(&tasklist_lock); 4837 read_unlock(&tasklist_lock);
4838 jiffies_to_timespec(time_slice, &t);
4771 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4839 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4772out_nounlock:
4773 return retval; 4840 return retval;
4841
4774out_unlock: 4842out_unlock:
4775 read_unlock(&tasklist_lock); 4843 read_unlock(&tasklist_lock);
4776 return retval; 4844 return retval;
@@ -4784,18 +4852,18 @@ static void show_task(struct task_struct *p)
4784 unsigned state; 4852 unsigned state;
4785 4853
4786 state = p->state ? __ffs(p->state) + 1 : 0; 4854 state = p->state ? __ffs(p->state) + 1 : 0;
4787 printk("%-13.13s %c", p->comm, 4855 printk(KERN_INFO "%-13.13s %c", p->comm,
4788 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4856 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4789#if BITS_PER_LONG == 32 4857#if BITS_PER_LONG == 32
4790 if (state == TASK_RUNNING) 4858 if (state == TASK_RUNNING)
4791 printk(" running "); 4859 printk(KERN_CONT " running ");
4792 else 4860 else
4793 printk(" %08lx ", thread_saved_pc(p)); 4861 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4794#else 4862#else
4795 if (state == TASK_RUNNING) 4863 if (state == TASK_RUNNING)
4796 printk(" running task "); 4864 printk(KERN_CONT " running task ");
4797 else 4865 else
4798 printk(" %016lx ", thread_saved_pc(p)); 4866 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4799#endif 4867#endif
4800#ifdef CONFIG_DEBUG_STACK_USAGE 4868#ifdef CONFIG_DEBUG_STACK_USAGE
4801 { 4869 {
@@ -4805,7 +4873,8 @@ static void show_task(struct task_struct *p)
4805 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4873 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4806 } 4874 }
4807#endif 4875#endif
4808 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); 4876 printk(KERN_CONT "%5lu %5d %6d\n", free,
4877 task_pid_nr(p), task_pid_nr(p->parent));
4809 4878
4810 if (state != TASK_RUNNING) 4879 if (state != TASK_RUNNING)
4811 show_stack(p, NULL); 4880 show_stack(p, NULL);
@@ -4899,32 +4968,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4899 */ 4968 */
4900cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4969cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4901 4970
4902/*
4903 * Increase the granularity value when there are more CPUs,
4904 * because with more CPUs the 'effective latency' as visible
4905 * to users decreases. But the relationship is not linear,
4906 * so pick a second-best guess by going with the log2 of the
4907 * number of CPUs.
4908 *
4909 * This idea comes from the SD scheduler of Con Kolivas:
4910 */
4911static inline void sched_init_granularity(void)
4912{
4913 unsigned int factor = 1 + ilog2(num_online_cpus());
4914 const unsigned long limit = 100000000;
4915
4916 sysctl_sched_min_granularity *= factor;
4917 if (sysctl_sched_min_granularity > limit)
4918 sysctl_sched_min_granularity = limit;
4919
4920 sysctl_sched_latency *= factor;
4921 if (sysctl_sched_latency > limit)
4922 sysctl_sched_latency = limit;
4923
4924 sysctl_sched_runtime_limit = sysctl_sched_latency;
4925 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4926}
4927
4928#ifdef CONFIG_SMP 4971#ifdef CONFIG_SMP
4929/* 4972/*
4930 * This is how migration works: 4973 * This is how migration works:
@@ -5091,6 +5134,17 @@ wait_to_die:
5091} 5134}
5092 5135
5093#ifdef CONFIG_HOTPLUG_CPU 5136#ifdef CONFIG_HOTPLUG_CPU
5137
5138static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5139{
5140 int ret;
5141
5142 local_irq_disable();
5143 ret = __migrate_task(p, src_cpu, dest_cpu);
5144 local_irq_enable();
5145 return ret;
5146}
5147
5094/* 5148/*
5095 * Figure out where task on dead CPU should go, use force if neccessary. 5149 * Figure out where task on dead CPU should go, use force if neccessary.
5096 * NOTE: interrupts should be disabled by the caller 5150 * NOTE: interrupts should be disabled by the caller
@@ -5102,35 +5156,42 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5102 struct rq *rq; 5156 struct rq *rq;
5103 int dest_cpu; 5157 int dest_cpu;
5104 5158
5105restart: 5159 do {
5106 /* On same node? */ 5160 /* On same node? */
5107 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5161 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5108 cpus_and(mask, mask, p->cpus_allowed); 5162 cpus_and(mask, mask, p->cpus_allowed);
5109 dest_cpu = any_online_cpu(mask); 5163 dest_cpu = any_online_cpu(mask);
5110 5164
5111 /* On any allowed CPU? */ 5165 /* On any allowed CPU? */
5112 if (dest_cpu == NR_CPUS) 5166 if (dest_cpu == NR_CPUS)
5113 dest_cpu = any_online_cpu(p->cpus_allowed); 5167 dest_cpu = any_online_cpu(p->cpus_allowed);
5114 5168
5115 /* No more Mr. Nice Guy. */ 5169 /* No more Mr. Nice Guy. */
5116 if (dest_cpu == NR_CPUS) { 5170 if (dest_cpu == NR_CPUS) {
5117 rq = task_rq_lock(p, &flags); 5171 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
5118 cpus_setall(p->cpus_allowed); 5172 /*
5119 dest_cpu = any_online_cpu(p->cpus_allowed); 5173 * Try to stay on the same cpuset, where the
5120 task_rq_unlock(rq, &flags); 5174 * current cpuset may be a subset of all cpus.
5175 * The cpuset_cpus_allowed_locked() variant of
5176 * cpuset_cpus_allowed() will not block. It must be
5177 * called within calls to cpuset_lock/cpuset_unlock.
5178 */
5179 rq = task_rq_lock(p, &flags);
5180 p->cpus_allowed = cpus_allowed;
5181 dest_cpu = any_online_cpu(p->cpus_allowed);
5182 task_rq_unlock(rq, &flags);
5121 5183
5122 /* 5184 /*
5123 * Don't tell them about moving exiting tasks or 5185 * Don't tell them about moving exiting tasks or
5124 * kernel threads (both mm NULL), since they never 5186 * kernel threads (both mm NULL), since they never
5125 * leave kernel. 5187 * leave kernel.
5126 */ 5188 */
5127 if (p->mm && printk_ratelimit()) 5189 if (p->mm && printk_ratelimit())
5128 printk(KERN_INFO "process %d (%s) no " 5190 printk(KERN_INFO "process %d (%s) no "
5129 "longer affine to cpu%d\n", 5191 "longer affine to cpu%d\n",
5130 p->pid, p->comm, dead_cpu); 5192 task_pid_nr(p), p->comm, dead_cpu);
5131 } 5193 }
5132 if (!__migrate_task(p, dead_cpu, dest_cpu)) 5194 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5133 goto restart;
5134} 5195}
5135 5196
5136/* 5197/*
@@ -5158,7 +5219,7 @@ static void migrate_live_tasks(int src_cpu)
5158{ 5219{
5159 struct task_struct *p, *t; 5220 struct task_struct *p, *t;
5160 5221
5161 write_lock_irq(&tasklist_lock); 5222 read_lock(&tasklist_lock);
5162 5223
5163 do_each_thread(t, p) { 5224 do_each_thread(t, p) {
5164 if (p == current) 5225 if (p == current)
@@ -5168,7 +5229,21 @@ static void migrate_live_tasks(int src_cpu)
5168 move_task_off_dead_cpu(src_cpu, p); 5229 move_task_off_dead_cpu(src_cpu, p);
5169 } while_each_thread(t, p); 5230 } while_each_thread(t, p);
5170 5231
5171 write_unlock_irq(&tasklist_lock); 5232 read_unlock(&tasklist_lock);
5233}
5234
5235/*
5236 * activate_idle_task - move idle task to the _front_ of runqueue.
5237 */
5238static void activate_idle_task(struct task_struct *p, struct rq *rq)
5239{
5240 update_rq_clock(rq);
5241
5242 if (p->state == TASK_UNINTERRUPTIBLE)
5243 rq->nr_uninterruptible--;
5244
5245 enqueue_task(rq, p, 0);
5246 inc_nr_running(p, rq);
5172} 5247}
5173 5248
5174/* 5249/*
@@ -5221,7 +5296,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5221 struct rq *rq = cpu_rq(dead_cpu); 5296 struct rq *rq = cpu_rq(dead_cpu);
5222 5297
5223 /* Must be exiting, otherwise would be on tasklist. */ 5298 /* Must be exiting, otherwise would be on tasklist. */
5224 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5299 BUG_ON(!p->exit_state);
5225 5300
5226 /* Cannot have done final schedule yet: would have vanished. */ 5301 /* Cannot have done final schedule yet: would have vanished. */
5227 BUG_ON(p->state == TASK_DEAD); 5302 BUG_ON(p->state == TASK_DEAD);
@@ -5232,11 +5307,10 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5232 * Drop lock around migration; if someone else moves it, 5307 * Drop lock around migration; if someone else moves it,
5233 * that's OK. No task can be added to this CPU, so iteration is 5308 * that's OK. No task can be added to this CPU, so iteration is
5234 * fine. 5309 * fine.
5235 * NOTE: interrupts should be left disabled --dev@
5236 */ 5310 */
5237 spin_unlock(&rq->lock); 5311 spin_unlock_irq(&rq->lock);
5238 move_task_off_dead_cpu(dead_cpu, p); 5312 move_task_off_dead_cpu(dead_cpu, p);
5239 spin_lock(&rq->lock); 5313 spin_lock_irq(&rq->lock);
5240 5314
5241 put_task_struct(p); 5315 put_task_struct(p);
5242} 5316}
@@ -5283,14 +5357,32 @@ static struct ctl_table sd_ctl_root[] = {
5283static struct ctl_table *sd_alloc_ctl_entry(int n) 5357static struct ctl_table *sd_alloc_ctl_entry(int n)
5284{ 5358{
5285 struct ctl_table *entry = 5359 struct ctl_table *entry =
5286 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); 5360 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5287
5288 BUG_ON(!entry);
5289 memset(entry, 0, n * sizeof(struct ctl_table));
5290 5361
5291 return entry; 5362 return entry;
5292} 5363}
5293 5364
5365static void sd_free_ctl_entry(struct ctl_table **tablep)
5366{
5367 struct ctl_table *entry;
5368
5369 /*
5370 * In the intermediate directories, both the child directory and
5371 * procname are dynamically allocated and could fail but the mode
5372 * will always be set. In the lowest directory the names are
5373 * static strings and all have proc handlers.
5374 */
5375 for (entry = *tablep; entry->mode; entry++) {
5376 if (entry->child)
5377 sd_free_ctl_entry(&entry->child);
5378 if (entry->proc_handler == NULL)
5379 kfree(entry->procname);
5380 }
5381
5382 kfree(*tablep);
5383 *tablep = NULL;
5384}
5385
5294static void 5386static void
5295set_table_entry(struct ctl_table *entry, 5387set_table_entry(struct ctl_table *entry,
5296 const char *procname, void *data, int maxlen, 5388 const char *procname, void *data, int maxlen,
@@ -5306,7 +5398,10 @@ set_table_entry(struct ctl_table *entry,
5306static struct ctl_table * 5398static struct ctl_table *
5307sd_alloc_ctl_domain_table(struct sched_domain *sd) 5399sd_alloc_ctl_domain_table(struct sched_domain *sd)
5308{ 5400{
5309 struct ctl_table *table = sd_alloc_ctl_entry(14); 5401 struct ctl_table *table = sd_alloc_ctl_entry(12);
5402
5403 if (table == NULL)
5404 return NULL;
5310 5405
5311 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5406 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5312 sizeof(long), 0644, proc_doulongvec_minmax); 5407 sizeof(long), 0644, proc_doulongvec_minmax);
@@ -5326,16 +5421,17 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5326 sizeof(int), 0644, proc_dointvec_minmax); 5421 sizeof(int), 0644, proc_dointvec_minmax);
5327 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5422 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5328 sizeof(int), 0644, proc_dointvec_minmax); 5423 sizeof(int), 0644, proc_dointvec_minmax);
5329 set_table_entry(&table[10], "cache_nice_tries", 5424 set_table_entry(&table[9], "cache_nice_tries",
5330 &sd->cache_nice_tries, 5425 &sd->cache_nice_tries,
5331 sizeof(int), 0644, proc_dointvec_minmax); 5426 sizeof(int), 0644, proc_dointvec_minmax);
5332 set_table_entry(&table[12], "flags", &sd->flags, 5427 set_table_entry(&table[10], "flags", &sd->flags,
5333 sizeof(int), 0644, proc_dointvec_minmax); 5428 sizeof(int), 0644, proc_dointvec_minmax);
5429 /* &table[11] is terminator */
5334 5430
5335 return table; 5431 return table;
5336} 5432}
5337 5433
5338static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5434static ctl_table * sd_alloc_ctl_cpu_table(int cpu)
5339{ 5435{
5340 struct ctl_table *entry, *table; 5436 struct ctl_table *entry, *table;
5341 struct sched_domain *sd; 5437 struct sched_domain *sd;
@@ -5345,6 +5441,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5345 for_each_domain(cpu, sd) 5441 for_each_domain(cpu, sd)
5346 domain_num++; 5442 domain_num++;
5347 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5443 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5444 if (table == NULL)
5445 return NULL;
5348 5446
5349 i = 0; 5447 i = 0;
5350 for_each_domain(cpu, sd) { 5448 for_each_domain(cpu, sd) {
@@ -5359,24 +5457,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5359} 5457}
5360 5458
5361static struct ctl_table_header *sd_sysctl_header; 5459static struct ctl_table_header *sd_sysctl_header;
5362static void init_sched_domain_sysctl(void) 5460static void register_sched_domain_sysctl(void)
5363{ 5461{
5364 int i, cpu_num = num_online_cpus(); 5462 int i, cpu_num = num_online_cpus();
5365 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5463 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5366 char buf[32]; 5464 char buf[32];
5367 5465
5466 if (entry == NULL)
5467 return;
5468
5368 sd_ctl_dir[0].child = entry; 5469 sd_ctl_dir[0].child = entry;
5369 5470
5370 for (i = 0; i < cpu_num; i++, entry++) { 5471 for_each_online_cpu(i) {
5371 snprintf(buf, 32, "cpu%d", i); 5472 snprintf(buf, 32, "cpu%d", i);
5372 entry->procname = kstrdup(buf, GFP_KERNEL); 5473 entry->procname = kstrdup(buf, GFP_KERNEL);
5373 entry->mode = 0555; 5474 entry->mode = 0555;
5374 entry->child = sd_alloc_ctl_cpu_table(i); 5475 entry->child = sd_alloc_ctl_cpu_table(i);
5476 entry++;
5375 } 5477 }
5376 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5478 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5377} 5479}
5480
5481static void unregister_sched_domain_sysctl(void)
5482{
5483 unregister_sysctl_table(sd_sysctl_header);
5484 sd_sysctl_header = NULL;
5485 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5486}
5378#else 5487#else
5379static void init_sched_domain_sysctl(void) 5488static void register_sched_domain_sysctl(void)
5489{
5490}
5491static void unregister_sched_domain_sysctl(void)
5380{ 5492{
5381} 5493}
5382#endif 5494#endif
@@ -5431,19 +5543,21 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5431 5543
5432 case CPU_DEAD: 5544 case CPU_DEAD:
5433 case CPU_DEAD_FROZEN: 5545 case CPU_DEAD_FROZEN:
5546 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5434 migrate_live_tasks(cpu); 5547 migrate_live_tasks(cpu);
5435 rq = cpu_rq(cpu); 5548 rq = cpu_rq(cpu);
5436 kthread_stop(rq->migration_thread); 5549 kthread_stop(rq->migration_thread);
5437 rq->migration_thread = NULL; 5550 rq->migration_thread = NULL;
5438 /* Idle task back to normal (off runqueue, low prio) */ 5551 /* Idle task back to normal (off runqueue, low prio) */
5439 rq = task_rq_lock(rq->idle, &flags); 5552 spin_lock_irq(&rq->lock);
5440 update_rq_clock(rq); 5553 update_rq_clock(rq);
5441 deactivate_task(rq, rq->idle, 0); 5554 deactivate_task(rq, rq->idle, 0);
5442 rq->idle->static_prio = MAX_PRIO; 5555 rq->idle->static_prio = MAX_PRIO;
5443 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5556 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5444 rq->idle->sched_class = &idle_sched_class; 5557 rq->idle->sched_class = &idle_sched_class;
5445 migrate_dead_tasks(cpu); 5558 migrate_dead_tasks(cpu);
5446 task_rq_unlock(rq, &flags); 5559 spin_unlock_irq(&rq->lock);
5560 cpuset_unlock();
5447 migrate_nr_uninterruptible(rq); 5561 migrate_nr_uninterruptible(rq);
5448 BUG_ON(rq->nr_running != 0); 5562 BUG_ON(rq->nr_running != 0);
5449 5563
@@ -5498,8 +5612,7 @@ int __init migration_init(void)
5498int nr_cpu_ids __read_mostly = NR_CPUS; 5612int nr_cpu_ids __read_mostly = NR_CPUS;
5499EXPORT_SYMBOL(nr_cpu_ids); 5613EXPORT_SYMBOL(nr_cpu_ids);
5500 5614
5501#undef SCHED_DOMAIN_DEBUG 5615#ifdef CONFIG_SCHED_DEBUG
5502#ifdef SCHED_DOMAIN_DEBUG
5503static void sched_domain_debug(struct sched_domain *sd, int cpu) 5616static void sched_domain_debug(struct sched_domain *sd, int cpu)
5504{ 5617{
5505 int level = 0; 5618 int level = 0;
@@ -5554,29 +5667,32 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5554 } 5667 }
5555 5668
5556 if (!group->__cpu_power) { 5669 if (!group->__cpu_power) {
5557 printk("\n"); 5670 printk(KERN_CONT "\n");
5558 printk(KERN_ERR "ERROR: domain->cpu_power not " 5671 printk(KERN_ERR "ERROR: domain->cpu_power not "
5559 "set\n"); 5672 "set\n");
5673 break;
5560 } 5674 }
5561 5675
5562 if (!cpus_weight(group->cpumask)) { 5676 if (!cpus_weight(group->cpumask)) {
5563 printk("\n"); 5677 printk(KERN_CONT "\n");
5564 printk(KERN_ERR "ERROR: empty group\n"); 5678 printk(KERN_ERR "ERROR: empty group\n");
5679 break;
5565 } 5680 }
5566 5681
5567 if (cpus_intersects(groupmask, group->cpumask)) { 5682 if (cpus_intersects(groupmask, group->cpumask)) {
5568 printk("\n"); 5683 printk(KERN_CONT "\n");
5569 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5684 printk(KERN_ERR "ERROR: repeated CPUs\n");
5685 break;
5570 } 5686 }
5571 5687
5572 cpus_or(groupmask, groupmask, group->cpumask); 5688 cpus_or(groupmask, groupmask, group->cpumask);
5573 5689
5574 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 5690 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5575 printk(" %s", str); 5691 printk(KERN_CONT " %s", str);
5576 5692
5577 group = group->next; 5693 group = group->next;
5578 } while (group != sd->groups); 5694 } while (group != sd->groups);
5579 printk("\n"); 5695 printk(KERN_CONT "\n");
5580 5696
5581 if (!cpus_equal(sd->span, groupmask)) 5697 if (!cpus_equal(sd->span, groupmask))
5582 printk(KERN_ERR "ERROR: groups don't span " 5698 printk(KERN_ERR "ERROR: groups don't span "
@@ -5700,7 +5816,7 @@ static int __init isolated_cpu_setup(char *str)
5700 return 1; 5816 return 1;
5701} 5817}
5702 5818
5703__setup ("isolcpus=", isolated_cpu_setup); 5819__setup("isolcpus=", isolated_cpu_setup);
5704 5820
5705/* 5821/*
5706 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 5822 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
@@ -5856,7 +5972,7 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5856 struct sched_group **sg) 5972 struct sched_group **sg)
5857{ 5973{
5858 int group; 5974 int group;
5859 cpumask_t mask = cpu_sibling_map[cpu]; 5975 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
5860 cpus_and(mask, mask, *cpu_map); 5976 cpus_and(mask, mask, *cpu_map);
5861 group = first_cpu(mask); 5977 group = first_cpu(mask);
5862 if (sg) 5978 if (sg)
@@ -5885,7 +6001,7 @@ static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5885 cpus_and(mask, mask, *cpu_map); 6001 cpus_and(mask, mask, *cpu_map);
5886 group = first_cpu(mask); 6002 group = first_cpu(mask);
5887#elif defined(CONFIG_SCHED_SMT) 6003#elif defined(CONFIG_SCHED_SMT)
5888 cpumask_t mask = cpu_sibling_map[cpu]; 6004 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
5889 cpus_and(mask, mask, *cpu_map); 6005 cpus_and(mask, mask, *cpu_map);
5890 group = first_cpu(mask); 6006 group = first_cpu(mask);
5891#else 6007#else
@@ -5929,24 +6045,23 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
5929 6045
5930 if (!sg) 6046 if (!sg)
5931 return; 6047 return;
5932next_sg: 6048 do {
5933 for_each_cpu_mask(j, sg->cpumask) { 6049 for_each_cpu_mask(j, sg->cpumask) {
5934 struct sched_domain *sd; 6050 struct sched_domain *sd;
5935 6051
5936 sd = &per_cpu(phys_domains, j); 6052 sd = &per_cpu(phys_domains, j);
5937 if (j != first_cpu(sd->groups->cpumask)) { 6053 if (j != first_cpu(sd->groups->cpumask)) {
5938 /* 6054 /*
5939 * Only add "power" once for each 6055 * Only add "power" once for each
5940 * physical package. 6056 * physical package.
5941 */ 6057 */
5942 continue; 6058 continue;
5943 } 6059 }
5944 6060
5945 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 6061 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5946 } 6062 }
5947 sg = sg->next; 6063 sg = sg->next;
5948 if (sg != group_head) 6064 } while (sg != group_head);
5949 goto next_sg;
5950} 6065}
5951#endif 6066#endif
5952 6067
@@ -6057,7 +6172,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6057 /* 6172 /*
6058 * Allocate the per-node list of sched groups 6173 * Allocate the per-node list of sched groups
6059 */ 6174 */
6060 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, 6175 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6061 GFP_KERNEL); 6176 GFP_KERNEL);
6062 if (!sched_group_nodes) { 6177 if (!sched_group_nodes) {
6063 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6178 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6120,7 +6235,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6120 p = sd; 6235 p = sd;
6121 sd = &per_cpu(cpu_domains, i); 6236 sd = &per_cpu(cpu_domains, i);
6122 *sd = SD_SIBLING_INIT; 6237 *sd = SD_SIBLING_INIT;
6123 sd->span = cpu_sibling_map[i]; 6238 sd->span = per_cpu(cpu_sibling_map, i);
6124 cpus_and(sd->span, sd->span, *cpu_map); 6239 cpus_and(sd->span, sd->span, *cpu_map);
6125 sd->parent = p; 6240 sd->parent = p;
6126 p->child = sd; 6241 p->child = sd;
@@ -6131,7 +6246,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6131#ifdef CONFIG_SCHED_SMT 6246#ifdef CONFIG_SCHED_SMT
6132 /* Set up CPU (sibling) groups */ 6247 /* Set up CPU (sibling) groups */
6133 for_each_cpu_mask(i, *cpu_map) { 6248 for_each_cpu_mask(i, *cpu_map) {
6134 cpumask_t this_sibling_map = cpu_sibling_map[i]; 6249 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
6135 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 6250 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6136 if (i != first_cpu(this_sibling_map)) 6251 if (i != first_cpu(this_sibling_map))
6137 continue; 6252 continue;
@@ -6293,24 +6408,31 @@ error:
6293 return -ENOMEM; 6408 return -ENOMEM;
6294#endif 6409#endif
6295} 6410}
6411
6412static cpumask_t *doms_cur; /* current sched domains */
6413static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6414
6415/*
6416 * Special case: If a kmalloc of a doms_cur partition (array of
6417 * cpumask_t) fails, then fallback to a single sched domain,
6418 * as determined by the single cpumask_t fallback_doms.
6419 */
6420static cpumask_t fallback_doms;
6421
6296/* 6422/*
6297 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6423 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6424 * For now this just excludes isolated cpus, but could be used to
6425 * exclude other special cases in the future.
6298 */ 6426 */
6299static int arch_init_sched_domains(const cpumask_t *cpu_map) 6427static int arch_init_sched_domains(const cpumask_t *cpu_map)
6300{ 6428{
6301 cpumask_t cpu_default_map; 6429 ndoms_cur = 1;
6302 int err; 6430 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6303 6431 if (!doms_cur)
6304 /* 6432 doms_cur = &fallback_doms;
6305 * Setup mask for cpus without special case scheduling requirements. 6433 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
6306 * For now this just excludes isolated cpus, but could be used to 6434 register_sched_domain_sysctl();
6307 * exclude other special cases in the future. 6435 return build_sched_domains(doms_cur);
6308 */
6309 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6310
6311 err = build_sched_domains(&cpu_default_map);
6312
6313 return err;
6314} 6436}
6315 6437
6316static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6438static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6326,6 +6448,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6326{ 6448{
6327 int i; 6449 int i;
6328 6450
6451 unregister_sched_domain_sysctl();
6452
6329 for_each_cpu_mask(i, *cpu_map) 6453 for_each_cpu_mask(i, *cpu_map)
6330 cpu_attach_domain(NULL, i); 6454 cpu_attach_domain(NULL, i);
6331 synchronize_sched(); 6455 synchronize_sched();
@@ -6333,30 +6457,65 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6333} 6457}
6334 6458
6335/* 6459/*
6336 * Partition sched domains as specified by the cpumasks below. 6460 * Partition sched domains as specified by the 'ndoms_new'
6337 * This attaches all cpus from the cpumasks to the NULL domain, 6461 * cpumasks in the array doms_new[] of cpumasks. This compares
6338 * waits for a RCU quiescent period, recalculates sched 6462 * doms_new[] to the current sched domain partitioning, doms_cur[].
6339 * domain information and then attaches them back to the 6463 * It destroys each deleted domain and builds each new domain.
6340 * correct sched domains 6464 *
6465 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6466 * The masks don't intersect (don't overlap.) We should setup one
6467 * sched domain for each mask. CPUs not in any of the cpumasks will
6468 * not be load balanced. If the same cpumask appears both in the
6469 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6470 * it as it is.
6471 *
6472 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6473 * ownership of it and will kfree it when done with it. If the caller
6474 * failed the kmalloc call, then it can pass in doms_new == NULL,
6475 * and partition_sched_domains() will fallback to the single partition
6476 * 'fallback_doms'.
6477 *
6341 * Call with hotplug lock held 6478 * Call with hotplug lock held
6342 */ 6479 */
6343int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6480void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6344{ 6481{
6345 cpumask_t change_map; 6482 int i, j;
6346 int err = 0;
6347 6483
6348 cpus_and(*partition1, *partition1, cpu_online_map); 6484 if (doms_new == NULL) {
6349 cpus_and(*partition2, *partition2, cpu_online_map); 6485 ndoms_new = 1;
6350 cpus_or(change_map, *partition1, *partition2); 6486 doms_new = &fallback_doms;
6487 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
6488 }
6351 6489
6352 /* Detach sched domains from all of the affected cpus */ 6490 /* Destroy deleted domains */
6353 detach_destroy_domains(&change_map); 6491 for (i = 0; i < ndoms_cur; i++) {
6354 if (!cpus_empty(*partition1)) 6492 for (j = 0; j < ndoms_new; j++) {
6355 err = build_sched_domains(partition1); 6493 if (cpus_equal(doms_cur[i], doms_new[j]))
6356 if (!err && !cpus_empty(*partition2)) 6494 goto match1;
6357 err = build_sched_domains(partition2); 6495 }
6496 /* no match - a current sched domain not in new doms_new[] */
6497 detach_destroy_domains(doms_cur + i);
6498match1:
6499 ;
6500 }
6358 6501
6359 return err; 6502 /* Build new domains */
6503 for (i = 0; i < ndoms_new; i++) {
6504 for (j = 0; j < ndoms_cur; j++) {
6505 if (cpus_equal(doms_new[i], doms_cur[j]))
6506 goto match2;
6507 }
6508 /* no match - add a new doms_new */
6509 build_sched_domains(doms_new + i);
6510match2:
6511 ;
6512 }
6513
6514 /* Remember the new sched domains */
6515 if (doms_cur != &fallback_doms)
6516 kfree(doms_cur);
6517 doms_cur = doms_new;
6518 ndoms_cur = ndoms_new;
6360} 6519}
6361 6520
6362#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6521#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6487,17 +6646,13 @@ void __init sched_init_smp(void)
6487 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6646 /* XXX: Theoretical race here - CPU may be hotplugged now */
6488 hotcpu_notifier(update_sched_domains, 0); 6647 hotcpu_notifier(update_sched_domains, 0);
6489 6648
6490 init_sched_domain_sysctl();
6491
6492 /* Move init over to a non-isolated CPU */ 6649 /* Move init over to a non-isolated CPU */
6493 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6650 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6494 BUG(); 6651 BUG();
6495 sched_init_granularity();
6496} 6652}
6497#else 6653#else
6498void __init sched_init_smp(void) 6654void __init sched_init_smp(void)
6499{ 6655{
6500 sched_init_granularity();
6501} 6656}
6502#endif /* CONFIG_SMP */ 6657#endif /* CONFIG_SMP */
6503 6658
@@ -6511,28 +6666,20 @@ int in_sched_functions(unsigned long addr)
6511 && addr < (unsigned long)__sched_text_end); 6666 && addr < (unsigned long)__sched_text_end);
6512} 6667}
6513 6668
6514static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 6669static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6515{ 6670{
6516 cfs_rq->tasks_timeline = RB_ROOT; 6671 cfs_rq->tasks_timeline = RB_ROOT;
6517 cfs_rq->fair_clock = 1;
6518#ifdef CONFIG_FAIR_GROUP_SCHED 6672#ifdef CONFIG_FAIR_GROUP_SCHED
6519 cfs_rq->rq = rq; 6673 cfs_rq->rq = rq;
6520#endif 6674#endif
6675 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6521} 6676}
6522 6677
6523void __init sched_init(void) 6678void __init sched_init(void)
6524{ 6679{
6525 u64 now = sched_clock();
6526 int highest_cpu = 0; 6680 int highest_cpu = 0;
6527 int i, j; 6681 int i, j;
6528 6682
6529 /*
6530 * Link up the scheduling class hierarchy:
6531 */
6532 rt_sched_class.next = &fair_sched_class;
6533 fair_sched_class.next = &idle_sched_class;
6534 idle_sched_class.next = NULL;
6535
6536 for_each_possible_cpu(i) { 6683 for_each_possible_cpu(i) {
6537 struct rt_prio_array *array; 6684 struct rt_prio_array *array;
6538 struct rq *rq; 6685 struct rq *rq;
@@ -6545,10 +6692,28 @@ void __init sched_init(void)
6545 init_cfs_rq(&rq->cfs, rq); 6692 init_cfs_rq(&rq->cfs, rq);
6546#ifdef CONFIG_FAIR_GROUP_SCHED 6693#ifdef CONFIG_FAIR_GROUP_SCHED
6547 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6694 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6548 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 6695 {
6696 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6697 struct sched_entity *se =
6698 &per_cpu(init_sched_entity, i);
6699
6700 init_cfs_rq_p[i] = cfs_rq;
6701 init_cfs_rq(cfs_rq, rq);
6702 cfs_rq->tg = &init_task_group;
6703 list_add(&cfs_rq->leaf_cfs_rq_list,
6704 &rq->leaf_cfs_rq_list);
6705
6706 init_sched_entity_p[i] = se;
6707 se->cfs_rq = &rq->cfs;
6708 se->my_q = cfs_rq;
6709 se->load.weight = init_task_group_load;
6710 se->load.inv_weight =
6711 div64_64(1ULL<<32, init_task_group_load);
6712 se->parent = NULL;
6713 }
6714 init_task_group.shares = init_task_group_load;
6715 spin_lock_init(&init_task_group.lock);
6549#endif 6716#endif
6550 rq->ls.load_update_last = now;
6551 rq->ls.load_update_start = now;
6552 6717
6553 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6718 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6554 rq->cpu_load[j] = 0; 6719 rq->cpu_load[j] = 0;
@@ -6633,26 +6798,40 @@ EXPORT_SYMBOL(__might_sleep);
6633#endif 6798#endif
6634 6799
6635#ifdef CONFIG_MAGIC_SYSRQ 6800#ifdef CONFIG_MAGIC_SYSRQ
6801static void normalize_task(struct rq *rq, struct task_struct *p)
6802{
6803 int on_rq;
6804 update_rq_clock(rq);
6805 on_rq = p->se.on_rq;
6806 if (on_rq)
6807 deactivate_task(rq, p, 0);
6808 __setscheduler(rq, p, SCHED_NORMAL, 0);
6809 if (on_rq) {
6810 activate_task(rq, p, 0);
6811 resched_task(rq->curr);
6812 }
6813}
6814
6636void normalize_rt_tasks(void) 6815void normalize_rt_tasks(void)
6637{ 6816{
6638 struct task_struct *g, *p; 6817 struct task_struct *g, *p;
6639 unsigned long flags; 6818 unsigned long flags;
6640 struct rq *rq; 6819 struct rq *rq;
6641 int on_rq;
6642 6820
6643 read_lock_irq(&tasklist_lock); 6821 read_lock_irq(&tasklist_lock);
6644 do_each_thread(g, p) { 6822 do_each_thread(g, p) {
6645 p->se.fair_key = 0; 6823 /*
6646 p->se.wait_runtime = 0; 6824 * Only normalize user tasks:
6825 */
6826 if (!p->mm)
6827 continue;
6828
6647 p->se.exec_start = 0; 6829 p->se.exec_start = 0;
6648 p->se.wait_start_fair = 0;
6649 p->se.sleep_start_fair = 0;
6650#ifdef CONFIG_SCHEDSTATS 6830#ifdef CONFIG_SCHEDSTATS
6651 p->se.wait_start = 0; 6831 p->se.wait_start = 0;
6652 p->se.sleep_start = 0; 6832 p->se.sleep_start = 0;
6653 p->se.block_start = 0; 6833 p->se.block_start = 0;
6654#endif 6834#endif
6655 task_rq(p)->cfs.fair_clock = 0;
6656 task_rq(p)->clock = 0; 6835 task_rq(p)->clock = 0;
6657 6836
6658 if (!rt_task(p)) { 6837 if (!rt_task(p)) {
@@ -6667,26 +6846,9 @@ void normalize_rt_tasks(void)
6667 6846
6668 spin_lock_irqsave(&p->pi_lock, flags); 6847 spin_lock_irqsave(&p->pi_lock, flags);
6669 rq = __task_rq_lock(p); 6848 rq = __task_rq_lock(p);
6670#ifdef CONFIG_SMP
6671 /*
6672 * Do not touch the migration thread:
6673 */
6674 if (p == rq->migration_thread)
6675 goto out_unlock;
6676#endif
6677 6849
6678 update_rq_clock(rq); 6850 normalize_task(rq, p);
6679 on_rq = p->se.on_rq; 6851
6680 if (on_rq)
6681 deactivate_task(rq, p, 0);
6682 __setscheduler(rq, p, SCHED_NORMAL, 0);
6683 if (on_rq) {
6684 activate_task(rq, p, 0);
6685 resched_task(rq->curr);
6686 }
6687#ifdef CONFIG_SMP
6688 out_unlock:
6689#endif
6690 __task_rq_unlock(rq); 6852 __task_rq_unlock(rq);
6691 spin_unlock_irqrestore(&p->pi_lock, flags); 6853 spin_unlock_irqrestore(&p->pi_lock, flags);
6692 } while_each_thread(g, p); 6854 } while_each_thread(g, p);
@@ -6739,3 +6901,314 @@ void set_curr_task(int cpu, struct task_struct *p)
6739} 6901}
6740 6902
6741#endif 6903#endif
6904
6905#ifdef CONFIG_FAIR_GROUP_SCHED
6906
6907/* allocate runqueue etc for a new task group */
6908struct task_group *sched_create_group(void)
6909{
6910 struct task_group *tg;
6911 struct cfs_rq *cfs_rq;
6912 struct sched_entity *se;
6913 struct rq *rq;
6914 int i;
6915
6916 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
6917 if (!tg)
6918 return ERR_PTR(-ENOMEM);
6919
6920 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
6921 if (!tg->cfs_rq)
6922 goto err;
6923 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6924 if (!tg->se)
6925 goto err;
6926
6927 for_each_possible_cpu(i) {
6928 rq = cpu_rq(i);
6929
6930 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
6931 cpu_to_node(i));
6932 if (!cfs_rq)
6933 goto err;
6934
6935 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
6936 cpu_to_node(i));
6937 if (!se)
6938 goto err;
6939
6940 memset(cfs_rq, 0, sizeof(struct cfs_rq));
6941 memset(se, 0, sizeof(struct sched_entity));
6942
6943 tg->cfs_rq[i] = cfs_rq;
6944 init_cfs_rq(cfs_rq, rq);
6945 cfs_rq->tg = tg;
6946
6947 tg->se[i] = se;
6948 se->cfs_rq = &rq->cfs;
6949 se->my_q = cfs_rq;
6950 se->load.weight = NICE_0_LOAD;
6951 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6952 se->parent = NULL;
6953 }
6954
6955 for_each_possible_cpu(i) {
6956 rq = cpu_rq(i);
6957 cfs_rq = tg->cfs_rq[i];
6958 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6959 }
6960
6961 tg->shares = NICE_0_LOAD;
6962 spin_lock_init(&tg->lock);
6963
6964 return tg;
6965
6966err:
6967 for_each_possible_cpu(i) {
6968 if (tg->cfs_rq)
6969 kfree(tg->cfs_rq[i]);
6970 if (tg->se)
6971 kfree(tg->se[i]);
6972 }
6973 kfree(tg->cfs_rq);
6974 kfree(tg->se);
6975 kfree(tg);
6976
6977 return ERR_PTR(-ENOMEM);
6978}
6979
6980/* rcu callback to free various structures associated with a task group */
6981static void free_sched_group(struct rcu_head *rhp)
6982{
6983 struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
6984 struct task_group *tg = cfs_rq->tg;
6985 struct sched_entity *se;
6986 int i;
6987
6988 /* now it should be safe to free those cfs_rqs */
6989 for_each_possible_cpu(i) {
6990 cfs_rq = tg->cfs_rq[i];
6991 kfree(cfs_rq);
6992
6993 se = tg->se[i];
6994 kfree(se);
6995 }
6996
6997 kfree(tg->cfs_rq);
6998 kfree(tg->se);
6999 kfree(tg);
7000}
7001
7002/* Destroy runqueue etc associated with a task group */
7003void sched_destroy_group(struct task_group *tg)
7004{
7005 struct cfs_rq *cfs_rq;
7006 int i;
7007
7008 for_each_possible_cpu(i) {
7009 cfs_rq = tg->cfs_rq[i];
7010 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7011 }
7012
7013 cfs_rq = tg->cfs_rq[0];
7014
7015 /* wait for possible concurrent references to cfs_rqs complete */
7016 call_rcu(&cfs_rq->rcu, free_sched_group);
7017}
7018
7019/* change task's runqueue when it moves between groups.
7020 * The caller of this function should have put the task in its new group
7021 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7022 * reflect its new group.
7023 */
7024void sched_move_task(struct task_struct *tsk)
7025{
7026 int on_rq, running;
7027 unsigned long flags;
7028 struct rq *rq;
7029
7030 rq = task_rq_lock(tsk, &flags);
7031
7032 if (tsk->sched_class != &fair_sched_class)
7033 goto done;
7034
7035 update_rq_clock(rq);
7036
7037 running = task_running(rq, tsk);
7038 on_rq = tsk->se.on_rq;
7039
7040 if (on_rq) {
7041 dequeue_task(rq, tsk, 0);
7042 if (unlikely(running))
7043 tsk->sched_class->put_prev_task(rq, tsk);
7044 }
7045
7046 set_task_cfs_rq(tsk);
7047
7048 if (on_rq) {
7049 if (unlikely(running))
7050 tsk->sched_class->set_curr_task(rq);
7051 enqueue_task(rq, tsk, 0);
7052 }
7053
7054done:
7055 task_rq_unlock(rq, &flags);
7056}
7057
7058static void set_se_shares(struct sched_entity *se, unsigned long shares)
7059{
7060 struct cfs_rq *cfs_rq = se->cfs_rq;
7061 struct rq *rq = cfs_rq->rq;
7062 int on_rq;
7063
7064 spin_lock_irq(&rq->lock);
7065
7066 on_rq = se->on_rq;
7067 if (on_rq)
7068 dequeue_entity(cfs_rq, se, 0);
7069
7070 se->load.weight = shares;
7071 se->load.inv_weight = div64_64((1ULL<<32), shares);
7072
7073 if (on_rq)
7074 enqueue_entity(cfs_rq, se, 0);
7075
7076 spin_unlock_irq(&rq->lock);
7077}
7078
7079int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7080{
7081 int i;
7082
7083 spin_lock(&tg->lock);
7084 if (tg->shares == shares)
7085 goto done;
7086
7087 tg->shares = shares;
7088 for_each_possible_cpu(i)
7089 set_se_shares(tg->se[i], shares);
7090
7091done:
7092 spin_unlock(&tg->lock);
7093 return 0;
7094}
7095
7096unsigned long sched_group_shares(struct task_group *tg)
7097{
7098 return tg->shares;
7099}
7100
7101#endif /* CONFIG_FAIR_GROUP_SCHED */
7102
7103#ifdef CONFIG_FAIR_CGROUP_SCHED
7104
7105/* return corresponding task_group object of a cgroup */
7106static inline struct task_group *cgroup_tg(struct cgroup *cont)
7107{
7108 return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
7109 struct task_group, css);
7110}
7111
7112static struct cgroup_subsys_state *
7113cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
7114{
7115 struct task_group *tg;
7116
7117 if (!cont->parent) {
7118 /* This is early initialization for the top cgroup */
7119 init_task_group.css.cgroup = cont;
7120 return &init_task_group.css;
7121 }
7122
7123 /* we support only 1-level deep hierarchical scheduler atm */
7124 if (cont->parent->parent)
7125 return ERR_PTR(-EINVAL);
7126
7127 tg = sched_create_group();
7128 if (IS_ERR(tg))
7129 return ERR_PTR(-ENOMEM);
7130
7131 /* Bind the cgroup to task_group object we just created */
7132 tg->css.cgroup = cont;
7133
7134 return &tg->css;
7135}
7136
7137static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
7138 struct cgroup *cont)
7139{
7140 struct task_group *tg = cgroup_tg(cont);
7141
7142 sched_destroy_group(tg);
7143}
7144
7145static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
7146 struct cgroup *cont, struct task_struct *tsk)
7147{
7148 /* We don't support RT-tasks being in separate groups */
7149 if (tsk->sched_class != &fair_sched_class)
7150 return -EINVAL;
7151
7152 return 0;
7153}
7154
7155static void
7156cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
7157 struct cgroup *old_cont, struct task_struct *tsk)
7158{
7159 sched_move_task(tsk);
7160}
7161
7162static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
7163 struct file *file, const char __user *userbuf,
7164 size_t nbytes, loff_t *ppos)
7165{
7166 unsigned long shareval;
7167 struct task_group *tg = cgroup_tg(cont);
7168 char buffer[2*sizeof(unsigned long) + 1];
7169 int rc;
7170
7171 if (nbytes > 2*sizeof(unsigned long)) /* safety check */
7172 return -E2BIG;
7173
7174 if (copy_from_user(buffer, userbuf, nbytes))
7175 return -EFAULT;
7176
7177 buffer[nbytes] = 0; /* nul-terminate */
7178 shareval = simple_strtoul(buffer, NULL, 10);
7179
7180 rc = sched_group_set_shares(tg, shareval);
7181
7182 return (rc < 0 ? rc : nbytes);
7183}
7184
7185static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
7186{
7187 struct task_group *tg = cgroup_tg(cont);
7188
7189 return (u64) tg->shares;
7190}
7191
7192static struct cftype cpu_shares = {
7193 .name = "shares",
7194 .read_uint = cpu_shares_read_uint,
7195 .write = cpu_shares_write,
7196};
7197
7198static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7199{
7200 return cgroup_add_file(cont, ss, &cpu_shares);
7201}
7202
7203struct cgroup_subsys cpu_cgroup_subsys = {
7204 .name = "cpu",
7205 .create = cpu_cgroup_create,
7206 .destroy = cpu_cgroup_destroy,
7207 .can_attach = cpu_cgroup_can_attach,
7208 .attach = cpu_cgroup_attach,
7209 .populate = cpu_cgroup_populate,
7210 .subsys_id = cpu_cgroup_subsys_id,
7211 .early_init = 1,
7212};
7213
7214#endif /* CONFIG_FAIR_CGROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index c3ee38bd3426..e6fb392e5164 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -28,6 +28,31 @@
28 printk(x); \ 28 printk(x); \
29 } while (0) 29 } while (0)
30 30
31/*
32 * Ease the printing of nsec fields:
33 */
34static long long nsec_high(long long nsec)
35{
36 if (nsec < 0) {
37 nsec = -nsec;
38 do_div(nsec, 1000000);
39 return -nsec;
40 }
41 do_div(nsec, 1000000);
42
43 return nsec;
44}
45
46static unsigned long nsec_low(long long nsec)
47{
48 if (nsec < 0)
49 nsec = -nsec;
50
51 return do_div(nsec, 1000000);
52}
53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55
31static void 56static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
33{ 58{
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
36 else 61 else
37 SEQ_printf(m, " "); 62 SEQ_printf(m, " ");
38 63
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", 64 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
40 p->comm, p->pid, 65 p->comm, p->pid,
41 (long long)p->se.fair_key, 66 SPLIT_NS(p->se.vruntime),
42 (long long)(p->se.fair_key - rq->cfs.fair_clock),
43 (long long)p->se.wait_runtime,
44 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
45 p->prio); 68 p->prio);
46#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
47 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
48 (long long)p->se.sum_exec_runtime, 71 SPLIT_NS(p->se.vruntime),
49 (long long)p->se.sum_wait_runtime, 72 SPLIT_NS(p->se.sum_exec_runtime),
50 (long long)p->se.sum_sleep_runtime, 73 SPLIT_NS(p->se.sum_sleep_runtime));
51 (long long)p->se.wait_runtime_overruns,
52 (long long)p->se.wait_runtime_underruns);
53#else 74#else
54 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
55 0LL, 0LL, 0LL, 0LL, 0LL); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
56#endif 77#endif
57} 78}
58 79
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
62 83
63 SEQ_printf(m, 84 SEQ_printf(m,
64 "\nrunnable tasks:\n" 85 "\nrunnable tasks:\n"
65 " task PID tree-key delta waiting" 86 " task PID tree-key switches prio"
66 " switches prio" 87 " exec-runtime sum-exec sum-sleep\n"
67 " sum-exec sum-wait sum-sleep" 88 "------------------------------------------------------"
68 " wait-overrun wait-underrun\n" 89 "----------------------------------------------------\n");
69 "------------------------------------------------------------------"
70 "----------------"
71 "------------------------------------------------"
72 "--------------------------------\n");
73 90
74 read_lock_irq(&tasklist_lock); 91 read_lock_irq(&tasklist_lock);
75 92
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
83 read_unlock_irq(&tasklist_lock); 100 read_unlock_irq(&tasklist_lock);
84} 101}
85 102
86static void 103void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
87print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
88{ 104{
89 s64 wait_runtime_rq_sum = 0; 105 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
90 struct task_struct *p; 106 spread, rq0_min_vruntime, spread0;
91 struct rb_node *curr;
92 unsigned long flags;
93 struct rq *rq = &per_cpu(runqueues, cpu); 107 struct rq *rq = &per_cpu(runqueues, cpu);
108 struct sched_entity *last;
109 unsigned long flags;
94 110
95 spin_lock_irqsave(&rq->lock, flags);
96 curr = first_fair(cfs_rq);
97 while (curr) {
98 p = rb_entry(curr, struct task_struct, se.run_node);
99 wait_runtime_rq_sum += p->se.wait_runtime;
100
101 curr = rb_next(curr);
102 }
103 spin_unlock_irqrestore(&rq->lock, flags);
104
105 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
106 (long long)wait_runtime_rq_sum);
107}
108
109void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
110{
111 SEQ_printf(m, "\ncfs_rq\n"); 111 SEQ_printf(m, "\ncfs_rq\n");
112 112
113#define P(x) \ 113 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
114 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) 114 SPLIT_NS(cfs_rq->exec_clock));
115
116 P(fair_clock);
117 P(exec_clock);
118 P(wait_runtime);
119 P(wait_runtime_overruns);
120 P(wait_runtime_underruns);
121 P(sleeper_bonus);
122#undef P
123 115
124 print_cfs_rq_runtime_sum(m, cpu, cfs_rq); 116 spin_lock_irqsave(&rq->lock, flags);
117 if (cfs_rq->rb_leftmost)
118 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
119 last = __pick_last_entity(cfs_rq);
120 if (last)
121 max_vruntime = last->vruntime;
122 min_vruntime = rq->cfs.min_vruntime;
123 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
124 spin_unlock_irqrestore(&rq->lock, flags);
125 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
126 SPLIT_NS(MIN_vruntime));
127 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
128 SPLIT_NS(min_vruntime));
129 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
130 SPLIT_NS(max_vruntime));
131 spread = max_vruntime - MIN_vruntime;
132 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
133 SPLIT_NS(spread));
134 spread0 = min_vruntime - rq0_min_vruntime;
135 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
136 SPLIT_NS(spread0));
137 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
138 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
139#ifdef CONFIG_SCHEDSTATS
140 SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
141 rq->bkl_count);
142#endif
143 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
144 cfs_rq->nr_spread_over);
125} 145}
126 146
127static void print_cpu(struct seq_file *m, int cpu) 147static void print_cpu(struct seq_file *m, int cpu)
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m, int cpu)
141 161
142#define P(x) \ 162#define P(x) \
143 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) 163 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
164#define PN(x) \
165 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
144 166
145 P(nr_running); 167 P(nr_running);
146 SEQ_printf(m, " .%-30s: %lu\n", "load", 168 SEQ_printf(m, " .%-30s: %lu\n", "load",
147 rq->ls.load.weight); 169 rq->load.weight);
148 P(ls.delta_fair);
149 P(ls.delta_exec);
150 P(nr_switches); 170 P(nr_switches);
151 P(nr_load_updates); 171 P(nr_load_updates);
152 P(nr_uninterruptible); 172 P(nr_uninterruptible);
153 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); 173 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
154 P(next_balance); 174 PN(next_balance);
155 P(curr->pid); 175 P(curr->pid);
156 P(clock); 176 PN(clock);
157 P(idle_clock); 177 PN(idle_clock);
158 P(prev_clock_raw); 178 PN(prev_clock_raw);
159 P(clock_warps); 179 P(clock_warps);
160 P(clock_overflows); 180 P(clock_overflows);
161 P(clock_deep_idle_events); 181 P(clock_deep_idle_events);
162 P(clock_max_delta); 182 PN(clock_max_delta);
163 P(cpu_load[0]); 183 P(cpu_load[0]);
164 P(cpu_load[1]); 184 P(cpu_load[1]);
165 P(cpu_load[2]); 185 P(cpu_load[2]);
166 P(cpu_load[3]); 186 P(cpu_load[3]);
167 P(cpu_load[4]); 187 P(cpu_load[4]);
168#undef P 188#undef P
189#undef PN
169 190
170 print_cfs_stats(m, cpu); 191 print_cfs_stats(m, cpu);
171 192
@@ -177,12 +198,25 @@ static int sched_debug_show(struct seq_file *m, void *v)
177 u64 now = ktime_to_ns(ktime_get()); 198 u64 now = ktime_to_ns(ktime_get());
178 int cpu; 199 int cpu;
179 200
180 SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", 201 SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n",
181 init_utsname()->release, 202 init_utsname()->release,
182 (int)strcspn(init_utsname()->version, " "), 203 (int)strcspn(init_utsname()->version, " "),
183 init_utsname()->version); 204 init_utsname()->version);
184 205
185 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); 206 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
207
208#define P(x) \
209 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
210#define PN(x) \
211 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
212 PN(sysctl_sched_latency);
213 PN(sysctl_sched_nr_latency);
214 PN(sysctl_sched_wakeup_granularity);
215 PN(sysctl_sched_batch_wakeup_granularity);
216 PN(sysctl_sched_child_runs_first);
217 P(sysctl_sched_features);
218#undef PN
219#undef P
186 220
187 for_each_online_cpu(cpu) 221 for_each_online_cpu(cpu)
188 print_cpu(m, cpu); 222 print_cpu(m, cpu);
@@ -202,7 +236,7 @@ static int sched_debug_open(struct inode *inode, struct file *filp)
202 return single_open(filp, sched_debug_show, NULL); 236 return single_open(filp, sched_debug_show, NULL);
203} 237}
204 238
205static struct file_operations sched_debug_fops = { 239static const struct file_operations sched_debug_fops = {
206 .open = sched_debug_open, 240 .open = sched_debug_open,
207 .read = seq_read, 241 .read = seq_read,
208 .llseek = seq_lseek, 242 .llseek = seq_lseek,
@@ -226,6 +260,7 @@ __initcall(init_sched_debug_procfs);
226 260
227void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 261void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
228{ 262{
263 unsigned long nr_switches;
229 unsigned long flags; 264 unsigned long flags;
230 int num_threads = 1; 265 int num_threads = 1;
231 266
@@ -237,41 +272,89 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
237 rcu_read_unlock(); 272 rcu_read_unlock();
238 273
239 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 274 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
240 SEQ_printf(m, "----------------------------------------------\n"); 275 SEQ_printf(m,
276 "---------------------------------------------------------\n");
277#define __P(F) \
278 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
241#define P(F) \ 279#define P(F) \
242 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) 280 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
281#define __PN(F) \
282 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
283#define PN(F) \
284 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
243 285
244 P(se.wait_runtime); 286 PN(se.exec_start);
245 P(se.wait_start_fair); 287 PN(se.vruntime);
246 P(se.exec_start); 288 PN(se.sum_exec_runtime);
247 P(se.sleep_start_fair); 289
248 P(se.sum_exec_runtime); 290 nr_switches = p->nvcsw + p->nivcsw;
249 291
250#ifdef CONFIG_SCHEDSTATS 292#ifdef CONFIG_SCHEDSTATS
251 P(se.wait_start); 293 PN(se.wait_start);
252 P(se.sleep_start); 294 PN(se.sleep_start);
253 P(se.block_start); 295 PN(se.block_start);
254 P(se.sleep_max); 296 PN(se.sleep_max);
255 P(se.block_max); 297 PN(se.block_max);
256 P(se.exec_max); 298 PN(se.exec_max);
257 P(se.wait_max); 299 PN(se.slice_max);
258 P(se.wait_runtime_overruns); 300 PN(se.wait_max);
259 P(se.wait_runtime_underruns); 301 P(sched_info.bkl_count);
260 P(se.sum_wait_runtime); 302 P(se.nr_migrations);
303 P(se.nr_migrations_cold);
304 P(se.nr_failed_migrations_affine);
305 P(se.nr_failed_migrations_running);
306 P(se.nr_failed_migrations_hot);
307 P(se.nr_forced_migrations);
308 P(se.nr_forced2_migrations);
309 P(se.nr_wakeups);
310 P(se.nr_wakeups_sync);
311 P(se.nr_wakeups_migrate);
312 P(se.nr_wakeups_local);
313 P(se.nr_wakeups_remote);
314 P(se.nr_wakeups_affine);
315 P(se.nr_wakeups_affine_attempts);
316 P(se.nr_wakeups_passive);
317 P(se.nr_wakeups_idle);
318
319 {
320 u64 avg_atom, avg_per_cpu;
321
322 avg_atom = p->se.sum_exec_runtime;
323 if (nr_switches)
324 do_div(avg_atom, nr_switches);
325 else
326 avg_atom = -1LL;
327
328 avg_per_cpu = p->se.sum_exec_runtime;
329 if (p->se.nr_migrations)
330 avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations);
331 else
332 avg_per_cpu = -1LL;
333
334 __PN(avg_atom);
335 __PN(avg_per_cpu);
336 }
261#endif 337#endif
262 SEQ_printf(m, "%-25s:%20Ld\n", 338 __P(nr_switches);
263 "nr_switches", (long long)(p->nvcsw + p->nivcsw)); 339 SEQ_printf(m, "%-35s:%21Ld\n",
340 "nr_voluntary_switches", (long long)p->nvcsw);
341 SEQ_printf(m, "%-35s:%21Ld\n",
342 "nr_involuntary_switches", (long long)p->nivcsw);
343
264 P(se.load.weight); 344 P(se.load.weight);
265 P(policy); 345 P(policy);
266 P(prio); 346 P(prio);
347#undef PN
348#undef __PN
267#undef P 349#undef P
350#undef __P
268 351
269 { 352 {
270 u64 t0, t1; 353 u64 t0, t1;
271 354
272 t0 = sched_clock(); 355 t0 = sched_clock();
273 t1 = sched_clock(); 356 t1 = sched_clock();
274 SEQ_printf(m, "%-25s:%20Ld\n", 357 SEQ_printf(m, "%-35s:%21Ld\n",
275 "clock-delta", (long long)(t1-t0)); 358 "clock-delta", (long long)(t1-t0));
276 } 359 }
277} 360}
@@ -279,9 +362,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
279void proc_sched_set_task(struct task_struct *p) 362void proc_sched_set_task(struct task_struct *p)
280{ 363{
281#ifdef CONFIG_SCHEDSTATS 364#ifdef CONFIG_SCHEDSTATS
282 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; 365 p->se.wait_max = 0;
283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 366 p->se.sleep_max = 0;
367 p->se.sum_sleep_runtime = 0;
368 p->se.block_max = 0;
369 p->se.exec_max = 0;
370 p->se.slice_max = 0;
371 p->se.nr_migrations = 0;
372 p->se.nr_migrations_cold = 0;
373 p->se.nr_failed_migrations_affine = 0;
374 p->se.nr_failed_migrations_running = 0;
375 p->se.nr_failed_migrations_hot = 0;
376 p->se.nr_forced_migrations = 0;
377 p->se.nr_forced2_migrations = 0;
378 p->se.nr_wakeups = 0;
379 p->se.nr_wakeups_sync = 0;
380 p->se.nr_wakeups_migrate = 0;
381 p->se.nr_wakeups_local = 0;
382 p->se.nr_wakeups_remote = 0;
383 p->se.nr_wakeups_affine = 0;
384 p->se.nr_wakeups_affine_attempts = 0;
385 p->se.nr_wakeups_passive = 0;
386 p->se.nr_wakeups_idle = 0;
387 p->sched_info.bkl_count = 0;
284#endif 388#endif
285 p->se.sum_exec_runtime = 0; 389 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0; 390 p->se.prev_sum_exec_runtime = 0;
391 p->nvcsw = 0;
392 p->nivcsw = 0;
287} 393}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 67c67a87146e..166ed6db600b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,22 +25,26 @@
25 * (default: 20ms, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
26 * 26 *
27 * NOTE: this latency value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length. 28 * 'timeslice length' - timeslices in CFS are of variable length
29 * (to see the precise effective timeslice length of your workload, 29 * and have no persistent notion like in traditional, time-slice
30 * run vmstat and monitor the context-switches field) 30 * based scheduling concepts.
31 * 31 *
32 * On SMP systems the value of this is multiplied by the log2 of the 32 * (to see the precise effective timeslice length of your workload,
33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * run vmstat and monitor the context-switches (cs) field)
34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */ 34 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; 35const_debug unsigned int sysctl_sched_latency = 20000000ULL;
36
37/*
38 * After fork, child runs first. (default) If set to 0 then
39 * parent will (try to) run first.
40 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1;
38 42
39/* 43/*
40 * Minimal preemption granularity for CPU-bound tasks: 44 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds) 45 * (default: 2 msec, units: nanoseconds)
42 */ 46 */
43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; 47const_debug unsigned int sysctl_sched_nr_latency = 20;
44 48
45/* 49/*
46 * sys_sched_yield() compat mode 50 * sys_sched_yield() compat mode
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
52 56
53/* 57/*
54 * SCHED_BATCH wake-up granularity. 58 * SCHED_BATCH wake-up granularity.
55 * (default: 25 msec, units: nanoseconds) 59 * (default: 10 msec, units: nanoseconds)
56 * 60 *
57 * This option delays the preemption effects of decoupled workloads 61 * This option delays the preemption effects of decoupled workloads
58 * and reduces their over-scheduling. Synchronous workloads will still 62 * and reduces their over-scheduling. Synchronous workloads will still
59 * have immediate wakeup/sleep latencies. 63 * have immediate wakeup/sleep latencies.
60 */ 64 */
61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; 65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
62 66
63/* 67/*
64 * SCHED_OTHER wake-up granularity. 68 * SCHED_OTHER wake-up granularity.
65 * (default: 1 msec, units: nanoseconds) 69 * (default: 10 msec, units: nanoseconds)
66 * 70 *
67 * This option delays the preemption effects of decoupled workloads 71 * This option delays the preemption effects of decoupled workloads
68 * and reduces their over-scheduling. Synchronous workloads will still 72 * and reduces their over-scheduling. Synchronous workloads will still
69 * have immediate wakeup/sleep latencies. 73 * have immediate wakeup/sleep latencies.
70 */ 74 */
71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; 75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
72
73unsigned int sysctl_sched_stat_granularity __read_mostly;
74
75/*
76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
77 */
78unsigned int sysctl_sched_runtime_limit __read_mostly;
79
80/*
81 * Debugging: various feature bits
82 */
83enum {
84 SCHED_FEAT_FAIR_SLEEPERS = 1,
85 SCHED_FEAT_SLEEPER_AVG = 2,
86 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
87 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
88 SCHED_FEAT_START_DEBIT = 16,
89 SCHED_FEAT_SKIP_INITIAL = 32,
90};
91 76
92unsigned int sysctl_sched_features __read_mostly = 77const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
93 SCHED_FEAT_FAIR_SLEEPERS *1 |
94 SCHED_FEAT_SLEEPER_AVG *0 |
95 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
96 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
97 SCHED_FEAT_START_DEBIT *1 |
98 SCHED_FEAT_SKIP_INITIAL *0;
99
100extern struct sched_class fair_sched_class;
101 78
102/************************************************************** 79/**************************************************************
103 * CFS operations on generic schedulable entities: 80 * CFS operations on generic schedulable entities:
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
111 return cfs_rq->rq; 88 return cfs_rq->rq;
112} 89}
113 90
114/* currently running entity (if any) on this cfs_rq */
115static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
116{
117 return cfs_rq->curr;
118}
119
120/* An entity is a task if it doesn't "own" a runqueue */ 91/* An entity is a task if it doesn't "own" a runqueue */
121#define entity_is_task(se) (!se->my_q) 92#define entity_is_task(se) (!se->my_q)
122 93
123static inline void
124set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
125{
126 cfs_rq->curr = se;
127}
128
129#else /* CONFIG_FAIR_GROUP_SCHED */ 94#else /* CONFIG_FAIR_GROUP_SCHED */
130 95
131static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 96static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
133 return container_of(cfs_rq, struct rq, cfs); 98 return container_of(cfs_rq, struct rq, cfs);
134} 99}
135 100
136static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
137{
138 struct rq *rq = rq_of(cfs_rq);
139
140 if (unlikely(rq->curr->sched_class != &fair_sched_class))
141 return NULL;
142
143 return &rq->curr->se;
144}
145
146#define entity_is_task(se) 1 101#define entity_is_task(se) 1
147 102
148static inline void
149set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
150
151#endif /* CONFIG_FAIR_GROUP_SCHED */ 103#endif /* CONFIG_FAIR_GROUP_SCHED */
152 104
153static inline struct task_struct *task_of(struct sched_entity *se) 105static inline struct task_struct *task_of(struct sched_entity *se)
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se)
160 * Scheduling class tree data structure manipulation methods: 112 * Scheduling class tree data structure manipulation methods:
161 */ 113 */
162 114
115static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
116{
117 s64 delta = (s64)(vruntime - min_vruntime);
118 if (delta > 0)
119 min_vruntime = vruntime;
120
121 return min_vruntime;
122}
123
124static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
125{
126 s64 delta = (s64)(vruntime - min_vruntime);
127 if (delta < 0)
128 min_vruntime = vruntime;
129
130 return min_vruntime;
131}
132
133static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
134{
135 return se->vruntime - cfs_rq->min_vruntime;
136}
137
163/* 138/*
164 * Enqueue an entity into the rb-tree: 139 * Enqueue an entity into the rb-tree:
165 */ 140 */
166static inline void 141static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
167__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
168{ 142{
169 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 143 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
170 struct rb_node *parent = NULL; 144 struct rb_node *parent = NULL;
171 struct sched_entity *entry; 145 struct sched_entity *entry;
172 s64 key = se->fair_key; 146 s64 key = entity_key(cfs_rq, se);
173 int leftmost = 1; 147 int leftmost = 1;
174 148
175 /* 149 /*
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
182 * We dont care about collisions. Nodes with 156 * We dont care about collisions. Nodes with
183 * the same key stay together. 157 * the same key stay together.
184 */ 158 */
185 if (key - entry->fair_key < 0) { 159 if (key < entity_key(cfs_rq, entry)) {
186 link = &parent->rb_left; 160 link = &parent->rb_left;
187 } else { 161 } else {
188 link = &parent->rb_right; 162 link = &parent->rb_right;
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
199 173
200 rb_link_node(&se->run_node, parent, link); 174 rb_link_node(&se->run_node, parent, link);
201 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 175 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
202 update_load_add(&cfs_rq->load, se->load.weight);
203 cfs_rq->nr_running++;
204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
207} 176}
208 177
209static inline void 178static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
210__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
211{ 179{
212 if (cfs_rq->rb_leftmost == &se->run_node) 180 if (cfs_rq->rb_leftmost == &se->run_node)
213 cfs_rq->rb_leftmost = rb_next(&se->run_node); 181 cfs_rq->rb_leftmost = rb_next(&se->run_node);
214 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
215 update_load_sub(&cfs_rq->load, se->load.weight);
216 cfs_rq->nr_running--;
217 se->on_rq = 0;
218 182
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 183 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
220} 184}
221 185
222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 186static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
229 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 193 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
230} 194}
231 195
196static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
197{
198 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
199 struct sched_entity *se = NULL;
200 struct rb_node *parent;
201
202 while (*link) {
203 parent = *link;
204 se = rb_entry(parent, struct sched_entity, run_node);
205 link = &parent->rb_right;
206 }
207
208 return se;
209}
210
232/************************************************************** 211/**************************************************************
233 * Scheduling class statistics methods: 212 * Scheduling class statistics methods:
234 */ 213 */
235 214
215
236/* 216/*
237 * Calculate the preemption granularity needed to schedule every 217 * The idea is to set a period in which each task runs once.
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 * 218 *
247 * To achieve this we use the following dynamic-granularity rule: 219 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
220 * this period because otherwise the slices get too small.
248 * 221 *
249 * gran = lat/nr - lat/nr/nr 222 * p = (nr <= nl) ? l : l*nr/nl
250 *
251 * This comes out of the following equations:
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */ 223 */
265static long 224static u64 __sched_period(unsigned long nr_running)
266sched_granularity(struct cfs_rq *cfs_rq)
267{ 225{
268 unsigned int gran = sysctl_sched_latency; 226 u64 period = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running; 227 unsigned long nr_latency = sysctl_sched_nr_latency;
270 228
271 if (nr > 1) { 229 if (unlikely(nr_running > nr_latency)) {
272 gran = gran/nr - gran/nr/nr; 230 period *= nr_running;
273 gran = max(gran, sysctl_sched_min_granularity); 231 do_div(period, nr_latency);
274 } 232 }
275 233
276 return gran; 234 return period;
277} 235}
278 236
279/* 237/*
280 * We rescale the rescheduling granularity of tasks according to their 238 * We calculate the wall-time slice from the period by taking a part
281 * nice level, but only linearly, not exponentially: 239 * proportional to the weight.
240 *
241 * s = p*w/rw
282 */ 242 */
283static long 243static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
284niced_granularity(struct sched_entity *curr, unsigned long granularity)
285{ 244{
286 u64 tmp; 245 u64 slice = __sched_period(cfs_rq->nr_running);
287 246
288 if (likely(curr->load.weight == NICE_0_LOAD)) 247 slice *= se->load.weight;
289 return granularity; 248 do_div(slice, cfs_rq->load.weight);
290 /*
291 * Positive nice levels get the same granularity as nice-0:
292 */
293 if (likely(curr->load.weight < NICE_0_LOAD)) {
294 tmp = curr->load.weight * (u64)granularity;
295 return (long) (tmp >> NICE_0_SHIFT);
296 }
297 /*
298 * Negative nice level tasks get linearly finer
299 * granularity:
300 */
301 tmp = curr->load.inv_weight * (u64)granularity;
302 249
303 /* 250 return slice;
304 * It will always fit into 'long':
305 */
306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
307} 251}
308 252
309static inline void 253/*
310limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) 254 * We calculate the vruntime slice.
255 *
256 * vs = s/w = p/rw
257 */
258static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
311{ 259{
312 long limit = sysctl_sched_runtime_limit; 260 u64 vslice = __sched_period(nr_running);
313 261
314 /* 262 do_div(vslice, rq_weight);
315 * Niced tasks have the same history dynamic range as 263
316 * non-niced tasks: 264 return vslice;
317 */
318 if (unlikely(se->wait_runtime > limit)) {
319 se->wait_runtime = limit;
320 schedstat_inc(se, wait_runtime_overruns);
321 schedstat_inc(cfs_rq, wait_runtime_overruns);
322 }
323 if (unlikely(se->wait_runtime < -limit)) {
324 se->wait_runtime = -limit;
325 schedstat_inc(se, wait_runtime_underruns);
326 schedstat_inc(cfs_rq, wait_runtime_underruns);
327 }
328} 265}
329 266
330static inline void 267static u64 sched_vslice(struct cfs_rq *cfs_rq)
331__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
332{ 268{
333 se->wait_runtime += delta; 269 return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
334 schedstat_add(se, sum_wait_runtime, delta);
335 limit_wait_runtime(cfs_rq, se);
336} 270}
337 271
338static void 272static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
339add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
340{ 273{
341 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 274 return __sched_vslice(cfs_rq->load.weight + se->load.weight,
342 __add_wait_runtime(cfs_rq, se, delta); 275 cfs_rq->nr_running + 1);
343 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
344} 276}
345 277
346/* 278/*
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
348 * are not in our scheduling class. 280 * are not in our scheduling class.
349 */ 281 */
350static inline void 282static inline void
351__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) 283__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
284 unsigned long delta_exec)
352{ 285{
353 unsigned long delta, delta_exec, delta_fair, delta_mine; 286 unsigned long delta_exec_weighted;
354 struct load_weight *lw = &cfs_rq->load; 287 u64 vruntime;
355 unsigned long load = lw->weight;
356 288
357 delta_exec = curr->delta_exec;
358 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 289 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
359 290
360 curr->sum_exec_runtime += delta_exec; 291 curr->sum_exec_runtime += delta_exec;
361 cfs_rq->exec_clock += delta_exec; 292 schedstat_add(cfs_rq, exec_clock, delta_exec);
362 293 delta_exec_weighted = delta_exec;
363 if (unlikely(!load)) 294 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
364 return; 295 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
365 296 &curr->load);
366 delta_fair = calc_delta_fair(delta_exec, lw);
367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
368
369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
371 delta = min(delta, (unsigned long)(
372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
373 cfs_rq->sleeper_bonus -= delta;
374 delta_mine -= delta;
375 } 297 }
298 curr->vruntime += delta_exec_weighted;
376 299
377 cfs_rq->fair_clock += delta_fair;
378 /* 300 /*
379 * We executed delta_exec amount of time on the CPU, 301 * maintain cfs_rq->min_vruntime to be a monotonic increasing
380 * but we were only entitled to delta_mine amount of 302 * value tracking the leftmost vruntime in the tree.
381 * time during that period (if nr_running == 1 then
382 * the two values are equal)
383 * [Note: delta_mine - delta_exec is negative]:
384 */ 303 */
385 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); 304 if (first_fair(cfs_rq)) {
305 vruntime = min_vruntime(curr->vruntime,
306 __pick_next_entity(cfs_rq)->vruntime);
307 } else
308 vruntime = curr->vruntime;
309
310 cfs_rq->min_vruntime =
311 max_vruntime(cfs_rq->min_vruntime, vruntime);
386} 312}
387 313
388static void update_curr(struct cfs_rq *cfs_rq) 314static void update_curr(struct cfs_rq *cfs_rq)
389{ 315{
390 struct sched_entity *curr = cfs_rq_curr(cfs_rq); 316 struct sched_entity *curr = cfs_rq->curr;
317 u64 now = rq_of(cfs_rq)->clock;
391 unsigned long delta_exec; 318 unsigned long delta_exec;
392 319
393 if (unlikely(!curr)) 320 if (unlikely(!curr))
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq)
398 * since the last time we changed load (this cannot 325 * since the last time we changed load (this cannot
399 * overflow on 32 bits): 326 * overflow on 32 bits):
400 */ 327 */
401 delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); 328 delta_exec = (unsigned long)(now - curr->exec_start);
402
403 curr->delta_exec += delta_exec;
404 329
405 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { 330 __update_curr(cfs_rq, curr, delta_exec);
406 __update_curr(cfs_rq, curr); 331 curr->exec_start = now;
407 curr->delta_exec = 0;
408 }
409 curr->exec_start = rq_of(cfs_rq)->clock;
410} 332}
411 333
412static inline void 334static inline void
413update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 335update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
414{ 336{
415 se->wait_start_fair = cfs_rq->fair_clock;
416 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 337 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
417} 338}
418 339
419/* 340/*
420 * We calculate fair deltas here, so protect against the random effects
421 * of a multiplication overflow by capping it to the runtime limit:
422 */
423#if BITS_PER_LONG == 32
424static inline unsigned long
425calc_weighted(unsigned long delta, unsigned long weight, int shift)
426{
427 u64 tmp = (u64)delta * weight >> shift;
428
429 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
430 return sysctl_sched_runtime_limit*2;
431 return tmp;
432}
433#else
434static inline unsigned long
435calc_weighted(unsigned long delta, unsigned long weight, int shift)
436{
437 return delta * weight >> shift;
438}
439#endif
440
441/*
442 * Task is being enqueued - update stats: 341 * Task is being enqueued - update stats:
443 */ 342 */
444static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 343static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
445{ 344{
446 s64 key;
447
448 /* 345 /*
449 * Are we enqueueing a waiting task? (for current tasks 346 * Are we enqueueing a waiting task? (for current tasks
450 * a dequeue/enqueue event is a NOP) 347 * a dequeue/enqueue event is a NOP)
451 */ 348 */
452 if (se != cfs_rq_curr(cfs_rq)) 349 if (se != cfs_rq->curr)
453 update_stats_wait_start(cfs_rq, se); 350 update_stats_wait_start(cfs_rq, se);
454 /*
455 * Update the key:
456 */
457 key = cfs_rq->fair_clock;
458
459 /*
460 * Optimize the common nice 0 case:
461 */
462 if (likely(se->load.weight == NICE_0_LOAD)) {
463 key -= se->wait_runtime;
464 } else {
465 u64 tmp;
466
467 if (se->wait_runtime < 0) {
468 tmp = -se->wait_runtime;
469 key += (tmp * se->load.inv_weight) >>
470 (WMULT_SHIFT - NICE_0_SHIFT);
471 } else {
472 tmp = se->wait_runtime;
473 key -= (tmp * se->load.inv_weight) >>
474 (WMULT_SHIFT - NICE_0_SHIFT);
475 }
476 }
477
478 se->fair_key = key;
479}
480
481/*
482 * Note: must be called with a freshly updated rq->fair_clock.
483 */
484static inline void
485__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
486{
487 unsigned long delta_fair = se->delta_fair_run;
488
489 schedstat_set(se->wait_max, max(se->wait_max,
490 rq_of(cfs_rq)->clock - se->wait_start));
491
492 if (unlikely(se->load.weight != NICE_0_LOAD))
493 delta_fair = calc_weighted(delta_fair, se->load.weight,
494 NICE_0_SHIFT);
495
496 add_wait_runtime(cfs_rq, se, delta_fair);
497} 351}
498 352
499static void 353static void
500update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 354update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
501{ 355{
502 unsigned long delta_fair; 356 schedstat_set(se->wait_max, max(se->wait_max,
503 357 rq_of(cfs_rq)->clock - se->wait_start));
504 if (unlikely(!se->wait_start_fair))
505 return;
506
507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
509
510 se->delta_fair_run += delta_fair;
511 if (unlikely(abs(se->delta_fair_run) >=
512 sysctl_sched_stat_granularity)) {
513 __update_stats_wait_end(cfs_rq, se);
514 se->delta_fair_run = 0;
515 }
516
517 se->wait_start_fair = 0;
518 schedstat_set(se->wait_start, 0); 358 schedstat_set(se->wait_start, 0);
519} 359}
520 360
521static inline void 361static inline void
522update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 362update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{ 363{
524 update_curr(cfs_rq);
525 /* 364 /*
526 * Mark the end of the wait period if dequeueing a 365 * Mark the end of the wait period if dequeueing a
527 * waiting task: 366 * waiting task:
528 */ 367 */
529 if (se != cfs_rq_curr(cfs_rq)) 368 if (se != cfs_rq->curr)
530 update_stats_wait_end(cfs_rq, se); 369 update_stats_wait_end(cfs_rq, se);
531} 370}
532 371
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 se->exec_start = rq_of(cfs_rq)->clock; 381 se->exec_start = rq_of(cfs_rq)->clock;
543} 382}
544 383
545/*
546 * We are descheduling a task - update its stats:
547 */
548static inline void
549update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{
551 se->exec_start = 0;
552}
553
554/************************************************** 384/**************************************************
555 * Scheduling class queueing methods: 385 * Scheduling class queueing methods:
556 */ 386 */
557 387
558static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 388static void
389account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
559{ 390{
560 unsigned long load = cfs_rq->load.weight, delta_fair; 391 update_load_add(&cfs_rq->load, se->load.weight);
561 long prev_runtime; 392 cfs_rq->nr_running++;
562 393 se->on_rq = 1;
563 /* 394}
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
571 load = rq_of(cfs_rq)->cpu_load[2];
572
573 delta_fair = se->delta_fair_sleep;
574
575 /*
576 * Fix up delta_fair with the effect of us running
577 * during the whole sleep period:
578 */
579 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
580 delta_fair = div64_likely32((u64)delta_fair * load,
581 load + se->load.weight);
582
583 if (unlikely(se->load.weight != NICE_0_LOAD))
584 delta_fair = calc_weighted(delta_fair, se->load.weight,
585 NICE_0_SHIFT);
586
587 prev_runtime = se->wait_runtime;
588 __add_wait_runtime(cfs_rq, se, delta_fair);
589 delta_fair = se->wait_runtime - prev_runtime;
590 395
591 /* 396static void
592 * Track the amount of bonus we've given to sleepers: 397account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
593 */ 398{
594 cfs_rq->sleeper_bonus += delta_fair; 399 update_load_sub(&cfs_rq->load, se->load.weight);
400 cfs_rq->nr_running--;
401 se->on_rq = 0;
595} 402}
596 403
597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 404static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
598{ 405{
599 struct task_struct *tsk = task_of(se);
600 unsigned long delta_fair;
601
602 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
603 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
604 return;
605
606 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
607 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
608
609 se->delta_fair_sleep += delta_fair;
610 if (unlikely(abs(se->delta_fair_sleep) >=
611 sysctl_sched_stat_granularity)) {
612 __enqueue_sleeper(cfs_rq, se);
613 se->delta_fair_sleep = 0;
614 }
615
616 se->sleep_start_fair = 0;
617
618#ifdef CONFIG_SCHEDSTATS 406#ifdef CONFIG_SCHEDSTATS
619 if (se->sleep_start) { 407 if (se->sleep_start) {
620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 408 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
646 * time that the task spent sleeping: 434 * time that the task spent sleeping:
647 */ 435 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) { 436 if (unlikely(prof_on == SLEEP_PROFILING)) {
437 struct task_struct *tsk = task_of(se);
438
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 439 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20); 440 delta >> 20);
651 } 441 }
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
653#endif 443#endif
654} 444}
655 445
446static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
447{
448#ifdef CONFIG_SCHED_DEBUG
449 s64 d = se->vruntime - cfs_rq->min_vruntime;
450
451 if (d < 0)
452 d = -d;
453
454 if (d > 3*sysctl_sched_latency)
455 schedstat_inc(cfs_rq, nr_spread_over);
456#endif
457}
458
459static void
460place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
461{
462 u64 vruntime;
463
464 vruntime = cfs_rq->min_vruntime;
465
466 if (sched_feat(TREE_AVG)) {
467 struct sched_entity *last = __pick_last_entity(cfs_rq);
468 if (last) {
469 vruntime += last->vruntime;
470 vruntime >>= 1;
471 }
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2;
474
475 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se);
477
478 if (!initial) {
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) &&
480 task_of(se)->policy != SCHED_BATCH)
481 vruntime -= sysctl_sched_latency;
482
483 vruntime = max_t(s64, vruntime, se->vruntime);
484 }
485
486 se->vruntime = vruntime;
487
488}
489
656static void 490static void
657enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 491enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
658{ 492{
659 /* 493 /*
660 * Update the fair clock. 494 * Update run-time statistics of the 'current'.
661 */ 495 */
662 update_curr(cfs_rq); 496 update_curr(cfs_rq);
663 497
664 if (wakeup) 498 if (wakeup) {
499 place_entity(cfs_rq, se, 0);
665 enqueue_sleeper(cfs_rq, se); 500 enqueue_sleeper(cfs_rq, se);
501 }
666 502
667 update_stats_enqueue(cfs_rq, se); 503 update_stats_enqueue(cfs_rq, se);
668 __enqueue_entity(cfs_rq, se); 504 check_spread(cfs_rq, se);
505 if (se != cfs_rq->curr)
506 __enqueue_entity(cfs_rq, se);
507 account_entity_enqueue(cfs_rq, se);
669} 508}
670 509
671static void 510static void
672dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 511dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
673{ 512{
513 /*
514 * Update run-time statistics of the 'current'.
515 */
516 update_curr(cfs_rq);
517
674 update_stats_dequeue(cfs_rq, se); 518 update_stats_dequeue(cfs_rq, se);
675 if (sleep) { 519 if (sleep) {
676 se->sleep_start_fair = cfs_rq->fair_clock; 520 se->peer_preempt = 0;
677#ifdef CONFIG_SCHEDSTATS 521#ifdef CONFIG_SCHEDSTATS
678 if (entity_is_task(se)) { 522 if (entity_is_task(se)) {
679 struct task_struct *tsk = task_of(se); 523 struct task_struct *tsk = task_of(se);
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
685 } 529 }
686#endif 530#endif
687 } 531 }
688 __dequeue_entity(cfs_rq, se); 532
533 if (se != cfs_rq->curr)
534 __dequeue_entity(cfs_rq, se);
535 account_entity_dequeue(cfs_rq, se);
689} 536}
690 537
691/* 538/*
692 * Preempt the current task with a newly woken task if needed: 539 * Preempt the current task with a newly woken task if needed:
693 */ 540 */
694static void 541static void
695__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, 542check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
696 struct sched_entity *curr, unsigned long granularity)
697{ 543{
698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec; 544 unsigned long ideal_runtime, delta_exec;
700 545
701 /* 546 ideal_runtime = sched_slice(cfs_rq, curr);
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime) 548 if (delta_exec > ideal_runtime ||
716 granularity = 0; 549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
717
718 /*
719 * Take scheduling granularity into account - do not
720 * preempt the current task unless the best task has
721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
724 */
725 if (__delta > niced_granularity(curr, granularity))
726 resched_task(rq_of(cfs_rq)->curr); 550 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
727} 552}
728 553
729static inline void 554static void
730set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 555set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
731{ 556{
557 /* 'current' is not kept within the tree. */
558 if (se->on_rq) {
559 /*
560 * Any task has to be enqueued before it get to execute on
561 * a CPU. So account for the time it spent waiting on the
562 * runqueue.
563 */
564 update_stats_wait_end(cfs_rq, se);
565 __dequeue_entity(cfs_rq, se);
566 }
567
568 update_stats_curr_start(cfs_rq, se);
569 cfs_rq->curr = se;
570#ifdef CONFIG_SCHEDSTATS
732 /* 571 /*
733 * Any task has to be enqueued before it get to execute on 572 * Track our maximum slice length, if the CPU's load is at
734 * a CPU. So account for the time it spent waiting on the 573 * least twice that of our own weight (i.e. dont track it
735 * runqueue. (note, here we rely on pick_next_task() having 574 * when there are only lesser-weight tasks around):
736 * done a put_prev_task_fair() shortly before this, which
737 * updated rq->fair_clock - used by update_stats_wait_end())
738 */ 575 */
739 update_stats_wait_end(cfs_rq, se); 576 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
740 update_stats_curr_start(cfs_rq, se); 577 se->slice_max = max(se->slice_max,
741 set_cfs_rq_curr(cfs_rq, se); 578 se->sum_exec_runtime - se->prev_sum_exec_runtime);
579 }
580#endif
742 se->prev_sum_exec_runtime = se->sum_exec_runtime; 581 se->prev_sum_exec_runtime = se->sum_exec_runtime;
743} 582}
744 583
745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 584static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
746{ 585{
747 struct sched_entity *se = __pick_next_entity(cfs_rq); 586 struct sched_entity *se = NULL;
748 587
749 set_next_entity(cfs_rq, se); 588 if (first_fair(cfs_rq)) {
589 se = __pick_next_entity(cfs_rq);
590 set_next_entity(cfs_rq, se);
591 }
750 592
751 return se; 593 return se;
752} 594}
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
760 if (prev->on_rq) 602 if (prev->on_rq)
761 update_curr(cfs_rq); 603 update_curr(cfs_rq);
762 604
763 update_stats_curr_end(cfs_rq, prev); 605 check_spread(cfs_rq, prev);
764 606 if (prev->on_rq) {
765 if (prev->on_rq)
766 update_stats_wait_start(cfs_rq, prev); 607 update_stats_wait_start(cfs_rq, prev);
767 set_cfs_rq_curr(cfs_rq, NULL); 608 /* Put 'current' back into the tree. */
609 __enqueue_entity(cfs_rq, prev);
610 }
611 cfs_rq->curr = NULL;
768} 612}
769 613
770static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 614static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
771{ 615{
772 struct sched_entity *next;
773
774 /* 616 /*
775 * Dequeue and enqueue the task to update its 617 * Update run-time statistics of the 'current'.
776 * position within the tree:
777 */ 618 */
778 dequeue_entity(cfs_rq, curr, 0); 619 update_curr(cfs_rq);
779 enqueue_entity(cfs_rq, curr, 0);
780
781 /*
782 * Reschedule if another task tops the current one.
783 */
784 next = __pick_next_entity(cfs_rq);
785 if (next == curr)
786 return;
787 620
788 __check_preempt_curr_fair(cfs_rq, next, curr, 621 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
789 sched_granularity(cfs_rq)); 622 check_preempt_tick(cfs_rq, curr);
790} 623}
791 624
792/************************************************** 625/**************************************************
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
821 */ 654 */
822static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 655static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
823{ 656{
824 /* A later patch will take group into account */ 657 return cfs_rq->tg->cfs_rq[this_cpu];
825 return &cpu_rq(this_cpu)->cfs;
826} 658}
827 659
828/* Iterate thr' all leaf cfs_rq's on a runqueue */ 660/* Iterate thr' all leaf cfs_rq's on a runqueue */
829#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 661#define for_each_leaf_cfs_rq(rq, cfs_rq) \
830 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 662 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
831 663
832/* Do the two (enqueued) tasks belong to the same group ? */ 664/* Do the two (enqueued) entities belong to the same group ? */
833static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 665static inline int
666is_same_group(struct sched_entity *se, struct sched_entity *pse)
834{ 667{
835 if (curr->se.cfs_rq == p->se.cfs_rq) 668 if (se->cfs_rq == pse->cfs_rq)
836 return 1; 669 return 1;
837 670
838 return 0; 671 return 0;
839} 672}
840 673
674static inline struct sched_entity *parent_entity(struct sched_entity *se)
675{
676 return se->parent;
677}
678
841#else /* CONFIG_FAIR_GROUP_SCHED */ 679#else /* CONFIG_FAIR_GROUP_SCHED */
842 680
843#define for_each_sched_entity(se) \ 681#define for_each_sched_entity(se) \
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
870#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 708#define for_each_leaf_cfs_rq(rq, cfs_rq) \
871 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 709 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
872 710
873static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 711static inline int
712is_same_group(struct sched_entity *se, struct sched_entity *pse)
874{ 713{
875 return 1; 714 return 1;
876} 715}
877 716
717static inline struct sched_entity *parent_entity(struct sched_entity *se)
718{
719 return NULL;
720}
721
878#endif /* CONFIG_FAIR_GROUP_SCHED */ 722#endif /* CONFIG_FAIR_GROUP_SCHED */
879 723
880/* 724/*
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
892 break; 736 break;
893 cfs_rq = cfs_rq_of(se); 737 cfs_rq = cfs_rq_of(se);
894 enqueue_entity(cfs_rq, se, wakeup); 738 enqueue_entity(cfs_rq, se, wakeup);
739 wakeup = 1;
895 } 740 }
896} 741}
897 742
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
911 /* Don't dequeue parent if it has other entities besides us */ 756 /* Don't dequeue parent if it has other entities besides us */
912 if (cfs_rq->load.weight) 757 if (cfs_rq->load.weight)
913 break; 758 break;
759 sleep = 1;
914 } 760 }
915} 761}
916 762
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
919 * 765 *
920 * If compat_yield is turned on then we requeue to the end of the tree. 766 * If compat_yield is turned on then we requeue to the end of the tree.
921 */ 767 */
922static void yield_task_fair(struct rq *rq, struct task_struct *p) 768static void yield_task_fair(struct rq *rq)
923{ 769{
924 struct cfs_rq *cfs_rq = task_cfs_rq(p); 770 struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 771 struct sched_entity *rightmost, *se = &rq->curr->se;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
928 772
929 /* 773 /*
930 * Are we the only task in the tree? 774 * Are we the only task in the tree?
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p)
935 if (likely(!sysctl_sched_compat_yield)) { 779 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq); 780 __update_rq_clock(rq);
937 /* 781 /*
938 * Dequeue and enqueue the task to update its 782 * Update run-time statistics of the 'current'.
939 * position within the tree:
940 */ 783 */
941 dequeue_entity(cfs_rq, &p->se, 0); 784 update_curr(cfs_rq);
942 enqueue_entity(cfs_rq, &p->se, 0);
943 785
944 return; 786 return;
945 } 787 }
946 /* 788 /*
947 * Find the rightmost entry in the rbtree: 789 * Find the rightmost entry in the rbtree:
948 */ 790 */
949 do { 791 rightmost = __pick_last_entity(cfs_rq);
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /* 792 /*
956 * Already in the rightmost position? 793 * Already in the rightmost position?
957 */ 794 */
958 if (unlikely(rightmost == se)) 795 if (unlikely(rightmost->vruntime < se->vruntime))
959 return; 796 return;
960 797
961 /* 798 /*
962 * Minimally necessary key value to be last in the tree: 799 * Minimally necessary key value to be last in the tree:
800 * Upon rescheduling, sched_class::put_prev_task() will place
801 * 'current' within the tree based on its new key value.
963 */ 802 */
964 se->fair_key = rightmost->fair_key + 1; 803 se->vruntime = rightmost->vruntime + 1;
965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
974} 804}
975 805
976/* 806/*
977 * Preempt the current task with a newly woken task if needed: 807 * Preempt the current task with a newly woken task if needed:
978 */ 808 */
979static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) 809static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
980{ 810{
981 struct task_struct *curr = rq->curr; 811 struct task_struct *curr = rq->curr;
982 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 812 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
983 unsigned long gran; 813 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran;
984 815
985 if (unlikely(rt_prio(p->prio))) { 816 if (unlikely(rt_prio(p->prio))) {
986 update_rq_clock(rq); 817 update_rq_clock(rq);
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
988 resched_task(curr); 819 resched_task(curr);
989 return; 820 return;
990 } 821 }
991
992 gran = sysctl_sched_wakeup_granularity;
993 /* 822 /*
994 * Batch tasks prefer throughput over latency: 823 * Batch tasks do not preempt (their preemption is driven by
824 * the tick):
995 */ 825 */
996 if (unlikely(p->policy == SCHED_BATCH)) 826 if (unlikely(p->policy == SCHED_BATCH))
997 gran = sysctl_sched_batch_wakeup_granularity; 827 return;
828
829 if (sched_feat(WAKEUP_PREEMPT)) {
830 while (!is_same_group(se, pse)) {
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
834
835 delta = se->vruntime - pse->vruntime;
836 gran = sysctl_sched_wakeup_granularity;
837 if (unlikely(se->load.weight != NICE_0_LOAD))
838 gran = calc_delta_fair(gran, &se->load);
839
840 if (delta > gran) {
841 int now = !sched_feat(PREEMPT_RESTRICT);
998 842
999 if (is_same_group(curr, p)) 843 if (now || p->prio < curr->prio || !se->peer_preempt++)
1000 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); 844 resched_task(curr);
845 }
846 }
1001} 847}
1002 848
1003static struct task_struct *pick_next_task_fair(struct rq *rq) 849static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1041 * achieve that by always pre-iterating before returning 887 * achieve that by always pre-iterating before returning
1042 * the current task: 888 * the current task:
1043 */ 889 */
1044static inline struct task_struct * 890static struct task_struct *
1045__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 891__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
1046{ 892{
1047 struct task_struct *p; 893 struct task_struct *p;
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1078 if (!cfs_rq->nr_running) 924 if (!cfs_rq->nr_running)
1079 return MAX_PRIO; 925 return MAX_PRIO;
1080 926
1081 curr = __pick_next_entity(cfs_rq); 927 curr = cfs_rq->curr;
928 if (!curr)
929 curr = __pick_next_entity(cfs_rq);
930
1082 p = task_of(curr); 931 p = task_of(curr);
1083 932
1084 return p->prio; 933 return p->prio;
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1153 } 1002 }
1154} 1003}
1155 1004
1005#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1006
1156/* 1007/*
1157 * Share the fairness runtime between parent and child, thus the 1008 * Share the fairness runtime between parent and child, thus the
1158 * total amount of pressure for CPU stays equal - new tasks 1009 * total amount of pressure for CPU stays equal - new tasks
@@ -1163,37 +1014,28 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1163static void task_new_fair(struct rq *rq, struct task_struct *p) 1014static void task_new_fair(struct rq *rq, struct task_struct *p)
1164{ 1015{
1165 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1016 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); 1017 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1018 int this_cpu = smp_processor_id();
1167 1019
1168 sched_info_queued(p); 1020 sched_info_queued(p);
1169 1021
1170 update_curr(cfs_rq); 1022 update_curr(cfs_rq);
1171 update_stats_enqueue(cfs_rq, se); 1023 place_entity(cfs_rq, se, 1);
1172 /*
1173 * Child runs first: we let it run before the parent
1174 * until it reschedules once. We set up the key so that
1175 * it will preempt the parent:
1176 */
1177 se->fair_key = curr->fair_key -
1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1179 /*
1180 * The first wait is dominated by the child-runs-first logic,
1181 * so do not credit it with that waiting time yet:
1182 */
1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1184 se->wait_start_fair = 0;
1185 1024
1186 /* 1025 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1187 * The statistical average of wait_runtime is about 1026 curr->vruntime < se->vruntime) {
1188 * -granularity/2, so initialize the task with that: 1027 /*
1189 */ 1028 * Upon rescheduling, sched_class::put_prev_task() will place
1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1029 * 'current' within the tree based on its new key value.
1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2); 1030 */
1031 swap(curr->vruntime, se->vruntime);
1032 }
1192 1033
1193 __enqueue_entity(cfs_rq, se); 1034 se->peer_preempt = 0;
1035 enqueue_task_fair(rq, p, 0);
1036 resched_task(rq->curr);
1194} 1037}
1195 1038
1196#ifdef CONFIG_FAIR_GROUP_SCHED
1197/* Account for a task changing its policy or group. 1039/* Account for a task changing its policy or group.
1198 * 1040 *
1199 * This routine is mostly called to set cfs_rq->curr field when a task 1041 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1206,21 +1048,17 @@ static void set_curr_task_fair(struct rq *rq)
1206 for_each_sched_entity(se) 1048 for_each_sched_entity(se)
1207 set_next_entity(cfs_rq_of(se), se); 1049 set_next_entity(cfs_rq_of(se), se);
1208} 1050}
1209#else
1210static void set_curr_task_fair(struct rq *rq)
1211{
1212}
1213#endif
1214 1051
1215/* 1052/*
1216 * All the scheduling class methods: 1053 * All the scheduling class methods:
1217 */ 1054 */
1218struct sched_class fair_sched_class __read_mostly = { 1055static const struct sched_class fair_sched_class = {
1056 .next = &idle_sched_class,
1219 .enqueue_task = enqueue_task_fair, 1057 .enqueue_task = enqueue_task_fair,
1220 .dequeue_task = dequeue_task_fair, 1058 .dequeue_task = dequeue_task_fair,
1221 .yield_task = yield_task_fair, 1059 .yield_task = yield_task_fair,
1222 1060
1223 .check_preempt_curr = check_preempt_curr_fair, 1061 .check_preempt_curr = check_preempt_wakeup,
1224 1062
1225 .pick_next_task = pick_next_task_fair, 1063 .pick_next_task = pick_next_task_fair,
1226 .put_prev_task = put_prev_task_fair, 1064 .put_prev_task = put_prev_task_fair,
@@ -1237,6 +1075,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1237{ 1075{
1238 struct cfs_rq *cfs_rq; 1076 struct cfs_rq *cfs_rq;
1239 1077
1078#ifdef CONFIG_FAIR_GROUP_SCHED
1079 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1080#endif
1240 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1081 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1241 print_cfs_rq(m, cpu, cfs_rq); 1082 print_cfs_rq(m, cpu, cfs_rq);
1242} 1083}
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3503fb2d9f96..6e2ead41516e 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{ 50{
51} 51}
52 52
53static void set_curr_task_idle(struct rq *rq)
54{
55}
56
53/* 57/*
54 * Simple, special scheduling class for the per-CPU idle tasks: 58 * Simple, special scheduling class for the per-CPU idle tasks:
55 */ 59 */
56static struct sched_class idle_sched_class __read_mostly = { 60const struct sched_class idle_sched_class = {
61 /* .next is NULL */
57 /* no enqueue/yield_task for idle tasks */ 62 /* no enqueue/yield_task for idle tasks */
58 63
59 /* dequeue is not valid, we print a debug message there: */ 64 /* dequeue is not valid, we print a debug message there: */
@@ -66,6 +71,7 @@ static struct sched_class idle_sched_class __read_mostly = {
66 71
67 .load_balance = load_balance_idle, 72 .load_balance = load_balance_idle,
68 73
74 .set_curr_task = set_curr_task_idle,
69 .task_tick = task_tick_idle, 75 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */ 76 /* no .task_new for idle tasks */
71}; 77};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476a02d0..d0097a0634e5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
7 * Update the current task's runtime statistics. Skip current tasks that 7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 8 * are not in our scheduling class.
9 */ 9 */
10static inline void update_curr_rt(struct rq *rq) 10static void update_curr_rt(struct rq *rq)
11{ 11{
12 struct task_struct *curr = rq->curr; 12 struct task_struct *curr = rq->curr;
13 u64 delta_exec; 13 u64 delta_exec;
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p)
59} 59}
60 60
61static void 61static void
62yield_task_rt(struct rq *rq, struct task_struct *p) 62yield_task_rt(struct rq *rq)
63{ 63{
64 requeue_task_rt(rq, p); 64 requeue_task_rt(rq, rq->curr);
65} 65}
66 66
67/* 67/*
@@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
206 if (--p->time_slice) 206 if (--p->time_slice)
207 return; 207 return;
208 208
209 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = DEF_TIMESLICE;
210 210
211 /* 211 /*
212 * Requeue to the end of queue if we are not the only element 212 * Requeue to the end of queue if we are not the only element
@@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
218 } 218 }
219} 219}
220 220
221static struct sched_class rt_sched_class __read_mostly = { 221static void set_curr_task_rt(struct rq *rq)
222{
223 struct task_struct *p = rq->curr;
224
225 p->se.exec_start = rq->clock;
226}
227
228const struct sched_class rt_sched_class = {
229 .next = &fair_sched_class,
222 .enqueue_task = enqueue_task_rt, 230 .enqueue_task = enqueue_task_rt,
223 .dequeue_task = dequeue_task_rt, 231 .dequeue_task = dequeue_task_rt,
224 .yield_task = yield_task_rt, 232 .yield_task = yield_task_rt,
@@ -230,5 +238,6 @@ static struct sched_class rt_sched_class __read_mostly = {
230 238
231 .load_balance = load_balance_rt, 239 .load_balance = load_balance_rt,
232 240
241 .set_curr_task = set_curr_task_rt,
233 .task_tick = task_tick_rt, 242 .task_tick = task_tick_rt,
234}; 243};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c20a94dda61e..ef1a7df80ea2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_file *seq, void *v)
16 struct rq *rq = cpu_rq(cpu); 16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 struct sched_domain *sd; 18 struct sched_domain *sd;
19 int dcnt = 0; 19 int dcount = 0;
20#endif 20#endif
21 21
22 /* runqueue-specific stats */ 22 /* runqueue-specific stats */
23 seq_printf(seq, 23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", 24 "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
25 cpu, rq->yld_both_empty, 25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 27 rq->sched_switch, rq->sched_count, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local, 28 rq->ttwu_count, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time, 29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
31 31
32 seq_printf(seq, "\n"); 32 seq_printf(seq, "\n");
33 33
@@ -39,12 +39,11 @@ static int show_schedstat(struct seq_file *seq, void *v)
39 char mask_str[NR_CPUS]; 39 char mask_str[NR_CPUS];
40 40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 42 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " 45 seq_printf(seq, " %u %u %u %u %u %u %u %u",
46 "%lu", 46 sd->lb_count[itype],
47 sd->lb_cnt[itype],
48 sd->lb_balanced[itype], 47 sd->lb_balanced[itype],
49 sd->lb_failed[itype], 48 sd->lb_failed[itype],
50 sd->lb_imbalance[itype], 49 sd->lb_imbalance[itype],
@@ -53,11 +52,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
53 sd->lb_nobusyq[itype], 52 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]); 53 sd->lb_nobusyg[itype]);
55 } 54 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" 55 seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n",
57 " %lu %lu %lu\n", 56 sd->alb_count, sd->alb_failed, sd->alb_pushed,
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 57 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 58 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine, 59 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance); 60 sd->ttwu_move_balance);
63 } 61 }
@@ -101,7 +99,7 @@ rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{ 99{
102 if (rq) { 100 if (rq) {
103 rq->rq_sched_info.run_delay += delta; 101 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++; 102 rq->rq_sched_info.pcount++;
105 } 103 }
106} 104}
107 105
@@ -129,7 +127,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
129# define schedstat_set(var, val) do { } while (0) 127# define schedstat_set(var, val) do { } while (0)
130#endif 128#endif
131 129
132#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 130#ifdef CONFIG_SCHEDSTATS
133/* 131/*
134 * Called when a process is dequeued from the active array and given 132 * Called when a process is dequeued from the active array and given
135 * the cpu. We should note that with the exception of interactive 133 * the cpu. We should note that with the exception of interactive
@@ -164,7 +162,7 @@ static void sched_info_arrive(struct task_struct *t)
164 sched_info_dequeued(t); 162 sched_info_dequeued(t);
165 t->sched_info.run_delay += delta; 163 t->sched_info.run_delay += delta;
166 t->sched_info.last_arrival = now; 164 t->sched_info.last_arrival = now;
167 t->sched_info.pcnt++; 165 t->sched_info.pcount++;
168 166
169 rq_sched_info_arrive(task_rq(t), delta); 167 rq_sched_info_arrive(task_rq(t), delta);
170} 168}
@@ -233,5 +231,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
233#else 231#else
234#define sched_info_queued(t) do { } while (0) 232#define sched_info_queued(t) do { } while (0)
235#define sched_info_switch(t, next) do { } while (0) 233#define sched_info_switch(t, next) do { } while (0)
236#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 234#endif /* CONFIG_SCHEDSTATS */
237 235
diff --git a/kernel/signal.c b/kernel/signal.c
index 792952381092..12006308c7eb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -99,7 +99,6 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
99static int recalc_sigpending_tsk(struct task_struct *t) 99static int recalc_sigpending_tsk(struct task_struct *t)
100{ 100{
101 if (t->signal->group_stop_count > 0 || 101 if (t->signal->group_stop_count > 0 ||
102 (freezing(t)) ||
103 PENDING(&t->pending, &t->blocked) || 102 PENDING(&t->pending, &t->blocked) ||
104 PENDING(&t->signal->shared_pending, &t->blocked)) { 103 PENDING(&t->signal->shared_pending, &t->blocked)) {
105 set_tsk_thread_flag(t, TIF_SIGPENDING); 104 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -257,7 +256,7 @@ flush_signal_handlers(struct task_struct *t, int force_default)
257 256
258int unhandled_signal(struct task_struct *tsk, int sig) 257int unhandled_signal(struct task_struct *tsk, int sig)
259{ 258{
260 if (is_init(tsk)) 259 if (is_global_init(tsk))
261 return 1; 260 return 1;
262 if (tsk->ptrace & PT_PTRACED) 261 if (tsk->ptrace & PT_PTRACED)
263 return 0; 262 return 0;
@@ -537,7 +536,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
537 return error; 536 return error;
538 error = -EPERM; 537 error = -EPERM;
539 if (((sig != SIGCONT) || 538 if (((sig != SIGCONT) ||
540 (process_session(current) != process_session(t))) 539 (task_session_nr(current) != task_session_nr(t)))
541 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 540 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
542 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 541 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
543 && !capable(CAP_KILL)) 542 && !capable(CAP_KILL))
@@ -695,7 +694,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
695 q->info.si_signo = sig; 694 q->info.si_signo = sig;
696 q->info.si_errno = 0; 695 q->info.si_errno = 0;
697 q->info.si_code = SI_USER; 696 q->info.si_code = SI_USER;
698 q->info.si_pid = current->pid; 697 q->info.si_pid = task_pid_vnr(current);
699 q->info.si_uid = current->uid; 698 q->info.si_uid = current->uid;
700 break; 699 break;
701 case (unsigned long) SEND_SIG_PRIV: 700 case (unsigned long) SEND_SIG_PRIV:
@@ -731,7 +730,7 @@ int print_fatal_signals;
731static void print_fatal_signal(struct pt_regs *regs, int signr) 730static void print_fatal_signal(struct pt_regs *regs, int signr)
732{ 731{
733 printk("%s/%d: potentially unexpected fatal signal %d.\n", 732 printk("%s/%d: potentially unexpected fatal signal %d.\n",
734 current->comm, current->pid, signr); 733 current->comm, task_pid_nr(current), signr);
735 734
736#ifdef __i386__ 735#ifdef __i386__
737 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->eip);
@@ -909,8 +908,7 @@ __group_complete_signal(int sig, struct task_struct *p)
909 do { 908 do {
910 sigaddset(&t->pending.signal, SIGKILL); 909 sigaddset(&t->pending.signal, SIGKILL);
911 signal_wake_up(t, 1); 910 signal_wake_up(t, 1);
912 t = next_thread(t); 911 } while_each_thread(p, t);
913 } while (t != p);
914 return; 912 return;
915 } 913 }
916 914
@@ -928,13 +926,11 @@ __group_complete_signal(int sig, struct task_struct *p)
928 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 926 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
929 p->signal->group_stop_count = 0; 927 p->signal->group_stop_count = 0;
930 p->signal->group_exit_task = t; 928 p->signal->group_exit_task = t;
931 t = p; 929 p = t;
932 do { 930 do {
933 p->signal->group_stop_count++; 931 p->signal->group_stop_count++;
934 signal_wake_up(t, 0); 932 signal_wake_up(t, t == p);
935 t = next_thread(t); 933 } while_each_thread(p, t);
936 } while (t != p);
937 wake_up_process(p->signal->group_exit_task);
938 return; 934 return;
939 } 935 }
940 936
@@ -985,9 +981,6 @@ void zap_other_threads(struct task_struct *p)
985 p->signal->flags = SIGNAL_GROUP_EXIT; 981 p->signal->flags = SIGNAL_GROUP_EXIT;
986 p->signal->group_stop_count = 0; 982 p->signal->group_stop_count = 0;
987 983
988 if (thread_group_empty(p))
989 return;
990
991 for (t = next_thread(p); t != p; t = next_thread(t)) { 984 for (t = next_thread(p); t != p; t = next_thread(t)) {
992 /* 985 /*
993 * Don't bother with already dead threads 986 * Don't bother with already dead threads
@@ -1096,7 +1089,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1096{ 1089{
1097 int error; 1090 int error;
1098 rcu_read_lock(); 1091 rcu_read_lock();
1099 error = kill_pid_info(sig, info, find_pid(pid)); 1092 error = kill_pid_info(sig, info, find_vpid(pid));
1100 rcu_read_unlock(); 1093 rcu_read_unlock();
1101 return error; 1094 return error;
1102} 1095}
@@ -1157,7 +1150,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1157 1150
1158 read_lock(&tasklist_lock); 1151 read_lock(&tasklist_lock);
1159 for_each_process(p) { 1152 for_each_process(p) {
1160 if (p->pid > 1 && p->tgid != current->tgid) { 1153 if (p->pid > 1 && !same_thread_group(p, current)) {
1161 int err = group_send_sig_info(sig, info, p); 1154 int err = group_send_sig_info(sig, info, p);
1162 ++count; 1155 ++count;
1163 if (err != -EPERM) 1156 if (err != -EPERM)
@@ -1167,9 +1160,9 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
1167 read_unlock(&tasklist_lock); 1160 read_unlock(&tasklist_lock);
1168 ret = count ? retval : -ESRCH; 1161 ret = count ? retval : -ESRCH;
1169 } else if (pid < 0) { 1162 } else if (pid < 0) {
1170 ret = kill_pgrp_info(sig, info, find_pid(-pid)); 1163 ret = kill_pgrp_info(sig, info, find_vpid(-pid));
1171 } else { 1164 } else {
1172 ret = kill_pid_info(sig, info, find_pid(pid)); 1165 ret = kill_pid_info(sig, info, find_vpid(pid));
1173 } 1166 }
1174 rcu_read_unlock(); 1167 rcu_read_unlock();
1175 return ret; 1168 return ret;
@@ -1273,7 +1266,12 @@ EXPORT_SYMBOL(kill_pid);
1273int 1266int
1274kill_proc(pid_t pid, int sig, int priv) 1267kill_proc(pid_t pid, int sig, int priv)
1275{ 1268{
1276 return kill_proc_info(sig, __si_special(priv), pid); 1269 int ret;
1270
1271 rcu_read_lock();
1272 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1273 rcu_read_unlock();
1274 return ret;
1277} 1275}
1278 1276
1279/* 1277/*
@@ -1450,7 +1448,22 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1450 1448
1451 info.si_signo = sig; 1449 info.si_signo = sig;
1452 info.si_errno = 0; 1450 info.si_errno = 0;
1453 info.si_pid = tsk->pid; 1451 /*
1452 * we are under tasklist_lock here so our parent is tied to
1453 * us and cannot exit and release its namespace.
1454 *
1455 * the only it can is to switch its nsproxy with sys_unshare,
1456 * bu uncharing pid namespaces is not allowed, so we'll always
1457 * see relevant namespace
1458 *
1459 * write_lock() currently calls preempt_disable() which is the
1460 * same as rcu_read_lock(), but according to Oleg, this is not
1461 * correct to rely on this
1462 */
1463 rcu_read_lock();
1464 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1465 rcu_read_unlock();
1466
1454 info.si_uid = tsk->uid; 1467 info.si_uid = tsk->uid;
1455 1468
1456 /* FIXME: find out whether or not this is supposed to be c*time. */ 1469 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1515,7 +1528,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1515 1528
1516 info.si_signo = SIGCHLD; 1529 info.si_signo = SIGCHLD;
1517 info.si_errno = 0; 1530 info.si_errno = 0;
1518 info.si_pid = tsk->pid; 1531 /*
1532 * see comment in do_notify_parent() abot the following 3 lines
1533 */
1534 rcu_read_lock();
1535 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1536 rcu_read_unlock();
1537
1519 info.si_uid = tsk->uid; 1538 info.si_uid = tsk->uid;
1520 1539
1521 /* FIXME: find out whether or not this is supposed to be c*time. */ 1540 /* FIXME: find out whether or not this is supposed to be c*time. */
@@ -1641,7 +1660,7 @@ void ptrace_notify(int exit_code)
1641 memset(&info, 0, sizeof info); 1660 memset(&info, 0, sizeof info);
1642 info.si_signo = SIGTRAP; 1661 info.si_signo = SIGTRAP;
1643 info.si_code = exit_code; 1662 info.si_code = exit_code;
1644 info.si_pid = current->pid; 1663 info.si_pid = task_pid_vnr(current);
1645 info.si_uid = current->uid; 1664 info.si_uid = current->uid;
1646 1665
1647 /* Let the debugger run. */ 1666 /* Let the debugger run. */
@@ -1811,7 +1830,7 @@ relock:
1811 info->si_signo = signr; 1830 info->si_signo = signr;
1812 info->si_errno = 0; 1831 info->si_errno = 0;
1813 info->si_code = SI_USER; 1832 info->si_code = SI_USER;
1814 info->si_pid = current->parent->pid; 1833 info->si_pid = task_pid_vnr(current->parent);
1815 info->si_uid = current->parent->uid; 1834 info->si_uid = current->parent->uid;
1816 } 1835 }
1817 1836
@@ -1842,11 +1861,9 @@ relock:
1842 continue; 1861 continue;
1843 1862
1844 /* 1863 /*
1845 * Init of a pid space gets no signals it doesn't want from 1864 * Global init gets no signals it doesn't want.
1846 * within that pid space. It can of course get signals from
1847 * its parent pid space.
1848 */ 1865 */
1849 if (current == child_reaper(current)) 1866 if (is_global_init(current))
1850 continue; 1867 continue;
1851 1868
1852 if (sig_kernel_stop(signr)) { 1869 if (sig_kernel_stop(signr)) {
@@ -2200,7 +2217,7 @@ sys_kill(int pid, int sig)
2200 info.si_signo = sig; 2217 info.si_signo = sig;
2201 info.si_errno = 0; 2218 info.si_errno = 0;
2202 info.si_code = SI_USER; 2219 info.si_code = SI_USER;
2203 info.si_pid = current->tgid; 2220 info.si_pid = task_tgid_vnr(current);
2204 info.si_uid = current->uid; 2221 info.si_uid = current->uid;
2205 2222
2206 return kill_something_info(sig, &info, pid); 2223 return kill_something_info(sig, &info, pid);
@@ -2216,12 +2233,12 @@ static int do_tkill(int tgid, int pid, int sig)
2216 info.si_signo = sig; 2233 info.si_signo = sig;
2217 info.si_errno = 0; 2234 info.si_errno = 0;
2218 info.si_code = SI_TKILL; 2235 info.si_code = SI_TKILL;
2219 info.si_pid = current->tgid; 2236 info.si_pid = task_tgid_vnr(current);
2220 info.si_uid = current->uid; 2237 info.si_uid = current->uid;
2221 2238
2222 read_lock(&tasklist_lock); 2239 read_lock(&tasklist_lock);
2223 p = find_task_by_pid(pid); 2240 p = find_task_by_vpid(pid);
2224 if (p && (tgid <= 0 || p->tgid == tgid)) { 2241 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2225 error = check_kill_permission(sig, &info, p); 2242 error = check_kill_permission(sig, &info, p);
2226 /* 2243 /*
2227 * The null signal is a permissions and process existence 2244 * The null signal is a permissions and process existence
@@ -2300,15 +2317,6 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2300 k = &current->sighand->action[sig-1]; 2317 k = &current->sighand->action[sig-1];
2301 2318
2302 spin_lock_irq(&current->sighand->siglock); 2319 spin_lock_irq(&current->sighand->siglock);
2303 if (signal_pending(current)) {
2304 /*
2305 * If there might be a fatal signal pending on multiple
2306 * threads, make sure we take it before changing the action.
2307 */
2308 spin_unlock_irq(&current->sighand->siglock);
2309 return -ERESTARTNOINTR;
2310 }
2311
2312 if (oact) 2320 if (oact)
2313 *oact = *k; 2321 *oact = *k;
2314 2322
@@ -2335,7 +2343,6 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2335 rm_from_queue_full(&mask, &t->signal->shared_pending); 2343 rm_from_queue_full(&mask, &t->signal->shared_pending);
2336 do { 2344 do {
2337 rm_from_queue_full(&mask, &t->pending); 2345 rm_from_queue_full(&mask, &t->pending);
2338 recalc_sigpending_and_wake(t);
2339 t = next_thread(t); 2346 t = next_thread(t);
2340 } while (t != current); 2347 } while (t != current);
2341 } 2348 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f546ddea43d..bd89bc4eb0b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void)
271 local_irq_restore(flags); 271 local_irq_restore(flags);
272} 272}
273 273
274EXPORT_SYMBOL(do_softirq);
275
276#endif 274#endif
277 275
278/* 276/*
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
332 wakeup_softirqd(); 330 wakeup_softirqd();
333} 331}
334 332
335EXPORT_SYMBOL(raise_softirq_irqoff);
336
337void fastcall raise_softirq(unsigned int nr) 333void fastcall raise_softirq(unsigned int nr)
338{ 334{
339 unsigned long flags; 335 unsigned long flags;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 708d4882c0c3..11df812263c8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -15,13 +15,16 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/module.h> 16#include <linux/module.h>
17 17
18#include <asm/irq_regs.h>
19
18static DEFINE_SPINLOCK(print_lock); 20static DEFINE_SPINLOCK(print_lock);
19 21
20static DEFINE_PER_CPU(unsigned long, touch_timestamp); 22static DEFINE_PER_CPU(unsigned long, touch_timestamp);
21static DEFINE_PER_CPU(unsigned long, print_timestamp); 23static DEFINE_PER_CPU(unsigned long, print_timestamp);
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 24static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23 25
24static int did_panic = 0; 26static int did_panic;
27int softlockup_thresh = 10;
25 28
26static int 29static int
27softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 30softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -40,14 +43,16 @@ static struct notifier_block panic_block = {
40 * resolution, and we don't need to waste time with a big divide when 43 * resolution, and we don't need to waste time with a big divide when
41 * 2^30ns == 1.074s. 44 * 2^30ns == 1.074s.
42 */ 45 */
43static unsigned long get_timestamp(void) 46static unsigned long get_timestamp(int this_cpu)
44{ 47{
45 return sched_clock() >> 30; /* 2^30 ~= 10^9 */ 48 return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
46} 49}
47 50
48void touch_softlockup_watchdog(void) 51void touch_softlockup_watchdog(void)
49{ 52{
50 __raw_get_cpu_var(touch_timestamp) = get_timestamp(); 53 int this_cpu = raw_smp_processor_id();
54
55 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
51} 56}
52EXPORT_SYMBOL(touch_softlockup_watchdog); 57EXPORT_SYMBOL(touch_softlockup_watchdog);
53 58
@@ -70,6 +75,7 @@ void softlockup_tick(void)
70 int this_cpu = smp_processor_id(); 75 int this_cpu = smp_processor_id();
71 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 76 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
72 unsigned long print_timestamp; 77 unsigned long print_timestamp;
78 struct pt_regs *regs = get_irq_regs();
73 unsigned long now; 79 unsigned long now;
74 80
75 if (touch_timestamp == 0) { 81 if (touch_timestamp == 0) {
@@ -80,10 +86,11 @@ void softlockup_tick(void)
80 print_timestamp = per_cpu(print_timestamp, this_cpu); 86 print_timestamp = per_cpu(print_timestamp, this_cpu);
81 87
82 /* report at most once a second */ 88 /* report at most once a second */
83 if (print_timestamp < (touch_timestamp + 1) || 89 if ((print_timestamp >= touch_timestamp &&
84 did_panic || 90 print_timestamp < (touch_timestamp + 1)) ||
85 !per_cpu(watchdog_task, this_cpu)) 91 did_panic || !per_cpu(watchdog_task, this_cpu)) {
86 return; 92 return;
93 }
87 94
88 /* do not print during early bootup: */ 95 /* do not print during early bootup: */
89 if (unlikely(system_state != SYSTEM_RUNNING)) { 96 if (unlikely(system_state != SYSTEM_RUNNING)) {
@@ -91,28 +98,33 @@ void softlockup_tick(void)
91 return; 98 return;
92 } 99 }
93 100
94 now = get_timestamp(); 101 now = get_timestamp(this_cpu);
95 102
96 /* Wake up the high-prio watchdog task every second: */ 103 /* Wake up the high-prio watchdog task every second: */
97 if (now > (touch_timestamp + 1)) 104 if (now > (touch_timestamp + 1))
98 wake_up_process(per_cpu(watchdog_task, this_cpu)); 105 wake_up_process(per_cpu(watchdog_task, this_cpu));
99 106
100 /* Warn about unreasonable 10+ seconds delays: */ 107 /* Warn about unreasonable 10+ seconds delays: */
101 if (now > (touch_timestamp + 10)) { 108 if (now <= (touch_timestamp + softlockup_thresh))
102 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 109 return;
103 110
104 spin_lock(&print_lock); 111 per_cpu(print_timestamp, this_cpu) = touch_timestamp;
105 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", 112
106 this_cpu); 113 spin_lock(&print_lock);
114 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
115 this_cpu, now - touch_timestamp,
116 current->comm, task_pid_nr(current));
117 if (regs)
118 show_regs(regs);
119 else
107 dump_stack(); 120 dump_stack();
108 spin_unlock(&print_lock); 121 spin_unlock(&print_lock);
109 }
110} 122}
111 123
112/* 124/*
113 * The watchdog thread - runs every second and touches the timestamp. 125 * The watchdog thread - runs every second and touches the timestamp.
114 */ 126 */
115static int watchdog(void * __bind_cpu) 127static int watchdog(void *__bind_cpu)
116{ 128{
117 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 129 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
118 130
@@ -150,13 +162,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
150 BUG_ON(per_cpu(watchdog_task, hotcpu)); 162 BUG_ON(per_cpu(watchdog_task, hotcpu));
151 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 163 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
152 if (IS_ERR(p)) { 164 if (IS_ERR(p)) {
153 printk("watchdog for %i failed\n", hotcpu); 165 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
154 return NOTIFY_BAD; 166 return NOTIFY_BAD;
155 } 167 }
156 per_cpu(touch_timestamp, hotcpu) = 0; 168 per_cpu(touch_timestamp, hotcpu) = 0;
157 per_cpu(watchdog_task, hotcpu) = p; 169 per_cpu(watchdog_task, hotcpu) = p;
158 kthread_bind(p, hotcpu); 170 kthread_bind(p, hotcpu);
159 break; 171 break;
160 case CPU_ONLINE: 172 case CPU_ONLINE:
161 case CPU_ONLINE_FROZEN: 173 case CPU_ONLINE_FROZEN:
162 wake_up_process(per_cpu(watchdog_task, hotcpu)); 174 wake_up_process(per_cpu(watchdog_task, hotcpu));
@@ -176,7 +188,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
176 kthread_stop(p); 188 kthread_stop(p);
177 break; 189 break;
178#endif /* CONFIG_HOTPLUG_CPU */ 190#endif /* CONFIG_HOTPLUG_CPU */
179 } 191 }
180 return NOTIFY_OK; 192 return NOTIFY_OK;
181} 193}
182 194
diff --git a/kernel/sys.c b/kernel/sys.c
index 8ae2e636eb1b..304b5410d746 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -105,538 +105,6 @@ EXPORT_SYMBOL(cad_pid);
105 */ 105 */
106 106
107void (*pm_power_off_prepare)(void); 107void (*pm_power_off_prepare)(void);
108EXPORT_SYMBOL(pm_power_off_prepare);
109
110/*
111 * Notifier list for kernel code which wants to be called
112 * at shutdown. This is used to stop any idling DMA operations
113 * and the like.
114 */
115
116static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
117
118/*
119 * Notifier chain core routines. The exported routines below
120 * are layered on top of these, with appropriate locking added.
121 */
122
123static int notifier_chain_register(struct notifier_block **nl,
124 struct notifier_block *n)
125{
126 while ((*nl) != NULL) {
127 if (n->priority > (*nl)->priority)
128 break;
129 nl = &((*nl)->next);
130 }
131 n->next = *nl;
132 rcu_assign_pointer(*nl, n);
133 return 0;
134}
135
136static int notifier_chain_unregister(struct notifier_block **nl,
137 struct notifier_block *n)
138{
139 while ((*nl) != NULL) {
140 if ((*nl) == n) {
141 rcu_assign_pointer(*nl, n->next);
142 return 0;
143 }
144 nl = &((*nl)->next);
145 }
146 return -ENOENT;
147}
148
149/**
150 * notifier_call_chain - Informs the registered notifiers about an event.
151 * @nl: Pointer to head of the blocking notifier chain
152 * @val: Value passed unmodified to notifier function
153 * @v: Pointer passed unmodified to notifier function
154 * @nr_to_call: Number of notifier functions to be called. Don't care
155 * value of this parameter is -1.
156 * @nr_calls: Records the number of notifications sent. Don't care
157 * value of this field is NULL.
158 * @returns: notifier_call_chain returns the value returned by the
159 * last notifier function called.
160 */
161
162static int __kprobes notifier_call_chain(struct notifier_block **nl,
163 unsigned long val, void *v,
164 int nr_to_call, int *nr_calls)
165{
166 int ret = NOTIFY_DONE;
167 struct notifier_block *nb, *next_nb;
168
169 nb = rcu_dereference(*nl);
170
171 while (nb && nr_to_call) {
172 next_nb = rcu_dereference(nb->next);
173 ret = nb->notifier_call(nb, val, v);
174
175 if (nr_calls)
176 (*nr_calls)++;
177
178 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
179 break;
180 nb = next_nb;
181 nr_to_call--;
182 }
183 return ret;
184}
185
186/*
187 * Atomic notifier chain routines. Registration and unregistration
188 * use a spinlock, and call_chain is synchronized by RCU (no locks).
189 */
190
191/**
192 * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
193 * @nh: Pointer to head of the atomic notifier chain
194 * @n: New entry in notifier chain
195 *
196 * Adds a notifier to an atomic notifier chain.
197 *
198 * Currently always returns zero.
199 */
200
201int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
202 struct notifier_block *n)
203{
204 unsigned long flags;
205 int ret;
206
207 spin_lock_irqsave(&nh->lock, flags);
208 ret = notifier_chain_register(&nh->head, n);
209 spin_unlock_irqrestore(&nh->lock, flags);
210 return ret;
211}
212
213EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
214
215/**
216 * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
217 * @nh: Pointer to head of the atomic notifier chain
218 * @n: Entry to remove from notifier chain
219 *
220 * Removes a notifier from an atomic notifier chain.
221 *
222 * Returns zero on success or %-ENOENT on failure.
223 */
224int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
225 struct notifier_block *n)
226{
227 unsigned long flags;
228 int ret;
229
230 spin_lock_irqsave(&nh->lock, flags);
231 ret = notifier_chain_unregister(&nh->head, n);
232 spin_unlock_irqrestore(&nh->lock, flags);
233 synchronize_rcu();
234 return ret;
235}
236
237EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
238
239/**
240 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
241 * @nh: Pointer to head of the atomic notifier chain
242 * @val: Value passed unmodified to notifier function
243 * @v: Pointer passed unmodified to notifier function
244 * @nr_to_call: See the comment for notifier_call_chain.
245 * @nr_calls: See the comment for notifier_call_chain.
246 *
247 * Calls each function in a notifier chain in turn. The functions
248 * run in an atomic context, so they must not block.
249 * This routine uses RCU to synchronize with changes to the chain.
250 *
251 * If the return value of the notifier can be and'ed
252 * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
253 * will return immediately, with the return value of
254 * the notifier function which halted execution.
255 * Otherwise the return value is the return value
256 * of the last notifier function called.
257 */
258
259int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
260 unsigned long val, void *v,
261 int nr_to_call, int *nr_calls)
262{
263 int ret;
264
265 rcu_read_lock();
266 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
267 rcu_read_unlock();
268 return ret;
269}
270
271EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
272
273int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
274 unsigned long val, void *v)
275{
276 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
277}
278
279EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
280/*
281 * Blocking notifier chain routines. All access to the chain is
282 * synchronized by an rwsem.
283 */
284
285/**
286 * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
287 * @nh: Pointer to head of the blocking notifier chain
288 * @n: New entry in notifier chain
289 *
290 * Adds a notifier to a blocking notifier chain.
291 * Must be called in process context.
292 *
293 * Currently always returns zero.
294 */
295
296int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
297 struct notifier_block *n)
298{
299 int ret;
300
301 /*
302 * This code gets used during boot-up, when task switching is
303 * not yet working and interrupts must remain disabled. At
304 * such times we must not call down_write().
305 */
306 if (unlikely(system_state == SYSTEM_BOOTING))
307 return notifier_chain_register(&nh->head, n);
308
309 down_write(&nh->rwsem);
310 ret = notifier_chain_register(&nh->head, n);
311 up_write(&nh->rwsem);
312 return ret;
313}
314
315EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
316
317/**
318 * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
319 * @nh: Pointer to head of the blocking notifier chain
320 * @n: Entry to remove from notifier chain
321 *
322 * Removes a notifier from a blocking notifier chain.
323 * Must be called from process context.
324 *
325 * Returns zero on success or %-ENOENT on failure.
326 */
327int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
328 struct notifier_block *n)
329{
330 int ret;
331
332 /*
333 * This code gets used during boot-up, when task switching is
334 * not yet working and interrupts must remain disabled. At
335 * such times we must not call down_write().
336 */
337 if (unlikely(system_state == SYSTEM_BOOTING))
338 return notifier_chain_unregister(&nh->head, n);
339
340 down_write(&nh->rwsem);
341 ret = notifier_chain_unregister(&nh->head, n);
342 up_write(&nh->rwsem);
343 return ret;
344}
345
346EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
347
348/**
349 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
350 * @nh: Pointer to head of the blocking notifier chain
351 * @val: Value passed unmodified to notifier function
352 * @v: Pointer passed unmodified to notifier function
353 * @nr_to_call: See comment for notifier_call_chain.
354 * @nr_calls: See comment for notifier_call_chain.
355 *
356 * Calls each function in a notifier chain in turn. The functions
357 * run in a process context, so they are allowed to block.
358 *
359 * If the return value of the notifier can be and'ed
360 * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
361 * will return immediately, with the return value of
362 * the notifier function which halted execution.
363 * Otherwise the return value is the return value
364 * of the last notifier function called.
365 */
366
367int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
368 unsigned long val, void *v,
369 int nr_to_call, int *nr_calls)
370{
371 int ret = NOTIFY_DONE;
372
373 /*
374 * We check the head outside the lock, but if this access is
375 * racy then it does not matter what the result of the test
376 * is, we re-check the list after having taken the lock anyway:
377 */
378 if (rcu_dereference(nh->head)) {
379 down_read(&nh->rwsem);
380 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
381 nr_calls);
382 up_read(&nh->rwsem);
383 }
384 return ret;
385}
386EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
387
388int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
389 unsigned long val, void *v)
390{
391 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
392}
393EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
394
395/*
396 * Raw notifier chain routines. There is no protection;
397 * the caller must provide it. Use at your own risk!
398 */
399
400/**
401 * raw_notifier_chain_register - Add notifier to a raw notifier chain
402 * @nh: Pointer to head of the raw notifier chain
403 * @n: New entry in notifier chain
404 *
405 * Adds a notifier to a raw notifier chain.
406 * All locking must be provided by the caller.
407 *
408 * Currently always returns zero.
409 */
410
411int raw_notifier_chain_register(struct raw_notifier_head *nh,
412 struct notifier_block *n)
413{
414 return notifier_chain_register(&nh->head, n);
415}
416
417EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
418
419/**
420 * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
421 * @nh: Pointer to head of the raw notifier chain
422 * @n: Entry to remove from notifier chain
423 *
424 * Removes a notifier from a raw notifier chain.
425 * All locking must be provided by the caller.
426 *
427 * Returns zero on success or %-ENOENT on failure.
428 */
429int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
430 struct notifier_block *n)
431{
432 return notifier_chain_unregister(&nh->head, n);
433}
434
435EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
436
437/**
438 * __raw_notifier_call_chain - Call functions in a raw notifier chain
439 * @nh: Pointer to head of the raw notifier chain
440 * @val: Value passed unmodified to notifier function
441 * @v: Pointer passed unmodified to notifier function
442 * @nr_to_call: See comment for notifier_call_chain.
443 * @nr_calls: See comment for notifier_call_chain
444 *
445 * Calls each function in a notifier chain in turn. The functions
446 * run in an undefined context.
447 * All locking must be provided by the caller.
448 *
449 * If the return value of the notifier can be and'ed
450 * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
451 * will return immediately, with the return value of
452 * the notifier function which halted execution.
453 * Otherwise the return value is the return value
454 * of the last notifier function called.
455 */
456
457int __raw_notifier_call_chain(struct raw_notifier_head *nh,
458 unsigned long val, void *v,
459 int nr_to_call, int *nr_calls)
460{
461 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
462}
463
464EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
465
466int raw_notifier_call_chain(struct raw_notifier_head *nh,
467 unsigned long val, void *v)
468{
469 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
470}
471
472EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
473
474/*
475 * SRCU notifier chain routines. Registration and unregistration
476 * use a mutex, and call_chain is synchronized by SRCU (no locks).
477 */
478
479/**
480 * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
481 * @nh: Pointer to head of the SRCU notifier chain
482 * @n: New entry in notifier chain
483 *
484 * Adds a notifier to an SRCU notifier chain.
485 * Must be called in process context.
486 *
487 * Currently always returns zero.
488 */
489
490int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
491 struct notifier_block *n)
492{
493 int ret;
494
495 /*
496 * This code gets used during boot-up, when task switching is
497 * not yet working and interrupts must remain disabled. At
498 * such times we must not call mutex_lock().
499 */
500 if (unlikely(system_state == SYSTEM_BOOTING))
501 return notifier_chain_register(&nh->head, n);
502
503 mutex_lock(&nh->mutex);
504 ret = notifier_chain_register(&nh->head, n);
505 mutex_unlock(&nh->mutex);
506 return ret;
507}
508
509EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
510
511/**
512 * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
513 * @nh: Pointer to head of the SRCU notifier chain
514 * @n: Entry to remove from notifier chain
515 *
516 * Removes a notifier from an SRCU notifier chain.
517 * Must be called from process context.
518 *
519 * Returns zero on success or %-ENOENT on failure.
520 */
521int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
522 struct notifier_block *n)
523{
524 int ret;
525
526 /*
527 * This code gets used during boot-up, when task switching is
528 * not yet working and interrupts must remain disabled. At
529 * such times we must not call mutex_lock().
530 */
531 if (unlikely(system_state == SYSTEM_BOOTING))
532 return notifier_chain_unregister(&nh->head, n);
533
534 mutex_lock(&nh->mutex);
535 ret = notifier_chain_unregister(&nh->head, n);
536 mutex_unlock(&nh->mutex);
537 synchronize_srcu(&nh->srcu);
538 return ret;
539}
540
541EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
542
543/**
544 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
545 * @nh: Pointer to head of the SRCU notifier chain
546 * @val: Value passed unmodified to notifier function
547 * @v: Pointer passed unmodified to notifier function
548 * @nr_to_call: See comment for notifier_call_chain.
549 * @nr_calls: See comment for notifier_call_chain
550 *
551 * Calls each function in a notifier chain in turn. The functions
552 * run in a process context, so they are allowed to block.
553 *
554 * If the return value of the notifier can be and'ed
555 * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
556 * will return immediately, with the return value of
557 * the notifier function which halted execution.
558 * Otherwise the return value is the return value
559 * of the last notifier function called.
560 */
561
562int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
563 unsigned long val, void *v,
564 int nr_to_call, int *nr_calls)
565{
566 int ret;
567 int idx;
568
569 idx = srcu_read_lock(&nh->srcu);
570 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
571 srcu_read_unlock(&nh->srcu, idx);
572 return ret;
573}
574EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
575
576int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
577 unsigned long val, void *v)
578{
579 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
580}
581EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
582
583/**
584 * srcu_init_notifier_head - Initialize an SRCU notifier head
585 * @nh: Pointer to head of the srcu notifier chain
586 *
587 * Unlike other sorts of notifier heads, SRCU notifier heads require
588 * dynamic initialization. Be sure to call this routine before
589 * calling any of the other SRCU notifier routines for this head.
590 *
591 * If an SRCU notifier head is deallocated, it must first be cleaned
592 * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
593 * per-cpu data (used by the SRCU mechanism) will leak.
594 */
595
596void srcu_init_notifier_head(struct srcu_notifier_head *nh)
597{
598 mutex_init(&nh->mutex);
599 if (init_srcu_struct(&nh->srcu) < 0)
600 BUG();
601 nh->head = NULL;
602}
603
604EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
605
606/**
607 * register_reboot_notifier - Register function to be called at reboot time
608 * @nb: Info about notifier function to be called
609 *
610 * Registers a function with the list of functions
611 * to be called at reboot time.
612 *
613 * Currently always returns zero, as blocking_notifier_chain_register()
614 * always returns zero.
615 */
616
617int register_reboot_notifier(struct notifier_block * nb)
618{
619 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
620}
621
622EXPORT_SYMBOL(register_reboot_notifier);
623
624/**
625 * unregister_reboot_notifier - Unregister previously registered reboot notifier
626 * @nb: Hook to be unregistered
627 *
628 * Unregisters a previously registered reboot
629 * notifier function.
630 *
631 * Returns zero on success, or %-ENOENT on failure.
632 */
633
634int unregister_reboot_notifier(struct notifier_block * nb)
635{
636 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
637}
638
639EXPORT_SYMBOL(unregister_reboot_notifier);
640 108
641static int set_one_prio(struct task_struct *p, int niceval, int error) 109static int set_one_prio(struct task_struct *p, int niceval, int error)
642{ 110{
@@ -684,7 +152,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
684 switch (which) { 152 switch (which) {
685 case PRIO_PROCESS: 153 case PRIO_PROCESS:
686 if (who) 154 if (who)
687 p = find_task_by_pid(who); 155 p = find_task_by_vpid(who);
688 else 156 else
689 p = current; 157 p = current;
690 if (p) 158 if (p)
@@ -692,7 +160,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
692 break; 160 break;
693 case PRIO_PGRP: 161 case PRIO_PGRP:
694 if (who) 162 if (who)
695 pgrp = find_pid(who); 163 pgrp = find_vpid(who);
696 else 164 else
697 pgrp = task_pgrp(current); 165 pgrp = task_pgrp(current);
698 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 166 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -741,7 +209,7 @@ asmlinkage long sys_getpriority(int which, int who)
741 switch (which) { 209 switch (which) {
742 case PRIO_PROCESS: 210 case PRIO_PROCESS:
743 if (who) 211 if (who)
744 p = find_task_by_pid(who); 212 p = find_task_by_vpid(who);
745 else 213 else
746 p = current; 214 p = current;
747 if (p) { 215 if (p) {
@@ -752,7 +220,7 @@ asmlinkage long sys_getpriority(int which, int who)
752 break; 220 break;
753 case PRIO_PGRP: 221 case PRIO_PGRP:
754 if (who) 222 if (who)
755 pgrp = find_pid(who); 223 pgrp = find_vpid(who);
756 else 224 else
757 pgrp = task_pgrp(current); 225 pgrp = task_pgrp(current);
758 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 226 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
@@ -1449,9 +917,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1449 struct task_struct *p; 917 struct task_struct *p;
1450 struct task_struct *group_leader = current->group_leader; 918 struct task_struct *group_leader = current->group_leader;
1451 int err = -EINVAL; 919 int err = -EINVAL;
920 struct pid_namespace *ns;
1452 921
1453 if (!pid) 922 if (!pid)
1454 pid = group_leader->pid; 923 pid = task_pid_vnr(group_leader);
1455 if (!pgid) 924 if (!pgid)
1456 pgid = pid; 925 pgid = pid;
1457 if (pgid < 0) 926 if (pgid < 0)
@@ -1460,10 +929,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1460 /* From this point forward we keep holding onto the tasklist lock 929 /* From this point forward we keep holding onto the tasklist lock
1461 * so that our parent does not change from under us. -DaveM 930 * so that our parent does not change from under us. -DaveM
1462 */ 931 */
932 ns = current->nsproxy->pid_ns;
933
1463 write_lock_irq(&tasklist_lock); 934 write_lock_irq(&tasklist_lock);
1464 935
1465 err = -ESRCH; 936 err = -ESRCH;
1466 p = find_task_by_pid(pid); 937 p = find_task_by_pid_ns(pid, ns);
1467 if (!p) 938 if (!p)
1468 goto out; 939 goto out;
1469 940
@@ -1489,9 +960,9 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1489 goto out; 960 goto out;
1490 961
1491 if (pgid != pid) { 962 if (pgid != pid) {
1492 struct task_struct *g = 963 struct task_struct *g;
1493 find_task_by_pid_type(PIDTYPE_PGID, pgid);
1494 964
965 g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
1495 if (!g || task_session(g) != task_session(group_leader)) 966 if (!g || task_session(g) != task_session(group_leader))
1496 goto out; 967 goto out;
1497 } 968 }
@@ -1500,10 +971,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1500 if (err) 971 if (err)
1501 goto out; 972 goto out;
1502 973
1503 if (process_group(p) != pgid) { 974 if (task_pgrp_nr_ns(p, ns) != pgid) {
975 struct pid *pid;
976
1504 detach_pid(p, PIDTYPE_PGID); 977 detach_pid(p, PIDTYPE_PGID);
1505 p->signal->pgrp = pgid; 978 pid = find_vpid(pgid);
1506 attach_pid(p, PIDTYPE_PGID, find_pid(pgid)); 979 attach_pid(p, PIDTYPE_PGID, pid);
980 set_task_pgrp(p, pid_nr(pid));
1507 } 981 }
1508 982
1509 err = 0; 983 err = 0;
@@ -1516,19 +990,21 @@ out:
1516asmlinkage long sys_getpgid(pid_t pid) 990asmlinkage long sys_getpgid(pid_t pid)
1517{ 991{
1518 if (!pid) 992 if (!pid)
1519 return process_group(current); 993 return task_pgrp_vnr(current);
1520 else { 994 else {
1521 int retval; 995 int retval;
1522 struct task_struct *p; 996 struct task_struct *p;
997 struct pid_namespace *ns;
1523 998
1524 read_lock(&tasklist_lock); 999 ns = current->nsproxy->pid_ns;
1525 p = find_task_by_pid(pid);
1526 1000
1001 read_lock(&tasklist_lock);
1002 p = find_task_by_pid_ns(pid, ns);
1527 retval = -ESRCH; 1003 retval = -ESRCH;
1528 if (p) { 1004 if (p) {
1529 retval = security_task_getpgid(p); 1005 retval = security_task_getpgid(p);
1530 if (!retval) 1006 if (!retval)
1531 retval = process_group(p); 1007 retval = task_pgrp_nr_ns(p, ns);
1532 } 1008 }
1533 read_unlock(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1534 return retval; 1010 return retval;
@@ -1540,7 +1016,7 @@ asmlinkage long sys_getpgid(pid_t pid)
1540asmlinkage long sys_getpgrp(void) 1016asmlinkage long sys_getpgrp(void)
1541{ 1017{
1542 /* SMP - assuming writes are word atomic this is fine */ 1018 /* SMP - assuming writes are word atomic this is fine */
1543 return process_group(current); 1019 return task_pgrp_vnr(current);
1544} 1020}
1545 1021
1546#endif 1022#endif
@@ -1548,19 +1024,21 @@ asmlinkage long sys_getpgrp(void)
1548asmlinkage long sys_getsid(pid_t pid) 1024asmlinkage long sys_getsid(pid_t pid)
1549{ 1025{
1550 if (!pid) 1026 if (!pid)
1551 return process_session(current); 1027 return task_session_vnr(current);
1552 else { 1028 else {
1553 int retval; 1029 int retval;
1554 struct task_struct *p; 1030 struct task_struct *p;
1031 struct pid_namespace *ns;
1555 1032
1556 read_lock(&tasklist_lock); 1033 ns = current->nsproxy->pid_ns;
1557 p = find_task_by_pid(pid);
1558 1034
1035 read_lock(&tasklist_lock);
1036 p = find_task_by_pid_ns(pid, ns);
1559 retval = -ESRCH; 1037 retval = -ESRCH;
1560 if (p) { 1038 if (p) {
1561 retval = security_task_getsid(p); 1039 retval = security_task_getsid(p);
1562 if (!retval) 1040 if (!retval)
1563 retval = process_session(p); 1041 retval = task_session_nr_ns(p, ns);
1564 } 1042 }
1565 read_unlock(&tasklist_lock); 1043 read_unlock(&tasklist_lock);
1566 return retval; 1044 return retval;
@@ -1587,7 +1065,8 @@ asmlinkage long sys_setsid(void)
1587 * session id and so the check will always fail and make it so 1065 * session id and so the check will always fail and make it so
1588 * init cannot successfully call setsid. 1066 * init cannot successfully call setsid.
1589 */ 1067 */
1590 if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session)) 1068 if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
1069 session, &init_pid_ns))
1591 goto out; 1070 goto out;
1592 1071
1593 group_leader->signal->leader = 1; 1072 group_leader->signal->leader = 1;
@@ -1597,7 +1076,7 @@ asmlinkage long sys_setsid(void)
1597 group_leader->signal->tty = NULL; 1076 group_leader->signal->tty = NULL;
1598 spin_unlock(&group_leader->sighand->siglock); 1077 spin_unlock(&group_leader->sighand->siglock);
1599 1078
1600 err = process_group(group_leader); 1079 err = task_pgrp_vnr(group_leader);
1601out: 1080out:
1602 write_unlock_irq(&tasklist_lock); 1081 write_unlock_irq(&tasklist_lock);
1603 return err; 1082 return err;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b0ec498a18d9..52c7a151e298 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -4,6 +4,10 @@
4 4
5#include <asm/unistd.h> 5#include <asm/unistd.h>
6 6
7/* we can't #include <linux/syscalls.h> here,
8 but tell gcc to not warn with -Wmissing-prototypes */
9asmlinkage long sys_ni_syscall(void);
10
7/* 11/*
8 * Non-implemented system calls get redirected here. 12 * Non-implemented system calls get redirected here.
9 */ 13 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53a456ebf6d5..3b4efbe26445 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,7 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/capability.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
@@ -55,6 +55,8 @@
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#endif 56#endif
57 57
58static int deprecated_sysctl_warning(struct __sysctl_args *args);
59
58#if defined(CONFIG_SYSCTL) 60#if defined(CONFIG_SYSCTL)
59 61
60/* External variables not in a header file. */ 62/* External variables not in a header file. */
@@ -63,6 +65,7 @@ extern int print_fatal_signals;
63extern int sysctl_overcommit_memory; 65extern int sysctl_overcommit_memory;
64extern int sysctl_overcommit_ratio; 66extern int sysctl_overcommit_ratio;
65extern int sysctl_panic_on_oom; 67extern int sysctl_panic_on_oom;
68extern int sysctl_oom_kill_allocating_task;
66extern int max_threads; 69extern int max_threads;
67extern int core_uses_pid; 70extern int core_uses_pid;
68extern int suid_dumpable; 71extern int suid_dumpable;
@@ -79,6 +82,19 @@ extern int maps_protect;
79extern int sysctl_stat_interval; 82extern int sysctl_stat_interval;
80extern int audit_argv_kb; 83extern int audit_argv_kb;
81 84
85/* Constants used for minimum and maximum */
86#ifdef CONFIG_DETECT_SOFTLOCKUP
87static int one = 1;
88static int sixty = 60;
89#endif
90
91#ifdef CONFIG_MMU
92static int two = 2;
93#endif
94
95static int zero;
96static int one_hundred = 100;
97
82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 98/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
83static int maxolduid = 65535; 99static int maxolduid = 65535;
84static int minolduid; 100static int minolduid;
@@ -128,32 +144,29 @@ extern int max_lock_depth;
128 144
129#ifdef CONFIG_SYSCTL_SYSCALL 145#ifdef CONFIG_SYSCTL_SYSCALL
130static int parse_table(int __user *, int, void __user *, size_t __user *, 146static int parse_table(int __user *, int, void __user *, size_t __user *,
131 void __user *, size_t, ctl_table *); 147 void __user *, size_t, struct ctl_table *);
132#endif 148#endif
133 149
134 150
135#ifdef CONFIG_PROC_SYSCTL 151#ifdef CONFIG_PROC_SYSCTL
136static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 152static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
137 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
138static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, 154static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
139 void __user *buffer, size_t *lenp, loff_t *ppos); 155 void __user *buffer, size_t *lenp, loff_t *ppos);
140#endif 156#endif
141 157
142static ctl_table root_table[]; 158static struct ctl_table root_table[];
143static struct ctl_table_header root_table_header = 159static struct ctl_table_header root_table_header =
144 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; 160 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) };
145 161
146static ctl_table kern_table[]; 162static struct ctl_table kern_table[];
147static ctl_table vm_table[]; 163static struct ctl_table vm_table[];
148static ctl_table fs_table[]; 164static struct ctl_table fs_table[];
149static ctl_table debug_table[]; 165static struct ctl_table debug_table[];
150static ctl_table dev_table[]; 166static struct ctl_table dev_table[];
151extern ctl_table random_table[]; 167extern struct ctl_table random_table[];
152#ifdef CONFIG_UNIX98_PTYS
153extern ctl_table pty_table[];
154#endif
155#ifdef CONFIG_INOTIFY_USER 168#ifdef CONFIG_INOTIFY_USER
156extern ctl_table inotify_table[]; 169extern struct ctl_table inotify_table[];
157#endif 170#endif
158 171
159#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 172#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -165,7 +178,7 @@ extern int lock_stat;
165 178
166/* The default sysctl tables: */ 179/* The default sysctl tables: */
167 180
168static ctl_table root_table[] = { 181static struct ctl_table root_table[] = {
169 { 182 {
170 .ctl_name = CTL_KERN, 183 .ctl_name = CTL_KERN,
171 .procname = "kernel", 184 .procname = "kernel",
@@ -218,18 +231,15 @@ static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
218static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ 231static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
219#endif 232#endif
220 233
221static ctl_table kern_table[] = { 234static struct ctl_table kern_table[] = {
222#ifdef CONFIG_SCHED_DEBUG 235#ifdef CONFIG_SCHED_DEBUG
223 { 236 {
224 .ctl_name = CTL_UNNUMBERED, 237 .ctl_name = CTL_UNNUMBERED,
225 .procname = "sched_min_granularity_ns", 238 .procname = "sched_nr_latency",
226 .data = &sysctl_sched_min_granularity, 239 .data = &sysctl_sched_nr_latency,
227 .maxlen = sizeof(unsigned int), 240 .maxlen = sizeof(unsigned int),
228 .mode = 0644, 241 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax, 242 .proc_handler = &proc_dointvec,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 }, 243 },
234 { 244 {
235 .ctl_name = CTL_UNNUMBERED, 245 .ctl_name = CTL_UNNUMBERED,
@@ -266,38 +276,24 @@ static ctl_table kern_table[] = {
266 }, 276 },
267 { 277 {
268 .ctl_name = CTL_UNNUMBERED, 278 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_stat_granularity_ns", 279 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_stat_granularity, 280 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int),
272 .mode = 0644,
273 .proc_handler = &proc_dointvec_minmax,
274 .strategy = &sysctl_intvec,
275 .extra1 = &min_wakeup_granularity_ns,
276 .extra2 = &max_wakeup_granularity_ns,
277 },
278 {
279 .ctl_name = CTL_UNNUMBERED,
280 .procname = "sched_runtime_limit_ns",
281 .data = &sysctl_sched_runtime_limit,
282 .maxlen = sizeof(unsigned int), 281 .maxlen = sizeof(unsigned int),
283 .mode = 0644, 282 .mode = 0644,
284 .proc_handler = &proc_dointvec_minmax, 283 .proc_handler = &proc_dointvec,
285 .strategy = &sysctl_intvec,
286 .extra1 = &min_sched_granularity_ns,
287 .extra2 = &max_sched_granularity_ns,
288 }, 284 },
289 { 285 {
290 .ctl_name = CTL_UNNUMBERED, 286 .ctl_name = CTL_UNNUMBERED,
291 .procname = "sched_child_runs_first", 287 .procname = "sched_features",
292 .data = &sysctl_sched_child_runs_first, 288 .data = &sysctl_sched_features,
293 .maxlen = sizeof(unsigned int), 289 .maxlen = sizeof(unsigned int),
294 .mode = 0644, 290 .mode = 0644,
295 .proc_handler = &proc_dointvec, 291 .proc_handler = &proc_dointvec,
296 }, 292 },
297 { 293 {
298 .ctl_name = CTL_UNNUMBERED, 294 .ctl_name = CTL_UNNUMBERED,
299 .procname = "sched_features", 295 .procname = "sched_migration_cost",
300 .data = &sysctl_sched_features, 296 .data = &sysctl_sched_migration_cost,
301 .maxlen = sizeof(unsigned int), 297 .maxlen = sizeof(unsigned int),
302 .mode = 0644, 298 .mode = 0644,
303 .proc_handler = &proc_dointvec, 299 .proc_handler = &proc_dointvec,
@@ -368,7 +364,6 @@ static ctl_table kern_table[] = {
368 }, 364 },
369#ifdef CONFIG_PROC_SYSCTL 365#ifdef CONFIG_PROC_SYSCTL
370 { 366 {
371 .ctl_name = KERN_TAINTED,
372 .procname = "tainted", 367 .procname = "tainted",
373 .data = &tainted, 368 .data = &tainted,
374 .maxlen = sizeof(int), 369 .maxlen = sizeof(int),
@@ -376,14 +371,15 @@ static ctl_table kern_table[] = {
376 .proc_handler = &proc_dointvec_taint, 371 .proc_handler = &proc_dointvec_taint,
377 }, 372 },
378#endif 373#endif
374#ifdef CONFIG_SECURITY_CAPABILITIES
379 { 375 {
380 .ctl_name = KERN_CAP_BSET,
381 .procname = "cap-bound", 376 .procname = "cap-bound",
382 .data = &cap_bset, 377 .data = &cap_bset,
383 .maxlen = sizeof(kernel_cap_t), 378 .maxlen = sizeof(kernel_cap_t),
384 .mode = 0600, 379 .mode = 0600,
385 .proc_handler = &proc_dointvec_bset, 380 .proc_handler = &proc_dointvec_bset,
386 }, 381 },
382#endif /* def CONFIG_SECURITY_CAPABILITIES */
387#ifdef CONFIG_BLK_DEV_INITRD 383#ifdef CONFIG_BLK_DEV_INITRD
388 { 384 {
389 .ctl_name = KERN_REALROOTDEV, 385 .ctl_name = KERN_REALROOTDEV,
@@ -517,7 +513,6 @@ static ctl_table kern_table[] = {
517#endif 513#endif
518#ifdef CONFIG_PROC_SYSCTL 514#ifdef CONFIG_PROC_SYSCTL
519 { 515 {
520 .ctl_name = KERN_CADPID,
521 .procname = "cad_pid", 516 .procname = "cad_pid",
522 .data = NULL, 517 .data = NULL,
523 .maxlen = sizeof (int), 518 .maxlen = sizeof (int),
@@ -539,14 +534,6 @@ static ctl_table kern_table[] = {
539 .mode = 0555, 534 .mode = 0555,
540 .child = random_table, 535 .child = random_table,
541 }, 536 },
542#ifdef CONFIG_UNIX98_PTYS
543 {
544 .ctl_name = KERN_PTY,
545 .procname = "pty",
546 .mode = 0555,
547 .child = pty_table,
548 },
549#endif
550 { 537 {
551 .ctl_name = KERN_OVERFLOWUID, 538 .ctl_name = KERN_OVERFLOWUID,
552 .procname = "overflowuid", 539 .procname = "overflowuid",
@@ -653,7 +640,6 @@ static ctl_table kern_table[] = {
653 .proc_handler = &proc_dointvec, 640 .proc_handler = &proc_dointvec,
654 }, 641 },
655 { 642 {
656 .ctl_name = KERN_NMI_WATCHDOG,
657 .procname = "nmi_watchdog", 643 .procname = "nmi_watchdog",
658 .data = &nmi_watchdog_enabled, 644 .data = &nmi_watchdog_enabled,
659 .maxlen = sizeof (int), 645 .maxlen = sizeof (int),
@@ -709,7 +695,6 @@ static ctl_table kern_table[] = {
709#endif 695#endif
710#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) 696#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
711 { 697 {
712 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
713 .procname = "acpi_video_flags", 698 .procname = "acpi_video_flags",
714 .data = &acpi_realmode_flags, 699 .data = &acpi_realmode_flags,
715 .maxlen = sizeof (unsigned long), 700 .maxlen = sizeof (unsigned long),
@@ -727,6 +712,19 @@ static ctl_table kern_table[] = {
727 .proc_handler = &proc_dointvec, 712 .proc_handler = &proc_dointvec,
728 }, 713 },
729#endif 714#endif
715#ifdef CONFIG_DETECT_SOFTLOCKUP
716 {
717 .ctl_name = CTL_UNNUMBERED,
718 .procname = "softlockup_thresh",
719 .data = &softlockup_thresh,
720 .maxlen = sizeof(int),
721 .mode = 0644,
722 .proc_handler = &proc_dointvec_minmax,
723 .strategy = &sysctl_intvec,
724 .extra1 = &one,
725 .extra2 = &sixty,
726 },
727#endif
730#ifdef CONFIG_COMPAT 728#ifdef CONFIG_COMPAT
731 { 729 {
732 .ctl_name = KERN_COMPAT_LOG, 730 .ctl_name = KERN_COMPAT_LOG,
@@ -773,14 +771,7 @@ static ctl_table kern_table[] = {
773 { .ctl_name = 0 } 771 { .ctl_name = 0 }
774}; 772};
775 773
776/* Constants for minimum and maximum testing in vm_table. 774static struct ctl_table vm_table[] = {
777 We use these as one-element integer vectors. */
778static int zero;
779static int two = 2;
780static int one_hundred = 100;
781
782
783static ctl_table vm_table[] = {
784 { 775 {
785 .ctl_name = VM_OVERCOMMIT_MEMORY, 776 .ctl_name = VM_OVERCOMMIT_MEMORY,
786 .procname = "overcommit_memory", 777 .procname = "overcommit_memory",
@@ -798,6 +789,14 @@ static ctl_table vm_table[] = {
798 .proc_handler = &proc_dointvec, 789 .proc_handler = &proc_dointvec,
799 }, 790 },
800 { 791 {
792 .ctl_name = CTL_UNNUMBERED,
793 .procname = "oom_kill_allocating_task",
794 .data = &sysctl_oom_kill_allocating_task,
795 .maxlen = sizeof(sysctl_oom_kill_allocating_task),
796 .mode = 0644,
797 .proc_handler = &proc_dointvec,
798 },
799 {
801 .ctl_name = VM_OVERCOMMIT_RATIO, 800 .ctl_name = VM_OVERCOMMIT_RATIO,
802 .procname = "overcommit_ratio", 801 .procname = "overcommit_ratio",
803 .data = &sysctl_overcommit_ratio, 802 .data = &sysctl_overcommit_ratio,
@@ -830,13 +829,12 @@ static ctl_table vm_table[] = {
830 .data = &vm_dirty_ratio, 829 .data = &vm_dirty_ratio,
831 .maxlen = sizeof(vm_dirty_ratio), 830 .maxlen = sizeof(vm_dirty_ratio),
832 .mode = 0644, 831 .mode = 0644,
833 .proc_handler = &proc_dointvec_minmax, 832 .proc_handler = &dirty_ratio_handler,
834 .strategy = &sysctl_intvec, 833 .strategy = &sysctl_intvec,
835 .extra1 = &zero, 834 .extra1 = &zero,
836 .extra2 = &one_hundred, 835 .extra2 = &one_hundred,
837 }, 836 },
838 { 837 {
839 .ctl_name = VM_DIRTY_WB_CS,
840 .procname = "dirty_writeback_centisecs", 838 .procname = "dirty_writeback_centisecs",
841 .data = &dirty_writeback_interval, 839 .data = &dirty_writeback_interval,
842 .maxlen = sizeof(dirty_writeback_interval), 840 .maxlen = sizeof(dirty_writeback_interval),
@@ -844,7 +842,6 @@ static ctl_table vm_table[] = {
844 .proc_handler = &dirty_writeback_centisecs_handler, 842 .proc_handler = &dirty_writeback_centisecs_handler,
845 }, 843 },
846 { 844 {
847 .ctl_name = VM_DIRTY_EXPIRE_CS,
848 .procname = "dirty_expire_centisecs", 845 .procname = "dirty_expire_centisecs",
849 .data = &dirty_expire_interval, 846 .data = &dirty_expire_interval,
850 .maxlen = sizeof(dirty_expire_interval), 847 .maxlen = sizeof(dirty_expire_interval),
@@ -872,7 +869,6 @@ static ctl_table vm_table[] = {
872 }, 869 },
873#ifdef CONFIG_HUGETLB_PAGE 870#ifdef CONFIG_HUGETLB_PAGE
874 { 871 {
875 .ctl_name = VM_HUGETLB_PAGES,
876 .procname = "nr_hugepages", 872 .procname = "nr_hugepages",
877 .data = &max_huge_pages, 873 .data = &max_huge_pages,
878 .maxlen = sizeof(unsigned long), 874 .maxlen = sizeof(unsigned long),
@@ -897,6 +893,14 @@ static ctl_table vm_table[] = {
897 .mode = 0644, 893 .mode = 0644,
898 .proc_handler = &hugetlb_treat_movable_handler, 894 .proc_handler = &hugetlb_treat_movable_handler,
899 }, 895 },
896 {
897 .ctl_name = CTL_UNNUMBERED,
898 .procname = "hugetlb_dynamic_pool",
899 .data = &hugetlb_dynamic_pool,
900 .maxlen = sizeof(hugetlb_dynamic_pool),
901 .mode = 0644,
902 .proc_handler = &proc_dointvec,
903 },
900#endif 904#endif
901 { 905 {
902 .ctl_name = VM_LOWMEM_RESERVE_RATIO, 906 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
@@ -1053,7 +1057,7 @@ static ctl_table vm_table[] = {
1053 .strategy = &sysctl_string, 1057 .strategy = &sysctl_string,
1054 }, 1058 },
1055#endif 1059#endif
1056#if defined(CONFIG_X86_32) || \ 1060#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1057 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1061 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1058 { 1062 {
1059 .ctl_name = VM_VDSO_ENABLED, 1063 .ctl_name = VM_VDSO_ENABLED,
@@ -1074,12 +1078,12 @@ static ctl_table vm_table[] = {
1074}; 1078};
1075 1079
1076#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1080#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1077static ctl_table binfmt_misc_table[] = { 1081static struct ctl_table binfmt_misc_table[] = {
1078 { .ctl_name = 0 } 1082 { .ctl_name = 0 }
1079}; 1083};
1080#endif 1084#endif
1081 1085
1082static ctl_table fs_table[] = { 1086static struct ctl_table fs_table[] = {
1083 { 1087 {
1084 .ctl_name = FS_NRINODE, 1088 .ctl_name = FS_NRINODE,
1085 .procname = "inode-nr", 1089 .procname = "inode-nr",
@@ -1097,7 +1101,6 @@ static ctl_table fs_table[] = {
1097 .proc_handler = &proc_dointvec, 1101 .proc_handler = &proc_dointvec,
1098 }, 1102 },
1099 { 1103 {
1100 .ctl_name = FS_NRFILE,
1101 .procname = "file-nr", 1104 .procname = "file-nr",
1102 .data = &files_stat, 1105 .data = &files_stat,
1103 .maxlen = 3*sizeof(int), 1106 .maxlen = 3*sizeof(int),
@@ -1173,7 +1176,6 @@ static ctl_table fs_table[] = {
1173 .extra2 = &two, 1176 .extra2 = &two,
1174 }, 1177 },
1175 { 1178 {
1176 .ctl_name = FS_AIO_NR,
1177 .procname = "aio-nr", 1179 .procname = "aio-nr",
1178 .data = &aio_nr, 1180 .data = &aio_nr,
1179 .maxlen = sizeof(aio_nr), 1181 .maxlen = sizeof(aio_nr),
@@ -1181,7 +1183,6 @@ static ctl_table fs_table[] = {
1181 .proc_handler = &proc_doulongvec_minmax, 1183 .proc_handler = &proc_doulongvec_minmax,
1182 }, 1184 },
1183 { 1185 {
1184 .ctl_name = FS_AIO_MAX_NR,
1185 .procname = "aio-max-nr", 1186 .procname = "aio-max-nr",
1186 .data = &aio_max_nr, 1187 .data = &aio_max_nr,
1187 .maxlen = sizeof(aio_max_nr), 1188 .maxlen = sizeof(aio_max_nr),
@@ -1220,8 +1221,8 @@ static ctl_table fs_table[] = {
1220 { .ctl_name = 0 } 1221 { .ctl_name = 0 }
1221}; 1222};
1222 1223
1223static ctl_table debug_table[] = { 1224static struct ctl_table debug_table[] = {
1224#ifdef CONFIG_X86 1225#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1225 { 1226 {
1226 .ctl_name = CTL_UNNUMBERED, 1227 .ctl_name = CTL_UNNUMBERED,
1227 .procname = "exception-trace", 1228 .procname = "exception-trace",
@@ -1234,7 +1235,7 @@ static ctl_table debug_table[] = {
1234 { .ctl_name = 0 } 1235 { .ctl_name = 0 }
1235}; 1236};
1236 1237
1237static ctl_table dev_table[] = { 1238static struct ctl_table dev_table[] = {
1238 { .ctl_name = 0 } 1239 { .ctl_name = 0 }
1239}; 1240};
1240 1241
@@ -1350,10 +1351,15 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1350 if (copy_from_user(&tmp, args, sizeof(tmp))) 1351 if (copy_from_user(&tmp, args, sizeof(tmp)))
1351 return -EFAULT; 1352 return -EFAULT;
1352 1353
1354 error = deprecated_sysctl_warning(&tmp);
1355 if (error)
1356 goto out;
1357
1353 lock_kernel(); 1358 lock_kernel();
1354 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, 1359 error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
1355 tmp.newval, tmp.newlen); 1360 tmp.newval, tmp.newlen);
1356 unlock_kernel(); 1361 unlock_kernel();
1362out:
1357 return error; 1363 return error;
1358} 1364}
1359#endif /* CONFIG_SYSCTL_SYSCALL */ 1365#endif /* CONFIG_SYSCTL_SYSCALL */
@@ -1374,7 +1380,7 @@ static int test_perm(int mode, int op)
1374 return -EACCES; 1380 return -EACCES;
1375} 1381}
1376 1382
1377int sysctl_perm(ctl_table *table, int op) 1383int sysctl_perm(struct ctl_table *table, int op)
1378{ 1384{
1379 int error; 1385 int error;
1380 error = security_sysctl(table, op); 1386 error = security_sysctl(table, op);
@@ -1387,7 +1393,7 @@ int sysctl_perm(ctl_table *table, int op)
1387static int parse_table(int __user *name, int nlen, 1393static int parse_table(int __user *name, int nlen,
1388 void __user *oldval, size_t __user *oldlenp, 1394 void __user *oldval, size_t __user *oldlenp,
1389 void __user *newval, size_t newlen, 1395 void __user *newval, size_t newlen,
1390 ctl_table *table) 1396 struct ctl_table *table)
1391{ 1397{
1392 int n; 1398 int n;
1393repeat: 1399repeat:
@@ -1418,13 +1424,12 @@ repeat:
1418} 1424}
1419 1425
1420/* Perform the actual read/write of a sysctl table entry. */ 1426/* Perform the actual read/write of a sysctl table entry. */
1421int do_sysctl_strategy (ctl_table *table, 1427int do_sysctl_strategy (struct ctl_table *table,
1422 int __user *name, int nlen, 1428 int __user *name, int nlen,
1423 void __user *oldval, size_t __user *oldlenp, 1429 void __user *oldval, size_t __user *oldlenp,
1424 void __user *newval, size_t newlen) 1430 void __user *newval, size_t newlen)
1425{ 1431{
1426 int op = 0, rc; 1432 int op = 0, rc;
1427 size_t len;
1428 1433
1429 if (oldval) 1434 if (oldval)
1430 op |= 004; 1435 op |= 004;
@@ -1445,25 +1450,10 @@ int do_sysctl_strategy (ctl_table *table,
1445 /* If there is no strategy routine, or if the strategy returns 1450 /* If there is no strategy routine, or if the strategy returns
1446 * zero, proceed with automatic r/w */ 1451 * zero, proceed with automatic r/w */
1447 if (table->data && table->maxlen) { 1452 if (table->data && table->maxlen) {
1448 if (oldval && oldlenp) { 1453 rc = sysctl_data(table, name, nlen, oldval, oldlenp,
1449 if (get_user(len, oldlenp)) 1454 newval, newlen);
1450 return -EFAULT; 1455 if (rc < 0)
1451 if (len) { 1456 return rc;
1452 if (len > table->maxlen)
1453 len = table->maxlen;
1454 if(copy_to_user(oldval, table->data, len))
1455 return -EFAULT;
1456 if(put_user(len, oldlenp))
1457 return -EFAULT;
1458 }
1459 }
1460 if (newval && newlen) {
1461 len = newlen;
1462 if (len > table->maxlen)
1463 len = table->maxlen;
1464 if(copy_from_user(table->data, newval, len))
1465 return -EFAULT;
1466 }
1467 } 1457 }
1468 return 0; 1458 return 0;
1469} 1459}
@@ -1480,7 +1470,9 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1480 1470
1481static __init int sysctl_init(void) 1471static __init int sysctl_init(void)
1482{ 1472{
1473 int err;
1483 sysctl_set_parent(NULL, root_table); 1474 sysctl_set_parent(NULL, root_table);
1475 err = sysctl_check_table(root_table);
1484 return 0; 1476 return 0;
1485} 1477}
1486 1478
@@ -1493,7 +1485,7 @@ core_initcall(sysctl_init);
1493 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1485 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1494 * array. An entry with a ctl_name of 0 terminates the table. 1486 * array. An entry with a ctl_name of 0 terminates the table.
1495 * 1487 *
1496 * The members of the &ctl_table structure are used as follows: 1488 * The members of the &struct ctl_table structure are used as follows:
1497 * 1489 *
1498 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number 1490 * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
1499 * must be unique within that level of sysctl 1491 * must be unique within that level of sysctl
@@ -1554,7 +1546,7 @@ core_initcall(sysctl_init);
1554 * This routine returns %NULL on a failure to register, and a pointer 1546 * This routine returns %NULL on a failure to register, and a pointer
1555 * to the table header on success. 1547 * to the table header on success.
1556 */ 1548 */
1557struct ctl_table_header *register_sysctl_table(ctl_table * table) 1549struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1558{ 1550{
1559 struct ctl_table_header *tmp; 1551 struct ctl_table_header *tmp;
1560 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1552 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1565,6 +1557,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table)
1565 tmp->used = 0; 1557 tmp->used = 0;
1566 tmp->unregistering = NULL; 1558 tmp->unregistering = NULL;
1567 sysctl_set_parent(NULL, table); 1559 sysctl_set_parent(NULL, table);
1560 if (sysctl_check_table(tmp->ctl_table)) {
1561 kfree(tmp);
1562 return NULL;
1563 }
1568 spin_lock(&sysctl_lock); 1564 spin_lock(&sysctl_lock);
1569 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1565 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1570 spin_unlock(&sysctl_lock); 1566 spin_unlock(&sysctl_lock);
@@ -1588,7 +1584,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1588} 1584}
1589 1585
1590#else /* !CONFIG_SYSCTL */ 1586#else /* !CONFIG_SYSCTL */
1591struct ctl_table_header *register_sysctl_table(ctl_table * table) 1587struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1592{ 1588{
1593 return NULL; 1589 return NULL;
1594} 1590}
@@ -1681,7 +1677,7 @@ static int _proc_do_string(void* data, int maxlen, int write,
1681 * 1677 *
1682 * Returns 0 on success. 1678 * Returns 0 on success.
1683 */ 1679 */
1684int proc_dostring(ctl_table *table, int write, struct file *filp, 1680int proc_dostring(struct ctl_table *table, int write, struct file *filp,
1685 void __user *buffer, size_t *lenp, loff_t *ppos) 1681 void __user *buffer, size_t *lenp, loff_t *ppos)
1686{ 1682{
1687 return _proc_do_string(table->data, table->maxlen, write, filp, 1683 return _proc_do_string(table->data, table->maxlen, write, filp,
@@ -1708,7 +1704,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
1708 return 0; 1704 return 0;
1709} 1705}
1710 1706
1711static int __do_proc_dointvec(void *tbl_data, ctl_table *table, 1707static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
1712 int write, struct file *filp, void __user *buffer, 1708 int write, struct file *filp, void __user *buffer,
1713 size_t *lenp, loff_t *ppos, 1709 size_t *lenp, loff_t *ppos,
1714 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 1710 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
@@ -1818,7 +1814,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
1818#undef TMPBUFLEN 1814#undef TMPBUFLEN
1819} 1815}
1820 1816
1821static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, 1817static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
1822 void __user *buffer, size_t *lenp, loff_t *ppos, 1818 void __user *buffer, size_t *lenp, loff_t *ppos,
1823 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 1819 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
1824 int write, void *data), 1820 int write, void *data),
@@ -1842,7 +1838,7 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
1842 * 1838 *
1843 * Returns 0 on success. 1839 * Returns 0 on success.
1844 */ 1840 */
1845int proc_dointvec(ctl_table *table, int write, struct file *filp, 1841int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
1846 void __user *buffer, size_t *lenp, loff_t *ppos) 1842 void __user *buffer, size_t *lenp, loff_t *ppos)
1847{ 1843{
1848 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1844 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -1878,11 +1874,12 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
1878 return 0; 1874 return 0;
1879} 1875}
1880 1876
1877#ifdef CONFIG_SECURITY_CAPABILITIES
1881/* 1878/*
1882 * init may raise the set. 1879 * init may raise the set.
1883 */ 1880 */
1884 1881
1885int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, 1882int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
1886 void __user *buffer, size_t *lenp, loff_t *ppos) 1883 void __user *buffer, size_t *lenp, loff_t *ppos)
1887{ 1884{
1888 int op; 1885 int op;
@@ -1891,15 +1888,16 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1891 return -EPERM; 1888 return -EPERM;
1892 } 1889 }
1893 1890
1894 op = is_init(current) ? OP_SET : OP_AND; 1891 op = is_global_init(current) ? OP_SET : OP_AND;
1895 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1892 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1896 do_proc_dointvec_bset_conv,&op); 1893 do_proc_dointvec_bset_conv,&op);
1897} 1894}
1895#endif /* def CONFIG_SECURITY_CAPABILITIES */
1898 1896
1899/* 1897/*
1900 * Taint values can only be increased 1898 * Taint values can only be increased
1901 */ 1899 */
1902static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, 1900static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
1903 void __user *buffer, size_t *lenp, loff_t *ppos) 1901 void __user *buffer, size_t *lenp, loff_t *ppos)
1904{ 1902{
1905 int op; 1903 int op;
@@ -1958,7 +1956,7 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
1958 * 1956 *
1959 * Returns 0 on success. 1957 * Returns 0 on success.
1960 */ 1958 */
1961int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, 1959int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
1962 void __user *buffer, size_t *lenp, loff_t *ppos) 1960 void __user *buffer, size_t *lenp, loff_t *ppos)
1963{ 1961{
1964 struct do_proc_dointvec_minmax_conv_param param = { 1962 struct do_proc_dointvec_minmax_conv_param param = {
@@ -1969,7 +1967,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
1969 do_proc_dointvec_minmax_conv, &param); 1967 do_proc_dointvec_minmax_conv, &param);
1970} 1968}
1971 1969
1972static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, 1970static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
1973 struct file *filp, 1971 struct file *filp,
1974 void __user *buffer, 1972 void __user *buffer,
1975 size_t *lenp, loff_t *ppos, 1973 size_t *lenp, loff_t *ppos,
@@ -2074,7 +2072,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
2074#undef TMPBUFLEN 2072#undef TMPBUFLEN
2075} 2073}
2076 2074
2077static int do_proc_doulongvec_minmax(ctl_table *table, int write, 2075static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2078 struct file *filp, 2076 struct file *filp,
2079 void __user *buffer, 2077 void __user *buffer,
2080 size_t *lenp, loff_t *ppos, 2078 size_t *lenp, loff_t *ppos,
@@ -2102,7 +2100,7 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write,
2102 * 2100 *
2103 * Returns 0 on success. 2101 * Returns 0 on success.
2104 */ 2102 */
2105int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, 2103int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
2106 void __user *buffer, size_t *lenp, loff_t *ppos) 2104 void __user *buffer, size_t *lenp, loff_t *ppos)
2107{ 2105{
2108 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2106 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
@@ -2126,7 +2124,7 @@ int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2126 * 2124 *
2127 * Returns 0 on success. 2125 * Returns 0 on success.
2128 */ 2126 */
2129int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, 2127int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2130 struct file *filp, 2128 struct file *filp,
2131 void __user *buffer, 2129 void __user *buffer,
2132 size_t *lenp, loff_t *ppos) 2130 size_t *lenp, loff_t *ppos)
@@ -2219,7 +2217,7 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2219 * 2217 *
2220 * Returns 0 on success. 2218 * Returns 0 on success.
2221 */ 2219 */
2222int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, 2220int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2223 void __user *buffer, size_t *lenp, loff_t *ppos) 2221 void __user *buffer, size_t *lenp, loff_t *ppos)
2224{ 2222{
2225 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2223 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -2242,7 +2240,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2242 * 2240 *
2243 * Returns 0 on success. 2241 * Returns 0 on success.
2244 */ 2242 */
2245int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, 2243int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
2246 void __user *buffer, size_t *lenp, loff_t *ppos) 2244 void __user *buffer, size_t *lenp, loff_t *ppos)
2247{ 2245{
2248 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2246 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
@@ -2266,21 +2264,21 @@ int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2266 * 2264 *
2267 * Returns 0 on success. 2265 * Returns 0 on success.
2268 */ 2266 */
2269int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, 2267int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
2270 void __user *buffer, size_t *lenp, loff_t *ppos) 2268 void __user *buffer, size_t *lenp, loff_t *ppos)
2271{ 2269{
2272 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2270 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
2273 do_proc_dointvec_ms_jiffies_conv, NULL); 2271 do_proc_dointvec_ms_jiffies_conv, NULL);
2274} 2272}
2275 2273
2276static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, 2274static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
2277 void __user *buffer, size_t *lenp, loff_t *ppos) 2275 void __user *buffer, size_t *lenp, loff_t *ppos)
2278{ 2276{
2279 struct pid *new_pid; 2277 struct pid *new_pid;
2280 pid_t tmp; 2278 pid_t tmp;
2281 int r; 2279 int r;
2282 2280
2283 tmp = pid_nr(cad_pid); 2281 tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
2284 2282
2285 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2283 r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
2286 lenp, ppos, NULL, NULL); 2284 lenp, ppos, NULL, NULL);
@@ -2297,55 +2295,55 @@ static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
2297 2295
2298#else /* CONFIG_PROC_FS */ 2296#else /* CONFIG_PROC_FS */
2299 2297
2300int proc_dostring(ctl_table *table, int write, struct file *filp, 2298int proc_dostring(struct ctl_table *table, int write, struct file *filp,
2301 void __user *buffer, size_t *lenp, loff_t *ppos) 2299 void __user *buffer, size_t *lenp, loff_t *ppos)
2302{ 2300{
2303 return -ENOSYS; 2301 return -ENOSYS;
2304} 2302}
2305 2303
2306int proc_dointvec(ctl_table *table, int write, struct file *filp, 2304int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2307 void __user *buffer, size_t *lenp, loff_t *ppos) 2305 void __user *buffer, size_t *lenp, loff_t *ppos)
2308{ 2306{
2309 return -ENOSYS; 2307 return -ENOSYS;
2310} 2308}
2311 2309
2312int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, 2310int proc_dointvec_bset(struct ctl_table *table, int write, struct file *filp,
2313 void __user *buffer, size_t *lenp, loff_t *ppos) 2311 void __user *buffer, size_t *lenp, loff_t *ppos)
2314{ 2312{
2315 return -ENOSYS; 2313 return -ENOSYS;
2316} 2314}
2317 2315
2318int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, 2316int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
2319 void __user *buffer, size_t *lenp, loff_t *ppos) 2317 void __user *buffer, size_t *lenp, loff_t *ppos)
2320{ 2318{
2321 return -ENOSYS; 2319 return -ENOSYS;
2322} 2320}
2323 2321
2324int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, 2322int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2325 void __user *buffer, size_t *lenp, loff_t *ppos) 2323 void __user *buffer, size_t *lenp, loff_t *ppos)
2326{ 2324{
2327 return -ENOSYS; 2325 return -ENOSYS;
2328} 2326}
2329 2327
2330int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, 2328int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
2331 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2332{ 2330{
2333 return -ENOSYS; 2331 return -ENOSYS;
2334} 2332}
2335 2333
2336int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, 2334int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
2337 void __user *buffer, size_t *lenp, loff_t *ppos) 2335 void __user *buffer, size_t *lenp, loff_t *ppos)
2338{ 2336{
2339 return -ENOSYS; 2337 return -ENOSYS;
2340} 2338}
2341 2339
2342int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, 2340int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
2343 void __user *buffer, size_t *lenp, loff_t *ppos) 2341 void __user *buffer, size_t *lenp, loff_t *ppos)
2344{ 2342{
2345 return -ENOSYS; 2343 return -ENOSYS;
2346} 2344}
2347 2345
2348int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, 2346int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2349 struct file *filp, 2347 struct file *filp,
2350 void __user *buffer, 2348 void __user *buffer,
2351 size_t *lenp, loff_t *ppos) 2349 size_t *lenp, loff_t *ppos)
@@ -2362,8 +2360,42 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2362 * General sysctl support routines 2360 * General sysctl support routines
2363 */ 2361 */
2364 2362
2363/* The generic sysctl data routine (used if no strategy routine supplied) */
2364int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2365 void __user *oldval, size_t __user *oldlenp,
2366 void __user *newval, size_t newlen)
2367{
2368 size_t len;
2369
2370 /* Get out of I don't have a variable */
2371 if (!table->data || !table->maxlen)
2372 return -ENOTDIR;
2373
2374 if (oldval && oldlenp) {
2375 if (get_user(len, oldlenp))
2376 return -EFAULT;
2377 if (len) {
2378 if (len > table->maxlen)
2379 len = table->maxlen;
2380 if (copy_to_user(oldval, table->data, len))
2381 return -EFAULT;
2382 if (put_user(len, oldlenp))
2383 return -EFAULT;
2384 }
2385 }
2386
2387 if (newval && newlen) {
2388 if (newlen > table->maxlen)
2389 newlen = table->maxlen;
2390
2391 if (copy_from_user(table->data, newval, newlen))
2392 return -EFAULT;
2393 }
2394 return 1;
2395}
2396
2365/* The generic string strategy routine: */ 2397/* The generic string strategy routine: */
2366int sysctl_string(ctl_table *table, int __user *name, int nlen, 2398int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2367 void __user *oldval, size_t __user *oldlenp, 2399 void __user *oldval, size_t __user *oldlenp,
2368 void __user *newval, size_t newlen) 2400 void __user *newval, size_t newlen)
2369{ 2401{
@@ -2409,7 +2441,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
2409 * are between the minimum and maximum values given in the arrays 2441 * are between the minimum and maximum values given in the arrays
2410 * table->extra1 and table->extra2, respectively. 2442 * table->extra1 and table->extra2, respectively.
2411 */ 2443 */
2412int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2444int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2413 void __user *oldval, size_t __user *oldlenp, 2445 void __user *oldval, size_t __user *oldlenp,
2414 void __user *newval, size_t newlen) 2446 void __user *newval, size_t newlen)
2415{ 2447{
@@ -2445,7 +2477,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
2445} 2477}
2446 2478
2447/* Strategy function to convert jiffies to seconds */ 2479/* Strategy function to convert jiffies to seconds */
2448int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2480int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2449 void __user *oldval, size_t __user *oldlenp, 2481 void __user *oldval, size_t __user *oldlenp,
2450 void __user *newval, size_t newlen) 2482 void __user *newval, size_t newlen)
2451{ 2483{
@@ -2479,7 +2511,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
2479} 2511}
2480 2512
2481/* Strategy function to convert jiffies to seconds */ 2513/* Strategy function to convert jiffies to seconds */
2482int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2514int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
2483 void __user *oldval, size_t __user *oldlenp, 2515 void __user *oldval, size_t __user *oldlenp,
2484 void __user *newval, size_t newlen) 2516 void __user *newval, size_t newlen)
2485{ 2517{
@@ -2519,59 +2551,50 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2519 2551
2520asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2552asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2521{ 2553{
2522 static int msg_count;
2523 struct __sysctl_args tmp; 2554 struct __sysctl_args tmp;
2524 int name[CTL_MAXNAME]; 2555 int error;
2525 int i;
2526 2556
2527 /* Read in the sysctl name for better debug message logging */
2528 if (copy_from_user(&tmp, args, sizeof(tmp))) 2557 if (copy_from_user(&tmp, args, sizeof(tmp)))
2529 return -EFAULT; 2558 return -EFAULT;
2530 if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
2531 return -ENOTDIR;
2532 for (i = 0; i < tmp.nlen; i++)
2533 if (get_user(name[i], tmp.name + i))
2534 return -EFAULT;
2535 2559
2536 /* Ignore accesses to kernel.version */ 2560 error = deprecated_sysctl_warning(&tmp);
2537 if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2538 goto out;
2539 2561
2540 if (msg_count < 5) { 2562 /* If no error reading the parameters then just -ENOSYS ... */
2541 msg_count++; 2563 if (!error)
2542 printk(KERN_INFO 2564 error = -ENOSYS;
2543 "warning: process `%s' used the removed sysctl " 2565
2544 "system call with ", current->comm); 2566 return error;
2545 for (i = 0; i < tmp.nlen; i++) 2567}
2546 printk("%d.", name[i]); 2568
2547 printk("\n"); 2569int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2548 } 2570 void __user *oldval, size_t __user *oldlenp,
2549out: 2571 void __user *newval, size_t newlen)
2572{
2550 return -ENOSYS; 2573 return -ENOSYS;
2551} 2574}
2552 2575
2553int sysctl_string(ctl_table *table, int __user *name, int nlen, 2576int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2554 void __user *oldval, size_t __user *oldlenp, 2577 void __user *oldval, size_t __user *oldlenp,
2555 void __user *newval, size_t newlen) 2578 void __user *newval, size_t newlen)
2556{ 2579{
2557 return -ENOSYS; 2580 return -ENOSYS;
2558} 2581}
2559 2582
2560int sysctl_intvec(ctl_table *table, int __user *name, int nlen, 2583int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2561 void __user *oldval, size_t __user *oldlenp, 2584 void __user *oldval, size_t __user *oldlenp,
2562 void __user *newval, size_t newlen) 2585 void __user *newval, size_t newlen)
2563{ 2586{
2564 return -ENOSYS; 2587 return -ENOSYS;
2565} 2588}
2566 2589
2567int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, 2590int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2568 void __user *oldval, size_t __user *oldlenp, 2591 void __user *oldval, size_t __user *oldlenp,
2569 void __user *newval, size_t newlen) 2592 void __user *newval, size_t newlen)
2570{ 2593{
2571 return -ENOSYS; 2594 return -ENOSYS;
2572} 2595}
2573 2596
2574int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, 2597int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
2575 void __user *oldval, size_t __user *oldlenp, 2598 void __user *oldval, size_t __user *oldlenp,
2576 void __user *newval, size_t newlen) 2599 void __user *newval, size_t newlen)
2577{ 2600{
@@ -2580,6 +2603,33 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2580 2603
2581#endif /* CONFIG_SYSCTL_SYSCALL */ 2604#endif /* CONFIG_SYSCTL_SYSCALL */
2582 2605
2606static int deprecated_sysctl_warning(struct __sysctl_args *args)
2607{
2608 static int msg_count;
2609 int name[CTL_MAXNAME];
2610 int i;
2611
2612 /* Read in the sysctl name for better debug message logging */
2613 for (i = 0; i < args->nlen; i++)
2614 if (get_user(name[i], args->name + i))
2615 return -EFAULT;
2616
2617 /* Ignore accesses to kernel.version */
2618 if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
2619 return 0;
2620
2621 if (msg_count < 5) {
2622 msg_count++;
2623 printk(KERN_INFO
2624 "warning: process `%s' used the deprecated sysctl "
2625 "system call with ", current->comm);
2626 for (i = 0; i < args->nlen; i++)
2627 printk("%d.", name[i]);
2628 printk("\n");
2629 }
2630 return 0;
2631}
2632
2583/* 2633/*
2584 * No sense putting this after each symbol definition, twice, 2634 * No sense putting this after each symbol definition, twice,
2585 * exception granted :-) 2635 * exception granted :-)
@@ -2597,4 +2647,5 @@ EXPORT_SYMBOL(sysctl_intvec);
2597EXPORT_SYMBOL(sysctl_jiffies); 2647EXPORT_SYMBOL(sysctl_jiffies);
2598EXPORT_SYMBOL(sysctl_ms_jiffies); 2648EXPORT_SYMBOL(sysctl_ms_jiffies);
2599EXPORT_SYMBOL(sysctl_string); 2649EXPORT_SYMBOL(sysctl_string);
2650EXPORT_SYMBOL(sysctl_data);
2600EXPORT_SYMBOL(unregister_sysctl_table); 2651EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
new file mode 100644
index 000000000000..3c9ef5a7d575
--- /dev/null
+++ b/kernel/sysctl_check.c
@@ -0,0 +1,1588 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../arch/s390/appldata/appldata.h"
4#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
5#include <linux/sunrpc/debug.h>
6#include <linux/string.h>
7#include <net/ip_vs.h>
8
9struct trans_ctl_table {
10 int ctl_name;
11 const char *procname;
12 struct trans_ctl_table *child;
13};
14
15static struct trans_ctl_table trans_random_table[] = {
16 { RANDOM_POOLSIZE, "poolsize" },
17 { RANDOM_ENTROPY_COUNT, "entropy_avail" },
18 { RANDOM_READ_THRESH, "read_wakeup_threshold" },
19 { RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
20 { RANDOM_BOOT_ID, "boot_id" },
21 { RANDOM_UUID, "uuid" },
22 {}
23};
24
25static struct trans_ctl_table trans_pty_table[] = {
26 { PTY_MAX, "max" },
27 { PTY_NR, "nr" },
28 {}
29};
30
31static struct trans_ctl_table trans_kern_table[] = {
32 { KERN_OSTYPE, "ostype" },
33 { KERN_OSRELEASE, "osrelease" },
34 /* KERN_OSREV not used */
35 { KERN_VERSION, "version" },
36 /* KERN_SECUREMASK not used */
37 /* KERN_PROF not used */
38 { KERN_NODENAME, "hostname" },
39 { KERN_DOMAINNAME, "domainname" },
40
41#ifdef CONFIG_SECURITY_CAPABILITIES
42 { KERN_CAP_BSET, "cap-bound" },
43#endif /* def CONFIG_SECURITY_CAPABILITIES */
44
45 { KERN_PANIC, "panic" },
46 { KERN_REALROOTDEV, "real-root-dev" },
47
48 { KERN_SPARC_REBOOT, "reboot-cmd" },
49 { KERN_CTLALTDEL, "ctrl-alt-del" },
50 { KERN_PRINTK, "printk" },
51
52 /* KERN_NAMETRANS not used */
53 /* KERN_PPC_HTABRECLAIM not used */
54 /* KERN_PPC_ZEROPAGED not used */
55 { KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
56
57 { KERN_MODPROBE, "modprobe" },
58 { KERN_SG_BIG_BUFF, "sg-big-buff" },
59 { KERN_ACCT, "acct" },
60 { KERN_PPC_L2CR, "l2cr" },
61
62 /* KERN_RTSIGNR not used */
63 /* KERN_RTSIGMAX not used */
64
65 { KERN_SHMMAX, "shmmax" },
66 { KERN_MSGMAX, "msgmax" },
67 { KERN_MSGMNB, "msgmnb" },
68 /* KERN_MSGPOOL not used*/
69 { KERN_SYSRQ, "sysrq" },
70 { KERN_MAX_THREADS, "threads-max" },
71 { KERN_RANDOM, "random", trans_random_table },
72 { KERN_SHMALL, "shmall" },
73 { KERN_MSGMNI, "msgmni" },
74 { KERN_SEM, "sem" },
75 { KERN_SPARC_STOP_A, "stop-a" },
76 { KERN_SHMMNI, "shmmni" },
77
78 { KERN_OVERFLOWUID, "overflowuid" },
79 { KERN_OVERFLOWGID, "overflowgid" },
80
81 { KERN_HOTPLUG, "hotplug", },
82 { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
83
84 { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
85 { KERN_CORE_USES_PID, "core_uses_pid" },
86 { KERN_TAINTED, "tainted" },
87 { KERN_CADPID, "cad_pid" },
88 { KERN_PIDMAX, "pid_max" },
89 { KERN_CORE_PATTERN, "core_pattern" },
90 { KERN_PANIC_ON_OOPS, "panic_on_oops" },
91 { KERN_HPPA_PWRSW, "soft-power" },
92 { KERN_HPPA_UNALIGNED, "unaligned-trap" },
93
94 { KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
95 { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
96
97 { KERN_PTY, "pty", trans_pty_table },
98 { KERN_NGROUPS_MAX, "ngroups_max" },
99 { KERN_SPARC_SCONS_PWROFF, "scons_poweroff" },
100 { KERN_HZ_TIMER, "hz_timer" },
101 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
102 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
103 { KERN_RANDOMIZE, "randomize_va_space" },
104
105 { KERN_SPIN_RETRY, "spin_retry" },
106 { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" },
107 { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
108 { KERN_COMPAT_LOG, "compat-log" },
109 { KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
110 { KERN_NMI_WATCHDOG, "nmi_watchdog" },
111 { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
112 {}
113};
114
115static struct trans_ctl_table trans_vm_table[] = {
116 { VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
117 { VM_PAGE_CLUSTER, "page-cluster" },
118 { VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
119 { VM_DIRTY_RATIO, "dirty_ratio" },
120 { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" },
121 { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" },
122 { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" },
123 { VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
124 /* VM_PAGEBUF unused */
125 { VM_HUGETLB_PAGES, "nr_hugepages" },
126 { VM_SWAPPINESS, "swappiness" },
127 { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
128 { VM_MIN_FREE_KBYTES, "min_free_kbytes" },
129 { VM_MAX_MAP_COUNT, "max_map_count" },
130 { VM_LAPTOP_MODE, "laptop_mode" },
131 { VM_BLOCK_DUMP, "block_dump" },
132 { VM_HUGETLB_GROUP, "hugetlb_shm_group" },
133 { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
134 { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
135 /* VM_SWAP_TOKEN_TIMEOUT unused */
136 { VM_DROP_PAGECACHE, "drop_caches" },
137 { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
138 { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
139 { VM_MIN_UNMAPPED, "min_unmapped_ratio" },
140 { VM_PANIC_ON_OOM, "panic_on_oom" },
141 { VM_VDSO_ENABLED, "vdso_enabled" },
142 { VM_MIN_SLAB, "min_slab_ratio" },
143 { VM_CMM_PAGES, "cmm_pages" },
144 { VM_CMM_TIMED_PAGES, "cmm_timed_pages" },
145 { VM_CMM_TIMEOUT, "cmm_timeout" },
146
147 {}
148};
149
150static struct trans_ctl_table trans_net_core_table[] = {
151 { NET_CORE_WMEM_MAX, "wmem_max" },
152 { NET_CORE_RMEM_MAX, "rmem_max" },
153 { NET_CORE_WMEM_DEFAULT, "wmem_default" },
154 { NET_CORE_RMEM_DEFAULT, "rmem_default" },
155 /* NET_CORE_DESTROY_DELAY unused */
156 { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
157 /* NET_CORE_FASTROUTE unused */
158 { NET_CORE_MSG_COST, "message_cost" },
159 { NET_CORE_MSG_BURST, "message_burst" },
160 { NET_CORE_OPTMEM_MAX, "optmem_max" },
161 /* NET_CORE_HOT_LIST_LENGTH unused */
162 /* NET_CORE_DIVERT_VERSION unused */
163 /* NET_CORE_NO_CONG_THRESH unused */
164 /* NET_CORE_NO_CONG unused */
165 /* NET_CORE_LO_CONG unused */
166 /* NET_CORE_MOD_CONG unused */
167 { NET_CORE_DEV_WEIGHT, "dev_weight" },
168 { NET_CORE_SOMAXCONN, "somaxconn" },
169 { NET_CORE_BUDGET, "netdev_budget" },
170 { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
171 { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
172 { NET_CORE_WARNINGS, "warnings" },
173 {},
174};
175
176static struct trans_ctl_table trans_net_unix_table[] = {
177 /* NET_UNIX_DESTROY_DELAY unused */
178 /* NET_UNIX_DELETE_DELAY unused */
179 { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
180 {}
181};
182
183static struct trans_ctl_table trans_net_ipv4_route_table[] = {
184 { NET_IPV4_ROUTE_FLUSH, "flush" },
185 { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
186 { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
187 { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
188 { NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
189 { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
190 { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
191 { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
192 { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
193 { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
194 { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
195 { NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
196 { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
197 { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
198 { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
199 { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
200 { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
201 { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
202 { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
203 {}
204};
205
206static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
207 { NET_IPV4_CONF_FORWARDING, "forwarding" },
208 { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
209
210 { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
211 { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
212 { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
213 { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
214 { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
215 { NET_IPV4_CONF_RP_FILTER, "rp_filter" },
216 { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
217 { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
218 { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
219 { NET_IPV4_CONF_TAG, "tag" },
220 { NET_IPV4_CONF_ARPFILTER, "arp_filter" },
221 { NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
222 { NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
223 { NET_IPV4_CONF_NOPOLICY, "disable_policy" },
224 { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
225
226 { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
227 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
228 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
229 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
230 {}
231};
232
233static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
234 { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
235 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
236 { 0, NULL, trans_net_ipv4_conf_vars_table },
237 {}
238};
239
240
241static struct trans_ctl_table trans_net_ipv4_vs_table[] = {
242 { NET_IPV4_VS_AMEMTHRESH, "amemthresh" },
243 { NET_IPV4_VS_DEBUG_LEVEL, "debug_level" },
244 { NET_IPV4_VS_AMDROPRATE, "am_droprate" },
245 { NET_IPV4_VS_DROP_ENTRY, "drop_entry" },
246 { NET_IPV4_VS_DROP_PACKET, "drop_packet" },
247 { NET_IPV4_VS_SECURE_TCP, "secure_tcp" },
248 { NET_IPV4_VS_TO_ES, "timeout_established" },
249 { NET_IPV4_VS_TO_SS, "timeout_synsent" },
250 { NET_IPV4_VS_TO_SR, "timeout_synrecv" },
251 { NET_IPV4_VS_TO_FW, "timeout_finwait" },
252 { NET_IPV4_VS_TO_TW, "timeout_timewait" },
253 { NET_IPV4_VS_TO_CL, "timeout_close" },
254 { NET_IPV4_VS_TO_CW, "timeout_closewait" },
255 { NET_IPV4_VS_TO_LA, "timeout_lastack" },
256 { NET_IPV4_VS_TO_LI, "timeout_listen" },
257 { NET_IPV4_VS_TO_SA, "timeout_synack" },
258 { NET_IPV4_VS_TO_UDP, "timeout_udp" },
259 { NET_IPV4_VS_TO_ICMP, "timeout_icmp" },
260 { NET_IPV4_VS_CACHE_BYPASS, "cache_bypass" },
261 { NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn" },
262 { NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, "expire_quiescent_template" },
263 { NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold" },
264 { NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send" },
265 { NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration" },
266 { NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration" },
267 {}
268};
269
270static struct trans_ctl_table trans_net_neigh_vars_table[] = {
271 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
272 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
273 { NET_NEIGH_APP_SOLICIT, "app_solicit" },
274 { NET_NEIGH_RETRANS_TIME, "retrans_time" },
275 { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
276 { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
277 { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
278 { NET_NEIGH_UNRES_QLEN, "unres_qlen" },
279 { NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
280 { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" },
281 { NET_NEIGH_PROXY_DELAY, "proxy_delay" },
282 { NET_NEIGH_LOCKTIME, "locktime" },
283 { NET_NEIGH_GC_INTERVAL, "gc_interval" },
284 { NET_NEIGH_GC_THRESH1, "gc_thresh1" },
285 { NET_NEIGH_GC_THRESH2, "gc_thresh2" },
286 { NET_NEIGH_GC_THRESH3, "gc_thresh3" },
287 { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
288 { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
289 {}
290};
291
292static struct trans_ctl_table trans_net_neigh_table[] = {
293 { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
294 { 0, NULL, trans_net_neigh_vars_table },
295 {}
296};
297
298static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
299 { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
300
301 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
302 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" },
303 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" },
304 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" },
305 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" },
306 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" },
307 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" },
308 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" },
309
310 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" },
311 { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" },
312 { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" },
313 { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" },
314
315 { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
316 { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
317 { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" },
318 { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
319 { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
320 { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
321
322 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" },
323 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" },
324 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" },
325 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" },
326 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" },
327 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" },
328 { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
329
330 { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
331 { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
332 {}
333};
334
335static struct trans_ctl_table trans_net_ipv4_table[] = {
336 { NET_IPV4_FORWARD, "ip_forward" },
337 { NET_IPV4_DYNADDR, "ip_dynaddr" },
338
339 { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table },
340 { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table },
341 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
342 /* NET_IPV4_FIB_HASH unused */
343 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
344 { NET_IPV4_VS, "vs", trans_net_ipv4_vs_table },
345
346 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
347 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
348 { NET_IPV4_TCP_SACK, "tcp_sack" },
349 { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
350 { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
351 /* NET_IPV4_AUTOCONFIG unused */
352 { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
353 { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
354 { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
355 { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
356 { NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
357 /* NET_IPV4_TCP_MAX_KA_PROBES unused */
358 { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
359 { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
360 { NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
361 { NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
362 { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
363 /* NET_IPV4_IP_MASQ_DEBUG unused */
364 { NET_TCP_SYNCOOKIES, "tcp_syncookies" },
365 { NET_TCP_STDURG, "tcp_stdurg" },
366 { NET_TCP_RFC1337, "tcp_rfc1337" },
367 /* NET_TCP_SYN_TAILDROP unused */
368 { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
369 { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
370 { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
371 { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
372 /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
373 /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
374 /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
375 /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
376 /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
377 { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
378 { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
379 { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
380 /* NET_IPV4_ALWAYS_DEFRAG unused */
381 { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
382 { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
383 { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
384 { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
385 { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
386 { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
387 { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
388 { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
389 { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
390 { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
391 { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
392 { NET_TCP_FACK, "tcp_fack" },
393 { NET_TCP_REORDERING, "tcp_reordering" },
394 { NET_TCP_ECN, "tcp_ecn" },
395 { NET_TCP_DSACK, "tcp_dsack" },
396 { NET_TCP_MEM, "tcp_mem" },
397 { NET_TCP_WMEM, "tcp_wmem" },
398 { NET_TCP_RMEM, "tcp_rmem" },
399 { NET_TCP_APP_WIN, "tcp_app_win" },
400 { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
401 { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
402 { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
403 { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
404 { NET_TCP_TW_REUSE, "tcp_tw_reuse" },
405 { NET_TCP_FRTO, "tcp_frto" },
406 { NET_TCP_LOW_LATENCY, "tcp_low_latency" },
407 { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
408 { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
409 { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
410 /* NET_TCP_DEFAULT_WIN_SCALE unused */
411 { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
412 { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
413 /* NET_TCP_BIC_BETA unused */
414 { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
415 { NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
416 { NET_TCP_ABC, "tcp_abc" },
417 { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" },
418 { NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
419 { NET_TCP_BASE_MSS, "tcp_base_mss" },
420 { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
421 { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
422 { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
423 { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
424 { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
425 { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
426 { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
427 { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" },
428 { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
429 { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
430 { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
431 { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
432 {}
433};
434
435static struct trans_ctl_table trans_net_ipx_table[] = {
436 { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
437 /* NET_IPX_FORWARDING unused */
438 {}
439};
440
441static struct trans_ctl_table trans_net_atalk_table[] = {
442 { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
443 { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
444 { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
445 { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
446 {},
447};
448
449static struct trans_ctl_table trans_net_netrom_table[] = {
450 { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
451 { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
452 { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
453 { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
454 { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
455 { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
456 { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
457 { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
458 { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
459 { NET_NETROM_ROUTING_CONTROL, "routing_control" },
460 { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
461 { NET_NETROM_RESET, "reset" },
462 {}
463};
464
465static struct trans_ctl_table trans_net_ax25_table[] = {
466 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
467 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
468 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
469 { NET_AX25_CONNECT_MODE, "connect_mode" },
470 { NET_AX25_STANDARD_WINDOW, "standard_window_size" },
471 { NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
472 { NET_AX25_T1_TIMEOUT, "t1_timeout" },
473 { NET_AX25_T2_TIMEOUT, "t2_timeout" },
474 { NET_AX25_T3_TIMEOUT, "t3_timeout" },
475 { NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
476 { NET_AX25_N2, "maximum_retry_count" },
477 { NET_AX25_PACLEN, "maximum_packet_length" },
478 { NET_AX25_PROTOCOL, "protocol" },
479 { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
480 {}
481};
482
483static struct trans_ctl_table trans_net_bridge_table[] = {
484 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
485 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
486 { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
487 { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" },
488 { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" },
489 {}
490};
491
492static struct trans_ctl_table trans_net_rose_table[] = {
493 { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
494 { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
495 { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
496 { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
497 { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
498 { NET_ROSE_ROUTING_CONTROL, "routing_control" },
499 { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
500 { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
501 { NET_ROSE_WINDOW_SIZE, "window_size" },
502 { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
503 {}
504};
505
506static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
507 { NET_IPV6_FORWARDING, "forwarding" },
508 { NET_IPV6_HOP_LIMIT, "hop_limit" },
509 { NET_IPV6_MTU, "mtu" },
510 { NET_IPV6_ACCEPT_RA, "accept_ra" },
511 { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
512 { NET_IPV6_AUTOCONF, "autoconf" },
513 { NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
514 { NET_IPV6_RTR_SOLICITS, "router_solicitations" },
515 { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
516 { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
517 { NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
518 { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
519 { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
520 { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
521 { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
522 { NET_IPV6_MAX_ADDRESSES, "max_addresses" },
523 { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
524 { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
525 { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
526 { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
527 { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
528 { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
529 { NET_IPV6_PROXY_NDP, "proxy_ndp" },
530 { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
531 {}
532};
533
534static struct trans_ctl_table trans_net_ipv6_conf_table[] = {
535 { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
536 { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
537 { 0, NULL, trans_net_ipv6_conf_var_table },
538 {}
539};
540
541static struct trans_ctl_table trans_net_ipv6_route_table[] = {
542 { NET_IPV6_ROUTE_FLUSH, "flush" },
543 { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
544 { NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
545 { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
546 { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
547 { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
548 { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
549 { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
550 { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
551 { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
552 {}
553};
554
555static struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
556 { NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
557 {}
558};
559
560static struct trans_ctl_table trans_net_ipv6_table[] = {
561 { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
562 { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
563 { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
564 { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table },
565 { NET_IPV6_BINDV6ONLY, "bindv6only" },
566 { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
567 { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
568 { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
569 { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
570 { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
571 { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
572 {}
573};
574
575static struct trans_ctl_table trans_net_x25_table[] = {
576 { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
577 { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
578 { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
579 { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
580 { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
581 { NET_X25_FORWARD, "x25_forward" },
582 {}
583};
584
585static struct trans_ctl_table trans_net_tr_table[] = {
586 { NET_TR_RIF_TIMEOUT, "rif_timeout" },
587 {}
588};
589
590
591static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
592 { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
593 { NET_DECNET_CONF_DEV_PRIORITY, "priority" },
594 { NET_DECNET_CONF_DEV_T2, "t2" },
595 { NET_DECNET_CONF_DEV_T3, "t3" },
596 {}
597};
598
599static struct trans_ctl_table trans_net_decnet_conf[] = {
600 { 0, NULL, trans_net_decnet_conf_vars },
601 {}
602};
603
604static struct trans_ctl_table trans_net_decnet_table[] = {
605 { NET_DECNET_CONF, "conf", trans_net_decnet_conf },
606 { NET_DECNET_NODE_ADDRESS, "node_address" },
607 { NET_DECNET_NODE_NAME, "node_name" },
608 { NET_DECNET_DEFAULT_DEVICE, "default_device" },
609 { NET_DECNET_TIME_WAIT, "time_wait" },
610 { NET_DECNET_DN_COUNT, "dn_count" },
611 { NET_DECNET_DI_COUNT, "di_count" },
612 { NET_DECNET_DR_COUNT, "dr_count" },
613 { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
614 { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
615 { NET_DECNET_MEM, "decnet_mem" },
616 { NET_DECNET_RMEM, "decnet_rmem" },
617 { NET_DECNET_WMEM, "decnet_wmem" },
618 { NET_DECNET_DEBUG_LEVEL, "debug" },
619 {}
620};
621
622static struct trans_ctl_table trans_net_sctp_table[] = {
623 { NET_SCTP_RTO_INITIAL, "rto_initial" },
624 { NET_SCTP_RTO_MIN, "rto_min" },
625 { NET_SCTP_RTO_MAX, "rto_max" },
626 { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
627 { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
628 { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
629 { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
630 { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
631 { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
632 { NET_SCTP_HB_INTERVAL, "hb_interval" },
633 { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
634 { NET_SCTP_MAX_BURST, "max_burst" },
635 { NET_SCTP_ADDIP_ENABLE, "addip_enable" },
636 { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
637 { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
638 { NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
639 { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
640 {}
641};
642
643static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
644 { NET_LLC2_ACK_TIMEOUT, "ack" },
645 { NET_LLC2_P_TIMEOUT, "p" },
646 { NET_LLC2_REJ_TIMEOUT, "rej" },
647 { NET_LLC2_BUSY_TIMEOUT, "busy" },
648 {}
649};
650
651static struct trans_ctl_table trans_net_llc_station_table[] = {
652 { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
653 {}
654};
655
656static struct trans_ctl_table trans_net_llc_llc2_table[] = {
657 { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
658 {}
659};
660
661static struct trans_ctl_table trans_net_llc_table[] = {
662 { NET_LLC2, "llc2", trans_net_llc_llc2_table },
663 { NET_LLC_STATION, "station", trans_net_llc_station_table },
664 {}
665};
666
667static struct trans_ctl_table trans_net_netfilter_table[] = {
668 { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
669 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
670 { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
671 { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" },
672 { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" },
673 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" },
674 { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" },
675 { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" },
676 { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" },
677 { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" },
678 { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" },
679 { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" },
680 { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" },
681 { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
682 { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
683 { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" },
684 { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
685 { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
686 { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
687 { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" },
688 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" },
689 { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" },
690 { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" },
691 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" },
692 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" },
693 { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
694 { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
695 { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" },
696 { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" },
697 { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
698 { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
699 { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
700
701 {}
702};
703
704static struct trans_ctl_table trans_net_dccp_table[] = {
705 { NET_DCCP_DEFAULT, "default" },
706 {}
707};
708
709static struct trans_ctl_table trans_net_irda_table[] = {
710 { NET_IRDA_DISCOVERY, "discovery" },
711 { NET_IRDA_DEVNAME, "devname" },
712 { NET_IRDA_DEBUG, "debug" },
713 { NET_IRDA_FAST_POLL, "fast_poll_increase" },
714 { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
715 { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
716 { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
717 { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
718 { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
719 { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
720 { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
721 { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
722 { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
723 { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
724 {}
725};
726
727static struct trans_ctl_table trans_net_table[] = {
728 { NET_CORE, "core", trans_net_core_table },
729 /* NET_ETHER not used */
730 /* NET_802 not used */
731 { NET_UNIX, "unix", trans_net_unix_table },
732 { NET_IPV4, "ipv4", trans_net_ipv4_table },
733 { NET_IPX, "ipx", trans_net_ipx_table },
734 { NET_ATALK, "atalk", trans_net_atalk_table },
735 { NET_NETROM, "netrom", trans_net_netrom_table },
736 { NET_AX25, "ax25", trans_net_ax25_table },
737 { NET_BRIDGE, "bridge", trans_net_bridge_table },
738 { NET_ROSE, "rose", trans_net_rose_table },
739 { NET_IPV6, "ipv6", trans_net_ipv6_table },
740 { NET_X25, "x25", trans_net_x25_table },
741 { NET_TR, "tr", trans_net_tr_table },
742 { NET_DECNET, "decnet", trans_net_decnet_table },
743 /* NET_ECONET not used */
744 { NET_SCTP, "sctp", trans_net_sctp_table },
745 { NET_LLC, "llc", trans_net_llc_table },
746 { NET_NETFILTER, "netfilter", trans_net_netfilter_table },
747 { NET_DCCP, "dccp", trans_net_dccp_table },
748 { NET_IRDA, "irda", trans_net_irda_table },
749 { 2089, "nf_conntrack_max" },
750 {}
751};
752
753static struct trans_ctl_table trans_fs_quota_table[] = {
754 { FS_DQ_LOOKUPS, "lookups" },
755 { FS_DQ_DROPS, "drops" },
756 { FS_DQ_READS, "reads" },
757 { FS_DQ_WRITES, "writes" },
758 { FS_DQ_CACHE_HITS, "cache_hits" },
759 { FS_DQ_ALLOCATED, "allocated_dquots" },
760 { FS_DQ_FREE, "free_dquots" },
761 { FS_DQ_SYNCS, "syncs" },
762 { FS_DQ_WARNINGS, "warnings" },
763 {}
764};
765
766static struct trans_ctl_table trans_fs_xfs_table[] = {
767 { XFS_RESTRICT_CHOWN, "restrict_chown" },
768 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
769 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
770 { XFS_PANIC_MASK, "panic_mask" },
771
772 { XFS_ERRLEVEL, "error_level" },
773 { XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
774 { XFS_INHERIT_SYNC, "inherit_sync" },
775 { XFS_INHERIT_NODUMP, "inherit_nodump" },
776 { XFS_INHERIT_NOATIME, "inherit_noatime" },
777 { XFS_BUF_TIMER, "xfsbufd_centisecs" },
778 { XFS_BUF_AGE, "age_buffer_centisecs" },
779 { XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
780 { XFS_ROTORSTEP, "rotorstep" },
781 { XFS_INHERIT_NODFRG, "inherit_nodefrag" },
782 { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
783 { XFS_STATS_CLEAR, "stats_clear" },
784 {}
785};
786
787static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
788 { 1, "hb_ctl_path" },
789 {}
790};
791
792static struct trans_ctl_table trans_fs_ocfs2_table[] = {
793 { 1, "nm", trans_fs_ocfs2_nm_table },
794 {}
795};
796
797static struct trans_ctl_table trans_inotify_table[] = {
798 { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
799 { INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
800 { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
801 {}
802};
803
804static struct trans_ctl_table trans_fs_table[] = {
805 { FS_NRINODE, "inode-nr" },
806 { FS_STATINODE, "inode-state" },
807 /* FS_MAXINODE unused */
808 /* FS_NRDQUOT unused */
809 /* FS_MAXDQUOT unused */
810 { FS_NRFILE, "file-nr" },
811 { FS_MAXFILE, "file-max" },
812 { FS_DENTRY, "dentry-state" },
813 /* FS_NRSUPER unused */
814 /* FS_MAXUPSER unused */
815 { FS_OVERFLOWUID, "overflowuid" },
816 { FS_OVERFLOWGID, "overflowgid" },
817 { FS_LEASES, "leases-enable" },
818 { FS_DIR_NOTIFY, "dir-notify-enable" },
819 { FS_LEASE_TIME, "lease-break-time" },
820 { FS_DQSTATS, "quota", trans_fs_quota_table },
821 { FS_XFS, "xfs", trans_fs_xfs_table },
822 { FS_AIO_NR, "aio-nr" },
823 { FS_AIO_MAX_NR, "aio-max-nr" },
824 { FS_INOTIFY, "inotify", trans_inotify_table },
825 { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table },
826 { KERN_SETUID_DUMPABLE, "suid_dumpable" },
827 {}
828};
829
830static struct trans_ctl_table trans_debug_table[] = {
831 {}
832};
833
834static struct trans_ctl_table trans_cdrom_table[] = {
835 { DEV_CDROM_INFO, "info" },
836 { DEV_CDROM_AUTOCLOSE, "autoclose" },
837 { DEV_CDROM_AUTOEJECT, "autoeject" },
838 { DEV_CDROM_DEBUG, "debug" },
839 { DEV_CDROM_LOCK, "lock" },
840 { DEV_CDROM_CHECK_MEDIA, "check_media" },
841 {}
842};
843
844static struct trans_ctl_table trans_ipmi_table[] = {
845 { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
846 {}
847};
848
849static struct trans_ctl_table trans_mac_hid_files[] = {
850 /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
851 /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
852 { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
853 { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
854 { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
855 /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
856 {}
857};
858
859static struct trans_ctl_table trans_raid_table[] = {
860 { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
861 { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
862 {}
863};
864
865static struct trans_ctl_table trans_scsi_table[] = {
866 { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
867 {}
868};
869
870static struct trans_ctl_table trans_parport_default_table[] = {
871 { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
872 { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
873 {}
874};
875
876static struct trans_ctl_table trans_parport_device_table[] = {
877 { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
878 {}
879};
880
881static struct trans_ctl_table trans_parport_devices_table[] = {
882 { DEV_PARPORT_DEVICES_ACTIVE, "active" },
883 { 0, NULL, trans_parport_device_table },
884 {}
885};
886
887static struct trans_ctl_table trans_parport_parport_table[] = {
888 { DEV_PARPORT_SPINTIME, "spintime" },
889 { DEV_PARPORT_BASE_ADDR, "base-addr" },
890 { DEV_PARPORT_IRQ, "irq" },
891 { DEV_PARPORT_DMA, "dma" },
892 { DEV_PARPORT_MODES, "modes" },
893 { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table },
894 { DEV_PARPORT_AUTOPROBE, "autoprobe" },
895 { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" },
896 { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" },
897 { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" },
898 { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
899 {}
900};
901static struct trans_ctl_table trans_parport_table[] = {
902 { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
903 { 0, NULL, trans_parport_parport_table },
904 {}
905};
906
907static struct trans_ctl_table trans_dev_table[] = {
908 { DEV_CDROM, "cdrom", trans_cdrom_table },
909 /* DEV_HWMON unused */
910 { DEV_PARPORT, "parport", trans_parport_table },
911 { DEV_RAID, "raid", trans_raid_table },
912 { DEV_MAC_HID, "mac_hid", trans_mac_hid_files },
913 { DEV_SCSI, "scsi", trans_scsi_table },
914 { DEV_IPMI, "ipmi", trans_ipmi_table },
915 {}
916};
917
918static struct trans_ctl_table trans_bus_isa_table[] = {
919 { BUS_ISA_MEM_BASE, "membase" },
920 { BUS_ISA_PORT_BASE, "portbase" },
921 { BUS_ISA_PORT_SHIFT, "portshift" },
922 {}
923};
924
925static struct trans_ctl_table trans_bus_table[] = {
926 { CTL_BUS_ISA, "isa", trans_bus_isa_table },
927 {}
928};
929
930static struct trans_ctl_table trans_arlan_conf_table0[] = {
931 { 1, "spreadingCode" },
932 { 2, "channelNumber" },
933 { 3, "scramblingDisable" },
934 { 4, "txAttenuation" },
935 { 5, "systemId" },
936 { 6, "maxDatagramSize" },
937 { 7, "maxFrameSize" },
938 { 8, "maxRetries" },
939 { 9, "receiveMode" },
940 { 10, "priority" },
941 { 11, "rootOrRepeater" },
942 { 12, "SID" },
943 { 13, "registrationMode" },
944 { 14, "registrationFill" },
945 { 15, "localTalkAddress" },
946 { 16, "codeFormat" },
947 { 17, "numChannels" },
948 { 18, "channel1" },
949 { 19, "channel2" },
950 { 20, "channel3" },
951 { 21, "channel4" },
952 { 22, "txClear" },
953 { 23, "txRetries" },
954 { 24, "txRouting" },
955 { 25, "txScrambled" },
956 { 26, "rxParameter" },
957 { 27, "txTimeoutMs" },
958 { 28, "waitCardTimeout" },
959 { 29, "channelSet" },
960 { 30, "name" },
961 { 31, "waitTime" },
962 { 32, "lParameter" },
963 { 33, "_15" },
964 { 34, "headerSize" },
965 { 36, "tx_delay_ms" },
966 { 37, "retries" },
967 { 38, "ReTransmitPacketMaxSize" },
968 { 39, "waitReTransmitPacketMaxSize" },
969 { 40, "fastReTransCount" },
970 { 41, "driverRetransmissions" },
971 { 42, "txAckTimeoutMs" },
972 { 43, "registrationInterrupts" },
973 { 44, "hardwareType" },
974 { 45, "radioType" },
975 { 46, "writeEEPROM" },
976 { 47, "writeRadioType" },
977 { 48, "entry_exit_debug" },
978 { 49, "debug" },
979 { 50, "in_speed" },
980 { 51, "out_speed" },
981 { 52, "in_speed10" },
982 { 53, "out_speed10" },
983 { 54, "in_speed_max" },
984 { 55, "out_speed_max" },
985 { 56, "measure_rate" },
986 { 57, "pre_Command_Wait" },
987 { 58, "rx_tweak1" },
988 { 59, "rx_tweak2" },
989 { 60, "tx_queue_len" },
990
991 { 150, "arlan0-txRing" },
992 { 151, "arlan0-rxRing" },
993 { 152, "arlan0-18" },
994 { 153, "arlan0-ring" },
995 { 154, "arlan0-shm-cpy" },
996 { 155, "config0" },
997 { 156, "reset0" },
998 {}
999};
1000
1001static struct trans_ctl_table trans_arlan_conf_table1[] = {
1002 { 1, "spreadingCode" },
1003 { 2, "channelNumber" },
1004 { 3, "scramblingDisable" },
1005 { 4, "txAttenuation" },
1006 { 5, "systemId" },
1007 { 6, "maxDatagramSize" },
1008 { 7, "maxFrameSize" },
1009 { 8, "maxRetries" },
1010 { 9, "receiveMode" },
1011 { 10, "priority" },
1012 { 11, "rootOrRepeater" },
1013 { 12, "SID" },
1014 { 13, "registrationMode" },
1015 { 14, "registrationFill" },
1016 { 15, "localTalkAddress" },
1017 { 16, "codeFormat" },
1018 { 17, "numChannels" },
1019 { 18, "channel1" },
1020 { 19, "channel2" },
1021 { 20, "channel3" },
1022 { 21, "channel4" },
1023 { 22, "txClear" },
1024 { 23, "txRetries" },
1025 { 24, "txRouting" },
1026 { 25, "txScrambled" },
1027 { 26, "rxParameter" },
1028 { 27, "txTimeoutMs" },
1029 { 28, "waitCardTimeout" },
1030 { 29, "channelSet" },
1031 { 30, "name" },
1032 { 31, "waitTime" },
1033 { 32, "lParameter" },
1034 { 33, "_15" },
1035 { 34, "headerSize" },
1036 { 36, "tx_delay_ms" },
1037 { 37, "retries" },
1038 { 38, "ReTransmitPacketMaxSize" },
1039 { 39, "waitReTransmitPacketMaxSize" },
1040 { 40, "fastReTransCount" },
1041 { 41, "driverRetransmissions" },
1042 { 42, "txAckTimeoutMs" },
1043 { 43, "registrationInterrupts" },
1044 { 44, "hardwareType" },
1045 { 45, "radioType" },
1046 { 46, "writeEEPROM" },
1047 { 47, "writeRadioType" },
1048 { 48, "entry_exit_debug" },
1049 { 49, "debug" },
1050 { 50, "in_speed" },
1051 { 51, "out_speed" },
1052 { 52, "in_speed10" },
1053 { 53, "out_speed10" },
1054 { 54, "in_speed_max" },
1055 { 55, "out_speed_max" },
1056 { 56, "measure_rate" },
1057 { 57, "pre_Command_Wait" },
1058 { 58, "rx_tweak1" },
1059 { 59, "rx_tweak2" },
1060 { 60, "tx_queue_len" },
1061
1062 { 150, "arlan1-txRing" },
1063 { 151, "arlan1-rxRing" },
1064 { 152, "arlan1-18" },
1065 { 153, "arlan1-ring" },
1066 { 154, "arlan1-shm-cpy" },
1067 { 155, "config1" },
1068 { 156, "reset1" },
1069 {}
1070};
1071
1072static struct trans_ctl_table trans_arlan_conf_table2[] = {
1073 { 1, "spreadingCode" },
1074 { 2, "channelNumber" },
1075 { 3, "scramblingDisable" },
1076 { 4, "txAttenuation" },
1077 { 5, "systemId" },
1078 { 6, "maxDatagramSize" },
1079 { 7, "maxFrameSize" },
1080 { 8, "maxRetries" },
1081 { 9, "receiveMode" },
1082 { 10, "priority" },
1083 { 11, "rootOrRepeater" },
1084 { 12, "SID" },
1085 { 13, "registrationMode" },
1086 { 14, "registrationFill" },
1087 { 15, "localTalkAddress" },
1088 { 16, "codeFormat" },
1089 { 17, "numChannels" },
1090 { 18, "channel1" },
1091 { 19, "channel2" },
1092 { 20, "channel3" },
1093 { 21, "channel4" },
1094 { 22, "txClear" },
1095 { 23, "txRetries" },
1096 { 24, "txRouting" },
1097 { 25, "txScrambled" },
1098 { 26, "rxParameter" },
1099 { 27, "txTimeoutMs" },
1100 { 28, "waitCardTimeout" },
1101 { 29, "channelSet" },
1102 { 30, "name" },
1103 { 31, "waitTime" },
1104 { 32, "lParameter" },
1105 { 33, "_15" },
1106 { 34, "headerSize" },
1107 { 36, "tx_delay_ms" },
1108 { 37, "retries" },
1109 { 38, "ReTransmitPacketMaxSize" },
1110 { 39, "waitReTransmitPacketMaxSize" },
1111 { 40, "fastReTransCount" },
1112 { 41, "driverRetransmissions" },
1113 { 42, "txAckTimeoutMs" },
1114 { 43, "registrationInterrupts" },
1115 { 44, "hardwareType" },
1116 { 45, "radioType" },
1117 { 46, "writeEEPROM" },
1118 { 47, "writeRadioType" },
1119 { 48, "entry_exit_debug" },
1120 { 49, "debug" },
1121 { 50, "in_speed" },
1122 { 51, "out_speed" },
1123 { 52, "in_speed10" },
1124 { 53, "out_speed10" },
1125 { 54, "in_speed_max" },
1126 { 55, "out_speed_max" },
1127 { 56, "measure_rate" },
1128 { 57, "pre_Command_Wait" },
1129 { 58, "rx_tweak1" },
1130 { 59, "rx_tweak2" },
1131 { 60, "tx_queue_len" },
1132
1133 { 150, "arlan2-txRing" },
1134 { 151, "arlan2-rxRing" },
1135 { 152, "arlan2-18" },
1136 { 153, "arlan2-ring" },
1137 { 154, "arlan2-shm-cpy" },
1138 { 155, "config2" },
1139 { 156, "reset2" },
1140 {}
1141};
1142
1143static struct trans_ctl_table trans_arlan_conf_table3[] = {
1144 { 1, "spreadingCode" },
1145 { 2, "channelNumber" },
1146 { 3, "scramblingDisable" },
1147 { 4, "txAttenuation" },
1148 { 5, "systemId" },
1149 { 6, "maxDatagramSize" },
1150 { 7, "maxFrameSize" },
1151 { 8, "maxRetries" },
1152 { 9, "receiveMode" },
1153 { 10, "priority" },
1154 { 11, "rootOrRepeater" },
1155 { 12, "SID" },
1156 { 13, "registrationMode" },
1157 { 14, "registrationFill" },
1158 { 15, "localTalkAddress" },
1159 { 16, "codeFormat" },
1160 { 17, "numChannels" },
1161 { 18, "channel1" },
1162 { 19, "channel2" },
1163 { 20, "channel3" },
1164 { 21, "channel4" },
1165 { 22, "txClear" },
1166 { 23, "txRetries" },
1167 { 24, "txRouting" },
1168 { 25, "txScrambled" },
1169 { 26, "rxParameter" },
1170 { 27, "txTimeoutMs" },
1171 { 28, "waitCardTimeout" },
1172 { 29, "channelSet" },
1173 { 30, "name" },
1174 { 31, "waitTime" },
1175 { 32, "lParameter" },
1176 { 33, "_15" },
1177 { 34, "headerSize" },
1178 { 36, "tx_delay_ms" },
1179 { 37, "retries" },
1180 { 38, "ReTransmitPacketMaxSize" },
1181 { 39, "waitReTransmitPacketMaxSize" },
1182 { 40, "fastReTransCount" },
1183 { 41, "driverRetransmissions" },
1184 { 42, "txAckTimeoutMs" },
1185 { 43, "registrationInterrupts" },
1186 { 44, "hardwareType" },
1187 { 45, "radioType" },
1188 { 46, "writeEEPROM" },
1189 { 47, "writeRadioType" },
1190 { 48, "entry_exit_debug" },
1191 { 49, "debug" },
1192 { 50, "in_speed" },
1193 { 51, "out_speed" },
1194 { 52, "in_speed10" },
1195 { 53, "out_speed10" },
1196 { 54, "in_speed_max" },
1197 { 55, "out_speed_max" },
1198 { 56, "measure_rate" },
1199 { 57, "pre_Command_Wait" },
1200 { 58, "rx_tweak1" },
1201 { 59, "rx_tweak2" },
1202 { 60, "tx_queue_len" },
1203
1204 { 150, "arlan3-txRing" },
1205 { 151, "arlan3-rxRing" },
1206 { 152, "arlan3-18" },
1207 { 153, "arlan3-ring" },
1208 { 154, "arlan3-shm-cpy" },
1209 { 155, "config3" },
1210 { 156, "reset3" },
1211 {}
1212};
1213
1214static struct trans_ctl_table trans_arlan_table[] = {
1215 { 1, "arlan0", trans_arlan_conf_table0 },
1216 { 2, "arlan1", trans_arlan_conf_table1 },
1217 { 3, "arlan2", trans_arlan_conf_table2 },
1218 { 4, "arlan3", trans_arlan_conf_table3 },
1219 {}
1220};
1221
1222static struct trans_ctl_table trans_appldata_table[] = {
1223 { CTL_APPLDATA_TIMER, "timer" },
1224 { CTL_APPLDATA_INTERVAL, "interval" },
1225 { CTL_APPLDATA_OS, "os" },
1226 { CTL_APPLDATA_NET_SUM, "net_sum" },
1227 { CTL_APPLDATA_MEM, "mem" },
1228 {}
1229
1230};
1231
1232static struct trans_ctl_table trans_s390dbf_table[] = {
1233 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1234 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
1235 {}
1236};
1237
1238static struct trans_ctl_table trans_sunrpc_table[] = {
1239 { CTL_RPCDEBUG, "rpc_debug" },
1240 { CTL_NFSDEBUG, "nfs_debug" },
1241 { CTL_NFSDDEBUG, "nfsd_debug" },
1242 { CTL_NLMDEBUG, "nlm_debug" },
1243 { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
1244 { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
1245 { CTL_MIN_RESVPORT, "min_resvport" },
1246 { CTL_MAX_RESVPORT, "max_resvport" },
1247 {}
1248};
1249
1250static struct trans_ctl_table trans_pm_table[] = {
1251 { 1 /* CTL_PM_SUSPEND */, "suspend" },
1252 { 2 /* CTL_PM_CMODE */, "cmode" },
1253 { 3 /* CTL_PM_P0 */, "p0" },
1254 { 4 /* CTL_PM_CM */, "cm" },
1255 {}
1256};
1257
1258static struct trans_ctl_table trans_frv_table[] = {
1259 { 1, "cache-mode" },
1260 { 2, "pin-cxnr" },
1261 {}
1262};
1263
1264static struct trans_ctl_table trans_root_table[] = {
1265 { CTL_KERN, "kernel", trans_kern_table },
1266 { CTL_VM, "vm", trans_vm_table },
1267 { CTL_NET, "net", trans_net_table },
1268 /* CTL_PROC not used */
1269 { CTL_FS, "fs", trans_fs_table },
1270 { CTL_DEBUG, "debug", trans_debug_table },
1271 { CTL_DEV, "dev", trans_dev_table },
1272 { CTL_BUS, "bus", trans_bus_table },
1273 { CTL_ABI, "abi" },
1274 /* CTL_CPU not used */
1275 { CTL_ARLAN, "arlan", trans_arlan_table },
1276 { CTL_APPLDATA, "appldata", trans_appldata_table },
1277 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1278 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1279 { CTL_PM, "pm", trans_pm_table },
1280 { CTL_FRV, "frv", trans_frv_table },
1281 {}
1282};
1283
1284
1285
1286
1287static int sysctl_depth(struct ctl_table *table)
1288{
1289 struct ctl_table *tmp;
1290 int depth;
1291
1292 depth = 0;
1293 for (tmp = table; tmp->parent; tmp = tmp->parent)
1294 depth++;
1295
1296 return depth;
1297}
1298
1299static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
1300{
1301 int i;
1302
1303 for (i = 0; table && i < n; i++)
1304 table = table->parent;
1305
1306 return table;
1307}
1308
1309static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
1310{
1311 struct ctl_table *test;
1312 struct trans_ctl_table *ref;
1313 int depth, cur_depth;
1314
1315 depth = sysctl_depth(table);
1316
1317 cur_depth = depth;
1318 ref = trans_root_table;
1319repeat:
1320 test = sysctl_parent(table, cur_depth);
1321 for (; ref->ctl_name || ref->procname || ref->child; ref++) {
1322 int match = 0;
1323
1324 if (cur_depth && !ref->child)
1325 continue;
1326
1327 if (test->procname && ref->procname &&
1328 (strcmp(test->procname, ref->procname) == 0))
1329 match++;
1330
1331 if (test->ctl_name && ref->ctl_name &&
1332 (test->ctl_name == ref->ctl_name))
1333 match++;
1334
1335 if (!ref->ctl_name && !ref->procname)
1336 match++;
1337
1338 if (match) {
1339 if (cur_depth != 0) {
1340 cur_depth--;
1341 ref = ref->child;
1342 goto repeat;
1343 }
1344 goto out;
1345 }
1346 }
1347 ref = NULL;
1348out:
1349 return ref;
1350}
1351
1352static void sysctl_print_path(struct ctl_table *table)
1353{
1354 struct ctl_table *tmp;
1355 int depth, i;
1356 depth = sysctl_depth(table);
1357 if (table->procname) {
1358 for (i = depth; i >= 0; i--) {
1359 tmp = sysctl_parent(table, i);
1360 printk("/%s", tmp->procname?tmp->procname:"");
1361 }
1362 }
1363 printk(" ");
1364 if (table->ctl_name) {
1365 for (i = depth; i >= 0; i--) {
1366 tmp = sysctl_parent(table, i);
1367 printk(".%d", tmp->ctl_name);
1368 }
1369 }
1370}
1371
1372static void sysctl_repair_table(struct ctl_table *table)
1373{
1374 /* Don't complain about the classic default
1375 * sysctl strategy routine. Maybe later we
1376 * can get the tables fixed and complain about
1377 * this.
1378 */
1379 if (table->ctl_name && table->procname &&
1380 (table->proc_handler == proc_dointvec) &&
1381 (!table->strategy)) {
1382 table->strategy = sysctl_data;
1383 }
1384}
1385
1386static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
1387{
1388 struct ctl_table_header *head;
1389 struct ctl_table *ref, *test;
1390 int depth, cur_depth;
1391
1392 depth = sysctl_depth(table);
1393
1394 for (head = sysctl_head_next(NULL); head;
1395 head = sysctl_head_next(head)) {
1396 cur_depth = depth;
1397 ref = head->ctl_table;
1398repeat:
1399 test = sysctl_parent(table, cur_depth);
1400 for (; ref->ctl_name || ref->procname; ref++) {
1401 int match = 0;
1402 if (cur_depth && !ref->child)
1403 continue;
1404
1405 if (test->procname && ref->procname &&
1406 (strcmp(test->procname, ref->procname) == 0))
1407 match++;
1408
1409 if (test->ctl_name && ref->ctl_name &&
1410 (test->ctl_name == ref->ctl_name))
1411 match++;
1412
1413 if (match) {
1414 if (cur_depth != 0) {
1415 cur_depth--;
1416 ref = ref->child;
1417 goto repeat;
1418 }
1419 goto out;
1420 }
1421 }
1422 }
1423 ref = NULL;
1424out:
1425 sysctl_head_finish(head);
1426 return ref;
1427}
1428
1429static void set_fail(const char **fail, struct ctl_table *table, const char *str)
1430{
1431 if (*fail) {
1432 printk(KERN_ERR "sysctl table check failed: ");
1433 sysctl_print_path(table);
1434 printk(" %s\n", *fail);
1435 }
1436 *fail = str;
1437}
1438
1439static int sysctl_check_dir(struct ctl_table *table)
1440{
1441 struct ctl_table *ref;
1442 int error;
1443
1444 error = 0;
1445 ref = sysctl_check_lookup(table);
1446 if (ref) {
1447 int match = 0;
1448 if ((!table->procname && !ref->procname) ||
1449 (table->procname && ref->procname &&
1450 (strcmp(table->procname, ref->procname) == 0)))
1451 match++;
1452
1453 if ((!table->ctl_name && !ref->ctl_name) ||
1454 (table->ctl_name && ref->ctl_name &&
1455 (table->ctl_name == ref->ctl_name)))
1456 match++;
1457
1458 if (match != 2) {
1459 printk(KERN_ERR "%s: failed: ", __func__);
1460 sysctl_print_path(table);
1461 printk(" ref: ");
1462 sysctl_print_path(ref);
1463 printk("\n");
1464 error = -EINVAL;
1465 }
1466 }
1467 return error;
1468}
1469
1470static void sysctl_check_leaf(struct ctl_table *table, const char **fail)
1471{
1472 struct ctl_table *ref;
1473
1474 ref = sysctl_check_lookup(table);
1475 if (ref && (ref != table))
1476 set_fail(fail, table, "Sysctl already exists");
1477}
1478
1479static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1480{
1481 struct trans_ctl_table *ref;
1482
1483 ref = sysctl_binary_lookup(table);
1484 if (table->ctl_name && !ref)
1485 set_fail(fail, table, "Unknown sysctl binary path");
1486 if (ref) {
1487 if (ref->procname &&
1488 (!table->procname ||
1489 (strcmp(table->procname, ref->procname) != 0)))
1490 set_fail(fail, table, "procname does not match binary path procname");
1491
1492 if (ref->ctl_name && table->ctl_name &&
1493 (table->ctl_name != ref->ctl_name))
1494 set_fail(fail, table, "ctl_name does not match binary path ctl_name");
1495 }
1496}
1497
1498int sysctl_check_table(struct ctl_table *table)
1499{
1500 int error = 0;
1501 for (; table->ctl_name || table->procname; table++) {
1502 const char *fail = NULL;
1503
1504 sysctl_repair_table(table);
1505 if (table->parent) {
1506 if (table->procname && !table->parent->procname)
1507 set_fail(&fail, table, "Parent without procname");
1508 if (table->ctl_name && !table->parent->ctl_name)
1509 set_fail(&fail, table, "Parent without ctl_name");
1510 }
1511 if (!table->procname)
1512 set_fail(&fail, table, "No procname");
1513 if (table->child) {
1514 if (table->data)
1515 set_fail(&fail, table, "Directory with data?");
1516 if (table->maxlen)
1517 set_fail(&fail, table, "Directory with maxlen?");
1518 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
1519 set_fail(&fail, table, "Writable sysctl directory");
1520 if (table->proc_handler)
1521 set_fail(&fail, table, "Directory with proc_handler");
1522 if (table->strategy)
1523 set_fail(&fail, table, "Directory with strategy");
1524 if (table->extra1)
1525 set_fail(&fail, table, "Directory with extra1");
1526 if (table->extra2)
1527 set_fail(&fail, table, "Directory with extra2");
1528 if (sysctl_check_dir(table))
1529 set_fail(&fail, table, "Inconsistent directory names");
1530 } else {
1531 if ((table->strategy == sysctl_data) ||
1532 (table->strategy == sysctl_string) ||
1533 (table->strategy == sysctl_intvec) ||
1534 (table->strategy == sysctl_jiffies) ||
1535 (table->strategy == sysctl_ms_jiffies) ||
1536 (table->proc_handler == proc_dostring) ||
1537 (table->proc_handler == proc_dointvec) ||
1538#ifdef CONFIG_SECURITY_CAPABILITIES
1539 (table->proc_handler == proc_dointvec_bset) ||
1540#endif /* def CONFIG_SECURITY_CAPABILITIES */
1541 (table->proc_handler == proc_dointvec_minmax) ||
1542 (table->proc_handler == proc_dointvec_jiffies) ||
1543 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
1544 (table->proc_handler == proc_dointvec_ms_jiffies) ||
1545 (table->proc_handler == proc_doulongvec_minmax) ||
1546 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1547 if (!table->data)
1548 set_fail(&fail, table, "No data");
1549 if (!table->maxlen)
1550 set_fail(&fail, table, "No maxlen");
1551 }
1552 if ((table->proc_handler == proc_doulongvec_minmax) ||
1553 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1554 if (table->maxlen > sizeof (unsigned long)) {
1555 if (!table->extra1)
1556 set_fail(&fail, table, "No min");
1557 if (!table->extra2)
1558 set_fail(&fail, table, "No max");
1559 }
1560 }
1561#ifdef CONFIG_SYSCTL_SYSCALL
1562 if (table->ctl_name && !table->strategy)
1563 set_fail(&fail, table, "Missing strategy");
1564#endif
1565#if 0
1566 if (!table->ctl_name && table->strategy)
1567 set_fail(&fail, table, "Strategy without ctl_name");
1568#endif
1569#ifdef CONFIG_PROC_FS
1570 if (table->procname && !table->proc_handler)
1571 set_fail(&fail, table, "No proc_handler");
1572#endif
1573#if 0
1574 if (!table->procname && table->proc_handler)
1575 set_fail(&fail, table, "proc_handler without procname");
1576#endif
1577 sysctl_check_leaf(table, &fail);
1578 }
1579 sysctl_check_bin_path(table, &fail);
1580 if (fail) {
1581 set_fail(&fail, table, NULL);
1582 error = -EINVAL;
1583 }
1584 if (table->child)
1585 error |= sysctl_check_table(table->child);
1586 }
1587 return error;
1588}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 059431ed67db..9f360f68aad6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -20,9 +20,12 @@
20#include <linux/taskstats_kern.h> 20#include <linux/taskstats_kern.h>
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/tsacct_kern.h>
24#include <linux/cpumask.h> 23#include <linux/cpumask.h>
25#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/cgroupstats.h>
26#include <linux/cgroup.h>
27#include <linux/fs.h>
28#include <linux/file.h>
26#include <net/genetlink.h> 29#include <net/genetlink.h>
27#include <asm/atomic.h> 30#include <asm/atomic.h>
28 31
@@ -50,6 +53,11 @@ __read_mostly = {
50 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
51 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
52 55
56static struct nla_policy
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59};
60
53struct listener { 61struct listener {
54 struct list_head list; 62 struct list_head list;
55 pid_t pid; 63 pid_t pid;
@@ -373,6 +381,51 @@ err:
373 return NULL; 381 return NULL;
374} 382}
375 383
384static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
385{
386 int rc = 0;
387 struct sk_buff *rep_skb;
388 struct cgroupstats *stats;
389 struct nlattr *na;
390 size_t size;
391 u32 fd;
392 struct file *file;
393 int fput_needed;
394
395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
396 if (!na)
397 return -EINVAL;
398
399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
400 file = fget_light(fd, &fput_needed);
401 if (file) {
402 size = nla_total_size(sizeof(struct cgroupstats));
403
404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
405 size);
406 if (rc < 0)
407 goto err;
408
409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
410 sizeof(struct cgroupstats));
411 stats = nla_data(na);
412 memset(stats, 0, sizeof(*stats));
413
414 rc = cgroupstats_build(stats, file->f_dentry);
415 if (rc < 0)
416 goto err;
417
418 fput_light(file, fput_needed);
419 return send_reply(rep_skb, info->snd_pid);
420 }
421
422err:
423 if (file)
424 fput_light(file, fput_needed);
425 nlmsg_free(rep_skb);
426 return rc;
427}
428
376static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
377{ 430{
378 int rc = 0; 431 int rc = 0;
@@ -523,6 +576,12 @@ static struct genl_ops taskstats_ops = {
523 .policy = taskstats_cmd_get_policy, 576 .policy = taskstats_cmd_get_policy,
524}; 577};
525 578
579static struct genl_ops cgroupstats_ops = {
580 .cmd = CGROUPSTATS_CMD_GET,
581 .doit = cgroupstats_user_cmd,
582 .policy = cgroupstats_cmd_get_policy,
583};
584
526/* Needed early in initialization */ 585/* Needed early in initialization */
527void __init taskstats_init_early(void) 586void __init taskstats_init_early(void)
528{ 587{
@@ -547,8 +606,15 @@ static int __init taskstats_init(void)
547 if (rc < 0) 606 if (rc < 0)
548 goto err; 607 goto err;
549 608
609 rc = genl_register_ops(&family, &cgroupstats_ops);
610 if (rc < 0)
611 goto err_cgroup_ops;
612
550 family_registered = 1; 613 family_registered = 1;
614 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
551 return 0; 615 return 0;
616err_cgroup_ops:
617 genl_unregister_ops(&family, &taskstats_ops);
552err: 618err:
553 genl_unregister_family(&family); 619 genl_unregister_family(&family);
554 return rc; 620 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 2289a8d68314..09d3c45c4da7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -9,9 +9,9 @@
9 */ 9 */
10/* 10/*
11 * Modification history kernel/time.c 11 * Modification history kernel/time.c
12 * 12 *
13 * 1993-09-02 Philip Gladstone 13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex() 14 * Created file with time related functions from sched.c and adjtimex()
15 * 1993-10-08 Torsten Duwe 15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code 16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe 17 * 1995-08-13 Torsten Duwe
@@ -30,16 +30,16 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/timex.h> 31#include <linux/timex.h>
32#include <linux/capability.h> 32#include <linux/capability.h>
33#include <linux/clocksource.h>
33#include <linux/errno.h> 34#include <linux/errno.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/security.h> 36#include <linux/security.h>
36#include <linux/fs.h> 37#include <linux/fs.h>
37#include <linux/module.h>
38 38
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/unistd.h> 40#include <asm/unistd.h>
41 41
42/* 42/*
43 * The timezone where the local system is located. Used as a default by some 43 * The timezone where the local system is located. Used as a default by some
44 * programs who obtain this value by using gettimeofday. 44 * programs who obtain this value by using gettimeofday.
45 */ 45 */
@@ -57,11 +57,7 @@ EXPORT_SYMBOL(sys_tz);
57 */ 57 */
58asmlinkage long sys_time(time_t __user * tloc) 58asmlinkage long sys_time(time_t __user * tloc)
59{ 59{
60 time_t i; 60 time_t i = get_seconds();
61 struct timespec tv;
62
63 getnstimeofday(&tv);
64 i = tv.tv_sec;
65 61
66 if (tloc) { 62 if (tloc) {
67 if (put_user(i,tloc)) 63 if (put_user(i,tloc))
@@ -76,7 +72,7 @@ asmlinkage long sys_time(time_t __user * tloc)
76 * why not move it into the appropriate arch directory (for those 72 * why not move it into the appropriate arch directory (for those
77 * architectures that need it). 73 * architectures that need it).
78 */ 74 */
79 75
80asmlinkage long sys_stime(time_t __user *tptr) 76asmlinkage long sys_stime(time_t __user *tptr)
81{ 77{
82 struct timespec tv; 78 struct timespec tv;
@@ -115,10 +111,10 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
115/* 111/*
116 * Adjust the time obtained from the CMOS to be UTC time instead of 112 * Adjust the time obtained from the CMOS to be UTC time instead of
117 * local time. 113 * local time.
118 * 114 *
119 * This is ugly, but preferable to the alternatives. Otherwise we 115 * This is ugly, but preferable to the alternatives. Otherwise we
120 * would either need to write a program to do it in /etc/rc (and risk 116 * would either need to write a program to do it in /etc/rc (and risk
121 * confusion if the program gets run more than once; it would also be 117 * confusion if the program gets run more than once; it would also be
122 * hard to make the program warp the clock precisely n hours) or 118 * hard to make the program warp the clock precisely n hours) or
123 * compile in the timezone information into the kernel. Bad, bad.... 119 * compile in the timezone information into the kernel. Bad, bad....
124 * 120 *
@@ -163,6 +159,7 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
163 if (tz) { 159 if (tz) {
164 /* SMP safe, global irq locking makes it work. */ 160 /* SMP safe, global irq locking makes it work. */
165 sys_tz = *tz; 161 sys_tz = *tz;
162 update_vsyscall_tz();
166 if (firsttime) { 163 if (firsttime) {
167 firsttime = 0; 164 firsttime = 0;
168 if (!tv) 165 if (!tv)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f66351126544..8d53106a0a92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS
23 hardware is not capable then this option only increases 23 hardware is not capable then this option only increases
24 the size of the kernel image. 24 the size of the kernel image.
25 25
26config GENERIC_CLOCKEVENTS_BUILD
27 bool
28 default y
29 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
30
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 99b6034fc86b..905b0b50792d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 41dd3105ce7f..822beebe664a 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
194 local_irq_restore(flags); 194 local_irq_restore(flags);
195} 195}
196 196
197#ifdef CONFIG_GENERIC_CLOCKEVENTS
197/** 198/**
198 * clockevents_notify - notification about relevant events 199 * clockevents_notify - notification about relevant events
199 */ 200 */
@@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg)
222 spin_unlock(&clockevents_lock); 223 spin_unlock(&clockevents_lock);
223} 224}
224EXPORT_SYMBOL_GPL(clockevents_notify); 225EXPORT_SYMBOL_GPL(clockevents_notify);
225 226#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 51b6a6a6158c..c8a9d13874df 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -207,15 +207,12 @@ static inline void clocksource_resume_watchdog(void) { }
207 */ 207 */
208void clocksource_resume(void) 208void clocksource_resume(void)
209{ 209{
210 struct list_head *tmp; 210 struct clocksource *cs;
211 unsigned long flags; 211 unsigned long flags;
212 212
213 spin_lock_irqsave(&clocksource_lock, flags); 213 spin_lock_irqsave(&clocksource_lock, flags);
214 214
215 list_for_each(tmp, &clocksource_list) { 215 list_for_each_entry(cs, &clocksource_list, list) {
216 struct clocksource *cs;
217
218 cs = list_entry(tmp, struct clocksource, list);
219 if (cs->resume) 216 if (cs->resume)
220 cs->resume(); 217 cs->resume();
221 } 218 }
@@ -369,7 +366,6 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
369 const char *buf, size_t count) 366 const char *buf, size_t count)
370{ 367{
371 struct clocksource *ovr = NULL; 368 struct clocksource *ovr = NULL;
372 struct list_head *tmp;
373 size_t ret = count; 369 size_t ret = count;
374 int len; 370 int len;
375 371
@@ -389,12 +385,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
389 385
390 len = strlen(override_name); 386 len = strlen(override_name);
391 if (len) { 387 if (len) {
388 struct clocksource *cs;
389
392 ovr = clocksource_override; 390 ovr = clocksource_override;
393 /* try to select it: */ 391 /* try to select it: */
394 list_for_each(tmp, &clocksource_list) { 392 list_for_each_entry(cs, &clocksource_list, list) {
395 struct clocksource *cs;
396
397 cs = list_entry(tmp, struct clocksource, list);
398 if (strlen(cs->name) == len && 393 if (strlen(cs->name) == len &&
399 !strcmp(cs->name, override_name)) 394 !strcmp(cs->name, override_name))
400 ovr = cs; 395 ovr = cs;
@@ -422,14 +417,11 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
422static ssize_t 417static ssize_t
423sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 418sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
424{ 419{
425 struct list_head *tmp; 420 struct clocksource *src;
426 char *curr = buf; 421 char *curr = buf;
427 422
428 spin_lock_irq(&clocksource_lock); 423 spin_lock_irq(&clocksource_lock);
429 list_for_each(tmp, &clocksource_list) { 424 list_for_each_entry(src, &clocksource_list, list) {
430 struct clocksource *src;
431
432 src = list_entry(tmp, struct clocksource, list);
433 curr += sprintf(curr, "%s ", src->name); 425 curr += sprintf(curr, "%s ", src->name);
434 } 426 }
435 spin_unlock_irq(&clocksource_lock); 427 spin_unlock_irq(&clocksource_lock);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0962e0577660..8cfb8b2ce773 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64 */ 64 */
65int tick_check_broadcast_device(struct clock_event_device *dev) 65int tick_check_broadcast_device(struct clock_event_device *dev)
66{ 66{
67 if (tick_broadcast_device.evtdev || 67 if ((tick_broadcast_device.evtdev &&
68 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 68 tick_broadcast_device.evtdev->rating >= dev->rating) ||
69 (dev->features & CLOCK_EVT_FEAT_C3STOP))
69 return 0; 70 return 0;
70 71
71 clockevents_exchange_device(NULL, dev); 72 clockevents_exchange_device(NULL, dev);
@@ -176,8 +177,6 @@ static void tick_do_periodic_broadcast(void)
176 */ 177 */
177static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 178static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
178{ 179{
179 dev->next_event.tv64 = KTIME_MAX;
180
181 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
182 181
183 /* 182 /*
@@ -218,26 +217,33 @@ static void tick_do_broadcast_on_off(void *why)
218 bc = tick_broadcast_device.evtdev; 217 bc = tick_broadcast_device.evtdev;
219 218
220 /* 219 /*
221 * Is the device in broadcast mode forever or is it not 220 * Is the device not affected by the powerstate ?
222 * affected by the powerstate ?
223 */ 221 */
224 if (!dev || !tick_device_is_functional(dev) || 222 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
225 !(dev->features & CLOCK_EVT_FEAT_C3STOP)) 223 goto out;
224
225 if (!tick_device_is_functional(dev))
226 goto out; 226 goto out;
227 227
228 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { 228 switch (*reason) {
229 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
230 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 231 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 232 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 233 if (td->mode == TICKDEV_MODE_PERIODIC)
232 clockevents_set_mode(dev, 234 clockevents_set_mode(dev,
233 CLOCK_EVT_MODE_SHUTDOWN); 235 CLOCK_EVT_MODE_SHUTDOWN);
234 } 236 }
235 } else { 237 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
238 dev->features |= CLOCK_EVT_FEAT_DUMMY;
239 break;
240 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
236 if (cpu_isset(cpu, tick_broadcast_mask)) { 241 if (cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_clear(cpu, tick_broadcast_mask); 242 cpu_clear(cpu, tick_broadcast_mask);
238 if (td->mode == TICKDEV_MODE_PERIODIC) 243 if (td->mode == TICKDEV_MODE_PERIODIC)
239 tick_setup_periodic(dev, 0); 244 tick_setup_periodic(dev, 0);
240 } 245 }
246 break;
241 } 247 }
242 248
243 if (cpus_empty(tick_broadcast_mask)) 249 if (cpus_empty(tick_broadcast_mask))
@@ -258,21 +264,12 @@ out:
258 */ 264 */
259void tick_broadcast_on_off(unsigned long reason, int *oncpu) 265void tick_broadcast_on_off(unsigned long reason, int *oncpu)
260{ 266{
261 int cpu = get_cpu(); 267 if (!cpu_isset(*oncpu, cpu_online_map))
262
263 if (!cpu_isset(*oncpu, cpu_online_map)) {
264 printk(KERN_ERR "tick-braodcast: ignoring broadcast for " 268 printk(KERN_ERR "tick-braodcast: ignoring broadcast for "
265 "offline CPU #%d\n", *oncpu); 269 "offline CPU #%d\n", *oncpu);
266 } else { 270 else
267 271 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
268 if (cpu == *oncpu) 272 &reason, 1, 1);
269 tick_do_broadcast_on_off(&reason);
270 else
271 smp_call_function_single(*oncpu,
272 tick_do_broadcast_on_off,
273 &reason, 1, 1);
274 }
275 put_cpu();
276} 273}
277 274
278/* 275/*
@@ -515,11 +512,9 @@ static void tick_broadcast_clear_oneshot(int cpu)
515 */ 512 */
516void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 513void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
517{ 514{
518 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { 515 bc->event_handler = tick_handle_oneshot_broadcast;
519 bc->event_handler = tick_handle_oneshot_broadcast; 516 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
520 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 517 bc->next_event.tv64 = KTIME_MAX;
521 bc->next_event.tv64 = KTIME_MAX;
522 }
523} 518}
524 519
525/* 520/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 77a21abc8716..1bea399a9ef0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
200 200
201 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
202 if (!cpu_isset(cpu, newdev->cpumask)) 202 if (!cpu_isset(cpu, newdev->cpumask))
203 goto out; 203 goto out_bc;
204 204
205 td = &per_cpu(tick_cpu_device, cpu); 205 td = &per_cpu(tick_cpu_device, cpu);
206 curdev = td->evtdev; 206 curdev = td->evtdev;
@@ -265,7 +265,7 @@ out_bc:
265 */ 265 */
266 if (tick_check_broadcast_device(newdev)) 266 if (tick_check_broadcast_device(newdev))
267 ret = NOTIFY_STOP; 267 ret = NOTIFY_STOP;
268out: 268
269 spin_unlock_irqrestore(&tick_device_lock, flags); 269 spin_unlock_irqrestore(&tick_device_lock, flags);
270 270
271 return ret; 271 return ret;
@@ -345,6 +345,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
345 345
346 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 346 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
348 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
348 tick_broadcast_on_off(reason, dev); 349 tick_broadcast_on_off(reason, dev);
349 break; 350 break;
350 351
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 637519af6151..10a1347597fd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -586,7 +586,7 @@ void tick_setup_sched_timer(void)
586 /* Get the next period (per cpu) */ 586 /* Get the next period (per cpu) */
587 ts->sched_timer.expires = tick_init_jiffy_update(); 587 ts->sched_timer.expires = tick_init_jiffy_update();
588 offset = ktime_to_ns(tick_period) >> 1; 588 offset = ktime_to_ns(tick_period) >> 1;
589 do_div(offset, NR_CPUS); 589 do_div(offset, num_possible_cpus());
590 offset *= smp_processor_id(); 590 offset *= smp_processor_id();
591 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); 591 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
592 592
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4ad79f6bdec6..e5e466b27598 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,9 +24,7 @@
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime and avenrun.
26 */ 26 */
27__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28
29EXPORT_SYMBOL(xtime_lock);
30 28
31 29
32/* 30/*
@@ -47,21 +45,13 @@ EXPORT_SYMBOL(xtime_lock);
47struct timespec xtime __attribute__ ((aligned (16))); 45struct timespec xtime __attribute__ ((aligned (16)));
48struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
49static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
50EXPORT_SYMBOL(xtime);
51
52 48
53#ifdef CONFIG_NO_HZ
54static struct timespec xtime_cache __attribute__ ((aligned (16))); 49static struct timespec xtime_cache __attribute__ ((aligned (16)));
55static inline void update_xtime_cache(u64 nsec) 50static inline void update_xtime_cache(u64 nsec)
56{ 51{
57 xtime_cache = xtime; 52 xtime_cache = xtime;
58 timespec_add_ns(&xtime_cache, nsec); 53 timespec_add_ns(&xtime_cache, nsec);
59} 54}
60#else
61#define xtime_cache xtime
62/* We do *not* want to evaluate the argument for this case */
63#define update_xtime_cache(n) do { } while (0)
64#endif
65 55
66static struct clocksource *clock; /* pointer to current clocksource */ 56static struct clocksource *clock; /* pointer to current clocksource */
67 57
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ce1952eea7d..fb4e67d5dd60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -26,6 +26,7 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/pid_namespace.h>
29#include <linux/notifier.h> 30#include <linux/notifier.h>
30#include <linux/thread_info.h> 31#include <linux/thread_info.h>
31#include <linux/time.h> 32#include <linux/time.h>
@@ -817,7 +818,7 @@ unsigned long next_timer_interrupt(void)
817#endif 818#endif
818 819
819/* 820/*
820 * Called from the timer interrupt handler to charge one tick to the current 821 * Called from the timer interrupt handler to charge one tick to the current
821 * process. user_tick is 1 if the tick is user time, 0 for system. 822 * process. user_tick is 1 if the tick is user time, 0 for system.
822 */ 823 */
823void update_process_times(int user_tick) 824void update_process_times(int user_tick)
@@ -826,10 +827,13 @@ void update_process_times(int user_tick)
826 int cpu = smp_processor_id(); 827 int cpu = smp_processor_id();
827 828
828 /* Note: this timer irq context must be accounted for as well. */ 829 /* Note: this timer irq context must be accounted for as well. */
829 if (user_tick) 830 if (user_tick) {
830 account_user_time(p, jiffies_to_cputime(1)); 831 account_user_time(p, jiffies_to_cputime(1));
831 else 832 account_user_time_scaled(p, jiffies_to_cputime(1));
833 } else {
832 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); 834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
835 account_system_time_scaled(p, jiffies_to_cputime(1));
836 }
833 run_local_timers(); 837 run_local_timers();
834 if (rcu_pending(cpu)) 838 if (rcu_pending(cpu))
835 rcu_check_callbacks(cpu, user_tick); 839 rcu_check_callbacks(cpu, user_tick);
@@ -953,7 +957,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
953 */ 957 */
954asmlinkage long sys_getpid(void) 958asmlinkage long sys_getpid(void)
955{ 959{
956 return current->tgid; 960 return task_tgid_vnr(current);
957} 961}
958 962
959/* 963/*
@@ -967,7 +971,7 @@ asmlinkage long sys_getppid(void)
967 int pid; 971 int pid;
968 972
969 rcu_read_lock(); 973 rcu_read_lock();
970 pid = rcu_dereference(current->real_parent)->tgid; 974 pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns);
971 rcu_read_unlock(); 975 rcu_read_unlock();
972 976
973 return pid; 977 return pid;
@@ -1099,7 +1103,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1099/* Thread ID - the internal kernel "pid" */ 1103/* Thread ID - the internal kernel "pid" */
1100asmlinkage long sys_gettid(void) 1104asmlinkage long sys_gettid(void)
1101{ 1105{
1102 return current->pid; 1106 return task_pid_vnr(current);
1103} 1107}
1104 1108
1105/** 1109/**
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index c122131a122f..4ab1b584961b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -62,6 +62,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
62 rcu_read_unlock(); 62 rcu_read_unlock();
63 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; 63 stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
64 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; 64 stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
65 stats->ac_utimescaled =
66 cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
67 stats->ac_stimescaled =
68 cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
65 stats->ac_minflt = tsk->min_flt; 69 stats->ac_minflt = tsk->min_flt;
66 stats->ac_majflt = tsk->maj_flt; 70 stats->ac_majflt = tsk->maj_flt;
67 71
diff --git a/kernel/user.c b/kernel/user.c
index 9ca2848fc356..e91331c457e2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -44,34 +44,36 @@ struct user_struct root_user = {
44 .processes = ATOMIC_INIT(1), 44 .processes = ATOMIC_INIT(1),
45 .files = ATOMIC_INIT(0), 45 .files = ATOMIC_INIT(0),
46 .sigpending = ATOMIC_INIT(0), 46 .sigpending = ATOMIC_INIT(0),
47 .mq_bytes = 0,
48 .locked_shm = 0, 47 .locked_shm = 0,
49#ifdef CONFIG_KEYS 48#ifdef CONFIG_KEYS
50 .uid_keyring = &root_user_keyring, 49 .uid_keyring = &root_user_keyring,
51 .session_keyring = &root_session_keyring, 50 .session_keyring = &root_session_keyring,
52#endif 51#endif
52#ifdef CONFIG_FAIR_USER_SCHED
53 .tg = &init_task_group,
54#endif
53}; 55};
54 56
55/* 57/*
56 * These routines must be called with the uidhash spinlock held! 58 * These routines must be called with the uidhash spinlock held!
57 */ 59 */
58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) 60static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
59{ 61{
60 hlist_add_head(&up->uidhash_node, hashent); 62 hlist_add_head(&up->uidhash_node, hashent);
61} 63}
62 64
63static inline void uid_hash_remove(struct user_struct *up) 65static void uid_hash_remove(struct user_struct *up)
64{ 66{
65 hlist_del_init(&up->uidhash_node); 67 hlist_del_init(&up->uidhash_node);
66} 68}
67 69
68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 70static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
69{ 71{
70 struct user_struct *user; 72 struct user_struct *user;
71 struct hlist_node *h; 73 struct hlist_node *h;
72 74
73 hlist_for_each_entry(user, h, hashent, uidhash_node) { 75 hlist_for_each_entry(user, h, hashent, uidhash_node) {
74 if(user->uid == uid) { 76 if (user->uid == uid) {
75 atomic_inc(&user->__count); 77 atomic_inc(&user->__count);
76 return user; 78 return user;
77 } 79 }
@@ -80,6 +82,210 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *ha
80 return NULL; 82 return NULL;
81} 83}
82 84
85#ifdef CONFIG_FAIR_USER_SCHED
86
87static void sched_destroy_user(struct user_struct *up)
88{
89 sched_destroy_group(up->tg);
90}
91
92static int sched_create_user(struct user_struct *up)
93{
94 int rc = 0;
95
96 up->tg = sched_create_group();
97 if (IS_ERR(up->tg))
98 rc = -ENOMEM;
99
100 return rc;
101}
102
103static void sched_switch_user(struct task_struct *p)
104{
105 sched_move_task(p);
106}
107
108#else /* CONFIG_FAIR_USER_SCHED */
109
110static void sched_destroy_user(struct user_struct *up) { }
111static int sched_create_user(struct user_struct *up) { return 0; }
112static void sched_switch_user(struct task_struct *p) { }
113
114#endif /* CONFIG_FAIR_USER_SCHED */
115
116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
117
118static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
119static DEFINE_MUTEX(uids_mutex);
120
121static inline void uids_mutex_lock(void)
122{
123 mutex_lock(&uids_mutex);
124}
125
126static inline void uids_mutex_unlock(void)
127{
128 mutex_unlock(&uids_mutex);
129}
130
131/* return cpu shares held by the user */
132ssize_t cpu_shares_show(struct kset *kset, char *buffer)
133{
134 struct user_struct *up = container_of(kset, struct user_struct, kset);
135
136 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
137}
138
139/* modify cpu shares held by the user */
140ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
141{
142 struct user_struct *up = container_of(kset, struct user_struct, kset);
143 unsigned long shares;
144 int rc;
145
146 sscanf(buffer, "%lu", &shares);
147
148 rc = sched_group_set_shares(up->tg, shares);
149
150 return (rc ? rc : size);
151}
152
153static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
154{
155 sa->attr.name = name;
156 sa->attr.mode = mode;
157 sa->show = cpu_shares_show;
158 sa->store = cpu_shares_store;
159}
160
161/* Create "/sys/kernel/uids/<uid>" directory and
162 * "/sys/kernel/uids/<uid>/cpu_share" file for this user.
163 */
164static int user_kobject_create(struct user_struct *up)
165{
166 struct kset *kset = &up->kset;
167 struct kobject *kobj = &kset->kobj;
168 int error;
169
170 memset(kset, 0, sizeof(struct kset));
171 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
172 kobject_set_name(kobj, "%d", up->uid);
173 kset_init(kset);
174 user_attr_init(&up->user_attr, "cpu_share", 0644);
175
176 error = kobject_add(kobj);
177 if (error)
178 goto done;
179
180 error = sysfs_create_file(kobj, &up->user_attr.attr);
181 if (error)
182 kobject_del(kobj);
183
184 kobject_uevent(kobj, KOBJ_ADD);
185
186done:
187 return error;
188}
189
190/* create these in sysfs filesystem:
191 * "/sys/kernel/uids" directory
192 * "/sys/kernel/uids/0" directory (for root user)
193 * "/sys/kernel/uids/0/cpu_share" file (for root user)
194 */
195int __init uids_kobject_init(void)
196{
197 int error;
198
199 /* create under /sys/kernel dir */
200 uids_kobject.parent = &kernel_subsys.kobj;
201 uids_kobject.kset = &kernel_subsys;
202 kobject_set_name(&uids_kobject, "uids");
203 kobject_init(&uids_kobject);
204
205 error = kobject_add(&uids_kobject);
206 if (!error)
207 error = user_kobject_create(&root_user);
208
209 return error;
210}
211
212/* work function to remove sysfs directory for a user and free up
213 * corresponding structures.
214 */
215static void remove_user_sysfs_dir(struct work_struct *w)
216{
217 struct user_struct *up = container_of(w, struct user_struct, work);
218 struct kobject *kobj = &up->kset.kobj;
219 unsigned long flags;
220 int remove_user = 0;
221
222 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
223 * atomic.
224 */
225 uids_mutex_lock();
226
227 local_irq_save(flags);
228
229 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
230 uid_hash_remove(up);
231 remove_user = 1;
232 spin_unlock_irqrestore(&uidhash_lock, flags);
233 } else {
234 local_irq_restore(flags);
235 }
236
237 if (!remove_user)
238 goto done;
239
240 sysfs_remove_file(kobj, &up->user_attr.attr);
241 kobject_uevent(kobj, KOBJ_REMOVE);
242 kobject_del(kobj);
243
244 sched_destroy_user(up);
245 key_put(up->uid_keyring);
246 key_put(up->session_keyring);
247 kmem_cache_free(uid_cachep, up);
248
249done:
250 uids_mutex_unlock();
251}
252
253/* IRQs are disabled and uidhash_lock is held upon function entry.
254 * IRQ state (as stored in flags) is restored and uidhash_lock released
255 * upon function exit.
256 */
257static inline void free_user(struct user_struct *up, unsigned long flags)
258{
259 /* restore back the count */
260 atomic_inc(&up->__count);
261 spin_unlock_irqrestore(&uidhash_lock, flags);
262
263 INIT_WORK(&up->work, remove_user_sysfs_dir);
264 schedule_work(&up->work);
265}
266
267#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
268
269static inline int user_kobject_create(struct user_struct *up) { return 0; }
270static inline void uids_mutex_lock(void) { }
271static inline void uids_mutex_unlock(void) { }
272
273/* IRQs are disabled and uidhash_lock is held upon function entry.
274 * IRQ state (as stored in flags) is restored and uidhash_lock released
275 * upon function exit.
276 */
277static inline void free_user(struct user_struct *up, unsigned long flags)
278{
279 uid_hash_remove(up);
280 spin_unlock_irqrestore(&uidhash_lock, flags);
281 sched_destroy_user(up);
282 key_put(up->uid_keyring);
283 key_put(up->session_keyring);
284 kmem_cache_free(uid_cachep, up);
285}
286
287#endif
288
83/* 289/*
84 * Locate the user_struct for the passed UID. If found, take a ref on it. The 290 * Locate the user_struct for the passed UID. If found, take a ref on it. The
85 * caller must undo that ref with free_uid(). 291 * caller must undo that ref with free_uid().
@@ -106,15 +312,10 @@ void free_uid(struct user_struct *up)
106 return; 312 return;
107 313
108 local_irq_save(flags); 314 local_irq_save(flags);
109 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { 315 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
110 uid_hash_remove(up); 316 free_user(up, flags);
111 spin_unlock_irqrestore(&uidhash_lock, flags); 317 else
112 key_put(up->uid_keyring);
113 key_put(up->session_keyring);
114 kmem_cache_free(uid_cachep, up);
115 } else {
116 local_irq_restore(flags); 318 local_irq_restore(flags);
117 }
118} 319}
119 320
120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 321struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -122,6 +323,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
122 struct hlist_head *hashent = uidhashentry(ns, uid); 323 struct hlist_head *hashent = uidhashentry(ns, uid);
123 struct user_struct *up; 324 struct user_struct *up;
124 325
326 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
327 * atomic.
328 */
329 uids_mutex_lock();
330
125 spin_lock_irq(&uidhash_lock); 331 spin_lock_irq(&uidhash_lock);
126 up = uid_hash_find(uid, hashent); 332 up = uid_hash_find(uid, hashent);
127 spin_unlock_irq(&uidhash_lock); 333 spin_unlock_irq(&uidhash_lock);
@@ -141,8 +347,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
141 atomic_set(&new->inotify_watches, 0); 347 atomic_set(&new->inotify_watches, 0);
142 atomic_set(&new->inotify_devs, 0); 348 atomic_set(&new->inotify_devs, 0);
143#endif 349#endif
144 350#ifdef CONFIG_POSIX_MQUEUE
145 new->mq_bytes = 0; 351 new->mq_bytes = 0;
352#endif
146 new->locked_shm = 0; 353 new->locked_shm = 0;
147 354
148 if (alloc_uid_keyring(new, current) < 0) { 355 if (alloc_uid_keyring(new, current) < 0) {
@@ -150,6 +357,22 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
150 return NULL; 357 return NULL;
151 } 358 }
152 359
360 if (sched_create_user(new) < 0) {
361 key_put(new->uid_keyring);
362 key_put(new->session_keyring);
363 kmem_cache_free(uid_cachep, new);
364 return NULL;
365 }
366
367 if (user_kobject_create(new)) {
368 sched_destroy_user(new);
369 key_put(new->uid_keyring);
370 key_put(new->session_keyring);
371 kmem_cache_free(uid_cachep, new);
372 uids_mutex_unlock();
373 return NULL;
374 }
375
153 /* 376 /*
154 * Before adding this, check whether we raced 377 * Before adding this, check whether we raced
155 * on adding the same user already.. 378 * on adding the same user already..
@@ -157,6 +380,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
157 spin_lock_irq(&uidhash_lock); 380 spin_lock_irq(&uidhash_lock);
158 up = uid_hash_find(uid, hashent); 381 up = uid_hash_find(uid, hashent);
159 if (up) { 382 if (up) {
383 /* This case is not possible when CONFIG_FAIR_USER_SCHED
384 * is defined, since we serialize alloc_uid() using
385 * uids_mutex. Hence no need to call
386 * sched_destroy_user() or remove_user_sysfs_dir().
387 */
160 key_put(new->uid_keyring); 388 key_put(new->uid_keyring);
161 key_put(new->session_keyring); 389 key_put(new->session_keyring);
162 kmem_cache_free(uid_cachep, new); 390 kmem_cache_free(uid_cachep, new);
@@ -167,6 +395,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
167 spin_unlock_irq(&uidhash_lock); 395 spin_unlock_irq(&uidhash_lock);
168 396
169 } 397 }
398
399 uids_mutex_unlock();
400
170 return up; 401 return up;
171} 402}
172 403
@@ -184,6 +415,7 @@ void switch_uid(struct user_struct *new_user)
184 atomic_dec(&old_user->processes); 415 atomic_dec(&old_user->processes);
185 switch_uid_keyring(new_user); 416 switch_uid_keyring(new_user);
186 current->user = new_user; 417 current->user = new_user;
418 sched_switch_user(current);
187 419
188 /* 420 /*
189 * We need to synchronize with __sigqueue_alloc() 421 * We need to synchronize with __sigqueue_alloc()
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e080d1d744cc..52d5e7c9a8e6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -32,6 +32,7 @@
32#include <linux/freezer.h> 32#include <linux/freezer.h>
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h>
35 36
36/* 37/*
37 * The per-CPU workqueue (if single thread, we always use the first 38 * The per-CPU workqueue (if single thread, we always use the first
@@ -61,6 +62,9 @@ struct workqueue_struct {
61 const char *name; 62 const char *name;
62 int singlethread; 63 int singlethread;
63 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map;
67#endif
64}; 68};
65 69
66/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -250,6 +254,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
250 struct work_struct *work = list_entry(cwq->worklist.next, 254 struct work_struct *work = list_entry(cwq->worklist.next,
251 struct work_struct, entry); 255 struct work_struct, entry);
252 work_func_t f = work->func; 256 work_func_t f = work->func;
257#ifdef CONFIG_LOCKDEP
258 /*
259 * It is permissible to free the struct work_struct
260 * from inside the function that is called from it,
261 * this we need to take into account for lockdep too.
262 * To avoid bogus "held lock freed" warnings as well
263 * as problems when looking into work->lockdep_map,
264 * make a copy and use that here.
265 */
266 struct lockdep_map lockdep_map = work->lockdep_map;
267#endif
253 268
254 cwq->current_work = work; 269 cwq->current_work = work;
255 list_del_init(cwq->worklist.next); 270 list_del_init(cwq->worklist.next);
@@ -257,13 +272,17 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
257 272
258 BUG_ON(get_wq_data(work) != cwq); 273 BUG_ON(get_wq_data(work) != cwq);
259 work_clear_pending(work); 274 work_clear_pending(work);
275 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
276 lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
260 f(work); 277 f(work);
278 lock_release(&lockdep_map, 1, _THIS_IP_);
279 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
261 280
262 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 281 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
263 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 282 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
264 "%s/0x%08x/%d\n", 283 "%s/0x%08x/%d\n",
265 current->comm, preempt_count(), 284 current->comm, preempt_count(),
266 current->pid); 285 task_pid_nr(current));
267 printk(KERN_ERR " last function: "); 286 printk(KERN_ERR " last function: ");
268 print_symbol("%s\n", (unsigned long)f); 287 print_symbol("%s\n", (unsigned long)f);
269 debug_show_held_locks(current); 288 debug_show_held_locks(current);
@@ -376,6 +395,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
376 int cpu; 395 int cpu;
377 396
378 might_sleep(); 397 might_sleep();
398 lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
399 lock_release(&wq->lockdep_map, 1, _THIS_IP_);
379 for_each_cpu_mask(cpu, *cpu_map) 400 for_each_cpu_mask(cpu, *cpu_map)
380 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
381} 402}
@@ -446,6 +467,9 @@ static void wait_on_work(struct work_struct *work)
446 467
447 might_sleep(); 468 might_sleep();
448 469
470 lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
471 lock_release(&work->lockdep_map, 1, _THIS_IP_);
472
449 cwq = get_wq_data(work); 473 cwq = get_wq_data(work);
450 if (!cwq) 474 if (!cwq)
451 return; 475 return;
@@ -695,8 +719,10 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
695 } 719 }
696} 720}
697 721
698struct workqueue_struct *__create_workqueue(const char *name, 722struct workqueue_struct *__create_workqueue_key(const char *name,
699 int singlethread, int freezeable) 723 int singlethread,
724 int freezeable,
725 struct lock_class_key *key)
700{ 726{
701 struct workqueue_struct *wq; 727 struct workqueue_struct *wq;
702 struct cpu_workqueue_struct *cwq; 728 struct cpu_workqueue_struct *cwq;
@@ -713,6 +739,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
713 } 739 }
714 740
715 wq->name = name; 741 wq->name = name;
742 lockdep_init_map(&wq->lockdep_map, name, key, 0);
716 wq->singlethread = singlethread; 743 wq->singlethread = singlethread;
717 wq->freezeable = freezeable; 744 wq->freezeable = freezeable;
718 INIT_LIST_HEAD(&wq->list); 745 INIT_LIST_HEAD(&wq->list);
@@ -741,7 +768,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
741 } 768 }
742 return wq; 769 return wq;
743} 770}
744EXPORT_SYMBOL_GPL(__create_workqueue); 771EXPORT_SYMBOL_GPL(__create_workqueue_key);
745 772
746static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
747{ 774{
@@ -752,6 +779,9 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
752 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
753 return; 780 return;
754 781
782 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
783 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
784
755 flush_cpu_workqueue(cwq); 785 flush_cpu_workqueue(cwq);
756 /* 786 /*
757 * If the caller is CPU_DEAD and cwq->worklist was not empty, 787 * If the caller is CPU_DEAD and cwq->worklist was not empty,