aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c23
-rw-r--r--kernel/audit.c30
-rw-r--r--kernel/audit.h25
-rw-r--r--kernel/auditfilter.c99
-rw-r--r--kernel/auditsc.c74
-rw-r--r--kernel/bounds.c23
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c125
-rw-r--r--kernel/exit.c39
-rw-r--r--kernel/fork.c103
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c119
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kgdb.c1700
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/kprobes.c349
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c27
-rw-r--r--kernel/marker.c40
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c30
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/Makefile1
-rw-r--r--kernel/power/console.c27
-rw-r--r--kernel/power/pm.c205
-rw-r--r--kernel/printk.c96
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c55
-rw-r--r--kernel/rcupreempt.c4
-rw-r--r--kernel/rcutorture.c15
-rw-r--r--kernel/relay.c7
-rw-r--r--kernel/resource.c18
-rw-r--r--kernel/sched.c1908
-rw-r--r--kernel/sched_debug.c36
-rw-r--r--kernel/sched_fair.c578
-rw-r--r--kernel/sched_features.h10
-rw-r--r--kernel/sched_rt.c227
-rw-r--r--kernel/sched_stats.h8
-rw-r--r--kernel/semaphore.c264
-rw-r--r--kernel/signal.c75
-rw-r--r--kernel/softirq.c63
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c40
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clocksource.c32
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c26
-rw-r--r--kernel/uid16.c22
-rw-r--r--kernel/user.c30
-rw-r--r--kernel/workqueue.c2
62 files changed, 5249 insertions, 1467 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c584c55a6e9..6c5f081132a4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o 12 notifier.o ksysfs.o pm_qos_params.o
13 13
14obj-$(CONFIG_SYSCTL) += sysctl_check.o 14obj-$(CONFIG_SYSCTL) += sysctl_check.o
@@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
53obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 53obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
54obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 54obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
55obj-$(CONFIG_KPROBES) += kprobes.o 55obj-$(CONFIG_KPROBES) += kprobes.o
56obj-$(CONFIG_KGDB) += kgdb.o
56obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 57obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
57obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 58obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
58obj-$(CONFIG_SECCOMP) += seccomp.o 59obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 521dfa53cb99..91e1cfd734d2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -58,6 +58,7 @@
58#include <asm/uaccess.h> 58#include <asm/uaccess.h>
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h>
61 62
62/* 63/*
63 * These constants control the amount of freespace that suspend and 64 * These constants control the amount of freespace that suspend and
@@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
74/* 75/*
75 * External references and all of the globals. 76 * External references and all of the globals.
76 */ 77 */
77static void do_acct_process(struct file *); 78static void do_acct_process(struct pid_namespace *ns, struct file *);
78 79
79/* 80/*
80 * This structure is used so that all the data protected by lock 81 * This structure is used so that all the data protected by lock
@@ -86,6 +87,7 @@ struct acct_glbs {
86 volatile int active; 87 volatile int active;
87 volatile int needcheck; 88 volatile int needcheck;
88 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns;
89 struct timer_list timer; 91 struct timer_list timer;
90}; 92};
91 93
@@ -175,9 +177,11 @@ out:
175static void acct_file_reopen(struct file *file) 177static void acct_file_reopen(struct file *file)
176{ 178{
177 struct file *old_acct = NULL; 179 struct file *old_acct = NULL;
180 struct pid_namespace *old_ns = NULL;
178 181
179 if (acct_globals.file) { 182 if (acct_globals.file) {
180 old_acct = acct_globals.file; 183 old_acct = acct_globals.file;
184 old_ns = acct_globals.ns;
181 del_timer(&acct_globals.timer); 185 del_timer(&acct_globals.timer);
182 acct_globals.active = 0; 186 acct_globals.active = 0;
183 acct_globals.needcheck = 0; 187 acct_globals.needcheck = 0;
@@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file)
185 } 189 }
186 if (file) { 190 if (file) {
187 acct_globals.file = file; 191 acct_globals.file = file;
192 acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
188 acct_globals.needcheck = 0; 193 acct_globals.needcheck = 0;
189 acct_globals.active = 1; 194 acct_globals.active = 1;
190 /* It's been deleted if it was used before so this is safe */ 195 /* It's been deleted if it was used before so this is safe */
@@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 201 if (old_acct) {
197 mnt_unpin(old_acct->f_path.mnt); 202 mnt_unpin(old_acct->f_path.mnt);
198 spin_unlock(&acct_globals.lock); 203 spin_unlock(&acct_globals.lock);
199 do_acct_process(old_acct); 204 do_acct_process(old_ns, old_acct);
200 filp_close(old_acct, NULL); 205 filp_close(old_acct, NULL);
206 put_pid_ns(old_ns);
201 spin_lock(&acct_globals.lock); 207 spin_lock(&acct_globals.lock);
202 } 208 }
203} 209}
@@ -419,7 +425,7 @@ static u32 encode_float(u64 value)
419/* 425/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 426 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 427 */
422static void do_acct_process(struct file *file) 428static void do_acct_process(struct pid_namespace *ns, struct file *file)
423{ 429{
424 struct pacct_struct *pacct = &current->signal->pacct; 430 struct pacct_struct *pacct = &current->signal->pacct;
425 acct_t ac; 431 acct_t ac;
@@ -481,8 +487,10 @@ static void do_acct_process(struct file *file)
481 ac.ac_gid16 = current->gid; 487 ac.ac_gid16 = current->gid;
482#endif 488#endif
483#if ACCT_VERSION==3 489#if ACCT_VERSION==3
484 ac.ac_pid = current->tgid; 490 ac.ac_pid = task_tgid_nr_ns(current, ns);
485 ac.ac_ppid = current->real_parent->tgid; 491 rcu_read_lock();
492 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
493 rcu_read_unlock();
486#endif 494#endif
487 495
488 spin_lock_irq(&current->sighand->siglock); 496 spin_lock_irq(&current->sighand->siglock);
@@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead)
578void acct_process(void) 586void acct_process(void)
579{ 587{
580 struct file *file = NULL; 588 struct file *file = NULL;
589 struct pid_namespace *ns;
581 590
582 /* 591 /*
583 * accelerate the common fastpath: 592 * accelerate the common fastpath:
@@ -592,8 +601,10 @@ void acct_process(void)
592 return; 601 return;
593 } 602 }
594 get_file(file); 603 get_file(file);
604 ns = get_pid_ns(acct_globals.ns);
595 spin_unlock(&acct_globals.lock); 605 spin_unlock(&acct_globals.lock);
596 606
597 do_acct_process(file); 607 do_acct_process(ns, file);
598 fput(file); 608 fput(file);
609 put_pid_ns(ns);
599} 610}
diff --git a/kernel/audit.c b/kernel/audit.c
index be55cb503633..a7b16086d36f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -21,7 +21,7 @@
21 * 21 *
22 * Written by Rickard E. (Rik) Faith <faith@redhat.com> 22 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
23 * 23 *
24 * Goals: 1) Integrate fully with SELinux. 24 * Goals: 1) Integrate fully with Security Modules.
25 * 2) Minimal run-time overhead: 25 * 2) Minimal run-time overhead:
26 * a) Minimal when syscall auditing is disabled (audit_enable=0). 26 * a) Minimal when syscall auditing is disabled (audit_enable=0).
27 * b) Small when syscall auditing is enabled and no audit record 27 * b) Small when syscall auditing is enabled and no audit record
@@ -55,7 +55,6 @@
55#include <net/netlink.h> 55#include <net/netlink.h>
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h>
59#include <linux/inotify.h> 58#include <linux/inotify.h>
60#include <linux/freezer.h> 59#include <linux/freezer.h>
61#include <linux/tty.h> 60#include <linux/tty.h>
@@ -265,13 +264,13 @@ static int audit_log_config_change(char *function_name, int new, int old,
265 char *ctx = NULL; 264 char *ctx = NULL;
266 u32 len; 265 u32 len;
267 266
268 rc = selinux_sid_to_string(sid, &ctx, &len); 267 rc = security_secid_to_secctx(sid, &ctx, &len);
269 if (rc) { 268 if (rc) {
270 audit_log_format(ab, " sid=%u", sid); 269 audit_log_format(ab, " sid=%u", sid);
271 allow_changes = 0; /* Something weird, deny request */ 270 allow_changes = 0; /* Something weird, deny request */
272 } else { 271 } else {
273 audit_log_format(ab, " subj=%s", ctx); 272 audit_log_format(ab, " subj=%s", ctx);
274 kfree(ctx); 273 security_release_secctx(ctx, len);
275 } 274 }
276 } 275 }
277 audit_log_format(ab, " res=%d", allow_changes); 276 audit_log_format(ab, " res=%d", allow_changes);
@@ -550,12 +549,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
550 audit_log_format(*ab, "user pid=%d uid=%u auid=%u", 549 audit_log_format(*ab, "user pid=%d uid=%u auid=%u",
551 pid, uid, auid); 550 pid, uid, auid);
552 if (sid) { 551 if (sid) {
553 rc = selinux_sid_to_string(sid, &ctx, &len); 552 rc = security_secid_to_secctx(sid, &ctx, &len);
554 if (rc) 553 if (rc)
555 audit_log_format(*ab, " ssid=%u", sid); 554 audit_log_format(*ab, " ssid=%u", sid);
556 else 555 else {
557 audit_log_format(*ab, " subj=%s", ctx); 556 audit_log_format(*ab, " subj=%s", ctx);
558 kfree(ctx); 557 security_release_secctx(ctx, len);
558 }
559 } 559 }
560 560
561 return rc; 561 return rc;
@@ -758,18 +758,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
758 break; 758 break;
759 } 759 }
760 case AUDIT_SIGNAL_INFO: 760 case AUDIT_SIGNAL_INFO:
761 err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); 761 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
762 if (err) 762 if (err)
763 return err; 763 return err;
764 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 764 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
765 if (!sig_data) { 765 if (!sig_data) {
766 kfree(ctx); 766 security_release_secctx(ctx, len);
767 return -ENOMEM; 767 return -ENOMEM;
768 } 768 }
769 sig_data->uid = audit_sig_uid; 769 sig_data->uid = audit_sig_uid;
770 sig_data->pid = audit_sig_pid; 770 sig_data->pid = audit_sig_pid;
771 memcpy(sig_data->ctx, ctx, len); 771 memcpy(sig_data->ctx, ctx, len);
772 kfree(ctx); 772 security_release_secctx(ctx, len);
773 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 773 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
774 0, 0, sig_data, sizeof(*sig_data) + len); 774 0, 0, sig_data, sizeof(*sig_data) + len);
775 kfree(sig_data); 775 kfree(sig_data);
@@ -881,10 +881,6 @@ static int __init audit_init(void)
881 audit_enabled = audit_default; 881 audit_enabled = audit_default;
882 audit_ever_enabled |= !!audit_default; 882 audit_ever_enabled |= !!audit_default;
883 883
884 /* Register the callback with selinux. This callback will be invoked
885 * when a new policy is loaded. */
886 selinux_audit_set_callback(&selinux_audit_rule_update);
887
888 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 884 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
889 885
890#ifdef CONFIG_AUDITSYSCALL 886#ifdef CONFIG_AUDITSYSCALL
@@ -1269,8 +1265,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1269 1265
1270/** 1266/**
1271 * audit_string_contains_control - does a string need to be logged in hex 1267 * audit_string_contains_control - does a string need to be logged in hex
1272 * @string - string to be checked 1268 * @string: string to be checked
1273 * @len - max length of the string to check 1269 * @len: max length of the string to check
1274 */ 1270 */
1275int audit_string_contains_control(const char *string, size_t len) 1271int audit_string_contains_control(const char *string, size_t len)
1276{ 1272{
@@ -1285,7 +1281,7 @@ int audit_string_contains_control(const char *string, size_t len)
1285/** 1281/**
1286 * audit_log_n_untrustedstring - log a string that may contain random characters 1282 * audit_log_n_untrustedstring - log a string that may contain random characters
1287 * @ab: audit_buffer 1283 * @ab: audit_buffer
1288 * @len: lenth of string (not including trailing null) 1284 * @len: length of string (not including trailing null)
1289 * @string: string to be logged 1285 * @string: string to be logged
1290 * 1286 *
1291 * This code will escape a string that is passed to it if the string 1287 * This code will escape a string that is passed to it if the string
diff --git a/kernel/audit.h b/kernel/audit.h
index 2554bd524fd1..3cfc54ee3e1f 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -65,34 +65,9 @@ struct audit_watch {
65 struct list_head rules; /* associated rules */ 65 struct list_head rules; /* associated rules */
66}; 66};
67 67
68struct audit_field {
69 u32 type;
70 u32 val;
71 u32 op;
72 char *se_str;
73 struct selinux_audit_rule *se_rule;
74};
75
76struct audit_tree; 68struct audit_tree;
77struct audit_chunk; 69struct audit_chunk;
78 70
79struct audit_krule {
80 int vers_ops;
81 u32 flags;
82 u32 listnr;
83 u32 action;
84 u32 mask[AUDIT_BITMASK_SIZE];
85 u32 buflen; /* for data alloc on list rules */
86 u32 field_count;
87 char *filterkey; /* ties events to rules */
88 struct audit_field *fields;
89 struct audit_field *arch_f; /* quick access to arch field */
90 struct audit_field *inode_f; /* quick access to an inode field */
91 struct audit_watch *watch; /* associated watch */
92 struct audit_tree *tree; /* associated watched tree */
93 struct list_head rlist; /* entry in audit_{watch,tree}.rules list */
94};
95
96struct audit_entry { 71struct audit_entry {
97 struct list_head list; 72 struct list_head list;
98 struct rcu_head rcu; 73 struct rcu_head rcu;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 2f2914b7cc30..28fef6bf8534 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -28,7 +28,7 @@
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/inotify.h> 30#include <linux/inotify.h>
31#include <linux/selinux.h> 31#include <linux/security.h>
32#include "audit.h" 32#include "audit.h"
33 33
34/* 34/*
@@ -38,7 +38,7 @@
38 * Synchronizes writes and blocking reads of audit's filterlist 38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access 39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque 40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures 41 * LSM rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist. 42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may 43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held. 44 * be written directly provided audit_filter_mutex is held.
@@ -139,8 +139,8 @@ static inline void audit_free_rule(struct audit_entry *e)
139 if (e->rule.fields) 139 if (e->rule.fields)
140 for (i = 0; i < e->rule.field_count; i++) { 140 for (i = 0; i < e->rule.field_count; i++) {
141 struct audit_field *f = &e->rule.fields[i]; 141 struct audit_field *f = &e->rule.fields[i];
142 kfree(f->se_str); 142 kfree(f->lsm_str);
143 selinux_audit_rule_free(f->se_rule); 143 security_audit_rule_free(f->lsm_rule);
144 } 144 }
145 kfree(e->rule.fields); 145 kfree(e->rule.fields);
146 kfree(e->rule.filterkey); 146 kfree(e->rule.filterkey);
@@ -554,8 +554,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
554 f->op = data->fieldflags[i] & AUDIT_OPERATORS; 554 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
555 f->type = data->fields[i]; 555 f->type = data->fields[i];
556 f->val = data->values[i]; 556 f->val = data->values[i];
557 f->se_str = NULL; 557 f->lsm_str = NULL;
558 f->se_rule = NULL; 558 f->lsm_rule = NULL;
559 switch(f->type) { 559 switch(f->type) {
560 case AUDIT_PID: 560 case AUDIT_PID:
561 case AUDIT_UID: 561 case AUDIT_UID:
@@ -597,12 +597,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
597 goto exit_free; 597 goto exit_free;
598 entry->rule.buflen += f->val; 598 entry->rule.buflen += f->val;
599 599
600 err = selinux_audit_rule_init(f->type, f->op, str, 600 err = security_audit_rule_init(f->type, f->op, str,
601 &f->se_rule); 601 (void **)&f->lsm_rule);
602 /* Keep currently invalid fields around in case they 602 /* Keep currently invalid fields around in case they
603 * become valid after a policy reload. */ 603 * become valid after a policy reload. */
604 if (err == -EINVAL) { 604 if (err == -EINVAL) {
605 printk(KERN_WARNING "audit rule for selinux " 605 printk(KERN_WARNING "audit rule for LSM "
606 "\'%s\' is invalid\n", str); 606 "\'%s\' is invalid\n", str);
607 err = 0; 607 err = 0;
608 } 608 }
@@ -610,7 +610,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
610 kfree(str); 610 kfree(str);
611 goto exit_free; 611 goto exit_free;
612 } else 612 } else
613 f->se_str = str; 613 f->lsm_str = str;
614 break; 614 break;
615 case AUDIT_WATCH: 615 case AUDIT_WATCH:
616 str = audit_unpack_string(&bufp, &remain, f->val); 616 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -754,7 +754,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
754 case AUDIT_OBJ_LEV_LOW: 754 case AUDIT_OBJ_LEV_LOW:
755 case AUDIT_OBJ_LEV_HIGH: 755 case AUDIT_OBJ_LEV_HIGH:
756 data->buflen += data->values[i] = 756 data->buflen += data->values[i] =
757 audit_pack_string(&bufp, f->se_str); 757 audit_pack_string(&bufp, f->lsm_str);
758 break; 758 break;
759 case AUDIT_WATCH: 759 case AUDIT_WATCH:
760 data->buflen += data->values[i] = 760 data->buflen += data->values[i] =
@@ -806,7 +806,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
806 case AUDIT_OBJ_TYPE: 806 case AUDIT_OBJ_TYPE:
807 case AUDIT_OBJ_LEV_LOW: 807 case AUDIT_OBJ_LEV_LOW:
808 case AUDIT_OBJ_LEV_HIGH: 808 case AUDIT_OBJ_LEV_HIGH:
809 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 809 if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
810 return 1; 810 return 1;
811 break; 811 break;
812 case AUDIT_WATCH: 812 case AUDIT_WATCH:
@@ -862,28 +862,28 @@ out:
862 return new; 862 return new;
863} 863}
864 864
865/* Duplicate selinux field information. The se_rule is opaque, so must be 865/* Duplicate LSM field information. The lsm_rule is opaque, so must be
866 * re-initialized. */ 866 * re-initialized. */
867static inline int audit_dupe_selinux_field(struct audit_field *df, 867static inline int audit_dupe_lsm_field(struct audit_field *df,
868 struct audit_field *sf) 868 struct audit_field *sf)
869{ 869{
870 int ret = 0; 870 int ret = 0;
871 char *se_str; 871 char *lsm_str;
872 872
873 /* our own copy of se_str */ 873 /* our own copy of lsm_str */
874 se_str = kstrdup(sf->se_str, GFP_KERNEL); 874 lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
875 if (unlikely(!se_str)) 875 if (unlikely(!lsm_str))
876 return -ENOMEM; 876 return -ENOMEM;
877 df->se_str = se_str; 877 df->lsm_str = lsm_str;
878 878
879 /* our own (refreshed) copy of se_rule */ 879 /* our own (refreshed) copy of lsm_rule */
880 ret = selinux_audit_rule_init(df->type, df->op, df->se_str, 880 ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
881 &df->se_rule); 881 (void **)&df->lsm_rule);
882 /* Keep currently invalid fields around in case they 882 /* Keep currently invalid fields around in case they
883 * become valid after a policy reload. */ 883 * become valid after a policy reload. */
884 if (ret == -EINVAL) { 884 if (ret == -EINVAL) {
885 printk(KERN_WARNING "audit rule for selinux \'%s\' is " 885 printk(KERN_WARNING "audit rule for LSM \'%s\' is "
886 "invalid\n", df->se_str); 886 "invalid\n", df->lsm_str);
887 ret = 0; 887 ret = 0;
888 } 888 }
889 889
@@ -891,7 +891,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
891} 891}
892 892
893/* Duplicate an audit rule. This will be a deep copy with the exception 893/* Duplicate an audit rule. This will be a deep copy with the exception
894 * of the watch - that pointer is carried over. The selinux specific fields 894 * of the watch - that pointer is carried over. The LSM specific fields
895 * will be updated in the copy. The point is to be able to replace the old 895 * will be updated in the copy. The point is to be able to replace the old
896 * rule with the new rule in the filterlist, then free the old rule. 896 * rule with the new rule in the filterlist, then free the old rule.
897 * The rlist element is undefined; list manipulations are handled apart from 897 * The rlist element is undefined; list manipulations are handled apart from
@@ -930,7 +930,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
930 new->tree = old->tree; 930 new->tree = old->tree;
931 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 931 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
932 932
933 /* deep copy this information, updating the se_rule fields, because 933 /* deep copy this information, updating the lsm_rule fields, because
934 * the originals will all be freed when the old rule is freed. */ 934 * the originals will all be freed when the old rule is freed. */
935 for (i = 0; i < fcount; i++) { 935 for (i = 0; i < fcount; i++) {
936 switch (new->fields[i].type) { 936 switch (new->fields[i].type) {
@@ -944,7 +944,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
944 case AUDIT_OBJ_TYPE: 944 case AUDIT_OBJ_TYPE:
945 case AUDIT_OBJ_LEV_LOW: 945 case AUDIT_OBJ_LEV_LOW:
946 case AUDIT_OBJ_LEV_HIGH: 946 case AUDIT_OBJ_LEV_HIGH:
947 err = audit_dupe_selinux_field(&new->fields[i], 947 err = audit_dupe_lsm_field(&new->fields[i],
948 &old->fields[i]); 948 &old->fields[i]);
949 break; 949 break;
950 case AUDIT_FILTERKEY: 950 case AUDIT_FILTERKEY:
@@ -1515,11 +1515,12 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1515 if (sid) { 1515 if (sid) {
1516 char *ctx = NULL; 1516 char *ctx = NULL;
1517 u32 len; 1517 u32 len;
1518 if (selinux_sid_to_string(sid, &ctx, &len)) 1518 if (security_secid_to_secctx(sid, &ctx, &len))
1519 audit_log_format(ab, " ssid=%u", sid); 1519 audit_log_format(ab, " ssid=%u", sid);
1520 else 1520 else {
1521 audit_log_format(ab, " subj=%s", ctx); 1521 audit_log_format(ab, " subj=%s", ctx);
1522 kfree(ctx); 1522 security_release_secctx(ctx, len);
1523 }
1523 } 1524 }
1524 audit_log_format(ab, " op=%s rule key=", action); 1525 audit_log_format(ab, " op=%s rule key=", action);
1525 if (rule->filterkey) 1526 if (rule->filterkey)
@@ -1761,38 +1762,12 @@ unlock_and_return:
1761 return result; 1762 return result;
1762} 1763}
1763 1764
1764/* Check to see if the rule contains any selinux fields. Returns 1 if there 1765/* This function will re-initialize the lsm_rule field of all applicable rules.
1765 are selinux fields specified in the rule, 0 otherwise. */ 1766 * It will traverse the filter lists serarching for rules that contain LSM
1766static inline int audit_rule_has_selinux(struct audit_krule *rule)
1767{
1768 int i;
1769
1770 for (i = 0; i < rule->field_count; i++) {
1771 struct audit_field *f = &rule->fields[i];
1772 switch (f->type) {
1773 case AUDIT_SUBJ_USER:
1774 case AUDIT_SUBJ_ROLE:
1775 case AUDIT_SUBJ_TYPE:
1776 case AUDIT_SUBJ_SEN:
1777 case AUDIT_SUBJ_CLR:
1778 case AUDIT_OBJ_USER:
1779 case AUDIT_OBJ_ROLE:
1780 case AUDIT_OBJ_TYPE:
1781 case AUDIT_OBJ_LEV_LOW:
1782 case AUDIT_OBJ_LEV_HIGH:
1783 return 1;
1784 }
1785 }
1786
1787 return 0;
1788}
1789
1790/* This function will re-initialize the se_rule field of all applicable rules.
1791 * It will traverse the filter lists serarching for rules that contain selinux
1792 * specific filter fields. When such a rule is found, it is copied, the 1767 * specific filter fields. When such a rule is found, it is copied, the
1793 * selinux field is re-initialized, and the old rule is replaced with the 1768 * LSM field is re-initialized, and the old rule is replaced with the
1794 * updated rule. */ 1769 * updated rule. */
1795int selinux_audit_rule_update(void) 1770int audit_update_lsm_rules(void)
1796{ 1771{
1797 struct audit_entry *entry, *n, *nentry; 1772 struct audit_entry *entry, *n, *nentry;
1798 struct audit_watch *watch; 1773 struct audit_watch *watch;
@@ -1804,7 +1779,7 @@ int selinux_audit_rule_update(void)
1804 1779
1805 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1780 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1806 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1781 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
1807 if (!audit_rule_has_selinux(&entry->rule)) 1782 if (!security_audit_rule_known(&entry->rule))
1808 continue; 1783 continue;
1809 1784
1810 watch = entry->rule.watch; 1785 watch = entry->rule.watch;
@@ -1815,7 +1790,7 @@ int selinux_audit_rule_update(void)
1815 * return value */ 1790 * return value */
1816 if (!err) 1791 if (!err)
1817 err = PTR_ERR(nentry); 1792 err = PTR_ERR(nentry);
1818 audit_panic("error updating selinux filters"); 1793 audit_panic("error updating LSM filters");
1819 if (watch) 1794 if (watch)
1820 list_del(&entry->rule.rlist); 1795 list_del(&entry->rule.rlist);
1821 list_del_rcu(&entry->list); 1796 list_del_rcu(&entry->list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 782262e4107d..56e56ed594a8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -61,7 +61,6 @@
61#include <linux/security.h> 61#include <linux/security.h>
62#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/selinux.h>
65#include <linux/binfmts.h> 64#include <linux/binfmts.h>
66#include <linux/highmem.h> 65#include <linux/highmem.h>
67#include <linux/syscalls.h> 66#include <linux/syscalls.h>
@@ -528,14 +527,14 @@ static int audit_filter_rules(struct task_struct *tsk,
528 match for now to avoid losing information that 527 match for now to avoid losing information that
529 may be wanted. An error message will also be 528 may be wanted. An error message will also be
530 logged upon error */ 529 logged upon error */
531 if (f->se_rule) { 530 if (f->lsm_rule) {
532 if (need_sid) { 531 if (need_sid) {
533 selinux_get_task_sid(tsk, &sid); 532 security_task_getsecid(tsk, &sid);
534 need_sid = 0; 533 need_sid = 0;
535 } 534 }
536 result = selinux_audit_rule_match(sid, f->type, 535 result = security_audit_rule_match(sid, f->type,
537 f->op, 536 f->op,
538 f->se_rule, 537 f->lsm_rule,
539 ctx); 538 ctx);
540 } 539 }
541 break; 540 break;
@@ -546,18 +545,18 @@ static int audit_filter_rules(struct task_struct *tsk,
546 case AUDIT_OBJ_LEV_HIGH: 545 case AUDIT_OBJ_LEV_HIGH:
547 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR 546 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
548 also applies here */ 547 also applies here */
549 if (f->se_rule) { 548 if (f->lsm_rule) {
550 /* Find files that match */ 549 /* Find files that match */
551 if (name) { 550 if (name) {
552 result = selinux_audit_rule_match( 551 result = security_audit_rule_match(
553 name->osid, f->type, f->op, 552 name->osid, f->type, f->op,
554 f->se_rule, ctx); 553 f->lsm_rule, ctx);
555 } else if (ctx) { 554 } else if (ctx) {
556 for (j = 0; j < ctx->name_count; j++) { 555 for (j = 0; j < ctx->name_count; j++) {
557 if (selinux_audit_rule_match( 556 if (security_audit_rule_match(
558 ctx->names[j].osid, 557 ctx->names[j].osid,
559 f->type, f->op, 558 f->type, f->op,
560 f->se_rule, ctx)) { 559 f->lsm_rule, ctx)) {
561 ++result; 560 ++result;
562 break; 561 break;
563 } 562 }
@@ -570,7 +569,7 @@ static int audit_filter_rules(struct task_struct *tsk,
570 aux = aux->next) { 569 aux = aux->next) {
571 if (aux->type == AUDIT_IPC) { 570 if (aux->type == AUDIT_IPC) {
572 struct audit_aux_data_ipcctl *axi = (void *)aux; 571 struct audit_aux_data_ipcctl *axi = (void *)aux;
573 if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { 572 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
574 ++result; 573 ++result;
575 break; 574 break;
576 } 575 }
@@ -885,11 +884,11 @@ void audit_log_task_context(struct audit_buffer *ab)
885 int error; 884 int error;
886 u32 sid; 885 u32 sid;
887 886
888 selinux_get_task_sid(current, &sid); 887 security_task_getsecid(current, &sid);
889 if (!sid) 888 if (!sid)
890 return; 889 return;
891 890
892 error = selinux_sid_to_string(sid, &ctx, &len); 891 error = security_secid_to_secctx(sid, &ctx, &len);
893 if (error) { 892 if (error) {
894 if (error != -EINVAL) 893 if (error != -EINVAL)
895 goto error_path; 894 goto error_path;
@@ -897,7 +896,7 @@ void audit_log_task_context(struct audit_buffer *ab)
897 } 896 }
898 897
899 audit_log_format(ab, " subj=%s", ctx); 898 audit_log_format(ab, " subj=%s", ctx);
900 kfree(ctx); 899 security_release_secctx(ctx, len);
901 return; 900 return;
902 901
903error_path: 902error_path:
@@ -941,7 +940,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
941 u32 sid, char *comm) 940 u32 sid, char *comm)
942{ 941{
943 struct audit_buffer *ab; 942 struct audit_buffer *ab;
944 char *s = NULL; 943 char *ctx = NULL;
945 u32 len; 944 u32 len;
946 int rc = 0; 945 int rc = 0;
947 946
@@ -951,15 +950,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
951 950
952 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, 951 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
953 uid, sessionid); 952 uid, sessionid);
954 if (selinux_sid_to_string(sid, &s, &len)) { 953 if (security_secid_to_secctx(sid, &ctx, &len)) {
955 audit_log_format(ab, " obj=(none)"); 954 audit_log_format(ab, " obj=(none)");
956 rc = 1; 955 rc = 1;
957 } else 956 } else {
958 audit_log_format(ab, " obj=%s", s); 957 audit_log_format(ab, " obj=%s", ctx);
958 security_release_secctx(ctx, len);
959 }
959 audit_log_format(ab, " ocomm="); 960 audit_log_format(ab, " ocomm=");
960 audit_log_untrustedstring(ab, comm); 961 audit_log_untrustedstring(ab, comm);
961 audit_log_end(ab); 962 audit_log_end(ab);
962 kfree(s);
963 963
964 return rc; 964 return rc;
965} 965}
@@ -1271,14 +1271,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1271 if (axi->osid != 0) { 1271 if (axi->osid != 0) {
1272 char *ctx = NULL; 1272 char *ctx = NULL;
1273 u32 len; 1273 u32 len;
1274 if (selinux_sid_to_string( 1274 if (security_secid_to_secctx(
1275 axi->osid, &ctx, &len)) { 1275 axi->osid, &ctx, &len)) {
1276 audit_log_format(ab, " osid=%u", 1276 audit_log_format(ab, " osid=%u",
1277 axi->osid); 1277 axi->osid);
1278 call_panic = 1; 1278 call_panic = 1;
1279 } else 1279 } else {
1280 audit_log_format(ab, " obj=%s", ctx); 1280 audit_log_format(ab, " obj=%s", ctx);
1281 kfree(ctx); 1281 security_release_secctx(ctx, len);
1282 }
1282 } 1283 }
1283 break; } 1284 break; }
1284 1285
@@ -1392,13 +1393,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1392 if (n->osid != 0) { 1393 if (n->osid != 0) {
1393 char *ctx = NULL; 1394 char *ctx = NULL;
1394 u32 len; 1395 u32 len;
1395 if (selinux_sid_to_string( 1396 if (security_secid_to_secctx(
1396 n->osid, &ctx, &len)) { 1397 n->osid, &ctx, &len)) {
1397 audit_log_format(ab, " osid=%u", n->osid); 1398 audit_log_format(ab, " osid=%u", n->osid);
1398 call_panic = 2; 1399 call_panic = 2;
1399 } else 1400 } else {
1400 audit_log_format(ab, " obj=%s", ctx); 1401 audit_log_format(ab, " obj=%s", ctx);
1401 kfree(ctx); 1402 security_release_secctx(ctx, len);
1403 }
1402 } 1404 }
1403 1405
1404 audit_log_end(ab); 1406 audit_log_end(ab);
@@ -1775,7 +1777,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1775 name->uid = inode->i_uid; 1777 name->uid = inode->i_uid;
1776 name->gid = inode->i_gid; 1778 name->gid = inode->i_gid;
1777 name->rdev = inode->i_rdev; 1779 name->rdev = inode->i_rdev;
1778 selinux_get_inode_sid(inode, &name->osid); 1780 security_inode_getsecid(inode, &name->osid);
1779} 1781}
1780 1782
1781/** 1783/**
@@ -2190,8 +2192,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2190 ax->uid = ipcp->uid; 2192 ax->uid = ipcp->uid;
2191 ax->gid = ipcp->gid; 2193 ax->gid = ipcp->gid;
2192 ax->mode = ipcp->mode; 2194 ax->mode = ipcp->mode;
2193 selinux_get_ipc_sid(ipcp, &ax->osid); 2195 security_ipc_getsecid(ipcp, &ax->osid);
2194
2195 ax->d.type = AUDIT_IPC; 2196 ax->d.type = AUDIT_IPC;
2196 ax->d.next = context->aux; 2197 ax->d.next = context->aux;
2197 context->aux = (void *)ax; 2198 context->aux = (void *)ax;
@@ -2343,7 +2344,7 @@ void __audit_ptrace(struct task_struct *t)
2343 context->target_auid = audit_get_loginuid(t); 2344 context->target_auid = audit_get_loginuid(t);
2344 context->target_uid = t->uid; 2345 context->target_uid = t->uid;
2345 context->target_sessionid = audit_get_sessionid(t); 2346 context->target_sessionid = audit_get_sessionid(t);
2346 selinux_get_task_sid(t, &context->target_sid); 2347 security_task_getsecid(t, &context->target_sid);
2347 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2348 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
2348} 2349}
2349 2350
@@ -2371,7 +2372,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2371 audit_sig_uid = tsk->loginuid; 2372 audit_sig_uid = tsk->loginuid;
2372 else 2373 else
2373 audit_sig_uid = tsk->uid; 2374 audit_sig_uid = tsk->uid;
2374 selinux_get_task_sid(tsk, &audit_sig_sid); 2375 security_task_getsecid(tsk, &audit_sig_sid);
2375 } 2376 }
2376 if (!audit_signals || audit_dummy_context()) 2377 if (!audit_signals || audit_dummy_context())
2377 return 0; 2378 return 0;
@@ -2384,7 +2385,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2384 ctx->target_auid = audit_get_loginuid(t); 2385 ctx->target_auid = audit_get_loginuid(t);
2385 ctx->target_uid = t->uid; 2386 ctx->target_uid = t->uid;
2386 ctx->target_sessionid = audit_get_sessionid(t); 2387 ctx->target_sessionid = audit_get_sessionid(t);
2387 selinux_get_task_sid(t, &ctx->target_sid); 2388 security_task_getsecid(t, &ctx->target_sid);
2388 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2389 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
2389 return 0; 2390 return 0;
2390 } 2391 }
@@ -2405,7 +2406,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2405 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2406 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2406 axp->target_uid[axp->pid_count] = t->uid; 2407 axp->target_uid[axp->pid_count] = t->uid;
2407 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2408 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2408 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); 2409 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2409 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2410 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
2410 axp->pid_count++; 2411 axp->pid_count++;
2411 2412
@@ -2435,16 +2436,17 @@ void audit_core_dumps(long signr)
2435 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2436 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2436 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2437 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2437 auid, current->uid, current->gid, sessionid); 2438 auid, current->uid, current->gid, sessionid);
2438 selinux_get_task_sid(current, &sid); 2439 security_task_getsecid(current, &sid);
2439 if (sid) { 2440 if (sid) {
2440 char *ctx = NULL; 2441 char *ctx = NULL;
2441 u32 len; 2442 u32 len;
2442 2443
2443 if (selinux_sid_to_string(sid, &ctx, &len)) 2444 if (security_secid_to_secctx(sid, &ctx, &len))
2444 audit_log_format(ab, " ssid=%u", sid); 2445 audit_log_format(ab, " ssid=%u", sid);
2445 else 2446 else {
2446 audit_log_format(ab, " subj=%s", ctx); 2447 audit_log_format(ab, " subj=%s", ctx);
2447 kfree(ctx); 2448 security_release_secctx(ctx, len);
2449 }
2448 } 2450 }
2449 audit_log_format(ab, " pid=%d comm=", current->pid); 2451 audit_log_format(ab, " pid=%d comm=", current->pid);
2450 audit_log_untrustedstring(ab, current->comm); 2452 audit_log_untrustedstring(ab, current->comm);
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 000000000000..c3c55544db2f
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,23 @@
1/*
2 * Generate definitions needed by the preprocessor.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#define __GENERATING_BOUNDS_H
8/* Include headers that define the enum constants of interest */
9#include <linux/page-flags.h>
10#include <linux/mmzone.h>
11
12#define DEFINE(sym, val) \
13 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
14
15#define BLANK() asm volatile("\n->" : : )
16
17void foo(void)
18{
19 /* The enum constants to put into include/linux/bounds.h */
20 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
21 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
22 /* End of constants */
23}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e9c2fb01e89b..6d8de051382b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -782,7 +782,14 @@ static int parse_cgroupfs_options(char *data,
782 if (!*token) 782 if (!*token)
783 return -EINVAL; 783 return -EINVAL;
784 if (!strcmp(token, "all")) { 784 if (!strcmp(token, "all")) {
785 opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; 785 /* Add all non-disabled subsystems */
786 int i;
787 opts->subsys_bits = 0;
788 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
789 struct cgroup_subsys *ss = subsys[i];
790 if (!ss->disabled)
791 opts->subsys_bits |= 1ul << i;
792 }
786 } else if (!strcmp(token, "noprefix")) { 793 } else if (!strcmp(token, "noprefix")) {
787 set_bit(ROOT_NOPREFIX, &opts->flags); 794 set_bit(ROOT_NOPREFIX, &opts->flags);
788 } else if (!strncmp(token, "release_agent=", 14)) { 795 } else if (!strncmp(token, "release_agent=", 14)) {
@@ -800,7 +807,8 @@ static int parse_cgroupfs_options(char *data,
800 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 807 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
801 ss = subsys[i]; 808 ss = subsys[i];
802 if (!strcmp(token, ss->name)) { 809 if (!strcmp(token, ss->name)) {
803 set_bit(i, &opts->subsys_bits); 810 if (!ss->disabled)
811 set_bit(i, &opts->subsys_bits);
804 break; 812 break;
805 } 813 }
806 } 814 }
@@ -1714,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void)
1714 use_task_css_set_links = 1; 1722 use_task_css_set_links = 1;
1715 do_each_thread(g, p) { 1723 do_each_thread(g, p) {
1716 task_lock(p); 1724 task_lock(p);
1717 if (list_empty(&p->cg_list)) 1725 /*
1726 * We should check if the process is exiting, otherwise
1727 * it will race with cgroup_exit() in that the list
1728 * entry won't be deleted though the process has exited.
1729 */
1730 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
1718 list_add(&p->cg_list, &p->cgroups->tasks); 1731 list_add(&p->cg_list, &p->cgroups->tasks);
1719 task_unlock(p); 1732 task_unlock(p);
1720 } while_each_thread(g, p); 1733 } while_each_thread(g, p);
@@ -2082,7 +2095,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2082 2095
2083 kfree(pidarray); 2096 kfree(pidarray);
2084 } else { 2097 } else {
2085 ctr->buf = 0; 2098 ctr->buf = NULL;
2086 ctr->bufsz = 0; 2099 ctr->bufsz = 0;
2087 } 2100 }
2088 file->private_data = ctr; 2101 file->private_data = ctr;
@@ -2561,6 +2574,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2561 /* Skip this hierarchy if it has no active subsystems */ 2574 /* Skip this hierarchy if it has no active subsystems */
2562 if (!root->actual_subsys_bits) 2575 if (!root->actual_subsys_bits)
2563 continue; 2576 continue;
2577 seq_printf(m, "%lu:", root->subsys_bits);
2564 for_each_subsys(root, ss) 2578 for_each_subsys(root, ss)
2565 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2579 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
2566 seq_putc(m, ':'); 2580 seq_putc(m, ':');
@@ -2600,13 +2614,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
2600{ 2614{
2601 int i; 2615 int i;
2602 2616
2603 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); 2617 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
2604 mutex_lock(&cgroup_mutex); 2618 mutex_lock(&cgroup_mutex);
2605 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2619 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2606 struct cgroup_subsys *ss = subsys[i]; 2620 struct cgroup_subsys *ss = subsys[i];
2607 seq_printf(m, "%s\t%lu\t%d\n", 2621 seq_printf(m, "%s\t%lu\t%d\t%d\n",
2608 ss->name, ss->root->subsys_bits, 2622 ss->name, ss->root->subsys_bits,
2609 ss->root->number_of_cgroups); 2623 ss->root->number_of_cgroups, !ss->disabled);
2610 } 2624 }
2611 mutex_unlock(&cgroup_mutex); 2625 mutex_unlock(&cgroup_mutex);
2612 return 0; 2626 return 0;
@@ -2614,7 +2628,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
2614 2628
2615static int cgroupstats_open(struct inode *inode, struct file *file) 2629static int cgroupstats_open(struct inode *inode, struct file *file)
2616{ 2630{
2617 return single_open(file, proc_cgroupstats_show, 0); 2631 return single_open(file, proc_cgroupstats_show, NULL);
2618} 2632}
2619 2633
2620static struct file_operations proc_cgroupstats_operations = { 2634static struct file_operations proc_cgroupstats_operations = {
@@ -3010,3 +3024,27 @@ static void cgroup_release_agent(struct work_struct *work)
3010 spin_unlock(&release_list_lock); 3024 spin_unlock(&release_list_lock);
3011 mutex_unlock(&cgroup_mutex); 3025 mutex_unlock(&cgroup_mutex);
3012} 3026}
3027
3028static int __init cgroup_disable(char *str)
3029{
3030 int i;
3031 char *token;
3032
3033 while ((token = strsep(&str, ",")) != NULL) {
3034 if (!*token)
3035 continue;
3036
3037 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3038 struct cgroup_subsys *ss = subsys[i];
3039
3040 if (!strcmp(token, ss->name)) {
3041 ss->disabled = 1;
3042 printk(KERN_INFO "Disabling %s control group"
3043 " subsystem\n", ss->name);
3044 break;
3045 }
3046 }
3047 }
3048 return 1;
3049}
3050__setup("cgroup_disable=", cgroup_disable);
diff --git a/kernel/compat.c b/kernel/compat.c
index 5f0e201bcfd3..e1ef04870c2a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart)
47 mm_segment_t oldfs; 47 mm_segment_t oldfs;
48 long ret; 48 long ret;
49 49
50 rmtp = (struct compat_timespec __user *)(restart->arg1); 50 restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
51 restart->arg1 = (unsigned long)&rmt;
52 oldfs = get_fs(); 51 oldfs = get_fs();
53 set_fs(KERNEL_DS); 52 set_fs(KERNEL_DS);
54 ret = hrtimer_nanosleep_restart(restart); 53 ret = hrtimer_nanosleep_restart(restart);
55 set_fs(oldfs); 54 set_fs(oldfs);
56 55
57 if (ret) { 56 if (ret) {
58 restart->arg1 = (unsigned long)rmtp; 57 rmtp = restart->nanosleep.compat_rmtp;
59 58
60 if (rmtp && put_compat_timespec(&rmt, rmtp)) 59 if (rmtp && put_compat_timespec(&rmt, rmtp))
61 return -EFAULT; 60 return -EFAULT;
@@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
89 = &current_thread_info()->restart_block; 88 = &current_thread_info()->restart_block;
90 89
91 restart->fn = compat_nanosleep_restart; 90 restart->fn = compat_nanosleep_restart;
92 restart->arg1 = (unsigned long)rmtp; 91 restart->nanosleep.compat_rmtp = rmtp;
93 92
94 if (rmtp && put_compat_timespec(&rmt, rmtp)) 93 if (rmtp && put_compat_timespec(&rmt, rmtp))
95 return -EFAULT; 94 return -EFAULT;
@@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
446 if (retval) 445 if (retval)
447 return retval; 446 return retval;
448 447
449 return sched_setaffinity(pid, new_mask); 448 return sched_setaffinity(pid, &new_mask);
450} 449}
451 450
452asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 451asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
@@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
607 long err; 606 long err;
608 mm_segment_t oldfs; 607 mm_segment_t oldfs;
609 struct timespec tu; 608 struct timespec tu;
610 struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); 609 struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
611 610
612 restart->arg1 = (unsigned long) &tu; 611 restart->nanosleep.rmtp = (struct timespec __user *) &tu;
613 oldfs = get_fs(); 612 oldfs = get_fs();
614 set_fs(KERNEL_DS); 613 set_fs(KERNEL_DS);
615 err = clock_nanosleep_restart(restart); 614 err = clock_nanosleep_restart(restart);
@@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
621 620
622 if (err == -ERESTART_RESTARTBLOCK) { 621 if (err == -ERESTART_RESTARTBLOCK) {
623 restart->fn = compat_clock_nanosleep_restart; 622 restart->fn = compat_clock_nanosleep_restart;
624 restart->arg1 = (unsigned long) rmtp; 623 restart->nanosleep.compat_rmtp = rmtp;
625 } 624 }
626 return err; 625 return err;
627} 626}
@@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
652 if (err == -ERESTART_RESTARTBLOCK) { 651 if (err == -ERESTART_RESTARTBLOCK) {
653 restart = &current_thread_info()->restart_block; 652 restart = &current_thread_info()->restart_block;
654 restart->fn = compat_clock_nanosleep_restart; 653 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp; 654 restart->nanosleep.compat_rmtp = rmtp;
656 } 655 }
657 return err; 656 return err;
658} 657}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2eff3f63abed..2011ad8d2697 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
232 232
233 /* Ensure that we are not runnable on dying cpu */ 233 /* Ensure that we are not runnable on dying cpu */
234 old_allowed = current->cpus_allowed; 234 old_allowed = current->cpus_allowed;
235 tmp = CPU_MASK_ALL; 235 cpus_setall(tmp);
236 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
237 set_cpus_allowed(current, tmp); 237 set_cpus_allowed_ptr(current, &tmp);
238 238
239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
240 240
@@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
268out_thread: 268out_thread:
269 err = kthread_stop(p); 269 err = kthread_stop(p);
270out_allowed: 270out_allowed:
271 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed_ptr(current, &old_allowed);
272out_release: 272out_release:
273 cpu_hotplug_done(); 273 cpu_hotplug_done();
274 return err; 274 return err;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a1b61f414228..48a976c52cf5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -98,6 +98,9 @@ struct cpuset {
98 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
99 int pn; 99 int pn;
100 100
101 /* for custom sched domain */
102 int relax_domain_level;
103
101 /* used for walking a cpuset heirarchy */ 104 /* used for walking a cpuset heirarchy */
102 struct list_head stack_list; 105 struct list_head stack_list;
103}; 106};
@@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
478 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 481 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
479} 482}
480 483
484static void
485update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
486{
487 if (!dattr)
488 return;
489 if (dattr->relax_domain_level < c->relax_domain_level)
490 dattr->relax_domain_level = c->relax_domain_level;
491 return;
492}
493
481/* 494/*
482 * rebuild_sched_domains() 495 * rebuild_sched_domains()
483 * 496 *
@@ -553,12 +566,14 @@ static void rebuild_sched_domains(void)
553 int csn; /* how many cpuset ptrs in csa so far */ 566 int csn; /* how many cpuset ptrs in csa so far */
554 int i, j, k; /* indices for partition finding loops */ 567 int i, j, k; /* indices for partition finding loops */
555 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 568 cpumask_t *doms; /* resulting partition; i.e. sched domains */
569 struct sched_domain_attr *dattr; /* attributes for custom domains */
556 int ndoms; /* number of sched domains in result */ 570 int ndoms; /* number of sched domains in result */
557 int nslot; /* next empty doms[] cpumask_t slot */ 571 int nslot; /* next empty doms[] cpumask_t slot */
558 572
559 q = NULL; 573 q = NULL;
560 csa = NULL; 574 csa = NULL;
561 doms = NULL; 575 doms = NULL;
576 dattr = NULL;
562 577
563 /* Special case for the 99% of systems with one, full, sched domain */ 578 /* Special case for the 99% of systems with one, full, sched domain */
564 if (is_sched_load_balance(&top_cpuset)) { 579 if (is_sched_load_balance(&top_cpuset)) {
@@ -566,6 +581,11 @@ static void rebuild_sched_domains(void)
566 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 581 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
567 if (!doms) 582 if (!doms)
568 goto rebuild; 583 goto rebuild;
584 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
585 if (dattr) {
586 *dattr = SD_ATTR_INIT;
587 update_domain_attr(dattr, &top_cpuset);
588 }
569 *doms = top_cpuset.cpus_allowed; 589 *doms = top_cpuset.cpus_allowed;
570 goto rebuild; 590 goto rebuild;
571 } 591 }
@@ -622,6 +642,7 @@ restart:
622 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 642 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
623 if (!doms) 643 if (!doms)
624 goto rebuild; 644 goto rebuild;
645 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
625 646
626 for (nslot = 0, i = 0; i < csn; i++) { 647 for (nslot = 0, i = 0; i < csn; i++) {
627 struct cpuset *a = csa[i]; 648 struct cpuset *a = csa[i];
@@ -644,12 +665,15 @@ restart:
644 } 665 }
645 666
646 cpus_clear(*dp); 667 cpus_clear(*dp);
668 if (dattr)
669 *(dattr + nslot) = SD_ATTR_INIT;
647 for (j = i; j < csn; j++) { 670 for (j = i; j < csn; j++) {
648 struct cpuset *b = csa[j]; 671 struct cpuset *b = csa[j];
649 672
650 if (apn == b->pn) { 673 if (apn == b->pn) {
651 cpus_or(*dp, *dp, b->cpus_allowed); 674 cpus_or(*dp, *dp, b->cpus_allowed);
652 b->pn = -1; 675 b->pn = -1;
676 update_domain_attr(dattr, b);
653 } 677 }
654 } 678 }
655 nslot++; 679 nslot++;
@@ -660,7 +684,7 @@ restart:
660rebuild: 684rebuild:
661 /* Have scheduler rebuild sched domains */ 685 /* Have scheduler rebuild sched domains */
662 get_online_cpus(); 686 get_online_cpus();
663 partition_sched_domains(ndoms, doms); 687 partition_sched_domains(ndoms, doms, dattr);
664 put_online_cpus(); 688 put_online_cpus();
665 689
666done: 690done:
@@ -668,6 +692,7 @@ done:
668 kfifo_free(q); 692 kfifo_free(q);
669 kfree(csa); 693 kfree(csa);
670 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 694 /* Don't kfree(doms) -- partition_sched_domains() does that. */
695 /* Don't kfree(dattr) -- partition_sched_domains() does that. */
671} 696}
672 697
673static inline int started_after_time(struct task_struct *t1, 698static inline int started_after_time(struct task_struct *t1,
@@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
729 */ 754 */
730void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) 755void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
731{ 756{
732 set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); 757 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
733} 758}
734 759
735/** 760/**
@@ -916,7 +941,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
916 cs->mems_generation = cpuset_mems_generation++; 941 cs->mems_generation = cpuset_mems_generation++;
917 mutex_unlock(&callback_mutex); 942 mutex_unlock(&callback_mutex);
918 943
919 cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ 944 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
920 945
921 fudge = 10; /* spare mmarray[] slots */ 946 fudge = 10; /* spare mmarray[] slots */
922 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 947 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
@@ -967,7 +992,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
967 * rebind the vma mempolicies of each mm in mmarray[] to their 992 * rebind the vma mempolicies of each mm in mmarray[] to their
968 * new cpuset, and release that mm. The mpol_rebind_mm() 993 * new cpuset, and release that mm. The mpol_rebind_mm()
969 * call takes mmap_sem, which we couldn't take while holding 994 * call takes mmap_sem, which we couldn't take while holding
970 * tasklist_lock. Forks can happen again now - the mpol_copy() 995 * tasklist_lock. Forks can happen again now - the mpol_dup()
971 * cpuset_being_rebound check will catch such forks, and rebind 996 * cpuset_being_rebound check will catch such forks, and rebind
972 * their vma mempolicies too. Because we still hold the global 997 * their vma mempolicies too. Because we still hold the global
973 * cgroup_mutex, we know that no other rebind effort will 998 * cgroup_mutex, we know that no other rebind effort will
@@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1011 return 0; 1036 return 0;
1012} 1037}
1013 1038
1039static int update_relax_domain_level(struct cpuset *cs, char *buf)
1040{
1041 int val = simple_strtol(buf, NULL, 10);
1042
1043 if (val < 0)
1044 val = -1;
1045
1046 if (val != cs->relax_domain_level) {
1047 cs->relax_domain_level = val;
1048 rebuild_sched_domains();
1049 }
1050
1051 return 0;
1052}
1053
1014/* 1054/*
1015 * update_flag - read a 0 or a 1 in a file and update associated flag 1055 * update_flag - read a 0 or a 1 in a file and update associated flag
1016 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1056 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1178 1218
1179 mutex_lock(&callback_mutex); 1219 mutex_lock(&callback_mutex);
1180 guarantee_online_cpus(cs, &cpus); 1220 guarantee_online_cpus(cs, &cpus);
1181 set_cpus_allowed(tsk, cpus); 1221 set_cpus_allowed_ptr(tsk, &cpus);
1182 mutex_unlock(&callback_mutex); 1222 mutex_unlock(&callback_mutex);
1183 1223
1184 from = oldcs->mems_allowed; 1224 from = oldcs->mems_allowed;
@@ -1202,6 +1242,7 @@ typedef enum {
1202 FILE_CPU_EXCLUSIVE, 1242 FILE_CPU_EXCLUSIVE,
1203 FILE_MEM_EXCLUSIVE, 1243 FILE_MEM_EXCLUSIVE,
1204 FILE_SCHED_LOAD_BALANCE, 1244 FILE_SCHED_LOAD_BALANCE,
1245 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1205 FILE_MEMORY_PRESSURE_ENABLED, 1246 FILE_MEMORY_PRESSURE_ENABLED,
1206 FILE_MEMORY_PRESSURE, 1247 FILE_MEMORY_PRESSURE,
1207 FILE_SPREAD_PAGE, 1248 FILE_SPREAD_PAGE,
@@ -1224,7 +1265,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1224 return -E2BIG; 1265 return -E2BIG;
1225 1266
1226 /* +1 for nul-terminator */ 1267 /* +1 for nul-terminator */
1227 if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) 1268 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1269 if (!buffer)
1228 return -ENOMEM; 1270 return -ENOMEM;
1229 1271
1230 if (copy_from_user(buffer, userbuf, nbytes)) { 1272 if (copy_from_user(buffer, userbuf, nbytes)) {
@@ -1256,6 +1298,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
1256 case FILE_SCHED_LOAD_BALANCE: 1298 case FILE_SCHED_LOAD_BALANCE:
1257 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); 1299 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1258 break; 1300 break;
1301 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1302 retval = update_relax_domain_level(cs, buffer);
1303 break;
1259 case FILE_MEMORY_MIGRATE: 1304 case FILE_MEMORY_MIGRATE:
1260 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1305 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1261 break; 1306 break;
@@ -1354,6 +1399,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1354 case FILE_SCHED_LOAD_BALANCE: 1399 case FILE_SCHED_LOAD_BALANCE:
1355 *s++ = is_sched_load_balance(cs) ? '1' : '0'; 1400 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1356 break; 1401 break;
1402 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1403 s += sprintf(s, "%d", cs->relax_domain_level);
1404 break;
1357 case FILE_MEMORY_MIGRATE: 1405 case FILE_MEMORY_MIGRATE:
1358 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1406 *s++ = is_memory_migrate(cs) ? '1' : '0';
1359 break; 1407 break;
@@ -1424,6 +1472,13 @@ static struct cftype cft_sched_load_balance = {
1424 .private = FILE_SCHED_LOAD_BALANCE, 1472 .private = FILE_SCHED_LOAD_BALANCE,
1425}; 1473};
1426 1474
1475static struct cftype cft_sched_relax_domain_level = {
1476 .name = "sched_relax_domain_level",
1477 .read = cpuset_common_file_read,
1478 .write = cpuset_common_file_write,
1479 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
1480};
1481
1427static struct cftype cft_memory_migrate = { 1482static struct cftype cft_memory_migrate = {
1428 .name = "memory_migrate", 1483 .name = "memory_migrate",
1429 .read = cpuset_common_file_read, 1484 .read = cpuset_common_file_read,
@@ -1475,6 +1530,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1475 return err; 1530 return err;
1476 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) 1531 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1477 return err; 1532 return err;
1533 if ((err = cgroup_add_file(cont, ss,
1534 &cft_sched_relax_domain_level)) < 0)
1535 return err;
1478 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) 1536 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1479 return err; 1537 return err;
1480 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) 1538 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
@@ -1555,10 +1613,11 @@ static struct cgroup_subsys_state *cpuset_create(
1555 if (is_spread_slab(parent)) 1613 if (is_spread_slab(parent))
1556 set_bit(CS_SPREAD_SLAB, &cs->flags); 1614 set_bit(CS_SPREAD_SLAB, &cs->flags);
1557 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1615 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1558 cs->cpus_allowed = CPU_MASK_NONE; 1616 cpus_clear(cs->cpus_allowed);
1559 cs->mems_allowed = NODE_MASK_NONE; 1617 nodes_clear(cs->mems_allowed);
1560 cs->mems_generation = cpuset_mems_generation++; 1618 cs->mems_generation = cpuset_mems_generation++;
1561 fmeter_init(&cs->fmeter); 1619 fmeter_init(&cs->fmeter);
1620 cs->relax_domain_level = -1;
1562 1621
1563 cs->parent = parent; 1622 cs->parent = parent;
1564 number_of_cpusets++; 1623 number_of_cpusets++;
@@ -1625,12 +1684,13 @@ int __init cpuset_init(void)
1625{ 1684{
1626 int err = 0; 1685 int err = 0;
1627 1686
1628 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1687 cpus_setall(top_cpuset.cpus_allowed);
1629 top_cpuset.mems_allowed = NODE_MASK_ALL; 1688 nodes_setall(top_cpuset.mems_allowed);
1630 1689
1631 fmeter_init(&top_cpuset.fmeter); 1690 fmeter_init(&top_cpuset.fmeter);
1632 top_cpuset.mems_generation = cpuset_mems_generation++; 1691 top_cpuset.mems_generation = cpuset_mems_generation++;
1633 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1692 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1693 top_cpuset.relax_domain_level = -1;
1634 1694
1635 err = register_filesystem(&cpuset_fs_type); 1695 err = register_filesystem(&cpuset_fs_type);
1636 if (err < 0) 1696 if (err < 0)
@@ -1844,6 +1904,7 @@ void __init cpuset_init_smp(void)
1844 1904
1845 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1905 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1846 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1906 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1907 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
1847 * 1908 *
1848 * Description: Returns the cpumask_t cpus_allowed of the cpuset 1909 * Description: Returns the cpumask_t cpus_allowed of the cpuset
1849 * attached to the specified @tsk. Guaranteed to return some non-empty 1910 * attached to the specified @tsk. Guaranteed to return some non-empty
@@ -1851,35 +1912,27 @@ void __init cpuset_init_smp(void)
1851 * tasks cpuset. 1912 * tasks cpuset.
1852 **/ 1913 **/
1853 1914
1854cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) 1915void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
1855{ 1916{
1856 cpumask_t mask;
1857
1858 mutex_lock(&callback_mutex); 1917 mutex_lock(&callback_mutex);
1859 mask = cpuset_cpus_allowed_locked(tsk); 1918 cpuset_cpus_allowed_locked(tsk, pmask);
1860 mutex_unlock(&callback_mutex); 1919 mutex_unlock(&callback_mutex);
1861
1862 return mask;
1863} 1920}
1864 1921
1865/** 1922/**
1866 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 1923 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1867 * Must be called with callback_mutex held. 1924 * Must be called with callback_mutex held.
1868 **/ 1925 **/
1869cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) 1926void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
1870{ 1927{
1871 cpumask_t mask;
1872
1873 task_lock(tsk); 1928 task_lock(tsk);
1874 guarantee_online_cpus(task_cs(tsk), &mask); 1929 guarantee_online_cpus(task_cs(tsk), pmask);
1875 task_unlock(tsk); 1930 task_unlock(tsk);
1876
1877 return mask;
1878} 1931}
1879 1932
1880void cpuset_init_current_mems_allowed(void) 1933void cpuset_init_current_mems_allowed(void)
1881{ 1934{
1882 current->mems_allowed = NODE_MASK_ALL; 1935 nodes_setall(current->mems_allowed);
1883} 1936}
1884 1937
1885/** 1938/**
@@ -1906,22 +1959,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1906} 1959}
1907 1960
1908/** 1961/**
1909 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed 1962 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
1910 * @zl: the zonelist to be checked 1963 * @nodemask: the nodemask to be checked
1911 * 1964 *
1912 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1965 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1913 */ 1966 */
1914int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1967int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1915{ 1968{
1916 int i; 1969 return nodes_intersects(*nodemask, current->mems_allowed);
1917
1918 for (i = 0; zl->zones[i]; i++) {
1919 int nid = zone_to_nid(zl->zones[i]);
1920
1921 if (node_isset(nid, current->mems_allowed))
1922 return 1;
1923 }
1924 return 0;
1925} 1970}
1926 1971
1927/* 1972/*
@@ -2261,8 +2306,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2261 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2306 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
2262 task->cpus_allowed); 2307 task->cpus_allowed);
2263 seq_printf(m, "\n"); 2308 seq_printf(m, "\n");
2309 seq_printf(m, "Cpus_allowed_list:\t");
2310 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count,
2311 task->cpus_allowed);
2312 seq_printf(m, "\n");
2264 seq_printf(m, "Mems_allowed:\t"); 2313 seq_printf(m, "Mems_allowed:\t");
2265 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2314 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
2266 task->mems_allowed); 2315 task->mems_allowed);
2267 seq_printf(m, "\n"); 2316 seq_printf(m, "\n");
2317 seq_printf(m, "Mems_allowed_list:\t");
2318 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count,
2319 task->mems_allowed);
2320 seq_printf(m, "\n");
2268} 2321}
diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf993fa..2a9d98c641ac 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -507,10 +507,9 @@ void put_files_struct(struct files_struct *files)
507 } 507 }
508} 508}
509 509
510EXPORT_SYMBOL(put_files_struct); 510void reset_files_struct(struct files_struct *files)
511
512void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
513{ 511{
512 struct task_struct *tsk = current;
514 struct files_struct *old; 513 struct files_struct *old;
515 514
516 old = tsk->files; 515 old = tsk->files;
@@ -519,9 +518,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
519 task_unlock(tsk); 518 task_unlock(tsk);
520 put_files_struct(old); 519 put_files_struct(old);
521} 520}
522EXPORT_SYMBOL(reset_files_struct);
523 521
524static void __exit_files(struct task_struct *tsk) 522void exit_files(struct task_struct *tsk)
525{ 523{
526 struct files_struct * files = tsk->files; 524 struct files_struct * files = tsk->files;
527 525
@@ -533,12 +531,7 @@ static void __exit_files(struct task_struct *tsk)
533 } 531 }
534} 532}
535 533
536void exit_files(struct task_struct *tsk) 534void put_fs_struct(struct fs_struct *fs)
537{
538 __exit_files(tsk);
539}
540
541static void __put_fs_struct(struct fs_struct *fs)
542{ 535{
543 /* No need to hold fs->lock if we are killing it */ 536 /* No need to hold fs->lock if we are killing it */
544 if (atomic_dec_and_test(&fs->count)) { 537 if (atomic_dec_and_test(&fs->count)) {
@@ -550,12 +543,7 @@ static void __put_fs_struct(struct fs_struct *fs)
550 } 543 }
551} 544}
552 545
553void put_fs_struct(struct fs_struct *fs) 546void exit_fs(struct task_struct *tsk)
554{
555 __put_fs_struct(fs);
556}
557
558static void __exit_fs(struct task_struct *tsk)
559{ 547{
560 struct fs_struct * fs = tsk->fs; 548 struct fs_struct * fs = tsk->fs;
561 549
@@ -563,15 +551,10 @@ static void __exit_fs(struct task_struct *tsk)
563 task_lock(tsk); 551 task_lock(tsk);
564 tsk->fs = NULL; 552 tsk->fs = NULL;
565 task_unlock(tsk); 553 task_unlock(tsk);
566 __put_fs_struct(fs); 554 put_fs_struct(fs);
567 } 555 }
568} 556}
569 557
570void exit_fs(struct task_struct *tsk)
571{
572 __exit_fs(tsk);
573}
574
575EXPORT_SYMBOL_GPL(exit_fs); 558EXPORT_SYMBOL_GPL(exit_fs);
576 559
577/* 560/*
@@ -967,8 +950,8 @@ NORET_TYPE void do_exit(long code)
967 if (group_dead) 950 if (group_dead)
968 acct_process(); 951 acct_process();
969 exit_sem(tsk); 952 exit_sem(tsk);
970 __exit_files(tsk); 953 exit_files(tsk);
971 __exit_fs(tsk); 954 exit_fs(tsk);
972 check_stack_usage(); 955 check_stack_usage();
973 exit_thread(); 956 exit_thread();
974 cgroup_exit(tsk, 1); 957 cgroup_exit(tsk, 1);
@@ -984,7 +967,7 @@ NORET_TYPE void do_exit(long code)
984 proc_exit_connector(tsk); 967 proc_exit_connector(tsk);
985 exit_notify(tsk, group_dead); 968 exit_notify(tsk, group_dead);
986#ifdef CONFIG_NUMA 969#ifdef CONFIG_NUMA
987 mpol_free(tsk->mempolicy); 970 mpol_put(tsk->mempolicy);
988 tsk->mempolicy = NULL; 971 tsk->mempolicy = NULL;
989#endif 972#endif
990#ifdef CONFIG_FUTEX 973#ifdef CONFIG_FUTEX
@@ -1608,7 +1591,7 @@ asmlinkage long sys_waitid(int which, pid_t upid,
1608 put_pid(pid); 1591 put_pid(pid);
1609 1592
1610 /* avoid REGPARM breakage on x86: */ 1593 /* avoid REGPARM breakage on x86: */
1611 prevent_tail_call(ret); 1594 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1612 return ret; 1595 return ret;
1613} 1596}
1614 1597
@@ -1640,7 +1623,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1640 put_pid(pid); 1623 put_pid(pid);
1641 1624
1642 /* avoid REGPARM breakage on x86: */ 1625 /* avoid REGPARM breakage on x86: */
1643 prevent_tail_call(ret); 1626 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1644 return ret; 1627 return ret;
1645} 1628}
1646 1629
diff --git a/kernel/fork.c b/kernel/fork.c
index dd249c37b3a3..6067e429f281 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -132,6 +132,14 @@ void __put_task_struct(struct task_struct *tsk)
132 free_task(tsk); 132 free_task(tsk);
133} 133}
134 134
135/*
136 * macro override instead of weak attribute alias, to workaround
137 * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
138 */
139#ifndef arch_task_cache_init
140#define arch_task_cache_init()
141#endif
142
135void __init fork_init(unsigned long mempages) 143void __init fork_init(unsigned long mempages)
136{ 144{
137#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR 145#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -144,6 +152,9 @@ void __init fork_init(unsigned long mempages)
144 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 152 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
145#endif 153#endif
146 154
155 /* do the arch specific task caches init */
156 arch_task_cache_init();
157
147 /* 158 /*
148 * The default maximum number of threads is set to a safe 159 * The default maximum number of threads is set to a safe
149 * value: the thread structures can take up at most half 160 * value: the thread structures can take up at most half
@@ -163,6 +174,13 @@ void __init fork_init(unsigned long mempages)
163 init_task.signal->rlim[RLIMIT_NPROC]; 174 init_task.signal->rlim[RLIMIT_NPROC];
164} 175}
165 176
177int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
178 struct task_struct *src)
179{
180 *dst = *src;
181 return 0;
182}
183
166static struct task_struct *dup_task_struct(struct task_struct *orig) 184static struct task_struct *dup_task_struct(struct task_struct *orig)
167{ 185{
168 struct task_struct *tsk; 186 struct task_struct *tsk;
@@ -181,15 +199,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
181 return NULL; 199 return NULL;
182 } 200 }
183 201
184 *tsk = *orig; 202 err = arch_dup_task_struct(tsk, orig);
203 if (err)
204 goto out;
205
185 tsk->stack = ti; 206 tsk->stack = ti;
186 207
187 err = prop_local_init_single(&tsk->dirties); 208 err = prop_local_init_single(&tsk->dirties);
188 if (err) { 209 if (err)
189 free_thread_info(ti); 210 goto out;
190 free_task_struct(tsk);
191 return NULL;
192 }
193 211
194 setup_thread_stack(tsk, orig); 212 setup_thread_stack(tsk, orig);
195 213
@@ -205,6 +223,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
205#endif 223#endif
206 tsk->splice_pipe = NULL; 224 tsk->splice_pipe = NULL;
207 return tsk; 225 return tsk;
226
227out:
228 free_thread_info(ti);
229 free_task_struct(tsk);
230 return NULL;
208} 231}
209 232
210#ifdef CONFIG_MMU 233#ifdef CONFIG_MMU
@@ -256,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
256 if (!tmp) 279 if (!tmp)
257 goto fail_nomem; 280 goto fail_nomem;
258 *tmp = *mpnt; 281 *tmp = *mpnt;
259 pol = mpol_copy(vma_policy(mpnt)); 282 pol = mpol_dup(vma_policy(mpnt));
260 retval = PTR_ERR(pol); 283 retval = PTR_ERR(pol);
261 if (IS_ERR(pol)) 284 if (IS_ERR(pol))
262 goto fail_nomem_policy; 285 goto fail_nomem_policy;
@@ -394,7 +417,6 @@ void __mmdrop(struct mm_struct *mm)
394{ 417{
395 BUG_ON(mm == &init_mm); 418 BUG_ON(mm == &init_mm);
396 mm_free_pgd(mm); 419 mm_free_pgd(mm);
397 mm_free_cgroup(mm);
398 destroy_context(mm); 420 destroy_context(mm);
399 free_mm(mm); 421 free_mm(mm);
400} 422}
@@ -416,6 +438,7 @@ void mmput(struct mm_struct *mm)
416 spin_unlock(&mmlist_lock); 438 spin_unlock(&mmlist_lock);
417 } 439 }
418 put_swap_token(mm); 440 put_swap_token(mm);
441 mm_free_cgroup(mm);
419 mmdrop(mm); 442 mmdrop(mm);
420 } 443 }
421} 444}
@@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
498 * Allocate a new mm structure and copy contents from the 521 * Allocate a new mm structure and copy contents from the
499 * mm structure of the passed in task structure. 522 * mm structure of the passed in task structure.
500 */ 523 */
501static struct mm_struct *dup_mm(struct task_struct *tsk) 524struct mm_struct *dup_mm(struct task_struct *tsk)
502{ 525{
503 struct mm_struct *mm, *oldmm = current->mm; 526 struct mm_struct *mm, *oldmm = current->mm;
504 int err; 527 int err;
@@ -782,12 +805,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
782 goto out; 805 goto out;
783 } 806 }
784 807
785 /*
786 * Note: we may be using current for both targets (See exec.c)
787 * This works because we cache current->files (old) as oldf. Don't
788 * break this.
789 */
790 tsk->files = NULL;
791 newf = dup_fd(oldf, &error); 808 newf = dup_fd(oldf, &error);
792 if (!newf) 809 if (!newf)
793 goto out; 810 goto out;
@@ -823,34 +840,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
823 return 0; 840 return 0;
824} 841}
825 842
826/*
827 * Helper to unshare the files of the current task.
828 * We don't want to expose copy_files internals to
829 * the exec layer of the kernel.
830 */
831
832int unshare_files(void)
833{
834 struct files_struct *files = current->files;
835 int rc;
836
837 BUG_ON(!files);
838
839 /* This can race but the race causes us to copy when we don't
840 need to and drop the copy */
841 if(atomic_read(&files->count) == 1)
842 {
843 atomic_inc(&files->count);
844 return 0;
845 }
846 rc = copy_files(0, current);
847 if(rc)
848 current->files = files;
849 return rc;
850}
851
852EXPORT_SYMBOL(unshare_files);
853
854static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) 843static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
855{ 844{
856 struct sighand_struct *sig; 845 struct sighand_struct *sig;
@@ -1127,7 +1116,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127 p->audit_context = NULL; 1116 p->audit_context = NULL;
1128 cgroup_fork(p); 1117 cgroup_fork(p);
1129#ifdef CONFIG_NUMA 1118#ifdef CONFIG_NUMA
1130 p->mempolicy = mpol_copy(p->mempolicy); 1119 p->mempolicy = mpol_dup(p->mempolicy);
1131 if (IS_ERR(p->mempolicy)) { 1120 if (IS_ERR(p->mempolicy)) {
1132 retval = PTR_ERR(p->mempolicy); 1121 retval = PTR_ERR(p->mempolicy);
1133 p->mempolicy = NULL; 1122 p->mempolicy = NULL;
@@ -1385,7 +1374,7 @@ bad_fork_cleanup_security:
1385 security_task_free(p); 1374 security_task_free(p);
1386bad_fork_cleanup_policy: 1375bad_fork_cleanup_policy:
1387#ifdef CONFIG_NUMA 1376#ifdef CONFIG_NUMA
1388 mpol_free(p->mempolicy); 1377 mpol_put(p->mempolicy);
1389bad_fork_cleanup_cgroup: 1378bad_fork_cleanup_cgroup:
1390#endif 1379#endif
1391 cgroup_exit(p, cgroup_callbacks_done); 1380 cgroup_exit(p, cgroup_callbacks_done);
@@ -1788,3 +1777,27 @@ bad_unshare_cleanup_thread:
1788bad_unshare_out: 1777bad_unshare_out:
1789 return err; 1778 return err;
1790} 1779}
1780
1781/*
1782 * Helper to unshare the files of the current task.
1783 * We don't want to expose copy_files internals to
1784 * the exec layer of the kernel.
1785 */
1786
1787int unshare_files(struct files_struct **displaced)
1788{
1789 struct task_struct *task = current;
1790 struct files_struct *copy = NULL;
1791 int error;
1792
1793 error = unshare_fd(CLONE_FILES, &copy);
1794 if (error || !copy) {
1795 *displaced = NULL;
1796 return error;
1797 }
1798 *displaced = task->files;
1799 task_lock(task);
1800 task->files = copy;
1801 task_unlock(task);
1802 return 0;
1803}
diff --git a/kernel/futex.c b/kernel/futex.c
index 06968cd79200..e43945e995f5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
281 */ 281 */
282static void get_futex_key_refs(union futex_key *key) 282static void get_futex_key_refs(union futex_key *key)
283{ 283{
284 if (key->both.ptr == 0) 284 if (key->both.ptr == NULL)
285 return; 285 return;
286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 286 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
287 case FUT_OFF_INODE: 287 case FUT_OFF_INODE:
@@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = {
2158 .kill_sb = kill_anon_super, 2158 .kill_sb = kill_anon_super,
2159}; 2159};
2160 2160
2161static int __init init(void) 2161static int __init futex_init(void)
2162{ 2162{
2163 u32 curval; 2163 u32 curval;
2164 int i; 2164 int i;
@@ -2194,4 +2194,4 @@ static int __init init(void)
2194 2194
2195 return 0; 2195 return 0;
2196} 2196}
2197__initcall(init); 2197__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index ff90f049f8f6..04ac3a9e42cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
30 return 0; 30 return 0;
31} 31}
32 32
33static void __user *futex_uaddr(struct robust_list *entry, 33static void __user *futex_uaddr(struct robust_list __user *entry,
34 compat_long_t futex_offset) 34 compat_long_t futex_offset)
35{ 35{
36 compat_uptr_t base = ptr_to_compat(entry); 36 compat_uptr_t base = ptr_to_compat(entry);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 98bee013f71f..dea4c9124ac8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
590 list_add_tail(&timer->cb_entry, 590 list_add_tail(&timer->cb_entry,
591 &base->cpu_base->cb_pending); 591 &base->cpu_base->cb_pending);
592 timer->state = HRTIMER_STATE_PENDING; 592 timer->state = HRTIMER_STATE_PENDING;
593 raise_softirq(HRTIMER_SOFTIRQ);
594 return 1; 593 return 1;
595 default: 594 default:
596 BUG(); 595 BUG();
@@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void)
633 return 1; 632 return 1;
634} 633}
635 634
635static inline void hrtimer_raise_softirq(void)
636{
637 raise_softirq(HRTIMER_SOFTIRQ);
638}
639
636#else 640#else
637 641
638static inline int hrtimer_hres_active(void) { return 0; } 642static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
651{ 655{
652 return 0; 656 return 0;
653} 657}
658static inline void hrtimer_raise_softirq(void) { }
654 659
655#endif /* CONFIG_HIGH_RES_TIMERS */ 660#endif /* CONFIG_HIGH_RES_TIMERS */
656 661
@@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850{ 855{
851 struct hrtimer_clock_base *base, *new_base; 856 struct hrtimer_clock_base *base, *new_base;
852 unsigned long flags; 857 unsigned long flags;
853 int ret; 858 int ret, raise;
854 859
855 base = lock_hrtimer_base(timer, &flags); 860 base = lock_hrtimer_base(timer, &flags);
856 861
@@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
884 enqueue_hrtimer(timer, new_base, 889 enqueue_hrtimer(timer, new_base,
885 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 890 new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
886 891
892 /*
893 * The timer may be expired and moved to the cb_pending
894 * list. We can not raise the softirq with base lock held due
895 * to a possible deadlock with runqueue lock.
896 */
897 raise = timer->state == HRTIMER_STATE_PENDING;
898
887 unlock_hrtimer_base(timer, &flags); 899 unlock_hrtimer_base(timer, &flags);
888 900
901 if (raise)
902 hrtimer_raise_softirq();
903
889 return ret; 904 return ret;
890} 905}
891EXPORT_SYMBOL_GPL(hrtimer_start); 906EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -1080,8 +1095,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1080 * If the timer was rearmed on another CPU, reprogram 1095 * If the timer was rearmed on another CPU, reprogram
1081 * the event device. 1096 * the event device.
1082 */ 1097 */
1083 if (timer->base->first == &timer->node) 1098 struct hrtimer_clock_base *base = timer->base;
1084 hrtimer_reprogram(timer, timer->base); 1099
1100 if (base->first == &timer->node &&
1101 hrtimer_reprogram(timer, base)) {
1102 /*
1103 * Timer is expired. Thus move it from tree to
1104 * pending list again.
1105 */
1106 __remove_hrtimer(timer, base,
1107 HRTIMER_STATE_PENDING, 0);
1108 list_add_tail(&timer->cb_entry,
1109 &base->cpu_base->cb_pending);
1110 }
1085 } 1111 }
1086 } 1112 }
1087 spin_unlock_irq(&cpu_base->lock); 1113 spin_unlock_irq(&cpu_base->lock);
@@ -1238,51 +1264,50 @@ void hrtimer_run_pending(void)
1238/* 1264/*
1239 * Called from hardirq context every jiffy 1265 * Called from hardirq context every jiffy
1240 */ 1266 */
1241static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1267void hrtimer_run_queues(void)
1242 int index)
1243{ 1268{
1244 struct rb_node *node; 1269 struct rb_node *node;
1245 struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; 1270 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1271 struct hrtimer_clock_base *base;
1272 int index, gettime = 1;
1246 1273
1247 if (!base->first) 1274 if (hrtimer_hres_active())
1248 return; 1275 return;
1249 1276
1250 if (base->get_softirq_time) 1277 for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
1251 base->softirq_time = base->get_softirq_time(); 1278 base = &cpu_base->clock_base[index];
1252
1253 spin_lock(&cpu_base->lock);
1254
1255 while ((node = base->first)) {
1256 struct hrtimer *timer;
1257
1258 timer = rb_entry(node, struct hrtimer, node);
1259 if (base->softirq_time.tv64 <= timer->expires.tv64)
1260 break;
1261 1279
1262 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { 1280 if (!base->first)
1263 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1264 list_add_tail(&timer->cb_entry,
1265 &base->cpu_base->cb_pending);
1266 continue; 1281 continue;
1282
1283 if (base->get_softirq_time)
1284 base->softirq_time = base->get_softirq_time();
1285 else if (gettime) {
1286 hrtimer_get_softirq_time(cpu_base);
1287 gettime = 0;
1267 } 1288 }
1268 1289
1269 __run_hrtimer(timer); 1290 spin_lock(&cpu_base->lock);
1270 }
1271 spin_unlock(&cpu_base->lock);
1272}
1273 1291
1274void hrtimer_run_queues(void) 1292 while ((node = base->first)) {
1275{ 1293 struct hrtimer *timer;
1276 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1277 int i;
1278 1294
1279 if (hrtimer_hres_active()) 1295 timer = rb_entry(node, struct hrtimer, node);
1280 return; 1296 if (base->softirq_time.tv64 <= timer->expires.tv64)
1297 break;
1281 1298
1282 hrtimer_get_softirq_time(cpu_base); 1299 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1300 __remove_hrtimer(timer, base,
1301 HRTIMER_STATE_PENDING, 0);
1302 list_add_tail(&timer->cb_entry,
1303 &base->cpu_base->cb_pending);
1304 continue;
1305 }
1283 1306
1284 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1307 __run_hrtimer(timer);
1285 run_hrtimer_queue(cpu_base, i); 1308 }
1309 spin_unlock(&cpu_base->lock);
1310 }
1286} 1311}
1287 1312
1288/* 1313/*
@@ -1354,13 +1379,13 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1354 struct hrtimer_sleeper t; 1379 struct hrtimer_sleeper t;
1355 struct timespec __user *rmtp; 1380 struct timespec __user *rmtp;
1356 1381
1357 hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); 1382 hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS);
1358 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; 1383 t.timer.expires.tv64 = restart->nanosleep.expires;
1359 1384
1360 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1385 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1361 return 0; 1386 return 0;
1362 1387
1363 rmtp = (struct timespec __user *)restart->arg1; 1388 rmtp = restart->nanosleep.rmtp;
1364 if (rmtp) { 1389 if (rmtp) {
1365 int ret = update_rmtp(&t.timer, rmtp); 1390 int ret = update_rmtp(&t.timer, rmtp);
1366 if (ret <= 0) 1391 if (ret <= 0)
@@ -1394,10 +1419,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1394 1419
1395 restart = &current_thread_info()->restart_block; 1420 restart = &current_thread_info()->restart_block;
1396 restart->fn = hrtimer_nanosleep_restart; 1421 restart->fn = hrtimer_nanosleep_restart;
1397 restart->arg0 = (unsigned long) t.timer.base->index; 1422 restart->nanosleep.index = t.timer.base->index;
1398 restart->arg1 = (unsigned long) rmtp; 1423 restart->nanosleep.rmtp = rmtp;
1399 restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; 1424 restart->nanosleep.expires = t.timer.expires.tv64;
1400 restart->arg3 = t.timer.expires.tv64 >> 32;
1401 1425
1402 return -ERESTART_RESTARTBLOCK; 1426 return -ERESTART_RESTARTBLOCK;
1403} 1427}
@@ -1425,7 +1449,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1425 int i; 1449 int i;
1426 1450
1427 spin_lock_init(&cpu_base->lock); 1451 spin_lock_init(&cpu_base->lock);
1428 lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
1429 1452
1430 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1453 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1431 cpu_base->clock_base[i].cpu_base = cpu_base; 1454 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1466,16 +1489,16 @@ static void migrate_hrtimers(int cpu)
1466 tick_cancel_sched_timer(cpu); 1489 tick_cancel_sched_timer(cpu);
1467 1490
1468 local_irq_disable(); 1491 local_irq_disable();
1469 double_spin_lock(&new_base->lock, &old_base->lock, 1492 spin_lock(&new_base->lock);
1470 smp_processor_id() < cpu); 1493 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1471 1494
1472 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1495 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1473 migrate_hrtimer_list(&old_base->clock_base[i], 1496 migrate_hrtimer_list(&old_base->clock_base[i],
1474 &new_base->clock_base[i]); 1497 &new_base->clock_base[i]);
1475 } 1498 }
1476 1499
1477 double_spin_unlock(&new_base->lock, &old_base->lock, 1500 spin_unlock(&old_base->lock);
1478 smp_processor_id() < cpu); 1501 spin_unlock(&new_base->lock);
1479 local_irq_enable(); 1502 local_irq_enable();
1480 put_cpu_var(hrtimer_bases); 1503 put_cpu_var(hrtimer_bases);
1481} 1504}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fdb3fbe2b0c4..964964baefa2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq)
47 desc->irq_count = 0; 47 desc->irq_count = 0;
48 desc->irqs_unhandled = 0; 48 desc->irqs_unhandled = 0;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 desc->affinity = CPU_MASK_ALL; 50 cpus_setall(desc->affinity);
51#endif 51#endif
52 spin_unlock_irqrestore(&desc->lock, flags); 52 spin_unlock_irqrestore(&desc->lock, flags);
53} 53}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 06a0e2775651..cb85c79989b4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -29,7 +29,6 @@
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <asm/io.h> 30#include <asm/io.h>
31#include <asm/system.h> 31#include <asm/system.h>
32#include <asm/semaphore.h>
33#include <asm/sections.h> 32#include <asm/sections.h>
34 33
35/* Per cpu memory for storing cpu states in case of system crash. */ 34/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1407 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1408 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1407 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408 VMCOREINFO_NUMBER(PG_lru);
1409 VMCOREINFO_NUMBER(PG_private);
1410 VMCOREINFO_NUMBER(PG_swapcache);
1409 1411
1410 arch_crash_save_vmcoreinfo(); 1412 arch_crash_save_vmcoreinfo();
1411 1413
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
new file mode 100644
index 000000000000..1bd0ec1c80b2
--- /dev/null
+++ b/kernel/kgdb.c
@@ -0,0 +1,1700 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56static int kgdb_break_asap;
57
58struct kgdb_state {
59 int ex_vector;
60 int signo;
61 int err_code;
62 int cpu;
63 int pass_exception;
64 long threadid;
65 long kgdb_usethreadid;
66 struct pt_regs *linux_regs;
67};
68
69static struct debuggerinfo_struct {
70 void *debuggerinfo;
71 struct task_struct *task;
72} kgdb_info[NR_CPUS];
73
74/**
75 * kgdb_connected - Is a host GDB connected to us?
76 */
77int kgdb_connected;
78EXPORT_SYMBOL_GPL(kgdb_connected);
79
80/* All the KGDB handlers are installed */
81static int kgdb_io_module_registered;
82
83/* Guard for recursive entry */
84static int exception_level;
85
86static struct kgdb_io *kgdb_io_ops;
87static DEFINE_SPINLOCK(kgdb_registration_lock);
88
89/* kgdb console driver is loaded */
90static int kgdb_con_registered;
91/* determine if kgdb console output should be used */
92static int kgdb_use_con;
93
94static int __init opt_kgdb_con(char *str)
95{
96 kgdb_use_con = 1;
97 return 0;
98}
99
100early_param("kgdbcon", opt_kgdb_con);
101
102module_param(kgdb_use_con, int, 0644);
103
104/*
105 * Holds information about breakpoints in a kernel. These breakpoints are
106 * added and removed by gdb.
107 */
108static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
109 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
110};
111
112/*
113 * The CPU# of the active CPU, or -1 if none:
114 */
115atomic_t kgdb_active = ATOMIC_INIT(-1);
116
117/*
118 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
119 * bootup code (which might not have percpu set up yet):
120 */
121static atomic_t passive_cpu_wait[NR_CPUS];
122static atomic_t cpu_in_kgdb[NR_CPUS];
123atomic_t kgdb_setting_breakpoint;
124
125struct task_struct *kgdb_usethread;
126struct task_struct *kgdb_contthread;
127
128int kgdb_single_step;
129
130/* Our I/O buffers. */
131static char remcom_in_buffer[BUFMAX];
132static char remcom_out_buffer[BUFMAX];
133
134/* Storage for the registers, in GDB format. */
135static unsigned long gdb_regs[(NUMREGBYTES +
136 sizeof(unsigned long) - 1) /
137 sizeof(unsigned long)];
138
139/* to keep track of the CPU which is doing the single stepping*/
140atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
141
142/*
143 * If you are debugging a problem where roundup (the collection of
144 * all other CPUs) is a problem [this should be extremely rare],
145 * then use the nokgdbroundup option to avoid roundup. In that case
146 * the other CPUs might interfere with your debugging context, so
147 * use this with care:
148 */
149int kgdb_do_roundup = 1;
150
151static int __init opt_nokgdbroundup(char *str)
152{
153 kgdb_do_roundup = 0;
154
155 return 0;
156}
157
158early_param("nokgdbroundup", opt_nokgdbroundup);
159
160/*
161 * Finally, some KGDB code :-)
162 */
163
164/*
165 * Weak aliases for breakpoint management,
166 * can be overriden by architectures when needed:
167 */
168int __weak kgdb_validate_break_address(unsigned long addr)
169{
170 char tmp_variable[BREAK_INSTR_SIZE];
171
172 return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
176{
177 int err;
178
179 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
180 if (err)
181 return err;
182
183 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
184 BREAK_INSTR_SIZE);
185}
186
187int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
188{
189 return probe_kernel_write((char *)addr,
190 (char *)bundle, BREAK_INSTR_SIZE);
191}
192
193unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
194{
195 return instruction_pointer(regs);
196}
197
198int __weak kgdb_arch_init(void)
199{
200 return 0;
201}
202
203int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
204{
205 return 0;
206}
207
208void __weak
209kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
210{
211 return;
212}
213
214/**
215 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
216 * @regs: Current &struct pt_regs.
217 *
218 * This function will be called if the particular architecture must
219 * disable hardware debugging while it is processing gdb packets or
220 * handling exception.
221 */
222void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
223{
224}
225
226/*
227 * GDB remote protocol parser:
228 */
229
230static const char hexchars[] = "0123456789abcdef";
231
232static int hex(char ch)
233{
234 if ((ch >= 'a') && (ch <= 'f'))
235 return ch - 'a' + 10;
236 if ((ch >= '0') && (ch <= '9'))
237 return ch - '0';
238 if ((ch >= 'A') && (ch <= 'F'))
239 return ch - 'A' + 10;
240 return -1;
241}
242
243/* scan for the sequence $<data>#<checksum> */
244static void get_packet(char *buffer)
245{
246 unsigned char checksum;
247 unsigned char xmitcsum;
248 int count;
249 char ch;
250
251 do {
252 /*
253 * Spin and wait around for the start character, ignore all
254 * other characters:
255 */
256 while ((ch = (kgdb_io_ops->read_char())) != '$')
257 /* nothing */;
258
259 kgdb_connected = 1;
260 checksum = 0;
261 xmitcsum = -1;
262
263 count = 0;
264
265 /*
266 * now, read until a # or end of buffer is found:
267 */
268 while (count < (BUFMAX - 1)) {
269 ch = kgdb_io_ops->read_char();
270 if (ch == '#')
271 break;
272 checksum = checksum + ch;
273 buffer[count] = ch;
274 count = count + 1;
275 }
276 buffer[count] = 0;
277
278 if (ch == '#') {
279 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
280 xmitcsum += hex(kgdb_io_ops->read_char());
281
282 if (checksum != xmitcsum)
283 /* failed checksum */
284 kgdb_io_ops->write_char('-');
285 else
286 /* successful transfer */
287 kgdb_io_ops->write_char('+');
288 if (kgdb_io_ops->flush)
289 kgdb_io_ops->flush();
290 }
291 } while (checksum != xmitcsum);
292}
293
294/*
295 * Send the packet in buffer.
296 * Check for gdb connection if asked for.
297 */
298static void put_packet(char *buffer)
299{
300 unsigned char checksum;
301 int count;
302 char ch;
303
304 /*
305 * $<packet info>#<checksum>.
306 */
307 while (1) {
308 kgdb_io_ops->write_char('$');
309 checksum = 0;
310 count = 0;
311
312 while ((ch = buffer[count])) {
313 kgdb_io_ops->write_char(ch);
314 checksum += ch;
315 count++;
316 }
317
318 kgdb_io_ops->write_char('#');
319 kgdb_io_ops->write_char(hexchars[checksum >> 4]);
320 kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
321 if (kgdb_io_ops->flush)
322 kgdb_io_ops->flush();
323
324 /* Now see what we get in reply. */
325 ch = kgdb_io_ops->read_char();
326
327 if (ch == 3)
328 ch = kgdb_io_ops->read_char();
329
330 /* If we get an ACK, we are done. */
331 if (ch == '+')
332 return;
333
334 /*
335 * If we get the start of another packet, this means
336 * that GDB is attempting to reconnect. We will NAK
337 * the packet being sent, and stop trying to send this
338 * packet.
339 */
340 if (ch == '$') {
341 kgdb_io_ops->write_char('-');
342 if (kgdb_io_ops->flush)
343 kgdb_io_ops->flush();
344 return;
345 }
346 }
347}
348
349static char *pack_hex_byte(char *pkt, u8 byte)
350{
351 *pkt++ = hexchars[byte >> 4];
352 *pkt++ = hexchars[byte & 0xf];
353
354 return pkt;
355}
356
357/*
358 * Convert the memory pointed to by mem into hex, placing result in buf.
359 * Return a pointer to the last char put in buf (null). May return an error.
360 */
361int kgdb_mem2hex(char *mem, char *buf, int count)
362{
363 char *tmp;
364 int err;
365
366 /*
367 * We use the upper half of buf as an intermediate buffer for the
368 * raw memory copy. Hex conversion will work against this one.
369 */
370 tmp = buf + count;
371
372 err = probe_kernel_read(tmp, mem, count);
373 if (!err) {
374 while (count > 0) {
375 buf = pack_hex_byte(buf, *tmp);
376 tmp++;
377 count--;
378 }
379
380 *buf = 0;
381 }
382
383 return err;
384}
385
386/*
387 * Copy the binary array pointed to by buf into mem. Fix $, #, and
388 * 0x7d escaped with 0x7d. Return a pointer to the character after
389 * the last byte written.
390 */
391static int kgdb_ebin2mem(char *buf, char *mem, int count)
392{
393 int err = 0;
394 char c;
395
396 while (count-- > 0) {
397 c = *buf++;
398 if (c == 0x7d)
399 c = *buf++ ^ 0x20;
400
401 err = probe_kernel_write(mem, &c, 1);
402 if (err)
403 break;
404
405 mem++;
406 }
407
408 return err;
409}
410
411/*
412 * Convert the hex array pointed to by buf into binary to be placed in mem.
413 * Return a pointer to the character AFTER the last byte written.
414 * May return an error.
415 */
416int kgdb_hex2mem(char *buf, char *mem, int count)
417{
418 char *tmp_raw;
419 char *tmp_hex;
420
421 /*
422 * We use the upper half of buf as an intermediate buffer for the
423 * raw memory that is converted from hex.
424 */
425 tmp_raw = buf + count * 2;
426
427 tmp_hex = tmp_raw - 1;
428 while (tmp_hex >= buf) {
429 tmp_raw--;
430 *tmp_raw = hex(*tmp_hex--);
431 *tmp_raw |= hex(*tmp_hex--) << 4;
432 }
433
434 return probe_kernel_write(mem, tmp_raw, count);
435}
436
437/*
438 * While we find nice hex chars, build a long_val.
439 * Return number of chars processed.
440 */
441int kgdb_hex2long(char **ptr, long *long_val)
442{
443 int hex_val;
444 int num = 0;
445
446 *long_val = 0;
447
448 while (**ptr) {
449 hex_val = hex(**ptr);
450 if (hex_val < 0)
451 break;
452
453 *long_val = (*long_val << 4) | hex_val;
454 num++;
455 (*ptr)++;
456 }
457
458 return num;
459}
460
461/* Write memory due to an 'M' or 'X' packet. */
462static int write_mem_msg(int binary)
463{
464 char *ptr = &remcom_in_buffer[1];
465 unsigned long addr;
466 unsigned long length;
467 int err;
468
469 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
470 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
471 if (binary)
472 err = kgdb_ebin2mem(ptr, (char *)addr, length);
473 else
474 err = kgdb_hex2mem(ptr, (char *)addr, length);
475 if (err)
476 return err;
477 if (CACHE_FLUSH_IS_SAFE)
478 flush_icache_range(addr, addr + length + 1);
479 return 0;
480 }
481
482 return -EINVAL;
483}
484
485static void error_packet(char *pkt, int error)
486{
487 error = -error;
488 pkt[0] = 'E';
489 pkt[1] = hexchars[(error / 10)];
490 pkt[2] = hexchars[(error % 10)];
491 pkt[3] = '\0';
492}
493
494/*
495 * Thread ID accessors. We represent a flat TID space to GDB, where
496 * the per CPU idle threads (which under Linux all have PID 0) are
497 * remapped to negative TIDs.
498 */
499
500#define BUF_THREAD_ID_SIZE 16
501
502static char *pack_threadid(char *pkt, unsigned char *id)
503{
504 char *limit;
505
506 limit = pkt + BUF_THREAD_ID_SIZE;
507 while (pkt < limit)
508 pkt = pack_hex_byte(pkt, *id++);
509
510 return pkt;
511}
512
513static void int_to_threadref(unsigned char *id, int value)
514{
515 unsigned char *scan;
516 int i = 4;
517
518 scan = (unsigned char *)id;
519 while (i--)
520 *scan++ = 0;
521 *scan++ = (value >> 24) & 0xff;
522 *scan++ = (value >> 16) & 0xff;
523 *scan++ = (value >> 8) & 0xff;
524 *scan++ = (value & 0xff);
525}
526
527static struct task_struct *getthread(struct pt_regs *regs, int tid)
528{
529 /*
530 * Non-positive TIDs are remapped idle tasks:
531 */
532 if (tid <= 0)
533 return idle_task(-tid);
534
535 /*
536 * find_task_by_pid_ns() does not take the tasklist lock anymore
537 * but is nicely RCU locked - hence is a pretty resilient
538 * thing to use:
539 */
540 return find_task_by_pid_ns(tid, &init_pid_ns);
541}
542
543/*
544 * CPU debug state control:
545 */
546
547#ifdef CONFIG_SMP
548static void kgdb_wait(struct pt_regs *regs)
549{
550 unsigned long flags;
551 int cpu;
552
553 local_irq_save(flags);
554 cpu = raw_smp_processor_id();
555 kgdb_info[cpu].debuggerinfo = regs;
556 kgdb_info[cpu].task = current;
557 /*
558 * Make sure the above info reaches the primary CPU before
559 * our cpu_in_kgdb[] flag setting does:
560 */
561 smp_wmb();
562 atomic_set(&cpu_in_kgdb[cpu], 1);
563
564 /* Wait till primary CPU is done with debugging */
565 while (atomic_read(&passive_cpu_wait[cpu]))
566 cpu_relax();
567
568 kgdb_info[cpu].debuggerinfo = NULL;
569 kgdb_info[cpu].task = NULL;
570
571 /* fix up hardware debug registers on local cpu */
572 if (arch_kgdb_ops.correct_hw_break)
573 arch_kgdb_ops.correct_hw_break();
574
575 /* Signal the primary CPU that we are done: */
576 atomic_set(&cpu_in_kgdb[cpu], 0);
577 clocksource_touch_watchdog();
578 local_irq_restore(flags);
579}
580#endif
581
582/*
583 * Some architectures need cache flushes when we set/clear a
584 * breakpoint:
585 */
586static void kgdb_flush_swbreak_addr(unsigned long addr)
587{
588 if (!CACHE_FLUSH_IS_SAFE)
589 return;
590
591 if (current->mm && current->mm->mmap_cache) {
592 flush_cache_range(current->mm->mmap_cache,
593 addr, addr + BREAK_INSTR_SIZE);
594 }
595 /* Force flush instruction cache if it was outside the mm */
596 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
597}
598
599/*
600 * SW breakpoint management:
601 */
602static int kgdb_activate_sw_breakpoints(void)
603{
604 unsigned long addr;
605 int error = 0;
606 int i;
607
608 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
609 if (kgdb_break[i].state != BP_SET)
610 continue;
611
612 addr = kgdb_break[i].bpt_addr;
613 error = kgdb_arch_set_breakpoint(addr,
614 kgdb_break[i].saved_instr);
615 if (error)
616 return error;
617
618 kgdb_flush_swbreak_addr(addr);
619 kgdb_break[i].state = BP_ACTIVE;
620 }
621 return 0;
622}
623
624static int kgdb_set_sw_break(unsigned long addr)
625{
626 int err = kgdb_validate_break_address(addr);
627 int breakno = -1;
628 int i;
629
630 if (err)
631 return err;
632
633 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
634 if ((kgdb_break[i].state == BP_SET) &&
635 (kgdb_break[i].bpt_addr == addr))
636 return -EEXIST;
637 }
638 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
639 if (kgdb_break[i].state == BP_REMOVED &&
640 kgdb_break[i].bpt_addr == addr) {
641 breakno = i;
642 break;
643 }
644 }
645
646 if (breakno == -1) {
647 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
648 if (kgdb_break[i].state == BP_UNDEFINED) {
649 breakno = i;
650 break;
651 }
652 }
653 }
654
655 if (breakno == -1)
656 return -E2BIG;
657
658 kgdb_break[breakno].state = BP_SET;
659 kgdb_break[breakno].type = BP_BREAKPOINT;
660 kgdb_break[breakno].bpt_addr = addr;
661
662 return 0;
663}
664
665static int kgdb_deactivate_sw_breakpoints(void)
666{
667 unsigned long addr;
668 int error = 0;
669 int i;
670
671 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
672 if (kgdb_break[i].state != BP_ACTIVE)
673 continue;
674 addr = kgdb_break[i].bpt_addr;
675 error = kgdb_arch_remove_breakpoint(addr,
676 kgdb_break[i].saved_instr);
677 if (error)
678 return error;
679
680 kgdb_flush_swbreak_addr(addr);
681 kgdb_break[i].state = BP_SET;
682 }
683 return 0;
684}
685
686static int kgdb_remove_sw_break(unsigned long addr)
687{
688 int i;
689
690 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
691 if ((kgdb_break[i].state == BP_SET) &&
692 (kgdb_break[i].bpt_addr == addr)) {
693 kgdb_break[i].state = BP_REMOVED;
694 return 0;
695 }
696 }
697 return -ENOENT;
698}
699
700int kgdb_isremovedbreak(unsigned long addr)
701{
702 int i;
703
704 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
705 if ((kgdb_break[i].state == BP_REMOVED) &&
706 (kgdb_break[i].bpt_addr == addr))
707 return 1;
708 }
709 return 0;
710}
711
712int remove_all_break(void)
713{
714 unsigned long addr;
715 int error;
716 int i;
717
718 /* Clear memory breakpoints. */
719 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
720 if (kgdb_break[i].state != BP_ACTIVE)
721 goto setundefined;
722 addr = kgdb_break[i].bpt_addr;
723 error = kgdb_arch_remove_breakpoint(addr,
724 kgdb_break[i].saved_instr);
725 if (error)
726 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
727 addr);
728setundefined:
729 kgdb_break[i].state = BP_UNDEFINED;
730 }
731
732 /* Clear hardware breakpoints. */
733 if (arch_kgdb_ops.remove_all_hw_break)
734 arch_kgdb_ops.remove_all_hw_break();
735
736 return 0;
737}
738
739/*
740 * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
741 */
742static inline int shadow_pid(int realpid)
743{
744 if (realpid)
745 return realpid;
746
747 return -1-raw_smp_processor_id();
748}
749
750static char gdbmsgbuf[BUFMAX + 1];
751
752static void kgdb_msg_write(const char *s, int len)
753{
754 char *bufptr;
755 int wcount;
756 int i;
757
758 /* 'O'utput */
759 gdbmsgbuf[0] = 'O';
760
761 /* Fill and send buffers... */
762 while (len > 0) {
763 bufptr = gdbmsgbuf + 1;
764
765 /* Calculate how many this time */
766 if ((len << 1) > (BUFMAX - 2))
767 wcount = (BUFMAX - 2) >> 1;
768 else
769 wcount = len;
770
771 /* Pack in hex chars */
772 for (i = 0; i < wcount; i++)
773 bufptr = pack_hex_byte(bufptr, s[i]);
774 *bufptr = '\0';
775
776 /* Move up */
777 s += wcount;
778 len -= wcount;
779
780 /* Write packet */
781 put_packet(gdbmsgbuf);
782 }
783}
784
785/*
786 * Return true if there is a valid kgdb I/O module. Also if no
787 * debugger is attached a message can be printed to the console about
788 * waiting for the debugger to attach.
789 *
790 * The print_wait argument is only to be true when called from inside
791 * the core kgdb_handle_exception, because it will wait for the
792 * debugger to attach.
793 */
794static int kgdb_io_ready(int print_wait)
795{
796 if (!kgdb_io_ops)
797 return 0;
798 if (kgdb_connected)
799 return 1;
800 if (atomic_read(&kgdb_setting_breakpoint))
801 return 1;
802 if (print_wait)
803 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
804 return 1;
805}
806
807/*
808 * All the functions that start with gdb_cmd are the various
809 * operations to implement the handlers for the gdbserial protocol
810 * where KGDB is communicating with an external debugger
811 */
812
813/* Handle the '?' status packets */
814static void gdb_cmd_status(struct kgdb_state *ks)
815{
816 /*
817 * We know that this packet is only sent
818 * during initial connect. So to be safe,
819 * we clear out our breakpoints now in case
820 * GDB is reconnecting.
821 */
822 remove_all_break();
823
824 remcom_out_buffer[0] = 'S';
825 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
826}
827
828/* Handle the 'g' get registers request */
829static void gdb_cmd_getregs(struct kgdb_state *ks)
830{
831 struct task_struct *thread;
832 void *local_debuggerinfo;
833 int i;
834
835 thread = kgdb_usethread;
836 if (!thread) {
837 thread = kgdb_info[ks->cpu].task;
838 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
839 } else {
840 local_debuggerinfo = NULL;
841 for (i = 0; i < NR_CPUS; i++) {
842 /*
843 * Try to find the task on some other
844 * or possibly this node if we do not
845 * find the matching task then we try
846 * to approximate the results.
847 */
848 if (thread == kgdb_info[i].task)
849 local_debuggerinfo = kgdb_info[i].debuggerinfo;
850 }
851 }
852
853 /*
854 * All threads that don't have debuggerinfo should be
855 * in __schedule() sleeping, since all other CPUs
856 * are in kgdb_wait, and thus have debuggerinfo.
857 */
858 if (local_debuggerinfo) {
859 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
860 } else {
861 /*
862 * Pull stuff saved during switch_to; nothing
863 * else is accessible (or even particularly
864 * relevant).
865 *
866 * This should be enough for a stack trace.
867 */
868 sleeping_thread_to_gdb_regs(gdb_regs, thread);
869 }
870 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
871}
872
873/* Handle the 'G' set registers request */
874static void gdb_cmd_setregs(struct kgdb_state *ks)
875{
876 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
877
878 if (kgdb_usethread && kgdb_usethread != current) {
879 error_packet(remcom_out_buffer, -EINVAL);
880 } else {
881 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
882 strcpy(remcom_out_buffer, "OK");
883 }
884}
885
886/* Handle the 'm' memory read bytes */
887static void gdb_cmd_memread(struct kgdb_state *ks)
888{
889 char *ptr = &remcom_in_buffer[1];
890 unsigned long length;
891 unsigned long addr;
892 int err;
893
894 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
895 kgdb_hex2long(&ptr, &length) > 0) {
896 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
897 if (err)
898 error_packet(remcom_out_buffer, err);
899 } else {
900 error_packet(remcom_out_buffer, -EINVAL);
901 }
902}
903
904/* Handle the 'M' memory write bytes */
905static void gdb_cmd_memwrite(struct kgdb_state *ks)
906{
907 int err = write_mem_msg(0);
908
909 if (err)
910 error_packet(remcom_out_buffer, err);
911 else
912 strcpy(remcom_out_buffer, "OK");
913}
914
915/* Handle the 'X' memory binary write bytes */
916static void gdb_cmd_binwrite(struct kgdb_state *ks)
917{
918 int err = write_mem_msg(1);
919
920 if (err)
921 error_packet(remcom_out_buffer, err);
922 else
923 strcpy(remcom_out_buffer, "OK");
924}
925
926/* Handle the 'D' or 'k', detach or kill packets */
927static void gdb_cmd_detachkill(struct kgdb_state *ks)
928{
929 int error;
930
931 /* The detach case */
932 if (remcom_in_buffer[0] == 'D') {
933 error = remove_all_break();
934 if (error < 0) {
935 error_packet(remcom_out_buffer, error);
936 } else {
937 strcpy(remcom_out_buffer, "OK");
938 kgdb_connected = 0;
939 }
940 put_packet(remcom_out_buffer);
941 } else {
942 /*
943 * Assume the kill case, with no exit code checking,
944 * trying to force detach the debugger:
945 */
946 remove_all_break();
947 kgdb_connected = 0;
948 }
949}
950
951/* Handle the 'R' reboot packets */
952static int gdb_cmd_reboot(struct kgdb_state *ks)
953{
954 /* For now, only honor R0 */
955 if (strcmp(remcom_in_buffer, "R0") == 0) {
956 printk(KERN_CRIT "Executing emergency reboot\n");
957 strcpy(remcom_out_buffer, "OK");
958 put_packet(remcom_out_buffer);
959
960 /*
961 * Execution should not return from
962 * machine_emergency_restart()
963 */
964 machine_emergency_restart();
965 kgdb_connected = 0;
966
967 return 1;
968 }
969 return 0;
970}
971
972/* Handle the 'q' query packets */
973static void gdb_cmd_query(struct kgdb_state *ks)
974{
975 struct task_struct *thread;
976 unsigned char thref[8];
977 char *ptr;
978 int i;
979
980 switch (remcom_in_buffer[1]) {
981 case 's':
982 case 'f':
983 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
984 error_packet(remcom_out_buffer, -EINVAL);
985 break;
986 }
987
988 if (remcom_in_buffer[1] == 'f')
989 ks->threadid = 1;
990
991 remcom_out_buffer[0] = 'm';
992 ptr = remcom_out_buffer + 1;
993
994 for (i = 0; i < 17; ks->threadid++) {
995 thread = getthread(ks->linux_regs, ks->threadid);
996 if (thread) {
997 int_to_threadref(thref, ks->threadid);
998 pack_threadid(ptr, thref);
999 ptr += BUF_THREAD_ID_SIZE;
1000 *(ptr++) = ',';
1001 i++;
1002 }
1003 }
1004 *(--ptr) = '\0';
1005 break;
1006
1007 case 'C':
1008 /* Current thread id */
1009 strcpy(remcom_out_buffer, "QC");
1010 ks->threadid = shadow_pid(current->pid);
1011 int_to_threadref(thref, ks->threadid);
1012 pack_threadid(remcom_out_buffer + 2, thref);
1013 break;
1014 case 'T':
1015 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1016 error_packet(remcom_out_buffer, -EINVAL);
1017 break;
1018 }
1019 ks->threadid = 0;
1020 ptr = remcom_in_buffer + 17;
1021 kgdb_hex2long(&ptr, &ks->threadid);
1022 if (!getthread(ks->linux_regs, ks->threadid)) {
1023 error_packet(remcom_out_buffer, -EINVAL);
1024 break;
1025 }
1026 if (ks->threadid > 0) {
1027 kgdb_mem2hex(getthread(ks->linux_regs,
1028 ks->threadid)->comm,
1029 remcom_out_buffer, 16);
1030 } else {
1031 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1032
1033 sprintf(tmpstr, "Shadow task %d for pid 0",
1034 (int)(-ks->threadid-1));
1035 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1036 }
1037 break;
1038 }
1039}
1040
1041/* Handle the 'H' task query packets */
1042static void gdb_cmd_task(struct kgdb_state *ks)
1043{
1044 struct task_struct *thread;
1045 char *ptr;
1046
1047 switch (remcom_in_buffer[1]) {
1048 case 'g':
1049 ptr = &remcom_in_buffer[2];
1050 kgdb_hex2long(&ptr, &ks->threadid);
1051 thread = getthread(ks->linux_regs, ks->threadid);
1052 if (!thread && ks->threadid > 0) {
1053 error_packet(remcom_out_buffer, -EINVAL);
1054 break;
1055 }
1056 kgdb_usethread = thread;
1057 ks->kgdb_usethreadid = ks->threadid;
1058 strcpy(remcom_out_buffer, "OK");
1059 break;
1060 case 'c':
1061 ptr = &remcom_in_buffer[2];
1062 kgdb_hex2long(&ptr, &ks->threadid);
1063 if (!ks->threadid) {
1064 kgdb_contthread = NULL;
1065 } else {
1066 thread = getthread(ks->linux_regs, ks->threadid);
1067 if (!thread && ks->threadid > 0) {
1068 error_packet(remcom_out_buffer, -EINVAL);
1069 break;
1070 }
1071 kgdb_contthread = thread;
1072 }
1073 strcpy(remcom_out_buffer, "OK");
1074 break;
1075 }
1076}
1077
1078/* Handle the 'T' thread query packets */
1079static void gdb_cmd_thread(struct kgdb_state *ks)
1080{
1081 char *ptr = &remcom_in_buffer[1];
1082 struct task_struct *thread;
1083
1084 kgdb_hex2long(&ptr, &ks->threadid);
1085 thread = getthread(ks->linux_regs, ks->threadid);
1086 if (thread)
1087 strcpy(remcom_out_buffer, "OK");
1088 else
1089 error_packet(remcom_out_buffer, -EINVAL);
1090}
1091
1092/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1093static void gdb_cmd_break(struct kgdb_state *ks)
1094{
1095 /*
1096 * Since GDB-5.3, it's been drafted that '0' is a software
1097 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1098 */
1099 char *bpt_type = &remcom_in_buffer[1];
1100 char *ptr = &remcom_in_buffer[2];
1101 unsigned long addr;
1102 unsigned long length;
1103 int error = 0;
1104
1105 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1106 /* Unsupported */
1107 if (*bpt_type > '4')
1108 return;
1109 } else {
1110 if (*bpt_type != '0' && *bpt_type != '1')
1111 /* Unsupported. */
1112 return;
1113 }
1114
1115 /*
1116 * Test if this is a hardware breakpoint, and
1117 * if we support it:
1118 */
1119 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1120 /* Unsupported. */
1121 return;
1122
1123 if (*(ptr++) != ',') {
1124 error_packet(remcom_out_buffer, -EINVAL);
1125 return;
1126 }
1127 if (!kgdb_hex2long(&ptr, &addr)) {
1128 error_packet(remcom_out_buffer, -EINVAL);
1129 return;
1130 }
1131 if (*(ptr++) != ',' ||
1132 !kgdb_hex2long(&ptr, &length)) {
1133 error_packet(remcom_out_buffer, -EINVAL);
1134 return;
1135 }
1136
1137 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1138 error = kgdb_set_sw_break(addr);
1139 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1140 error = kgdb_remove_sw_break(addr);
1141 else if (remcom_in_buffer[0] == 'Z')
1142 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1143 (int)length, *bpt_type - '0');
1144 else if (remcom_in_buffer[0] == 'z')
1145 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1146 (int) length, *bpt_type - '0');
1147
1148 if (error == 0)
1149 strcpy(remcom_out_buffer, "OK");
1150 else
1151 error_packet(remcom_out_buffer, error);
1152}
1153
1154/* Handle the 'C' signal / exception passing packets */
1155static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1156{
1157 /* C09 == pass exception
1158 * C15 == detach kgdb, pass exception
1159 */
1160 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1161
1162 ks->pass_exception = 1;
1163 remcom_in_buffer[0] = 'c';
1164
1165 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1166
1167 ks->pass_exception = 1;
1168 remcom_in_buffer[0] = 'D';
1169 remove_all_break();
1170 kgdb_connected = 0;
1171 return 1;
1172
1173 } else {
1174 error_packet(remcom_out_buffer, -EINVAL);
1175 return 0;
1176 }
1177
1178 /* Indicate fall through */
1179 return -1;
1180}
1181
1182/*
1183 * This function performs all gdbserial command procesing
1184 */
1185static int gdb_serial_stub(struct kgdb_state *ks)
1186{
1187 int error = 0;
1188 int tmp;
1189
1190 /* Clear the out buffer. */
1191 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1192
1193 if (kgdb_connected) {
1194 unsigned char thref[8];
1195 char *ptr;
1196
1197 /* Reply to host that an exception has occurred */
1198 ptr = remcom_out_buffer;
1199 *ptr++ = 'T';
1200 ptr = pack_hex_byte(ptr, ks->signo);
1201 ptr += strlen(strcpy(ptr, "thread:"));
1202 int_to_threadref(thref, shadow_pid(current->pid));
1203 ptr = pack_threadid(ptr, thref);
1204 *ptr++ = ';';
1205 put_packet(remcom_out_buffer);
1206 }
1207
1208 kgdb_usethread = kgdb_info[ks->cpu].task;
1209 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1210 ks->pass_exception = 0;
1211
1212 while (1) {
1213 error = 0;
1214
1215 /* Clear the out buffer. */
1216 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1217
1218 get_packet(remcom_in_buffer);
1219
1220 switch (remcom_in_buffer[0]) {
1221 case '?': /* gdbserial status */
1222 gdb_cmd_status(ks);
1223 break;
1224 case 'g': /* return the value of the CPU registers */
1225 gdb_cmd_getregs(ks);
1226 break;
1227 case 'G': /* set the value of the CPU registers - return OK */
1228 gdb_cmd_setregs(ks);
1229 break;
1230 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1231 gdb_cmd_memread(ks);
1232 break;
1233 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1234 gdb_cmd_memwrite(ks);
1235 break;
1236 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1237 gdb_cmd_binwrite(ks);
1238 break;
1239 /* kill or detach. KGDB should treat this like a
1240 * continue.
1241 */
1242 case 'D': /* Debugger detach */
1243 case 'k': /* Debugger detach via kill */
1244 gdb_cmd_detachkill(ks);
1245 goto default_handle;
1246 case 'R': /* Reboot */
1247 if (gdb_cmd_reboot(ks))
1248 goto default_handle;
1249 break;
1250 case 'q': /* query command */
1251 gdb_cmd_query(ks);
1252 break;
1253 case 'H': /* task related */
1254 gdb_cmd_task(ks);
1255 break;
1256 case 'T': /* Query thread status */
1257 gdb_cmd_thread(ks);
1258 break;
1259 case 'z': /* Break point remove */
1260 case 'Z': /* Break point set */
1261 gdb_cmd_break(ks);
1262 break;
1263 case 'C': /* Exception passing */
1264 tmp = gdb_cmd_exception_pass(ks);
1265 if (tmp > 0)
1266 goto default_handle;
1267 if (tmp == 0)
1268 break;
1269 /* Fall through on tmp < 0 */
1270 case 'c': /* Continue packet */
1271 case 's': /* Single step packet */
1272 if (kgdb_contthread && kgdb_contthread != current) {
1273 /* Can't switch threads in kgdb */
1274 error_packet(remcom_out_buffer, -EINVAL);
1275 break;
1276 }
1277 kgdb_activate_sw_breakpoints();
1278 /* Fall through to default processing */
1279 default:
1280default_handle:
1281 error = kgdb_arch_handle_exception(ks->ex_vector,
1282 ks->signo,
1283 ks->err_code,
1284 remcom_in_buffer,
1285 remcom_out_buffer,
1286 ks->linux_regs);
1287 /*
1288 * Leave cmd processing on error, detach,
1289 * kill, continue, or single step.
1290 */
1291 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1292 remcom_in_buffer[0] == 'k') {
1293 error = 0;
1294 goto kgdb_exit;
1295 }
1296
1297 }
1298
1299 /* reply to the request */
1300 put_packet(remcom_out_buffer);
1301 }
1302
1303kgdb_exit:
1304 if (ks->pass_exception)
1305 error = 1;
1306 return error;
1307}
1308
1309static int kgdb_reenter_check(struct kgdb_state *ks)
1310{
1311 unsigned long addr;
1312
1313 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1314 return 0;
1315
1316 /* Panic on recursive debugger calls: */
1317 exception_level++;
1318 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1319 kgdb_deactivate_sw_breakpoints();
1320
1321 /*
1322 * If the break point removed ok at the place exception
1323 * occurred, try to recover and print a warning to the end
1324 * user because the user planted a breakpoint in a place that
1325 * KGDB needs in order to function.
1326 */
1327 if (kgdb_remove_sw_break(addr) == 0) {
1328 exception_level = 0;
1329 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1330 kgdb_activate_sw_breakpoints();
1331 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1332 addr);
1333 WARN_ON_ONCE(1);
1334
1335 return 1;
1336 }
1337 remove_all_break();
1338 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1339
1340 if (exception_level > 1) {
1341 dump_stack();
1342 panic("Recursive entry to debugger");
1343 }
1344
1345 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1346 dump_stack();
1347 panic("Recursive entry to debugger");
1348
1349 return 1;
1350}
1351
1352/*
1353 * kgdb_handle_exception() - main entry point from a kernel exception
1354 *
1355 * Locking hierarchy:
1356 * interface locks, if any (begin_session)
1357 * kgdb lock (kgdb_active)
1358 */
1359int
1360kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1361{
1362 struct kgdb_state kgdb_var;
1363 struct kgdb_state *ks = &kgdb_var;
1364 unsigned long flags;
1365 int error = 0;
1366 int i, cpu;
1367
1368 ks->cpu = raw_smp_processor_id();
1369 ks->ex_vector = evector;
1370 ks->signo = signo;
1371 ks->ex_vector = evector;
1372 ks->err_code = ecode;
1373 ks->kgdb_usethreadid = 0;
1374 ks->linux_regs = regs;
1375
1376 if (kgdb_reenter_check(ks))
1377 return 0; /* Ouch, double exception ! */
1378
1379acquirelock:
1380 /*
1381 * Interrupts will be restored by the 'trap return' code, except when
1382 * single stepping.
1383 */
1384 local_irq_save(flags);
1385
1386 cpu = raw_smp_processor_id();
1387
1388 /*
1389 * Acquire the kgdb_active lock:
1390 */
1391 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
1392 cpu_relax();
1393
1394 /*
1395 * Do not start the debugger connection on this CPU if the last
1396 * instance of the exception handler wanted to come into the
1397 * debugger on a different CPU via a single step
1398 */
1399 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1400 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1401
1402 atomic_set(&kgdb_active, -1);
1403 clocksource_touch_watchdog();
1404 local_irq_restore(flags);
1405
1406 goto acquirelock;
1407 }
1408
1409 if (!kgdb_io_ready(1)) {
1410 error = 1;
1411 goto kgdb_restore; /* No I/O connection, so resume the system */
1412 }
1413
1414 /*
1415 * Don't enter if we have hit a removed breakpoint.
1416 */
1417 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1418 goto kgdb_restore;
1419
1420 /* Call the I/O driver's pre_exception routine */
1421 if (kgdb_io_ops->pre_exception)
1422 kgdb_io_ops->pre_exception();
1423
1424 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1425 kgdb_info[ks->cpu].task = current;
1426
1427 kgdb_disable_hw_debug(ks->linux_regs);
1428
1429 /*
1430 * Get the passive CPU lock which will hold all the non-primary
1431 * CPU in a spin state while the debugger is active
1432 */
1433 if (!kgdb_single_step || !kgdb_contthread) {
1434 for (i = 0; i < NR_CPUS; i++)
1435 atomic_set(&passive_cpu_wait[i], 1);
1436 }
1437
1438 /*
1439 * spin_lock code is good enough as a barrier so we don't
1440 * need one here:
1441 */
1442 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1443
1444#ifdef CONFIG_SMP
1445 /* Signal the other CPUs to enter kgdb_wait() */
1446 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
1447 kgdb_roundup_cpus(flags);
1448#endif
1449
1450 /*
1451 * Wait for the other CPUs to be notified and be waiting for us:
1452 */
1453 for_each_online_cpu(i) {
1454 while (!atomic_read(&cpu_in_kgdb[i]))
1455 cpu_relax();
1456 }
1457
1458 /*
1459 * At this point the primary processor is completely
1460 * in the debugger and all secondary CPUs are quiescent
1461 */
1462 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1463 kgdb_deactivate_sw_breakpoints();
1464 kgdb_single_step = 0;
1465 kgdb_contthread = NULL;
1466 exception_level = 0;
1467
1468 /* Talk to debugger with gdbserial protocol */
1469 error = gdb_serial_stub(ks);
1470
1471 /* Call the I/O driver's post_exception routine */
1472 if (kgdb_io_ops->post_exception)
1473 kgdb_io_ops->post_exception();
1474
1475 kgdb_info[ks->cpu].debuggerinfo = NULL;
1476 kgdb_info[ks->cpu].task = NULL;
1477 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1478
1479 if (!kgdb_single_step || !kgdb_contthread) {
1480 for (i = NR_CPUS-1; i >= 0; i--)
1481 atomic_set(&passive_cpu_wait[i], 0);
1482 /*
1483 * Wait till all the CPUs have quit
1484 * from the debugger.
1485 */
1486 for_each_online_cpu(i) {
1487 while (atomic_read(&cpu_in_kgdb[i]))
1488 cpu_relax();
1489 }
1490 }
1491
1492kgdb_restore:
1493 /* Free kgdb_active */
1494 atomic_set(&kgdb_active, -1);
1495 clocksource_touch_watchdog();
1496 local_irq_restore(flags);
1497
1498 return error;
1499}
1500
1501int kgdb_nmicallback(int cpu, void *regs)
1502{
1503#ifdef CONFIG_SMP
1504 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1505 atomic_read(&kgdb_active) != cpu &&
1506 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
1507 kgdb_wait((struct pt_regs *)regs);
1508 return 0;
1509 }
1510#endif
1511 return 1;
1512}
1513
1514void kgdb_console_write(struct console *co, const char *s, unsigned count)
1515{
1516 unsigned long flags;
1517
1518 /* If we're debugging, or KGDB has not connected, don't try
1519 * and print. */
1520 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1521 return;
1522
1523 local_irq_save(flags);
1524 kgdb_msg_write(s, count);
1525 local_irq_restore(flags);
1526}
1527
1528static struct console kgdbcons = {
1529 .name = "kgdb",
1530 .write = kgdb_console_write,
1531 .flags = CON_PRINTBUFFER | CON_ENABLED,
1532 .index = -1,
1533};
1534
1535#ifdef CONFIG_MAGIC_SYSRQ
1536static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1537{
1538 if (!kgdb_io_ops) {
1539 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1540 return;
1541 }
1542 if (!kgdb_connected)
1543 printk(KERN_CRIT "Entering KGDB\n");
1544
1545 kgdb_breakpoint();
1546}
1547
1548static struct sysrq_key_op sysrq_gdb_op = {
1549 .handler = sysrq_handle_gdb,
1550 .help_msg = "Gdb",
1551 .action_msg = "GDB",
1552};
1553#endif
1554
1555static void kgdb_register_callbacks(void)
1556{
1557 if (!kgdb_io_module_registered) {
1558 kgdb_io_module_registered = 1;
1559 kgdb_arch_init();
1560#ifdef CONFIG_MAGIC_SYSRQ
1561 register_sysrq_key('g', &sysrq_gdb_op);
1562#endif
1563 if (kgdb_use_con && !kgdb_con_registered) {
1564 register_console(&kgdbcons);
1565 kgdb_con_registered = 1;
1566 }
1567 }
1568}
1569
1570static void kgdb_unregister_callbacks(void)
1571{
1572 /*
1573 * When this routine is called KGDB should unregister from the
1574 * panic handler and clean up, making sure it is not handling any
1575 * break exceptions at the time.
1576 */
1577 if (kgdb_io_module_registered) {
1578 kgdb_io_module_registered = 0;
1579 kgdb_arch_exit();
1580#ifdef CONFIG_MAGIC_SYSRQ
1581 unregister_sysrq_key('g', &sysrq_gdb_op);
1582#endif
1583 if (kgdb_con_registered) {
1584 unregister_console(&kgdbcons);
1585 kgdb_con_registered = 0;
1586 }
1587 }
1588}
1589
1590static void kgdb_initial_breakpoint(void)
1591{
1592 kgdb_break_asap = 0;
1593
1594 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1595 kgdb_breakpoint();
1596}
1597
1598/**
1599 * kgdb_register_io_module - register KGDB IO module
1600 * @new_kgdb_io_ops: the io ops vector
1601 *
1602 * Register it with the KGDB core.
1603 */
1604int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1605{
1606 int err;
1607
1608 spin_lock(&kgdb_registration_lock);
1609
1610 if (kgdb_io_ops) {
1611 spin_unlock(&kgdb_registration_lock);
1612
1613 printk(KERN_ERR "kgdb: Another I/O driver is already "
1614 "registered with KGDB.\n");
1615 return -EBUSY;
1616 }
1617
1618 if (new_kgdb_io_ops->init) {
1619 err = new_kgdb_io_ops->init();
1620 if (err) {
1621 spin_unlock(&kgdb_registration_lock);
1622 return err;
1623 }
1624 }
1625
1626 kgdb_io_ops = new_kgdb_io_ops;
1627
1628 spin_unlock(&kgdb_registration_lock);
1629
1630 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1631 new_kgdb_io_ops->name);
1632
1633 /* Arm KGDB now. */
1634 kgdb_register_callbacks();
1635
1636 if (kgdb_break_asap)
1637 kgdb_initial_breakpoint();
1638
1639 return 0;
1640}
1641EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1642
1643/**
1644 * kkgdb_unregister_io_module - unregister KGDB IO module
1645 * @old_kgdb_io_ops: the io ops vector
1646 *
1647 * Unregister it with the KGDB core.
1648 */
1649void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1650{
1651 BUG_ON(kgdb_connected);
1652
1653 /*
1654 * KGDB is no longer able to communicate out, so
1655 * unregister our callbacks and reset state.
1656 */
1657 kgdb_unregister_callbacks();
1658
1659 spin_lock(&kgdb_registration_lock);
1660
1661 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1662 kgdb_io_ops = NULL;
1663
1664 spin_unlock(&kgdb_registration_lock);
1665
1666 printk(KERN_INFO
1667 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1668 old_kgdb_io_ops->name);
1669}
1670EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1671
1672/**
1673 * kgdb_breakpoint - generate breakpoint exception
1674 *
1675 * This function will generate a breakpoint exception. It is used at the
1676 * beginning of a program to sync up with a debugger and can be used
1677 * otherwise as a quick means to stop program execution and "break" into
1678 * the debugger.
1679 */
1680void kgdb_breakpoint(void)
1681{
1682 atomic_set(&kgdb_setting_breakpoint, 1);
1683 wmb(); /* Sync point before breakpoint */
1684 arch_kgdb_breakpoint();
1685 wmb(); /* Sync point after breakpoint */
1686 atomic_set(&kgdb_setting_breakpoint, 0);
1687}
1688EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1689
1690static int __init opt_kgdb_wait(char *str)
1691{
1692 kgdb_break_asap = 1;
1693
1694 if (kgdb_io_module_registered)
1695 kgdb_initial_breakpoint();
1696
1697 return 0;
1698}
1699
1700early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 22be3ff3f363..e2764047ec03 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data)
165 } 165 }
166 166
167 /* We can run anywhere, unlike our parent keventd(). */ 167 /* We can run anywhere, unlike our parent keventd(). */
168 set_cpus_allowed(current, CPU_MASK_ALL); 168 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
169 169
170 /* 170 /*
171 * Our parent is keventd, which runs with elevated scheduling priority. 171 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fcfb580c3afc..1e0250cb9486 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74 74
75/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked
77 * __kprobes. But, there are cases where such functions already belong to
78 * a different section (__sched for preempt_schedule)
79 *
80 * For such cases, we now have a blacklist
81 */
82struct kprobe_blackpoint kprobe_blacklist[] = {
83 {"preempt_schedule",},
84 {NULL} /* Terminator */
85};
86
75#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 87#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
76/* 88/*
77 * kprobe->ainsn.insn points to the copy of the instruction to be 89 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp)
417 } 429 }
418} 430}
419 431
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{
434 unsigned long flags;
435 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next;
437 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags);
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
440 ri->rp = NULL;
441 hlist_del(&ri->uflist);
442 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp);
445}
446
420/* 447/*
421 * Keep all fields in the kprobe consistent 448 * Keep all fields in the kprobe consistent
422 */ 449 */
@@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
492 519
493static int __kprobes in_kprobes_functions(unsigned long addr) 520static int __kprobes in_kprobes_functions(unsigned long addr)
494{ 521{
522 struct kprobe_blackpoint *kb;
523
495 if (addr >= (unsigned long)__kprobes_text_start && 524 if (addr >= (unsigned long)__kprobes_text_start &&
496 addr < (unsigned long)__kprobes_text_end) 525 addr < (unsigned long)__kprobes_text_end)
497 return -EINVAL; 526 return -EINVAL;
527 /*
528 * If there exists a kprobe_blacklist, verify and
529 * fail any probe registration in the prohibited area
530 */
531 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
532 if (kb->start_addr) {
533 if (addr >= kb->start_addr &&
534 addr < (kb->start_addr + kb->range))
535 return -EINVAL;
536 }
537 }
498 return 0; 538 return 0;
499} 539}
500 540
@@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
555 } 595 }
556 596
557 p->nmissed = 0; 597 p->nmissed = 0;
598 INIT_LIST_HEAD(&p->list);
558 mutex_lock(&kprobe_mutex); 599 mutex_lock(&kprobe_mutex);
559 old_p = get_kprobe(p->addr); 600 old_p = get_kprobe(p->addr);
560 if (old_p) { 601 if (old_p) {
@@ -581,35 +622,28 @@ out:
581 return ret; 622 return ret;
582} 623}
583 624
584int __kprobes register_kprobe(struct kprobe *p) 625/*
585{ 626 * Unregister a kprobe without a scheduler synchronization.
586 return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); 627 */
587} 628static int __kprobes __unregister_kprobe_top(struct kprobe *p)
588
589void __kprobes unregister_kprobe(struct kprobe *p)
590{ 629{
591 struct module *mod;
592 struct kprobe *old_p, *list_p; 630 struct kprobe *old_p, *list_p;
593 int cleanup_p;
594 631
595 mutex_lock(&kprobe_mutex);
596 old_p = get_kprobe(p->addr); 632 old_p = get_kprobe(p->addr);
597 if (unlikely(!old_p)) { 633 if (unlikely(!old_p))
598 mutex_unlock(&kprobe_mutex); 634 return -EINVAL;
599 return; 635
600 }
601 if (p != old_p) { 636 if (p != old_p) {
602 list_for_each_entry_rcu(list_p, &old_p->list, list) 637 list_for_each_entry_rcu(list_p, &old_p->list, list)
603 if (list_p == p) 638 if (list_p == p)
604 /* kprobe p is a valid probe */ 639 /* kprobe p is a valid probe */
605 goto valid_p; 640 goto valid_p;
606 mutex_unlock(&kprobe_mutex); 641 return -EINVAL;
607 return;
608 } 642 }
609valid_p: 643valid_p:
610 if (old_p == p || 644 if (old_p == p ||
611 (old_p->pre_handler == aggr_pre_handler && 645 (old_p->pre_handler == aggr_pre_handler &&
612 p->list.next == &old_p->list && p->list.prev == &old_p->list)) { 646 list_is_singular(&old_p->list))) {
613 /* 647 /*
614 * Only probe on the hash list. Disarm only if kprobes are 648 * Only probe on the hash list. Disarm only if kprobes are
615 * enabled - otherwise, the breakpoint would already have 649 * enabled - otherwise, the breakpoint would already have
@@ -618,43 +652,97 @@ valid_p:
618 if (kprobe_enabled) 652 if (kprobe_enabled)
619 arch_disarm_kprobe(p); 653 arch_disarm_kprobe(p);
620 hlist_del_rcu(&old_p->hlist); 654 hlist_del_rcu(&old_p->hlist);
621 cleanup_p = 1;
622 } else { 655 } else {
656 if (p->break_handler)
657 old_p->break_handler = NULL;
658 if (p->post_handler) {
659 list_for_each_entry_rcu(list_p, &old_p->list, list) {
660 if ((list_p != p) && (list_p->post_handler))
661 goto noclean;
662 }
663 old_p->post_handler = NULL;
664 }
665noclean:
623 list_del_rcu(&p->list); 666 list_del_rcu(&p->list);
624 cleanup_p = 0;
625 } 667 }
668 return 0;
669}
626 670
627 mutex_unlock(&kprobe_mutex); 671static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
672{
673 struct module *mod;
674 struct kprobe *old_p;
628 675
629 synchronize_sched();
630 if (p->mod_refcounted) { 676 if (p->mod_refcounted) {
631 mod = module_text_address((unsigned long)p->addr); 677 mod = module_text_address((unsigned long)p->addr);
632 if (mod) 678 if (mod)
633 module_put(mod); 679 module_put(mod);
634 } 680 }
635 681
636 if (cleanup_p) { 682 if (list_empty(&p->list) || list_is_singular(&p->list)) {
637 if (p != old_p) { 683 if (!list_empty(&p->list)) {
638 list_del_rcu(&p->list); 684 /* "p" is the last child of an aggr_kprobe */
685 old_p = list_entry(p->list.next, struct kprobe, list);
686 list_del(&p->list);
639 kfree(old_p); 687 kfree(old_p);
640 } 688 }
641 arch_remove_kprobe(p); 689 arch_remove_kprobe(p);
642 } else { 690 }
643 mutex_lock(&kprobe_mutex); 691}
644 if (p->break_handler) 692
645 old_p->break_handler = NULL; 693static int __register_kprobes(struct kprobe **kps, int num,
646 if (p->post_handler){ 694 unsigned long called_from)
647 list_for_each_entry_rcu(list_p, &old_p->list, list){ 695{
648 if (list_p->post_handler){ 696 int i, ret = 0;
649 cleanup_p = 2; 697
650 break; 698 if (num <= 0)
651 } 699 return -EINVAL;
652 } 700 for (i = 0; i < num; i++) {
653 if (cleanup_p == 0) 701 ret = __register_kprobe(kps[i], called_from);
654 old_p->post_handler = NULL; 702 if (ret < 0 && i > 0) {
703 unregister_kprobes(kps, i);
704 break;
655 } 705 }
656 mutex_unlock(&kprobe_mutex);
657 } 706 }
707 return ret;
708}
709
710/*
711 * Registration and unregistration functions for kprobe.
712 */
713int __kprobes register_kprobe(struct kprobe *p)
714{
715 return __register_kprobes(&p, 1,
716 (unsigned long)__builtin_return_address(0));
717}
718
719void __kprobes unregister_kprobe(struct kprobe *p)
720{
721 unregister_kprobes(&p, 1);
722}
723
724int __kprobes register_kprobes(struct kprobe **kps, int num)
725{
726 return __register_kprobes(kps, num,
727 (unsigned long)__builtin_return_address(0));
728}
729
730void __kprobes unregister_kprobes(struct kprobe **kps, int num)
731{
732 int i;
733
734 if (num <= 0)
735 return;
736 mutex_lock(&kprobe_mutex);
737 for (i = 0; i < num; i++)
738 if (__unregister_kprobe_top(kps[i]) < 0)
739 kps[i]->addr = NULL;
740 mutex_unlock(&kprobe_mutex);
741
742 synchronize_sched();
743 for (i = 0; i < num; i++)
744 if (kps[i]->addr)
745 __unregister_kprobe_bottom(kps[i]);
658} 746}
659 747
660static struct notifier_block kprobe_exceptions_nb = { 748static struct notifier_block kprobe_exceptions_nb = {
@@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry)
667 return (unsigned long)entry; 755 return (unsigned long)entry;
668} 756}
669 757
670int __kprobes register_jprobe(struct jprobe *jp) 758static int __register_jprobes(struct jprobe **jps, int num,
759 unsigned long called_from)
671{ 760{
672 unsigned long addr = arch_deref_entry_point(jp->entry); 761 struct jprobe *jp;
762 int ret = 0, i;
673 763
674 if (!kernel_text_address(addr)) 764 if (num <= 0)
675 return -EINVAL; 765 return -EINVAL;
766 for (i = 0; i < num; i++) {
767 unsigned long addr;
768 jp = jps[i];
769 addr = arch_deref_entry_point(jp->entry);
770
771 if (!kernel_text_address(addr))
772 ret = -EINVAL;
773 else {
774 /* Todo: Verify probepoint is a function entry point */
775 jp->kp.pre_handler = setjmp_pre_handler;
776 jp->kp.break_handler = longjmp_break_handler;
777 ret = __register_kprobe(&jp->kp, called_from);
778 }
779 if (ret < 0 && i > 0) {
780 unregister_jprobes(jps, i);
781 break;
782 }
783 }
784 return ret;
785}
676 786
677 /* Todo: Verify probepoint is a function entry point */ 787int __kprobes register_jprobe(struct jprobe *jp)
678 jp->kp.pre_handler = setjmp_pre_handler; 788{
679 jp->kp.break_handler = longjmp_break_handler; 789 return __register_jprobes(&jp, 1,
680
681 return __register_kprobe(&jp->kp,
682 (unsigned long)__builtin_return_address(0)); 790 (unsigned long)__builtin_return_address(0));
683} 791}
684 792
685void __kprobes unregister_jprobe(struct jprobe *jp) 793void __kprobes unregister_jprobe(struct jprobe *jp)
686{ 794{
687 unregister_kprobe(&jp->kp); 795 unregister_jprobes(&jp, 1);
796}
797
798int __kprobes register_jprobes(struct jprobe **jps, int num)
799{
800 return __register_jprobes(jps, num,
801 (unsigned long)__builtin_return_address(0));
802}
803
804void __kprobes unregister_jprobes(struct jprobe **jps, int num)
805{
806 int i;
807
808 if (num <= 0)
809 return;
810 mutex_lock(&kprobe_mutex);
811 for (i = 0; i < num; i++)
812 if (__unregister_kprobe_top(&jps[i]->kp) < 0)
813 jps[i]->kp.addr = NULL;
814 mutex_unlock(&kprobe_mutex);
815
816 synchronize_sched();
817 for (i = 0; i < num; i++) {
818 if (jps[i]->kp.addr)
819 __unregister_kprobe_bottom(&jps[i]->kp);
820 }
688} 821}
689 822
690#ifdef CONFIG_KRETPROBES 823#ifdef CONFIG_KRETPROBES
@@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
725 return 0; 858 return 0;
726} 859}
727 860
728int __kprobes register_kretprobe(struct kretprobe *rp) 861static int __kprobes __register_kretprobe(struct kretprobe *rp,
862 unsigned long called_from)
729{ 863{
730 int ret = 0; 864 int ret = 0;
731 struct kretprobe_instance *inst; 865 struct kretprobe_instance *inst;
@@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
771 905
772 rp->nmissed = 0; 906 rp->nmissed = 0;
773 /* Establish function entry probe point */ 907 /* Establish function entry probe point */
774 if ((ret = __register_kprobe(&rp->kp, 908 ret = __register_kprobe(&rp->kp, called_from);
775 (unsigned long)__builtin_return_address(0))) != 0) 909 if (ret != 0)
776 free_rp_inst(rp); 910 free_rp_inst(rp);
777 return ret; 911 return ret;
778} 912}
779 913
914static int __register_kretprobes(struct kretprobe **rps, int num,
915 unsigned long called_from)
916{
917 int ret = 0, i;
918
919 if (num <= 0)
920 return -EINVAL;
921 for (i = 0; i < num; i++) {
922 ret = __register_kretprobe(rps[i], called_from);
923 if (ret < 0 && i > 0) {
924 unregister_kretprobes(rps, i);
925 break;
926 }
927 }
928 return ret;
929}
930
931int __kprobes register_kretprobe(struct kretprobe *rp)
932{
933 return __register_kretprobes(&rp, 1,
934 (unsigned long)__builtin_return_address(0));
935}
936
937void __kprobes unregister_kretprobe(struct kretprobe *rp)
938{
939 unregister_kretprobes(&rp, 1);
940}
941
942int __kprobes register_kretprobes(struct kretprobe **rps, int num)
943{
944 return __register_kretprobes(rps, num,
945 (unsigned long)__builtin_return_address(0));
946}
947
948void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
949{
950 int i;
951
952 if (num <= 0)
953 return;
954 mutex_lock(&kprobe_mutex);
955 for (i = 0; i < num; i++)
956 if (__unregister_kprobe_top(&rps[i]->kp) < 0)
957 rps[i]->kp.addr = NULL;
958 mutex_unlock(&kprobe_mutex);
959
960 synchronize_sched();
961 for (i = 0; i < num; i++) {
962 if (rps[i]->kp.addr) {
963 __unregister_kprobe_bottom(&rps[i]->kp);
964 cleanup_rp_inst(rps[i]);
965 }
966 }
967}
968
780#else /* CONFIG_KRETPROBES */ 969#else /* CONFIG_KRETPROBES */
781int __kprobes register_kretprobe(struct kretprobe *rp) 970int __kprobes register_kretprobe(struct kretprobe *rp)
782{ 971{
783 return -ENOSYS; 972 return -ENOSYS;
784} 973}
785 974
786static int __kprobes pre_handler_kretprobe(struct kprobe *p, 975int __kprobes register_kretprobes(struct kretprobe **rps, int num)
787 struct pt_regs *regs)
788{ 976{
789 return 0; 977 return -ENOSYS;
790} 978}
791#endif /* CONFIG_KRETPROBES */
792
793void __kprobes unregister_kretprobe(struct kretprobe *rp) 979void __kprobes unregister_kretprobe(struct kretprobe *rp)
794{ 980{
795 unsigned long flags; 981}
796 struct kretprobe_instance *ri;
797 struct hlist_node *pos, *next;
798 982
799 unregister_kprobe(&rp->kp); 983void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
984{
985}
800 986
801 /* No race here */ 987static int __kprobes pre_handler_kretprobe(struct kprobe *p,
802 spin_lock_irqsave(&kretprobe_lock, flags); 988 struct pt_regs *regs)
803 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 989{
804 ri->rp = NULL; 990 return 0;
805 hlist_del(&ri->uflist);
806 }
807 spin_unlock_irqrestore(&kretprobe_lock, flags);
808 free_rp_inst(rp);
809} 991}
810 992
993#endif /* CONFIG_KRETPROBES */
994
811static int __init init_kprobes(void) 995static int __init init_kprobes(void)
812{ 996{
813 int i, err = 0; 997 int i, err = 0;
998 unsigned long offset = 0, size = 0;
999 char *modname, namebuf[128];
1000 const char *symbol_name;
1001 void *addr;
1002 struct kprobe_blackpoint *kb;
814 1003
815 /* FIXME allocate the probe table, currently defined statically */ 1004 /* FIXME allocate the probe table, currently defined statically */
816 /* initialize all list heads */ 1005 /* initialize all list heads */
@@ -819,6 +1008,28 @@ static int __init init_kprobes(void)
819 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1008 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
820 } 1009 }
821 1010
1011 /*
1012 * Lookup and populate the kprobe_blacklist.
1013 *
1014 * Unlike the kretprobe blacklist, we'll need to determine
1015 * the range of addresses that belong to the said functions,
1016 * since a kprobe need not necessarily be at the beginning
1017 * of a function.
1018 */
1019 for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
1020 kprobe_lookup_name(kb->name, addr);
1021 if (!addr)
1022 continue;
1023
1024 kb->start_addr = (unsigned long)addr;
1025 symbol_name = kallsyms_lookup(kb->start_addr,
1026 &size, &offset, &modname, namebuf);
1027 if (!symbol_name)
1028 kb->range = 0;
1029 else
1030 kb->range = size;
1031 }
1032
822 if (kretprobe_blacklist_size) { 1033 if (kretprobe_blacklist_size) {
823 /* lookup the function address from its name */ 1034 /* lookup the function address from its name */
824 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1035 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -1066,8 +1277,12 @@ module_init(init_kprobes);
1066 1277
1067EXPORT_SYMBOL_GPL(register_kprobe); 1278EXPORT_SYMBOL_GPL(register_kprobe);
1068EXPORT_SYMBOL_GPL(unregister_kprobe); 1279EXPORT_SYMBOL_GPL(unregister_kprobe);
1280EXPORT_SYMBOL_GPL(register_kprobes);
1281EXPORT_SYMBOL_GPL(unregister_kprobes);
1069EXPORT_SYMBOL_GPL(register_jprobe); 1282EXPORT_SYMBOL_GPL(register_jprobe);
1070EXPORT_SYMBOL_GPL(unregister_jprobe); 1283EXPORT_SYMBOL_GPL(unregister_jprobe);
1284EXPORT_SYMBOL_GPL(register_jprobes);
1285EXPORT_SYMBOL_GPL(unregister_jprobes);
1071#ifdef CONFIG_KPROBES 1286#ifdef CONFIG_KPROBES
1072EXPORT_SYMBOL_GPL(jprobe_return); 1287EXPORT_SYMBOL_GPL(jprobe_return);
1073#endif 1288#endif
@@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return);
1075#ifdef CONFIG_KPROBES 1290#ifdef CONFIG_KPROBES
1076EXPORT_SYMBOL_GPL(register_kretprobe); 1291EXPORT_SYMBOL_GPL(register_kretprobe);
1077EXPORT_SYMBOL_GPL(unregister_kretprobe); 1292EXPORT_SYMBOL_GPL(unregister_kretprobe);
1293EXPORT_SYMBOL_GPL(register_kretprobes);
1294EXPORT_SYMBOL_GPL(unregister_kretprobes);
1078#endif 1295#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0ac887882f90..92cf6930ab51 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,6 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h>
17 16
18#define KTHREAD_NICE_LEVEL (-5) 17#define KTHREAD_NICE_LEVEL (-5)
19 18
@@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 wait_task_inactive(k); 179 wait_task_inactive(k);
181 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
182 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1;
183} 183}
184EXPORT_SYMBOL(kthread_bind); 184EXPORT_SYMBOL(kthread_bind);
185 185
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b4e3c85abe74..7c74dab0d21b 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
64 return; 64 return;
65 65
66 for (i = 0; i < MAXLR; i++) { 66 for (i = 0; i < MAXLR; i++) {
67 int q; 67 int q, same = 1;
68 int same = 1; 68
69 /* Nothing stored: */ 69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) { 70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i) 71 if (firstnonnull > i)
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
73 continue; 73 continue;
74 } 74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] != 76 unsigned long record = lat->backtrace[q];
77 lat->backtrace[q]) 77
78 if (latency_record[i].backtrace[q] != record) {
78 same = 0; 79 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break; 80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX) 81 }
82
83 /* 0 and ULONG_MAX entries mean end of backtrace: */
84 if (record == 0 || record == ULONG_MAX)
82 break; 85 break;
83 } 86 }
84 if (same) { 87 if (same) {
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 for (i = 0; i < LT_SAVECOUNT ; i++) { 146 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat; 147 struct latency_record *mylat;
145 int same = 1; 148 int same = 1;
149
146 mylat = &tsk->latency_record[i]; 150 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] != 152 unsigned long record = lat.backtrace[q];
149 lat.backtrace[q]) 153
154 if (mylat->backtrace[q] != record) {
150 same = 0; 155 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break; 156 break;
153 if (same && lat.backtrace[q] == ULONG_MAX) 157 }
158
159 /* 0 and ULONG_MAX entries mean end of backtrace: */
160 if (record == 0 || record == ULONG_MAX)
154 break; 161 break;
155 } 162 }
156 if (same) { 163 if (same) {
diff --git a/kernel/marker.c b/kernel/marker.c
index 48a4ea5afffd..005b95954593 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
104 char ptype; 104 char ptype;
105 105
106 /* 106 /*
107 * disabling preemption to make sure the teardown of the callbacks can 107 * preempt_disable does two things : disabling preemption to make sure
108 * be done correctly when they are in modules and they insure RCU read 108 * the teardown of the callbacks can be done correctly when they are in
109 * coherency. 109 * modules and they insure RCU read coherency.
110 */ 110 */
111 preempt_disable(); 111 preempt_disable();
112 ptype = ACCESS_ONCE(mdata->ptype); 112 ptype = mdata->ptype;
113 if (likely(!ptype)) { 113 if (likely(!ptype)) {
114 marker_probe_func *func; 114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant, 115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */ 116 * so we put an explicit smp_rmb() here. */
117 smp_rmb(); 117 smp_rmb();
118 func = ACCESS_ONCE(mdata->single.func); 118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data 119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */ 120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb(); 121 smp_rmb();
@@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
133 * in the fast path, so put the explicit barrier here. 133 * in the fast path, so put the explicit barrier here.
134 */ 134 */
135 smp_read_barrier_depends(); 135 smp_read_barrier_depends();
136 multi = ACCESS_ONCE(mdata->multi); 136 multi = mdata->multi;
137 for (i = 0; multi[i].func; i++) { 137 for (i = 0; multi[i].func; i++) {
138 va_start(args, fmt); 138 va_start(args, fmt);
139 multi[i].func(multi[i].probe_private, call_private, fmt, 139 multi[i].func(multi[i].probe_private, call_private, fmt,
@@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata,
161 char ptype; 161 char ptype;
162 162
163 preempt_disable(); 163 preempt_disable();
164 ptype = ACCESS_ONCE(mdata->ptype); 164 ptype = mdata->ptype;
165 if (likely(!ptype)) { 165 if (likely(!ptype)) {
166 marker_probe_func *func; 166 marker_probe_func *func;
167 /* Must read the ptype before ptr. They are not data dependant, 167 /* Must read the ptype before ptr. They are not data dependant,
168 * so we put an explicit smp_rmb() here. */ 168 * so we put an explicit smp_rmb() here. */
169 smp_rmb(); 169 smp_rmb();
170 func = ACCESS_ONCE(mdata->single.func); 170 func = mdata->single.func;
171 /* Must read the ptr before private data. They are not data 171 /* Must read the ptr before private data. They are not data
172 * dependant, so we put an explicit smp_rmb() here. */ 172 * dependant, so we put an explicit smp_rmb() here. */
173 smp_rmb(); 173 smp_rmb();
@@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata,
183 * in the fast path, so put the explicit barrier here. 183 * in the fast path, so put the explicit barrier here.
184 */ 184 */
185 smp_read_barrier_depends(); 185 smp_read_barrier_depends();
186 multi = ACCESS_ONCE(mdata->multi); 186 multi = mdata->multi;
187 for (i = 0; multi[i].func; i++) 187 for (i = 0; multi[i].func; i++)
188 multi[i].func(multi[i].probe_private, call_private, fmt, 188 multi[i].func(multi[i].probe_private, call_private, fmt,
189 &args); 189 &args);
@@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
551 551
552/* 552/*
553 * Disable a marker and its probe callback. 553 * Disable a marker and its probe callback.
554 * Note: only after a synchronize_sched() issued after setting elem->call to the 554 * Note: only waiting an RCU period after setting elem->call to the empty
555 * empty function insures that the original callback is not used anymore. This 555 * function insures that the original callback is not used anymore. This insured
556 * insured by preemption disabling around the call site. 556 * by preempt_disable around the call site.
557 */ 557 */
558static void disable_marker(struct marker *elem) 558static void disable_marker(struct marker *elem)
559{ 559{
@@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem)
565 elem->ptype = 0; /* single probe */ 565 elem->ptype = 0; /* single probe */
566 /* 566 /*
567 * Leave the private data and id there, because removal is racy and 567 * Leave the private data and id there, because removal is racy and
568 * should be done only after a synchronize_sched(). These are never used 568 * should be done only after an RCU period. These are never used until
569 * until the next initialization anyway. 569 * the next initialization anyway.
570 */ 570 */
571} 571}
572 572
@@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin,
601 601
602/* 602/*
603 * Update probes, removing the faulty probes. 603 * Update probes, removing the faulty probes.
604 * Issues a synchronize_sched() when no reference to the module passed
605 * as parameter is found in the probes so the probe module can be
606 * safely unloaded from now on.
607 * 604 *
608 * Internal callback only changed before the first probe is connected to it. 605 * Internal callback only changed before the first probe is connected to it.
609 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 606 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
@@ -674,6 +671,9 @@ int marker_probe_register(const char *name, const char *format,
674 entry->rcu_pending = 1; 671 entry->rcu_pending = 1;
675 /* write rcu_pending before calling the RCU callback */ 672 /* write rcu_pending before calling the RCU callback */
676 smp_wmb(); 673 smp_wmb();
674#ifdef CONFIG_PREEMPT_RCU
675 synchronize_sched(); /* Until we have the call_rcu_sched() */
676#endif
677 call_rcu(&entry->rcu, free_old_closure); 677 call_rcu(&entry->rcu, free_old_closure);
678end: 678end:
679 mutex_unlock(&markers_mutex); 679 mutex_unlock(&markers_mutex);
@@ -717,6 +717,9 @@ int marker_probe_unregister(const char *name,
717 entry->rcu_pending = 1; 717 entry->rcu_pending = 1;
718 /* write rcu_pending before calling the RCU callback */ 718 /* write rcu_pending before calling the RCU callback */
719 smp_wmb(); 719 smp_wmb();
720#ifdef CONFIG_PREEMPT_RCU
721 synchronize_sched(); /* Until we have the call_rcu_sched() */
722#endif
720 call_rcu(&entry->rcu, free_old_closure); 723 call_rcu(&entry->rcu, free_old_closure);
721 remove_marker(name); /* Ignore busy error message */ 724 remove_marker(name); /* Ignore busy error message */
722 ret = 0; 725 ret = 0;
@@ -795,6 +798,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
795 entry->rcu_pending = 1; 798 entry->rcu_pending = 1;
796 /* write rcu_pending before calling the RCU callback */ 799 /* write rcu_pending before calling the RCU callback */
797 smp_wmb(); 800 smp_wmb();
801#ifdef CONFIG_PREEMPT_RCU
802 synchronize_sched(); /* Until we have the call_rcu_sched() */
803#endif
798 call_rcu(&entry->rcu, free_old_closure); 804 call_rcu(&entry->rcu, free_old_closure);
799 remove_marker(entry->name); /* Ignore busy error message */ 805 remove_marker(entry->name); /* Ignore busy error message */
800end: 806end:
diff --git a/kernel/module.c b/kernel/module.c
index 5d437bffd8dc..8d6cccc6c3cf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/mutex.h> 43#include <linux/mutex.h>
44#include <linux/unwind.h> 44#include <linux/unwind.h>
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46#include <asm/semaphore.h>
47#include <asm/cacheflush.h> 46#include <asm/cacheflush.h>
48#include <linux/license.h> 47#include <linux/license.h>
49#include <asm/sections.h> 48#include <asm/sections.h>
@@ -664,7 +663,7 @@ static void free_module(struct module *mod);
664 663
665static void wait_for_zero_refcount(struct module *mod) 664static void wait_for_zero_refcount(struct module *mod)
666{ 665{
667 /* Since we might sleep for some time, drop the semaphore first */ 666 /* Since we might sleep for some time, release the mutex first */
668 mutex_unlock(&module_mutex); 667 mutex_unlock(&module_mutex);
669 for (;;) { 668 for (;;) {
670 DEBUGP("Looking at refcount...\n"); 669 DEBUGP("Looking at refcount...\n");
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..5ca37fa50beb 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 93
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 94 for (i = 1; i < PIDMAP_ENTRIES; i++) {
95 ns->pidmap[i].page = 0; 95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 } 97 }
98 98
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2eae91f954ca..ae5c6c147c4b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1087,45 +1087,45 @@ static void check_process_timers(struct task_struct *tsk,
1087 maxfire = 20; 1087 maxfire = 20;
1088 prof_expires = cputime_zero; 1088 prof_expires = cputime_zero;
1089 while (!list_empty(timers)) { 1089 while (!list_empty(timers)) {
1090 struct cpu_timer_list *t = list_first_entry(timers, 1090 struct cpu_timer_list *tl = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { 1093 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
1094 prof_expires = t->expires.cpu; 1094 prof_expires = tl->expires.cpu;
1095 break; 1095 break;
1096 } 1096 }
1097 t->firing = 1; 1097 tl->firing = 1;
1098 list_move_tail(&t->entry, firing); 1098 list_move_tail(&tl->entry, firing);
1099 } 1099 }
1100 1100
1101 ++timers; 1101 ++timers;
1102 maxfire = 20; 1102 maxfire = 20;
1103 virt_expires = cputime_zero; 1103 virt_expires = cputime_zero;
1104 while (!list_empty(timers)) { 1104 while (!list_empty(timers)) {
1105 struct cpu_timer_list *t = list_first_entry(timers, 1105 struct cpu_timer_list *tl = list_first_entry(timers,
1106 struct cpu_timer_list, 1106 struct cpu_timer_list,
1107 entry); 1107 entry);
1108 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { 1108 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
1109 virt_expires = t->expires.cpu; 1109 virt_expires = tl->expires.cpu;
1110 break; 1110 break;
1111 } 1111 }
1112 t->firing = 1; 1112 tl->firing = 1;
1113 list_move_tail(&t->entry, firing); 1113 list_move_tail(&tl->entry, firing);
1114 } 1114 }
1115 1115
1116 ++timers; 1116 ++timers;
1117 maxfire = 20; 1117 maxfire = 20;
1118 sched_expires = 0; 1118 sched_expires = 0;
1119 while (!list_empty(timers)) { 1119 while (!list_empty(timers)) {
1120 struct cpu_timer_list *t = list_first_entry(timers, 1120 struct cpu_timer_list *tl = list_first_entry(timers,
1121 struct cpu_timer_list, 1121 struct cpu_timer_list,
1122 entry); 1122 entry);
1123 if (!--maxfire || sum_sched_runtime < t->expires.sched) { 1123 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1124 sched_expires = t->expires.sched; 1124 sched_expires = tl->expires.sched;
1125 break; 1125 break;
1126 } 1126 }
1127 t->firing = 1; 1127 tl->firing = 1;
1128 list_move_tail(&t->entry, firing); 1128 list_move_tail(&tl->entry, firing);
1129 } 1129 }
1130 1130
1131 /* 1131 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a9b04203a66d..8476956ffd92 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -37,7 +37,6 @@
37#include <linux/mutex.h> 37#include <linux/mutex.h>
38 38
39#include <asm/uaccess.h> 39#include <asm/uaccess.h>
40#include <asm/semaphore.h>
41#include <linux/list.h> 40#include <linux/list.h>
42#include <linux/init.h> 41#include <linux/init.h>
43#include <linux/compiler.h> 42#include <linux/compiler.h>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API (DEPRECATED)"
24 depends on PM
25 default n
26 ---help---
27 Support for pm_register() and friends. This old API is obsoleted
28 by the driver model.
29
30 If unsure, say N.
31
32config PM_DEBUG 22config PM_DEBUG
33 bool "Power Management Debug Support" 23 bool "Power Management Debug Support"
34 depends on PM 24 depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-y := main.o
7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += process.o console.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
10 9
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 89bcf4973ee5..b8628be2a465 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -7,17 +7,39 @@
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/console.h>
10#include <linux/module.h>
10#include "power.h" 11#include "power.h"
11 12
12#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) 13#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
13#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
16 33
17int pm_prepare_console(void) 34int pm_prepare_console(void)
18{ 35{
19 acquire_console_sem(); 36 acquire_console_sem();
20 37
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
21 orig_fgconsole = fg_console; 43 orig_fgconsole = fg_console;
22 44
23 if (vc_allocate(SUSPEND_CONSOLE)) { 45 if (vc_allocate(SUSPEND_CONSOLE)) {
@@ -50,9 +72,12 @@ int pm_prepare_console(void)
50void pm_restore_console(void) 72void pm_restore_console(void)
51{ 73{
52 acquire_console_sem(); 74 acquire_console_sem();
75 if (disable_vt_switch) {
76 release_console_sem();
77 return;
78 }
53 set_console(orig_fgconsole); 79 set_console(orig_fgconsole);
54 release_console_sem(); 80 release_console_sem();
55 kmsg_redirect = orig_kmsg; 81 kmsg_redirect = orig_kmsg;
56 return;
57} 82}
58#endif 83#endif
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
1/*
2 * pm.c - Power management interface
3 *
4 * Copyright (C) 2000 Andrew Henroid
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20#include <linux/init.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/mm.h>
24#include <linux/slab.h>
25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
27#include <linux/interrupt.h>
28#include <linux/mutex.h>
29
30/*
31 * Locking notes:
32 * pm_devs_lock can be a semaphore providing pm ops are not called
33 * from an interrupt handler (already a bad idea so no change here). Each
34 * change must be protected so that an unlink of an entry doesn't clash
35 * with a pm send - which is permitted to sleep in the current architecture
36 *
37 * Module unloads clashing with pm events now work out safely, the module
38 * unload path will block until the event has been sent. It may well block
39 * until a resume but that will be fine.
40 */
41
42static DEFINE_MUTEX(pm_devs_lock);
43static LIST_HEAD(pm_devs);
44
45/**
46 * pm_register - register a device with power management
47 * @type: device type
48 * @id: device ID
49 * @callback: callback function
50 *
51 * Add a device to the list of devices that wish to be notified about
52 * power management events. A &pm_dev structure is returned on success,
53 * on failure the return is %NULL.
54 *
55 * The callback function will be called in process context and
56 * it may sleep.
57 */
58
59struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id,
61 pm_callback callback)
62{
63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) {
65 dev->type = type;
66 dev->id = id;
67 dev->callback = callback;
68
69 mutex_lock(&pm_devs_lock);
70 list_add(&dev->entry, &pm_devs);
71 mutex_unlock(&pm_devs_lock);
72 }
73 return dev;
74}
75
76/**
77 * pm_send - send request to a single device
78 * @dev: device to send to
79 * @rqst: power management request
80 * @data: data for the callback
81 *
82 * Issue a power management request to a given device. The
83 * %PM_SUSPEND and %PM_RESUME events are handled specially. The
84 * data field must hold the intended next state. No call is made
85 * if the state matches.
86 *
87 * BUGS: what stops two power management requests occurring in parallel
88 * and conflicting.
89 *
90 * WARNING: Calling pm_send directly is not generally recommended, in
91 * particular there is no locking against the pm_dev going away. The
92 * caller must maintain all needed locking or have 'inside knowledge'
93 * on the safety. Also remember that this function is not locked against
94 * pm_unregister. This means that you must handle SMP races on callback
95 * execution and unload yourself.
96 */
97
98static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
99{
100 int status = 0;
101 unsigned long prev_state, next_state;
102
103 if (in_interrupt())
104 BUG();
105
106 switch (rqst) {
107 case PM_SUSPEND:
108 case PM_RESUME:
109 prev_state = dev->state;
110 next_state = (unsigned long) data;
111 if (prev_state != next_state) {
112 if (dev->callback)
113 status = (*dev->callback)(dev, rqst, data);
114 if (!status) {
115 dev->state = next_state;
116 dev->prev_state = prev_state;
117 }
118 }
119 else {
120 dev->prev_state = prev_state;
121 }
122 break;
123 default:
124 if (dev->callback)
125 status = (*dev->callback)(dev, rqst, data);
126 break;
127 }
128 return status;
129}
130
131/*
132 * Undo incomplete request
133 */
134static void pm_undo_all(struct pm_dev *last)
135{
136 struct list_head *entry = last->entry.prev;
137 while (entry != &pm_devs) {
138 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
139 if (dev->state != dev->prev_state) {
140 /* previous state was zero (running) resume or
141 * previous state was non-zero (suspended) suspend
142 */
143 pm_request_t undo = (dev->prev_state
144 ? PM_SUSPEND:PM_RESUME);
145 pm_send(dev, undo, (void*) dev->prev_state);
146 }
147 entry = entry->prev;
148 }
149}
150
151/**
152 * pm_send_all - send request to all managed devices
153 * @rqst: power management request
154 * @data: data for the callback
155 *
156 * Issue a power management request to a all devices. The
157 * %PM_SUSPEND events are handled specially. Any device is
158 * permitted to fail a suspend by returning a non zero (error)
159 * value from its callback function. If any device vetoes a
160 * suspend request then all other devices that have suspended
161 * during the processing of this request are restored to their
162 * previous state.
163 *
164 * WARNING: This function takes the pm_devs_lock. The lock is not dropped until
165 * the callbacks have completed. This prevents races against pm locking
166 * functions, races against module unload pm_unregister code. It does
167 * mean however that you must not issue pm_ functions within the callback
168 * or you will deadlock and users will hate you.
169 *
170 * Zero is returned on success. If a suspend fails then the status
171 * from the device that vetoes the suspend is returned.
172 *
173 * BUGS: what stops two power management requests occurring in parallel
174 * and conflicting.
175 */
176
177int pm_send_all(pm_request_t rqst, void *data)
178{
179 struct list_head *entry;
180
181 mutex_lock(&pm_devs_lock);
182 entry = pm_devs.next;
183 while (entry != &pm_devs) {
184 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
185 if (dev->callback) {
186 int status = pm_send(dev, rqst, data);
187 if (status) {
188 /* return devices to previous state on
189 * failed suspend request
190 */
191 if (rqst == PM_SUSPEND)
192 pm_undo_all(dev);
193 mutex_unlock(&pm_devs_lock);
194 return status;
195 }
196 }
197 entry = entry->next;
198 }
199 mutex_unlock(&pm_devs_lock);
200 return 0;
201}
202
203EXPORT_SYMBOL(pm_register);
204EXPORT_SYMBOL(pm_send_all);
205
diff --git a/kernel/printk.c b/kernel/printk.c
index 9adc2a473e6e..bdd4ea8c3f2b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -616,6 +616,53 @@ asmlinkage int printk(const char *fmt, ...)
616/* cpu currently holding logbuf_lock */ 616/* cpu currently holding logbuf_lock */
617static volatile unsigned int printk_cpu = UINT_MAX; 617static volatile unsigned int printk_cpu = UINT_MAX;
618 618
619/*
620 * Can we actually use the console at this time on this cpu?
621 *
622 * Console drivers may assume that per-cpu resources have
623 * been allocated. So unless they're explicitly marked as
624 * being able to cope (CON_ANYTIME) don't call them until
625 * this CPU is officially up.
626 */
627static inline int can_use_console(unsigned int cpu)
628{
629 return cpu_online(cpu) || have_callable_console();
630}
631
632/*
633 * Try to get console ownership to actually show the kernel
634 * messages from a 'printk'. Return true (and with the
635 * console_semaphore held, and 'console_locked' set) if it
636 * is successful, false otherwise.
637 *
638 * This gets called with the 'logbuf_lock' spinlock held and
639 * interrupts disabled. It should return with 'lockbuf_lock'
640 * released but interrupts still disabled.
641 */
642static int acquire_console_semaphore_for_printk(unsigned int cpu)
643{
644 int retval = 0;
645
646 if (!try_acquire_console_sem()) {
647 retval = 1;
648
649 /*
650 * If we can't use the console, we need to release
651 * the console semaphore by hand to avoid flushing
652 * the buffer. We need to hold the console semaphore
653 * in order to do this test safely.
654 */
655 if (!can_use_console(cpu)) {
656 console_locked = 0;
657 up(&console_sem);
658 retval = 0;
659 }
660 }
661 printk_cpu = UINT_MAX;
662 spin_unlock(&logbuf_lock);
663 return retval;
664}
665
619const char printk_recursion_bug_msg [] = 666const char printk_recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 667 KERN_CRIT "BUG: recent printk recursion!\n";
621static int printk_recursion_bug; 668static int printk_recursion_bug;
@@ -725,43 +772,22 @@ asmlinkage int vprintk(const char *fmt, va_list args)
725 log_level_unknown = 1; 772 log_level_unknown = 1;
726 } 773 }
727 774
728 if (!down_trylock(&console_sem)) { 775 /*
729 /* 776 * Try to acquire and then immediately release the
730 * We own the drivers. We can drop the spinlock and 777 * console semaphore. The release will do all the
731 * let release_console_sem() print the text, maybe ... 778 * actual magic (print out buffers, wake up klogd,
732 */ 779 * etc).
733 console_locked = 1; 780 *
734 printk_cpu = UINT_MAX; 781 * The acquire_console_semaphore_for_printk() function
735 spin_unlock(&logbuf_lock); 782 * will release 'logbuf_lock' regardless of whether it
783 * actually gets the semaphore or not.
784 */
785 if (acquire_console_semaphore_for_printk(this_cpu))
786 release_console_sem();
736 787
737 /* 788 lockdep_on();
738 * Console drivers may assume that per-cpu resources have
739 * been allocated. So unless they're explicitly marked as
740 * being able to cope (CON_ANYTIME) don't call them until
741 * this CPU is officially up.
742 */
743 if (cpu_online(smp_processor_id()) || have_callable_console()) {
744 console_may_schedule = 0;
745 release_console_sem();
746 } else {
747 /* Release by hand to avoid flushing the buffer. */
748 console_locked = 0;
749 up(&console_sem);
750 }
751 lockdep_on();
752 raw_local_irq_restore(flags);
753 } else {
754 /*
755 * Someone else owns the drivers. We drop the spinlock, which
756 * allows the semaphore holder to proceed and to call the
757 * console drivers with the output which we just produced.
758 */
759 printk_cpu = UINT_MAX;
760 spin_unlock(&logbuf_lock);
761 lockdep_on();
762out_restore_irqs: 789out_restore_irqs:
763 raw_local_irq_restore(flags); 790 raw_local_irq_restore(flags);
764 }
765 791
766 preempt_enable(); 792 preempt_enable();
767 return printed_len; 793 return printed_len;
diff --git a/kernel/profile.c b/kernel/profile.c
index 3b7a1b055122..606d7387265c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,7 +23,6 @@
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <asm/sections.h> 25#include <asm/sections.h>
26#include <asm/semaphore.h>
27#include <asm/irq_regs.h> 26#include <asm/irq_regs.h>
28#include <asm/ptrace.h> 27#include <asm/ptrace.h>
29 28
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index fdb34e86f923..dac4b4e57293 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -323,9 +323,8 @@ static int ptrace_setoptions(struct task_struct *child, long data)
323 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 323 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
324} 324}
325 325
326static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) 326static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
327{ 327{
328 siginfo_t lastinfo;
329 int error = -ESRCH; 328 int error = -ESRCH;
330 329
331 read_lock(&tasklist_lock); 330 read_lock(&tasklist_lock);
@@ -333,31 +332,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
333 error = -EINVAL; 332 error = -EINVAL;
334 spin_lock_irq(&child->sighand->siglock); 333 spin_lock_irq(&child->sighand->siglock);
335 if (likely(child->last_siginfo != NULL)) { 334 if (likely(child->last_siginfo != NULL)) {
336 lastinfo = *child->last_siginfo; 335 *info = *child->last_siginfo;
337 error = 0; 336 error = 0;
338 } 337 }
339 spin_unlock_irq(&child->sighand->siglock); 338 spin_unlock_irq(&child->sighand->siglock);
340 } 339 }
341 read_unlock(&tasklist_lock); 340 read_unlock(&tasklist_lock);
342 if (!error)
343 return copy_siginfo_to_user(data, &lastinfo);
344 return error; 341 return error;
345} 342}
346 343
347static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) 344static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
348{ 345{
349 siginfo_t newinfo;
350 int error = -ESRCH; 346 int error = -ESRCH;
351 347
352 if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
353 return -EFAULT;
354
355 read_lock(&tasklist_lock); 348 read_lock(&tasklist_lock);
356 if (likely(child->sighand != NULL)) { 349 if (likely(child->sighand != NULL)) {
357 error = -EINVAL; 350 error = -EINVAL;
358 spin_lock_irq(&child->sighand->siglock); 351 spin_lock_irq(&child->sighand->siglock);
359 if (likely(child->last_siginfo != NULL)) { 352 if (likely(child->last_siginfo != NULL)) {
360 *child->last_siginfo = newinfo; 353 *child->last_siginfo = *info;
361 error = 0; 354 error = 0;
362 } 355 }
363 spin_unlock_irq(&child->sighand->siglock); 356 spin_unlock_irq(&child->sighand->siglock);
@@ -424,6 +417,7 @@ int ptrace_request(struct task_struct *child, long request,
424 long addr, long data) 417 long addr, long data)
425{ 418{
426 int ret = -EIO; 419 int ret = -EIO;
420 siginfo_t siginfo;
427 421
428 switch (request) { 422 switch (request) {
429 case PTRACE_PEEKTEXT: 423 case PTRACE_PEEKTEXT:
@@ -442,12 +436,22 @@ int ptrace_request(struct task_struct *child, long request,
442 case PTRACE_GETEVENTMSG: 436 case PTRACE_GETEVENTMSG:
443 ret = put_user(child->ptrace_message, (unsigned long __user *) data); 437 ret = put_user(child->ptrace_message, (unsigned long __user *) data);
444 break; 438 break;
439
445 case PTRACE_GETSIGINFO: 440 case PTRACE_GETSIGINFO:
446 ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); 441 ret = ptrace_getsiginfo(child, &siginfo);
442 if (!ret)
443 ret = copy_siginfo_to_user((siginfo_t __user *) data,
444 &siginfo);
447 break; 445 break;
446
448 case PTRACE_SETSIGINFO: 447 case PTRACE_SETSIGINFO:
449 ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); 448 if (copy_from_user(&siginfo, (siginfo_t __user *) data,
449 sizeof siginfo))
450 ret = -EFAULT;
451 else
452 ret = ptrace_setsiginfo(child, &siginfo);
450 break; 453 break;
454
451 case PTRACE_DETACH: /* detach a process that was attached. */ 455 case PTRACE_DETACH: /* detach a process that was attached. */
452 ret = ptrace_detach(child, data); 456 ret = ptrace_detach(child, data);
453 break; 457 break;
@@ -608,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
608 return (copied == sizeof(data)) ? 0 : -EIO; 612 return (copied == sizeof(data)) ? 0 : -EIO;
609} 613}
610 614
611#ifdef CONFIG_COMPAT 615#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
612#include <linux/compat.h> 616#include <linux/compat.h>
613 617
614int compat_ptrace_request(struct task_struct *child, compat_long_t request, 618int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -616,6 +620,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
616{ 620{
617 compat_ulong_t __user *datap = compat_ptr(data); 621 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word; 622 compat_ulong_t word;
623 siginfo_t siginfo;
619 int ret; 624 int ret;
620 625
621 switch (request) { 626 switch (request) {
@@ -638,6 +643,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap); 643 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break; 644 break;
640 645
646 case PTRACE_GETSIGINFO:
647 ret = ptrace_getsiginfo(child, &siginfo);
648 if (!ret)
649 ret = copy_siginfo_to_user32(
650 (struct compat_siginfo __user *) datap,
651 &siginfo);
652 break;
653
654 case PTRACE_SETSIGINFO:
655 memset(&siginfo, 0, sizeof siginfo);
656 if (copy_siginfo_from_user32(
657 &siginfo, (struct compat_siginfo __user *) datap))
658 ret = -EFAULT;
659 else
660 ret = ptrace_setsiginfo(child, &siginfo);
661 break;
662
641 default: 663 default:
642 ret = ptrace_request(child, request, addr, data); 664 ret = ptrace_request(child, request, addr, data);
643 } 665 }
@@ -645,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
645 return ret; 667 return ret;
646} 668}
647 669
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 670asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data) 671 compat_long_t addr, compat_long_t data)
651{ 672{
@@ -688,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
688 unlock_kernel(); 709 unlock_kernel();
689 return ret; 710 return ret;
690} 711}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ 712#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e9517014b57c..e1cdf196a515 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void)
1007 if (sched_getaffinity(0, &oldmask) < 0) 1007 if (sched_getaffinity(0, &oldmask) < 0)
1008 oldmask = cpu_possible_map; 1008 oldmask = cpu_possible_map;
1009 for_each_online_cpu(cpu) { 1009 for_each_online_cpu(cpu) {
1010 sched_setaffinity(0, cpumask_of_cpu(cpu)); 1010 sched_setaffinity(0, &cpumask_of_cpu(cpu));
1011 schedule(); 1011 schedule();
1012 } 1012 }
1013 sched_setaffinity(0, oldmask); 1013 sched_setaffinity(0, &oldmask);
1014} 1014}
1015EXPORT_SYMBOL_GPL(__synchronize_sched); 1015EXPORT_SYMBOL_GPL(__synchronize_sched);
1016 1016
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index fd599829e72a..47894f919d4e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
723 */ 723 */
724static void rcu_torture_shuffle_tasks(void) 724static void rcu_torture_shuffle_tasks(void)
725{ 725{
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask;
727 int i; 727 int i;
728 728
729 cpus_setall(tmp_mask);
729 get_online_cpus(); 730 get_online_cpus();
730 731
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 732 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void)
737 if (rcu_idle_cpu != -1) 738 if (rcu_idle_cpu != -1)
738 cpu_clear(rcu_idle_cpu, tmp_mask); 739 cpu_clear(rcu_idle_cpu, tmp_mask);
739 740
740 set_cpus_allowed(current, tmp_mask); 741 set_cpus_allowed_ptr(current, &tmp_mask);
741 742
742 if (reader_tasks) { 743 if (reader_tasks) {
743 for (i = 0; i < nrealreaders; i++) 744 for (i = 0; i < nrealreaders; i++)
744 if (reader_tasks[i]) 745 if (reader_tasks[i])
745 set_cpus_allowed(reader_tasks[i], tmp_mask); 746 set_cpus_allowed_ptr(reader_tasks[i],
747 &tmp_mask);
746 } 748 }
747 749
748 if (fakewriter_tasks) { 750 if (fakewriter_tasks) {
749 for (i = 0; i < nfakewriters; i++) 751 for (i = 0; i < nfakewriters; i++)
750 if (fakewriter_tasks[i]) 752 if (fakewriter_tasks[i])
751 set_cpus_allowed(fakewriter_tasks[i], tmp_mask); 753 set_cpus_allowed_ptr(fakewriter_tasks[i],
754 &tmp_mask);
752 } 755 }
753 756
754 if (writer_task) 757 if (writer_task)
755 set_cpus_allowed(writer_task, tmp_mask); 758 set_cpus_allowed_ptr(writer_task, &tmp_mask);
756 759
757 if (stats_task) 760 if (stats_task)
758 set_cpus_allowed(stats_task, tmp_mask); 761 set_cpus_allowed_ptr(stats_task, &tmp_mask);
759 762
760 if (rcu_idle_cpu == -1) 763 if (rcu_idle_cpu == -1)
761 rcu_idle_cpu = num_online_cpus() - 1; 764 rcu_idle_cpu = num_online_cpus() - 1;
diff --git a/kernel/relay.c b/kernel/relay.c
index 4c035a8a248c..d6204a485818 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
736 kref_get(&buf->kref); 736 kref_get(&buf->kref);
737 filp->private_data = buf; 737 filp->private_data = buf;
738 738
739 return 0; 739 return nonseekable_open(inode, filp);
740} 740}
741 741
742/** 742/**
@@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = {
1056 .get = generic_pipe_buf_get, 1056 .get = generic_pipe_buf_get,
1057}; 1057};
1058 1058
1059static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1060{
1061}
1062
1059/* 1063/*
1060 * subbuf_splice_actor - splice up to one subbuf's worth of data 1064 * subbuf_splice_actor - splice up to one subbuf's worth of data
1061 */ 1065 */
@@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in,
1083 .partial = partial, 1087 .partial = partial,
1084 .flags = flags, 1088 .flags = flags,
1085 .ops = &relay_pipe_buf_ops, 1089 .ops = &relay_pipe_buf_ops,
1090 .spd_release = relay_page_release,
1086 }; 1091 };
1087 1092
1088 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1093 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 82aea814d409..cee12cc47cab 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -486,6 +486,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
486 486
487EXPORT_SYMBOL(adjust_resource); 487EXPORT_SYMBOL(adjust_resource);
488 488
489/**
490 * resource_alignment - calculate resource's alignment
491 * @res: resource pointer
492 *
493 * Returns alignment on success, 0 (invalid alignment) on failure.
494 */
495resource_size_t resource_alignment(struct resource *res)
496{
497 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
498 case IORESOURCE_SIZEALIGN:
499 return res->end - res->start + 1;
500 case IORESOURCE_STARTALIGN:
501 return res->start;
502 default:
503 return 0;
504 }
505}
506
489/* 507/*
490 * This is compatibility stuff for IO resources. 508 * This is compatibility stuff for IO resources.
491 * 509 *
diff --git a/kernel/sched.c b/kernel/sched.c
index 28c73f07efb2..740fb409e5bb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,10 @@
66#include <linux/unistd.h> 66#include <linux/unistd.h>
67#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h> 68#include <linux/hrtimer.h>
69#include <linux/tick.h>
70#include <linux/bootmem.h>
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
69 73
70#include <asm/tlb.h> 74#include <asm/tlb.h>
71#include <asm/irq_regs.h> 75#include <asm/irq_regs.h>
@@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void)
114 */ 118 */
115#define DEF_TIMESLICE (100 * HZ / 1000) 119#define DEF_TIMESLICE (100 * HZ / 1000)
116 120
121/*
122 * single value that denotes runtime == period, ie unlimited time.
123 */
124#define RUNTIME_INF ((u64)~0ULL)
125
117#ifdef CONFIG_SMP 126#ifdef CONFIG_SMP
118/* 127/*
119 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -155,6 +164,84 @@ struct rt_prio_array {
155 struct list_head queue[MAX_RT_PRIO]; 164 struct list_head queue[MAX_RT_PRIO];
156}; 165};
157 166
167struct rt_bandwidth {
168 /* nests inside the rq lock: */
169 spinlock_t rt_runtime_lock;
170 ktime_t rt_period;
171 u64 rt_runtime;
172 struct hrtimer rt_period_timer;
173};
174
175static struct rt_bandwidth def_rt_bandwidth;
176
177static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
178
179static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
180{
181 struct rt_bandwidth *rt_b =
182 container_of(timer, struct rt_bandwidth, rt_period_timer);
183 ktime_t now;
184 int overrun;
185 int idle = 0;
186
187 for (;;) {
188 now = hrtimer_cb_get_time(timer);
189 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
190
191 if (!overrun)
192 break;
193
194 idle = do_sched_rt_period_timer(rt_b, overrun);
195 }
196
197 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
198}
199
200static
201void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
202{
203 rt_b->rt_period = ns_to_ktime(period);
204 rt_b->rt_runtime = runtime;
205
206 spin_lock_init(&rt_b->rt_runtime_lock);
207
208 hrtimer_init(&rt_b->rt_period_timer,
209 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
210 rt_b->rt_period_timer.function = sched_rt_period_timer;
211 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
212}
213
214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
215{
216 ktime_t now;
217
218 if (rt_b->rt_runtime == RUNTIME_INF)
219 return;
220
221 if (hrtimer_active(&rt_b->rt_period_timer))
222 return;
223
224 spin_lock(&rt_b->rt_runtime_lock);
225 for (;;) {
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 break;
228
229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
231 hrtimer_start(&rt_b->rt_period_timer,
232 rt_b->rt_period_timer.expires,
233 HRTIMER_MODE_ABS);
234 }
235 spin_unlock(&rt_b->rt_runtime_lock);
236}
237
238#ifdef CONFIG_RT_GROUP_SCHED
239static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
240{
241 hrtimer_cancel(&rt_b->rt_period_timer);
242}
243#endif
244
158#ifdef CONFIG_GROUP_SCHED 245#ifdef CONFIG_GROUP_SCHED
159 246
160#include <linux/cgroup.h> 247#include <linux/cgroup.h>
@@ -181,29 +268,39 @@ struct task_group {
181 struct sched_rt_entity **rt_se; 268 struct sched_rt_entity **rt_se;
182 struct rt_rq **rt_rq; 269 struct rt_rq **rt_rq;
183 270
184 u64 rt_runtime; 271 struct rt_bandwidth rt_bandwidth;
185#endif 272#endif
186 273
187 struct rcu_head rcu; 274 struct rcu_head rcu;
188 struct list_head list; 275 struct list_head list;
276
277 struct task_group *parent;
278 struct list_head siblings;
279 struct list_head children;
189}; 280};
190 281
282#ifdef CONFIG_USER_SCHED
283
284/*
285 * Root task group.
286 * Every UID task group (including init_task_group aka UID-0) will
287 * be a child to this group.
288 */
289struct task_group root_task_group;
290
191#ifdef CONFIG_FAIR_GROUP_SCHED 291#ifdef CONFIG_FAIR_GROUP_SCHED
192/* Default task group's sched entity on each cpu */ 292/* Default task group's sched entity on each cpu */
193static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 293static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
194/* Default task group's cfs_rq on each cpu */ 294/* Default task group's cfs_rq on each cpu */
195static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 295static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
196
197static struct sched_entity *init_sched_entity_p[NR_CPUS];
198static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
199#endif 296#endif
200 297
201#ifdef CONFIG_RT_GROUP_SCHED 298#ifdef CONFIG_RT_GROUP_SCHED
202static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 299static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
203static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 300static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
204 301#endif
205static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; 302#else
206static struct rt_rq *init_rt_rq_p[NR_CPUS]; 303#define root_task_group init_task_group
207#endif 304#endif
208 305
209/* task_group_lock serializes add/remove of task groups and also changes to 306/* task_group_lock serializes add/remove of task groups and also changes to
@@ -221,23 +318,15 @@ static DEFINE_MUTEX(doms_cur_mutex);
221# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 318# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
222#endif 319#endif
223 320
321#define MIN_SHARES 2
322
224static int init_task_group_load = INIT_TASK_GROUP_LOAD; 323static int init_task_group_load = INIT_TASK_GROUP_LOAD;
225#endif 324#endif
226 325
227/* Default task group. 326/* Default task group.
228 * Every task in system belong to this group at bootup. 327 * Every task in system belong to this group at bootup.
229 */ 328 */
230struct task_group init_task_group = { 329struct task_group init_task_group;
231#ifdef CONFIG_FAIR_GROUP_SCHED
232 .se = init_sched_entity_p,
233 .cfs_rq = init_cfs_rq_p,
234#endif
235
236#ifdef CONFIG_RT_GROUP_SCHED
237 .rt_se = init_sched_rt_entity_p,
238 .rt_rq = init_rt_rq_p,
239#endif
240};
241 330
242/* return group to which a task belongs */ 331/* return group to which a task belongs */
243static inline struct task_group *task_group(struct task_struct *p) 332static inline struct task_group *task_group(struct task_struct *p)
@@ -297,8 +386,12 @@ struct cfs_rq {
297 386
298 struct rb_root tasks_timeline; 387 struct rb_root tasks_timeline;
299 struct rb_node *rb_leftmost; 388 struct rb_node *rb_leftmost;
300 struct rb_node *rb_load_balance_curr; 389
301 /* 'curr' points to currently running entity on this cfs_rq. 390 struct list_head tasks;
391 struct list_head *balance_iterator;
392
393 /*
394 * 'curr' points to currently running entity on this cfs_rq.
302 * It is set to NULL otherwise (i.e when none are currently running). 395 * It is set to NULL otherwise (i.e when none are currently running).
303 */ 396 */
304 struct sched_entity *curr, *next; 397 struct sched_entity *curr, *next;
@@ -318,6 +411,43 @@ struct cfs_rq {
318 */ 411 */
319 struct list_head leaf_cfs_rq_list; 412 struct list_head leaf_cfs_rq_list;
320 struct task_group *tg; /* group that "owns" this runqueue */ 413 struct task_group *tg; /* group that "owns" this runqueue */
414
415#ifdef CONFIG_SMP
416 unsigned long task_weight;
417 unsigned long shares;
418 /*
419 * We need space to build a sched_domain wide view of the full task
420 * group tree, in order to avoid depending on dynamic memory allocation
421 * during the load balancing we place this in the per cpu task group
422 * hierarchy. This limits the load balancing to one instance per cpu,
423 * but more should not be needed anyway.
424 */
425 struct aggregate_struct {
426 /*
427 * load = weight(cpus) * f(tg)
428 *
429 * Where f(tg) is the recursive weight fraction assigned to
430 * this group.
431 */
432 unsigned long load;
433
434 /*
435 * part of the group weight distributed to this span.
436 */
437 unsigned long shares;
438
439 /*
440 * The sum of all runqueue weights within this span.
441 */
442 unsigned long rq_weight;
443
444 /*
445 * Weight contributed by tasks; this is the part we can
446 * influence by moving tasks around.
447 */
448 unsigned long task_weight;
449 } aggregate;
450#endif
321#endif 451#endif
322}; 452};
323 453
@@ -334,6 +464,9 @@ struct rt_rq {
334#endif 464#endif
335 int rt_throttled; 465 int rt_throttled;
336 u64 rt_time; 466 u64 rt_time;
467 u64 rt_runtime;
468 /* Nests inside the rq lock: */
469 spinlock_t rt_runtime_lock;
337 470
338#ifdef CONFIG_RT_GROUP_SCHED 471#ifdef CONFIG_RT_GROUP_SCHED
339 unsigned long rt_nr_boosted; 472 unsigned long rt_nr_boosted;
@@ -396,6 +529,7 @@ struct rq {
396 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 529 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
397 unsigned char idle_at_tick; 530 unsigned char idle_at_tick;
398#ifdef CONFIG_NO_HZ 531#ifdef CONFIG_NO_HZ
532 unsigned long last_tick_seen;
399 unsigned char in_nohz_recently; 533 unsigned char in_nohz_recently;
400#endif 534#endif
401 /* capture load from *all* tasks on this cpu: */ 535 /* capture load from *all* tasks on this cpu: */
@@ -405,8 +539,6 @@ struct rq {
405 539
406 struct cfs_rq cfs; 540 struct cfs_rq cfs;
407 struct rt_rq rt; 541 struct rt_rq rt;
408 u64 rt_period_expire;
409 int rt_throttled;
410 542
411#ifdef CONFIG_FAIR_GROUP_SCHED 543#ifdef CONFIG_FAIR_GROUP_SCHED
412 /* list of leaf cfs_rq on this cpu: */ 544 /* list of leaf cfs_rq on this cpu: */
@@ -499,6 +631,32 @@ static inline int cpu_of(struct rq *rq)
499#endif 631#endif
500} 632}
501 633
634#ifdef CONFIG_NO_HZ
635static inline bool nohz_on(int cpu)
636{
637 return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
638}
639
640static inline u64 max_skipped_ticks(struct rq *rq)
641{
642 return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
643}
644
645static inline void update_last_tick_seen(struct rq *rq)
646{
647 rq->last_tick_seen = jiffies;
648}
649#else
650static inline u64 max_skipped_ticks(struct rq *rq)
651{
652 return 1;
653}
654
655static inline void update_last_tick_seen(struct rq *rq)
656{
657}
658#endif
659
502/* 660/*
503 * Update the per-runqueue clock, as finegrained as the platform can give 661 * Update the per-runqueue clock, as finegrained as the platform can give
504 * us, but without assuming monotonicity, etc.: 662 * us, but without assuming monotonicity, etc.:
@@ -523,9 +681,12 @@ static void __update_rq_clock(struct rq *rq)
523 /* 681 /*
524 * Catch too large forward jumps too: 682 * Catch too large forward jumps too:
525 */ 683 */
526 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { 684 u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
527 if (clock < rq->tick_timestamp + TICK_NSEC) 685 u64 max_time = rq->tick_timestamp + max_jump;
528 clock = rq->tick_timestamp + TICK_NSEC; 686
687 if (unlikely(clock + delta > max_time)) {
688 if (clock < max_time)
689 clock = max_time;
529 else 690 else
530 clock++; 691 clock++;
531 rq->clock_overflows++; 692 rq->clock_overflows++;
@@ -561,23 +722,6 @@ static void update_rq_clock(struct rq *rq)
561#define task_rq(p) cpu_rq(task_cpu(p)) 722#define task_rq(p) cpu_rq(task_cpu(p))
562#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 723#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
563 724
564unsigned long rt_needs_cpu(int cpu)
565{
566 struct rq *rq = cpu_rq(cpu);
567 u64 delta;
568
569 if (!rq->rt_throttled)
570 return 0;
571
572 if (rq->clock > rq->rt_period_expire)
573 return 1;
574
575 delta = rq->rt_period_expire - rq->clock;
576 do_div(delta, NSEC_PER_SEC / HZ);
577
578 return (unsigned long)delta;
579}
580
581/* 725/*
582 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 726 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
583 */ 727 */
@@ -590,22 +734,137 @@ unsigned long rt_needs_cpu(int cpu)
590/* 734/*
591 * Debugging: various feature bits 735 * Debugging: various feature bits
592 */ 736 */
737
738#define SCHED_FEAT(name, enabled) \
739 __SCHED_FEAT_##name ,
740
593enum { 741enum {
594 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 742#include "sched_features.h"
595 SCHED_FEAT_WAKEUP_PREEMPT = 2,
596 SCHED_FEAT_START_DEBIT = 4,
597 SCHED_FEAT_HRTICK = 8,
598 SCHED_FEAT_DOUBLE_TICK = 16,
599}; 743};
600 744
745#undef SCHED_FEAT
746
747#define SCHED_FEAT(name, enabled) \
748 (1UL << __SCHED_FEAT_##name) * enabled |
749
601const_debug unsigned int sysctl_sched_features = 750const_debug unsigned int sysctl_sched_features =
602 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | 751#include "sched_features.h"
603 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 752 0;
604 SCHED_FEAT_START_DEBIT * 1 | 753
605 SCHED_FEAT_HRTICK * 1 | 754#undef SCHED_FEAT
606 SCHED_FEAT_DOUBLE_TICK * 0; 755
756#ifdef CONFIG_SCHED_DEBUG
757#define SCHED_FEAT(name, enabled) \
758 #name ,
759
760__read_mostly char *sched_feat_names[] = {
761#include "sched_features.h"
762 NULL
763};
764
765#undef SCHED_FEAT
766
767int sched_feat_open(struct inode *inode, struct file *filp)
768{
769 filp->private_data = inode->i_private;
770 return 0;
771}
772
773static ssize_t
774sched_feat_read(struct file *filp, char __user *ubuf,
775 size_t cnt, loff_t *ppos)
776{
777 char *buf;
778 int r = 0;
779 int len = 0;
780 int i;
607 781
608#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 782 for (i = 0; sched_feat_names[i]; i++) {
783 len += strlen(sched_feat_names[i]);
784 len += 4;
785 }
786
787 buf = kmalloc(len + 2, GFP_KERNEL);
788 if (!buf)
789 return -ENOMEM;
790
791 for (i = 0; sched_feat_names[i]; i++) {
792 if (sysctl_sched_features & (1UL << i))
793 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
794 else
795 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
796 }
797
798 r += sprintf(buf + r, "\n");
799 WARN_ON(r >= len + 2);
800
801 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
802
803 kfree(buf);
804
805 return r;
806}
807
808static ssize_t
809sched_feat_write(struct file *filp, const char __user *ubuf,
810 size_t cnt, loff_t *ppos)
811{
812 char buf[64];
813 char *cmp = buf;
814 int neg = 0;
815 int i;
816
817 if (cnt > 63)
818 cnt = 63;
819
820 if (copy_from_user(&buf, ubuf, cnt))
821 return -EFAULT;
822
823 buf[cnt] = 0;
824
825 if (strncmp(buf, "NO_", 3) == 0) {
826 neg = 1;
827 cmp += 3;
828 }
829
830 for (i = 0; sched_feat_names[i]; i++) {
831 int len = strlen(sched_feat_names[i]);
832
833 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
834 if (neg)
835 sysctl_sched_features &= ~(1UL << i);
836 else
837 sysctl_sched_features |= (1UL << i);
838 break;
839 }
840 }
841
842 if (!sched_feat_names[i])
843 return -EINVAL;
844
845 filp->f_pos += cnt;
846
847 return cnt;
848}
849
850static struct file_operations sched_feat_fops = {
851 .open = sched_feat_open,
852 .read = sched_feat_read,
853 .write = sched_feat_write,
854};
855
856static __init int sched_init_debug(void)
857{
858 debugfs_create_file("sched_features", 0644, NULL, NULL,
859 &sched_feat_fops);
860
861 return 0;
862}
863late_initcall(sched_init_debug);
864
865#endif
866
867#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
609 868
610/* 869/*
611 * Number of tasks to iterate in a single balance run. 870 * Number of tasks to iterate in a single balance run.
@@ -627,16 +886,52 @@ static __read_mostly int scheduler_running;
627 */ 886 */
628int sysctl_sched_rt_runtime = 950000; 887int sysctl_sched_rt_runtime = 950000;
629 888
630/* 889static inline u64 global_rt_period(void)
631 * single value that denotes runtime == period, ie unlimited time. 890{
632 */ 891 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
633#define RUNTIME_INF ((u64)~0ULL) 892}
893
894static inline u64 global_rt_runtime(void)
895{
896 if (sysctl_sched_rt_period < 0)
897 return RUNTIME_INF;
898
899 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
900}
901
902static const unsigned long long time_sync_thresh = 100000;
903
904static DEFINE_PER_CPU(unsigned long long, time_offset);
905static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
634 906
635/* 907/*
636 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 908 * Global lock which we take every now and then to synchronize
637 * clock constructed from sched_clock(): 909 * the CPUs time. This method is not warp-safe, but it's good
910 * enough to synchronize slowly diverging time sources and thus
911 * it's good enough for tracing:
638 */ 912 */
639unsigned long long cpu_clock(int cpu) 913static DEFINE_SPINLOCK(time_sync_lock);
914static unsigned long long prev_global_time;
915
916static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
917{
918 unsigned long flags;
919
920 spin_lock_irqsave(&time_sync_lock, flags);
921
922 if (time < prev_global_time) {
923 per_cpu(time_offset, cpu) += prev_global_time - time;
924 time = prev_global_time;
925 } else {
926 prev_global_time = time;
927 }
928
929 spin_unlock_irqrestore(&time_sync_lock, flags);
930
931 return time;
932}
933
934static unsigned long long __cpu_clock(int cpu)
640{ 935{
641 unsigned long long now; 936 unsigned long long now;
642 unsigned long flags; 937 unsigned long flags;
@@ -657,6 +952,24 @@ unsigned long long cpu_clock(int cpu)
657 952
658 return now; 953 return now;
659} 954}
955
956/*
957 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
958 * clock constructed from sched_clock():
959 */
960unsigned long long cpu_clock(int cpu)
961{
962 unsigned long long prev_cpu_time, time, delta_time;
963
964 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
965 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
966 delta_time = time-prev_cpu_time;
967
968 if (unlikely(delta_time > time_sync_thresh))
969 time = __sync_cpu_clock(time, cpu);
970
971 return time;
972}
660EXPORT_SYMBOL_GPL(cpu_clock); 973EXPORT_SYMBOL_GPL(cpu_clock);
661 974
662#ifndef prepare_arch_switch 975#ifndef prepare_arch_switch
@@ -1052,6 +1365,49 @@ static void resched_cpu(int cpu)
1052 resched_task(cpu_curr(cpu)); 1365 resched_task(cpu_curr(cpu));
1053 spin_unlock_irqrestore(&rq->lock, flags); 1366 spin_unlock_irqrestore(&rq->lock, flags);
1054} 1367}
1368
1369#ifdef CONFIG_NO_HZ
1370/*
1371 * When add_timer_on() enqueues a timer into the timer wheel of an
1372 * idle CPU then this timer might expire before the next timer event
1373 * which is scheduled to wake up that CPU. In case of a completely
1374 * idle system the next event might even be infinite time into the
1375 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1376 * leaves the inner idle loop so the newly added timer is taken into
1377 * account when the CPU goes back to idle and evaluates the timer
1378 * wheel for the next timer event.
1379 */
1380void wake_up_idle_cpu(int cpu)
1381{
1382 struct rq *rq = cpu_rq(cpu);
1383
1384 if (cpu == smp_processor_id())
1385 return;
1386
1387 /*
1388 * This is safe, as this function is called with the timer
1389 * wheel base lock of (cpu) held. When the CPU is on the way
1390 * to idle and has not yet set rq->curr to idle then it will
1391 * be serialized on the timer wheel base lock and take the new
1392 * timer into account automatically.
1393 */
1394 if (rq->curr != rq->idle)
1395 return;
1396
1397 /*
1398 * We can set TIF_RESCHED on the idle task of the other CPU
1399 * lockless. The worst case is that the other CPU runs the
1400 * idle task through an additional NOOP schedule()
1401 */
1402 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1403
1404 /* NEED_RESCHED must be visible before we test polling */
1405 smp_mb();
1406 if (!tsk_is_polling(rq->idle))
1407 smp_send_reschedule(cpu);
1408}
1409#endif
1410
1055#else 1411#else
1056static void __resched_task(struct task_struct *p, int tif_bit) 1412static void __resched_task(struct task_struct *p, int tif_bit)
1057{ 1413{
@@ -1073,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1073 */ 1429 */
1074#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1430#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1075 1431
1432/*
1433 * delta *= weight / lw
1434 */
1076static unsigned long 1435static unsigned long
1077calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1436calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1078 struct load_weight *lw) 1437 struct load_weight *lw)
@@ -1095,12 +1454,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1095 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1454 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1096} 1455}
1097 1456
1098static inline unsigned long
1099calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1100{
1101 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1102}
1103
1104static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1457static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1105{ 1458{
1106 lw->weight += inc; 1459 lw->weight += inc;
@@ -1198,11 +1551,347 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1198static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1551static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1199#endif 1552#endif
1200 1553
1554static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1555{
1556 update_load_add(&rq->load, load);
1557}
1558
1559static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1560{
1561 update_load_sub(&rq->load, load);
1562}
1563
1201#ifdef CONFIG_SMP 1564#ifdef CONFIG_SMP
1202static unsigned long source_load(int cpu, int type); 1565static unsigned long source_load(int cpu, int type);
1203static unsigned long target_load(int cpu, int type); 1566static unsigned long target_load(int cpu, int type);
1204static unsigned long cpu_avg_load_per_task(int cpu); 1567static unsigned long cpu_avg_load_per_task(int cpu);
1205static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1568static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1569
1570#ifdef CONFIG_FAIR_GROUP_SCHED
1571
1572/*
1573 * Group load balancing.
1574 *
1575 * We calculate a few balance domain wide aggregate numbers; load and weight.
1576 * Given the pictures below, and assuming each item has equal weight:
1577 *
1578 * root 1 - thread
1579 * / | \ A - group
1580 * A 1 B
1581 * /|\ / \
1582 * C 2 D 3 4
1583 * | |
1584 * 5 6
1585 *
1586 * load:
1587 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1588 * which equals 1/9-th of the total load.
1589 *
1590 * shares:
1591 * The weight of this group on the selected cpus.
1592 *
1593 * rq_weight:
1594 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1595 * B would get 2.
1596 *
1597 * task_weight:
1598 * Part of the rq_weight contributed by tasks; all groups except B would
1599 * get 1, B gets 2.
1600 */
1601
1602static inline struct aggregate_struct *
1603aggregate(struct task_group *tg, struct sched_domain *sd)
1604{
1605 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1606}
1607
1608typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1609
1610/*
1611 * Iterate the full tree, calling @down when first entering a node and @up when
1612 * leaving it for the final time.
1613 */
1614static
1615void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1616 struct sched_domain *sd)
1617{
1618 struct task_group *parent, *child;
1619
1620 rcu_read_lock();
1621 parent = &root_task_group;
1622down:
1623 (*down)(parent, sd);
1624 list_for_each_entry_rcu(child, &parent->children, siblings) {
1625 parent = child;
1626 goto down;
1627
1628up:
1629 continue;
1630 }
1631 (*up)(parent, sd);
1632
1633 child = parent;
1634 parent = parent->parent;
1635 if (parent)
1636 goto up;
1637 rcu_read_unlock();
1638}
1639
1640/*
1641 * Calculate the aggregate runqueue weight.
1642 */
1643static
1644void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1645{
1646 unsigned long rq_weight = 0;
1647 unsigned long task_weight = 0;
1648 int i;
1649
1650 for_each_cpu_mask(i, sd->span) {
1651 rq_weight += tg->cfs_rq[i]->load.weight;
1652 task_weight += tg->cfs_rq[i]->task_weight;
1653 }
1654
1655 aggregate(tg, sd)->rq_weight = rq_weight;
1656 aggregate(tg, sd)->task_weight = task_weight;
1657}
1658
1659/*
1660 * Compute the weight of this group on the given cpus.
1661 */
1662static
1663void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1664{
1665 unsigned long shares = 0;
1666 int i;
1667
1668 for_each_cpu_mask(i, sd->span)
1669 shares += tg->cfs_rq[i]->shares;
1670
1671 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1672 shares = tg->shares;
1673
1674 aggregate(tg, sd)->shares = shares;
1675}
1676
1677/*
1678 * Compute the load fraction assigned to this group, relies on the aggregate
1679 * weight and this group's parent's load, i.e. top-down.
1680 */
1681static
1682void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1683{
1684 unsigned long load;
1685
1686 if (!tg->parent) {
1687 int i;
1688
1689 load = 0;
1690 for_each_cpu_mask(i, sd->span)
1691 load += cpu_rq(i)->load.weight;
1692
1693 } else {
1694 load = aggregate(tg->parent, sd)->load;
1695
1696 /*
1697 * shares is our weight in the parent's rq so
1698 * shares/parent->rq_weight gives our fraction of the load
1699 */
1700 load *= aggregate(tg, sd)->shares;
1701 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1702 }
1703
1704 aggregate(tg, sd)->load = load;
1705}
1706
1707static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1708
1709/*
1710 * Calculate and set the cpu's group shares.
1711 */
1712static void
1713__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1714 int tcpu)
1715{
1716 int boost = 0;
1717 unsigned long shares;
1718 unsigned long rq_weight;
1719
1720 if (!tg->se[tcpu])
1721 return;
1722
1723 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1724
1725 /*
1726 * If there are currently no tasks on the cpu pretend there is one of
1727 * average load so that when a new task gets to run here it will not
1728 * get delayed by group starvation.
1729 */
1730 if (!rq_weight) {
1731 boost = 1;
1732 rq_weight = NICE_0_LOAD;
1733 }
1734
1735 /*
1736 * \Sum shares * rq_weight
1737 * shares = -----------------------
1738 * \Sum rq_weight
1739 *
1740 */
1741 shares = aggregate(tg, sd)->shares * rq_weight;
1742 shares /= aggregate(tg, sd)->rq_weight + 1;
1743
1744 /*
1745 * record the actual number of shares, not the boosted amount.
1746 */
1747 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1748
1749 if (shares < MIN_SHARES)
1750 shares = MIN_SHARES;
1751
1752 __set_se_shares(tg->se[tcpu], shares);
1753}
1754
1755/*
1756 * Re-adjust the weights on the cpu the task came from and on the cpu the
1757 * task went to.
1758 */
1759static void
1760__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1761 int scpu, int dcpu)
1762{
1763 unsigned long shares;
1764
1765 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1766
1767 __update_group_shares_cpu(tg, sd, scpu);
1768 __update_group_shares_cpu(tg, sd, dcpu);
1769
1770 /*
1771 * ensure we never loose shares due to rounding errors in the
1772 * above redistribution.
1773 */
1774 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1775 if (shares)
1776 tg->cfs_rq[dcpu]->shares += shares;
1777}
1778
1779/*
1780 * Because changing a group's shares changes the weight of the super-group
1781 * we need to walk up the tree and change all shares until we hit the root.
1782 */
1783static void
1784move_group_shares(struct task_group *tg, struct sched_domain *sd,
1785 int scpu, int dcpu)
1786{
1787 while (tg) {
1788 __move_group_shares(tg, sd, scpu, dcpu);
1789 tg = tg->parent;
1790 }
1791}
1792
1793static
1794void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1795{
1796 unsigned long shares = aggregate(tg, sd)->shares;
1797 int i;
1798
1799 for_each_cpu_mask(i, sd->span) {
1800 struct rq *rq = cpu_rq(i);
1801 unsigned long flags;
1802
1803 spin_lock_irqsave(&rq->lock, flags);
1804 __update_group_shares_cpu(tg, sd, i);
1805 spin_unlock_irqrestore(&rq->lock, flags);
1806 }
1807
1808 aggregate_group_shares(tg, sd);
1809
1810 /*
1811 * ensure we never loose shares due to rounding errors in the
1812 * above redistribution.
1813 */
1814 shares -= aggregate(tg, sd)->shares;
1815 if (shares) {
1816 tg->cfs_rq[sd->first_cpu]->shares += shares;
1817 aggregate(tg, sd)->shares += shares;
1818 }
1819}
1820
1821/*
1822 * Calculate the accumulative weight and recursive load of each task group
1823 * while walking down the tree.
1824 */
1825static
1826void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1827{
1828 aggregate_group_weight(tg, sd);
1829 aggregate_group_shares(tg, sd);
1830 aggregate_group_load(tg, sd);
1831}
1832
1833/*
1834 * Rebalance the cpu shares while walking back up the tree.
1835 */
1836static
1837void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1838{
1839 aggregate_group_set_shares(tg, sd);
1840}
1841
1842static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1843
1844static void __init init_aggregate(void)
1845{
1846 int i;
1847
1848 for_each_possible_cpu(i)
1849 spin_lock_init(&per_cpu(aggregate_lock, i));
1850}
1851
1852static int get_aggregate(struct sched_domain *sd)
1853{
1854 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1855 return 0;
1856
1857 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1858 return 1;
1859}
1860
1861static void put_aggregate(struct sched_domain *sd)
1862{
1863 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1864}
1865
1866static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1867{
1868 cfs_rq->shares = shares;
1869}
1870
1871#else
1872
1873static inline void init_aggregate(void)
1874{
1875}
1876
1877static inline int get_aggregate(struct sched_domain *sd)
1878{
1879 return 0;
1880}
1881
1882static inline void put_aggregate(struct sched_domain *sd)
1883{
1884}
1885#endif
1886
1887#else /* CONFIG_SMP */
1888
1889#ifdef CONFIG_FAIR_GROUP_SCHED
1890static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1891{
1892}
1893#endif
1894
1206#endif /* CONFIG_SMP */ 1895#endif /* CONFIG_SMP */
1207 1896
1208#include "sched_stats.h" 1897#include "sched_stats.h"
@@ -1215,26 +1904,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1215 1904
1216#define sched_class_highest (&rt_sched_class) 1905#define sched_class_highest (&rt_sched_class)
1217 1906
1218static inline void inc_load(struct rq *rq, const struct task_struct *p) 1907static void inc_nr_running(struct rq *rq)
1219{
1220 update_load_add(&rq->load, p->se.load.weight);
1221}
1222
1223static inline void dec_load(struct rq *rq, const struct task_struct *p)
1224{
1225 update_load_sub(&rq->load, p->se.load.weight);
1226}
1227
1228static void inc_nr_running(struct task_struct *p, struct rq *rq)
1229{ 1908{
1230 rq->nr_running++; 1909 rq->nr_running++;
1231 inc_load(rq, p);
1232} 1910}
1233 1911
1234static void dec_nr_running(struct task_struct *p, struct rq *rq) 1912static void dec_nr_running(struct rq *rq)
1235{ 1913{
1236 rq->nr_running--; 1914 rq->nr_running--;
1237 dec_load(rq, p);
1238} 1915}
1239 1916
1240static void set_load_weight(struct task_struct *p) 1917static void set_load_weight(struct task_struct *p)
@@ -1326,7 +2003,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1326 rq->nr_uninterruptible--; 2003 rq->nr_uninterruptible--;
1327 2004
1328 enqueue_task(rq, p, wakeup); 2005 enqueue_task(rq, p, wakeup);
1329 inc_nr_running(p, rq); 2006 inc_nr_running(rq);
1330} 2007}
1331 2008
1332/* 2009/*
@@ -1338,7 +2015,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1338 rq->nr_uninterruptible++; 2015 rq->nr_uninterruptible++;
1339 2016
1340 dequeue_task(rq, p, sleep); 2017 dequeue_task(rq, p, sleep);
1341 dec_nr_running(p, rq); 2018 dec_nr_running(rq);
1342} 2019}
1343 2020
1344/** 2021/**
@@ -1395,7 +2072,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1395 /* 2072 /*
1396 * Buddy candidates are cache hot: 2073 * Buddy candidates are cache hot:
1397 */ 2074 */
1398 if (&p->se == cfs_rq_of(&p->se)->next) 2075 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
1399 return 1; 2076 return 1;
1400 2077
1401 if (p->sched_class != &fair_sched_class) 2078 if (p->sched_class != &fair_sched_class)
@@ -1685,17 +2362,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1685 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2362 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1686 */ 2363 */
1687static int 2364static int
1688find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 2365find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2366 cpumask_t *tmp)
1689{ 2367{
1690 cpumask_t tmp;
1691 unsigned long load, min_load = ULONG_MAX; 2368 unsigned long load, min_load = ULONG_MAX;
1692 int idlest = -1; 2369 int idlest = -1;
1693 int i; 2370 int i;
1694 2371
1695 /* Traverse only the allowed CPUs */ 2372 /* Traverse only the allowed CPUs */
1696 cpus_and(tmp, group->cpumask, p->cpus_allowed); 2373 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1697 2374
1698 for_each_cpu_mask(i, tmp) { 2375 for_each_cpu_mask(i, *tmp) {
1699 load = weighted_cpuload(i); 2376 load = weighted_cpuload(i);
1700 2377
1701 if (load < min_load || (load == min_load && i == this_cpu)) { 2378 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1734,7 +2411,7 @@ static int sched_balance_self(int cpu, int flag)
1734 } 2411 }
1735 2412
1736 while (sd) { 2413 while (sd) {
1737 cpumask_t span; 2414 cpumask_t span, tmpmask;
1738 struct sched_group *group; 2415 struct sched_group *group;
1739 int new_cpu, weight; 2416 int new_cpu, weight;
1740 2417
@@ -1750,7 +2427,7 @@ static int sched_balance_self(int cpu, int flag)
1750 continue; 2427 continue;
1751 } 2428 }
1752 2429
1753 new_cpu = find_idlest_cpu(group, t, cpu); 2430 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
1754 if (new_cpu == -1 || new_cpu == cpu) { 2431 if (new_cpu == -1 || new_cpu == cpu) {
1755 /* Now try balancing at a lower domain level of cpu */ 2432 /* Now try balancing at a lower domain level of cpu */
1756 sd = sd->child; 2433 sd = sd->child;
@@ -1796,6 +2473,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1796 long old_state; 2473 long old_state;
1797 struct rq *rq; 2474 struct rq *rq;
1798 2475
2476 if (!sched_feat(SYNC_WAKEUPS))
2477 sync = 0;
2478
1799 smp_wmb(); 2479 smp_wmb();
1800 rq = task_rq_lock(p, &flags); 2480 rq = task_rq_lock(p, &flags);
1801 old_state = p->state; 2481 old_state = p->state;
@@ -1912,6 +2592,7 @@ static void __sched_fork(struct task_struct *p)
1912 2592
1913 INIT_LIST_HEAD(&p->rt.run_list); 2593 INIT_LIST_HEAD(&p->rt.run_list);
1914 p->se.on_rq = 0; 2594 p->se.on_rq = 0;
2595 INIT_LIST_HEAD(&p->se.group_node);
1915 2596
1916#ifdef CONFIG_PREEMPT_NOTIFIERS 2597#ifdef CONFIG_PREEMPT_NOTIFIERS
1917 INIT_HLIST_HEAD(&p->preempt_notifiers); 2598 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -1987,7 +2668,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1987 * management (if any): 2668 * management (if any):
1988 */ 2669 */
1989 p->sched_class->task_new(rq, p); 2670 p->sched_class->task_new(rq, p);
1990 inc_nr_running(p, rq); 2671 inc_nr_running(rq);
1991 } 2672 }
1992 check_preempt_curr(rq, p); 2673 check_preempt_curr(rq, p);
1993#ifdef CONFIG_SMP 2674#ifdef CONFIG_SMP
@@ -2631,7 +3312,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2631static struct sched_group * 3312static struct sched_group *
2632find_busiest_group(struct sched_domain *sd, int this_cpu, 3313find_busiest_group(struct sched_domain *sd, int this_cpu,
2633 unsigned long *imbalance, enum cpu_idle_type idle, 3314 unsigned long *imbalance, enum cpu_idle_type idle,
2634 int *sd_idle, cpumask_t *cpus, int *balance) 3315 int *sd_idle, const cpumask_t *cpus, int *balance)
2635{ 3316{
2636 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3317 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2637 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3318 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2932,7 +3613,7 @@ ret:
2932 */ 3613 */
2933static struct rq * 3614static struct rq *
2934find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3615find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2935 unsigned long imbalance, cpumask_t *cpus) 3616 unsigned long imbalance, const cpumask_t *cpus)
2936{ 3617{
2937 struct rq *busiest = NULL, *rq; 3618 struct rq *busiest = NULL, *rq;
2938 unsigned long max_load = 0; 3619 unsigned long max_load = 0;
@@ -2971,14 +3652,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2971 */ 3652 */
2972static int load_balance(int this_cpu, struct rq *this_rq, 3653static int load_balance(int this_cpu, struct rq *this_rq,
2973 struct sched_domain *sd, enum cpu_idle_type idle, 3654 struct sched_domain *sd, enum cpu_idle_type idle,
2974 int *balance) 3655 int *balance, cpumask_t *cpus)
2975{ 3656{
2976 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3657 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2977 struct sched_group *group; 3658 struct sched_group *group;
2978 unsigned long imbalance; 3659 unsigned long imbalance;
2979 struct rq *busiest; 3660 struct rq *busiest;
2980 cpumask_t cpus = CPU_MASK_ALL;
2981 unsigned long flags; 3661 unsigned long flags;
3662 int unlock_aggregate;
3663
3664 cpus_setall(*cpus);
3665
3666 unlock_aggregate = get_aggregate(sd);
2982 3667
2983 /* 3668 /*
2984 * When power savings policy is enabled for the parent domain, idle 3669 * When power savings policy is enabled for the parent domain, idle
@@ -2994,7 +3679,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2994 3679
2995redo: 3680redo:
2996 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3681 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2997 &cpus, balance); 3682 cpus, balance);
2998 3683
2999 if (*balance == 0) 3684 if (*balance == 0)
3000 goto out_balanced; 3685 goto out_balanced;
@@ -3004,7 +3689,7 @@ redo:
3004 goto out_balanced; 3689 goto out_balanced;
3005 } 3690 }
3006 3691
3007 busiest = find_busiest_queue(group, idle, imbalance, &cpus); 3692 busiest = find_busiest_queue(group, idle, imbalance, cpus);
3008 if (!busiest) { 3693 if (!busiest) {
3009 schedstat_inc(sd, lb_nobusyq[idle]); 3694 schedstat_inc(sd, lb_nobusyq[idle]);
3010 goto out_balanced; 3695 goto out_balanced;
@@ -3037,8 +3722,8 @@ redo:
3037 3722
3038 /* All tasks on this runqueue were pinned by CPU affinity */ 3723 /* All tasks on this runqueue were pinned by CPU affinity */
3039 if (unlikely(all_pinned)) { 3724 if (unlikely(all_pinned)) {
3040 cpu_clear(cpu_of(busiest), cpus); 3725 cpu_clear(cpu_of(busiest), *cpus);
3041 if (!cpus_empty(cpus)) 3726 if (!cpus_empty(*cpus))
3042 goto redo; 3727 goto redo;
3043 goto out_balanced; 3728 goto out_balanced;
3044 } 3729 }
@@ -3095,8 +3780,9 @@ redo:
3095 3780
3096 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3781 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3097 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3782 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3098 return -1; 3783 ld_moved = -1;
3099 return ld_moved; 3784
3785 goto out;
3100 3786
3101out_balanced: 3787out_balanced:
3102 schedstat_inc(sd, lb_balanced[idle]); 3788 schedstat_inc(sd, lb_balanced[idle]);
@@ -3111,8 +3797,13 @@ out_one_pinned:
3111 3797
3112 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3798 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3113 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3799 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3114 return -1; 3800 ld_moved = -1;
3115 return 0; 3801 else
3802 ld_moved = 0;
3803out:
3804 if (unlock_aggregate)
3805 put_aggregate(sd);
3806 return ld_moved;
3116} 3807}
3117 3808
3118/* 3809/*
@@ -3123,7 +3814,8 @@ out_one_pinned:
3123 * this_rq is locked. 3814 * this_rq is locked.
3124 */ 3815 */
3125static int 3816static int
3126load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 3817load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3818 cpumask_t *cpus)
3127{ 3819{
3128 struct sched_group *group; 3820 struct sched_group *group;
3129 struct rq *busiest = NULL; 3821 struct rq *busiest = NULL;
@@ -3131,7 +3823,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3131 int ld_moved = 0; 3823 int ld_moved = 0;
3132 int sd_idle = 0; 3824 int sd_idle = 0;
3133 int all_pinned = 0; 3825 int all_pinned = 0;
3134 cpumask_t cpus = CPU_MASK_ALL; 3826
3827 cpus_setall(*cpus);
3135 3828
3136 /* 3829 /*
3137 * When power savings policy is enabled for the parent domain, idle 3830 * When power savings policy is enabled for the parent domain, idle
@@ -3146,14 +3839,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3146 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3839 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3147redo: 3840redo:
3148 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3841 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3149 &sd_idle, &cpus, NULL); 3842 &sd_idle, cpus, NULL);
3150 if (!group) { 3843 if (!group) {
3151 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); 3844 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
3152 goto out_balanced; 3845 goto out_balanced;
3153 } 3846 }
3154 3847
3155 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, 3848 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
3156 &cpus);
3157 if (!busiest) { 3849 if (!busiest) {
3158 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); 3850 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
3159 goto out_balanced; 3851 goto out_balanced;
@@ -3175,8 +3867,8 @@ redo:
3175 spin_unlock(&busiest->lock); 3867 spin_unlock(&busiest->lock);
3176 3868
3177 if (unlikely(all_pinned)) { 3869 if (unlikely(all_pinned)) {
3178 cpu_clear(cpu_of(busiest), cpus); 3870 cpu_clear(cpu_of(busiest), *cpus);
3179 if (!cpus_empty(cpus)) 3871 if (!cpus_empty(*cpus))
3180 goto redo; 3872 goto redo;
3181 } 3873 }
3182 } 3874 }
@@ -3210,6 +3902,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3210 struct sched_domain *sd; 3902 struct sched_domain *sd;
3211 int pulled_task = -1; 3903 int pulled_task = -1;
3212 unsigned long next_balance = jiffies + HZ; 3904 unsigned long next_balance = jiffies + HZ;
3905 cpumask_t tmpmask;
3213 3906
3214 for_each_domain(this_cpu, sd) { 3907 for_each_domain(this_cpu, sd) {
3215 unsigned long interval; 3908 unsigned long interval;
@@ -3219,8 +3912,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3219 3912
3220 if (sd->flags & SD_BALANCE_NEWIDLE) 3913 if (sd->flags & SD_BALANCE_NEWIDLE)
3221 /* If we've pulled tasks over stop searching: */ 3914 /* If we've pulled tasks over stop searching: */
3222 pulled_task = load_balance_newidle(this_cpu, 3915 pulled_task = load_balance_newidle(this_cpu, this_rq,
3223 this_rq, sd); 3916 sd, &tmpmask);
3224 3917
3225 interval = msecs_to_jiffies(sd->balance_interval); 3918 interval = msecs_to_jiffies(sd->balance_interval);
3226 if (time_after(next_balance, sd->last_balance + interval)) 3919 if (time_after(next_balance, sd->last_balance + interval))
@@ -3379,6 +4072,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3379 /* Earliest time when we have to do rebalance again */ 4072 /* Earliest time when we have to do rebalance again */
3380 unsigned long next_balance = jiffies + 60*HZ; 4073 unsigned long next_balance = jiffies + 60*HZ;
3381 int update_next_balance = 0; 4074 int update_next_balance = 0;
4075 cpumask_t tmp;
3382 4076
3383 for_each_domain(cpu, sd) { 4077 for_each_domain(cpu, sd) {
3384 if (!(sd->flags & SD_LOAD_BALANCE)) 4078 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3402,7 +4096,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3402 } 4096 }
3403 4097
3404 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4098 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3405 if (load_balance(cpu, rq, sd, idle, &balance)) { 4099 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
3406 /* 4100 /*
3407 * We've pulled tasks over so either we're no 4101 * We've pulled tasks over so either we're no
3408 * longer idle, or one of our SMT siblings is 4102 * longer idle, or one of our SMT siblings is
@@ -3518,7 +4212,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3518 */ 4212 */
3519 int ilb = first_cpu(nohz.cpu_mask); 4213 int ilb = first_cpu(nohz.cpu_mask);
3520 4214
3521 if (ilb != NR_CPUS) 4215 if (ilb < nr_cpu_ids)
3522 resched_cpu(ilb); 4216 resched_cpu(ilb);
3523 } 4217 }
3524 } 4218 }
@@ -3722,9 +4416,9 @@ void scheduler_tick(void)
3722 rq->clock_underflows++; 4416 rq->clock_underflows++;
3723 } 4417 }
3724 rq->tick_timestamp = rq->clock; 4418 rq->tick_timestamp = rq->clock;
4419 update_last_tick_seen(rq);
3725 update_cpu_load(rq); 4420 update_cpu_load(rq);
3726 curr->sched_class->task_tick(rq, curr, 0); 4421 curr->sched_class->task_tick(rq, curr, 0);
3727 update_sched_rt_period(rq);
3728 spin_unlock(&rq->lock); 4422 spin_unlock(&rq->lock);
3729 4423
3730#ifdef CONFIG_SMP 4424#ifdef CONFIG_SMP
@@ -4324,10 +5018,8 @@ void set_user_nice(struct task_struct *p, long nice)
4324 goto out_unlock; 5018 goto out_unlock;
4325 } 5019 }
4326 on_rq = p->se.on_rq; 5020 on_rq = p->se.on_rq;
4327 if (on_rq) { 5021 if (on_rq)
4328 dequeue_task(rq, p, 0); 5022 dequeue_task(rq, p, 0);
4329 dec_load(rq, p);
4330 }
4331 5023
4332 p->static_prio = NICE_TO_PRIO(nice); 5024 p->static_prio = NICE_TO_PRIO(nice);
4333 set_load_weight(p); 5025 set_load_weight(p);
@@ -4337,7 +5029,6 @@ void set_user_nice(struct task_struct *p, long nice)
4337 5029
4338 if (on_rq) { 5030 if (on_rq) {
4339 enqueue_task(rq, p, 0); 5031 enqueue_task(rq, p, 0);
4340 inc_load(rq, p);
4341 /* 5032 /*
4342 * If the task increased its priority or is running and 5033 * If the task increased its priority or is running and
4343 * lowered its priority, then reschedule its CPU: 5034 * lowered its priority, then reschedule its CPU:
@@ -4559,7 +5250,7 @@ recheck:
4559 * Do not allow realtime tasks into groups that have no runtime 5250 * Do not allow realtime tasks into groups that have no runtime
4560 * assigned. 5251 * assigned.
4561 */ 5252 */
4562 if (rt_policy(policy) && task_group(p)->rt_runtime == 0) 5253 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
4563 return -EPERM; 5254 return -EPERM;
4564#endif 5255#endif
4565 5256
@@ -4721,9 +5412,10 @@ out_unlock:
4721 return retval; 5412 return retval;
4722} 5413}
4723 5414
4724long sched_setaffinity(pid_t pid, cpumask_t new_mask) 5415long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
4725{ 5416{
4726 cpumask_t cpus_allowed; 5417 cpumask_t cpus_allowed;
5418 cpumask_t new_mask = *in_mask;
4727 struct task_struct *p; 5419 struct task_struct *p;
4728 int retval; 5420 int retval;
4729 5421
@@ -4754,13 +5446,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4754 if (retval) 5446 if (retval)
4755 goto out_unlock; 5447 goto out_unlock;
4756 5448
4757 cpus_allowed = cpuset_cpus_allowed(p); 5449 cpuset_cpus_allowed(p, &cpus_allowed);
4758 cpus_and(new_mask, new_mask, cpus_allowed); 5450 cpus_and(new_mask, new_mask, cpus_allowed);
4759 again: 5451 again:
4760 retval = set_cpus_allowed(p, new_mask); 5452 retval = set_cpus_allowed_ptr(p, &new_mask);
4761 5453
4762 if (!retval) { 5454 if (!retval) {
4763 cpus_allowed = cpuset_cpus_allowed(p); 5455 cpuset_cpus_allowed(p, &cpus_allowed);
4764 if (!cpus_subset(new_mask, cpus_allowed)) { 5456 if (!cpus_subset(new_mask, cpus_allowed)) {
4765 /* 5457 /*
4766 * We must have raced with a concurrent cpuset 5458 * We must have raced with a concurrent cpuset
@@ -4804,7 +5496,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4804 if (retval) 5496 if (retval)
4805 return retval; 5497 return retval;
4806 5498
4807 return sched_setaffinity(pid, new_mask); 5499 return sched_setaffinity(pid, &new_mask);
4808} 5500}
4809 5501
4810/* 5502/*
@@ -5266,7 +5958,6 @@ static inline void sched_init_granularity(void)
5266 sysctl_sched_latency = limit; 5958 sysctl_sched_latency = limit;
5267 5959
5268 sysctl_sched_wakeup_granularity *= factor; 5960 sysctl_sched_wakeup_granularity *= factor;
5269 sysctl_sched_batch_wakeup_granularity *= factor;
5270} 5961}
5271 5962
5272#ifdef CONFIG_SMP 5963#ifdef CONFIG_SMP
@@ -5295,7 +5986,7 @@ static inline void sched_init_granularity(void)
5295 * task must not exit() & deallocate itself prematurely. The 5986 * task must not exit() & deallocate itself prematurely. The
5296 * call is not atomic; no spinlocks may be held. 5987 * call is not atomic; no spinlocks may be held.
5297 */ 5988 */
5298int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 5989int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5299{ 5990{
5300 struct migration_req req; 5991 struct migration_req req;
5301 unsigned long flags; 5992 unsigned long flags;
@@ -5303,23 +5994,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5303 int ret = 0; 5994 int ret = 0;
5304 5995
5305 rq = task_rq_lock(p, &flags); 5996 rq = task_rq_lock(p, &flags);
5306 if (!cpus_intersects(new_mask, cpu_online_map)) { 5997 if (!cpus_intersects(*new_mask, cpu_online_map)) {
5307 ret = -EINVAL; 5998 ret = -EINVAL;
5308 goto out; 5999 goto out;
5309 } 6000 }
5310 6001
5311 if (p->sched_class->set_cpus_allowed) 6002 if (p->sched_class->set_cpus_allowed)
5312 p->sched_class->set_cpus_allowed(p, &new_mask); 6003 p->sched_class->set_cpus_allowed(p, new_mask);
5313 else { 6004 else {
5314 p->cpus_allowed = new_mask; 6005 p->cpus_allowed = *new_mask;
5315 p->rt.nr_cpus_allowed = cpus_weight(new_mask); 6006 p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
5316 } 6007 }
5317 6008
5318 /* Can the task run on the task's current CPU? If so, we're done */ 6009 /* Can the task run on the task's current CPU? If so, we're done */
5319 if (cpu_isset(task_cpu(p), new_mask)) 6010 if (cpu_isset(task_cpu(p), *new_mask))
5320 goto out; 6011 goto out;
5321 6012
5322 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 6013 if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
5323 /* Need help from migration thread: drop lock and wait. */ 6014 /* Need help from migration thread: drop lock and wait. */
5324 task_rq_unlock(rq, &flags); 6015 task_rq_unlock(rq, &flags);
5325 wake_up_process(rq->migration_thread); 6016 wake_up_process(rq->migration_thread);
@@ -5332,7 +6023,7 @@ out:
5332 6023
5333 return ret; 6024 return ret;
5334} 6025}
5335EXPORT_SYMBOL_GPL(set_cpus_allowed); 6026EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5336 6027
5337/* 6028/*
5338 * Move (not current) task off this cpu, onto dest cpu. We're doing 6029 * Move (not current) task off this cpu, onto dest cpu. We're doing
@@ -5470,12 +6161,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5470 dest_cpu = any_online_cpu(mask); 6161 dest_cpu = any_online_cpu(mask);
5471 6162
5472 /* On any allowed CPU? */ 6163 /* On any allowed CPU? */
5473 if (dest_cpu == NR_CPUS) 6164 if (dest_cpu >= nr_cpu_ids)
5474 dest_cpu = any_online_cpu(p->cpus_allowed); 6165 dest_cpu = any_online_cpu(p->cpus_allowed);
5475 6166
5476 /* No more Mr. Nice Guy. */ 6167 /* No more Mr. Nice Guy. */
5477 if (dest_cpu == NR_CPUS) { 6168 if (dest_cpu >= nr_cpu_ids) {
5478 cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); 6169 cpumask_t cpus_allowed;
6170
6171 cpuset_cpus_allowed_locked(p, &cpus_allowed);
5479 /* 6172 /*
5480 * Try to stay on the same cpuset, where the 6173 * Try to stay on the same cpuset, where the
5481 * current cpuset may be a subset of all cpus. 6174 * current cpuset may be a subset of all cpus.
@@ -5511,7 +6204,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5511 */ 6204 */
5512static void migrate_nr_uninterruptible(struct rq *rq_src) 6205static void migrate_nr_uninterruptible(struct rq *rq_src)
5513{ 6206{
5514 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 6207 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
5515 unsigned long flags; 6208 unsigned long flags;
5516 6209
5517 local_irq_save(flags); 6210 local_irq_save(flags);
@@ -5923,20 +6616,16 @@ void __init migration_init(void)
5923 6616
5924#ifdef CONFIG_SMP 6617#ifdef CONFIG_SMP
5925 6618
5926/* Number of possible processor ids */
5927int nr_cpu_ids __read_mostly = NR_CPUS;
5928EXPORT_SYMBOL(nr_cpu_ids);
5929
5930#ifdef CONFIG_SCHED_DEBUG 6619#ifdef CONFIG_SCHED_DEBUG
5931 6620
5932static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) 6621static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6622 cpumask_t *groupmask)
5933{ 6623{
5934 struct sched_group *group = sd->groups; 6624 struct sched_group *group = sd->groups;
5935 cpumask_t groupmask; 6625 char str[256];
5936 char str[NR_CPUS];
5937 6626
5938 cpumask_scnprintf(str, NR_CPUS, sd->span); 6627 cpulist_scnprintf(str, sizeof(str), sd->span);
5939 cpus_clear(groupmask); 6628 cpus_clear(*groupmask);
5940 6629
5941 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6630 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5942 6631
@@ -5980,25 +6669,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
5980 break; 6669 break;
5981 } 6670 }
5982 6671
5983 if (cpus_intersects(groupmask, group->cpumask)) { 6672 if (cpus_intersects(*groupmask, group->cpumask)) {
5984 printk(KERN_CONT "\n"); 6673 printk(KERN_CONT "\n");
5985 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6674 printk(KERN_ERR "ERROR: repeated CPUs\n");
5986 break; 6675 break;
5987 } 6676 }
5988 6677
5989 cpus_or(groupmask, groupmask, group->cpumask); 6678 cpus_or(*groupmask, *groupmask, group->cpumask);
5990 6679
5991 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 6680 cpulist_scnprintf(str, sizeof(str), group->cpumask);
5992 printk(KERN_CONT " %s", str); 6681 printk(KERN_CONT " %s", str);
5993 6682
5994 group = group->next; 6683 group = group->next;
5995 } while (group != sd->groups); 6684 } while (group != sd->groups);
5996 printk(KERN_CONT "\n"); 6685 printk(KERN_CONT "\n");
5997 6686
5998 if (!cpus_equal(sd->span, groupmask)) 6687 if (!cpus_equal(sd->span, *groupmask))
5999 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6688 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6000 6689
6001 if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) 6690 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
6002 printk(KERN_ERR "ERROR: parent span is not a superset " 6691 printk(KERN_ERR "ERROR: parent span is not a superset "
6003 "of domain->span\n"); 6692 "of domain->span\n");
6004 return 0; 6693 return 0;
@@ -6006,6 +6695,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
6006 6695
6007static void sched_domain_debug(struct sched_domain *sd, int cpu) 6696static void sched_domain_debug(struct sched_domain *sd, int cpu)
6008{ 6697{
6698 cpumask_t *groupmask;
6009 int level = 0; 6699 int level = 0;
6010 6700
6011 if (!sd) { 6701 if (!sd) {
@@ -6015,14 +6705,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6015 6705
6016 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6706 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6017 6707
6708 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6709 if (!groupmask) {
6710 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6711 return;
6712 }
6713
6018 for (;;) { 6714 for (;;) {
6019 if (sched_domain_debug_one(sd, cpu, level)) 6715 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6020 break; 6716 break;
6021 level++; 6717 level++;
6022 sd = sd->parent; 6718 sd = sd->parent;
6023 if (!sd) 6719 if (!sd)
6024 break; 6720 break;
6025 } 6721 }
6722 kfree(groupmask);
6026} 6723}
6027#else 6724#else
6028# define sched_domain_debug(sd, cpu) do { } while (0) 6725# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6210,30 +6907,33 @@ __setup("isolcpus=", isolated_cpu_setup);
6210 * and ->cpu_power to 0. 6907 * and ->cpu_power to 0.
6211 */ 6908 */
6212static void 6909static void
6213init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, 6910init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6214 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6911 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
6215 struct sched_group **sg)) 6912 struct sched_group **sg,
6913 cpumask_t *tmpmask),
6914 cpumask_t *covered, cpumask_t *tmpmask)
6216{ 6915{
6217 struct sched_group *first = NULL, *last = NULL; 6916 struct sched_group *first = NULL, *last = NULL;
6218 cpumask_t covered = CPU_MASK_NONE;
6219 int i; 6917 int i;
6220 6918
6221 for_each_cpu_mask(i, span) { 6919 cpus_clear(*covered);
6920
6921 for_each_cpu_mask(i, *span) {
6222 struct sched_group *sg; 6922 struct sched_group *sg;
6223 int group = group_fn(i, cpu_map, &sg); 6923 int group = group_fn(i, cpu_map, &sg, tmpmask);
6224 int j; 6924 int j;
6225 6925
6226 if (cpu_isset(i, covered)) 6926 if (cpu_isset(i, *covered))
6227 continue; 6927 continue;
6228 6928
6229 sg->cpumask = CPU_MASK_NONE; 6929 cpus_clear(sg->cpumask);
6230 sg->__cpu_power = 0; 6930 sg->__cpu_power = 0;
6231 6931
6232 for_each_cpu_mask(j, span) { 6932 for_each_cpu_mask(j, *span) {
6233 if (group_fn(j, cpu_map, NULL) != group) 6933 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6234 continue; 6934 continue;
6235 6935
6236 cpu_set(j, covered); 6936 cpu_set(j, *covered);
6237 cpu_set(j, sg->cpumask); 6937 cpu_set(j, sg->cpumask);
6238 } 6938 }
6239 if (!first) 6939 if (!first)
@@ -6259,7 +6959,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
6259 * 6959 *
6260 * Should use nodemask_t. 6960 * Should use nodemask_t.
6261 */ 6961 */
6262static int find_next_best_node(int node, unsigned long *used_nodes) 6962static int find_next_best_node(int node, nodemask_t *used_nodes)
6263{ 6963{
6264 int i, n, val, min_val, best_node = 0; 6964 int i, n, val, min_val, best_node = 0;
6265 6965
@@ -6273,7 +6973,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6273 continue; 6973 continue;
6274 6974
6275 /* Skip already used nodes */ 6975 /* Skip already used nodes */
6276 if (test_bit(n, used_nodes)) 6976 if (node_isset(n, *used_nodes))
6277 continue; 6977 continue;
6278 6978
6279 /* Simple min distance search */ 6979 /* Simple min distance search */
@@ -6285,40 +6985,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
6285 } 6985 }
6286 } 6986 }
6287 6987
6288 set_bit(best_node, used_nodes); 6988 node_set(best_node, *used_nodes);
6289 return best_node; 6989 return best_node;
6290} 6990}
6291 6991
6292/** 6992/**
6293 * sched_domain_node_span - get a cpumask for a node's sched_domain 6993 * sched_domain_node_span - get a cpumask for a node's sched_domain
6294 * @node: node whose cpumask we're constructing 6994 * @node: node whose cpumask we're constructing
6295 * @size: number of nodes to include in this span 6995 * @span: resulting cpumask
6296 * 6996 *
6297 * Given a node, construct a good cpumask for its sched_domain to span. It 6997 * Given a node, construct a good cpumask for its sched_domain to span. It
6298 * should be one that prevents unnecessary balancing, but also spreads tasks 6998 * should be one that prevents unnecessary balancing, but also spreads tasks
6299 * out optimally. 6999 * out optimally.
6300 */ 7000 */
6301static cpumask_t sched_domain_node_span(int node) 7001static void sched_domain_node_span(int node, cpumask_t *span)
6302{ 7002{
6303 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 7003 nodemask_t used_nodes;
6304 cpumask_t span, nodemask; 7004 node_to_cpumask_ptr(nodemask, node);
6305 int i; 7005 int i;
6306 7006
6307 cpus_clear(span); 7007 cpus_clear(*span);
6308 bitmap_zero(used_nodes, MAX_NUMNODES); 7008 nodes_clear(used_nodes);
6309 7009
6310 nodemask = node_to_cpumask(node); 7010 cpus_or(*span, *span, *nodemask);
6311 cpus_or(span, span, nodemask); 7011 node_set(node, used_nodes);
6312 set_bit(node, used_nodes);
6313 7012
6314 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7013 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6315 int next_node = find_next_best_node(node, used_nodes); 7014 int next_node = find_next_best_node(node, &used_nodes);
6316 7015
6317 nodemask = node_to_cpumask(next_node); 7016 node_to_cpumask_ptr_next(nodemask, next_node);
6318 cpus_or(span, span, nodemask); 7017 cpus_or(*span, *span, *nodemask);
6319 } 7018 }
6320
6321 return span;
6322} 7019}
6323#endif 7020#endif
6324 7021
@@ -6332,7 +7029,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6332static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7029static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6333 7030
6334static int 7031static int
6335cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7032cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7033 cpumask_t *unused)
6336{ 7034{
6337 if (sg) 7035 if (sg)
6338 *sg = &per_cpu(sched_group_cpus, cpu); 7036 *sg = &per_cpu(sched_group_cpus, cpu);
@@ -6350,19 +7048,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6350 7048
6351#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7049#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6352static int 7050static int
6353cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7051cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7052 cpumask_t *mask)
6354{ 7053{
6355 int group; 7054 int group;
6356 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7055
6357 cpus_and(mask, mask, *cpu_map); 7056 *mask = per_cpu(cpu_sibling_map, cpu);
6358 group = first_cpu(mask); 7057 cpus_and(*mask, *mask, *cpu_map);
7058 group = first_cpu(*mask);
6359 if (sg) 7059 if (sg)
6360 *sg = &per_cpu(sched_group_core, group); 7060 *sg = &per_cpu(sched_group_core, group);
6361 return group; 7061 return group;
6362} 7062}
6363#elif defined(CONFIG_SCHED_MC) 7063#elif defined(CONFIG_SCHED_MC)
6364static int 7064static int
6365cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7065cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7066 cpumask_t *unused)
6366{ 7067{
6367 if (sg) 7068 if (sg)
6368 *sg = &per_cpu(sched_group_core, cpu); 7069 *sg = &per_cpu(sched_group_core, cpu);
@@ -6374,17 +7075,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6374static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7075static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6375 7076
6376static int 7077static int
6377cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) 7078cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7079 cpumask_t *mask)
6378{ 7080{
6379 int group; 7081 int group;
6380#ifdef CONFIG_SCHED_MC 7082#ifdef CONFIG_SCHED_MC
6381 cpumask_t mask = cpu_coregroup_map(cpu); 7083 *mask = cpu_coregroup_map(cpu);
6382 cpus_and(mask, mask, *cpu_map); 7084 cpus_and(*mask, *mask, *cpu_map);
6383 group = first_cpu(mask); 7085 group = first_cpu(*mask);
6384#elif defined(CONFIG_SCHED_SMT) 7086#elif defined(CONFIG_SCHED_SMT)
6385 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 7087 *mask = per_cpu(cpu_sibling_map, cpu);
6386 cpus_and(mask, mask, *cpu_map); 7088 cpus_and(*mask, *mask, *cpu_map);
6387 group = first_cpu(mask); 7089 group = first_cpu(*mask);
6388#else 7090#else
6389 group = cpu; 7091 group = cpu;
6390#endif 7092#endif
@@ -6400,19 +7102,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6400 * gets dynamically allocated. 7102 * gets dynamically allocated.
6401 */ 7103 */
6402static DEFINE_PER_CPU(struct sched_domain, node_domains); 7104static DEFINE_PER_CPU(struct sched_domain, node_domains);
6403static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 7105static struct sched_group ***sched_group_nodes_bycpu;
6404 7106
6405static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7107static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6406static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7108static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6407 7109
6408static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7110static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6409 struct sched_group **sg) 7111 struct sched_group **sg, cpumask_t *nodemask)
6410{ 7112{
6411 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6412 int group; 7113 int group;
6413 7114
6414 cpus_and(nodemask, nodemask, *cpu_map); 7115 *nodemask = node_to_cpumask(cpu_to_node(cpu));
6415 group = first_cpu(nodemask); 7116 cpus_and(*nodemask, *nodemask, *cpu_map);
7117 group = first_cpu(*nodemask);
6416 7118
6417 if (sg) 7119 if (sg)
6418 *sg = &per_cpu(sched_group_allnodes, group); 7120 *sg = &per_cpu(sched_group_allnodes, group);
@@ -6448,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6448 7150
6449#ifdef CONFIG_NUMA 7151#ifdef CONFIG_NUMA
6450/* Free memory allocated for various sched_group structures */ 7152/* Free memory allocated for various sched_group structures */
6451static void free_sched_groups(const cpumask_t *cpu_map) 7153static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6452{ 7154{
6453 int cpu, i; 7155 int cpu, i;
6454 7156
@@ -6460,11 +7162,11 @@ static void free_sched_groups(const cpumask_t *cpu_map)
6460 continue; 7162 continue;
6461 7163
6462 for (i = 0; i < MAX_NUMNODES; i++) { 7164 for (i = 0; i < MAX_NUMNODES; i++) {
6463 cpumask_t nodemask = node_to_cpumask(i);
6464 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7165 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6465 7166
6466 cpus_and(nodemask, nodemask, *cpu_map); 7167 *nodemask = node_to_cpumask(i);
6467 if (cpus_empty(nodemask)) 7168 cpus_and(*nodemask, *nodemask, *cpu_map);
7169 if (cpus_empty(*nodemask))
6468 continue; 7170 continue;
6469 7171
6470 if (sg == NULL) 7172 if (sg == NULL)
@@ -6482,7 +7184,7 @@ next_sg:
6482 } 7184 }
6483} 7185}
6484#else 7186#else
6485static void free_sched_groups(const cpumask_t *cpu_map) 7187static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6486{ 7188{
6487} 7189}
6488#endif 7190#endif
@@ -6540,13 +7242,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6540} 7242}
6541 7243
6542/* 7244/*
7245 * Initializers for schedule domains
7246 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7247 */
7248
7249#define SD_INIT(sd, type) sd_init_##type(sd)
7250#define SD_INIT_FUNC(type) \
7251static noinline void sd_init_##type(struct sched_domain *sd) \
7252{ \
7253 memset(sd, 0, sizeof(*sd)); \
7254 *sd = SD_##type##_INIT; \
7255 sd->level = SD_LV_##type; \
7256}
7257
7258SD_INIT_FUNC(CPU)
7259#ifdef CONFIG_NUMA
7260 SD_INIT_FUNC(ALLNODES)
7261 SD_INIT_FUNC(NODE)
7262#endif
7263#ifdef CONFIG_SCHED_SMT
7264 SD_INIT_FUNC(SIBLING)
7265#endif
7266#ifdef CONFIG_SCHED_MC
7267 SD_INIT_FUNC(MC)
7268#endif
7269
7270/*
7271 * To minimize stack usage kmalloc room for cpumasks and share the
7272 * space as the usage in build_sched_domains() dictates. Used only
7273 * if the amount of space is significant.
7274 */
7275struct allmasks {
7276 cpumask_t tmpmask; /* make this one first */
7277 union {
7278 cpumask_t nodemask;
7279 cpumask_t this_sibling_map;
7280 cpumask_t this_core_map;
7281 };
7282 cpumask_t send_covered;
7283
7284#ifdef CONFIG_NUMA
7285 cpumask_t domainspan;
7286 cpumask_t covered;
7287 cpumask_t notcovered;
7288#endif
7289};
7290
7291#if NR_CPUS > 128
7292#define SCHED_CPUMASK_ALLOC 1
7293#define SCHED_CPUMASK_FREE(v) kfree(v)
7294#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7295#else
7296#define SCHED_CPUMASK_ALLOC 0
7297#define SCHED_CPUMASK_FREE(v)
7298#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7299#endif
7300
7301#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7302 ((unsigned long)(a) + offsetof(struct allmasks, v))
7303
7304static int default_relax_domain_level = -1;
7305
7306static int __init setup_relax_domain_level(char *str)
7307{
7308 default_relax_domain_level = simple_strtoul(str, NULL, 0);
7309 return 1;
7310}
7311__setup("relax_domain_level=", setup_relax_domain_level);
7312
7313static void set_domain_attribute(struct sched_domain *sd,
7314 struct sched_domain_attr *attr)
7315{
7316 int request;
7317
7318 if (!attr || attr->relax_domain_level < 0) {
7319 if (default_relax_domain_level < 0)
7320 return;
7321 else
7322 request = default_relax_domain_level;
7323 } else
7324 request = attr->relax_domain_level;
7325 if (request < sd->level) {
7326 /* turn off idle balance on this domain */
7327 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7328 } else {
7329 /* turn on idle balance on this domain */
7330 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7331 }
7332}
7333
7334/*
6543 * Build sched domains for a given set of cpus and attach the sched domains 7335 * Build sched domains for a given set of cpus and attach the sched domains
6544 * to the individual cpus 7336 * to the individual cpus
6545 */ 7337 */
6546static int build_sched_domains(const cpumask_t *cpu_map) 7338static int __build_sched_domains(const cpumask_t *cpu_map,
7339 struct sched_domain_attr *attr)
6547{ 7340{
6548 int i; 7341 int i;
6549 struct root_domain *rd; 7342 struct root_domain *rd;
7343 SCHED_CPUMASK_DECLARE(allmasks);
7344 cpumask_t *tmpmask;
6550#ifdef CONFIG_NUMA 7345#ifdef CONFIG_NUMA
6551 struct sched_group **sched_group_nodes = NULL; 7346 struct sched_group **sched_group_nodes = NULL;
6552 int sd_allnodes = 0; 7347 int sd_allnodes = 0;
@@ -6560,39 +7355,65 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6560 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7355 printk(KERN_WARNING "Can not alloc sched group node list\n");
6561 return -ENOMEM; 7356 return -ENOMEM;
6562 } 7357 }
6563 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6564#endif 7358#endif
6565 7359
6566 rd = alloc_rootdomain(); 7360 rd = alloc_rootdomain();
6567 if (!rd) { 7361 if (!rd) {
6568 printk(KERN_WARNING "Cannot alloc root domain\n"); 7362 printk(KERN_WARNING "Cannot alloc root domain\n");
7363#ifdef CONFIG_NUMA
7364 kfree(sched_group_nodes);
7365#endif
6569 return -ENOMEM; 7366 return -ENOMEM;
6570 } 7367 }
6571 7368
7369#if SCHED_CPUMASK_ALLOC
7370 /* get space for all scratch cpumask variables */
7371 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7372 if (!allmasks) {
7373 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7374 kfree(rd);
7375#ifdef CONFIG_NUMA
7376 kfree(sched_group_nodes);
7377#endif
7378 return -ENOMEM;
7379 }
7380#endif
7381 tmpmask = (cpumask_t *)allmasks;
7382
7383
7384#ifdef CONFIG_NUMA
7385 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7386#endif
7387
6572 /* 7388 /*
6573 * Set up domains for cpus specified by the cpu_map. 7389 * Set up domains for cpus specified by the cpu_map.
6574 */ 7390 */
6575 for_each_cpu_mask(i, *cpu_map) { 7391 for_each_cpu_mask(i, *cpu_map) {
6576 struct sched_domain *sd = NULL, *p; 7392 struct sched_domain *sd = NULL, *p;
6577 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 7393 SCHED_CPUMASK_VAR(nodemask, allmasks);
6578 7394
6579 cpus_and(nodemask, nodemask, *cpu_map); 7395 *nodemask = node_to_cpumask(cpu_to_node(i));
7396 cpus_and(*nodemask, *nodemask, *cpu_map);
6580 7397
6581#ifdef CONFIG_NUMA 7398#ifdef CONFIG_NUMA
6582 if (cpus_weight(*cpu_map) > 7399 if (cpus_weight(*cpu_map) >
6583 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 7400 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
6584 sd = &per_cpu(allnodes_domains, i); 7401 sd = &per_cpu(allnodes_domains, i);
6585 *sd = SD_ALLNODES_INIT; 7402 SD_INIT(sd, ALLNODES);
7403 set_domain_attribute(sd, attr);
6586 sd->span = *cpu_map; 7404 sd->span = *cpu_map;
6587 cpu_to_allnodes_group(i, cpu_map, &sd->groups); 7405 sd->first_cpu = first_cpu(sd->span);
7406 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6588 p = sd; 7407 p = sd;
6589 sd_allnodes = 1; 7408 sd_allnodes = 1;
6590 } else 7409 } else
6591 p = NULL; 7410 p = NULL;
6592 7411
6593 sd = &per_cpu(node_domains, i); 7412 sd = &per_cpu(node_domains, i);
6594 *sd = SD_NODE_INIT; 7413 SD_INIT(sd, NODE);
6595 sd->span = sched_domain_node_span(cpu_to_node(i)); 7414 set_domain_attribute(sd, attr);
7415 sched_domain_node_span(cpu_to_node(i), &sd->span);
7416 sd->first_cpu = first_cpu(sd->span);
6596 sd->parent = p; 7417 sd->parent = p;
6597 if (p) 7418 if (p)
6598 p->child = sd; 7419 p->child = sd;
@@ -6601,94 +7422,120 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 7422
6602 p = sd; 7423 p = sd;
6603 sd = &per_cpu(phys_domains, i); 7424 sd = &per_cpu(phys_domains, i);
6604 *sd = SD_CPU_INIT; 7425 SD_INIT(sd, CPU);
6605 sd->span = nodemask; 7426 set_domain_attribute(sd, attr);
7427 sd->span = *nodemask;
7428 sd->first_cpu = first_cpu(sd->span);
6606 sd->parent = p; 7429 sd->parent = p;
6607 if (p) 7430 if (p)
6608 p->child = sd; 7431 p->child = sd;
6609 cpu_to_phys_group(i, cpu_map, &sd->groups); 7432 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
6610 7433
6611#ifdef CONFIG_SCHED_MC 7434#ifdef CONFIG_SCHED_MC
6612 p = sd; 7435 p = sd;
6613 sd = &per_cpu(core_domains, i); 7436 sd = &per_cpu(core_domains, i);
6614 *sd = SD_MC_INIT; 7437 SD_INIT(sd, MC);
7438 set_domain_attribute(sd, attr);
6615 sd->span = cpu_coregroup_map(i); 7439 sd->span = cpu_coregroup_map(i);
7440 sd->first_cpu = first_cpu(sd->span);
6616 cpus_and(sd->span, sd->span, *cpu_map); 7441 cpus_and(sd->span, sd->span, *cpu_map);
6617 sd->parent = p; 7442 sd->parent = p;
6618 p->child = sd; 7443 p->child = sd;
6619 cpu_to_core_group(i, cpu_map, &sd->groups); 7444 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
6620#endif 7445#endif
6621 7446
6622#ifdef CONFIG_SCHED_SMT 7447#ifdef CONFIG_SCHED_SMT
6623 p = sd; 7448 p = sd;
6624 sd = &per_cpu(cpu_domains, i); 7449 sd = &per_cpu(cpu_domains, i);
6625 *sd = SD_SIBLING_INIT; 7450 SD_INIT(sd, SIBLING);
7451 set_domain_attribute(sd, attr);
6626 sd->span = per_cpu(cpu_sibling_map, i); 7452 sd->span = per_cpu(cpu_sibling_map, i);
7453 sd->first_cpu = first_cpu(sd->span);
6627 cpus_and(sd->span, sd->span, *cpu_map); 7454 cpus_and(sd->span, sd->span, *cpu_map);
6628 sd->parent = p; 7455 sd->parent = p;
6629 p->child = sd; 7456 p->child = sd;
6630 cpu_to_cpu_group(i, cpu_map, &sd->groups); 7457 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
6631#endif 7458#endif
6632 } 7459 }
6633 7460
6634#ifdef CONFIG_SCHED_SMT 7461#ifdef CONFIG_SCHED_SMT
6635 /* Set up CPU (sibling) groups */ 7462 /* Set up CPU (sibling) groups */
6636 for_each_cpu_mask(i, *cpu_map) { 7463 for_each_cpu_mask(i, *cpu_map) {
6637 cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); 7464 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
6638 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 7465 SCHED_CPUMASK_VAR(send_covered, allmasks);
6639 if (i != first_cpu(this_sibling_map)) 7466
7467 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7468 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7469 if (i != first_cpu(*this_sibling_map))
6640 continue; 7470 continue;
6641 7471
6642 init_sched_build_groups(this_sibling_map, cpu_map, 7472 init_sched_build_groups(this_sibling_map, cpu_map,
6643 &cpu_to_cpu_group); 7473 &cpu_to_cpu_group,
7474 send_covered, tmpmask);
6644 } 7475 }
6645#endif 7476#endif
6646 7477
6647#ifdef CONFIG_SCHED_MC 7478#ifdef CONFIG_SCHED_MC
6648 /* Set up multi-core groups */ 7479 /* Set up multi-core groups */
6649 for_each_cpu_mask(i, *cpu_map) { 7480 for_each_cpu_mask(i, *cpu_map) {
6650 cpumask_t this_core_map = cpu_coregroup_map(i); 7481 SCHED_CPUMASK_VAR(this_core_map, allmasks);
6651 cpus_and(this_core_map, this_core_map, *cpu_map); 7482 SCHED_CPUMASK_VAR(send_covered, allmasks);
6652 if (i != first_cpu(this_core_map)) 7483
7484 *this_core_map = cpu_coregroup_map(i);
7485 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7486 if (i != first_cpu(*this_core_map))
6653 continue; 7487 continue;
7488
6654 init_sched_build_groups(this_core_map, cpu_map, 7489 init_sched_build_groups(this_core_map, cpu_map,
6655 &cpu_to_core_group); 7490 &cpu_to_core_group,
7491 send_covered, tmpmask);
6656 } 7492 }
6657#endif 7493#endif
6658 7494
6659 /* Set up physical groups */ 7495 /* Set up physical groups */
6660 for (i = 0; i < MAX_NUMNODES; i++) { 7496 for (i = 0; i < MAX_NUMNODES; i++) {
6661 cpumask_t nodemask = node_to_cpumask(i); 7497 SCHED_CPUMASK_VAR(nodemask, allmasks);
7498 SCHED_CPUMASK_VAR(send_covered, allmasks);
6662 7499
6663 cpus_and(nodemask, nodemask, *cpu_map); 7500 *nodemask = node_to_cpumask(i);
6664 if (cpus_empty(nodemask)) 7501 cpus_and(*nodemask, *nodemask, *cpu_map);
7502 if (cpus_empty(*nodemask))
6665 continue; 7503 continue;
6666 7504
6667 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); 7505 init_sched_build_groups(nodemask, cpu_map,
7506 &cpu_to_phys_group,
7507 send_covered, tmpmask);
6668 } 7508 }
6669 7509
6670#ifdef CONFIG_NUMA 7510#ifdef CONFIG_NUMA
6671 /* Set up node groups */ 7511 /* Set up node groups */
6672 if (sd_allnodes) 7512 if (sd_allnodes) {
6673 init_sched_build_groups(*cpu_map, cpu_map, 7513 SCHED_CPUMASK_VAR(send_covered, allmasks);
6674 &cpu_to_allnodes_group); 7514
7515 init_sched_build_groups(cpu_map, cpu_map,
7516 &cpu_to_allnodes_group,
7517 send_covered, tmpmask);
7518 }
6675 7519
6676 for (i = 0; i < MAX_NUMNODES; i++) { 7520 for (i = 0; i < MAX_NUMNODES; i++) {
6677 /* Set up node groups */ 7521 /* Set up node groups */
6678 struct sched_group *sg, *prev; 7522 struct sched_group *sg, *prev;
6679 cpumask_t nodemask = node_to_cpumask(i); 7523 SCHED_CPUMASK_VAR(nodemask, allmasks);
6680 cpumask_t domainspan; 7524 SCHED_CPUMASK_VAR(domainspan, allmasks);
6681 cpumask_t covered = CPU_MASK_NONE; 7525 SCHED_CPUMASK_VAR(covered, allmasks);
6682 int j; 7526 int j;
6683 7527
6684 cpus_and(nodemask, nodemask, *cpu_map); 7528 *nodemask = node_to_cpumask(i);
6685 if (cpus_empty(nodemask)) { 7529 cpus_clear(*covered);
7530
7531 cpus_and(*nodemask, *nodemask, *cpu_map);
7532 if (cpus_empty(*nodemask)) {
6686 sched_group_nodes[i] = NULL; 7533 sched_group_nodes[i] = NULL;
6687 continue; 7534 continue;
6688 } 7535 }
6689 7536
6690 domainspan = sched_domain_node_span(i); 7537 sched_domain_node_span(i, domainspan);
6691 cpus_and(domainspan, domainspan, *cpu_map); 7538 cpus_and(*domainspan, *domainspan, *cpu_map);
6692 7539
6693 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7540 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6694 if (!sg) { 7541 if (!sg) {
@@ -6697,31 +7544,31 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6697 goto error; 7544 goto error;
6698 } 7545 }
6699 sched_group_nodes[i] = sg; 7546 sched_group_nodes[i] = sg;
6700 for_each_cpu_mask(j, nodemask) { 7547 for_each_cpu_mask(j, *nodemask) {
6701 struct sched_domain *sd; 7548 struct sched_domain *sd;
6702 7549
6703 sd = &per_cpu(node_domains, j); 7550 sd = &per_cpu(node_domains, j);
6704 sd->groups = sg; 7551 sd->groups = sg;
6705 } 7552 }
6706 sg->__cpu_power = 0; 7553 sg->__cpu_power = 0;
6707 sg->cpumask = nodemask; 7554 sg->cpumask = *nodemask;
6708 sg->next = sg; 7555 sg->next = sg;
6709 cpus_or(covered, covered, nodemask); 7556 cpus_or(*covered, *covered, *nodemask);
6710 prev = sg; 7557 prev = sg;
6711 7558
6712 for (j = 0; j < MAX_NUMNODES; j++) { 7559 for (j = 0; j < MAX_NUMNODES; j++) {
6713 cpumask_t tmp, notcovered; 7560 SCHED_CPUMASK_VAR(notcovered, allmasks);
6714 int n = (i + j) % MAX_NUMNODES; 7561 int n = (i + j) % MAX_NUMNODES;
7562 node_to_cpumask_ptr(pnodemask, n);
6715 7563
6716 cpus_complement(notcovered, covered); 7564 cpus_complement(*notcovered, *covered);
6717 cpus_and(tmp, notcovered, *cpu_map); 7565 cpus_and(*tmpmask, *notcovered, *cpu_map);
6718 cpus_and(tmp, tmp, domainspan); 7566 cpus_and(*tmpmask, *tmpmask, *domainspan);
6719 if (cpus_empty(tmp)) 7567 if (cpus_empty(*tmpmask))
6720 break; 7568 break;
6721 7569
6722 nodemask = node_to_cpumask(n); 7570 cpus_and(*tmpmask, *tmpmask, *pnodemask);
6723 cpus_and(tmp, tmp, nodemask); 7571 if (cpus_empty(*tmpmask))
6724 if (cpus_empty(tmp))
6725 continue; 7572 continue;
6726 7573
6727 sg = kmalloc_node(sizeof(struct sched_group), 7574 sg = kmalloc_node(sizeof(struct sched_group),
@@ -6732,9 +7579,9 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6732 goto error; 7579 goto error;
6733 } 7580 }
6734 sg->__cpu_power = 0; 7581 sg->__cpu_power = 0;
6735 sg->cpumask = tmp; 7582 sg->cpumask = *tmpmask;
6736 sg->next = prev->next; 7583 sg->next = prev->next;
6737 cpus_or(covered, covered, tmp); 7584 cpus_or(*covered, *covered, *tmpmask);
6738 prev->next = sg; 7585 prev->next = sg;
6739 prev = sg; 7586 prev = sg;
6740 } 7587 }
@@ -6770,7 +7617,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6770 if (sd_allnodes) { 7617 if (sd_allnodes) {
6771 struct sched_group *sg; 7618 struct sched_group *sg;
6772 7619
6773 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); 7620 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7621 tmpmask);
6774 init_numa_sched_groups_power(sg); 7622 init_numa_sched_groups_power(sg);
6775 } 7623 }
6776#endif 7624#endif
@@ -6788,17 +7636,26 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6788 cpu_attach_domain(sd, rd, i); 7636 cpu_attach_domain(sd, rd, i);
6789 } 7637 }
6790 7638
7639 SCHED_CPUMASK_FREE((void *)allmasks);
6791 return 0; 7640 return 0;
6792 7641
6793#ifdef CONFIG_NUMA 7642#ifdef CONFIG_NUMA
6794error: 7643error:
6795 free_sched_groups(cpu_map); 7644 free_sched_groups(cpu_map, tmpmask);
7645 SCHED_CPUMASK_FREE((void *)allmasks);
6796 return -ENOMEM; 7646 return -ENOMEM;
6797#endif 7647#endif
6798} 7648}
6799 7649
7650static int build_sched_domains(const cpumask_t *cpu_map)
7651{
7652 return __build_sched_domains(cpu_map, NULL);
7653}
7654
6800static cpumask_t *doms_cur; /* current sched domains */ 7655static cpumask_t *doms_cur; /* current sched domains */
6801static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7656static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7657static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
7658 in 'doms_cur' */
6802 7659
6803/* 7660/*
6804 * Special case: If a kmalloc of a doms_cur partition (array of 7661 * Special case: If a kmalloc of a doms_cur partition (array of
@@ -6826,15 +7683,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
6826 if (!doms_cur) 7683 if (!doms_cur)
6827 doms_cur = &fallback_doms; 7684 doms_cur = &fallback_doms;
6828 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7685 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
7686 dattr_cur = NULL;
6829 err = build_sched_domains(doms_cur); 7687 err = build_sched_domains(doms_cur);
6830 register_sched_domain_sysctl(); 7688 register_sched_domain_sysctl();
6831 7689
6832 return err; 7690 return err;
6833} 7691}
6834 7692
6835static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 7693static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7694 cpumask_t *tmpmask)
6836{ 7695{
6837 free_sched_groups(cpu_map); 7696 free_sched_groups(cpu_map, tmpmask);
6838} 7697}
6839 7698
6840/* 7699/*
@@ -6843,6 +7702,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6843 */ 7702 */
6844static void detach_destroy_domains(const cpumask_t *cpu_map) 7703static void detach_destroy_domains(const cpumask_t *cpu_map)
6845{ 7704{
7705 cpumask_t tmpmask;
6846 int i; 7706 int i;
6847 7707
6848 unregister_sched_domain_sysctl(); 7708 unregister_sched_domain_sysctl();
@@ -6850,7 +7710,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6850 for_each_cpu_mask(i, *cpu_map) 7710 for_each_cpu_mask(i, *cpu_map)
6851 cpu_attach_domain(NULL, &def_root_domain, i); 7711 cpu_attach_domain(NULL, &def_root_domain, i);
6852 synchronize_sched(); 7712 synchronize_sched();
6853 arch_destroy_sched_domains(cpu_map); 7713 arch_destroy_sched_domains(cpu_map, &tmpmask);
7714}
7715
7716/* handle null as "default" */
7717static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7718 struct sched_domain_attr *new, int idx_new)
7719{
7720 struct sched_domain_attr tmp;
7721
7722 /* fast path */
7723 if (!new && !cur)
7724 return 1;
7725
7726 tmp = SD_ATTR_INIT;
7727 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7728 new ? (new + idx_new) : &tmp,
7729 sizeof(struct sched_domain_attr));
6854} 7730}
6855 7731
6856/* 7732/*
@@ -6874,7 +7750,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6874 * 7750 *
6875 * Call with hotplug lock held 7751 * Call with hotplug lock held
6876 */ 7752 */
6877void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) 7753void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7754 struct sched_domain_attr *dattr_new)
6878{ 7755{
6879 int i, j; 7756 int i, j;
6880 7757
@@ -6887,12 +7764,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6887 ndoms_new = 1; 7764 ndoms_new = 1;
6888 doms_new = &fallback_doms; 7765 doms_new = &fallback_doms;
6889 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7766 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7767 dattr_new = NULL;
6890 } 7768 }
6891 7769
6892 /* Destroy deleted domains */ 7770 /* Destroy deleted domains */
6893 for (i = 0; i < ndoms_cur; i++) { 7771 for (i = 0; i < ndoms_cur; i++) {
6894 for (j = 0; j < ndoms_new; j++) { 7772 for (j = 0; j < ndoms_new; j++) {
6895 if (cpus_equal(doms_cur[i], doms_new[j])) 7773 if (cpus_equal(doms_cur[i], doms_new[j])
7774 && dattrs_equal(dattr_cur, i, dattr_new, j))
6896 goto match1; 7775 goto match1;
6897 } 7776 }
6898 /* no match - a current sched domain not in new doms_new[] */ 7777 /* no match - a current sched domain not in new doms_new[] */
@@ -6904,11 +7783,13 @@ match1:
6904 /* Build new domains */ 7783 /* Build new domains */
6905 for (i = 0; i < ndoms_new; i++) { 7784 for (i = 0; i < ndoms_new; i++) {
6906 for (j = 0; j < ndoms_cur; j++) { 7785 for (j = 0; j < ndoms_cur; j++) {
6907 if (cpus_equal(doms_new[i], doms_cur[j])) 7786 if (cpus_equal(doms_new[i], doms_cur[j])
7787 && dattrs_equal(dattr_new, i, dattr_cur, j))
6908 goto match2; 7788 goto match2;
6909 } 7789 }
6910 /* no match - add a new doms_new */ 7790 /* no match - add a new doms_new */
6911 build_sched_domains(doms_new + i); 7791 __build_sched_domains(doms_new + i,
7792 dattr_new ? dattr_new + i : NULL);
6912match2: 7793match2:
6913 ; 7794 ;
6914 } 7795 }
@@ -6916,7 +7797,9 @@ match2:
6916 /* Remember the new sched domains */ 7797 /* Remember the new sched domains */
6917 if (doms_cur != &fallback_doms) 7798 if (doms_cur != &fallback_doms)
6918 kfree(doms_cur); 7799 kfree(doms_cur);
7800 kfree(dattr_cur); /* kfree(NULL) is safe */
6919 doms_cur = doms_new; 7801 doms_cur = doms_new;
7802 dattr_cur = dattr_new;
6920 ndoms_cur = ndoms_new; 7803 ndoms_cur = ndoms_new;
6921 7804
6922 register_sched_domain_sysctl(); 7805 register_sched_domain_sysctl();
@@ -7043,6 +7926,11 @@ void __init sched_init_smp(void)
7043{ 7926{
7044 cpumask_t non_isolated_cpus; 7927 cpumask_t non_isolated_cpus;
7045 7928
7929#if defined(CONFIG_NUMA)
7930 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7931 GFP_KERNEL);
7932 BUG_ON(sched_group_nodes_bycpu == NULL);
7933#endif
7046 get_online_cpus(); 7934 get_online_cpus();
7047 arch_init_sched_domains(&cpu_online_map); 7935 arch_init_sched_domains(&cpu_online_map);
7048 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7936 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
@@ -7053,7 +7941,7 @@ void __init sched_init_smp(void)
7053 hotcpu_notifier(update_sched_domains, 0); 7941 hotcpu_notifier(update_sched_domains, 0);
7054 7942
7055 /* Move init over to a non-isolated CPU */ 7943 /* Move init over to a non-isolated CPU */
7056 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7944 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
7057 BUG(); 7945 BUG();
7058 sched_init_granularity(); 7946 sched_init_granularity();
7059} 7947}
@@ -7074,6 +7962,7 @@ int in_sched_functions(unsigned long addr)
7074static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7962static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7075{ 7963{
7076 cfs_rq->tasks_timeline = RB_ROOT; 7964 cfs_rq->tasks_timeline = RB_ROOT;
7965 INIT_LIST_HEAD(&cfs_rq->tasks);
7077#ifdef CONFIG_FAIR_GROUP_SCHED 7966#ifdef CONFIG_FAIR_GROUP_SCHED
7078 cfs_rq->rq = rq; 7967 cfs_rq->rq = rq;
7079#endif 7968#endif
@@ -7103,6 +7992,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7103 7992
7104 rt_rq->rt_time = 0; 7993 rt_rq->rt_time = 0;
7105 rt_rq->rt_throttled = 0; 7994 rt_rq->rt_throttled = 0;
7995 rt_rq->rt_runtime = 0;
7996 spin_lock_init(&rt_rq->rt_runtime_lock);
7106 7997
7107#ifdef CONFIG_RT_GROUP_SCHED 7998#ifdef CONFIG_RT_GROUP_SCHED
7108 rt_rq->rt_nr_boosted = 0; 7999 rt_rq->rt_nr_boosted = 0;
@@ -7111,10 +8002,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7111} 8002}
7112 8003
7113#ifdef CONFIG_FAIR_GROUP_SCHED 8004#ifdef CONFIG_FAIR_GROUP_SCHED
7114static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, 8005static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7115 struct cfs_rq *cfs_rq, struct sched_entity *se, 8006 struct sched_entity *se, int cpu, int add,
7116 int cpu, int add) 8007 struct sched_entity *parent)
7117{ 8008{
8009 struct rq *rq = cpu_rq(cpu);
7118 tg->cfs_rq[cpu] = cfs_rq; 8010 tg->cfs_rq[cpu] = cfs_rq;
7119 init_cfs_rq(cfs_rq, rq); 8011 init_cfs_rq(cfs_rq, rq);
7120 cfs_rq->tg = tg; 8012 cfs_rq->tg = tg;
@@ -7122,45 +8014,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7122 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 8014 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7123 8015
7124 tg->se[cpu] = se; 8016 tg->se[cpu] = se;
7125 se->cfs_rq = &rq->cfs; 8017 /* se could be NULL for init_task_group */
8018 if (!se)
8019 return;
8020
8021 if (!parent)
8022 se->cfs_rq = &rq->cfs;
8023 else
8024 se->cfs_rq = parent->my_q;
8025
7126 se->my_q = cfs_rq; 8026 se->my_q = cfs_rq;
7127 se->load.weight = tg->shares; 8027 se->load.weight = tg->shares;
7128 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); 8028 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7129 se->parent = NULL; 8029 se->parent = parent;
7130} 8030}
7131#endif 8031#endif
7132 8032
7133#ifdef CONFIG_RT_GROUP_SCHED 8033#ifdef CONFIG_RT_GROUP_SCHED
7134static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, 8034static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7135 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 8035 struct sched_rt_entity *rt_se, int cpu, int add,
7136 int cpu, int add) 8036 struct sched_rt_entity *parent)
7137{ 8037{
8038 struct rq *rq = cpu_rq(cpu);
8039
7138 tg->rt_rq[cpu] = rt_rq; 8040 tg->rt_rq[cpu] = rt_rq;
7139 init_rt_rq(rt_rq, rq); 8041 init_rt_rq(rt_rq, rq);
7140 rt_rq->tg = tg; 8042 rt_rq->tg = tg;
7141 rt_rq->rt_se = rt_se; 8043 rt_rq->rt_se = rt_se;
8044 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7142 if (add) 8045 if (add)
7143 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 8046 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7144 8047
7145 tg->rt_se[cpu] = rt_se; 8048 tg->rt_se[cpu] = rt_se;
8049 if (!rt_se)
8050 return;
8051
8052 if (!parent)
8053 rt_se->rt_rq = &rq->rt;
8054 else
8055 rt_se->rt_rq = parent->my_q;
8056
7146 rt_se->rt_rq = &rq->rt; 8057 rt_se->rt_rq = &rq->rt;
7147 rt_se->my_q = rt_rq; 8058 rt_se->my_q = rt_rq;
7148 rt_se->parent = NULL; 8059 rt_se->parent = parent;
7149 INIT_LIST_HEAD(&rt_se->run_list); 8060 INIT_LIST_HEAD(&rt_se->run_list);
7150} 8061}
7151#endif 8062#endif
7152 8063
7153void __init sched_init(void) 8064void __init sched_init(void)
7154{ 8065{
7155 int highest_cpu = 0;
7156 int i, j; 8066 int i, j;
8067 unsigned long alloc_size = 0, ptr;
8068
8069#ifdef CONFIG_FAIR_GROUP_SCHED
8070 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8071#endif
8072#ifdef CONFIG_RT_GROUP_SCHED
8073 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8074#endif
8075#ifdef CONFIG_USER_SCHED
8076 alloc_size *= 2;
8077#endif
8078 /*
8079 * As sched_init() is called before page_alloc is setup,
8080 * we use alloc_bootmem().
8081 */
8082 if (alloc_size) {
8083 ptr = (unsigned long)alloc_bootmem(alloc_size);
8084
8085#ifdef CONFIG_FAIR_GROUP_SCHED
8086 init_task_group.se = (struct sched_entity **)ptr;
8087 ptr += nr_cpu_ids * sizeof(void **);
8088
8089 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8090 ptr += nr_cpu_ids * sizeof(void **);
8091
8092#ifdef CONFIG_USER_SCHED
8093 root_task_group.se = (struct sched_entity **)ptr;
8094 ptr += nr_cpu_ids * sizeof(void **);
8095
8096 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8097 ptr += nr_cpu_ids * sizeof(void **);
8098#endif
8099#endif
8100#ifdef CONFIG_RT_GROUP_SCHED
8101 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8102 ptr += nr_cpu_ids * sizeof(void **);
8103
8104 init_task_group.rt_rq = (struct rt_rq **)ptr;
8105 ptr += nr_cpu_ids * sizeof(void **);
8106
8107#ifdef CONFIG_USER_SCHED
8108 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8109 ptr += nr_cpu_ids * sizeof(void **);
8110
8111 root_task_group.rt_rq = (struct rt_rq **)ptr;
8112 ptr += nr_cpu_ids * sizeof(void **);
8113#endif
8114#endif
8115 }
7157 8116
7158#ifdef CONFIG_SMP 8117#ifdef CONFIG_SMP
8118 init_aggregate();
7159 init_defrootdomain(); 8119 init_defrootdomain();
7160#endif 8120#endif
7161 8121
8122 init_rt_bandwidth(&def_rt_bandwidth,
8123 global_rt_period(), global_rt_runtime());
8124
8125#ifdef CONFIG_RT_GROUP_SCHED
8126 init_rt_bandwidth(&init_task_group.rt_bandwidth,
8127 global_rt_period(), global_rt_runtime());
8128#ifdef CONFIG_USER_SCHED
8129 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8130 global_rt_period(), RUNTIME_INF);
8131#endif
8132#endif
8133
7162#ifdef CONFIG_GROUP_SCHED 8134#ifdef CONFIG_GROUP_SCHED
7163 list_add(&init_task_group.list, &task_groups); 8135 list_add(&init_task_group.list, &task_groups);
8136 INIT_LIST_HEAD(&init_task_group.children);
8137
8138#ifdef CONFIG_USER_SCHED
8139 INIT_LIST_HEAD(&root_task_group.children);
8140 init_task_group.parent = &root_task_group;
8141 list_add(&init_task_group.siblings, &root_task_group.children);
8142#endif
7164#endif 8143#endif
7165 8144
7166 for_each_possible_cpu(i) { 8145 for_each_possible_cpu(i) {
@@ -7171,26 +8150,68 @@ void __init sched_init(void)
7171 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 8150 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7172 rq->nr_running = 0; 8151 rq->nr_running = 0;
7173 rq->clock = 1; 8152 rq->clock = 1;
8153 update_last_tick_seen(rq);
7174 init_cfs_rq(&rq->cfs, rq); 8154 init_cfs_rq(&rq->cfs, rq);
7175 init_rt_rq(&rq->rt, rq); 8155 init_rt_rq(&rq->rt, rq);
7176#ifdef CONFIG_FAIR_GROUP_SCHED 8156#ifdef CONFIG_FAIR_GROUP_SCHED
7177 init_task_group.shares = init_task_group_load; 8157 init_task_group.shares = init_task_group_load;
7178 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8158 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7179 init_tg_cfs_entry(rq, &init_task_group, 8159#ifdef CONFIG_CGROUP_SCHED
8160 /*
8161 * How much cpu bandwidth does init_task_group get?
8162 *
8163 * In case of task-groups formed thr' the cgroup filesystem, it
8164 * gets 100% of the cpu resources in the system. This overall
8165 * system cpu resource is divided among the tasks of
8166 * init_task_group and its child task-groups in a fair manner,
8167 * based on each entity's (task or task-group's) weight
8168 * (se->load.weight).
8169 *
8170 * In other words, if init_task_group has 10 tasks of weight
8171 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8172 * then A0's share of the cpu resource is:
8173 *
8174 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8175 *
8176 * We achieve this by letting init_task_group's tasks sit
8177 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8178 */
8179 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8180#elif defined CONFIG_USER_SCHED
8181 root_task_group.shares = NICE_0_LOAD;
8182 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8183 /*
8184 * In case of task-groups formed thr' the user id of tasks,
8185 * init_task_group represents tasks belonging to root user.
8186 * Hence it forms a sibling of all subsequent groups formed.
8187 * In this case, init_task_group gets only a fraction of overall
8188 * system cpu resource, based on the weight assigned to root
8189 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8190 * by letting tasks of init_task_group sit in a separate cfs_rq
8191 * (init_cfs_rq) and having one entity represent this group of
8192 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8193 */
8194 init_tg_cfs_entry(&init_task_group,
7180 &per_cpu(init_cfs_rq, i), 8195 &per_cpu(init_cfs_rq, i),
7181 &per_cpu(init_sched_entity, i), i, 1); 8196 &per_cpu(init_sched_entity, i), i, 1,
8197 root_task_group.se[i]);
7182 8198
7183#endif 8199#endif
8200#endif /* CONFIG_FAIR_GROUP_SCHED */
8201
8202 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7184#ifdef CONFIG_RT_GROUP_SCHED 8203#ifdef CONFIG_RT_GROUP_SCHED
7185 init_task_group.rt_runtime =
7186 sysctl_sched_rt_runtime * NSEC_PER_USEC;
7187 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8204 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7188 init_tg_rt_entry(rq, &init_task_group, 8205#ifdef CONFIG_CGROUP_SCHED
8206 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8207#elif defined CONFIG_USER_SCHED
8208 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8209 init_tg_rt_entry(&init_task_group,
7189 &per_cpu(init_rt_rq, i), 8210 &per_cpu(init_rt_rq, i),
7190 &per_cpu(init_sched_rt_entity, i), i, 1); 8211 &per_cpu(init_sched_rt_entity, i), i, 1,
8212 root_task_group.rt_se[i]);
8213#endif
7191#endif 8214#endif
7192 rq->rt_period_expire = 0;
7193 rq->rt_throttled = 0;
7194 8215
7195 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8216 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7196 rq->cpu_load[j] = 0; 8217 rq->cpu_load[j] = 0;
@@ -7207,7 +8228,6 @@ void __init sched_init(void)
7207#endif 8228#endif
7208 init_rq_hrtick(rq); 8229 init_rq_hrtick(rq);
7209 atomic_set(&rq->nr_iowait, 0); 8230 atomic_set(&rq->nr_iowait, 0);
7210 highest_cpu = i;
7211 } 8231 }
7212 8232
7213 set_load_weight(&init_task); 8233 set_load_weight(&init_task);
@@ -7217,7 +8237,6 @@ void __init sched_init(void)
7217#endif 8237#endif
7218 8238
7219#ifdef CONFIG_SMP 8239#ifdef CONFIG_SMP
7220 nr_cpu_ids = highest_cpu + 1;
7221 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8240 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
7222#endif 8241#endif
7223 8242
@@ -7376,8 +8395,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7376 8395
7377#endif 8396#endif
7378 8397
7379#ifdef CONFIG_GROUP_SCHED
7380
7381#ifdef CONFIG_FAIR_GROUP_SCHED 8398#ifdef CONFIG_FAIR_GROUP_SCHED
7382static void free_fair_sched_group(struct task_group *tg) 8399static void free_fair_sched_group(struct task_group *tg)
7383{ 8400{
@@ -7394,17 +8411,18 @@ static void free_fair_sched_group(struct task_group *tg)
7394 kfree(tg->se); 8411 kfree(tg->se);
7395} 8412}
7396 8413
7397static int alloc_fair_sched_group(struct task_group *tg) 8414static
8415int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7398{ 8416{
7399 struct cfs_rq *cfs_rq; 8417 struct cfs_rq *cfs_rq;
7400 struct sched_entity *se; 8418 struct sched_entity *se, *parent_se;
7401 struct rq *rq; 8419 struct rq *rq;
7402 int i; 8420 int i;
7403 8421
7404 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); 8422 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7405 if (!tg->cfs_rq) 8423 if (!tg->cfs_rq)
7406 goto err; 8424 goto err;
7407 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 8425 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7408 if (!tg->se) 8426 if (!tg->se)
7409 goto err; 8427 goto err;
7410 8428
@@ -7423,7 +8441,8 @@ static int alloc_fair_sched_group(struct task_group *tg)
7423 if (!se) 8441 if (!se)
7424 goto err; 8442 goto err;
7425 8443
7426 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); 8444 parent_se = parent ? parent->se[i] : NULL;
8445 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
7427 } 8446 }
7428 8447
7429 return 1; 8448 return 1;
@@ -7447,7 +8466,8 @@ static inline void free_fair_sched_group(struct task_group *tg)
7447{ 8466{
7448} 8467}
7449 8468
7450static inline int alloc_fair_sched_group(struct task_group *tg) 8469static inline
8470int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7451{ 8471{
7452 return 1; 8472 return 1;
7453} 8473}
@@ -7466,6 +8486,8 @@ static void free_rt_sched_group(struct task_group *tg)
7466{ 8486{
7467 int i; 8487 int i;
7468 8488
8489 destroy_rt_bandwidth(&tg->rt_bandwidth);
8490
7469 for_each_possible_cpu(i) { 8491 for_each_possible_cpu(i) {
7470 if (tg->rt_rq) 8492 if (tg->rt_rq)
7471 kfree(tg->rt_rq[i]); 8493 kfree(tg->rt_rq[i]);
@@ -7477,21 +8499,23 @@ static void free_rt_sched_group(struct task_group *tg)
7477 kfree(tg->rt_se); 8499 kfree(tg->rt_se);
7478} 8500}
7479 8501
7480static int alloc_rt_sched_group(struct task_group *tg) 8502static
8503int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7481{ 8504{
7482 struct rt_rq *rt_rq; 8505 struct rt_rq *rt_rq;
7483 struct sched_rt_entity *rt_se; 8506 struct sched_rt_entity *rt_se, *parent_se;
7484 struct rq *rq; 8507 struct rq *rq;
7485 int i; 8508 int i;
7486 8509
7487 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); 8510 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
7488 if (!tg->rt_rq) 8511 if (!tg->rt_rq)
7489 goto err; 8512 goto err;
7490 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); 8513 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
7491 if (!tg->rt_se) 8514 if (!tg->rt_se)
7492 goto err; 8515 goto err;
7493 8516
7494 tg->rt_runtime = 0; 8517 init_rt_bandwidth(&tg->rt_bandwidth,
8518 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
7495 8519
7496 for_each_possible_cpu(i) { 8520 for_each_possible_cpu(i) {
7497 rq = cpu_rq(i); 8521 rq = cpu_rq(i);
@@ -7506,7 +8530,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
7506 if (!rt_se) 8530 if (!rt_se)
7507 goto err; 8531 goto err;
7508 8532
7509 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); 8533 parent_se = parent ? parent->rt_se[i] : NULL;
8534 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
7510 } 8535 }
7511 8536
7512 return 1; 8537 return 1;
@@ -7530,7 +8555,8 @@ static inline void free_rt_sched_group(struct task_group *tg)
7530{ 8555{
7531} 8556}
7532 8557
7533static inline int alloc_rt_sched_group(struct task_group *tg) 8558static inline
8559int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
7534{ 8560{
7535 return 1; 8561 return 1;
7536} 8562}
@@ -7544,6 +8570,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
7544} 8570}
7545#endif 8571#endif
7546 8572
8573#ifdef CONFIG_GROUP_SCHED
7547static void free_sched_group(struct task_group *tg) 8574static void free_sched_group(struct task_group *tg)
7548{ 8575{
7549 free_fair_sched_group(tg); 8576 free_fair_sched_group(tg);
@@ -7552,7 +8579,7 @@ static void free_sched_group(struct task_group *tg)
7552} 8579}
7553 8580
7554/* allocate runqueue etc for a new task group */ 8581/* allocate runqueue etc for a new task group */
7555struct task_group *sched_create_group(void) 8582struct task_group *sched_create_group(struct task_group *parent)
7556{ 8583{
7557 struct task_group *tg; 8584 struct task_group *tg;
7558 unsigned long flags; 8585 unsigned long flags;
@@ -7562,10 +8589,10 @@ struct task_group *sched_create_group(void)
7562 if (!tg) 8589 if (!tg)
7563 return ERR_PTR(-ENOMEM); 8590 return ERR_PTR(-ENOMEM);
7564 8591
7565 if (!alloc_fair_sched_group(tg)) 8592 if (!alloc_fair_sched_group(tg, parent))
7566 goto err; 8593 goto err;
7567 8594
7568 if (!alloc_rt_sched_group(tg)) 8595 if (!alloc_rt_sched_group(tg, parent))
7569 goto err; 8596 goto err;
7570 8597
7571 spin_lock_irqsave(&task_group_lock, flags); 8598 spin_lock_irqsave(&task_group_lock, flags);
@@ -7574,6 +8601,12 @@ struct task_group *sched_create_group(void)
7574 register_rt_sched_group(tg, i); 8601 register_rt_sched_group(tg, i);
7575 } 8602 }
7576 list_add_rcu(&tg->list, &task_groups); 8603 list_add_rcu(&tg->list, &task_groups);
8604
8605 WARN_ON(!parent); /* root should already exist */
8606
8607 tg->parent = parent;
8608 list_add_rcu(&tg->siblings, &parent->children);
8609 INIT_LIST_HEAD(&tg->children);
7577 spin_unlock_irqrestore(&task_group_lock, flags); 8610 spin_unlock_irqrestore(&task_group_lock, flags);
7578 8611
7579 return tg; 8612 return tg;
@@ -7602,6 +8635,7 @@ void sched_destroy_group(struct task_group *tg)
7602 unregister_rt_sched_group(tg, i); 8635 unregister_rt_sched_group(tg, i);
7603 } 8636 }
7604 list_del_rcu(&tg->list); 8637 list_del_rcu(&tg->list);
8638 list_del_rcu(&tg->siblings);
7605 spin_unlock_irqrestore(&task_group_lock, flags); 8639 spin_unlock_irqrestore(&task_group_lock, flags);
7606 8640
7607 /* wait for possible concurrent references to cfs_rqs complete */ 8641 /* wait for possible concurrent references to cfs_rqs complete */
@@ -7645,16 +8679,14 @@ void sched_move_task(struct task_struct *tsk)
7645 8679
7646 task_rq_unlock(rq, &flags); 8680 task_rq_unlock(rq, &flags);
7647} 8681}
8682#endif
7648 8683
7649#ifdef CONFIG_FAIR_GROUP_SCHED 8684#ifdef CONFIG_FAIR_GROUP_SCHED
7650static void set_se_shares(struct sched_entity *se, unsigned long shares) 8685static void __set_se_shares(struct sched_entity *se, unsigned long shares)
7651{ 8686{
7652 struct cfs_rq *cfs_rq = se->cfs_rq; 8687 struct cfs_rq *cfs_rq = se->cfs_rq;
7653 struct rq *rq = cfs_rq->rq;
7654 int on_rq; 8688 int on_rq;
7655 8689
7656 spin_lock_irq(&rq->lock);
7657
7658 on_rq = se->on_rq; 8690 on_rq = se->on_rq;
7659 if (on_rq) 8691 if (on_rq)
7660 dequeue_entity(cfs_rq, se, 0); 8692 dequeue_entity(cfs_rq, se, 0);
@@ -7664,8 +8696,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
7664 8696
7665 if (on_rq) 8697 if (on_rq)
7666 enqueue_entity(cfs_rq, se, 0); 8698 enqueue_entity(cfs_rq, se, 0);
8699}
7667 8700
7668 spin_unlock_irq(&rq->lock); 8701static void set_se_shares(struct sched_entity *se, unsigned long shares)
8702{
8703 struct cfs_rq *cfs_rq = se->cfs_rq;
8704 struct rq *rq = cfs_rq->rq;
8705 unsigned long flags;
8706
8707 spin_lock_irqsave(&rq->lock, flags);
8708 __set_se_shares(se, shares);
8709 spin_unlock_irqrestore(&rq->lock, flags);
7669} 8710}
7670 8711
7671static DEFINE_MUTEX(shares_mutex); 8712static DEFINE_MUTEX(shares_mutex);
@@ -7676,12 +8717,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7676 unsigned long flags; 8717 unsigned long flags;
7677 8718
7678 /* 8719 /*
8720 * We can't change the weight of the root cgroup.
8721 */
8722 if (!tg->se[0])
8723 return -EINVAL;
8724
8725 /*
7679 * A weight of 0 or 1 can cause arithmetics problems. 8726 * A weight of 0 or 1 can cause arithmetics problems.
7680 * (The default weight is 1024 - so there's no practical 8727 * (The default weight is 1024 - so there's no practical
7681 * limitation from this.) 8728 * limitation from this.)
7682 */ 8729 */
7683 if (shares < 2) 8730 if (shares < MIN_SHARES)
7684 shares = 2; 8731 shares = MIN_SHARES;
7685 8732
7686 mutex_lock(&shares_mutex); 8733 mutex_lock(&shares_mutex);
7687 if (tg->shares == shares) 8734 if (tg->shares == shares)
@@ -7690,6 +8737,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7690 spin_lock_irqsave(&task_group_lock, flags); 8737 spin_lock_irqsave(&task_group_lock, flags);
7691 for_each_possible_cpu(i) 8738 for_each_possible_cpu(i)
7692 unregister_fair_sched_group(tg, i); 8739 unregister_fair_sched_group(tg, i);
8740 list_del_rcu(&tg->siblings);
7693 spin_unlock_irqrestore(&task_group_lock, flags); 8741 spin_unlock_irqrestore(&task_group_lock, flags);
7694 8742
7695 /* wait for any ongoing reference to this group to finish */ 8743 /* wait for any ongoing reference to this group to finish */
@@ -7700,8 +8748,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7700 * w/o tripping rebalance_share or load_balance_fair. 8748 * w/o tripping rebalance_share or load_balance_fair.
7701 */ 8749 */
7702 tg->shares = shares; 8750 tg->shares = shares;
7703 for_each_possible_cpu(i) 8751 for_each_possible_cpu(i) {
7704 set_se_shares(tg->se[i], shares); 8752 /*
8753 * force a rebalance
8754 */
8755 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8756 set_se_shares(tg->se[i], shares/nr_cpu_ids);
8757 }
7705 8758
7706 /* 8759 /*
7707 * Enable load balance activity on this group, by inserting it back on 8760 * Enable load balance activity on this group, by inserting it back on
@@ -7710,6 +8763,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7710 spin_lock_irqsave(&task_group_lock, flags); 8763 spin_lock_irqsave(&task_group_lock, flags);
7711 for_each_possible_cpu(i) 8764 for_each_possible_cpu(i)
7712 register_fair_sched_group(tg, i); 8765 register_fair_sched_group(tg, i);
8766 list_add_rcu(&tg->siblings, &tg->parent->children);
7713 spin_unlock_irqrestore(&task_group_lock, flags); 8767 spin_unlock_irqrestore(&task_group_lock, flags);
7714done: 8768done:
7715 mutex_unlock(&shares_mutex); 8769 mutex_unlock(&shares_mutex);
@@ -7736,26 +8790,58 @@ static unsigned long to_ratio(u64 period, u64 runtime)
7736 return div64_64(runtime << 16, period); 8790 return div64_64(runtime << 16, period);
7737} 8791}
7738 8792
8793#ifdef CONFIG_CGROUP_SCHED
8794static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8795{
8796 struct task_group *tgi, *parent = tg->parent;
8797 unsigned long total = 0;
8798
8799 if (!parent) {
8800 if (global_rt_period() < period)
8801 return 0;
8802
8803 return to_ratio(period, runtime) <
8804 to_ratio(global_rt_period(), global_rt_runtime());
8805 }
8806
8807 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
8808 return 0;
8809
8810 rcu_read_lock();
8811 list_for_each_entry_rcu(tgi, &parent->children, siblings) {
8812 if (tgi == tg)
8813 continue;
8814
8815 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8816 tgi->rt_bandwidth.rt_runtime);
8817 }
8818 rcu_read_unlock();
8819
8820 return total + to_ratio(period, runtime) <
8821 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8822 parent->rt_bandwidth.rt_runtime);
8823}
8824#elif defined CONFIG_USER_SCHED
7739static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8825static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7740{ 8826{
7741 struct task_group *tgi; 8827 struct task_group *tgi;
7742 unsigned long total = 0; 8828 unsigned long total = 0;
7743 unsigned long global_ratio = 8829 unsigned long global_ratio =
7744 to_ratio(sysctl_sched_rt_period, 8830 to_ratio(global_rt_period(), global_rt_runtime());
7745 sysctl_sched_rt_runtime < 0 ?
7746 RUNTIME_INF : sysctl_sched_rt_runtime);
7747 8831
7748 rcu_read_lock(); 8832 rcu_read_lock();
7749 list_for_each_entry_rcu(tgi, &task_groups, list) { 8833 list_for_each_entry_rcu(tgi, &task_groups, list) {
7750 if (tgi == tg) 8834 if (tgi == tg)
7751 continue; 8835 continue;
7752 8836
7753 total += to_ratio(period, tgi->rt_runtime); 8837 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
8838 tgi->rt_bandwidth.rt_runtime);
7754 } 8839 }
7755 rcu_read_unlock(); 8840 rcu_read_unlock();
7756 8841
7757 return total + to_ratio(period, runtime) < global_ratio; 8842 return total + to_ratio(period, runtime) < global_ratio;
7758} 8843}
8844#endif
7759 8845
7760/* Must be called with tasklist_lock held */ 8846/* Must be called with tasklist_lock held */
7761static inline int tg_has_rt_tasks(struct task_group *tg) 8847static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -7768,19 +8854,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7768 return 0; 8854 return 0;
7769} 8855}
7770 8856
7771int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8857static int tg_set_bandwidth(struct task_group *tg,
8858 u64 rt_period, u64 rt_runtime)
7772{ 8859{
7773 u64 rt_runtime, rt_period; 8860 int i, err = 0;
7774 int err = 0;
7775
7776 rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
7777 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7778 if (rt_runtime_us == -1)
7779 rt_runtime = RUNTIME_INF;
7780 8861
7781 mutex_lock(&rt_constraints_mutex); 8862 mutex_lock(&rt_constraints_mutex);
7782 read_lock(&tasklist_lock); 8863 read_lock(&tasklist_lock);
7783 if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { 8864 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
7784 err = -EBUSY; 8865 err = -EBUSY;
7785 goto unlock; 8866 goto unlock;
7786 } 8867 }
@@ -7788,7 +8869,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7788 err = -EINVAL; 8869 err = -EINVAL;
7789 goto unlock; 8870 goto unlock;
7790 } 8871 }
7791 tg->rt_runtime = rt_runtime; 8872
8873 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8874 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8875 tg->rt_bandwidth.rt_runtime = rt_runtime;
8876
8877 for_each_possible_cpu(i) {
8878 struct rt_rq *rt_rq = tg->rt_rq[i];
8879
8880 spin_lock(&rt_rq->rt_runtime_lock);
8881 rt_rq->rt_runtime = rt_runtime;
8882 spin_unlock(&rt_rq->rt_runtime_lock);
8883 }
8884 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7792 unlock: 8885 unlock:
7793 read_unlock(&tasklist_lock); 8886 read_unlock(&tasklist_lock);
7794 mutex_unlock(&rt_constraints_mutex); 8887 mutex_unlock(&rt_constraints_mutex);
@@ -7796,19 +8889,109 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7796 return err; 8889 return err;
7797} 8890}
7798 8891
8892int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8893{
8894 u64 rt_runtime, rt_period;
8895
8896 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8897 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8898 if (rt_runtime_us < 0)
8899 rt_runtime = RUNTIME_INF;
8900
8901 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8902}
8903
7799long sched_group_rt_runtime(struct task_group *tg) 8904long sched_group_rt_runtime(struct task_group *tg)
7800{ 8905{
7801 u64 rt_runtime_us; 8906 u64 rt_runtime_us;
7802 8907
7803 if (tg->rt_runtime == RUNTIME_INF) 8908 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7804 return -1; 8909 return -1;
7805 8910
7806 rt_runtime_us = tg->rt_runtime; 8911 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7807 do_div(rt_runtime_us, NSEC_PER_USEC); 8912 do_div(rt_runtime_us, NSEC_PER_USEC);
7808 return rt_runtime_us; 8913 return rt_runtime_us;
7809} 8914}
8915
8916int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8917{
8918 u64 rt_runtime, rt_period;
8919
8920 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8921 rt_runtime = tg->rt_bandwidth.rt_runtime;
8922
8923 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8924}
8925
8926long sched_group_rt_period(struct task_group *tg)
8927{
8928 u64 rt_period_us;
8929
8930 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8931 do_div(rt_period_us, NSEC_PER_USEC);
8932 return rt_period_us;
8933}
8934
8935static int sched_rt_global_constraints(void)
8936{
8937 int ret = 0;
8938
8939 mutex_lock(&rt_constraints_mutex);
8940 if (!__rt_schedulable(NULL, 1, 0))
8941 ret = -EINVAL;
8942 mutex_unlock(&rt_constraints_mutex);
8943
8944 return ret;
8945}
8946#else
8947static int sched_rt_global_constraints(void)
8948{
8949 unsigned long flags;
8950 int i;
8951
8952 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8953 for_each_possible_cpu(i) {
8954 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8955
8956 spin_lock(&rt_rq->rt_runtime_lock);
8957 rt_rq->rt_runtime = global_rt_runtime();
8958 spin_unlock(&rt_rq->rt_runtime_lock);
8959 }
8960 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8961
8962 return 0;
8963}
7810#endif 8964#endif
7811#endif /* CONFIG_GROUP_SCHED */ 8965
8966int sched_rt_handler(struct ctl_table *table, int write,
8967 struct file *filp, void __user *buffer, size_t *lenp,
8968 loff_t *ppos)
8969{
8970 int ret;
8971 int old_period, old_runtime;
8972 static DEFINE_MUTEX(mutex);
8973
8974 mutex_lock(&mutex);
8975 old_period = sysctl_sched_rt_period;
8976 old_runtime = sysctl_sched_rt_runtime;
8977
8978 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
8979
8980 if (!ret && write) {
8981 ret = sched_rt_global_constraints();
8982 if (ret) {
8983 sysctl_sched_rt_period = old_period;
8984 sysctl_sched_rt_runtime = old_runtime;
8985 } else {
8986 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8987 def_rt_bandwidth.rt_period =
8988 ns_to_ktime(global_rt_period());
8989 }
8990 }
8991 mutex_unlock(&mutex);
8992
8993 return ret;
8994}
7812 8995
7813#ifdef CONFIG_CGROUP_SCHED 8996#ifdef CONFIG_CGROUP_SCHED
7814 8997
@@ -7822,7 +9005,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7822static struct cgroup_subsys_state * 9005static struct cgroup_subsys_state *
7823cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 9006cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7824{ 9007{
7825 struct task_group *tg; 9008 struct task_group *tg, *parent;
7826 9009
7827 if (!cgrp->parent) { 9010 if (!cgrp->parent) {
7828 /* This is early initialization for the top cgroup */ 9011 /* This is early initialization for the top cgroup */
@@ -7830,11 +9013,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7830 return &init_task_group.css; 9013 return &init_task_group.css;
7831 } 9014 }
7832 9015
7833 /* we support only 1-level deep hierarchical scheduler atm */ 9016 parent = cgroup_tg(cgrp->parent);
7834 if (cgrp->parent->parent) 9017 tg = sched_create_group(parent);
7835 return ERR_PTR(-EINVAL);
7836
7837 tg = sched_create_group();
7838 if (IS_ERR(tg)) 9018 if (IS_ERR(tg))
7839 return ERR_PTR(-ENOMEM); 9019 return ERR_PTR(-ENOMEM);
7840 9020
@@ -7858,7 +9038,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7858{ 9038{
7859#ifdef CONFIG_RT_GROUP_SCHED 9039#ifdef CONFIG_RT_GROUP_SCHED
7860 /* Don't accept realtime tasks when there is no way for them to run */ 9040 /* Don't accept realtime tasks when there is no way for them to run */
7861 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) 9041 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
7862 return -EINVAL; 9042 return -EINVAL;
7863#else 9043#else
7864 /* We don't support RT-tasks being in separate groups */ 9044 /* We don't support RT-tasks being in separate groups */
@@ -7892,7 +9072,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7892#endif 9072#endif
7893 9073
7894#ifdef CONFIG_RT_GROUP_SCHED 9074#ifdef CONFIG_RT_GROUP_SCHED
7895static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9075static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7896 struct file *file, 9076 struct file *file,
7897 const char __user *userbuf, 9077 const char __user *userbuf,
7898 size_t nbytes, loff_t *unused_ppos) 9078 size_t nbytes, loff_t *unused_ppos)
@@ -7936,6 +9116,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
7936 9116
7937 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 9117 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
7938} 9118}
9119
9120static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9121 u64 rt_period_us)
9122{
9123 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9124}
9125
9126static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9127{
9128 return sched_group_rt_period(cgroup_tg(cgrp));
9129}
7939#endif 9130#endif
7940 9131
7941static struct cftype cpu_files[] = { 9132static struct cftype cpu_files[] = {
@@ -7952,6 +9143,11 @@ static struct cftype cpu_files[] = {
7952 .read = cpu_rt_runtime_read, 9143 .read = cpu_rt_runtime_read,
7953 .write = cpu_rt_runtime_write, 9144 .write = cpu_rt_runtime_write,
7954 }, 9145 },
9146 {
9147 .name = "rt_period_us",
9148 .read_uint = cpu_rt_period_read_uint,
9149 .write_uint = cpu_rt_period_write_uint,
9150 },
7955#endif 9151#endif
7956}; 9152};
7957 9153
@@ -7992,9 +9188,9 @@ struct cpuacct {
7992struct cgroup_subsys cpuacct_subsys; 9188struct cgroup_subsys cpuacct_subsys;
7993 9189
7994/* return cpu accounting group corresponding to this container */ 9190/* return cpu accounting group corresponding to this container */
7995static inline struct cpuacct *cgroup_ca(struct cgroup *cont) 9191static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
7996{ 9192{
7997 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), 9193 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
7998 struct cpuacct, css); 9194 struct cpuacct, css);
7999} 9195}
8000 9196
@@ -8007,7 +9203,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
8007 9203
8008/* create a new cpu accounting group */ 9204/* create a new cpu accounting group */
8009static struct cgroup_subsys_state *cpuacct_create( 9205static struct cgroup_subsys_state *cpuacct_create(
8010 struct cgroup_subsys *ss, struct cgroup *cont) 9206 struct cgroup_subsys *ss, struct cgroup *cgrp)
8011{ 9207{
8012 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 9208 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8013 9209
@@ -8025,18 +9221,18 @@ static struct cgroup_subsys_state *cpuacct_create(
8025 9221
8026/* destroy an existing cpu accounting group */ 9222/* destroy an existing cpu accounting group */
8027static void 9223static void
8028cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 9224cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8029{ 9225{
8030 struct cpuacct *ca = cgroup_ca(cont); 9226 struct cpuacct *ca = cgroup_ca(cgrp);
8031 9227
8032 free_percpu(ca->cpuusage); 9228 free_percpu(ca->cpuusage);
8033 kfree(ca); 9229 kfree(ca);
8034} 9230}
8035 9231
8036/* return total cpu usage (in nanoseconds) of a group */ 9232/* return total cpu usage (in nanoseconds) of a group */
8037static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) 9233static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8038{ 9234{
8039 struct cpuacct *ca = cgroup_ca(cont); 9235 struct cpuacct *ca = cgroup_ca(cgrp);
8040 u64 totalcpuusage = 0; 9236 u64 totalcpuusage = 0;
8041 int i; 9237 int i;
8042 9238
@@ -8055,16 +9251,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
8055 return totalcpuusage; 9251 return totalcpuusage;
8056} 9252}
8057 9253
9254static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9255 u64 reset)
9256{
9257 struct cpuacct *ca = cgroup_ca(cgrp);
9258 int err = 0;
9259 int i;
9260
9261 if (reset) {
9262 err = -EINVAL;
9263 goto out;
9264 }
9265
9266 for_each_possible_cpu(i) {
9267 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9268
9269 spin_lock_irq(&cpu_rq(i)->lock);
9270 *cpuusage = 0;
9271 spin_unlock_irq(&cpu_rq(i)->lock);
9272 }
9273out:
9274 return err;
9275}
9276
8058static struct cftype files[] = { 9277static struct cftype files[] = {
8059 { 9278 {
8060 .name = "usage", 9279 .name = "usage",
8061 .read_uint = cpuusage_read, 9280 .read_uint = cpuusage_read,
9281 .write_uint = cpuusage_write,
8062 }, 9282 },
8063}; 9283};
8064 9284
8065static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) 9285static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8066{ 9286{
8067 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); 9287 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8068} 9288}
8069 9289
8070/* 9290/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ef358ba07683..f3f4af4b8b0f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
67 (long long)(p->nvcsw + p->nivcsw), 67 (long long)(p->nvcsw + p->nivcsw),
68 p->prio); 68 p->prio);
69#ifdef CONFIG_SCHEDSTATS 69#ifdef CONFIG_SCHEDSTATS
70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", 70 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
71 SPLIT_NS(p->se.vruntime), 71 SPLIT_NS(p->se.vruntime),
72 SPLIT_NS(p->se.sum_exec_runtime), 72 SPLIT_NS(p->se.sum_exec_runtime),
73 SPLIT_NS(p->se.sum_sleep_runtime)); 73 SPLIT_NS(p->se.sum_sleep_runtime));
74#else 74#else
75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", 75 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 76 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
77#endif 77#endif
78
79#ifdef CONFIG_CGROUP_SCHED
80 {
81 char path[64];
82
83 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
84 SEQ_printf(m, " %s", path);
85 }
86#endif
87 SEQ_printf(m, "\n");
78} 88}
79 89
80static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 90static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
109 struct sched_entity *last; 119 struct sched_entity *last;
110 unsigned long flags; 120 unsigned long flags;
111 121
112 SEQ_printf(m, "\ncfs_rq\n"); 122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = "";
126 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg;
128
129 if (tg)
130 cgroup = tg->css.cgroup;
131
132 if (cgroup)
133 cgroup_path(cgroup, path, sizeof(path));
134
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
136#endif
113 137
114 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
115 SPLIT_NS(cfs_rq->exec_clock)); 139 SPLIT_NS(cfs_rq->exec_clock));
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
143#endif 167#endif
144 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
145 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170#ifdef CONFIG_FAIR_GROUP_SCHED
171#ifdef CONFIG_SMP
172 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
173#endif
174#endif
146} 175}
147 176
148static void print_cpu(struct seq_file *m, int cpu) 177static void print_cpu(struct seq_file *m, int cpu)
@@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v)
214 PN(sysctl_sched_latency); 243 PN(sysctl_sched_latency);
215 PN(sysctl_sched_min_granularity); 244 PN(sysctl_sched_min_granularity);
216 PN(sysctl_sched_wakeup_granularity); 245 PN(sysctl_sched_wakeup_granularity);
217 PN(sysctl_sched_batch_wakeup_granularity);
218 PN(sysctl_sched_child_runs_first); 246 PN(sysctl_sched_child_runs_first);
219 P(sysctl_sched_features); 247 P(sysctl_sched_features);
220#undef PN 248#undef PN
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 86a93376282c..89fa32b4edf2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1;
62unsigned int __read_mostly sysctl_sched_compat_yield; 62unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_BATCH wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 *
68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies.
71 */
72unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
73
74/*
75 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
76 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
77 * 67 *
78 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
81 */ 71 */
82unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
83 73
84const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
85 75
@@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
87 * CFS operations on generic schedulable entities: 77 * CFS operations on generic schedulable entities:
88 */ 78 */
89 79
80static inline struct task_struct *task_of(struct sched_entity *se)
81{
82 return container_of(se, struct task_struct, se);
83}
84
90#ifdef CONFIG_FAIR_GROUP_SCHED 85#ifdef CONFIG_FAIR_GROUP_SCHED
91 86
92/* cpu runqueue to which this cfs_rq is attached */ 87/* cpu runqueue to which this cfs_rq is attached */
@@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
98/* An entity is a task if it doesn't "own" a runqueue */ 93/* An entity is a task if it doesn't "own" a runqueue */
99#define entity_is_task(se) (!se->my_q) 94#define entity_is_task(se) (!se->my_q)
100 95
96/* Walk up scheduling entities hierarchy */
97#define for_each_sched_entity(se) \
98 for (; se; se = se->parent)
99
100static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
101{
102 return p->se.cfs_rq;
103}
104
105/* runqueue on which this entity is (to be) queued */
106static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
107{
108 return se->cfs_rq;
109}
110
111/* runqueue "owned" by this group */
112static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
113{
114 return grp->my_q;
115}
116
117/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
118 * another cpu ('this_cpu')
119 */
120static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
121{
122 return cfs_rq->tg->cfs_rq[this_cpu];
123}
124
125/* Iterate thr' all leaf cfs_rq's on a runqueue */
126#define for_each_leaf_cfs_rq(rq, cfs_rq) \
127 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
128
129/* Do the two (enqueued) entities belong to the same group ? */
130static inline int
131is_same_group(struct sched_entity *se, struct sched_entity *pse)
132{
133 if (se->cfs_rq == pse->cfs_rq)
134 return 1;
135
136 return 0;
137}
138
139static inline struct sched_entity *parent_entity(struct sched_entity *se)
140{
141 return se->parent;
142}
143
101#else /* CONFIG_FAIR_GROUP_SCHED */ 144#else /* CONFIG_FAIR_GROUP_SCHED */
102 145
103static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 146static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
107 150
108#define entity_is_task(se) 1 151#define entity_is_task(se) 1
109 152
110#endif /* CONFIG_FAIR_GROUP_SCHED */ 153#define for_each_sched_entity(se) \
154 for (; se; se = NULL)
111 155
112static inline struct task_struct *task_of(struct sched_entity *se) 156static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
113{ 157{
114 return container_of(se, struct task_struct, se); 158 return &task_rq(p)->cfs;
159}
160
161static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
162{
163 struct task_struct *p = task_of(se);
164 struct rq *rq = task_rq(p);
165
166 return &rq->cfs;
115} 167}
116 168
169/* runqueue "owned" by this group */
170static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
171{
172 return NULL;
173}
174
175static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
176{
177 return &cpu_rq(this_cpu)->cfs;
178}
179
180#define for_each_leaf_cfs_rq(rq, cfs_rq) \
181 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
182
183static inline int
184is_same_group(struct sched_entity *se, struct sched_entity *pse)
185{
186 return 1;
187}
188
189static inline struct sched_entity *parent_entity(struct sched_entity *se)
190{
191 return NULL;
192}
193
194#endif /* CONFIG_FAIR_GROUP_SCHED */
195
117 196
118/************************************************************** 197/**************************************************************
119 * Scheduling class tree data structure manipulation methods: 198 * Scheduling class tree data structure manipulation methods:
@@ -255,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
255#endif 334#endif
256 335
257/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
258 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
259 * 366 *
260 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -283,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
283 */ 390 */
284static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
285{ 392{
286 return calc_delta_mine(__sched_period(cfs_rq->nr_running), 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
287 se->load.weight, &cfs_rq->load);
288} 394}
289 395
290/* 396/*
291 * We calculate the vruntime slice. 397 * We calculate the vruntime slice of a to be inserted task
292 * 398 *
293 * vs = s/w = p/rw 399 * vs = s*rw/w = p
294 */ 400 */
295static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
296{ 402{
297 u64 vslice = __sched_period(nr_running); 403 unsigned long nr_running = cfs_rq->nr_running;
298 404
299 vslice *= NICE_0_LOAD; 405 if (!se->on_rq)
300 do_div(vslice, rq_weight); 406 nr_running++;
301 407
302 return vslice; 408 return __sched_period(nr_running);
303} 409}
304 410
305static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
306{ 424{
307 return __sched_vslice(cfs_rq->load.weight + se->load.weight, 425 struct load_weight lw = {
308 cfs_rq->nr_running + 1); 426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432
433 if (se->load.weight < NICE_0_LOAD)
434 se_lw = &lw;
435
436 delta = calc_delta_mine(delta,
437 cfs_rq_of(se)->load.weight, se_lw);
438 }
439
440 return delta;
309} 441}
310 442
311/* 443/*
@@ -322,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
322 454
323 curr->sum_exec_runtime += delta_exec; 455 curr->sum_exec_runtime += delta_exec;
324 schedstat_add(cfs_rq, exec_clock, delta_exec); 456 schedstat_add(cfs_rq, exec_clock, delta_exec);
325 delta_exec_weighted = delta_exec; 457 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
326 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
327 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
328 &curr->load);
329 }
330 curr->vruntime += delta_exec_weighted; 458 curr->vruntime += delta_exec_weighted;
331} 459}
332 460
@@ -413,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 * Scheduling class queueing methods: 541 * Scheduling class queueing methods:
414 */ 542 */
415 543
544#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
545static void
546add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
547{
548 cfs_rq->task_weight += weight;
549}
550#else
551static inline void
552add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
553{
554}
555#endif
556
416static void 557static void
417account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 558account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
418{ 559{
419 update_load_add(&cfs_rq->load, se->load.weight); 560 update_load_add(&cfs_rq->load, se->load.weight);
561 if (!parent_entity(se))
562 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
563 if (entity_is_task(se))
564 add_cfs_task_weight(cfs_rq, se->load.weight);
420 cfs_rq->nr_running++; 565 cfs_rq->nr_running++;
421 se->on_rq = 1; 566 se->on_rq = 1;
567 list_add(&se->group_node, &cfs_rq->tasks);
422} 568}
423 569
424static void 570static void
425account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 571account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
426{ 572{
427 update_load_sub(&cfs_rq->load, se->load.weight); 573 update_load_sub(&cfs_rq->load, se->load.weight);
574 if (!parent_entity(se))
575 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
576 if (entity_is_task(se))
577 add_cfs_task_weight(cfs_rq, -se->load.weight);
428 cfs_rq->nr_running--; 578 cfs_rq->nr_running--;
429 se->on_rq = 0; 579 se->on_rq = 0;
580 list_del_init(&se->group_node);
430} 581}
431 582
432static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 583static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -511,8 +662,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
511 if (!initial) { 662 if (!initial) {
512 /* sleeps upto a single latency don't count. */ 663 /* sleeps upto a single latency don't count. */
513 if (sched_feat(NEW_FAIR_SLEEPERS)) { 664 if (sched_feat(NEW_FAIR_SLEEPERS)) {
514 vruntime -= calc_delta_fair(sysctl_sched_latency, 665 if (sched_feat(NORMALIZED_SLEEPER))
515 &cfs_rq->load); 666 vruntime -= calc_delta_weight(sysctl_sched_latency, se);
667 else
668 vruntime -= sysctl_sched_latency;
516 } 669 }
517 670
518 /* ensure we never gain time by being placed backwards. */ 671 /* ensure we never gain time by being placed backwards. */
@@ -629,20 +782,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
629 se->prev_sum_exec_runtime = se->sum_exec_runtime; 782 se->prev_sum_exec_runtime = se->sum_exec_runtime;
630} 783}
631 784
785static int
786wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
787
632static struct sched_entity * 788static struct sched_entity *
633pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 789pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
634{ 790{
635 s64 diff, gran;
636
637 if (!cfs_rq->next) 791 if (!cfs_rq->next)
638 return se; 792 return se;
639 793
640 diff = cfs_rq->next->vruntime - se->vruntime; 794 if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
641 if (diff < 0)
642 return se;
643
644 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
645 if (diff > gran)
646 return se; 795 return se;
647 796
648 return cfs_rq->next; 797 return cfs_rq->next;
@@ -710,101 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
710 * CFS operations on tasks: 859 * CFS operations on tasks:
711 */ 860 */
712 861
713#ifdef CONFIG_FAIR_GROUP_SCHED
714
715/* Walk up scheduling entities hierarchy */
716#define for_each_sched_entity(se) \
717 for (; se; se = se->parent)
718
719static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
720{
721 return p->se.cfs_rq;
722}
723
724/* runqueue on which this entity is (to be) queued */
725static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
726{
727 return se->cfs_rq;
728}
729
730/* runqueue "owned" by this group */
731static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
732{
733 return grp->my_q;
734}
735
736/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
737 * another cpu ('this_cpu')
738 */
739static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
740{
741 return cfs_rq->tg->cfs_rq[this_cpu];
742}
743
744/* Iterate thr' all leaf cfs_rq's on a runqueue */
745#define for_each_leaf_cfs_rq(rq, cfs_rq) \
746 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
747
748/* Do the two (enqueued) entities belong to the same group ? */
749static inline int
750is_same_group(struct sched_entity *se, struct sched_entity *pse)
751{
752 if (se->cfs_rq == pse->cfs_rq)
753 return 1;
754
755 return 0;
756}
757
758static inline struct sched_entity *parent_entity(struct sched_entity *se)
759{
760 return se->parent;
761}
762
763#else /* CONFIG_FAIR_GROUP_SCHED */
764
765#define for_each_sched_entity(se) \
766 for (; se; se = NULL)
767
768static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
769{
770 return &task_rq(p)->cfs;
771}
772
773static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
774{
775 struct task_struct *p = task_of(se);
776 struct rq *rq = task_rq(p);
777
778 return &rq->cfs;
779}
780
781/* runqueue "owned" by this group */
782static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
783{
784 return NULL;
785}
786
787static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
788{
789 return &cpu_rq(this_cpu)->cfs;
790}
791
792#define for_each_leaf_cfs_rq(rq, cfs_rq) \
793 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
794
795static inline int
796is_same_group(struct sched_entity *se, struct sched_entity *pse)
797{
798 return 1;
799}
800
801static inline struct sched_entity *parent_entity(struct sched_entity *se)
802{
803 return NULL;
804}
805
806#endif /* CONFIG_FAIR_GROUP_SCHED */
807
808#ifdef CONFIG_SCHED_HRTICK 862#ifdef CONFIG_SCHED_HRTICK
809static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 863static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
810{ 864{
@@ -918,7 +972,7 @@ static void yield_task_fair(struct rq *rq)
918 /* 972 /*
919 * Already in the rightmost position? 973 * Already in the rightmost position?
920 */ 974 */
921 if (unlikely(rightmost->vruntime < se->vruntime)) 975 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
922 return; 976 return;
923 977
924 /* 978 /*
@@ -957,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p)
957 return cpu; 1011 return cpu;
958 1012
959 for_each_domain(cpu, sd) { 1013 for_each_domain(cpu, sd) {
960 if (sd->flags & SD_WAKE_IDLE) { 1014 if ((sd->flags & SD_WAKE_IDLE)
1015 || ((sd->flags & SD_WAKE_IDLE_FAR)
1016 && !task_hot(p, task_rq(p)->clock, sd))) {
961 cpus_and(tmp, sd->span, p->cpus_allowed); 1017 cpus_and(tmp, sd->span, p->cpus_allowed);
962 for_each_cpu_mask(i, tmp) { 1018 for_each_cpu_mask(i, tmp) {
963 if (idle_cpu(i)) { 1019 if (idle_cpu(i)) {
@@ -1101,6 +1157,58 @@ out:
1101} 1157}
1102#endif /* CONFIG_SMP */ 1158#endif /* CONFIG_SMP */
1103 1159
1160static unsigned long wakeup_gran(struct sched_entity *se)
1161{
1162 unsigned long gran = sysctl_sched_wakeup_granularity;
1163
1164 /*
1165 * More easily preempt - nice tasks, while not making it harder for
1166 * + nice tasks.
1167 */
1168 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1169
1170 return gran;
1171}
1172
1173/*
1174 * Should 'se' preempt 'curr'.
1175 *
1176 * |s1
1177 * |s2
1178 * |s3
1179 * g
1180 * |<--->|c
1181 *
1182 * w(c, s1) = -1
1183 * w(c, s2) = 0
1184 * w(c, s3) = 1
1185 *
1186 */
1187static int
1188wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1189{
1190 s64 gran, vdiff = curr->vruntime - se->vruntime;
1191
1192 if (vdiff < 0)
1193 return -1;
1194
1195 gran = wakeup_gran(curr);
1196 if (vdiff > gran)
1197 return 1;
1198
1199 return 0;
1200}
1201
1202/* return depth at which a sched entity is present in the hierarchy */
1203static inline int depth_se(struct sched_entity *se)
1204{
1205 int depth = 0;
1206
1207 for_each_sched_entity(se)
1208 depth++;
1209
1210 return depth;
1211}
1104 1212
1105/* 1213/*
1106 * Preempt the current task with a newly woken task if needed: 1214 * Preempt the current task with a newly woken task if needed:
@@ -1110,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1110 struct task_struct *curr = rq->curr; 1218 struct task_struct *curr = rq->curr;
1111 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1219 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1112 struct sched_entity *se = &curr->se, *pse = &p->se; 1220 struct sched_entity *se = &curr->se, *pse = &p->se;
1113 unsigned long gran; 1221 int se_depth, pse_depth;
1114 1222
1115 if (unlikely(rt_prio(p->prio))) { 1223 if (unlikely(rt_prio(p->prio))) {
1116 update_rq_clock(rq); 1224 update_rq_clock(rq);
@@ -1135,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1135 if (!sched_feat(WAKEUP_PREEMPT)) 1243 if (!sched_feat(WAKEUP_PREEMPT))
1136 return; 1244 return;
1137 1245
1138 while (!is_same_group(se, pse)) { 1246 /*
1247 * preemption test can be made between sibling entities who are in the
1248 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
1249 * both tasks until we find their ancestors who are siblings of common
1250 * parent.
1251 */
1252
1253 /* First walk up until both entities are at same depth */
1254 se_depth = depth_se(se);
1255 pse_depth = depth_se(pse);
1256
1257 while (se_depth > pse_depth) {
1258 se_depth--;
1139 se = parent_entity(se); 1259 se = parent_entity(se);
1260 }
1261
1262 while (pse_depth > se_depth) {
1263 pse_depth--;
1140 pse = parent_entity(pse); 1264 pse = parent_entity(pse);
1141 } 1265 }
1142 1266
1143 gran = sysctl_sched_wakeup_granularity; 1267 while (!is_same_group(se, pse)) {
1144 /* 1268 se = parent_entity(se);
1145 * More easily preempt - nice tasks, while not making 1269 pse = parent_entity(pse);
1146 * it harder for + nice tasks. 1270 }
1147 */
1148 if (unlikely(se->load.weight > NICE_0_LOAD))
1149 gran = calc_delta_fair(gran, &se->load);
1150 1271
1151 if (pse->vruntime + gran < se->vruntime) 1272 if (wakeup_preempt_entity(se, pse) == 1)
1152 resched_task(curr); 1273 resched_task(curr);
1153} 1274}
1154 1275
@@ -1199,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1199 * the current task: 1320 * the current task:
1200 */ 1321 */
1201static struct task_struct * 1322static struct task_struct *
1202__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 1323__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1203{ 1324{
1204 struct task_struct *p; 1325 struct task_struct *p = NULL;
1326 struct sched_entity *se;
1327
1328 if (next == &cfs_rq->tasks)
1329 return NULL;
1330
1331 /* Skip over entities that are not tasks */
1332 do {
1333 se = list_entry(next, struct sched_entity, group_node);
1334 next = next->next;
1335 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1205 1336
1206 if (!curr) 1337 if (next == &cfs_rq->tasks)
1207 return NULL; 1338 return NULL;
1208 1339
1209 p = rb_entry(curr, struct task_struct, se.run_node); 1340 cfs_rq->balance_iterator = next;
1210 cfs_rq->rb_load_balance_curr = rb_next(curr); 1341
1342 if (entity_is_task(se))
1343 p = task_of(se);
1211 1344
1212 return p; 1345 return p;
1213} 1346}
@@ -1216,85 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg)
1216{ 1349{
1217 struct cfs_rq *cfs_rq = arg; 1350 struct cfs_rq *cfs_rq = arg;
1218 1351
1219 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); 1352 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
1220} 1353}
1221 1354
1222static struct task_struct *load_balance_next_fair(void *arg) 1355static struct task_struct *load_balance_next_fair(void *arg)
1223{ 1356{
1224 struct cfs_rq *cfs_rq = arg; 1357 struct cfs_rq *cfs_rq = arg;
1225 1358
1226 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1359 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1227} 1360}
1228 1361
1229#ifdef CONFIG_FAIR_GROUP_SCHED 1362static unsigned long
1230static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1363__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1364 unsigned long max_load_move, struct sched_domain *sd,
1365 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1366 struct cfs_rq *cfs_rq)
1231{ 1367{
1232 struct sched_entity *curr; 1368 struct rq_iterator cfs_rq_iterator;
1233 struct task_struct *p;
1234
1235 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1236 return MAX_PRIO;
1237
1238 curr = cfs_rq->curr;
1239 if (!curr)
1240 curr = __pick_next_entity(cfs_rq);
1241 1369
1242 p = task_of(curr); 1370 cfs_rq_iterator.start = load_balance_start_fair;
1371 cfs_rq_iterator.next = load_balance_next_fair;
1372 cfs_rq_iterator.arg = cfs_rq;
1243 1373
1244 return p->prio; 1374 return balance_tasks(this_rq, this_cpu, busiest,
1375 max_load_move, sd, idle, all_pinned,
1376 this_best_prio, &cfs_rq_iterator);
1245} 1377}
1246#endif
1247 1378
1379#ifdef CONFIG_FAIR_GROUP_SCHED
1248static unsigned long 1380static unsigned long
1249load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1381load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1250 unsigned long max_load_move, 1382 unsigned long max_load_move,
1251 struct sched_domain *sd, enum cpu_idle_type idle, 1383 struct sched_domain *sd, enum cpu_idle_type idle,
1252 int *all_pinned, int *this_best_prio) 1384 int *all_pinned, int *this_best_prio)
1253{ 1385{
1254 struct cfs_rq *busy_cfs_rq;
1255 long rem_load_move = max_load_move; 1386 long rem_load_move = max_load_move;
1256 struct rq_iterator cfs_rq_iterator; 1387 int busiest_cpu = cpu_of(busiest);
1257 1388 struct task_group *tg;
1258 cfs_rq_iterator.start = load_balance_start_fair;
1259 cfs_rq_iterator.next = load_balance_next_fair;
1260 1389
1261 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1390 rcu_read_lock();
1262#ifdef CONFIG_FAIR_GROUP_SCHED 1391 list_for_each_entry(tg, &task_groups, list) {
1263 struct cfs_rq *this_cfs_rq;
1264 long imbalance; 1392 long imbalance;
1265 unsigned long maxload; 1393 unsigned long this_weight, busiest_weight;
1394 long rem_load, max_load, moved_load;
1395
1396 /*
1397 * empty group
1398 */
1399 if (!aggregate(tg, sd)->task_weight)
1400 continue;
1401
1402 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1403 rem_load /= aggregate(tg, sd)->load + 1;
1404
1405 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1406 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1407
1408 imbalance = (busiest_weight - this_weight) / 2;
1266 1409
1267 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1410 if (imbalance < 0)
1411 imbalance = busiest_weight;
1268 1412
1269 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1413 max_load = max(rem_load, imbalance);
1270 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1414 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1271 if (imbalance <= 0) 1415 max_load, sd, idle, all_pinned, this_best_prio,
1416 tg->cfs_rq[busiest_cpu]);
1417
1418 if (!moved_load)
1272 continue; 1419 continue;
1273 1420
1274 /* Don't pull more than imbalance/2 */ 1421 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1275 imbalance /= 2;
1276 maxload = min(rem_load_move, imbalance);
1277 1422
1278 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1423 moved_load *= aggregate(tg, sd)->load;
1279#else 1424 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1280# define maxload rem_load_move
1281#endif
1282 /*
1283 * pass busy_cfs_rq argument into
1284 * load_balance_[start|next]_fair iterators
1285 */
1286 cfs_rq_iterator.arg = busy_cfs_rq;
1287 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1288 maxload, sd, idle, all_pinned,
1289 this_best_prio,
1290 &cfs_rq_iterator);
1291 1425
1292 if (rem_load_move <= 0) 1426 rem_load_move -= moved_load;
1427 if (rem_load_move < 0)
1293 break; 1428 break;
1294 } 1429 }
1430 rcu_read_unlock();
1295 1431
1296 return max_load_move - rem_load_move; 1432 return max_load_move - rem_load_move;
1297} 1433}
1434#else
1435static unsigned long
1436load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1437 unsigned long max_load_move,
1438 struct sched_domain *sd, enum cpu_idle_type idle,
1439 int *all_pinned, int *this_best_prio)
1440{
1441 return __load_balance_fair(this_rq, this_cpu, busiest,
1442 max_load_move, sd, idle, all_pinned,
1443 this_best_prio, &busiest->cfs);
1444}
1445#endif
1298 1446
1299static int 1447static int
1300move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1448move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1463,16 +1611,40 @@ static const struct sched_class fair_sched_class = {
1463}; 1611};
1464 1612
1465#ifdef CONFIG_SCHED_DEBUG 1613#ifdef CONFIG_SCHED_DEBUG
1614static void
1615print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
1616{
1617 struct sched_entity *se;
1618
1619 if (!cfs_rq)
1620 return;
1621
1622 list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
1623 int i;
1624
1625 for (i = depth; i; i--)
1626 seq_puts(m, " ");
1627
1628 seq_printf(m, "%lu %s %lu\n",
1629 se->load.weight,
1630 entity_is_task(se) ? "T" : "G",
1631 calc_delta_weight(SCHED_LOAD_SCALE, se)
1632 );
1633 if (!entity_is_task(se))
1634 print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
1635 }
1636}
1637
1466static void print_cfs_stats(struct seq_file *m, int cpu) 1638static void print_cfs_stats(struct seq_file *m, int cpu)
1467{ 1639{
1468 struct cfs_rq *cfs_rq; 1640 struct cfs_rq *cfs_rq;
1469 1641
1470#ifdef CONFIG_FAIR_GROUP_SCHED
1471 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1472#endif
1473 rcu_read_lock(); 1642 rcu_read_lock();
1474 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1643 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1475 print_cfs_rq(m, cpu, cfs_rq); 1644 print_cfs_rq(m, cpu, cfs_rq);
1645
1646 seq_printf(m, "\nWeight tree:\n");
1647 print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
1476 rcu_read_unlock(); 1648 rcu_read_unlock();
1477} 1649}
1478#endif 1650#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
new file mode 100644
index 000000000000..1c7283cb9581
--- /dev/null
+++ b/kernel/sched_features.h
@@ -0,0 +1,10 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1)
5SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1)
10SCHED_FEAT(DEADLINE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a6d2e516420..c2730a5a4f05 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
62 if (!rt_rq->tg) 62 if (!rt_rq->tg)
63 return RUNTIME_INF; 63 return RUNTIME_INF;
64 64
65 return rt_rq->tg->rt_runtime; 65 return rt_rq->rt_runtime;
66}
67
68static inline u64 sched_rt_period(struct rt_rq *rt_rq)
69{
70 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
66} 71}
67 72
68#define for_each_leaf_rt_rq(rt_rq, rq) \ 73#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
127 return p->prio != p->normal_prio; 132 return p->prio != p->normal_prio;
128} 133}
129 134
135#ifdef CONFIG_SMP
136static inline cpumask_t sched_rt_period_mask(void)
137{
138 return cpu_rq(smp_processor_id())->rd->span;
139}
140#else
141static inline cpumask_t sched_rt_period_mask(void)
142{
143 return cpu_online_map;
144}
145#endif
146
147static inline
148struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
149{
150 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
151}
152
153static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
154{
155 return &rt_rq->tg->rt_bandwidth;
156}
157
130#else 158#else
131 159
132static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
133{ 161{
134 if (sysctl_sched_rt_runtime == -1) 162 return rt_rq->rt_runtime;
135 return RUNTIME_INF; 163}
136 164
137 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 165static inline u64 sched_rt_period(struct rt_rq *rt_rq)
166{
167 return ktime_to_ns(def_rt_bandwidth.rt_period);
138} 168}
139 169
140#define for_each_leaf_rt_rq(rt_rq, rq) \ 170#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
173{ 203{
174 return rt_rq->rt_throttled; 204 return rt_rq->rt_throttled;
175} 205}
206
207static inline cpumask_t sched_rt_period_mask(void)
208{
209 return cpu_online_map;
210}
211
212static inline
213struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
214{
215 return &cpu_rq(cpu)->rt;
216}
217
218static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
219{
220 return &def_rt_bandwidth;
221}
222
223#endif
224
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 }
254
255 if (enqueue)
256 sched_rt_rq_enqueue(rt_rq);
257 spin_unlock(&rq->lock);
258 }
259
260 return idle;
261}
262
263#ifdef CONFIG_SMP
264static int balance_runtime(struct rt_rq *rt_rq)
265{
266 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
267 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
268 int i, weight, more = 0;
269 u64 rt_period;
270
271 weight = cpus_weight(rd->span);
272
273 spin_lock(&rt_b->rt_runtime_lock);
274 rt_period = ktime_to_ns(rt_b->rt_period);
275 for_each_cpu_mask(i, rd->span) {
276 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
277 s64 diff;
278
279 if (iter == rt_rq)
280 continue;
281
282 spin_lock(&iter->rt_runtime_lock);
283 diff = iter->rt_runtime - iter->rt_time;
284 if (diff > 0) {
285 do_div(diff, weight);
286 if (rt_rq->rt_runtime + diff > rt_period)
287 diff = rt_period - rt_rq->rt_runtime;
288 iter->rt_runtime -= diff;
289 rt_rq->rt_runtime += diff;
290 more = 1;
291 if (rt_rq->rt_runtime == rt_period) {
292 spin_unlock(&iter->rt_runtime_lock);
293 break;
294 }
295 }
296 spin_unlock(&iter->rt_runtime_lock);
297 }
298 spin_unlock(&rt_b->rt_runtime_lock);
299
300 return more;
301}
176#endif 302#endif
177 303
178static inline int rt_se_prio(struct sched_rt_entity *rt_se) 304static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
197 if (rt_rq->rt_throttled) 323 if (rt_rq->rt_throttled)
198 return rt_rq_throttled(rt_rq); 324 return rt_rq_throttled(rt_rq);
199 325
326 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
327 return 0;
328
329#ifdef CONFIG_SMP
200 if (rt_rq->rt_time > runtime) { 330 if (rt_rq->rt_time > runtime) {
201 struct rq *rq = rq_of_rt_rq(rt_rq); 331 int more;
202 332
203 rq->rt_throttled = 1; 333 spin_unlock(&rt_rq->rt_runtime_lock);
204 rt_rq->rt_throttled = 1; 334 more = balance_runtime(rt_rq);
335 spin_lock(&rt_rq->rt_runtime_lock);
205 336
337 if (more)
338 runtime = sched_rt_runtime(rt_rq);
339 }
340#endif
341
342 if (rt_rq->rt_time > runtime) {
343 rt_rq->rt_throttled = 1;
206 if (rt_rq_throttled(rt_rq)) { 344 if (rt_rq_throttled(rt_rq)) {
207 sched_rt_rq_dequeue(rt_rq); 345 sched_rt_rq_dequeue(rt_rq);
208 return 1; 346 return 1;
@@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
212 return 0; 350 return 0;
213} 351}
214 352
215static void update_sched_rt_period(struct rq *rq)
216{
217 struct rt_rq *rt_rq;
218 u64 period;
219
220 while (rq->clock > rq->rt_period_expire) {
221 period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
222 rq->rt_period_expire += period;
223
224 for_each_leaf_rt_rq(rt_rq, rq) {
225 u64 runtime = sched_rt_runtime(rt_rq);
226
227 rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
228 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
229 rt_rq->rt_throttled = 0;
230 sched_rt_rq_enqueue(rt_rq);
231 }
232 }
233
234 rq->rt_throttled = 0;
235 }
236}
237
238/* 353/*
239 * Update the current task's runtime statistics. Skip current tasks that 354 * Update the current task's runtime statistics. Skip current tasks that
240 * are not in our scheduling class. 355 * are not in our scheduling class.
@@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq)
259 curr->se.exec_start = rq->clock; 374 curr->se.exec_start = rq->clock;
260 cpuacct_charge(curr, delta_exec); 375 cpuacct_charge(curr, delta_exec);
261 376
262 rt_rq->rt_time += delta_exec; 377 for_each_sched_rt_entity(rt_se) {
263 if (sched_rt_runtime_exceeded(rt_rq)) 378 rt_rq = rt_rq_of_se(rt_se);
264 resched_task(curr); 379
380 spin_lock(&rt_rq->rt_runtime_lock);
381 rt_rq->rt_time += delta_exec;
382 if (sched_rt_runtime_exceeded(rt_rq))
383 resched_task(curr);
384 spin_unlock(&rt_rq->rt_runtime_lock);
385 }
265} 386}
266 387
267static inline 388static inline
@@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
284#ifdef CONFIG_RT_GROUP_SCHED 405#ifdef CONFIG_RT_GROUP_SCHED
285 if (rt_se_boosted(rt_se)) 406 if (rt_se_boosted(rt_se))
286 rt_rq->rt_nr_boosted++; 407 rt_rq->rt_nr_boosted++;
408
409 if (rt_rq->tg)
410 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
411#else
412 start_rt_bandwidth(&def_rt_bandwidth);
287#endif 413#endif
288} 414}
289 415
@@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
353/* 479/*
354 * Because the prio of an upper entry depends on the lower 480 * Because the prio of an upper entry depends on the lower
355 * entries, we must remove entries top - down. 481 * entries, we must remove entries top - down.
356 *
357 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
358 * doesn't matter much for now, as h=2 for GROUP_SCHED.
359 */ 482 */
360static void dequeue_rt_stack(struct task_struct *p) 483static void dequeue_rt_stack(struct task_struct *p)
361{ 484{
362 struct sched_rt_entity *rt_se, *top_se; 485 struct sched_rt_entity *rt_se, *back = NULL;
363 486
364 /* 487 rt_se = &p->rt;
365 * dequeue all, top - down. 488 for_each_sched_rt_entity(rt_se) {
366 */ 489 rt_se->back = back;
367 do { 490 back = rt_se;
368 rt_se = &p->rt; 491 }
369 top_se = NULL; 492
370 for_each_sched_rt_entity(rt_se) { 493 for (rt_se = back; rt_se; rt_se = rt_se->back) {
371 if (on_rt_rq(rt_se)) 494 if (on_rt_rq(rt_se))
372 top_se = rt_se; 495 dequeue_rt_entity(rt_se);
373 } 496 }
374 if (top_se)
375 dequeue_rt_entity(top_se);
376 } while (top_se);
377} 497}
378 498
379/* 499/*
@@ -393,6 +513,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
393 */ 513 */
394 for_each_sched_rt_entity(rt_se) 514 for_each_sched_rt_entity(rt_se)
395 enqueue_rt_entity(rt_se); 515 enqueue_rt_entity(rt_se);
516
517 inc_cpu_load(rq, p->se.load.weight);
396} 518}
397 519
398static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 520static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -412,6 +534,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
412 if (rt_rq && rt_rq->rt_nr_running) 534 if (rt_rq && rt_rq->rt_nr_running)
413 enqueue_rt_entity(rt_se); 535 enqueue_rt_entity(rt_se);
414 } 536 }
537
538 dec_cpu_load(rq, p->se.load.weight);
415} 539}
416 540
417/* 541/*
@@ -1001,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1001 return 0; 1125 return 0;
1002} 1126}
1003 1127
1004static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) 1128static void set_cpus_allowed_rt(struct task_struct *p,
1129 const cpumask_t *new_mask)
1005{ 1130{
1006 int weight = cpus_weight(*new_mask); 1131 int weight = cpus_weight(*new_mask);
1007 1132
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5b32433e7ee5..5bae2e0c3ff2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,6 +9,11 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
12 17
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies); 19 seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
36 preempt_disable(); 41 preempt_disable();
37 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40 44
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 45 cpumask_scnprintf(mask_str, mask_len, sd->span);
42 seq_printf(seq, "domain%d %s", dcount++, mask_str); 46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) { 48 itype++) {
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
new file mode 100644
index 000000000000..5c2942e768cd
--- /dev/null
+++ b/kernel/semaphore.c
@@ -0,0 +1,264 @@
1/*
2 * Copyright (c) 2008 Intel Corporation
3 * Author: Matthew Wilcox <willy@linux.intel.com>
4 *
5 * Distributed under the terms of the GNU GPL, version 2
6 *
7 * This file implements counting semaphores.
8 * A counting semaphore may be acquired 'n' times before sleeping.
9 * See mutex.c for single-acquisition sleeping locks which enforce
10 * rules which allow code to be debugged more easily.
11 */
12
13/*
14 * Some notes on the implementation:
15 *
16 * The spinlock controls access to the other members of the semaphore.
17 * down_trylock() and up() can be called from interrupt context, so we
18 * have to disable interrupts when taking the lock. It turns out various
19 * parts of the kernel expect to be able to use down() on a semaphore in
20 * interrupt context when they know it will succeed, so we have to use
21 * irqsave variants for down(), down_interruptible() and down_killable()
22 * too.
23 *
24 * The ->count variable represents how many more tasks can acquire this
25 * semaphore. If it's zero, there may be tasks waiting on the wait_list.
26 */
27
28#include <linux/compiler.h>
29#include <linux/kernel.h>
30#include <linux/module.h>
31#include <linux/sched.h>
32#include <linux/semaphore.h>
33#include <linux/spinlock.h>
34
35static noinline void __down(struct semaphore *sem);
36static noinline int __down_interruptible(struct semaphore *sem);
37static noinline int __down_killable(struct semaphore *sem);
38static noinline int __down_timeout(struct semaphore *sem, long jiffies);
39static noinline void __up(struct semaphore *sem);
40
41/**
42 * down - acquire the semaphore
43 * @sem: the semaphore to be acquired
44 *
45 * Acquires the semaphore. If no more tasks are allowed to acquire the
46 * semaphore, calling this function will put the task to sleep until the
47 * semaphore is released.
48 *
49 * Use of this function is deprecated, please use down_interruptible() or
50 * down_killable() instead.
51 */
52void down(struct semaphore *sem)
53{
54 unsigned long flags;
55
56 spin_lock_irqsave(&sem->lock, flags);
57 if (likely(sem->count > 0))
58 sem->count--;
59 else
60 __down(sem);
61 spin_unlock_irqrestore(&sem->lock, flags);
62}
63EXPORT_SYMBOL(down);
64
65/**
66 * down_interruptible - acquire the semaphore unless interrupted
67 * @sem: the semaphore to be acquired
68 *
69 * Attempts to acquire the semaphore. If no more tasks are allowed to
70 * acquire the semaphore, calling this function will put the task to sleep.
71 * If the sleep is interrupted by a signal, this function will return -EINTR.
72 * If the semaphore is successfully acquired, this function returns 0.
73 */
74int down_interruptible(struct semaphore *sem)
75{
76 unsigned long flags;
77 int result = 0;
78
79 spin_lock_irqsave(&sem->lock, flags);
80 if (likely(sem->count > 0))
81 sem->count--;
82 else
83 result = __down_interruptible(sem);
84 spin_unlock_irqrestore(&sem->lock, flags);
85
86 return result;
87}
88EXPORT_SYMBOL(down_interruptible);
89
90/**
91 * down_killable - acquire the semaphore unless killed
92 * @sem: the semaphore to be acquired
93 *
94 * Attempts to acquire the semaphore. If no more tasks are allowed to
95 * acquire the semaphore, calling this function will put the task to sleep.
96 * If the sleep is interrupted by a fatal signal, this function will return
97 * -EINTR. If the semaphore is successfully acquired, this function returns
98 * 0.
99 */
100int down_killable(struct semaphore *sem)
101{
102 unsigned long flags;
103 int result = 0;
104
105 spin_lock_irqsave(&sem->lock, flags);
106 if (likely(sem->count > 0))
107 sem->count--;
108 else
109 result = __down_killable(sem);
110 spin_unlock_irqrestore(&sem->lock, flags);
111
112 return result;
113}
114EXPORT_SYMBOL(down_killable);
115
116/**
117 * down_trylock - try to acquire the semaphore, without waiting
118 * @sem: the semaphore to be acquired
119 *
120 * Try to acquire the semaphore atomically. Returns 0 if the mutex has
121 * been acquired successfully or 1 if it it cannot be acquired.
122 *
123 * NOTE: This return value is inverted from both spin_trylock and
124 * mutex_trylock! Be careful about this when converting code.
125 *
126 * Unlike mutex_trylock, this function can be used from interrupt context,
127 * and the semaphore can be released by any task or interrupt.
128 */
129int down_trylock(struct semaphore *sem)
130{
131 unsigned long flags;
132 int count;
133
134 spin_lock_irqsave(&sem->lock, flags);
135 count = sem->count - 1;
136 if (likely(count >= 0))
137 sem->count = count;
138 spin_unlock_irqrestore(&sem->lock, flags);
139
140 return (count < 0);
141}
142EXPORT_SYMBOL(down_trylock);
143
144/**
145 * down_timeout - acquire the semaphore within a specified time
146 * @sem: the semaphore to be acquired
147 * @jiffies: how long to wait before failing
148 *
149 * Attempts to acquire the semaphore. If no more tasks are allowed to
150 * acquire the semaphore, calling this function will put the task to sleep.
151 * If the semaphore is not released within the specified number of jiffies,
152 * this function returns -ETIME. It returns 0 if the semaphore was acquired.
153 */
154int down_timeout(struct semaphore *sem, long jiffies)
155{
156 unsigned long flags;
157 int result = 0;
158
159 spin_lock_irqsave(&sem->lock, flags);
160 if (likely(sem->count > 0))
161 sem->count--;
162 else
163 result = __down_timeout(sem, jiffies);
164 spin_unlock_irqrestore(&sem->lock, flags);
165
166 return result;
167}
168EXPORT_SYMBOL(down_timeout);
169
170/**
171 * up - release the semaphore
172 * @sem: the semaphore to release
173 *
174 * Release the semaphore. Unlike mutexes, up() may be called from any
175 * context and even by tasks which have never called down().
176 */
177void up(struct semaphore *sem)
178{
179 unsigned long flags;
180
181 spin_lock_irqsave(&sem->lock, flags);
182 if (likely(list_empty(&sem->wait_list)))
183 sem->count++;
184 else
185 __up(sem);
186 spin_unlock_irqrestore(&sem->lock, flags);
187}
188EXPORT_SYMBOL(up);
189
190/* Functions for the contended case */
191
192struct semaphore_waiter {
193 struct list_head list;
194 struct task_struct *task;
195 int up;
196};
197
198/*
199 * Because this function is inlined, the 'state' parameter will be
200 * constant, and thus optimised away by the compiler. Likewise the
201 * 'timeout' parameter for the cases without timeouts.
202 */
203static inline int __sched __down_common(struct semaphore *sem, long state,
204 long timeout)
205{
206 struct task_struct *task = current;
207 struct semaphore_waiter waiter;
208
209 list_add_tail(&waiter.list, &sem->wait_list);
210 waiter.task = task;
211 waiter.up = 0;
212
213 for (;;) {
214 if (state == TASK_INTERRUPTIBLE && signal_pending(task))
215 goto interrupted;
216 if (state == TASK_KILLABLE && fatal_signal_pending(task))
217 goto interrupted;
218 if (timeout <= 0)
219 goto timed_out;
220 __set_task_state(task, state);
221 spin_unlock_irq(&sem->lock);
222 timeout = schedule_timeout(timeout);
223 spin_lock_irq(&sem->lock);
224 if (waiter.up)
225 return 0;
226 }
227
228 timed_out:
229 list_del(&waiter.list);
230 return -ETIME;
231
232 interrupted:
233 list_del(&waiter.list);
234 return -EINTR;
235}
236
237static noinline void __sched __down(struct semaphore *sem)
238{
239 __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
240}
241
242static noinline int __sched __down_interruptible(struct semaphore *sem)
243{
244 return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
245}
246
247static noinline int __sched __down_killable(struct semaphore *sem)
248{
249 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
250}
251
252static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
253{
254 return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
255}
256
257static noinline void __sched __up(struct semaphore *sem)
258{
259 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
260 struct semaphore_waiter, list);
261 list_del(&waiter->list);
262 waiter->up = 1;
263 wake_up_process(waiter->task);
264}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6af1210092c3..64ad0ed15992 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t)
220 unsigned long flags; 220 unsigned long flags;
221 221
222 spin_lock_irqsave(&t->sighand->siglock, flags); 222 spin_lock_irqsave(&t->sighand->siglock, flags);
223 clear_tsk_thread_flag(t,TIF_SIGPENDING); 223 clear_tsk_thread_flag(t, TIF_SIGPENDING);
224 flush_sigqueue(&t->pending); 224 flush_sigqueue(&t->pending);
225 flush_sigqueue(&t->signal->shared_pending); 225 flush_sigqueue(&t->signal->shared_pending);
226 spin_unlock_irqrestore(&t->sighand->siglock, flags); 226 spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
424 } 424 }
425 if (signr && 425 if (signr &&
426 ((info->si_code & __SI_MASK) == __SI_TIMER) && 426 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
427 info->si_sys_private){ 427 info->si_sys_private) {
428 /* 428 /*
429 * Release the siglock to ensure proper locking order 429 * Release the siglock to ensure proper locking order
430 * of timer locks outside of siglocks. Note, we leave 430 * of timer locks outside of siglocks. Note, we leave
@@ -1757,6 +1757,45 @@ static int do_signal_stop(int signr)
1757 return 1; 1757 return 1;
1758} 1758}
1759 1759
1760static int ptrace_signal(int signr, siginfo_t *info,
1761 struct pt_regs *regs, void *cookie)
1762{
1763 if (!(current->ptrace & PT_PTRACED))
1764 return signr;
1765
1766 ptrace_signal_deliver(regs, cookie);
1767
1768 /* Let the debugger run. */
1769 ptrace_stop(signr, 0, info);
1770
1771 /* We're back. Did the debugger cancel the sig? */
1772 signr = current->exit_code;
1773 if (signr == 0)
1774 return signr;
1775
1776 current->exit_code = 0;
1777
1778 /* Update the siginfo structure if the signal has
1779 changed. If the debugger wanted something
1780 specific in the siginfo structure then it should
1781 have updated *info via PTRACE_SETSIGINFO. */
1782 if (signr != info->si_signo) {
1783 info->si_signo = signr;
1784 info->si_errno = 0;
1785 info->si_code = SI_USER;
1786 info->si_pid = task_pid_vnr(current->parent);
1787 info->si_uid = current->parent->uid;
1788 }
1789
1790 /* If the (new) signal is now blocked, requeue it. */
1791 if (sigismember(&current->blocked, signr)) {
1792 specific_send_sig_info(signr, info, current);
1793 signr = 0;
1794 }
1795
1796 return signr;
1797}
1798
1760int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 1799int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
1761 struct pt_regs *regs, void *cookie) 1800 struct pt_regs *regs, void *cookie)
1762{ 1801{
@@ -1785,36 +1824,10 @@ relock:
1785 if (!signr) 1824 if (!signr)
1786 break; /* will return 0 */ 1825 break; /* will return 0 */
1787 1826
1788 if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { 1827 if (signr != SIGKILL) {
1789 ptrace_signal_deliver(regs, cookie); 1828 signr = ptrace_signal(signr, info, regs, cookie);
1790 1829 if (!signr)
1791 /* Let the debugger run. */
1792 ptrace_stop(signr, 0, info);
1793
1794 /* We're back. Did the debugger cancel the sig? */
1795 signr = current->exit_code;
1796 if (signr == 0)
1797 continue;
1798
1799 current->exit_code = 0;
1800
1801 /* Update the siginfo structure if the signal has
1802 changed. If the debugger wanted something
1803 specific in the siginfo structure then it should
1804 have updated *info via PTRACE_SETSIGINFO. */
1805 if (signr != info->si_signo) {
1806 info->si_signo = signr;
1807 info->si_errno = 0;
1808 info->si_code = SI_USER;
1809 info->si_pid = task_pid_vnr(current->parent);
1810 info->si_uid = current->parent->uid;
1811 }
1812
1813 /* If the (new) signal is now blocked, requeue it. */
1814 if (sigismember(&current->blocked, signr)) {
1815 specific_send_sig_info(signr, info, current);
1816 continue; 1830 continue;
1817 }
1818 } 1831 }
1819 1832
1820 ka = &current->sighand->action[signr-1]; 1833 ka = &current->sighand->action[signr-1];
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31e9f2a47928..3c44956ee7e2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
356/* Tasklets */ 356/* Tasklets */
357struct tasklet_head 357struct tasklet_head
358{ 358{
359 struct tasklet_struct *list; 359 struct tasklet_struct *head;
360 struct tasklet_struct **tail;
360}; 361};
361 362
362/* Some compilers disobey section attribute on statics when not 363/* Some compilers disobey section attribute on statics when not
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t)
369 unsigned long flags; 370 unsigned long flags;
370 371
371 local_irq_save(flags); 372 local_irq_save(flags);
372 t->next = __get_cpu_var(tasklet_vec).list; 373 t->next = NULL;
373 __get_cpu_var(tasklet_vec).list = t; 374 *__get_cpu_var(tasklet_vec).tail = t;
375 __get_cpu_var(tasklet_vec).tail = &(t->next);
374 raise_softirq_irqoff(TASKLET_SOFTIRQ); 376 raise_softirq_irqoff(TASKLET_SOFTIRQ);
375 local_irq_restore(flags); 377 local_irq_restore(flags);
376} 378}
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
382 unsigned long flags; 384 unsigned long flags;
383 385
384 local_irq_save(flags); 386 local_irq_save(flags);
385 t->next = __get_cpu_var(tasklet_hi_vec).list; 387 t->next = NULL;
386 __get_cpu_var(tasklet_hi_vec).list = t; 388 *__get_cpu_var(tasklet_hi_vec).tail = t;
389 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
387 raise_softirq_irqoff(HI_SOFTIRQ); 390 raise_softirq_irqoff(HI_SOFTIRQ);
388 local_irq_restore(flags); 391 local_irq_restore(flags);
389} 392}
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a)
395 struct tasklet_struct *list; 398 struct tasklet_struct *list;
396 399
397 local_irq_disable(); 400 local_irq_disable();
398 list = __get_cpu_var(tasklet_vec).list; 401 list = __get_cpu_var(tasklet_vec).head;
399 __get_cpu_var(tasklet_vec).list = NULL; 402 __get_cpu_var(tasklet_vec).head = NULL;
403 __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
400 local_irq_enable(); 404 local_irq_enable();
401 405
402 while (list) { 406 while (list) {
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a)
416 } 420 }
417 421
418 local_irq_disable(); 422 local_irq_disable();
419 t->next = __get_cpu_var(tasklet_vec).list; 423 t->next = NULL;
420 __get_cpu_var(tasklet_vec).list = t; 424 *__get_cpu_var(tasklet_vec).tail = t;
425 __get_cpu_var(tasklet_vec).tail = &(t->next);
421 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 426 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
422 local_irq_enable(); 427 local_irq_enable();
423 } 428 }
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a)
428 struct tasklet_struct *list; 433 struct tasklet_struct *list;
429 434
430 local_irq_disable(); 435 local_irq_disable();
431 list = __get_cpu_var(tasklet_hi_vec).list; 436 list = __get_cpu_var(tasklet_hi_vec).head;
432 __get_cpu_var(tasklet_hi_vec).list = NULL; 437 __get_cpu_var(tasklet_hi_vec).head = NULL;
438 __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
433 local_irq_enable(); 439 local_irq_enable();
434 440
435 while (list) { 441 while (list) {
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a)
449 } 455 }
450 456
451 local_irq_disable(); 457 local_irq_disable();
452 t->next = __get_cpu_var(tasklet_hi_vec).list; 458 t->next = NULL;
453 __get_cpu_var(tasklet_hi_vec).list = t; 459 *__get_cpu_var(tasklet_hi_vec).tail = t;
460 __get_cpu_var(tasklet_hi_vec).tail = &(t->next);
454 __raise_softirq_irqoff(HI_SOFTIRQ); 461 __raise_softirq_irqoff(HI_SOFTIRQ);
455 local_irq_enable(); 462 local_irq_enable();
456 } 463 }
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill);
487 494
488void __init softirq_init(void) 495void __init softirq_init(void)
489{ 496{
497 int cpu;
498
499 for_each_possible_cpu(cpu) {
500 per_cpu(tasklet_vec, cpu).tail =
501 &per_cpu(tasklet_vec, cpu).head;
502 per_cpu(tasklet_hi_vec, cpu).tail =
503 &per_cpu(tasklet_hi_vec, cpu).head;
504 }
505
490 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); 506 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
491 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); 507 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
492} 508}
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
555 return; 571 return;
556 572
557 /* CPU is dead, so no lock needed. */ 573 /* CPU is dead, so no lock needed. */
558 for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { 574 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
559 if (*i == t) { 575 if (*i == t) {
560 *i = t->next; 576 *i = t->next;
577 /* If this was the tail element, move the tail ptr */
578 if (*i == NULL)
579 per_cpu(tasklet_vec, cpu).tail = i;
561 return; 580 return;
562 } 581 }
563 } 582 }
@@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
566 585
567static void takeover_tasklets(unsigned int cpu) 586static void takeover_tasklets(unsigned int cpu)
568{ 587{
569 struct tasklet_struct **i;
570
571 /* CPU is dead, so no lock needed. */ 588 /* CPU is dead, so no lock needed. */
572 local_irq_disable(); 589 local_irq_disable();
573 590
574 /* Find end, append list for that CPU. */ 591 /* Find end, append list for that CPU. */
575 for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); 592 *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head;
576 *i = per_cpu(tasklet_vec, cpu).list; 593 __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
577 per_cpu(tasklet_vec, cpu).list = NULL; 594 per_cpu(tasklet_vec, cpu).head = NULL;
595 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
578 raise_softirq_irqoff(TASKLET_SOFTIRQ); 596 raise_softirq_irqoff(TASKLET_SOFTIRQ);
579 597
580 for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); 598 *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
581 *i = per_cpu(tasklet_hi_vec, cpu).list; 599 __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
582 per_cpu(tasklet_hi_vec, cpu).list = NULL; 600 per_cpu(tasklet_hi_vec, cpu).head = NULL;
601 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
583 raise_softirq_irqoff(HI_SOFTIRQ); 602 raise_softirq_irqoff(HI_SOFTIRQ);
584 603
585 local_irq_enable(); 604 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70c..0101aeef7ed7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -11,7 +11,6 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12 12
13#include <asm/atomic.h> 13#include <asm/atomic.h>
14#include <asm/semaphore.h>
15#include <asm/uaccess.h> 14#include <asm/uaccess.h>
16 15
17/* Since we effect priority and affinity (both of which are visible 16/* Since we effect priority and affinity (both of which are visible
@@ -35,7 +34,7 @@ static int stopmachine(void *cpu)
35 int irqs_disabled = 0; 34 int irqs_disabled = 0;
36 int prepared = 0; 35 int prepared = 0;
37 36
38 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); 37 set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
39 38
40 /* Ack: we are alive */ 39 /* Ack: we are alive */
41 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 40 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
@@ -135,8 +134,7 @@ static void restart_machine(void)
135 preempt_enable_no_resched(); 134 preempt_enable_no_resched();
136} 135}
137 136
138struct stop_machine_data 137struct stop_machine_data {
139{
140 int (*fn)(void *); 138 int (*fn)(void *);
141 void *data; 139 void *data;
142 struct completion done; 140 struct completion done;
diff --git a/kernel/sys.c b/kernel/sys.c
index a626116af5db..f2a451366953 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -67,6 +67,12 @@
67#ifndef SET_ENDIAN 67#ifndef SET_ENDIAN
68# define SET_ENDIAN(a,b) (-EINVAL) 68# define SET_ENDIAN(a,b) (-EINVAL)
69#endif 69#endif
70#ifndef GET_TSC_CTL
71# define GET_TSC_CTL(a) (-EINVAL)
72#endif
73#ifndef SET_TSC_CTL
74# define SET_TSC_CTL(a) (-EINVAL)
75#endif
70 76
71/* 77/*
72 * this is where the system-wide overflow UID and GID are defined, for 78 * this is where the system-wide overflow UID and GID are defined, for
@@ -1626,10 +1632,9 @@ asmlinkage long sys_umask(int mask)
1626asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1632asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1627 unsigned long arg4, unsigned long arg5) 1633 unsigned long arg4, unsigned long arg5)
1628{ 1634{
1629 long error; 1635 long uninitialized_var(error);
1630 1636
1631 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1637 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
1632 if (error)
1633 return error; 1638 return error;
1634 1639
1635 switch (option) { 1640 switch (option) {
@@ -1682,17 +1687,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1682 error = -EINVAL; 1687 error = -EINVAL;
1683 break; 1688 break;
1684 1689
1685 case PR_GET_KEEPCAPS:
1686 if (current->keep_capabilities)
1687 error = 1;
1688 break;
1689 case PR_SET_KEEPCAPS:
1690 if (arg2 != 0 && arg2 != 1) {
1691 error = -EINVAL;
1692 break;
1693 }
1694 current->keep_capabilities = arg2;
1695 break;
1696 case PR_SET_NAME: { 1690 case PR_SET_NAME: {
1697 struct task_struct *me = current; 1691 struct task_struct *me = current;
1698 unsigned char ncomm[sizeof(me->comm)]; 1692 unsigned char ncomm[sizeof(me->comm)];
@@ -1726,18 +1720,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1726 case PR_SET_SECCOMP: 1720 case PR_SET_SECCOMP:
1727 error = prctl_set_seccomp(arg2); 1721 error = prctl_set_seccomp(arg2);
1728 break; 1722 break;
1729 1723 case PR_GET_TSC:
1730 case PR_CAPBSET_READ: 1724 error = GET_TSC_CTL(arg2);
1731 if (!cap_valid(arg2)) 1725 break;
1732 return -EINVAL; 1726 case PR_SET_TSC:
1733 return !!cap_raised(current->cap_bset, arg2); 1727 error = SET_TSC_CTL(arg2);
1734 case PR_CAPBSET_DROP: 1728 break;
1735#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
1736 return cap_prctl_drop(arg2);
1737#else
1738 return -EINVAL;
1739#endif
1740
1741 default: 1729 default:
1742 error = -EINVAL; 1730 error = -EINVAL;
1743 break; 1731 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2a2d6889bab..fd3364827ccf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = {
270 }, 270 },
271 { 271 {
272 .ctl_name = CTL_UNNUMBERED, 272 .ctl_name = CTL_UNNUMBERED,
273 .procname = "sched_batch_wakeup_granularity_ns",
274 .data = &sysctl_sched_batch_wakeup_granularity,
275 .maxlen = sizeof(unsigned int),
276 .mode = 0644,
277 .proc_handler = &proc_dointvec_minmax,
278 .strategy = &sysctl_intvec,
279 .extra1 = &min_wakeup_granularity_ns,
280 .extra2 = &max_wakeup_granularity_ns,
281 },
282 {
283 .ctl_name = CTL_UNNUMBERED,
284 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
285 .data = &sysctl_sched_child_runs_first, 274 .data = &sysctl_sched_child_runs_first,
286 .maxlen = sizeof(unsigned int), 275 .maxlen = sizeof(unsigned int),
@@ -318,7 +307,7 @@ static struct ctl_table kern_table[] = {
318 .data = &sysctl_sched_rt_period, 307 .data = &sysctl_sched_rt_period,
319 .maxlen = sizeof(unsigned int), 308 .maxlen = sizeof(unsigned int),
320 .mode = 0644, 309 .mode = 0644,
321 .proc_handler = &proc_dointvec, 310 .proc_handler = &sched_rt_handler,
322 }, 311 },
323 { 312 {
324 .ctl_name = CTL_UNNUMBERED, 313 .ctl_name = CTL_UNNUMBERED,
@@ -326,7 +315,7 @@ static struct ctl_table kern_table[] = {
326 .data = &sysctl_sched_rt_runtime, 315 .data = &sysctl_sched_rt_runtime,
327 .maxlen = sizeof(int), 316 .maxlen = sizeof(int),
328 .mode = 0644, 317 .mode = 0644,
329 .proc_handler = &proc_dointvec, 318 .proc_handler = &sched_rt_handler,
330 }, 319 },
331 { 320 {
332 .ctl_name = CTL_UNNUMBERED, 321 .ctl_name = CTL_UNNUMBERED,
diff --git a/kernel/time.c b/kernel/time.c
index a5ec013b6c80..35d373a98782 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -379,6 +379,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
379 ts->tv_sec = sec; 379 ts->tv_sec = sec;
380 ts->tv_nsec = nsec; 380 ts->tv_nsec = nsec;
381} 381}
382EXPORT_SYMBOL(set_normalized_timespec);
382 383
383/** 384/**
384 * ns_to_timespec - Convert nanoseconds to timespec 385 * ns_to_timespec - Convert nanoseconds to timespec
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 278534bbca95..73961f35fdc8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data)
141 } 141 }
142 142
143 if (!list_empty(&watchdog_list)) { 143 if (!list_empty(&watchdog_list)) {
144 __mod_timer(&watchdog_timer, 144 /*
145 watchdog_timer.expires + WATCHDOG_INTERVAL); 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other.
147 */
148 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
149
150 if (next_cpu >= NR_CPUS)
151 next_cpu = first_cpu(cpu_online_map);
152 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu);
146 } 154 }
147 spin_unlock(&watchdog_lock); 155 spin_unlock(&watchdog_lock);
148} 156}
@@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
164 if (!started && watchdog) { 172 if (!started && watchdog) {
165 watchdog_last = watchdog->read(); 173 watchdog_last = watchdog->read();
166 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
167 add_timer(&watchdog_timer); 175 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map));
168 } 177 }
169 } else { 178 } else {
170 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -174,7 +183,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
174 if (watchdog) 183 if (watchdog)
175 del_timer(&watchdog_timer); 184 del_timer(&watchdog_timer);
176 watchdog = cs; 185 watchdog = cs;
177 init_timer_deferrable(&watchdog_timer); 186 init_timer(&watchdog_timer);
178 watchdog_timer.function = clocksource_watchdog; 187 watchdog_timer.function = clocksource_watchdog;
179 188
180 /* Reset watchdog cycles */ 189 /* Reset watchdog cycles */
@@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
185 watchdog_last = watchdog->read(); 194 watchdog_last = watchdog->read();
186 watchdog_timer.expires = 195 watchdog_timer.expires =
187 jiffies + WATCHDOG_INTERVAL; 196 jiffies + WATCHDOG_INTERVAL;
188 add_timer(&watchdog_timer); 197 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map));
189 } 199 }
190 } 200 }
191 } 201 }
@@ -222,6 +232,18 @@ void clocksource_resume(void)
222} 232}
223 233
224/** 234/**
235 * clocksource_touch_watchdog - Update watchdog
236 *
237 * Update the watchdog after exception contexts such as kgdb so as not
238 * to incorrectly trip the watchdog.
239 *
240 */
241void clocksource_touch_watchdog(void)
242{
243 clocksource_resume_watchdog();
244}
245
246/**
225 * clocksource_get_next - Returns the selected clocksource 247 * clocksource_get_next - Returns the selected clocksource
226 * 248 *
227 */ 249 */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e1bd50cbbf5d..57a1f02e5ec0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -14,7 +14,7 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
@@ -262,7 +262,7 @@ out:
262void tick_broadcast_on_off(unsigned long reason, int *oncpu) 262void tick_broadcast_on_off(unsigned long reason, int *oncpu)
263{ 263{
264 if (!cpu_isset(*oncpu, cpu_online_map)) 264 if (!cpu_isset(*oncpu, cpu_online_map))
265 printk(KERN_ERR "tick-braodcast: ignoring broadcast for " 265 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
266 "offline CPU #%d\n", *oncpu); 266 "offline CPU #%d\n", *oncpu);
267 else 267 else
268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 1bea399a9ef0..4f3886562b8c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -14,12 +14,14 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include <asm/irq_regs.h>
24
23#include "tick-internal.h" 25#include "tick-internal.h"
24 26
25/* 27/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0258d3115d54..450c04935b66 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -14,7 +14,7 @@
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
17#include <linux/irq.h> 17#include <linux/interrupt.h>
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 686da821d376..b854a895591e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu)
158 } 158 }
159} 159}
160 160
161static ktime_t tick_nohz_start_idle(int cpu) 161static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
162{ 162{
163 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
164 ktime_t now, delta; 163 ktime_t now, delta;
165 164
166 now = ktime_get(); 165 now = ktime_get();
@@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
192void tick_nohz_stop_sched_tick(void) 191void tick_nohz_stop_sched_tick(void)
193{ 192{
194 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
195 unsigned long rt_jiffies;
196 struct tick_sched *ts; 194 struct tick_sched *ts;
197 ktime_t last_update, expires, now; 195 ktime_t last_update, expires, now;
198 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 196 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void)
201 local_irq_save(flags); 199 local_irq_save(flags);
202 200
203 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
204 now = tick_nohz_start_idle(cpu);
205 ts = &per_cpu(tick_cpu_sched, cpu); 202 ts = &per_cpu(tick_cpu_sched, cpu);
203 now = tick_nohz_start_idle(ts);
206 204
207 /* 205 /*
208 * If this cpu is offline and it is the one which updates 206 * If this cpu is offline and it is the one which updates
@@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void)
222 if (need_resched()) 220 if (need_resched())
223 goto end; 221 goto end;
224 222
225 cpu = smp_processor_id();
226 if (unlikely(local_softirq_pending())) { 223 if (unlikely(local_softirq_pending())) {
227 static int ratelimit; 224 static int ratelimit;
228 225
@@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void)
245 next_jiffies = get_next_timer_interrupt(last_jiffies); 242 next_jiffies = get_next_timer_interrupt(last_jiffies);
246 delta_jiffies = next_jiffies - last_jiffies; 243 delta_jiffies = next_jiffies - last_jiffies;
247 244
248 rt_jiffies = rt_needs_cpu(cpu);
249 if (rt_jiffies && rt_jiffies < delta_jiffies)
250 delta_jiffies = rt_jiffies;
251
252 if (rcu_needs_cpu(cpu)) 245 if (rcu_needs_cpu(cpu))
253 delta_jiffies = 1; 246 delta_jiffies = 1;
254 /* 247 /*
@@ -400,6 +393,7 @@ void tick_nohz_restart_sched_tick(void)
400 sub_preempt_count(HARDIRQ_OFFSET); 393 sub_preempt_count(HARDIRQ_OFFSET);
401 } 394 }
402 395
396 touch_softlockup_watchdog();
403 /* 397 /*
404 * Cancel the scheduled timer and restore the tick 398 * Cancel the scheduled timer and restore the tick
405 */ 399 */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a3fa587c350c..2d6087c7cf98 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -178,6 +178,7 @@ static void change_clocksource(void)
178 if (clock == new) 178 if (clock == new)
179 return; 179 return;
180 180
181 new->cycle_last = 0;
181 now = clocksource_read(new); 182 now = clocksource_read(new);
182 nsec = __get_nsec_offset(); 183 nsec = __get_nsec_offset();
183 timespec_add_ns(&xtime, nsec); 184 timespec_add_ns(&xtime, nsec);
@@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev)
295 timespec_add_ns(&xtime, timekeeping_suspend_nsecs); 296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
296 update_xtime_cache(0); 297 update_xtime_cache(0);
297 /* re-base the last cycle value */ 298 /* re-base the last cycle value */
299 clock->cycle_last = 0;
298 clock->cycle_last = clocksource_read(clock); 300 clock->cycle_last = clocksource_read(clock);
299 clock->error = 0; 301 clock->error = 0;
300 timekeeping_suspended = 0; 302 timekeeping_suspended = 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..f3d35d4ea42e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
451 spin_lock_irqsave(&base->lock, flags); 451 spin_lock_irqsave(&base->lock, flags);
452 timer_set_base(timer, base); 452 timer_set_base(timer, base);
453 internal_add_timer(base, timer); 453 internal_add_timer(base, timer);
454 /*
455 * Check whether the other CPU is idle and needs to be
456 * triggered to reevaluate the timer wheel when nohz is
457 * active. We are protected against the other CPU fiddling
458 * with the timer by holding the timer base lock. This also
459 * makes sure that a CPU on the way to idle can not evaluate
460 * the timer wheel.
461 */
462 wake_up_idle_cpu(cpu);
454 spin_unlock_irqrestore(&base->lock, flags); 463 spin_unlock_irqrestore(&base->lock, flags);
455} 464}
456 465
457
458/** 466/**
459 * mod_timer - modify a timer's timeout 467 * mod_timer - modify a timer's timeout
460 * @timer: the timer to be modified 468 * @timer: the timer to be modified
@@ -1220,13 +1228,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1220 return 0; 1228 return 0;
1221} 1229}
1222 1230
1223/*
1224 * lockdep: we want to track each per-CPU base as a separate lock-class,
1225 * but timer-bases are kmalloc()-ed, so we need to attach separate
1226 * keys to them:
1227 */
1228static struct lock_class_key base_lock_keys[NR_CPUS];
1229
1230static int __cpuinit init_timers_cpu(int cpu) 1231static int __cpuinit init_timers_cpu(int cpu)
1231{ 1232{
1232 int j; 1233 int j;
@@ -1269,7 +1270,6 @@ static int __cpuinit init_timers_cpu(int cpu)
1269 } 1270 }
1270 1271
1271 spin_lock_init(&base->lock); 1272 spin_lock_init(&base->lock);
1272 lockdep_set_class(&base->lock, base_lock_keys + cpu);
1273 1273
1274 for (j = 0; j < TVN_SIZE; j++) { 1274 for (j = 0; j < TVN_SIZE; j++) {
1275 INIT_LIST_HEAD(base->tv5.vec + j); 1275 INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1308,8 +1308,8 @@ static void __cpuinit migrate_timers(int cpu)
1308 new_base = get_cpu_var(tvec_bases); 1308 new_base = get_cpu_var(tvec_bases);
1309 1309
1310 local_irq_disable(); 1310 local_irq_disable();
1311 double_spin_lock(&new_base->lock, &old_base->lock, 1311 spin_lock(&new_base->lock);
1312 smp_processor_id() < cpu); 1312 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1313 1313
1314 BUG_ON(old_base->running_timer); 1314 BUG_ON(old_base->running_timer);
1315 1315
@@ -1322,8 +1322,8 @@ static void __cpuinit migrate_timers(int cpu)
1322 migrate_timer_list(new_base, old_base->tv5.vec + i); 1322 migrate_timer_list(new_base, old_base->tv5.vec + i);
1323 } 1323 }
1324 1324
1325 double_spin_unlock(&new_base->lock, &old_base->lock, 1325 spin_unlock(&old_base->lock);
1326 smp_processor_id() < cpu); 1326 spin_unlock(&new_base->lock);
1327 local_irq_enable(); 1327 local_irq_enable();
1328 put_cpu_var(tvec_bases); 1328 put_cpu_var(tvec_bases);
1329} 1329}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index dd308ba4e03b..3e41c1673e2f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
21{ 21{
22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
23 /* avoid REGPARM breakage on x86: */ 23 /* avoid REGPARM breakage on x86: */
24 prevent_tail_call(ret); 24 asmlinkage_protect(3, ret, filename, user, group);
25 return ret; 25 return ret;
26} 26}
27 27
@@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
29{ 29{
30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
31 /* avoid REGPARM breakage on x86: */ 31 /* avoid REGPARM breakage on x86: */
32 prevent_tail_call(ret); 32 asmlinkage_protect(3, ret, filename, user, group);
33 return ret; 33 return ret;
34} 34}
35 35
@@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
37{ 37{
38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
39 /* avoid REGPARM breakage on x86: */ 39 /* avoid REGPARM breakage on x86: */
40 prevent_tail_call(ret); 40 asmlinkage_protect(3, ret, fd, user, group);
41 return ret; 41 return ret;
42} 42}
43 43
@@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
45{ 45{
46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
47 /* avoid REGPARM breakage on x86: */ 47 /* avoid REGPARM breakage on x86: */
48 prevent_tail_call(ret); 48 asmlinkage_protect(2, ret, rgid, egid);
49 return ret; 49 return ret;
50} 50}
51 51
@@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
53{ 53{
54 long ret = sys_setgid(low2highgid(gid)); 54 long ret = sys_setgid(low2highgid(gid));
55 /* avoid REGPARM breakage on x86: */ 55 /* avoid REGPARM breakage on x86: */
56 prevent_tail_call(ret); 56 asmlinkage_protect(1, ret, gid);
57 return ret; 57 return ret;
58} 58}
59 59
@@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
61{ 61{
62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
63 /* avoid REGPARM breakage on x86: */ 63 /* avoid REGPARM breakage on x86: */
64 prevent_tail_call(ret); 64 asmlinkage_protect(2, ret, ruid, euid);
65 return ret; 65 return ret;
66} 66}
67 67
@@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
69{ 69{
70 long ret = sys_setuid(low2highuid(uid)); 70 long ret = sys_setuid(low2highuid(uid));
71 /* avoid REGPARM breakage on x86: */ 71 /* avoid REGPARM breakage on x86: */
72 prevent_tail_call(ret); 72 asmlinkage_protect(1, ret, uid);
73 return ret; 73 return ret;
74} 74}
75 75
@@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
79 low2highuid(suid)); 79 low2highuid(suid));
80 /* avoid REGPARM breakage on x86: */ 80 /* avoid REGPARM breakage on x86: */
81 prevent_tail_call(ret); 81 asmlinkage_protect(3, ret, ruid, euid, suid);
82 return ret; 82 return ret;
83} 83}
84 84
@@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
98 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 98 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
99 low2highgid(sgid)); 99 low2highgid(sgid));
100 /* avoid REGPARM breakage on x86: */ 100 /* avoid REGPARM breakage on x86: */
101 prevent_tail_call(ret); 101 asmlinkage_protect(3, ret, rgid, egid, sgid);
102 return ret; 102 return ret;
103} 103}
104 104
@@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
117{ 117{
118 long ret = sys_setfsuid(low2highuid(uid)); 118 long ret = sys_setfsuid(low2highuid(uid));
119 /* avoid REGPARM breakage on x86: */ 119 /* avoid REGPARM breakage on x86: */
120 prevent_tail_call(ret); 120 asmlinkage_protect(1, ret, uid);
121 return ret; 121 return ret;
122} 122}
123 123
@@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid)
125{ 125{
126 long ret = sys_setfsgid(low2highgid(gid)); 126 long ret = sys_setfsgid(low2highgid(gid));
127 /* avoid REGPARM breakage on x86: */ 127 /* avoid REGPARM breakage on x86: */
128 prevent_tail_call(ret); 128 asmlinkage_protect(1, ret, gid);
129 return ret; 129 return ret;
130} 130}
131 131
diff --git a/kernel/user.c b/kernel/user.c
index 7132022a040c..debce602bfdd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up)
101{ 101{
102 int rc = 0; 102 int rc = 0;
103 103
104 up->tg = sched_create_group(); 104 up->tg = sched_create_group(&root_task_group);
105 if (IS_ERR(up->tg)) 105 if (IS_ERR(up->tg))
106 rc = -ENOMEM; 106 rc = -ENOMEM;
107 107
@@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
193 193
194static struct kobj_attribute cpu_rt_runtime_attr = 194static struct kobj_attribute cpu_rt_runtime_attr =
195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); 195 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
196
197static ssize_t cpu_rt_period_show(struct kobject *kobj,
198 struct kobj_attribute *attr,
199 char *buf)
200{
201 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
202
203 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
204}
205
206static ssize_t cpu_rt_period_store(struct kobject *kobj,
207 struct kobj_attribute *attr,
208 const char *buf, size_t size)
209{
210 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
211 unsigned long rt_period;
212 int rc;
213
214 sscanf(buf, "%lu", &rt_period);
215
216 rc = sched_group_set_rt_period(up->tg, rt_period);
217
218 return (rc ? rc : size);
219}
220
221static struct kobj_attribute cpu_rt_period_attr =
222 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
196#endif 223#endif
197 224
198/* default attributes per uid directory */ 225/* default attributes per uid directory */
@@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = {
202#endif 229#endif
203#ifdef CONFIG_RT_GROUP_SCHED 230#ifdef CONFIG_RT_GROUP_SCHED
204 &cpu_rt_runtime_attr.attr, 231 &cpu_rt_runtime_attr.attr,
232 &cpu_rt_period_attr.attr,
205#endif 233#endif
206 NULL 234 NULL
207}; 235};
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ff06611655af..00ff4d08e370 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -219,6 +219,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
219 struct timer_list *timer = &dwork->timer; 219 struct timer_list *timer = &dwork->timer;
220 struct work_struct *work = &dwork->work; 220 struct work_struct *work = &dwork->work;
221 221
222 timer_stats_timer_set_start_info(&dwork->timer);
222 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 223 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
223 BUG_ON(timer_pending(timer)); 224 BUG_ON(timer_pending(timer));
224 BUG_ON(!list_empty(&work->entry)); 225 BUG_ON(!list_empty(&work->entry));
@@ -580,6 +581,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
580int schedule_delayed_work_on(int cpu, 581int schedule_delayed_work_on(int cpu,
581 struct delayed_work *dwork, unsigned long delay) 582 struct delayed_work *dwork, unsigned long delay)
582{ 583{
584 timer_stats_timer_set_start_info(&dwork->timer);
583 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); 585 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
584} 586}
585EXPORT_SYMBOL(schedule_delayed_work_on); 587EXPORT_SYMBOL(schedule_delayed_work_on);