aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2007-07-18 20:38:57 -0400
committerSteve French <sfrench@us.ibm.com>2007-07-18 20:38:57 -0400
commit1ff8392c32a2645d2665ca779ecb91bb29361c13 (patch)
tree860b95e9a499ade4060848740fc6ce1fbb4e4e8d /kernel
parent70b315b0dd3879cb3ab8aadffb14f10b2d19b9c3 (diff)
parent5bae7ac9feba925fd0099057f6b23d7be80b7b41 (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: fs/cifs/export.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c97
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c5
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/cpuset.c11
-rw-r--r--kernel/delayacct.c10
-rw-r--r--kernel/exit.c42
-rw-r--r--kernel/fork.c15
-rw-r--r--kernel/futex.c152
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/kallsyms.c27
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kmod.c216
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/module.c72
-rw-r--r--kernel/nsproxy.c82
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-cpu-timers.c34
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c26
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c218
-rw-r--r--kernel/rtmutex-debug.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c6
-rw-r--r--kernel/rtmutex_common.h9
-rw-r--r--kernel/sched.c3081
-rw-r--r--kernel/sched_debug.c275
-rw-r--r--kernel/sched_fair.c1131
-rw-r--r--kernel/sched_idletask.c71
-rw-r--r--kernel/sched_rt.c255
-rw-r--r--kernel/sched_stats.h235
-rw-r--r--kernel/seccomp.c29
-rw-r--r--kernel/signal.c33
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c71
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c145
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time.c32
-rw-r--r--kernel/time/clockevents.c41
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/timekeeping.c37
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c19
-rw-r--r--kernel/user.c18
-rw-r--r--kernel/user_namespace.c87
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/workqueue.c60
60 files changed, 4432 insertions, 2414 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 642d4277c2..2a999836ca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,11 +4,12 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
12 utsname.o
12 13
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/ 15obj-y += time/
@@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
49obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
50obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
51obj-$(CONFIG_UTS_NS) += utsname.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
54 54
diff --git a/kernel/audit.c b/kernel/audit.c
index d13276d414..eb0f9165b4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,7 @@
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h> 60#include <linux/freezer.h>
61#include <linux/tty.h>
61 62
62#include "audit.h" 63#include "audit.h"
63 64
@@ -391,6 +392,7 @@ static int kauditd_thread(void *dummy)
391{ 392{
392 struct sk_buff *skb; 393 struct sk_buff *skb;
393 394
395 set_freezable();
394 while (!kthread_should_stop()) { 396 while (!kthread_should_stop()) {
395 skb = skb_dequeue(&audit_skb_queue); 397 skb = skb_dequeue(&audit_skb_queue);
396 wake_up(&audit_backlog_wait); 398 wake_up(&audit_backlog_wait);
@@ -423,6 +425,31 @@ static int kauditd_thread(void *dummy)
423 return 0; 425 return 0;
424} 426}
425 427
428static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
429{
430 struct task_struct *tsk;
431 int err;
432
433 read_lock(&tasklist_lock);
434 tsk = find_task_by_pid(pid);
435 err = -ESRCH;
436 if (!tsk)
437 goto out;
438 err = 0;
439
440 spin_lock_irq(&tsk->sighand->siglock);
441 if (!tsk->signal->audit_tty)
442 err = -EPERM;
443 spin_unlock_irq(&tsk->sighand->siglock);
444 if (err)
445 goto out;
446
447 tty_audit_push_task(tsk, loginuid);
448out:
449 read_unlock(&tasklist_lock);
450 return err;
451}
452
426int audit_send_list(void *_dest) 453int audit_send_list(void *_dest)
427{ 454{
428 struct audit_netlink_list *dest = _dest; 455 struct audit_netlink_list *dest = _dest;
@@ -511,6 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
511 case AUDIT_DEL: 538 case AUDIT_DEL:
512 case AUDIT_DEL_RULE: 539 case AUDIT_DEL_RULE:
513 case AUDIT_SIGNAL_INFO: 540 case AUDIT_SIGNAL_INFO:
541 case AUDIT_TTY_GET:
542 case AUDIT_TTY_SET:
514 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 543 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
515 err = -EPERM; 544 err = -EPERM;
516 break; 545 break;
@@ -622,6 +651,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
622 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 651 err = audit_filter_user(&NETLINK_CB(skb), msg_type);
623 if (err == 1) { 652 if (err == 1) {
624 err = 0; 653 err = 0;
654 if (msg_type == AUDIT_USER_TTY) {
655 err = audit_prepare_user_tty(pid, loginuid);
656 if (err)
657 break;
658 }
625 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 659 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
626 if (ab) { 660 if (ab) {
627 audit_log_format(ab, 661 audit_log_format(ab,
@@ -638,8 +672,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
638 " subj=%s", ctx); 672 " subj=%s", ctx);
639 kfree(ctx); 673 kfree(ctx);
640 } 674 }
641 audit_log_format(ab, " msg='%.1024s'", 675 if (msg_type != AUDIT_USER_TTY)
642 (char *)data); 676 audit_log_format(ab, " msg='%.1024s'",
677 (char *)data);
678 else {
679 int size;
680
681 audit_log_format(ab, " msg=");
682 size = nlmsg_len(nlh);
683 audit_log_n_untrustedstring(ab, size,
684 data);
685 }
643 audit_set_pid(ab, pid); 686 audit_set_pid(ab, pid);
644 audit_log_end(ab); 687 audit_log_end(ab);
645 } 688 }
@@ -730,6 +773,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
730 0, 0, sig_data, sizeof(*sig_data) + len); 773 0, 0, sig_data, sizeof(*sig_data) + len);
731 kfree(sig_data); 774 kfree(sig_data);
732 break; 775 break;
776 case AUDIT_TTY_GET: {
777 struct audit_tty_status s;
778 struct task_struct *tsk;
779
780 read_lock(&tasklist_lock);
781 tsk = find_task_by_pid(pid);
782 if (!tsk)
783 err = -ESRCH;
784 else {
785 spin_lock_irq(&tsk->sighand->siglock);
786 s.enabled = tsk->signal->audit_tty != 0;
787 spin_unlock_irq(&tsk->sighand->siglock);
788 }
789 read_unlock(&tasklist_lock);
790 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
791 &s, sizeof(s));
792 break;
793 }
794 case AUDIT_TTY_SET: {
795 struct audit_tty_status *s;
796 struct task_struct *tsk;
797
798 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
799 return -EINVAL;
800 s = data;
801 if (s->enabled != 0 && s->enabled != 1)
802 return -EINVAL;
803 read_lock(&tasklist_lock);
804 tsk = find_task_by_pid(pid);
805 if (!tsk)
806 err = -ESRCH;
807 else {
808 spin_lock_irq(&tsk->sighand->siglock);
809 tsk->signal->audit_tty = s->enabled != 0;
810 spin_unlock_irq(&tsk->sighand->siglock);
811 }
812 read_unlock(&tasklist_lock);
813 break;
814 }
733 default: 815 default:
734 err = -EINVAL; 816 err = -EINVAL;
735 break; 817 break;
@@ -1185,7 +1267,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1185} 1267}
1186 1268
1187/** 1269/**
1188 * audit_log_n_unstrustedstring - log a string that may contain random characters 1270 * audit_log_n_untrustedstring - log a string that may contain random characters
1189 * @ab: audit_buffer 1271 * @ab: audit_buffer
1190 * @len: lenth of string (not including trailing null) 1272 * @len: lenth of string (not including trailing null)
1191 * @string: string to be logged 1273 * @string: string to be logged
@@ -1201,25 +1283,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1201const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1283const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1202 const char *string) 1284 const char *string)
1203{ 1285{
1204 const unsigned char *p = string; 1286 const unsigned char *p;
1205 1287
1206 while (*p) { 1288 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1207 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1289 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1208 audit_log_hex(ab, string, len); 1290 audit_log_hex(ab, string, len);
1209 return string + len + 1; 1291 return string + len + 1;
1210 } 1292 }
1211 p++;
1212 } 1293 }
1213 audit_log_n_string(ab, len, string); 1294 audit_log_n_string(ab, len, string);
1214 return p + 1; 1295 return p + 1;
1215} 1296}
1216 1297
1217/** 1298/**
1218 * audit_log_unstrustedstring - log a string that may contain random characters 1299 * audit_log_untrustedstring - log a string that may contain random characters
1219 * @ab: audit_buffer 1300 * @ab: audit_buffer
1220 * @string: string to be logged 1301 * @string: string to be logged
1221 * 1302 *
1222 * Same as audit_log_n_unstrustedstring(), except that strlen is used to 1303 * Same as audit_log_n_untrustedstring(), except that strlen is used to
1223 * determine string length. 1304 * determine string length.
1224 */ 1305 */
1225const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1306const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
diff --git a/kernel/audit.h b/kernel/audit.h
index 815d6f5c04..95877435c3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
115extern void audit_send_reply(int pid, int seq, int type, 115extern void audit_send_reply(int pid, int seq, int type,
116 int done, int multi, 116 int done, int multi,
117 void *payload, int size); 117 void *payload, int size);
118extern void audit_log_lost(const char *message);
119extern void audit_panic(const char *message); 118extern void audit_panic(const char *message);
120 119
121struct audit_netlink_list { 120struct audit_netlink_list {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 74cc0fc6bb..1bf093dcff 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -947,7 +947,7 @@ static void audit_update_watch(struct audit_parent *parent,
947 947
948 /* If the update involves invalidating rules, do the inode-based 948 /* If the update involves invalidating rules, do the inode-based
949 * filtering now, so we don't omit records. */ 949 * filtering now, so we don't omit records. */
950 if (invalidating && 950 if (invalidating && current->audit_context &&
951 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) 951 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
952 audit_set_auditable(current->audit_context); 952 audit_set_auditable(current->audit_context);
953 953
@@ -1210,8 +1210,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1210 struct audit_entry *e; 1210 struct audit_entry *e;
1211 struct audit_field *inode_f = entry->rule.inode_f; 1211 struct audit_field *inode_f = entry->rule.inode_f;
1212 struct audit_watch *watch = entry->rule.watch; 1212 struct audit_watch *watch = entry->rule.watch;
1213 struct nameidata *ndp, *ndw; 1213 struct nameidata *ndp = NULL, *ndw = NULL;
1214 int h, err, putnd_needed = 0; 1214 int h, err;
1215#ifdef CONFIG_AUDITSYSCALL 1215#ifdef CONFIG_AUDITSYSCALL
1216 int dont_count = 0; 1216 int dont_count = 0;
1217 1217
@@ -1239,7 +1239,6 @@ static inline int audit_add_rule(struct audit_entry *entry,
1239 err = audit_get_nd(watch->path, &ndp, &ndw); 1239 err = audit_get_nd(watch->path, &ndp, &ndw);
1240 if (err) 1240 if (err)
1241 goto error; 1241 goto error;
1242 putnd_needed = 1;
1243 } 1242 }
1244 1243
1245 mutex_lock(&audit_filter_mutex); 1244 mutex_lock(&audit_filter_mutex);
@@ -1269,14 +1268,11 @@ static inline int audit_add_rule(struct audit_entry *entry,
1269#endif 1268#endif
1270 mutex_unlock(&audit_filter_mutex); 1269 mutex_unlock(&audit_filter_mutex);
1271 1270
1272 if (putnd_needed) 1271 audit_put_nd(ndp, ndw); /* NULL args OK */
1273 audit_put_nd(ndp, ndw);
1274
1275 return 0; 1272 return 0;
1276 1273
1277error: 1274error:
1278 if (putnd_needed) 1275 audit_put_nd(ndp, ndw); /* NULL args OK */
1279 audit_put_nd(ndp, ndw);
1280 if (watch) 1276 if (watch)
1281 audit_put_watch(watch); /* tmp watch, matches initial get */ 1277 audit_put_watch(watch); /* tmp watch, matches initial get */
1282 return err; 1278 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e36481ed61..b7640a5f38 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,9 +71,6 @@
71 71
72extern struct list_head audit_filter_list[]; 72extern struct list_head audit_filter_list[];
73 73
74/* No syscall auditing will take place unless audit_enabled != 0. */
75extern int audit_enabled;
76
77/* AUDIT_NAMES is the number of slots we reserve in the audit_context 74/* AUDIT_NAMES is the number of slots we reserve in the audit_context
78 * for saving names from getname(). */ 75 * for saving names from getname(). */
79#define AUDIT_NAMES 20 76#define AUDIT_NAMES 20
@@ -2040,7 +2037,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2040 2037
2041/** 2038/**
2042 * audit_core_dumps - record information about processes that end abnormally 2039 * audit_core_dumps - record information about processes that end abnormally
2043 * @sig: signal value 2040 * @signr: signal value
2044 * 2041 *
2045 * If a process ends with a core dump, something fishy is going on and we 2042 * If a process ends with a core dump, something fishy is going on and we
2046 * should record the event for investigation. 2043 * should record the event for investigation.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 208cf3497c..181ae70860 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
104} 104}
105 105
106struct take_cpu_down_param {
107 unsigned long mod;
108 void *hcpu;
109};
110
106/* Take this CPU down. */ 111/* Take this CPU down. */
107static int take_cpu_down(void *unused) 112static int take_cpu_down(void *_param)
108{ 113{
114 struct take_cpu_down_param *param = _param;
109 int err; 115 int err;
110 116
117 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
118 param->hcpu);
111 /* Ensure this CPU doesn't handle any more interrupts. */ 119 /* Ensure this CPU doesn't handle any more interrupts. */
112 err = __cpu_disable(); 120 err = __cpu_disable();
113 if (err < 0) 121 if (err < 0)
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
127 cpumask_t old_allowed, tmp; 135 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu; 136 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 137 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
138 struct take_cpu_down_param tcd_param = {
139 .mod = mod,
140 .hcpu = hcpu,
141 };
130 142
131 if (num_online_cpus() == 1) 143 if (num_online_cpus() == 1)
132 return -EBUSY; 144 return -EBUSY;
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
153 set_cpus_allowed(current, tmp); 165 set_cpus_allowed(current, tmp);
154 166
155 mutex_lock(&cpu_bitmask_lock); 167 mutex_lock(&cpu_bitmask_lock);
156 p = __stop_machine_run(take_cpu_down, NULL, cpu); 168 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
157 mutex_unlock(&cpu_bitmask_lock); 169 mutex_unlock(&cpu_bitmask_lock);
158 170
159 if (IS_ERR(p) || cpu_online(cpu)) { 171 if (IS_ERR(p) || cpu_online(cpu)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4c49188cc4..57e6448b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL; 517 envp[i] = NULL;
518 518
519 call_usermodehelper(argv[0], argv, envp, 0); 519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf); 520 kfree(pathbuf);
521} 521}
522 522
@@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
982 if (!mmarray) 982 if (!mmarray)
983 goto done; 983 goto done;
984 write_lock_irq(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
985 if (atomic_read(&cs->count) <= ntasks) 985 if (atomic_read(&cs->count) <= ntasks)
986 break; /* got enough */ 986 break; /* got enough */
987 write_unlock_irq(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
988 kfree(mmarray); 988 kfree(mmarray);
989 } 989 }
990 990
@@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1006 continue; 1006 continue;
1007 mmarray[n++] = mm; 1007 mmarray[n++] = mm;
1008 } while_each_thread(g, p); 1008 } while_each_thread(g, p);
1009 write_unlock_irq(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1010 1010
1011 /* 1011 /*
1012 * Now that we've dropped the tasklist spinlock, we can 1012 * Now that we've dropped the tasklist spinlock, we can
@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
2138static int cpuset_handle_cpuhp(struct notifier_block *nb, 2138static int cpuset_handle_cpuhp(struct notifier_block *nb,
2139 unsigned long phase, void *cpu) 2139 unsigned long phase, void *cpu)
2140{ 2140{
2141 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2142 return NOTIFY_DONE;
2143
2141 common_cpu_mem_hotplug_unplug(); 2144 common_cpu_mem_hotplug_unplug();
2142 return 0; 2145 return 0;
2143} 2146}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c0148ae992..81e6978296 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
100{ 100{
101 s64 tmp; 101 s64 tmp;
102 struct timespec ts; 102 unsigned long t1;
103 unsigned long t1,t2,t3; 103 unsigned long long t2, t3;
104 unsigned long flags; 104 unsigned long flags;
105 struct timespec ts;
105 106
106 /* Though tsk->delays accessed later, early exit avoids 107 /* Though tsk->delays accessed later, early exit avoids
107 * unnecessary returning of other data 108 * unnecessary returning of other data
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
124 125
125 d->cpu_count += t1; 126 d->cpu_count += t1;
126 127
127 jiffies_to_timespec(t2, &ts); 128 tmp = (s64)d->cpu_delay_total + t2;
128 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; 129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
130 130
131 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; 131 tmp = (s64)d->cpu_run_virtual_total + t3;
132 d->cpu_run_virtual_total = 132 d->cpu_run_virtual_total =
133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; 133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
134 134
diff --git a/kernel/exit.c b/kernel/exit.c
index 5c8ecbaa19..e8af8d0c24 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 32#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 33#include <linux/delayacct.h>
34#include <linux/freezer.h>
34#include <linux/cpuset.h> 35#include <linux/cpuset.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
@@ -122,9 +123,9 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->maj_flt += tsk->maj_flt; 123 sig->maj_flt += tsk->maj_flt;
123 sig->nvcsw += tsk->nvcsw; 124 sig->nvcsw += tsk->nvcsw;
124 sig->nivcsw += tsk->nivcsw; 125 sig->nivcsw += tsk->nivcsw;
125 sig->sched_time += tsk->sched_time;
126 sig->inblock += task_io_get_inblock(tsk); 126 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 127 sig->oublock += task_io_get_oublock(tsk);
128 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
128 sig = NULL; /* Marker for below. */ 129 sig = NULL; /* Marker for below. */
129 } 130 }
130 131
@@ -182,7 +183,6 @@ repeat:
182 zap_leader = (leader->exit_signal == -1); 183 zap_leader = (leader->exit_signal == -1);
183 } 184 }
184 185
185 sched_exit(p);
186 write_unlock_irq(&tasklist_lock); 186 write_unlock_irq(&tasklist_lock);
187 proc_flush_task(p); 187 proc_flush_task(p);
188 release_thread(p); 188 release_thread(p);
@@ -291,7 +291,7 @@ static void reparent_to_kthreadd(void)
291 /* Set the exit signal to SIGCHLD so we signal init on exit */ 291 /* Set the exit signal to SIGCHLD so we signal init on exit */
292 current->exit_signal = SIGCHLD; 292 current->exit_signal = SIGCHLD;
293 293
294 if (!has_rt_policy(current) && (task_nice(current) < 0)) 294 if (task_nice(current) < 0)
295 set_user_nice(current, 0); 295 set_user_nice(current, 0);
296 /* cpus_allowed? */ 296 /* cpus_allowed? */
297 /* rt_priority? */ 297 /* rt_priority? */
@@ -388,6 +388,11 @@ void daemonize(const char *name, ...)
388 * they would be locked into memory. 388 * they would be locked into memory.
389 */ 389 */
390 exit_mm(current); 390 exit_mm(current);
391 /*
392 * We don't want to have TIF_FREEZE set if the system-wide hibernation
393 * or suspend transition begins right now.
394 */
395 current->flags |= PF_NOFREEZE;
391 396
392 set_special_pids(1, 1); 397 set_special_pids(1, 1);
393 proc_clear_tty(current); 398 proc_clear_tty(current);
@@ -859,6 +864,34 @@ static void exit_notify(struct task_struct *tsk)
859 release_task(tsk); 864 release_task(tsk);
860} 865}
861 866
867#ifdef CONFIG_DEBUG_STACK_USAGE
868static void check_stack_usage(void)
869{
870 static DEFINE_SPINLOCK(low_water_lock);
871 static int lowest_to_date = THREAD_SIZE;
872 unsigned long *n = end_of_stack(current);
873 unsigned long free;
874
875 while (*n == 0)
876 n++;
877 free = (unsigned long)n - (unsigned long)end_of_stack(current);
878
879 if (free >= lowest_to_date)
880 return;
881
882 spin_lock(&low_water_lock);
883 if (free < lowest_to_date) {
884 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
885 "left\n",
886 current->comm, free);
887 lowest_to_date = free;
888 }
889 spin_unlock(&low_water_lock);
890}
891#else
892static inline void check_stack_usage(void) {}
893#endif
894
862fastcall NORET_TYPE void do_exit(long code) 895fastcall NORET_TYPE void do_exit(long code)
863{ 896{
864 struct task_struct *tsk = current; 897 struct task_struct *tsk = current;
@@ -938,6 +971,8 @@ fastcall NORET_TYPE void do_exit(long code)
938 if (unlikely(tsk->compat_robust_list)) 971 if (unlikely(tsk->compat_robust_list))
939 compat_exit_robust_list(tsk); 972 compat_exit_robust_list(tsk);
940#endif 973#endif
974 if (group_dead)
975 tty_audit_exit();
941 if (unlikely(tsk->audit_context)) 976 if (unlikely(tsk->audit_context))
942 audit_free(tsk); 977 audit_free(tsk);
943 978
@@ -950,6 +985,7 @@ fastcall NORET_TYPE void do_exit(long code)
950 exit_sem(tsk); 985 exit_sem(tsk);
951 __exit_files(tsk); 986 __exit_files(tsk);
952 __exit_fs(tsk); 987 __exit_fs(tsk);
988 check_stack_usage();
953 exit_thread(); 989 exit_thread();
954 cpuset_exit(tsk); 990 cpuset_exit(tsk);
955 exit_keys(tsk); 991 exit_keys(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cda1b..ba39bdb2a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/delayacct.h> 49#include <linux/delayacct.h>
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h>
52 53
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
@@ -877,7 +878,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 878 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 879 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 880 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
880 sig->sched_time = 0; 881 sig->sum_sched_runtime = 0;
881 INIT_LIST_HEAD(&sig->cpu_timers[0]); 882 INIT_LIST_HEAD(&sig->cpu_timers[0]);
882 INIT_LIST_HEAD(&sig->cpu_timers[1]); 883 INIT_LIST_HEAD(&sig->cpu_timers[1]);
883 INIT_LIST_HEAD(&sig->cpu_timers[2]); 884 INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -897,6 +898,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
897 } 898 }
898 acct_init_pacct(&sig->pacct); 899 acct_init_pacct(&sig->pacct);
899 900
901 tty_audit_fork(sig);
902
900 return 0; 903 return 0;
901} 904}
902 905
@@ -920,7 +923,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
920{ 923{
921 unsigned long new_flags = p->flags; 924 unsigned long new_flags = p->flags;
922 925
923 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); 926 new_flags &= ~PF_SUPERPRIV;
924 new_flags |= PF_FORKNOEXEC; 927 new_flags |= PF_FORKNOEXEC;
925 if (!(clone_flags & CLONE_PTRACE)) 928 if (!(clone_flags & CLONE_PTRACE))
926 p->ptrace = 0; 929 p->ptrace = 0;
@@ -999,7 +1002,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (atomic_read(&p->user->processes) >= 1002 if (atomic_read(&p->user->processes) >=
1000 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1003 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
1001 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1004 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1002 p->user != &root_user) 1005 p->user != current->nsproxy->user_ns->root_user)
1003 goto bad_fork_free; 1006 goto bad_fork_free;
1004 } 1007 }
1005 1008
@@ -1040,7 +1043,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1040 1043
1041 p->utime = cputime_zero; 1044 p->utime = cputime_zero;
1042 p->stime = cputime_zero; 1045 p->stime = cputime_zero;
1043 p->sched_time = 0; 1046
1044#ifdef CONFIG_TASK_XACCT 1047#ifdef CONFIG_TASK_XACCT
1045 p->rchar = 0; /* I/O counter: bytes read */ 1048 p->rchar = 0; /* I/O counter: bytes read */
1046 p->wchar = 0; /* I/O counter: bytes written */ 1049 p->wchar = 0; /* I/O counter: bytes written */
@@ -1059,6 +1062,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 1062
1060 p->lock_depth = -1; /* -1 = no lock */ 1063 p->lock_depth = -1; /* -1 = no lock */
1061 do_posix_clock_monotonic_gettime(&p->start_time); 1064 do_posix_clock_monotonic_gettime(&p->start_time);
1065 p->real_start_time = p->start_time;
1066 monotonic_to_bootbased(&p->real_start_time);
1062 p->security = NULL; 1067 p->security = NULL;
1063 p->io_context = NULL; 1068 p->io_context = NULL;
1064 p->io_wait = NULL; 1069 p->io_wait = NULL;
@@ -1601,7 +1606,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1601 err = -EINVAL; 1606 err = -EINVAL;
1602 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1607 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1603 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1608 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1604 CLONE_NEWUTS|CLONE_NEWIPC)) 1609 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
1605 goto bad_unshare_out; 1610 goto bad_unshare_out;
1606 1611
1607 if ((err = unshare_thread(unshare_flags))) 1612 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/futex.c b/kernel/futex.c
index df248f5e08..5c3f45d07c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
121static struct vfsmount *futex_mnt; 121static struct vfsmount *futex_mnt;
122 122
123/* 123/*
124 * Take mm->mmap_sem, when futex is shared
125 */
126static inline void futex_lock_mm(struct rw_semaphore *fshared)
127{
128 if (fshared)
129 down_read(fshared);
130}
131
132/*
133 * Release mm->mmap_sem, when the futex is shared
134 */
135static inline void futex_unlock_mm(struct rw_semaphore *fshared)
136{
137 if (fshared)
138 up_read(fshared);
139}
140
141/*
124 * We hash on the keys returned from get_futex_key (see below). 142 * We hash on the keys returned from get_futex_key (see below).
125 */ 143 */
126static struct futex_hash_bucket *hash_futex(union futex_key *key) 144static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key)
287} 305}
288EXPORT_SYMBOL_GPL(drop_futex_key_refs); 306EXPORT_SYMBOL_GPL(drop_futex_key_refs);
289 307
290static inline int get_futex_value_locked(u32 *dest, u32 __user *from) 308static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
309{
310 u32 curval;
311
312 pagefault_disable();
313 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
314 pagefault_enable();
315
316 return curval;
317}
318
319static int get_futex_value_locked(u32 *dest, u32 __user *from)
291{ 320{
292 int ret; 321 int ret;
293 322
@@ -409,14 +438,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)
409 438
410 rcu_read_lock(); 439 rcu_read_lock();
411 p = find_task_by_pid(pid); 440 p = find_task_by_pid(pid);
412 if (!p) 441
413 goto out_unlock; 442 if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
414 if ((current->euid != p->euid) && (current->euid != p->uid)) { 443 p = ERR_PTR(-ESRCH);
415 p = NULL; 444 else
416 goto out_unlock; 445 get_task_struct(p);
417 } 446
418 get_task_struct(p);
419out_unlock:
420 rcu_read_unlock(); 447 rcu_read_unlock();
421 448
422 return p; 449 return p;
@@ -622,9 +649,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
622 649
623 newval = FUTEX_WAITERS | new_owner->pid; 650 newval = FUTEX_WAITERS | new_owner->pid;
624 651
625 pagefault_disable(); 652 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
627 pagefault_enable();
628 653
629 if (curval == -EFAULT) 654 if (curval == -EFAULT)
630 ret = -EFAULT; 655 ret = -EFAULT;
@@ -661,9 +686,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
661 * There is no waiter, so we unlock the futex. The owner died 686 * There is no waiter, so we unlock the futex. The owner died
662 * bit has not to be preserved here. We are the owner: 687 * bit has not to be preserved here. We are the owner:
663 */ 688 */
664 pagefault_disable(); 689 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
665 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
666 pagefault_enable();
667 690
668 if (oldval == -EFAULT) 691 if (oldval == -EFAULT)
669 return oldval; 692 return oldval;
@@ -702,8 +725,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
702 union futex_key key; 725 union futex_key key;
703 int ret; 726 int ret;
704 727
705 if (fshared) 728 futex_lock_mm(fshared);
706 down_read(fshared);
707 729
708 ret = get_futex_key(uaddr, fshared, &key); 730 ret = get_futex_key(uaddr, fshared, &key);
709 if (unlikely(ret != 0)) 731 if (unlikely(ret != 0))
@@ -727,8 +749,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
727 749
728 spin_unlock(&hb->lock); 750 spin_unlock(&hb->lock);
729out: 751out:
730 if (fshared) 752 futex_unlock_mm(fshared);
731 up_read(fshared);
732 return ret; 753 return ret;
733} 754}
734 755
@@ -748,8 +769,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
748 int ret, op_ret, attempt = 0; 769 int ret, op_ret, attempt = 0;
749 770
750retryfull: 771retryfull:
751 if (fshared) 772 futex_lock_mm(fshared);
752 down_read(fshared);
753 773
754 ret = get_futex_key(uaddr1, fshared, &key1); 774 ret = get_futex_key(uaddr1, fshared, &key1);
755 if (unlikely(ret != 0)) 775 if (unlikely(ret != 0))
@@ -795,7 +815,7 @@ retry:
795 */ 815 */
796 if (attempt++) { 816 if (attempt++) {
797 ret = futex_handle_fault((unsigned long)uaddr2, 817 ret = futex_handle_fault((unsigned long)uaddr2,
798 fshared, attempt); 818 fshared, attempt);
799 if (ret) 819 if (ret)
800 goto out; 820 goto out;
801 goto retry; 821 goto retry;
@@ -805,8 +825,7 @@ retry:
805 * If we would have faulted, release mmap_sem, 825 * If we would have faulted, release mmap_sem,
806 * fault it in and start all over again. 826 * fault it in and start all over again.
807 */ 827 */
808 if (fshared) 828 futex_unlock_mm(fshared);
809 up_read(fshared);
810 829
811 ret = get_user(dummy, uaddr2); 830 ret = get_user(dummy, uaddr2);
812 if (ret) 831 if (ret)
@@ -843,8 +862,8 @@ retry:
843 if (hb1 != hb2) 862 if (hb1 != hb2)
844 spin_unlock(&hb2->lock); 863 spin_unlock(&hb2->lock);
845out: 864out:
846 if (fshared) 865 futex_unlock_mm(fshared);
847 up_read(fshared); 866
848 return ret; 867 return ret;
849} 868}
850 869
@@ -863,8 +882,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
863 int ret, drop_count = 0; 882 int ret, drop_count = 0;
864 883
865 retry: 884 retry:
866 if (fshared) 885 futex_lock_mm(fshared);
867 down_read(fshared);
868 886
869 ret = get_futex_key(uaddr1, fshared, &key1); 887 ret = get_futex_key(uaddr1, fshared, &key1);
870 if (unlikely(ret != 0)) 888 if (unlikely(ret != 0))
@@ -892,8 +910,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
892 * If we would have faulted, release mmap_sem, fault 910 * If we would have faulted, release mmap_sem, fault
893 * it in and start all over again. 911 * it in and start all over again.
894 */ 912 */
895 if (fshared) 913 futex_unlock_mm(fshared);
896 up_read(fshared);
897 914
898 ret = get_user(curval, uaddr1); 915 ret = get_user(curval, uaddr1);
899 916
@@ -946,8 +963,7 @@ out_unlock:
946 drop_futex_key_refs(&key1); 963 drop_futex_key_refs(&key1);
947 964
948out: 965out:
949 if (fshared) 966 futex_unlock_mm(fshared);
950 up_read(fshared);
951 return ret; 967 return ret;
952} 968}
953 969
@@ -1115,10 +1131,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1115 while (!ret) { 1131 while (!ret) {
1116 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1132 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1117 1133
1118 pagefault_disable(); 1134 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1119 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1120 uval, newval);
1121 pagefault_enable();
1122 1135
1123 if (curval == -EFAULT) 1136 if (curval == -EFAULT)
1124 ret = -EFAULT; 1137 ret = -EFAULT;
@@ -1136,6 +1149,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1136#define ARG3_SHARED 1 1149#define ARG3_SHARED 1
1137 1150
1138static long futex_wait_restart(struct restart_block *restart); 1151static long futex_wait_restart(struct restart_block *restart);
1152
1139static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1153static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1140 u32 val, ktime_t *abs_time) 1154 u32 val, ktime_t *abs_time)
1141{ 1155{
@@ -1150,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1150 1164
1151 q.pi_state = NULL; 1165 q.pi_state = NULL;
1152 retry: 1166 retry:
1153 if (fshared) 1167 futex_lock_mm(fshared);
1154 down_read(fshared);
1155 1168
1156 ret = get_futex_key(uaddr, fshared, &q.key); 1169 ret = get_futex_key(uaddr, fshared, &q.key);
1157 if (unlikely(ret != 0)) 1170 if (unlikely(ret != 0))
@@ -1188,8 +1201,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1188 * If we would have faulted, release mmap_sem, fault it in and 1201 * If we would have faulted, release mmap_sem, fault it in and
1189 * start all over again. 1202 * start all over again.
1190 */ 1203 */
1191 if (fshared) 1204 futex_unlock_mm(fshared);
1192 up_read(fshared);
1193 1205
1194 ret = get_user(uval, uaddr); 1206 ret = get_user(uval, uaddr);
1195 1207
@@ -1208,8 +1220,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1208 * Now the futex is queued and we have checked the data, we 1220 * Now the futex is queued and we have checked the data, we
1209 * don't want to hold mmap_sem while we sleep. 1221 * don't want to hold mmap_sem while we sleep.
1210 */ 1222 */
1211 if (fshared) 1223 futex_unlock_mm(fshared);
1212 up_read(fshared);
1213 1224
1214 /* 1225 /*
1215 * There might have been scheduling since the queue_me(), as we 1226 * There might have been scheduling since the queue_me(), as we
@@ -1287,8 +1298,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1287 queue_unlock(&q, hb); 1298 queue_unlock(&q, hb);
1288 1299
1289 out_release_sem: 1300 out_release_sem:
1290 if (fshared) 1301 futex_unlock_mm(fshared);
1291 up_read(fshared);
1292 return ret; 1302 return ret;
1293} 1303}
1294 1304
@@ -1335,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1335 1345
1336 q.pi_state = NULL; 1346 q.pi_state = NULL;
1337 retry: 1347 retry:
1338 if (fshared) 1348 futex_lock_mm(fshared);
1339 down_read(fshared);
1340 1349
1341 ret = get_futex_key(uaddr, fshared, &q.key); 1350 ret = get_futex_key(uaddr, fshared, &q.key);
1342 if (unlikely(ret != 0)) 1351 if (unlikely(ret != 0))
@@ -1355,9 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1355 */ 1364 */
1356 newval = current->pid; 1365 newval = current->pid;
1357 1366
1358 pagefault_disable(); 1367 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1359 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1360 pagefault_enable();
1361 1368
1362 if (unlikely(curval == -EFAULT)) 1369 if (unlikely(curval == -EFAULT))
1363 goto uaddr_faulted; 1370 goto uaddr_faulted;
@@ -1400,9 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1400 lock_taken = 1; 1407 lock_taken = 1;
1401 } 1408 }
1402 1409
1403 pagefault_disable(); 1410 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1404 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1405 pagefault_enable();
1406 1411
1407 if (unlikely(curval == -EFAULT)) 1412 if (unlikely(curval == -EFAULT))
1408 goto uaddr_faulted; 1413 goto uaddr_faulted;
@@ -1430,8 +1435,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1430 * exit to complete. 1435 * exit to complete.
1431 */ 1436 */
1432 queue_unlock(&q, hb); 1437 queue_unlock(&q, hb);
1433 if (fshared) 1438 futex_unlock_mm(fshared);
1434 up_read(fshared);
1435 cond_resched(); 1439 cond_resched();
1436 goto retry; 1440 goto retry;
1437 1441
@@ -1467,8 +1471,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1467 * Now the futex is queued and we have checked the data, we 1471 * Now the futex is queued and we have checked the data, we
1468 * don't want to hold mmap_sem while we sleep. 1472 * don't want to hold mmap_sem while we sleep.
1469 */ 1473 */
1470 if (fshared) 1474 futex_unlock_mm(fshared);
1471 up_read(fshared);
1472 1475
1473 WARN_ON(!q.pi_state); 1476 WARN_ON(!q.pi_state);
1474 /* 1477 /*
@@ -1482,8 +1485,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1482 ret = ret ? 0 : -EWOULDBLOCK; 1485 ret = ret ? 0 : -EWOULDBLOCK;
1483 } 1486 }
1484 1487
1485 if (fshared) 1488 futex_lock_mm(fshared);
1486 down_read(fshared);
1487 spin_lock(q.lock_ptr); 1489 spin_lock(q.lock_ptr);
1488 1490
1489 if (!ret) { 1491 if (!ret) {
@@ -1520,8 +1522,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1520 1522
1521 /* Unqueue and drop the lock */ 1523 /* Unqueue and drop the lock */
1522 unqueue_me_pi(&q); 1524 unqueue_me_pi(&q);
1523 if (fshared) 1525 futex_unlock_mm(fshared);
1524 up_read(fshared);
1525 1526
1526 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1527 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1527 1528
@@ -1529,8 +1530,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1529 queue_unlock(&q, hb); 1530 queue_unlock(&q, hb);
1530 1531
1531 out_release_sem: 1532 out_release_sem:
1532 if (fshared) 1533 futex_unlock_mm(fshared);
1533 up_read(fshared);
1534 return ret; 1534 return ret;
1535 1535
1536 uaddr_faulted: 1536 uaddr_faulted:
@@ -1552,8 +1552,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 goto retry_unlocked; 1552 goto retry_unlocked;
1553 } 1553 }
1554 1554
1555 if (fshared) 1555 futex_unlock_mm(fshared);
1556 up_read(fshared);
1557 1556
1558 ret = get_user(uval, uaddr); 1557 ret = get_user(uval, uaddr);
1559 if (!ret && (uval != -EFAULT)) 1558 if (!ret && (uval != -EFAULT))
@@ -1587,8 +1586,7 @@ retry:
1587 /* 1586 /*
1588 * First take all the futex related locks: 1587 * First take all the futex related locks:
1589 */ 1588 */
1590 if (fshared) 1589 futex_lock_mm(fshared);
1591 down_read(fshared);
1592 1590
1593 ret = get_futex_key(uaddr, fshared, &key); 1591 ret = get_futex_key(uaddr, fshared, &key);
1594 if (unlikely(ret != 0)) 1592 if (unlikely(ret != 0))
@@ -1603,11 +1601,9 @@ retry_unlocked:
1603 * again. If it succeeds then we can return without waking 1601 * again. If it succeeds then we can return without waking
1604 * anyone else up: 1602 * anyone else up:
1605 */ 1603 */
1606 if (!(uval & FUTEX_OWNER_DIED)) { 1604 if (!(uval & FUTEX_OWNER_DIED))
1607 pagefault_disable(); 1605 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0);
1608 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1606
1609 pagefault_enable();
1610 }
1611 1607
1612 if (unlikely(uval == -EFAULT)) 1608 if (unlikely(uval == -EFAULT))
1613 goto pi_faulted; 1609 goto pi_faulted;
@@ -1649,8 +1645,7 @@ retry_unlocked:
1649out_unlock: 1645out_unlock:
1650 spin_unlock(&hb->lock); 1646 spin_unlock(&hb->lock);
1651out: 1647out:
1652 if (fshared) 1648 futex_unlock_mm(fshared);
1653 up_read(fshared);
1654 1649
1655 return ret; 1650 return ret;
1656 1651
@@ -1673,8 +1668,7 @@ pi_faulted:
1673 goto retry_unlocked; 1668 goto retry_unlocked;
1674 } 1669 }
1675 1670
1676 if (fshared) 1671 futex_unlock_mm(fshared);
1677 up_read(fshared);
1678 1672
1679 ret = get_user(uval, uaddr); 1673 ret = get_user(uval, uaddr);
1680 if (!ret && (uval != -EFAULT)) 1674 if (!ret && (uval != -EFAULT))
@@ -1731,8 +1725,8 @@ static int futex_fd(u32 __user *uaddr, int signal)
1731 1725
1732 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 1726 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1733 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " 1727 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1734 "will be removed from the kernel in June 2007\n", 1728 "will be removed from the kernel in June 2007\n",
1735 current->comm); 1729 current->comm);
1736 } 1730 }
1737 1731
1738 ret = -EINVAL; 1732 ret = -EINVAL;
@@ -1910,10 +1904,8 @@ retry:
1910 * Wake robust non-PI futexes here. The wakeup of 1904 * Wake robust non-PI futexes here. The wakeup of
1911 * PI futexes happens in exit_pi_state(): 1905 * PI futexes happens in exit_pi_state():
1912 */ 1906 */
1913 if (!pi) { 1907 if (!pi && (uval & FUTEX_WAITERS))
1914 if (uval & FUTEX_WAITERS)
1915 futex_wake(uaddr, &curr->mm->mmap_sem, 1); 1908 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1916 }
1917 } 1909 }
1918 return 0; 1910 return 0;
1919} 1911}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 23c03f43e1..72d034258b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1406,7 +1406,7 @@ static void migrate_hrtimers(int cpu)
1406static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1406static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1407 unsigned long action, void *hcpu) 1407 unsigned long action, void *hcpu)
1408{ 1408{
1409 long cpu = (long)hcpu; 1409 unsigned int cpu = (long)hcpu;
1410 1410
1411 switch (action) { 1411 switch (action) {
1412 1412
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd9e272d55..32b161972f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
172 irqreturn_t action_ret) 172 irqreturn_t action_ret)
173{ 173{
174 if (unlikely(action_ret != IRQ_HANDLED)) { 174 if (unlikely(action_ret != IRQ_HANDLED)) {
175 desc->irqs_unhandled++; 175 /*
176 * If we are seeing only the odd spurious IRQ caused by
177 * bus asynchronicity then don't eventually trigger an error,
178 * otherwise the couter becomes a doomsday timer for otherwise
179 * working systems
180 */
181 if (jiffies - desc->last_unhandled > HZ/10)
182 desc->irqs_unhandled = 1;
183 else
184 desc->irqs_unhandled++;
185 desc->last_unhandled = jiffies;
176 if (unlikely(action_ret != IRQ_NONE)) 186 if (unlikely(action_ret != IRQ_NONE))
177 report_bad_irq(irq, desc, action_ret); 187 report_bad_irq(irq, desc, action_ret);
178 } 188 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fed5441862..474219a419 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
152/* Lookup the address for this symbol. Returns 0 if not found. */ 152/* Lookup the address for this symbol. Returns 0 if not found. */
153unsigned long kallsyms_lookup_name(const char *name) 153unsigned long kallsyms_lookup_name(const char *name)
154{ 154{
155 char namebuf[KSYM_NAME_LEN+1]; 155 char namebuf[KSYM_NAME_LEN];
156 unsigned long i; 156 unsigned long i;
157 unsigned int off; 157 unsigned int off;
158 158
@@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr,
248{ 248{
249 const char *msym; 249 const char *msym;
250 250
251 namebuf[KSYM_NAME_LEN] = 0; 251 namebuf[KSYM_NAME_LEN - 1] = 0;
252 namebuf[0] = 0; 252 namebuf[0] = 0;
253 253
254 if (is_ksym_addr(addr)) { 254 if (is_ksym_addr(addr)) {
@@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr,
265 /* see if it's in a module */ 265 /* see if it's in a module */
266 msym = module_address_lookup(addr, symbolsize, offset, modname); 266 msym = module_address_lookup(addr, symbolsize, offset, modname);
267 if (msym) 267 if (msym)
268 return strncpy(namebuf, msym, KSYM_NAME_LEN); 268 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
269 269
270 return NULL; 270 return NULL;
271} 271}
@@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr,
273int lookup_symbol_name(unsigned long addr, char *symname) 273int lookup_symbol_name(unsigned long addr, char *symname)
274{ 274{
275 symname[0] = '\0'; 275 symname[0] = '\0';
276 symname[KSYM_NAME_LEN] = '\0'; 276 symname[KSYM_NAME_LEN - 1] = '\0';
277 277
278 if (is_ksym_addr(addr)) { 278 if (is_ksym_addr(addr)) {
279 unsigned long pos; 279 unsigned long pos;
@@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
291 unsigned long *offset, char *modname, char *name) 291 unsigned long *offset, char *modname, char *name)
292{ 292{
293 name[0] = '\0'; 293 name[0] = '\0';
294 name[KSYM_NAME_LEN] = '\0'; 294 name[KSYM_NAME_LEN - 1] = '\0';
295 295
296 if (is_ksym_addr(addr)) { 296 if (is_ksym_addr(addr)) {
297 unsigned long pos; 297 unsigned long pos;
@@ -312,18 +312,17 @@ int sprint_symbol(char *buffer, unsigned long address)
312 char *modname; 312 char *modname;
313 const char *name; 313 const char *name;
314 unsigned long offset, size; 314 unsigned long offset, size;
315 char namebuf[KSYM_NAME_LEN+1]; 315 char namebuf[KSYM_NAME_LEN];
316 316
317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
318 if (!name) 318 if (!name)
319 return sprintf(buffer, "0x%lx", address); 319 return sprintf(buffer, "0x%lx", address);
320 else { 320
321 if (modname) 321 if (modname)
322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
323 size, modname); 323 size, modname);
324 else 324 else
325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
326 }
327} 326}
328 327
329/* Look up a kernel symbol and print it to the kernel messages. */ 328/* Look up a kernel symbol and print it to the kernel messages. */
@@ -343,8 +342,8 @@ struct kallsym_iter
343 unsigned long value; 342 unsigned long value;
344 unsigned int nameoff; /* If iterating in core kernel symbols */ 343 unsigned int nameoff; /* If iterating in core kernel symbols */
345 char type; 344 char type;
346 char name[KSYM_NAME_LEN+1]; 345 char name[KSYM_NAME_LEN];
347 char module_name[MODULE_NAME_LEN + 1]; 346 char module_name[MODULE_NAME_LEN];
348 int exported; 347 int exported;
349}; 348};
350 349
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index cee419143f..bc41ad0f24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/kfifo.h> 26#include <linux/kfifo.h>
27#include <linux/log2.h>
27 28
28/** 29/**
29 * kfifo_init - allocates a new FIFO using a preallocated buffer 30 * kfifo_init - allocates a new FIFO using a preallocated buffer
@@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
41 struct kfifo *fifo; 42 struct kfifo *fifo;
42 43
43 /* size must be a power of 2 */ 44 /* size must be a power of 2 */
44 BUG_ON(size & (size - 1)); 45 BUG_ON(!is_power_of_2(size));
45 46
46 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
47 if (!fifo) 48 if (!fifo)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb0771..78d365c524 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -119,9 +119,10 @@ struct subprocess_info {
119 char **argv; 119 char **argv;
120 char **envp; 120 char **envp;
121 struct key *ring; 121 struct key *ring;
122 int wait; 122 enum umh_wait wait;
123 int retval; 123 int retval;
124 struct file *stdin; 124 struct file *stdin;
125 void (*cleanup)(char **argv, char **envp);
125}; 126};
126 127
127/* 128/*
@@ -180,6 +181,14 @@ static int ____call_usermodehelper(void *data)
180 do_exit(0); 181 do_exit(0);
181} 182}
182 183
184void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{
186 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp);
188 kfree(info);
189}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo);
191
183/* Keventd can't block, but this (a child) can. */ 192/* Keventd can't block, but this (a child) can. */
184static int wait_for_helper(void *data) 193static int wait_for_helper(void *data)
185{ 194{
@@ -216,8 +225,8 @@ static int wait_for_helper(void *data)
216 sub_info->retval = ret; 225 sub_info->retval = ret;
217 } 226 }
218 227
219 if (sub_info->wait < 0) 228 if (sub_info->wait == UMH_NO_WAIT)
220 kfree(sub_info); 229 call_usermodehelper_freeinfo(sub_info);
221 else 230 else
222 complete(sub_info->complete); 231 complete(sub_info->complete);
223 return 0; 232 return 0;
@@ -229,34 +238,122 @@ static void __call_usermodehelper(struct work_struct *work)
229 struct subprocess_info *sub_info = 238 struct subprocess_info *sub_info =
230 container_of(work, struct subprocess_info, work); 239 container_of(work, struct subprocess_info, work);
231 pid_t pid; 240 pid_t pid;
232 int wait = sub_info->wait; 241 enum umh_wait wait = sub_info->wait;
233 242
234 /* CLONE_VFORK: wait until the usermode helper has execve'd 243 /* CLONE_VFORK: wait until the usermode helper has execve'd
235 * successfully We need the data structures to stay around 244 * successfully We need the data structures to stay around
236 * until that is done. */ 245 * until that is done. */
237 if (wait) 246 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
238 pid = kernel_thread(wait_for_helper, sub_info, 247 pid = kernel_thread(wait_for_helper, sub_info,
239 CLONE_FS | CLONE_FILES | SIGCHLD); 248 CLONE_FS | CLONE_FILES | SIGCHLD);
240 else 249 else
241 pid = kernel_thread(____call_usermodehelper, sub_info, 250 pid = kernel_thread(____call_usermodehelper, sub_info,
242 CLONE_VFORK | SIGCHLD); 251 CLONE_VFORK | SIGCHLD);
243 252
244 if (wait < 0) 253 switch (wait) {
245 return; 254 case UMH_NO_WAIT:
255 break;
246 256
247 if (pid < 0) { 257 case UMH_WAIT_PROC:
258 if (pid > 0)
259 break;
248 sub_info->retval = pid; 260 sub_info->retval = pid;
261 /* FALLTHROUGH */
262
263 case UMH_WAIT_EXEC:
249 complete(sub_info->complete); 264 complete(sub_info->complete);
250 } else if (!wait) 265 }
251 complete(sub_info->complete); 266}
267
268/**
269 * call_usermodehelper_setup - prepare to call a usermode helper
270 * @path - path to usermode executable
271 * @argv - arg vector for process
272 * @envp - environment for process
273 *
274 * Returns either NULL on allocation failure, or a subprocess_info
275 * structure. This should be passed to call_usermodehelper_exec to
276 * exec the process and free the structure.
277 */
278struct subprocess_info *call_usermodehelper_setup(char *path,
279 char **argv, char **envp)
280{
281 struct subprocess_info *sub_info;
282 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
283 if (!sub_info)
284 goto out;
285
286 INIT_WORK(&sub_info->work, __call_usermodehelper);
287 sub_info->path = path;
288 sub_info->argv = argv;
289 sub_info->envp = envp;
290
291 out:
292 return sub_info;
252} 293}
294EXPORT_SYMBOL(call_usermodehelper_setup);
253 295
254/** 296/**
255 * call_usermodehelper_keys - start a usermode application 297 * call_usermodehelper_setkeys - set the session keys for usermode helper
256 * @path: pathname for the application 298 * @info: a subprocess_info returned by call_usermodehelper_setup
257 * @argv: null-terminated argument list 299 * @session_keyring: the session keyring for the process
258 * @envp: null-terminated environment list 300 */
259 * @session_keyring: session keyring for process (NULL for an empty keyring) 301void call_usermodehelper_setkeys(struct subprocess_info *info,
302 struct key *session_keyring)
303{
304 info->ring = session_keyring;
305}
306EXPORT_SYMBOL(call_usermodehelper_setkeys);
307
308/**
309 * call_usermodehelper_setcleanup - set a cleanup function
310 * @info: a subprocess_info returned by call_usermodehelper_setup
311 * @cleanup: a cleanup function
312 *
313 * The cleanup function is just befor ethe subprocess_info is about to
314 * be freed. This can be used for freeing the argv and envp. The
315 * Function must be runnable in either a process context or the
316 * context in which call_usermodehelper_exec is called.
317 */
318void call_usermodehelper_setcleanup(struct subprocess_info *info,
319 void (*cleanup)(char **argv, char **envp))
320{
321 info->cleanup = cleanup;
322}
323EXPORT_SYMBOL(call_usermodehelper_setcleanup);
324
325/**
326 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
327 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
328 * @filp: set to the write-end of a pipe
329 *
330 * This constructs a pipe, and sets the read end to be the stdin of the
331 * subprocess, and returns the write-end in *@filp.
332 */
333int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
334 struct file **filp)
335{
336 struct file *f;
337
338 f = create_write_pipe();
339 if (IS_ERR(f))
340 return PTR_ERR(f);
341 *filp = f;
342
343 f = create_read_pipe(f);
344 if (IS_ERR(f)) {
345 free_write_pipe(*filp);
346 return PTR_ERR(f);
347 }
348 sub_info->stdin = f;
349
350 return 0;
351}
352EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
353
354/**
355 * call_usermodehelper_exec - start a usermode application
356 * @sub_info: information about the subprocessa
260 * @wait: wait for the application to finish and return status. 357 * @wait: wait for the application to finish and return status.
261 * when -1 don't wait at all, but you get no useful error back when 358 * when -1 don't wait at all, but you get no useful error back when
262 * the program couldn't be exec'ed. This makes it safe to call 359 * the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +362,68 @@ static void __call_usermodehelper(struct work_struct *work)
265 * Runs a user-space application. The application is started 362 * Runs a user-space application. The application is started
266 * asynchronously if wait is not set, and runs as a child of keventd. 363 * asynchronously if wait is not set, and runs as a child of keventd.
267 * (ie. it runs with full root capabilities). 364 * (ie. it runs with full root capabilities).
268 *
269 * Must be called from process context. Returns a negative error code
270 * if program was not execed successfully, or 0.
271 */ 365 */
272int call_usermodehelper_keys(char *path, char **argv, char **envp, 366int call_usermodehelper_exec(struct subprocess_info *sub_info,
273 struct key *session_keyring, int wait) 367 enum umh_wait wait)
274{ 368{
275 DECLARE_COMPLETION_ONSTACK(done); 369 DECLARE_COMPLETION_ONSTACK(done);
276 struct subprocess_info *sub_info;
277 int retval; 370 int retval;
278 371
279 if (!khelper_wq) 372 if (sub_info->path[0] == '\0') {
280 return -EBUSY; 373 retval = 0;
281 374 goto out;
282 if (path[0] == '\0') 375 }
283 return 0;
284 376
285 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 377 if (!khelper_wq) {
286 if (!sub_info) 378 retval = -EBUSY;
287 return -ENOMEM; 379 goto out;
380 }
288 381
289 INIT_WORK(&sub_info->work, __call_usermodehelper);
290 sub_info->complete = &done; 382 sub_info->complete = &done;
291 sub_info->path = path;
292 sub_info->argv = argv;
293 sub_info->envp = envp;
294 sub_info->ring = session_keyring;
295 sub_info->wait = wait; 383 sub_info->wait = wait;
296 384
297 queue_work(khelper_wq, &sub_info->work); 385 queue_work(khelper_wq, &sub_info->work);
298 if (wait < 0) /* task has freed sub_info */ 386 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
299 return 0; 387 return 0;
300 wait_for_completion(&done); 388 wait_for_completion(&done);
301 retval = sub_info->retval; 389 retval = sub_info->retval;
302 kfree(sub_info); 390
391 out:
392 call_usermodehelper_freeinfo(sub_info);
303 return retval; 393 return retval;
304} 394}
305EXPORT_SYMBOL(call_usermodehelper_keys); 395EXPORT_SYMBOL(call_usermodehelper_exec);
306 396
397/**
398 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
399 * @path: path to usermode executable
400 * @argv: arg vector for process
401 * @envp: environment for process
402 * @filp: set to the write-end of a pipe
403 *
404 * This is a simple wrapper which executes a usermode-helper function
405 * with a pipe as stdin. It is implemented entirely in terms of
406 * lower-level call_usermodehelper_* functions.
407 */
307int call_usermodehelper_pipe(char *path, char **argv, char **envp, 408int call_usermodehelper_pipe(char *path, char **argv, char **envp,
308 struct file **filp) 409 struct file **filp)
309{ 410{
310 DECLARE_COMPLETION(done); 411 struct subprocess_info *sub_info;
311 struct subprocess_info sub_info = { 412 int ret;
312 .work = __WORK_INITIALIZER(sub_info.work,
313 __call_usermodehelper),
314 .complete = &done,
315 .path = path,
316 .argv = argv,
317 .envp = envp,
318 .retval = 0,
319 };
320 struct file *f;
321 413
322 if (!khelper_wq) 414 sub_info = call_usermodehelper_setup(path, argv, envp);
323 return -EBUSY; 415 if (sub_info == NULL)
416 return -ENOMEM;
324 417
325 if (path[0] == '\0') 418 ret = call_usermodehelper_stdinpipe(sub_info, filp);
326 return 0; 419 if (ret < 0)
420 goto out;
327 421
328 f = create_write_pipe(); 422 return call_usermodehelper_exec(sub_info, 1);
329 if (IS_ERR(f))
330 return PTR_ERR(f);
331 *filp = f;
332
333 f = create_read_pipe(f);
334 if (IS_ERR(f)) {
335 free_write_pipe(*filp);
336 return PTR_ERR(f);
337 }
338 sub_info.stdin = f;
339 423
340 queue_work(khelper_wq, &sub_info.work); 424 out:
341 wait_for_completion(&done); 425 call_usermodehelper_freeinfo(sub_info);
342 return sub_info.retval; 426 return ret;
343} 427}
344EXPORT_SYMBOL(call_usermodehelper_pipe); 428EXPORT_SYMBOL(call_usermodehelper_pipe);
345 429
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bbd51b81a3..a404f7ee73 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k)
215EXPORT_SYMBOL(kthread_stop); 215EXPORT_SYMBOL(kthread_stop);
216 216
217 217
218static __init void kthreadd_setup(void) 218static noinline __init_refok void kthreadd_setup(void)
219{ 219{
220 struct task_struct *tsk = current; 220 struct task_struct *tsk = current;
221 221
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1a5ff2211d..edba2ffb43 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -379,7 +379,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
379 379
380static void print_lock_name(struct lock_class *class) 380static void print_lock_name(struct lock_class *class)
381{ 381{
382 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; 382 char str[KSYM_NAME_LEN], c1, c2, c3, c4;
383 const char *name; 383 const char *name;
384 384
385 get_usage_chars(class, &c1, &c2, &c3, &c4); 385 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -401,7 +401,7 @@ static void print_lock_name(struct lock_class *class)
401static void print_lockdep_cache(struct lockdep_map *lock) 401static void print_lockdep_cache(struct lockdep_map *lock)
402{ 402{
403 const char *name; 403 const char *name;
404 char str[KSYM_NAME_LEN + 1]; 404 char str[KSYM_NAME_LEN];
405 405
406 name = lock->name; 406 name = lock->name;
407 if (!name) 407 if (!name)
diff --git a/kernel/module.c b/kernel/module.c
index 9bd93de01f..33c04ad511 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,10 +61,8 @@ extern int module_sysfs_initialized;
61/* If this is set, the section belongs in the init part of the module */ 61/* If this is set, the section belongs in the init part of the module */
62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
63 63
64/* Protects module list */ 64/* List of modules, protected by module_mutex or preempt_disable
65static DEFINE_SPINLOCK(modlist_lock); 65 * (add/delete uses stop_machine). */
66
67/* List of modules, protected by module_mutex AND modlist_lock */
68static DEFINE_MUTEX(module_mutex); 66static DEFINE_MUTEX(module_mutex);
69static LIST_HEAD(modules); 67static LIST_HEAD(modules);
70 68
@@ -488,8 +486,7 @@ static void free_modinfo_##field(struct module *mod) \
488 mod->field = NULL; \ 486 mod->field = NULL; \
489} \ 487} \
490static struct module_attribute modinfo_##field = { \ 488static struct module_attribute modinfo_##field = { \
491 .attr = { .name = __stringify(field), .mode = 0444, \ 489 .attr = { .name = __stringify(field), .mode = 0444 }, \
492 .owner = THIS_MODULE }, \
493 .show = show_modinfo_##field, \ 490 .show = show_modinfo_##field, \
494 .setup = setup_modinfo_##field, \ 491 .setup = setup_modinfo_##field, \
495 .test = modinfo_##field##_exists, \ 492 .test = modinfo_##field##_exists, \
@@ -761,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
761void __symbol_put(const char *symbol) 758void __symbol_put(const char *symbol)
762{ 759{
763 struct module *owner; 760 struct module *owner;
764 unsigned long flags;
765 const unsigned long *crc; 761 const unsigned long *crc;
766 762
767 spin_lock_irqsave(&modlist_lock, flags); 763 preempt_disable();
768 if (!__find_symbol(symbol, &owner, &crc, 1)) 764 if (!__find_symbol(symbol, &owner, &crc, 1))
769 BUG(); 765 BUG();
770 module_put(owner); 766 module_put(owner);
771 spin_unlock_irqrestore(&modlist_lock, flags); 767 preempt_enable();
772} 768}
773EXPORT_SYMBOL(__symbol_put); 769EXPORT_SYMBOL(__symbol_put);
774 770
@@ -793,7 +789,7 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
793} 789}
794 790
795static struct module_attribute refcnt = { 791static struct module_attribute refcnt = {
796 .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, 792 .attr = { .name = "refcnt", .mode = 0444 },
797 .show = show_refcnt, 793 .show = show_refcnt,
798}; 794};
799 795
@@ -851,7 +847,7 @@ static ssize_t show_initstate(struct module_attribute *mattr,
851} 847}
852 848
853static struct module_attribute initstate = { 849static struct module_attribute initstate = {
854 .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, 850 .attr = { .name = "initstate", .mode = 0444 },
855 .show = show_initstate, 851 .show = show_initstate,
856}; 852};
857 853
@@ -1032,7 +1028,6 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1032 sattr->mattr.show = module_sect_show; 1028 sattr->mattr.show = module_sect_show;
1033 sattr->mattr.store = NULL; 1029 sattr->mattr.store = NULL;
1034 sattr->mattr.attr.name = sattr->name; 1030 sattr->mattr.attr.name = sattr->name;
1035 sattr->mattr.attr.owner = mod;
1036 sattr->mattr.attr.mode = S_IRUGO; 1031 sattr->mattr.attr.mode = S_IRUGO;
1037 *(gattr++) = &(sattr++)->mattr.attr; 1032 *(gattr++) = &(sattr++)->mattr.attr;
1038 } 1033 }
@@ -1090,7 +1085,6 @@ int module_add_modinfo_attrs(struct module *mod)
1090 if (!attr->test || 1085 if (!attr->test ||
1091 (attr->test && attr->test(mod))) { 1086 (attr->test && attr->test(mod))) {
1092 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1087 memcpy(temp_attr, attr, sizeof(*temp_attr));
1093 temp_attr->attr.owner = mod;
1094 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1088 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1095 ++temp_attr; 1089 ++temp_attr;
1096 } 1090 }
@@ -1231,14 +1225,14 @@ static void free_module(struct module *mod)
1231void *__symbol_get(const char *symbol) 1225void *__symbol_get(const char *symbol)
1232{ 1226{
1233 struct module *owner; 1227 struct module *owner;
1234 unsigned long value, flags; 1228 unsigned long value;
1235 const unsigned long *crc; 1229 const unsigned long *crc;
1236 1230
1237 spin_lock_irqsave(&modlist_lock, flags); 1231 preempt_disable();
1238 value = __find_symbol(symbol, &owner, &crc, 1); 1232 value = __find_symbol(symbol, &owner, &crc, 1);
1239 if (value && !strong_try_module_get(owner)) 1233 if (value && !strong_try_module_get(owner))
1240 value = 0; 1234 value = 0;
1241 spin_unlock_irqrestore(&modlist_lock, flags); 1235 preempt_enable();
1242 1236
1243 return (void *)value; 1237 return (void *)value;
1244} 1238}
@@ -2139,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2139 sym = get_ksymbol(mod, addr, NULL, NULL); 2133 sym = get_ksymbol(mod, addr, NULL, NULL);
2140 if (!sym) 2134 if (!sym)
2141 goto out; 2135 goto out;
2142 strlcpy(symname, sym, KSYM_NAME_LEN + 1); 2136 strlcpy(symname, sym, KSYM_NAME_LEN);
2143 mutex_unlock(&module_mutex); 2137 mutex_unlock(&module_mutex);
2144 return 0; 2138 return 0;
2145 } 2139 }
@@ -2164,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2164 if (!sym) 2158 if (!sym)
2165 goto out; 2159 goto out;
2166 if (modname) 2160 if (modname)
2167 strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); 2161 strlcpy(modname, mod->name, MODULE_NAME_LEN);
2168 if (name) 2162 if (name)
2169 strlcpy(name, sym, KSYM_NAME_LEN + 1); 2163 strlcpy(name, sym, KSYM_NAME_LEN);
2170 mutex_unlock(&module_mutex); 2164 mutex_unlock(&module_mutex);
2171 return 0; 2165 return 0;
2172 } 2166 }
@@ -2187,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2187 *value = mod->symtab[symnum].st_value; 2181 *value = mod->symtab[symnum].st_value;
2188 *type = mod->symtab[symnum].st_info; 2182 *type = mod->symtab[symnum].st_info;
2189 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2183 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2190 KSYM_NAME_LEN + 1); 2184 KSYM_NAME_LEN);
2191 strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); 2185 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2192 *exported = is_exported(name, mod); 2186 *exported = is_exported(name, mod);
2193 mutex_unlock(&module_mutex); 2187 mutex_unlock(&module_mutex);
2194 return 0; 2188 return 0;
@@ -2235,26 +2229,13 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2235/* Called by the /proc file system to return a list of modules. */ 2229/* Called by the /proc file system to return a list of modules. */
2236static void *m_start(struct seq_file *m, loff_t *pos) 2230static void *m_start(struct seq_file *m, loff_t *pos)
2237{ 2231{
2238 struct list_head *i;
2239 loff_t n = 0;
2240
2241 mutex_lock(&module_mutex); 2232 mutex_lock(&module_mutex);
2242 list_for_each(i, &modules) { 2233 return seq_list_start(&modules, *pos);
2243 if (n++ == *pos)
2244 break;
2245 }
2246 if (i == &modules)
2247 return NULL;
2248 return i;
2249} 2234}
2250 2235
2251static void *m_next(struct seq_file *m, void *p, loff_t *pos) 2236static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2252{ 2237{
2253 struct list_head *i = p; 2238 return seq_list_next(p, &modules, pos);
2254 (*pos)++;
2255 if (i->next == &modules)
2256 return NULL;
2257 return i->next;
2258} 2239}
2259 2240
2260static void m_stop(struct seq_file *m, void *p) 2241static void m_stop(struct seq_file *m, void *p)
@@ -2324,11 +2305,10 @@ const struct seq_operations modules_op = {
2324/* Given an address, look for it in the module exception tables. */ 2305/* Given an address, look for it in the module exception tables. */
2325const struct exception_table_entry *search_module_extables(unsigned long addr) 2306const struct exception_table_entry *search_module_extables(unsigned long addr)
2326{ 2307{
2327 unsigned long flags;
2328 const struct exception_table_entry *e = NULL; 2308 const struct exception_table_entry *e = NULL;
2329 struct module *mod; 2309 struct module *mod;
2330 2310
2331 spin_lock_irqsave(&modlist_lock, flags); 2311 preempt_disable();
2332 list_for_each_entry(mod, &modules, list) { 2312 list_for_each_entry(mod, &modules, list) {
2333 if (mod->num_exentries == 0) 2313 if (mod->num_exentries == 0)
2334 continue; 2314 continue;
@@ -2339,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2339 if (e) 2319 if (e)
2340 break; 2320 break;
2341 } 2321 }
2342 spin_unlock_irqrestore(&modlist_lock, flags); 2322 preempt_enable();
2343 2323
2344 /* Now, if we found one, we are running inside it now, hence 2324 /* Now, if we found one, we are running inside it now, hence
2345 we cannot unload the module, hence no refcnt needed. */ 2325 we cannot unload the module, hence no refcnt needed. */
@@ -2351,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2351 */ 2331 */
2352int is_module_address(unsigned long addr) 2332int is_module_address(unsigned long addr)
2353{ 2333{
2354 unsigned long flags;
2355 struct module *mod; 2334 struct module *mod;
2356 2335
2357 spin_lock_irqsave(&modlist_lock, flags); 2336 preempt_disable();
2358 2337
2359 list_for_each_entry(mod, &modules, list) { 2338 list_for_each_entry(mod, &modules, list) {
2360 if (within(addr, mod->module_core, mod->core_size)) { 2339 if (within(addr, mod->module_core, mod->core_size)) {
2361 spin_unlock_irqrestore(&modlist_lock, flags); 2340 preempt_enable();
2362 return 1; 2341 return 1;
2363 } 2342 }
2364 } 2343 }
2365 2344
2366 spin_unlock_irqrestore(&modlist_lock, flags); 2345 preempt_enable();
2367 2346
2368 return 0; 2347 return 0;
2369} 2348}
2370 2349
2371 2350
2372/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2351/* Is this a valid kernel address? */
2373struct module *__module_text_address(unsigned long addr) 2352struct module *__module_text_address(unsigned long addr)
2374{ 2353{
2375 struct module *mod; 2354 struct module *mod;
@@ -2384,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr)
2384struct module *module_text_address(unsigned long addr) 2363struct module *module_text_address(unsigned long addr)
2385{ 2364{
2386 struct module *mod; 2365 struct module *mod;
2387 unsigned long flags;
2388 2366
2389 spin_lock_irqsave(&modlist_lock, flags); 2367 preempt_disable();
2390 mod = __module_text_address(addr); 2368 mod = __module_text_address(addr);
2391 spin_unlock_irqrestore(&modlist_lock, flags); 2369 preempt_enable();
2392 2370
2393 return mod; 2371 return mod;
2394} 2372}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1bc4b55241..10f0bbba38 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,8 @@
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23 23
24static struct kmem_cache *nsproxy_cachep;
25
24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 26struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
25 27
26static inline void get_nsproxy(struct nsproxy *ns) 28static inline void get_nsproxy(struct nsproxy *ns)
@@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
43{ 45{
44 struct nsproxy *ns; 46 struct nsproxy *ns;
45 47
46 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); 48 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
47 if (ns) 49 if (ns) {
50 memcpy(ns, orig, sizeof(struct nsproxy));
48 atomic_set(&ns->count, 1); 51 atomic_set(&ns->count, 1);
52 }
49 return ns; 53 return ns;
50} 54}
51 55
@@ -54,33 +58,51 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
54 * Return the newly created nsproxy. Do not attach this to the task, 58 * Return the newly created nsproxy. Do not attach this to the task,
55 * leave it to the caller to do proper locking and attach it to task. 59 * leave it to the caller to do proper locking and attach it to task.
56 */ 60 */
57static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, 61static struct nsproxy *create_new_namespaces(unsigned long flags,
58 struct fs_struct *new_fs) 62 struct task_struct *tsk, struct fs_struct *new_fs)
59{ 63{
60 struct nsproxy *new_nsp; 64 struct nsproxy *new_nsp;
65 int err;
61 66
62 new_nsp = clone_nsproxy(tsk->nsproxy); 67 new_nsp = clone_nsproxy(tsk->nsproxy);
63 if (!new_nsp) 68 if (!new_nsp)
64 return ERR_PTR(-ENOMEM); 69 return ERR_PTR(-ENOMEM);
65 70
66 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 71 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
67 if (IS_ERR(new_nsp->mnt_ns)) 72 if (IS_ERR(new_nsp->mnt_ns)) {
73 err = PTR_ERR(new_nsp->mnt_ns);
68 goto out_ns; 74 goto out_ns;
75 }
69 76
70 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 77 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
71 if (IS_ERR(new_nsp->uts_ns)) 78 if (IS_ERR(new_nsp->uts_ns)) {
79 err = PTR_ERR(new_nsp->uts_ns);
72 goto out_uts; 80 goto out_uts;
81 }
73 82
74 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 83 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
75 if (IS_ERR(new_nsp->ipc_ns)) 84 if (IS_ERR(new_nsp->ipc_ns)) {
85 err = PTR_ERR(new_nsp->ipc_ns);
76 goto out_ipc; 86 goto out_ipc;
87 }
77 88
78 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 89 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
79 if (IS_ERR(new_nsp->pid_ns)) 90 if (IS_ERR(new_nsp->pid_ns)) {
91 err = PTR_ERR(new_nsp->pid_ns);
80 goto out_pid; 92 goto out_pid;
93 }
94
95 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
96 if (IS_ERR(new_nsp->user_ns)) {
97 err = PTR_ERR(new_nsp->user_ns);
98 goto out_user;
99 }
81 100
82 return new_nsp; 101 return new_nsp;
83 102
103out_user:
104 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns);
84out_pid: 106out_pid:
85 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
86 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -91,15 +113,15 @@ out_uts:
91 if (new_nsp->mnt_ns) 113 if (new_nsp->mnt_ns)
92 put_mnt_ns(new_nsp->mnt_ns); 114 put_mnt_ns(new_nsp->mnt_ns);
93out_ns: 115out_ns:
94 kfree(new_nsp); 116 kmem_cache_free(nsproxy_cachep, new_nsp);
95 return ERR_PTR(-ENOMEM); 117 return ERR_PTR(err);
96} 118}
97 119
98/* 120/*
99 * called from clone. This now handles copy for nsproxy and all 121 * called from clone. This now handles copy for nsproxy and all
100 * namespaces therein. 122 * namespaces therein.
101 */ 123 */
102int copy_namespaces(int flags, struct task_struct *tsk) 124int copy_namespaces(unsigned long flags, struct task_struct *tsk)
103{ 125{
104 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
105 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
@@ -110,7 +132,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
110 132
111 get_nsproxy(old_ns); 133 get_nsproxy(old_ns);
112 134
113 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
114 return 0; 136 return 0;
115 137
116 if (!capable(CAP_SYS_ADMIN)) { 138 if (!capable(CAP_SYS_ADMIN)) {
@@ -140,43 +162,39 @@ void free_nsproxy(struct nsproxy *ns)
140 put_ipc_ns(ns->ipc_ns); 162 put_ipc_ns(ns->ipc_ns);
141 if (ns->pid_ns) 163 if (ns->pid_ns)
142 put_pid_ns(ns->pid_ns); 164 put_pid_ns(ns->pid_ns);
143 kfree(ns); 165 if (ns->user_ns)
166 put_user_ns(ns->user_ns);
167 kmem_cache_free(nsproxy_cachep, ns);
144} 168}
145 169
146/* 170/*
147 * Called from unshare. Unshare all the namespaces part of nsproxy. 171 * Called from unshare. Unshare all the namespaces part of nsproxy.
148 * On sucess, returns the new nsproxy and a reference to old nsproxy 172 * On success, returns the new nsproxy.
149 * to make sure it stays around.
150 */ 173 */
151int unshare_nsproxy_namespaces(unsigned long unshare_flags, 174int unshare_nsproxy_namespaces(unsigned long unshare_flags,
152 struct nsproxy **new_nsp, struct fs_struct *new_fs) 175 struct nsproxy **new_nsp, struct fs_struct *new_fs)
153{ 176{
154 struct nsproxy *old_ns = current->nsproxy;
155 int err = 0; 177 int err = 0;
156 178
157 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER)))
158 return 0; 181 return 0;
159 182
160#ifndef CONFIG_IPC_NS
161 if (unshare_flags & CLONE_NEWIPC)
162 return -EINVAL;
163#endif
164
165#ifndef CONFIG_UTS_NS
166 if (unshare_flags & CLONE_NEWUTS)
167 return -EINVAL;
168#endif
169
170 if (!capable(CAP_SYS_ADMIN)) 183 if (!capable(CAP_SYS_ADMIN))
171 return -EPERM; 184 return -EPERM;
172 185
173 get_nsproxy(old_ns);
174
175 *new_nsp = create_new_namespaces(unshare_flags, current, 186 *new_nsp = create_new_namespaces(unshare_flags, current,
176 new_fs ? new_fs : current->fs); 187 new_fs ? new_fs : current->fs);
177 if (IS_ERR(*new_nsp)) { 188 if (IS_ERR(*new_nsp))
178 err = PTR_ERR(*new_nsp); 189 err = PTR_ERR(*new_nsp);
179 put_nsproxy(old_ns);
180 }
181 return err; 190 return err;
182} 191}
192
193static int __init nsproxy_cache_init(void)
194{
195 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
196 0, SLAB_PANIC, NULL, NULL);
197 return 0;
198}
199
200module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d182825..f64f4c1ac1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -159,14 +159,15 @@ const char *print_tainted(void)
159{ 159{
160 static char buf[20]; 160 static char buf[20];
161 if (tainted) { 161 if (tainted) {
162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", 162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
168 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' '); 169 tainted & TAINT_USER ? 'U' : ' ',
170 tainted & TAINT_DIE ? 'D' : ' ');
170 } 171 }
171 else 172 else
172 snprintf(buf, sizeof(buf), "Not tainted"); 173 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index e61c46c97c..effbaaedd7 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -491,7 +491,6 @@ param_sysfs_setup(struct module_kobject *mk,
491 pattr->mattr.show = param_attr_show; 491 pattr->mattr.show = param_attr_show;
492 pattr->mattr.store = param_attr_store; 492 pattr->mattr.store = param_attr_store;
493 pattr->mattr.attr.name = (char *)&kp->name[name_skip]; 493 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
494 pattr->mattr.attr.owner = mk->mod;
495 pattr->mattr.attr.mode = kp->perm; 494 pattr->mattr.attr.mode = kp->perm;
496 *(gattr++) = &(pattr++)->mattr.attr; 495 *(gattr++) = &(pattr++)->mattr.attr;
497 } 496 }
diff --git a/kernel/pid.c b/kernel/pid.c
index eb66bd2953..c6e3f9ffff 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr)
365} 365}
366EXPORT_SYMBOL_GPL(find_get_pid); 366EXPORT_SYMBOL_GPL(find_get_pid);
367 367
368struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) 368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 369{
370 BUG_ON(!old_ns); 370 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 371 get_pid_ns(old_ns);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e183..b53c8fcd9d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
161} 161}
162static inline unsigned long long sched_ns(struct task_struct *p) 162static inline unsigned long long sched_ns(struct task_struct *p)
163{ 163{
164 return (p == current) ? current_sched_time(p) : p->sched_time; 164 return task_sched_runtime(p);
165} 165}
166 166
167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
246 } while (t != p); 246 } while (t != p);
247 break; 247 break;
248 case CPUCLOCK_SCHED: 248 case CPUCLOCK_SCHED:
249 cpu->sched = p->signal->sched_time; 249 cpu->sched = p->signal->sum_sched_runtime;
250 /* Add in each other live thread. */ 250 /* Add in each other live thread. */
251 while ((t = next_thread(t)) != p) { 251 while ((t = next_thread(t)) != p) {
252 cpu->sched += t->sched_time; 252 cpu->sched += t->se.sum_exec_runtime;
253 } 253 }
254 cpu->sched += sched_ns(p); 254 cpu->sched += sched_ns(p);
255 break; 255 break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
422 */ 422 */
423static void cleanup_timers(struct list_head *head, 423static void cleanup_timers(struct list_head *head,
424 cputime_t utime, cputime_t stime, 424 cputime_t utime, cputime_t stime,
425 unsigned long long sched_time) 425 unsigned long long sum_exec_runtime)
426{ 426{
427 struct cpu_timer_list *timer, *next; 427 struct cpu_timer_list *timer, *next;
428 cputime_t ptime = cputime_add(utime, stime); 428 cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
451 ++head; 451 ++head;
452 list_for_each_entry_safe(timer, next, head, entry) { 452 list_for_each_entry_safe(timer, next, head, entry) {
453 list_del_init(&timer->entry); 453 list_del_init(&timer->entry);
454 if (timer->expires.sched < sched_time) { 454 if (timer->expires.sched < sum_exec_runtime) {
455 timer->expires.sched = 0; 455 timer->expires.sched = 0;
456 } else { 456 } else {
457 timer->expires.sched -= sched_time; 457 timer->expires.sched -= sum_exec_runtime;
458 } 458 }
459 } 459 }
460} 460}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
467void posix_cpu_timers_exit(struct task_struct *tsk) 467void posix_cpu_timers_exit(struct task_struct *tsk)
468{ 468{
469 cleanup_timers(tsk->cpu_timers, 469 cleanup_timers(tsk->cpu_timers,
470 tsk->utime, tsk->stime, tsk->sched_time); 470 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
471 471
472} 472}
473void posix_cpu_timers_exit_group(struct task_struct *tsk) 473void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
475 cleanup_timers(tsk->signal->cpu_timers, 475 cleanup_timers(tsk->signal->cpu_timers,
476 cputime_add(tsk->utime, tsk->signal->utime), 476 cputime_add(tsk->utime, tsk->signal->utime),
477 cputime_add(tsk->stime, tsk->signal->stime), 477 cputime_add(tsk->stime, tsk->signal->stime),
478 tsk->sched_time + tsk->signal->sched_time); 478 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
479} 479}
480 480
481 481
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
536 nsleft = max_t(unsigned long long, nsleft, 1); 536 nsleft = max_t(unsigned long long, nsleft, 1);
537 do { 537 do {
538 if (likely(!(t->flags & PF_EXITING))) { 538 if (likely(!(t->flags & PF_EXITING))) {
539 ns = t->sched_time + nsleft; 539 ns = t->se.sum_exec_runtime + nsleft;
540 if (t->it_sched_expires == 0 || 540 if (t->it_sched_expires == 0 ||
541 t->it_sched_expires > ns) { 541 t->it_sched_expires > ns) {
542 t->it_sched_expires = ns; 542 t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
1004 struct cpu_timer_list *t = list_first_entry(timers, 1004 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 1005 struct cpu_timer_list,
1006 entry); 1006 entry);
1007 if (!--maxfire || tsk->sched_time < t->expires.sched) { 1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 1008 tsk->it_sched_expires = t->expires.sched;
1009 break; 1009 break;
1010 } 1010 }
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
1024 int maxfire; 1024 int maxfire;
1025 struct signal_struct *const sig = tsk->signal; 1025 struct signal_struct *const sig = tsk->signal;
1026 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1026 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1027 unsigned long long sched_time, sched_expires; 1027 unsigned long long sum_sched_runtime, sched_expires;
1028 struct task_struct *t; 1028 struct task_struct *t;
1029 struct list_head *timers = sig->cpu_timers; 1029 struct list_head *timers = sig->cpu_timers;
1030 1030
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
1044 */ 1044 */
1045 utime = sig->utime; 1045 utime = sig->utime;
1046 stime = sig->stime; 1046 stime = sig->stime;
1047 sched_time = sig->sched_time; 1047 sum_sched_runtime = sig->sum_sched_runtime;
1048 t = tsk; 1048 t = tsk;
1049 do { 1049 do {
1050 utime = cputime_add(utime, t->utime); 1050 utime = cputime_add(utime, t->utime);
1051 stime = cputime_add(stime, t->stime); 1051 stime = cputime_add(stime, t->stime);
1052 sched_time += t->sched_time; 1052 sum_sched_runtime += t->se.sum_exec_runtime;
1053 t = next_thread(t); 1053 t = next_thread(t);
1054 } while (t != tsk); 1054 } while (t != tsk);
1055 ptime = cputime_add(utime, stime); 1055 ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
1090 struct cpu_timer_list *t = list_first_entry(timers, 1090 struct cpu_timer_list *t = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || sched_time < t->expires.sched) { 1093 if (!--maxfire || sum_sched_runtime < t->expires.sched) {
1094 sched_expires = t->expires.sched; 1094 sched_expires = t->expires.sched;
1095 break; 1095 break;
1096 } 1096 }
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
1182 virt_left = cputime_sub(virt_expires, utime); 1182 virt_left = cputime_sub(virt_expires, utime);
1183 virt_left = cputime_div_non_zero(virt_left, nthreads); 1183 virt_left = cputime_div_non_zero(virt_left, nthreads);
1184 if (sched_expires) { 1184 if (sched_expires) {
1185 sched_left = sched_expires - sched_time; 1185 sched_left = sched_expires - sum_sched_runtime;
1186 do_div(sched_left, nthreads); 1186 do_div(sched_left, nthreads);
1187 sched_left = max_t(unsigned long long, sched_left, 1); 1187 sched_left = max_t(unsigned long long, sched_left, 1);
1188 } else { 1188 } else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
1208 t->it_virt_expires = ticks; 1208 t->it_virt_expires = ticks;
1209 } 1209 }
1210 1210
1211 sched = t->sched_time + sched_left; 1211 sched = t->se.sum_exec_runtime + sched_left;
1212 if (sched_expires && (t->it_sched_expires == 0 || 1212 if (sched_expires && (t->it_sched_expires == 0 ||
1213 t->it_sched_expires > sched)) { 1213 t->it_sched_expires > sched)) {
1214 t->it_sched_expires = sched; 1214 t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1300 1300
1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1302 (tsk->it_sched_expires == 0 || 1302 (tsk->it_sched_expires == 0 ||
1303 tsk->sched_time < tsk->it_sched_expires)) 1303 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1304 return; 1304 return;
1305 1305
1306#undef UNEXPIRED 1306#undef UNEXPIRED
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8812985f30..fc45ed2262 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,6 @@
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/pm.h>
19#include <linux/console.h> 18#include <linux/console.h>
20#include <linux/cpu.h> 19#include <linux/cpu.h>
21#include <linux/resume-trace.h> 20#include <linux/resume-trace.h>
@@ -97,6 +96,11 @@ static int suspend_prepare(suspend_state_t state)
97 } 96 }
98 } 97 }
99 98
99 if (pm_ops->set_target) {
100 error = pm_ops->set_target(state);
101 if (error)
102 goto Thaw;
103 }
100 suspend_console(); 104 suspend_console();
101 error = device_suspend(PMSG_SUSPEND); 105 error = device_suspend(PMSG_SUSPEND);
102 if (error) { 106 if (error) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 0bbdeac281..051d27e36a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -449,13 +449,16 @@ static int printk_time = 1;
449#else 449#else
450static int printk_time = 0; 450static int printk_time = 0;
451#endif 451#endif
452module_param(printk_time, int, S_IRUGO | S_IWUSR); 452module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
453 453
454static int __init printk_time_setup(char *str) 454static int __init printk_time_setup(char *str)
455{ 455{
456 if (*str) 456 if (*str)
457 return 0; 457 return 0;
458 printk_time = 1; 458 printk_time = 1;
459 printk(KERN_NOTICE "The 'time' option is deprecated and "
460 "is scheduled for removal in early 2008\n");
461 printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
459 return 1; 462 return 1;
460} 463}
461 464
@@ -483,6 +486,9 @@ static int have_callable_console(void)
483 * @fmt: format string 486 * @fmt: format string
484 * 487 *
485 * This is printk(). It can be called from any context. We want it to work. 488 * This is printk(). It can be called from any context. We want it to work.
489 * Be aware of the fact that if oops_in_progress is not set, we might try to
490 * wake klogd up which could deadlock on runqueue lock if printk() is called
491 * from scheduler code.
486 * 492 *
487 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 493 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
488 * call the console drivers. If we fail to get the semaphore we place the output 494 * call the console drivers. If we fail to get the semaphore we place the output
@@ -654,7 +660,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
654 */ 660 */
655static int __init console_setup(char *str) 661static int __init console_setup(char *str)
656{ 662{
657 char name[sizeof(console_cmdline[0].name)]; 663 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
658 char *s, *options; 664 char *s, *options;
659 int idx; 665 int idx;
660 666
@@ -662,27 +668,27 @@ static int __init console_setup(char *str)
662 * Decode str into name, index, options. 668 * Decode str into name, index, options.
663 */ 669 */
664 if (str[0] >= '0' && str[0] <= '9') { 670 if (str[0] >= '0' && str[0] <= '9') {
665 strcpy(name, "ttyS"); 671 strcpy(buf, "ttyS");
666 strncpy(name + 4, str, sizeof(name) - 5); 672 strncpy(buf + 4, str, sizeof(buf) - 5);
667 } else { 673 } else {
668 strncpy(name, str, sizeof(name) - 1); 674 strncpy(buf, str, sizeof(buf) - 1);
669 } 675 }
670 name[sizeof(name) - 1] = 0; 676 buf[sizeof(buf) - 1] = 0;
671 if ((options = strchr(str, ',')) != NULL) 677 if ((options = strchr(str, ',')) != NULL)
672 *(options++) = 0; 678 *(options++) = 0;
673#ifdef __sparc__ 679#ifdef __sparc__
674 if (!strcmp(str, "ttya")) 680 if (!strcmp(str, "ttya"))
675 strcpy(name, "ttyS0"); 681 strcpy(buf, "ttyS0");
676 if (!strcmp(str, "ttyb")) 682 if (!strcmp(str, "ttyb"))
677 strcpy(name, "ttyS1"); 683 strcpy(buf, "ttyS1");
678#endif 684#endif
679 for (s = name; *s; s++) 685 for (s = buf; *s; s++)
680 if ((*s >= '0' && *s <= '9') || *s == ',') 686 if ((*s >= '0' && *s <= '9') || *s == ',')
681 break; 687 break;
682 idx = simple_strtoul(s, NULL, 10); 688 idx = simple_strtoul(s, NULL, 10);
683 *s = 0; 689 *s = 0;
684 690
685 add_preferred_console(name, idx, options); 691 add_preferred_console(buf, idx, options);
686 return 1; 692 return 1;
687} 693}
688__setup("console=", console_setup); 694__setup("console=", console_setup);
@@ -709,7 +715,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
709 * See if this tty is not yet registered, and 715 * See if this tty is not yet registered, and
710 * if we have a slot free. 716 * if we have a slot free.
711 */ 717 */
712 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 718 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
713 if (strcmp(console_cmdline[i].name, name) == 0 && 719 if (strcmp(console_cmdline[i].name, name) == 0 &&
714 console_cmdline[i].index == idx) { 720 console_cmdline[i].index == idx) {
715 selected_console = i; 721 selected_console = i;
@@ -726,6 +732,25 @@ int __init add_preferred_console(char *name, int idx, char *options)
726 return 0; 732 return 0;
727} 733}
728 734
735int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
736{
737 struct console_cmdline *c;
738 int i;
739
740 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
741 if (strcmp(console_cmdline[i].name, name) == 0 &&
742 console_cmdline[i].index == idx) {
743 c = &console_cmdline[i];
744 memcpy(c->name, name_new, sizeof(c->name));
745 c->name[sizeof(c->name) - 1] = 0;
746 c->options = options;
747 c->index = idx_new;
748 return i;
749 }
750 /* not found */
751 return -1;
752}
753
729#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND 754#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
730/** 755/**
731 * suspend_console - suspend the console subsystem 756 * suspend_console - suspend the console subsystem
@@ -942,6 +967,9 @@ void register_console(struct console *console)
942 if (preferred_console < 0 || bootconsole || !console_drivers) 967 if (preferred_console < 0 || bootconsole || !console_drivers)
943 preferred_console = selected_console; 968 preferred_console = selected_console;
944 969
970 if (console->early_setup)
971 console->early_setup();
972
945 /* 973 /*
946 * See if we want to use this console driver. If we 974 * See if we want to use this console driver. If we
947 * didn't select a console we take the first one 975 * didn't select a console we take the first one
@@ -985,12 +1013,15 @@ void register_console(struct console *console)
985 if (!(console->flags & CON_ENABLED)) 1013 if (!(console->flags & CON_ENABLED))
986 return; 1014 return;
987 1015
988 if (bootconsole) { 1016 if (bootconsole && (console->flags & CON_CONSDEV)) {
989 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1017 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
990 bootconsole->name, bootconsole->index, 1018 bootconsole->name, bootconsole->index,
991 console->name, console->index); 1019 console->name, console->index);
992 unregister_console(bootconsole); 1020 unregister_console(bootconsole);
993 console->flags &= ~CON_PRINTBUFFER; 1021 console->flags &= ~CON_PRINTBUFFER;
1022 } else {
1023 printk(KERN_INFO "console [%s%d] enabled\n",
1024 console->name, console->index);
994 } 1025 }
995 1026
996 /* 1027 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ad7949a589..4a1745f1da 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task)
161int ptrace_attach(struct task_struct *task) 161int ptrace_attach(struct task_struct *task)
162{ 162{
163 int retval; 163 int retval;
164 unsigned long flags;
164 165
165 audit_ptrace(task); 166 audit_ptrace(task);
166 167
@@ -181,9 +182,7 @@ repeat:
181 * cpu's that may have task_lock). 182 * cpu's that may have task_lock).
182 */ 183 */
183 task_lock(task); 184 task_lock(task);
184 local_irq_disable(); 185 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
185 if (!write_trylock(&tasklist_lock)) {
186 local_irq_enable();
187 task_unlock(task); 186 task_unlock(task);
188 do { 187 do {
189 cpu_relax(); 188 cpu_relax();
@@ -211,7 +210,7 @@ repeat:
211 force_sig_specific(SIGSTOP, task); 210 force_sig_specific(SIGSTOP, task);
212 211
213bad: 212bad:
214 write_unlock_irq(&tasklist_lock); 213 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 214 task_unlock(task);
216out: 215out:
217 return retval; 216 return retval;
@@ -491,3 +490,22 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
491 return ret; 490 return ret;
492} 491}
493#endif /* __ARCH_SYS_PTRACE */ 492#endif /* __ARCH_SYS_PTRACE */
493
494int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
495{
496 unsigned long tmp;
497 int copied;
498
499 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
500 if (copied != sizeof(tmp))
501 return -EIO;
502 return put_user(tmp, (unsigned long __user *)data);
503}
504
505int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
506{
507 int copied;
508
509 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
510 return (copied == sizeof(data)) ? 0 : -EIO;
511}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 55ba82a85a..ddff332477 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -40,6 +40,7 @@
40#include <linux/moduleparam.h> 40#include <linux/moduleparam.h>
41#include <linux/percpu.h> 41#include <linux/percpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/random.h> 45#include <linux/random.h>
45#include <linux/delay.h> 46#include <linux/delay.h>
@@ -518,7 +519,6 @@ rcu_torture_writer(void *arg)
518 519
519 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 520 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
520 set_user_nice(current, 19); 521 set_user_nice(current, 19);
521 current->flags |= PF_NOFREEZE;
522 522
523 do { 523 do {
524 schedule_timeout_uninterruptible(1); 524 schedule_timeout_uninterruptible(1);
@@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg)
558 558
559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
560 set_user_nice(current, 19); 560 set_user_nice(current, 19);
561 current->flags |= PF_NOFREEZE;
562 561
563 do { 562 do {
564 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 563 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -589,7 +588,6 @@ rcu_torture_reader(void *arg)
589 588
590 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 589 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
591 set_user_nice(current, 19); 590 set_user_nice(current, 19);
592 current->flags |= PF_NOFREEZE;
593 591
594 do { 592 do {
595 idx = cur_ops->readlock(); 593 idx = cur_ops->readlock();
diff --git a/kernel/relay.c b/kernel/relay.c
index 4311101b0c..a615a8f513 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -21,6 +21,7 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/splice.h>
24 25
25/* list of open channels, for cpu hotplug */ 26/* list of open channels, for cpu hotplug */
26static DEFINE_MUTEX(relay_channels_mutex); 27static DEFINE_MUTEX(relay_channels_mutex);
@@ -121,6 +122,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
121 buf->page_array[i] = alloc_page(GFP_KERNEL); 122 buf->page_array[i] = alloc_page(GFP_KERNEL);
122 if (unlikely(!buf->page_array[i])) 123 if (unlikely(!buf->page_array[i]))
123 goto depopulate; 124 goto depopulate;
125 set_page_private(buf->page_array[i], (unsigned long)buf);
124 } 126 }
125 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); 127 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
126 if (!mem) 128 if (!mem)
@@ -812,7 +814,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
812 } 814 }
813 815
814 buf->bytes_consumed += bytes_consumed; 816 buf->bytes_consumed += bytes_consumed;
815 read_subbuf = read_pos / buf->chan->subbuf_size; 817 if (!read_pos)
818 read_subbuf = buf->subbufs_consumed % n_subbufs;
819 else
820 read_subbuf = read_pos / buf->chan->subbuf_size;
816 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { 821 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
817 if ((read_subbuf == buf->subbufs_produced % n_subbufs) && 822 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
818 (buf->offset == subbuf_size)) 823 (buf->offset == subbuf_size))
@@ -841,8 +846,9 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
841 } 846 }
842 847
843 if (unlikely(produced - consumed >= n_subbufs)) { 848 if (unlikely(produced - consumed >= n_subbufs)) {
844 consumed = (produced / n_subbufs) * n_subbufs; 849 consumed = produced - n_subbufs + 1;
845 buf->subbufs_consumed = consumed; 850 buf->subbufs_consumed = consumed;
851 buf->bytes_consumed = 0;
846 } 852 }
847 853
848 produced = (produced % n_subbufs) * subbuf_size + buf->offset; 854 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
@@ -899,7 +905,10 @@ static size_t relay_file_read_start_pos(size_t read_pos,
899 size_t read_subbuf, padding, padding_start, padding_end; 905 size_t read_subbuf, padding, padding_start, padding_end;
900 size_t subbuf_size = buf->chan->subbuf_size; 906 size_t subbuf_size = buf->chan->subbuf_size;
901 size_t n_subbufs = buf->chan->n_subbufs; 907 size_t n_subbufs = buf->chan->n_subbufs;
908 size_t consumed = buf->subbufs_consumed % n_subbufs;
902 909
910 if (!read_pos)
911 read_pos = consumed * subbuf_size + buf->bytes_consumed;
903 read_subbuf = read_pos / subbuf_size; 912 read_subbuf = read_pos / subbuf_size;
904 padding = buf->padding[read_subbuf]; 913 padding = buf->padding[read_subbuf];
905 padding_start = (read_subbuf + 1) * subbuf_size - padding; 914 padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -963,43 +972,6 @@ static int subbuf_read_actor(size_t read_start,
963 return ret; 972 return ret;
964} 973}
965 974
966/*
967 * subbuf_send_actor - send up to one subbuf's worth of data
968 */
969static int subbuf_send_actor(size_t read_start,
970 struct rchan_buf *buf,
971 size_t avail,
972 read_descriptor_t *desc,
973 read_actor_t actor)
974{
975 unsigned long pidx, poff;
976 unsigned int subbuf_pages;
977 int ret = 0;
978
979 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
980 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
981 poff = read_start & ~PAGE_MASK;
982 while (avail) {
983 struct page *p = buf->page_array[pidx];
984 unsigned int len;
985
986 len = PAGE_SIZE - poff;
987 if (len > avail)
988 len = avail;
989
990 len = actor(desc, p, poff, len);
991 if (desc->error)
992 break;
993
994 avail -= len;
995 ret += len;
996 poff = 0;
997 pidx = (pidx + 1) % subbuf_pages;
998 }
999
1000 return ret;
1001}
1002
1003typedef int (*subbuf_actor_t) (size_t read_start, 975typedef int (*subbuf_actor_t) (size_t read_start,
1004 struct rchan_buf *buf, 976 struct rchan_buf *buf,
1005 size_t avail, 977 size_t avail,
@@ -1060,19 +1032,161 @@ static ssize_t relay_file_read(struct file *filp,
1060 NULL, &desc); 1032 NULL, &desc);
1061} 1033}
1062 1034
1063static ssize_t relay_file_sendfile(struct file *filp, 1035static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
1064 loff_t *ppos,
1065 size_t count,
1066 read_actor_t actor,
1067 void *target)
1068{ 1036{
1069 read_descriptor_t desc; 1037 rbuf->bytes_consumed += bytes_consumed;
1070 desc.written = 0; 1038
1071 desc.count = count; 1039 if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
1072 desc.arg.data = target; 1040 relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
1073 desc.error = 0; 1041 rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
1074 return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, 1042 }
1075 actor, &desc); 1043}
1044
1045static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1046 struct pipe_buffer *buf)
1047{
1048 struct rchan_buf *rbuf;
1049
1050 rbuf = (struct rchan_buf *)page_private(buf->page);
1051 relay_consume_bytes(rbuf, buf->private);
1052}
1053
1054static struct pipe_buf_operations relay_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = relay_pipe_buf_release,
1060 .steal = generic_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064/*
1065 * subbuf_splice_actor - splice up to one subbuf's worth of data
1066 */
1067static int subbuf_splice_actor(struct file *in,
1068 loff_t *ppos,
1069 struct pipe_inode_info *pipe,
1070 size_t len,
1071 unsigned int flags,
1072 int *nonpad_ret)
1073{
1074 unsigned int pidx, poff, total_len, subbuf_pages, ret;
1075 struct rchan_buf *rbuf = in->private_data;
1076 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1077 uint64_t pos = (uint64_t) *ppos;
1078 uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
1079 size_t read_start = (size_t) do_div(pos, alloc_size);
1080 size_t read_subbuf = read_start / subbuf_size;
1081 size_t padding = rbuf->padding[read_subbuf];
1082 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1083 struct page *pages[PIPE_BUFFERS];
1084 struct partial_page partial[PIPE_BUFFERS];
1085 struct splice_pipe_desc spd = {
1086 .pages = pages,
1087 .nr_pages = 0,
1088 .partial = partial,
1089 .flags = flags,
1090 .ops = &relay_pipe_buf_ops,
1091 };
1092
1093 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1094 return 0;
1095
1096 /*
1097 * Adjust read len, if longer than what is available
1098 */
1099 if (len > (subbuf_size - read_start % subbuf_size))
1100 len = subbuf_size - read_start % subbuf_size;
1101
1102 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1103 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1104 poff = read_start & ~PAGE_MASK;
1105
1106 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
1107 unsigned int this_len, this_end, private;
1108 unsigned int cur_pos = read_start + total_len;
1109
1110 if (!len)
1111 break;
1112
1113 this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
1114 private = this_len;
1115
1116 spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
1117 spd.partial[spd.nr_pages].offset = poff;
1118
1119 this_end = cur_pos + this_len;
1120 if (this_end >= nonpad_end) {
1121 this_len = nonpad_end - cur_pos;
1122 private = this_len + padding;
1123 }
1124 spd.partial[spd.nr_pages].len = this_len;
1125 spd.partial[spd.nr_pages].private = private;
1126
1127 len -= this_len;
1128 total_len += this_len;
1129 poff = 0;
1130 pidx = (pidx + 1) % subbuf_pages;
1131
1132 if (this_end >= nonpad_end) {
1133 spd.nr_pages++;
1134 break;
1135 }
1136 }
1137
1138 if (!spd.nr_pages)
1139 return 0;
1140
1141 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1142 if (ret < 0 || ret < total_len)
1143 return ret;
1144
1145 if (read_start + ret == nonpad_end)
1146 ret += padding;
1147
1148 return ret;
1149}
1150
1151static ssize_t relay_file_splice_read(struct file *in,
1152 loff_t *ppos,
1153 struct pipe_inode_info *pipe,
1154 size_t len,
1155 unsigned int flags)
1156{
1157 ssize_t spliced;
1158 int ret;
1159 int nonpad_ret = 0;
1160
1161 ret = 0;
1162 spliced = 0;
1163
1164 while (len) {
1165 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1166 if (ret < 0)
1167 break;
1168 else if (!ret) {
1169 if (spliced)
1170 break;
1171 if (flags & SPLICE_F_NONBLOCK) {
1172 ret = -EAGAIN;
1173 break;
1174 }
1175 }
1176
1177 *ppos += ret;
1178 if (ret > len)
1179 len = 0;
1180 else
1181 len -= ret;
1182 spliced += nonpad_ret;
1183 nonpad_ret = 0;
1184 }
1185
1186 if (spliced)
1187 return spliced;
1188
1189 return ret;
1076} 1190}
1077 1191
1078const struct file_operations relay_file_operations = { 1192const struct file_operations relay_file_operations = {
@@ -1082,7 +1196,7 @@ const struct file_operations relay_file_operations = {
1082 .read = relay_file_read, 1196 .read = relay_file_read,
1083 .llseek = no_llseek, 1197 .llseek = no_llseek,
1084 .release = relay_file_release, 1198 .release = relay_file_release,
1085 .sendfile = relay_file_sendfile, 1199 .splice_read = relay_file_splice_read,
1086}; 1200};
1087EXPORT_SYMBOL_GPL(relay_file_operations); 1201EXPORT_SYMBOL_GPL(relay_file_operations);
1088 1202
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index da8d6bf464..5aedbee014 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,12 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x) 32# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x) 33# define TRACE_BUG_ON(x) BUG_ON(x)
40 34
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 015fc633c9..e3055ba691 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -260,6 +260,7 @@ static int test_func(void *data)
260 int ret; 260 int ret;
261 261
262 current->flags |= PF_MUTEX_TESTER; 262 current->flags |= PF_MUTEX_TESTER;
263 set_freezable();
263 allow_signal(SIGHUP); 264 allow_signal(SIGHUP);
264 265
265 for(;;) { 266 for(;;) {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 17d28ce203..8cd9bd2cdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -17,12 +17,6 @@
17 17
18#include "rtmutex_common.h" 18#include "rtmutex_common.h"
19 19
20#ifdef CONFIG_DEBUG_RT_MUTEXES
21# include "rtmutex-debug.h"
22#else
23# include "rtmutex.h"
24#endif
25
26/* 20/*
27 * lock->owner state tracking: 21 * lock->owner state tracking:
28 * 22 *
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e79..2d3b83593c 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
103 103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) 104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{ 105{
106 return (struct task_struct *) 106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); 107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108} 108}
109 109
@@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123
124#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h"
126#else
127# include "rtmutex.h"
128#endif
129
123#endif 130#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index a7475913b0..cb31fb4a13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
214 158
215static inline unsigned int task_timeslice(struct task_struct *p) 159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
198
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
223 209
224struct prio_array { 210/* Real-Time classes' related field in a runqueue: */
225 unsigned int nr_active; 211struct rt_rq {
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 212 struct rt_prio_array active;
227 struct list_head queue[MAX_PRIO]; 213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
334#ifndef prepare_arch_switch 394#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 395# define prepare_arch_switch(next) do { } while (0)
336#endif 396#endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
461} 521}
462 522
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 523/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 524 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 525 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 535 return rq;
604} 536}
605 537
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 538/*
608 * Called when a process is dequeued from the active array and given 539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 540 */
622static inline void sched_info_dequeued(struct task_struct *t) 541void sched_clock_unstable_event(void)
623{ 542{
624 t->sched_info.last_queued = 0; 543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
625} 550}
626 551
627/* 552/*
628 * Called when a task finally hits the cpu. We can now calculate how 553 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 554 *
630 * can keep stats on how long its timeslice is. 555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
631 */ 558 */
632static void sched_info_arrive(struct task_struct *t) 559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
633{ 566{
634 unsigned long now = jiffies, delta_jiffies = 0; 567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
573
574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
635 575
636 if (t->sched_info.last_queued) 576 cpu = task_cpu(p);
637 delta_jiffies = now - t->sched_info.last_queued; 577 if (cpu == smp_processor_id())
638 sched_info_dequeued(t); 578 return;
639 t->sched_info.run_delay += delta_jiffies;
640 t->sched_info.last_arrival = now;
641 t->sched_info.pcnt++;
642 579
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
644} 584}
645 585
646/* 586static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired 587{
648 * array. The time is noted and later used to determine how long we 588 struct rq *rq = cpu_rq(cpu);
649 * had to wait for us to reach the cpu. Since the expired queue will 589 unsigned long flags;
650 * become the active queue after active queue is empty, without dequeuing 590
651 * and requeuing any tasks, we are interested in queuing to either. It 591 if (!spin_trylock_irqsave(&rq->lock, flags))
652 * is unusual but not impossible for tasks to be dequeued and immediately 592 return;
653 * requeued in the same or another array: this can happen in sched_yield(), 593 resched_task(cpu_curr(cpu));
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue 594 spin_unlock_irqrestore(&rq->lock, flags);
655 * to runqueue. 595}
656 * 596#else
657 * This function is only called from enqueue_task(), but also only updates 597static inline void resched_task(struct task_struct *p)
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 598{
663 if (unlikely(sched_info_on())) 599 assert_spin_locked(&task_rq(p)->lock);
664 if (!t->sched_info.last_queued) 600 set_tsk_need_resched(p);
665 t->sched_info.last_queued = jiffies;
666} 601}
602#endif
667 603
668/* 604static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 605{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
675 610
676 t->sched_info.cpu_time += delta_jiffies; 611 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 612#else
613 return divident / divisor;
614#endif
678} 615}
679 616
680/* 617#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 618# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 619#else
683 * the idle task.) We are only called when prev != next. 620# define WMULT_CONST (1UL << 32)
684 */ 621#endif
685static inline void 622
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
687{ 628{
688 struct rq *rq = task_rq(prev); 629 u64 tmp;
689 630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
690 /* 635 /*
691 * prev now departs the cpu. It's not interesting to record 636 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 637 */
695 if (prev != rq->idle) 638 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
697 644
698 if (next != rq->idle) 645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706} 646}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 647
712/* 648static inline unsigned long
713 * Adding/removing a task to/from a priority array: 649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 650{
717 array->nr_active--; 651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 652}
722 653
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 654static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 655{
725 sched_info_queued(p); 656 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 657 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 658}
731 659
732/* 660static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 661{
738 list_move_tail(&p->run_list, array->queue + p->prio); 662 lw->weight -= dec;
663 lw->inv_weight = 0;
739} 664}
740 665
741static inline void 666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 667{
744 list_add(&p->run_list, array->queue + p->prio); 668 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 669 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 671 ls->delta_stat = 0;
672 }
748} 673}
749 674
750/* 675/*
751 * __normal_prio - return the priority that is based on the static 676 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 * 677 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 678 * delta_fair clock advances at a rate inversely proportional to
755 * into the -5 ... 0 ... +5 bonus/penalty range. 679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
756 * 682 *
757 * We use 25% of the full 0...39 priority range so that: 683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
758 * 686 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 687 * This function is called /before/ updating rq->ls.load
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 688 * and when switching tasks.
761 *
762 * Both properties are important to certain workloads.
763 */ 689 */
764 690static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 691{
767 int bonus, prio; 692 struct load_stat *ls = &rq->ls;
768 693 u64 start;
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
770 694
771 prio = p->static_prio - bonus; 695 start = ls->load_update_start;
772 if (prio < MAX_RT_PRIO) 696 ls->load_update_start = now;
773 prio = MAX_RT_PRIO; 697 ls->delta_stat += now - start;
774 if (prio > MAX_PRIO-1) 698 /*
775 prio = MAX_PRIO-1; 699 * Stagger updates to ls->delta_fair. Very frequent updates
776 return prio; 700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
777} 704}
778 705
779/* 706/*
@@ -791,53 +718,155 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 718 * this code will need modification
792 */ 719 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 721#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 723#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 724 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 727
801static void set_load_weight(struct task_struct *p) 728#define WEIGHT_IDLEPRIO 2
802{ 729#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 730
804#ifdef CONFIG_SMP 731/*
805 if (p == task_rq(p)->migration_thread) 732 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 735 * that remained on nice 0.
809 * adversely. 736 *
810 */ 737 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 739 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
813#endif 740 * If a task goes up by ~10% and another task goes down by ~10% then
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 741 * the relative distance between them is ~25%.)
815 } else 742 */
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 743static const int prio_to_weight[40] = {
817} 744/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
745/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
746/* 0 */ NICE_0_LOAD /* 1024 */,
747/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
748/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
749};
750
751/*
752 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
753 *
754 * In cases where the weight does not change often, we can use the
755 * precalculated inverse to speed up arithmetics by turning divisions
756 * into multiplications:
757 */
758static const u32 prio_to_wmult[40] = {
759/* -20 */ 48356, 60446, 75558, 94446, 118058,
760/* -15 */ 147573, 184467, 230589, 288233, 360285,
761/* -10 */ 450347, 562979, 703746, 879575, 1099582,
762/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
763/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
764/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
765/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
766/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
767};
818 768
819static inline void 769static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 770inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 771{
822 rq->raw_weighted_load += p->load_weight; 772 update_curr_load(rq, now);
773 update_load_add(&rq->ls.load, p->se.load.weight);
823} 774}
824 775
825static inline void 776static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 777dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 778{
828 rq->raw_weighted_load -= p->load_weight; 779 update_curr_load(rq, now);
780 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 781}
830 782
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 783static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 784{
833 rq->nr_running++; 785 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 786 inc_load(rq, p, now);
835} 787}
836 788
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 789static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 790{
839 rq->nr_running--; 791 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 792 dec_load(rq, p, now);
793}
794
795static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
796
797/*
798 * runqueue iterator, to support SMP load-balancing between different
799 * scheduling classes, without having to expose their internal data
800 * structures to the load-balancing proper:
801 */
802struct rq_iterator {
803 void *arg;
804 struct task_struct *(*start)(void *);
805 struct task_struct *(*next)(void *);
806};
807
808static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
809 unsigned long max_nr_move, unsigned long max_load_move,
810 struct sched_domain *sd, enum cpu_idle_type idle,
811 int *all_pinned, unsigned long *load_moved,
812 int this_best_prio, int best_prio, int best_prio_seen,
813 struct rq_iterator *iterator);
814
815#include "sched_stats.h"
816#include "sched_rt.c"
817#include "sched_fair.c"
818#include "sched_idletask.c"
819#ifdef CONFIG_SCHED_DEBUG
820# include "sched_debug.c"
821#endif
822
823#define sched_class_highest (&rt_sched_class)
824
825static void set_load_weight(struct task_struct *p)
826{
827 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
828 p->se.wait_runtime = 0;
829
830 if (task_has_rt_policy(p)) {
831 p->se.load.weight = prio_to_weight[0] * 2;
832 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
833 return;
834 }
835
836 /*
837 * SCHED_IDLE tasks get minimal weight:
838 */
839 if (p->policy == SCHED_IDLE) {
840 p->se.load.weight = WEIGHT_IDLEPRIO;
841 p->se.load.inv_weight = WMULT_IDLEPRIO;
842 return;
843 }
844
845 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
846 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
847}
848
849static void
850enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
851{
852 sched_info_queued(p);
853 p->sched_class->enqueue_task(rq, p, wakeup, now);
854 p->se.on_rq = 1;
855}
856
857static void
858dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
859{
860 p->sched_class->dequeue_task(rq, p, sleep, now);
861 p->se.on_rq = 0;
862}
863
864/*
865 * __normal_prio - return the priority that is based on the static prio
866 */
867static inline int __normal_prio(struct task_struct *p)
868{
869 return p->static_prio;
841} 870}
842 871
843/* 872/*
@@ -851,7 +880,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 880{
852 int prio; 881 int prio;
853 882
854 if (has_rt_policy(p)) 883 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 884 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 885 else
857 prio = __normal_prio(p); 886 prio = __normal_prio(p);
@@ -879,221 +908,46 @@ static int effective_prio(struct task_struct *p)
879} 908}
880 909
881/* 910/*
882 * __activate_task - move a task to the runqueue. 911 * activate_task - move a task to the runqueue.
883 */
884static void __activate_task(struct task_struct *p, struct rq *rq)
885{
886 struct prio_array *target = rq->active;
887
888 if (batch_task(p))
889 target = rq->expired;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
898{
899 enqueue_task_head(p, rq->active);
900 inc_nr_running(p, rq);
901}
902
903/*
904 * Recalculate p->normal_prio and p->prio after having slept,
905 * updating the sleep-average too:
906 */ 912 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now) 913static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
908{ 914{
909 /* Caller must always ensure 'now >= p->timestamp' */ 915 u64 now = rq_clock(rq);
910 unsigned long sleep_time = now - p->timestamp;
911 916
912 if (batch_task(p)) 917 if (p->state == TASK_UNINTERRUPTIBLE)
913 sleep_time = 0; 918 rq->nr_uninterruptible--;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954
955 /*
956 * This code gives a bonus to interactive tasks.
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 919
970 return effective_prio(p); 920 enqueue_task(rq, p, wakeup, now);
921 inc_nr_running(p, rq, now);
971} 922}
972 923
973/* 924/*
974 * activate_task - move a task to the runqueue and do priority recalculation 925 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 926 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 927static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 928{
981 unsigned long long now; 929 u64 now = rq_clock(rq);
982
983 if (rt_task(p))
984 goto out;
985
986 now = sched_clock();
987#ifdef CONFIG_SMP
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995
996 /*
997 * Sleep time is in units of nanosecs, so shift by 20 to get a
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006 930
1007 p->prio = recalc_task_prio(p, now); 931 if (p->state == TASK_UNINTERRUPTIBLE)
932 rq->nr_uninterruptible--;
1008 933
1009 /* 934 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 935 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 936}
1035 937
1036/* 938/*
1037 * deactivate_task - remove a task from the runqueue. 939 * deactivate_task - remove a task from the runqueue.
1038 */ 940 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 941static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{
1041 dec_nr_running(p, rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045
1046/*
1047 * resched_task - mark a task 'to be rescheduled now'.
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054
1055#ifndef tsk_is_polling
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{ 942{
1061 int cpu; 943 u64 now = rq_clock(rq);
1062
1063 assert_spin_locked(&task_rq(p)->lock);
1064
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1066 return;
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078}
1079 944
1080static void resched_cpu(int cpu) 945 if (p->state == TASK_UNINTERRUPTIBLE)
1081{ 946 rq->nr_uninterruptible++;
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084 947
1085 if (!spin_trylock_irqsave(&rq->lock, flags)) 948 dequeue_task(rq, p, sleep, now);
1086 return; 949 dec_nr_running(p, rq, now);
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095} 950}
1096#endif
1097 951
1098/** 952/**
1099 * task_curr - is this task currently executing on a CPU? 953 * task_curr - is this task currently executing on a CPU?
@@ -1107,10 +961,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 961/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 962unsigned long weighted_cpuload(const int cpu)
1109{ 963{
1110 return cpu_rq(cpu)->raw_weighted_load; 964 return cpu_rq(cpu)->ls.load.weight;
965}
966
967static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
968{
969#ifdef CONFIG_SMP
970 task_thread_info(p)->cpu = cpu;
971 set_task_cfs_rq(p);
972#endif
1111} 973}
1112 974
1113#ifdef CONFIG_SMP 975#ifdef CONFIG_SMP
976
977void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
978{
979 int old_cpu = task_cpu(p);
980 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
981 u64 clock_offset, fair_clock_offset;
982
983 clock_offset = old_rq->clock - new_rq->clock;
984 fair_clock_offset = old_rq->cfs.fair_clock -
985 new_rq->cfs.fair_clock;
986 if (p->se.wait_start)
987 p->se.wait_start -= clock_offset;
988 if (p->se.wait_start_fair)
989 p->se.wait_start_fair -= fair_clock_offset;
990 if (p->se.sleep_start)
991 p->se.sleep_start -= clock_offset;
992 if (p->se.block_start)
993 p->se.block_start -= clock_offset;
994 if (p->se.sleep_start_fair)
995 p->se.sleep_start_fair -= fair_clock_offset;
996
997 __set_task_cpu(p, new_cpu);
998}
999
1114struct migration_req { 1000struct migration_req {
1115 struct list_head list; 1001 struct list_head list;
1116 1002
@@ -1133,7 +1019,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1019 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1020 * it is sufficient to simply update the task's cpu field.
1135 */ 1021 */
1136 if (!p->array && !task_running(rq, p)) { 1022 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1023 set_task_cpu(p, dest_cpu);
1138 return 0; 1024 return 0;
1139 } 1025 }
@@ -1158,9 +1044,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1044void wait_task_inactive(struct task_struct *p)
1159{ 1045{
1160 unsigned long flags; 1046 unsigned long flags;
1047 int running, on_rq;
1161 struct rq *rq; 1048 struct rq *rq;
1162 struct prio_array *array;
1163 int running;
1164 1049
1165repeat: 1050repeat:
1166 /* 1051 /*
@@ -1192,7 +1077,7 @@ repeat:
1192 */ 1077 */
1193 rq = task_rq_lock(p, &flags); 1078 rq = task_rq_lock(p, &flags);
1194 running = task_running(rq, p); 1079 running = task_running(rq, p);
1195 array = p->array; 1080 on_rq = p->se.on_rq;
1196 task_rq_unlock(rq, &flags); 1081 task_rq_unlock(rq, &flags);
1197 1082
1198 /* 1083 /*
@@ -1215,7 +1100,7 @@ repeat:
1215 * running right now), it's preempted, and we should 1100 * running right now), it's preempted, and we should
1216 * yield - it could be a while. 1101 * yield - it could be a while.
1217 */ 1102 */
1218 if (unlikely(array)) { 1103 if (unlikely(on_rq)) {
1219 yield(); 1104 yield();
1220 goto repeat; 1105 goto repeat;
1221 } 1106 }
@@ -1261,11 +1146,12 @@ void kick_process(struct task_struct *p)
1261static inline unsigned long source_load(int cpu, int type) 1146static inline unsigned long source_load(int cpu, int type)
1262{ 1147{
1263 struct rq *rq = cpu_rq(cpu); 1148 struct rq *rq = cpu_rq(cpu);
1149 unsigned long total = weighted_cpuload(cpu);
1264 1150
1265 if (type == 0) 1151 if (type == 0)
1266 return rq->raw_weighted_load; 1152 return total;
1267 1153
1268 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1154 return min(rq->cpu_load[type-1], total);
1269} 1155}
1270 1156
1271/* 1157/*
@@ -1275,11 +1161,12 @@ static inline unsigned long source_load(int cpu, int type)
1275static inline unsigned long target_load(int cpu, int type) 1161static inline unsigned long target_load(int cpu, int type)
1276{ 1162{
1277 struct rq *rq = cpu_rq(cpu); 1163 struct rq *rq = cpu_rq(cpu);
1164 unsigned long total = weighted_cpuload(cpu);
1278 1165
1279 if (type == 0) 1166 if (type == 0)
1280 return rq->raw_weighted_load; 1167 return total;
1281 1168
1282 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1169 return max(rq->cpu_load[type-1], total);
1283} 1170}
1284 1171
1285/* 1172/*
@@ -1288,9 +1175,10 @@ static inline unsigned long target_load(int cpu, int type)
1288static inline unsigned long cpu_avg_load_per_task(int cpu) 1175static inline unsigned long cpu_avg_load_per_task(int cpu)
1289{ 1176{
1290 struct rq *rq = cpu_rq(cpu); 1177 struct rq *rq = cpu_rq(cpu);
1178 unsigned long total = weighted_cpuload(cpu);
1291 unsigned long n = rq->nr_running; 1179 unsigned long n = rq->nr_running;
1292 1180
1293 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1181 return n ? total / n : SCHED_LOAD_SCALE;
1294} 1182}
1295 1183
1296/* 1184/*
@@ -1392,9 +1280,9 @@ static int sched_balance_self(int cpu, int flag)
1392 struct sched_domain *tmp, *sd = NULL; 1280 struct sched_domain *tmp, *sd = NULL;
1393 1281
1394 for_each_domain(cpu, tmp) { 1282 for_each_domain(cpu, tmp) {
1395 /* 1283 /*
1396 * If power savings logic is enabled for a domain, stop there. 1284 * If power savings logic is enabled for a domain, stop there.
1397 */ 1285 */
1398 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1286 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1399 break; 1287 break;
1400 if (tmp->flags & flag) 1288 if (tmp->flags & flag)
@@ -1477,9 +1365,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1477 if (idle_cpu(i)) 1365 if (idle_cpu(i))
1478 return i; 1366 return i;
1479 } 1367 }
1480 } 1368 } else {
1481 else
1482 break; 1369 break;
1370 }
1483 } 1371 }
1484 return cpu; 1372 return cpu;
1485} 1373}
@@ -1521,7 +1409,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (!(old_state & state)) 1409 if (!(old_state & state))
1522 goto out; 1410 goto out;
1523 1411
1524 if (p->array) 1412 if (p->se.on_rq)
1525 goto out_running; 1413 goto out_running;
1526 1414
1527 cpu = task_cpu(p); 1415 cpu = task_cpu(p);
@@ -1576,11 +1464,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1576 * of the current CPU: 1464 * of the current CPU:
1577 */ 1465 */
1578 if (sync) 1466 if (sync)
1579 tl -= current->load_weight; 1467 tl -= current->se.load.weight;
1580 1468
1581 if ((tl <= load && 1469 if ((tl <= load &&
1582 tl + target_load(cpu, idx) <= tl_per_task) || 1470 tl + target_load(cpu, idx) <= tl_per_task) ||
1583 100*(tl + p->load_weight) <= imbalance*load) { 1471 100*(tl + p->se.load.weight) <= imbalance*load) {
1584 /* 1472 /*
1585 * This domain has SD_WAKE_AFFINE and 1473 * This domain has SD_WAKE_AFFINE and
1586 * p is cache cold in this domain, and 1474 * p is cache cold in this domain, and
@@ -1614,7 +1502,7 @@ out_set_cpu:
1614 old_state = p->state; 1502 old_state = p->state;
1615 if (!(old_state & state)) 1503 if (!(old_state & state))
1616 goto out; 1504 goto out;
1617 if (p->array) 1505 if (p->se.on_rq)
1618 goto out_running; 1506 goto out_running;
1619 1507
1620 this_cpu = smp_processor_id(); 1508 this_cpu = smp_processor_id();
@@ -1623,25 +1511,7 @@ out_set_cpu:
1623 1511
1624out_activate: 1512out_activate:
1625#endif /* CONFIG_SMP */ 1513#endif /* CONFIG_SMP */
1626 if (old_state == TASK_UNINTERRUPTIBLE) { 1514 activate_task(rq, p, 1);
1627 rq->nr_uninterruptible--;
1628 /*
1629 * Tasks on involuntary sleep don't earn
1630 * sleep_avg beyond just interactive state.
1631 */
1632 p->sleep_type = SLEEP_NONINTERACTIVE;
1633 } else
1634
1635 /*
1636 * Tasks that have marked their sleep as noninteractive get
1637 * woken up with their sleep average not weighted in an
1638 * interactive way.
1639 */
1640 if (old_state & TASK_NONINTERACTIVE)
1641 p->sleep_type = SLEEP_NONINTERACTIVE;
1642
1643
1644 activate_task(p, rq, cpu == this_cpu);
1645 /* 1515 /*
1646 * Sync wakeups (i.e. those types of wakeups where the waker 1516 * Sync wakeups (i.e. those types of wakeups where the waker
1647 * has indicated that it will leave the CPU in short order) 1517 * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1520,8 @@ out_activate:
1650 * the waker guarantees that the freshly woken up task is going 1520 * the waker guarantees that the freshly woken up task is going
1651 * to be considered on this CPU.) 1521 * to be considered on this CPU.)
1652 */ 1522 */
1653 if (!sync || cpu != this_cpu) { 1523 if (!sync || cpu != this_cpu)
1654 if (TASK_PREEMPTS_CURR(p, rq)) 1524 check_preempt_curr(rq, p);
1655 resched_task(rq->curr);
1656 }
1657 success = 1; 1525 success = 1;
1658 1526
1659out_running: 1527out_running:
@@ -1676,19 +1544,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1676 return try_to_wake_up(p, state, 0); 1544 return try_to_wake_up(p, state, 0);
1677} 1545}
1678 1546
1679static void task_running_tick(struct rq *rq, struct task_struct *p);
1680/* 1547/*
1681 * Perform scheduler related setup for a newly forked process p. 1548 * Perform scheduler related setup for a newly forked process p.
1682 * p is forked by current. 1549 * p is forked by current.
1683 */ 1550 *
1684void fastcall sched_fork(struct task_struct *p, int clone_flags) 1551 * __sched_fork() is basic setup used by init_idle() too:
1685{ 1552 */
1686 int cpu = get_cpu(); 1553static void __sched_fork(struct task_struct *p)
1554{
1555 p->se.wait_start_fair = 0;
1556 p->se.wait_start = 0;
1557 p->se.exec_start = 0;
1558 p->se.sum_exec_runtime = 0;
1559 p->se.delta_exec = 0;
1560 p->se.delta_fair_run = 0;
1561 p->se.delta_fair_sleep = 0;
1562 p->se.wait_runtime = 0;
1563 p->se.sum_wait_runtime = 0;
1564 p->se.sum_sleep_runtime = 0;
1565 p->se.sleep_start = 0;
1566 p->se.sleep_start_fair = 0;
1567 p->se.block_start = 0;
1568 p->se.sleep_max = 0;
1569 p->se.block_max = 0;
1570 p->se.exec_max = 0;
1571 p->se.wait_max = 0;
1572 p->se.wait_runtime_overruns = 0;
1573 p->se.wait_runtime_underruns = 0;
1687 1574
1688#ifdef CONFIG_SMP 1575 INIT_LIST_HEAD(&p->run_list);
1689 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1576 p->se.on_rq = 0;
1690#endif
1691 set_task_cpu(p, cpu);
1692 1577
1693 /* 1578 /*
1694 * We mark the process as running here, but have not actually 1579 * We mark the process as running here, but have not actually
@@ -1697,16 +1582,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1697 * event cannot wake it up and insert it on the runqueue either. 1582 * event cannot wake it up and insert it on the runqueue either.
1698 */ 1583 */
1699 p->state = TASK_RUNNING; 1584 p->state = TASK_RUNNING;
1585}
1586
1587/*
1588 * fork()/clone()-time setup:
1589 */
1590void sched_fork(struct task_struct *p, int clone_flags)
1591{
1592 int cpu = get_cpu();
1593
1594 __sched_fork(p);
1595
1596#ifdef CONFIG_SMP
1597 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1598#endif
1599 __set_task_cpu(p, cpu);
1700 1600
1701 /* 1601 /*
1702 * Make sure we do not leak PI boosting priority to the child: 1602 * Make sure we do not leak PI boosting priority to the child:
1703 */ 1603 */
1704 p->prio = current->normal_prio; 1604 p->prio = current->normal_prio;
1705 1605
1706 INIT_LIST_HEAD(&p->run_list);
1707 p->array = NULL;
1708#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1709 if (unlikely(sched_info_on())) 1607 if (likely(sched_info_on()))
1710 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1608 memset(&p->sched_info, 0, sizeof(p->sched_info));
1711#endif 1609#endif
1712#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1610#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1614,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1716 /* Want to start with kernel preemption disabled. */ 1614 /* Want to start with kernel preemption disabled. */
1717 task_thread_info(p)->preempt_count = 1; 1615 task_thread_info(p)->preempt_count = 1;
1718#endif 1616#endif
1719 /*
1720 * Share the timeslice between parent and child, thus the
1721 * total amount of pending timeslices in the system doesn't change,
1722 * resulting in more scheduling fairness.
1723 */
1724 local_irq_disable();
1725 p->time_slice = (current->time_slice + 1) >> 1;
1726 /*
1727 * The remainder of the first timeslice might be recovered by
1728 * the parent if the child exits early enough.
1729 */
1730 p->first_time_slice = 1;
1731 current->time_slice >>= 1;
1732 p->timestamp = sched_clock();
1733 if (unlikely(!current->time_slice)) {
1734 /*
1735 * This case is rare, it happens when the parent has only
1736 * a single jiffy left from its timeslice. Taking the
1737 * runqueue lock is not a problem.
1738 */
1739 current->time_slice = 1;
1740 task_running_tick(cpu_rq(cpu), current);
1741 }
1742 local_irq_enable();
1743 put_cpu(); 1617 put_cpu();
1744} 1618}
1745 1619
1746/* 1620/*
1621 * After fork, child runs first. (default) If set to 0 then
1622 * parent will (try to) run first.
1623 */
1624unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1625
1626/*
1747 * wake_up_new_task - wake up a newly created task for the first time. 1627 * wake_up_new_task - wake up a newly created task for the first time.
1748 * 1628 *
1749 * This function will do some initial scheduler statistics housekeeping 1629 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1632,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1752 */ 1632 */
1753void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1633void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1754{ 1634{
1755 struct rq *rq, *this_rq;
1756 unsigned long flags; 1635 unsigned long flags;
1757 int this_cpu, cpu; 1636 struct rq *rq;
1637 int this_cpu;
1758 1638
1759 rq = task_rq_lock(p, &flags); 1639 rq = task_rq_lock(p, &flags);
1760 BUG_ON(p->state != TASK_RUNNING); 1640 BUG_ON(p->state != TASK_RUNNING);
1761 this_cpu = smp_processor_id(); 1641 this_cpu = smp_processor_id(); /* parent's CPU */
1762 cpu = task_cpu(p);
1763
1764 /*
1765 * We decrease the sleep average of forking parents
1766 * and children as well, to keep max-interactive tasks
1767 * from forking tasks that are max-interactive. The parent
1768 * (current) is done further down, under its lock.
1769 */
1770 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1771 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1772 1642
1773 p->prio = effective_prio(p); 1643 p->prio = effective_prio(p);
1774 1644
1775 if (likely(cpu == this_cpu)) { 1645 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1776 if (!(clone_flags & CLONE_VM)) { 1646 task_cpu(p) != this_cpu || !current->se.on_rq) {
1777 /* 1647 activate_task(rq, p, 0);
1778 * The VM isn't cloned, so we're in a good position to
1779 * do child-runs-first in anticipation of an exec. This
1780 * usually avoids a lot of COW overhead.
1781 */
1782 if (unlikely(!current->array))
1783 __activate_task(p, rq);
1784 else {
1785 p->prio = current->prio;
1786 p->normal_prio = current->normal_prio;
1787 list_add_tail(&p->run_list, &current->run_list);
1788 p->array = current->array;
1789 p->array->nr_active++;
1790 inc_nr_running(p, rq);
1791 }
1792 set_need_resched();
1793 } else
1794 /* Run child last */
1795 __activate_task(p, rq);
1796 /*
1797 * We skip the following code due to cpu == this_cpu
1798 *
1799 * task_rq_unlock(rq, &flags);
1800 * this_rq = task_rq_lock(current, &flags);
1801 */
1802 this_rq = rq;
1803 } else { 1648 } else {
1804 this_rq = cpu_rq(this_cpu);
1805
1806 /*
1807 * Not the local CPU - must adjust timestamp. This should
1808 * get optimised away in the !CONFIG_SMP case.
1809 */
1810 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1811 + rq->most_recent_timestamp;
1812 __activate_task(p, rq);
1813 if (TASK_PREEMPTS_CURR(p, rq))
1814 resched_task(rq->curr);
1815
1816 /* 1649 /*
1817 * Parent and child are on different CPUs, now get the 1650 * Let the scheduling class do new task startup
1818 * parent runqueue to update the parent's ->sleep_avg: 1651 * management (if any):
1819 */ 1652 */
1820 task_rq_unlock(rq, &flags); 1653 p->sched_class->task_new(rq, p);
1821 this_rq = task_rq_lock(current, &flags);
1822 }
1823 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1824 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1825 task_rq_unlock(this_rq, &flags);
1826}
1827
1828/*
1829 * Potentially available exiting-child timeslices are
1830 * retrieved here - this way the parent does not get
1831 * penalized for creating too many threads.
1832 *
1833 * (this cannot be used to 'generate' timeslices
1834 * artificially, because any timeslice recovered here
1835 * was given away by the parent in the first place.)
1836 */
1837void fastcall sched_exit(struct task_struct *p)
1838{
1839 unsigned long flags;
1840 struct rq *rq;
1841
1842 /*
1843 * If the child was a (relative-) CPU hog then decrease
1844 * the sleep_avg of the parent as well.
1845 */
1846 rq = task_rq_lock(p->parent, &flags);
1847 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1848 p->parent->time_slice += p->time_slice;
1849 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1850 p->parent->time_slice = task_timeslice(p);
1851 } 1654 }
1852 if (p->sleep_avg < p->parent->sleep_avg) 1655 check_preempt_curr(rq, p);
1853 p->parent->sleep_avg = p->parent->sleep_avg /
1854 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1855 (EXIT_WEIGHT + 1);
1856 task_rq_unlock(rq, &flags); 1656 task_rq_unlock(rq, &flags);
1857} 1657}
1858 1658
@@ -1917,7 +1717,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1917 /* 1717 /*
1918 * Remove function-return probe instances associated with this 1718 * Remove function-return probe instances associated with this
1919 * task and put them back on the free list. 1719 * task and put them back on the free list.
1920 */ 1720 */
1921 kprobe_flush_task(prev); 1721 kprobe_flush_task(prev);
1922 put_task_struct(prev); 1722 put_task_struct(prev);
1923 } 1723 }
@@ -1945,13 +1745,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1945 * context_switch - switch to the new MM and the new 1745 * context_switch - switch to the new MM and the new
1946 * thread's register state. 1746 * thread's register state.
1947 */ 1747 */
1948static inline struct task_struct * 1748static inline void
1949context_switch(struct rq *rq, struct task_struct *prev, 1749context_switch(struct rq *rq, struct task_struct *prev,
1950 struct task_struct *next) 1750 struct task_struct *next)
1951{ 1751{
1952 struct mm_struct *mm = next->mm; 1752 struct mm_struct *mm, *oldmm;
1953 struct mm_struct *oldmm = prev->active_mm;
1954 1753
1754 prepare_task_switch(rq, next);
1755 mm = next->mm;
1756 oldmm = prev->active_mm;
1955 /* 1757 /*
1956 * For paravirt, this is coupled with an exit in switch_to to 1758 * For paravirt, this is coupled with an exit in switch_to to
1957 * combine the page table reload and the switch backend into 1759 * combine the page table reload and the switch backend into
@@ -1959,16 +1761,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1959 */ 1761 */
1960 arch_enter_lazy_cpu_mode(); 1762 arch_enter_lazy_cpu_mode();
1961 1763
1962 if (!mm) { 1764 if (unlikely(!mm)) {
1963 next->active_mm = oldmm; 1765 next->active_mm = oldmm;
1964 atomic_inc(&oldmm->mm_count); 1766 atomic_inc(&oldmm->mm_count);
1965 enter_lazy_tlb(oldmm, next); 1767 enter_lazy_tlb(oldmm, next);
1966 } else 1768 } else
1967 switch_mm(oldmm, mm, next); 1769 switch_mm(oldmm, mm, next);
1968 1770
1969 if (!prev->mm) { 1771 if (unlikely(!prev->mm)) {
1970 prev->active_mm = NULL; 1772 prev->active_mm = NULL;
1971 WARN_ON(rq->prev_mm);
1972 rq->prev_mm = oldmm; 1773 rq->prev_mm = oldmm;
1973 } 1774 }
1974 /* 1775 /*
@@ -1984,7 +1785,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1984 /* Here we just switch the register state and the stack. */ 1785 /* Here we just switch the register state and the stack. */
1985 switch_to(prev, next, prev); 1786 switch_to(prev, next, prev);
1986 1787
1987 return prev; 1788 barrier();
1789 /*
1790 * this_rq must be evaluated again because prev may have moved
1791 * CPUs since it called schedule(), thus the 'rq' on its stack
1792 * frame will be invalid.
1793 */
1794 finish_task_switch(this_rq(), prev);
1988} 1795}
1989 1796
1990/* 1797/*
@@ -2057,17 +1864,65 @@ unsigned long nr_active(void)
2057 return running + uninterruptible; 1864 return running + uninterruptible;
2058} 1865}
2059 1866
2060#ifdef CONFIG_SMP
2061
2062/* 1867/*
2063 * Is this task likely cache-hot: 1868 * Update rq->cpu_load[] statistics. This function is usually called every
1869 * scheduler tick (TICK_NSEC).
2064 */ 1870 */
2065static inline int 1871static void update_cpu_load(struct rq *this_rq)
2066task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2067{ 1872{
2068 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1873 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1874 unsigned long total_load = this_rq->ls.load.weight;
1875 unsigned long this_load = total_load;
1876 struct load_stat *ls = &this_rq->ls;
1877 u64 now = __rq_clock(this_rq);
1878 int i, scale;
1879
1880 this_rq->nr_load_updates++;
1881 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1882 goto do_avg;
1883
1884 /* Update delta_fair/delta_exec fields first */
1885 update_curr_load(this_rq, now);
1886
1887 fair_delta64 = ls->delta_fair + 1;
1888 ls->delta_fair = 0;
1889
1890 exec_delta64 = ls->delta_exec + 1;
1891 ls->delta_exec = 0;
1892
1893 sample_interval64 = now - ls->load_update_last;
1894 ls->load_update_last = now;
1895
1896 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1897 sample_interval64 = TICK_NSEC;
1898
1899 if (exec_delta64 > sample_interval64)
1900 exec_delta64 = sample_interval64;
1901
1902 idle_delta64 = sample_interval64 - exec_delta64;
1903
1904 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1905 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1906
1907 this_load = (unsigned long)tmp64;
1908
1909do_avg:
1910
1911 /* Update our load: */
1912 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1913 unsigned long old_load, new_load;
1914
1915 /* scale is effectively 1 << i now, and >> i divides by scale */
1916
1917 old_load = this_rq->cpu_load[i];
1918 new_load = this_load;
1919
1920 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1921 }
2069} 1922}
2070 1923
1924#ifdef CONFIG_SMP
1925
2071/* 1926/*
2072 * double_rq_lock - safely lock two runqueues 1927 * double_rq_lock - safely lock two runqueues
2073 * 1928 *
@@ -2184,23 +2039,17 @@ void sched_exec(void)
2184 * pull_task - move a task from a remote runqueue to the local runqueue. 2039 * pull_task - move a task from a remote runqueue to the local runqueue.
2185 * Both runqueues must be locked. 2040 * Both runqueues must be locked.
2186 */ 2041 */
2187static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2042static void pull_task(struct rq *src_rq, struct task_struct *p,
2188 struct task_struct *p, struct rq *this_rq, 2043 struct rq *this_rq, int this_cpu)
2189 struct prio_array *this_array, int this_cpu)
2190{ 2044{
2191 dequeue_task(p, src_array); 2045 deactivate_task(src_rq, p, 0);
2192 dec_nr_running(p, src_rq);
2193 set_task_cpu(p, this_cpu); 2046 set_task_cpu(p, this_cpu);
2194 inc_nr_running(p, this_rq); 2047 activate_task(this_rq, p, 0);
2195 enqueue_task(p, this_array);
2196 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2197 + this_rq->most_recent_timestamp;
2198 /* 2048 /*
2199 * Note that idle threads have a prio of MAX_PRIO, for this test 2049 * Note that idle threads have a prio of MAX_PRIO, for this test
2200 * to be always true for them. 2050 * to be always true for them.
2201 */ 2051 */
2202 if (TASK_PREEMPTS_CURR(p, this_rq)) 2052 check_preempt_curr(this_rq, p);
2203 resched_task(this_rq->curr);
2204} 2053}
2205 2054
2206/* 2055/*
@@ -2208,7 +2057,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2208 */ 2057 */
2209static 2058static
2210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2059int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2211 struct sched_domain *sd, enum idle_type idle, 2060 struct sched_domain *sd, enum cpu_idle_type idle,
2212 int *all_pinned) 2061 int *all_pinned)
2213{ 2062{
2214 /* 2063 /*
@@ -2225,132 +2074,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2225 return 0; 2074 return 0;
2226 2075
2227 /* 2076 /*
2228 * Aggressive migration if: 2077 * Aggressive migration if too many balance attempts have failed:
2229 * 1) task is cache cold, or
2230 * 2) too many balance attempts have failed.
2231 */ 2078 */
2232 2079 if (sd->nr_balance_failed > sd->cache_nice_tries)
2233 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2234#ifdef CONFIG_SCHEDSTATS
2235 if (task_hot(p, rq->most_recent_timestamp, sd))
2236 schedstat_inc(sd, lb_hot_gained[idle]);
2237#endif
2238 return 1; 2080 return 1;
2239 }
2240 2081
2241 if (task_hot(p, rq->most_recent_timestamp, sd))
2242 return 0;
2243 return 1; 2082 return 1;
2244} 2083}
2245 2084
2246#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2085static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2247
2248/*
2249 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2250 * load from busiest to this_rq, as part of a balancing operation within
2251 * "domain". Returns the number of tasks moved.
2252 *
2253 * Called with both runqueues locked.
2254 */
2255static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2256 unsigned long max_nr_move, unsigned long max_load_move, 2086 unsigned long max_nr_move, unsigned long max_load_move,
2257 struct sched_domain *sd, enum idle_type idle, 2087 struct sched_domain *sd, enum cpu_idle_type idle,
2258 int *all_pinned) 2088 int *all_pinned, unsigned long *load_moved,
2089 int this_best_prio, int best_prio, int best_prio_seen,
2090 struct rq_iterator *iterator)
2259{ 2091{
2260 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2092 int pulled = 0, pinned = 0, skip_for_load;
2261 best_prio_seen, skip_for_load; 2093 struct task_struct *p;
2262 struct prio_array *array, *dst_array; 2094 long rem_load_move = max_load_move;
2263 struct list_head *head, *curr;
2264 struct task_struct *tmp;
2265 long rem_load_move;
2266 2095
2267 if (max_nr_move == 0 || max_load_move == 0) 2096 if (max_nr_move == 0 || max_load_move == 0)
2268 goto out; 2097 goto out;
2269 2098
2270 rem_load_move = max_load_move;
2271 pinned = 1; 2099 pinned = 1;
2272 this_best_prio = rq_best_prio(this_rq);
2273 best_prio = rq_best_prio(busiest);
2274 /*
2275 * Enable handling of the case where there is more than one task
2276 * with the best priority. If the current running task is one
2277 * of those with prio==best_prio we know it won't be moved
2278 * and therefore it's safe to override the skip (based on load) of
2279 * any task we find with that prio.
2280 */
2281 best_prio_seen = best_prio == busiest->curr->prio;
2282 2100
2283 /* 2101 /*
2284 * We first consider expired tasks. Those will likely not be 2102 * Start the load-balancing iterator:
2285 * executed in the near future, and they are most likely to
2286 * be cache-cold, thus switching CPUs has the least effect
2287 * on them.
2288 */ 2103 */
2289 if (busiest->expired->nr_active) { 2104 p = iterator->start(iterator->arg);
2290 array = busiest->expired; 2105next:
2291 dst_array = this_rq->expired; 2106 if (!p)
2292 } else {
2293 array = busiest->active;
2294 dst_array = this_rq->active;
2295 }
2296
2297new_array:
2298 /* Start searching at priority 0: */
2299 idx = 0;
2300skip_bitmap:
2301 if (!idx)
2302 idx = sched_find_first_bit(array->bitmap);
2303 else
2304 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2305 if (idx >= MAX_PRIO) {
2306 if (array == busiest->expired && busiest->active->nr_active) {
2307 array = busiest->active;
2308 dst_array = this_rq->active;
2309 goto new_array;
2310 }
2311 goto out; 2107 goto out;
2312 }
2313
2314 head = array->queue + idx;
2315 curr = head->prev;
2316skip_queue:
2317 tmp = list_entry(curr, struct task_struct, run_list);
2318
2319 curr = curr->prev;
2320
2321 /* 2108 /*
2322 * To help distribute high priority tasks accross CPUs we don't 2109 * To help distribute high priority tasks accross CPUs we don't
2323 * skip a task if it will be the highest priority task (i.e. smallest 2110 * skip a task if it will be the highest priority task (i.e. smallest
2324 * prio value) on its new queue regardless of its load weight 2111 * prio value) on its new queue regardless of its load weight
2325 */ 2112 */
2326 skip_for_load = tmp->load_weight > rem_load_move; 2113 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2327 if (skip_for_load && idx < this_best_prio) 2114 SCHED_LOAD_SCALE_FUZZ;
2328 skip_for_load = !best_prio_seen && idx == best_prio; 2115 if (skip_for_load && p->prio < this_best_prio)
2116 skip_for_load = !best_prio_seen && p->prio == best_prio;
2329 if (skip_for_load || 2117 if (skip_for_load ||
2330 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2118 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2331 2119
2332 best_prio_seen |= idx == best_prio; 2120 best_prio_seen |= p->prio == best_prio;
2333 if (curr != head) 2121 p = iterator->next(iterator->arg);
2334 goto skip_queue; 2122 goto next;
2335 idx++;
2336 goto skip_bitmap;
2337 } 2123 }
2338 2124
2339 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2125 pull_task(busiest, p, this_rq, this_cpu);
2340 pulled++; 2126 pulled++;
2341 rem_load_move -= tmp->load_weight; 2127 rem_load_move -= p->se.load.weight;
2342 2128
2343 /* 2129 /*
2344 * We only want to steal up to the prescribed number of tasks 2130 * We only want to steal up to the prescribed number of tasks
2345 * and the prescribed amount of weighted load. 2131 * and the prescribed amount of weighted load.
2346 */ 2132 */
2347 if (pulled < max_nr_move && rem_load_move > 0) { 2133 if (pulled < max_nr_move && rem_load_move > 0) {
2348 if (idx < this_best_prio) 2134 if (p->prio < this_best_prio)
2349 this_best_prio = idx; 2135 this_best_prio = p->prio;
2350 if (curr != head) 2136 p = iterator->next(iterator->arg);
2351 goto skip_queue; 2137 goto next;
2352 idx++;
2353 goto skip_bitmap;
2354 } 2138 }
2355out: 2139out:
2356 /* 2140 /*
@@ -2362,18 +2146,48 @@ out:
2362 2146
2363 if (all_pinned) 2147 if (all_pinned)
2364 *all_pinned = pinned; 2148 *all_pinned = pinned;
2149 *load_moved = max_load_move - rem_load_move;
2365 return pulled; 2150 return pulled;
2366} 2151}
2367 2152
2368/* 2153/*
2154 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2155 * load from busiest to this_rq, as part of a balancing operation within
2156 * "domain". Returns the number of tasks moved.
2157 *
2158 * Called with both runqueues locked.
2159 */
2160static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2161 unsigned long max_nr_move, unsigned long max_load_move,
2162 struct sched_domain *sd, enum cpu_idle_type idle,
2163 int *all_pinned)
2164{
2165 struct sched_class *class = sched_class_highest;
2166 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2167 long rem_load_move = max_load_move;
2168
2169 do {
2170 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2171 max_nr_move, (unsigned long)rem_load_move,
2172 sd, idle, all_pinned, &load_moved);
2173 total_nr_moved += nr_moved;
2174 max_nr_move -= nr_moved;
2175 rem_load_move -= load_moved;
2176 class = class->next;
2177 } while (class && max_nr_move && rem_load_move > 0);
2178
2179 return total_nr_moved;
2180}
2181
2182/*
2369 * find_busiest_group finds and returns the busiest CPU group within the 2183 * find_busiest_group finds and returns the busiest CPU group within the
2370 * domain. It calculates and returns the amount of weighted load which 2184 * domain. It calculates and returns the amount of weighted load which
2371 * should be moved to restore balance via the imbalance parameter. 2185 * should be moved to restore balance via the imbalance parameter.
2372 */ 2186 */
2373static struct sched_group * 2187static struct sched_group *
2374find_busiest_group(struct sched_domain *sd, int this_cpu, 2188find_busiest_group(struct sched_domain *sd, int this_cpu,
2375 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2189 unsigned long *imbalance, enum cpu_idle_type idle,
2376 cpumask_t *cpus, int *balance) 2190 int *sd_idle, cpumask_t *cpus, int *balance)
2377{ 2191{
2378 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2192 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2379 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2193 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2205,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2391 max_load = this_load = total_load = total_pwr = 0; 2205 max_load = this_load = total_load = total_pwr = 0;
2392 busiest_load_per_task = busiest_nr_running = 0; 2206 busiest_load_per_task = busiest_nr_running = 0;
2393 this_load_per_task = this_nr_running = 0; 2207 this_load_per_task = this_nr_running = 0;
2394 if (idle == NOT_IDLE) 2208 if (idle == CPU_NOT_IDLE)
2395 load_idx = sd->busy_idx; 2209 load_idx = sd->busy_idx;
2396 else if (idle == NEWLY_IDLE) 2210 else if (idle == CPU_NEWLY_IDLE)
2397 load_idx = sd->newidle_idx; 2211 load_idx = sd->newidle_idx;
2398 else 2212 else
2399 load_idx = sd->idle_idx; 2213 load_idx = sd->idle_idx;
@@ -2437,7 +2251,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 2251
2438 avg_load += load; 2252 avg_load += load;
2439 sum_nr_running += rq->nr_running; 2253 sum_nr_running += rq->nr_running;
2440 sum_weighted_load += rq->raw_weighted_load; 2254 sum_weighted_load += weighted_cpuload(i);
2441 } 2255 }
2442 2256
2443 /* 2257 /*
@@ -2477,8 +2291,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2477 * Busy processors will not participate in power savings 2291 * Busy processors will not participate in power savings
2478 * balance. 2292 * balance.
2479 */ 2293 */
2480 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2294 if (idle == CPU_NOT_IDLE ||
2481 goto group_next; 2295 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2296 goto group_next;
2482 2297
2483 /* 2298 /*
2484 * If the local group is idle or completely loaded 2299 * If the local group is idle or completely loaded
@@ -2488,42 +2303,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2488 !this_nr_running)) 2303 !this_nr_running))
2489 power_savings_balance = 0; 2304 power_savings_balance = 0;
2490 2305
2491 /* 2306 /*
2492 * If a group is already running at full capacity or idle, 2307 * If a group is already running at full capacity or idle,
2493 * don't include that group in power savings calculations 2308 * don't include that group in power savings calculations
2494 */ 2309 */
2495 if (!power_savings_balance || sum_nr_running >= group_capacity 2310 if (!power_savings_balance || sum_nr_running >= group_capacity
2496 || !sum_nr_running) 2311 || !sum_nr_running)
2497 goto group_next; 2312 goto group_next;
2498 2313
2499 /* 2314 /*
2500 * Calculate the group which has the least non-idle load. 2315 * Calculate the group which has the least non-idle load.
2501 * This is the group from where we need to pick up the load 2316 * This is the group from where we need to pick up the load
2502 * for saving power 2317 * for saving power
2503 */ 2318 */
2504 if ((sum_nr_running < min_nr_running) || 2319 if ((sum_nr_running < min_nr_running) ||
2505 (sum_nr_running == min_nr_running && 2320 (sum_nr_running == min_nr_running &&
2506 first_cpu(group->cpumask) < 2321 first_cpu(group->cpumask) <
2507 first_cpu(group_min->cpumask))) { 2322 first_cpu(group_min->cpumask))) {
2508 group_min = group; 2323 group_min = group;
2509 min_nr_running = sum_nr_running; 2324 min_nr_running = sum_nr_running;
2510 min_load_per_task = sum_weighted_load / 2325 min_load_per_task = sum_weighted_load /
2511 sum_nr_running; 2326 sum_nr_running;
2512 } 2327 }
2513 2328
2514 /* 2329 /*
2515 * Calculate the group which is almost near its 2330 * Calculate the group which is almost near its
2516 * capacity but still has some space to pick up some load 2331 * capacity but still has some space to pick up some load
2517 * from other group and save more power 2332 * from other group and save more power
2518 */ 2333 */
2519 if (sum_nr_running <= group_capacity - 1) { 2334 if (sum_nr_running <= group_capacity - 1) {
2520 if (sum_nr_running > leader_nr_running || 2335 if (sum_nr_running > leader_nr_running ||
2521 (sum_nr_running == leader_nr_running && 2336 (sum_nr_running == leader_nr_running &&
2522 first_cpu(group->cpumask) > 2337 first_cpu(group->cpumask) >
2523 first_cpu(group_leader->cpumask))) { 2338 first_cpu(group_leader->cpumask))) {
2524 group_leader = group; 2339 group_leader = group;
2525 leader_nr_running = sum_nr_running; 2340 leader_nr_running = sum_nr_running;
2526 } 2341 }
2527 } 2342 }
2528group_next: 2343group_next:
2529#endif 2344#endif
@@ -2578,7 +2393,7 @@ group_next:
2578 * a think about bumping its value to force at least one task to be 2393 * a think about bumping its value to force at least one task to be
2579 * moved 2394 * moved
2580 */ 2395 */
2581 if (*imbalance < busiest_load_per_task) { 2396 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2582 unsigned long tmp, pwr_now, pwr_move; 2397 unsigned long tmp, pwr_now, pwr_move;
2583 unsigned int imbn; 2398 unsigned int imbn;
2584 2399
@@ -2592,7 +2407,8 @@ small_imbalance:
2592 } else 2407 } else
2593 this_load_per_task = SCHED_LOAD_SCALE; 2408 this_load_per_task = SCHED_LOAD_SCALE;
2594 2409
2595 if (max_load - this_load >= busiest_load_per_task * imbn) { 2410 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2411 busiest_load_per_task * imbn) {
2596 *imbalance = busiest_load_per_task; 2412 *imbalance = busiest_load_per_task;
2597 return busiest; 2413 return busiest;
2598 } 2414 }
@@ -2639,7 +2455,7 @@ small_imbalance:
2639 2455
2640out_balanced: 2456out_balanced:
2641#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2457#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2642 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2458 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2643 goto ret; 2459 goto ret;
2644 2460
2645 if (this == group_leader && group_leader != group_min) { 2461 if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2472,7 @@ ret:
2656 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2472 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2657 */ 2473 */
2658static struct rq * 2474static struct rq *
2659find_busiest_queue(struct sched_group *group, enum idle_type idle, 2475find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2660 unsigned long imbalance, cpumask_t *cpus) 2476 unsigned long imbalance, cpumask_t *cpus)
2661{ 2477{
2662 struct rq *busiest = NULL, *rq; 2478 struct rq *busiest = NULL, *rq;
@@ -2664,17 +2480,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2664 int i; 2480 int i;
2665 2481
2666 for_each_cpu_mask(i, group->cpumask) { 2482 for_each_cpu_mask(i, group->cpumask) {
2483 unsigned long wl;
2667 2484
2668 if (!cpu_isset(i, *cpus)) 2485 if (!cpu_isset(i, *cpus))
2669 continue; 2486 continue;
2670 2487
2671 rq = cpu_rq(i); 2488 rq = cpu_rq(i);
2489 wl = weighted_cpuload(i);
2672 2490
2673 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2491 if (rq->nr_running == 1 && wl > imbalance)
2674 continue; 2492 continue;
2675 2493
2676 if (rq->raw_weighted_load > max_load) { 2494 if (wl > max_load) {
2677 max_load = rq->raw_weighted_load; 2495 max_load = wl;
2678 busiest = rq; 2496 busiest = rq;
2679 } 2497 }
2680 } 2498 }
@@ -2698,7 +2516,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2698 * tasks if there is an imbalance. 2516 * tasks if there is an imbalance.
2699 */ 2517 */
2700static int load_balance(int this_cpu, struct rq *this_rq, 2518static int load_balance(int this_cpu, struct rq *this_rq,
2701 struct sched_domain *sd, enum idle_type idle, 2519 struct sched_domain *sd, enum cpu_idle_type idle,
2702 int *balance) 2520 int *balance)
2703{ 2521{
2704 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2522 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2529,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2711 /* 2529 /*
2712 * When power savings policy is enabled for the parent domain, idle 2530 * When power savings policy is enabled for the parent domain, idle
2713 * sibling can pick up load irrespective of busy siblings. In this case, 2531 * sibling can pick up load irrespective of busy siblings. In this case,
2714 * let the state of idle sibling percolate up as IDLE, instead of 2532 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2715 * portraying it as NOT_IDLE. 2533 * portraying it as CPU_NOT_IDLE.
2716 */ 2534 */
2717 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2535 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2718 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2536 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd_idle = 1; 2537 sd_idle = 1;
2720 2538
@@ -2848,7 +2666,7 @@ out_one_pinned:
2848 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2666 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2849 * tasks if there is an imbalance. 2667 * tasks if there is an imbalance.
2850 * 2668 *
2851 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2669 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2852 * this_rq is locked. 2670 * this_rq is locked.
2853 */ 2671 */
2854static int 2672static int
@@ -2865,31 +2683,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2865 * When power savings policy is enabled for the parent domain, idle 2683 * When power savings policy is enabled for the parent domain, idle
2866 * sibling can pick up load irrespective of busy siblings. In this case, 2684 * sibling can pick up load irrespective of busy siblings. In this case,
2867 * let the state of idle sibling percolate up as IDLE, instead of 2685 * let the state of idle sibling percolate up as IDLE, instead of
2868 * portraying it as NOT_IDLE. 2686 * portraying it as CPU_NOT_IDLE.
2869 */ 2687 */
2870 if (sd->flags & SD_SHARE_CPUPOWER && 2688 if (sd->flags & SD_SHARE_CPUPOWER &&
2871 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2689 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2872 sd_idle = 1; 2690 sd_idle = 1;
2873 2691
2874 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2692 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2875redo: 2693redo:
2876 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2694 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2877 &sd_idle, &cpus, NULL); 2695 &sd_idle, &cpus, NULL);
2878 if (!group) { 2696 if (!group) {
2879 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2697 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2880 goto out_balanced; 2698 goto out_balanced;
2881 } 2699 }
2882 2700
2883 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2701 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2884 &cpus); 2702 &cpus);
2885 if (!busiest) { 2703 if (!busiest) {
2886 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2704 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2887 goto out_balanced; 2705 goto out_balanced;
2888 } 2706 }
2889 2707
2890 BUG_ON(busiest == this_rq); 2708 BUG_ON(busiest == this_rq);
2891 2709
2892 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2710 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2893 2711
2894 nr_moved = 0; 2712 nr_moved = 0;
2895 if (busiest->nr_running > 1) { 2713 if (busiest->nr_running > 1) {
@@ -2897,7 +2715,7 @@ redo:
2897 double_lock_balance(this_rq, busiest); 2715 double_lock_balance(this_rq, busiest);
2898 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2716 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2899 minus_1_or_zero(busiest->nr_running), 2717 minus_1_or_zero(busiest->nr_running),
2900 imbalance, sd, NEWLY_IDLE, NULL); 2718 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2901 spin_unlock(&busiest->lock); 2719 spin_unlock(&busiest->lock);
2902 2720
2903 if (!nr_moved) { 2721 if (!nr_moved) {
@@ -2908,7 +2726,7 @@ redo:
2908 } 2726 }
2909 2727
2910 if (!nr_moved) { 2728 if (!nr_moved) {
2911 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2729 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2912 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2913 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2914 return -1; 2732 return -1;
@@ -2918,7 +2736,7 @@ redo:
2918 return nr_moved; 2736 return nr_moved;
2919 2737
2920out_balanced: 2738out_balanced:
2921 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2739 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2922 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2740 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2923 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2741 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2924 return -1; 2742 return -1;
@@ -2934,28 +2752,33 @@ out_balanced:
2934static void idle_balance(int this_cpu, struct rq *this_rq) 2752static void idle_balance(int this_cpu, struct rq *this_rq)
2935{ 2753{
2936 struct sched_domain *sd; 2754 struct sched_domain *sd;
2937 int pulled_task = 0; 2755 int pulled_task = -1;
2938 unsigned long next_balance = jiffies + 60 * HZ; 2756 unsigned long next_balance = jiffies + HZ;
2939 2757
2940 for_each_domain(this_cpu, sd) { 2758 for_each_domain(this_cpu, sd) {
2941 if (sd->flags & SD_BALANCE_NEWIDLE) { 2759 unsigned long interval;
2760
2761 if (!(sd->flags & SD_LOAD_BALANCE))
2762 continue;
2763
2764 if (sd->flags & SD_BALANCE_NEWIDLE)
2942 /* If we've pulled tasks over stop searching: */ 2765 /* If we've pulled tasks over stop searching: */
2943 pulled_task = load_balance_newidle(this_cpu, 2766 pulled_task = load_balance_newidle(this_cpu,
2944 this_rq, sd); 2767 this_rq, sd);
2945 if (time_after(next_balance, 2768
2946 sd->last_balance + sd->balance_interval)) 2769 interval = msecs_to_jiffies(sd->balance_interval);
2947 next_balance = sd->last_balance 2770 if (time_after(next_balance, sd->last_balance + interval))
2948 + sd->balance_interval; 2771 next_balance = sd->last_balance + interval;
2949 if (pulled_task) 2772 if (pulled_task)
2950 break; 2773 break;
2951 }
2952 } 2774 }
2953 if (!pulled_task) 2775 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2954 /* 2776 /*
2955 * We are going idle. next_balance may be set based on 2777 * We are going idle. next_balance may be set based on
2956 * a busy processor. So reset next_balance. 2778 * a busy processor. So reset next_balance.
2957 */ 2779 */
2958 this_rq->next_balance = next_balance; 2780 this_rq->next_balance = next_balance;
2781 }
2959} 2782}
2960 2783
2961/* 2784/*
@@ -2999,7 +2822,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2999 schedstat_inc(sd, alb_cnt); 2822 schedstat_inc(sd, alb_cnt);
3000 2823
3001 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2824 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
3002 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2825 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
3003 NULL)) 2826 NULL))
3004 schedstat_inc(sd, alb_pushed); 2827 schedstat_inc(sd, alb_pushed);
3005 else 2828 else
@@ -3008,32 +2831,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3008 spin_unlock(&target_rq->lock); 2831 spin_unlock(&target_rq->lock);
3009} 2832}
3010 2833
3011static void update_load(struct rq *this_rq)
3012{
3013 unsigned long this_load;
3014 unsigned int i, scale;
3015
3016 this_load = this_rq->raw_weighted_load;
3017
3018 /* Update our load: */
3019 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
3020 unsigned long old_load, new_load;
3021
3022 /* scale is effectively 1 << i now, and >> i divides by scale */
3023
3024 old_load = this_rq->cpu_load[i];
3025 new_load = this_load;
3026 /*
3027 * Round up the averaging division if load is increasing. This
3028 * prevents us from getting stuck on 9 if the load is 10, for
3029 * example.
3030 */
3031 if (new_load > old_load)
3032 new_load += scale-1;
3033 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3034 }
3035}
3036
3037#ifdef CONFIG_NO_HZ 2834#ifdef CONFIG_NO_HZ
3038static struct { 2835static struct {
3039 atomic_t load_balancer; 2836 atomic_t load_balancer;
@@ -3116,7 +2913,7 @@ static DEFINE_SPINLOCK(balancing);
3116 * 2913 *
3117 * Balancing parameters are set up in arch_init_sched_domains. 2914 * Balancing parameters are set up in arch_init_sched_domains.
3118 */ 2915 */
3119static inline void rebalance_domains(int cpu, enum idle_type idle) 2916static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3120{ 2917{
3121 int balance = 1; 2918 int balance = 1;
3122 struct rq *rq = cpu_rq(cpu); 2919 struct rq *rq = cpu_rq(cpu);
@@ -3130,13 +2927,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3130 continue; 2927 continue;
3131 2928
3132 interval = sd->balance_interval; 2929 interval = sd->balance_interval;
3133 if (idle != SCHED_IDLE) 2930 if (idle != CPU_IDLE)
3134 interval *= sd->busy_factor; 2931 interval *= sd->busy_factor;
3135 2932
3136 /* scale ms to jiffies */ 2933 /* scale ms to jiffies */
3137 interval = msecs_to_jiffies(interval); 2934 interval = msecs_to_jiffies(interval);
3138 if (unlikely(!interval)) 2935 if (unlikely(!interval))
3139 interval = 1; 2936 interval = 1;
2937 if (interval > HZ*NR_CPUS/10)
2938 interval = HZ*NR_CPUS/10;
2939
3140 2940
3141 if (sd->flags & SD_SERIALIZE) { 2941 if (sd->flags & SD_SERIALIZE) {
3142 if (!spin_trylock(&balancing)) 2942 if (!spin_trylock(&balancing))
@@ -3150,7 +2950,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3150 * longer idle, or one of our SMT siblings is 2950 * longer idle, or one of our SMT siblings is
3151 * not idle. 2951 * not idle.
3152 */ 2952 */
3153 idle = NOT_IDLE; 2953 idle = CPU_NOT_IDLE;
3154 } 2954 }
3155 sd->last_balance = jiffies; 2955 sd->last_balance = jiffies;
3156 } 2956 }
@@ -3178,11 +2978,12 @@ out:
3178 */ 2978 */
3179static void run_rebalance_domains(struct softirq_action *h) 2979static void run_rebalance_domains(struct softirq_action *h)
3180{ 2980{
3181 int local_cpu = smp_processor_id(); 2981 int this_cpu = smp_processor_id();
3182 struct rq *local_rq = cpu_rq(local_cpu); 2982 struct rq *this_rq = cpu_rq(this_cpu);
3183 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 2983 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2984 CPU_IDLE : CPU_NOT_IDLE;
3184 2985
3185 rebalance_domains(local_cpu, idle); 2986 rebalance_domains(this_cpu, idle);
3186 2987
3187#ifdef CONFIG_NO_HZ 2988#ifdef CONFIG_NO_HZ
3188 /* 2989 /*
@@ -3190,13 +2991,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3190 * balancing on behalf of the other idle cpus whose ticks are 2991 * balancing on behalf of the other idle cpus whose ticks are
3191 * stopped. 2992 * stopped.
3192 */ 2993 */
3193 if (local_rq->idle_at_tick && 2994 if (this_rq->idle_at_tick &&
3194 atomic_read(&nohz.load_balancer) == local_cpu) { 2995 atomic_read(&nohz.load_balancer) == this_cpu) {
3195 cpumask_t cpus = nohz.cpu_mask; 2996 cpumask_t cpus = nohz.cpu_mask;
3196 struct rq *rq; 2997 struct rq *rq;
3197 int balance_cpu; 2998 int balance_cpu;
3198 2999
3199 cpu_clear(local_cpu, cpus); 3000 cpu_clear(this_cpu, cpus);
3200 for_each_cpu_mask(balance_cpu, cpus) { 3001 for_each_cpu_mask(balance_cpu, cpus) {
3201 /* 3002 /*
3202 * If this cpu gets work to do, stop the load balancing 3003 * If this cpu gets work to do, stop the load balancing
@@ -3209,8 +3010,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3209 rebalance_domains(balance_cpu, SCHED_IDLE); 3010 rebalance_domains(balance_cpu, SCHED_IDLE);
3210 3011
3211 rq = cpu_rq(balance_cpu); 3012 rq = cpu_rq(balance_cpu);
3212 if (time_after(local_rq->next_balance, rq->next_balance)) 3013 if (time_after(this_rq->next_balance, rq->next_balance))
3213 local_rq->next_balance = rq->next_balance; 3014 this_rq->next_balance = rq->next_balance;
3214 } 3015 }
3215 } 3016 }
3216#endif 3017#endif
@@ -3223,9 +3024,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3223 * idle load balancing owner or decide to stop the periodic load balancing, 3024 * idle load balancing owner or decide to stop the periodic load balancing,
3224 * if the whole system is idle. 3025 * if the whole system is idle.
3225 */ 3026 */
3226static inline void trigger_load_balance(int cpu) 3027static inline void trigger_load_balance(struct rq *rq, int cpu)
3227{ 3028{
3228 struct rq *rq = cpu_rq(cpu);
3229#ifdef CONFIG_NO_HZ 3029#ifdef CONFIG_NO_HZ
3230 /* 3030 /*
3231 * If we were in the nohz mode recently and busy at the current 3031 * If we were in the nohz mode recently and busy at the current
@@ -3277,13 +3077,29 @@ static inline void trigger_load_balance(int cpu)
3277 if (time_after_eq(jiffies, rq->next_balance)) 3077 if (time_after_eq(jiffies, rq->next_balance))
3278 raise_softirq(SCHED_SOFTIRQ); 3078 raise_softirq(SCHED_SOFTIRQ);
3279} 3079}
3280#else 3080
3081#else /* CONFIG_SMP */
3082
3281/* 3083/*
3282 * on UP we do not need to balance between CPUs: 3084 * on UP we do not need to balance between CPUs:
3283 */ 3085 */
3284static inline void idle_balance(int cpu, struct rq *rq) 3086static inline void idle_balance(int cpu, struct rq *rq)
3285{ 3087{
3286} 3088}
3089
3090/* Avoid "used but not defined" warning on UP */
3091static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3092 unsigned long max_nr_move, unsigned long max_load_move,
3093 struct sched_domain *sd, enum cpu_idle_type idle,
3094 int *all_pinned, unsigned long *load_moved,
3095 int this_best_prio, int best_prio, int best_prio_seen,
3096 struct rq_iterator *iterator)
3097{
3098 *load_moved = 0;
3099
3100 return 0;
3101}
3102
3287#endif 3103#endif
3288 3104
3289DEFINE_PER_CPU(struct kernel_stat, kstat); 3105DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3291,54 +3107,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3291EXPORT_PER_CPU_SYMBOL(kstat); 3107EXPORT_PER_CPU_SYMBOL(kstat);
3292 3108
3293/* 3109/*
3294 * This is called on clock ticks and on context switches. 3110 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3295 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3111 * that have not yet been banked in case the task is currently running.
3296 */
3297static inline void
3298update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3299{
3300 p->sched_time += now - p->last_ran;
3301 p->last_ran = rq->most_recent_timestamp = now;
3302}
3303
3304/*
3305 * Return current->sched_time plus any more ns on the sched_clock
3306 * that have not yet been banked.
3307 */ 3112 */
3308unsigned long long current_sched_time(const struct task_struct *p) 3113unsigned long long task_sched_runtime(struct task_struct *p)
3309{ 3114{
3310 unsigned long long ns;
3311 unsigned long flags; 3115 unsigned long flags;
3116 u64 ns, delta_exec;
3117 struct rq *rq;
3312 3118
3313 local_irq_save(flags); 3119 rq = task_rq_lock(p, &flags);
3314 ns = p->sched_time + sched_clock() - p->last_ran; 3120 ns = p->se.sum_exec_runtime;
3315 local_irq_restore(flags); 3121 if (rq->curr == p) {
3122 delta_exec = rq_clock(rq) - p->se.exec_start;
3123 if ((s64)delta_exec > 0)
3124 ns += delta_exec;
3125 }
3126 task_rq_unlock(rq, &flags);
3316 3127
3317 return ns; 3128 return ns;
3318} 3129}
3319 3130
3320/* 3131/*
3321 * We place interactive tasks back into the active array, if possible.
3322 *
3323 * To guarantee that this does not starve expired tasks we ignore the
3324 * interactivity of a task if the first expired task had to wait more
3325 * than a 'reasonable' amount of time. This deadline timeout is
3326 * load-dependent, as the frequency of array switched decreases with
3327 * increasing number of running tasks. We also ignore the interactivity
3328 * if a better static_prio task has expired:
3329 */
3330static inline int expired_starving(struct rq *rq)
3331{
3332 if (rq->curr->static_prio > rq->best_expired_prio)
3333 return 1;
3334 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3335 return 0;
3336 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3337 return 1;
3338 return 0;
3339}
3340
3341/*
3342 * Account user cpu time to a process. 3132 * Account user cpu time to a process.
3343 * @p: the process that the cpu time gets accounted to 3133 * @p: the process that the cpu time gets accounted to
3344 * @hardirq_offset: the offset to subtract from hardirq_count() 3134 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3411,81 +3201,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3411 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3201 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3412} 3202}
3413 3203
3414static void task_running_tick(struct rq *rq, struct task_struct *p)
3415{
3416 if (p->array != rq->active) {
3417 /* Task has expired but was not scheduled yet */
3418 set_tsk_need_resched(p);
3419 return;
3420 }
3421 spin_lock(&rq->lock);
3422 /*
3423 * The task was running during this tick - update the
3424 * time slice counter. Note: we do not update a thread's
3425 * priority until it either goes to sleep or uses up its
3426 * timeslice. This makes it possible for interactive tasks
3427 * to use up their timeslices at their highest priority levels.
3428 */
3429 if (rt_task(p)) {
3430 /*
3431 * RR tasks need a special form of timeslice management.
3432 * FIFO tasks have no timeslices.
3433 */
3434 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3435 p->time_slice = task_timeslice(p);
3436 p->first_time_slice = 0;
3437 set_tsk_need_resched(p);
3438
3439 /* put it at the end of the queue: */
3440 requeue_task(p, rq->active);
3441 }
3442 goto out_unlock;
3443 }
3444 if (!--p->time_slice) {
3445 dequeue_task(p, rq->active);
3446 set_tsk_need_resched(p);
3447 p->prio = effective_prio(p);
3448 p->time_slice = task_timeslice(p);
3449 p->first_time_slice = 0;
3450
3451 if (!rq->expired_timestamp)
3452 rq->expired_timestamp = jiffies;
3453 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3454 enqueue_task(p, rq->expired);
3455 if (p->static_prio < rq->best_expired_prio)
3456 rq->best_expired_prio = p->static_prio;
3457 } else
3458 enqueue_task(p, rq->active);
3459 } else {
3460 /*
3461 * Prevent a too long timeslice allowing a task to monopolize
3462 * the CPU. We do this by splitting up the timeslice into
3463 * smaller pieces.
3464 *
3465 * Note: this does not mean the task's timeslices expire or
3466 * get lost in any way, they just might be preempted by
3467 * another task of equal priority. (one with higher
3468 * priority would have preempted this task already.) We
3469 * requeue this task to the end of the list on this priority
3470 * level, which is in essence a round-robin of tasks with
3471 * equal priority.
3472 *
3473 * This only applies to tasks in the interactive
3474 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3475 */
3476 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3477 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3478 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3479 (p->array == rq->active)) {
3480
3481 requeue_task(p, rq->active);
3482 set_tsk_need_resched(p);
3483 }
3484 }
3485out_unlock:
3486 spin_unlock(&rq->lock);
3487}
3488
3489/* 3204/*
3490 * This function gets called by the timer code, with HZ frequency. 3205 * This function gets called by the timer code, with HZ frequency.
3491 * We call it with interrupts disabled. 3206 * We call it with interrupts disabled.
@@ -3495,20 +3210,19 @@ out_unlock:
3495 */ 3210 */
3496void scheduler_tick(void) 3211void scheduler_tick(void)
3497{ 3212{
3498 unsigned long long now = sched_clock();
3499 struct task_struct *p = current;
3500 int cpu = smp_processor_id(); 3213 int cpu = smp_processor_id();
3501 int idle_at_tick = idle_cpu(cpu);
3502 struct rq *rq = cpu_rq(cpu); 3214 struct rq *rq = cpu_rq(cpu);
3215 struct task_struct *curr = rq->curr;
3503 3216
3504 update_cpu_clock(p, rq, now); 3217 spin_lock(&rq->lock);
3218 if (curr != rq->idle) /* FIXME: needed? */
3219 curr->sched_class->task_tick(rq, curr);
3220 update_cpu_load(rq);
3221 spin_unlock(&rq->lock);
3505 3222
3506 if (!idle_at_tick)
3507 task_running_tick(rq, p);
3508#ifdef CONFIG_SMP 3223#ifdef CONFIG_SMP
3509 update_load(rq); 3224 rq->idle_at_tick = idle_cpu(cpu);
3510 rq->idle_at_tick = idle_at_tick; 3225 trigger_load_balance(rq, cpu);
3511 trigger_load_balance(cpu);
3512#endif 3226#endif
3513} 3227}
3514 3228
@@ -3550,170 +3264,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3550 3264
3551#endif 3265#endif
3552 3266
3553static inline int interactive_sleep(enum sleep_type sleep_type) 3267/*
3268 * Print scheduling while atomic bug:
3269 */
3270static noinline void __schedule_bug(struct task_struct *prev)
3554{ 3271{
3555 return (sleep_type == SLEEP_INTERACTIVE || 3272 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3556 sleep_type == SLEEP_INTERRUPTED); 3273 prev->comm, preempt_count(), prev->pid);
3274 debug_show_held_locks(prev);
3275 if (irqs_disabled())
3276 print_irqtrace_events(prev);
3277 dump_stack();
3557} 3278}
3558 3279
3559/* 3280/*
3560 * schedule() is the main scheduler function. 3281 * Various schedule()-time debugging checks and statistics:
3561 */ 3282 */
3562asmlinkage void __sched schedule(void) 3283static inline void schedule_debug(struct task_struct *prev)
3563{ 3284{
3564 struct task_struct *prev, *next;
3565 struct prio_array *array;
3566 struct list_head *queue;
3567 unsigned long long now;
3568 unsigned long run_time;
3569 int cpu, idx, new_prio;
3570 long *switch_count;
3571 struct rq *rq;
3572
3573 /* 3285 /*
3574 * Test if we are atomic. Since do_exit() needs to call into 3286 * Test if we are atomic. Since do_exit() needs to call into
3575 * schedule() atomically, we ignore that path for now. 3287 * schedule() atomically, we ignore that path for now.
3576 * Otherwise, whine if we are scheduling when we should not be. 3288 * Otherwise, whine if we are scheduling when we should not be.
3577 */ 3289 */
3578 if (unlikely(in_atomic() && !current->exit_state)) { 3290 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3579 printk(KERN_ERR "BUG: scheduling while atomic: " 3291 __schedule_bug(prev);
3580 "%s/0x%08x/%d\n",
3581 current->comm, preempt_count(), current->pid);
3582 debug_show_held_locks(current);
3583 if (irqs_disabled())
3584 print_irqtrace_events(current);
3585 dump_stack();
3586 }
3587 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3588 3292
3589need_resched: 3293 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3590 preempt_disable();
3591 prev = current;
3592 release_kernel_lock(prev);
3593need_resched_nonpreemptible:
3594 rq = this_rq();
3595 3294
3596 /* 3295 schedstat_inc(this_rq(), sched_cnt);
3597 * The idle thread is not allowed to schedule! 3296}
3598 * Remove this check after it has been exercised a bit.
3599 */
3600 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3601 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3602 dump_stack();
3603 }
3604 3297
3605 schedstat_inc(rq, sched_cnt); 3298/*
3606 now = sched_clock(); 3299 * Pick up the highest-prio task:
3607 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3300 */
3608 run_time = now - prev->timestamp; 3301static inline struct task_struct *
3609 if (unlikely((long long)(now - prev->timestamp) < 0)) 3302pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3610 run_time = 0; 3303{
3611 } else 3304 struct sched_class *class;
3612 run_time = NS_MAX_SLEEP_AVG; 3305 struct task_struct *p;
3613 3306
3614 /* 3307 /*
3615 * Tasks charged proportionately less run_time at high sleep_avg to 3308 * Optimization: we know that if all tasks are in
3616 * delay them losing their interactive status 3309 * the fair class we can call that function directly:
3617 */ 3310 */
3618 run_time /= (CURRENT_BONUS(prev) ? : 1); 3311 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3619 3312 p = fair_sched_class.pick_next_task(rq, now);
3620 spin_lock_irq(&rq->lock); 3313 if (likely(p))
3621 3314 return p;
3622 switch_count = &prev->nivcsw;
3623 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3624 switch_count = &prev->nvcsw;
3625 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3626 unlikely(signal_pending(prev))))
3627 prev->state = TASK_RUNNING;
3628 else {
3629 if (prev->state == TASK_UNINTERRUPTIBLE)
3630 rq->nr_uninterruptible++;
3631 deactivate_task(prev, rq);
3632 }
3633 }
3634
3635 cpu = smp_processor_id();
3636 if (unlikely(!rq->nr_running)) {
3637 idle_balance(cpu, rq);
3638 if (!rq->nr_running) {
3639 next = rq->idle;
3640 rq->expired_timestamp = 0;
3641 goto switch_tasks;
3642 }
3643 } 3315 }
3644 3316
3645 array = rq->active; 3317 class = sched_class_highest;
3646 if (unlikely(!array->nr_active)) { 3318 for ( ; ; ) {
3319 p = class->pick_next_task(rq, now);
3320 if (p)
3321 return p;
3647 /* 3322 /*
3648 * Switch the active and expired arrays. 3323 * Will never be NULL as the idle class always
3324 * returns a non-NULL p:
3649 */ 3325 */
3650 schedstat_inc(rq, sched_switch); 3326 class = class->next;
3651 rq->active = rq->expired;
3652 rq->expired = array;
3653 array = rq->active;
3654 rq->expired_timestamp = 0;
3655 rq->best_expired_prio = MAX_PRIO;
3656 } 3327 }
3328}
3657 3329
3658 idx = sched_find_first_bit(array->bitmap); 3330/*
3659 queue = array->queue + idx; 3331 * schedule() is the main scheduler function.
3660 next = list_entry(queue->next, struct task_struct, run_list); 3332 */
3333asmlinkage void __sched schedule(void)
3334{
3335 struct task_struct *prev, *next;
3336 long *switch_count;
3337 struct rq *rq;
3338 u64 now;
3339 int cpu;
3661 3340
3662 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3341need_resched:
3663 unsigned long long delta = now - next->timestamp; 3342 preempt_disable();
3664 if (unlikely((long long)(now - next->timestamp) < 0)) 3343 cpu = smp_processor_id();
3665 delta = 0; 3344 rq = cpu_rq(cpu);
3345 rcu_qsctr_inc(cpu);
3346 prev = rq->curr;
3347 switch_count = &prev->nivcsw;
3666 3348
3667 if (next->sleep_type == SLEEP_INTERACTIVE) 3349 release_kernel_lock(prev);
3668 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3350need_resched_nonpreemptible:
3669 3351
3670 array = next->array; 3352 schedule_debug(prev);
3671 new_prio = recalc_task_prio(next, next->timestamp + delta);
3672 3353
3673 if (unlikely(next->prio != new_prio)) { 3354 spin_lock_irq(&rq->lock);
3674 dequeue_task(next, array); 3355 clear_tsk_need_resched(prev);
3675 next->prio = new_prio; 3356
3676 enqueue_task(next, array); 3357 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3358 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3359 unlikely(signal_pending(prev)))) {
3360 prev->state = TASK_RUNNING;
3361 } else {
3362 deactivate_task(rq, prev, 1);
3677 } 3363 }
3364 switch_count = &prev->nvcsw;
3678 } 3365 }
3679 next->sleep_type = SLEEP_NORMAL;
3680switch_tasks:
3681 if (next == rq->idle)
3682 schedstat_inc(rq, sched_goidle);
3683 prefetch(next);
3684 prefetch_stack(next);
3685 clear_tsk_need_resched(prev);
3686 rcu_qsctr_inc(task_cpu(prev));
3687 3366
3688 update_cpu_clock(prev, rq, now); 3367 if (unlikely(!rq->nr_running))
3368 idle_balance(cpu, rq);
3689 3369
3690 prev->sleep_avg -= run_time; 3370 now = __rq_clock(rq);
3691 if ((long)prev->sleep_avg <= 0) 3371 prev->sched_class->put_prev_task(rq, prev, now);
3692 prev->sleep_avg = 0; 3372 next = pick_next_task(rq, prev, now);
3693 prev->timestamp = prev->last_ran = now;
3694 3373
3695 sched_info_switch(prev, next); 3374 sched_info_switch(prev, next);
3375
3696 if (likely(prev != next)) { 3376 if (likely(prev != next)) {
3697 next->timestamp = next->last_ran = now;
3698 rq->nr_switches++; 3377 rq->nr_switches++;
3699 rq->curr = next; 3378 rq->curr = next;
3700 ++*switch_count; 3379 ++*switch_count;
3701 3380
3702 prepare_task_switch(rq, next); 3381 context_switch(rq, prev, next); /* unlocks the rq */
3703 prev = context_switch(rq, prev, next);
3704 barrier();
3705 /*
3706 * this_rq must be evaluated again because prev may have moved
3707 * CPUs since it called schedule(), thus the 'rq' on its stack
3708 * frame will be invalid.
3709 */
3710 finish_task_switch(this_rq(), prev);
3711 } else 3382 } else
3712 spin_unlock_irq(&rq->lock); 3383 spin_unlock_irq(&rq->lock);
3713 3384
3714 prev = current; 3385 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3715 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3386 cpu = smp_processor_id();
3387 rq = cpu_rq(cpu);
3716 goto need_resched_nonpreemptible; 3388 goto need_resched_nonpreemptible;
3389 }
3717 preempt_enable_no_resched(); 3390 preempt_enable_no_resched();
3718 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3391 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3719 goto need_resched; 3392 goto need_resched;
@@ -4041,74 +3714,85 @@ out:
4041} 3714}
4042EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3715EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4043 3716
4044 3717static inline void
4045#define SLEEP_ON_VAR \ 3718sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4046 unsigned long flags; \ 3719{
4047 wait_queue_t wait; \ 3720 spin_lock_irqsave(&q->lock, *flags);
4048 init_waitqueue_entry(&wait, current); 3721 __add_wait_queue(q, wait);
4049
4050#define SLEEP_ON_HEAD \
4051 spin_lock_irqsave(&q->lock,flags); \
4052 __add_wait_queue(q, &wait); \
4053 spin_unlock(&q->lock); 3722 spin_unlock(&q->lock);
3723}
4054 3724
4055#define SLEEP_ON_TAIL \ 3725static inline void
4056 spin_lock_irq(&q->lock); \ 3726sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4057 __remove_wait_queue(q, &wait); \ 3727{
4058 spin_unlock_irqrestore(&q->lock, flags); 3728 spin_lock_irq(&q->lock);
3729 __remove_wait_queue(q, wait);
3730 spin_unlock_irqrestore(&q->lock, *flags);
3731}
4059 3732
4060void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3733void __sched interruptible_sleep_on(wait_queue_head_t *q)
4061{ 3734{
4062 SLEEP_ON_VAR 3735 unsigned long flags;
3736 wait_queue_t wait;
3737
3738 init_waitqueue_entry(&wait, current);
4063 3739
4064 current->state = TASK_INTERRUPTIBLE; 3740 current->state = TASK_INTERRUPTIBLE;
4065 3741
4066 SLEEP_ON_HEAD 3742 sleep_on_head(q, &wait, &flags);
4067 schedule(); 3743 schedule();
4068 SLEEP_ON_TAIL 3744 sleep_on_tail(q, &wait, &flags);
4069} 3745}
4070EXPORT_SYMBOL(interruptible_sleep_on); 3746EXPORT_SYMBOL(interruptible_sleep_on);
4071 3747
4072long fastcall __sched 3748long __sched
4073interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3749interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4074{ 3750{
4075 SLEEP_ON_VAR 3751 unsigned long flags;
3752 wait_queue_t wait;
3753
3754 init_waitqueue_entry(&wait, current);
4076 3755
4077 current->state = TASK_INTERRUPTIBLE; 3756 current->state = TASK_INTERRUPTIBLE;
4078 3757
4079 SLEEP_ON_HEAD 3758 sleep_on_head(q, &wait, &flags);
4080 timeout = schedule_timeout(timeout); 3759 timeout = schedule_timeout(timeout);
4081 SLEEP_ON_TAIL 3760 sleep_on_tail(q, &wait, &flags);
4082 3761
4083 return timeout; 3762 return timeout;
4084} 3763}
4085EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3764EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4086 3765
4087void fastcall __sched sleep_on(wait_queue_head_t *q) 3766void __sched sleep_on(wait_queue_head_t *q)
4088{ 3767{
4089 SLEEP_ON_VAR 3768 unsigned long flags;
3769 wait_queue_t wait;
3770
3771 init_waitqueue_entry(&wait, current);
4090 3772
4091 current->state = TASK_UNINTERRUPTIBLE; 3773 current->state = TASK_UNINTERRUPTIBLE;
4092 3774
4093 SLEEP_ON_HEAD 3775 sleep_on_head(q, &wait, &flags);
4094 schedule(); 3776 schedule();
4095 SLEEP_ON_TAIL 3777 sleep_on_tail(q, &wait, &flags);
4096} 3778}
4097EXPORT_SYMBOL(sleep_on); 3779EXPORT_SYMBOL(sleep_on);
4098 3780
4099long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3781long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4100{ 3782{
4101 SLEEP_ON_VAR 3783 unsigned long flags;
3784 wait_queue_t wait;
3785
3786 init_waitqueue_entry(&wait, current);
4102 3787
4103 current->state = TASK_UNINTERRUPTIBLE; 3788 current->state = TASK_UNINTERRUPTIBLE;
4104 3789
4105 SLEEP_ON_HEAD 3790 sleep_on_head(q, &wait, &flags);
4106 timeout = schedule_timeout(timeout); 3791 timeout = schedule_timeout(timeout);
4107 SLEEP_ON_TAIL 3792 sleep_on_tail(q, &wait, &flags);
4108 3793
4109 return timeout; 3794 return timeout;
4110} 3795}
4111
4112EXPORT_SYMBOL(sleep_on_timeout); 3796EXPORT_SYMBOL(sleep_on_timeout);
4113 3797
4114#ifdef CONFIG_RT_MUTEXES 3798#ifdef CONFIG_RT_MUTEXES
@@ -4125,29 +3809,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4125 */ 3809 */
4126void rt_mutex_setprio(struct task_struct *p, int prio) 3810void rt_mutex_setprio(struct task_struct *p, int prio)
4127{ 3811{
4128 struct prio_array *array;
4129 unsigned long flags; 3812 unsigned long flags;
3813 int oldprio, on_rq;
4130 struct rq *rq; 3814 struct rq *rq;
4131 int oldprio; 3815 u64 now;
4132 3816
4133 BUG_ON(prio < 0 || prio > MAX_PRIO); 3817 BUG_ON(prio < 0 || prio > MAX_PRIO);
4134 3818
4135 rq = task_rq_lock(p, &flags); 3819 rq = task_rq_lock(p, &flags);
3820 now = rq_clock(rq);
4136 3821
4137 oldprio = p->prio; 3822 oldprio = p->prio;
4138 array = p->array; 3823 on_rq = p->se.on_rq;
4139 if (array) 3824 if (on_rq)
4140 dequeue_task(p, array); 3825 dequeue_task(rq, p, 0, now);
3826
3827 if (rt_prio(prio))
3828 p->sched_class = &rt_sched_class;
3829 else
3830 p->sched_class = &fair_sched_class;
3831
4141 p->prio = prio; 3832 p->prio = prio;
4142 3833
4143 if (array) { 3834 if (on_rq) {
4144 /* 3835 enqueue_task(rq, p, 0, now);
4145 * If changing to an RT priority then queue it
4146 * in the active array!
4147 */
4148 if (rt_task(p))
4149 array = rq->active;
4150 enqueue_task(p, array);
4151 /* 3836 /*
4152 * Reschedule if we are currently running on this runqueue and 3837 * Reschedule if we are currently running on this runqueue and
4153 * our priority decreased, or if we are not currently running on 3838 * our priority decreased, or if we are not currently running on
@@ -4156,8 +3841,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4156 if (task_running(rq, p)) { 3841 if (task_running(rq, p)) {
4157 if (p->prio > oldprio) 3842 if (p->prio > oldprio)
4158 resched_task(rq->curr); 3843 resched_task(rq->curr);
4159 } else if (TASK_PREEMPTS_CURR(p, rq)) 3844 } else {
4160 resched_task(rq->curr); 3845 check_preempt_curr(rq, p);
3846 }
4161 } 3847 }
4162 task_rq_unlock(rq, &flags); 3848 task_rq_unlock(rq, &flags);
4163} 3849}
@@ -4166,10 +3852,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4166 3852
4167void set_user_nice(struct task_struct *p, long nice) 3853void set_user_nice(struct task_struct *p, long nice)
4168{ 3854{
4169 struct prio_array *array; 3855 int old_prio, delta, on_rq;
4170 int old_prio, delta;
4171 unsigned long flags; 3856 unsigned long flags;
4172 struct rq *rq; 3857 struct rq *rq;
3858 u64 now;
4173 3859
4174 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3860 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4175 return; 3861 return;
@@ -4178,20 +3864,21 @@ void set_user_nice(struct task_struct *p, long nice)
4178 * the task might be in the middle of scheduling on another CPU. 3864 * the task might be in the middle of scheduling on another CPU.
4179 */ 3865 */
4180 rq = task_rq_lock(p, &flags); 3866 rq = task_rq_lock(p, &flags);
3867 now = rq_clock(rq);
4181 /* 3868 /*
4182 * The RT priorities are set via sched_setscheduler(), but we still 3869 * The RT priorities are set via sched_setscheduler(), but we still
4183 * allow the 'normal' nice value to be set - but as expected 3870 * allow the 'normal' nice value to be set - but as expected
4184 * it wont have any effect on scheduling until the task is 3871 * it wont have any effect on scheduling until the task is
4185 * not SCHED_NORMAL/SCHED_BATCH: 3872 * SCHED_FIFO/SCHED_RR:
4186 */ 3873 */
4187 if (has_rt_policy(p)) { 3874 if (task_has_rt_policy(p)) {
4188 p->static_prio = NICE_TO_PRIO(nice); 3875 p->static_prio = NICE_TO_PRIO(nice);
4189 goto out_unlock; 3876 goto out_unlock;
4190 } 3877 }
4191 array = p->array; 3878 on_rq = p->se.on_rq;
4192 if (array) { 3879 if (on_rq) {
4193 dequeue_task(p, array); 3880 dequeue_task(rq, p, 0, now);
4194 dec_raw_weighted_load(rq, p); 3881 dec_load(rq, p, now);
4195 } 3882 }
4196 3883
4197 p->static_prio = NICE_TO_PRIO(nice); 3884 p->static_prio = NICE_TO_PRIO(nice);
@@ -4200,9 +3887,9 @@ void set_user_nice(struct task_struct *p, long nice)
4200 p->prio = effective_prio(p); 3887 p->prio = effective_prio(p);
4201 delta = p->prio - old_prio; 3888 delta = p->prio - old_prio;
4202 3889
4203 if (array) { 3890 if (on_rq) {
4204 enqueue_task(p, array); 3891 enqueue_task(rq, p, 0, now);
4205 inc_raw_weighted_load(rq, p); 3892 inc_load(rq, p, now);
4206 /* 3893 /*
4207 * If the task increased its priority or is running and 3894 * If the task increased its priority or is running and
4208 * lowered its priority, then reschedule its CPU: 3895 * lowered its priority, then reschedule its CPU:
@@ -4322,20 +4009,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4322} 4009}
4323 4010
4324/* Actually do priority change: must hold rq lock. */ 4011/* Actually do priority change: must hold rq lock. */
4325static void __setscheduler(struct task_struct *p, int policy, int prio) 4012static void
4013__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4326{ 4014{
4327 BUG_ON(p->array); 4015 BUG_ON(p->se.on_rq);
4328 4016
4329 p->policy = policy; 4017 p->policy = policy;
4018 switch (p->policy) {
4019 case SCHED_NORMAL:
4020 case SCHED_BATCH:
4021 case SCHED_IDLE:
4022 p->sched_class = &fair_sched_class;
4023 break;
4024 case SCHED_FIFO:
4025 case SCHED_RR:
4026 p->sched_class = &rt_sched_class;
4027 break;
4028 }
4029
4330 p->rt_priority = prio; 4030 p->rt_priority = prio;
4331 p->normal_prio = normal_prio(p); 4031 p->normal_prio = normal_prio(p);
4332 /* we are holding p->pi_lock already */ 4032 /* we are holding p->pi_lock already */
4333 p->prio = rt_mutex_getprio(p); 4033 p->prio = rt_mutex_getprio(p);
4334 /*
4335 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4336 */
4337 if (policy == SCHED_BATCH)
4338 p->sleep_avg = 0;
4339 set_load_weight(p); 4034 set_load_weight(p);
4340} 4035}
4341 4036
@@ -4350,8 +4045,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4350int sched_setscheduler(struct task_struct *p, int policy, 4045int sched_setscheduler(struct task_struct *p, int policy,
4351 struct sched_param *param) 4046 struct sched_param *param)
4352{ 4047{
4353 int retval, oldprio, oldpolicy = -1; 4048 int retval, oldprio, oldpolicy = -1, on_rq;
4354 struct prio_array *array;
4355 unsigned long flags; 4049 unsigned long flags;
4356 struct rq *rq; 4050 struct rq *rq;
4357 4051
@@ -4362,27 +4056,27 @@ recheck:
4362 if (policy < 0) 4056 if (policy < 0)
4363 policy = oldpolicy = p->policy; 4057 policy = oldpolicy = p->policy;
4364 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4058 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4365 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4059 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4060 policy != SCHED_IDLE)
4366 return -EINVAL; 4061 return -EINVAL;
4367 /* 4062 /*
4368 * Valid priorities for SCHED_FIFO and SCHED_RR are 4063 * Valid priorities for SCHED_FIFO and SCHED_RR are
4369 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4064 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4370 * SCHED_BATCH is 0. 4065 * SCHED_BATCH and SCHED_IDLE is 0.
4371 */ 4066 */
4372 if (param->sched_priority < 0 || 4067 if (param->sched_priority < 0 ||
4373 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4068 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4374 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4069 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4375 return -EINVAL; 4070 return -EINVAL;
4376 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4071 if (rt_policy(policy) != (param->sched_priority != 0))
4377 return -EINVAL; 4072 return -EINVAL;
4378 4073
4379 /* 4074 /*
4380 * Allow unprivileged RT tasks to decrease priority: 4075 * Allow unprivileged RT tasks to decrease priority:
4381 */ 4076 */
4382 if (!capable(CAP_SYS_NICE)) { 4077 if (!capable(CAP_SYS_NICE)) {
4383 if (is_rt_policy(policy)) { 4078 if (rt_policy(policy)) {
4384 unsigned long rlim_rtprio; 4079 unsigned long rlim_rtprio;
4385 unsigned long flags;
4386 4080
4387 if (!lock_task_sighand(p, &flags)) 4081 if (!lock_task_sighand(p, &flags))
4388 return -ESRCH; 4082 return -ESRCH;
@@ -4398,6 +4092,12 @@ recheck:
4398 param->sched_priority > rlim_rtprio) 4092 param->sched_priority > rlim_rtprio)
4399 return -EPERM; 4093 return -EPERM;
4400 } 4094 }
4095 /*
4096 * Like positive nice levels, dont allow tasks to
4097 * move out of SCHED_IDLE either:
4098 */
4099 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4100 return -EPERM;
4401 4101
4402 /* can't change other user's priorities */ 4102 /* can't change other user's priorities */
4403 if ((current->euid != p->euid) && 4103 if ((current->euid != p->euid) &&
@@ -4425,13 +4125,13 @@ recheck:
4425 spin_unlock_irqrestore(&p->pi_lock, flags); 4125 spin_unlock_irqrestore(&p->pi_lock, flags);
4426 goto recheck; 4126 goto recheck;
4427 } 4127 }
4428 array = p->array; 4128 on_rq = p->se.on_rq;
4429 if (array) 4129 if (on_rq)
4430 deactivate_task(p, rq); 4130 deactivate_task(rq, p, 0);
4431 oldprio = p->prio; 4131 oldprio = p->prio;
4432 __setscheduler(p, policy, param->sched_priority); 4132 __setscheduler(rq, p, policy, param->sched_priority);
4433 if (array) { 4133 if (on_rq) {
4434 __activate_task(p, rq); 4134 activate_task(rq, p, 0);
4435 /* 4135 /*
4436 * Reschedule if we are currently running on this runqueue and 4136 * Reschedule if we are currently running on this runqueue and
4437 * our priority decreased, or if we are not currently running on 4137 * our priority decreased, or if we are not currently running on
@@ -4440,8 +4140,9 @@ recheck:
4440 if (task_running(rq, p)) { 4140 if (task_running(rq, p)) {
4441 if (p->prio > oldprio) 4141 if (p->prio > oldprio)
4442 resched_task(rq->curr); 4142 resched_task(rq->curr);
4443 } else if (TASK_PREEMPTS_CURR(p, rq)) 4143 } else {
4444 resched_task(rq->curr); 4144 check_preempt_curr(rq, p);
4145 }
4445 } 4146 }
4446 __task_rq_unlock(rq); 4147 __task_rq_unlock(rq);
4447 spin_unlock_irqrestore(&p->pi_lock, flags); 4148 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4713,41 +4414,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4713/** 4414/**
4714 * sys_sched_yield - yield the current processor to other threads. 4415 * sys_sched_yield - yield the current processor to other threads.
4715 * 4416 *
4716 * This function yields the current CPU by moving the calling thread 4417 * This function yields the current CPU to other tasks. If there are no
4717 * to the expired array. If there are no other threads running on this 4418 * other threads running on this CPU then this function will return.
4718 * CPU then this function will return.
4719 */ 4419 */
4720asmlinkage long sys_sched_yield(void) 4420asmlinkage long sys_sched_yield(void)
4721{ 4421{
4722 struct rq *rq = this_rq_lock(); 4422 struct rq *rq = this_rq_lock();
4723 struct prio_array *array = current->array, *target = rq->expired;
4724 4423
4725 schedstat_inc(rq, yld_cnt); 4424 schedstat_inc(rq, yld_cnt);
4726 /* 4425 if (unlikely(rq->nr_running == 1))
4727 * We implement yielding by moving the task into the expired
4728 * queue.
4729 *
4730 * (special rule: RT tasks will just roundrobin in the active
4731 * array.)
4732 */
4733 if (rt_task(current))
4734 target = rq->active;
4735
4736 if (array->nr_active == 1) {
4737 schedstat_inc(rq, yld_act_empty); 4426 schedstat_inc(rq, yld_act_empty);
4738 if (!rq->expired->nr_active) 4427 else
4739 schedstat_inc(rq, yld_both_empty); 4428 current->sched_class->yield_task(rq, current);
4740 } else if (!rq->expired->nr_active)
4741 schedstat_inc(rq, yld_exp_empty);
4742
4743 if (array != target) {
4744 dequeue_task(current, array);
4745 enqueue_task(current, target);
4746 } else
4747 /*
4748 * requeue_task is cheaper so perform that if possible.
4749 */
4750 requeue_task(current, array);
4751 4429
4752 /* 4430 /*
4753 * Since we are going to call schedule() anyway, there's 4431 * Since we are going to call schedule() anyway, there's
@@ -4898,6 +4576,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4898 break; 4576 break;
4899 case SCHED_NORMAL: 4577 case SCHED_NORMAL:
4900 case SCHED_BATCH: 4578 case SCHED_BATCH:
4579 case SCHED_IDLE:
4901 ret = 0; 4580 ret = 0;
4902 break; 4581 break;
4903 } 4582 }
@@ -4922,6 +4601,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4922 break; 4601 break;
4923 case SCHED_NORMAL: 4602 case SCHED_NORMAL:
4924 case SCHED_BATCH: 4603 case SCHED_BATCH:
4604 case SCHED_IDLE:
4925 ret = 0; 4605 ret = 0;
4926 } 4606 }
4927 return ret; 4607 return ret;
@@ -4956,7 +4636,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4956 goto out_unlock; 4636 goto out_unlock;
4957 4637
4958 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4638 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4959 0 : task_timeslice(p), &t); 4639 0 : static_prio_timeslice(p->static_prio), &t);
4960 read_unlock(&tasklist_lock); 4640 read_unlock(&tasklist_lock);
4961 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4641 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4962out_nounlock: 4642out_nounlock:
@@ -4976,14 +4656,14 @@ static void show_task(struct task_struct *p)
4976 state = p->state ? __ffs(p->state) + 1 : 0; 4656 state = p->state ? __ffs(p->state) + 1 : 0;
4977 printk("%-13.13s %c", p->comm, 4657 printk("%-13.13s %c", p->comm,
4978 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4658 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4979#if (BITS_PER_LONG == 32) 4659#if BITS_PER_LONG == 32
4980 if (state == TASK_RUNNING) 4660 if (state == TASK_RUNNING)
4981 printk(" running "); 4661 printk(" running ");
4982 else 4662 else
4983 printk(" %08lX ", thread_saved_pc(p)); 4663 printk(" %08lx ", thread_saved_pc(p));
4984#else 4664#else
4985 if (state == TASK_RUNNING) 4665 if (state == TASK_RUNNING)
4986 printk(" running task "); 4666 printk(" running task ");
4987 else 4667 else
4988 printk(" %016lx ", thread_saved_pc(p)); 4668 printk(" %016lx ", thread_saved_pc(p));
4989#endif 4669#endif
@@ -4995,11 +4675,7 @@ static void show_task(struct task_struct *p)
4995 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4675 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4996 } 4676 }
4997#endif 4677#endif
4998 printk("%5lu %5d %6d", free, p->pid, p->parent->pid); 4678 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4999 if (!p->mm)
5000 printk(" (L-TLB)\n");
5001 else
5002 printk(" (NOTLB)\n");
5003 4679
5004 if (state != TASK_RUNNING) 4680 if (state != TASK_RUNNING)
5005 show_stack(p, NULL); 4681 show_stack(p, NULL);
@@ -5009,14 +4685,12 @@ void show_state_filter(unsigned long state_filter)
5009{ 4685{
5010 struct task_struct *g, *p; 4686 struct task_struct *g, *p;
5011 4687
5012#if (BITS_PER_LONG == 32) 4688#if BITS_PER_LONG == 32
5013 printk("\n" 4689 printk(KERN_INFO
5014 " free sibling\n"); 4690 " task PC stack pid father\n");
5015 printk(" task PC stack pid father child younger older\n");
5016#else 4691#else
5017 printk("\n" 4692 printk(KERN_INFO
5018 " free sibling\n"); 4693 " task PC stack pid father\n");
5019 printk(" task PC stack pid father child younger older\n");
5020#endif 4694#endif
5021 read_lock(&tasklist_lock); 4695 read_lock(&tasklist_lock);
5022 do_each_thread(g, p) { 4696 do_each_thread(g, p) {
@@ -5031,6 +4705,9 @@ void show_state_filter(unsigned long state_filter)
5031 4705
5032 touch_all_softlockup_watchdogs(); 4706 touch_all_softlockup_watchdogs();
5033 4707
4708#ifdef CONFIG_SCHED_DEBUG
4709 sysrq_sched_debug_show();
4710#endif
5034 read_unlock(&tasklist_lock); 4711 read_unlock(&tasklist_lock);
5035 /* 4712 /*
5036 * Only show locks if all tasks are dumped: 4713 * Only show locks if all tasks are dumped:
@@ -5039,6 +4716,11 @@ void show_state_filter(unsigned long state_filter)
5039 debug_show_all_locks(); 4716 debug_show_all_locks();
5040} 4717}
5041 4718
4719void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4720{
4721 idle->sched_class = &idle_sched_class;
4722}
4723
5042/** 4724/**
5043 * init_idle - set up an idle thread for a given CPU 4725 * init_idle - set up an idle thread for a given CPU
5044 * @idle: task in question 4726 * @idle: task in question
@@ -5052,13 +4734,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5052 struct rq *rq = cpu_rq(cpu); 4734 struct rq *rq = cpu_rq(cpu);
5053 unsigned long flags; 4735 unsigned long flags;
5054 4736
5055 idle->timestamp = sched_clock(); 4737 __sched_fork(idle);
5056 idle->sleep_avg = 0; 4738 idle->se.exec_start = sched_clock();
5057 idle->array = NULL; 4739
5058 idle->prio = idle->normal_prio = MAX_PRIO; 4740 idle->prio = idle->normal_prio = MAX_PRIO;
5059 idle->state = TASK_RUNNING;
5060 idle->cpus_allowed = cpumask_of_cpu(cpu); 4741 idle->cpus_allowed = cpumask_of_cpu(cpu);
5061 set_task_cpu(idle, cpu); 4742 __set_task_cpu(idle, cpu);
5062 4743
5063 spin_lock_irqsave(&rq->lock, flags); 4744 spin_lock_irqsave(&rq->lock, flags);
5064 rq->curr = rq->idle = idle; 4745 rq->curr = rq->idle = idle;
@@ -5073,6 +4754,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5073#else 4754#else
5074 task_thread_info(idle)->preempt_count = 0; 4755 task_thread_info(idle)->preempt_count = 0;
5075#endif 4756#endif
4757 /*
4758 * The idle tasks have their own, simple scheduling class:
4759 */
4760 idle->sched_class = &idle_sched_class;
5076} 4761}
5077 4762
5078/* 4763/*
@@ -5084,6 +4769,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5084 */ 4769 */
5085cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4770cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5086 4771
4772/*
4773 * Increase the granularity value when there are more CPUs,
4774 * because with more CPUs the 'effective latency' as visible
4775 * to users decreases. But the relationship is not linear,
4776 * so pick a second-best guess by going with the log2 of the
4777 * number of CPUs.
4778 *
4779 * This idea comes from the SD scheduler of Con Kolivas:
4780 */
4781static inline void sched_init_granularity(void)
4782{
4783 unsigned int factor = 1 + ilog2(num_online_cpus());
4784 const unsigned long gran_limit = 100000000;
4785
4786 sysctl_sched_granularity *= factor;
4787 if (sysctl_sched_granularity > gran_limit)
4788 sysctl_sched_granularity = gran_limit;
4789
4790 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4791 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4792}
4793
5087#ifdef CONFIG_SMP 4794#ifdef CONFIG_SMP
5088/* 4795/*
5089 * This is how migration works: 4796 * This is how migration works:
@@ -5157,7 +4864,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5157static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4864static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5158{ 4865{
5159 struct rq *rq_dest, *rq_src; 4866 struct rq *rq_dest, *rq_src;
5160 int ret = 0; 4867 int ret = 0, on_rq;
5161 4868
5162 if (unlikely(cpu_is_offline(dest_cpu))) 4869 if (unlikely(cpu_is_offline(dest_cpu)))
5163 return ret; 4870 return ret;
@@ -5173,20 +4880,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5173 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4880 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5174 goto out; 4881 goto out;
5175 4882
4883 on_rq = p->se.on_rq;
4884 if (on_rq)
4885 deactivate_task(rq_src, p, 0);
5176 set_task_cpu(p, dest_cpu); 4886 set_task_cpu(p, dest_cpu);
5177 if (p->array) { 4887 if (on_rq) {
5178 /* 4888 activate_task(rq_dest, p, 0);
5179 * Sync timestamp with rq_dest's before activating. 4889 check_preempt_curr(rq_dest, p);
5180 * The same thing could be achieved by doing this step
5181 * afterwards, and pretending it was a local activate.
5182 * This way is cleaner and logically correct.
5183 */
5184 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5185 + rq_dest->most_recent_timestamp;
5186 deactivate_task(p, rq_src);
5187 __activate_task(p, rq_dest);
5188 if (TASK_PREEMPTS_CURR(p, rq_dest))
5189 resched_task(rq_dest->curr);
5190 } 4890 }
5191 ret = 1; 4891 ret = 1;
5192out: 4892out:
@@ -5212,8 +4912,6 @@ static int migration_thread(void *data)
5212 struct migration_req *req; 4912 struct migration_req *req;
5213 struct list_head *head; 4913 struct list_head *head;
5214 4914
5215 try_to_freeze();
5216
5217 spin_lock_irq(&rq->lock); 4915 spin_lock_irq(&rq->lock);
5218 4916
5219 if (cpu_is_offline(cpu)) { 4917 if (cpu_is_offline(cpu)) {
@@ -5338,7 +5036,8 @@ static void migrate_live_tasks(int src_cpu)
5338 write_unlock_irq(&tasklist_lock); 5036 write_unlock_irq(&tasklist_lock);
5339} 5037}
5340 5038
5341/* Schedules idle task to be the next runnable task on current CPU. 5039/*
5040 * Schedules idle task to be the next runnable task on current CPU.
5342 * It does so by boosting its priority to highest possible and adding it to 5041 * It does so by boosting its priority to highest possible and adding it to
5343 * the _front_ of the runqueue. Used by CPU offline code. 5042 * the _front_ of the runqueue. Used by CPU offline code.
5344 */ 5043 */
@@ -5358,10 +5057,10 @@ void sched_idle_next(void)
5358 */ 5057 */
5359 spin_lock_irqsave(&rq->lock, flags); 5058 spin_lock_irqsave(&rq->lock, flags);
5360 5059
5361 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5060 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5362 5061
5363 /* Add idle task to the _front_ of its priority queue: */ 5062 /* Add idle task to the _front_ of its priority queue: */
5364 __activate_idle_task(p, rq); 5063 activate_idle_task(p, rq);
5365 5064
5366 spin_unlock_irqrestore(&rq->lock, flags); 5065 spin_unlock_irqrestore(&rq->lock, flags);
5367} 5066}
@@ -5411,16 +5110,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5411static void migrate_dead_tasks(unsigned int dead_cpu) 5110static void migrate_dead_tasks(unsigned int dead_cpu)
5412{ 5111{
5413 struct rq *rq = cpu_rq(dead_cpu); 5112 struct rq *rq = cpu_rq(dead_cpu);
5414 unsigned int arr, i; 5113 struct task_struct *next;
5415
5416 for (arr = 0; arr < 2; arr++) {
5417 for (i = 0; i < MAX_PRIO; i++) {
5418 struct list_head *list = &rq->arrays[arr].queue[i];
5419 5114
5420 while (!list_empty(list)) 5115 for ( ; ; ) {
5421 migrate_dead(dead_cpu, list_entry(list->next, 5116 if (!rq->nr_running)
5422 struct task_struct, run_list)); 5117 break;
5423 } 5118 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5119 if (!next)
5120 break;
5121 migrate_dead(dead_cpu, next);
5424 } 5122 }
5425} 5123}
5426#endif /* CONFIG_HOTPLUG_CPU */ 5124#endif /* CONFIG_HOTPLUG_CPU */
@@ -5444,14 +5142,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5444 5142
5445 case CPU_UP_PREPARE: 5143 case CPU_UP_PREPARE:
5446 case CPU_UP_PREPARE_FROZEN: 5144 case CPU_UP_PREPARE_FROZEN:
5447 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5145 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5448 if (IS_ERR(p)) 5146 if (IS_ERR(p))
5449 return NOTIFY_BAD; 5147 return NOTIFY_BAD;
5450 p->flags |= PF_NOFREEZE;
5451 kthread_bind(p, cpu); 5148 kthread_bind(p, cpu);
5452 /* Must be high prio: stop_machine expects to yield to it. */ 5149 /* Must be high prio: stop_machine expects to yield to it. */
5453 rq = task_rq_lock(p, &flags); 5150 rq = task_rq_lock(p, &flags);
5454 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5151 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5455 task_rq_unlock(rq, &flags); 5152 task_rq_unlock(rq, &flags);
5456 cpu_rq(cpu)->migration_thread = p; 5153 cpu_rq(cpu)->migration_thread = p;
5457 break; 5154 break;
@@ -5482,9 +5179,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5482 rq->migration_thread = NULL; 5179 rq->migration_thread = NULL;
5483 /* Idle task back to normal (off runqueue, low prio) */ 5180 /* Idle task back to normal (off runqueue, low prio) */
5484 rq = task_rq_lock(rq->idle, &flags); 5181 rq = task_rq_lock(rq->idle, &flags);
5485 deactivate_task(rq->idle, rq); 5182 deactivate_task(rq, rq->idle, 0);
5486 rq->idle->static_prio = MAX_PRIO; 5183 rq->idle->static_prio = MAX_PRIO;
5487 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185 rq->idle->sched_class = &idle_sched_class;
5488 migrate_dead_tasks(cpu); 5186 migrate_dead_tasks(cpu);
5489 task_rq_unlock(rq, &flags); 5187 task_rq_unlock(rq, &flags);
5490 migrate_nr_uninterruptible(rq); 5188 migrate_nr_uninterruptible(rq);
@@ -5793,483 +5491,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5793 5491
5794#define SD_NODES_PER_DOMAIN 16 5492#define SD_NODES_PER_DOMAIN 16
5795 5493
5796/*
5797 * Self-tuning task migration cost measurement between source and target CPUs.
5798 *
5799 * This is done by measuring the cost of manipulating buffers of varying
5800 * sizes. For a given buffer-size here are the steps that are taken:
5801 *
5802 * 1) the source CPU reads+dirties a shared buffer
5803 * 2) the target CPU reads+dirties the same shared buffer
5804 *
5805 * We measure how long they take, in the following 4 scenarios:
5806 *
5807 * - source: CPU1, target: CPU2 | cost1
5808 * - source: CPU2, target: CPU1 | cost2
5809 * - source: CPU1, target: CPU1 | cost3
5810 * - source: CPU2, target: CPU2 | cost4
5811 *
5812 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5813 * the cost of migration.
5814 *
5815 * We then start off from a small buffer-size and iterate up to larger
5816 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5817 * doing a maximum search for the cost. (The maximum cost for a migration
5818 * normally occurs when the working set size is around the effective cache
5819 * size.)
5820 */
5821#define SEARCH_SCOPE 2
5822#define MIN_CACHE_SIZE (64*1024U)
5823#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5824#define ITERATIONS 1
5825#define SIZE_THRESH 130
5826#define COST_THRESH 130
5827
5828/*
5829 * The migration cost is a function of 'domain distance'. Domain
5830 * distance is the number of steps a CPU has to iterate down its
5831 * domain tree to share a domain with the other CPU. The farther
5832 * two CPUs are from each other, the larger the distance gets.
5833 *
5834 * Note that we use the distance only to cache measurement results,
5835 * the distance value is not used numerically otherwise. When two
5836 * CPUs have the same distance it is assumed that the migration
5837 * cost is the same. (this is a simplification but quite practical)
5838 */
5839#define MAX_DOMAIN_DISTANCE 32
5840
5841static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5842 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5843/*
5844 * Architectures may override the migration cost and thus avoid
5845 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5846 * virtualized hardware:
5847 */
5848#ifdef CONFIG_DEFAULT_MIGRATION_COST
5849 CONFIG_DEFAULT_MIGRATION_COST
5850#else
5851 -1LL
5852#endif
5853};
5854
5855/*
5856 * Allow override of migration cost - in units of microseconds.
5857 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5858 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5859 */
5860static int __init migration_cost_setup(char *str)
5861{
5862 int ints[MAX_DOMAIN_DISTANCE+1], i;
5863
5864 str = get_options(str, ARRAY_SIZE(ints), ints);
5865
5866 printk("#ints: %d\n", ints[0]);
5867 for (i = 1; i <= ints[0]; i++) {
5868 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5869 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5870 }
5871 return 1;
5872}
5873
5874__setup ("migration_cost=", migration_cost_setup);
5875
5876/*
5877 * Global multiplier (divisor) for migration-cutoff values,
5878 * in percentiles. E.g. use a value of 150 to get 1.5 times
5879 * longer cache-hot cutoff times.
5880 *
5881 * (We scale it from 100 to 128 to long long handling easier.)
5882 */
5883
5884#define MIGRATION_FACTOR_SCALE 128
5885
5886static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5887
5888static int __init setup_migration_factor(char *str)
5889{
5890 get_option(&str, &migration_factor);
5891 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5892 return 1;
5893}
5894
5895__setup("migration_factor=", setup_migration_factor);
5896
5897/*
5898 * Estimated distance of two CPUs, measured via the number of domains
5899 * we have to pass for the two CPUs to be in the same span:
5900 */
5901static unsigned long domain_distance(int cpu1, int cpu2)
5902{
5903 unsigned long distance = 0;
5904 struct sched_domain *sd;
5905
5906 for_each_domain(cpu1, sd) {
5907 WARN_ON(!cpu_isset(cpu1, sd->span));
5908 if (cpu_isset(cpu2, sd->span))
5909 return distance;
5910 distance++;
5911 }
5912 if (distance >= MAX_DOMAIN_DISTANCE) {
5913 WARN_ON(1);
5914 distance = MAX_DOMAIN_DISTANCE-1;
5915 }
5916
5917 return distance;
5918}
5919
5920static unsigned int migration_debug;
5921
5922static int __init setup_migration_debug(char *str)
5923{
5924 get_option(&str, &migration_debug);
5925 return 1;
5926}
5927
5928__setup("migration_debug=", setup_migration_debug);
5929
5930/*
5931 * Maximum cache-size that the scheduler should try to measure.
5932 * Architectures with larger caches should tune this up during
5933 * bootup. Gets used in the domain-setup code (i.e. during SMP
5934 * bootup).
5935 */
5936unsigned int max_cache_size;
5937
5938static int __init setup_max_cache_size(char *str)
5939{
5940 get_option(&str, &max_cache_size);
5941 return 1;
5942}
5943
5944__setup("max_cache_size=", setup_max_cache_size);
5945
5946/*
5947 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5948 * is the operation that is timed, so we try to generate unpredictable
5949 * cachemisses that still end up filling the L2 cache:
5950 */
5951static void touch_cache(void *__cache, unsigned long __size)
5952{
5953 unsigned long size = __size / sizeof(long);
5954 unsigned long chunk1 = size / 3;
5955 unsigned long chunk2 = 2 * size / 3;
5956 unsigned long *cache = __cache;
5957 int i;
5958
5959 for (i = 0; i < size/6; i += 8) {
5960 switch (i % 6) {
5961 case 0: cache[i]++;
5962 case 1: cache[size-1-i]++;
5963 case 2: cache[chunk1-i]++;
5964 case 3: cache[chunk1+i]++;
5965 case 4: cache[chunk2-i]++;
5966 case 5: cache[chunk2+i]++;
5967 }
5968 }
5969}
5970
5971/*
5972 * Measure the cache-cost of one task migration. Returns in units of nsec.
5973 */
5974static unsigned long long
5975measure_one(void *cache, unsigned long size, int source, int target)
5976{
5977 cpumask_t mask, saved_mask;
5978 unsigned long long t0, t1, t2, t3, cost;
5979
5980 saved_mask = current->cpus_allowed;
5981
5982 /*
5983 * Flush source caches to RAM and invalidate them:
5984 */
5985 sched_cacheflush();
5986
5987 /*
5988 * Migrate to the source CPU:
5989 */
5990 mask = cpumask_of_cpu(source);
5991 set_cpus_allowed(current, mask);
5992 WARN_ON(smp_processor_id() != source);
5993
5994 /*
5995 * Dirty the working set:
5996 */
5997 t0 = sched_clock();
5998 touch_cache(cache, size);
5999 t1 = sched_clock();
6000
6001 /*
6002 * Migrate to the target CPU, dirty the L2 cache and access
6003 * the shared buffer. (which represents the working set
6004 * of a migrated task.)
6005 */
6006 mask = cpumask_of_cpu(target);
6007 set_cpus_allowed(current, mask);
6008 WARN_ON(smp_processor_id() != target);
6009
6010 t2 = sched_clock();
6011 touch_cache(cache, size);
6012 t3 = sched_clock();
6013
6014 cost = t1-t0 + t3-t2;
6015
6016 if (migration_debug >= 2)
6017 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6018 source, target, t1-t0, t1-t0, t3-t2, cost);
6019 /*
6020 * Flush target caches to RAM and invalidate them:
6021 */
6022 sched_cacheflush();
6023
6024 set_cpus_allowed(current, saved_mask);
6025
6026 return cost;
6027}
6028
6029/*
6030 * Measure a series of task migrations and return the average
6031 * result. Since this code runs early during bootup the system
6032 * is 'undisturbed' and the average latency makes sense.
6033 *
6034 * The algorithm in essence auto-detects the relevant cache-size,
6035 * so it will properly detect different cachesizes for different
6036 * cache-hierarchies, depending on how the CPUs are connected.
6037 *
6038 * Architectures can prime the upper limit of the search range via
6039 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6040 */
6041static unsigned long long
6042measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6043{
6044 unsigned long long cost1, cost2;
6045 int i;
6046
6047 /*
6048 * Measure the migration cost of 'size' bytes, over an
6049 * average of 10 runs:
6050 *
6051 * (We perturb the cache size by a small (0..4k)
6052 * value to compensate size/alignment related artifacts.
6053 * We also subtract the cost of the operation done on
6054 * the same CPU.)
6055 */
6056 cost1 = 0;
6057
6058 /*
6059 * dry run, to make sure we start off cache-cold on cpu1,
6060 * and to get any vmalloc pagefaults in advance:
6061 */
6062 measure_one(cache, size, cpu1, cpu2);
6063 for (i = 0; i < ITERATIONS; i++)
6064 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6065
6066 measure_one(cache, size, cpu2, cpu1);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6069
6070 /*
6071 * (We measure the non-migrating [cached] cost on both
6072 * cpu1 and cpu2, to handle CPUs with different speeds)
6073 */
6074 cost2 = 0;
6075
6076 measure_one(cache, size, cpu1, cpu1);
6077 for (i = 0; i < ITERATIONS; i++)
6078 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6079
6080 measure_one(cache, size, cpu2, cpu2);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6083
6084 /*
6085 * Get the per-iteration migration cost:
6086 */
6087 do_div(cost1, 2 * ITERATIONS);
6088 do_div(cost2, 2 * ITERATIONS);
6089
6090 return cost1 - cost2;
6091}
6092
6093static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6094{
6095 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6096 unsigned int max_size, size, size_found = 0;
6097 long long cost = 0, prev_cost;
6098 void *cache;
6099
6100 /*
6101 * Search from max_cache_size*5 down to 64K - the real relevant
6102 * cachesize has to lie somewhere inbetween.
6103 */
6104 if (max_cache_size) {
6105 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6106 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6107 } else {
6108 /*
6109 * Since we have no estimation about the relevant
6110 * search range
6111 */
6112 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6113 size = MIN_CACHE_SIZE;
6114 }
6115
6116 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6117 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6118 return 0;
6119 }
6120
6121 /*
6122 * Allocate the working set:
6123 */
6124 cache = vmalloc(max_size);
6125 if (!cache) {
6126 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6127 return 1000000; /* return 1 msec on very small boxen */
6128 }
6129
6130 while (size <= max_size) {
6131 prev_cost = cost;
6132 cost = measure_cost(cpu1, cpu2, cache, size);
6133
6134 /*
6135 * Update the max:
6136 */
6137 if (cost > 0) {
6138 if (max_cost < cost) {
6139 max_cost = cost;
6140 size_found = size;
6141 }
6142 }
6143 /*
6144 * Calculate average fluctuation, we use this to prevent
6145 * noise from triggering an early break out of the loop:
6146 */
6147 fluct = abs(cost - prev_cost);
6148 avg_fluct = (avg_fluct + fluct)/2;
6149
6150 if (migration_debug)
6151 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6152 "(%8Ld %8Ld)\n",
6153 cpu1, cpu2, size,
6154 (long)cost / 1000000,
6155 ((long)cost / 100000) % 10,
6156 (long)max_cost / 1000000,
6157 ((long)max_cost / 100000) % 10,
6158 domain_distance(cpu1, cpu2),
6159 cost, avg_fluct);
6160
6161 /*
6162 * If we iterated at least 20% past the previous maximum,
6163 * and the cost has dropped by more than 20% already,
6164 * (taking fluctuations into account) then we assume to
6165 * have found the maximum and break out of the loop early:
6166 */
6167 if (size_found && (size*100 > size_found*SIZE_THRESH))
6168 if (cost+avg_fluct <= 0 ||
6169 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6170
6171 if (migration_debug)
6172 printk("-> found max.\n");
6173 break;
6174 }
6175 /*
6176 * Increase the cachesize in 10% steps:
6177 */
6178 size = size * 10 / 9;
6179 }
6180
6181 if (migration_debug)
6182 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6183 cpu1, cpu2, size_found, max_cost);
6184
6185 vfree(cache);
6186
6187 /*
6188 * A task is considered 'cache cold' if at least 2 times
6189 * the worst-case cost of migration has passed.
6190 *
6191 * (this limit is only listened to if the load-balancing
6192 * situation is 'nice' - if there is a large imbalance we
6193 * ignore it for the sake of CPU utilization and
6194 * processing fairness.)
6195 */
6196 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6197}
6198
6199static void calibrate_migration_costs(const cpumask_t *cpu_map)
6200{
6201 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6202 unsigned long j0, j1, distance, max_distance = 0;
6203 struct sched_domain *sd;
6204
6205 j0 = jiffies;
6206
6207 /*
6208 * First pass - calculate the cacheflush times:
6209 */
6210 for_each_cpu_mask(cpu1, *cpu_map) {
6211 for_each_cpu_mask(cpu2, *cpu_map) {
6212 if (cpu1 == cpu2)
6213 continue;
6214 distance = domain_distance(cpu1, cpu2);
6215 max_distance = max(max_distance, distance);
6216 /*
6217 * No result cached yet?
6218 */
6219 if (migration_cost[distance] == -1LL)
6220 migration_cost[distance] =
6221 measure_migration_cost(cpu1, cpu2);
6222 }
6223 }
6224 /*
6225 * Second pass - update the sched domain hierarchy with
6226 * the new cache-hot-time estimations:
6227 */
6228 for_each_cpu_mask(cpu, *cpu_map) {
6229 distance = 0;
6230 for_each_domain(cpu, sd) {
6231 sd->cache_hot_time = migration_cost[distance];
6232 distance++;
6233 }
6234 }
6235 /*
6236 * Print the matrix:
6237 */
6238 if (migration_debug)
6239 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6240 max_cache_size,
6241#ifdef CONFIG_X86
6242 cpu_khz/1000
6243#else
6244 -1
6245#endif
6246 );
6247 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6248 printk("migration_cost=");
6249 for (distance = 0; distance <= max_distance; distance++) {
6250 if (distance)
6251 printk(",");
6252 printk("%ld", (long)migration_cost[distance] / 1000);
6253 }
6254 printk("\n");
6255 }
6256 j1 = jiffies;
6257 if (migration_debug)
6258 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6259
6260 /*
6261 * Move back to the original CPU. NUMA-Q gets confused
6262 * if we migrate to another quad during bootup.
6263 */
6264 if (raw_smp_processor_id() != orig_cpu) {
6265 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6266 saved_mask = current->cpus_allowed;
6267
6268 set_cpus_allowed(current, mask);
6269 set_cpus_allowed(current, saved_mask);
6270 }
6271}
6272
6273#ifdef CONFIG_NUMA 5494#ifdef CONFIG_NUMA
6274 5495
6275/** 5496/**
@@ -6570,7 +5791,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6570static int build_sched_domains(const cpumask_t *cpu_map) 5791static int build_sched_domains(const cpumask_t *cpu_map)
6571{ 5792{
6572 int i; 5793 int i;
6573 struct sched_domain *sd;
6574#ifdef CONFIG_NUMA 5794#ifdef CONFIG_NUMA
6575 struct sched_group **sched_group_nodes = NULL; 5795 struct sched_group **sched_group_nodes = NULL;
6576 int sd_allnodes = 0; 5796 int sd_allnodes = 0;
@@ -6578,7 +5798,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6578 /* 5798 /*
6579 * Allocate the per-node list of sched groups 5799 * Allocate the per-node list of sched groups
6580 */ 5800 */
6581 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5801 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6582 GFP_KERNEL); 5802 GFP_KERNEL);
6583 if (!sched_group_nodes) { 5803 if (!sched_group_nodes) {
6584 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5804 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6597,8 +5817,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6597 cpus_and(nodemask, nodemask, *cpu_map); 5817 cpus_and(nodemask, nodemask, *cpu_map);
6598 5818
6599#ifdef CONFIG_NUMA 5819#ifdef CONFIG_NUMA
6600 if (cpus_weight(*cpu_map) 5820 if (cpus_weight(*cpu_map) >
6601 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5821 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6602 sd = &per_cpu(allnodes_domains, i); 5822 sd = &per_cpu(allnodes_domains, i);
6603 *sd = SD_ALLNODES_INIT; 5823 *sd = SD_ALLNODES_INIT;
6604 sd->span = *cpu_map; 5824 sd->span = *cpu_map;
@@ -6657,7 +5877,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6657 if (i != first_cpu(this_sibling_map)) 5877 if (i != first_cpu(this_sibling_map))
6658 continue; 5878 continue;
6659 5879
6660 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5880 init_sched_build_groups(this_sibling_map, cpu_map,
5881 &cpu_to_cpu_group);
6661 } 5882 }
6662#endif 5883#endif
6663 5884
@@ -6668,11 +5889,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6668 cpus_and(this_core_map, this_core_map, *cpu_map); 5889 cpus_and(this_core_map, this_core_map, *cpu_map);
6669 if (i != first_cpu(this_core_map)) 5890 if (i != first_cpu(this_core_map))
6670 continue; 5891 continue;
6671 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5892 init_sched_build_groups(this_core_map, cpu_map,
5893 &cpu_to_core_group);
6672 } 5894 }
6673#endif 5895#endif
6674 5896
6675
6676 /* Set up physical groups */ 5897 /* Set up physical groups */
6677 for (i = 0; i < MAX_NUMNODES; i++) { 5898 for (i = 0; i < MAX_NUMNODES; i++) {
6678 cpumask_t nodemask = node_to_cpumask(i); 5899 cpumask_t nodemask = node_to_cpumask(i);
@@ -6687,7 +5908,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6687#ifdef CONFIG_NUMA 5908#ifdef CONFIG_NUMA
6688 /* Set up node groups */ 5909 /* Set up node groups */
6689 if (sd_allnodes) 5910 if (sd_allnodes)
6690 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5911 init_sched_build_groups(*cpu_map, cpu_map,
5912 &cpu_to_allnodes_group);
6691 5913
6692 for (i = 0; i < MAX_NUMNODES; i++) { 5914 for (i = 0; i < MAX_NUMNODES; i++) {
6693 /* Set up node groups */ 5915 /* Set up node groups */
@@ -6715,6 +5937,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6715 sched_group_nodes[i] = sg; 5937 sched_group_nodes[i] = sg;
6716 for_each_cpu_mask(j, nodemask) { 5938 for_each_cpu_mask(j, nodemask) {
6717 struct sched_domain *sd; 5939 struct sched_domain *sd;
5940
6718 sd = &per_cpu(node_domains, j); 5941 sd = &per_cpu(node_domains, j);
6719 sd->groups = sg; 5942 sd->groups = sg;
6720 } 5943 }
@@ -6759,19 +5982,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6759 /* Calculate CPU power for physical packages and nodes */ 5982 /* Calculate CPU power for physical packages and nodes */
6760#ifdef CONFIG_SCHED_SMT 5983#ifdef CONFIG_SCHED_SMT
6761 for_each_cpu_mask(i, *cpu_map) { 5984 for_each_cpu_mask(i, *cpu_map) {
6762 sd = &per_cpu(cpu_domains, i); 5985 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
6763 init_sched_groups_power(i, sd); 5987 init_sched_groups_power(i, sd);
6764 } 5988 }
6765#endif 5989#endif
6766#ifdef CONFIG_SCHED_MC 5990#ifdef CONFIG_SCHED_MC
6767 for_each_cpu_mask(i, *cpu_map) { 5991 for_each_cpu_mask(i, *cpu_map) {
6768 sd = &per_cpu(core_domains, i); 5992 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
6769 init_sched_groups_power(i, sd); 5994 init_sched_groups_power(i, sd);
6770 } 5995 }
6771#endif 5996#endif
6772 5997
6773 for_each_cpu_mask(i, *cpu_map) { 5998 for_each_cpu_mask(i, *cpu_map) {
6774 sd = &per_cpu(phys_domains, i); 5999 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
6775 init_sched_groups_power(i, sd); 6001 init_sched_groups_power(i, sd);
6776 } 6002 }
6777 6003
@@ -6799,10 +6025,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6799#endif 6025#endif
6800 cpu_attach_domain(sd, i); 6026 cpu_attach_domain(sd, i);
6801 } 6027 }
6802 /*
6803 * Tune cache-hot values:
6804 */
6805 calibrate_migration_costs(cpu_map);
6806 6028
6807 return 0; 6029 return 0;
6808 6030
@@ -7009,10 +6231,12 @@ void __init sched_init_smp(void)
7009 /* Move init over to a non-isolated CPU */ 6231 /* Move init over to a non-isolated CPU */
7010 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7011 BUG(); 6233 BUG();
6234 sched_init_granularity();
7012} 6235}
7013#else 6236#else
7014void __init sched_init_smp(void) 6237void __init sched_init_smp(void)
7015{ 6238{
6239 sched_init_granularity();
7016} 6240}
7017#endif /* CONFIG_SMP */ 6241#endif /* CONFIG_SMP */
7018 6242
@@ -7026,28 +6250,51 @@ int in_sched_functions(unsigned long addr)
7026 && addr < (unsigned long)__sched_text_end); 6250 && addr < (unsigned long)__sched_text_end);
7027} 6251}
7028 6252
6253static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254{
6255 cfs_rq->tasks_timeline = RB_ROOT;
6256 cfs_rq->fair_clock = 1;
6257#ifdef CONFIG_FAIR_GROUP_SCHED
6258 cfs_rq->rq = rq;
6259#endif
6260}
6261
7029void __init sched_init(void) 6262void __init sched_init(void)
7030{ 6263{
7031 int i, j, k; 6264 u64 now = sched_clock();
7032 int highest_cpu = 0; 6265 int highest_cpu = 0;
6266 int i, j;
6267
6268 /*
6269 * Link up the scheduling class hierarchy:
6270 */
6271 rt_sched_class.next = &fair_sched_class;
6272 fair_sched_class.next = &idle_sched_class;
6273 idle_sched_class.next = NULL;
7033 6274
7034 for_each_possible_cpu(i) { 6275 for_each_possible_cpu(i) {
7035 struct prio_array *array; 6276 struct rt_prio_array *array;
7036 struct rq *rq; 6277 struct rq *rq;
7037 6278
7038 rq = cpu_rq(i); 6279 rq = cpu_rq(i);
7039 spin_lock_init(&rq->lock); 6280 spin_lock_init(&rq->lock);
7040 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7041 rq->nr_running = 0; 6282 rq->nr_running = 0;
7042 rq->active = rq->arrays; 6283 rq->clock = 1;
7043 rq->expired = rq->arrays + 1; 6284 init_cfs_rq(&rq->cfs, rq);
7044 rq->best_expired_prio = MAX_PRIO; 6285#ifdef CONFIG_FAIR_GROUP_SCHED
6286 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288#endif
6289 rq->ls.load_update_last = now;
6290 rq->ls.load_update_start = now;
7045 6291
6292 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293 rq->cpu_load[j] = 0;
7046#ifdef CONFIG_SMP 6294#ifdef CONFIG_SMP
7047 rq->sd = NULL; 6295 rq->sd = NULL;
7048 for (j = 1; j < 3; j++)
7049 rq->cpu_load[j] = 0;
7050 rq->active_balance = 0; 6296 rq->active_balance = 0;
6297 rq->next_balance = jiffies;
7051 rq->push_cpu = 0; 6298 rq->push_cpu = 0;
7052 rq->cpu = i; 6299 rq->cpu = i;
7053 rq->migration_thread = NULL; 6300 rq->migration_thread = NULL;
@@ -7055,16 +6302,14 @@ void __init sched_init(void)
7055#endif 6302#endif
7056 atomic_set(&rq->nr_iowait, 0); 6303 atomic_set(&rq->nr_iowait, 0);
7057 6304
7058 for (j = 0; j < 2; j++) { 6305 array = &rq->rt.active;
7059 array = rq->arrays + j; 6306 for (j = 0; j < MAX_RT_PRIO; j++) {
7060 for (k = 0; k < MAX_PRIO; k++) { 6307 INIT_LIST_HEAD(array->queue + j);
7061 INIT_LIST_HEAD(array->queue + k); 6308 __clear_bit(j, array->bitmap);
7062 __clear_bit(k, array->bitmap);
7063 }
7064 // delimiter for bitsearch
7065 __set_bit(MAX_PRIO, array->bitmap);
7066 } 6309 }
7067 highest_cpu = i; 6310 highest_cpu = i;
6311 /* delimiter for bitsearch: */
6312 __set_bit(MAX_RT_PRIO, array->bitmap);
7068 } 6313 }
7069 6314
7070 set_load_weight(&init_task); 6315 set_load_weight(&init_task);
@@ -7091,6 +6336,10 @@ void __init sched_init(void)
7091 * when this runqueue becomes "idle". 6336 * when this runqueue becomes "idle".
7092 */ 6337 */
7093 init_idle(current, smp_processor_id()); 6338 init_idle(current, smp_processor_id());
6339 /*
6340 * During early bootup we pretend to be a normal task:
6341 */
6342 current->sched_class = &fair_sched_class;
7094} 6343}
7095 6344
7096#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6345#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7121,29 +6370,55 @@ EXPORT_SYMBOL(__might_sleep);
7121#ifdef CONFIG_MAGIC_SYSRQ 6370#ifdef CONFIG_MAGIC_SYSRQ
7122void normalize_rt_tasks(void) 6371void normalize_rt_tasks(void)
7123{ 6372{
7124 struct prio_array *array;
7125 struct task_struct *g, *p; 6373 struct task_struct *g, *p;
7126 unsigned long flags; 6374 unsigned long flags;
7127 struct rq *rq; 6375 struct rq *rq;
6376 int on_rq;
7128 6377
7129 read_lock_irq(&tasklist_lock); 6378 read_lock_irq(&tasklist_lock);
7130
7131 do_each_thread(g, p) { 6379 do_each_thread(g, p) {
7132 if (!rt_task(p)) 6380 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0;
6382 p->se.wait_start_fair = 0;
6383 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0;
6388 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0;
6390
6391 if (!rt_task(p)) {
6392 /*
6393 * Renice negative nice level userspace
6394 * tasks back to 0:
6395 */
6396 if (TASK_NICE(p) < 0 && p->mm)
6397 set_user_nice(p, 0);
7133 continue; 6398 continue;
6399 }
7134 6400
7135 spin_lock_irqsave(&p->pi_lock, flags); 6401 spin_lock_irqsave(&p->pi_lock, flags);
7136 rq = __task_rq_lock(p); 6402 rq = __task_rq_lock(p);
6403#ifdef CONFIG_SMP
6404 /*
6405 * Do not touch the migration thread:
6406 */
6407 if (p == rq->migration_thread)
6408 goto out_unlock;
6409#endif
7137 6410
7138 array = p->array; 6411 on_rq = p->se.on_rq;
7139 if (array) 6412 if (on_rq)
7140 deactivate_task(p, task_rq(p)); 6413 deactivate_task(task_rq(p), p, 0);
7141 __setscheduler(p, SCHED_NORMAL, 0); 6414 __setscheduler(rq, p, SCHED_NORMAL, 0);
7142 if (array) { 6415 if (on_rq) {
7143 __activate_task(p, task_rq(p)); 6416 activate_task(task_rq(p), p, 0);
7144 resched_task(rq->curr); 6417 resched_task(rq->curr);
7145 } 6418 }
7146 6419#ifdef CONFIG_SMP
6420 out_unlock:
6421#endif
7147 __task_rq_unlock(rq); 6422 __task_rq_unlock(rq);
7148 spin_unlock_irqrestore(&p->pi_lock, flags); 6423 spin_unlock_irqrestore(&p->pi_lock, flags);
7149 } while_each_thread(g, p); 6424 } while_each_thread(g, p);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 0000000000..29f2c21e7d
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,275 @@
1/*
2 * kernel/time/sched_debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19/*
20 * This allows printing both to /proc/sched_debug and
21 * to the console
22 */
23#define SEQ_printf(m, x...) \
24 do { \
25 if (m) \
26 seq_printf(m, x); \
27 else \
28 printk(x); \
29 } while (0)
30
31static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
33{
34 if (rq->curr == p)
35 SEQ_printf(m, "R");
36 else
37 SEQ_printf(m, " ");
38
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
40 "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
41 p->comm, p->pid,
42 (long long)p->se.fair_key,
43 (long long)(p->se.fair_key - rq->cfs.fair_clock),
44 (long long)p->se.wait_runtime,
45 (long long)(p->nvcsw + p->nivcsw),
46 p->prio,
47 (long long)p->se.sum_exec_runtime,
48 (long long)p->se.sum_wait_runtime,
49 (long long)p->se.sum_sleep_runtime,
50 (long long)p->se.wait_runtime_overruns,
51 (long long)p->se.wait_runtime_underruns);
52}
53
54static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
55{
56 struct task_struct *g, *p;
57
58 SEQ_printf(m,
59 "\nrunnable tasks:\n"
60 " task PID tree-key delta waiting"
61 " switches prio"
62 " sum-exec sum-wait sum-sleep"
63 " wait-overrun wait-underrun\n"
64 "------------------------------------------------------------------"
65 "----------------"
66 "------------------------------------------------"
67 "--------------------------------\n");
68
69 read_lock_irq(&tasklist_lock);
70
71 do_each_thread(g, p) {
72 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
73 continue;
74
75 print_task(m, rq, p, now);
76 } while_each_thread(g, p);
77
78 read_unlock_irq(&tasklist_lock);
79}
80
81static void
82print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
83{
84 s64 wait_runtime_rq_sum = 0;
85 struct task_struct *p;
86 struct rb_node *curr;
87 unsigned long flags;
88 struct rq *rq = &per_cpu(runqueues, cpu);
89
90 spin_lock_irqsave(&rq->lock, flags);
91 curr = first_fair(cfs_rq);
92 while (curr) {
93 p = rb_entry(curr, struct task_struct, se.run_node);
94 wait_runtime_rq_sum += p->se.wait_runtime;
95
96 curr = rb_next(curr);
97 }
98 spin_unlock_irqrestore(&rq->lock, flags);
99
100 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
101 (long long)wait_runtime_rq_sum);
102}
103
104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
105{
106 SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
107
108#define P(x) \
109 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
110
111 P(fair_clock);
112 P(exec_clock);
113 P(wait_runtime);
114 P(wait_runtime_overruns);
115 P(wait_runtime_underruns);
116 P(sleeper_bonus);
117#undef P
118
119 print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
120}
121
122static void print_cpu(struct seq_file *m, int cpu, u64 now)
123{
124 struct rq *rq = &per_cpu(runqueues, cpu);
125
126#ifdef CONFIG_X86
127 {
128 unsigned int freq = cpu_khz ? : 1;
129
130 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
131 cpu, freq / 1000, (freq % 1000));
132 }
133#else
134 SEQ_printf(m, "\ncpu#%d\n", cpu);
135#endif
136
137#define P(x) \
138 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
139
140 P(nr_running);
141 SEQ_printf(m, " .%-30s: %lu\n", "load",
142 rq->ls.load.weight);
143 P(ls.delta_fair);
144 P(ls.delta_exec);
145 P(nr_switches);
146 P(nr_load_updates);
147 P(nr_uninterruptible);
148 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
149 P(next_balance);
150 P(curr->pid);
151 P(clock);
152 P(prev_clock_raw);
153 P(clock_warps);
154 P(clock_overflows);
155 P(clock_unstable_events);
156 P(clock_max_delta);
157 P(cpu_load[0]);
158 P(cpu_load[1]);
159 P(cpu_load[2]);
160 P(cpu_load[3]);
161 P(cpu_load[4]);
162#undef P
163
164 print_cfs_stats(m, cpu, now);
165
166 print_rq(m, rq, cpu, now);
167}
168
169static int sched_debug_show(struct seq_file *m, void *v)
170{
171 u64 now = ktime_to_ns(ktime_get());
172 int cpu;
173
174 SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n",
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version);
178
179 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
180
181 for_each_online_cpu(cpu)
182 print_cpu(m, cpu, now);
183
184 SEQ_printf(m, "\n");
185
186 return 0;
187}
188
189void sysrq_sched_debug_show(void)
190{
191 sched_debug_show(NULL, NULL);
192}
193
194static int sched_debug_open(struct inode *inode, struct file *filp)
195{
196 return single_open(filp, sched_debug_show, NULL);
197}
198
199static struct file_operations sched_debug_fops = {
200 .open = sched_debug_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = seq_release,
204};
205
206static int __init init_sched_debug_procfs(void)
207{
208 struct proc_dir_entry *pe;
209
210 pe = create_proc_entry("sched_debug", 0644, NULL);
211 if (!pe)
212 return -ENOMEM;
213
214 pe->proc_fops = &sched_debug_fops;
215
216 return 0;
217}
218
219__initcall(init_sched_debug_procfs);
220
221void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
222{
223 unsigned long flags;
224 int num_threads = 1;
225
226 rcu_read_lock();
227 if (lock_task_sighand(p, &flags)) {
228 num_threads = atomic_read(&p->signal->count);
229 unlock_task_sighand(p, &flags);
230 }
231 rcu_read_unlock();
232
233 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
234 SEQ_printf(m, "----------------------------------------------\n");
235#define P(F) \
236 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
237
238 P(se.wait_start);
239 P(se.wait_start_fair);
240 P(se.exec_start);
241 P(se.sleep_start);
242 P(se.sleep_start_fair);
243 P(se.block_start);
244 P(se.sleep_max);
245 P(se.block_max);
246 P(se.exec_max);
247 P(se.wait_max);
248 P(se.wait_runtime);
249 P(se.wait_runtime_overruns);
250 P(se.wait_runtime_underruns);
251 P(se.sum_wait_runtime);
252 P(se.sum_exec_runtime);
253 SEQ_printf(m, "%-25s:%20Ld\n",
254 "nr_switches", (long long)(p->nvcsw + p->nivcsw));
255 P(se.load.weight);
256 P(policy);
257 P(prio);
258#undef P
259
260 {
261 u64 t0, t1;
262
263 t0 = sched_clock();
264 t1 = sched_clock();
265 SEQ_printf(m, "%-25s:%20Ld\n",
266 "clock-delta", (long long)(t1-t0));
267 }
268}
269
270void proc_sched_set_task(struct task_struct *p)
271{
272 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
273 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
274 p->se.sum_exec_runtime = 0;
275}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 0000000000..6971db0a71
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1131 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 */
19
20/*
21 * Preemption granularity:
22 * (default: 2 msec, units: nanoseconds)
23 *
24 * NOTE: this granularity value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat
26 * larger than this value. (to see the precise effective timeslice
27 * length of your workload, run vmstat and monitor the context-switches
28 * field)
29 *
30 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
33 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
35
36/*
37 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds)
39 *
40 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies.
43 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
45 10000000000ULL/HZ;
46
47/*
48 * SCHED_OTHER wake-up granularity.
49 * (default: 1 msec, units: nanoseconds)
50 *
51 * This option delays the preemption effects of decoupled workloads
52 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies.
54 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
56
57unsigned int sysctl_sched_stat_granularity __read_mostly;
58
59/*
60 * Initialized in sched_init_granularity():
61 */
62unsigned int sysctl_sched_runtime_limit __read_mostly;
63
64/*
65 * Debugging: various feature bits
66 */
67enum {
68 SCHED_FEAT_FAIR_SLEEPERS = 1,
69 SCHED_FEAT_SLEEPER_AVG = 2,
70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
71 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
72 SCHED_FEAT_START_DEBIT = 16,
73 SCHED_FEAT_SKIP_INITIAL = 32,
74};
75
76unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 |
82 SCHED_FEAT_SKIP_INITIAL *0;
83
84extern struct sched_class fair_sched_class;
85
86/**************************************************************
87 * CFS operations on generic schedulable entities:
88 */
89
90#ifdef CONFIG_FAIR_GROUP_SCHED
91
92/* cpu runqueue to which this cfs_rq is attached */
93static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
94{
95 return cfs_rq->rq;
96}
97
98/* currently running entity (if any) on this cfs_rq */
99static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
100{
101 return cfs_rq->curr;
102}
103
104/* An entity is a task if it doesn't "own" a runqueue */
105#define entity_is_task(se) (!se->my_q)
106
107static inline void
108set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
109{
110 cfs_rq->curr = se;
111}
112
113#else /* CONFIG_FAIR_GROUP_SCHED */
114
115static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116{
117 return container_of(cfs_rq, struct rq, cfs);
118}
119
120static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
121{
122 struct rq *rq = rq_of(cfs_rq);
123
124 if (unlikely(rq->curr->sched_class != &fair_sched_class))
125 return NULL;
126
127 return &rq->curr->se;
128}
129
130#define entity_is_task(se) 1
131
132static inline void
133set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
134
135#endif /* CONFIG_FAIR_GROUP_SCHED */
136
137static inline struct task_struct *task_of(struct sched_entity *se)
138{
139 return container_of(se, struct task_struct, se);
140}
141
142
143/**************************************************************
144 * Scheduling class tree data structure manipulation methods:
145 */
146
147/*
148 * Enqueue an entity into the rb-tree:
149 */
150static inline void
151__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
152{
153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
154 struct rb_node *parent = NULL;
155 struct sched_entity *entry;
156 s64 key = se->fair_key;
157 int leftmost = 1;
158
159 /*
160 * Find the right place in the rbtree:
161 */
162 while (*link) {
163 parent = *link;
164 entry = rb_entry(parent, struct sched_entity, run_node);
165 /*
166 * We dont care about collisions. Nodes with
167 * the same key stay together.
168 */
169 if (key - entry->fair_key < 0) {
170 link = &parent->rb_left;
171 } else {
172 link = &parent->rb_right;
173 leftmost = 0;
174 }
175 }
176
177 /*
178 * Maintain a cache of leftmost tree entries (it is frequently
179 * used):
180 */
181 if (leftmost)
182 cfs_rq->rb_leftmost = &se->run_node;
183
184 rb_link_node(&se->run_node, parent, link);
185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
186 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++;
188 se->on_rq = 1;
189}
190
191static inline void
192__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
193{
194 if (cfs_rq->rb_leftmost == &se->run_node)
195 cfs_rq->rb_leftmost = rb_next(&se->run_node);
196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
197 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--;
199 se->on_rq = 0;
200}
201
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
203{
204 return cfs_rq->rb_leftmost;
205}
206
207static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
208{
209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
210}
211
212/**************************************************************
213 * Scheduling class statistics methods:
214 */
215
216/*
217 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially:
219 */
220static long
221niced_granularity(struct sched_entity *curr, unsigned long granularity)
222{
223 u64 tmp;
224
225 /*
226 * Negative nice levels get the same granularity as nice-0:
227 */
228 if (likely(curr->load.weight >= NICE_0_LOAD))
229 return granularity;
230 /*
231 * Positive nice level tasks get linearly finer
232 * granularity:
233 */
234 tmp = curr->load.weight * (u64)granularity;
235
236 /*
237 * It will always fit into 'long':
238 */
239 return (long) (tmp >> NICE_0_SHIFT);
240}
241
242static inline void
243limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
244{
245 long limit = sysctl_sched_runtime_limit;
246
247 /*
248 * Niced tasks have the same history dynamic range as
249 * non-niced tasks:
250 */
251 if (unlikely(se->wait_runtime > limit)) {
252 se->wait_runtime = limit;
253 schedstat_inc(se, wait_runtime_overruns);
254 schedstat_inc(cfs_rq, wait_runtime_overruns);
255 }
256 if (unlikely(se->wait_runtime < -limit)) {
257 se->wait_runtime = -limit;
258 schedstat_inc(se, wait_runtime_underruns);
259 schedstat_inc(cfs_rq, wait_runtime_underruns);
260 }
261}
262
263static inline void
264__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
265{
266 se->wait_runtime += delta;
267 schedstat_add(se, sum_wait_runtime, delta);
268 limit_wait_runtime(cfs_rq, se);
269}
270
271static void
272add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
273{
274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
275 __add_wait_runtime(cfs_rq, se, delta);
276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
277}
278
279/*
280 * Update the current task's runtime statistics. Skip current tasks that
281 * are not in our scheduling class.
282 */
283static inline void
284__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
285{
286 unsigned long delta, delta_exec, delta_fair;
287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight;
290
291 if (unlikely(!load))
292 return;
293
294 delta_exec = curr->delta_exec;
295#ifdef CONFIG_SCHEDSTATS
296 if (unlikely(delta_exec > curr->exec_max))
297 curr->exec_max = delta_exec;
298#endif
299
300 curr->sum_exec_runtime += delta_exec;
301 cfs_rq->exec_clock += delta_exec;
302
303 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305
306 if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
307 delta = calc_delta_mine(cfs_rq->sleeper_bonus,
308 curr->load.weight, lw);
309 if (unlikely(delta > cfs_rq->sleeper_bonus))
310 delta = cfs_rq->sleeper_bonus;
311
312 cfs_rq->sleeper_bonus -= delta;
313 delta_mine -= delta;
314 }
315
316 cfs_rq->fair_clock += delta_fair;
317 /*
318 * We executed delta_exec amount of time on the CPU,
319 * but we were only entitled to delta_mine amount of
320 * time during that period (if nr_running == 1 then
321 * the two values are equal)
322 * [Note: delta_mine - delta_exec is negative]:
323 */
324 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
325}
326
327static void update_curr(struct cfs_rq *cfs_rq, u64 now)
328{
329 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
330 unsigned long delta_exec;
331
332 if (unlikely(!curr))
333 return;
334
335 /*
336 * Get the amount of time the current task was running
337 * since the last time we changed load (this cannot
338 * overflow on 32 bits):
339 */
340 delta_exec = (unsigned long)(now - curr->exec_start);
341
342 curr->delta_exec += delta_exec;
343
344 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
345 __update_curr(cfs_rq, curr, now);
346 curr->delta_exec = 0;
347 }
348 curr->exec_start = now;
349}
350
351static inline void
352update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
353{
354 se->wait_start_fair = cfs_rq->fair_clock;
355 se->wait_start = now;
356}
357
358/*
359 * We calculate fair deltas here, so protect against the random effects
360 * of a multiplication overflow by capping it to the runtime limit:
361 */
362#if BITS_PER_LONG == 32
363static inline unsigned long
364calc_weighted(unsigned long delta, unsigned long weight, int shift)
365{
366 u64 tmp = (u64)delta * weight >> shift;
367
368 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
369 return sysctl_sched_runtime_limit*2;
370 return tmp;
371}
372#else
373static inline unsigned long
374calc_weighted(unsigned long delta, unsigned long weight, int shift)
375{
376 return delta * weight >> shift;
377}
378#endif
379
380/*
381 * Task is being enqueued - update stats:
382 */
383static void
384update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
385{
386 s64 key;
387
388 /*
389 * Are we enqueueing a waiting task? (for current tasks
390 * a dequeue/enqueue event is a NOP)
391 */
392 if (se != cfs_rq_curr(cfs_rq))
393 update_stats_wait_start(cfs_rq, se, now);
394 /*
395 * Update the key:
396 */
397 key = cfs_rq->fair_clock;
398
399 /*
400 * Optimize the common nice 0 case:
401 */
402 if (likely(se->load.weight == NICE_0_LOAD)) {
403 key -= se->wait_runtime;
404 } else {
405 u64 tmp;
406
407 if (se->wait_runtime < 0) {
408 tmp = -se->wait_runtime;
409 key += (tmp * se->load.inv_weight) >>
410 (WMULT_SHIFT - NICE_0_SHIFT);
411 } else {
412 tmp = se->wait_runtime;
413 key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
414 }
415 }
416
417 se->fair_key = key;
418}
419
420/*
421 * Note: must be called with a freshly updated rq->fair_clock.
422 */
423static inline void
424__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
425{
426 unsigned long delta_fair = se->delta_fair_run;
427
428#ifdef CONFIG_SCHEDSTATS
429 {
430 s64 delta_wait = now - se->wait_start;
431 if (unlikely(delta_wait > se->wait_max))
432 se->wait_max = delta_wait;
433 }
434#endif
435
436 if (unlikely(se->load.weight != NICE_0_LOAD))
437 delta_fair = calc_weighted(delta_fair, se->load.weight,
438 NICE_0_SHIFT);
439
440 add_wait_runtime(cfs_rq, se, delta_fair);
441}
442
443static void
444update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
445{
446 unsigned long delta_fair;
447
448 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
449 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
450
451 se->delta_fair_run += delta_fair;
452 if (unlikely(abs(se->delta_fair_run) >=
453 sysctl_sched_stat_granularity)) {
454 __update_stats_wait_end(cfs_rq, se, now);
455 se->delta_fair_run = 0;
456 }
457
458 se->wait_start_fair = 0;
459 se->wait_start = 0;
460}
461
462static inline void
463update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
464{
465 update_curr(cfs_rq, now);
466 /*
467 * Mark the end of the wait period if dequeueing a
468 * waiting task:
469 */
470 if (se != cfs_rq_curr(cfs_rq))
471 update_stats_wait_end(cfs_rq, se, now);
472}
473
474/*
475 * We are picking a new current task - update its stats:
476 */
477static inline void
478update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
479{
480 /*
481 * We are starting a new run period:
482 */
483 se->exec_start = now;
484}
485
486/*
487 * We are descheduling a task - update its stats:
488 */
489static inline void
490update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
491{
492 se->exec_start = 0;
493}
494
495/**************************************************
496 * Scheduling class queueing methods:
497 */
498
499static void
500__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
501{
502 unsigned long load = cfs_rq->load.weight, delta_fair;
503 long prev_runtime;
504
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
506 load = rq_of(cfs_rq)->cpu_load[2];
507
508 delta_fair = se->delta_fair_sleep;
509
510 /*
511 * Fix up delta_fair with the effect of us running
512 * during the whole sleep period:
513 */
514 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
515 delta_fair = div64_likely32((u64)delta_fair * load,
516 load + se->load.weight);
517
518 if (unlikely(se->load.weight != NICE_0_LOAD))
519 delta_fair = calc_weighted(delta_fair, se->load.weight,
520 NICE_0_SHIFT);
521
522 prev_runtime = se->wait_runtime;
523 __add_wait_runtime(cfs_rq, se, delta_fair);
524 delta_fair = se->wait_runtime - prev_runtime;
525
526 /*
527 * Track the amount of bonus we've given to sleepers:
528 */
529 cfs_rq->sleeper_bonus += delta_fair;
530
531 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
532}
533
534static void
535enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
536{
537 struct task_struct *tsk = task_of(se);
538 unsigned long delta_fair;
539
540 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
541 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
542 return;
543
544 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
545 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
546
547 se->delta_fair_sleep += delta_fair;
548 if (unlikely(abs(se->delta_fair_sleep) >=
549 sysctl_sched_stat_granularity)) {
550 __enqueue_sleeper(cfs_rq, se, now);
551 se->delta_fair_sleep = 0;
552 }
553
554 se->sleep_start_fair = 0;
555
556#ifdef CONFIG_SCHEDSTATS
557 if (se->sleep_start) {
558 u64 delta = now - se->sleep_start;
559
560 if ((s64)delta < 0)
561 delta = 0;
562
563 if (unlikely(delta > se->sleep_max))
564 se->sleep_max = delta;
565
566 se->sleep_start = 0;
567 se->sum_sleep_runtime += delta;
568 }
569 if (se->block_start) {
570 u64 delta = now - se->block_start;
571
572 if ((s64)delta < 0)
573 delta = 0;
574
575 if (unlikely(delta > se->block_max))
576 se->block_max = delta;
577
578 se->block_start = 0;
579 se->sum_sleep_runtime += delta;
580 }
581#endif
582}
583
584static void
585enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
586 int wakeup, u64 now)
587{
588 /*
589 * Update the fair clock.
590 */
591 update_curr(cfs_rq, now);
592
593 if (wakeup)
594 enqueue_sleeper(cfs_rq, se, now);
595
596 update_stats_enqueue(cfs_rq, se, now);
597 __enqueue_entity(cfs_rq, se);
598}
599
600static void
601dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
602 int sleep, u64 now)
603{
604 update_stats_dequeue(cfs_rq, se, now);
605 if (sleep) {
606 se->sleep_start_fair = cfs_rq->fair_clock;
607#ifdef CONFIG_SCHEDSTATS
608 if (entity_is_task(se)) {
609 struct task_struct *tsk = task_of(se);
610
611 if (tsk->state & TASK_INTERRUPTIBLE)
612 se->sleep_start = now;
613 if (tsk->state & TASK_UNINTERRUPTIBLE)
614 se->block_start = now;
615 }
616 cfs_rq->wait_runtime -= se->wait_runtime;
617#endif
618 }
619 __dequeue_entity(cfs_rq, se);
620}
621
622/*
623 * Preempt the current task with a newly woken task if needed:
624 */
625static void
626__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
627 struct sched_entity *curr, unsigned long granularity)
628{
629 s64 __delta = curr->fair_key - se->fair_key;
630
631 /*
632 * Take scheduling granularity into account - do not
633 * preempt the current task unless the best task has
634 * a larger than sched_granularity fairness advantage:
635 */
636 if (__delta > niced_granularity(curr, granularity))
637 resched_task(rq_of(cfs_rq)->curr);
638}
639
640static inline void
641set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
642{
643 /*
644 * Any task has to be enqueued before it get to execute on
645 * a CPU. So account for the time it spent waiting on the
646 * runqueue. (note, here we rely on pick_next_task() having
647 * done a put_prev_task_fair() shortly before this, which
648 * updated rq->fair_clock - used by update_stats_wait_end())
649 */
650 update_stats_wait_end(cfs_rq, se, now);
651 update_stats_curr_start(cfs_rq, se, now);
652 set_cfs_rq_curr(cfs_rq, se);
653}
654
655static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
656{
657 struct sched_entity *se = __pick_next_entity(cfs_rq);
658
659 set_next_entity(cfs_rq, se, now);
660
661 return se;
662}
663
664static void
665put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
666{
667 /*
668 * If still on the runqueue then deactivate_task()
669 * was not called and update_curr() has to be done:
670 */
671 if (prev->on_rq)
672 update_curr(cfs_rq, now);
673
674 update_stats_curr_end(cfs_rq, prev, now);
675
676 if (prev->on_rq)
677 update_stats_wait_start(cfs_rq, prev, now);
678 set_cfs_rq_curr(cfs_rq, NULL);
679}
680
681static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
682{
683 struct rq *rq = rq_of(cfs_rq);
684 struct sched_entity *next;
685 u64 now = __rq_clock(rq);
686
687 /*
688 * Dequeue and enqueue the task to update its
689 * position within the tree:
690 */
691 dequeue_entity(cfs_rq, curr, 0, now);
692 enqueue_entity(cfs_rq, curr, 0, now);
693
694 /*
695 * Reschedule if another task tops the current one.
696 */
697 next = __pick_next_entity(cfs_rq);
698 if (next == curr)
699 return;
700
701 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
702}
703
704/**************************************************
705 * CFS operations on tasks:
706 */
707
708#ifdef CONFIG_FAIR_GROUP_SCHED
709
710/* Walk up scheduling entities hierarchy */
711#define for_each_sched_entity(se) \
712 for (; se; se = se->parent)
713
714static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
715{
716 return p->se.cfs_rq;
717}
718
719/* runqueue on which this entity is (to be) queued */
720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
721{
722 return se->cfs_rq;
723}
724
725/* runqueue "owned" by this group */
726static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
727{
728 return grp->my_q;
729}
730
731/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
732 * another cpu ('this_cpu')
733 */
734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
735{
736 /* A later patch will take group into account */
737 return &cpu_rq(this_cpu)->cfs;
738}
739
740/* Iterate thr' all leaf cfs_rq's on a runqueue */
741#define for_each_leaf_cfs_rq(rq, cfs_rq) \
742 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
743
744/* Do the two (enqueued) tasks belong to the same group ? */
745static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
746{
747 if (curr->se.cfs_rq == p->se.cfs_rq)
748 return 1;
749
750 return 0;
751}
752
753#else /* CONFIG_FAIR_GROUP_SCHED */
754
755#define for_each_sched_entity(se) \
756 for (; se; se = NULL)
757
758static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
759{
760 return &task_rq(p)->cfs;
761}
762
763static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
764{
765 struct task_struct *p = task_of(se);
766 struct rq *rq = task_rq(p);
767
768 return &rq->cfs;
769}
770
771/* runqueue "owned" by this group */
772static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
773{
774 return NULL;
775}
776
777static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
778{
779 return &cpu_rq(this_cpu)->cfs;
780}
781
782#define for_each_leaf_cfs_rq(rq, cfs_rq) \
783 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
784
785static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
786{
787 return 1;
788}
789
790#endif /* CONFIG_FAIR_GROUP_SCHED */
791
792/*
793 * The enqueue_task method is called before nr_running is
794 * increased. Here we update the fair scheduling stats and
795 * then put the task into the rbtree:
796 */
797static void
798enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
799{
800 struct cfs_rq *cfs_rq;
801 struct sched_entity *se = &p->se;
802
803 for_each_sched_entity(se) {
804 if (se->on_rq)
805 break;
806 cfs_rq = cfs_rq_of(se);
807 enqueue_entity(cfs_rq, se, wakeup, now);
808 }
809}
810
811/*
812 * The dequeue_task method is called before nr_running is
813 * decreased. We remove the task from the rbtree and
814 * update the fair scheduling stats:
815 */
816static void
817dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
818{
819 struct cfs_rq *cfs_rq;
820 struct sched_entity *se = &p->se;
821
822 for_each_sched_entity(se) {
823 cfs_rq = cfs_rq_of(se);
824 dequeue_entity(cfs_rq, se, sleep, now);
825 /* Don't dequeue parent if it has other entities besides us */
826 if (cfs_rq->load.weight)
827 break;
828 }
829}
830
831/*
832 * sched_yield() support is very simple - we dequeue and enqueue
833 */
834static void yield_task_fair(struct rq *rq, struct task_struct *p)
835{
836 struct cfs_rq *cfs_rq = task_cfs_rq(p);
837 u64 now = __rq_clock(rq);
838
839 /*
840 * Dequeue and enqueue the task to update its
841 * position within the tree:
842 */
843 dequeue_entity(cfs_rq, &p->se, 0, now);
844 enqueue_entity(cfs_rq, &p->se, 0, now);
845}
846
847/*
848 * Preempt the current task with a newly woken task if needed:
849 */
850static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
851{
852 struct task_struct *curr = rq->curr;
853 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
854 unsigned long gran;
855
856 if (unlikely(rt_prio(p->prio))) {
857 update_curr(cfs_rq, rq_clock(rq));
858 resched_task(curr);
859 return;
860 }
861
862 gran = sysctl_sched_wakeup_granularity;
863 /*
864 * Batch tasks prefer throughput over latency:
865 */
866 if (unlikely(p->policy == SCHED_BATCH))
867 gran = sysctl_sched_batch_wakeup_granularity;
868
869 if (is_same_group(curr, p))
870 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
871}
872
873static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
874{
875 struct cfs_rq *cfs_rq = &rq->cfs;
876 struct sched_entity *se;
877
878 if (unlikely(!cfs_rq->nr_running))
879 return NULL;
880
881 do {
882 se = pick_next_entity(cfs_rq, now);
883 cfs_rq = group_cfs_rq(se);
884 } while (cfs_rq);
885
886 return task_of(se);
887}
888
889/*
890 * Account for a descheduled task:
891 */
892static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
893{
894 struct sched_entity *se = &prev->se;
895 struct cfs_rq *cfs_rq;
896
897 for_each_sched_entity(se) {
898 cfs_rq = cfs_rq_of(se);
899 put_prev_entity(cfs_rq, se, now);
900 }
901}
902
903/**************************************************
904 * Fair scheduling class load-balancing methods:
905 */
906
907/*
908 * Load-balancing iterator. Note: while the runqueue stays locked
909 * during the whole iteration, the current task might be
910 * dequeued so the iterator has to be dequeue-safe. Here we
911 * achieve that by always pre-iterating before returning
912 * the current task:
913 */
914static inline struct task_struct *
915__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
916{
917 struct task_struct *p;
918
919 if (!curr)
920 return NULL;
921
922 p = rb_entry(curr, struct task_struct, se.run_node);
923 cfs_rq->rb_load_balance_curr = rb_next(curr);
924
925 return p;
926}
927
928static struct task_struct *load_balance_start_fair(void *arg)
929{
930 struct cfs_rq *cfs_rq = arg;
931
932 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
933}
934
935static struct task_struct *load_balance_next_fair(void *arg)
936{
937 struct cfs_rq *cfs_rq = arg;
938
939 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
940}
941
942static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
943{
944 struct sched_entity *curr;
945 struct task_struct *p;
946
947 if (!cfs_rq->nr_running)
948 return MAX_PRIO;
949
950 curr = __pick_next_entity(cfs_rq);
951 p = task_of(curr);
952
953 return p->prio;
954}
955
956static int
957load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 unsigned long max_nr_move, unsigned long max_load_move,
959 struct sched_domain *sd, enum cpu_idle_type idle,
960 int *all_pinned, unsigned long *total_load_moved)
961{
962 struct cfs_rq *busy_cfs_rq;
963 unsigned long load_moved, total_nr_moved = 0, nr_moved;
964 long rem_load_move = max_load_move;
965 struct rq_iterator cfs_rq_iterator;
966
967 cfs_rq_iterator.start = load_balance_start_fair;
968 cfs_rq_iterator.next = load_balance_next_fair;
969
970 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
971 struct cfs_rq *this_cfs_rq;
972 long imbalance;
973 unsigned long maxload;
974 int this_best_prio, best_prio, best_prio_seen = 0;
975
976 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
977
978 imbalance = busy_cfs_rq->load.weight -
979 this_cfs_rq->load.weight;
980 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
981 if (imbalance <= 0)
982 continue;
983
984 /* Don't pull more than imbalance/2 */
985 imbalance /= 2;
986 maxload = min(rem_load_move, imbalance);
987
988 this_best_prio = cfs_rq_best_prio(this_cfs_rq);
989 best_prio = cfs_rq_best_prio(busy_cfs_rq);
990
991 /*
992 * Enable handling of the case where there is more than one task
993 * with the best priority. If the current running task is one
994 * of those with prio==best_prio we know it won't be moved
995 * and therefore it's safe to override the skip (based on load)
996 * of any task we find with that prio.
997 */
998 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
999 best_prio_seen = 1;
1000
1001 /* pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators
1003 */
1004 cfs_rq_iterator.arg = busy_cfs_rq;
1005 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 max_nr_move, maxload, sd, idle, all_pinned,
1007 &load_moved, this_best_prio, best_prio,
1008 best_prio_seen, &cfs_rq_iterator);
1009
1010 total_nr_moved += nr_moved;
1011 max_nr_move -= nr_moved;
1012 rem_load_move -= load_moved;
1013
1014 if (max_nr_move <= 0 || rem_load_move <= 0)
1015 break;
1016 }
1017
1018 *total_load_moved = max_load_move - rem_load_move;
1019
1020 return total_nr_moved;
1021}
1022
1023/*
1024 * scheduler tick hitting a task of our scheduling class:
1025 */
1026static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1027{
1028 struct cfs_rq *cfs_rq;
1029 struct sched_entity *se = &curr->se;
1030
1031 for_each_sched_entity(se) {
1032 cfs_rq = cfs_rq_of(se);
1033 entity_tick(cfs_rq, se);
1034 }
1035}
1036
1037/*
1038 * Share the fairness runtime between parent and child, thus the
1039 * total amount of pressure for CPU stays equal - new tasks
1040 * get a chance to run but frequent forkers are not allowed to
1041 * monopolize the CPU. Note: the parent runqueue is locked,
1042 * the child is not running yet.
1043 */
1044static void task_new_fair(struct rq *rq, struct task_struct *p)
1045{
1046 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1047 struct sched_entity *se = &p->se;
1048 u64 now = rq_clock(rq);
1049
1050 sched_info_queued(p);
1051
1052 update_stats_enqueue(cfs_rq, se, now);
1053 /*
1054 * Child runs first: we let it run before the parent
1055 * until it reschedules once. We set up the key so that
1056 * it will preempt the parent:
1057 */
1058 p->se.fair_key = current->se.fair_key -
1059 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
1060 /*
1061 * The first wait is dominated by the child-runs-first logic,
1062 * so do not credit it with that waiting time yet:
1063 */
1064 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1065 p->se.wait_start_fair = 0;
1066
1067 /*
1068 * The statistical average of wait_runtime is about
1069 * -granularity/2, so initialize the task with that:
1070 */
1071 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1072 p->se.wait_runtime = -(sysctl_sched_granularity / 2);
1073
1074 __enqueue_entity(cfs_rq, se);
1075 inc_nr_running(p, rq, now);
1076}
1077
1078#ifdef CONFIG_FAIR_GROUP_SCHED
1079/* Account for a task changing its policy or group.
1080 *
1081 * This routine is mostly called to set cfs_rq->curr field when a task
1082 * migrates between groups/classes.
1083 */
1084static void set_curr_task_fair(struct rq *rq)
1085{
1086 struct task_struct *curr = rq->curr;
1087 struct sched_entity *se = &curr->se;
1088 u64 now = rq_clock(rq);
1089 struct cfs_rq *cfs_rq;
1090
1091 for_each_sched_entity(se) {
1092 cfs_rq = cfs_rq_of(se);
1093 set_next_entity(cfs_rq, se, now);
1094 }
1095}
1096#else
1097static void set_curr_task_fair(struct rq *rq)
1098{
1099}
1100#endif
1101
1102/*
1103 * All the scheduling class methods:
1104 */
1105struct sched_class fair_sched_class __read_mostly = {
1106 .enqueue_task = enqueue_task_fair,
1107 .dequeue_task = dequeue_task_fair,
1108 .yield_task = yield_task_fair,
1109
1110 .check_preempt_curr = check_preempt_curr_fair,
1111
1112 .pick_next_task = pick_next_task_fair,
1113 .put_prev_task = put_prev_task_fair,
1114
1115 .load_balance = load_balance_fair,
1116
1117 .set_curr_task = set_curr_task_fair,
1118 .task_tick = task_tick_fair,
1119 .task_new = task_new_fair,
1120};
1121
1122#ifdef CONFIG_SCHED_DEBUG
1123void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
1124{
1125 struct rq *rq = cpu_rq(cpu);
1126 struct cfs_rq *cfs_rq;
1127
1128 for_each_leaf_cfs_rq(rq, cfs_rq)
1129 print_cfs_rq(m, cpu, cfs_rq, now);
1130}
1131#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 0000000000..41841e741c
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
1/*
2 * idle-task scheduling class.
3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c)
6 */
7
8/*
9 * Idle tasks are unconditionally rescheduled:
10 */
11static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
12{
13 resched_task(rq->idle);
14}
15
16static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
17{
18 schedstat_inc(rq, sched_goidle);
19
20 return rq->idle;
21}
22
23/*
24 * It is not legal to sleep in the idle task - print a warning
25 * message if some code attempts to do it:
26 */
27static void
28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
29{
30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32 dump_stack();
33 spin_lock_irq(&rq->lock);
34}
35
36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
37{
38}
39
40static int
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved)
45{
46 return 0;
47}
48
49static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{
51}
52
53/*
54 * Simple, special scheduling class for the per-CPU idle tasks:
55 */
56static struct sched_class idle_sched_class __read_mostly = {
57 /* no enqueue/yield_task for idle tasks */
58
59 /* dequeue is not valid, we print a debug message there: */
60 .dequeue_task = dequeue_task_idle,
61
62 .check_preempt_curr = check_preempt_curr_idle,
63
64 .pick_next_task = pick_next_task_idle,
65 .put_prev_task = put_prev_task_idle,
66
67 .load_balance = load_balance_idle,
68
69 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */
71};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 0000000000..1192a2741b
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6/*
7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class.
9 */
10static inline void update_curr_rt(struct rq *rq, u64 now)
11{
12 struct task_struct *curr = rq->curr;
13 u64 delta_exec;
14
15 if (!task_has_rt_policy(curr))
16 return;
17
18 delta_exec = now - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0;
21 if (unlikely(delta_exec > curr->se.exec_max))
22 curr->se.exec_max = delta_exec;
23
24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now;
26}
27
28static void
29enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30{
31 struct rt_prio_array *array = &rq->rt.active;
32
33 list_add_tail(&p->run_list, array->queue + p->prio);
34 __set_bit(p->prio, array->bitmap);
35}
36
37/*
38 * Adding/removing a task to/from a priority array:
39 */
40static void
41dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42{
43 struct rt_prio_array *array = &rq->rt.active;
44
45 update_curr_rt(rq, now);
46
47 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio))
49 __clear_bit(p->prio, array->bitmap);
50}
51
52/*
53 * Put task to the end of the run list without the overhead of dequeue
54 * followed by enqueue.
55 */
56static void requeue_task_rt(struct rq *rq, struct task_struct *p)
57{
58 struct rt_prio_array *array = &rq->rt.active;
59
60 list_move_tail(&p->run_list, array->queue + p->prio);
61}
62
63static void
64yield_task_rt(struct rq *rq, struct task_struct *p)
65{
66 requeue_task_rt(rq, p);
67}
68
69/*
70 * Preempt the current task with a newly woken task if needed:
71 */
72static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73{
74 if (p->prio < rq->curr->prio)
75 resched_task(rq->curr);
76}
77
78static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
79{
80 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next;
82 struct list_head *queue;
83 int idx;
84
85 idx = sched_find_first_bit(array->bitmap);
86 if (idx >= MAX_RT_PRIO)
87 return NULL;
88
89 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list);
91
92 next->se.exec_start = now;
93
94 return next;
95}
96
97static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
98{
99 update_curr_rt(rq, now);
100 p->se.exec_start = 0;
101}
102
103/*
104 * Load-balancing iterator. Note: while the runqueue stays locked
105 * during the whole iteration, the current task might be
106 * dequeued so the iterator has to be dequeue-safe. Here we
107 * achieve that by always pre-iterating before returning
108 * the current task:
109 */
110static struct task_struct *load_balance_start_rt(void *arg)
111{
112 struct rq *rq = arg;
113 struct rt_prio_array *array = &rq->rt.active;
114 struct list_head *head, *curr;
115 struct task_struct *p;
116 int idx;
117
118 idx = sched_find_first_bit(array->bitmap);
119 if (idx >= MAX_RT_PRIO)
120 return NULL;
121
122 head = array->queue + idx;
123 curr = head->prev;
124
125 p = list_entry(curr, struct task_struct, run_list);
126
127 curr = curr->prev;
128
129 rq->rt.rt_load_balance_idx = idx;
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132
133 return p;
134}
135
136static struct task_struct *load_balance_next_rt(void *arg)
137{
138 struct rq *rq = arg;
139 struct rt_prio_array *array = &rq->rt.active;
140 struct list_head *head, *curr;
141 struct task_struct *p;
142 int idx;
143
144 idx = rq->rt.rt_load_balance_idx;
145 head = rq->rt.rt_load_balance_head;
146 curr = rq->rt.rt_load_balance_curr;
147
148 /*
149 * If we arrived back to the head again then
150 * iterate to the next queue (if any):
151 */
152 if (unlikely(head == curr)) {
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
154
155 if (next_idx >= MAX_RT_PRIO)
156 return NULL;
157
158 idx = next_idx;
159 head = array->queue + idx;
160 curr = head->prev;
161
162 rq->rt.rt_load_balance_idx = idx;
163 rq->rt.rt_load_balance_head = head;
164 }
165
166 p = list_entry(curr, struct task_struct, run_list);
167
168 curr = curr->prev;
169
170 rq->rt.rt_load_balance_curr = curr;
171
172 return p;
173}
174
175static int
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved)
180{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved;
183 struct rq_iterator rt_rq_iterator;
184
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187
188 /*
189 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio.
194 */
195 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1;
197
198 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt;
200 /* pass 'busiest' rq argument into
201 * load_balance_[start|next]_rt iterators
202 */
203 rt_rq_iterator.arg = busiest;
204
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved,
207 this_best_prio, best_prio, best_prio_seen,
208 &rt_rq_iterator);
209
210 return nr_moved;
211}
212
213static void task_tick_rt(struct rq *rq, struct task_struct *p)
214{
215 /*
216 * RR tasks need a special form of timeslice management.
217 * FIFO tasks have no timeslices.
218 */
219 if (p->policy != SCHED_RR)
220 return;
221
222 if (--p->time_slice)
223 return;
224
225 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p);
227
228 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p);
230}
231
232/*
233 * No parent/child timeslice management necessary for RT tasks,
234 * just activate them:
235 */
236static void task_new_rt(struct rq *rq, struct task_struct *p)
237{
238 activate_task(rq, p, 1);
239}
240
241static struct sched_class rt_sched_class __read_mostly = {
242 .enqueue_task = enqueue_task_rt,
243 .dequeue_task = dequeue_task_rt,
244 .yield_task = yield_task_rt,
245
246 .check_preempt_curr = check_preempt_curr_rt,
247
248 .pick_next_task = pick_next_task_rt,
249 .put_prev_task = put_prev_task_rt,
250
251 .load_balance = load_balance_rt,
252
253 .task_tick = task_tick_rt,
254 .task_new = task_new_rt,
255};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 0000000000..c63c38f6fa
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
1
2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 14
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies);
15 for_each_online_cpu(cpu) {
16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP
18 struct sched_domain *sd;
19 int dcnt = 0;
20#endif
21
22 /* runqueue-specific stats */
23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
31
32 seq_printf(seq, "\n");
33
34#ifdef CONFIG_SMP
35 /* domain-specific stats */
36 preempt_disable();
37 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu",
47 sd->lb_cnt[itype],
48 sd->lb_balanced[itype],
49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype],
51 sd->lb_gained[itype],
52 sd->lb_hot_gained[itype],
53 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]);
55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance);
63 }
64 preempt_enable();
65#endif
66 }
67 return 0;
68}
69
70static int schedstat_open(struct inode *inode, struct file *file)
71{
72 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
73 char *buf = kmalloc(size, GFP_KERNEL);
74 struct seq_file *m;
75 int res;
76
77 if (!buf)
78 return -ENOMEM;
79 res = single_open(file, show_schedstat, NULL);
80 if (!res) {
81 m = file->private_data;
82 m->buf = buf;
83 m->size = size;
84 } else
85 kfree(buf);
86 return res;
87}
88
89const struct file_operations proc_schedstat_operations = {
90 .open = schedstat_open,
91 .read = seq_read,
92 .llseek = seq_lseek,
93 .release = single_release,
94};
95
96/*
97 * Expects runqueue lock to be held for atomicity of update
98 */
99static inline void
100rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{
102 if (rq) {
103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++;
105 }
106}
107
108/*
109 * Expects runqueue lock to be held for atomicity of update
110 */
111static inline void
112rq_sched_info_depart(struct rq *rq, unsigned long long delta)
113{
114 if (rq)
115 rq->rq_sched_info.cpu_time += delta;
116}
117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
119#else /* !CONFIG_SCHEDSTATS */
120static inline void
121rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
122{}
123static inline void
124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{}
126# define schedstat_inc(rq, field) do { } while (0)
127# define schedstat_add(rq, field, amt) do { } while (0)
128#endif
129
130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/*
132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive
134 * tasks, the expired queue will become the active queue after the active
135 * queue is empty, without explicitly dequeuing and requeuing tasks in the
136 * expired queue. (Interactive tasks may be requeued directly to the
137 * active queue, thus delaying tasks in the expired queue from running;
138 * see scheduler_tick()).
139 *
140 * This function is only called from sched_info_arrive(), rather than
141 * dequeue_task(). Even though a task may be queued and dequeued multiple
142 * times as it is shuffled about, we're really interested in knowing how
143 * long it was from the *first* time it was queued to the time that it
144 * finally hit a cpu.
145 */
146static inline void sched_info_dequeued(struct task_struct *t)
147{
148 t->sched_info.last_queued = 0;
149}
150
151/*
152 * Called when a task finally hits the cpu. We can now calculate how
153 * long it was waiting to run. We also note when it began so that we
154 * can keep stats on how long its timeslice is.
155 */
156static void sched_info_arrive(struct task_struct *t)
157{
158 unsigned long long now = sched_clock(), delta = 0;
159
160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued;
162 sched_info_dequeued(t);
163 t->sched_info.run_delay += delta;
164 t->sched_info.last_arrival = now;
165 t->sched_info.pcnt++;
166
167 rq_sched_info_arrive(task_rq(t), delta);
168}
169
170/*
171 * Called when a process is queued into either the active or expired
172 * array. The time is noted and later used to determine how long we
173 * had to wait for us to reach the cpu. Since the expired queue will
174 * become the active queue after active queue is empty, without dequeuing
175 * and requeuing any tasks, we are interested in queuing to either. It
176 * is unusual but not impossible for tasks to be dequeued and immediately
177 * requeued in the same or another array: this can happen in sched_yield(),
178 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
179 * to runqueue.
180 *
181 * This function is only called from enqueue_task(), but also only updates
182 * the timestamp if it is already not set. It's assumed that
183 * sched_info_dequeued() will clear that stamp when appropriate.
184 */
185static inline void sched_info_queued(struct task_struct *t)
186{
187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock();
190}
191
192/*
193 * Called when a process ceases being the active-running process, either
194 * voluntarily or involuntarily. Now we can calculate how long we ran.
195 */
196static inline void sched_info_depart(struct task_struct *t)
197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
199
200 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta);
202}
203
204/*
205 * Called when tasks are switched involuntarily due, typically, to expiring
206 * their time slice. (This may also be called when switching to or from
207 * the idle task.) We are only called when prev != next.
208 */
209static inline void
210__sched_info_switch(struct task_struct *prev, struct task_struct *next)
211{
212 struct rq *rq = task_rq(prev);
213
214 /*
215 * prev now departs the cpu. It's not interesting to record
216 * stats about how efficient we were at scheduling the idle
217 * process, however.
218 */
219 if (prev != rq->idle)
220 sched_info_depart(prev);
221
222 if (next != rq->idle)
223 sched_info_arrive(next);
224}
225static inline void
226sched_info_switch(struct task_struct *prev, struct task_struct *next)
227{
228 if (unlikely(sched_info_on()))
229 __sched_info_switch(prev, next);
230}
231#else
232#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index c3391b6020..ad64fcb731 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -10,6 +10,7 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11 11
12/* #define SECCOMP_DEBUG 1 */ 12/* #define SECCOMP_DEBUG 1 */
13#define NR_SECCOMP_MODES 1
13 14
14/* 15/*
15 * Secure computing mode 1 allows only read/write/exit/sigreturn. 16 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -54,3 +55,31 @@ void __secure_computing(int this_syscall)
54#endif 55#endif
55 do_exit(SIGKILL); 56 do_exit(SIGKILL);
56} 57}
58
59long prctl_get_seccomp(void)
60{
61 return current->seccomp.mode;
62}
63
64long prctl_set_seccomp(unsigned long seccomp_mode)
65{
66 long ret;
67
68 /* can set it only once to be even more secure */
69 ret = -EPERM;
70 if (unlikely(current->seccomp.mode))
71 goto out;
72
73 ret = -EINVAL;
74 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
75 current->seccomp.mode = seccomp_mode;
76 set_thread_flag(TIF_SECCOMP);
77#ifdef TIF_NOTSC
78 disable_TSC();
79#endif
80 ret = 0;
81 }
82
83 out:
84 return ret;
85}
diff --git a/kernel/signal.c b/kernel/signal.c
index f940560977..39d122753b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -718,6 +718,37 @@ out_set:
718#define LEGACY_QUEUE(sigptr, sig) \ 718#define LEGACY_QUEUE(sigptr, sig) \
719 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) 719 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
720 720
721int print_fatal_signals;
722
723static void print_fatal_signal(struct pt_regs *regs, int signr)
724{
725 printk("%s/%d: potentially unexpected fatal signal %d.\n",
726 current->comm, current->pid, signr);
727
728#ifdef __i386__
729 printk("code at %08lx: ", regs->eip);
730 {
731 int i;
732 for (i = 0; i < 16; i++) {
733 unsigned char insn;
734
735 __get_user(insn, (unsigned char *)(regs->eip + i));
736 printk("%02x ", insn);
737 }
738 }
739#endif
740 printk("\n");
741 show_regs(regs);
742}
743
744static int __init setup_print_fatal_signals(char *str)
745{
746 get_option (&str, &print_fatal_signals);
747
748 return 1;
749}
750
751__setup("print-fatal-signals=", setup_print_fatal_signals);
721 752
722static int 753static int
723specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 754specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -1855,6 +1886,8 @@ relock:
1855 * Anything else is fatal, maybe with a core dump. 1886 * Anything else is fatal, maybe with a core dump.
1856 */ 1887 */
1857 current->flags |= PF_SIGNALED; 1888 current->flags |= PF_SIGNALED;
1889 if ((signr != SIGKILL) && print_fatal_signals)
1890 print_fatal_signal(regs, signr);
1858 if (sig_kernel_coredump(signr)) { 1891 if (sig_kernel_coredump(signr)) {
1859 /* 1892 /*
1860 * If it was able to dump core, this kills all 1893 * If it was able to dump core, this kills all
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0b9886a00e..0f546ddea4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -14,6 +14,7 @@
14#include <linux/notifier.h> 14#include <linux/notifier.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/freezer.h>
17#include <linux/kthread.h> 18#include <linux/kthread.h>
18#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
@@ -488,9 +489,6 @@ void __init softirq_init(void)
488 489
489static int ksoftirqd(void * __bind_cpu) 490static int ksoftirqd(void * __bind_cpu)
490{ 491{
491 set_user_nice(current, 19);
492 current->flags |= PF_NOFREEZE;
493
494 set_current_state(TASK_INTERRUPTIBLE); 492 set_current_state(TASK_INTERRUPTIBLE);
495 493
496 while (!kthread_should_stop()) { 494 while (!kthread_should_stop()) {
@@ -615,12 +613,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
615 kthread_bind(per_cpu(ksoftirqd, hotcpu), 613 kthread_bind(per_cpu(ksoftirqd, hotcpu),
616 any_online_cpu(cpu_online_map)); 614 any_online_cpu(cpu_online_map));
617 case CPU_DEAD: 615 case CPU_DEAD:
618 case CPU_DEAD_FROZEN: 616 case CPU_DEAD_FROZEN: {
617 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
618
619 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
620 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
621 sched_setscheduler(p, SCHED_FIFO, &param);
621 kthread_stop(p); 622 kthread_stop(p);
622 takeover_tasklets(hotcpu); 623 takeover_tasklets(hotcpu);
623 break; 624 break;
625 }
624#endif /* CONFIG_HOTPLUG_CPU */ 626#endif /* CONFIG_HOTPLUG_CPU */
625 } 627 }
626 return NOTIFY_OK; 628 return NOTIFY_OK;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0131e296ff..708d4882c0 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -10,6 +10,7 @@
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/freezer.h>
13#include <linux/kthread.h> 14#include <linux/kthread.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu)
116 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 117 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
117 118
118 sched_setscheduler(current, SCHED_FIFO, &param); 119 sched_setscheduler(current, SCHED_FIFO, &param);
119 current->flags |= PF_NOFREEZE;
120 120
121 /* initialize timestamp */ 121 /* initialize timestamp */
122 touch_softlockup_watchdog(); 122 touch_softlockup_watchdog();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fcee2a8e6d..319821ef78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state)
93static int stop_machine(void) 93static int stop_machine(void)
94{ 94{
95 int i, ret = 0; 95 int i, ret = 0;
96 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
97
98 /* One high-prio thread per cpu. We'll do this one. */
99 sched_setscheduler(current, SCHED_FIFO, &param);
100 96
101 atomic_set(&stopmachine_thread_ack, 0); 97 atomic_set(&stopmachine_thread_ack, 0);
102 stopmachine_num_threads = 0; 98 stopmachine_num_threads = 0;
@@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
189 185
190 p = kthread_create(do_stop, &smdata, "kstopmachine"); 186 p = kthread_create(do_stop, &smdata, "kstopmachine");
191 if (!IS_ERR(p)) { 187 if (!IS_ERR(p)) {
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 /* One high-prio thread per cpu. We'll do this one. */
191 sched_setscheduler(p, SCHED_FIFO, &param);
192 kthread_bind(p, cpu); 192 kthread_bind(p, cpu);
193 wake_up_process(p); 193 wake_up_process(p);
194 wait_for_completion(&smdata.done); 194 wait_for_completion(&smdata.done);
diff --git a/kernel/sys.c b/kernel/sys.c
index 872271ccc3..18987c7f6a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,10 +31,12 @@
31#include <linux/cn_proc.h> 31#include <linux/cn_proc.h>
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h>
34 35
35#include <linux/compat.h> 36#include <linux/compat.h>
36#include <linux/syscalls.h> 37#include <linux/syscalls.h>
37#include <linux/kprobes.h> 38#include <linux/kprobes.h>
39#include <linux/user_namespace.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -1078,13 +1080,13 @@ static int set_user(uid_t new_ruid, int dumpclear)
1078{ 1080{
1079 struct user_struct *new_user; 1081 struct user_struct *new_user;
1080 1082
1081 new_user = alloc_uid(new_ruid); 1083 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
1082 if (!new_user) 1084 if (!new_user)
1083 return -EAGAIN; 1085 return -EAGAIN;
1084 1086
1085 if (atomic_read(&new_user->processes) >= 1087 if (atomic_read(&new_user->processes) >=
1086 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 1088 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
1087 new_user != &root_user) { 1089 new_user != current->nsproxy->user_ns->root_user) {
1088 free_uid(new_user); 1090 free_uid(new_user);
1089 return -EAGAIN; 1091 return -EAGAIN;
1090 } 1092 }
@@ -2241,6 +2243,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2241 error = SET_ENDIAN(current, arg2); 2243 error = SET_ENDIAN(current, arg2);
2242 break; 2244 break;
2243 2245
2246 case PR_GET_SECCOMP:
2247 error = prctl_get_seccomp();
2248 break;
2249 case PR_SET_SECCOMP:
2250 error = prctl_set_seccomp(arg2);
2251 break;
2252
2244 default: 2253 default:
2245 error = -EINVAL; 2254 error = -EINVAL;
2246 break; 2255 break;
@@ -2277,3 +2286,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2277 } 2286 }
2278 return err ? -EFAULT : 0; 2287 return err ? -EFAULT : 0;
2279} 2288}
2289
2290char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2291
2292static void argv_cleanup(char **argv, char **envp)
2293{
2294 argv_free(argv);
2295}
2296
2297/**
2298 * orderly_poweroff - Trigger an orderly system poweroff
2299 * @force: force poweroff if command execution fails
2300 *
2301 * This may be called from any context to trigger a system shutdown.
2302 * If the orderly shutdown fails, it will force an immediate shutdown.
2303 */
2304int orderly_poweroff(bool force)
2305{
2306 int argc;
2307 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2308 static char *envp[] = {
2309 "HOME=/",
2310 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2311 NULL
2312 };
2313 int ret = -ENOMEM;
2314 struct subprocess_info *info;
2315
2316 if (argv == NULL) {
2317 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2318 __func__, poweroff_cmd);
2319 goto out;
2320 }
2321
2322 info = call_usermodehelper_setup(argv[0], argv, envp);
2323 if (info == NULL) {
2324 argv_free(argv);
2325 goto out;
2326 }
2327
2328 call_usermodehelper_setcleanup(info, argv_cleanup);
2329
2330 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2331
2332 out:
2333 if (ret && force) {
2334 printk(KERN_WARNING "Failed to start orderly shutdown: "
2335 "forcing the issue\n");
2336
2337 /* I guess this should try to kick off some daemon to
2338 sync and poweroff asap. Or not even bother syncing
2339 if we're doing an emergency shutdown? */
2340 emergency_sync();
2341 kernel_power_off();
2342 }
2343
2344 return ret;
2345}
2346EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7e11e2c98b..b0ec498a18 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void)
14 14
15cond_syscall(sys_nfsservctl); 15cond_syscall(sys_nfsservctl);
16cond_syscall(sys_quotactl); 16cond_syscall(sys_quotactl);
17cond_syscall(sys32_quotactl);
17cond_syscall(sys_acct); 18cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 19cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 20cond_syscall(sys_swapon);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 30ee462ee7..44a1d699aa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -29,6 +29,7 @@
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
34#include <linux/kobject.h> 35#include <linux/kobject.h>
@@ -45,13 +46,11 @@
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/processor.h> 52#include <asm/processor.h>
51 53
52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
53 void __user *buffer, size_t *lenp, loff_t *ppos);
54
55#ifdef CONFIG_X86 54#ifdef CONFIG_X86
56#include <asm/nmi.h> 55#include <asm/nmi.h>
57#include <asm/stacktrace.h> 56#include <asm/stacktrace.h>
@@ -61,6 +60,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
61 60
62/* External variables not in a header file. */ 61/* External variables not in a header file. */
63extern int C_A_D; 62extern int C_A_D;
63extern int print_fatal_signals;
64extern int sysctl_overcommit_memory; 64extern int sysctl_overcommit_memory;
65extern int sysctl_overcommit_ratio; 65extern int sysctl_overcommit_ratio;
66extern int sysctl_panic_on_oom; 66extern int sysctl_panic_on_oom;
@@ -202,11 +202,94 @@ static ctl_table root_table[] = {
202 .mode = 0555, 202 .mode = 0555,
203 .child = dev_table, 203 .child = dev_table,
204 }, 204 },
205 205/*
206 * NOTE: do not add new entries to this table unless you have read
207 * Documentation/sysctl/ctl_unnumbered.txt
208 */
206 { .ctl_name = 0 } 209 { .ctl_name = 0 }
207}; 210};
208 211
212#ifdef CONFIG_SCHED_DEBUG
213static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
214static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
215static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
216static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
217#endif
218
209static ctl_table kern_table[] = { 219static ctl_table kern_table[] = {
220#ifdef CONFIG_SCHED_DEBUG
221 {
222 .ctl_name = CTL_UNNUMBERED,
223 .procname = "sched_granularity_ns",
224 .data = &sysctl_sched_granularity,
225 .maxlen = sizeof(unsigned int),
226 .mode = 0644,
227 .proc_handler = &proc_dointvec_minmax,
228 .strategy = &sysctl_intvec,
229 .extra1 = &min_sched_granularity_ns,
230 .extra2 = &max_sched_granularity_ns,
231 },
232 {
233 .ctl_name = CTL_UNNUMBERED,
234 .procname = "sched_wakeup_granularity_ns",
235 .data = &sysctl_sched_wakeup_granularity,
236 .maxlen = sizeof(unsigned int),
237 .mode = 0644,
238 .proc_handler = &proc_dointvec_minmax,
239 .strategy = &sysctl_intvec,
240 .extra1 = &min_wakeup_granularity_ns,
241 .extra2 = &max_wakeup_granularity_ns,
242 },
243 {
244 .ctl_name = CTL_UNNUMBERED,
245 .procname = "sched_batch_wakeup_granularity_ns",
246 .data = &sysctl_sched_batch_wakeup_granularity,
247 .maxlen = sizeof(unsigned int),
248 .mode = 0644,
249 .proc_handler = &proc_dointvec_minmax,
250 .strategy = &sysctl_intvec,
251 .extra1 = &min_wakeup_granularity_ns,
252 .extra2 = &max_wakeup_granularity_ns,
253 },
254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_stat_granularity_ns",
257 .data = &sysctl_sched_stat_granularity,
258 .maxlen = sizeof(unsigned int),
259 .mode = 0644,
260 .proc_handler = &proc_dointvec_minmax,
261 .strategy = &sysctl_intvec,
262 .extra1 = &min_wakeup_granularity_ns,
263 .extra2 = &max_wakeup_granularity_ns,
264 },
265 {
266 .ctl_name = CTL_UNNUMBERED,
267 .procname = "sched_runtime_limit_ns",
268 .data = &sysctl_sched_runtime_limit,
269 .maxlen = sizeof(unsigned int),
270 .mode = 0644,
271 .proc_handler = &proc_dointvec_minmax,
272 .strategy = &sysctl_intvec,
273 .extra1 = &min_sched_granularity_ns,
274 .extra2 = &max_sched_granularity_ns,
275 },
276 {
277 .ctl_name = CTL_UNNUMBERED,
278 .procname = "sched_child_runs_first",
279 .data = &sysctl_sched_child_runs_first,
280 .maxlen = sizeof(unsigned int),
281 .mode = 0644,
282 .proc_handler = &proc_dointvec,
283 },
284 {
285 .ctl_name = CTL_UNNUMBERED,
286 .procname = "sched_features",
287 .data = &sysctl_sched_features,
288 .maxlen = sizeof(unsigned int),
289 .mode = 0644,
290 .proc_handler = &proc_dointvec,
291 },
292#endif
210 { 293 {
211 .ctl_name = KERN_PANIC, 294 .ctl_name = KERN_PANIC,
212 .procname = "panic", 295 .procname = "panic",
@@ -260,6 +343,14 @@ static ctl_table kern_table[] = {
260 .proc_handler = &proc_dointvec, 343 .proc_handler = &proc_dointvec,
261 }, 344 },
262#endif 345#endif
346 {
347 .ctl_name = CTL_UNNUMBERED,
348 .procname = "print-fatal-signals",
349 .data = &print_fatal_signals,
350 .maxlen = sizeof(int),
351 .mode = 0644,
352 .proc_handler = &proc_dointvec,
353 },
263#ifdef __sparc__ 354#ifdef __sparc__
264 { 355 {
265 .ctl_name = KERN_SPARC_REBOOT, 356 .ctl_name = KERN_SPARC_REBOOT,
@@ -615,6 +706,15 @@ static ctl_table kern_table[] = {
615 .proc_handler = &proc_dointvec, 706 .proc_handler = &proc_dointvec,
616 }, 707 },
617#endif 708#endif
709 {
710 .ctl_name = CTL_UNNUMBERED,
711 .procname = "poweroff_cmd",
712 .data = &poweroff_cmd,
713 .maxlen = POWEROFF_CMD_PATH_LEN,
714 .mode = 0644,
715 .proc_handler = &proc_dostring,
716 .strategy = &sysctl_string,
717 },
618 718
619 { .ctl_name = 0 } 719 { .ctl_name = 0 }
620}; 720};
@@ -734,6 +834,14 @@ static ctl_table vm_table[] = {
734 .mode = 0644, 834 .mode = 0644,
735 .proc_handler = &proc_dointvec, 835 .proc_handler = &proc_dointvec,
736 }, 836 },
837 {
838 .ctl_name = CTL_UNNUMBERED,
839 .procname = "hugepages_treat_as_movable",
840 .data = &hugepages_treat_as_movable,
841 .maxlen = sizeof(int),
842 .mode = 0644,
843 .proc_handler = &hugetlb_treat_movable_handler,
844 },
737#endif 845#endif
738 { 846 {
739 .ctl_name = VM_LOWMEM_RESERVE_RATIO, 847 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
@@ -869,6 +977,27 @@ static ctl_table vm_table[] = {
869 .strategy = &sysctl_jiffies, 977 .strategy = &sysctl_jiffies,
870 }, 978 },
871#endif 979#endif
980#ifdef CONFIG_SECURITY
981 {
982 .ctl_name = CTL_UNNUMBERED,
983 .procname = "mmap_min_addr",
984 .data = &mmap_min_addr,
985 .maxlen = sizeof(unsigned long),
986 .mode = 0644,
987 .proc_handler = &proc_doulongvec_minmax,
988 },
989#ifdef CONFIG_NUMA
990 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "numa_zonelist_order",
993 .data = &numa_zonelist_order,
994 .maxlen = NUMA_ZONELIST_ORDER_LEN,
995 .mode = 0644,
996 .proc_handler = &numa_zonelist_order_handler,
997 .strategy = &sysctl_string,
998 },
999#endif
1000#endif
872#if defined(CONFIG_X86_32) || \ 1001#if defined(CONFIG_X86_32) || \
873 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1002 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
874 { 1003 {
@@ -882,6 +1011,10 @@ static ctl_table vm_table[] = {
882 .extra1 = &zero, 1011 .extra1 = &zero,
883 }, 1012 },
884#endif 1013#endif
1014/*
1015 * NOTE: do not add new entries to this table unless you have read
1016 * Documentation/sysctl/ctl_unnumbered.txt
1017 */
885 { .ctl_name = 0 } 1018 { .ctl_name = 0 }
886}; 1019};
887 1020
@@ -1022,6 +1155,10 @@ static ctl_table fs_table[] = {
1022 .child = binfmt_misc_table, 1155 .child = binfmt_misc_table,
1023 }, 1156 },
1024#endif 1157#endif
1158/*
1159 * NOTE: do not add new entries to this table unless you have read
1160 * Documentation/sysctl/ctl_unnumbered.txt
1161 */
1025 { .ctl_name = 0 } 1162 { .ctl_name = 0 }
1026}; 1163};
1027 1164
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 906cae7715..059431ed67 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
196 196
197 /* fill in basic acct fields */ 197 /* fill in basic acct fields */
198 stats->version = TASKSTATS_VERSION; 198 stats->version = TASKSTATS_VERSION;
199 stats->nvcsw = tsk->nvcsw;
200 stats->nivcsw = tsk->nivcsw;
199 bacct_add_tsk(stats, tsk); 201 bacct_add_tsk(stats, tsk);
200 202
201 /* fill in extended acct fields */ 203 /* fill in extended acct fields */
@@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
242 */ 244 */
243 delayacct_add_tsk(stats, tsk); 245 delayacct_add_tsk(stats, tsk);
244 246
247 stats->nvcsw += tsk->nvcsw;
248 stats->nivcsw += tsk->nivcsw;
245 } while_each_thread(first, tsk); 249 } while_each_thread(first, tsk);
246 250
247 unlock_task_sighand(first, &flags); 251 unlock_task_sighand(first, &flags);
diff --git a/kernel/time.c b/kernel/time.c
index f04791f694..ffe19149d7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -57,14 +57,17 @@ EXPORT_SYMBOL(sys_tz);
57 */ 57 */
58asmlinkage long sys_time(time_t __user * tloc) 58asmlinkage long sys_time(time_t __user * tloc)
59{ 59{
60 time_t i; 60 /*
61 struct timeval tv; 61 * We read xtime.tv_sec atomically - it's updated
62 * atomically by update_wall_time(), so no need to
63 * even read-lock the xtime seqlock:
64 */
65 time_t i = xtime.tv_sec;
62 66
63 do_gettimeofday(&tv); 67 smp_rmb(); /* sys_time() results are coherent */
64 i = tv.tv_sec;
65 68
66 if (tloc) { 69 if (tloc) {
67 if (put_user(i,tloc)) 70 if (put_user(i, tloc))
68 i = -EFAULT; 71 i = -EFAULT;
69 } 72 }
70 return i; 73 return i;
@@ -373,12 +376,25 @@ void do_gettimeofday (struct timeval *tv)
373 376
374 tv->tv_sec = sec; 377 tv->tv_sec = sec;
375 tv->tv_usec = usec; 378 tv->tv_usec = usec;
376}
377 379
380 /*
381 * Make sure xtime.tv_sec [returned by sys_time()] always
382 * follows the gettimeofday() result precisely. This
383 * condition is extremely unlikely, it can hit at most
384 * once per second:
385 */
386 if (unlikely(xtime.tv_sec != tv->tv_sec)) {
387 unsigned long flags;
388
389 write_seqlock_irqsave(&xtime_lock, flags);
390 update_wall_time();
391 write_sequnlock_irqrestore(&xtime_lock, flags);
392 }
393}
378EXPORT_SYMBOL(do_gettimeofday); 394EXPORT_SYMBOL(do_gettimeofday);
379 395
396#else /* CONFIG_TIME_INTERPOLATION */
380 397
381#else
382#ifndef CONFIG_GENERIC_TIME 398#ifndef CONFIG_GENERIC_TIME
383/* 399/*
384 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 400 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -394,7 +410,7 @@ void getnstimeofday(struct timespec *tv)
394} 410}
395EXPORT_SYMBOL_GPL(getnstimeofday); 411EXPORT_SYMBOL_GPL(getnstimeofday);
396#endif 412#endif
397#endif 413#endif /* CONFIG_TIME_INTERPOLATION */
398 414
399/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 415/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
400 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 416 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 76212b2a99..2ad1c37b8d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
205} 205}
206 206
207/** 207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events 208 * clockevents_notify - notification about relevant events
250 */ 209 */
251void clockevents_notify(unsigned long reason, void *arg) 210void clockevents_notify(unsigned long reason, void *arg)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87aa5ff931..438c6b723e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -13,7 +13,7 @@
13#include <linux/timex.h> 13#include <linux/timex.h>
14#include <linux/jiffies.h> 14#include <linux/jiffies.h>
15#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
16 16#include <linux/capability.h>
17#include <asm/div64.h> 17#include <asm/div64.h>
18#include <asm/timex.h> 18#include <asm/timex.h>
19 19
@@ -122,7 +122,6 @@ void second_overflow(void)
122 */ 122 */
123 time_interpolator_update(-NSEC_PER_SEC); 123 time_interpolator_update(-NSEC_PER_SEC);
124 time_state = TIME_OOP; 124 time_state = TIME_OOP;
125 clock_was_set();
126 printk(KERN_NOTICE "Clock: inserting leap second " 125 printk(KERN_NOTICE "Clock: inserting leap second "
127 "23:59:60 UTC\n"); 126 "23:59:60 UTC\n");
128 } 127 }
@@ -137,7 +136,6 @@ void second_overflow(void)
137 */ 136 */
138 time_interpolator_update(NSEC_PER_SEC); 137 time_interpolator_update(NSEC_PER_SEC);
139 time_state = TIME_WAIT; 138 time_state = TIME_WAIT;
140 clock_was_set();
141 printk(KERN_NOTICE "Clock: deleting leap second " 139 printk(KERN_NOTICE "Clock: deleting leap second "
142 "23:59:59 UTC\n"); 140 "23:59:59 UTC\n");
143 } 141 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d1042f82a..728cedfd3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock);
36 * at zero at system boot time, so wall_to_monotonic will be negative, 36 * at zero at system boot time, so wall_to_monotonic will be negative,
37 * however, we will ALWAYS keep the tv_nsec part positive so we can use 37 * however, we will ALWAYS keep the tv_nsec part positive so we can use
38 * the usual normalization. 38 * the usual normalization.
39 *
40 * wall_to_monotonic is moved after resume from suspend for the monotonic
41 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
42 * to get the real boot based time offset.
43 *
44 * - wall_to_monotonic is no longer the boot time, getboottime must be
45 * used instead.
39 */ 46 */
40struct timespec xtime __attribute__ ((aligned (16))); 47struct timespec xtime __attribute__ ((aligned (16)));
41struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 48struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
49static unsigned long total_sleep_time; /* seconds */
42 50
43EXPORT_SYMBOL(xtime); 51EXPORT_SYMBOL(xtime);
44 52
@@ -251,6 +259,7 @@ void __init timekeeping_init(void)
251 xtime.tv_nsec = 0; 259 xtime.tv_nsec = 0;
252 set_normalized_timespec(&wall_to_monotonic, 260 set_normalized_timespec(&wall_to_monotonic,
253 -xtime.tv_sec, -xtime.tv_nsec); 261 -xtime.tv_sec, -xtime.tv_nsec);
262 total_sleep_time = 0;
254 263
255 write_sequnlock_irqrestore(&xtime_lock, flags); 264 write_sequnlock_irqrestore(&xtime_lock, flags);
256} 265}
@@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
282 291
283 xtime.tv_sec += sleep_length; 292 xtime.tv_sec += sleep_length;
284 wall_to_monotonic.tv_sec -= sleep_length; 293 wall_to_monotonic.tv_sec -= sleep_length;
294 total_sleep_time += sleep_length;
285 } 295 }
286 /* re-base the last cycle value */ 296 /* re-base the last cycle value */
287 clock->cycle_last = clocksource_read(clock); 297 clock->cycle_last = clocksource_read(clock);
@@ -476,3 +486,30 @@ void update_wall_time(void)
476 change_clocksource(); 486 change_clocksource();
477 update_vsyscall(&xtime, clock); 487 update_vsyscall(&xtime, clock);
478} 488}
489
490/**
491 * getboottime - Return the real time of system boot.
492 * @ts: pointer to the timespec to be set
493 *
494 * Returns the time of day in a timespec.
495 *
496 * This is based on the wall_to_monotonic offset and the total suspend
497 * time. Calls to settimeofday will affect the value returned (which
498 * basically means that however wrong your real time clock is at boot time,
499 * you get the right time here).
500 */
501void getboottime(struct timespec *ts)
502{
503 set_normalized_timespec(ts,
504 - (wall_to_monotonic.tv_sec + total_sleep_time),
505 - wall_to_monotonic.tv_nsec);
506}
507
508/**
509 * monotonic_to_bootbased - Convert the monotonic time to boot based.
510 * @ts: pointer to the timespec to be converted
511 */
512void monotonic_to_bootbased(struct timespec *ts)
513{
514 ts->tv_sec += total_sleep_time;
515}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 8bbcfb77f7..e5edc3a22a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
38 38
39static void print_name_offset(struct seq_file *m, void *sym) 39static void print_name_offset(struct seq_file *m, void *sym)
40{ 40{
41 char symname[KSYM_NAME_LEN+1]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%p>", sym);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 321693724a..8ed62fda16 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,6 +68,7 @@ struct entry {
68 * Number of timeout events: 68 * Number of timeout events:
69 */ 69 */
70 unsigned long count; 70 unsigned long count;
71 unsigned int timer_flag;
71 72
72 /* 73 /*
73 * We save the command-line string to preserve 74 * We save the command-line string to preserve
@@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
231 * incremented. Otherwise the timer is registered in a free slot. 232 * incremented. Otherwise the timer is registered in a free slot.
232 */ 233 */
233void timer_stats_update_stats(void *timer, pid_t pid, void *startf, 234void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
234 void *timerf, char * comm) 235 void *timerf, char *comm,
236 unsigned int timer_flag)
235{ 237{
236 /* 238 /*
237 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
@@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
249 input.start_func = startf; 251 input.start_func = startf;
250 input.expire_func = timerf; 252 input.expire_func = timerf;
251 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag;
252 255
253 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
254 if (!active) 257 if (!active)
@@ -266,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
266 269
267static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
268{ 271{
269 char symname[KSYM_NAME_LEN+1]; 272 char symname[KSYM_NAME_LEN];
270 273
271 if (lookup_symbol_name(addr, symname) < 0) 274 if (lookup_symbol_name(addr, symname) < 0)
272 seq_printf(m, "<%p>", (void *)addr); 275 seq_printf(m, "<%p>", (void *)addr);
@@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v)
295 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
296 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
297 300
298 seq_puts(m, "Timer Stats Version: v0.1\n"); 301 seq_puts(m, "Timer Stats Version: v0.2\n");
299 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
300 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
301 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n",
@@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v)
303 306
304 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
305 entry = entries + i; 308 entry = entries + i;
306 seq_printf(m, "%4lu, %5d %-16s ", 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ",
307 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else {
313 seq_printf(m, " %4lu, %5d %-16s ",
314 entry->count, entry->pid, entry->comm);
315 }
308 316
309 print_name_offset(m, (unsigned long)entry->start_func); 317 print_name_offset(m, (unsigned long)entry->start_func);
310 seq_puts(m, " ("); 318 seq_puts(m, " (");
diff --git a/kernel/timer.c b/kernel/timer.c
index 1a69705c2f..b7792fb033 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
306 timer->start_pid = current->pid; 306 timer->start_pid = current->pid;
307} 307}
308
309static void timer_stats_account_timer(struct timer_list *timer)
310{
311 unsigned int flag = 0;
312
313 if (unlikely(tbase_get_deferrable(timer->base)))
314 flag |= TIMER_STATS_FLAG_DEFERRABLE;
315
316 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
317 timer->function, timer->start_comm, flag);
318}
319
320#else
321static void timer_stats_account_timer(struct timer_list *timer) {}
308#endif 322#endif
309 323
310/** 324/**
@@ -1114,6 +1128,7 @@ int do_sysinfo(struct sysinfo *info)
1114 getnstimeofday(&tp); 1128 getnstimeofday(&tp);
1115 tp.tv_sec += wall_to_monotonic.tv_sec; 1129 tp.tv_sec += wall_to_monotonic.tv_sec;
1116 tp.tv_nsec += wall_to_monotonic.tv_nsec; 1130 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1131 monotonic_to_bootbased(&tp);
1117 if (tp.tv_nsec - NSEC_PER_SEC >= 0) { 1132 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1118 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1133 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1119 tp.tv_sec++; 1134 tp.tv_sec++;
@@ -1206,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu)
1206 /* 1221 /*
1207 * The APs use this path later in boot 1222 * The APs use this path later in boot
1208 */ 1223 */
1209 base = kmalloc_node(sizeof(*base), GFP_KERNEL, 1224 base = kmalloc_node(sizeof(*base),
1225 GFP_KERNEL | __GFP_ZERO,
1210 cpu_to_node(cpu)); 1226 cpu_to_node(cpu));
1211 if (!base) 1227 if (!base)
1212 return -ENOMEM; 1228 return -ENOMEM;
@@ -1217,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu)
1217 kfree(base); 1233 kfree(base);
1218 return -ENOMEM; 1234 return -ENOMEM;
1219 } 1235 }
1220 memset(base, 0, sizeof(*base));
1221 per_cpu(tvec_bases, cpu) = base; 1236 per_cpu(tvec_bases, cpu) = base;
1222 } else { 1237 } else {
1223 /* 1238 /*
diff --git a/kernel/user.c b/kernel/user.c
index 4869563080..98b8250779 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,20 +14,19 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/user_namespace.h>
17 19
18/* 20/*
19 * UID task count cache, to get fast user lookup in "alloc_uid" 21 * UID task count cache, to get fast user lookup in "alloc_uid"
20 * when changing user ID's (ie setuid() and friends). 22 * when changing user ID's (ie setuid() and friends).
21 */ 23 */
22 24
23#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
24#define UIDHASH_SZ (1 << UIDHASH_BITS)
25#define UIDHASH_MASK (UIDHASH_SZ - 1) 25#define UIDHASH_MASK (UIDHASH_SZ - 1)
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
28 28
29static struct kmem_cache *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ];
31 30
32/* 31/*
33 * The uidhash_lock is mostly taken from process context, but it is 32 * The uidhash_lock is mostly taken from process context, but it is
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid)
94{ 93{
95 struct user_struct *ret; 94 struct user_struct *ret;
96 unsigned long flags; 95 unsigned long flags;
96 struct user_namespace *ns = current->nsproxy->user_ns;
97 97
98 spin_lock_irqsave(&uidhash_lock, flags); 98 spin_lock_irqsave(&uidhash_lock, flags);
99 ret = uid_hash_find(uid, uidhashentry(uid)); 99 ret = uid_hash_find(uid, uidhashentry(ns, uid));
100 spin_unlock_irqrestore(&uidhash_lock, flags); 100 spin_unlock_irqrestore(&uidhash_lock, flags);
101 return ret; 101 return ret;
102} 102}
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up)
120 } 120 }
121} 121}
122 122
123struct user_struct * alloc_uid(uid_t uid) 123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 124{
125 struct list_head *hashent = uidhashentry(uid); 125 struct list_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 126 struct user_struct *up;
127 127
128 spin_lock_irq(&uidhash_lock); 128 spin_lock_irq(&uidhash_lock);
@@ -211,11 +211,11 @@ static int __init uid_cache_init(void)
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
212 212
213 for(n = 0; n < UIDHASH_SZ; ++n) 213 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(uidhash_table + n); 214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
215 215
216 /* Insert the root user immediately (init already runs as root) */ 216 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 217 spin_lock_irq(&uidhash_lock);
218 uid_hash_insert(&root_user, uidhashentry(0)); 218 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
219 spin_unlock_irq(&uidhash_lock); 219 spin_unlock_irq(&uidhash_lock);
220 220
221 return 0; 221 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 0000000000..d055d98785
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,87 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7
8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h>
11#include <linux/user_namespace.h>
12
13struct user_namespace init_user_ns = {
14 .kref = {
15 .refcount = ATOMIC_INIT(2),
16 },
17 .root_user = &root_user,
18};
19
20EXPORT_SYMBOL_GPL(init_user_ns);
21
22#ifdef CONFIG_USER_NS
23
24/*
25 * Clone a new ns copying an original user ns, setting refcount to 1
26 * @old_ns: namespace to clone
27 * Return NULL on error (failure to kmalloc), new ns otherwise
28 */
29static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
30{
31 struct user_namespace *ns;
32 struct user_struct *new_user;
33 int n;
34
35 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
36 if (!ns)
37 return ERR_PTR(-ENOMEM);
38
39 kref_init(&ns->kref);
40
41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n);
43
44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0);
46 if (!ns->root_user) {
47 kfree(ns);
48 return ERR_PTR(-ENOMEM);
49 }
50
51 /* Reset current->user with a new one */
52 new_user = alloc_uid(ns, current->uid);
53 if (!new_user) {
54 free_uid(ns->root_user);
55 kfree(ns);
56 return ERR_PTR(-ENOMEM);
57 }
58
59 switch_uid(new_user);
60 return ns;
61}
62
63struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
64{
65 struct user_namespace *new_ns;
66
67 BUG_ON(!old_ns);
68 get_user_ns(old_ns);
69
70 if (!(flags & CLONE_NEWUSER))
71 return old_ns;
72
73 new_ns = clone_user_ns(old_ns);
74
75 put_user_ns(old_ns);
76 return new_ns;
77}
78
79void free_user_ns(struct kref *kref)
80{
81 struct user_namespace *ns;
82
83 ns = container_of(kref, struct user_namespace, kref);
84 kfree(ns);
85}
86
87#endif /* CONFIG_USER_NS */
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 160c8c5136..9d8180a0f0 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h>
16 17
17/* 18/*
18 * Clone a new ns copying an original utsname, setting refcount to 1 19 * Clone a new ns copying an original utsname, setting refcount to 1
@@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24 struct uts_namespace *ns; 25 struct uts_namespace *ns;
25 26
26 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
27 if (ns) { 28 if (!ns)
28 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 29 return ERR_PTR(-ENOMEM);
29 kref_init(&ns->kref); 30
30 } 31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
32 kref_init(&ns->kref);
31 return ns; 33 return ns;
32} 34}
33 35
@@ -37,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
37 * utsname of this process won't be seen by parent, and vice 39 * utsname of this process won't be seen by parent, and vice
38 * versa. 40 * versa.
39 */ 41 */
40struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) 42struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
41{ 43{
42 struct uts_namespace *new_ns; 44 struct uts_namespace *new_ns;
43 45
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index f22b9dbd2a..c76c06466b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,10 +18,7 @@
18static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
19{ 19{
20 char *which = table->data; 20 char *which = table->data;
21#ifdef CONFIG_UTS_NS 21
22 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24#endif
25 if (!write) 22 if (!write)
26 down_read(&uts_sem); 23 down_read(&uts_sem);
27 else 24 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bebf73be9..58e5c152a6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -282,8 +282,8 @@ static int worker_thread(void *__cwq)
282 struct cpu_workqueue_struct *cwq = __cwq; 282 struct cpu_workqueue_struct *cwq = __cwq;
283 DEFINE_WAIT(wait); 283 DEFINE_WAIT(wait);
284 284
285 if (!cwq->wq->freezeable) 285 if (cwq->wq->freezeable)
286 current->flags |= PF_NOFREEZE; 286 set_freezable();
287 287
288 set_user_nice(current, -5); 288 set_user_nice(current, -5);
289 289
@@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
382EXPORT_SYMBOL_GPL(flush_workqueue); 382EXPORT_SYMBOL_GPL(flush_workqueue);
383 383
384/* 384/*
385 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, 385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
386 * so this work can't be re-armed in any way. 386 * so this work can't be re-armed in any way.
387 */ 387 */
388static int try_to_grab_pending(struct work_struct *work) 388static int try_to_grab_pending(struct work_struct *work)
389{ 389{
390 struct cpu_workqueue_struct *cwq; 390 struct cpu_workqueue_struct *cwq;
391 int ret = 0; 391 int ret = -1;
392 392
393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
394 return 1; 394 return 0;
395 395
396 /* 396 /*
397 * The queueing is in progress, or it is already queued. Try to 397 * The queueing is in progress, or it is already queued. Try to
@@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work)
457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
458} 458}
459 459
460static int __cancel_work_timer(struct work_struct *work,
461 struct timer_list* timer)
462{
463 int ret;
464
465 do {
466 ret = (timer && likely(del_timer(timer)));
467 if (!ret)
468 ret = try_to_grab_pending(work);
469 wait_on_work(work);
470 } while (unlikely(ret < 0));
471
472 work_clear_pending(work);
473 return ret;
474}
475
460/** 476/**
461 * cancel_work_sync - block until a work_struct's callback has terminated 477 * cancel_work_sync - block until a work_struct's callback has terminated
462 * @work: the work which is to be flushed 478 * @work: the work which is to be flushed
463 * 479 *
480 * Returns true if @work was pending.
481 *
464 * cancel_work_sync() will cancel the work if it is queued. If the work's 482 * cancel_work_sync() will cancel the work if it is queued. If the work's
465 * callback appears to be running, cancel_work_sync() will block until it 483 * callback appears to be running, cancel_work_sync() will block until it
466 * has completed. 484 * has completed.
@@ -476,31 +494,26 @@ static void wait_on_work(struct work_struct *work)
476 * The caller must ensure that workqueue_struct on which this work was last 494 * The caller must ensure that workqueue_struct on which this work was last
477 * queued can't be destroyed before this function returns. 495 * queued can't be destroyed before this function returns.
478 */ 496 */
479void cancel_work_sync(struct work_struct *work) 497int cancel_work_sync(struct work_struct *work)
480{ 498{
481 while (!try_to_grab_pending(work)) 499 return __cancel_work_timer(work, NULL);
482 cpu_relax();
483 wait_on_work(work);
484 work_clear_pending(work);
485} 500}
486EXPORT_SYMBOL_GPL(cancel_work_sync); 501EXPORT_SYMBOL_GPL(cancel_work_sync);
487 502
488/** 503/**
489 * cancel_rearming_delayed_work - reliably kill off a delayed work. 504 * cancel_delayed_work_sync - reliably kill off a delayed work.
490 * @dwork: the delayed work struct 505 * @dwork: the delayed work struct
491 * 506 *
507 * Returns true if @dwork was pending.
508 *
492 * It is possible to use this function if @dwork rearms itself via queue_work() 509 * It is possible to use this function if @dwork rearms itself via queue_work()
493 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 510 * or queue_delayed_work(). See also the comment for cancel_work_sync().
494 */ 511 */
495void cancel_rearming_delayed_work(struct delayed_work *dwork) 512int cancel_delayed_work_sync(struct delayed_work *dwork)
496{ 513{
497 while (!del_timer(&dwork->timer) && 514 return __cancel_work_timer(&dwork->work, &dwork->timer);
498 !try_to_grab_pending(&dwork->work))
499 cpu_relax();
500 wait_on_work(&dwork->work);
501 work_clear_pending(&dwork->work);
502} 515}
503EXPORT_SYMBOL(cancel_rearming_delayed_work); 516EXPORT_SYMBOL(cancel_delayed_work_sync);
504 517
505static struct workqueue_struct *keventd_wq __read_mostly; 518static struct workqueue_struct *keventd_wq __read_mostly;
506 519
@@ -739,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
739 if (cwq->thread == NULL) 752 if (cwq->thread == NULL)
740 return; 753 return;
741 754
755 flush_cpu_workqueue(cwq);
742 /* 756 /*
743 * If the caller is CPU_DEAD the single flush_cpu_workqueue() 757 * If the caller is CPU_DEAD and cwq->worklist was not empty,
744 * is not enough, a concurrent flush_workqueue() can insert a 758 * a concurrent flush_workqueue() can insert a barrier after us.
745 * barrier after us. 759 * However, in that case run_workqueue() won't return and check
760 * kthread_should_stop() until it flushes all work_struct's.
746 * When ->worklist becomes empty it is safe to exit because no 761 * When ->worklist becomes empty it is safe to exit because no
747 * more work_structs can be queued on this cwq: flush_workqueue 762 * more work_structs can be queued on this cwq: flush_workqueue
748 * checks list_empty(), and a "normal" queue_work() can't use 763 * checks list_empty(), and a "normal" queue_work() can't use
749 * a dead CPU. 764 * a dead CPU.
750 */ 765 */
751 while (flush_cpu_workqueue(cwq))
752 ;
753
754 kthread_stop(cwq->thread); 766 kthread_stop(cwq->thread);
755 cwq->thread = NULL; 767 cwq->thread = NULL;
756} 768}