aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c97
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c89
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/cpuset.c11
-rw-r--r--kernel/delayacct.c10
-rw-r--r--kernel/exit.c45
-rw-r--r--kernel/fork.c35
-rw-r--r--kernel/futex.c159
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/irq/proc.c10
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/kallsyms.c27
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kmod.c303
-rw-r--r--kernel/kprobes.c9
-rw-r--r--kernel/ksysfs.c28
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c1501
-rw-r--r--kernel/lockdep_proc.c301
-rw-r--r--kernel/module.c72
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/nsproxy.c72
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-cpu-timers.c34
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig29
-rw-r--r--kernel/power/disk.c251
-rw-r--r--kernel/power/main.c108
-rw-r--r--kernel/power/power.h29
-rw-r--r--kernel/power/process.c90
-rw-r--r--kernel/power/swap.c20
-rw-r--r--kernel/power/user.c154
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c28
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/relay.c220
-rw-r--r--kernel/rtmutex-debug.c6
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c6
-rw-r--r--kernel/rtmutex_common.h9
-rw-r--r--kernel/rwsem.c8
-rw-r--r--kernel/sched.c3090
-rw-r--r--kernel/sched_debug.c275
-rw-r--r--kernel/sched_fair.c1131
-rw-r--r--kernel/sched_idletask.c71
-rw-r--r--kernel/sched_rt.c255
-rw-r--r--kernel/sched_stats.h235
-rw-r--r--kernel/seccomp.c29
-rw-r--r--kernel/signal.c33
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/spinlock.c32
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c104
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c191
-rw-r--r--kernel/taskstats.c4
-rw-r--r--kernel/time.c79
-rw-r--r--kernel/time/clockevents.c41
-rw-r--r--kernel/time/ntp.c73
-rw-r--r--kernel/time/tick-broadcast.c35
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-oneshot.c15
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c16
-rw-r--r--kernel/timer.c231
-rw-r--r--kernel/user.c20
-rw-r--r--kernel/user_namespace.c87
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/workqueue.c60
78 files changed, 6467 insertions, 3656 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 642d4277c2..2a999836ca 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,11 +4,12 @@
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o 11 hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
12 utsname.o
12 13
13obj-$(CONFIG_STACKTRACE) += stacktrace.o 14obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/ 15obj-y += time/
@@ -48,7 +49,6 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
48obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
49obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
50obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 51obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
51obj-$(CONFIG_UTS_NS) += utsname.o
52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 52obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 53obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
54 54
diff --git a/kernel/audit.c b/kernel/audit.c
index d13276d414..eb0f9165b4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,6 +58,7 @@
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h> 59#include <linux/inotify.h>
60#include <linux/freezer.h> 60#include <linux/freezer.h>
61#include <linux/tty.h>
61 62
62#include "audit.h" 63#include "audit.h"
63 64
@@ -391,6 +392,7 @@ static int kauditd_thread(void *dummy)
391{ 392{
392 struct sk_buff *skb; 393 struct sk_buff *skb;
393 394
395 set_freezable();
394 while (!kthread_should_stop()) { 396 while (!kthread_should_stop()) {
395 skb = skb_dequeue(&audit_skb_queue); 397 skb = skb_dequeue(&audit_skb_queue);
396 wake_up(&audit_backlog_wait); 398 wake_up(&audit_backlog_wait);
@@ -423,6 +425,31 @@ static int kauditd_thread(void *dummy)
423 return 0; 425 return 0;
424} 426}
425 427
428static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
429{
430 struct task_struct *tsk;
431 int err;
432
433 read_lock(&tasklist_lock);
434 tsk = find_task_by_pid(pid);
435 err = -ESRCH;
436 if (!tsk)
437 goto out;
438 err = 0;
439
440 spin_lock_irq(&tsk->sighand->siglock);
441 if (!tsk->signal->audit_tty)
442 err = -EPERM;
443 spin_unlock_irq(&tsk->sighand->siglock);
444 if (err)
445 goto out;
446
447 tty_audit_push_task(tsk, loginuid);
448out:
449 read_unlock(&tasklist_lock);
450 return err;
451}
452
426int audit_send_list(void *_dest) 453int audit_send_list(void *_dest)
427{ 454{
428 struct audit_netlink_list *dest = _dest; 455 struct audit_netlink_list *dest = _dest;
@@ -511,6 +538,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
511 case AUDIT_DEL: 538 case AUDIT_DEL:
512 case AUDIT_DEL_RULE: 539 case AUDIT_DEL_RULE:
513 case AUDIT_SIGNAL_INFO: 540 case AUDIT_SIGNAL_INFO:
541 case AUDIT_TTY_GET:
542 case AUDIT_TTY_SET:
514 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 543 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
515 err = -EPERM; 544 err = -EPERM;
516 break; 545 break;
@@ -622,6 +651,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
622 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 651 err = audit_filter_user(&NETLINK_CB(skb), msg_type);
623 if (err == 1) { 652 if (err == 1) {
624 err = 0; 653 err = 0;
654 if (msg_type == AUDIT_USER_TTY) {
655 err = audit_prepare_user_tty(pid, loginuid);
656 if (err)
657 break;
658 }
625 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 659 ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
626 if (ab) { 660 if (ab) {
627 audit_log_format(ab, 661 audit_log_format(ab,
@@ -638,8 +672,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
638 " subj=%s", ctx); 672 " subj=%s", ctx);
639 kfree(ctx); 673 kfree(ctx);
640 } 674 }
641 audit_log_format(ab, " msg='%.1024s'", 675 if (msg_type != AUDIT_USER_TTY)
642 (char *)data); 676 audit_log_format(ab, " msg='%.1024s'",
677 (char *)data);
678 else {
679 int size;
680
681 audit_log_format(ab, " msg=");
682 size = nlmsg_len(nlh);
683 audit_log_n_untrustedstring(ab, size,
684 data);
685 }
643 audit_set_pid(ab, pid); 686 audit_set_pid(ab, pid);
644 audit_log_end(ab); 687 audit_log_end(ab);
645 } 688 }
@@ -730,6 +773,45 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
730 0, 0, sig_data, sizeof(*sig_data) + len); 773 0, 0, sig_data, sizeof(*sig_data) + len);
731 kfree(sig_data); 774 kfree(sig_data);
732 break; 775 break;
776 case AUDIT_TTY_GET: {
777 struct audit_tty_status s;
778 struct task_struct *tsk;
779
780 read_lock(&tasklist_lock);
781 tsk = find_task_by_pid(pid);
782 if (!tsk)
783 err = -ESRCH;
784 else {
785 spin_lock_irq(&tsk->sighand->siglock);
786 s.enabled = tsk->signal->audit_tty != 0;
787 spin_unlock_irq(&tsk->sighand->siglock);
788 }
789 read_unlock(&tasklist_lock);
790 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
791 &s, sizeof(s));
792 break;
793 }
794 case AUDIT_TTY_SET: {
795 struct audit_tty_status *s;
796 struct task_struct *tsk;
797
798 if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
799 return -EINVAL;
800 s = data;
801 if (s->enabled != 0 && s->enabled != 1)
802 return -EINVAL;
803 read_lock(&tasklist_lock);
804 tsk = find_task_by_pid(pid);
805 if (!tsk)
806 err = -ESRCH;
807 else {
808 spin_lock_irq(&tsk->sighand->siglock);
809 tsk->signal->audit_tty = s->enabled != 0;
810 spin_unlock_irq(&tsk->sighand->siglock);
811 }
812 read_unlock(&tasklist_lock);
813 break;
814 }
733 default: 815 default:
734 err = -EINVAL; 816 err = -EINVAL;
735 break; 817 break;
@@ -1185,7 +1267,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1185} 1267}
1186 1268
1187/** 1269/**
1188 * audit_log_n_unstrustedstring - log a string that may contain random characters 1270 * audit_log_n_untrustedstring - log a string that may contain random characters
1189 * @ab: audit_buffer 1271 * @ab: audit_buffer
1190 * @len: lenth of string (not including trailing null) 1272 * @len: lenth of string (not including trailing null)
1191 * @string: string to be logged 1273 * @string: string to be logged
@@ -1201,25 +1283,24 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1201const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1283const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1202 const char *string) 1284 const char *string)
1203{ 1285{
1204 const unsigned char *p = string; 1286 const unsigned char *p;
1205 1287
1206 while (*p) { 1288 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1207 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1289 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1208 audit_log_hex(ab, string, len); 1290 audit_log_hex(ab, string, len);
1209 return string + len + 1; 1291 return string + len + 1;
1210 } 1292 }
1211 p++;
1212 } 1293 }
1213 audit_log_n_string(ab, len, string); 1294 audit_log_n_string(ab, len, string);
1214 return p + 1; 1295 return p + 1;
1215} 1296}
1216 1297
1217/** 1298/**
1218 * audit_log_unstrustedstring - log a string that may contain random characters 1299 * audit_log_untrustedstring - log a string that may contain random characters
1219 * @ab: audit_buffer 1300 * @ab: audit_buffer
1220 * @string: string to be logged 1301 * @string: string to be logged
1221 * 1302 *
1222 * Same as audit_log_n_unstrustedstring(), except that strlen is used to 1303 * Same as audit_log_n_untrustedstring(), except that strlen is used to
1223 * determine string length. 1304 * determine string length.
1224 */ 1305 */
1225const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1306const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
diff --git a/kernel/audit.h b/kernel/audit.h
index 815d6f5c04..95877435c3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -115,7 +115,6 @@ extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
115extern void audit_send_reply(int pid, int seq, int type, 115extern void audit_send_reply(int pid, int seq, int type,
116 int done, int multi, 116 int done, int multi,
117 void *payload, int size); 117 void *payload, int size);
118extern void audit_log_lost(const char *message);
119extern void audit_panic(const char *message); 118extern void audit_panic(const char *message);
120 119
121struct audit_netlink_list { 120struct audit_netlink_list {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce61f42354..1bf093dcff 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1210,8 +1210,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1210 struct audit_entry *e; 1210 struct audit_entry *e;
1211 struct audit_field *inode_f = entry->rule.inode_f; 1211 struct audit_field *inode_f = entry->rule.inode_f;
1212 struct audit_watch *watch = entry->rule.watch; 1212 struct audit_watch *watch = entry->rule.watch;
1213 struct nameidata *ndp, *ndw; 1213 struct nameidata *ndp = NULL, *ndw = NULL;
1214 int h, err, putnd_needed = 0; 1214 int h, err;
1215#ifdef CONFIG_AUDITSYSCALL 1215#ifdef CONFIG_AUDITSYSCALL
1216 int dont_count = 0; 1216 int dont_count = 0;
1217 1217
@@ -1239,7 +1239,6 @@ static inline int audit_add_rule(struct audit_entry *entry,
1239 err = audit_get_nd(watch->path, &ndp, &ndw); 1239 err = audit_get_nd(watch->path, &ndp, &ndw);
1240 if (err) 1240 if (err)
1241 goto error; 1241 goto error;
1242 putnd_needed = 1;
1243 } 1242 }
1244 1243
1245 mutex_lock(&audit_filter_mutex); 1244 mutex_lock(&audit_filter_mutex);
@@ -1269,14 +1268,11 @@ static inline int audit_add_rule(struct audit_entry *entry,
1269#endif 1268#endif
1270 mutex_unlock(&audit_filter_mutex); 1269 mutex_unlock(&audit_filter_mutex);
1271 1270
1272 if (putnd_needed) 1271 audit_put_nd(ndp, ndw); /* NULL args OK */
1273 audit_put_nd(ndp, ndw);
1274
1275 return 0; 1272 return 0;
1276 1273
1277error: 1274error:
1278 if (putnd_needed) 1275 audit_put_nd(ndp, ndw); /* NULL args OK */
1279 audit_put_nd(ndp, ndw);
1280 if (watch) 1276 if (watch)
1281 audit_put_watch(watch); /* tmp watch, matches initial get */ 1277 audit_put_watch(watch); /* tmp watch, matches initial get */
1282 return err; 1278 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e36481ed61..145cbb79c4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -71,9 +71,6 @@
71 71
72extern struct list_head audit_filter_list[]; 72extern struct list_head audit_filter_list[];
73 73
74/* No syscall auditing will take place unless audit_enabled != 0. */
75extern int audit_enabled;
76
77/* AUDIT_NAMES is the number of slots we reserve in the audit_context 74/* AUDIT_NAMES is the number of slots we reserve in the audit_context
78 * for saving names from getname(). */ 75 * for saving names from getname(). */
79#define AUDIT_NAMES 20 76#define AUDIT_NAMES 20
@@ -156,7 +153,7 @@ struct audit_aux_data_execve {
156 struct audit_aux_data d; 153 struct audit_aux_data d;
157 int argc; 154 int argc;
158 int envc; 155 int envc;
159 char mem[0]; 156 struct mm_struct *mm;
160}; 157};
161 158
162struct audit_aux_data_socketcall { 159struct audit_aux_data_socketcall {
@@ -834,6 +831,55 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
834 return rc; 831 return rc;
835} 832}
836 833
834static void audit_log_execve_info(struct audit_buffer *ab,
835 struct audit_aux_data_execve *axi)
836{
837 int i;
838 long len, ret;
839 const char __user *p = (const char __user *)axi->mm->arg_start;
840 char *buf;
841
842 if (axi->mm != current->mm)
843 return; /* execve failed, no additional info */
844
845 for (i = 0; i < axi->argc; i++, p += len) {
846 len = strnlen_user(p, MAX_ARG_STRLEN);
847 /*
848 * We just created this mm, if we can't find the strings
849 * we just copied into it something is _very_ wrong. Similar
850 * for strings that are too long, we should not have created
851 * any.
852 */
853 if (!len || len > MAX_ARG_STRLEN) {
854 WARN_ON(1);
855 send_sig(SIGKILL, current, 0);
856 }
857
858 buf = kmalloc(len, GFP_KERNEL);
859 if (!buf) {
860 audit_panic("out of memory for argv string\n");
861 break;
862 }
863
864 ret = copy_from_user(buf, p, len);
865 /*
866 * There is no reason for this copy to be short. We just
867 * copied them here, and the mm hasn't been exposed to user-
868 * space yet.
869 */
870 if (!ret) {
871 WARN_ON(1);
872 send_sig(SIGKILL, current, 0);
873 }
874
875 audit_log_format(ab, "a%d=", i);
876 audit_log_untrustedstring(ab, buf);
877 audit_log_format(ab, "\n");
878
879 kfree(buf);
880 }
881}
882
837static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 883static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
838{ 884{
839 int i, call_panic = 0; 885 int i, call_panic = 0;
@@ -974,13 +1020,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
974 1020
975 case AUDIT_EXECVE: { 1021 case AUDIT_EXECVE: {
976 struct audit_aux_data_execve *axi = (void *)aux; 1022 struct audit_aux_data_execve *axi = (void *)aux;
977 int i; 1023 audit_log_execve_info(ab, axi);
978 const char *p;
979 for (i = 0, p = axi->mem; i < axi->argc; i++) {
980 audit_log_format(ab, "a%d=", i);
981 p = audit_log_untrustedstring(ab, p);
982 audit_log_format(ab, "\n");
983 }
984 break; } 1024 break; }
985 1025
986 case AUDIT_SOCKETCALL: { 1026 case AUDIT_SOCKETCALL: {
@@ -1824,32 +1864,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
1824 return 0; 1864 return 0;
1825} 1865}
1826 1866
1867int audit_argv_kb = 32;
1868
1827int audit_bprm(struct linux_binprm *bprm) 1869int audit_bprm(struct linux_binprm *bprm)
1828{ 1870{
1829 struct audit_aux_data_execve *ax; 1871 struct audit_aux_data_execve *ax;
1830 struct audit_context *context = current->audit_context; 1872 struct audit_context *context = current->audit_context;
1831 unsigned long p, next;
1832 void *to;
1833 1873
1834 if (likely(!audit_enabled || !context || context->dummy)) 1874 if (likely(!audit_enabled || !context || context->dummy))
1835 return 0; 1875 return 0;
1836 1876
1837 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, 1877 /*
1838 GFP_KERNEL); 1878 * Even though the stack code doesn't limit the arg+env size any more,
1879 * the audit code requires that _all_ arguments be logged in a single
1880 * netlink skb. Hence cap it :-(
1881 */
1882 if (bprm->argv_len > (audit_argv_kb << 10))
1883 return -E2BIG;
1884
1885 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1839 if (!ax) 1886 if (!ax)
1840 return -ENOMEM; 1887 return -ENOMEM;
1841 1888
1842 ax->argc = bprm->argc; 1889 ax->argc = bprm->argc;
1843 ax->envc = bprm->envc; 1890 ax->envc = bprm->envc;
1844 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { 1891 ax->mm = bprm->mm;
1845 struct page *page = bprm->page[p / PAGE_SIZE];
1846 void *kaddr = kmap(page);
1847 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1848 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1849 to += next - p;
1850 kunmap(page);
1851 }
1852
1853 ax->d.type = AUDIT_EXECVE; 1892 ax->d.type = AUDIT_EXECVE;
1854 ax->d.next = context->aux; 1893 ax->d.next = context->aux;
1855 context->aux = (void *)ax; 1894 context->aux = (void *)ax;
@@ -2040,7 +2079,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2040 2079
2041/** 2080/**
2042 * audit_core_dumps - record information about processes that end abnormally 2081 * audit_core_dumps - record information about processes that end abnormally
2043 * @sig: signal value 2082 * @signr: signal value
2044 * 2083 *
2045 * If a process ends with a core dump, something fishy is going on and we 2084 * If a process ends with a core dump, something fishy is going on and we
2046 * should record the event for investigation. 2085 * should record the event for investigation.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 208cf3497c..181ae70860 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
104} 104}
105 105
106struct take_cpu_down_param {
107 unsigned long mod;
108 void *hcpu;
109};
110
106/* Take this CPU down. */ 111/* Take this CPU down. */
107static int take_cpu_down(void *unused) 112static int take_cpu_down(void *_param)
108{ 113{
114 struct take_cpu_down_param *param = _param;
109 int err; 115 int err;
110 116
117 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
118 param->hcpu);
111 /* Ensure this CPU doesn't handle any more interrupts. */ 119 /* Ensure this CPU doesn't handle any more interrupts. */
112 err = __cpu_disable(); 120 err = __cpu_disable();
113 if (err < 0) 121 if (err < 0)
@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
127 cpumask_t old_allowed, tmp; 135 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu; 136 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 137 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
138 struct take_cpu_down_param tcd_param = {
139 .mod = mod,
140 .hcpu = hcpu,
141 };
130 142
131 if (num_online_cpus() == 1) 143 if (num_online_cpus() == 1)
132 return -EBUSY; 144 return -EBUSY;
@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
153 set_cpus_allowed(current, tmp); 165 set_cpus_allowed(current, tmp);
154 166
155 mutex_lock(&cpu_bitmask_lock); 167 mutex_lock(&cpu_bitmask_lock);
156 p = __stop_machine_run(take_cpu_down, NULL, cpu); 168 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
157 mutex_unlock(&cpu_bitmask_lock); 169 mutex_unlock(&cpu_bitmask_lock);
158 170
159 if (IS_ERR(p) || cpu_online(cpu)) { 171 if (IS_ERR(p) || cpu_online(cpu)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4c49188cc4..57e6448b17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL; 517 envp[i] = NULL;
518 518
519 call_usermodehelper(argv[0], argv, envp, 0); 519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf); 520 kfree(pathbuf);
521} 521}
522 522
@@ -981,10 +981,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); 981 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
982 if (!mmarray) 982 if (!mmarray)
983 goto done; 983 goto done;
984 write_lock_irq(&tasklist_lock); /* block fork */ 984 read_lock(&tasklist_lock); /* block fork */
985 if (atomic_read(&cs->count) <= ntasks) 985 if (atomic_read(&cs->count) <= ntasks)
986 break; /* got enough */ 986 break; /* got enough */
987 write_unlock_irq(&tasklist_lock); /* try again */ 987 read_unlock(&tasklist_lock); /* try again */
988 kfree(mmarray); 988 kfree(mmarray);
989 } 989 }
990 990
@@ -1006,7 +1006,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1006 continue; 1006 continue;
1007 mmarray[n++] = mm; 1007 mmarray[n++] = mm;
1008 } while_each_thread(g, p); 1008 } while_each_thread(g, p);
1009 write_unlock_irq(&tasklist_lock); 1009 read_unlock(&tasklist_lock);
1010 1010
1011 /* 1011 /*
1012 * Now that we've dropped the tasklist spinlock, we can 1012 * Now that we've dropped the tasklist spinlock, we can
@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
2138static int cpuset_handle_cpuhp(struct notifier_block *nb, 2138static int cpuset_handle_cpuhp(struct notifier_block *nb,
2139 unsigned long phase, void *cpu) 2139 unsigned long phase, void *cpu)
2140{ 2140{
2141 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
2142 return NOTIFY_DONE;
2143
2141 common_cpu_mem_hotplug_unplug(); 2144 common_cpu_mem_hotplug_unplug();
2142 return 0; 2145 return 0;
2143} 2146}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index c0148ae992..81e6978296 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -99,9 +99,10 @@ void __delayacct_blkio_end(void)
99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 99int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
100{ 100{
101 s64 tmp; 101 s64 tmp;
102 struct timespec ts; 102 unsigned long t1;
103 unsigned long t1,t2,t3; 103 unsigned long long t2, t3;
104 unsigned long flags; 104 unsigned long flags;
105 struct timespec ts;
105 106
106 /* Though tsk->delays accessed later, early exit avoids 107 /* Though tsk->delays accessed later, early exit avoids
107 * unnecessary returning of other data 108 * unnecessary returning of other data
@@ -124,11 +125,10 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
124 125
125 d->cpu_count += t1; 126 d->cpu_count += t1;
126 127
127 jiffies_to_timespec(t2, &ts); 128 tmp = (s64)d->cpu_delay_total + t2;
128 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; 129 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
130 130
131 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; 131 tmp = (s64)d->cpu_run_virtual_total + t3;
132 d->cpu_run_virtual_total = 132 d->cpu_run_virtual_total =
133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; 133 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
134 134
diff --git a/kernel/exit.c b/kernel/exit.c
index 5c8ecbaa19..464c2b172f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h> 32#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h> 33#include <linux/delayacct.h>
34#include <linux/freezer.h>
34#include <linux/cpuset.h> 35#include <linux/cpuset.h>
35#include <linux/syscalls.h> 36#include <linux/syscalls.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
@@ -44,6 +45,7 @@
44#include <linux/resource.h> 45#include <linux/resource.h>
45#include <linux/blkdev.h> 46#include <linux/blkdev.h>
46#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/freezer.h>
47 49
48#include <asm/uaccess.h> 50#include <asm/uaccess.h>
49#include <asm/unistd.h> 51#include <asm/unistd.h>
@@ -122,9 +124,9 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->maj_flt += tsk->maj_flt; 124 sig->maj_flt += tsk->maj_flt;
123 sig->nvcsw += tsk->nvcsw; 125 sig->nvcsw += tsk->nvcsw;
124 sig->nivcsw += tsk->nivcsw; 126 sig->nivcsw += tsk->nivcsw;
125 sig->sched_time += tsk->sched_time;
126 sig->inblock += task_io_get_inblock(tsk); 127 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 128 sig->oublock += task_io_get_oublock(tsk);
129 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
128 sig = NULL; /* Marker for below. */ 130 sig = NULL; /* Marker for below. */
129 } 131 }
130 132
@@ -182,7 +184,6 @@ repeat:
182 zap_leader = (leader->exit_signal == -1); 184 zap_leader = (leader->exit_signal == -1);
183 } 185 }
184 186
185 sched_exit(p);
186 write_unlock_irq(&tasklist_lock); 187 write_unlock_irq(&tasklist_lock);
187 proc_flush_task(p); 188 proc_flush_task(p);
188 release_thread(p); 189 release_thread(p);
@@ -291,7 +292,7 @@ static void reparent_to_kthreadd(void)
291 /* Set the exit signal to SIGCHLD so we signal init on exit */ 292 /* Set the exit signal to SIGCHLD so we signal init on exit */
292 current->exit_signal = SIGCHLD; 293 current->exit_signal = SIGCHLD;
293 294
294 if (!has_rt_policy(current) && (task_nice(current) < 0)) 295 if (task_nice(current) < 0)
295 set_user_nice(current, 0); 296 set_user_nice(current, 0);
296 /* cpus_allowed? */ 297 /* cpus_allowed? */
297 /* rt_priority? */ 298 /* rt_priority? */
@@ -388,6 +389,11 @@ void daemonize(const char *name, ...)
388 * they would be locked into memory. 389 * they would be locked into memory.
389 */ 390 */
390 exit_mm(current); 391 exit_mm(current);
392 /*
393 * We don't want to have TIF_FREEZE set if the system-wide hibernation
394 * or suspend transition begins right now.
395 */
396 current->flags |= PF_NOFREEZE;
391 397
392 set_special_pids(1, 1); 398 set_special_pids(1, 1);
393 proc_clear_tty(current); 399 proc_clear_tty(current);
@@ -589,6 +595,8 @@ static void exit_mm(struct task_struct * tsk)
589 tsk->mm = NULL; 595 tsk->mm = NULL;
590 up_read(&mm->mmap_sem); 596 up_read(&mm->mmap_sem);
591 enter_lazy_tlb(mm, current); 597 enter_lazy_tlb(mm, current);
598 /* We don't want this task to be frozen prematurely */
599 clear_freeze_flag(tsk);
592 task_unlock(tsk); 600 task_unlock(tsk);
593 mmput(mm); 601 mmput(mm);
594} 602}
@@ -859,6 +867,34 @@ static void exit_notify(struct task_struct *tsk)
859 release_task(tsk); 867 release_task(tsk);
860} 868}
861 869
870#ifdef CONFIG_DEBUG_STACK_USAGE
871static void check_stack_usage(void)
872{
873 static DEFINE_SPINLOCK(low_water_lock);
874 static int lowest_to_date = THREAD_SIZE;
875 unsigned long *n = end_of_stack(current);
876 unsigned long free;
877
878 while (*n == 0)
879 n++;
880 free = (unsigned long)n - (unsigned long)end_of_stack(current);
881
882 if (free >= lowest_to_date)
883 return;
884
885 spin_lock(&low_water_lock);
886 if (free < lowest_to_date) {
887 printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
888 "left\n",
889 current->comm, free);
890 lowest_to_date = free;
891 }
892 spin_unlock(&low_water_lock);
893}
894#else
895static inline void check_stack_usage(void) {}
896#endif
897
862fastcall NORET_TYPE void do_exit(long code) 898fastcall NORET_TYPE void do_exit(long code)
863{ 899{
864 struct task_struct *tsk = current; 900 struct task_struct *tsk = current;
@@ -938,6 +974,8 @@ fastcall NORET_TYPE void do_exit(long code)
938 if (unlikely(tsk->compat_robust_list)) 974 if (unlikely(tsk->compat_robust_list))
939 compat_exit_robust_list(tsk); 975 compat_exit_robust_list(tsk);
940#endif 976#endif
977 if (group_dead)
978 tty_audit_exit();
941 if (unlikely(tsk->audit_context)) 979 if (unlikely(tsk->audit_context))
942 audit_free(tsk); 980 audit_free(tsk);
943 981
@@ -950,6 +988,7 @@ fastcall NORET_TYPE void do_exit(long code)
950 exit_sem(tsk); 988 exit_sem(tsk);
951 __exit_files(tsk); 989 __exit_files(tsk);
952 __exit_fs(tsk); 990 __exit_fs(tsk);
991 check_stack_usage();
953 exit_thread(); 992 exit_thread();
954 cpuset_exit(tsk); 993 cpuset_exit(tsk);
955 exit_keys(tsk); 994 exit_keys(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 73ad5cda1b..7332e236d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/delayacct.h> 49#include <linux/delayacct.h>
50#include <linux/taskstats_kern.h> 50#include <linux/taskstats_kern.h>
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h>
52 53
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
@@ -136,7 +137,7 @@ void __init fork_init(unsigned long mempages)
136 /* create a slab on which task_structs can be allocated */ 137 /* create a slab on which task_structs can be allocated */
137 task_struct_cachep = 138 task_struct_cachep =
138 kmem_cache_create("task_struct", sizeof(struct task_struct), 139 kmem_cache_create("task_struct", sizeof(struct task_struct),
139 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); 140 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
140#endif 141#endif
141 142
142 /* 143 /*
@@ -333,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
333 atomic_set(&mm->mm_count, 1); 334 atomic_set(&mm->mm_count, 1);
334 init_rwsem(&mm->mmap_sem); 335 init_rwsem(&mm->mmap_sem);
335 INIT_LIST_HEAD(&mm->mmlist); 336 INIT_LIST_HEAD(&mm->mmlist);
337 mm->flags = (current->mm) ? current->mm->flags
338 : MMF_DUMP_FILTER_DEFAULT;
336 mm->core_waiters = 0; 339 mm->core_waiters = 0;
337 mm->nr_ptes = 0; 340 mm->nr_ptes = 0;
338 set_mm_counter(mm, file_rss, 0); 341 set_mm_counter(mm, file_rss, 0);
@@ -877,7 +880,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
877 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 880 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
878 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 881 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
879 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 882 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
880 sig->sched_time = 0; 883 sig->sum_sched_runtime = 0;
881 INIT_LIST_HEAD(&sig->cpu_timers[0]); 884 INIT_LIST_HEAD(&sig->cpu_timers[0]);
882 INIT_LIST_HEAD(&sig->cpu_timers[1]); 885 INIT_LIST_HEAD(&sig->cpu_timers[1]);
883 INIT_LIST_HEAD(&sig->cpu_timers[2]); 886 INIT_LIST_HEAD(&sig->cpu_timers[2]);
@@ -897,6 +900,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
897 } 900 }
898 acct_init_pacct(&sig->pacct); 901 acct_init_pacct(&sig->pacct);
899 902
903 tty_audit_fork(sig);
904
900 return 0; 905 return 0;
901} 906}
902 907
@@ -920,7 +925,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
920{ 925{
921 unsigned long new_flags = p->flags; 926 unsigned long new_flags = p->flags;
922 927
923 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); 928 new_flags &= ~PF_SUPERPRIV;
924 new_flags |= PF_FORKNOEXEC; 929 new_flags |= PF_FORKNOEXEC;
925 if (!(clone_flags & CLONE_PTRACE)) 930 if (!(clone_flags & CLONE_PTRACE))
926 p->ptrace = 0; 931 p->ptrace = 0;
@@ -999,7 +1004,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (atomic_read(&p->user->processes) >= 1004 if (atomic_read(&p->user->processes) >=
1000 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1005 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
1001 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1006 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1002 p->user != &root_user) 1007 p->user != current->nsproxy->user_ns->root_user)
1003 goto bad_fork_free; 1008 goto bad_fork_free;
1004 } 1009 }
1005 1010
@@ -1040,7 +1045,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1040 1045
1041 p->utime = cputime_zero; 1046 p->utime = cputime_zero;
1042 p->stime = cputime_zero; 1047 p->stime = cputime_zero;
1043 p->sched_time = 0; 1048
1044#ifdef CONFIG_TASK_XACCT 1049#ifdef CONFIG_TASK_XACCT
1045 p->rchar = 0; /* I/O counter: bytes read */ 1050 p->rchar = 0; /* I/O counter: bytes read */
1046 p->wchar = 0; /* I/O counter: bytes written */ 1051 p->wchar = 0; /* I/O counter: bytes written */
@@ -1059,6 +1064,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1059 1064
1060 p->lock_depth = -1; /* -1 = no lock */ 1065 p->lock_depth = -1; /* -1 = no lock */
1061 do_posix_clock_monotonic_gettime(&p->start_time); 1066 do_posix_clock_monotonic_gettime(&p->start_time);
1067 p->real_start_time = p->start_time;
1068 monotonic_to_bootbased(&p->real_start_time);
1062 p->security = NULL; 1069 p->security = NULL;
1063 p->io_context = NULL; 1070 p->io_context = NULL;
1064 p->io_wait = NULL; 1071 p->io_wait = NULL;
@@ -1439,22 +1446,22 @@ void __init proc_caches_init(void)
1439 sighand_cachep = kmem_cache_create("sighand_cache", 1446 sighand_cachep = kmem_cache_create("sighand_cache",
1440 sizeof(struct sighand_struct), 0, 1447 sizeof(struct sighand_struct), 0,
1441 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1448 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1442 sighand_ctor, NULL); 1449 sighand_ctor);
1443 signal_cachep = kmem_cache_create("signal_cache", 1450 signal_cachep = kmem_cache_create("signal_cache",
1444 sizeof(struct signal_struct), 0, 1451 sizeof(struct signal_struct), 0,
1445 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1452 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1446 files_cachep = kmem_cache_create("files_cache", 1453 files_cachep = kmem_cache_create("files_cache",
1447 sizeof(struct files_struct), 0, 1454 sizeof(struct files_struct), 0,
1448 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1455 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1449 fs_cachep = kmem_cache_create("fs_cache", 1456 fs_cachep = kmem_cache_create("fs_cache",
1450 sizeof(struct fs_struct), 0, 1457 sizeof(struct fs_struct), 0,
1451 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1458 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1452 vm_area_cachep = kmem_cache_create("vm_area_struct", 1459 vm_area_cachep = kmem_cache_create("vm_area_struct",
1453 sizeof(struct vm_area_struct), 0, 1460 sizeof(struct vm_area_struct), 0,
1454 SLAB_PANIC, NULL, NULL); 1461 SLAB_PANIC, NULL);
1455 mm_cachep = kmem_cache_create("mm_struct", 1462 mm_cachep = kmem_cache_create("mm_struct",
1456 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1463 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1457 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1458} 1465}
1459 1466
1460/* 1467/*
@@ -1601,7 +1608,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1601 err = -EINVAL; 1608 err = -EINVAL;
1602 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1603 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1604 CLONE_NEWUTS|CLONE_NEWIPC)) 1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER))
1605 goto bad_unshare_out; 1612 goto bad_unshare_out;
1606 1613
1607 if ((err = unshare_thread(unshare_flags))) 1614 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 45490bec58..a12425051e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -121,6 +121,24 @@ static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
121static struct vfsmount *futex_mnt; 121static struct vfsmount *futex_mnt;
122 122
123/* 123/*
124 * Take mm->mmap_sem, when futex is shared
125 */
126static inline void futex_lock_mm(struct rw_semaphore *fshared)
127{
128 if (fshared)
129 down_read(fshared);
130}
131
132/*
133 * Release mm->mmap_sem, when the futex is shared
134 */
135static inline void futex_unlock_mm(struct rw_semaphore *fshared)
136{
137 if (fshared)
138 up_read(fshared);
139}
140
141/*
124 * We hash on the keys returned from get_futex_key (see below). 142 * We hash on the keys returned from get_futex_key (see below).
125 */ 143 */
126static struct futex_hash_bucket *hash_futex(union futex_key *key) 144static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -287,7 +305,18 @@ void drop_futex_key_refs(union futex_key *key)
287} 305}
288EXPORT_SYMBOL_GPL(drop_futex_key_refs); 306EXPORT_SYMBOL_GPL(drop_futex_key_refs);
289 307
290static inline int get_futex_value_locked(u32 *dest, u32 __user *from) 308static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
309{
310 u32 curval;
311
312 pagefault_disable();
313 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
314 pagefault_enable();
315
316 return curval;
317}
318
319static int get_futex_value_locked(u32 *dest, u32 __user *from)
291{ 320{
292 int ret; 321 int ret;
293 322
@@ -317,15 +346,20 @@ static int futex_handle_fault(unsigned long address,
317 vma = find_vma(mm, address); 346 vma = find_vma(mm, address);
318 if (vma && address >= vma->vm_start && 347 if (vma && address >= vma->vm_start &&
319 (vma->vm_flags & VM_WRITE)) { 348 (vma->vm_flags & VM_WRITE)) {
320 switch (handle_mm_fault(mm, vma, address, 1)) { 349 int fault;
321 case VM_FAULT_MINOR: 350 fault = handle_mm_fault(mm, vma, address, 1);
322 ret = 0; 351 if (unlikely((fault & VM_FAULT_ERROR))) {
323 current->min_flt++; 352#if 0
324 break; 353 /* XXX: let's do this when we verify it is OK */
325 case VM_FAULT_MAJOR: 354 if (ret & VM_FAULT_OOM)
355 ret = -ENOMEM;
356#endif
357 } else {
326 ret = 0; 358 ret = 0;
327 current->maj_flt++; 359 if (fault & VM_FAULT_MAJOR)
328 break; 360 current->maj_flt++;
361 else
362 current->min_flt++;
329 } 363 }
330 } 364 }
331 if (!fshared) 365 if (!fshared)
@@ -620,9 +654,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
620 654
621 newval = FUTEX_WAITERS | new_owner->pid; 655 newval = FUTEX_WAITERS | new_owner->pid;
622 656
623 pagefault_disable(); 657 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
624 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
625 pagefault_enable();
626 658
627 if (curval == -EFAULT) 659 if (curval == -EFAULT)
628 ret = -EFAULT; 660 ret = -EFAULT;
@@ -659,9 +691,7 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
659 * There is no waiter, so we unlock the futex. The owner died 691 * There is no waiter, so we unlock the futex. The owner died
660 * bit has not to be preserved here. We are the owner: 692 * bit has not to be preserved here. We are the owner:
661 */ 693 */
662 pagefault_disable(); 694 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
663 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
664 pagefault_enable();
665 695
666 if (oldval == -EFAULT) 696 if (oldval == -EFAULT)
667 return oldval; 697 return oldval;
@@ -700,8 +730,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
700 union futex_key key; 730 union futex_key key;
701 int ret; 731 int ret;
702 732
703 if (fshared) 733 futex_lock_mm(fshared);
704 down_read(fshared);
705 734
706 ret = get_futex_key(uaddr, fshared, &key); 735 ret = get_futex_key(uaddr, fshared, &key);
707 if (unlikely(ret != 0)) 736 if (unlikely(ret != 0))
@@ -725,8 +754,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
725 754
726 spin_unlock(&hb->lock); 755 spin_unlock(&hb->lock);
727out: 756out:
728 if (fshared) 757 futex_unlock_mm(fshared);
729 up_read(fshared);
730 return ret; 758 return ret;
731} 759}
732 760
@@ -746,8 +774,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
746 int ret, op_ret, attempt = 0; 774 int ret, op_ret, attempt = 0;
747 775
748retryfull: 776retryfull:
749 if (fshared) 777 futex_lock_mm(fshared);
750 down_read(fshared);
751 778
752 ret = get_futex_key(uaddr1, fshared, &key1); 779 ret = get_futex_key(uaddr1, fshared, &key1);
753 if (unlikely(ret != 0)) 780 if (unlikely(ret != 0))
@@ -793,7 +820,7 @@ retry:
793 */ 820 */
794 if (attempt++) { 821 if (attempt++) {
795 ret = futex_handle_fault((unsigned long)uaddr2, 822 ret = futex_handle_fault((unsigned long)uaddr2,
796 fshared, attempt); 823 fshared, attempt);
797 if (ret) 824 if (ret)
798 goto out; 825 goto out;
799 goto retry; 826 goto retry;
@@ -803,8 +830,7 @@ retry:
803 * If we would have faulted, release mmap_sem, 830 * If we would have faulted, release mmap_sem,
804 * fault it in and start all over again. 831 * fault it in and start all over again.
805 */ 832 */
806 if (fshared) 833 futex_unlock_mm(fshared);
807 up_read(fshared);
808 834
809 ret = get_user(dummy, uaddr2); 835 ret = get_user(dummy, uaddr2);
810 if (ret) 836 if (ret)
@@ -841,8 +867,8 @@ retry:
841 if (hb1 != hb2) 867 if (hb1 != hb2)
842 spin_unlock(&hb2->lock); 868 spin_unlock(&hb2->lock);
843out: 869out:
844 if (fshared) 870 futex_unlock_mm(fshared);
845 up_read(fshared); 871
846 return ret; 872 return ret;
847} 873}
848 874
@@ -861,8 +887,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
861 int ret, drop_count = 0; 887 int ret, drop_count = 0;
862 888
863 retry: 889 retry:
864 if (fshared) 890 futex_lock_mm(fshared);
865 down_read(fshared);
866 891
867 ret = get_futex_key(uaddr1, fshared, &key1); 892 ret = get_futex_key(uaddr1, fshared, &key1);
868 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
@@ -890,8 +915,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
890 * If we would have faulted, release mmap_sem, fault 915 * If we would have faulted, release mmap_sem, fault
891 * it in and start all over again. 916 * it in and start all over again.
892 */ 917 */
893 if (fshared) 918 futex_unlock_mm(fshared);
894 up_read(fshared);
895 919
896 ret = get_user(curval, uaddr1); 920 ret = get_user(curval, uaddr1);
897 921
@@ -944,8 +968,7 @@ out_unlock:
944 drop_futex_key_refs(&key1); 968 drop_futex_key_refs(&key1);
945 969
946out: 970out:
947 if (fshared) 971 futex_unlock_mm(fshared);
948 up_read(fshared);
949 return ret; 972 return ret;
950} 973}
951 974
@@ -1113,10 +1136,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1113 while (!ret) { 1136 while (!ret) {
1114 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1137 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1115 1138
1116 pagefault_disable(); 1139 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1117 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1118 uval, newval);
1119 pagefault_enable();
1120 1140
1121 if (curval == -EFAULT) 1141 if (curval == -EFAULT)
1122 ret = -EFAULT; 1142 ret = -EFAULT;
@@ -1134,6 +1154,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1134#define ARG3_SHARED 1 1154#define ARG3_SHARED 1
1135 1155
1136static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1157
1137static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1138 u32 val, ktime_t *abs_time) 1159 u32 val, ktime_t *abs_time)
1139{ 1160{
@@ -1148,8 +1169,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1148 1169
1149 q.pi_state = NULL; 1170 q.pi_state = NULL;
1150 retry: 1171 retry:
1151 if (fshared) 1172 futex_lock_mm(fshared);
1152 down_read(fshared);
1153 1173
1154 ret = get_futex_key(uaddr, fshared, &q.key); 1174 ret = get_futex_key(uaddr, fshared, &q.key);
1155 if (unlikely(ret != 0)) 1175 if (unlikely(ret != 0))
@@ -1186,8 +1206,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1186 * If we would have faulted, release mmap_sem, fault it in and 1206 * If we would have faulted, release mmap_sem, fault it in and
1187 * start all over again. 1207 * start all over again.
1188 */ 1208 */
1189 if (fshared) 1209 futex_unlock_mm(fshared);
1190 up_read(fshared);
1191 1210
1192 ret = get_user(uval, uaddr); 1211 ret = get_user(uval, uaddr);
1193 1212
@@ -1206,8 +1225,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1206 * Now the futex is queued and we have checked the data, we 1225 * Now the futex is queued and we have checked the data, we
1207 * don't want to hold mmap_sem while we sleep. 1226 * don't want to hold mmap_sem while we sleep.
1208 */ 1227 */
1209 if (fshared) 1228 futex_unlock_mm(fshared);
1210 up_read(fshared);
1211 1229
1212 /* 1230 /*
1213 * There might have been scheduling since the queue_me(), as we 1231 * There might have been scheduling since the queue_me(), as we
@@ -1285,8 +1303,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1285 queue_unlock(&q, hb); 1303 queue_unlock(&q, hb);
1286 1304
1287 out_release_sem: 1305 out_release_sem:
1288 if (fshared) 1306 futex_unlock_mm(fshared);
1289 up_read(fshared);
1290 return ret; 1307 return ret;
1291} 1308}
1292 1309
@@ -1333,8 +1350,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 1350
1334 q.pi_state = NULL; 1351 q.pi_state = NULL;
1335 retry: 1352 retry:
1336 if (fshared) 1353 futex_lock_mm(fshared);
1337 down_read(fshared);
1338 1354
1339 ret = get_futex_key(uaddr, fshared, &q.key); 1355 ret = get_futex_key(uaddr, fshared, &q.key);
1340 if (unlikely(ret != 0)) 1356 if (unlikely(ret != 0))
@@ -1353,9 +1369,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1353 */ 1369 */
1354 newval = current->pid; 1370 newval = current->pid;
1355 1371
1356 pagefault_disable(); 1372 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1357 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1358 pagefault_enable();
1359 1373
1360 if (unlikely(curval == -EFAULT)) 1374 if (unlikely(curval == -EFAULT))
1361 goto uaddr_faulted; 1375 goto uaddr_faulted;
@@ -1398,9 +1412,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1398 lock_taken = 1; 1412 lock_taken = 1;
1399 } 1413 }
1400 1414
1401 pagefault_disable(); 1415 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1402 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1403 pagefault_enable();
1404 1416
1405 if (unlikely(curval == -EFAULT)) 1417 if (unlikely(curval == -EFAULT))
1406 goto uaddr_faulted; 1418 goto uaddr_faulted;
@@ -1428,8 +1440,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1428 * exit to complete. 1440 * exit to complete.
1429 */ 1441 */
1430 queue_unlock(&q, hb); 1442 queue_unlock(&q, hb);
1431 if (fshared) 1443 futex_unlock_mm(fshared);
1432 up_read(fshared);
1433 cond_resched(); 1444 cond_resched();
1434 goto retry; 1445 goto retry;
1435 1446
@@ -1465,8 +1476,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1465 * Now the futex is queued and we have checked the data, we 1476 * Now the futex is queued and we have checked the data, we
1466 * don't want to hold mmap_sem while we sleep. 1477 * don't want to hold mmap_sem while we sleep.
1467 */ 1478 */
1468 if (fshared) 1479 futex_unlock_mm(fshared);
1469 up_read(fshared);
1470 1480
1471 WARN_ON(!q.pi_state); 1481 WARN_ON(!q.pi_state);
1472 /* 1482 /*
@@ -1480,8 +1490,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1480 ret = ret ? 0 : -EWOULDBLOCK; 1490 ret = ret ? 0 : -EWOULDBLOCK;
1481 } 1491 }
1482 1492
1483 if (fshared) 1493 futex_lock_mm(fshared);
1484 down_read(fshared);
1485 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1486 1495
1487 if (!ret) { 1496 if (!ret) {
@@ -1518,8 +1527,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1518 1527
1519 /* Unqueue and drop the lock */ 1528 /* Unqueue and drop the lock */
1520 unqueue_me_pi(&q); 1529 unqueue_me_pi(&q);
1521 if (fshared) 1530 futex_unlock_mm(fshared);
1522 up_read(fshared);
1523 1531
1524 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1532 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1525 1533
@@ -1527,8 +1535,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1527 queue_unlock(&q, hb); 1535 queue_unlock(&q, hb);
1528 1536
1529 out_release_sem: 1537 out_release_sem:
1530 if (fshared) 1538 futex_unlock_mm(fshared);
1531 up_read(fshared);
1532 return ret; 1539 return ret;
1533 1540
1534 uaddr_faulted: 1541 uaddr_faulted:
@@ -1550,8 +1557,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1550 goto retry_unlocked; 1557 goto retry_unlocked;
1551 } 1558 }
1552 1559
1553 if (fshared) 1560 futex_unlock_mm(fshared);
1554 up_read(fshared);
1555 1561
1556 ret = get_user(uval, uaddr); 1562 ret = get_user(uval, uaddr);
1557 if (!ret && (uval != -EFAULT)) 1563 if (!ret && (uval != -EFAULT))
@@ -1585,8 +1591,7 @@ retry:
1585 /* 1591 /*
1586 * First take all the futex related locks: 1592 * First take all the futex related locks:
1587 */ 1593 */
1588 if (fshared) 1594 futex_lock_mm(fshared);
1589 down_read(fshared);
1590 1595
1591 ret = get_futex_key(uaddr, fshared, &key); 1596 ret = get_futex_key(uaddr, fshared, &key);
1592 if (unlikely(ret != 0)) 1597 if (unlikely(ret != 0))
@@ -1601,11 +1606,9 @@ retry_unlocked:
1601 * again. If it succeeds then we can return without waking 1606 * again. If it succeeds then we can return without waking
1602 * anyone else up: 1607 * anyone else up:
1603 */ 1608 */
1604 if (!(uval & FUTEX_OWNER_DIED)) { 1609 if (!(uval & FUTEX_OWNER_DIED))
1605 pagefault_disable(); 1610 uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0);
1606 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1611
1607 pagefault_enable();
1608 }
1609 1612
1610 if (unlikely(uval == -EFAULT)) 1613 if (unlikely(uval == -EFAULT))
1611 goto pi_faulted; 1614 goto pi_faulted;
@@ -1647,8 +1650,7 @@ retry_unlocked:
1647out_unlock: 1650out_unlock:
1648 spin_unlock(&hb->lock); 1651 spin_unlock(&hb->lock);
1649out: 1652out:
1650 if (fshared) 1653 futex_unlock_mm(fshared);
1651 up_read(fshared);
1652 1654
1653 return ret; 1655 return ret;
1654 1656
@@ -1671,8 +1673,7 @@ pi_faulted:
1671 goto retry_unlocked; 1673 goto retry_unlocked;
1672 } 1674 }
1673 1675
1674 if (fshared) 1676 futex_unlock_mm(fshared);
1675 up_read(fshared);
1676 1677
1677 ret = get_user(uval, uaddr); 1678 ret = get_user(uval, uaddr);
1678 if (!ret && (uval != -EFAULT)) 1679 if (!ret && (uval != -EFAULT))
@@ -1729,8 +1730,8 @@ static int futex_fd(u32 __user *uaddr, int signal)
1729 1730
1730 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 1731 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
1731 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " 1732 printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
1732 "will be removed from the kernel in June 2007\n", 1733 "will be removed from the kernel in June 2007\n",
1733 current->comm); 1734 current->comm);
1734 } 1735 }
1735 1736
1736 ret = -EINVAL; 1737 ret = -EINVAL;
@@ -1908,10 +1909,8 @@ retry:
1908 * Wake robust non-PI futexes here. The wakeup of 1909 * Wake robust non-PI futexes here. The wakeup of
1909 * PI futexes happens in exit_pi_state(): 1910 * PI futexes happens in exit_pi_state():
1910 */ 1911 */
1911 if (!pi) { 1912 if (!pi && (uval & FUTEX_WAITERS))
1912 if (uval & FUTEX_WAITERS)
1913 futex_wake(uaddr, &curr->mm->mmap_sem, 1); 1913 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1914 }
1915 } 1914 }
1916 return 0; 1915 return 0;
1917} 1916}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 23c03f43e1..eb1ddebd2c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -558,7 +558,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
558 */ 558 */
559static int hrtimer_switch_to_hres(void) 559static int hrtimer_switch_to_hres(void)
560{ 560{
561 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 561 int cpu = smp_processor_id();
562 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
562 unsigned long flags; 563 unsigned long flags;
563 564
564 if (base->hres_active) 565 if (base->hres_active)
@@ -568,6 +569,8 @@ static int hrtimer_switch_to_hres(void)
568 569
569 if (tick_init_highres()) { 570 if (tick_init_highres()) {
570 local_irq_restore(flags); 571 local_irq_restore(flags);
572 printk(KERN_WARNING "Could not switch to high resolution "
573 "mode on CPU %d\n", cpu);
571 return 0; 574 return 0;
572 } 575 }
573 base->hres_active = 1; 576 base->hres_active = 1;
@@ -683,6 +686,7 @@ static void enqueue_hrtimer(struct hrtimer *timer,
683 struct rb_node **link = &base->active.rb_node; 686 struct rb_node **link = &base->active.rb_node;
684 struct rb_node *parent = NULL; 687 struct rb_node *parent = NULL;
685 struct hrtimer *entry; 688 struct hrtimer *entry;
689 int leftmost = 1;
686 690
687 /* 691 /*
688 * Find the right place in the rbtree: 692 * Find the right place in the rbtree:
@@ -694,18 +698,19 @@ static void enqueue_hrtimer(struct hrtimer *timer,
694 * We dont care about collisions. Nodes with 698 * We dont care about collisions. Nodes with
695 * the same expiry time stay together. 699 * the same expiry time stay together.
696 */ 700 */
697 if (timer->expires.tv64 < entry->expires.tv64) 701 if (timer->expires.tv64 < entry->expires.tv64) {
698 link = &(*link)->rb_left; 702 link = &(*link)->rb_left;
699 else 703 } else {
700 link = &(*link)->rb_right; 704 link = &(*link)->rb_right;
705 leftmost = 0;
706 }
701 } 707 }
702 708
703 /* 709 /*
704 * Insert the timer to the rbtree and check whether it 710 * Insert the timer to the rbtree and check whether it
705 * replaces the first pending timer 711 * replaces the first pending timer
706 */ 712 */
707 if (!base->first || timer->expires.tv64 < 713 if (leftmost) {
708 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
709 /* 714 /*
710 * Reprogram the clock event device. When the timer is already 715 * Reprogram the clock event device. When the timer is already
711 * expired hrtimer_enqueue_reprogram has either called the 716 * expired hrtimer_enqueue_reprogram has either called the
@@ -1406,7 +1411,7 @@ static void migrate_hrtimers(int cpu)
1406static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1411static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1407 unsigned long action, void *hcpu) 1412 unsigned long action, void *hcpu)
1408{ 1413{
1409 long cpu = (long)hcpu; 1414 unsigned int cpu = (long)hcpu;
1410 1415
1411 switch (action) { 1416 switch (action) {
1412 1417
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b4f1674fca..50b81b9804 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir;
19static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
20 int count, int *eof, void *data) 20 int count, int *eof, void *data)
21{ 21{
22 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); 22 struct irq_desc *desc = irq_desc + (long)data;
23 cpumask_t *mask = &desc->affinity;
24 int len;
25
26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING)
28 mask = &desc->pending_mask;
29#endif
30 len = cpumask_scnprintf(page, count, *mask);
23 31
24 if (count - len < 2) 32 if (count - len < 2)
25 return -EINVAL; 33 return -EINVAL;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd9e272d55..32b161972f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -172,7 +172,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
172 irqreturn_t action_ret) 172 irqreturn_t action_ret)
173{ 173{
174 if (unlikely(action_ret != IRQ_HANDLED)) { 174 if (unlikely(action_ret != IRQ_HANDLED)) {
175 desc->irqs_unhandled++; 175 /*
176 * If we are seeing only the odd spurious IRQ caused by
177 * bus asynchronicity then don't eventually trigger an error,
178 * otherwise the couter becomes a doomsday timer for otherwise
179 * working systems
180 */
181 if (jiffies - desc->last_unhandled > HZ/10)
182 desc->irqs_unhandled = 1;
183 else
184 desc->irqs_unhandled++;
185 desc->last_unhandled = jiffies;
176 if (unlikely(action_ret != IRQ_NONE)) 186 if (unlikely(action_ret != IRQ_NONE))
177 report_bad_irq(irq, desc, action_ret); 187 report_bad_irq(irq, desc, action_ret);
178 } 188 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fed5441862..474219a419 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -152,7 +152,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
152/* Lookup the address for this symbol. Returns 0 if not found. */ 152/* Lookup the address for this symbol. Returns 0 if not found. */
153unsigned long kallsyms_lookup_name(const char *name) 153unsigned long kallsyms_lookup_name(const char *name)
154{ 154{
155 char namebuf[KSYM_NAME_LEN+1]; 155 char namebuf[KSYM_NAME_LEN];
156 unsigned long i; 156 unsigned long i;
157 unsigned int off; 157 unsigned int off;
158 158
@@ -248,7 +248,7 @@ const char *kallsyms_lookup(unsigned long addr,
248{ 248{
249 const char *msym; 249 const char *msym;
250 250
251 namebuf[KSYM_NAME_LEN] = 0; 251 namebuf[KSYM_NAME_LEN - 1] = 0;
252 namebuf[0] = 0; 252 namebuf[0] = 0;
253 253
254 if (is_ksym_addr(addr)) { 254 if (is_ksym_addr(addr)) {
@@ -265,7 +265,7 @@ const char *kallsyms_lookup(unsigned long addr,
265 /* see if it's in a module */ 265 /* see if it's in a module */
266 msym = module_address_lookup(addr, symbolsize, offset, modname); 266 msym = module_address_lookup(addr, symbolsize, offset, modname);
267 if (msym) 267 if (msym)
268 return strncpy(namebuf, msym, KSYM_NAME_LEN); 268 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
269 269
270 return NULL; 270 return NULL;
271} 271}
@@ -273,7 +273,7 @@ const char *kallsyms_lookup(unsigned long addr,
273int lookup_symbol_name(unsigned long addr, char *symname) 273int lookup_symbol_name(unsigned long addr, char *symname)
274{ 274{
275 symname[0] = '\0'; 275 symname[0] = '\0';
276 symname[KSYM_NAME_LEN] = '\0'; 276 symname[KSYM_NAME_LEN - 1] = '\0';
277 277
278 if (is_ksym_addr(addr)) { 278 if (is_ksym_addr(addr)) {
279 unsigned long pos; 279 unsigned long pos;
@@ -291,7 +291,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
291 unsigned long *offset, char *modname, char *name) 291 unsigned long *offset, char *modname, char *name)
292{ 292{
293 name[0] = '\0'; 293 name[0] = '\0';
294 name[KSYM_NAME_LEN] = '\0'; 294 name[KSYM_NAME_LEN - 1] = '\0';
295 295
296 if (is_ksym_addr(addr)) { 296 if (is_ksym_addr(addr)) {
297 unsigned long pos; 297 unsigned long pos;
@@ -312,18 +312,17 @@ int sprint_symbol(char *buffer, unsigned long address)
312 char *modname; 312 char *modname;
313 const char *name; 313 const char *name;
314 unsigned long offset, size; 314 unsigned long offset, size;
315 char namebuf[KSYM_NAME_LEN+1]; 315 char namebuf[KSYM_NAME_LEN];
316 316
317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 317 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
318 if (!name) 318 if (!name)
319 return sprintf(buffer, "0x%lx", address); 319 return sprintf(buffer, "0x%lx", address);
320 else { 320
321 if (modname) 321 if (modname)
322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, 322 return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
323 size, modname); 323 size, modname);
324 else 324 else
325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); 325 return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
326 }
327} 326}
328 327
329/* Look up a kernel symbol and print it to the kernel messages. */ 328/* Look up a kernel symbol and print it to the kernel messages. */
@@ -343,8 +342,8 @@ struct kallsym_iter
343 unsigned long value; 342 unsigned long value;
344 unsigned int nameoff; /* If iterating in core kernel symbols */ 343 unsigned int nameoff; /* If iterating in core kernel symbols */
345 char type; 344 char type;
346 char name[KSYM_NAME_LEN+1]; 345 char name[KSYM_NAME_LEN];
347 char module_name[MODULE_NAME_LEN + 1]; 346 char module_name[MODULE_NAME_LEN];
348 int exported; 347 int exported;
349}; 348};
350 349
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index cee419143f..bc41ad0f24 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -24,6 +24,7 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/kfifo.h> 26#include <linux/kfifo.h>
27#include <linux/log2.h>
27 28
28/** 29/**
29 * kfifo_init - allocates a new FIFO using a preallocated buffer 30 * kfifo_init - allocates a new FIFO using a preallocated buffer
@@ -41,7 +42,7 @@ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
41 struct kfifo *fifo; 42 struct kfifo *fifo;
42 43
43 /* size must be a power of 2 */ 44 /* size must be a power of 2 */
44 BUG_ON(size & (size - 1)); 45 BUG_ON(!is_power_of_2(size));
45 46
46 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
47 if (!fifo) 48 if (!fifo)
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb0771..beedbdc646 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -33,6 +33,8 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/resource.h> 35#include <linux/resource.h>
36#include <linux/notifier.h>
37#include <linux/suspend.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38extern int max_threads; 40extern int max_threads;
@@ -119,9 +121,10 @@ struct subprocess_info {
119 char **argv; 121 char **argv;
120 char **envp; 122 char **envp;
121 struct key *ring; 123 struct key *ring;
122 int wait; 124 enum umh_wait wait;
123 int retval; 125 int retval;
124 struct file *stdin; 126 struct file *stdin;
127 void (*cleanup)(char **argv, char **envp);
125}; 128};
126 129
127/* 130/*
@@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data)
180 do_exit(0); 183 do_exit(0);
181} 184}
182 185
186void call_usermodehelper_freeinfo(struct subprocess_info *info)
187{
188 if (info->cleanup)
189 (*info->cleanup)(info->argv, info->envp);
190 kfree(info);
191}
192EXPORT_SYMBOL(call_usermodehelper_freeinfo);
193
183/* Keventd can't block, but this (a child) can. */ 194/* Keventd can't block, but this (a child) can. */
184static int wait_for_helper(void *data) 195static int wait_for_helper(void *data)
185{ 196{
@@ -216,8 +227,8 @@ static int wait_for_helper(void *data)
216 sub_info->retval = ret; 227 sub_info->retval = ret;
217 } 228 }
218 229
219 if (sub_info->wait < 0) 230 if (sub_info->wait == UMH_NO_WAIT)
220 kfree(sub_info); 231 call_usermodehelper_freeinfo(sub_info);
221 else 232 else
222 complete(sub_info->complete); 233 complete(sub_info->complete);
223 return 0; 234 return 0;
@@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work)
229 struct subprocess_info *sub_info = 240 struct subprocess_info *sub_info =
230 container_of(work, struct subprocess_info, work); 241 container_of(work, struct subprocess_info, work);
231 pid_t pid; 242 pid_t pid;
232 int wait = sub_info->wait; 243 enum umh_wait wait = sub_info->wait;
233 244
234 /* CLONE_VFORK: wait until the usermode helper has execve'd 245 /* CLONE_VFORK: wait until the usermode helper has execve'd
235 * successfully We need the data structures to stay around 246 * successfully We need the data structures to stay around
236 * until that is done. */ 247 * until that is done. */
237 if (wait) 248 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
238 pid = kernel_thread(wait_for_helper, sub_info, 249 pid = kernel_thread(wait_for_helper, sub_info,
239 CLONE_FS | CLONE_FILES | SIGCHLD); 250 CLONE_FS | CLONE_FILES | SIGCHLD);
240 else 251 else
241 pid = kernel_thread(____call_usermodehelper, sub_info, 252 pid = kernel_thread(____call_usermodehelper, sub_info,
242 CLONE_VFORK | SIGCHLD); 253 CLONE_VFORK | SIGCHLD);
243 254
244 if (wait < 0) 255 switch (wait) {
245 return; 256 case UMH_NO_WAIT:
257 break;
246 258
247 if (pid < 0) { 259 case UMH_WAIT_PROC:
260 if (pid > 0)
261 break;
248 sub_info->retval = pid; 262 sub_info->retval = pid;
263 /* FALLTHROUGH */
264
265 case UMH_WAIT_EXEC:
249 complete(sub_info->complete); 266 complete(sub_info->complete);
250 } else if (!wait) 267 }
251 complete(sub_info->complete); 268}
269
270#ifdef CONFIG_PM
271/*
272 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
273 * (used for preventing user land processes from being created after the user
274 * land has been frozen during a system-wide hibernation or suspend operation).
275 */
276static int usermodehelper_disabled;
277
278/* Number of helpers running */
279static atomic_t running_helpers = ATOMIC_INIT(0);
280
281/*
282 * Wait queue head used by usermodehelper_pm_callback() to wait for all running
283 * helpers to finish.
284 */
285static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
286
287/*
288 * Time to wait for running_helpers to become zero before the setting of
289 * usermodehelper_disabled in usermodehelper_pm_callback() fails
290 */
291#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
292
293static int usermodehelper_pm_callback(struct notifier_block *nfb,
294 unsigned long action,
295 void *ignored)
296{
297 long retval;
298
299 switch (action) {
300 case PM_HIBERNATION_PREPARE:
301 case PM_SUSPEND_PREPARE:
302 usermodehelper_disabled = 1;
303 smp_mb();
304 /*
305 * From now on call_usermodehelper_exec() won't start any new
306 * helpers, so it is sufficient if running_helpers turns out to
307 * be zero at one point (it may be increased later, but that
308 * doesn't matter).
309 */
310 retval = wait_event_timeout(running_helpers_waitq,
311 atomic_read(&running_helpers) == 0,
312 RUNNING_HELPERS_TIMEOUT);
313 if (retval) {
314 return NOTIFY_OK;
315 } else {
316 usermodehelper_disabled = 0;
317 return NOTIFY_BAD;
318 }
319 case PM_POST_HIBERNATION:
320 case PM_POST_SUSPEND:
321 usermodehelper_disabled = 0;
322 return NOTIFY_OK;
323 }
324
325 return NOTIFY_DONE;
326}
327
328static void helper_lock(void)
329{
330 atomic_inc(&running_helpers);
331 smp_mb__after_atomic_inc();
332}
333
334static void helper_unlock(void)
335{
336 if (atomic_dec_and_test(&running_helpers))
337 wake_up(&running_helpers_waitq);
338}
339
340static void register_pm_notifier_callback(void)
341{
342 pm_notifier(usermodehelper_pm_callback, 0);
252} 343}
344#else /* CONFIG_PM */
345#define usermodehelper_disabled 0
346
347static inline void helper_lock(void) {}
348static inline void helper_unlock(void) {}
349static inline void register_pm_notifier_callback(void) {}
350#endif /* CONFIG_PM */
253 351
254/** 352/**
255 * call_usermodehelper_keys - start a usermode application 353 * call_usermodehelper_setup - prepare to call a usermode helper
256 * @path: pathname for the application 354 * @path - path to usermode executable
257 * @argv: null-terminated argument list 355 * @argv - arg vector for process
258 * @envp: null-terminated environment list 356 * @envp - environment for process
259 * @session_keyring: session keyring for process (NULL for an empty keyring) 357 *
358 * Returns either NULL on allocation failure, or a subprocess_info
359 * structure. This should be passed to call_usermodehelper_exec to
360 * exec the process and free the structure.
361 */
362struct subprocess_info *call_usermodehelper_setup(char *path,
363 char **argv, char **envp)
364{
365 struct subprocess_info *sub_info;
366 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
367 if (!sub_info)
368 goto out;
369
370 INIT_WORK(&sub_info->work, __call_usermodehelper);
371 sub_info->path = path;
372 sub_info->argv = argv;
373 sub_info->envp = envp;
374
375 out:
376 return sub_info;
377}
378EXPORT_SYMBOL(call_usermodehelper_setup);
379
380/**
381 * call_usermodehelper_setkeys - set the session keys for usermode helper
382 * @info: a subprocess_info returned by call_usermodehelper_setup
383 * @session_keyring: the session keyring for the process
384 */
385void call_usermodehelper_setkeys(struct subprocess_info *info,
386 struct key *session_keyring)
387{
388 info->ring = session_keyring;
389}
390EXPORT_SYMBOL(call_usermodehelper_setkeys);
391
392/**
393 * call_usermodehelper_setcleanup - set a cleanup function
394 * @info: a subprocess_info returned by call_usermodehelper_setup
395 * @cleanup: a cleanup function
396 *
397 * The cleanup function is just befor ethe subprocess_info is about to
398 * be freed. This can be used for freeing the argv and envp. The
399 * Function must be runnable in either a process context or the
400 * context in which call_usermodehelper_exec is called.
401 */
402void call_usermodehelper_setcleanup(struct subprocess_info *info,
403 void (*cleanup)(char **argv, char **envp))
404{
405 info->cleanup = cleanup;
406}
407EXPORT_SYMBOL(call_usermodehelper_setcleanup);
408
409/**
410 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
411 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
412 * @filp: set to the write-end of a pipe
413 *
414 * This constructs a pipe, and sets the read end to be the stdin of the
415 * subprocess, and returns the write-end in *@filp.
416 */
417int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
418 struct file **filp)
419{
420 struct file *f;
421
422 f = create_write_pipe();
423 if (IS_ERR(f))
424 return PTR_ERR(f);
425 *filp = f;
426
427 f = create_read_pipe(f);
428 if (IS_ERR(f)) {
429 free_write_pipe(*filp);
430 return PTR_ERR(f);
431 }
432 sub_info->stdin = f;
433
434 return 0;
435}
436EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
437
438/**
439 * call_usermodehelper_exec - start a usermode application
440 * @sub_info: information about the subprocessa
260 * @wait: wait for the application to finish and return status. 441 * @wait: wait for the application to finish and return status.
261 * when -1 don't wait at all, but you get no useful error back when 442 * when -1 don't wait at all, but you get no useful error back when
262 * the program couldn't be exec'ed. This makes it safe to call 443 * the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work)
265 * Runs a user-space application. The application is started 446 * Runs a user-space application. The application is started
266 * asynchronously if wait is not set, and runs as a child of keventd. 447 * asynchronously if wait is not set, and runs as a child of keventd.
267 * (ie. it runs with full root capabilities). 448 * (ie. it runs with full root capabilities).
268 *
269 * Must be called from process context. Returns a negative error code
270 * if program was not execed successfully, or 0.
271 */ 449 */
272int call_usermodehelper_keys(char *path, char **argv, char **envp, 450int call_usermodehelper_exec(struct subprocess_info *sub_info,
273 struct key *session_keyring, int wait) 451 enum umh_wait wait)
274{ 452{
275 DECLARE_COMPLETION_ONSTACK(done); 453 DECLARE_COMPLETION_ONSTACK(done);
276 struct subprocess_info *sub_info;
277 int retval; 454 int retval;
278 455
279 if (!khelper_wq) 456 helper_lock();
280 return -EBUSY; 457 if (sub_info->path[0] == '\0') {
281 458 retval = 0;
282 if (path[0] == '\0') 459 goto out;
283 return 0; 460 }
284 461
285 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 462 if (!khelper_wq || usermodehelper_disabled) {
286 if (!sub_info) 463 retval = -EBUSY;
287 return -ENOMEM; 464 goto out;
465 }
288 466
289 INIT_WORK(&sub_info->work, __call_usermodehelper);
290 sub_info->complete = &done; 467 sub_info->complete = &done;
291 sub_info->path = path;
292 sub_info->argv = argv;
293 sub_info->envp = envp;
294 sub_info->ring = session_keyring;
295 sub_info->wait = wait; 468 sub_info->wait = wait;
296 469
297 queue_work(khelper_wq, &sub_info->work); 470 queue_work(khelper_wq, &sub_info->work);
298 if (wait < 0) /* task has freed sub_info */ 471 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
299 return 0; 472 return 0;
300 wait_for_completion(&done); 473 wait_for_completion(&done);
301 retval = sub_info->retval; 474 retval = sub_info->retval;
302 kfree(sub_info); 475
476 out:
477 call_usermodehelper_freeinfo(sub_info);
478 helper_unlock();
303 return retval; 479 return retval;
304} 480}
305EXPORT_SYMBOL(call_usermodehelper_keys); 481EXPORT_SYMBOL(call_usermodehelper_exec);
306 482
483/**
484 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
485 * @path: path to usermode executable
486 * @argv: arg vector for process
487 * @envp: environment for process
488 * @filp: set to the write-end of a pipe
489 *
490 * This is a simple wrapper which executes a usermode-helper function
491 * with a pipe as stdin. It is implemented entirely in terms of
492 * lower-level call_usermodehelper_* functions.
493 */
307int call_usermodehelper_pipe(char *path, char **argv, char **envp, 494int call_usermodehelper_pipe(char *path, char **argv, char **envp,
308 struct file **filp) 495 struct file **filp)
309{ 496{
310 DECLARE_COMPLETION(done); 497 struct subprocess_info *sub_info;
311 struct subprocess_info sub_info = { 498 int ret;
312 .work = __WORK_INITIALIZER(sub_info.work,
313 __call_usermodehelper),
314 .complete = &done,
315 .path = path,
316 .argv = argv,
317 .envp = envp,
318 .retval = 0,
319 };
320 struct file *f;
321
322 if (!khelper_wq)
323 return -EBUSY;
324 499
325 if (path[0] == '\0') 500 sub_info = call_usermodehelper_setup(path, argv, envp);
326 return 0; 501 if (sub_info == NULL)
502 return -ENOMEM;
327 503
328 f = create_write_pipe(); 504 ret = call_usermodehelper_stdinpipe(sub_info, filp);
329 if (IS_ERR(f)) 505 if (ret < 0)
330 return PTR_ERR(f); 506 goto out;
331 *filp = f;
332 507
333 f = create_read_pipe(f); 508 return call_usermodehelper_exec(sub_info, 1);
334 if (IS_ERR(f)) {
335 free_write_pipe(*filp);
336 return PTR_ERR(f);
337 }
338 sub_info.stdin = f;
339 509
340 queue_work(khelper_wq, &sub_info.work); 510 out:
341 wait_for_completion(&done); 511 call_usermodehelper_freeinfo(sub_info);
342 return sub_info.retval; 512 return ret;
343} 513}
344EXPORT_SYMBOL(call_usermodehelper_pipe); 514EXPORT_SYMBOL(call_usermodehelper_pipe);
345 515
@@ -347,4 +517,5 @@ void __init usermodehelper_init(void)
347{ 517{
348 khelper_wq = create_singlethread_workqueue("khelper"); 518 khelper_wq = create_singlethread_workqueue("khelper");
349 BUG_ON(!khelper_wq); 519 BUG_ON(!khelper_wq);
520 register_pm_notifier_callback();
350} 521}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e47d8c493..3e9f513a72 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = {
675 .priority = 0x7fffffff /* we need to be notified first */ 675 .priority = 0x7fffffff /* we need to be notified first */
676}; 676};
677 677
678unsigned long __weak arch_deref_entry_point(void *entry)
679{
680 return (unsigned long)entry;
681}
678 682
679int __kprobes register_jprobe(struct jprobe *jp) 683int __kprobes register_jprobe(struct jprobe *jp)
680{ 684{
685 unsigned long addr = arch_deref_entry_point(jp->entry);
686
687 if (!kernel_text_address(addr))
688 return -EINVAL;
689
681 /* Todo: Verify probepoint is a function entry point */ 690 /* Todo: Verify probepoint is a function entry point */
682 jp->kp.pre_handler = setjmp_pre_handler; 691 jp->kp.pre_handler = setjmp_pre_handler;
683 jp->kp.break_handler = longjmp_break_handler; 692 jp->kp.break_handler = longjmp_break_handler;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 559deca5ed..d0e5c48e18 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
62KERNEL_ATTR_RO(kexec_crash_loaded); 62KERNEL_ATTR_RO(kexec_crash_loaded);
63#endif /* CONFIG_KEXEC */ 63#endif /* CONFIG_KEXEC */
64 64
65/*
66 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
67 */
68extern const void __start_notes __attribute__((weak));
69extern const void __stop_notes __attribute__((weak));
70#define notes_size (&__stop_notes - &__start_notes)
71
72static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
73 char *buf, loff_t off, size_t count)
74{
75 memcpy(buf, &__start_notes + off, count);
76 return count;
77}
78
79static struct bin_attribute notes_attr = {
80 .attr = {
81 .name = "notes",
82 .mode = S_IRUGO,
83 },
84 .read = &notes_read,
85};
86
65decl_subsys(kernel, NULL, NULL); 87decl_subsys(kernel, NULL, NULL);
66EXPORT_SYMBOL_GPL(kernel_subsys); 88EXPORT_SYMBOL_GPL(kernel_subsys);
67 89
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void)
88 error = sysfs_create_group(&kernel_subsys.kobj, 110 error = sysfs_create_group(&kernel_subsys.kobj,
89 &kernel_attr_group); 111 &kernel_attr_group);
90 112
113 if (!error && notes_size > 0) {
114 notes_attr.size = notes_size;
115 error = sysfs_create_bin_file(&kernel_subsys.kobj,
116 &notes_attr);
117 }
118
91 return error; 119 return error;
92} 120}
93 121
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bbd51b81a3..a404f7ee73 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -215,7 +215,7 @@ int kthread_stop(struct task_struct *k)
215EXPORT_SYMBOL(kthread_stop); 215EXPORT_SYMBOL(kthread_stop);
216 216
217 217
218static __init void kthreadd_setup(void) 218static noinline __init_refok void kthreadd_setup(void)
219{ 219{
220 struct task_struct *tsk = current; 220 struct task_struct *tsk = current;
221 221
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1a5ff2211d..734da579ad 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * this code maps all the lock dependencies as they occur in a live kernel 11 * this code maps all the lock dependencies as they occur in a live kernel
11 * and will warn about the following classes of locking bugs: 12 * and will warn about the following classes of locking bugs:
@@ -37,11 +38,26 @@
37#include <linux/debug_locks.h> 38#include <linux/debug_locks.h>
38#include <linux/irqflags.h> 39#include <linux/irqflags.h>
39#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h>
40 42
41#include <asm/sections.h> 43#include <asm/sections.h>
42 44
43#include "lockdep_internals.h" 45#include "lockdep_internals.h"
44 46
47#ifdef CONFIG_PROVE_LOCKING
48int prove_locking = 1;
49module_param(prove_locking, int, 0644);
50#else
51#define prove_locking 0
52#endif
53
54#ifdef CONFIG_LOCK_STAT
55int lock_stat = 1;
56module_param(lock_stat, int, 0644);
57#else
58#define lock_stat 0
59#endif
60
45/* 61/*
46 * lockdep_lock: protects the lockdep graph, the hashes and the 62 * lockdep_lock: protects the lockdep graph, the hashes and the
47 * class/list/hash allocators. 63 * class/list/hash allocators.
@@ -96,23 +112,6 @@ unsigned long nr_list_entries;
96static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; 112static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
97 113
98/* 114/*
99 * Allocate a lockdep entry. (assumes the graph_lock held, returns
100 * with NULL on failure)
101 */
102static struct lock_list *alloc_list_entry(void)
103{
104 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
105 if (!debug_locks_off_graph_unlock())
106 return NULL;
107
108 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
109 printk("turning off the locking correctness validator.\n");
110 return NULL;
111 }
112 return list_entries + nr_list_entries++;
113}
114
115/*
116 * All data structures here are protected by the global debug_lock. 115 * All data structures here are protected by the global debug_lock.
117 * 116 *
118 * Mutex key structs only get allocated, once during bootup, and never 117 * Mutex key structs only get allocated, once during bootup, and never
@@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void)
121unsigned long nr_lock_classes; 120unsigned long nr_lock_classes;
122static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 121static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
123 122
123#ifdef CONFIG_LOCK_STAT
124static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
125
126static int lock_contention_point(struct lock_class *class, unsigned long ip)
127{
128 int i;
129
130 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
131 if (class->contention_point[i] == 0) {
132 class->contention_point[i] = ip;
133 break;
134 }
135 if (class->contention_point[i] == ip)
136 break;
137 }
138
139 return i;
140}
141
142static void lock_time_inc(struct lock_time *lt, s64 time)
143{
144 if (time > lt->max)
145 lt->max = time;
146
147 if (time < lt->min || !lt->min)
148 lt->min = time;
149
150 lt->total += time;
151 lt->nr++;
152}
153
154static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
155{
156 dst->min += src->min;
157 dst->max += src->max;
158 dst->total += src->total;
159 dst->nr += src->nr;
160}
161
162struct lock_class_stats lock_stats(struct lock_class *class)
163{
164 struct lock_class_stats stats;
165 int cpu, i;
166
167 memset(&stats, 0, sizeof(struct lock_class_stats));
168 for_each_possible_cpu(cpu) {
169 struct lock_class_stats *pcs =
170 &per_cpu(lock_stats, cpu)[class - lock_classes];
171
172 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
173 stats.contention_point[i] += pcs->contention_point[i];
174
175 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
176 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
177
178 lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
179 lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
180
181 for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
182 stats.bounces[i] += pcs->bounces[i];
183 }
184
185 return stats;
186}
187
188void clear_lock_stats(struct lock_class *class)
189{
190 int cpu;
191
192 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *cpu_stats =
194 &per_cpu(lock_stats, cpu)[class - lock_classes];
195
196 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
197 }
198 memset(class->contention_point, 0, sizeof(class->contention_point));
199}
200
201static struct lock_class_stats *get_lock_stats(struct lock_class *class)
202{
203 return &get_cpu_var(lock_stats)[class - lock_classes];
204}
205
206static void put_lock_stats(struct lock_class_stats *stats)
207{
208 put_cpu_var(lock_stats);
209}
210
211static void lock_release_holdtime(struct held_lock *hlock)
212{
213 struct lock_class_stats *stats;
214 s64 holdtime;
215
216 if (!lock_stat)
217 return;
218
219 holdtime = sched_clock() - hlock->holdtime_stamp;
220
221 stats = get_lock_stats(hlock->class);
222 if (hlock->read)
223 lock_time_inc(&stats->read_holdtime, holdtime);
224 else
225 lock_time_inc(&stats->write_holdtime, holdtime);
226 put_lock_stats(stats);
227}
228#else
229static inline void lock_release_holdtime(struct held_lock *hlock)
230{
231}
232#endif
233
124/* 234/*
125 * We keep a global list of all lock classes. The list only grows, 235 * We keep a global list of all lock classes. The list only grows,
126 * never shrinks. The list is only accessed with the lockdep 236 * never shrinks. The list is only accessed with the lockdep
@@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes);
133 */ 243 */
134#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) 244#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
135#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) 245#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
136#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) 246#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
137#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
138#define classhashentry(key) (classhash_table + __classhashfn((key))) 247#define classhashentry(key) (classhash_table + __classhashfn((key)))
139 248
140static struct list_head classhash_table[CLASSHASH_SIZE]; 249static struct list_head classhash_table[CLASSHASH_SIZE];
141 250
142unsigned long nr_lock_chains;
143static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
144
145/* 251/*
146 * We put the lock dependency chains into a hash-table as well, to cache 252 * We put the lock dependency chains into a hash-table as well, to cache
147 * their existence: 253 * their existence:
148 */ 254 */
149#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) 255#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
150#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) 256#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
151#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) 257#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
152#define __chainhashfn(chain) \
153 (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
154#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) 258#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
155 259
156static struct list_head chainhash_table[CHAINHASH_SIZE]; 260static struct list_head chainhash_table[CHAINHASH_SIZE];
@@ -223,26 +327,6 @@ static int verbose(struct lock_class *class)
223 return 0; 327 return 0;
224} 328}
225 329
226#ifdef CONFIG_TRACE_IRQFLAGS
227
228static int hardirq_verbose(struct lock_class *class)
229{
230#if HARDIRQ_VERBOSE
231 return class_filter(class);
232#endif
233 return 0;
234}
235
236static int softirq_verbose(struct lock_class *class)
237{
238#if SOFTIRQ_VERBOSE
239 return class_filter(class);
240#endif
241 return 0;
242}
243
244#endif
245
246/* 330/*
247 * Stack-trace: tightly packed array of stack backtrace 331 * Stack-trace: tightly packed array of stack backtrace
248 * addresses. Protected by the graph_lock. 332 * addresses. Protected by the graph_lock.
@@ -291,6 +375,11 @@ unsigned int max_recursion_depth;
291 * about it later on, in lockdep_info(). 375 * about it later on, in lockdep_info().
292 */ 376 */
293static int lockdep_init_error; 377static int lockdep_init_error;
378static unsigned long lockdep_init_trace_data[20];
379static struct stack_trace lockdep_init_trace = {
380 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
381 .entries = lockdep_init_trace_data,
382};
294 383
295/* 384/*
296 * Various lockdep statistics: 385 * Various lockdep statistics:
@@ -379,7 +468,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
379 468
380static void print_lock_name(struct lock_class *class) 469static void print_lock_name(struct lock_class *class)
381{ 470{
382 char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; 471 char str[KSYM_NAME_LEN], c1, c2, c3, c4;
383 const char *name; 472 const char *name;
384 473
385 get_usage_chars(class, &c1, &c2, &c3, &c4); 474 get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -401,7 +490,7 @@ static void print_lock_name(struct lock_class *class)
401static void print_lockdep_cache(struct lockdep_map *lock) 490static void print_lockdep_cache(struct lockdep_map *lock)
402{ 491{
403 const char *name; 492 const char *name;
404 char str[KSYM_NAME_LEN + 1]; 493 char str[KSYM_NAME_LEN];
405 494
406 name = lock->name; 495 name = lock->name;
407 if (!name) 496 if (!name)
@@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
482 } 571 }
483} 572}
484 573
574static void print_kernel_version(void)
575{
576 printk("%s %.*s\n", init_utsname()->release,
577 (int)strcspn(init_utsname()->version, " "),
578 init_utsname()->version);
579}
580
581static int very_verbose(struct lock_class *class)
582{
583#if VERY_VERBOSE
584 return class_filter(class);
585#endif
586 return 0;
587}
588
589/*
590 * Is this the address of a static object:
591 */
592static int static_obj(void *obj)
593{
594 unsigned long start = (unsigned long) &_stext,
595 end = (unsigned long) &_end,
596 addr = (unsigned long) obj;
597#ifdef CONFIG_SMP
598 int i;
599#endif
600
601 /*
602 * static variable?
603 */
604 if ((addr >= start) && (addr < end))
605 return 1;
606
607#ifdef CONFIG_SMP
608 /*
609 * percpu var?
610 */
611 for_each_possible_cpu(i) {
612 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
613 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
614 + per_cpu_offset(i);
615
616 if ((addr >= start) && (addr < end))
617 return 1;
618 }
619#endif
620
621 /*
622 * module var?
623 */
624 return is_module_address(addr);
625}
626
627/*
628 * To make lock name printouts unique, we calculate a unique
629 * class->name_version generation counter:
630 */
631static int count_matching_names(struct lock_class *new_class)
632{
633 struct lock_class *class;
634 int count = 0;
635
636 if (!new_class->name)
637 return 0;
638
639 list_for_each_entry(class, &all_lock_classes, lock_entry) {
640 if (new_class->key - new_class->subclass == class->key)
641 return class->name_version;
642 if (class->name && !strcmp(class->name, new_class->name))
643 count = max(count, class->name_version);
644 }
645
646 return count + 1;
647}
648
649/*
650 * Register a lock's class in the hash-table, if the class is not present
651 * yet. Otherwise we look it up. We cache the result in the lock object
652 * itself, so actual lookup of the hash should be once per lock object.
653 */
654static inline struct lock_class *
655look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656{
657 struct lockdep_subclass_key *key;
658 struct list_head *hash_head;
659 struct lock_class *class;
660
661#ifdef CONFIG_DEBUG_LOCKDEP
662 /*
663 * If the architecture calls into lockdep before initializing
664 * the hashes then we'll warn about it later. (we cannot printk
665 * right now)
666 */
667 if (unlikely(!lockdep_initialized)) {
668 lockdep_init();
669 lockdep_init_error = 1;
670 save_stack_trace(&lockdep_init_trace);
671 }
672#endif
673
674 /*
675 * Static locks do not have their class-keys yet - for them the key
676 * is the lock object itself:
677 */
678 if (unlikely(!lock->key))
679 lock->key = (void *)lock;
680
681 /*
682 * NOTE: the class-key must be unique. For dynamic locks, a static
683 * lock_class_key variable is passed in through the mutex_init()
684 * (or spin_lock_init()) call - which acts as the key. For static
685 * locks we use the lock object itself as the key.
686 */
687 BUILD_BUG_ON(sizeof(struct lock_class_key) >
688 sizeof(struct lockdep_map));
689
690 key = lock->key->subkeys + subclass;
691
692 hash_head = classhashentry(key);
693
694 /*
695 * We can walk the hash lockfree, because the hash only
696 * grows, and we are careful when adding entries to the end:
697 */
698 list_for_each_entry(class, hash_head, hash_entry) {
699 if (class->key == key) {
700 WARN_ON_ONCE(class->name != lock->name);
701 return class;
702 }
703 }
704
705 return NULL;
706}
707
708/*
709 * Register a lock's class in the hash-table, if the class is not present
710 * yet. Otherwise we look it up. We cache the result in the lock object
711 * itself, so actual lookup of the hash should be once per lock object.
712 */
713static inline struct lock_class *
714register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
715{
716 struct lockdep_subclass_key *key;
717 struct list_head *hash_head;
718 struct lock_class *class;
719 unsigned long flags;
720
721 class = look_up_lock_class(lock, subclass);
722 if (likely(class))
723 return class;
724
725 /*
726 * Debug-check: all keys must be persistent!
727 */
728 if (!static_obj(lock->key)) {
729 debug_locks_off();
730 printk("INFO: trying to register non-static key.\n");
731 printk("the code is fine but needs lockdep annotation.\n");
732 printk("turning off the locking correctness validator.\n");
733 dump_stack();
734
735 return NULL;
736 }
737
738 key = lock->key->subkeys + subclass;
739 hash_head = classhashentry(key);
740
741 raw_local_irq_save(flags);
742 if (!graph_lock()) {
743 raw_local_irq_restore(flags);
744 return NULL;
745 }
746 /*
747 * We have to do the hash-walk again, to avoid races
748 * with another CPU:
749 */
750 list_for_each_entry(class, hash_head, hash_entry)
751 if (class->key == key)
752 goto out_unlock_set;
753 /*
754 * Allocate a new key from the static array, and add it to
755 * the hash:
756 */
757 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
758 if (!debug_locks_off_graph_unlock()) {
759 raw_local_irq_restore(flags);
760 return NULL;
761 }
762 raw_local_irq_restore(flags);
763
764 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
765 printk("turning off the locking correctness validator.\n");
766 return NULL;
767 }
768 class = lock_classes + nr_lock_classes++;
769 debug_atomic_inc(&nr_unused_locks);
770 class->key = key;
771 class->name = lock->name;
772 class->subclass = subclass;
773 INIT_LIST_HEAD(&class->lock_entry);
774 INIT_LIST_HEAD(&class->locks_before);
775 INIT_LIST_HEAD(&class->locks_after);
776 class->name_version = count_matching_names(class);
777 /*
778 * We use RCU's safe list-add method to make
779 * parallel walking of the hash-list safe:
780 */
781 list_add_tail_rcu(&class->hash_entry, hash_head);
782
783 if (verbose(class)) {
784 graph_unlock();
785 raw_local_irq_restore(flags);
786
787 printk("\nnew class %p: %s", class->key, class->name);
788 if (class->name_version > 1)
789 printk("#%d", class->name_version);
790 printk("\n");
791 dump_stack();
792
793 raw_local_irq_save(flags);
794 if (!graph_lock()) {
795 raw_local_irq_restore(flags);
796 return NULL;
797 }
798 }
799out_unlock_set:
800 graph_unlock();
801 raw_local_irq_restore(flags);
802
803 if (!subclass || force)
804 lock->class_cache = class;
805
806 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
807 return NULL;
808
809 return class;
810}
811
812#ifdef CONFIG_PROVE_LOCKING
813/*
814 * Allocate a lockdep entry. (assumes the graph_lock held, returns
815 * with NULL on failure)
816 */
817static struct lock_list *alloc_list_entry(void)
818{
819 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
820 if (!debug_locks_off_graph_unlock())
821 return NULL;
822
823 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
824 printk("turning off the locking correctness validator.\n");
825 return NULL;
826 }
827 return list_entries + nr_list_entries++;
828}
829
485/* 830/*
486 * Add a new dependency to the head of the list: 831 * Add a new dependency to the head of the list:
487 */ 832 */
@@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
542 return 0; 887 return 0;
543} 888}
544 889
545static void print_kernel_version(void)
546{
547 printk("%s %.*s\n", init_utsname()->release,
548 (int)strcspn(init_utsname()->version, " "),
549 init_utsname()->version);
550}
551
552/* 890/*
553 * When a circular dependency is detected, print the 891 * When a circular dependency is detected, print the
554 * header first: 892 * header first:
@@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
640 return 1; 978 return 1;
641} 979}
642 980
643static int very_verbose(struct lock_class *class)
644{
645#if VERY_VERBOSE
646 return class_filter(class);
647#endif
648 return 0;
649}
650#ifdef CONFIG_TRACE_IRQFLAGS 981#ifdef CONFIG_TRACE_IRQFLAGS
651
652/* 982/*
653 * Forwards and backwards subgraph searching, for the purposes of 983 * Forwards and backwards subgraph searching, for the purposes of
654 * proving that two subgraphs can be connected by a new dependency 984 * proving that two subgraphs can be connected by a new dependency
@@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
821 bit_backwards, bit_forwards, irqclass); 1151 bit_backwards, bit_forwards, irqclass);
822} 1152}
823 1153
1154static int
1155check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1156 struct held_lock *next)
1157{
1158 /*
1159 * Prove that the new dependency does not connect a hardirq-safe
1160 * lock with a hardirq-unsafe lock - to achieve this we search
1161 * the backwards-subgraph starting at <prev>, and the
1162 * forwards-subgraph starting at <next>:
1163 */
1164 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
1165 LOCK_ENABLED_HARDIRQS, "hard"))
1166 return 0;
1167
1168 /*
1169 * Prove that the new dependency does not connect a hardirq-safe-read
1170 * lock with a hardirq-unsafe lock - to achieve this we search
1171 * the backwards-subgraph starting at <prev>, and the
1172 * forwards-subgraph starting at <next>:
1173 */
1174 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
1175 LOCK_ENABLED_HARDIRQS, "hard-read"))
1176 return 0;
1177
1178 /*
1179 * Prove that the new dependency does not connect a softirq-safe
1180 * lock with a softirq-unsafe lock - to achieve this we search
1181 * the backwards-subgraph starting at <prev>, and the
1182 * forwards-subgraph starting at <next>:
1183 */
1184 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
1185 LOCK_ENABLED_SOFTIRQS, "soft"))
1186 return 0;
1187 /*
1188 * Prove that the new dependency does not connect a softirq-safe-read
1189 * lock with a softirq-unsafe lock - to achieve this we search
1190 * the backwards-subgraph starting at <prev>, and the
1191 * forwards-subgraph starting at <next>:
1192 */
1193 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1194 LOCK_ENABLED_SOFTIRQS, "soft"))
1195 return 0;
1196
1197 return 1;
1198}
1199
1200static void inc_chains(void)
1201{
1202 if (current->hardirq_context)
1203 nr_hardirq_chains++;
1204 else {
1205 if (current->softirq_context)
1206 nr_softirq_chains++;
1207 else
1208 nr_process_chains++;
1209 }
1210}
1211
1212#else
1213
1214static inline int
1215check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1216 struct held_lock *next)
1217{
1218 return 1;
1219}
1220
1221static inline void inc_chains(void)
1222{
1223 nr_process_chains++;
1224}
1225
824#endif 1226#endif
825 1227
826static int 1228static int
@@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
922 if (!(check_noncircular(next->class, 0))) 1324 if (!(check_noncircular(next->class, 0)))
923 return print_circular_bug_tail(); 1325 return print_circular_bug_tail();
924 1326
925#ifdef CONFIG_TRACE_IRQFLAGS 1327 if (!check_prev_add_irq(curr, prev, next))
926 /*
927 * Prove that the new dependency does not connect a hardirq-safe
928 * lock with a hardirq-unsafe lock - to achieve this we search
929 * the backwards-subgraph starting at <prev>, and the
930 * forwards-subgraph starting at <next>:
931 */
932 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
933 LOCK_ENABLED_HARDIRQS, "hard"))
934 return 0; 1328 return 0;
935 1329
936 /* 1330 /*
937 * Prove that the new dependency does not connect a hardirq-safe-read
938 * lock with a hardirq-unsafe lock - to achieve this we search
939 * the backwards-subgraph starting at <prev>, and the
940 * forwards-subgraph starting at <next>:
941 */
942 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
943 LOCK_ENABLED_HARDIRQS, "hard-read"))
944 return 0;
945
946 /*
947 * Prove that the new dependency does not connect a softirq-safe
948 * lock with a softirq-unsafe lock - to achieve this we search
949 * the backwards-subgraph starting at <prev>, and the
950 * forwards-subgraph starting at <next>:
951 */
952 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
953 LOCK_ENABLED_SOFTIRQS, "soft"))
954 return 0;
955 /*
956 * Prove that the new dependency does not connect a softirq-safe-read
957 * lock with a softirq-unsafe lock - to achieve this we search
958 * the backwards-subgraph starting at <prev>, and the
959 * forwards-subgraph starting at <next>:
960 */
961 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
962 LOCK_ENABLED_SOFTIRQS, "soft"))
963 return 0;
964#endif
965 /*
966 * For recursive read-locks we do all the dependency checks, 1331 * For recursive read-locks we do all the dependency checks,
967 * but we dont store read-triggered dependencies (only 1332 * but we dont store read-triggered dependencies (only
968 * write-triggered dependencies). This ensures that only the 1333 * write-triggered dependencies). This ensures that only the
@@ -1088,224 +1453,8 @@ out_bug:
1088 return 0; 1453 return 0;
1089} 1454}
1090 1455
1091 1456unsigned long nr_lock_chains;
1092/* 1457static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1093 * Is this the address of a static object:
1094 */
1095static int static_obj(void *obj)
1096{
1097 unsigned long start = (unsigned long) &_stext,
1098 end = (unsigned long) &_end,
1099 addr = (unsigned long) obj;
1100#ifdef CONFIG_SMP
1101 int i;
1102#endif
1103
1104 /*
1105 * static variable?
1106 */
1107 if ((addr >= start) && (addr < end))
1108 return 1;
1109
1110#ifdef CONFIG_SMP
1111 /*
1112 * percpu var?
1113 */
1114 for_each_possible_cpu(i) {
1115 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1116 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
1117 + per_cpu_offset(i);
1118
1119 if ((addr >= start) && (addr < end))
1120 return 1;
1121 }
1122#endif
1123
1124 /*
1125 * module var?
1126 */
1127 return is_module_address(addr);
1128}
1129
1130/*
1131 * To make lock name printouts unique, we calculate a unique
1132 * class->name_version generation counter:
1133 */
1134static int count_matching_names(struct lock_class *new_class)
1135{
1136 struct lock_class *class;
1137 int count = 0;
1138
1139 if (!new_class->name)
1140 return 0;
1141
1142 list_for_each_entry(class, &all_lock_classes, lock_entry) {
1143 if (new_class->key - new_class->subclass == class->key)
1144 return class->name_version;
1145 if (class->name && !strcmp(class->name, new_class->name))
1146 count = max(count, class->name_version);
1147 }
1148
1149 return count + 1;
1150}
1151
1152/*
1153 * Register a lock's class in the hash-table, if the class is not present
1154 * yet. Otherwise we look it up. We cache the result in the lock object
1155 * itself, so actual lookup of the hash should be once per lock object.
1156 */
1157static inline struct lock_class *
1158look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1159{
1160 struct lockdep_subclass_key *key;
1161 struct list_head *hash_head;
1162 struct lock_class *class;
1163
1164#ifdef CONFIG_DEBUG_LOCKDEP
1165 /*
1166 * If the architecture calls into lockdep before initializing
1167 * the hashes then we'll warn about it later. (we cannot printk
1168 * right now)
1169 */
1170 if (unlikely(!lockdep_initialized)) {
1171 lockdep_init();
1172 lockdep_init_error = 1;
1173 }
1174#endif
1175
1176 /*
1177 * Static locks do not have their class-keys yet - for them the key
1178 * is the lock object itself:
1179 */
1180 if (unlikely(!lock->key))
1181 lock->key = (void *)lock;
1182
1183 /*
1184 * NOTE: the class-key must be unique. For dynamic locks, a static
1185 * lock_class_key variable is passed in through the mutex_init()
1186 * (or spin_lock_init()) call - which acts as the key. For static
1187 * locks we use the lock object itself as the key.
1188 */
1189 BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
1190
1191 key = lock->key->subkeys + subclass;
1192
1193 hash_head = classhashentry(key);
1194
1195 /*
1196 * We can walk the hash lockfree, because the hash only
1197 * grows, and we are careful when adding entries to the end:
1198 */
1199 list_for_each_entry(class, hash_head, hash_entry)
1200 if (class->key == key)
1201 return class;
1202
1203 return NULL;
1204}
1205
1206/*
1207 * Register a lock's class in the hash-table, if the class is not present
1208 * yet. Otherwise we look it up. We cache the result in the lock object
1209 * itself, so actual lookup of the hash should be once per lock object.
1210 */
1211static inline struct lock_class *
1212register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1213{
1214 struct lockdep_subclass_key *key;
1215 struct list_head *hash_head;
1216 struct lock_class *class;
1217 unsigned long flags;
1218
1219 class = look_up_lock_class(lock, subclass);
1220 if (likely(class))
1221 return class;
1222
1223 /*
1224 * Debug-check: all keys must be persistent!
1225 */
1226 if (!static_obj(lock->key)) {
1227 debug_locks_off();
1228 printk("INFO: trying to register non-static key.\n");
1229 printk("the code is fine but needs lockdep annotation.\n");
1230 printk("turning off the locking correctness validator.\n");
1231 dump_stack();
1232
1233 return NULL;
1234 }
1235
1236 key = lock->key->subkeys + subclass;
1237 hash_head = classhashentry(key);
1238
1239 raw_local_irq_save(flags);
1240 if (!graph_lock()) {
1241 raw_local_irq_restore(flags);
1242 return NULL;
1243 }
1244 /*
1245 * We have to do the hash-walk again, to avoid races
1246 * with another CPU:
1247 */
1248 list_for_each_entry(class, hash_head, hash_entry)
1249 if (class->key == key)
1250 goto out_unlock_set;
1251 /*
1252 * Allocate a new key from the static array, and add it to
1253 * the hash:
1254 */
1255 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1256 if (!debug_locks_off_graph_unlock()) {
1257 raw_local_irq_restore(flags);
1258 return NULL;
1259 }
1260 raw_local_irq_restore(flags);
1261
1262 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1263 printk("turning off the locking correctness validator.\n");
1264 return NULL;
1265 }
1266 class = lock_classes + nr_lock_classes++;
1267 debug_atomic_inc(&nr_unused_locks);
1268 class->key = key;
1269 class->name = lock->name;
1270 class->subclass = subclass;
1271 INIT_LIST_HEAD(&class->lock_entry);
1272 INIT_LIST_HEAD(&class->locks_before);
1273 INIT_LIST_HEAD(&class->locks_after);
1274 class->name_version = count_matching_names(class);
1275 /*
1276 * We use RCU's safe list-add method to make
1277 * parallel walking of the hash-list safe:
1278 */
1279 list_add_tail_rcu(&class->hash_entry, hash_head);
1280
1281 if (verbose(class)) {
1282 graph_unlock();
1283 raw_local_irq_restore(flags);
1284
1285 printk("\nnew class %p: %s", class->key, class->name);
1286 if (class->name_version > 1)
1287 printk("#%d", class->name_version);
1288 printk("\n");
1289 dump_stack();
1290
1291 raw_local_irq_save(flags);
1292 if (!graph_lock()) {
1293 raw_local_irq_restore(flags);
1294 return NULL;
1295 }
1296 }
1297out_unlock_set:
1298 graph_unlock();
1299 raw_local_irq_restore(flags);
1300
1301 if (!subclass || force)
1302 lock->class_cache = class;
1303
1304 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
1305 return NULL;
1306
1307 return class;
1308}
1309 1458
1310/* 1459/*
1311 * Look up a dependency chain. If the key is not present yet then 1460 * Look up a dependency chain. If the key is not present yet then
@@ -1366,21 +1515,72 @@ cache_hit:
1366 chain->chain_key = chain_key; 1515 chain->chain_key = chain_key;
1367 list_add_tail_rcu(&chain->entry, hash_head); 1516 list_add_tail_rcu(&chain->entry, hash_head);
1368 debug_atomic_inc(&chain_lookup_misses); 1517 debug_atomic_inc(&chain_lookup_misses);
1369#ifdef CONFIG_TRACE_IRQFLAGS 1518 inc_chains();
1370 if (current->hardirq_context) 1519
1371 nr_hardirq_chains++; 1520 return 1;
1372 else { 1521}
1373 if (current->softirq_context) 1522
1374 nr_softirq_chains++; 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1375 else 1524 struct held_lock *hlock, int chain_head)
1376 nr_process_chains++; 1525{
1377 } 1526 /*
1378#else 1527 * Trylock needs to maintain the stack of held locks, but it
1379 nr_process_chains++; 1528 * does not add new dependencies, because trylock can be done
1380#endif 1529 * in any order.
1530 *
1531 * We look up the chain_key and do the O(N^2) check and update of
1532 * the dependencies only if this is a new dependency chain.
1533 * (If lookup_chain_cache() returns with 1 it acquires
1534 * graph_lock for us)
1535 */
1536 if (!hlock->trylock && (hlock->check == 2) &&
1537 lookup_chain_cache(curr->curr_chain_key, hlock->class)) {
1538 /*
1539 * Check whether last held lock:
1540 *
1541 * - is irq-safe, if this lock is irq-unsafe
1542 * - is softirq-safe, if this lock is hardirq-unsafe
1543 *
1544 * And check whether the new lock's dependency graph
1545 * could lead back to the previous lock.
1546 *
1547 * any of these scenarios could lead to a deadlock. If
1548 * All validations
1549 */
1550 int ret = check_deadlock(curr, hlock, lock, hlock->read);
1551
1552 if (!ret)
1553 return 0;
1554 /*
1555 * Mark recursive read, as we jump over it when
1556 * building dependencies (just like we jump over
1557 * trylock entries):
1558 */
1559 if (ret == 2)
1560 hlock->read = 2;
1561 /*
1562 * Add dependency only if this lock is not the head
1563 * of the chain, and if it's not a secondary read-lock:
1564 */
1565 if (!chain_head && ret != 2)
1566 if (!check_prevs_add(curr, hlock))
1567 return 0;
1568 graph_unlock();
1569 } else
1570 /* after lookup_chain_cache(): */
1571 if (unlikely(!debug_locks))
1572 return 0;
1381 1573
1382 return 1; 1574 return 1;
1383} 1575}
1576#else
1577static inline int validate_chain(struct task_struct *curr,
1578 struct lockdep_map *lock, struct held_lock *hlock,
1579 int chain_head)
1580{
1581 return 1;
1582}
1583#endif
1384 1584
1385/* 1585/*
1386 * We are building curr_chain_key incrementally, so double-check 1586 * We are building curr_chain_key incrementally, so double-check
@@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr)
1425#endif 1625#endif
1426} 1626}
1427 1627
1628static int
1629print_usage_bug(struct task_struct *curr, struct held_lock *this,
1630 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1631{
1632 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1633 return 0;
1634
1635 printk("\n=================================\n");
1636 printk( "[ INFO: inconsistent lock state ]\n");
1637 print_kernel_version();
1638 printk( "---------------------------------\n");
1639
1640 printk("inconsistent {%s} -> {%s} usage.\n",
1641 usage_str[prev_bit], usage_str[new_bit]);
1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid,
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr),
1648 trace_softirqs_enabled(curr));
1649 print_lock(this);
1650
1651 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1652 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1653
1654 print_irqtrace_events(curr);
1655 printk("\nother info that might help us debug this:\n");
1656 lockdep_print_held_locks(curr);
1657
1658 printk("\nstack backtrace:\n");
1659 dump_stack();
1660
1661 return 0;
1662}
1663
1664/*
1665 * Print out an error if an invalid bit is set:
1666 */
1667static inline int
1668valid_state(struct task_struct *curr, struct held_lock *this,
1669 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1670{
1671 if (unlikely(this->class->usage_mask & (1 << bad_bit)))
1672 return print_usage_bug(curr, this, bad_bit, new_bit);
1673 return 1;
1674}
1675
1676static int mark_lock(struct task_struct *curr, struct held_lock *this,
1677 enum lock_usage_bit new_bit);
1678
1428#ifdef CONFIG_TRACE_IRQFLAGS 1679#ifdef CONFIG_TRACE_IRQFLAGS
1429 1680
1430/* 1681/*
@@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr)
1518 print_ip_sym(curr->softirq_disable_ip); 1769 print_ip_sym(curr->softirq_disable_ip);
1519} 1770}
1520 1771
1521#endif 1772static int hardirq_verbose(struct lock_class *class)
1522
1523static int
1524print_usage_bug(struct task_struct *curr, struct held_lock *this,
1525 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1526{ 1773{
1527 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1774#if HARDIRQ_VERBOSE
1528 return 0; 1775 return class_filter(class);
1529 1776#endif
1530 printk("\n=================================\n");
1531 printk( "[ INFO: inconsistent lock state ]\n");
1532 print_kernel_version();
1533 printk( "---------------------------------\n");
1534
1535 printk("inconsistent {%s} -> {%s} usage.\n",
1536 usage_str[prev_bit], usage_str[new_bit]);
1537
1538 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1539 curr->comm, curr->pid,
1540 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1541 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1542 trace_hardirqs_enabled(curr),
1543 trace_softirqs_enabled(curr));
1544 print_lock(this);
1545
1546 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1547 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1548
1549 print_irqtrace_events(curr);
1550 printk("\nother info that might help us debug this:\n");
1551 lockdep_print_held_locks(curr);
1552
1553 printk("\nstack backtrace:\n");
1554 dump_stack();
1555
1556 return 0; 1777 return 0;
1557} 1778}
1558 1779
1559/* 1780static int softirq_verbose(struct lock_class *class)
1560 * Print out an error if an invalid bit is set:
1561 */
1562static inline int
1563valid_state(struct task_struct *curr, struct held_lock *this,
1564 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1565{ 1781{
1566 if (unlikely(this->class->usage_mask & (1 << bad_bit))) 1782#if SOFTIRQ_VERBOSE
1567 return print_usage_bug(curr, this, bad_bit, new_bit); 1783 return class_filter(class);
1568 return 1; 1784#endif
1785 return 0;
1569} 1786}
1570 1787
1571#define STRICT_READ_CHECKS 1 1788#define STRICT_READ_CHECKS 1
1572 1789
1573/* 1790static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1574 * Mark a lock with a usage bit, and validate the state transition: 1791 enum lock_usage_bit new_bit)
1575 */
1576static int mark_lock(struct task_struct *curr, struct held_lock *this,
1577 enum lock_usage_bit new_bit)
1578{ 1792{
1579 unsigned int new_mask = 1 << new_bit, ret = 1; 1793 int ret = 1;
1580
1581 /*
1582 * If already set then do not dirty the cacheline,
1583 * nor do any checks:
1584 */
1585 if (likely(this->class->usage_mask & new_mask))
1586 return 1;
1587
1588 if (!graph_lock())
1589 return 0;
1590 /*
1591 * Make sure we didnt race:
1592 */
1593 if (unlikely(this->class->usage_mask & new_mask)) {
1594 graph_unlock();
1595 return 1;
1596 }
1597
1598 this->class->usage_mask |= new_mask;
1599 1794
1600 if (!save_trace(this->class->usage_traces + new_bit)) 1795 switch(new_bit) {
1601 return 0;
1602
1603 switch (new_bit) {
1604#ifdef CONFIG_TRACE_IRQFLAGS
1605 case LOCK_USED_IN_HARDIRQ: 1796 case LOCK_USED_IN_HARDIRQ:
1606 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 1797 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1607 return 0; 1798 return 0;
@@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1760 if (softirq_verbose(this->class)) 1951 if (softirq_verbose(this->class))
1761 ret = 2; 1952 ret = 2;
1762 break; 1953 break;
1763#endif
1764 case LOCK_USED:
1765 /*
1766 * Add it to the global list of classes:
1767 */
1768 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
1769 debug_atomic_dec(&nr_unused_locks);
1770 break;
1771 default: 1954 default:
1772 if (!debug_locks_off_graph_unlock())
1773 return 0;
1774 WARN_ON(1); 1955 WARN_ON(1);
1775 return 0; 1956 break;
1776 }
1777
1778 graph_unlock();
1779
1780 /*
1781 * We must printk outside of the graph_lock:
1782 */
1783 if (ret == 2) {
1784 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
1785 print_lock(this);
1786 print_irqtrace_events(curr);
1787 dump_stack();
1788 } 1957 }
1789 1958
1790 return ret; 1959 return ret;
1791} 1960}
1792 1961
1793#ifdef CONFIG_TRACE_IRQFLAGS
1794/* 1962/*
1795 * Mark all held locks with a usage bit: 1963 * Mark all held locks with a usage bit:
1796 */ 1964 */
@@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip)
1973 debug_atomic_inc(&redundant_softirqs_off); 2141 debug_atomic_inc(&redundant_softirqs_off);
1974} 2142}
1975 2143
2144static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2145{
2146 /*
2147 * If non-trylock use in a hardirq or softirq context, then
2148 * mark the lock as used in these contexts:
2149 */
2150 if (!hlock->trylock) {
2151 if (hlock->read) {
2152 if (curr->hardirq_context)
2153 if (!mark_lock(curr, hlock,
2154 LOCK_USED_IN_HARDIRQ_READ))
2155 return 0;
2156 if (curr->softirq_context)
2157 if (!mark_lock(curr, hlock,
2158 LOCK_USED_IN_SOFTIRQ_READ))
2159 return 0;
2160 } else {
2161 if (curr->hardirq_context)
2162 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2163 return 0;
2164 if (curr->softirq_context)
2165 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2166 return 0;
2167 }
2168 }
2169 if (!hlock->hardirqs_off) {
2170 if (hlock->read) {
2171 if (!mark_lock(curr, hlock,
2172 LOCK_ENABLED_HARDIRQS_READ))
2173 return 0;
2174 if (curr->softirqs_enabled)
2175 if (!mark_lock(curr, hlock,
2176 LOCK_ENABLED_SOFTIRQS_READ))
2177 return 0;
2178 } else {
2179 if (!mark_lock(curr, hlock,
2180 LOCK_ENABLED_HARDIRQS))
2181 return 0;
2182 if (curr->softirqs_enabled)
2183 if (!mark_lock(curr, hlock,
2184 LOCK_ENABLED_SOFTIRQS))
2185 return 0;
2186 }
2187 }
2188
2189 return 1;
2190}
2191
2192static int separate_irq_context(struct task_struct *curr,
2193 struct held_lock *hlock)
2194{
2195 unsigned int depth = curr->lockdep_depth;
2196
2197 /*
2198 * Keep track of points where we cross into an interrupt context:
2199 */
2200 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2201 curr->softirq_context;
2202 if (depth) {
2203 struct held_lock *prev_hlock;
2204
2205 prev_hlock = curr->held_locks + depth-1;
2206 /*
2207 * If we cross into another context, reset the
2208 * hash key (this also prevents the checking and the
2209 * adding of the dependency to 'prev'):
2210 */
2211 if (prev_hlock->irq_context != hlock->irq_context)
2212 return 1;
2213 }
2214 return 0;
2215}
2216
2217#else
2218
2219static inline
2220int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2221 enum lock_usage_bit new_bit)
2222{
2223 WARN_ON(1);
2224 return 1;
2225}
2226
2227static inline int mark_irqflags(struct task_struct *curr,
2228 struct held_lock *hlock)
2229{
2230 return 1;
2231}
2232
2233static inline int separate_irq_context(struct task_struct *curr,
2234 struct held_lock *hlock)
2235{
2236 return 0;
2237}
2238
1976#endif 2239#endif
1977 2240
1978/* 2241/*
2242 * Mark a lock with a usage bit, and validate the state transition:
2243 */
2244static int mark_lock(struct task_struct *curr, struct held_lock *this,
2245 enum lock_usage_bit new_bit)
2246{
2247 unsigned int new_mask = 1 << new_bit, ret = 1;
2248
2249 /*
2250 * If already set then do not dirty the cacheline,
2251 * nor do any checks:
2252 */
2253 if (likely(this->class->usage_mask & new_mask))
2254 return 1;
2255
2256 if (!graph_lock())
2257 return 0;
2258 /*
2259 * Make sure we didnt race:
2260 */
2261 if (unlikely(this->class->usage_mask & new_mask)) {
2262 graph_unlock();
2263 return 1;
2264 }
2265
2266 this->class->usage_mask |= new_mask;
2267
2268 if (!save_trace(this->class->usage_traces + new_bit))
2269 return 0;
2270
2271 switch (new_bit) {
2272 case LOCK_USED_IN_HARDIRQ:
2273 case LOCK_USED_IN_SOFTIRQ:
2274 case LOCK_USED_IN_HARDIRQ_READ:
2275 case LOCK_USED_IN_SOFTIRQ_READ:
2276 case LOCK_ENABLED_HARDIRQS:
2277 case LOCK_ENABLED_SOFTIRQS:
2278 case LOCK_ENABLED_HARDIRQS_READ:
2279 case LOCK_ENABLED_SOFTIRQS_READ:
2280 ret = mark_lock_irq(curr, this, new_bit);
2281 if (!ret)
2282 return 0;
2283 break;
2284 case LOCK_USED:
2285 /*
2286 * Add it to the global list of classes:
2287 */
2288 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
2289 debug_atomic_dec(&nr_unused_locks);
2290 break;
2291 default:
2292 if (!debug_locks_off_graph_unlock())
2293 return 0;
2294 WARN_ON(1);
2295 return 0;
2296 }
2297
2298 graph_unlock();
2299
2300 /*
2301 * We must printk outside of the graph_lock:
2302 */
2303 if (ret == 2) {
2304 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
2305 print_lock(this);
2306 print_irqtrace_events(curr);
2307 dump_stack();
2308 }
2309
2310 return ret;
2311}
2312
2313/*
1979 * Initialize a lock instance's lock-class mapping info: 2314 * Initialize a lock instance's lock-class mapping info:
1980 */ 2315 */
1981void lockdep_init_map(struct lockdep_map *lock, const char *name, 2316void lockdep_init_map(struct lockdep_map *lock, const char *name,
@@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
1999 lock->name = name; 2334 lock->name = name;
2000 lock->key = key; 2335 lock->key = key;
2001 lock->class_cache = NULL; 2336 lock->class_cache = NULL;
2337#ifdef CONFIG_LOCK_STAT
2338 lock->cpu = raw_smp_processor_id();
2339#endif
2002 if (subclass) 2340 if (subclass)
2003 register_lock_class(lock, subclass, 1); 2341 register_lock_class(lock, subclass, 1);
2004} 2342}
@@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2020 int chain_head = 0; 2358 int chain_head = 0;
2021 u64 chain_key; 2359 u64 chain_key;
2022 2360
2361 if (!prove_locking)
2362 check = 1;
2363
2023 if (unlikely(!debug_locks)) 2364 if (unlikely(!debug_locks))
2024 return 0; 2365 return 0;
2025 2366
@@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2070 hlock->read = read; 2411 hlock->read = read;
2071 hlock->check = check; 2412 hlock->check = check;
2072 hlock->hardirqs_off = hardirqs_off; 2413 hlock->hardirqs_off = hardirqs_off;
2073 2414#ifdef CONFIG_LOCK_STAT
2074 if (check != 2) 2415 hlock->waittime_stamp = 0;
2075 goto out_calc_hash; 2416 hlock->holdtime_stamp = sched_clock();
2076#ifdef CONFIG_TRACE_IRQFLAGS
2077 /*
2078 * If non-trylock use in a hardirq or softirq context, then
2079 * mark the lock as used in these contexts:
2080 */
2081 if (!trylock) {
2082 if (read) {
2083 if (curr->hardirq_context)
2084 if (!mark_lock(curr, hlock,
2085 LOCK_USED_IN_HARDIRQ_READ))
2086 return 0;
2087 if (curr->softirq_context)
2088 if (!mark_lock(curr, hlock,
2089 LOCK_USED_IN_SOFTIRQ_READ))
2090 return 0;
2091 } else {
2092 if (curr->hardirq_context)
2093 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2094 return 0;
2095 if (curr->softirq_context)
2096 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2097 return 0;
2098 }
2099 }
2100 if (!hardirqs_off) {
2101 if (read) {
2102 if (!mark_lock(curr, hlock,
2103 LOCK_ENABLED_HARDIRQS_READ))
2104 return 0;
2105 if (curr->softirqs_enabled)
2106 if (!mark_lock(curr, hlock,
2107 LOCK_ENABLED_SOFTIRQS_READ))
2108 return 0;
2109 } else {
2110 if (!mark_lock(curr, hlock,
2111 LOCK_ENABLED_HARDIRQS))
2112 return 0;
2113 if (curr->softirqs_enabled)
2114 if (!mark_lock(curr, hlock,
2115 LOCK_ENABLED_SOFTIRQS))
2116 return 0;
2117 }
2118 }
2119#endif 2417#endif
2418
2419 if (check == 2 && !mark_irqflags(curr, hlock))
2420 return 0;
2421
2120 /* mark it as used: */ 2422 /* mark it as used: */
2121 if (!mark_lock(curr, hlock, LOCK_USED)) 2423 if (!mark_lock(curr, hlock, LOCK_USED))
2122 return 0; 2424 return 0;
2123out_calc_hash: 2425
2124 /* 2426 /*
2125 * Calculate the chain hash: it's the combined has of all the 2427 * Calculate the chain hash: it's the combined has of all the
2126 * lock keys along the dependency chain. We save the hash value 2428 * lock keys along the dependency chain. We save the hash value
@@ -2143,77 +2445,15 @@ out_calc_hash:
2143 } 2445 }
2144 2446
2145 hlock->prev_chain_key = chain_key; 2447 hlock->prev_chain_key = chain_key;
2146 2448 if (separate_irq_context(curr, hlock)) {
2147#ifdef CONFIG_TRACE_IRQFLAGS 2449 chain_key = 0;
2148 /* 2450 chain_head = 1;
2149 * Keep track of points where we cross into an interrupt context:
2150 */
2151 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2152 curr->softirq_context;
2153 if (depth) {
2154 struct held_lock *prev_hlock;
2155
2156 prev_hlock = curr->held_locks + depth-1;
2157 /*
2158 * If we cross into another context, reset the
2159 * hash key (this also prevents the checking and the
2160 * adding of the dependency to 'prev'):
2161 */
2162 if (prev_hlock->irq_context != hlock->irq_context) {
2163 chain_key = 0;
2164 chain_head = 1;
2165 }
2166 } 2451 }
2167#endif
2168 chain_key = iterate_chain_key(chain_key, id); 2452 chain_key = iterate_chain_key(chain_key, id);
2169 curr->curr_chain_key = chain_key; 2453 curr->curr_chain_key = chain_key;
2170 2454
2171 /* 2455 if (!validate_chain(curr, lock, hlock, chain_head))
2172 * Trylock needs to maintain the stack of held locks, but it 2456 return 0;
2173 * does not add new dependencies, because trylock can be done
2174 * in any order.
2175 *
2176 * We look up the chain_key and do the O(N^2) check and update of
2177 * the dependencies only if this is a new dependency chain.
2178 * (If lookup_chain_cache() returns with 1 it acquires
2179 * graph_lock for us)
2180 */
2181 if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
2182 /*
2183 * Check whether last held lock:
2184 *
2185 * - is irq-safe, if this lock is irq-unsafe
2186 * - is softirq-safe, if this lock is hardirq-unsafe
2187 *
2188 * And check whether the new lock's dependency graph
2189 * could lead back to the previous lock.
2190 *
2191 * any of these scenarios could lead to a deadlock. If
2192 * All validations
2193 */
2194 int ret = check_deadlock(curr, hlock, lock, read);
2195
2196 if (!ret)
2197 return 0;
2198 /*
2199 * Mark recursive read, as we jump over it when
2200 * building dependencies (just like we jump over
2201 * trylock entries):
2202 */
2203 if (ret == 2)
2204 hlock->read = 2;
2205 /*
2206 * Add dependency only if this lock is not the head
2207 * of the chain, and if it's not a secondary read-lock:
2208 */
2209 if (!chain_head && ret != 2)
2210 if (!check_prevs_add(curr, hlock))
2211 return 0;
2212 graph_unlock();
2213 } else
2214 /* after lookup_chain_cache(): */
2215 if (unlikely(!debug_locks))
2216 return 0;
2217 2457
2218 curr->lockdep_depth++; 2458 curr->lockdep_depth++;
2219 check_chain_key(curr); 2459 check_chain_key(curr);
@@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr,
2315 return print_unlock_inbalance_bug(curr, lock, ip); 2555 return print_unlock_inbalance_bug(curr, lock, ip);
2316 2556
2317found_it: 2557found_it:
2558 lock_release_holdtime(hlock);
2559
2318 /* 2560 /*
2319 * We have the right lock to unlock, 'hlock' points to it. 2561 * We have the right lock to unlock, 'hlock' points to it.
2320 * Now we remove it from the stack, and add back the other 2562 * Now we remove it from the stack, and add back the other
@@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr,
2367 2609
2368 curr->curr_chain_key = hlock->prev_chain_key; 2610 curr->curr_chain_key = hlock->prev_chain_key;
2369 2611
2612 lock_release_holdtime(hlock);
2613
2370#ifdef CONFIG_DEBUG_LOCKDEP 2614#ifdef CONFIG_DEBUG_LOCKDEP
2371 hlock->prev_chain_key = 0; 2615 hlock->prev_chain_key = 0;
2372 hlock->class = NULL; 2616 hlock->class = NULL;
@@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2441{ 2685{
2442 unsigned long flags; 2686 unsigned long flags;
2443 2687
2688 if (unlikely(!lock_stat && !prove_locking))
2689 return;
2690
2444 if (unlikely(current->lockdep_recursion)) 2691 if (unlikely(current->lockdep_recursion))
2445 return; 2692 return;
2446 2693
@@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2460{ 2707{
2461 unsigned long flags; 2708 unsigned long flags;
2462 2709
2710 if (unlikely(!lock_stat && !prove_locking))
2711 return;
2712
2463 if (unlikely(current->lockdep_recursion)) 2713 if (unlikely(current->lockdep_recursion))
2464 return; 2714 return;
2465 2715
@@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2473 2723
2474EXPORT_SYMBOL_GPL(lock_release); 2724EXPORT_SYMBOL_GPL(lock_release);
2475 2725
2726#ifdef CONFIG_LOCK_STAT
2727static int
2728print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2729 unsigned long ip)
2730{
2731 if (!debug_locks_off())
2732 return 0;
2733 if (debug_locks_silent)
2734 return 0;
2735
2736 printk("\n=================================\n");
2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid);
2741 print_lockdep_cache(lock);
2742 printk(") at:\n");
2743 print_ip_sym(ip);
2744 printk("but there are no locks held!\n");
2745 printk("\nother info that might help us debug this:\n");
2746 lockdep_print_held_locks(curr);
2747
2748 printk("\nstack backtrace:\n");
2749 dump_stack();
2750
2751 return 0;
2752}
2753
2754static void
2755__lock_contended(struct lockdep_map *lock, unsigned long ip)
2756{
2757 struct task_struct *curr = current;
2758 struct held_lock *hlock, *prev_hlock;
2759 struct lock_class_stats *stats;
2760 unsigned int depth;
2761 int i, point;
2762
2763 depth = curr->lockdep_depth;
2764 if (DEBUG_LOCKS_WARN_ON(!depth))
2765 return;
2766
2767 prev_hlock = NULL;
2768 for (i = depth-1; i >= 0; i--) {
2769 hlock = curr->held_locks + i;
2770 /*
2771 * We must not cross into another context:
2772 */
2773 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2774 break;
2775 if (hlock->instance == lock)
2776 goto found_it;
2777 prev_hlock = hlock;
2778 }
2779 print_lock_contention_bug(curr, lock, ip);
2780 return;
2781
2782found_it:
2783 hlock->waittime_stamp = sched_clock();
2784
2785 point = lock_contention_point(hlock->class, ip);
2786
2787 stats = get_lock_stats(hlock->class);
2788 if (point < ARRAY_SIZE(stats->contention_point))
2789 stats->contention_point[i]++;
2790 if (lock->cpu != smp_processor_id())
2791 stats->bounces[bounce_contended + !!hlock->read]++;
2792 put_lock_stats(stats);
2793}
2794
2795static void
2796__lock_acquired(struct lockdep_map *lock)
2797{
2798 struct task_struct *curr = current;
2799 struct held_lock *hlock, *prev_hlock;
2800 struct lock_class_stats *stats;
2801 unsigned int depth;
2802 u64 now;
2803 s64 waittime = 0;
2804 int i, cpu;
2805
2806 depth = curr->lockdep_depth;
2807 if (DEBUG_LOCKS_WARN_ON(!depth))
2808 return;
2809
2810 prev_hlock = NULL;
2811 for (i = depth-1; i >= 0; i--) {
2812 hlock = curr->held_locks + i;
2813 /*
2814 * We must not cross into another context:
2815 */
2816 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2817 break;
2818 if (hlock->instance == lock)
2819 goto found_it;
2820 prev_hlock = hlock;
2821 }
2822 print_lock_contention_bug(curr, lock, _RET_IP_);
2823 return;
2824
2825found_it:
2826 cpu = smp_processor_id();
2827 if (hlock->waittime_stamp) {
2828 now = sched_clock();
2829 waittime = now - hlock->waittime_stamp;
2830 hlock->holdtime_stamp = now;
2831 }
2832
2833 stats = get_lock_stats(hlock->class);
2834 if (waittime) {
2835 if (hlock->read)
2836 lock_time_inc(&stats->read_waittime, waittime);
2837 else
2838 lock_time_inc(&stats->write_waittime, waittime);
2839 }
2840 if (lock->cpu != cpu)
2841 stats->bounces[bounce_acquired + !!hlock->read]++;
2842 put_lock_stats(stats);
2843
2844 lock->cpu = cpu;
2845}
2846
2847void lock_contended(struct lockdep_map *lock, unsigned long ip)
2848{
2849 unsigned long flags;
2850
2851 if (unlikely(!lock_stat))
2852 return;
2853
2854 if (unlikely(current->lockdep_recursion))
2855 return;
2856
2857 raw_local_irq_save(flags);
2858 check_flags(flags);
2859 current->lockdep_recursion = 1;
2860 __lock_contended(lock, ip);
2861 current->lockdep_recursion = 0;
2862 raw_local_irq_restore(flags);
2863}
2864EXPORT_SYMBOL_GPL(lock_contended);
2865
2866void lock_acquired(struct lockdep_map *lock)
2867{
2868 unsigned long flags;
2869
2870 if (unlikely(!lock_stat))
2871 return;
2872
2873 if (unlikely(current->lockdep_recursion))
2874 return;
2875
2876 raw_local_irq_save(flags);
2877 check_flags(flags);
2878 current->lockdep_recursion = 1;
2879 __lock_acquired(lock);
2880 current->lockdep_recursion = 0;
2881 raw_local_irq_restore(flags);
2882}
2883EXPORT_SYMBOL_GPL(lock_acquired);
2884#endif
2885
2476/* 2886/*
2477 * Used by the testsuite, sanitize the validator state 2887 * Used by the testsuite, sanitize the validator state
2478 * after a simulated failure: 2888 * after a simulated failure:
@@ -2636,8 +3046,11 @@ void __init lockdep_info(void)
2636 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3046 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
2637 3047
2638#ifdef CONFIG_DEBUG_LOCKDEP 3048#ifdef CONFIG_DEBUG_LOCKDEP
2639 if (lockdep_init_error) 3049 if (lockdep_init_error) {
2640 printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); 3050 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
3051 printk("Call stack leading to lockdep invocation was:\n");
3052 print_stack_trace(&lockdep_init_trace, 0);
3053 }
2641#endif 3054#endif
2642} 3055}
2643 3056
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 58f35e586e..9f17af4a24 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
11 * 12 *
@@ -15,6 +16,10 @@
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
17#include <linux/debug_locks.h> 18#include <linux/debug_locks.h>
19#include <linux/vmalloc.h>
20#include <linux/sort.h>
21#include <asm/uaccess.h>
22#include <asm/div64.h>
18 23
19#include "lockdep_internals.h" 24#include "lockdep_internals.h"
20 25
@@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
271 if (nr_list_entries) 276 if (nr_list_entries)
272 factor = sum_forward_deps / nr_list_entries; 277 factor = sum_forward_deps / nr_list_entries;
273 278
279#ifdef CONFIG_PROVE_LOCKING
274 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 280 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
275 nr_lock_chains, MAX_LOCKDEP_CHAINS); 281 nr_lock_chains, MAX_LOCKDEP_CHAINS);
282#endif
276 283
277#ifdef CONFIG_TRACE_IRQFLAGS 284#ifdef CONFIG_TRACE_IRQFLAGS
278 seq_printf(m, " in-hardirq chains: %11u\n", 285 seq_printf(m, " in-hardirq chains: %11u\n",
@@ -342,6 +349,292 @@ static const struct file_operations proc_lockdep_stats_operations = {
342 .release = seq_release, 349 .release = seq_release,
343}; 350};
344 351
352#ifdef CONFIG_LOCK_STAT
353
354struct lock_stat_data {
355 struct lock_class *class;
356 struct lock_class_stats stats;
357};
358
359struct lock_stat_seq {
360 struct lock_stat_data *iter;
361 struct lock_stat_data *iter_end;
362 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
363};
364
365/*
366 * sort on absolute number of contentions
367 */
368static int lock_stat_cmp(const void *l, const void *r)
369{
370 const struct lock_stat_data *dl = l, *dr = r;
371 unsigned long nl, nr;
372
373 nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
374 nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
375
376 return nr - nl;
377}
378
379static void seq_line(struct seq_file *m, char c, int offset, int length)
380{
381 int i;
382
383 for (i = 0; i < offset; i++)
384 seq_puts(m, " ");
385 for (i = 0; i < length; i++)
386 seq_printf(m, "%c", c);
387 seq_puts(m, "\n");
388}
389
390static void snprint_time(char *buf, size_t bufsiz, s64 nr)
391{
392 unsigned long rem;
393
394 rem = do_div(nr, 1000); /* XXX: do_div_signed */
395 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
396}
397
398static void seq_time(struct seq_file *m, s64 time)
399{
400 char num[15];
401
402 snprint_time(num, sizeof(num), time);
403 seq_printf(m, " %14s", num);
404}
405
406static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
407{
408 seq_printf(m, "%14lu", lt->nr);
409 seq_time(m, lt->min);
410 seq_time(m, lt->max);
411 seq_time(m, lt->total);
412}
413
414static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
415{
416 char name[39];
417 struct lock_class *class;
418 struct lock_class_stats *stats;
419 int i, namelen;
420
421 class = data->class;
422 stats = &data->stats;
423
424 namelen = 38;
425 if (class->name_version > 1)
426 namelen -= 2; /* XXX truncates versions > 9 */
427 if (class->subclass)
428 namelen -= 2;
429
430 if (!class->name) {
431 char str[KSYM_NAME_LEN];
432 const char *key_name;
433
434 key_name = __get_key_name(class->key, str);
435 snprintf(name, namelen, "%s", key_name);
436 } else {
437 snprintf(name, namelen, "%s", class->name);
438 }
439 namelen = strlen(name);
440 if (class->name_version > 1) {
441 snprintf(name+namelen, 3, "#%d", class->name_version);
442 namelen += 2;
443 }
444 if (class->subclass) {
445 snprintf(name+namelen, 3, "/%d", class->subclass);
446 namelen += 2;
447 }
448
449 if (stats->write_holdtime.nr) {
450 if (stats->read_holdtime.nr)
451 seq_printf(m, "%38s-W:", name);
452 else
453 seq_printf(m, "%40s:", name);
454
455 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
456 seq_lock_time(m, &stats->write_waittime);
457 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
458 seq_lock_time(m, &stats->write_holdtime);
459 seq_puts(m, "\n");
460 }
461
462 if (stats->read_holdtime.nr) {
463 seq_printf(m, "%38s-R:", name);
464 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
465 seq_lock_time(m, &stats->read_waittime);
466 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
467 seq_lock_time(m, &stats->read_holdtime);
468 seq_puts(m, "\n");
469 }
470
471 if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
472 return;
473
474 if (stats->read_holdtime.nr)
475 namelen += 2;
476
477 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
478 char sym[KSYM_SYMBOL_LEN];
479 char ip[32];
480
481 if (class->contention_point[i] == 0)
482 break;
483
484 if (!i)
485 seq_line(m, '-', 40-namelen, namelen);
486
487 sprint_symbol(sym, class->contention_point[i]);
488 snprintf(ip, sizeof(ip), "[<%p>]",
489 (void *)class->contention_point[i]);
490 seq_printf(m, "%40s %14lu %29s %s\n", name,
491 stats->contention_point[i],
492 ip, sym);
493 }
494 if (i) {
495 seq_puts(m, "\n");
496 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
497 seq_puts(m, "\n");
498 }
499}
500
501static void seq_header(struct seq_file *m)
502{
503 seq_printf(m, "lock_stat version 0.2\n");
504 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
505 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
506 "%14s %14s\n",
507 "class name",
508 "con-bounces",
509 "contentions",
510 "waittime-min",
511 "waittime-max",
512 "waittime-total",
513 "acq-bounces",
514 "acquisitions",
515 "holdtime-min",
516 "holdtime-max",
517 "holdtime-total");
518 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
519 seq_printf(m, "\n");
520}
521
522static void *ls_start(struct seq_file *m, loff_t *pos)
523{
524 struct lock_stat_seq *data = m->private;
525
526 if (data->iter == data->stats)
527 seq_header(m);
528
529 if (data->iter == data->iter_end)
530 data->iter = NULL;
531
532 return data->iter;
533}
534
535static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
536{
537 struct lock_stat_seq *data = m->private;
538
539 (*pos)++;
540
541 data->iter = v;
542 data->iter++;
543 if (data->iter == data->iter_end)
544 data->iter = NULL;
545
546 return data->iter;
547}
548
549static void ls_stop(struct seq_file *m, void *v)
550{
551}
552
553static int ls_show(struct seq_file *m, void *v)
554{
555 struct lock_stat_seq *data = m->private;
556
557 seq_stats(m, data->iter);
558 return 0;
559}
560
561static struct seq_operations lockstat_ops = {
562 .start = ls_start,
563 .next = ls_next,
564 .stop = ls_stop,
565 .show = ls_show,
566};
567
568static int lock_stat_open(struct inode *inode, struct file *file)
569{
570 int res;
571 struct lock_class *class;
572 struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
573
574 if (!data)
575 return -ENOMEM;
576
577 res = seq_open(file, &lockstat_ops);
578 if (!res) {
579 struct lock_stat_data *iter = data->stats;
580 struct seq_file *m = file->private_data;
581
582 data->iter = iter;
583 list_for_each_entry(class, &all_lock_classes, lock_entry) {
584 iter->class = class;
585 iter->stats = lock_stats(class);
586 iter++;
587 }
588 data->iter_end = iter;
589
590 sort(data->stats, data->iter_end - data->iter,
591 sizeof(struct lock_stat_data),
592 lock_stat_cmp, NULL);
593
594 m->private = data;
595 } else
596 vfree(data);
597
598 return res;
599}
600
601static ssize_t lock_stat_write(struct file *file, const char __user *buf,
602 size_t count, loff_t *ppos)
603{
604 struct lock_class *class;
605 char c;
606
607 if (count) {
608 if (get_user(c, buf))
609 return -EFAULT;
610
611 if (c != '0')
612 return count;
613
614 list_for_each_entry(class, &all_lock_classes, lock_entry)
615 clear_lock_stats(class);
616 }
617 return count;
618}
619
620static int lock_stat_release(struct inode *inode, struct file *file)
621{
622 struct seq_file *seq = file->private_data;
623
624 vfree(seq->private);
625 seq->private = NULL;
626 return seq_release(inode, file);
627}
628
629static const struct file_operations proc_lock_stat_operations = {
630 .open = lock_stat_open,
631 .write = lock_stat_write,
632 .read = seq_read,
633 .llseek = seq_lseek,
634 .release = lock_stat_release,
635};
636#endif /* CONFIG_LOCK_STAT */
637
345static int __init lockdep_proc_init(void) 638static int __init lockdep_proc_init(void)
346{ 639{
347 struct proc_dir_entry *entry; 640 struct proc_dir_entry *entry;
@@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void)
354 if (entry) 647 if (entry)
355 entry->proc_fops = &proc_lockdep_stats_operations; 648 entry->proc_fops = &proc_lockdep_stats_operations;
356 649
650#ifdef CONFIG_LOCK_STAT
651 entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
652 if (entry)
653 entry->proc_fops = &proc_lock_stat_operations;
654#endif
655
357 return 0; 656 return 0;
358} 657}
359 658
diff --git a/kernel/module.c b/kernel/module.c
index 9bd93de01f..33c04ad511 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,10 +61,8 @@ extern int module_sysfs_initialized;
61/* If this is set, the section belongs in the init part of the module */ 61/* If this is set, the section belongs in the init part of the module */
62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 62#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
63 63
64/* Protects module list */ 64/* List of modules, protected by module_mutex or preempt_disable
65static DEFINE_SPINLOCK(modlist_lock); 65 * (add/delete uses stop_machine). */
66
67/* List of modules, protected by module_mutex AND modlist_lock */
68static DEFINE_MUTEX(module_mutex); 66static DEFINE_MUTEX(module_mutex);
69static LIST_HEAD(modules); 67static LIST_HEAD(modules);
70 68
@@ -488,8 +486,7 @@ static void free_modinfo_##field(struct module *mod) \
488 mod->field = NULL; \ 486 mod->field = NULL; \
489} \ 487} \
490static struct module_attribute modinfo_##field = { \ 488static struct module_attribute modinfo_##field = { \
491 .attr = { .name = __stringify(field), .mode = 0444, \ 489 .attr = { .name = __stringify(field), .mode = 0444 }, \
492 .owner = THIS_MODULE }, \
493 .show = show_modinfo_##field, \ 490 .show = show_modinfo_##field, \
494 .setup = setup_modinfo_##field, \ 491 .setup = setup_modinfo_##field, \
495 .test = modinfo_##field##_exists, \ 492 .test = modinfo_##field##_exists, \
@@ -761,14 +758,13 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
761void __symbol_put(const char *symbol) 758void __symbol_put(const char *symbol)
762{ 759{
763 struct module *owner; 760 struct module *owner;
764 unsigned long flags;
765 const unsigned long *crc; 761 const unsigned long *crc;
766 762
767 spin_lock_irqsave(&modlist_lock, flags); 763 preempt_disable();
768 if (!__find_symbol(symbol, &owner, &crc, 1)) 764 if (!__find_symbol(symbol, &owner, &crc, 1))
769 BUG(); 765 BUG();
770 module_put(owner); 766 module_put(owner);
771 spin_unlock_irqrestore(&modlist_lock, flags); 767 preempt_enable();
772} 768}
773EXPORT_SYMBOL(__symbol_put); 769EXPORT_SYMBOL(__symbol_put);
774 770
@@ -793,7 +789,7 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
793} 789}
794 790
795static struct module_attribute refcnt = { 791static struct module_attribute refcnt = {
796 .attr = { .name = "refcnt", .mode = 0444, .owner = THIS_MODULE }, 792 .attr = { .name = "refcnt", .mode = 0444 },
797 .show = show_refcnt, 793 .show = show_refcnt,
798}; 794};
799 795
@@ -851,7 +847,7 @@ static ssize_t show_initstate(struct module_attribute *mattr,
851} 847}
852 848
853static struct module_attribute initstate = { 849static struct module_attribute initstate = {
854 .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, 850 .attr = { .name = "initstate", .mode = 0444 },
855 .show = show_initstate, 851 .show = show_initstate,
856}; 852};
857 853
@@ -1032,7 +1028,6 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1032 sattr->mattr.show = module_sect_show; 1028 sattr->mattr.show = module_sect_show;
1033 sattr->mattr.store = NULL; 1029 sattr->mattr.store = NULL;
1034 sattr->mattr.attr.name = sattr->name; 1030 sattr->mattr.attr.name = sattr->name;
1035 sattr->mattr.attr.owner = mod;
1036 sattr->mattr.attr.mode = S_IRUGO; 1031 sattr->mattr.attr.mode = S_IRUGO;
1037 *(gattr++) = &(sattr++)->mattr.attr; 1032 *(gattr++) = &(sattr++)->mattr.attr;
1038 } 1033 }
@@ -1090,7 +1085,6 @@ int module_add_modinfo_attrs(struct module *mod)
1090 if (!attr->test || 1085 if (!attr->test ||
1091 (attr->test && attr->test(mod))) { 1086 (attr->test && attr->test(mod))) {
1092 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1087 memcpy(temp_attr, attr, sizeof(*temp_attr));
1093 temp_attr->attr.owner = mod;
1094 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1088 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1095 ++temp_attr; 1089 ++temp_attr;
1096 } 1090 }
@@ -1231,14 +1225,14 @@ static void free_module(struct module *mod)
1231void *__symbol_get(const char *symbol) 1225void *__symbol_get(const char *symbol)
1232{ 1226{
1233 struct module *owner; 1227 struct module *owner;
1234 unsigned long value, flags; 1228 unsigned long value;
1235 const unsigned long *crc; 1229 const unsigned long *crc;
1236 1230
1237 spin_lock_irqsave(&modlist_lock, flags); 1231 preempt_disable();
1238 value = __find_symbol(symbol, &owner, &crc, 1); 1232 value = __find_symbol(symbol, &owner, &crc, 1);
1239 if (value && !strong_try_module_get(owner)) 1233 if (value && !strong_try_module_get(owner))
1240 value = 0; 1234 value = 0;
1241 spin_unlock_irqrestore(&modlist_lock, flags); 1235 preempt_enable();
1242 1236
1243 return (void *)value; 1237 return (void *)value;
1244} 1238}
@@ -2139,7 +2133,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2139 sym = get_ksymbol(mod, addr, NULL, NULL); 2133 sym = get_ksymbol(mod, addr, NULL, NULL);
2140 if (!sym) 2134 if (!sym)
2141 goto out; 2135 goto out;
2142 strlcpy(symname, sym, KSYM_NAME_LEN + 1); 2136 strlcpy(symname, sym, KSYM_NAME_LEN);
2143 mutex_unlock(&module_mutex); 2137 mutex_unlock(&module_mutex);
2144 return 0; 2138 return 0;
2145 } 2139 }
@@ -2164,9 +2158,9 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2164 if (!sym) 2158 if (!sym)
2165 goto out; 2159 goto out;
2166 if (modname) 2160 if (modname)
2167 strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); 2161 strlcpy(modname, mod->name, MODULE_NAME_LEN);
2168 if (name) 2162 if (name)
2169 strlcpy(name, sym, KSYM_NAME_LEN + 1); 2163 strlcpy(name, sym, KSYM_NAME_LEN);
2170 mutex_unlock(&module_mutex); 2164 mutex_unlock(&module_mutex);
2171 return 0; 2165 return 0;
2172 } 2166 }
@@ -2187,8 +2181,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2187 *value = mod->symtab[symnum].st_value; 2181 *value = mod->symtab[symnum].st_value;
2188 *type = mod->symtab[symnum].st_info; 2182 *type = mod->symtab[symnum].st_info;
2189 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2183 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2190 KSYM_NAME_LEN + 1); 2184 KSYM_NAME_LEN);
2191 strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); 2185 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2192 *exported = is_exported(name, mod); 2186 *exported = is_exported(name, mod);
2193 mutex_unlock(&module_mutex); 2187 mutex_unlock(&module_mutex);
2194 return 0; 2188 return 0;
@@ -2235,26 +2229,13 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2235/* Called by the /proc file system to return a list of modules. */ 2229/* Called by the /proc file system to return a list of modules. */
2236static void *m_start(struct seq_file *m, loff_t *pos) 2230static void *m_start(struct seq_file *m, loff_t *pos)
2237{ 2231{
2238 struct list_head *i;
2239 loff_t n = 0;
2240
2241 mutex_lock(&module_mutex); 2232 mutex_lock(&module_mutex);
2242 list_for_each(i, &modules) { 2233 return seq_list_start(&modules, *pos);
2243 if (n++ == *pos)
2244 break;
2245 }
2246 if (i == &modules)
2247 return NULL;
2248 return i;
2249} 2234}
2250 2235
2251static void *m_next(struct seq_file *m, void *p, loff_t *pos) 2236static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2252{ 2237{
2253 struct list_head *i = p; 2238 return seq_list_next(p, &modules, pos);
2254 (*pos)++;
2255 if (i->next == &modules)
2256 return NULL;
2257 return i->next;
2258} 2239}
2259 2240
2260static void m_stop(struct seq_file *m, void *p) 2241static void m_stop(struct seq_file *m, void *p)
@@ -2324,11 +2305,10 @@ const struct seq_operations modules_op = {
2324/* Given an address, look for it in the module exception tables. */ 2305/* Given an address, look for it in the module exception tables. */
2325const struct exception_table_entry *search_module_extables(unsigned long addr) 2306const struct exception_table_entry *search_module_extables(unsigned long addr)
2326{ 2307{
2327 unsigned long flags;
2328 const struct exception_table_entry *e = NULL; 2308 const struct exception_table_entry *e = NULL;
2329 struct module *mod; 2309 struct module *mod;
2330 2310
2331 spin_lock_irqsave(&modlist_lock, flags); 2311 preempt_disable();
2332 list_for_each_entry(mod, &modules, list) { 2312 list_for_each_entry(mod, &modules, list) {
2333 if (mod->num_exentries == 0) 2313 if (mod->num_exentries == 0)
2334 continue; 2314 continue;
@@ -2339,7 +2319,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2339 if (e) 2319 if (e)
2340 break; 2320 break;
2341 } 2321 }
2342 spin_unlock_irqrestore(&modlist_lock, flags); 2322 preempt_enable();
2343 2323
2344 /* Now, if we found one, we are running inside it now, hence 2324 /* Now, if we found one, we are running inside it now, hence
2345 we cannot unload the module, hence no refcnt needed. */ 2325 we cannot unload the module, hence no refcnt needed. */
@@ -2351,25 +2331,24 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2351 */ 2331 */
2352int is_module_address(unsigned long addr) 2332int is_module_address(unsigned long addr)
2353{ 2333{
2354 unsigned long flags;
2355 struct module *mod; 2334 struct module *mod;
2356 2335
2357 spin_lock_irqsave(&modlist_lock, flags); 2336 preempt_disable();
2358 2337
2359 list_for_each_entry(mod, &modules, list) { 2338 list_for_each_entry(mod, &modules, list) {
2360 if (within(addr, mod->module_core, mod->core_size)) { 2339 if (within(addr, mod->module_core, mod->core_size)) {
2361 spin_unlock_irqrestore(&modlist_lock, flags); 2340 preempt_enable();
2362 return 1; 2341 return 1;
2363 } 2342 }
2364 } 2343 }
2365 2344
2366 spin_unlock_irqrestore(&modlist_lock, flags); 2345 preempt_enable();
2367 2346
2368 return 0; 2347 return 0;
2369} 2348}
2370 2349
2371 2350
2372/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2351/* Is this a valid kernel address? */
2373struct module *__module_text_address(unsigned long addr) 2352struct module *__module_text_address(unsigned long addr)
2374{ 2353{
2375 struct module *mod; 2354 struct module *mod;
@@ -2384,11 +2363,10 @@ struct module *__module_text_address(unsigned long addr)
2384struct module *module_text_address(unsigned long addr) 2363struct module *module_text_address(unsigned long addr)
2385{ 2364{
2386 struct module *mod; 2365 struct module *mod;
2387 unsigned long flags;
2388 2366
2389 spin_lock_irqsave(&modlist_lock, flags); 2367 preempt_disable();
2390 mod = __module_text_address(addr); 2368 mod = __module_text_address(addr);
2391 spin_unlock_irqrestore(&modlist_lock, flags); 2369 preempt_enable();
2392 2370
2393 return mod; 2371 return mod;
2394} 2372}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 303eab1848..691b86564d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
140 waiter.task = task; 140 waiter.task = task;
141 141
142 old_val = atomic_xchg(&lock->count, -1);
143 if (old_val == 1)
144 goto done;
145
146 lock_contended(&lock->dep_map, _RET_IP_);
147
142 for (;;) { 148 for (;;) {
143 /* 149 /*
144 * Lets try to take the lock again - this is needed even if 150 * Lets try to take the lock again - this is needed even if
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
174 spin_lock_mutex(&lock->wait_lock, flags); 180 spin_lock_mutex(&lock->wait_lock, flags);
175 } 181 }
176 182
183done:
184 lock_acquired(&lock->dep_map);
177 /* got the lock - rejoice! */ 185 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 186 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task_thread_info(task)); 187 debug_mutex_set_owner(lock, task_thread_info(task));
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9e83b589f7..a4fb7d4697 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,8 @@
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23 23
24static struct kmem_cache *nsproxy_cachep;
25
24struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 26struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
25 27
26static inline void get_nsproxy(struct nsproxy *ns) 28static inline void get_nsproxy(struct nsproxy *ns)
@@ -43,9 +45,11 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
43{ 45{
44 struct nsproxy *ns; 46 struct nsproxy *ns;
45 47
46 ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); 48 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
47 if (ns) 49 if (ns) {
50 memcpy(ns, orig, sizeof(struct nsproxy));
48 atomic_set(&ns->count, 1); 51 atomic_set(&ns->count, 1);
52 }
49 return ns; 53 return ns;
50} 54}
51 55
@@ -54,33 +58,51 @@ static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
54 * Return the newly created nsproxy. Do not attach this to the task, 58 * Return the newly created nsproxy. Do not attach this to the task,
55 * leave it to the caller to do proper locking and attach it to task. 59 * leave it to the caller to do proper locking and attach it to task.
56 */ 60 */
57static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, 61static struct nsproxy *create_new_namespaces(unsigned long flags,
58 struct fs_struct *new_fs) 62 struct task_struct *tsk, struct fs_struct *new_fs)
59{ 63{
60 struct nsproxy *new_nsp; 64 struct nsproxy *new_nsp;
65 int err;
61 66
62 new_nsp = clone_nsproxy(tsk->nsproxy); 67 new_nsp = clone_nsproxy(tsk->nsproxy);
63 if (!new_nsp) 68 if (!new_nsp)
64 return ERR_PTR(-ENOMEM); 69 return ERR_PTR(-ENOMEM);
65 70
66 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); 71 new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
67 if (IS_ERR(new_nsp->mnt_ns)) 72 if (IS_ERR(new_nsp->mnt_ns)) {
73 err = PTR_ERR(new_nsp->mnt_ns);
68 goto out_ns; 74 goto out_ns;
75 }
69 76
70 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); 77 new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
71 if (IS_ERR(new_nsp->uts_ns)) 78 if (IS_ERR(new_nsp->uts_ns)) {
79 err = PTR_ERR(new_nsp->uts_ns);
72 goto out_uts; 80 goto out_uts;
81 }
73 82
74 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); 83 new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
75 if (IS_ERR(new_nsp->ipc_ns)) 84 if (IS_ERR(new_nsp->ipc_ns)) {
85 err = PTR_ERR(new_nsp->ipc_ns);
76 goto out_ipc; 86 goto out_ipc;
87 }
77 88
78 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); 89 new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
79 if (IS_ERR(new_nsp->pid_ns)) 90 if (IS_ERR(new_nsp->pid_ns)) {
91 err = PTR_ERR(new_nsp->pid_ns);
80 goto out_pid; 92 goto out_pid;
93 }
94
95 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
96 if (IS_ERR(new_nsp->user_ns)) {
97 err = PTR_ERR(new_nsp->user_ns);
98 goto out_user;
99 }
81 100
82 return new_nsp; 101 return new_nsp;
83 102
103out_user:
104 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns);
84out_pid: 106out_pid:
85 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
86 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -91,15 +113,15 @@ out_uts:
91 if (new_nsp->mnt_ns) 113 if (new_nsp->mnt_ns)
92 put_mnt_ns(new_nsp->mnt_ns); 114 put_mnt_ns(new_nsp->mnt_ns);
93out_ns: 115out_ns:
94 kfree(new_nsp); 116 kmem_cache_free(nsproxy_cachep, new_nsp);
95 return ERR_PTR(-ENOMEM); 117 return ERR_PTR(err);
96} 118}
97 119
98/* 120/*
99 * called from clone. This now handles copy for nsproxy and all 121 * called from clone. This now handles copy for nsproxy and all
100 * namespaces therein. 122 * namespaces therein.
101 */ 123 */
102int copy_namespaces(int flags, struct task_struct *tsk) 124int copy_namespaces(unsigned long flags, struct task_struct *tsk)
103{ 125{
104 struct nsproxy *old_ns = tsk->nsproxy; 126 struct nsproxy *old_ns = tsk->nsproxy;
105 struct nsproxy *new_ns; 127 struct nsproxy *new_ns;
@@ -110,7 +132,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
110 132
111 get_nsproxy(old_ns); 133 get_nsproxy(old_ns);
112 134
113 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER)))
114 return 0; 136 return 0;
115 137
116 if (!capable(CAP_SYS_ADMIN)) { 138 if (!capable(CAP_SYS_ADMIN)) {
@@ -140,7 +162,9 @@ void free_nsproxy(struct nsproxy *ns)
140 put_ipc_ns(ns->ipc_ns); 162 put_ipc_ns(ns->ipc_ns);
141 if (ns->pid_ns) 163 if (ns->pid_ns)
142 put_pid_ns(ns->pid_ns); 164 put_pid_ns(ns->pid_ns);
143 kfree(ns); 165 if (ns->user_ns)
166 put_user_ns(ns->user_ns);
167 kmem_cache_free(nsproxy_cachep, ns);
144} 168}
145 169
146/* 170/*
@@ -152,19 +176,10 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
152{ 176{
153 int err = 0; 177 int err = 0;
154 178
155 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) 179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER)))
156 return 0; 181 return 0;
157 182
158#ifndef CONFIG_IPC_NS
159 if (unshare_flags & CLONE_NEWIPC)
160 return -EINVAL;
161#endif
162
163#ifndef CONFIG_UTS_NS
164 if (unshare_flags & CLONE_NEWUTS)
165 return -EINVAL;
166#endif
167
168 if (!capable(CAP_SYS_ADMIN)) 183 if (!capable(CAP_SYS_ADMIN))
169 return -EPERM; 184 return -EPERM;
170 185
@@ -174,3 +189,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
174 err = PTR_ERR(*new_nsp); 189 err = PTR_ERR(*new_nsp);
175 return err; 190 return err;
176} 191}
192
193static int __init nsproxy_cache_init(void)
194{
195 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
196 0, SLAB_PANIC, NULL);
197 return 0;
198}
199
200module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 623d182825..f64f4c1ac1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -159,14 +159,15 @@ const char *print_tainted(void)
159{ 159{
160 static char buf[20]; 160 static char buf[20];
161 if (tainted) { 161 if (tainted) {
162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", 162 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 163 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 164 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 165 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 166 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 167 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
168 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 168 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
169 tainted & TAINT_USER ? 'U' : ' '); 169 tainted & TAINT_USER ? 'U' : ' ',
170 tainted & TAINT_DIE ? 'D' : ' ');
170 } 171 }
171 else 172 else
172 snprintf(buf, sizeof(buf), "Not tainted"); 173 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index e61c46c97c..effbaaedd7 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -491,7 +491,6 @@ param_sysfs_setup(struct module_kobject *mk,
491 pattr->mattr.show = param_attr_show; 491 pattr->mattr.show = param_attr_show;
492 pattr->mattr.store = param_attr_store; 492 pattr->mattr.store = param_attr_store;
493 pattr->mattr.attr.name = (char *)&kp->name[name_skip]; 493 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
494 pattr->mattr.attr.owner = mk->mod;
495 pattr->mattr.attr.mode = kp->perm; 494 pattr->mattr.attr.mode = kp->perm;
496 *(gattr++) = &(pattr++)->mattr.attr; 495 *(gattr++) = &(pattr++)->mattr.attr;
497 } 496 }
diff --git a/kernel/pid.c b/kernel/pid.c
index eb66bd2953..c6e3f9ffff 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -365,7 +365,7 @@ struct pid *find_ge_pid(int nr)
365} 365}
366EXPORT_SYMBOL_GPL(find_get_pid); 366EXPORT_SYMBOL_GPL(find_get_pid);
367 367
368struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) 368struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
369{ 369{
370 BUG_ON(!old_ns); 370 BUG_ON(!old_ns);
371 get_pid_ns(old_ns); 371 get_pid_ns(old_ns);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e183..b53c8fcd9d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
161} 161}
162static inline unsigned long long sched_ns(struct task_struct *p) 162static inline unsigned long long sched_ns(struct task_struct *p)
163{ 163{
164 return (p == current) ? current_sched_time(p) : p->sched_time; 164 return task_sched_runtime(p);
165} 165}
166 166
167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 167int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
246 } while (t != p); 246 } while (t != p);
247 break; 247 break;
248 case CPUCLOCK_SCHED: 248 case CPUCLOCK_SCHED:
249 cpu->sched = p->signal->sched_time; 249 cpu->sched = p->signal->sum_sched_runtime;
250 /* Add in each other live thread. */ 250 /* Add in each other live thread. */
251 while ((t = next_thread(t)) != p) { 251 while ((t = next_thread(t)) != p) {
252 cpu->sched += t->sched_time; 252 cpu->sched += t->se.sum_exec_runtime;
253 } 253 }
254 cpu->sched += sched_ns(p); 254 cpu->sched += sched_ns(p);
255 break; 255 break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
422 */ 422 */
423static void cleanup_timers(struct list_head *head, 423static void cleanup_timers(struct list_head *head,
424 cputime_t utime, cputime_t stime, 424 cputime_t utime, cputime_t stime,
425 unsigned long long sched_time) 425 unsigned long long sum_exec_runtime)
426{ 426{
427 struct cpu_timer_list *timer, *next; 427 struct cpu_timer_list *timer, *next;
428 cputime_t ptime = cputime_add(utime, stime); 428 cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
451 ++head; 451 ++head;
452 list_for_each_entry_safe(timer, next, head, entry) { 452 list_for_each_entry_safe(timer, next, head, entry) {
453 list_del_init(&timer->entry); 453 list_del_init(&timer->entry);
454 if (timer->expires.sched < sched_time) { 454 if (timer->expires.sched < sum_exec_runtime) {
455 timer->expires.sched = 0; 455 timer->expires.sched = 0;
456 } else { 456 } else {
457 timer->expires.sched -= sched_time; 457 timer->expires.sched -= sum_exec_runtime;
458 } 458 }
459 } 459 }
460} 460}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
467void posix_cpu_timers_exit(struct task_struct *tsk) 467void posix_cpu_timers_exit(struct task_struct *tsk)
468{ 468{
469 cleanup_timers(tsk->cpu_timers, 469 cleanup_timers(tsk->cpu_timers,
470 tsk->utime, tsk->stime, tsk->sched_time); 470 tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
471 471
472} 472}
473void posix_cpu_timers_exit_group(struct task_struct *tsk) 473void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
475 cleanup_timers(tsk->signal->cpu_timers, 475 cleanup_timers(tsk->signal->cpu_timers,
476 cputime_add(tsk->utime, tsk->signal->utime), 476 cputime_add(tsk->utime, tsk->signal->utime),
477 cputime_add(tsk->stime, tsk->signal->stime), 477 cputime_add(tsk->stime, tsk->signal->stime),
478 tsk->sched_time + tsk->signal->sched_time); 478 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
479} 479}
480 480
481 481
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
536 nsleft = max_t(unsigned long long, nsleft, 1); 536 nsleft = max_t(unsigned long long, nsleft, 1);
537 do { 537 do {
538 if (likely(!(t->flags & PF_EXITING))) { 538 if (likely(!(t->flags & PF_EXITING))) {
539 ns = t->sched_time + nsleft; 539 ns = t->se.sum_exec_runtime + nsleft;
540 if (t->it_sched_expires == 0 || 540 if (t->it_sched_expires == 0 ||
541 t->it_sched_expires > ns) { 541 t->it_sched_expires > ns) {
542 t->it_sched_expires = ns; 542 t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
1004 struct cpu_timer_list *t = list_first_entry(timers, 1004 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 1005 struct cpu_timer_list,
1006 entry); 1006 entry);
1007 if (!--maxfire || tsk->sched_time < t->expires.sched) { 1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 1008 tsk->it_sched_expires = t->expires.sched;
1009 break; 1009 break;
1010 } 1010 }
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
1024 int maxfire; 1024 int maxfire;
1025 struct signal_struct *const sig = tsk->signal; 1025 struct signal_struct *const sig = tsk->signal;
1026 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1026 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1027 unsigned long long sched_time, sched_expires; 1027 unsigned long long sum_sched_runtime, sched_expires;
1028 struct task_struct *t; 1028 struct task_struct *t;
1029 struct list_head *timers = sig->cpu_timers; 1029 struct list_head *timers = sig->cpu_timers;
1030 1030
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
1044 */ 1044 */
1045 utime = sig->utime; 1045 utime = sig->utime;
1046 stime = sig->stime; 1046 stime = sig->stime;
1047 sched_time = sig->sched_time; 1047 sum_sched_runtime = sig->sum_sched_runtime;
1048 t = tsk; 1048 t = tsk;
1049 do { 1049 do {
1050 utime = cputime_add(utime, t->utime); 1050 utime = cputime_add(utime, t->utime);
1051 stime = cputime_add(stime, t->stime); 1051 stime = cputime_add(stime, t->stime);
1052 sched_time += t->sched_time; 1052 sum_sched_runtime += t->se.sum_exec_runtime;
1053 t = next_thread(t); 1053 t = next_thread(t);
1054 } while (t != tsk); 1054 } while (t != tsk);
1055 ptime = cputime_add(utime, stime); 1055 ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
1090 struct cpu_timer_list *t = list_first_entry(timers, 1090 struct cpu_timer_list *t = list_first_entry(timers,
1091 struct cpu_timer_list, 1091 struct cpu_timer_list,
1092 entry); 1092 entry);
1093 if (!--maxfire || sched_time < t->expires.sched) { 1093 if (!--maxfire || sum_sched_runtime < t->expires.sched) {
1094 sched_expires = t->expires.sched; 1094 sched_expires = t->expires.sched;
1095 break; 1095 break;
1096 } 1096 }
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
1182 virt_left = cputime_sub(virt_expires, utime); 1182 virt_left = cputime_sub(virt_expires, utime);
1183 virt_left = cputime_div_non_zero(virt_left, nthreads); 1183 virt_left = cputime_div_non_zero(virt_left, nthreads);
1184 if (sched_expires) { 1184 if (sched_expires) {
1185 sched_left = sched_expires - sched_time; 1185 sched_left = sched_expires - sum_sched_runtime;
1186 do_div(sched_left, nthreads); 1186 do_div(sched_left, nthreads);
1187 sched_left = max_t(unsigned long long, sched_left, 1); 1187 sched_left = max_t(unsigned long long, sched_left, 1);
1188 } else { 1188 } else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
1208 t->it_virt_expires = ticks; 1208 t->it_virt_expires = ticks;
1209 } 1209 }
1210 1210
1211 sched = t->sched_time + sched_left; 1211 sched = t->se.sum_exec_runtime + sched_left;
1212 if (sched_expires && (t->it_sched_expires == 0 || 1212 if (sched_expires && (t->it_sched_expires == 0 ||
1213 t->it_sched_expires > sched)) { 1213 t->it_sched_expires > sched)) {
1214 t->it_sched_expires = sched; 1214 t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1300 1300
1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1301 if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
1302 (tsk->it_sched_expires == 0 || 1302 (tsk->it_sched_expires == 0 ||
1303 tsk->sched_time < tsk->it_sched_expires)) 1303 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1304 return; 1304 return;
1305 1305
1306#undef UNEXPIRED 1306#undef UNEXPIRED
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 329ce01720..55b3761eda 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void)
241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
242 242
243 posix_timers_cache = kmem_cache_create("posix_timers_cache", 243 posix_timers_cache = kmem_cache_create("posix_timers_cache",
244 sizeof (struct k_itimer), 0, 0, NULL, NULL); 244 sizeof (struct k_itimer), 0, 0, NULL);
245 idr_init(&posix_timers_id); 245 idr_init(&posix_timers_id);
246 return 0; 246 return 0;
247} 247}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 495b7d4dd3..c1a106d87d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -33,13 +33,20 @@ config PM_DEBUG
33 bool "Power Management Debug Support" 33 bool "Power Management Debug Support"
34 depends on PM 34 depends on PM
35 ---help--- 35 ---help---
36 This option enables verbose debugging support in the Power Management 36 This option enables various debugging support in the Power Management
37 code. This is helpful when debugging and reporting various PM bugs, 37 code. This is helpful when debugging and reporting PM bugs, like
38 like suspend support. 38 suspend support.
39
40config PM_VERBOSE
41 bool "Verbose Power Management debugging"
42 depends on PM_DEBUG
43 default n
44 ---help---
45 This option enables verbose messages from the Power Management code.
39 46
40config DISABLE_CONSOLE_SUSPEND 47config DISABLE_CONSOLE_SUSPEND
41 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" 48 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
42 depends on PM && PM_DEBUG 49 depends on PM_DEBUG
43 default n 50 default n
44 ---help--- 51 ---help---
45 This option turns off the console suspend mechanism that prevents 52 This option turns off the console suspend mechanism that prevents
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
50 57
51config PM_TRACE 58config PM_TRACE
52 bool "Suspend/resume event tracing" 59 bool "Suspend/resume event tracing"
53 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL 60 depends on PM_DEBUG && X86 && EXPERIMENTAL
54 default n 61 default n
55 ---help--- 62 ---help---
56 This enables some cheesy code to save the last PM event point in the 63 This enables some cheesy code to save the last PM event point in the
@@ -65,18 +72,6 @@ config PM_TRACE
65 CAUTION: this option will cause your machine's real-time clock to be 72 CAUTION: this option will cause your machine's real-time clock to be
66 set to an invalid time after a resume. 73 set to an invalid time after a resume.
67 74
68config PM_SYSFS_DEPRECATED
69 bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
70 depends on PM && SYSFS
71 default n
72 help
73 The driver model started out with a sysfs file intended to provide
74 a userspace hook for device power management. This feature has never
75 worked very well, except for limited testing purposes, and so it will
76 be removed. It's not clear that a generic mechanism could really
77 handle the wide variability of device power states; any replacements
78 are likely to be bus or driver specific.
79
80config SOFTWARE_SUSPEND 75config SOFTWARE_SUSPEND
81 bool "Software Suspend (Hibernation)" 76 bool "Software Suspend (Hibernation)"
82 depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) 77 depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f445b9cd60..324ac0188c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,7 +45,7 @@ enum {
45 45
46static int hibernation_mode = HIBERNATION_SHUTDOWN; 46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47 47
48struct hibernation_ops *hibernation_ops; 48static struct hibernation_ops *hibernation_ops;
49 49
50/** 50/**
51 * hibernation_set_ops - set the global hibernate operations 51 * hibernation_set_ops - set the global hibernate operations
@@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops;
54 54
55void hibernation_set_ops(struct hibernation_ops *ops) 55void hibernation_set_ops(struct hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) { 57 if (ops && !(ops->prepare && ops->enter && ops->finish
58 && ops->pre_restore && ops->restore_cleanup)) {
58 WARN_ON(1); 59 WARN_ON(1);
59 return; 60 return;
60 } 61 }
@@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops)
74 * platform driver if so configured and return an error code if it fails 75 * platform driver if so configured and return an error code if it fails
75 */ 76 */
76 77
77static int platform_prepare(void) 78static int platform_prepare(int platform_mode)
78{ 79{
79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? 80 return (platform_mode && hibernation_ops) ?
80 hibernation_ops->prepare() : 0; 81 hibernation_ops->prepare() : 0;
81} 82}
82 83
@@ -85,13 +86,145 @@ static int platform_prepare(void)
85 * using the platform driver (must be called after platform_prepare()) 86 * using the platform driver (must be called after platform_prepare())
86 */ 87 */
87 88
88static void platform_finish(void) 89static void platform_finish(int platform_mode)
89{ 90{
90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) 91 if (platform_mode && hibernation_ops)
91 hibernation_ops->finish(); 92 hibernation_ops->finish();
92} 93}
93 94
94/** 95/**
96 * platform_pre_restore - prepare the platform for the restoration from a
97 * hibernation image. If the restore fails after this function has been
98 * called, platform_restore_cleanup() must be called.
99 */
100
101static int platform_pre_restore(int platform_mode)
102{
103 return (platform_mode && hibernation_ops) ?
104 hibernation_ops->pre_restore() : 0;
105}
106
107/**
108 * platform_restore_cleanup - switch the platform to the normal mode of
109 * operation after a failing restore. If platform_pre_restore() has been
110 * called before the failing restore, this function must be called too,
111 * regardless of the result of platform_pre_restore().
112 */
113
114static void platform_restore_cleanup(int platform_mode)
115{
116 if (platform_mode && hibernation_ops)
117 hibernation_ops->restore_cleanup();
118}
119
120/**
121 * hibernation_snapshot - quiesce devices and create the hibernation
122 * snapshot image.
123 * @platform_mode - if set, use the platform driver, if available, to
124 * prepare the platform frimware for the power transition.
125 *
126 * Must be called with pm_mutex held
127 */
128
129int hibernation_snapshot(int platform_mode)
130{
131 int error;
132
133 /* Free memory before shutting down devices. */
134 error = swsusp_shrink_memory();
135 if (error)
136 return error;
137
138 suspend_console();
139 error = device_suspend(PMSG_FREEZE);
140 if (error)
141 goto Resume_console;
142
143 error = platform_prepare(platform_mode);
144 if (error)
145 goto Resume_devices;
146
147 error = disable_nonboot_cpus();
148 if (!error) {
149 if (hibernation_mode != HIBERNATION_TEST) {
150 in_suspend = 1;
151 error = swsusp_suspend();
152 /* Control returns here after successful restore */
153 } else {
154 printk("swsusp debug: Waiting for 5 seconds.\n");
155 mdelay(5000);
156 }
157 }
158 enable_nonboot_cpus();
159 Resume_devices:
160 platform_finish(platform_mode);
161 device_resume();
162 Resume_console:
163 resume_console();
164 return error;
165}
166
167/**
168 * hibernation_restore - quiesce devices and restore the hibernation
169 * snapshot image. If successful, control returns in hibernation_snaphot()
170 * @platform_mode - if set, use the platform driver, if available, to
171 * prepare the platform frimware for the transition.
172 *
173 * Must be called with pm_mutex held
174 */
175
176int hibernation_restore(int platform_mode)
177{
178 int error;
179
180 pm_prepare_console();
181 suspend_console();
182 error = device_suspend(PMSG_PRETHAW);
183 if (error)
184 goto Finish;
185
186 error = platform_pre_restore(platform_mode);
187 if (!error) {
188 error = disable_nonboot_cpus();
189 if (!error)
190 error = swsusp_resume();
191 enable_nonboot_cpus();
192 }
193 platform_restore_cleanup(platform_mode);
194 device_resume();
195 Finish:
196 resume_console();
197 pm_restore_console();
198 return error;
199}
200
201/**
202 * hibernation_platform_enter - enter the hibernation state using the
203 * platform driver (if available)
204 */
205
206int hibernation_platform_enter(void)
207{
208 int error;
209
210 if (hibernation_ops) {
211 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
212 /*
213 * We have cancelled the power transition by running
214 * hibernation_ops->finish() before saving the image, so we
215 * should let the firmware know that we're going to enter the
216 * sleep state after all
217 */
218 error = hibernation_ops->prepare();
219 if (!error)
220 error = hibernation_ops->enter();
221 } else {
222 error = -ENOSYS;
223 }
224 return error;
225}
226
227/**
95 * power_down - Shut the machine down for hibernation. 228 * power_down - Shut the machine down for hibernation.
96 * 229 *
97 * Use the platform driver, if configured so; otherwise try 230 * Use the platform driver, if configured so; otherwise try
@@ -111,11 +244,7 @@ static void power_down(void)
111 kernel_restart(NULL); 244 kernel_restart(NULL);
112 break; 245 break;
113 case HIBERNATION_PLATFORM: 246 case HIBERNATION_PLATFORM:
114 if (hibernation_ops) { 247 hibernation_platform_enter();
115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
116 hibernation_ops->enter();
117 break;
118 }
119 } 248 }
120 kernel_halt(); 249 kernel_halt();
121 /* 250 /*
@@ -152,9 +281,16 @@ int hibernate(void)
152{ 281{
153 int error; 282 int error;
154 283
284 mutex_lock(&pm_mutex);
155 /* The snapshot device should not be opened while we're running */ 285 /* The snapshot device should not be opened while we're running */
156 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 286 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
157 return -EBUSY; 287 error = -EBUSY;
288 goto Unlock;
289 }
290
291 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
292 if (error)
293 goto Exit;
158 294
159 /* Allocate memory management structures */ 295 /* Allocate memory management structures */
160 error = create_basic_memory_bitmaps(); 296 error = create_basic_memory_bitmaps();
@@ -165,75 +301,35 @@ int hibernate(void)
165 if (error) 301 if (error)
166 goto Finish; 302 goto Finish;
167 303
168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) { 304 if (hibernation_mode == HIBERNATION_TESTPROC) {
170 printk("swsusp debug: Waiting for 5 seconds.\n"); 305 printk("swsusp debug: Waiting for 5 seconds.\n");
171 mdelay(5000); 306 mdelay(5000);
172 goto Thaw; 307 goto Thaw;
173 } 308 }
309 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
310 if (in_suspend && !error) {
311 unsigned int flags = 0;
174 312
175 /* Free memory before shutting down devices. */ 313 if (hibernation_mode == HIBERNATION_PLATFORM)
176 error = swsusp_shrink_memory(); 314 flags |= SF_PLATFORM_MODE;
177 if (error)
178 goto Thaw;
179
180 error = platform_prepare();
181 if (error)
182 goto Thaw;
183
184 suspend_console();
185 error = device_suspend(PMSG_FREEZE);
186 if (error) {
187 printk(KERN_ERR "PM: Some devices failed to suspend\n");
188 goto Resume_devices;
189 }
190 error = disable_nonboot_cpus();
191 if (error)
192 goto Enable_cpus;
193
194 if (hibernation_mode == HIBERNATION_TEST) {
195 printk("swsusp debug: Waiting for 5 seconds.\n");
196 mdelay(5000);
197 goto Enable_cpus;
198 }
199
200 pr_debug("PM: snapshotting memory.\n");
201 in_suspend = 1;
202 error = swsusp_suspend();
203 if (error)
204 goto Enable_cpus;
205
206 if (in_suspend) {
207 enable_nonboot_cpus();
208 platform_finish();
209 device_resume();
210 resume_console();
211 pr_debug("PM: writing image.\n"); 315 pr_debug("PM: writing image.\n");
212 error = swsusp_write(); 316 error = swsusp_write(flags);
317 swsusp_free();
213 if (!error) 318 if (!error)
214 power_down(); 319 power_down();
215 else {
216 swsusp_free();
217 goto Thaw;
218 }
219 } else { 320 } else {
220 pr_debug("PM: Image restored successfully.\n"); 321 pr_debug("PM: Image restored successfully.\n");
322 swsusp_free();
221 } 323 }
222
223 swsusp_free();
224 Enable_cpus:
225 enable_nonboot_cpus();
226 Resume_devices:
227 platform_finish();
228 device_resume();
229 resume_console();
230 Thaw: 324 Thaw:
231 mutex_unlock(&pm_mutex);
232 unprepare_processes(); 325 unprepare_processes();
233 Finish: 326 Finish:
234 free_basic_memory_bitmaps(); 327 free_basic_memory_bitmaps();
235 Exit: 328 Exit:
329 pm_notifier_call_chain(PM_POST_HIBERNATION);
236 atomic_inc(&snapshot_device_available); 330 atomic_inc(&snapshot_device_available);
331 Unlock:
332 mutex_unlock(&pm_mutex);
237 return error; 333 return error;
238} 334}
239 335
@@ -253,6 +349,7 @@ int hibernate(void)
253static int software_resume(void) 349static int software_resume(void)
254{ 350{
255 int error; 351 int error;
352 unsigned int flags;
256 353
257 mutex_lock(&pm_mutex); 354 mutex_lock(&pm_mutex);
258 if (!swsusp_resume_device) { 355 if (!swsusp_resume_device) {
@@ -300,30 +397,12 @@ static int software_resume(void)
300 397
301 pr_debug("PM: Reading swsusp image.\n"); 398 pr_debug("PM: Reading swsusp image.\n");
302 399
303 error = swsusp_read(); 400 error = swsusp_read(&flags);
304 if (error) {
305 swsusp_free();
306 goto Thaw;
307 }
308
309 pr_debug("PM: Preparing devices for restore.\n");
310
311 suspend_console();
312 error = device_suspend(PMSG_PRETHAW);
313 if (error)
314 goto Free;
315
316 error = disable_nonboot_cpus();
317 if (!error) 401 if (!error)
318 swsusp_resume(); 402 hibernation_restore(flags & SF_PLATFORM_MODE);
319 403
320 enable_nonboot_cpus();
321 Free:
322 swsusp_free();
323 device_resume();
324 resume_console();
325 Thaw:
326 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 404 printk(KERN_ERR "PM: Restore failed, recovering.\n");
405 swsusp_free();
327 unprepare_processes(); 406 unprepare_processes();
328 Done: 407 Done:
329 free_basic_memory_bitmaps(); 408 free_basic_memory_bitmaps();
@@ -333,7 +412,7 @@ static int software_resume(void)
333 Unlock: 412 Unlock:
334 mutex_unlock(&pm_mutex); 413 mutex_unlock(&pm_mutex);
335 pr_debug("PM: Resume from disk failed.\n"); 414 pr_debug("PM: Resume from disk failed.\n");
336 return 0; 415 return error;
337} 416}
338 417
339late_initcall(software_resume); 418late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fc45ed2262..32147b57c3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,6 +23,8 @@
23 23
24#include "power.h" 24#include "power.h"
25 25
26BLOCKING_NOTIFIER_HEAD(pm_chain_head);
27
26/*This is just an arbitrary number */ 28/*This is just an arbitrary number */
27#define FREE_PAGE_NUMBER (100) 29#define FREE_PAGE_NUMBER (100)
28 30
@@ -63,14 +65,11 @@ static inline void pm_finish(suspend_state_t state)
63 65
64/** 66/**
65 * suspend_prepare - Do prep work before entering low-power state. 67 * suspend_prepare - Do prep work before entering low-power state.
66 * @state: State we're entering.
67 * 68 *
68 * This is common code that is called for each state that we're 69 * This is common code that is called for each state that we're entering.
69 * entering. Allocate a console, stop all processes, then make sure 70 * Run suspend notifiers, allocate a console and stop all processes.
70 * the platform can enter the requested state.
71 */ 71 */
72 72static int suspend_prepare(void)
73static int suspend_prepare(suspend_state_t state)
74{ 73{
75 int error; 74 int error;
76 unsigned int free_pages; 75 unsigned int free_pages;
@@ -78,6 +77,10 @@ static int suspend_prepare(suspend_state_t state)
78 if (!pm_ops || !pm_ops->enter) 77 if (!pm_ops || !pm_ops->enter)
79 return -EPERM; 78 return -EPERM;
80 79
80 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
81 if (error)
82 goto Finish;
83
81 pm_prepare_console(); 84 pm_prepare_console();
82 85
83 if (freeze_processes()) { 86 if (freeze_processes()) {
@@ -85,46 +88,23 @@ static int suspend_prepare(suspend_state_t state)
85 goto Thaw; 88 goto Thaw;
86 } 89 }
87 90
88 if ((free_pages = global_page_state(NR_FREE_PAGES)) 91 free_pages = global_page_state(NR_FREE_PAGES);
89 < FREE_PAGE_NUMBER) { 92 if (free_pages < FREE_PAGE_NUMBER) {
90 pr_debug("PM: free some memory\n"); 93 pr_debug("PM: free some memory\n");
91 shrink_all_memory(FREE_PAGE_NUMBER - free_pages); 94 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
92 if (nr_free_pages() < FREE_PAGE_NUMBER) { 95 if (nr_free_pages() < FREE_PAGE_NUMBER) {
93 error = -ENOMEM; 96 error = -ENOMEM;
94 printk(KERN_ERR "PM: No enough memory\n"); 97 printk(KERN_ERR "PM: No enough memory\n");
95 goto Thaw;
96 } 98 }
97 } 99 }
98
99 if (pm_ops->set_target) {
100 error = pm_ops->set_target(state);
101 if (error)
102 goto Thaw;
103 }
104 suspend_console();
105 error = device_suspend(PMSG_SUSPEND);
106 if (error) {
107 printk(KERN_ERR "Some devices failed to suspend\n");
108 goto Resume_console;
109 }
110 if (pm_ops->prepare) {
111 if ((error = pm_ops->prepare(state)))
112 goto Resume_devices;
113 }
114
115 error = disable_nonboot_cpus();
116 if (!error) 100 if (!error)
117 return 0; 101 return 0;
118 102
119 enable_nonboot_cpus();
120 pm_finish(state);
121 Resume_devices:
122 device_resume();
123 Resume_console:
124 resume_console();
125 Thaw: 103 Thaw:
126 thaw_processes(); 104 thaw_processes();
127 pm_restore_console(); 105 pm_restore_console();
106 Finish:
107 pm_notifier_call_chain(PM_POST_SUSPEND);
128 return error; 108 return error;
129} 109}
130 110
@@ -140,6 +120,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
140 local_irq_enable(); 120 local_irq_enable();
141} 121}
142 122
123/**
124 * suspend_enter - enter the desired system sleep state.
125 * @state: state to enter
126 *
127 * This function should be called after devices have been suspended.
128 */
143int suspend_enter(suspend_state_t state) 129int suspend_enter(suspend_state_t state)
144{ 130{
145 int error = 0; 131 int error = 0;
@@ -159,23 +145,58 @@ int suspend_enter(suspend_state_t state)
159 return error; 145 return error;
160} 146}
161 147
148/**
149 * suspend_devices_and_enter - suspend devices and enter the desired system sleep
150 * state.
151 * @state: state to enter
152 */
153int suspend_devices_and_enter(suspend_state_t state)
154{
155 int error;
156
157 if (!pm_ops)
158 return -ENOSYS;
159
160 if (pm_ops->set_target) {
161 error = pm_ops->set_target(state);
162 if (error)
163 return error;
164 }
165 suspend_console();
166 error = device_suspend(PMSG_SUSPEND);
167 if (error) {
168 printk(KERN_ERR "Some devices failed to suspend\n");
169 goto Resume_console;
170 }
171 if (pm_ops->prepare) {
172 error = pm_ops->prepare(state);
173 if (error)
174 goto Resume_devices;
175 }
176 error = disable_nonboot_cpus();
177 if (!error)
178 suspend_enter(state);
179
180 enable_nonboot_cpus();
181 pm_finish(state);
182 Resume_devices:
183 device_resume();
184 Resume_console:
185 resume_console();
186 return error;
187}
162 188
163/** 189/**
164 * suspend_finish - Do final work before exiting suspend sequence. 190 * suspend_finish - Do final work before exiting suspend sequence.
165 * @state: State we're coming out of.
166 * 191 *
167 * Call platform code to clean up, restart processes, and free the 192 * Call platform code to clean up, restart processes, and free the
168 * console that we've allocated. This is not called for suspend-to-disk. 193 * console that we've allocated. This is not called for suspend-to-disk.
169 */ 194 */
170 195static void suspend_finish(void)
171static void suspend_finish(suspend_state_t state)
172{ 196{
173 enable_nonboot_cpus();
174 pm_finish(state);
175 device_resume();
176 resume_console();
177 thaw_processes(); 197 thaw_processes();
178 pm_restore_console(); 198 pm_restore_console();
199 pm_notifier_call_chain(PM_POST_SUSPEND);
179} 200}
180 201
181 202
@@ -207,7 +228,6 @@ static inline int valid_state(suspend_state_t state)
207 * Then, do the setup for suspend, enter the state, and cleaup (after 228 * Then, do the setup for suspend, enter the state, and cleaup (after
208 * we've woken up). 229 * we've woken up).
209 */ 230 */
210
211static int enter_state(suspend_state_t state) 231static int enter_state(suspend_state_t state)
212{ 232{
213 int error; 233 int error;
@@ -218,14 +238,14 @@ static int enter_state(suspend_state_t state)
218 return -EBUSY; 238 return -EBUSY;
219 239
220 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 240 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
221 if ((error = suspend_prepare(state))) 241 if ((error = suspend_prepare()))
222 goto Unlock; 242 goto Unlock;
223 243
224 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 244 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
225 error = suspend_enter(state); 245 error = suspend_devices_and_enter(state);
226 246
227 pr_debug("PM: Finishing wakeup.\n"); 247 pr_debug("PM: Finishing wakeup.\n");
228 suspend_finish(state); 248 suspend_finish();
229 Unlock: 249 Unlock:
230 mutex_unlock(&pm_mutex); 250 mutex_unlock(&pm_mutex);
231 return error; 251 return error;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5138148710..5f24c786f8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,7 +25,10 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern struct hibernation_ops *hibernation_ops; 28/* kernel/power/disk.c */
29extern int hibernation_snapshot(int platform_mode);
30extern int hibernation_restore(int platform_mode);
31extern int hibernation_platform_enter(void);
29#endif 32#endif
30 33
31extern int pfn_is_nosave(unsigned long); 34extern int pfn_is_nosave(unsigned long);
@@ -152,16 +155,34 @@ extern sector_t alloc_swapdev_block(int swap);
152extern void free_all_swap_pages(int swap); 155extern void free_all_swap_pages(int swap);
153extern int swsusp_swap_in_use(void); 156extern int swsusp_swap_in_use(void);
154 157
158/*
159 * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
160 * the image header.
161 */
162#define SF_PLATFORM_MODE 1
163
164/* kernel/power/disk.c */
155extern int swsusp_check(void); 165extern int swsusp_check(void);
156extern int swsusp_shrink_memory(void); 166extern int swsusp_shrink_memory(void);
157extern void swsusp_free(void); 167extern void swsusp_free(void);
158extern int swsusp_suspend(void); 168extern int swsusp_suspend(void);
159extern int swsusp_resume(void); 169extern int swsusp_resume(void);
160extern int swsusp_read(void); 170extern int swsusp_read(unsigned int *flags_p);
161extern int swsusp_write(void); 171extern int swsusp_write(unsigned int flags);
162extern void swsusp_close(void); 172extern void swsusp_close(void);
163extern int suspend_enter(suspend_state_t state);
164 173
165struct timeval; 174struct timeval;
175/* kernel/power/swsusp.c */
166extern void swsusp_show_speed(struct timeval *, struct timeval *, 176extern void swsusp_show_speed(struct timeval *, struct timeval *,
167 unsigned int, char *); 177 unsigned int, char *);
178
179/* kernel/power/main.c */
180extern int suspend_enter(suspend_state_t state);
181extern int suspend_devices_and_enter(suspend_state_t state);
182extern struct blocking_notifier_head pm_chain_head;
183
184static inline int pm_notifier_call_chain(unsigned long val)
185{
186 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
187 == NOTIFY_BAD) ? -EINVAL : 0;
188}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e0233d8422..3434940a3d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,7 @@ static inline void frozen_process(void)
40 current->flags |= PF_FROZEN; 40 current->flags |= PF_FROZEN;
41 wmb(); 41 wmb();
42 } 42 }
43 clear_tsk_thread_flag(current, TIF_FREEZE); 43 clear_freeze_flag(current);
44} 44}
45 45
46/* Refrigerator is place where frozen processes are stored :-). */ 46/* Refrigerator is place where frozen processes are stored :-). */
@@ -72,20 +72,19 @@ void refrigerator(void)
72 schedule(); 72 schedule();
73 } 73 }
74 pr_debug("%s left refrigerator\n", current->comm); 74 pr_debug("%s left refrigerator\n", current->comm);
75 current->state = save; 75 __set_current_state(save);
76} 76}
77 77
78static inline void freeze_process(struct task_struct *p) 78static void freeze_task(struct task_struct *p)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 if (!freezing(p)) { 82 if (!freezing(p)) {
83 rmb(); 83 rmb();
84 if (!frozen(p)) { 84 if (!frozen(p)) {
85 set_freeze_flag(p);
85 if (p->state == TASK_STOPPED) 86 if (p->state == TASK_STOPPED)
86 force_sig_specific(SIGSTOP, p); 87 force_sig_specific(SIGSTOP, p);
87
88 freeze(p);
89 spin_lock_irqsave(&p->sighand->siglock, flags); 88 spin_lock_irqsave(&p->sighand->siglock, flags);
90 signal_wake_up(p, p->state == TASK_STOPPED); 89 signal_wake_up(p, p->state == TASK_STOPPED);
91 spin_unlock_irqrestore(&p->sighand->siglock, flags); 90 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p)
99 98
100 if (freezing(p)) { 99 if (freezing(p)) {
101 pr_debug(" clean up: %s\n", p->comm); 100 pr_debug(" clean up: %s\n", p->comm);
102 do_not_freeze(p); 101 clear_freeze_flag(p);
103 spin_lock_irqsave(&p->sighand->siglock, flags); 102 spin_lock_irqsave(&p->sighand->siglock, flags);
104 recalc_sigpending_and_wake(p); 103 recalc_sigpending_and_wake(p);
105 spin_unlock_irqrestore(&p->sighand->siglock, flags); 104 spin_unlock_irqrestore(&p->sighand->siglock, flags);
106 } 105 }
107} 106}
108 107
109static inline int is_user_space(struct task_struct *p) 108static int try_to_freeze_tasks(int freeze_user_space)
110{
111 return p->mm && !(p->flags & PF_BORROWED_MM);
112}
113
114static unsigned int try_to_freeze_tasks(int freeze_user_space)
115{ 109{
116 struct task_struct *g, *p; 110 struct task_struct *g, *p;
117 unsigned long end_time; 111 unsigned long end_time;
@@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
122 todo = 0; 116 todo = 0;
123 read_lock(&tasklist_lock); 117 read_lock(&tasklist_lock);
124 do_each_thread(g, p) { 118 do_each_thread(g, p) {
125 if (!freezeable(p)) 119 if (frozen(p) || !freezeable(p))
126 continue; 120 continue;
127 121
128 if (frozen(p)) 122 if (freeze_user_space) {
129 continue; 123 if (p->state == TASK_TRACED &&
130 124 frozen(p->parent)) {
131 if (p->state == TASK_TRACED && frozen(p->parent)) { 125 cancel_freezing(p);
132 cancel_freezing(p); 126 continue;
133 continue; 127 }
128 /*
129 * Kernel threads should not have TIF_FREEZE set
130 * at this point, so we must ensure that either
131 * p->mm is not NULL *and* PF_BORROWED_MM is
132 * unset, or TIF_FRREZE is left unset.
133 * The task_lock() is necessary to prevent races
134 * with exit_mm() or use_mm()/unuse_mm() from
135 * occuring.
136 */
137 task_lock(p);
138 if (!p->mm || (p->flags & PF_BORROWED_MM)) {
139 task_unlock(p);
140 continue;
141 }
142 freeze_task(p);
143 task_unlock(p);
144 } else {
145 freeze_task(p);
134 } 146 }
135 if (freeze_user_space && !is_user_space(p))
136 continue;
137
138 freeze_process(p);
139 if (!freezer_should_skip(p)) 147 if (!freezer_should_skip(p))
140 todo++; 148 todo++;
141 } while_each_thread(g, p); 149 } while_each_thread(g, p);
142 read_unlock(&tasklist_lock); 150 read_unlock(&tasklist_lock);
143 yield(); /* Yield is okay here */ 151 yield(); /* Yield is okay here */
144 if (todo && time_after(jiffies, end_time)) 152 if (time_after(jiffies, end_time))
145 break; 153 break;
146 } while (todo); 154 } while (todo);
147 155
@@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
152 * but it cleans up leftover PF_FREEZE requests. 160 * but it cleans up leftover PF_FREEZE requests.
153 */ 161 */
154 printk("\n"); 162 printk("\n");
155 printk(KERN_ERR "Stopping %s timed out after %d seconds " 163 printk(KERN_ERR "Freezing of %s timed out after %d seconds "
156 "(%d tasks refusing to freeze):\n", 164 "(%d tasks refusing to freeze):\n",
157 freeze_user_space ? "user space processes" : 165 freeze_user_space ? "user space " : "tasks ",
158 "kernel threads",
159 TIMEOUT / HZ, todo); 166 TIMEOUT / HZ, todo);
167 show_state();
160 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
161 do_each_thread(g, p) { 169 do_each_thread(g, p) {
162 if (freeze_user_space && !is_user_space(p))
163 continue;
164
165 task_lock(p); 170 task_lock(p);
166 if (freezeable(p) && !frozen(p) && 171 if (freezing(p) && !freezer_should_skip(p))
167 !freezer_should_skip(p))
168 printk(KERN_ERR " %s\n", p->comm); 172 printk(KERN_ERR " %s\n", p->comm);
169
170 cancel_freezing(p); 173 cancel_freezing(p);
171 task_unlock(p); 174 task_unlock(p);
172 } while_each_thread(g, p); 175 } while_each_thread(g, p);
173 read_unlock(&tasklist_lock); 176 read_unlock(&tasklist_lock);
174 } 177 }
175 178
176 return todo; 179 return todo ? -EBUSY : 0;
177} 180}
178 181
179/** 182/**
180 * freeze_processes - tell processes to enter the refrigerator 183 * freeze_processes - tell processes to enter the refrigerator
181 *
182 * Returns 0 on success, or the number of processes that didn't freeze,
183 * although they were told to.
184 */ 184 */
185int freeze_processes(void) 185int freeze_processes(void)
186{ 186{
187 unsigned int nr_unfrozen; 187 int error;
188 188
189 printk("Stopping tasks ... "); 189 printk("Stopping tasks ... ");
190 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); 190 error = try_to_freeze_tasks(FREEZER_USER_SPACE);
191 if (nr_unfrozen) 191 if (error)
192 return nr_unfrozen; 192 return error;
193 193
194 sys_sync(); 194 sys_sync();
195 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 195 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
196 if (nr_unfrozen) 196 if (error)
197 return nr_unfrozen; 197 return error;
198 198
199 printk("done.\n"); 199 printk("done.\n");
200 BUG_ON(in_atomic()); 200 BUG_ON(in_atomic());
@@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space)
210 if (!freezeable(p)) 210 if (!freezeable(p))
211 continue; 211 continue;
212 212
213 if (is_user_space(p) == !thaw_user_space) 213 if (!p->mm == thaw_user_space)
214 continue; 214 continue;
215 215
216 thaw_process(p); 216 thaw_process(p);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8b1a1b8371..917aba1005 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,8 +33,9 @@ extern char resume_file[];
33#define SWSUSP_SIG "S1SUSPEND" 33#define SWSUSP_SIG "S1SUSPEND"
34 34
35struct swsusp_header { 35struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; 36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
37 sector_t image; 37 sector_t image;
38 unsigned int flags; /* Flags to pass to the "boot" kernel */
38 char orig_sig[10]; 39 char orig_sig[10];
39 char sig[10]; 40 char sig[10];
40} __attribute__((packed)); 41} __attribute__((packed));
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain)
138 * Saving part 139 * Saving part
139 */ 140 */
140 141
141static int mark_swapfiles(sector_t start) 142static int mark_swapfiles(sector_t start, unsigned int flags)
142{ 143{
143 int error; 144 int error;
144 145
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start)
148 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 149 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
149 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 150 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
150 swsusp_header->image = start; 151 swsusp_header->image = start;
152 swsusp_header->flags = flags;
151 error = bio_write_page(swsusp_resume_block, 153 error = bio_write_page(swsusp_resume_block,
152 swsusp_header, NULL); 154 swsusp_header, NULL);
153 } else { 155 } else {
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages)
369 371
370/** 372/**
371 * swsusp_write - Write entire image and metadata. 373 * swsusp_write - Write entire image and metadata.
374 * @flags: flags to pass to the "boot" kernel in the image header
372 * 375 *
373 * It is important _NOT_ to umount filesystems at this point. We want 376 * It is important _NOT_ to umount filesystems at this point. We want
374 * them synced (in case something goes wrong) but we DO not want to mark 377 * them synced (in case something goes wrong) but we DO not want to mark
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages)
376 * correctly, we'll mark system clean, anyway.) 379 * correctly, we'll mark system clean, anyway.)
377 */ 380 */
378 381
379int swsusp_write(void) 382int swsusp_write(unsigned int flags)
380{ 383{
381 struct swap_map_handle handle; 384 struct swap_map_handle handle;
382 struct snapshot_handle snapshot; 385 struct snapshot_handle snapshot;
@@ -415,7 +418,7 @@ int swsusp_write(void)
415 if (!error) { 418 if (!error) {
416 flush_swap_writer(&handle); 419 flush_swap_writer(&handle);
417 printk("S"); 420 printk("S");
418 error = mark_swapfiles(start); 421 error = mark_swapfiles(start, flags);
419 printk("|\n"); 422 printk("|\n");
420 } 423 }
421 } 424 }
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle,
540 return error; 543 return error;
541} 544}
542 545
543int swsusp_read(void) 546/**
547 * swsusp_read - read the hibernation image.
548 * @flags_p: flags passed by the "frozen" kernel in the image header should
549 * be written into this memeory location
550 */
551
552int swsusp_read(unsigned int *flags_p)
544{ 553{
545 int error; 554 int error;
546 struct swap_map_handle handle; 555 struct swap_map_handle handle;
547 struct snapshot_handle snapshot; 556 struct snapshot_handle snapshot;
548 struct swsusp_info *header; 557 struct swsusp_info *header;
549 558
559 *flags_p = swsusp_header->flags;
550 if (IS_ERR(resume_bdev)) { 560 if (IS_ERR(resume_bdev)) {
551 pr_debug("swsusp: block device not initialised\n"); 561 pr_debug("swsusp: block device not initialised\n");
552 return PTR_ERR(resume_bdev); 562 return PTR_ERR(resume_bdev);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d65305b515..bd0723a7df 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
128 return res; 128 return res;
129} 129}
130 130
131static inline int platform_prepare(void)
132{
133 int error = 0;
134
135 if (hibernation_ops)
136 error = hibernation_ops->prepare();
137
138 return error;
139}
140
141static inline void platform_finish(void)
142{
143 if (hibernation_ops)
144 hibernation_ops->finish();
145}
146
147static inline int snapshot_suspend(int platform_suspend)
148{
149 int error;
150
151 mutex_lock(&pm_mutex);
152 /* Free memory before shutting down devices. */
153 error = swsusp_shrink_memory();
154 if (error)
155 goto Finish;
156
157 if (platform_suspend) {
158 error = platform_prepare();
159 if (error)
160 goto Finish;
161 }
162 suspend_console();
163 error = device_suspend(PMSG_FREEZE);
164 if (error)
165 goto Resume_devices;
166
167 error = disable_nonboot_cpus();
168 if (!error) {
169 in_suspend = 1;
170 error = swsusp_suspend();
171 }
172 enable_nonboot_cpus();
173 Resume_devices:
174 if (platform_suspend)
175 platform_finish();
176
177 device_resume();
178 resume_console();
179 Finish:
180 mutex_unlock(&pm_mutex);
181 return error;
182}
183
184static inline int snapshot_restore(int platform_suspend)
185{
186 int error;
187
188 mutex_lock(&pm_mutex);
189 pm_prepare_console();
190 if (platform_suspend) {
191 error = platform_prepare();
192 if (error)
193 goto Finish;
194 }
195 suspend_console();
196 error = device_suspend(PMSG_PRETHAW);
197 if (error)
198 goto Resume_devices;
199
200 error = disable_nonboot_cpus();
201 if (!error)
202 error = swsusp_resume();
203
204 enable_nonboot_cpus();
205 Resume_devices:
206 if (platform_suspend)
207 platform_finish();
208
209 device_resume();
210 resume_console();
211 Finish:
212 pm_restore_console();
213 mutex_unlock(&pm_mutex);
214 return error;
215}
216
217static int snapshot_ioctl(struct inode *inode, struct file *filp, 131static int snapshot_ioctl(struct inode *inode, struct file *filp,
218 unsigned int cmd, unsigned long arg) 132 unsigned int cmd, unsigned long arg)
219{ 133{
@@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
237 if (data->frozen) 151 if (data->frozen)
238 break; 152 break;
239 mutex_lock(&pm_mutex); 153 mutex_lock(&pm_mutex);
240 if (freeze_processes()) { 154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
241 thaw_processes(); 155 if (!error) {
242 error = -EBUSY; 156 error = freeze_processes();
157 if (error)
158 thaw_processes();
243 } 159 }
160 if (error)
161 pm_notifier_call_chain(PM_POST_HIBERNATION);
244 mutex_unlock(&pm_mutex); 162 mutex_unlock(&pm_mutex);
245 if (!error) 163 if (!error)
246 data->frozen = 1; 164 data->frozen = 1;
@@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
251 break; 169 break;
252 mutex_lock(&pm_mutex); 170 mutex_lock(&pm_mutex);
253 thaw_processes(); 171 thaw_processes();
172 pm_notifier_call_chain(PM_POST_HIBERNATION);
254 mutex_unlock(&pm_mutex); 173 mutex_unlock(&pm_mutex);
255 data->frozen = 0; 174 data->frozen = 0;
256 break; 175 break;
@@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
260 error = -EPERM; 179 error = -EPERM;
261 break; 180 break;
262 } 181 }
263 error = snapshot_suspend(data->platform_suspend); 182 error = hibernation_snapshot(data->platform_suspend);
264 if (!error) 183 if (!error)
265 error = put_user(in_suspend, (unsigned int __user *)arg); 184 error = put_user(in_suspend, (unsigned int __user *)arg);
266 if (!error) 185 if (!error)
@@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
274 error = -EPERM; 193 error = -EPERM;
275 break; 194 break;
276 } 195 }
277 error = snapshot_restore(data->platform_suspend); 196 error = hibernation_restore(data->platform_suspend);
278 break; 197 break;
279 198
280 case SNAPSHOT_FREE: 199 case SNAPSHOT_FREE:
@@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
336 break; 255 break;
337 256
338 case SNAPSHOT_S2RAM: 257 case SNAPSHOT_S2RAM:
339 if (!pm_ops) {
340 error = -ENOSYS;
341 break;
342 }
343
344 if (!data->frozen) { 258 if (!data->frozen) {
345 error = -EPERM; 259 error = -EPERM;
346 break; 260 break;
347 } 261 }
348
349 if (!mutex_trylock(&pm_mutex)) { 262 if (!mutex_trylock(&pm_mutex)) {
350 error = -EBUSY; 263 error = -EBUSY;
351 break; 264 break;
352 } 265 }
353 266 /*
354 if (pm_ops->prepare) { 267 * Tasks are frozen and the notifiers have been called with
355 error = pm_ops->prepare(PM_SUSPEND_MEM); 268 * PM_HIBERNATION_PREPARE
356 if (error) 269 */
357 goto OutS3; 270 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
358 }
359
360 /* Put devices to sleep */
361 suspend_console();
362 error = device_suspend(PMSG_SUSPEND);
363 if (error) {
364 printk(KERN_ERR "Failed to suspend some devices.\n");
365 } else {
366 error = disable_nonboot_cpus();
367 if (!error) {
368 /* Enter S3, system is already frozen */
369 suspend_enter(PM_SUSPEND_MEM);
370 enable_nonboot_cpus();
371 }
372 /* Wake up devices */
373 device_resume();
374 }
375 resume_console();
376 if (pm_ops->finish)
377 pm_ops->finish(PM_SUSPEND_MEM);
378
379 OutS3:
380 mutex_unlock(&pm_mutex); 271 mutex_unlock(&pm_mutex);
381 break; 272 break;
382 273
@@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
386 switch (arg) { 277 switch (arg) {
387 278
388 case PMOPS_PREPARE: 279 case PMOPS_PREPARE:
389 if (hibernation_ops) { 280 data->platform_suspend = 1;
390 data->platform_suspend = 1; 281 error = 0;
391 error = 0;
392 } else {
393 error = -ENOSYS;
394 }
395 break; 282 break;
396 283
397 case PMOPS_ENTER: 284 case PMOPS_ENTER:
398 if (data->platform_suspend) { 285 if (data->platform_suspend)
399 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 286 error = hibernation_platform_enter();
400 error = hibernation_ops->enter(); 287
401 }
402 break; 288 break;
403 289
404 case PMOPS_FINISH: 290 case PMOPS_FINISH:
diff --git a/kernel/printk.c b/kernel/printk.c
index 0bbdeac281..051d27e36a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -449,13 +449,16 @@ static int printk_time = 1;
449#else 449#else
450static int printk_time = 0; 450static int printk_time = 0;
451#endif 451#endif
452module_param(printk_time, int, S_IRUGO | S_IWUSR); 452module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
453 453
454static int __init printk_time_setup(char *str) 454static int __init printk_time_setup(char *str)
455{ 455{
456 if (*str) 456 if (*str)
457 return 0; 457 return 0;
458 printk_time = 1; 458 printk_time = 1;
459 printk(KERN_NOTICE "The 'time' option is deprecated and "
460 "is scheduled for removal in early 2008\n");
461 printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
459 return 1; 462 return 1;
460} 463}
461 464
@@ -483,6 +486,9 @@ static int have_callable_console(void)
483 * @fmt: format string 486 * @fmt: format string
484 * 487 *
485 * This is printk(). It can be called from any context. We want it to work. 488 * This is printk(). It can be called from any context. We want it to work.
489 * Be aware of the fact that if oops_in_progress is not set, we might try to
490 * wake klogd up which could deadlock on runqueue lock if printk() is called
491 * from scheduler code.
486 * 492 *
487 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 493 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
488 * call the console drivers. If we fail to get the semaphore we place the output 494 * call the console drivers. If we fail to get the semaphore we place the output
@@ -654,7 +660,7 @@ static void call_console_drivers(unsigned long start, unsigned long end)
654 */ 660 */
655static int __init console_setup(char *str) 661static int __init console_setup(char *str)
656{ 662{
657 char name[sizeof(console_cmdline[0].name)]; 663 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
658 char *s, *options; 664 char *s, *options;
659 int idx; 665 int idx;
660 666
@@ -662,27 +668,27 @@ static int __init console_setup(char *str)
662 * Decode str into name, index, options. 668 * Decode str into name, index, options.
663 */ 669 */
664 if (str[0] >= '0' && str[0] <= '9') { 670 if (str[0] >= '0' && str[0] <= '9') {
665 strcpy(name, "ttyS"); 671 strcpy(buf, "ttyS");
666 strncpy(name + 4, str, sizeof(name) - 5); 672 strncpy(buf + 4, str, sizeof(buf) - 5);
667 } else { 673 } else {
668 strncpy(name, str, sizeof(name) - 1); 674 strncpy(buf, str, sizeof(buf) - 1);
669 } 675 }
670 name[sizeof(name) - 1] = 0; 676 buf[sizeof(buf) - 1] = 0;
671 if ((options = strchr(str, ',')) != NULL) 677 if ((options = strchr(str, ',')) != NULL)
672 *(options++) = 0; 678 *(options++) = 0;
673#ifdef __sparc__ 679#ifdef __sparc__
674 if (!strcmp(str, "ttya")) 680 if (!strcmp(str, "ttya"))
675 strcpy(name, "ttyS0"); 681 strcpy(buf, "ttyS0");
676 if (!strcmp(str, "ttyb")) 682 if (!strcmp(str, "ttyb"))
677 strcpy(name, "ttyS1"); 683 strcpy(buf, "ttyS1");
678#endif 684#endif
679 for (s = name; *s; s++) 685 for (s = buf; *s; s++)
680 if ((*s >= '0' && *s <= '9') || *s == ',') 686 if ((*s >= '0' && *s <= '9') || *s == ',')
681 break; 687 break;
682 idx = simple_strtoul(s, NULL, 10); 688 idx = simple_strtoul(s, NULL, 10);
683 *s = 0; 689 *s = 0;
684 690
685 add_preferred_console(name, idx, options); 691 add_preferred_console(buf, idx, options);
686 return 1; 692 return 1;
687} 693}
688__setup("console=", console_setup); 694__setup("console=", console_setup);
@@ -709,7 +715,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
709 * See if this tty is not yet registered, and 715 * See if this tty is not yet registered, and
710 * if we have a slot free. 716 * if we have a slot free.
711 */ 717 */
712 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 718 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
713 if (strcmp(console_cmdline[i].name, name) == 0 && 719 if (strcmp(console_cmdline[i].name, name) == 0 &&
714 console_cmdline[i].index == idx) { 720 console_cmdline[i].index == idx) {
715 selected_console = i; 721 selected_console = i;
@@ -726,6 +732,25 @@ int __init add_preferred_console(char *name, int idx, char *options)
726 return 0; 732 return 0;
727} 733}
728 734
735int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
736{
737 struct console_cmdline *c;
738 int i;
739
740 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
741 if (strcmp(console_cmdline[i].name, name) == 0 &&
742 console_cmdline[i].index == idx) {
743 c = &console_cmdline[i];
744 memcpy(c->name, name_new, sizeof(c->name));
745 c->name[sizeof(c->name) - 1] = 0;
746 c->options = options;
747 c->index = idx_new;
748 return i;
749 }
750 /* not found */
751 return -1;
752}
753
729#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND 754#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
730/** 755/**
731 * suspend_console - suspend the console subsystem 756 * suspend_console - suspend the console subsystem
@@ -942,6 +967,9 @@ void register_console(struct console *console)
942 if (preferred_console < 0 || bootconsole || !console_drivers) 967 if (preferred_console < 0 || bootconsole || !console_drivers)
943 preferred_console = selected_console; 968 preferred_console = selected_console;
944 969
970 if (console->early_setup)
971 console->early_setup();
972
945 /* 973 /*
946 * See if we want to use this console driver. If we 974 * See if we want to use this console driver. If we
947 * didn't select a console we take the first one 975 * didn't select a console we take the first one
@@ -985,12 +1013,15 @@ void register_console(struct console *console)
985 if (!(console->flags & CON_ENABLED)) 1013 if (!(console->flags & CON_ENABLED))
986 return; 1014 return;
987 1015
988 if (bootconsole) { 1016 if (bootconsole && (console->flags & CON_CONSDEV)) {
989 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1017 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
990 bootconsole->name, bootconsole->index, 1018 bootconsole->name, bootconsole->index,
991 console->name, console->index); 1019 console->name, console->index);
992 unregister_console(bootconsole); 1020 unregister_console(bootconsole);
993 console->flags &= ~CON_PRINTBUFFER; 1021 console->flags &= ~CON_PRINTBUFFER;
1022 } else {
1023 printk(KERN_INFO "console [%s%d] enabled\n",
1024 console->name, console->index);
994 } 1025 }
995 1026
996 /* 1027 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ad7949a589..82a558b655 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task)
142 return -EPERM; 142 return -EPERM;
143 smp_rmb(); 143 smp_rmb();
144 if (task->mm) 144 if (task->mm)
145 dumpable = task->mm->dumpable; 145 dumpable = get_dumpable(task->mm);
146 if (!dumpable && !capable(CAP_SYS_PTRACE)) 146 if (!dumpable && !capable(CAP_SYS_PTRACE))
147 return -EPERM; 147 return -EPERM;
148 148
@@ -161,6 +161,7 @@ int ptrace_may_attach(struct task_struct *task)
161int ptrace_attach(struct task_struct *task) 161int ptrace_attach(struct task_struct *task)
162{ 162{
163 int retval; 163 int retval;
164 unsigned long flags;
164 165
165 audit_ptrace(task); 166 audit_ptrace(task);
166 167
@@ -181,9 +182,7 @@ repeat:
181 * cpu's that may have task_lock). 182 * cpu's that may have task_lock).
182 */ 183 */
183 task_lock(task); 184 task_lock(task);
184 local_irq_disable(); 185 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
185 if (!write_trylock(&tasklist_lock)) {
186 local_irq_enable();
187 task_unlock(task); 186 task_unlock(task);
188 do { 187 do {
189 cpu_relax(); 188 cpu_relax();
@@ -211,7 +210,7 @@ repeat:
211 force_sig_specific(SIGSTOP, task); 210 force_sig_specific(SIGSTOP, task);
212 211
213bad: 212bad:
214 write_unlock_irq(&tasklist_lock); 213 write_unlock_irqrestore(&tasklist_lock, flags);
215 task_unlock(task); 214 task_unlock(task);
216out: 215out:
217 return retval; 216 return retval;
@@ -491,3 +490,22 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
491 return ret; 490 return ret;
492} 491}
493#endif /* __ARCH_SYS_PTRACE */ 492#endif /* __ARCH_SYS_PTRACE */
493
494int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
495{
496 unsigned long tmp;
497 int copied;
498
499 copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
500 if (copied != sizeof(tmp))
501 return -EIO;
502 return put_user(tmp, (unsigned long __user *)data);
503}
504
505int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
506{
507 int copied;
508
509 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
510 return (copied == sizeof(data)) ? 0 : -EIO;
511}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 55ba82a85a..ddff332477 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -40,6 +40,7 @@
40#include <linux/moduleparam.h> 40#include <linux/moduleparam.h>
41#include <linux/percpu.h> 41#include <linux/percpu.h>
42#include <linux/notifier.h> 42#include <linux/notifier.h>
43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/random.h> 45#include <linux/random.h>
45#include <linux/delay.h> 46#include <linux/delay.h>
@@ -518,7 +519,6 @@ rcu_torture_writer(void *arg)
518 519
519 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 520 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
520 set_user_nice(current, 19); 521 set_user_nice(current, 19);
521 current->flags |= PF_NOFREEZE;
522 522
523 do { 523 do {
524 schedule_timeout_uninterruptible(1); 524 schedule_timeout_uninterruptible(1);
@@ -558,7 +558,6 @@ rcu_torture_fakewriter(void *arg)
558 558
559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 559 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
560 set_user_nice(current, 19); 560 set_user_nice(current, 19);
561 current->flags |= PF_NOFREEZE;
562 561
563 do { 562 do {
564 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 563 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -589,7 +588,6 @@ rcu_torture_reader(void *arg)
589 588
590 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 589 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
591 set_user_nice(current, 19); 590 set_user_nice(current, 19);
592 current->flags |= PF_NOFREEZE;
593 591
594 do { 592 do {
595 idx = cur_ops->readlock(); 593 idx = cur_ops->readlock();
diff --git a/kernel/relay.c b/kernel/relay.c
index 95db8c79fe..510fbbd7b5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -21,6 +21,7 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/splice.h>
24 25
25/* list of open channels, for cpu hotplug */ 26/* list of open channels, for cpu hotplug */
26static DEFINE_MUTEX(relay_channels_mutex); 27static DEFINE_MUTEX(relay_channels_mutex);
@@ -79,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = {
79 * 80 *
80 * Caller should already have grabbed mmap_sem. 81 * Caller should already have grabbed mmap_sem.
81 */ 82 */
82int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) 83static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
83{ 84{
84 unsigned long length = vma->vm_end - vma->vm_start; 85 unsigned long length = vma->vm_end - vma->vm_start;
85 struct file *filp = vma->vm_file; 86 struct file *filp = vma->vm_file;
@@ -121,6 +122,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
121 buf->page_array[i] = alloc_page(GFP_KERNEL); 122 buf->page_array[i] = alloc_page(GFP_KERNEL);
122 if (unlikely(!buf->page_array[i])) 123 if (unlikely(!buf->page_array[i]))
123 goto depopulate; 124 goto depopulate;
125 set_page_private(buf->page_array[i], (unsigned long)buf);
124 } 126 }
125 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); 127 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
126 if (!mem) 128 if (!mem)
@@ -143,7 +145,7 @@ depopulate:
143 * 145 *
144 * Returns channel buffer if successful, %NULL otherwise. 146 * Returns channel buffer if successful, %NULL otherwise.
145 */ 147 */
146struct rchan_buf *relay_create_buf(struct rchan *chan) 148static struct rchan_buf *relay_create_buf(struct rchan *chan)
147{ 149{
148 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
149 if (!buf) 151 if (!buf)
@@ -173,7 +175,7 @@ free_buf:
173 * 175 *
174 * Should only be called from kref_put(). 176 * Should only be called from kref_put().
175 */ 177 */
176void relay_destroy_channel(struct kref *kref) 178static void relay_destroy_channel(struct kref *kref)
177{ 179{
178 struct rchan *chan = container_of(kref, struct rchan, kref); 180 struct rchan *chan = container_of(kref, struct rchan, kref);
179 kfree(chan); 181 kfree(chan);
@@ -183,7 +185,7 @@ void relay_destroy_channel(struct kref *kref)
183 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer 185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
184 * @buf: the buffer struct 186 * @buf: the buffer struct
185 */ 187 */
186void relay_destroy_buf(struct rchan_buf *buf) 188static void relay_destroy_buf(struct rchan_buf *buf)
187{ 189{
188 struct rchan *chan = buf->chan; 190 struct rchan *chan = buf->chan;
189 unsigned int i; 191 unsigned int i;
@@ -208,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
208 * rchan_buf_struct and the channel buffer. Should only be called from 210 * rchan_buf_struct and the channel buffer. Should only be called from
209 * kref_put(). 211 * kref_put().
210 */ 212 */
211void relay_remove_buf(struct kref *kref) 213static void relay_remove_buf(struct kref *kref)
212{ 214{
213 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
214 buf->chan->cb->remove_buf_file(buf->dentry); 216 buf->chan->cb->remove_buf_file(buf->dentry);
@@ -221,11 +223,10 @@ void relay_remove_buf(struct kref *kref)
221 * 223 *
222 * Returns 1 if the buffer is empty, 0 otherwise. 224 * Returns 1 if the buffer is empty, 0 otherwise.
223 */ 225 */
224int relay_buf_empty(struct rchan_buf *buf) 226static int relay_buf_empty(struct rchan_buf *buf)
225{ 227{
226 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; 228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
227} 229}
228EXPORT_SYMBOL_GPL(relay_buf_empty);
229 230
230/** 231/**
231 * relay_buf_full - boolean, is the channel buffer full? 232 * relay_buf_full - boolean, is the channel buffer full?
@@ -970,43 +971,6 @@ static int subbuf_read_actor(size_t read_start,
970 return ret; 971 return ret;
971} 972}
972 973
973/*
974 * subbuf_send_actor - send up to one subbuf's worth of data
975 */
976static int subbuf_send_actor(size_t read_start,
977 struct rchan_buf *buf,
978 size_t avail,
979 read_descriptor_t *desc,
980 read_actor_t actor)
981{
982 unsigned long pidx, poff;
983 unsigned int subbuf_pages;
984 int ret = 0;
985
986 subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
987 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
988 poff = read_start & ~PAGE_MASK;
989 while (avail) {
990 struct page *p = buf->page_array[pidx];
991 unsigned int len;
992
993 len = PAGE_SIZE - poff;
994 if (len > avail)
995 len = avail;
996
997 len = actor(desc, p, poff, len);
998 if (desc->error)
999 break;
1000
1001 avail -= len;
1002 ret += len;
1003 poff = 0;
1004 pidx = (pidx + 1) % subbuf_pages;
1005 }
1006
1007 return ret;
1008}
1009
1010typedef int (*subbuf_actor_t) (size_t read_start, 974typedef int (*subbuf_actor_t) (size_t read_start,
1011 struct rchan_buf *buf, 975 struct rchan_buf *buf,
1012 size_t avail, 976 size_t avail,
@@ -1067,19 +1031,161 @@ static ssize_t relay_file_read(struct file *filp,
1067 NULL, &desc); 1031 NULL, &desc);
1068} 1032}
1069 1033
1070static ssize_t relay_file_sendfile(struct file *filp, 1034static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
1071 loff_t *ppos,
1072 size_t count,
1073 read_actor_t actor,
1074 void *target)
1075{ 1035{
1076 read_descriptor_t desc; 1036 rbuf->bytes_consumed += bytes_consumed;
1077 desc.written = 0; 1037
1078 desc.count = count; 1038 if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
1079 desc.arg.data = target; 1039 relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
1080 desc.error = 0; 1040 rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
1081 return relay_file_read_subbufs(filp, ppos, subbuf_send_actor, 1041 }
1082 actor, &desc); 1042}
1043
1044static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1045 struct pipe_buffer *buf)
1046{
1047 struct rchan_buf *rbuf;
1048
1049 rbuf = (struct rchan_buf *)page_private(buf->page);
1050 relay_consume_bytes(rbuf, buf->private);
1051}
1052
1053static struct pipe_buf_operations relay_pipe_buf_ops = {
1054 .can_merge = 0,
1055 .map = generic_pipe_buf_map,
1056 .unmap = generic_pipe_buf_unmap,
1057 .confirm = generic_pipe_buf_confirm,
1058 .release = relay_pipe_buf_release,
1059 .steal = generic_pipe_buf_steal,
1060 .get = generic_pipe_buf_get,
1061};
1062
1063/*
1064 * subbuf_splice_actor - splice up to one subbuf's worth of data
1065 */
1066static int subbuf_splice_actor(struct file *in,
1067 loff_t *ppos,
1068 struct pipe_inode_info *pipe,
1069 size_t len,
1070 unsigned int flags,
1071 int *nonpad_ret)
1072{
1073 unsigned int pidx, poff, total_len, subbuf_pages, ret;
1074 struct rchan_buf *rbuf = in->private_data;
1075 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1076 uint64_t pos = (uint64_t) *ppos;
1077 uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
1078 size_t read_start = (size_t) do_div(pos, alloc_size);
1079 size_t read_subbuf = read_start / subbuf_size;
1080 size_t padding = rbuf->padding[read_subbuf];
1081 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1082 struct page *pages[PIPE_BUFFERS];
1083 struct partial_page partial[PIPE_BUFFERS];
1084 struct splice_pipe_desc spd = {
1085 .pages = pages,
1086 .nr_pages = 0,
1087 .partial = partial,
1088 .flags = flags,
1089 .ops = &relay_pipe_buf_ops,
1090 };
1091
1092 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1093 return 0;
1094
1095 /*
1096 * Adjust read len, if longer than what is available
1097 */
1098 if (len > (subbuf_size - read_start % subbuf_size))
1099 len = subbuf_size - read_start % subbuf_size;
1100
1101 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1102 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1103 poff = read_start & ~PAGE_MASK;
1104
1105 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
1106 unsigned int this_len, this_end, private;
1107 unsigned int cur_pos = read_start + total_len;
1108
1109 if (!len)
1110 break;
1111
1112 this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
1113 private = this_len;
1114
1115 spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
1116 spd.partial[spd.nr_pages].offset = poff;
1117
1118 this_end = cur_pos + this_len;
1119 if (this_end >= nonpad_end) {
1120 this_len = nonpad_end - cur_pos;
1121 private = this_len + padding;
1122 }
1123 spd.partial[spd.nr_pages].len = this_len;
1124 spd.partial[spd.nr_pages].private = private;
1125
1126 len -= this_len;
1127 total_len += this_len;
1128 poff = 0;
1129 pidx = (pidx + 1) % subbuf_pages;
1130
1131 if (this_end >= nonpad_end) {
1132 spd.nr_pages++;
1133 break;
1134 }
1135 }
1136
1137 if (!spd.nr_pages)
1138 return 0;
1139
1140 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1141 if (ret < 0 || ret < total_len)
1142 return ret;
1143
1144 if (read_start + ret == nonpad_end)
1145 ret += padding;
1146
1147 return ret;
1148}
1149
1150static ssize_t relay_file_splice_read(struct file *in,
1151 loff_t *ppos,
1152 struct pipe_inode_info *pipe,
1153 size_t len,
1154 unsigned int flags)
1155{
1156 ssize_t spliced;
1157 int ret;
1158 int nonpad_ret = 0;
1159
1160 ret = 0;
1161 spliced = 0;
1162
1163 while (len) {
1164 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1165 if (ret < 0)
1166 break;
1167 else if (!ret) {
1168 if (spliced)
1169 break;
1170 if (flags & SPLICE_F_NONBLOCK) {
1171 ret = -EAGAIN;
1172 break;
1173 }
1174 }
1175
1176 *ppos += ret;
1177 if (ret > len)
1178 len = 0;
1179 else
1180 len -= ret;
1181 spliced += nonpad_ret;
1182 nonpad_ret = 0;
1183 }
1184
1185 if (spliced)
1186 return spliced;
1187
1188 return ret;
1083} 1189}
1084 1190
1085const struct file_operations relay_file_operations = { 1191const struct file_operations relay_file_operations = {
@@ -1089,7 +1195,7 @@ const struct file_operations relay_file_operations = {
1089 .read = relay_file_read, 1195 .read = relay_file_read,
1090 .llseek = no_llseek, 1196 .llseek = no_llseek,
1091 .release = relay_file_release, 1197 .release = relay_file_release,
1092 .sendfile = relay_file_sendfile, 1198 .splice_read = relay_file_splice_read,
1093}; 1199};
1094EXPORT_SYMBOL_GPL(relay_file_operations); 1200EXPORT_SYMBOL_GPL(relay_file_operations);
1095 1201
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index da8d6bf464..5aedbee014 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,12 +29,6 @@
29 29
30#include "rtmutex_common.h" 30#include "rtmutex_common.h"
31 31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x) 32# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x) 33# define TRACE_BUG_ON(x) BUG_ON(x)
40 34
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 015fc633c9..e3055ba691 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -260,6 +260,7 @@ static int test_func(void *data)
260 int ret; 260 int ret;
261 261
262 current->flags |= PF_MUTEX_TESTER; 262 current->flags |= PF_MUTEX_TESTER;
263 set_freezable();
263 allow_signal(SIGHUP); 264 allow_signal(SIGHUP);
264 265
265 for(;;) { 266 for(;;) {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 17d28ce203..8cd9bd2cdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -17,12 +17,6 @@
17 17
18#include "rtmutex_common.h" 18#include "rtmutex_common.h"
19 19
20#ifdef CONFIG_DEBUG_RT_MUTEXES
21# include "rtmutex-debug.h"
22#else
23# include "rtmutex.h"
24#endif
25
26/* 20/*
27 * lock->owner state tracking: 21 * lock->owner state tracking:
28 * 22 *
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e79..2d3b83593c 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -103,7 +103,7 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
103 103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) 104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{ 105{
106 return (struct task_struct *) 106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); 107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108} 108}
109 109
@@ -120,4 +120,11 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123
124#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h"
126#else
127# include "rtmutex.h"
128#endif
129
123#endif 130#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9a87886b02..1ec620c030 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem)
20 might_sleep(); 20 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
22 22
23 __down_read(sem); 23 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
24} 24}
25 25
26EXPORT_SYMBOL(down_read); 26EXPORT_SYMBOL(down_read);
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem)
47 might_sleep(); 47 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 49
50 __down_write(sem); 50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
51} 51}
52 52
53EXPORT_SYMBOL(down_write); 53EXPORT_SYMBOL(down_write);
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
111 might_sleep(); 111 might_sleep();
112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
113 113
114 __down_read(sem); 114 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
115} 115}
116 116
117EXPORT_SYMBOL(down_read_nested); 117EXPORT_SYMBOL(down_read_nested);
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
130 might_sleep(); 130 might_sleep();
131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
132 132
133 __down_write_nested(sem, subclass); 133 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
134} 134}
135 135
136EXPORT_SYMBOL(down_write_nested); 136EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a31226..93cf241cfb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
214 158
215static inline unsigned int task_timeslice(struct task_struct *p) 159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
198
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
223 209
224struct prio_array { 210/* Real-Time classes' related field in a runqueue: */
225 unsigned int nr_active; 211struct rt_rq {
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 212 struct rt_prio_array active;
227 struct list_head queue[MAX_PRIO]; 213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -304,9 +301,14 @@ struct rq {
304 struct lock_class_key rq_lock_key; 301 struct lock_class_key rq_lock_key;
305}; 302};
306 303
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,35 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382/*
383 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
384 * clock constructed from sched_clock():
385 */
386unsigned long long cpu_clock(int cpu)
387{
388 struct rq *rq = cpu_rq(cpu);
389 unsigned long long now;
390 unsigned long flags;
391
392 spin_lock_irqsave(&rq->lock, flags);
393 now = rq_clock(rq);
394 spin_unlock_irqrestore(&rq->lock, flags);
395
396 return now;
397}
398
399#ifdef CONFIG_FAIR_GROUP_SCHED
400/* Change a task's ->cfs_rq if it moves across CPUs */
401static inline void set_task_cfs_rq(struct task_struct *p)
402{
403 p->se.cfs_rq = &task_rq(p)->cfs;
404}
405#else
406static inline void set_task_cfs_rq(struct task_struct *p)
407{
408}
409#endif
410
334#ifndef prepare_arch_switch 411#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 412# define prepare_arch_switch(next) do { } while (0)
336#endif 413#endif
@@ -460,134 +537,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 537 spin_unlock_irqrestore(&rq->lock, *flags);
461} 538}
462 539
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 540/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 541 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 542 */
@@ -603,177 +552,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 552 return rq;
604} 553}
605 554
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 555/*
608 * Called when a process is dequeued from the active array and given 556 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 557 */
622static inline void sched_info_dequeued(struct task_struct *t) 558void sched_clock_unstable_event(void)
623{ 559{
624 t->sched_info.last_queued = 0; 560 unsigned long flags;
561 struct rq *rq;
562
563 rq = task_rq_lock(current, &flags);
564 rq->prev_clock_raw = sched_clock();
565 rq->clock_unstable_events++;
566 task_rq_unlock(rq, &flags);
625} 567}
626 568
627/* 569/*
628 * Called when a task finally hits the cpu. We can now calculate how 570 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 571 *
630 * can keep stats on how long its timeslice is. 572 * On UP this means the setting of the need_resched flag, on SMP it
573 * might also involve a cross-CPU call to trigger the scheduler on
574 * the target CPU.
631 */ 575 */
632static void sched_info_arrive(struct task_struct *t) 576#ifdef CONFIG_SMP
577
578#ifndef tsk_is_polling
579#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
580#endif
581
582static void resched_task(struct task_struct *p)
633{ 583{
634 unsigned long now = jiffies, delta_jiffies = 0; 584 int cpu;
585
586 assert_spin_locked(&task_rq(p)->lock);
587
588 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
589 return;
590
591 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
635 592
636 if (t->sched_info.last_queued) 593 cpu = task_cpu(p);
637 delta_jiffies = now - t->sched_info.last_queued; 594 if (cpu == smp_processor_id())
638 sched_info_dequeued(t); 595 return;
639 t->sched_info.run_delay += delta_jiffies;
640 t->sched_info.last_arrival = now;
641 t->sched_info.pcnt++;
642 596
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 597 /* NEED_RESCHED must be visible before we test polling */
598 smp_mb();
599 if (!tsk_is_polling(p))
600 smp_send_reschedule(cpu);
644} 601}
645 602
646/* 603static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired 604{
648 * array. The time is noted and later used to determine how long we 605 struct rq *rq = cpu_rq(cpu);
649 * had to wait for us to reach the cpu. Since the expired queue will 606 unsigned long flags;
650 * become the active queue after active queue is empty, without dequeuing 607
651 * and requeuing any tasks, we are interested in queuing to either. It 608 if (!spin_trylock_irqsave(&rq->lock, flags))
652 * is unusual but not impossible for tasks to be dequeued and immediately 609 return;
653 * requeued in the same or another array: this can happen in sched_yield(), 610 resched_task(cpu_curr(cpu));
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue 611 spin_unlock_irqrestore(&rq->lock, flags);
655 * to runqueue. 612}
656 * 613#else
657 * This function is only called from enqueue_task(), but also only updates 614static inline void resched_task(struct task_struct *p)
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 615{
663 if (unlikely(sched_info_on())) 616 assert_spin_locked(&task_rq(p)->lock);
664 if (!t->sched_info.last_queued) 617 set_tsk_need_resched(p);
665 t->sched_info.last_queued = jiffies;
666} 618}
619#endif
667 620
668/* 621static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 622{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 623#if BITS_PER_LONG == 32
624 if (likely(divident <= 0xffffffffULL))
625 return (u32)divident / divisor;
626 do_div(divident, divisor);
675 627
676 t->sched_info.cpu_time += delta_jiffies; 628 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 629#else
630 return divident / divisor;
631#endif
678} 632}
679 633
680/* 634#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 635# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 636#else
683 * the idle task.) We are only called when prev != next. 637# define WMULT_CONST (1UL << 32)
684 */ 638#endif
685static inline void 639
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 640#define WMULT_SHIFT 32
641
642static inline unsigned long
643calc_delta_mine(unsigned long delta_exec, unsigned long weight,
644 struct load_weight *lw)
687{ 645{
688 struct rq *rq = task_rq(prev); 646 u64 tmp;
689 647
648 if (unlikely(!lw->inv_weight))
649 lw->inv_weight = WMULT_CONST / lw->weight;
650
651 tmp = (u64)delta_exec * weight;
690 /* 652 /*
691 * prev now departs the cpu. It's not interesting to record 653 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 654 */
695 if (prev != rq->idle) 655 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 656 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
657 >> (WMULT_SHIFT/2);
658 } else {
659 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
660 }
697 661
698 if (next != rq->idle) 662 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706} 663}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 664
712/* 665static inline unsigned long
713 * Adding/removing a task to/from a priority array: 666calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 667{
717 array->nr_active--; 668 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 669}
722 670
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 671static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 672{
725 sched_info_queued(p); 673 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 674 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 675}
731 676
732/* 677static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 678{
738 list_move_tail(&p->run_list, array->queue + p->prio); 679 lw->weight -= dec;
680 lw->inv_weight = 0;
739} 681}
740 682
741static inline void 683static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 684{
744 list_add(&p->run_list, array->queue + p->prio); 685 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 686 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 687 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 688 ls->delta_stat = 0;
689 }
748} 690}
749 691
750/* 692/*
751 * __normal_prio - return the priority that is based on the static 693 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 * 694 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 695 * delta_fair clock advances at a rate inversely proportional to
755 * into the -5 ... 0 ... +5 bonus/penalty range. 696 * total load (rq->ls.load.weight) on the runqueue, while
697 * delta_exec advances at the same rate as wall-clock (provided
698 * cpu is not idle).
756 * 699 *
757 * We use 25% of the full 0...39 priority range so that: 700 * delta_exec / delta_fair is a measure of the (smoothened) load on this
701 * runqueue over any given interval. This (smoothened) load is used
702 * during load balance.
758 * 703 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 704 * This function is called /before/ updating rq->ls.load
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 705 * and when switching tasks.
761 *
762 * Both properties are important to certain workloads.
763 */ 706 */
764 707static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 708{
767 int bonus, prio; 709 struct load_stat *ls = &rq->ls;
768 710 u64 start;
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
770 711
771 prio = p->static_prio - bonus; 712 start = ls->load_update_start;
772 if (prio < MAX_RT_PRIO) 713 ls->load_update_start = now;
773 prio = MAX_RT_PRIO; 714 ls->delta_stat += now - start;
774 if (prio > MAX_PRIO-1) 715 /*
775 prio = MAX_PRIO-1; 716 * Stagger updates to ls->delta_fair. Very frequent updates
776 return prio; 717 * can be expensive.
718 */
719 if (ls->delta_stat >= sysctl_sched_stat_granularity)
720 __update_curr_load(rq, ls);
777} 721}
778 722
779/* 723/*
@@ -791,53 +735,155 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 735 * this code will need modification
792 */ 736 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 737#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 738#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 739 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 740#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 741 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 742#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 743 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 744
801static void set_load_weight(struct task_struct *p) 745#define WEIGHT_IDLEPRIO 2
802{ 746#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 747
804#ifdef CONFIG_SMP 748/*
805 if (p == task_rq(p)->migration_thread) 749 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 750 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 751 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 752 * that remained on nice 0.
809 * adversely. 753 *
810 */ 754 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 755 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 756 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
813#endif 757 * If a task goes up by ~10% and another task goes down by ~10% then
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 758 * the relative distance between them is ~25%.)
815 } else 759 */
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 760static const int prio_to_weight[40] = {
817} 761/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
762/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
763/* 0 */ NICE_0_LOAD /* 1024 */,
764/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
765/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
766};
767
768/*
769 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
770 *
771 * In cases where the weight does not change often, we can use the
772 * precalculated inverse to speed up arithmetics by turning divisions
773 * into multiplications:
774 */
775static const u32 prio_to_wmult[40] = {
776/* -20 */ 48356, 60446, 75558, 94446, 118058,
777/* -15 */ 147573, 184467, 230589, 288233, 360285,
778/* -10 */ 450347, 562979, 703746, 879575, 1099582,
779/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
780/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
781/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
782/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
783/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
784};
818 785
819static inline void 786static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 787inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 788{
822 rq->raw_weighted_load += p->load_weight; 789 update_curr_load(rq, now);
790 update_load_add(&rq->ls.load, p->se.load.weight);
823} 791}
824 792
825static inline void 793static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 794dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 795{
828 rq->raw_weighted_load -= p->load_weight; 796 update_curr_load(rq, now);
797 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 798}
830 799
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 800static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 801{
833 rq->nr_running++; 802 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 803 inc_load(rq, p, now);
835} 804}
836 805
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 806static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 807{
839 rq->nr_running--; 808 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 809 dec_load(rq, p, now);
810}
811
812static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
813
814/*
815 * runqueue iterator, to support SMP load-balancing between different
816 * scheduling classes, without having to expose their internal data
817 * structures to the load-balancing proper:
818 */
819struct rq_iterator {
820 void *arg;
821 struct task_struct *(*start)(void *);
822 struct task_struct *(*next)(void *);
823};
824
825static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
826 unsigned long max_nr_move, unsigned long max_load_move,
827 struct sched_domain *sd, enum cpu_idle_type idle,
828 int *all_pinned, unsigned long *load_moved,
829 int this_best_prio, int best_prio, int best_prio_seen,
830 struct rq_iterator *iterator);
831
832#include "sched_stats.h"
833#include "sched_rt.c"
834#include "sched_fair.c"
835#include "sched_idletask.c"
836#ifdef CONFIG_SCHED_DEBUG
837# include "sched_debug.c"
838#endif
839
840#define sched_class_highest (&rt_sched_class)
841
842static void set_load_weight(struct task_struct *p)
843{
844 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
845 p->se.wait_runtime = 0;
846
847 if (task_has_rt_policy(p)) {
848 p->se.load.weight = prio_to_weight[0] * 2;
849 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
850 return;
851 }
852
853 /*
854 * SCHED_IDLE tasks get minimal weight:
855 */
856 if (p->policy == SCHED_IDLE) {
857 p->se.load.weight = WEIGHT_IDLEPRIO;
858 p->se.load.inv_weight = WMULT_IDLEPRIO;
859 return;
860 }
861
862 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
863 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
864}
865
866static void
867enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
868{
869 sched_info_queued(p);
870 p->sched_class->enqueue_task(rq, p, wakeup, now);
871 p->se.on_rq = 1;
872}
873
874static void
875dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
876{
877 p->sched_class->dequeue_task(rq, p, sleep, now);
878 p->se.on_rq = 0;
879}
880
881/*
882 * __normal_prio - return the priority that is based on the static prio
883 */
884static inline int __normal_prio(struct task_struct *p)
885{
886 return p->static_prio;
841} 887}
842 888
843/* 889/*
@@ -851,7 +897,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 897{
852 int prio; 898 int prio;
853 899
854 if (has_rt_policy(p)) 900 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 901 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 902 else
857 prio = __normal_prio(p); 903 prio = __normal_prio(p);
@@ -879,222 +925,47 @@ static int effective_prio(struct task_struct *p)
879} 925}
880 926
881/* 927/*
882 * __activate_task - move a task to the runqueue. 928 * activate_task - move a task to the runqueue.
883 */ 929 */
884static void __activate_task(struct task_struct *p, struct rq *rq) 930static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
885{ 931{
886 struct prio_array *target = rq->active; 932 u64 now = rq_clock(rq);
887 933
888 if (batch_task(p)) 934 if (p->state == TASK_UNINTERRUPTIBLE)
889 target = rq->expired; 935 rq->nr_uninterruptible--;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
898{
899 enqueue_task_head(p, rq->active);
900 inc_nr_running(p, rq);
901}
902
903/*
904 * Recalculate p->normal_prio and p->prio after having slept,
905 * updating the sleep-average too:
906 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now)
908{
909 /* Caller must always ensure 'now >= p->timestamp' */
910 unsigned long sleep_time = now - p->timestamp;
911
912 if (batch_task(p))
913 sleep_time = 0;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954
955 /*
956 * This code gives a bonus to interactive tasks.
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 936
970 return effective_prio(p); 937 enqueue_task(rq, p, wakeup, now);
938 inc_nr_running(p, rq, now);
971} 939}
972 940
973/* 941/*
974 * activate_task - move a task to the runqueue and do priority recalculation 942 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 943 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 944static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 945{
981 unsigned long long now; 946 u64 now = rq_clock(rq);
982
983 if (rt_task(p))
984 goto out;
985 947
986 now = sched_clock(); 948 if (p->state == TASK_UNINTERRUPTIBLE)
987#ifdef CONFIG_SMP 949 rq->nr_uninterruptible--;
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995
996 /*
997 * Sleep time is in units of nanosecs, so shift by 20 to get a
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006
1007 p->prio = recalc_task_prio(p, now);
1008 950
1009 /* 951 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 952 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 953}
1035 954
1036/* 955/*
1037 * deactivate_task - remove a task from the runqueue. 956 * deactivate_task - remove a task from the runqueue.
1038 */ 957 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 958static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{ 959{
1041 dec_nr_running(p, rq); 960 u64 now = rq_clock(rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045 961
1046/* 962 if (p->state == TASK_UNINTERRUPTIBLE)
1047 * resched_task - mark a task 'to be rescheduled now'. 963 rq->nr_uninterruptible++;
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054 964
1055#ifndef tsk_is_polling 965 dequeue_task(rq, p, sleep, now);
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 966 dec_nr_running(p, rq, now);
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{
1061 int cpu;
1062
1063 assert_spin_locked(&task_rq(p)->lock);
1064
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1066 return;
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078} 967}
1079 968
1080static void resched_cpu(int cpu)
1081{
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084
1085 if (!spin_trylock_irqsave(&rq->lock, flags))
1086 return;
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095}
1096#endif
1097
1098/** 969/**
1099 * task_curr - is this task currently executing on a CPU? 970 * task_curr - is this task currently executing on a CPU?
1100 * @p: the task in question. 971 * @p: the task in question.
@@ -1107,10 +978,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 978/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 979unsigned long weighted_cpuload(const int cpu)
1109{ 980{
1110 return cpu_rq(cpu)->raw_weighted_load; 981 return cpu_rq(cpu)->ls.load.weight;
1111} 982}
1112 983
984static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
985{
1113#ifdef CONFIG_SMP 986#ifdef CONFIG_SMP
987 task_thread_info(p)->cpu = cpu;
988 set_task_cfs_rq(p);
989#endif
990}
991
992#ifdef CONFIG_SMP
993
994void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
995{
996 int old_cpu = task_cpu(p);
997 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
998 u64 clock_offset, fair_clock_offset;
999
1000 clock_offset = old_rq->clock - new_rq->clock;
1001 fair_clock_offset = old_rq->cfs.fair_clock -
1002 new_rq->cfs.fair_clock;
1003 if (p->se.wait_start)
1004 p->se.wait_start -= clock_offset;
1005 if (p->se.wait_start_fair)
1006 p->se.wait_start_fair -= fair_clock_offset;
1007 if (p->se.sleep_start)
1008 p->se.sleep_start -= clock_offset;
1009 if (p->se.block_start)
1010 p->se.block_start -= clock_offset;
1011 if (p->se.sleep_start_fair)
1012 p->se.sleep_start_fair -= fair_clock_offset;
1013
1014 __set_task_cpu(p, new_cpu);
1015}
1016
1114struct migration_req { 1017struct migration_req {
1115 struct list_head list; 1018 struct list_head list;
1116 1019
@@ -1133,7 +1036,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1036 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1037 * it is sufficient to simply update the task's cpu field.
1135 */ 1038 */
1136 if (!p->array && !task_running(rq, p)) { 1039 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1040 set_task_cpu(p, dest_cpu);
1138 return 0; 1041 return 0;
1139 } 1042 }
@@ -1158,9 +1061,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1061void wait_task_inactive(struct task_struct *p)
1159{ 1062{
1160 unsigned long flags; 1063 unsigned long flags;
1064 int running, on_rq;
1161 struct rq *rq; 1065 struct rq *rq;
1162 struct prio_array *array;
1163 int running;
1164 1066
1165repeat: 1067repeat:
1166 /* 1068 /*
@@ -1192,7 +1094,7 @@ repeat:
1192 */ 1094 */
1193 rq = task_rq_lock(p, &flags); 1095 rq = task_rq_lock(p, &flags);
1194 running = task_running(rq, p); 1096 running = task_running(rq, p);
1195 array = p->array; 1097 on_rq = p->se.on_rq;
1196 task_rq_unlock(rq, &flags); 1098 task_rq_unlock(rq, &flags);
1197 1099
1198 /* 1100 /*
@@ -1215,7 +1117,7 @@ repeat:
1215 * running right now), it's preempted, and we should 1117 * running right now), it's preempted, and we should
1216 * yield - it could be a while. 1118 * yield - it could be a while.
1217 */ 1119 */
1218 if (unlikely(array)) { 1120 if (unlikely(on_rq)) {
1219 yield(); 1121 yield();
1220 goto repeat; 1122 goto repeat;
1221 } 1123 }
@@ -1261,11 +1163,12 @@ void kick_process(struct task_struct *p)
1261static inline unsigned long source_load(int cpu, int type) 1163static inline unsigned long source_load(int cpu, int type)
1262{ 1164{
1263 struct rq *rq = cpu_rq(cpu); 1165 struct rq *rq = cpu_rq(cpu);
1166 unsigned long total = weighted_cpuload(cpu);
1264 1167
1265 if (type == 0) 1168 if (type == 0)
1266 return rq->raw_weighted_load; 1169 return total;
1267 1170
1268 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1171 return min(rq->cpu_load[type-1], total);
1269} 1172}
1270 1173
1271/* 1174/*
@@ -1275,11 +1178,12 @@ static inline unsigned long source_load(int cpu, int type)
1275static inline unsigned long target_load(int cpu, int type) 1178static inline unsigned long target_load(int cpu, int type)
1276{ 1179{
1277 struct rq *rq = cpu_rq(cpu); 1180 struct rq *rq = cpu_rq(cpu);
1181 unsigned long total = weighted_cpuload(cpu);
1278 1182
1279 if (type == 0) 1183 if (type == 0)
1280 return rq->raw_weighted_load; 1184 return total;
1281 1185
1282 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1186 return max(rq->cpu_load[type-1], total);
1283} 1187}
1284 1188
1285/* 1189/*
@@ -1288,9 +1192,10 @@ static inline unsigned long target_load(int cpu, int type)
1288static inline unsigned long cpu_avg_load_per_task(int cpu) 1192static inline unsigned long cpu_avg_load_per_task(int cpu)
1289{ 1193{
1290 struct rq *rq = cpu_rq(cpu); 1194 struct rq *rq = cpu_rq(cpu);
1195 unsigned long total = weighted_cpuload(cpu);
1291 unsigned long n = rq->nr_running; 1196 unsigned long n = rq->nr_running;
1292 1197
1293 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1198 return n ? total / n : SCHED_LOAD_SCALE;
1294} 1199}
1295 1200
1296/* 1201/*
@@ -1392,9 +1297,9 @@ static int sched_balance_self(int cpu, int flag)
1392 struct sched_domain *tmp, *sd = NULL; 1297 struct sched_domain *tmp, *sd = NULL;
1393 1298
1394 for_each_domain(cpu, tmp) { 1299 for_each_domain(cpu, tmp) {
1395 /* 1300 /*
1396 * If power savings logic is enabled for a domain, stop there. 1301 * If power savings logic is enabled for a domain, stop there.
1397 */ 1302 */
1398 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1303 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1399 break; 1304 break;
1400 if (tmp->flags & flag) 1305 if (tmp->flags & flag)
@@ -1477,9 +1382,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1477 if (idle_cpu(i)) 1382 if (idle_cpu(i))
1478 return i; 1383 return i;
1479 } 1384 }
1480 } 1385 } else {
1481 else
1482 break; 1386 break;
1387 }
1483 } 1388 }
1484 return cpu; 1389 return cpu;
1485} 1390}
@@ -1521,7 +1426,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (!(old_state & state)) 1426 if (!(old_state & state))
1522 goto out; 1427 goto out;
1523 1428
1524 if (p->array) 1429 if (p->se.on_rq)
1525 goto out_running; 1430 goto out_running;
1526 1431
1527 cpu = task_cpu(p); 1432 cpu = task_cpu(p);
@@ -1576,11 +1481,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1576 * of the current CPU: 1481 * of the current CPU:
1577 */ 1482 */
1578 if (sync) 1483 if (sync)
1579 tl -= current->load_weight; 1484 tl -= current->se.load.weight;
1580 1485
1581 if ((tl <= load && 1486 if ((tl <= load &&
1582 tl + target_load(cpu, idx) <= tl_per_task) || 1487 tl + target_load(cpu, idx) <= tl_per_task) ||
1583 100*(tl + p->load_weight) <= imbalance*load) { 1488 100*(tl + p->se.load.weight) <= imbalance*load) {
1584 /* 1489 /*
1585 * This domain has SD_WAKE_AFFINE and 1490 * This domain has SD_WAKE_AFFINE and
1586 * p is cache cold in this domain, and 1491 * p is cache cold in this domain, and
@@ -1614,7 +1519,7 @@ out_set_cpu:
1614 old_state = p->state; 1519 old_state = p->state;
1615 if (!(old_state & state)) 1520 if (!(old_state & state))
1616 goto out; 1521 goto out;
1617 if (p->array) 1522 if (p->se.on_rq)
1618 goto out_running; 1523 goto out_running;
1619 1524
1620 this_cpu = smp_processor_id(); 1525 this_cpu = smp_processor_id();
@@ -1623,25 +1528,7 @@ out_set_cpu:
1623 1528
1624out_activate: 1529out_activate:
1625#endif /* CONFIG_SMP */ 1530#endif /* CONFIG_SMP */
1626 if (old_state == TASK_UNINTERRUPTIBLE) { 1531 activate_task(rq, p, 1);
1627 rq->nr_uninterruptible--;
1628 /*
1629 * Tasks on involuntary sleep don't earn
1630 * sleep_avg beyond just interactive state.
1631 */
1632 p->sleep_type = SLEEP_NONINTERACTIVE;
1633 } else
1634
1635 /*
1636 * Tasks that have marked their sleep as noninteractive get
1637 * woken up with their sleep average not weighted in an
1638 * interactive way.
1639 */
1640 if (old_state & TASK_NONINTERACTIVE)
1641 p->sleep_type = SLEEP_NONINTERACTIVE;
1642
1643
1644 activate_task(p, rq, cpu == this_cpu);
1645 /* 1532 /*
1646 * Sync wakeups (i.e. those types of wakeups where the waker 1533 * Sync wakeups (i.e. those types of wakeups where the waker
1647 * has indicated that it will leave the CPU in short order) 1534 * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1537,8 @@ out_activate:
1650 * the waker guarantees that the freshly woken up task is going 1537 * the waker guarantees that the freshly woken up task is going
1651 * to be considered on this CPU.) 1538 * to be considered on this CPU.)
1652 */ 1539 */
1653 if (!sync || cpu != this_cpu) { 1540 if (!sync || cpu != this_cpu)
1654 if (TASK_PREEMPTS_CURR(p, rq)) 1541 check_preempt_curr(rq, p);
1655 resched_task(rq->curr);
1656 }
1657 success = 1; 1542 success = 1;
1658 1543
1659out_running: 1544out_running:
@@ -1676,19 +1561,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1676 return try_to_wake_up(p, state, 0); 1561 return try_to_wake_up(p, state, 0);
1677} 1562}
1678 1563
1679static void task_running_tick(struct rq *rq, struct task_struct *p);
1680/* 1564/*
1681 * Perform scheduler related setup for a newly forked process p. 1565 * Perform scheduler related setup for a newly forked process p.
1682 * p is forked by current. 1566 * p is forked by current.
1683 */ 1567 *
1684void fastcall sched_fork(struct task_struct *p, int clone_flags) 1568 * __sched_fork() is basic setup used by init_idle() too:
1685{ 1569 */
1686 int cpu = get_cpu(); 1570static void __sched_fork(struct task_struct *p)
1571{
1572 p->se.wait_start_fair = 0;
1573 p->se.wait_start = 0;
1574 p->se.exec_start = 0;
1575 p->se.sum_exec_runtime = 0;
1576 p->se.delta_exec = 0;
1577 p->se.delta_fair_run = 0;
1578 p->se.delta_fair_sleep = 0;
1579 p->se.wait_runtime = 0;
1580 p->se.sum_wait_runtime = 0;
1581 p->se.sum_sleep_runtime = 0;
1582 p->se.sleep_start = 0;
1583 p->se.sleep_start_fair = 0;
1584 p->se.block_start = 0;
1585 p->se.sleep_max = 0;
1586 p->se.block_max = 0;
1587 p->se.exec_max = 0;
1588 p->se.wait_max = 0;
1589 p->se.wait_runtime_overruns = 0;
1590 p->se.wait_runtime_underruns = 0;
1687 1591
1688#ifdef CONFIG_SMP 1592 INIT_LIST_HEAD(&p->run_list);
1689 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1593 p->se.on_rq = 0;
1690#endif
1691 set_task_cpu(p, cpu);
1692 1594
1693 /* 1595 /*
1694 * We mark the process as running here, but have not actually 1596 * We mark the process as running here, but have not actually
@@ -1697,16 +1599,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1697 * event cannot wake it up and insert it on the runqueue either. 1599 * event cannot wake it up and insert it on the runqueue either.
1698 */ 1600 */
1699 p->state = TASK_RUNNING; 1601 p->state = TASK_RUNNING;
1602}
1603
1604/*
1605 * fork()/clone()-time setup:
1606 */
1607void sched_fork(struct task_struct *p, int clone_flags)
1608{
1609 int cpu = get_cpu();
1610
1611 __sched_fork(p);
1612
1613#ifdef CONFIG_SMP
1614 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1615#endif
1616 __set_task_cpu(p, cpu);
1700 1617
1701 /* 1618 /*
1702 * Make sure we do not leak PI boosting priority to the child: 1619 * Make sure we do not leak PI boosting priority to the child:
1703 */ 1620 */
1704 p->prio = current->normal_prio; 1621 p->prio = current->normal_prio;
1705 1622
1706 INIT_LIST_HEAD(&p->run_list);
1707 p->array = NULL;
1708#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1709 if (unlikely(sched_info_on())) 1624 if (likely(sched_info_on()))
1710 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1625 memset(&p->sched_info, 0, sizeof(p->sched_info));
1711#endif 1626#endif
1712#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1627#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1631,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1716 /* Want to start with kernel preemption disabled. */ 1631 /* Want to start with kernel preemption disabled. */
1717 task_thread_info(p)->preempt_count = 1; 1632 task_thread_info(p)->preempt_count = 1;
1718#endif 1633#endif
1719 /*
1720 * Share the timeslice between parent and child, thus the
1721 * total amount of pending timeslices in the system doesn't change,
1722 * resulting in more scheduling fairness.
1723 */
1724 local_irq_disable();
1725 p->time_slice = (current->time_slice + 1) >> 1;
1726 /*
1727 * The remainder of the first timeslice might be recovered by
1728 * the parent if the child exits early enough.
1729 */
1730 p->first_time_slice = 1;
1731 current->time_slice >>= 1;
1732 p->timestamp = sched_clock();
1733 if (unlikely(!current->time_slice)) {
1734 /*
1735 * This case is rare, it happens when the parent has only
1736 * a single jiffy left from its timeslice. Taking the
1737 * runqueue lock is not a problem.
1738 */
1739 current->time_slice = 1;
1740 task_running_tick(cpu_rq(cpu), current);
1741 }
1742 local_irq_enable();
1743 put_cpu(); 1634 put_cpu();
1744} 1635}
1745 1636
1746/* 1637/*
1638 * After fork, child runs first. (default) If set to 0 then
1639 * parent will (try to) run first.
1640 */
1641unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1642
1643/*
1747 * wake_up_new_task - wake up a newly created task for the first time. 1644 * wake_up_new_task - wake up a newly created task for the first time.
1748 * 1645 *
1749 * This function will do some initial scheduler statistics housekeeping 1646 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1649,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1752 */ 1649 */
1753void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1650void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1754{ 1651{
1755 struct rq *rq, *this_rq;
1756 unsigned long flags; 1652 unsigned long flags;
1757 int this_cpu, cpu; 1653 struct rq *rq;
1654 int this_cpu;
1758 1655
1759 rq = task_rq_lock(p, &flags); 1656 rq = task_rq_lock(p, &flags);
1760 BUG_ON(p->state != TASK_RUNNING); 1657 BUG_ON(p->state != TASK_RUNNING);
1761 this_cpu = smp_processor_id(); 1658 this_cpu = smp_processor_id(); /* parent's CPU */
1762 cpu = task_cpu(p);
1763
1764 /*
1765 * We decrease the sleep average of forking parents
1766 * and children as well, to keep max-interactive tasks
1767 * from forking tasks that are max-interactive. The parent
1768 * (current) is done further down, under its lock.
1769 */
1770 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1771 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1772 1659
1773 p->prio = effective_prio(p); 1660 p->prio = effective_prio(p);
1774 1661
1775 if (likely(cpu == this_cpu)) { 1662 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1776 if (!(clone_flags & CLONE_VM)) { 1663 task_cpu(p) != this_cpu || !current->se.on_rq) {
1777 /* 1664 activate_task(rq, p, 0);
1778 * The VM isn't cloned, so we're in a good position to
1779 * do child-runs-first in anticipation of an exec. This
1780 * usually avoids a lot of COW overhead.
1781 */
1782 if (unlikely(!current->array))
1783 __activate_task(p, rq);
1784 else {
1785 p->prio = current->prio;
1786 p->normal_prio = current->normal_prio;
1787 list_add_tail(&p->run_list, &current->run_list);
1788 p->array = current->array;
1789 p->array->nr_active++;
1790 inc_nr_running(p, rq);
1791 }
1792 set_need_resched();
1793 } else
1794 /* Run child last */
1795 __activate_task(p, rq);
1796 /*
1797 * We skip the following code due to cpu == this_cpu
1798 *
1799 * task_rq_unlock(rq, &flags);
1800 * this_rq = task_rq_lock(current, &flags);
1801 */
1802 this_rq = rq;
1803 } else { 1665 } else {
1804 this_rq = cpu_rq(this_cpu);
1805
1806 /*
1807 * Not the local CPU - must adjust timestamp. This should
1808 * get optimised away in the !CONFIG_SMP case.
1809 */
1810 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1811 + rq->most_recent_timestamp;
1812 __activate_task(p, rq);
1813 if (TASK_PREEMPTS_CURR(p, rq))
1814 resched_task(rq->curr);
1815
1816 /* 1666 /*
1817 * Parent and child are on different CPUs, now get the 1667 * Let the scheduling class do new task startup
1818 * parent runqueue to update the parent's ->sleep_avg: 1668 * management (if any):
1819 */ 1669 */
1820 task_rq_unlock(rq, &flags); 1670 p->sched_class->task_new(rq, p);
1821 this_rq = task_rq_lock(current, &flags);
1822 }
1823 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1824 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1825 task_rq_unlock(this_rq, &flags);
1826}
1827
1828/*
1829 * Potentially available exiting-child timeslices are
1830 * retrieved here - this way the parent does not get
1831 * penalized for creating too many threads.
1832 *
1833 * (this cannot be used to 'generate' timeslices
1834 * artificially, because any timeslice recovered here
1835 * was given away by the parent in the first place.)
1836 */
1837void fastcall sched_exit(struct task_struct *p)
1838{
1839 unsigned long flags;
1840 struct rq *rq;
1841
1842 /*
1843 * If the child was a (relative-) CPU hog then decrease
1844 * the sleep_avg of the parent as well.
1845 */
1846 rq = task_rq_lock(p->parent, &flags);
1847 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1848 p->parent->time_slice += p->time_slice;
1849 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1850 p->parent->time_slice = task_timeslice(p);
1851 } 1671 }
1852 if (p->sleep_avg < p->parent->sleep_avg) 1672 check_preempt_curr(rq, p);
1853 p->parent->sleep_avg = p->parent->sleep_avg /
1854 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1855 (EXIT_WEIGHT + 1);
1856 task_rq_unlock(rq, &flags); 1673 task_rq_unlock(rq, &flags);
1857} 1674}
1858 1675
@@ -1917,7 +1734,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1917 /* 1734 /*
1918 * Remove function-return probe instances associated with this 1735 * Remove function-return probe instances associated with this
1919 * task and put them back on the free list. 1736 * task and put them back on the free list.
1920 */ 1737 */
1921 kprobe_flush_task(prev); 1738 kprobe_flush_task(prev);
1922 put_task_struct(prev); 1739 put_task_struct(prev);
1923 } 1740 }
@@ -1945,13 +1762,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1945 * context_switch - switch to the new MM and the new 1762 * context_switch - switch to the new MM and the new
1946 * thread's register state. 1763 * thread's register state.
1947 */ 1764 */
1948static inline struct task_struct * 1765static inline void
1949context_switch(struct rq *rq, struct task_struct *prev, 1766context_switch(struct rq *rq, struct task_struct *prev,
1950 struct task_struct *next) 1767 struct task_struct *next)
1951{ 1768{
1952 struct mm_struct *mm = next->mm; 1769 struct mm_struct *mm, *oldmm;
1953 struct mm_struct *oldmm = prev->active_mm;
1954 1770
1771 prepare_task_switch(rq, next);
1772 mm = next->mm;
1773 oldmm = prev->active_mm;
1955 /* 1774 /*
1956 * For paravirt, this is coupled with an exit in switch_to to 1775 * For paravirt, this is coupled with an exit in switch_to to
1957 * combine the page table reload and the switch backend into 1776 * combine the page table reload and the switch backend into
@@ -1959,16 +1778,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1959 */ 1778 */
1960 arch_enter_lazy_cpu_mode(); 1779 arch_enter_lazy_cpu_mode();
1961 1780
1962 if (!mm) { 1781 if (unlikely(!mm)) {
1963 next->active_mm = oldmm; 1782 next->active_mm = oldmm;
1964 atomic_inc(&oldmm->mm_count); 1783 atomic_inc(&oldmm->mm_count);
1965 enter_lazy_tlb(oldmm, next); 1784 enter_lazy_tlb(oldmm, next);
1966 } else 1785 } else
1967 switch_mm(oldmm, mm, next); 1786 switch_mm(oldmm, mm, next);
1968 1787
1969 if (!prev->mm) { 1788 if (unlikely(!prev->mm)) {
1970 prev->active_mm = NULL; 1789 prev->active_mm = NULL;
1971 WARN_ON(rq->prev_mm);
1972 rq->prev_mm = oldmm; 1790 rq->prev_mm = oldmm;
1973 } 1791 }
1974 /* 1792 /*
@@ -1984,7 +1802,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1984 /* Here we just switch the register state and the stack. */ 1802 /* Here we just switch the register state and the stack. */
1985 switch_to(prev, next, prev); 1803 switch_to(prev, next, prev);
1986 1804
1987 return prev; 1805 barrier();
1806 /*
1807 * this_rq must be evaluated again because prev may have moved
1808 * CPUs since it called schedule(), thus the 'rq' on its stack
1809 * frame will be invalid.
1810 */
1811 finish_task_switch(this_rq(), prev);
1988} 1812}
1989 1813
1990/* 1814/*
@@ -2057,17 +1881,65 @@ unsigned long nr_active(void)
2057 return running + uninterruptible; 1881 return running + uninterruptible;
2058} 1882}
2059 1883
2060#ifdef CONFIG_SMP
2061
2062/* 1884/*
2063 * Is this task likely cache-hot: 1885 * Update rq->cpu_load[] statistics. This function is usually called every
1886 * scheduler tick (TICK_NSEC).
2064 */ 1887 */
2065static inline int 1888static void update_cpu_load(struct rq *this_rq)
2066task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2067{ 1889{
2068 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1890 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1891 unsigned long total_load = this_rq->ls.load.weight;
1892 unsigned long this_load = total_load;
1893 struct load_stat *ls = &this_rq->ls;
1894 u64 now = __rq_clock(this_rq);
1895 int i, scale;
1896
1897 this_rq->nr_load_updates++;
1898 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1899 goto do_avg;
1900
1901 /* Update delta_fair/delta_exec fields first */
1902 update_curr_load(this_rq, now);
1903
1904 fair_delta64 = ls->delta_fair + 1;
1905 ls->delta_fair = 0;
1906
1907 exec_delta64 = ls->delta_exec + 1;
1908 ls->delta_exec = 0;
1909
1910 sample_interval64 = now - ls->load_update_last;
1911 ls->load_update_last = now;
1912
1913 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1914 sample_interval64 = TICK_NSEC;
1915
1916 if (exec_delta64 > sample_interval64)
1917 exec_delta64 = sample_interval64;
1918
1919 idle_delta64 = sample_interval64 - exec_delta64;
1920
1921 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1922 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1923
1924 this_load = (unsigned long)tmp64;
1925
1926do_avg:
1927
1928 /* Update our load: */
1929 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1930 unsigned long old_load, new_load;
1931
1932 /* scale is effectively 1 << i now, and >> i divides by scale */
1933
1934 old_load = this_rq->cpu_load[i];
1935 new_load = this_load;
1936
1937 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1938 }
2069} 1939}
2070 1940
1941#ifdef CONFIG_SMP
1942
2071/* 1943/*
2072 * double_rq_lock - safely lock two runqueues 1944 * double_rq_lock - safely lock two runqueues
2073 * 1945 *
@@ -2184,23 +2056,17 @@ void sched_exec(void)
2184 * pull_task - move a task from a remote runqueue to the local runqueue. 2056 * pull_task - move a task from a remote runqueue to the local runqueue.
2185 * Both runqueues must be locked. 2057 * Both runqueues must be locked.
2186 */ 2058 */
2187static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2059static void pull_task(struct rq *src_rq, struct task_struct *p,
2188 struct task_struct *p, struct rq *this_rq, 2060 struct rq *this_rq, int this_cpu)
2189 struct prio_array *this_array, int this_cpu)
2190{ 2061{
2191 dequeue_task(p, src_array); 2062 deactivate_task(src_rq, p, 0);
2192 dec_nr_running(p, src_rq);
2193 set_task_cpu(p, this_cpu); 2063 set_task_cpu(p, this_cpu);
2194 inc_nr_running(p, this_rq); 2064 activate_task(this_rq, p, 0);
2195 enqueue_task(p, this_array);
2196 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2197 + this_rq->most_recent_timestamp;
2198 /* 2065 /*
2199 * Note that idle threads have a prio of MAX_PRIO, for this test 2066 * Note that idle threads have a prio of MAX_PRIO, for this test
2200 * to be always true for them. 2067 * to be always true for them.
2201 */ 2068 */
2202 if (TASK_PREEMPTS_CURR(p, this_rq)) 2069 check_preempt_curr(this_rq, p);
2203 resched_task(this_rq->curr);
2204} 2070}
2205 2071
2206/* 2072/*
@@ -2208,7 +2074,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2208 */ 2074 */
2209static 2075static
2210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2076int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2211 struct sched_domain *sd, enum idle_type idle, 2077 struct sched_domain *sd, enum cpu_idle_type idle,
2212 int *all_pinned) 2078 int *all_pinned)
2213{ 2079{
2214 /* 2080 /*
@@ -2225,132 +2091,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2225 return 0; 2091 return 0;
2226 2092
2227 /* 2093 /*
2228 * Aggressive migration if: 2094 * Aggressive migration if too many balance attempts have failed:
2229 * 1) task is cache cold, or
2230 * 2) too many balance attempts have failed.
2231 */ 2095 */
2232 2096 if (sd->nr_balance_failed > sd->cache_nice_tries)
2233 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2234#ifdef CONFIG_SCHEDSTATS
2235 if (task_hot(p, rq->most_recent_timestamp, sd))
2236 schedstat_inc(sd, lb_hot_gained[idle]);
2237#endif
2238 return 1; 2097 return 1;
2239 }
2240 2098
2241 if (task_hot(p, rq->most_recent_timestamp, sd))
2242 return 0;
2243 return 1; 2099 return 1;
2244} 2100}
2245 2101
2246#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2102static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2247
2248/*
2249 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2250 * load from busiest to this_rq, as part of a balancing operation within
2251 * "domain". Returns the number of tasks moved.
2252 *
2253 * Called with both runqueues locked.
2254 */
2255static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2256 unsigned long max_nr_move, unsigned long max_load_move, 2103 unsigned long max_nr_move, unsigned long max_load_move,
2257 struct sched_domain *sd, enum idle_type idle, 2104 struct sched_domain *sd, enum cpu_idle_type idle,
2258 int *all_pinned) 2105 int *all_pinned, unsigned long *load_moved,
2106 int this_best_prio, int best_prio, int best_prio_seen,
2107 struct rq_iterator *iterator)
2259{ 2108{
2260 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2109 int pulled = 0, pinned = 0, skip_for_load;
2261 best_prio_seen, skip_for_load; 2110 struct task_struct *p;
2262 struct prio_array *array, *dst_array; 2111 long rem_load_move = max_load_move;
2263 struct list_head *head, *curr;
2264 struct task_struct *tmp;
2265 long rem_load_move;
2266 2112
2267 if (max_nr_move == 0 || max_load_move == 0) 2113 if (max_nr_move == 0 || max_load_move == 0)
2268 goto out; 2114 goto out;
2269 2115
2270 rem_load_move = max_load_move;
2271 pinned = 1; 2116 pinned = 1;
2272 this_best_prio = rq_best_prio(this_rq);
2273 best_prio = rq_best_prio(busiest);
2274 /*
2275 * Enable handling of the case where there is more than one task
2276 * with the best priority. If the current running task is one
2277 * of those with prio==best_prio we know it won't be moved
2278 * and therefore it's safe to override the skip (based on load) of
2279 * any task we find with that prio.
2280 */
2281 best_prio_seen = best_prio == busiest->curr->prio;
2282 2117
2283 /* 2118 /*
2284 * We first consider expired tasks. Those will likely not be 2119 * Start the load-balancing iterator:
2285 * executed in the near future, and they are most likely to
2286 * be cache-cold, thus switching CPUs has the least effect
2287 * on them.
2288 */ 2120 */
2289 if (busiest->expired->nr_active) { 2121 p = iterator->start(iterator->arg);
2290 array = busiest->expired; 2122next:
2291 dst_array = this_rq->expired; 2123 if (!p)
2292 } else {
2293 array = busiest->active;
2294 dst_array = this_rq->active;
2295 }
2296
2297new_array:
2298 /* Start searching at priority 0: */
2299 idx = 0;
2300skip_bitmap:
2301 if (!idx)
2302 idx = sched_find_first_bit(array->bitmap);
2303 else
2304 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2305 if (idx >= MAX_PRIO) {
2306 if (array == busiest->expired && busiest->active->nr_active) {
2307 array = busiest->active;
2308 dst_array = this_rq->active;
2309 goto new_array;
2310 }
2311 goto out; 2124 goto out;
2312 }
2313
2314 head = array->queue + idx;
2315 curr = head->prev;
2316skip_queue:
2317 tmp = list_entry(curr, struct task_struct, run_list);
2318
2319 curr = curr->prev;
2320
2321 /* 2125 /*
2322 * To help distribute high priority tasks accross CPUs we don't 2126 * To help distribute high priority tasks accross CPUs we don't
2323 * skip a task if it will be the highest priority task (i.e. smallest 2127 * skip a task if it will be the highest priority task (i.e. smallest
2324 * prio value) on its new queue regardless of its load weight 2128 * prio value) on its new queue regardless of its load weight
2325 */ 2129 */
2326 skip_for_load = tmp->load_weight > rem_load_move; 2130 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2327 if (skip_for_load && idx < this_best_prio) 2131 SCHED_LOAD_SCALE_FUZZ;
2328 skip_for_load = !best_prio_seen && idx == best_prio; 2132 if (skip_for_load && p->prio < this_best_prio)
2133 skip_for_load = !best_prio_seen && p->prio == best_prio;
2329 if (skip_for_load || 2134 if (skip_for_load ||
2330 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2135 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2331 2136
2332 best_prio_seen |= idx == best_prio; 2137 best_prio_seen |= p->prio == best_prio;
2333 if (curr != head) 2138 p = iterator->next(iterator->arg);
2334 goto skip_queue; 2139 goto next;
2335 idx++;
2336 goto skip_bitmap;
2337 } 2140 }
2338 2141
2339 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2142 pull_task(busiest, p, this_rq, this_cpu);
2340 pulled++; 2143 pulled++;
2341 rem_load_move -= tmp->load_weight; 2144 rem_load_move -= p->se.load.weight;
2342 2145
2343 /* 2146 /*
2344 * We only want to steal up to the prescribed number of tasks 2147 * We only want to steal up to the prescribed number of tasks
2345 * and the prescribed amount of weighted load. 2148 * and the prescribed amount of weighted load.
2346 */ 2149 */
2347 if (pulled < max_nr_move && rem_load_move > 0) { 2150 if (pulled < max_nr_move && rem_load_move > 0) {
2348 if (idx < this_best_prio) 2151 if (p->prio < this_best_prio)
2349 this_best_prio = idx; 2152 this_best_prio = p->prio;
2350 if (curr != head) 2153 p = iterator->next(iterator->arg);
2351 goto skip_queue; 2154 goto next;
2352 idx++;
2353 goto skip_bitmap;
2354 } 2155 }
2355out: 2156out:
2356 /* 2157 /*
@@ -2362,18 +2163,48 @@ out:
2362 2163
2363 if (all_pinned) 2164 if (all_pinned)
2364 *all_pinned = pinned; 2165 *all_pinned = pinned;
2166 *load_moved = max_load_move - rem_load_move;
2365 return pulled; 2167 return pulled;
2366} 2168}
2367 2169
2368/* 2170/*
2171 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2172 * load from busiest to this_rq, as part of a balancing operation within
2173 * "domain". Returns the number of tasks moved.
2174 *
2175 * Called with both runqueues locked.
2176 */
2177static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2178 unsigned long max_nr_move, unsigned long max_load_move,
2179 struct sched_domain *sd, enum cpu_idle_type idle,
2180 int *all_pinned)
2181{
2182 struct sched_class *class = sched_class_highest;
2183 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2184 long rem_load_move = max_load_move;
2185
2186 do {
2187 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2188 max_nr_move, (unsigned long)rem_load_move,
2189 sd, idle, all_pinned, &load_moved);
2190 total_nr_moved += nr_moved;
2191 max_nr_move -= nr_moved;
2192 rem_load_move -= load_moved;
2193 class = class->next;
2194 } while (class && max_nr_move && rem_load_move > 0);
2195
2196 return total_nr_moved;
2197}
2198
2199/*
2369 * find_busiest_group finds and returns the busiest CPU group within the 2200 * find_busiest_group finds and returns the busiest CPU group within the
2370 * domain. It calculates and returns the amount of weighted load which 2201 * domain. It calculates and returns the amount of weighted load which
2371 * should be moved to restore balance via the imbalance parameter. 2202 * should be moved to restore balance via the imbalance parameter.
2372 */ 2203 */
2373static struct sched_group * 2204static struct sched_group *
2374find_busiest_group(struct sched_domain *sd, int this_cpu, 2205find_busiest_group(struct sched_domain *sd, int this_cpu,
2375 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2206 unsigned long *imbalance, enum cpu_idle_type idle,
2376 cpumask_t *cpus, int *balance) 2207 int *sd_idle, cpumask_t *cpus, int *balance)
2377{ 2208{
2378 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2209 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2379 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2210 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2222,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2391 max_load = this_load = total_load = total_pwr = 0; 2222 max_load = this_load = total_load = total_pwr = 0;
2392 busiest_load_per_task = busiest_nr_running = 0; 2223 busiest_load_per_task = busiest_nr_running = 0;
2393 this_load_per_task = this_nr_running = 0; 2224 this_load_per_task = this_nr_running = 0;
2394 if (idle == NOT_IDLE) 2225 if (idle == CPU_NOT_IDLE)
2395 load_idx = sd->busy_idx; 2226 load_idx = sd->busy_idx;
2396 else if (idle == NEWLY_IDLE) 2227 else if (idle == CPU_NEWLY_IDLE)
2397 load_idx = sd->newidle_idx; 2228 load_idx = sd->newidle_idx;
2398 else 2229 else
2399 load_idx = sd->idle_idx; 2230 load_idx = sd->idle_idx;
@@ -2421,7 +2252,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2421 2252
2422 rq = cpu_rq(i); 2253 rq = cpu_rq(i);
2423 2254
2424 if (*sd_idle && !idle_cpu(i)) 2255 if (*sd_idle && rq->nr_running)
2425 *sd_idle = 0; 2256 *sd_idle = 0;
2426 2257
2427 /* Bias balancing toward cpus of our domain */ 2258 /* Bias balancing toward cpus of our domain */
@@ -2437,15 +2268,17 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 2268
2438 avg_load += load; 2269 avg_load += load;
2439 sum_nr_running += rq->nr_running; 2270 sum_nr_running += rq->nr_running;
2440 sum_weighted_load += rq->raw_weighted_load; 2271 sum_weighted_load += weighted_cpuload(i);
2441 } 2272 }
2442 2273
2443 /* 2274 /*
2444 * First idle cpu or the first cpu(busiest) in this sched group 2275 * First idle cpu or the first cpu(busiest) in this sched group
2445 * is eligible for doing load balancing at this and above 2276 * is eligible for doing load balancing at this and above
2446 * domains. 2277 * domains. In the newly idle case, we will allow all the cpu's
2278 * to do the newly idle load balance.
2447 */ 2279 */
2448 if (local_group && balance_cpu != this_cpu && balance) { 2280 if (idle != CPU_NEWLY_IDLE && local_group &&
2281 balance_cpu != this_cpu && balance) {
2449 *balance = 0; 2282 *balance = 0;
2450 goto ret; 2283 goto ret;
2451 } 2284 }
@@ -2477,8 +2310,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2477 * Busy processors will not participate in power savings 2310 * Busy processors will not participate in power savings
2478 * balance. 2311 * balance.
2479 */ 2312 */
2480 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2313 if (idle == CPU_NOT_IDLE ||
2481 goto group_next; 2314 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2315 goto group_next;
2482 2316
2483 /* 2317 /*
2484 * If the local group is idle or completely loaded 2318 * If the local group is idle or completely loaded
@@ -2488,42 +2322,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2488 !this_nr_running)) 2322 !this_nr_running))
2489 power_savings_balance = 0; 2323 power_savings_balance = 0;
2490 2324
2491 /* 2325 /*
2492 * If a group is already running at full capacity or idle, 2326 * If a group is already running at full capacity or idle,
2493 * don't include that group in power savings calculations 2327 * don't include that group in power savings calculations
2494 */ 2328 */
2495 if (!power_savings_balance || sum_nr_running >= group_capacity 2329 if (!power_savings_balance || sum_nr_running >= group_capacity
2496 || !sum_nr_running) 2330 || !sum_nr_running)
2497 goto group_next; 2331 goto group_next;
2498 2332
2499 /* 2333 /*
2500 * Calculate the group which has the least non-idle load. 2334 * Calculate the group which has the least non-idle load.
2501 * This is the group from where we need to pick up the load 2335 * This is the group from where we need to pick up the load
2502 * for saving power 2336 * for saving power
2503 */ 2337 */
2504 if ((sum_nr_running < min_nr_running) || 2338 if ((sum_nr_running < min_nr_running) ||
2505 (sum_nr_running == min_nr_running && 2339 (sum_nr_running == min_nr_running &&
2506 first_cpu(group->cpumask) < 2340 first_cpu(group->cpumask) <
2507 first_cpu(group_min->cpumask))) { 2341 first_cpu(group_min->cpumask))) {
2508 group_min = group; 2342 group_min = group;
2509 min_nr_running = sum_nr_running; 2343 min_nr_running = sum_nr_running;
2510 min_load_per_task = sum_weighted_load / 2344 min_load_per_task = sum_weighted_load /
2511 sum_nr_running; 2345 sum_nr_running;
2512 } 2346 }
2513 2347
2514 /* 2348 /*
2515 * Calculate the group which is almost near its 2349 * Calculate the group which is almost near its
2516 * capacity but still has some space to pick up some load 2350 * capacity but still has some space to pick up some load
2517 * from other group and save more power 2351 * from other group and save more power
2518 */ 2352 */
2519 if (sum_nr_running <= group_capacity - 1) { 2353 if (sum_nr_running <= group_capacity - 1) {
2520 if (sum_nr_running > leader_nr_running || 2354 if (sum_nr_running > leader_nr_running ||
2521 (sum_nr_running == leader_nr_running && 2355 (sum_nr_running == leader_nr_running &&
2522 first_cpu(group->cpumask) > 2356 first_cpu(group->cpumask) >
2523 first_cpu(group_leader->cpumask))) { 2357 first_cpu(group_leader->cpumask))) {
2524 group_leader = group; 2358 group_leader = group;
2525 leader_nr_running = sum_nr_running; 2359 leader_nr_running = sum_nr_running;
2526 } 2360 }
2527 } 2361 }
2528group_next: 2362group_next:
2529#endif 2363#endif
@@ -2578,7 +2412,7 @@ group_next:
2578 * a think about bumping its value to force at least one task to be 2412 * a think about bumping its value to force at least one task to be
2579 * moved 2413 * moved
2580 */ 2414 */
2581 if (*imbalance < busiest_load_per_task) { 2415 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2582 unsigned long tmp, pwr_now, pwr_move; 2416 unsigned long tmp, pwr_now, pwr_move;
2583 unsigned int imbn; 2417 unsigned int imbn;
2584 2418
@@ -2592,7 +2426,8 @@ small_imbalance:
2592 } else 2426 } else
2593 this_load_per_task = SCHED_LOAD_SCALE; 2427 this_load_per_task = SCHED_LOAD_SCALE;
2594 2428
2595 if (max_load - this_load >= busiest_load_per_task * imbn) { 2429 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2430 busiest_load_per_task * imbn) {
2596 *imbalance = busiest_load_per_task; 2431 *imbalance = busiest_load_per_task;
2597 return busiest; 2432 return busiest;
2598 } 2433 }
@@ -2639,7 +2474,7 @@ small_imbalance:
2639 2474
2640out_balanced: 2475out_balanced:
2641#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2476#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2642 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2477 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2643 goto ret; 2478 goto ret;
2644 2479
2645 if (this == group_leader && group_leader != group_min) { 2480 if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2491,7 @@ ret:
2656 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2491 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2657 */ 2492 */
2658static struct rq * 2493static struct rq *
2659find_busiest_queue(struct sched_group *group, enum idle_type idle, 2494find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2660 unsigned long imbalance, cpumask_t *cpus) 2495 unsigned long imbalance, cpumask_t *cpus)
2661{ 2496{
2662 struct rq *busiest = NULL, *rq; 2497 struct rq *busiest = NULL, *rq;
@@ -2664,17 +2499,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2664 int i; 2499 int i;
2665 2500
2666 for_each_cpu_mask(i, group->cpumask) { 2501 for_each_cpu_mask(i, group->cpumask) {
2502 unsigned long wl;
2667 2503
2668 if (!cpu_isset(i, *cpus)) 2504 if (!cpu_isset(i, *cpus))
2669 continue; 2505 continue;
2670 2506
2671 rq = cpu_rq(i); 2507 rq = cpu_rq(i);
2508 wl = weighted_cpuload(i);
2672 2509
2673 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2510 if (rq->nr_running == 1 && wl > imbalance)
2674 continue; 2511 continue;
2675 2512
2676 if (rq->raw_weighted_load > max_load) { 2513 if (wl > max_load) {
2677 max_load = rq->raw_weighted_load; 2514 max_load = wl;
2678 busiest = rq; 2515 busiest = rq;
2679 } 2516 }
2680 } 2517 }
@@ -2698,7 +2535,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2698 * tasks if there is an imbalance. 2535 * tasks if there is an imbalance.
2699 */ 2536 */
2700static int load_balance(int this_cpu, struct rq *this_rq, 2537static int load_balance(int this_cpu, struct rq *this_rq,
2701 struct sched_domain *sd, enum idle_type idle, 2538 struct sched_domain *sd, enum cpu_idle_type idle,
2702 int *balance) 2539 int *balance)
2703{ 2540{
2704 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2541 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2548,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2711 /* 2548 /*
2712 * When power savings policy is enabled for the parent domain, idle 2549 * When power savings policy is enabled for the parent domain, idle
2713 * sibling can pick up load irrespective of busy siblings. In this case, 2550 * sibling can pick up load irrespective of busy siblings. In this case,
2714 * let the state of idle sibling percolate up as IDLE, instead of 2551 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2715 * portraying it as NOT_IDLE. 2552 * portraying it as CPU_NOT_IDLE.
2716 */ 2553 */
2717 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2554 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2718 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2555 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd_idle = 1; 2556 sd_idle = 1;
2720 2557
@@ -2848,7 +2685,7 @@ out_one_pinned:
2848 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2685 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2849 * tasks if there is an imbalance. 2686 * tasks if there is an imbalance.
2850 * 2687 *
2851 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2688 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2852 * this_rq is locked. 2689 * this_rq is locked.
2853 */ 2690 */
2854static int 2691static int
@@ -2859,37 +2696,38 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2859 unsigned long imbalance; 2696 unsigned long imbalance;
2860 int nr_moved = 0; 2697 int nr_moved = 0;
2861 int sd_idle = 0; 2698 int sd_idle = 0;
2699 int all_pinned = 0;
2862 cpumask_t cpus = CPU_MASK_ALL; 2700 cpumask_t cpus = CPU_MASK_ALL;
2863 2701
2864 /* 2702 /*
2865 * When power savings policy is enabled for the parent domain, idle 2703 * When power savings policy is enabled for the parent domain, idle
2866 * sibling can pick up load irrespective of busy siblings. In this case, 2704 * sibling can pick up load irrespective of busy siblings. In this case,
2867 * let the state of idle sibling percolate up as IDLE, instead of 2705 * let the state of idle sibling percolate up as IDLE, instead of
2868 * portraying it as NOT_IDLE. 2706 * portraying it as CPU_NOT_IDLE.
2869 */ 2707 */
2870 if (sd->flags & SD_SHARE_CPUPOWER && 2708 if (sd->flags & SD_SHARE_CPUPOWER &&
2871 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2709 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2872 sd_idle = 1; 2710 sd_idle = 1;
2873 2711
2874 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2712 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2875redo: 2713redo:
2876 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2714 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2877 &sd_idle, &cpus, NULL); 2715 &sd_idle, &cpus, NULL);
2878 if (!group) { 2716 if (!group) {
2879 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2717 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2880 goto out_balanced; 2718 goto out_balanced;
2881 } 2719 }
2882 2720
2883 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2721 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2884 &cpus); 2722 &cpus);
2885 if (!busiest) { 2723 if (!busiest) {
2886 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2724 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2887 goto out_balanced; 2725 goto out_balanced;
2888 } 2726 }
2889 2727
2890 BUG_ON(busiest == this_rq); 2728 BUG_ON(busiest == this_rq);
2891 2729
2892 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2730 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2893 2731
2894 nr_moved = 0; 2732 nr_moved = 0;
2895 if (busiest->nr_running > 1) { 2733 if (busiest->nr_running > 1) {
@@ -2897,10 +2735,11 @@ redo:
2897 double_lock_balance(this_rq, busiest); 2735 double_lock_balance(this_rq, busiest);
2898 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2736 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2899 minus_1_or_zero(busiest->nr_running), 2737 minus_1_or_zero(busiest->nr_running),
2900 imbalance, sd, NEWLY_IDLE, NULL); 2738 imbalance, sd, CPU_NEWLY_IDLE,
2739 &all_pinned);
2901 spin_unlock(&busiest->lock); 2740 spin_unlock(&busiest->lock);
2902 2741
2903 if (!nr_moved) { 2742 if (unlikely(all_pinned)) {
2904 cpu_clear(cpu_of(busiest), cpus); 2743 cpu_clear(cpu_of(busiest), cpus);
2905 if (!cpus_empty(cpus)) 2744 if (!cpus_empty(cpus))
2906 goto redo; 2745 goto redo;
@@ -2908,7 +2747,7 @@ redo:
2908 } 2747 }
2909 2748
2910 if (!nr_moved) { 2749 if (!nr_moved) {
2911 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2750 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2912 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2751 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2913 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2752 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2914 return -1; 2753 return -1;
@@ -2918,7 +2757,7 @@ redo:
2918 return nr_moved; 2757 return nr_moved;
2919 2758
2920out_balanced: 2759out_balanced:
2921 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2760 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2922 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2761 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2923 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2762 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2924 return -1; 2763 return -1;
@@ -2934,8 +2773,8 @@ out_balanced:
2934static void idle_balance(int this_cpu, struct rq *this_rq) 2773static void idle_balance(int this_cpu, struct rq *this_rq)
2935{ 2774{
2936 struct sched_domain *sd; 2775 struct sched_domain *sd;
2937 int pulled_task = 0; 2776 int pulled_task = -1;
2938 unsigned long next_balance = jiffies + 60 * HZ; 2777 unsigned long next_balance = jiffies + HZ;
2939 2778
2940 for_each_domain(this_cpu, sd) { 2779 for_each_domain(this_cpu, sd) {
2941 unsigned long interval; 2780 unsigned long interval;
@@ -2954,12 +2793,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
2954 if (pulled_task) 2793 if (pulled_task)
2955 break; 2794 break;
2956 } 2795 }
2957 if (!pulled_task) 2796 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2958 /* 2797 /*
2959 * We are going idle. next_balance may be set based on 2798 * We are going idle. next_balance may be set based on
2960 * a busy processor. So reset next_balance. 2799 * a busy processor. So reset next_balance.
2961 */ 2800 */
2962 this_rq->next_balance = next_balance; 2801 this_rq->next_balance = next_balance;
2802 }
2963} 2803}
2964 2804
2965/* 2805/*
@@ -3003,7 +2843,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3003 schedstat_inc(sd, alb_cnt); 2843 schedstat_inc(sd, alb_cnt);
3004 2844
3005 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2845 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
3006 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2846 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
3007 NULL)) 2847 NULL))
3008 schedstat_inc(sd, alb_pushed); 2848 schedstat_inc(sd, alb_pushed);
3009 else 2849 else
@@ -3012,32 +2852,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3012 spin_unlock(&target_rq->lock); 2852 spin_unlock(&target_rq->lock);
3013} 2853}
3014 2854
3015static void update_load(struct rq *this_rq)
3016{
3017 unsigned long this_load;
3018 unsigned int i, scale;
3019
3020 this_load = this_rq->raw_weighted_load;
3021
3022 /* Update our load: */
3023 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
3024 unsigned long old_load, new_load;
3025
3026 /* scale is effectively 1 << i now, and >> i divides by scale */
3027
3028 old_load = this_rq->cpu_load[i];
3029 new_load = this_load;
3030 /*
3031 * Round up the averaging division if load is increasing. This
3032 * prevents us from getting stuck on 9 if the load is 10, for
3033 * example.
3034 */
3035 if (new_load > old_load)
3036 new_load += scale-1;
3037 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3038 }
3039}
3040
3041#ifdef CONFIG_NO_HZ 2855#ifdef CONFIG_NO_HZ
3042static struct { 2856static struct {
3043 atomic_t load_balancer; 2857 atomic_t load_balancer;
@@ -3120,7 +2934,7 @@ static DEFINE_SPINLOCK(balancing);
3120 * 2934 *
3121 * Balancing parameters are set up in arch_init_sched_domains. 2935 * Balancing parameters are set up in arch_init_sched_domains.
3122 */ 2936 */
3123static inline void rebalance_domains(int cpu, enum idle_type idle) 2937static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3124{ 2938{
3125 int balance = 1; 2939 int balance = 1;
3126 struct rq *rq = cpu_rq(cpu); 2940 struct rq *rq = cpu_rq(cpu);
@@ -3134,13 +2948,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3134 continue; 2948 continue;
3135 2949
3136 interval = sd->balance_interval; 2950 interval = sd->balance_interval;
3137 if (idle != SCHED_IDLE) 2951 if (idle != CPU_IDLE)
3138 interval *= sd->busy_factor; 2952 interval *= sd->busy_factor;
3139 2953
3140 /* scale ms to jiffies */ 2954 /* scale ms to jiffies */
3141 interval = msecs_to_jiffies(interval); 2955 interval = msecs_to_jiffies(interval);
3142 if (unlikely(!interval)) 2956 if (unlikely(!interval))
3143 interval = 1; 2957 interval = 1;
2958 if (interval > HZ*NR_CPUS/10)
2959 interval = HZ*NR_CPUS/10;
2960
3144 2961
3145 if (sd->flags & SD_SERIALIZE) { 2962 if (sd->flags & SD_SERIALIZE) {
3146 if (!spin_trylock(&balancing)) 2963 if (!spin_trylock(&balancing))
@@ -3154,7 +2971,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3154 * longer idle, or one of our SMT siblings is 2971 * longer idle, or one of our SMT siblings is
3155 * not idle. 2972 * not idle.
3156 */ 2973 */
3157 idle = NOT_IDLE; 2974 idle = CPU_NOT_IDLE;
3158 } 2975 }
3159 sd->last_balance = jiffies; 2976 sd->last_balance = jiffies;
3160 } 2977 }
@@ -3182,11 +2999,12 @@ out:
3182 */ 2999 */
3183static void run_rebalance_domains(struct softirq_action *h) 3000static void run_rebalance_domains(struct softirq_action *h)
3184{ 3001{
3185 int local_cpu = smp_processor_id(); 3002 int this_cpu = smp_processor_id();
3186 struct rq *local_rq = cpu_rq(local_cpu); 3003 struct rq *this_rq = cpu_rq(this_cpu);
3187 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 3004 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3005 CPU_IDLE : CPU_NOT_IDLE;
3188 3006
3189 rebalance_domains(local_cpu, idle); 3007 rebalance_domains(this_cpu, idle);
3190 3008
3191#ifdef CONFIG_NO_HZ 3009#ifdef CONFIG_NO_HZ
3192 /* 3010 /*
@@ -3194,13 +3012,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3194 * balancing on behalf of the other idle cpus whose ticks are 3012 * balancing on behalf of the other idle cpus whose ticks are
3195 * stopped. 3013 * stopped.
3196 */ 3014 */
3197 if (local_rq->idle_at_tick && 3015 if (this_rq->idle_at_tick &&
3198 atomic_read(&nohz.load_balancer) == local_cpu) { 3016 atomic_read(&nohz.load_balancer) == this_cpu) {
3199 cpumask_t cpus = nohz.cpu_mask; 3017 cpumask_t cpus = nohz.cpu_mask;
3200 struct rq *rq; 3018 struct rq *rq;
3201 int balance_cpu; 3019 int balance_cpu;
3202 3020
3203 cpu_clear(local_cpu, cpus); 3021 cpu_clear(this_cpu, cpus);
3204 for_each_cpu_mask(balance_cpu, cpus) { 3022 for_each_cpu_mask(balance_cpu, cpus) {
3205 /* 3023 /*
3206 * If this cpu gets work to do, stop the load balancing 3024 * If this cpu gets work to do, stop the load balancing
@@ -3213,8 +3031,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3213 rebalance_domains(balance_cpu, SCHED_IDLE); 3031 rebalance_domains(balance_cpu, SCHED_IDLE);
3214 3032
3215 rq = cpu_rq(balance_cpu); 3033 rq = cpu_rq(balance_cpu);
3216 if (time_after(local_rq->next_balance, rq->next_balance)) 3034 if (time_after(this_rq->next_balance, rq->next_balance))
3217 local_rq->next_balance = rq->next_balance; 3035 this_rq->next_balance = rq->next_balance;
3218 } 3036 }
3219 } 3037 }
3220#endif 3038#endif
@@ -3227,9 +3045,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3227 * idle load balancing owner or decide to stop the periodic load balancing, 3045 * idle load balancing owner or decide to stop the periodic load balancing,
3228 * if the whole system is idle. 3046 * if the whole system is idle.
3229 */ 3047 */
3230static inline void trigger_load_balance(int cpu) 3048static inline void trigger_load_balance(struct rq *rq, int cpu)
3231{ 3049{
3232 struct rq *rq = cpu_rq(cpu);
3233#ifdef CONFIG_NO_HZ 3050#ifdef CONFIG_NO_HZ
3234 /* 3051 /*
3235 * If we were in the nohz mode recently and busy at the current 3052 * If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3098,29 @@ static inline void trigger_load_balance(int cpu)
3281 if (time_after_eq(jiffies, rq->next_balance)) 3098 if (time_after_eq(jiffies, rq->next_balance))
3282 raise_softirq(SCHED_SOFTIRQ); 3099 raise_softirq(SCHED_SOFTIRQ);
3283} 3100}
3284#else 3101
3102#else /* CONFIG_SMP */
3103
3285/* 3104/*
3286 * on UP we do not need to balance between CPUs: 3105 * on UP we do not need to balance between CPUs:
3287 */ 3106 */
3288static inline void idle_balance(int cpu, struct rq *rq) 3107static inline void idle_balance(int cpu, struct rq *rq)
3289{ 3108{
3290} 3109}
3110
3111/* Avoid "used but not defined" warning on UP */
3112static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3113 unsigned long max_nr_move, unsigned long max_load_move,
3114 struct sched_domain *sd, enum cpu_idle_type idle,
3115 int *all_pinned, unsigned long *load_moved,
3116 int this_best_prio, int best_prio, int best_prio_seen,
3117 struct rq_iterator *iterator)
3118{
3119 *load_moved = 0;
3120
3121 return 0;
3122}
3123
3291#endif 3124#endif
3292 3125
3293DEFINE_PER_CPU(struct kernel_stat, kstat); 3126DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,54 +3128,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3295EXPORT_PER_CPU_SYMBOL(kstat); 3128EXPORT_PER_CPU_SYMBOL(kstat);
3296 3129
3297/* 3130/*
3298 * This is called on clock ticks and on context switches. 3131 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3299 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3132 * that have not yet been banked in case the task is currently running.
3300 */
3301static inline void
3302update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3303{
3304 p->sched_time += now - p->last_ran;
3305 p->last_ran = rq->most_recent_timestamp = now;
3306}
3307
3308/*
3309 * Return current->sched_time plus any more ns on the sched_clock
3310 * that have not yet been banked.
3311 */ 3133 */
3312unsigned long long current_sched_time(const struct task_struct *p) 3134unsigned long long task_sched_runtime(struct task_struct *p)
3313{ 3135{
3314 unsigned long long ns;
3315 unsigned long flags; 3136 unsigned long flags;
3137 u64 ns, delta_exec;
3138 struct rq *rq;
3316 3139
3317 local_irq_save(flags); 3140 rq = task_rq_lock(p, &flags);
3318 ns = p->sched_time + sched_clock() - p->last_ran; 3141 ns = p->se.sum_exec_runtime;
3319 local_irq_restore(flags); 3142 if (rq->curr == p) {
3143 delta_exec = rq_clock(rq) - p->se.exec_start;
3144 if ((s64)delta_exec > 0)
3145 ns += delta_exec;
3146 }
3147 task_rq_unlock(rq, &flags);
3320 3148
3321 return ns; 3149 return ns;
3322} 3150}
3323 3151
3324/* 3152/*
3325 * We place interactive tasks back into the active array, if possible.
3326 *
3327 * To guarantee that this does not starve expired tasks we ignore the
3328 * interactivity of a task if the first expired task had to wait more
3329 * than a 'reasonable' amount of time. This deadline timeout is
3330 * load-dependent, as the frequency of array switched decreases with
3331 * increasing number of running tasks. We also ignore the interactivity
3332 * if a better static_prio task has expired:
3333 */
3334static inline int expired_starving(struct rq *rq)
3335{
3336 if (rq->curr->static_prio > rq->best_expired_prio)
3337 return 1;
3338 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3339 return 0;
3340 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3341 return 1;
3342 return 0;
3343}
3344
3345/*
3346 * Account user cpu time to a process. 3153 * Account user cpu time to a process.
3347 * @p: the process that the cpu time gets accounted to 3154 * @p: the process that the cpu time gets accounted to
3348 * @hardirq_offset: the offset to subtract from hardirq_count() 3155 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3415,81 +3222,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3415 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3222 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3416} 3223}
3417 3224
3418static void task_running_tick(struct rq *rq, struct task_struct *p)
3419{
3420 if (p->array != rq->active) {
3421 /* Task has expired but was not scheduled yet */
3422 set_tsk_need_resched(p);
3423 return;
3424 }
3425 spin_lock(&rq->lock);
3426 /*
3427 * The task was running during this tick - update the
3428 * time slice counter. Note: we do not update a thread's
3429 * priority until it either goes to sleep or uses up its
3430 * timeslice. This makes it possible for interactive tasks
3431 * to use up their timeslices at their highest priority levels.
3432 */
3433 if (rt_task(p)) {
3434 /*
3435 * RR tasks need a special form of timeslice management.
3436 * FIFO tasks have no timeslices.
3437 */
3438 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3439 p->time_slice = task_timeslice(p);
3440 p->first_time_slice = 0;
3441 set_tsk_need_resched(p);
3442
3443 /* put it at the end of the queue: */
3444 requeue_task(p, rq->active);
3445 }
3446 goto out_unlock;
3447 }
3448 if (!--p->time_slice) {
3449 dequeue_task(p, rq->active);
3450 set_tsk_need_resched(p);
3451 p->prio = effective_prio(p);
3452 p->time_slice = task_timeslice(p);
3453 p->first_time_slice = 0;
3454
3455 if (!rq->expired_timestamp)
3456 rq->expired_timestamp = jiffies;
3457 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3458 enqueue_task(p, rq->expired);
3459 if (p->static_prio < rq->best_expired_prio)
3460 rq->best_expired_prio = p->static_prio;
3461 } else
3462 enqueue_task(p, rq->active);
3463 } else {
3464 /*
3465 * Prevent a too long timeslice allowing a task to monopolize
3466 * the CPU. We do this by splitting up the timeslice into
3467 * smaller pieces.
3468 *
3469 * Note: this does not mean the task's timeslices expire or
3470 * get lost in any way, they just might be preempted by
3471 * another task of equal priority. (one with higher
3472 * priority would have preempted this task already.) We
3473 * requeue this task to the end of the list on this priority
3474 * level, which is in essence a round-robin of tasks with
3475 * equal priority.
3476 *
3477 * This only applies to tasks in the interactive
3478 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3479 */
3480 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3481 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3482 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3483 (p->array == rq->active)) {
3484
3485 requeue_task(p, rq->active);
3486 set_tsk_need_resched(p);
3487 }
3488 }
3489out_unlock:
3490 spin_unlock(&rq->lock);
3491}
3492
3493/* 3225/*
3494 * This function gets called by the timer code, with HZ frequency. 3226 * This function gets called by the timer code, with HZ frequency.
3495 * We call it with interrupts disabled. 3227 * We call it with interrupts disabled.
@@ -3499,20 +3231,19 @@ out_unlock:
3499 */ 3231 */
3500void scheduler_tick(void) 3232void scheduler_tick(void)
3501{ 3233{
3502 unsigned long long now = sched_clock();
3503 struct task_struct *p = current;
3504 int cpu = smp_processor_id(); 3234 int cpu = smp_processor_id();
3505 int idle_at_tick = idle_cpu(cpu);
3506 struct rq *rq = cpu_rq(cpu); 3235 struct rq *rq = cpu_rq(cpu);
3236 struct task_struct *curr = rq->curr;
3507 3237
3508 update_cpu_clock(p, rq, now); 3238 spin_lock(&rq->lock);
3239 if (curr != rq->idle) /* FIXME: needed? */
3240 curr->sched_class->task_tick(rq, curr);
3241 update_cpu_load(rq);
3242 spin_unlock(&rq->lock);
3509 3243
3510 if (!idle_at_tick)
3511 task_running_tick(rq, p);
3512#ifdef CONFIG_SMP 3244#ifdef CONFIG_SMP
3513 update_load(rq); 3245 rq->idle_at_tick = idle_cpu(cpu);
3514 rq->idle_at_tick = idle_at_tick; 3246 trigger_load_balance(rq, cpu);
3515 trigger_load_balance(cpu);
3516#endif 3247#endif
3517} 3248}
3518 3249
@@ -3554,170 +3285,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3554 3285
3555#endif 3286#endif
3556 3287
3557static inline int interactive_sleep(enum sleep_type sleep_type) 3288/*
3289 * Print scheduling while atomic bug:
3290 */
3291static noinline void __schedule_bug(struct task_struct *prev)
3558{ 3292{
3559 return (sleep_type == SLEEP_INTERACTIVE || 3293 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3560 sleep_type == SLEEP_INTERRUPTED); 3294 prev->comm, preempt_count(), prev->pid);
3295 debug_show_held_locks(prev);
3296 if (irqs_disabled())
3297 print_irqtrace_events(prev);
3298 dump_stack();
3561} 3299}
3562 3300
3563/* 3301/*
3564 * schedule() is the main scheduler function. 3302 * Various schedule()-time debugging checks and statistics:
3565 */ 3303 */
3566asmlinkage void __sched schedule(void) 3304static inline void schedule_debug(struct task_struct *prev)
3567{ 3305{
3568 struct task_struct *prev, *next;
3569 struct prio_array *array;
3570 struct list_head *queue;
3571 unsigned long long now;
3572 unsigned long run_time;
3573 int cpu, idx, new_prio;
3574 long *switch_count;
3575 struct rq *rq;
3576
3577 /* 3306 /*
3578 * Test if we are atomic. Since do_exit() needs to call into 3307 * Test if we are atomic. Since do_exit() needs to call into
3579 * schedule() atomically, we ignore that path for now. 3308 * schedule() atomically, we ignore that path for now.
3580 * Otherwise, whine if we are scheduling when we should not be. 3309 * Otherwise, whine if we are scheduling when we should not be.
3581 */ 3310 */
3582 if (unlikely(in_atomic() && !current->exit_state)) { 3311 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3583 printk(KERN_ERR "BUG: scheduling while atomic: " 3312 __schedule_bug(prev);
3584 "%s/0x%08x/%d\n",
3585 current->comm, preempt_count(), current->pid);
3586 debug_show_held_locks(current);
3587 if (irqs_disabled())
3588 print_irqtrace_events(current);
3589 dump_stack();
3590 }
3591 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3592 3313
3593need_resched: 3314 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3594 preempt_disable();
3595 prev = current;
3596 release_kernel_lock(prev);
3597need_resched_nonpreemptible:
3598 rq = this_rq();
3599 3315
3600 /* 3316 schedstat_inc(this_rq(), sched_cnt);
3601 * The idle thread is not allowed to schedule! 3317}
3602 * Remove this check after it has been exercised a bit.
3603 */
3604 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3605 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3606 dump_stack();
3607 }
3608 3318
3609 schedstat_inc(rq, sched_cnt); 3319/*
3610 now = sched_clock(); 3320 * Pick up the highest-prio task:
3611 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3321 */
3612 run_time = now - prev->timestamp; 3322static inline struct task_struct *
3613 if (unlikely((long long)(now - prev->timestamp) < 0)) 3323pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3614 run_time = 0; 3324{
3615 } else 3325 struct sched_class *class;
3616 run_time = NS_MAX_SLEEP_AVG; 3326 struct task_struct *p;
3617 3327
3618 /* 3328 /*
3619 * Tasks charged proportionately less run_time at high sleep_avg to 3329 * Optimization: we know that if all tasks are in
3620 * delay them losing their interactive status 3330 * the fair class we can call that function directly:
3621 */ 3331 */
3622 run_time /= (CURRENT_BONUS(prev) ? : 1); 3332 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3623 3333 p = fair_sched_class.pick_next_task(rq, now);
3624 spin_lock_irq(&rq->lock); 3334 if (likely(p))
3625 3335 return p;
3626 switch_count = &prev->nivcsw;
3627 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3628 switch_count = &prev->nvcsw;
3629 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3630 unlikely(signal_pending(prev))))
3631 prev->state = TASK_RUNNING;
3632 else {
3633 if (prev->state == TASK_UNINTERRUPTIBLE)
3634 rq->nr_uninterruptible++;
3635 deactivate_task(prev, rq);
3636 }
3637 } 3336 }
3638 3337
3639 cpu = smp_processor_id(); 3338 class = sched_class_highest;
3640 if (unlikely(!rq->nr_running)) { 3339 for ( ; ; ) {
3641 idle_balance(cpu, rq); 3340 p = class->pick_next_task(rq, now);
3642 if (!rq->nr_running) { 3341 if (p)
3643 next = rq->idle; 3342 return p;
3644 rq->expired_timestamp = 0;
3645 goto switch_tasks;
3646 }
3647 }
3648
3649 array = rq->active;
3650 if (unlikely(!array->nr_active)) {
3651 /* 3343 /*
3652 * Switch the active and expired arrays. 3344 * Will never be NULL as the idle class always
3345 * returns a non-NULL p:
3653 */ 3346 */
3654 schedstat_inc(rq, sched_switch); 3347 class = class->next;
3655 rq->active = rq->expired;
3656 rq->expired = array;
3657 array = rq->active;
3658 rq->expired_timestamp = 0;
3659 rq->best_expired_prio = MAX_PRIO;
3660 } 3348 }
3349}
3661 3350
3662 idx = sched_find_first_bit(array->bitmap); 3351/*
3663 queue = array->queue + idx; 3352 * schedule() is the main scheduler function.
3664 next = list_entry(queue->next, struct task_struct, run_list); 3353 */
3354asmlinkage void __sched schedule(void)
3355{
3356 struct task_struct *prev, *next;
3357 long *switch_count;
3358 struct rq *rq;
3359 u64 now;
3360 int cpu;
3665 3361
3666 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3362need_resched:
3667 unsigned long long delta = now - next->timestamp; 3363 preempt_disable();
3668 if (unlikely((long long)(now - next->timestamp) < 0)) 3364 cpu = smp_processor_id();
3669 delta = 0; 3365 rq = cpu_rq(cpu);
3366 rcu_qsctr_inc(cpu);
3367 prev = rq->curr;
3368 switch_count = &prev->nivcsw;
3670 3369
3671 if (next->sleep_type == SLEEP_INTERACTIVE) 3370 release_kernel_lock(prev);
3672 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3371need_resched_nonpreemptible:
3673 3372
3674 array = next->array; 3373 schedule_debug(prev);
3675 new_prio = recalc_task_prio(next, next->timestamp + delta);
3676 3374
3677 if (unlikely(next->prio != new_prio)) { 3375 spin_lock_irq(&rq->lock);
3678 dequeue_task(next, array); 3376 clear_tsk_need_resched(prev);
3679 next->prio = new_prio; 3377
3680 enqueue_task(next, array); 3378 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3379 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3380 unlikely(signal_pending(prev)))) {
3381 prev->state = TASK_RUNNING;
3382 } else {
3383 deactivate_task(rq, prev, 1);
3681 } 3384 }
3385 switch_count = &prev->nvcsw;
3682 } 3386 }
3683 next->sleep_type = SLEEP_NORMAL;
3684switch_tasks:
3685 if (next == rq->idle)
3686 schedstat_inc(rq, sched_goidle);
3687 prefetch(next);
3688 prefetch_stack(next);
3689 clear_tsk_need_resched(prev);
3690 rcu_qsctr_inc(task_cpu(prev));
3691 3387
3692 update_cpu_clock(prev, rq, now); 3388 if (unlikely(!rq->nr_running))
3389 idle_balance(cpu, rq);
3693 3390
3694 prev->sleep_avg -= run_time; 3391 now = __rq_clock(rq);
3695 if ((long)prev->sleep_avg <= 0) 3392 prev->sched_class->put_prev_task(rq, prev, now);
3696 prev->sleep_avg = 0; 3393 next = pick_next_task(rq, prev, now);
3697 prev->timestamp = prev->last_ran = now;
3698 3394
3699 sched_info_switch(prev, next); 3395 sched_info_switch(prev, next);
3396
3700 if (likely(prev != next)) { 3397 if (likely(prev != next)) {
3701 next->timestamp = next->last_ran = now;
3702 rq->nr_switches++; 3398 rq->nr_switches++;
3703 rq->curr = next; 3399 rq->curr = next;
3704 ++*switch_count; 3400 ++*switch_count;
3705 3401
3706 prepare_task_switch(rq, next); 3402 context_switch(rq, prev, next); /* unlocks the rq */
3707 prev = context_switch(rq, prev, next);
3708 barrier();
3709 /*
3710 * this_rq must be evaluated again because prev may have moved
3711 * CPUs since it called schedule(), thus the 'rq' on its stack
3712 * frame will be invalid.
3713 */
3714 finish_task_switch(this_rq(), prev);
3715 } else 3403 } else
3716 spin_unlock_irq(&rq->lock); 3404 spin_unlock_irq(&rq->lock);
3717 3405
3718 prev = current; 3406 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3719 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3407 cpu = smp_processor_id();
3408 rq = cpu_rq(cpu);
3720 goto need_resched_nonpreemptible; 3409 goto need_resched_nonpreemptible;
3410 }
3721 preempt_enable_no_resched(); 3411 preempt_enable_no_resched();
3722 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3412 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3723 goto need_resched; 3413 goto need_resched;
@@ -4045,74 +3735,85 @@ out:
4045} 3735}
4046EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3736EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4047 3737
4048 3738static inline void
4049#define SLEEP_ON_VAR \ 3739sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4050 unsigned long flags; \ 3740{
4051 wait_queue_t wait; \ 3741 spin_lock_irqsave(&q->lock, *flags);
4052 init_waitqueue_entry(&wait, current); 3742 __add_wait_queue(q, wait);
4053
4054#define SLEEP_ON_HEAD \
4055 spin_lock_irqsave(&q->lock,flags); \
4056 __add_wait_queue(q, &wait); \
4057 spin_unlock(&q->lock); 3743 spin_unlock(&q->lock);
3744}
4058 3745
4059#define SLEEP_ON_TAIL \ 3746static inline void
4060 spin_lock_irq(&q->lock); \ 3747sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4061 __remove_wait_queue(q, &wait); \ 3748{
4062 spin_unlock_irqrestore(&q->lock, flags); 3749 spin_lock_irq(&q->lock);
3750 __remove_wait_queue(q, wait);
3751 spin_unlock_irqrestore(&q->lock, *flags);
3752}
4063 3753
4064void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3754void __sched interruptible_sleep_on(wait_queue_head_t *q)
4065{ 3755{
4066 SLEEP_ON_VAR 3756 unsigned long flags;
3757 wait_queue_t wait;
3758
3759 init_waitqueue_entry(&wait, current);
4067 3760
4068 current->state = TASK_INTERRUPTIBLE; 3761 current->state = TASK_INTERRUPTIBLE;
4069 3762
4070 SLEEP_ON_HEAD 3763 sleep_on_head(q, &wait, &flags);
4071 schedule(); 3764 schedule();
4072 SLEEP_ON_TAIL 3765 sleep_on_tail(q, &wait, &flags);
4073} 3766}
4074EXPORT_SYMBOL(interruptible_sleep_on); 3767EXPORT_SYMBOL(interruptible_sleep_on);
4075 3768
4076long fastcall __sched 3769long __sched
4077interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3770interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4078{ 3771{
4079 SLEEP_ON_VAR 3772 unsigned long flags;
3773 wait_queue_t wait;
3774
3775 init_waitqueue_entry(&wait, current);
4080 3776
4081 current->state = TASK_INTERRUPTIBLE; 3777 current->state = TASK_INTERRUPTIBLE;
4082 3778
4083 SLEEP_ON_HEAD 3779 sleep_on_head(q, &wait, &flags);
4084 timeout = schedule_timeout(timeout); 3780 timeout = schedule_timeout(timeout);
4085 SLEEP_ON_TAIL 3781 sleep_on_tail(q, &wait, &flags);
4086 3782
4087 return timeout; 3783 return timeout;
4088} 3784}
4089EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3785EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4090 3786
4091void fastcall __sched sleep_on(wait_queue_head_t *q) 3787void __sched sleep_on(wait_queue_head_t *q)
4092{ 3788{
4093 SLEEP_ON_VAR 3789 unsigned long flags;
3790 wait_queue_t wait;
3791
3792 init_waitqueue_entry(&wait, current);
4094 3793
4095 current->state = TASK_UNINTERRUPTIBLE; 3794 current->state = TASK_UNINTERRUPTIBLE;
4096 3795
4097 SLEEP_ON_HEAD 3796 sleep_on_head(q, &wait, &flags);
4098 schedule(); 3797 schedule();
4099 SLEEP_ON_TAIL 3798 sleep_on_tail(q, &wait, &flags);
4100} 3799}
4101EXPORT_SYMBOL(sleep_on); 3800EXPORT_SYMBOL(sleep_on);
4102 3801
4103long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3802long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4104{ 3803{
4105 SLEEP_ON_VAR 3804 unsigned long flags;
3805 wait_queue_t wait;
3806
3807 init_waitqueue_entry(&wait, current);
4106 3808
4107 current->state = TASK_UNINTERRUPTIBLE; 3809 current->state = TASK_UNINTERRUPTIBLE;
4108 3810
4109 SLEEP_ON_HEAD 3811 sleep_on_head(q, &wait, &flags);
4110 timeout = schedule_timeout(timeout); 3812 timeout = schedule_timeout(timeout);
4111 SLEEP_ON_TAIL 3813 sleep_on_tail(q, &wait, &flags);
4112 3814
4113 return timeout; 3815 return timeout;
4114} 3816}
4115
4116EXPORT_SYMBOL(sleep_on_timeout); 3817EXPORT_SYMBOL(sleep_on_timeout);
4117 3818
4118#ifdef CONFIG_RT_MUTEXES 3819#ifdef CONFIG_RT_MUTEXES
@@ -4129,29 +3830,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4129 */ 3830 */
4130void rt_mutex_setprio(struct task_struct *p, int prio) 3831void rt_mutex_setprio(struct task_struct *p, int prio)
4131{ 3832{
4132 struct prio_array *array;
4133 unsigned long flags; 3833 unsigned long flags;
3834 int oldprio, on_rq;
4134 struct rq *rq; 3835 struct rq *rq;
4135 int oldprio; 3836 u64 now;
4136 3837
4137 BUG_ON(prio < 0 || prio > MAX_PRIO); 3838 BUG_ON(prio < 0 || prio > MAX_PRIO);
4138 3839
4139 rq = task_rq_lock(p, &flags); 3840 rq = task_rq_lock(p, &flags);
3841 now = rq_clock(rq);
4140 3842
4141 oldprio = p->prio; 3843 oldprio = p->prio;
4142 array = p->array; 3844 on_rq = p->se.on_rq;
4143 if (array) 3845 if (on_rq)
4144 dequeue_task(p, array); 3846 dequeue_task(rq, p, 0, now);
3847
3848 if (rt_prio(prio))
3849 p->sched_class = &rt_sched_class;
3850 else
3851 p->sched_class = &fair_sched_class;
3852
4145 p->prio = prio; 3853 p->prio = prio;
4146 3854
4147 if (array) { 3855 if (on_rq) {
4148 /* 3856 enqueue_task(rq, p, 0, now);
4149 * If changing to an RT priority then queue it
4150 * in the active array!
4151 */
4152 if (rt_task(p))
4153 array = rq->active;
4154 enqueue_task(p, array);
4155 /* 3857 /*
4156 * Reschedule if we are currently running on this runqueue and 3858 * Reschedule if we are currently running on this runqueue and
4157 * our priority decreased, or if we are not currently running on 3859 * our priority decreased, or if we are not currently running on
@@ -4160,8 +3862,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4160 if (task_running(rq, p)) { 3862 if (task_running(rq, p)) {
4161 if (p->prio > oldprio) 3863 if (p->prio > oldprio)
4162 resched_task(rq->curr); 3864 resched_task(rq->curr);
4163 } else if (TASK_PREEMPTS_CURR(p, rq)) 3865 } else {
4164 resched_task(rq->curr); 3866 check_preempt_curr(rq, p);
3867 }
4165 } 3868 }
4166 task_rq_unlock(rq, &flags); 3869 task_rq_unlock(rq, &flags);
4167} 3870}
@@ -4170,10 +3873,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4170 3873
4171void set_user_nice(struct task_struct *p, long nice) 3874void set_user_nice(struct task_struct *p, long nice)
4172{ 3875{
4173 struct prio_array *array; 3876 int old_prio, delta, on_rq;
4174 int old_prio, delta;
4175 unsigned long flags; 3877 unsigned long flags;
4176 struct rq *rq; 3878 struct rq *rq;
3879 u64 now;
4177 3880
4178 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3881 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4179 return; 3882 return;
@@ -4182,20 +3885,21 @@ void set_user_nice(struct task_struct *p, long nice)
4182 * the task might be in the middle of scheduling on another CPU. 3885 * the task might be in the middle of scheduling on another CPU.
4183 */ 3886 */
4184 rq = task_rq_lock(p, &flags); 3887 rq = task_rq_lock(p, &flags);
3888 now = rq_clock(rq);
4185 /* 3889 /*
4186 * The RT priorities are set via sched_setscheduler(), but we still 3890 * The RT priorities are set via sched_setscheduler(), but we still
4187 * allow the 'normal' nice value to be set - but as expected 3891 * allow the 'normal' nice value to be set - but as expected
4188 * it wont have any effect on scheduling until the task is 3892 * it wont have any effect on scheduling until the task is
4189 * not SCHED_NORMAL/SCHED_BATCH: 3893 * SCHED_FIFO/SCHED_RR:
4190 */ 3894 */
4191 if (has_rt_policy(p)) { 3895 if (task_has_rt_policy(p)) {
4192 p->static_prio = NICE_TO_PRIO(nice); 3896 p->static_prio = NICE_TO_PRIO(nice);
4193 goto out_unlock; 3897 goto out_unlock;
4194 } 3898 }
4195 array = p->array; 3899 on_rq = p->se.on_rq;
4196 if (array) { 3900 if (on_rq) {
4197 dequeue_task(p, array); 3901 dequeue_task(rq, p, 0, now);
4198 dec_raw_weighted_load(rq, p); 3902 dec_load(rq, p, now);
4199 } 3903 }
4200 3904
4201 p->static_prio = NICE_TO_PRIO(nice); 3905 p->static_prio = NICE_TO_PRIO(nice);
@@ -4204,9 +3908,9 @@ void set_user_nice(struct task_struct *p, long nice)
4204 p->prio = effective_prio(p); 3908 p->prio = effective_prio(p);
4205 delta = p->prio - old_prio; 3909 delta = p->prio - old_prio;
4206 3910
4207 if (array) { 3911 if (on_rq) {
4208 enqueue_task(p, array); 3912 enqueue_task(rq, p, 0, now);
4209 inc_raw_weighted_load(rq, p); 3913 inc_load(rq, p, now);
4210 /* 3914 /*
4211 * If the task increased its priority or is running and 3915 * If the task increased its priority or is running and
4212 * lowered its priority, then reschedule its CPU: 3916 * lowered its priority, then reschedule its CPU:
@@ -4326,20 +4030,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4326} 4030}
4327 4031
4328/* Actually do priority change: must hold rq lock. */ 4032/* Actually do priority change: must hold rq lock. */
4329static void __setscheduler(struct task_struct *p, int policy, int prio) 4033static void
4034__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4330{ 4035{
4331 BUG_ON(p->array); 4036 BUG_ON(p->se.on_rq);
4332 4037
4333 p->policy = policy; 4038 p->policy = policy;
4039 switch (p->policy) {
4040 case SCHED_NORMAL:
4041 case SCHED_BATCH:
4042 case SCHED_IDLE:
4043 p->sched_class = &fair_sched_class;
4044 break;
4045 case SCHED_FIFO:
4046 case SCHED_RR:
4047 p->sched_class = &rt_sched_class;
4048 break;
4049 }
4050
4334 p->rt_priority = prio; 4051 p->rt_priority = prio;
4335 p->normal_prio = normal_prio(p); 4052 p->normal_prio = normal_prio(p);
4336 /* we are holding p->pi_lock already */ 4053 /* we are holding p->pi_lock already */
4337 p->prio = rt_mutex_getprio(p); 4054 p->prio = rt_mutex_getprio(p);
4338 /*
4339 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4340 */
4341 if (policy == SCHED_BATCH)
4342 p->sleep_avg = 0;
4343 set_load_weight(p); 4055 set_load_weight(p);
4344} 4056}
4345 4057
@@ -4354,8 +4066,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4354int sched_setscheduler(struct task_struct *p, int policy, 4066int sched_setscheduler(struct task_struct *p, int policy,
4355 struct sched_param *param) 4067 struct sched_param *param)
4356{ 4068{
4357 int retval, oldprio, oldpolicy = -1; 4069 int retval, oldprio, oldpolicy = -1, on_rq;
4358 struct prio_array *array;
4359 unsigned long flags; 4070 unsigned long flags;
4360 struct rq *rq; 4071 struct rq *rq;
4361 4072
@@ -4366,27 +4077,27 @@ recheck:
4366 if (policy < 0) 4077 if (policy < 0)
4367 policy = oldpolicy = p->policy; 4078 policy = oldpolicy = p->policy;
4368 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4079 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4369 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4080 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4081 policy != SCHED_IDLE)
4370 return -EINVAL; 4082 return -EINVAL;
4371 /* 4083 /*
4372 * Valid priorities for SCHED_FIFO and SCHED_RR are 4084 * Valid priorities for SCHED_FIFO and SCHED_RR are
4373 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4085 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4374 * SCHED_BATCH is 0. 4086 * SCHED_BATCH and SCHED_IDLE is 0.
4375 */ 4087 */
4376 if (param->sched_priority < 0 || 4088 if (param->sched_priority < 0 ||
4377 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4089 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4378 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4090 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4379 return -EINVAL; 4091 return -EINVAL;
4380 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4092 if (rt_policy(policy) != (param->sched_priority != 0))
4381 return -EINVAL; 4093 return -EINVAL;
4382 4094
4383 /* 4095 /*
4384 * Allow unprivileged RT tasks to decrease priority: 4096 * Allow unprivileged RT tasks to decrease priority:
4385 */ 4097 */
4386 if (!capable(CAP_SYS_NICE)) { 4098 if (!capable(CAP_SYS_NICE)) {
4387 if (is_rt_policy(policy)) { 4099 if (rt_policy(policy)) {
4388 unsigned long rlim_rtprio; 4100 unsigned long rlim_rtprio;
4389 unsigned long flags;
4390 4101
4391 if (!lock_task_sighand(p, &flags)) 4102 if (!lock_task_sighand(p, &flags))
4392 return -ESRCH; 4103 return -ESRCH;
@@ -4402,6 +4113,12 @@ recheck:
4402 param->sched_priority > rlim_rtprio) 4113 param->sched_priority > rlim_rtprio)
4403 return -EPERM; 4114 return -EPERM;
4404 } 4115 }
4116 /*
4117 * Like positive nice levels, dont allow tasks to
4118 * move out of SCHED_IDLE either:
4119 */
4120 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4121 return -EPERM;
4405 4122
4406 /* can't change other user's priorities */ 4123 /* can't change other user's priorities */
4407 if ((current->euid != p->euid) && 4124 if ((current->euid != p->euid) &&
@@ -4429,13 +4146,13 @@ recheck:
4429 spin_unlock_irqrestore(&p->pi_lock, flags); 4146 spin_unlock_irqrestore(&p->pi_lock, flags);
4430 goto recheck; 4147 goto recheck;
4431 } 4148 }
4432 array = p->array; 4149 on_rq = p->se.on_rq;
4433 if (array) 4150 if (on_rq)
4434 deactivate_task(p, rq); 4151 deactivate_task(rq, p, 0);
4435 oldprio = p->prio; 4152 oldprio = p->prio;
4436 __setscheduler(p, policy, param->sched_priority); 4153 __setscheduler(rq, p, policy, param->sched_priority);
4437 if (array) { 4154 if (on_rq) {
4438 __activate_task(p, rq); 4155 activate_task(rq, p, 0);
4439 /* 4156 /*
4440 * Reschedule if we are currently running on this runqueue and 4157 * Reschedule if we are currently running on this runqueue and
4441 * our priority decreased, or if we are not currently running on 4158 * our priority decreased, or if we are not currently running on
@@ -4444,8 +4161,9 @@ recheck:
4444 if (task_running(rq, p)) { 4161 if (task_running(rq, p)) {
4445 if (p->prio > oldprio) 4162 if (p->prio > oldprio)
4446 resched_task(rq->curr); 4163 resched_task(rq->curr);
4447 } else if (TASK_PREEMPTS_CURR(p, rq)) 4164 } else {
4448 resched_task(rq->curr); 4165 check_preempt_curr(rq, p);
4166 }
4449 } 4167 }
4450 __task_rq_unlock(rq); 4168 __task_rq_unlock(rq);
4451 spin_unlock_irqrestore(&p->pi_lock, flags); 4169 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4717,41 +4435,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4717/** 4435/**
4718 * sys_sched_yield - yield the current processor to other threads. 4436 * sys_sched_yield - yield the current processor to other threads.
4719 * 4437 *
4720 * This function yields the current CPU by moving the calling thread 4438 * This function yields the current CPU to other tasks. If there are no
4721 * to the expired array. If there are no other threads running on this 4439 * other threads running on this CPU then this function will return.
4722 * CPU then this function will return.
4723 */ 4440 */
4724asmlinkage long sys_sched_yield(void) 4441asmlinkage long sys_sched_yield(void)
4725{ 4442{
4726 struct rq *rq = this_rq_lock(); 4443 struct rq *rq = this_rq_lock();
4727 struct prio_array *array = current->array, *target = rq->expired;
4728 4444
4729 schedstat_inc(rq, yld_cnt); 4445 schedstat_inc(rq, yld_cnt);
4730 /* 4446 if (unlikely(rq->nr_running == 1))
4731 * We implement yielding by moving the task into the expired
4732 * queue.
4733 *
4734 * (special rule: RT tasks will just roundrobin in the active
4735 * array.)
4736 */
4737 if (rt_task(current))
4738 target = rq->active;
4739
4740 if (array->nr_active == 1) {
4741 schedstat_inc(rq, yld_act_empty); 4447 schedstat_inc(rq, yld_act_empty);
4742 if (!rq->expired->nr_active) 4448 else
4743 schedstat_inc(rq, yld_both_empty); 4449 current->sched_class->yield_task(rq, current);
4744 } else if (!rq->expired->nr_active)
4745 schedstat_inc(rq, yld_exp_empty);
4746
4747 if (array != target) {
4748 dequeue_task(current, array);
4749 enqueue_task(current, target);
4750 } else
4751 /*
4752 * requeue_task is cheaper so perform that if possible.
4753 */
4754 requeue_task(current, array);
4755 4450
4756 /* 4451 /*
4757 * Since we are going to call schedule() anyway, there's 4452 * Since we are going to call schedule() anyway, there's
@@ -4902,6 +4597,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4902 break; 4597 break;
4903 case SCHED_NORMAL: 4598 case SCHED_NORMAL:
4904 case SCHED_BATCH: 4599 case SCHED_BATCH:
4600 case SCHED_IDLE:
4905 ret = 0; 4601 ret = 0;
4906 break; 4602 break;
4907 } 4603 }
@@ -4926,6 +4622,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4926 break; 4622 break;
4927 case SCHED_NORMAL: 4623 case SCHED_NORMAL:
4928 case SCHED_BATCH: 4624 case SCHED_BATCH:
4625 case SCHED_IDLE:
4929 ret = 0; 4626 ret = 0;
4930 } 4627 }
4931 return ret; 4628 return ret;
@@ -4960,7 +4657,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4960 goto out_unlock; 4657 goto out_unlock;
4961 4658
4962 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4659 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4963 0 : task_timeslice(p), &t); 4660 0 : static_prio_timeslice(p->static_prio), &t);
4964 read_unlock(&tasklist_lock); 4661 read_unlock(&tasklist_lock);
4965 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4662 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4966out_nounlock: 4663out_nounlock:
@@ -4980,14 +4677,14 @@ static void show_task(struct task_struct *p)
4980 state = p->state ? __ffs(p->state) + 1 : 0; 4677 state = p->state ? __ffs(p->state) + 1 : 0;
4981 printk("%-13.13s %c", p->comm, 4678 printk("%-13.13s %c", p->comm,
4982 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4679 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4983#if (BITS_PER_LONG == 32) 4680#if BITS_PER_LONG == 32
4984 if (state == TASK_RUNNING) 4681 if (state == TASK_RUNNING)
4985 printk(" running "); 4682 printk(" running ");
4986 else 4683 else
4987 printk(" %08lX ", thread_saved_pc(p)); 4684 printk(" %08lx ", thread_saved_pc(p));
4988#else 4685#else
4989 if (state == TASK_RUNNING) 4686 if (state == TASK_RUNNING)
4990 printk(" running task "); 4687 printk(" running task ");
4991 else 4688 else
4992 printk(" %016lx ", thread_saved_pc(p)); 4689 printk(" %016lx ", thread_saved_pc(p));
4993#endif 4690#endif
@@ -4999,11 +4696,7 @@ static void show_task(struct task_struct *p)
4999 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4696 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5000 } 4697 }
5001#endif 4698#endif
5002 printk("%5lu %5d %6d", free, p->pid, p->parent->pid); 4699 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
5003 if (!p->mm)
5004 printk(" (L-TLB)\n");
5005 else
5006 printk(" (NOTLB)\n");
5007 4700
5008 if (state != TASK_RUNNING) 4701 if (state != TASK_RUNNING)
5009 show_stack(p, NULL); 4702 show_stack(p, NULL);
@@ -5013,14 +4706,12 @@ void show_state_filter(unsigned long state_filter)
5013{ 4706{
5014 struct task_struct *g, *p; 4707 struct task_struct *g, *p;
5015 4708
5016#if (BITS_PER_LONG == 32) 4709#if BITS_PER_LONG == 32
5017 printk("\n" 4710 printk(KERN_INFO
5018 " free sibling\n"); 4711 " task PC stack pid father\n");
5019 printk(" task PC stack pid father child younger older\n");
5020#else 4712#else
5021 printk("\n" 4713 printk(KERN_INFO
5022 " free sibling\n"); 4714 " task PC stack pid father\n");
5023 printk(" task PC stack pid father child younger older\n");
5024#endif 4715#endif
5025 read_lock(&tasklist_lock); 4716 read_lock(&tasklist_lock);
5026 do_each_thread(g, p) { 4717 do_each_thread(g, p) {
@@ -5035,6 +4726,9 @@ void show_state_filter(unsigned long state_filter)
5035 4726
5036 touch_all_softlockup_watchdogs(); 4727 touch_all_softlockup_watchdogs();
5037 4728
4729#ifdef CONFIG_SCHED_DEBUG
4730 sysrq_sched_debug_show();
4731#endif
5038 read_unlock(&tasklist_lock); 4732 read_unlock(&tasklist_lock);
5039 /* 4733 /*
5040 * Only show locks if all tasks are dumped: 4734 * Only show locks if all tasks are dumped:
@@ -5043,6 +4737,11 @@ void show_state_filter(unsigned long state_filter)
5043 debug_show_all_locks(); 4737 debug_show_all_locks();
5044} 4738}
5045 4739
4740void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4741{
4742 idle->sched_class = &idle_sched_class;
4743}
4744
5046/** 4745/**
5047 * init_idle - set up an idle thread for a given CPU 4746 * init_idle - set up an idle thread for a given CPU
5048 * @idle: task in question 4747 * @idle: task in question
@@ -5056,13 +4755,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5056 struct rq *rq = cpu_rq(cpu); 4755 struct rq *rq = cpu_rq(cpu);
5057 unsigned long flags; 4756 unsigned long flags;
5058 4757
5059 idle->timestamp = sched_clock(); 4758 __sched_fork(idle);
5060 idle->sleep_avg = 0; 4759 idle->se.exec_start = sched_clock();
5061 idle->array = NULL; 4760
5062 idle->prio = idle->normal_prio = MAX_PRIO; 4761 idle->prio = idle->normal_prio = MAX_PRIO;
5063 idle->state = TASK_RUNNING;
5064 idle->cpus_allowed = cpumask_of_cpu(cpu); 4762 idle->cpus_allowed = cpumask_of_cpu(cpu);
5065 set_task_cpu(idle, cpu); 4763 __set_task_cpu(idle, cpu);
5066 4764
5067 spin_lock_irqsave(&rq->lock, flags); 4765 spin_lock_irqsave(&rq->lock, flags);
5068 rq->curr = rq->idle = idle; 4766 rq->curr = rq->idle = idle;
@@ -5077,6 +4775,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5077#else 4775#else
5078 task_thread_info(idle)->preempt_count = 0; 4776 task_thread_info(idle)->preempt_count = 0;
5079#endif 4777#endif
4778 /*
4779 * The idle tasks have their own, simple scheduling class:
4780 */
4781 idle->sched_class = &idle_sched_class;
5080} 4782}
5081 4783
5082/* 4784/*
@@ -5088,6 +4790,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5088 */ 4790 */
5089cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4791cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5090 4792
4793/*
4794 * Increase the granularity value when there are more CPUs,
4795 * because with more CPUs the 'effective latency' as visible
4796 * to users decreases. But the relationship is not linear,
4797 * so pick a second-best guess by going with the log2 of the
4798 * number of CPUs.
4799 *
4800 * This idea comes from the SD scheduler of Con Kolivas:
4801 */
4802static inline void sched_init_granularity(void)
4803{
4804 unsigned int factor = 1 + ilog2(num_online_cpus());
4805 const unsigned long gran_limit = 100000000;
4806
4807 sysctl_sched_granularity *= factor;
4808 if (sysctl_sched_granularity > gran_limit)
4809 sysctl_sched_granularity = gran_limit;
4810
4811 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4812 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4813}
4814
5091#ifdef CONFIG_SMP 4815#ifdef CONFIG_SMP
5092/* 4816/*
5093 * This is how migration works: 4817 * This is how migration works:
@@ -5161,7 +4885,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5161static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4885static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5162{ 4886{
5163 struct rq *rq_dest, *rq_src; 4887 struct rq *rq_dest, *rq_src;
5164 int ret = 0; 4888 int ret = 0, on_rq;
5165 4889
5166 if (unlikely(cpu_is_offline(dest_cpu))) 4890 if (unlikely(cpu_is_offline(dest_cpu)))
5167 return ret; 4891 return ret;
@@ -5177,20 +4901,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5177 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4901 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5178 goto out; 4902 goto out;
5179 4903
4904 on_rq = p->se.on_rq;
4905 if (on_rq)
4906 deactivate_task(rq_src, p, 0);
5180 set_task_cpu(p, dest_cpu); 4907 set_task_cpu(p, dest_cpu);
5181 if (p->array) { 4908 if (on_rq) {
5182 /* 4909 activate_task(rq_dest, p, 0);
5183 * Sync timestamp with rq_dest's before activating. 4910 check_preempt_curr(rq_dest, p);
5184 * The same thing could be achieved by doing this step
5185 * afterwards, and pretending it was a local activate.
5186 * This way is cleaner and logically correct.
5187 */
5188 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5189 + rq_dest->most_recent_timestamp;
5190 deactivate_task(p, rq_src);
5191 __activate_task(p, rq_dest);
5192 if (TASK_PREEMPTS_CURR(p, rq_dest))
5193 resched_task(rq_dest->curr);
5194 } 4911 }
5195 ret = 1; 4912 ret = 1;
5196out: 4913out:
@@ -5216,8 +4933,6 @@ static int migration_thread(void *data)
5216 struct migration_req *req; 4933 struct migration_req *req;
5217 struct list_head *head; 4934 struct list_head *head;
5218 4935
5219 try_to_freeze();
5220
5221 spin_lock_irq(&rq->lock); 4936 spin_lock_irq(&rq->lock);
5222 4937
5223 if (cpu_is_offline(cpu)) { 4938 if (cpu_is_offline(cpu)) {
@@ -5342,7 +5057,8 @@ static void migrate_live_tasks(int src_cpu)
5342 write_unlock_irq(&tasklist_lock); 5057 write_unlock_irq(&tasklist_lock);
5343} 5058}
5344 5059
5345/* Schedules idle task to be the next runnable task on current CPU. 5060/*
5061 * Schedules idle task to be the next runnable task on current CPU.
5346 * It does so by boosting its priority to highest possible and adding it to 5062 * It does so by boosting its priority to highest possible and adding it to
5347 * the _front_ of the runqueue. Used by CPU offline code. 5063 * the _front_ of the runqueue. Used by CPU offline code.
5348 */ 5064 */
@@ -5362,10 +5078,10 @@ void sched_idle_next(void)
5362 */ 5078 */
5363 spin_lock_irqsave(&rq->lock, flags); 5079 spin_lock_irqsave(&rq->lock, flags);
5364 5080
5365 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5081 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5366 5082
5367 /* Add idle task to the _front_ of its priority queue: */ 5083 /* Add idle task to the _front_ of its priority queue: */
5368 __activate_idle_task(p, rq); 5084 activate_idle_task(p, rq);
5369 5085
5370 spin_unlock_irqrestore(&rq->lock, flags); 5086 spin_unlock_irqrestore(&rq->lock, flags);
5371} 5087}
@@ -5415,16 +5131,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5415static void migrate_dead_tasks(unsigned int dead_cpu) 5131static void migrate_dead_tasks(unsigned int dead_cpu)
5416{ 5132{
5417 struct rq *rq = cpu_rq(dead_cpu); 5133 struct rq *rq = cpu_rq(dead_cpu);
5418 unsigned int arr, i; 5134 struct task_struct *next;
5419 5135
5420 for (arr = 0; arr < 2; arr++) { 5136 for ( ; ; ) {
5421 for (i = 0; i < MAX_PRIO; i++) { 5137 if (!rq->nr_running)
5422 struct list_head *list = &rq->arrays[arr].queue[i]; 5138 break;
5423 5139 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5424 while (!list_empty(list)) 5140 if (!next)
5425 migrate_dead(dead_cpu, list_entry(list->next, 5141 break;
5426 struct task_struct, run_list)); 5142 migrate_dead(dead_cpu, next);
5427 }
5428 } 5143 }
5429} 5144}
5430#endif /* CONFIG_HOTPLUG_CPU */ 5145#endif /* CONFIG_HOTPLUG_CPU */
@@ -5448,14 +5163,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5448 5163
5449 case CPU_UP_PREPARE: 5164 case CPU_UP_PREPARE:
5450 case CPU_UP_PREPARE_FROZEN: 5165 case CPU_UP_PREPARE_FROZEN:
5451 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5166 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5452 if (IS_ERR(p)) 5167 if (IS_ERR(p))
5453 return NOTIFY_BAD; 5168 return NOTIFY_BAD;
5454 p->flags |= PF_NOFREEZE;
5455 kthread_bind(p, cpu); 5169 kthread_bind(p, cpu);
5456 /* Must be high prio: stop_machine expects to yield to it. */ 5170 /* Must be high prio: stop_machine expects to yield to it. */
5457 rq = task_rq_lock(p, &flags); 5171 rq = task_rq_lock(p, &flags);
5458 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5172 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5459 task_rq_unlock(rq, &flags); 5173 task_rq_unlock(rq, &flags);
5460 cpu_rq(cpu)->migration_thread = p; 5174 cpu_rq(cpu)->migration_thread = p;
5461 break; 5175 break;
@@ -5486,9 +5200,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5486 rq->migration_thread = NULL; 5200 rq->migration_thread = NULL;
5487 /* Idle task back to normal (off runqueue, low prio) */ 5201 /* Idle task back to normal (off runqueue, low prio) */
5488 rq = task_rq_lock(rq->idle, &flags); 5202 rq = task_rq_lock(rq->idle, &flags);
5489 deactivate_task(rq->idle, rq); 5203 deactivate_task(rq, rq->idle, 0);
5490 rq->idle->static_prio = MAX_PRIO; 5204 rq->idle->static_prio = MAX_PRIO;
5491 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5205 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5206 rq->idle->sched_class = &idle_sched_class;
5492 migrate_dead_tasks(cpu); 5207 migrate_dead_tasks(cpu);
5493 task_rq_unlock(rq, &flags); 5208 task_rq_unlock(rq, &flags);
5494 migrate_nr_uninterruptible(rq); 5209 migrate_nr_uninterruptible(rq);
@@ -5797,483 +5512,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797 5512
5798#define SD_NODES_PER_DOMAIN 16 5513#define SD_NODES_PER_DOMAIN 16
5799 5514
5800/*
5801 * Self-tuning task migration cost measurement between source and target CPUs.
5802 *
5803 * This is done by measuring the cost of manipulating buffers of varying
5804 * sizes. For a given buffer-size here are the steps that are taken:
5805 *
5806 * 1) the source CPU reads+dirties a shared buffer
5807 * 2) the target CPU reads+dirties the same shared buffer
5808 *
5809 * We measure how long they take, in the following 4 scenarios:
5810 *
5811 * - source: CPU1, target: CPU2 | cost1
5812 * - source: CPU2, target: CPU1 | cost2
5813 * - source: CPU1, target: CPU1 | cost3
5814 * - source: CPU2, target: CPU2 | cost4
5815 *
5816 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817 * the cost of migration.
5818 *
5819 * We then start off from a small buffer-size and iterate up to larger
5820 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821 * doing a maximum search for the cost. (The maximum cost for a migration
5822 * normally occurs when the working set size is around the effective cache
5823 * size.)
5824 */
5825#define SEARCH_SCOPE 2
5826#define MIN_CACHE_SIZE (64*1024U)
5827#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5828#define ITERATIONS 1
5829#define SIZE_THRESH 130
5830#define COST_THRESH 130
5831
5832/*
5833 * The migration cost is a function of 'domain distance'. Domain
5834 * distance is the number of steps a CPU has to iterate down its
5835 * domain tree to share a domain with the other CPU. The farther
5836 * two CPUs are from each other, the larger the distance gets.
5837 *
5838 * Note that we use the distance only to cache measurement results,
5839 * the distance value is not used numerically otherwise. When two
5840 * CPUs have the same distance it is assumed that the migration
5841 * cost is the same. (this is a simplification but quite practical)
5842 */
5843#define MAX_DOMAIN_DISTANCE 32
5844
5845static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847/*
5848 * Architectures may override the migration cost and thus avoid
5849 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5850 * virtualized hardware:
5851 */
5852#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853 CONFIG_DEFAULT_MIGRATION_COST
5854#else
5855 -1LL
5856#endif
5857};
5858
5859/*
5860 * Allow override of migration cost - in units of microseconds.
5861 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863 */
5864static int __init migration_cost_setup(char *str)
5865{
5866 int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868 str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870 printk("#ints: %d\n", ints[0]);
5871 for (i = 1; i <= ints[0]; i++) {
5872 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874 }
5875 return 1;
5876}
5877
5878__setup ("migration_cost=", migration_cost_setup);
5879
5880/*
5881 * Global multiplier (divisor) for migration-cutoff values,
5882 * in percentiles. E.g. use a value of 150 to get 1.5 times
5883 * longer cache-hot cutoff times.
5884 *
5885 * (We scale it from 100 to 128 to long long handling easier.)
5886 */
5887
5888#define MIGRATION_FACTOR_SCALE 128
5889
5890static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892static int __init setup_migration_factor(char *str)
5893{
5894 get_option(&str, &migration_factor);
5895 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896 return 1;
5897}
5898
5899__setup("migration_factor=", setup_migration_factor);
5900
5901/*
5902 * Estimated distance of two CPUs, measured via the number of domains
5903 * we have to pass for the two CPUs to be in the same span:
5904 */
5905static unsigned long domain_distance(int cpu1, int cpu2)
5906{
5907 unsigned long distance = 0;
5908 struct sched_domain *sd;
5909
5910 for_each_domain(cpu1, sd) {
5911 WARN_ON(!cpu_isset(cpu1, sd->span));
5912 if (cpu_isset(cpu2, sd->span))
5913 return distance;
5914 distance++;
5915 }
5916 if (distance >= MAX_DOMAIN_DISTANCE) {
5917 WARN_ON(1);
5918 distance = MAX_DOMAIN_DISTANCE-1;
5919 }
5920
5921 return distance;
5922}
5923
5924static unsigned int migration_debug;
5925
5926static int __init setup_migration_debug(char *str)
5927{
5928 get_option(&str, &migration_debug);
5929 return 1;
5930}
5931
5932__setup("migration_debug=", setup_migration_debug);
5933
5934/*
5935 * Maximum cache-size that the scheduler should try to measure.
5936 * Architectures with larger caches should tune this up during
5937 * bootup. Gets used in the domain-setup code (i.e. during SMP
5938 * bootup).
5939 */
5940unsigned int max_cache_size;
5941
5942static int __init setup_max_cache_size(char *str)
5943{
5944 get_option(&str, &max_cache_size);
5945 return 1;
5946}
5947
5948__setup("max_cache_size=", setup_max_cache_size);
5949
5950/*
5951 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952 * is the operation that is timed, so we try to generate unpredictable
5953 * cachemisses that still end up filling the L2 cache:
5954 */
5955static void touch_cache(void *__cache, unsigned long __size)
5956{
5957 unsigned long size = __size / sizeof(long);
5958 unsigned long chunk1 = size / 3;
5959 unsigned long chunk2 = 2 * size / 3;
5960 unsigned long *cache = __cache;
5961 int i;
5962
5963 for (i = 0; i < size/6; i += 8) {
5964 switch (i % 6) {
5965 case 0: cache[i]++;
5966 case 1: cache[size-1-i]++;
5967 case 2: cache[chunk1-i]++;
5968 case 3: cache[chunk1+i]++;
5969 case 4: cache[chunk2-i]++;
5970 case 5: cache[chunk2+i]++;
5971 }
5972 }
5973}
5974
5975/*
5976 * Measure the cache-cost of one task migration. Returns in units of nsec.
5977 */
5978static unsigned long long
5979measure_one(void *cache, unsigned long size, int source, int target)
5980{
5981 cpumask_t mask, saved_mask;
5982 unsigned long long t0, t1, t2, t3, cost;
5983
5984 saved_mask = current->cpus_allowed;
5985
5986 /*
5987 * Flush source caches to RAM and invalidate them:
5988 */
5989 sched_cacheflush();
5990
5991 /*
5992 * Migrate to the source CPU:
5993 */
5994 mask = cpumask_of_cpu(source);
5995 set_cpus_allowed(current, mask);
5996 WARN_ON(smp_processor_id() != source);
5997
5998 /*
5999 * Dirty the working set:
6000 */
6001 t0 = sched_clock();
6002 touch_cache(cache, size);
6003 t1 = sched_clock();
6004
6005 /*
6006 * Migrate to the target CPU, dirty the L2 cache and access
6007 * the shared buffer. (which represents the working set
6008 * of a migrated task.)
6009 */
6010 mask = cpumask_of_cpu(target);
6011 set_cpus_allowed(current, mask);
6012 WARN_ON(smp_processor_id() != target);
6013
6014 t2 = sched_clock();
6015 touch_cache(cache, size);
6016 t3 = sched_clock();
6017
6018 cost = t1-t0 + t3-t2;
6019
6020 if (migration_debug >= 2)
6021 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022 source, target, t1-t0, t1-t0, t3-t2, cost);
6023 /*
6024 * Flush target caches to RAM and invalidate them:
6025 */
6026 sched_cacheflush();
6027
6028 set_cpus_allowed(current, saved_mask);
6029
6030 return cost;
6031}
6032
6033/*
6034 * Measure a series of task migrations and return the average
6035 * result. Since this code runs early during bootup the system
6036 * is 'undisturbed' and the average latency makes sense.
6037 *
6038 * The algorithm in essence auto-detects the relevant cache-size,
6039 * so it will properly detect different cachesizes for different
6040 * cache-hierarchies, depending on how the CPUs are connected.
6041 *
6042 * Architectures can prime the upper limit of the search range via
6043 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6044 */
6045static unsigned long long
6046measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047{
6048 unsigned long long cost1, cost2;
6049 int i;
6050
6051 /*
6052 * Measure the migration cost of 'size' bytes, over an
6053 * average of 10 runs:
6054 *
6055 * (We perturb the cache size by a small (0..4k)
6056 * value to compensate size/alignment related artifacts.
6057 * We also subtract the cost of the operation done on
6058 * the same CPU.)
6059 */
6060 cost1 = 0;
6061
6062 /*
6063 * dry run, to make sure we start off cache-cold on cpu1,
6064 * and to get any vmalloc pagefaults in advance:
6065 */
6066 measure_one(cache, size, cpu1, cpu2);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070 measure_one(cache, size, cpu2, cpu1);
6071 for (i = 0; i < ITERATIONS; i++)
6072 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074 /*
6075 * (We measure the non-migrating [cached] cost on both
6076 * cpu1 and cpu2, to handle CPUs with different speeds)
6077 */
6078 cost2 = 0;
6079
6080 measure_one(cache, size, cpu1, cpu1);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084 measure_one(cache, size, cpu2, cpu2);
6085 for (i = 0; i < ITERATIONS; i++)
6086 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088 /*
6089 * Get the per-iteration migration cost:
6090 */
6091 do_div(cost1, 2 * ITERATIONS);
6092 do_div(cost2, 2 * ITERATIONS);
6093
6094 return cost1 - cost2;
6095}
6096
6097static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098{
6099 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100 unsigned int max_size, size, size_found = 0;
6101 long long cost = 0, prev_cost;
6102 void *cache;
6103
6104 /*
6105 * Search from max_cache_size*5 down to 64K - the real relevant
6106 * cachesize has to lie somewhere inbetween.
6107 */
6108 if (max_cache_size) {
6109 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111 } else {
6112 /*
6113 * Since we have no estimation about the relevant
6114 * search range
6115 */
6116 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117 size = MIN_CACHE_SIZE;
6118 }
6119
6120 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6121 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122 return 0;
6123 }
6124
6125 /*
6126 * Allocate the working set:
6127 */
6128 cache = vmalloc(max_size);
6129 if (!cache) {
6130 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131 return 1000000; /* return 1 msec on very small boxen */
6132 }
6133
6134 while (size <= max_size) {
6135 prev_cost = cost;
6136 cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138 /*
6139 * Update the max:
6140 */
6141 if (cost > 0) {
6142 if (max_cost < cost) {
6143 max_cost = cost;
6144 size_found = size;
6145 }
6146 }
6147 /*
6148 * Calculate average fluctuation, we use this to prevent
6149 * noise from triggering an early break out of the loop:
6150 */
6151 fluct = abs(cost - prev_cost);
6152 avg_fluct = (avg_fluct + fluct)/2;
6153
6154 if (migration_debug)
6155 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156 "(%8Ld %8Ld)\n",
6157 cpu1, cpu2, size,
6158 (long)cost / 1000000,
6159 ((long)cost / 100000) % 10,
6160 (long)max_cost / 1000000,
6161 ((long)max_cost / 100000) % 10,
6162 domain_distance(cpu1, cpu2),
6163 cost, avg_fluct);
6164
6165 /*
6166 * If we iterated at least 20% past the previous maximum,
6167 * and the cost has dropped by more than 20% already,
6168 * (taking fluctuations into account) then we assume to
6169 * have found the maximum and break out of the loop early:
6170 */
6171 if (size_found && (size*100 > size_found*SIZE_THRESH))
6172 if (cost+avg_fluct <= 0 ||
6173 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6174
6175 if (migration_debug)
6176 printk("-> found max.\n");
6177 break;
6178 }
6179 /*
6180 * Increase the cachesize in 10% steps:
6181 */
6182 size = size * 10 / 9;
6183 }
6184
6185 if (migration_debug)
6186 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187 cpu1, cpu2, size_found, max_cost);
6188
6189 vfree(cache);
6190
6191 /*
6192 * A task is considered 'cache cold' if at least 2 times
6193 * the worst-case cost of migration has passed.
6194 *
6195 * (this limit is only listened to if the load-balancing
6196 * situation is 'nice' - if there is a large imbalance we
6197 * ignore it for the sake of CPU utilization and
6198 * processing fairness.)
6199 */
6200 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201}
6202
6203static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204{
6205 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206 unsigned long j0, j1, distance, max_distance = 0;
6207 struct sched_domain *sd;
6208
6209 j0 = jiffies;
6210
6211 /*
6212 * First pass - calculate the cacheflush times:
6213 */
6214 for_each_cpu_mask(cpu1, *cpu_map) {
6215 for_each_cpu_mask(cpu2, *cpu_map) {
6216 if (cpu1 == cpu2)
6217 continue;
6218 distance = domain_distance(cpu1, cpu2);
6219 max_distance = max(max_distance, distance);
6220 /*
6221 * No result cached yet?
6222 */
6223 if (migration_cost[distance] == -1LL)
6224 migration_cost[distance] =
6225 measure_migration_cost(cpu1, cpu2);
6226 }
6227 }
6228 /*
6229 * Second pass - update the sched domain hierarchy with
6230 * the new cache-hot-time estimations:
6231 */
6232 for_each_cpu_mask(cpu, *cpu_map) {
6233 distance = 0;
6234 for_each_domain(cpu, sd) {
6235 sd->cache_hot_time = migration_cost[distance];
6236 distance++;
6237 }
6238 }
6239 /*
6240 * Print the matrix:
6241 */
6242 if (migration_debug)
6243 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244 max_cache_size,
6245#ifdef CONFIG_X86
6246 cpu_khz/1000
6247#else
6248 -1
6249#endif
6250 );
6251 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252 printk("migration_cost=");
6253 for (distance = 0; distance <= max_distance; distance++) {
6254 if (distance)
6255 printk(",");
6256 printk("%ld", (long)migration_cost[distance] / 1000);
6257 }
6258 printk("\n");
6259 }
6260 j1 = jiffies;
6261 if (migration_debug)
6262 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264 /*
6265 * Move back to the original CPU. NUMA-Q gets confused
6266 * if we migrate to another quad during bootup.
6267 */
6268 if (raw_smp_processor_id() != orig_cpu) {
6269 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270 saved_mask = current->cpus_allowed;
6271
6272 set_cpus_allowed(current, mask);
6273 set_cpus_allowed(current, saved_mask);
6274 }
6275}
6276
6277#ifdef CONFIG_NUMA 5515#ifdef CONFIG_NUMA
6278 5516
6279/** 5517/**
@@ -6574,7 +5812,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6574static int build_sched_domains(const cpumask_t *cpu_map) 5812static int build_sched_domains(const cpumask_t *cpu_map)
6575{ 5813{
6576 int i; 5814 int i;
6577 struct sched_domain *sd;
6578#ifdef CONFIG_NUMA 5815#ifdef CONFIG_NUMA
6579 struct sched_group **sched_group_nodes = NULL; 5816 struct sched_group **sched_group_nodes = NULL;
6580 int sd_allnodes = 0; 5817 int sd_allnodes = 0;
@@ -6582,7 +5819,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6582 /* 5819 /*
6583 * Allocate the per-node list of sched groups 5820 * Allocate the per-node list of sched groups
6584 */ 5821 */
6585 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5822 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6586 GFP_KERNEL); 5823 GFP_KERNEL);
6587 if (!sched_group_nodes) { 5824 if (!sched_group_nodes) {
6588 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5825 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +5838,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 cpus_and(nodemask, nodemask, *cpu_map); 5838 cpus_and(nodemask, nodemask, *cpu_map);
6602 5839
6603#ifdef CONFIG_NUMA 5840#ifdef CONFIG_NUMA
6604 if (cpus_weight(*cpu_map) 5841 if (cpus_weight(*cpu_map) >
6605 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5842 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6606 sd = &per_cpu(allnodes_domains, i); 5843 sd = &per_cpu(allnodes_domains, i);
6607 *sd = SD_ALLNODES_INIT; 5844 *sd = SD_ALLNODES_INIT;
6608 sd->span = *cpu_map; 5845 sd->span = *cpu_map;
@@ -6661,7 +5898,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6661 if (i != first_cpu(this_sibling_map)) 5898 if (i != first_cpu(this_sibling_map))
6662 continue; 5899 continue;
6663 5900
6664 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5901 init_sched_build_groups(this_sibling_map, cpu_map,
5902 &cpu_to_cpu_group);
6665 } 5903 }
6666#endif 5904#endif
6667 5905
@@ -6672,11 +5910,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6672 cpus_and(this_core_map, this_core_map, *cpu_map); 5910 cpus_and(this_core_map, this_core_map, *cpu_map);
6673 if (i != first_cpu(this_core_map)) 5911 if (i != first_cpu(this_core_map))
6674 continue; 5912 continue;
6675 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5913 init_sched_build_groups(this_core_map, cpu_map,
5914 &cpu_to_core_group);
6676 } 5915 }
6677#endif 5916#endif
6678 5917
6679
6680 /* Set up physical groups */ 5918 /* Set up physical groups */
6681 for (i = 0; i < MAX_NUMNODES; i++) { 5919 for (i = 0; i < MAX_NUMNODES; i++) {
6682 cpumask_t nodemask = node_to_cpumask(i); 5920 cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +5929,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6691#ifdef CONFIG_NUMA 5929#ifdef CONFIG_NUMA
6692 /* Set up node groups */ 5930 /* Set up node groups */
6693 if (sd_allnodes) 5931 if (sd_allnodes)
6694 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5932 init_sched_build_groups(*cpu_map, cpu_map,
5933 &cpu_to_allnodes_group);
6695 5934
6696 for (i = 0; i < MAX_NUMNODES; i++) { 5935 for (i = 0; i < MAX_NUMNODES; i++) {
6697 /* Set up node groups */ 5936 /* Set up node groups */
@@ -6719,6 +5958,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6719 sched_group_nodes[i] = sg; 5958 sched_group_nodes[i] = sg;
6720 for_each_cpu_mask(j, nodemask) { 5959 for_each_cpu_mask(j, nodemask) {
6721 struct sched_domain *sd; 5960 struct sched_domain *sd;
5961
6722 sd = &per_cpu(node_domains, j); 5962 sd = &per_cpu(node_domains, j);
6723 sd->groups = sg; 5963 sd->groups = sg;
6724 } 5964 }
@@ -6763,19 +6003,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6763 /* Calculate CPU power for physical packages and nodes */ 6003 /* Calculate CPU power for physical packages and nodes */
6764#ifdef CONFIG_SCHED_SMT 6004#ifdef CONFIG_SCHED_SMT
6765 for_each_cpu_mask(i, *cpu_map) { 6005 for_each_cpu_mask(i, *cpu_map) {
6766 sd = &per_cpu(cpu_domains, i); 6006 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6007
6767 init_sched_groups_power(i, sd); 6008 init_sched_groups_power(i, sd);
6768 } 6009 }
6769#endif 6010#endif
6770#ifdef CONFIG_SCHED_MC 6011#ifdef CONFIG_SCHED_MC
6771 for_each_cpu_mask(i, *cpu_map) { 6012 for_each_cpu_mask(i, *cpu_map) {
6772 sd = &per_cpu(core_domains, i); 6013 struct sched_domain *sd = &per_cpu(core_domains, i);
6014
6773 init_sched_groups_power(i, sd); 6015 init_sched_groups_power(i, sd);
6774 } 6016 }
6775#endif 6017#endif
6776 6018
6777 for_each_cpu_mask(i, *cpu_map) { 6019 for_each_cpu_mask(i, *cpu_map) {
6778 sd = &per_cpu(phys_domains, i); 6020 struct sched_domain *sd = &per_cpu(phys_domains, i);
6021
6779 init_sched_groups_power(i, sd); 6022 init_sched_groups_power(i, sd);
6780 } 6023 }
6781 6024
@@ -6803,10 +6046,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803#endif 6046#endif
6804 cpu_attach_domain(sd, i); 6047 cpu_attach_domain(sd, i);
6805 } 6048 }
6806 /*
6807 * Tune cache-hot values:
6808 */
6809 calibrate_migration_costs(cpu_map);
6810 6049
6811 return 0; 6050 return 0;
6812 6051
@@ -7013,10 +6252,12 @@ void __init sched_init_smp(void)
7013 /* Move init over to a non-isolated CPU */ 6252 /* Move init over to a non-isolated CPU */
7014 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6253 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7015 BUG(); 6254 BUG();
6255 sched_init_granularity();
7016} 6256}
7017#else 6257#else
7018void __init sched_init_smp(void) 6258void __init sched_init_smp(void)
7019{ 6259{
6260 sched_init_granularity();
7020} 6261}
7021#endif /* CONFIG_SMP */ 6262#endif /* CONFIG_SMP */
7022 6263
@@ -7030,28 +6271,51 @@ int in_sched_functions(unsigned long addr)
7030 && addr < (unsigned long)__sched_text_end); 6271 && addr < (unsigned long)__sched_text_end);
7031} 6272}
7032 6273
6274static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6275{
6276 cfs_rq->tasks_timeline = RB_ROOT;
6277 cfs_rq->fair_clock = 1;
6278#ifdef CONFIG_FAIR_GROUP_SCHED
6279 cfs_rq->rq = rq;
6280#endif
6281}
6282
7033void __init sched_init(void) 6283void __init sched_init(void)
7034{ 6284{
7035 int i, j, k; 6285 u64 now = sched_clock();
7036 int highest_cpu = 0; 6286 int highest_cpu = 0;
6287 int i, j;
6288
6289 /*
6290 * Link up the scheduling class hierarchy:
6291 */
6292 rt_sched_class.next = &fair_sched_class;
6293 fair_sched_class.next = &idle_sched_class;
6294 idle_sched_class.next = NULL;
7037 6295
7038 for_each_possible_cpu(i) { 6296 for_each_possible_cpu(i) {
7039 struct prio_array *array; 6297 struct rt_prio_array *array;
7040 struct rq *rq; 6298 struct rq *rq;
7041 6299
7042 rq = cpu_rq(i); 6300 rq = cpu_rq(i);
7043 spin_lock_init(&rq->lock); 6301 spin_lock_init(&rq->lock);
7044 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6302 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7045 rq->nr_running = 0; 6303 rq->nr_running = 0;
7046 rq->active = rq->arrays; 6304 rq->clock = 1;
7047 rq->expired = rq->arrays + 1; 6305 init_cfs_rq(&rq->cfs, rq);
7048 rq->best_expired_prio = MAX_PRIO; 6306#ifdef CONFIG_FAIR_GROUP_SCHED
6307 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6308 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6309#endif
6310 rq->ls.load_update_last = now;
6311 rq->ls.load_update_start = now;
7049 6312
6313 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6314 rq->cpu_load[j] = 0;
7050#ifdef CONFIG_SMP 6315#ifdef CONFIG_SMP
7051 rq->sd = NULL; 6316 rq->sd = NULL;
7052 for (j = 1; j < 3; j++)
7053 rq->cpu_load[j] = 0;
7054 rq->active_balance = 0; 6317 rq->active_balance = 0;
6318 rq->next_balance = jiffies;
7055 rq->push_cpu = 0; 6319 rq->push_cpu = 0;
7056 rq->cpu = i; 6320 rq->cpu = i;
7057 rq->migration_thread = NULL; 6321 rq->migration_thread = NULL;
@@ -7059,16 +6323,14 @@ void __init sched_init(void)
7059#endif 6323#endif
7060 atomic_set(&rq->nr_iowait, 0); 6324 atomic_set(&rq->nr_iowait, 0);
7061 6325
7062 for (j = 0; j < 2; j++) { 6326 array = &rq->rt.active;
7063 array = rq->arrays + j; 6327 for (j = 0; j < MAX_RT_PRIO; j++) {
7064 for (k = 0; k < MAX_PRIO; k++) { 6328 INIT_LIST_HEAD(array->queue + j);
7065 INIT_LIST_HEAD(array->queue + k); 6329 __clear_bit(j, array->bitmap);
7066 __clear_bit(k, array->bitmap);
7067 }
7068 // delimiter for bitsearch
7069 __set_bit(MAX_PRIO, array->bitmap);
7070 } 6330 }
7071 highest_cpu = i; 6331 highest_cpu = i;
6332 /* delimiter for bitsearch: */
6333 __set_bit(MAX_RT_PRIO, array->bitmap);
7072 } 6334 }
7073 6335
7074 set_load_weight(&init_task); 6336 set_load_weight(&init_task);
@@ -7095,6 +6357,10 @@ void __init sched_init(void)
7095 * when this runqueue becomes "idle". 6357 * when this runqueue becomes "idle".
7096 */ 6358 */
7097 init_idle(current, smp_processor_id()); 6359 init_idle(current, smp_processor_id());
6360 /*
6361 * During early bootup we pretend to be a normal task:
6362 */
6363 current->sched_class = &fair_sched_class;
7098} 6364}
7099 6365
7100#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6366#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7125,29 +6391,55 @@ EXPORT_SYMBOL(__might_sleep);
7125#ifdef CONFIG_MAGIC_SYSRQ 6391#ifdef CONFIG_MAGIC_SYSRQ
7126void normalize_rt_tasks(void) 6392void normalize_rt_tasks(void)
7127{ 6393{
7128 struct prio_array *array;
7129 struct task_struct *g, *p; 6394 struct task_struct *g, *p;
7130 unsigned long flags; 6395 unsigned long flags;
7131 struct rq *rq; 6396 struct rq *rq;
6397 int on_rq;
7132 6398
7133 read_lock_irq(&tasklist_lock); 6399 read_lock_irq(&tasklist_lock);
7134
7135 do_each_thread(g, p) { 6400 do_each_thread(g, p) {
7136 if (!rt_task(p)) 6401 p->se.fair_key = 0;
6402 p->se.wait_runtime = 0;
6403 p->se.wait_start_fair = 0;
6404 p->se.wait_start = 0;
6405 p->se.exec_start = 0;
6406 p->se.sleep_start = 0;
6407 p->se.sleep_start_fair = 0;
6408 p->se.block_start = 0;
6409 task_rq(p)->cfs.fair_clock = 0;
6410 task_rq(p)->clock = 0;
6411
6412 if (!rt_task(p)) {
6413 /*
6414 * Renice negative nice level userspace
6415 * tasks back to 0:
6416 */
6417 if (TASK_NICE(p) < 0 && p->mm)
6418 set_user_nice(p, 0);
7137 continue; 6419 continue;
6420 }
7138 6421
7139 spin_lock_irqsave(&p->pi_lock, flags); 6422 spin_lock_irqsave(&p->pi_lock, flags);
7140 rq = __task_rq_lock(p); 6423 rq = __task_rq_lock(p);
6424#ifdef CONFIG_SMP
6425 /*
6426 * Do not touch the migration thread:
6427 */
6428 if (p == rq->migration_thread)
6429 goto out_unlock;
6430#endif
7141 6431
7142 array = p->array; 6432 on_rq = p->se.on_rq;
7143 if (array) 6433 if (on_rq)
7144 deactivate_task(p, task_rq(p)); 6434 deactivate_task(task_rq(p), p, 0);
7145 __setscheduler(p, SCHED_NORMAL, 0); 6435 __setscheduler(rq, p, SCHED_NORMAL, 0);
7146 if (array) { 6436 if (on_rq) {
7147 __activate_task(p, task_rq(p)); 6437 activate_task(task_rq(p), p, 0);
7148 resched_task(rq->curr); 6438 resched_task(rq->curr);
7149 } 6439 }
7150 6440#ifdef CONFIG_SMP
6441 out_unlock:
6442#endif
7151 __task_rq_unlock(rq); 6443 __task_rq_unlock(rq);
7152 spin_unlock_irqrestore(&p->pi_lock, flags); 6444 spin_unlock_irqrestore(&p->pi_lock, flags);
7153 } while_each_thread(g, p); 6445 } while_each_thread(g, p);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
new file mode 100644
index 0000000000..29f2c21e7d
--- /dev/null
+++ b/kernel/sched_debug.c
@@ -0,0 +1,275 @@
1/*
2 * kernel/time/sched_debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19/*
20 * This allows printing both to /proc/sched_debug and
21 * to the console
22 */
23#define SEQ_printf(m, x...) \
24 do { \
25 if (m) \
26 seq_printf(m, x); \
27 else \
28 printk(x); \
29 } while (0)
30
31static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
33{
34 if (rq->curr == p)
35 SEQ_printf(m, "R");
36 else
37 SEQ_printf(m, " ");
38
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d "
40 "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
41 p->comm, p->pid,
42 (long long)p->se.fair_key,
43 (long long)(p->se.fair_key - rq->cfs.fair_clock),
44 (long long)p->se.wait_runtime,
45 (long long)(p->nvcsw + p->nivcsw),
46 p->prio,
47 (long long)p->se.sum_exec_runtime,
48 (long long)p->se.sum_wait_runtime,
49 (long long)p->se.sum_sleep_runtime,
50 (long long)p->se.wait_runtime_overruns,
51 (long long)p->se.wait_runtime_underruns);
52}
53
54static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
55{
56 struct task_struct *g, *p;
57
58 SEQ_printf(m,
59 "\nrunnable tasks:\n"
60 " task PID tree-key delta waiting"
61 " switches prio"
62 " sum-exec sum-wait sum-sleep"
63 " wait-overrun wait-underrun\n"
64 "------------------------------------------------------------------"
65 "----------------"
66 "------------------------------------------------"
67 "--------------------------------\n");
68
69 read_lock_irq(&tasklist_lock);
70
71 do_each_thread(g, p) {
72 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
73 continue;
74
75 print_task(m, rq, p, now);
76 } while_each_thread(g, p);
77
78 read_unlock_irq(&tasklist_lock);
79}
80
81static void
82print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
83{
84 s64 wait_runtime_rq_sum = 0;
85 struct task_struct *p;
86 struct rb_node *curr;
87 unsigned long flags;
88 struct rq *rq = &per_cpu(runqueues, cpu);
89
90 spin_lock_irqsave(&rq->lock, flags);
91 curr = first_fair(cfs_rq);
92 while (curr) {
93 p = rb_entry(curr, struct task_struct, se.run_node);
94 wait_runtime_rq_sum += p->se.wait_runtime;
95
96 curr = rb_next(curr);
97 }
98 spin_unlock_irqrestore(&rq->lock, flags);
99
100 SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum",
101 (long long)wait_runtime_rq_sum);
102}
103
104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
105{
106 SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
107
108#define P(x) \
109 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
110
111 P(fair_clock);
112 P(exec_clock);
113 P(wait_runtime);
114 P(wait_runtime_overruns);
115 P(wait_runtime_underruns);
116 P(sleeper_bonus);
117#undef P
118
119 print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
120}
121
122static void print_cpu(struct seq_file *m, int cpu, u64 now)
123{
124 struct rq *rq = &per_cpu(runqueues, cpu);
125
126#ifdef CONFIG_X86
127 {
128 unsigned int freq = cpu_khz ? : 1;
129
130 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
131 cpu, freq / 1000, (freq % 1000));
132 }
133#else
134 SEQ_printf(m, "\ncpu#%d\n", cpu);
135#endif
136
137#define P(x) \
138 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
139
140 P(nr_running);
141 SEQ_printf(m, " .%-30s: %lu\n", "load",
142 rq->ls.load.weight);
143 P(ls.delta_fair);
144 P(ls.delta_exec);
145 P(nr_switches);
146 P(nr_load_updates);
147 P(nr_uninterruptible);
148 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
149 P(next_balance);
150 P(curr->pid);
151 P(clock);
152 P(prev_clock_raw);
153 P(clock_warps);
154 P(clock_overflows);
155 P(clock_unstable_events);
156 P(clock_max_delta);
157 P(cpu_load[0]);
158 P(cpu_load[1]);
159 P(cpu_load[2]);
160 P(cpu_load[3]);
161 P(cpu_load[4]);
162#undef P
163
164 print_cfs_stats(m, cpu, now);
165
166 print_rq(m, rq, cpu, now);
167}
168
169static int sched_debug_show(struct seq_file *m, void *v)
170{
171 u64 now = ktime_to_ns(ktime_get());
172 int cpu;
173
174 SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n",
175 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version);
178
179 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
180
181 for_each_online_cpu(cpu)
182 print_cpu(m, cpu, now);
183
184 SEQ_printf(m, "\n");
185
186 return 0;
187}
188
189void sysrq_sched_debug_show(void)
190{
191 sched_debug_show(NULL, NULL);
192}
193
194static int sched_debug_open(struct inode *inode, struct file *filp)
195{
196 return single_open(filp, sched_debug_show, NULL);
197}
198
199static struct file_operations sched_debug_fops = {
200 .open = sched_debug_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = seq_release,
204};
205
206static int __init init_sched_debug_procfs(void)
207{
208 struct proc_dir_entry *pe;
209
210 pe = create_proc_entry("sched_debug", 0644, NULL);
211 if (!pe)
212 return -ENOMEM;
213
214 pe->proc_fops = &sched_debug_fops;
215
216 return 0;
217}
218
219__initcall(init_sched_debug_procfs);
220
221void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
222{
223 unsigned long flags;
224 int num_threads = 1;
225
226 rcu_read_lock();
227 if (lock_task_sighand(p, &flags)) {
228 num_threads = atomic_read(&p->signal->count);
229 unlock_task_sighand(p, &flags);
230 }
231 rcu_read_unlock();
232
233 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
234 SEQ_printf(m, "----------------------------------------------\n");
235#define P(F) \
236 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
237
238 P(se.wait_start);
239 P(se.wait_start_fair);
240 P(se.exec_start);
241 P(se.sleep_start);
242 P(se.sleep_start_fair);
243 P(se.block_start);
244 P(se.sleep_max);
245 P(se.block_max);
246 P(se.exec_max);
247 P(se.wait_max);
248 P(se.wait_runtime);
249 P(se.wait_runtime_overruns);
250 P(se.wait_runtime_underruns);
251 P(se.sum_wait_runtime);
252 P(se.sum_exec_runtime);
253 SEQ_printf(m, "%-25s:%20Ld\n",
254 "nr_switches", (long long)(p->nvcsw + p->nivcsw));
255 P(se.load.weight);
256 P(policy);
257 P(prio);
258#undef P
259
260 {
261 u64 t0, t1;
262
263 t0 = sched_clock();
264 t1 = sched_clock();
265 SEQ_printf(m, "%-25s:%20Ld\n",
266 "clock-delta", (long long)(t1-t0));
267 }
268}
269
270void proc_sched_set_task(struct task_struct *p)
271{
272 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
273 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
274 p->se.sum_exec_runtime = 0;
275}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
new file mode 100644
index 0000000000..6971db0a71
--- /dev/null
+++ b/kernel/sched_fair.c
@@ -0,0 +1,1131 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 */
19
20/*
21 * Preemption granularity:
22 * (default: 2 msec, units: nanoseconds)
23 *
24 * NOTE: this granularity value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat
26 * larger than this value. (to see the precise effective timeslice
27 * length of your workload, run vmstat and monitor the context-switches
28 * field)
29 *
30 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
33 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
35
36/*
37 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds)
39 *
40 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies.
43 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
45 10000000000ULL/HZ;
46
47/*
48 * SCHED_OTHER wake-up granularity.
49 * (default: 1 msec, units: nanoseconds)
50 *
51 * This option delays the preemption effects of decoupled workloads
52 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies.
54 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
56
57unsigned int sysctl_sched_stat_granularity __read_mostly;
58
59/*
60 * Initialized in sched_init_granularity():
61 */
62unsigned int sysctl_sched_runtime_limit __read_mostly;
63
64/*
65 * Debugging: various feature bits
66 */
67enum {
68 SCHED_FEAT_FAIR_SLEEPERS = 1,
69 SCHED_FEAT_SLEEPER_AVG = 2,
70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
71 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
72 SCHED_FEAT_START_DEBIT = 16,
73 SCHED_FEAT_SKIP_INITIAL = 32,
74};
75
76unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 |
82 SCHED_FEAT_SKIP_INITIAL *0;
83
84extern struct sched_class fair_sched_class;
85
86/**************************************************************
87 * CFS operations on generic schedulable entities:
88 */
89
90#ifdef CONFIG_FAIR_GROUP_SCHED
91
92/* cpu runqueue to which this cfs_rq is attached */
93static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
94{
95 return cfs_rq->rq;
96}
97
98/* currently running entity (if any) on this cfs_rq */
99static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
100{
101 return cfs_rq->curr;
102}
103
104/* An entity is a task if it doesn't "own" a runqueue */
105#define entity_is_task(se) (!se->my_q)
106
107static inline void
108set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
109{
110 cfs_rq->curr = se;
111}
112
113#else /* CONFIG_FAIR_GROUP_SCHED */
114
115static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116{
117 return container_of(cfs_rq, struct rq, cfs);
118}
119
120static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
121{
122 struct rq *rq = rq_of(cfs_rq);
123
124 if (unlikely(rq->curr->sched_class != &fair_sched_class))
125 return NULL;
126
127 return &rq->curr->se;
128}
129
130#define entity_is_task(se) 1
131
132static inline void
133set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
134
135#endif /* CONFIG_FAIR_GROUP_SCHED */
136
137static inline struct task_struct *task_of(struct sched_entity *se)
138{
139 return container_of(se, struct task_struct, se);
140}
141
142
143/**************************************************************
144 * Scheduling class tree data structure manipulation methods:
145 */
146
147/*
148 * Enqueue an entity into the rb-tree:
149 */
150static inline void
151__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
152{
153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
154 struct rb_node *parent = NULL;
155 struct sched_entity *entry;
156 s64 key = se->fair_key;
157 int leftmost = 1;
158
159 /*
160 * Find the right place in the rbtree:
161 */
162 while (*link) {
163 parent = *link;
164 entry = rb_entry(parent, struct sched_entity, run_node);
165 /*
166 * We dont care about collisions. Nodes with
167 * the same key stay together.
168 */
169 if (key - entry->fair_key < 0) {
170 link = &parent->rb_left;
171 } else {
172 link = &parent->rb_right;
173 leftmost = 0;
174 }
175 }
176
177 /*
178 * Maintain a cache of leftmost tree entries (it is frequently
179 * used):
180 */
181 if (leftmost)
182 cfs_rq->rb_leftmost = &se->run_node;
183
184 rb_link_node(&se->run_node, parent, link);
185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
186 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++;
188 se->on_rq = 1;
189}
190
191static inline void
192__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
193{
194 if (cfs_rq->rb_leftmost == &se->run_node)
195 cfs_rq->rb_leftmost = rb_next(&se->run_node);
196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
197 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--;
199 se->on_rq = 0;
200}
201
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
203{
204 return cfs_rq->rb_leftmost;
205}
206
207static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
208{
209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
210}
211
212/**************************************************************
213 * Scheduling class statistics methods:
214 */
215
216/*
217 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially:
219 */
220static long
221niced_granularity(struct sched_entity *curr, unsigned long granularity)
222{
223 u64 tmp;
224
225 /*
226 * Negative nice levels get the same granularity as nice-0:
227 */
228 if (likely(curr->load.weight >= NICE_0_LOAD))
229 return granularity;
230 /*
231 * Positive nice level tasks get linearly finer
232 * granularity:
233 */
234 tmp = curr->load.weight * (u64)granularity;
235
236 /*
237 * It will always fit into 'long':
238 */
239 return (long) (tmp >> NICE_0_SHIFT);
240}
241
242static inline void
243limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
244{
245 long limit = sysctl_sched_runtime_limit;
246
247 /*
248 * Niced tasks have the same history dynamic range as
249 * non-niced tasks:
250 */
251 if (unlikely(se->wait_runtime > limit)) {
252 se->wait_runtime = limit;
253 schedstat_inc(se, wait_runtime_overruns);
254 schedstat_inc(cfs_rq, wait_runtime_overruns);
255 }
256 if (unlikely(se->wait_runtime < -limit)) {
257 se->wait_runtime = -limit;
258 schedstat_inc(se, wait_runtime_underruns);
259 schedstat_inc(cfs_rq, wait_runtime_underruns);
260 }
261}
262
263static inline void
264__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
265{
266 se->wait_runtime += delta;
267 schedstat_add(se, sum_wait_runtime, delta);
268 limit_wait_runtime(cfs_rq, se);
269}
270
271static void
272add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
273{
274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
275 __add_wait_runtime(cfs_rq, se, delta);
276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
277}
278
279/*
280 * Update the current task's runtime statistics. Skip current tasks that
281 * are not in our scheduling class.
282 */
283static inline void
284__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
285{
286 unsigned long delta, delta_exec, delta_fair;
287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight;
290
291 if (unlikely(!load))
292 return;
293
294 delta_exec = curr->delta_exec;
295#ifdef CONFIG_SCHEDSTATS
296 if (unlikely(delta_exec > curr->exec_max))
297 curr->exec_max = delta_exec;
298#endif
299
300 curr->sum_exec_runtime += delta_exec;
301 cfs_rq->exec_clock += delta_exec;
302
303 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305
306 if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
307 delta = calc_delta_mine(cfs_rq->sleeper_bonus,
308 curr->load.weight, lw);
309 if (unlikely(delta > cfs_rq->sleeper_bonus))
310 delta = cfs_rq->sleeper_bonus;
311
312 cfs_rq->sleeper_bonus -= delta;
313 delta_mine -= delta;
314 }
315
316 cfs_rq->fair_clock += delta_fair;
317 /*
318 * We executed delta_exec amount of time on the CPU,
319 * but we were only entitled to delta_mine amount of
320 * time during that period (if nr_running == 1 then
321 * the two values are equal)
322 * [Note: delta_mine - delta_exec is negative]:
323 */
324 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
325}
326
327static void update_curr(struct cfs_rq *cfs_rq, u64 now)
328{
329 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
330 unsigned long delta_exec;
331
332 if (unlikely(!curr))
333 return;
334
335 /*
336 * Get the amount of time the current task was running
337 * since the last time we changed load (this cannot
338 * overflow on 32 bits):
339 */
340 delta_exec = (unsigned long)(now - curr->exec_start);
341
342 curr->delta_exec += delta_exec;
343
344 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
345 __update_curr(cfs_rq, curr, now);
346 curr->delta_exec = 0;
347 }
348 curr->exec_start = now;
349}
350
351static inline void
352update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
353{
354 se->wait_start_fair = cfs_rq->fair_clock;
355 se->wait_start = now;
356}
357
358/*
359 * We calculate fair deltas here, so protect against the random effects
360 * of a multiplication overflow by capping it to the runtime limit:
361 */
362#if BITS_PER_LONG == 32
363static inline unsigned long
364calc_weighted(unsigned long delta, unsigned long weight, int shift)
365{
366 u64 tmp = (u64)delta * weight >> shift;
367
368 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
369 return sysctl_sched_runtime_limit*2;
370 return tmp;
371}
372#else
373static inline unsigned long
374calc_weighted(unsigned long delta, unsigned long weight, int shift)
375{
376 return delta * weight >> shift;
377}
378#endif
379
380/*
381 * Task is being enqueued - update stats:
382 */
383static void
384update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
385{
386 s64 key;
387
388 /*
389 * Are we enqueueing a waiting task? (for current tasks
390 * a dequeue/enqueue event is a NOP)
391 */
392 if (se != cfs_rq_curr(cfs_rq))
393 update_stats_wait_start(cfs_rq, se, now);
394 /*
395 * Update the key:
396 */
397 key = cfs_rq->fair_clock;
398
399 /*
400 * Optimize the common nice 0 case:
401 */
402 if (likely(se->load.weight == NICE_0_LOAD)) {
403 key -= se->wait_runtime;
404 } else {
405 u64 tmp;
406
407 if (se->wait_runtime < 0) {
408 tmp = -se->wait_runtime;
409 key += (tmp * se->load.inv_weight) >>
410 (WMULT_SHIFT - NICE_0_SHIFT);
411 } else {
412 tmp = se->wait_runtime;
413 key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
414 }
415 }
416
417 se->fair_key = key;
418}
419
420/*
421 * Note: must be called with a freshly updated rq->fair_clock.
422 */
423static inline void
424__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
425{
426 unsigned long delta_fair = se->delta_fair_run;
427
428#ifdef CONFIG_SCHEDSTATS
429 {
430 s64 delta_wait = now - se->wait_start;
431 if (unlikely(delta_wait > se->wait_max))
432 se->wait_max = delta_wait;
433 }
434#endif
435
436 if (unlikely(se->load.weight != NICE_0_LOAD))
437 delta_fair = calc_weighted(delta_fair, se->load.weight,
438 NICE_0_SHIFT);
439
440 add_wait_runtime(cfs_rq, se, delta_fair);
441}
442
443static void
444update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
445{
446 unsigned long delta_fair;
447
448 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
449 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
450
451 se->delta_fair_run += delta_fair;
452 if (unlikely(abs(se->delta_fair_run) >=
453 sysctl_sched_stat_granularity)) {
454 __update_stats_wait_end(cfs_rq, se, now);
455 se->delta_fair_run = 0;
456 }
457
458 se->wait_start_fair = 0;
459 se->wait_start = 0;
460}
461
462static inline void
463update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
464{
465 update_curr(cfs_rq, now);
466 /*
467 * Mark the end of the wait period if dequeueing a
468 * waiting task:
469 */
470 if (se != cfs_rq_curr(cfs_rq))
471 update_stats_wait_end(cfs_rq, se, now);
472}
473
474/*
475 * We are picking a new current task - update its stats:
476 */
477static inline void
478update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
479{
480 /*
481 * We are starting a new run period:
482 */
483 se->exec_start = now;
484}
485
486/*
487 * We are descheduling a task - update its stats:
488 */
489static inline void
490update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
491{
492 se->exec_start = 0;
493}
494
495/**************************************************
496 * Scheduling class queueing methods:
497 */
498
499static void
500__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
501{
502 unsigned long load = cfs_rq->load.weight, delta_fair;
503 long prev_runtime;
504
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
506 load = rq_of(cfs_rq)->cpu_load[2];
507
508 delta_fair = se->delta_fair_sleep;
509
510 /*
511 * Fix up delta_fair with the effect of us running
512 * during the whole sleep period:
513 */
514 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
515 delta_fair = div64_likely32((u64)delta_fair * load,
516 load + se->load.weight);
517
518 if (unlikely(se->load.weight != NICE_0_LOAD))
519 delta_fair = calc_weighted(delta_fair, se->load.weight,
520 NICE_0_SHIFT);
521
522 prev_runtime = se->wait_runtime;
523 __add_wait_runtime(cfs_rq, se, delta_fair);
524 delta_fair = se->wait_runtime - prev_runtime;
525
526 /*
527 * Track the amount of bonus we've given to sleepers:
528 */
529 cfs_rq->sleeper_bonus += delta_fair;
530
531 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
532}
533
534static void
535enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
536{
537 struct task_struct *tsk = task_of(se);
538 unsigned long delta_fair;
539
540 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
541 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
542 return;
543
544 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
545 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
546
547 se->delta_fair_sleep += delta_fair;
548 if (unlikely(abs(se->delta_fair_sleep) >=
549 sysctl_sched_stat_granularity)) {
550 __enqueue_sleeper(cfs_rq, se, now);
551 se->delta_fair_sleep = 0;
552 }
553
554 se->sleep_start_fair = 0;
555
556#ifdef CONFIG_SCHEDSTATS
557 if (se->sleep_start) {
558 u64 delta = now - se->sleep_start;
559
560 if ((s64)delta < 0)
561 delta = 0;
562
563 if (unlikely(delta > se->sleep_max))
564 se->sleep_max = delta;
565
566 se->sleep_start = 0;
567 se->sum_sleep_runtime += delta;
568 }
569 if (se->block_start) {
570 u64 delta = now - se->block_start;
571
572 if ((s64)delta < 0)
573 delta = 0;
574
575 if (unlikely(delta > se->block_max))
576 se->block_max = delta;
577
578 se->block_start = 0;
579 se->sum_sleep_runtime += delta;
580 }
581#endif
582}
583
584static void
585enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
586 int wakeup, u64 now)
587{
588 /*
589 * Update the fair clock.
590 */
591 update_curr(cfs_rq, now);
592
593 if (wakeup)
594 enqueue_sleeper(cfs_rq, se, now);
595
596 update_stats_enqueue(cfs_rq, se, now);
597 __enqueue_entity(cfs_rq, se);
598}
599
600static void
601dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
602 int sleep, u64 now)
603{
604 update_stats_dequeue(cfs_rq, se, now);
605 if (sleep) {
606 se->sleep_start_fair = cfs_rq->fair_clock;
607#ifdef CONFIG_SCHEDSTATS
608 if (entity_is_task(se)) {
609 struct task_struct *tsk = task_of(se);
610
611 if (tsk->state & TASK_INTERRUPTIBLE)
612 se->sleep_start = now;
613 if (tsk->state & TASK_UNINTERRUPTIBLE)
614 se->block_start = now;
615 }
616 cfs_rq->wait_runtime -= se->wait_runtime;
617#endif
618 }
619 __dequeue_entity(cfs_rq, se);
620}
621
622/*
623 * Preempt the current task with a newly woken task if needed:
624 */
625static void
626__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
627 struct sched_entity *curr, unsigned long granularity)
628{
629 s64 __delta = curr->fair_key - se->fair_key;
630
631 /*
632 * Take scheduling granularity into account - do not
633 * preempt the current task unless the best task has
634 * a larger than sched_granularity fairness advantage:
635 */
636 if (__delta > niced_granularity(curr, granularity))
637 resched_task(rq_of(cfs_rq)->curr);
638}
639
640static inline void
641set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
642{
643 /*
644 * Any task has to be enqueued before it get to execute on
645 * a CPU. So account for the time it spent waiting on the
646 * runqueue. (note, here we rely on pick_next_task() having
647 * done a put_prev_task_fair() shortly before this, which
648 * updated rq->fair_clock - used by update_stats_wait_end())
649 */
650 update_stats_wait_end(cfs_rq, se, now);
651 update_stats_curr_start(cfs_rq, se, now);
652 set_cfs_rq_curr(cfs_rq, se);
653}
654
655static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
656{
657 struct sched_entity *se = __pick_next_entity(cfs_rq);
658
659 set_next_entity(cfs_rq, se, now);
660
661 return se;
662}
663
664static void
665put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
666{
667 /*
668 * If still on the runqueue then deactivate_task()
669 * was not called and update_curr() has to be done:
670 */
671 if (prev->on_rq)
672 update_curr(cfs_rq, now);
673
674 update_stats_curr_end(cfs_rq, prev, now);
675
676 if (prev->on_rq)
677 update_stats_wait_start(cfs_rq, prev, now);
678 set_cfs_rq_curr(cfs_rq, NULL);
679}
680
681static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
682{
683 struct rq *rq = rq_of(cfs_rq);
684 struct sched_entity *next;
685 u64 now = __rq_clock(rq);
686
687 /*
688 * Dequeue and enqueue the task to update its
689 * position within the tree:
690 */
691 dequeue_entity(cfs_rq, curr, 0, now);
692 enqueue_entity(cfs_rq, curr, 0, now);
693
694 /*
695 * Reschedule if another task tops the current one.
696 */
697 next = __pick_next_entity(cfs_rq);
698 if (next == curr)
699 return;
700
701 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
702}
703
704/**************************************************
705 * CFS operations on tasks:
706 */
707
708#ifdef CONFIG_FAIR_GROUP_SCHED
709
710/* Walk up scheduling entities hierarchy */
711#define for_each_sched_entity(se) \
712 for (; se; se = se->parent)
713
714static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
715{
716 return p->se.cfs_rq;
717}
718
719/* runqueue on which this entity is (to be) queued */
720static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
721{
722 return se->cfs_rq;
723}
724
725/* runqueue "owned" by this group */
726static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
727{
728 return grp->my_q;
729}
730
731/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
732 * another cpu ('this_cpu')
733 */
734static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
735{
736 /* A later patch will take group into account */
737 return &cpu_rq(this_cpu)->cfs;
738}
739
740/* Iterate thr' all leaf cfs_rq's on a runqueue */
741#define for_each_leaf_cfs_rq(rq, cfs_rq) \
742 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
743
744/* Do the two (enqueued) tasks belong to the same group ? */
745static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
746{
747 if (curr->se.cfs_rq == p->se.cfs_rq)
748 return 1;
749
750 return 0;
751}
752
753#else /* CONFIG_FAIR_GROUP_SCHED */
754
755#define for_each_sched_entity(se) \
756 for (; se; se = NULL)
757
758static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
759{
760 return &task_rq(p)->cfs;
761}
762
763static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
764{
765 struct task_struct *p = task_of(se);
766 struct rq *rq = task_rq(p);
767
768 return &rq->cfs;
769}
770
771/* runqueue "owned" by this group */
772static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
773{
774 return NULL;
775}
776
777static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
778{
779 return &cpu_rq(this_cpu)->cfs;
780}
781
782#define for_each_leaf_cfs_rq(rq, cfs_rq) \
783 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
784
785static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
786{
787 return 1;
788}
789
790#endif /* CONFIG_FAIR_GROUP_SCHED */
791
792/*
793 * The enqueue_task method is called before nr_running is
794 * increased. Here we update the fair scheduling stats and
795 * then put the task into the rbtree:
796 */
797static void
798enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
799{
800 struct cfs_rq *cfs_rq;
801 struct sched_entity *se = &p->se;
802
803 for_each_sched_entity(se) {
804 if (se->on_rq)
805 break;
806 cfs_rq = cfs_rq_of(se);
807 enqueue_entity(cfs_rq, se, wakeup, now);
808 }
809}
810
811/*
812 * The dequeue_task method is called before nr_running is
813 * decreased. We remove the task from the rbtree and
814 * update the fair scheduling stats:
815 */
816static void
817dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
818{
819 struct cfs_rq *cfs_rq;
820 struct sched_entity *se = &p->se;
821
822 for_each_sched_entity(se) {
823 cfs_rq = cfs_rq_of(se);
824 dequeue_entity(cfs_rq, se, sleep, now);
825 /* Don't dequeue parent if it has other entities besides us */
826 if (cfs_rq->load.weight)
827 break;
828 }
829}
830
831/*
832 * sched_yield() support is very simple - we dequeue and enqueue
833 */
834static void yield_task_fair(struct rq *rq, struct task_struct *p)
835{
836 struct cfs_rq *cfs_rq = task_cfs_rq(p);
837 u64 now = __rq_clock(rq);
838
839 /*
840 * Dequeue and enqueue the task to update its
841 * position within the tree:
842 */
843 dequeue_entity(cfs_rq, &p->se, 0, now);
844 enqueue_entity(cfs_rq, &p->se, 0, now);
845}
846
847/*
848 * Preempt the current task with a newly woken task if needed:
849 */
850static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
851{
852 struct task_struct *curr = rq->curr;
853 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
854 unsigned long gran;
855
856 if (unlikely(rt_prio(p->prio))) {
857 update_curr(cfs_rq, rq_clock(rq));
858 resched_task(curr);
859 return;
860 }
861
862 gran = sysctl_sched_wakeup_granularity;
863 /*
864 * Batch tasks prefer throughput over latency:
865 */
866 if (unlikely(p->policy == SCHED_BATCH))
867 gran = sysctl_sched_batch_wakeup_granularity;
868
869 if (is_same_group(curr, p))
870 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
871}
872
873static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
874{
875 struct cfs_rq *cfs_rq = &rq->cfs;
876 struct sched_entity *se;
877
878 if (unlikely(!cfs_rq->nr_running))
879 return NULL;
880
881 do {
882 se = pick_next_entity(cfs_rq, now);
883 cfs_rq = group_cfs_rq(se);
884 } while (cfs_rq);
885
886 return task_of(se);
887}
888
889/*
890 * Account for a descheduled task:
891 */
892static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
893{
894 struct sched_entity *se = &prev->se;
895 struct cfs_rq *cfs_rq;
896
897 for_each_sched_entity(se) {
898 cfs_rq = cfs_rq_of(se);
899 put_prev_entity(cfs_rq, se, now);
900 }
901}
902
903/**************************************************
904 * Fair scheduling class load-balancing methods:
905 */
906
907/*
908 * Load-balancing iterator. Note: while the runqueue stays locked
909 * during the whole iteration, the current task might be
910 * dequeued so the iterator has to be dequeue-safe. Here we
911 * achieve that by always pre-iterating before returning
912 * the current task:
913 */
914static inline struct task_struct *
915__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
916{
917 struct task_struct *p;
918
919 if (!curr)
920 return NULL;
921
922 p = rb_entry(curr, struct task_struct, se.run_node);
923 cfs_rq->rb_load_balance_curr = rb_next(curr);
924
925 return p;
926}
927
928static struct task_struct *load_balance_start_fair(void *arg)
929{
930 struct cfs_rq *cfs_rq = arg;
931
932 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
933}
934
935static struct task_struct *load_balance_next_fair(void *arg)
936{
937 struct cfs_rq *cfs_rq = arg;
938
939 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
940}
941
942static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
943{
944 struct sched_entity *curr;
945 struct task_struct *p;
946
947 if (!cfs_rq->nr_running)
948 return MAX_PRIO;
949
950 curr = __pick_next_entity(cfs_rq);
951 p = task_of(curr);
952
953 return p->prio;
954}
955
956static int
957load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 unsigned long max_nr_move, unsigned long max_load_move,
959 struct sched_domain *sd, enum cpu_idle_type idle,
960 int *all_pinned, unsigned long *total_load_moved)
961{
962 struct cfs_rq *busy_cfs_rq;
963 unsigned long load_moved, total_nr_moved = 0, nr_moved;
964 long rem_load_move = max_load_move;
965 struct rq_iterator cfs_rq_iterator;
966
967 cfs_rq_iterator.start = load_balance_start_fair;
968 cfs_rq_iterator.next = load_balance_next_fair;
969
970 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
971 struct cfs_rq *this_cfs_rq;
972 long imbalance;
973 unsigned long maxload;
974 int this_best_prio, best_prio, best_prio_seen = 0;
975
976 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
977
978 imbalance = busy_cfs_rq->load.weight -
979 this_cfs_rq->load.weight;
980 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
981 if (imbalance <= 0)
982 continue;
983
984 /* Don't pull more than imbalance/2 */
985 imbalance /= 2;
986 maxload = min(rem_load_move, imbalance);
987
988 this_best_prio = cfs_rq_best_prio(this_cfs_rq);
989 best_prio = cfs_rq_best_prio(busy_cfs_rq);
990
991 /*
992 * Enable handling of the case where there is more than one task
993 * with the best priority. If the current running task is one
994 * of those with prio==best_prio we know it won't be moved
995 * and therefore it's safe to override the skip (based on load)
996 * of any task we find with that prio.
997 */
998 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
999 best_prio_seen = 1;
1000
1001 /* pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators
1003 */
1004 cfs_rq_iterator.arg = busy_cfs_rq;
1005 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 max_nr_move, maxload, sd, idle, all_pinned,
1007 &load_moved, this_best_prio, best_prio,
1008 best_prio_seen, &cfs_rq_iterator);
1009
1010 total_nr_moved += nr_moved;
1011 max_nr_move -= nr_moved;
1012 rem_load_move -= load_moved;
1013
1014 if (max_nr_move <= 0 || rem_load_move <= 0)
1015 break;
1016 }
1017
1018 *total_load_moved = max_load_move - rem_load_move;
1019
1020 return total_nr_moved;
1021}
1022
1023/*
1024 * scheduler tick hitting a task of our scheduling class:
1025 */
1026static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1027{
1028 struct cfs_rq *cfs_rq;
1029 struct sched_entity *se = &curr->se;
1030
1031 for_each_sched_entity(se) {
1032 cfs_rq = cfs_rq_of(se);
1033 entity_tick(cfs_rq, se);
1034 }
1035}
1036
1037/*
1038 * Share the fairness runtime between parent and child, thus the
1039 * total amount of pressure for CPU stays equal - new tasks
1040 * get a chance to run but frequent forkers are not allowed to
1041 * monopolize the CPU. Note: the parent runqueue is locked,
1042 * the child is not running yet.
1043 */
1044static void task_new_fair(struct rq *rq, struct task_struct *p)
1045{
1046 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1047 struct sched_entity *se = &p->se;
1048 u64 now = rq_clock(rq);
1049
1050 sched_info_queued(p);
1051
1052 update_stats_enqueue(cfs_rq, se, now);
1053 /*
1054 * Child runs first: we let it run before the parent
1055 * until it reschedules once. We set up the key so that
1056 * it will preempt the parent:
1057 */
1058 p->se.fair_key = current->se.fair_key -
1059 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
1060 /*
1061 * The first wait is dominated by the child-runs-first logic,
1062 * so do not credit it with that waiting time yet:
1063 */
1064 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1065 p->se.wait_start_fair = 0;
1066
1067 /*
1068 * The statistical average of wait_runtime is about
1069 * -granularity/2, so initialize the task with that:
1070 */
1071 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1072 p->se.wait_runtime = -(sysctl_sched_granularity / 2);
1073
1074 __enqueue_entity(cfs_rq, se);
1075 inc_nr_running(p, rq, now);
1076}
1077
1078#ifdef CONFIG_FAIR_GROUP_SCHED
1079/* Account for a task changing its policy or group.
1080 *
1081 * This routine is mostly called to set cfs_rq->curr field when a task
1082 * migrates between groups/classes.
1083 */
1084static void set_curr_task_fair(struct rq *rq)
1085{
1086 struct task_struct *curr = rq->curr;
1087 struct sched_entity *se = &curr->se;
1088 u64 now = rq_clock(rq);
1089 struct cfs_rq *cfs_rq;
1090
1091 for_each_sched_entity(se) {
1092 cfs_rq = cfs_rq_of(se);
1093 set_next_entity(cfs_rq, se, now);
1094 }
1095}
1096#else
1097static void set_curr_task_fair(struct rq *rq)
1098{
1099}
1100#endif
1101
1102/*
1103 * All the scheduling class methods:
1104 */
1105struct sched_class fair_sched_class __read_mostly = {
1106 .enqueue_task = enqueue_task_fair,
1107 .dequeue_task = dequeue_task_fair,
1108 .yield_task = yield_task_fair,
1109
1110 .check_preempt_curr = check_preempt_curr_fair,
1111
1112 .pick_next_task = pick_next_task_fair,
1113 .put_prev_task = put_prev_task_fair,
1114
1115 .load_balance = load_balance_fair,
1116
1117 .set_curr_task = set_curr_task_fair,
1118 .task_tick = task_tick_fair,
1119 .task_new = task_new_fair,
1120};
1121
1122#ifdef CONFIG_SCHED_DEBUG
1123void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
1124{
1125 struct rq *rq = cpu_rq(cpu);
1126 struct cfs_rq *cfs_rq;
1127
1128 for_each_leaf_cfs_rq(rq, cfs_rq)
1129 print_cfs_rq(m, cpu, cfs_rq, now);
1130}
1131#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
new file mode 100644
index 0000000000..41841e741c
--- /dev/null
+++ b/kernel/sched_idletask.c
@@ -0,0 +1,71 @@
1/*
2 * idle-task scheduling class.
3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c)
6 */
7
8/*
9 * Idle tasks are unconditionally rescheduled:
10 */
11static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
12{
13 resched_task(rq->idle);
14}
15
16static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
17{
18 schedstat_inc(rq, sched_goidle);
19
20 return rq->idle;
21}
22
23/*
24 * It is not legal to sleep in the idle task - print a warning
25 * message if some code attempts to do it:
26 */
27static void
28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
29{
30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32 dump_stack();
33 spin_lock_irq(&rq->lock);
34}
35
36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
37{
38}
39
40static int
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved)
45{
46 return 0;
47}
48
49static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50{
51}
52
53/*
54 * Simple, special scheduling class for the per-CPU idle tasks:
55 */
56static struct sched_class idle_sched_class __read_mostly = {
57 /* no enqueue/yield_task for idle tasks */
58
59 /* dequeue is not valid, we print a debug message there: */
60 .dequeue_task = dequeue_task_idle,
61
62 .check_preempt_curr = check_preempt_curr_idle,
63
64 .pick_next_task = pick_next_task_idle,
65 .put_prev_task = put_prev_task_idle,
66
67 .load_balance = load_balance_idle,
68
69 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */
71};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
new file mode 100644
index 0000000000..1192a2741b
--- /dev/null
+++ b/kernel/sched_rt.c
@@ -0,0 +1,255 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6/*
7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class.
9 */
10static inline void update_curr_rt(struct rq *rq, u64 now)
11{
12 struct task_struct *curr = rq->curr;
13 u64 delta_exec;
14
15 if (!task_has_rt_policy(curr))
16 return;
17
18 delta_exec = now - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0;
21 if (unlikely(delta_exec > curr->se.exec_max))
22 curr->se.exec_max = delta_exec;
23
24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now;
26}
27
28static void
29enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30{
31 struct rt_prio_array *array = &rq->rt.active;
32
33 list_add_tail(&p->run_list, array->queue + p->prio);
34 __set_bit(p->prio, array->bitmap);
35}
36
37/*
38 * Adding/removing a task to/from a priority array:
39 */
40static void
41dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42{
43 struct rt_prio_array *array = &rq->rt.active;
44
45 update_curr_rt(rq, now);
46
47 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio))
49 __clear_bit(p->prio, array->bitmap);
50}
51
52/*
53 * Put task to the end of the run list without the overhead of dequeue
54 * followed by enqueue.
55 */
56static void requeue_task_rt(struct rq *rq, struct task_struct *p)
57{
58 struct rt_prio_array *array = &rq->rt.active;
59
60 list_move_tail(&p->run_list, array->queue + p->prio);
61}
62
63static void
64yield_task_rt(struct rq *rq, struct task_struct *p)
65{
66 requeue_task_rt(rq, p);
67}
68
69/*
70 * Preempt the current task with a newly woken task if needed:
71 */
72static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73{
74 if (p->prio < rq->curr->prio)
75 resched_task(rq->curr);
76}
77
78static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
79{
80 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next;
82 struct list_head *queue;
83 int idx;
84
85 idx = sched_find_first_bit(array->bitmap);
86 if (idx >= MAX_RT_PRIO)
87 return NULL;
88
89 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list);
91
92 next->se.exec_start = now;
93
94 return next;
95}
96
97static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
98{
99 update_curr_rt(rq, now);
100 p->se.exec_start = 0;
101}
102
103/*
104 * Load-balancing iterator. Note: while the runqueue stays locked
105 * during the whole iteration, the current task might be
106 * dequeued so the iterator has to be dequeue-safe. Here we
107 * achieve that by always pre-iterating before returning
108 * the current task:
109 */
110static struct task_struct *load_balance_start_rt(void *arg)
111{
112 struct rq *rq = arg;
113 struct rt_prio_array *array = &rq->rt.active;
114 struct list_head *head, *curr;
115 struct task_struct *p;
116 int idx;
117
118 idx = sched_find_first_bit(array->bitmap);
119 if (idx >= MAX_RT_PRIO)
120 return NULL;
121
122 head = array->queue + idx;
123 curr = head->prev;
124
125 p = list_entry(curr, struct task_struct, run_list);
126
127 curr = curr->prev;
128
129 rq->rt.rt_load_balance_idx = idx;
130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr;
132
133 return p;
134}
135
136static struct task_struct *load_balance_next_rt(void *arg)
137{
138 struct rq *rq = arg;
139 struct rt_prio_array *array = &rq->rt.active;
140 struct list_head *head, *curr;
141 struct task_struct *p;
142 int idx;
143
144 idx = rq->rt.rt_load_balance_idx;
145 head = rq->rt.rt_load_balance_head;
146 curr = rq->rt.rt_load_balance_curr;
147
148 /*
149 * If we arrived back to the head again then
150 * iterate to the next queue (if any):
151 */
152 if (unlikely(head == curr)) {
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
154
155 if (next_idx >= MAX_RT_PRIO)
156 return NULL;
157
158 idx = next_idx;
159 head = array->queue + idx;
160 curr = head->prev;
161
162 rq->rt.rt_load_balance_idx = idx;
163 rq->rt.rt_load_balance_head = head;
164 }
165
166 p = list_entry(curr, struct task_struct, run_list);
167
168 curr = curr->prev;
169
170 rq->rt.rt_load_balance_curr = curr;
171
172 return p;
173}
174
175static int
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved)
180{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved;
183 struct rq_iterator rt_rq_iterator;
184
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187
188 /*
189 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio.
194 */
195 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1;
197
198 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt;
200 /* pass 'busiest' rq argument into
201 * load_balance_[start|next]_rt iterators
202 */
203 rt_rq_iterator.arg = busiest;
204
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved,
207 this_best_prio, best_prio, best_prio_seen,
208 &rt_rq_iterator);
209
210 return nr_moved;
211}
212
213static void task_tick_rt(struct rq *rq, struct task_struct *p)
214{
215 /*
216 * RR tasks need a special form of timeslice management.
217 * FIFO tasks have no timeslices.
218 */
219 if (p->policy != SCHED_RR)
220 return;
221
222 if (--p->time_slice)
223 return;
224
225 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p);
227
228 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p);
230}
231
232/*
233 * No parent/child timeslice management necessary for RT tasks,
234 * just activate them:
235 */
236static void task_new_rt(struct rq *rq, struct task_struct *p)
237{
238 activate_task(rq, p, 1);
239}
240
241static struct sched_class rt_sched_class __read_mostly = {
242 .enqueue_task = enqueue_task_rt,
243 .dequeue_task = dequeue_task_rt,
244 .yield_task = yield_task_rt,
245
246 .check_preempt_curr = check_preempt_curr_rt,
247
248 .pick_next_task = pick_next_task_rt,
249 .put_prev_task = put_prev_task_rt,
250
251 .load_balance = load_balance_rt,
252
253 .task_tick = task_tick_rt,
254 .task_new = task_new_rt,
255};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
new file mode 100644
index 0000000000..c63c38f6fa
--- /dev/null
+++ b/kernel/sched_stats.h
@@ -0,0 +1,235 @@
1
2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 14
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12
13 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
14 seq_printf(seq, "timestamp %lu\n", jiffies);
15 for_each_online_cpu(cpu) {
16 struct rq *rq = cpu_rq(cpu);
17#ifdef CONFIG_SMP
18 struct sched_domain *sd;
19 int dcnt = 0;
20#endif
21
22 /* runqueue-specific stats */
23 seq_printf(seq,
24 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
25 cpu, rq->yld_both_empty,
26 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
27 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
28 rq->ttwu_cnt, rq->ttwu_local,
29 rq->rq_sched_info.cpu_time,
30 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
31
32 seq_printf(seq, "\n");
33
34#ifdef CONFIG_SMP
35 /* domain-specific stats */
36 preempt_disable();
37 for_each_domain(cpu, sd) {
38 enum cpu_idle_type itype;
39 char mask_str[NR_CPUS];
40
41 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
42 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
43 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
44 itype++) {
45 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
46 "%lu",
47 sd->lb_cnt[itype],
48 sd->lb_balanced[itype],
49 sd->lb_failed[itype],
50 sd->lb_imbalance[itype],
51 sd->lb_gained[itype],
52 sd->lb_hot_gained[itype],
53 sd->lb_nobusyq[itype],
54 sd->lb_nobusyg[itype]);
55 }
56 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
57 " %lu %lu %lu\n",
58 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
59 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
60 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
61 sd->ttwu_wake_remote, sd->ttwu_move_affine,
62 sd->ttwu_move_balance);
63 }
64 preempt_enable();
65#endif
66 }
67 return 0;
68}
69
70static int schedstat_open(struct inode *inode, struct file *file)
71{
72 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
73 char *buf = kmalloc(size, GFP_KERNEL);
74 struct seq_file *m;
75 int res;
76
77 if (!buf)
78 return -ENOMEM;
79 res = single_open(file, show_schedstat, NULL);
80 if (!res) {
81 m = file->private_data;
82 m->buf = buf;
83 m->size = size;
84 } else
85 kfree(buf);
86 return res;
87}
88
89const struct file_operations proc_schedstat_operations = {
90 .open = schedstat_open,
91 .read = seq_read,
92 .llseek = seq_lseek,
93 .release = single_release,
94};
95
96/*
97 * Expects runqueue lock to be held for atomicity of update
98 */
99static inline void
100rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
101{
102 if (rq) {
103 rq->rq_sched_info.run_delay += delta;
104 rq->rq_sched_info.pcnt++;
105 }
106}
107
108/*
109 * Expects runqueue lock to be held for atomicity of update
110 */
111static inline void
112rq_sched_info_depart(struct rq *rq, unsigned long long delta)
113{
114 if (rq)
115 rq->rq_sched_info.cpu_time += delta;
116}
117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
119#else /* !CONFIG_SCHEDSTATS */
120static inline void
121rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
122{}
123static inline void
124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{}
126# define schedstat_inc(rq, field) do { } while (0)
127# define schedstat_add(rq, field, amt) do { } while (0)
128#endif
129
130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/*
132 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive
134 * tasks, the expired queue will become the active queue after the active
135 * queue is empty, without explicitly dequeuing and requeuing tasks in the
136 * expired queue. (Interactive tasks may be requeued directly to the
137 * active queue, thus delaying tasks in the expired queue from running;
138 * see scheduler_tick()).
139 *
140 * This function is only called from sched_info_arrive(), rather than
141 * dequeue_task(). Even though a task may be queued and dequeued multiple
142 * times as it is shuffled about, we're really interested in knowing how
143 * long it was from the *first* time it was queued to the time that it
144 * finally hit a cpu.
145 */
146static inline void sched_info_dequeued(struct task_struct *t)
147{
148 t->sched_info.last_queued = 0;
149}
150
151/*
152 * Called when a task finally hits the cpu. We can now calculate how
153 * long it was waiting to run. We also note when it began so that we
154 * can keep stats on how long its timeslice is.
155 */
156static void sched_info_arrive(struct task_struct *t)
157{
158 unsigned long long now = sched_clock(), delta = 0;
159
160 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued;
162 sched_info_dequeued(t);
163 t->sched_info.run_delay += delta;
164 t->sched_info.last_arrival = now;
165 t->sched_info.pcnt++;
166
167 rq_sched_info_arrive(task_rq(t), delta);
168}
169
170/*
171 * Called when a process is queued into either the active or expired
172 * array. The time is noted and later used to determine how long we
173 * had to wait for us to reach the cpu. Since the expired queue will
174 * become the active queue after active queue is empty, without dequeuing
175 * and requeuing any tasks, we are interested in queuing to either. It
176 * is unusual but not impossible for tasks to be dequeued and immediately
177 * requeued in the same or another array: this can happen in sched_yield(),
178 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
179 * to runqueue.
180 *
181 * This function is only called from enqueue_task(), but also only updates
182 * the timestamp if it is already not set. It's assumed that
183 * sched_info_dequeued() will clear that stamp when appropriate.
184 */
185static inline void sched_info_queued(struct task_struct *t)
186{
187 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock();
190}
191
192/*
193 * Called when a process ceases being the active-running process, either
194 * voluntarily or involuntarily. Now we can calculate how long we ran.
195 */
196static inline void sched_info_depart(struct task_struct *t)
197{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival;
199
200 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta);
202}
203
204/*
205 * Called when tasks are switched involuntarily due, typically, to expiring
206 * their time slice. (This may also be called when switching to or from
207 * the idle task.) We are only called when prev != next.
208 */
209static inline void
210__sched_info_switch(struct task_struct *prev, struct task_struct *next)
211{
212 struct rq *rq = task_rq(prev);
213
214 /*
215 * prev now departs the cpu. It's not interesting to record
216 * stats about how efficient we were at scheduling the idle
217 * process, however.
218 */
219 if (prev != rq->idle)
220 sched_info_depart(prev);
221
222 if (next != rq->idle)
223 sched_info_arrive(next);
224}
225static inline void
226sched_info_switch(struct task_struct *prev, struct task_struct *next)
227{
228 if (unlikely(sched_info_on()))
229 __sched_info_switch(prev, next);
230}
231#else
232#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index c3391b6020..ad64fcb731 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -10,6 +10,7 @@
10#include <linux/sched.h> 10#include <linux/sched.h>
11 11
12/* #define SECCOMP_DEBUG 1 */ 12/* #define SECCOMP_DEBUG 1 */
13#define NR_SECCOMP_MODES 1
13 14
14/* 15/*
15 * Secure computing mode 1 allows only read/write/exit/sigreturn. 16 * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -54,3 +55,31 @@ void __secure_computing(int this_syscall)
54#endif 55#endif
55 do_exit(SIGKILL); 56 do_exit(SIGKILL);
56} 57}
58
59long prctl_get_seccomp(void)
60{
61 return current->seccomp.mode;
62}
63
64long prctl_set_seccomp(unsigned long seccomp_mode)
65{
66 long ret;
67
68 /* can set it only once to be even more secure */
69 ret = -EPERM;
70 if (unlikely(current->seccomp.mode))
71 goto out;
72
73 ret = -EINVAL;
74 if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
75 current->seccomp.mode = seccomp_mode;
76 set_thread_flag(TIF_SECCOMP);
77#ifdef TIF_NOTSC
78 disable_TSC();
79#endif
80 ret = 0;
81 }
82
83 out:
84 return ret;
85}
diff --git a/kernel/signal.c b/kernel/signal.c
index f940560977..39d122753b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -718,6 +718,37 @@ out_set:
718#define LEGACY_QUEUE(sigptr, sig) \ 718#define LEGACY_QUEUE(sigptr, sig) \
719 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) 719 (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
720 720
721int print_fatal_signals;
722
723static void print_fatal_signal(struct pt_regs *regs, int signr)
724{
725 printk("%s/%d: potentially unexpected fatal signal %d.\n",
726 current->comm, current->pid, signr);
727
728#ifdef __i386__
729 printk("code at %08lx: ", regs->eip);
730 {
731 int i;
732 for (i = 0; i < 16; i++) {
733 unsigned char insn;
734
735 __get_user(insn, (unsigned char *)(regs->eip + i));
736 printk("%02x ", insn);
737 }
738 }
739#endif
740 printk("\n");
741 show_regs(regs);
742}
743
744static int __init setup_print_fatal_signals(char *str)
745{
746 get_option (&str, &print_fatal_signals);
747
748 return 1;
749}
750
751__setup("print-fatal-signals=", setup_print_fatal_signals);
721 752
722static int 753static int
723specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) 754specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -1855,6 +1886,8 @@ relock:
1855 * Anything else is fatal, maybe with a core dump. 1886 * Anything else is fatal, maybe with a core dump.
1856 */ 1887 */
1857 current->flags |= PF_SIGNALED; 1888 current->flags |= PF_SIGNALED;
1889 if ((signr != SIGKILL) && print_fatal_signals)
1890 print_fatal_signal(regs, signr);
1858 if (sig_kernel_coredump(signr)) { 1891 if (sig_kernel_coredump(signr)) {
1859 /* 1892 /*
1860 * If it was able to dump core, this kills all 1893 * If it was able to dump core, this kills all
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0b9886a00e..0f546ddea4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -14,6 +14,7 @@
14#include <linux/notifier.h> 14#include <linux/notifier.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/freezer.h>
17#include <linux/kthread.h> 18#include <linux/kthread.h>
18#include <linux/rcupdate.h> 19#include <linux/rcupdate.h>
19#include <linux/smp.h> 20#include <linux/smp.h>
@@ -488,9 +489,6 @@ void __init softirq_init(void)
488 489
489static int ksoftirqd(void * __bind_cpu) 490static int ksoftirqd(void * __bind_cpu)
490{ 491{
491 set_user_nice(current, 19);
492 current->flags |= PF_NOFREEZE;
493
494 set_current_state(TASK_INTERRUPTIBLE); 492 set_current_state(TASK_INTERRUPTIBLE);
495 493
496 while (!kthread_should_stop()) { 494 while (!kthread_should_stop()) {
@@ -615,12 +613,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
615 kthread_bind(per_cpu(ksoftirqd, hotcpu), 613 kthread_bind(per_cpu(ksoftirqd, hotcpu),
616 any_online_cpu(cpu_online_map)); 614 any_online_cpu(cpu_online_map));
617 case CPU_DEAD: 615 case CPU_DEAD:
618 case CPU_DEAD_FROZEN: 616 case CPU_DEAD_FROZEN: {
617 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
618
619 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
620 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
621 sched_setscheduler(p, SCHED_FIFO, &param);
621 kthread_stop(p); 622 kthread_stop(p);
622 takeover_tasklets(hotcpu); 623 takeover_tasklets(hotcpu);
623 break; 624 break;
625 }
624#endif /* CONFIG_HOTPLUG_CPU */ 626#endif /* CONFIG_HOTPLUG_CPU */
625 } 627 }
626 return NOTIFY_OK; 628 return NOTIFY_OK;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0131e296ff..708d4882c0 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -10,6 +10,7 @@
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/freezer.h>
13#include <linux/kthread.h> 14#include <linux/kthread.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -116,7 +117,6 @@ static int watchdog(void * __bind_cpu)
116 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 117 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
117 118
118 sched_setscheduler(current, SCHED_FIFO, &param); 119 sched_setscheduler(current, SCHED_FIFO, &param);
119 current->flags |= PF_NOFREEZE;
120 120
121 /* initialize timestamp */ 121 /* initialize timestamp */
122 touch_softlockup_watchdog(); 122 touch_softlockup_watchdog();
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 2c6c2bf855..cd72424c26 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
72{ 72{
73 preempt_disable(); 73 preempt_disable();
74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
75 _raw_read_lock(lock); 75 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
76} 76}
77EXPORT_SYMBOL(_read_lock); 77EXPORT_SYMBOL(_read_lock);
78 78
@@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
88 * _raw_spin_lock_flags() code, because lockdep assumes 88 * _raw_spin_lock_flags() code, because lockdep assumes
89 * that interrupts are not re-enabled during lock-acquire: 89 * that interrupts are not re-enabled during lock-acquire:
90 */ 90 */
91#ifdef CONFIG_PROVE_LOCKING 91#ifdef CONFIG_LOCKDEP
92 _raw_spin_lock(lock); 92 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
93#else 93#else
94 _raw_spin_lock_flags(lock, &flags); 94 _raw_spin_lock_flags(lock, &flags);
95#endif 95#endif
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
102 local_irq_disable(); 102 local_irq_disable();
103 preempt_disable(); 103 preempt_disable();
104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
105 _raw_spin_lock(lock); 105 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
106} 106}
107EXPORT_SYMBOL(_spin_lock_irq); 107EXPORT_SYMBOL(_spin_lock_irq);
108 108
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
111 local_bh_disable(); 111 local_bh_disable();
112 preempt_disable(); 112 preempt_disable();
113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
114 _raw_spin_lock(lock); 114 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
115} 115}
116EXPORT_SYMBOL(_spin_lock_bh); 116EXPORT_SYMBOL(_spin_lock_bh);
117 117
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
122 local_irq_save(flags); 122 local_irq_save(flags);
123 preempt_disable(); 123 preempt_disable();
124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
125 _raw_read_lock(lock); 125 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
126 return flags; 126 return flags;
127} 127}
128EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
132 local_irq_disable(); 132 local_irq_disable();
133 preempt_disable(); 133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 _raw_read_lock(lock); 135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 136}
137EXPORT_SYMBOL(_read_lock_irq); 137EXPORT_SYMBOL(_read_lock_irq);
138 138
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
141 local_bh_disable(); 141 local_bh_disable();
142 preempt_disable(); 142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 _raw_read_lock(lock); 144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 145}
146EXPORT_SYMBOL(_read_lock_bh); 146EXPORT_SYMBOL(_read_lock_bh);
147 147
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
152 local_irq_save(flags); 152 local_irq_save(flags);
153 preempt_disable(); 153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 _raw_write_lock(lock); 155 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
156 return flags; 156 return flags;
157} 157}
158EXPORT_SYMBOL(_write_lock_irqsave); 158EXPORT_SYMBOL(_write_lock_irqsave);
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
162 local_irq_disable(); 162 local_irq_disable();
163 preempt_disable(); 163 preempt_disable();
164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
165 _raw_write_lock(lock); 165 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
166} 166}
167EXPORT_SYMBOL(_write_lock_irq); 167EXPORT_SYMBOL(_write_lock_irq);
168 168
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
171 local_bh_disable(); 171 local_bh_disable();
172 preempt_disable(); 172 preempt_disable();
173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
174 _raw_write_lock(lock); 174 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
175} 175}
176EXPORT_SYMBOL(_write_lock_bh); 176EXPORT_SYMBOL(_write_lock_bh);
177 177
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
179{ 179{
180 preempt_disable(); 180 preempt_disable();
181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
182 _raw_spin_lock(lock); 182 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
183} 183}
184 184
185EXPORT_SYMBOL(_spin_lock); 185EXPORT_SYMBOL(_spin_lock);
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock)
188{ 188{
189 preempt_disable(); 189 preempt_disable();
190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
191 _raw_write_lock(lock); 191 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
192} 192}
193 193
194EXPORT_SYMBOL(_write_lock); 194EXPORT_SYMBOL(_write_lock);
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
289{ 289{
290 preempt_disable(); 290 preempt_disable();
291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
292 _raw_spin_lock(lock); 292 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
293} 293}
294 294
295EXPORT_SYMBOL(_spin_lock_nested); 295EXPORT_SYMBOL(_spin_lock_nested);
@@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
305 * _raw_spin_lock_flags() code, because lockdep assumes 305 * _raw_spin_lock_flags() code, because lockdep assumes
306 * that interrupts are not re-enabled during lock-acquire: 306 * that interrupts are not re-enabled during lock-acquire:
307 */ 307 */
308#ifdef CONFIG_PROVE_SPIN_LOCKING 308#ifdef CONFIG_LOCKDEP
309 _raw_spin_lock(lock); 309 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
310#else 310#else
311 _raw_spin_lock_flags(lock, &flags); 311 _raw_spin_lock_flags(lock, &flags);
312#endif 312#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fcee2a8e6d..319821ef78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -93,10 +93,6 @@ static void stopmachine_set_state(enum stopmachine_state state)
93static int stop_machine(void) 93static int stop_machine(void)
94{ 94{
95 int i, ret = 0; 95 int i, ret = 0;
96 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
97
98 /* One high-prio thread per cpu. We'll do this one. */
99 sched_setscheduler(current, SCHED_FIFO, &param);
100 96
101 atomic_set(&stopmachine_thread_ack, 0); 97 atomic_set(&stopmachine_thread_ack, 0);
102 stopmachine_num_threads = 0; 98 stopmachine_num_threads = 0;
@@ -189,6 +185,10 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
189 185
190 p = kthread_create(do_stop, &smdata, "kstopmachine"); 186 p = kthread_create(do_stop, &smdata, "kstopmachine");
191 if (!IS_ERR(p)) { 187 if (!IS_ERR(p)) {
188 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
189
190 /* One high-prio thread per cpu. We'll do this one. */
191 sched_setscheduler(p, SCHED_FIFO, &param);
192 kthread_bind(p, cpu); 192 kthread_bind(p, cpu);
193 wake_up_process(p); 193 wake_up_process(p);
194 wait_for_completion(&smdata.done); 194 wait_for_completion(&smdata.done);
diff --git a/kernel/sys.c b/kernel/sys.c
index 872271ccc3..08562f4197 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,10 +31,12 @@
31#include <linux/cn_proc.h> 31#include <linux/cn_proc.h>
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h>
34 35
35#include <linux/compat.h> 36#include <linux/compat.h>
36#include <linux/syscalls.h> 37#include <linux/syscalls.h>
37#include <linux/kprobes.h> 38#include <linux/kprobes.h>
39#include <linux/user_namespace.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -98,6 +100,13 @@ struct pid *cad_pid;
98EXPORT_SYMBOL(cad_pid); 100EXPORT_SYMBOL(cad_pid);
99 101
100/* 102/*
103 * If set, this is used for preparing the system to power off.
104 */
105
106void (*pm_power_off_prepare)(void);
107EXPORT_SYMBOL(pm_power_off_prepare);
108
109/*
101 * Notifier list for kernel code which wants to be called 110 * Notifier list for kernel code which wants to be called
102 * at shutdown. This is used to stop any idling DMA operations 111 * at shutdown. This is used to stop any idling DMA operations
103 * and the like. 112 * and the like.
@@ -865,6 +874,8 @@ EXPORT_SYMBOL_GPL(kernel_halt);
865void kernel_power_off(void) 874void kernel_power_off(void)
866{ 875{
867 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 876 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
877 if (pm_power_off_prepare)
878 pm_power_off_prepare();
868 printk(KERN_EMERG "Power down.\n"); 879 printk(KERN_EMERG "Power down.\n");
869 machine_power_off(); 880 machine_power_off();
870} 881}
@@ -1025,7 +1036,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
1025 return -EPERM; 1036 return -EPERM;
1026 } 1037 }
1027 if (new_egid != old_egid) { 1038 if (new_egid != old_egid) {
1028 current->mm->dumpable = suid_dumpable; 1039 set_dumpable(current->mm, suid_dumpable);
1029 smp_wmb(); 1040 smp_wmb();
1030 } 1041 }
1031 if (rgid != (gid_t) -1 || 1042 if (rgid != (gid_t) -1 ||
@@ -1055,13 +1066,13 @@ asmlinkage long sys_setgid(gid_t gid)
1055 1066
1056 if (capable(CAP_SETGID)) { 1067 if (capable(CAP_SETGID)) {
1057 if (old_egid != gid) { 1068 if (old_egid != gid) {
1058 current->mm->dumpable = suid_dumpable; 1069 set_dumpable(current->mm, suid_dumpable);
1059 smp_wmb(); 1070 smp_wmb();
1060 } 1071 }
1061 current->gid = current->egid = current->sgid = current->fsgid = gid; 1072 current->gid = current->egid = current->sgid = current->fsgid = gid;
1062 } else if ((gid == current->gid) || (gid == current->sgid)) { 1073 } else if ((gid == current->gid) || (gid == current->sgid)) {
1063 if (old_egid != gid) { 1074 if (old_egid != gid) {
1064 current->mm->dumpable = suid_dumpable; 1075 set_dumpable(current->mm, suid_dumpable);
1065 smp_wmb(); 1076 smp_wmb();
1066 } 1077 }
1067 current->egid = current->fsgid = gid; 1078 current->egid = current->fsgid = gid;
@@ -1078,13 +1089,13 @@ static int set_user(uid_t new_ruid, int dumpclear)
1078{ 1089{
1079 struct user_struct *new_user; 1090 struct user_struct *new_user;
1080 1091
1081 new_user = alloc_uid(new_ruid); 1092 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
1082 if (!new_user) 1093 if (!new_user)
1083 return -EAGAIN; 1094 return -EAGAIN;
1084 1095
1085 if (atomic_read(&new_user->processes) >= 1096 if (atomic_read(&new_user->processes) >=
1086 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 1097 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
1087 new_user != &root_user) { 1098 new_user != current->nsproxy->user_ns->root_user) {
1088 free_uid(new_user); 1099 free_uid(new_user);
1089 return -EAGAIN; 1100 return -EAGAIN;
1090 } 1101 }
@@ -1092,7 +1103,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
1092 switch_uid(new_user); 1103 switch_uid(new_user);
1093 1104
1094 if (dumpclear) { 1105 if (dumpclear) {
1095 current->mm->dumpable = suid_dumpable; 1106 set_dumpable(current->mm, suid_dumpable);
1096 smp_wmb(); 1107 smp_wmb();
1097 } 1108 }
1098 current->uid = new_ruid; 1109 current->uid = new_ruid;
@@ -1148,7 +1159,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1148 return -EAGAIN; 1159 return -EAGAIN;
1149 1160
1150 if (new_euid != old_euid) { 1161 if (new_euid != old_euid) {
1151 current->mm->dumpable = suid_dumpable; 1162 set_dumpable(current->mm, suid_dumpable);
1152 smp_wmb(); 1163 smp_wmb();
1153 } 1164 }
1154 current->fsuid = current->euid = new_euid; 1165 current->fsuid = current->euid = new_euid;
@@ -1198,7 +1209,7 @@ asmlinkage long sys_setuid(uid_t uid)
1198 return -EPERM; 1209 return -EPERM;
1199 1210
1200 if (old_euid != uid) { 1211 if (old_euid != uid) {
1201 current->mm->dumpable = suid_dumpable; 1212 set_dumpable(current->mm, suid_dumpable);
1202 smp_wmb(); 1213 smp_wmb();
1203 } 1214 }
1204 current->fsuid = current->euid = uid; 1215 current->fsuid = current->euid = uid;
@@ -1243,7 +1254,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
1243 } 1254 }
1244 if (euid != (uid_t) -1) { 1255 if (euid != (uid_t) -1) {
1245 if (euid != current->euid) { 1256 if (euid != current->euid) {
1246 current->mm->dumpable = suid_dumpable; 1257 set_dumpable(current->mm, suid_dumpable);
1247 smp_wmb(); 1258 smp_wmb();
1248 } 1259 }
1249 current->euid = euid; 1260 current->euid = euid;
@@ -1293,7 +1304,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
1293 } 1304 }
1294 if (egid != (gid_t) -1) { 1305 if (egid != (gid_t) -1) {
1295 if (egid != current->egid) { 1306 if (egid != current->egid) {
1296 current->mm->dumpable = suid_dumpable; 1307 set_dumpable(current->mm, suid_dumpable);
1297 smp_wmb(); 1308 smp_wmb();
1298 } 1309 }
1299 current->egid = egid; 1310 current->egid = egid;
@@ -1339,7 +1350,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
1339 uid == current->suid || uid == current->fsuid || 1350 uid == current->suid || uid == current->fsuid ||
1340 capable(CAP_SETUID)) { 1351 capable(CAP_SETUID)) {
1341 if (uid != old_fsuid) { 1352 if (uid != old_fsuid) {
1342 current->mm->dumpable = suid_dumpable; 1353 set_dumpable(current->mm, suid_dumpable);
1343 smp_wmb(); 1354 smp_wmb();
1344 } 1355 }
1345 current->fsuid = uid; 1356 current->fsuid = uid;
@@ -1368,7 +1379,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
1368 gid == current->sgid || gid == current->fsgid || 1379 gid == current->sgid || gid == current->fsgid ||
1369 capable(CAP_SETGID)) { 1380 capable(CAP_SETGID)) {
1370 if (gid != old_fsgid) { 1381 if (gid != old_fsgid) {
1371 current->mm->dumpable = suid_dumpable; 1382 set_dumpable(current->mm, suid_dumpable);
1372 smp_wmb(); 1383 smp_wmb();
1373 } 1384 }
1374 current->fsgid = gid; 1385 current->fsgid = gid;
@@ -2165,14 +2176,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2165 error = put_user(current->pdeath_signal, (int __user *)arg2); 2176 error = put_user(current->pdeath_signal, (int __user *)arg2);
2166 break; 2177 break;
2167 case PR_GET_DUMPABLE: 2178 case PR_GET_DUMPABLE:
2168 error = current->mm->dumpable; 2179 error = get_dumpable(current->mm);
2169 break; 2180 break;
2170 case PR_SET_DUMPABLE: 2181 case PR_SET_DUMPABLE:
2171 if (arg2 < 0 || arg2 > 1) { 2182 if (arg2 < 0 || arg2 > 1) {
2172 error = -EINVAL; 2183 error = -EINVAL;
2173 break; 2184 break;
2174 } 2185 }
2175 current->mm->dumpable = arg2; 2186 set_dumpable(current->mm, arg2);
2176 break; 2187 break;
2177 2188
2178 case PR_SET_UNALIGN: 2189 case PR_SET_UNALIGN:
@@ -2241,6 +2252,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2241 error = SET_ENDIAN(current, arg2); 2252 error = SET_ENDIAN(current, arg2);
2242 break; 2253 break;
2243 2254
2255 case PR_GET_SECCOMP:
2256 error = prctl_get_seccomp();
2257 break;
2258 case PR_SET_SECCOMP:
2259 error = prctl_set_seccomp(arg2);
2260 break;
2261
2244 default: 2262 default:
2245 error = -EINVAL; 2263 error = -EINVAL;
2246 break; 2264 break;
@@ -2277,3 +2295,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2277 } 2295 }
2278 return err ? -EFAULT : 0; 2296 return err ? -EFAULT : 0;
2279} 2297}
2298
2299char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2300
2301static void argv_cleanup(char **argv, char **envp)
2302{
2303 argv_free(argv);
2304}
2305
2306/**
2307 * orderly_poweroff - Trigger an orderly system poweroff
2308 * @force: force poweroff if command execution fails
2309 *
2310 * This may be called from any context to trigger a system shutdown.
2311 * If the orderly shutdown fails, it will force an immediate shutdown.
2312 */
2313int orderly_poweroff(bool force)
2314{
2315 int argc;
2316 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2317 static char *envp[] = {
2318 "HOME=/",
2319 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2320 NULL
2321 };
2322 int ret = -ENOMEM;
2323 struct subprocess_info *info;
2324
2325 if (argv == NULL) {
2326 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2327 __func__, poweroff_cmd);
2328 goto out;
2329 }
2330
2331 info = call_usermodehelper_setup(argv[0], argv, envp);
2332 if (info == NULL) {
2333 argv_free(argv);
2334 goto out;
2335 }
2336
2337 call_usermodehelper_setcleanup(info, argv_cleanup);
2338
2339 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2340
2341 out:
2342 if (ret && force) {
2343 printk(KERN_WARNING "Failed to start orderly shutdown: "
2344 "forcing the issue\n");
2345
2346 /* I guess this should try to kick off some daemon to
2347 sync and poweroff asap. Or not even bother syncing
2348 if we're doing an emergency shutdown? */
2349 emergency_sync();
2350 kernel_power_off();
2351 }
2352
2353 return ret;
2354}
2355EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7e11e2c98b..b0ec498a18 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -14,6 +14,7 @@ asmlinkage long sys_ni_syscall(void)
14 14
15cond_syscall(sys_nfsservctl); 15cond_syscall(sys_nfsservctl);
16cond_syscall(sys_quotactl); 16cond_syscall(sys_quotactl);
17cond_syscall(sys32_quotactl);
17cond_syscall(sys_acct); 18cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 19cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 20cond_syscall(sys_swapon);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 30ee462ee7..222299844a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -29,6 +29,7 @@
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/kernel.h> 34#include <linux/kernel.h>
34#include <linux/kobject.h> 35#include <linux/kobject.h>
@@ -45,13 +46,11 @@
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/processor.h> 52#include <asm/processor.h>
51 53
52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
53 void __user *buffer, size_t *lenp, loff_t *ppos);
54
55#ifdef CONFIG_X86 54#ifdef CONFIG_X86
56#include <asm/nmi.h> 55#include <asm/nmi.h>
57#include <asm/stacktrace.h> 56#include <asm/stacktrace.h>
@@ -61,6 +60,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
61 60
62/* External variables not in a header file. */ 61/* External variables not in a header file. */
63extern int C_A_D; 62extern int C_A_D;
63extern int print_fatal_signals;
64extern int sysctl_overcommit_memory; 64extern int sysctl_overcommit_memory;
65extern int sysctl_overcommit_ratio; 65extern int sysctl_overcommit_ratio;
66extern int sysctl_panic_on_oom; 66extern int sysctl_panic_on_oom;
@@ -78,6 +78,7 @@ extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect; 79extern int maps_protect;
80extern int sysctl_stat_interval; 80extern int sysctl_stat_interval;
81extern int audit_argv_kb;
81 82
82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 83/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
83static int maxolduid = 65535; 84static int maxolduid = 65535;
@@ -160,6 +161,8 @@ extern ctl_table inotify_table[];
160int sysctl_legacy_va_layout; 161int sysctl_legacy_va_layout;
161#endif 162#endif
162 163
164extern int prove_locking;
165extern int lock_stat;
163 166
164/* The default sysctl tables: */ 167/* The default sysctl tables: */
165 168
@@ -202,11 +205,114 @@ static ctl_table root_table[] = {
202 .mode = 0555, 205 .mode = 0555,
203 .child = dev_table, 206 .child = dev_table,
204 }, 207 },
205 208/*
209 * NOTE: do not add new entries to this table unless you have read
210 * Documentation/sysctl/ctl_unnumbered.txt
211 */
206 { .ctl_name = 0 } 212 { .ctl_name = 0 }
207}; 213};
208 214
215#ifdef CONFIG_SCHED_DEBUG
216static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
217static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
218static unsigned long min_wakeup_granularity_ns; /* 0 usecs */
219static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */
220#endif
221
209static ctl_table kern_table[] = { 222static ctl_table kern_table[] = {
223#ifdef CONFIG_SCHED_DEBUG
224 {
225 .ctl_name = CTL_UNNUMBERED,
226 .procname = "sched_granularity_ns",
227 .data = &sysctl_sched_granularity,
228 .maxlen = sizeof(unsigned int),
229 .mode = 0644,
230 .proc_handler = &proc_dointvec_minmax,
231 .strategy = &sysctl_intvec,
232 .extra1 = &min_sched_granularity_ns,
233 .extra2 = &max_sched_granularity_ns,
234 },
235 {
236 .ctl_name = CTL_UNNUMBERED,
237 .procname = "sched_wakeup_granularity_ns",
238 .data = &sysctl_sched_wakeup_granularity,
239 .maxlen = sizeof(unsigned int),
240 .mode = 0644,
241 .proc_handler = &proc_dointvec_minmax,
242 .strategy = &sysctl_intvec,
243 .extra1 = &min_wakeup_granularity_ns,
244 .extra2 = &max_wakeup_granularity_ns,
245 },
246 {
247 .ctl_name = CTL_UNNUMBERED,
248 .procname = "sched_batch_wakeup_granularity_ns",
249 .data = &sysctl_sched_batch_wakeup_granularity,
250 .maxlen = sizeof(unsigned int),
251 .mode = 0644,
252 .proc_handler = &proc_dointvec_minmax,
253 .strategy = &sysctl_intvec,
254 .extra1 = &min_wakeup_granularity_ns,
255 .extra2 = &max_wakeup_granularity_ns,
256 },
257 {
258 .ctl_name = CTL_UNNUMBERED,
259 .procname = "sched_stat_granularity_ns",
260 .data = &sysctl_sched_stat_granularity,
261 .maxlen = sizeof(unsigned int),
262 .mode = 0644,
263 .proc_handler = &proc_dointvec_minmax,
264 .strategy = &sysctl_intvec,
265 .extra1 = &min_wakeup_granularity_ns,
266 .extra2 = &max_wakeup_granularity_ns,
267 },
268 {
269 .ctl_name = CTL_UNNUMBERED,
270 .procname = "sched_runtime_limit_ns",
271 .data = &sysctl_sched_runtime_limit,
272 .maxlen = sizeof(unsigned int),
273 .mode = 0644,
274 .proc_handler = &proc_dointvec_minmax,
275 .strategy = &sysctl_intvec,
276 .extra1 = &min_sched_granularity_ns,
277 .extra2 = &max_sched_granularity_ns,
278 },
279 {
280 .ctl_name = CTL_UNNUMBERED,
281 .procname = "sched_child_runs_first",
282 .data = &sysctl_sched_child_runs_first,
283 .maxlen = sizeof(unsigned int),
284 .mode = 0644,
285 .proc_handler = &proc_dointvec,
286 },
287#ifdef CONFIG_PROVE_LOCKING
288 {
289 .ctl_name = CTL_UNNUMBERED,
290 .procname = "prove_locking",
291 .data = &prove_locking,
292 .maxlen = sizeof(int),
293 .mode = 0644,
294 .proc_handler = &proc_dointvec,
295 },
296#endif
297#ifdef CONFIG_LOCK_STAT
298 {
299 .ctl_name = CTL_UNNUMBERED,
300 .procname = "lock_stat",
301 .data = &lock_stat,
302 .maxlen = sizeof(int),
303 .mode = 0644,
304 .proc_handler = &proc_dointvec,
305 },
306#endif
307 {
308 .ctl_name = CTL_UNNUMBERED,
309 .procname = "sched_features",
310 .data = &sysctl_sched_features,
311 .maxlen = sizeof(unsigned int),
312 .mode = 0644,
313 .proc_handler = &proc_dointvec,
314 },
315#endif
210 { 316 {
211 .ctl_name = KERN_PANIC, 317 .ctl_name = KERN_PANIC,
212 .procname = "panic", 318 .procname = "panic",
@@ -223,6 +329,16 @@ static ctl_table kern_table[] = {
223 .mode = 0644, 329 .mode = 0644,
224 .proc_handler = &proc_dointvec, 330 .proc_handler = &proc_dointvec,
225 }, 331 },
332#ifdef CONFIG_AUDITSYSCALL
333 {
334 .ctl_name = CTL_UNNUMBERED,
335 .procname = "audit_argv_kb",
336 .data = &audit_argv_kb,
337 .maxlen = sizeof(int),
338 .mode = 0644,
339 .proc_handler = &proc_dointvec,
340 },
341#endif
226 { 342 {
227 .ctl_name = KERN_CORE_PATTERN, 343 .ctl_name = KERN_CORE_PATTERN,
228 .procname = "core_pattern", 344 .procname = "core_pattern",
@@ -260,6 +376,14 @@ static ctl_table kern_table[] = {
260 .proc_handler = &proc_dointvec, 376 .proc_handler = &proc_dointvec,
261 }, 377 },
262#endif 378#endif
379 {
380 .ctl_name = CTL_UNNUMBERED,
381 .procname = "print-fatal-signals",
382 .data = &print_fatal_signals,
383 .maxlen = sizeof(int),
384 .mode = 0644,
385 .proc_handler = &proc_dointvec,
386 },
263#ifdef __sparc__ 387#ifdef __sparc__
264 { 388 {
265 .ctl_name = KERN_SPARC_REBOOT, 389 .ctl_name = KERN_SPARC_REBOOT,
@@ -569,7 +693,7 @@ static ctl_table kern_table[] = {
569 { 693 {
570 .ctl_name = KERN_ACPI_VIDEO_FLAGS, 694 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
571 .procname = "acpi_video_flags", 695 .procname = "acpi_video_flags",
572 .data = &acpi_video_flags, 696 .data = &acpi_realmode_flags,
573 .maxlen = sizeof (unsigned long), 697 .maxlen = sizeof (unsigned long),
574 .mode = 0644, 698 .mode = 0644,
575 .proc_handler = &proc_doulongvec_minmax, 699 .proc_handler = &proc_doulongvec_minmax,
@@ -615,13 +739,26 @@ static ctl_table kern_table[] = {
615 .proc_handler = &proc_dointvec, 739 .proc_handler = &proc_dointvec,
616 }, 740 },
617#endif 741#endif
618 742 {
743 .ctl_name = CTL_UNNUMBERED,
744 .procname = "poweroff_cmd",
745 .data = &poweroff_cmd,
746 .maxlen = POWEROFF_CMD_PATH_LEN,
747 .mode = 0644,
748 .proc_handler = &proc_dostring,
749 .strategy = &sysctl_string,
750 },
751/*
752 * NOTE: do not add new entries to this table unless you have read
753 * Documentation/sysctl/ctl_unnumbered.txt
754 */
619 { .ctl_name = 0 } 755 { .ctl_name = 0 }
620}; 756};
621 757
622/* Constants for minimum and maximum testing in vm_table. 758/* Constants for minimum and maximum testing in vm_table.
623 We use these as one-element integer vectors. */ 759 We use these as one-element integer vectors. */
624static int zero; 760static int zero;
761static int two = 2;
625static int one_hundred = 100; 762static int one_hundred = 100;
626 763
627 764
@@ -734,6 +871,14 @@ static ctl_table vm_table[] = {
734 .mode = 0644, 871 .mode = 0644,
735 .proc_handler = &proc_dointvec, 872 .proc_handler = &proc_dointvec,
736 }, 873 },
874 {
875 .ctl_name = CTL_UNNUMBERED,
876 .procname = "hugepages_treat_as_movable",
877 .data = &hugepages_treat_as_movable,
878 .maxlen = sizeof(int),
879 .mode = 0644,
880 .proc_handler = &hugetlb_treat_movable_handler,
881 },
737#endif 882#endif
738 { 883 {
739 .ctl_name = VM_LOWMEM_RESERVE_RATIO, 884 .ctl_name = VM_LOWMEM_RESERVE_RATIO,
@@ -869,6 +1014,27 @@ static ctl_table vm_table[] = {
869 .strategy = &sysctl_jiffies, 1014 .strategy = &sysctl_jiffies,
870 }, 1015 },
871#endif 1016#endif
1017#ifdef CONFIG_SECURITY
1018 {
1019 .ctl_name = CTL_UNNUMBERED,
1020 .procname = "mmap_min_addr",
1021 .data = &mmap_min_addr,
1022 .maxlen = sizeof(unsigned long),
1023 .mode = 0644,
1024 .proc_handler = &proc_doulongvec_minmax,
1025 },
1026#ifdef CONFIG_NUMA
1027 {
1028 .ctl_name = CTL_UNNUMBERED,
1029 .procname = "numa_zonelist_order",
1030 .data = &numa_zonelist_order,
1031 .maxlen = NUMA_ZONELIST_ORDER_LEN,
1032 .mode = 0644,
1033 .proc_handler = &numa_zonelist_order_handler,
1034 .strategy = &sysctl_string,
1035 },
1036#endif
1037#endif
872#if defined(CONFIG_X86_32) || \ 1038#if defined(CONFIG_X86_32) || \
873 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1039 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
874 { 1040 {
@@ -882,6 +1048,10 @@ static ctl_table vm_table[] = {
882 .extra1 = &zero, 1048 .extra1 = &zero,
883 }, 1049 },
884#endif 1050#endif
1051/*
1052 * NOTE: do not add new entries to this table unless you have read
1053 * Documentation/sysctl/ctl_unnumbered.txt
1054 */
885 { .ctl_name = 0 } 1055 { .ctl_name = 0 }
886}; 1056};
887 1057
@@ -979,7 +1149,10 @@ static ctl_table fs_table[] = {
979 .data = &lease_break_time, 1149 .data = &lease_break_time,
980 .maxlen = sizeof(int), 1150 .maxlen = sizeof(int),
981 .mode = 0644, 1151 .mode = 0644,
982 .proc_handler = &proc_dointvec, 1152 .proc_handler = &proc_dointvec_minmax,
1153 .strategy = &sysctl_intvec,
1154 .extra1 = &zero,
1155 .extra2 = &two,
983 }, 1156 },
984 { 1157 {
985 .ctl_name = FS_AIO_NR, 1158 .ctl_name = FS_AIO_NR,
@@ -1022,6 +1195,10 @@ static ctl_table fs_table[] = {
1022 .child = binfmt_misc_table, 1195 .child = binfmt_misc_table,
1023 }, 1196 },
1024#endif 1197#endif
1198/*
1199 * NOTE: do not add new entries to this table unless you have read
1200 * Documentation/sysctl/ctl_unnumbered.txt
1201 */
1025 { .ctl_name = 0 } 1202 { .ctl_name = 0 }
1026}; 1203};
1027 1204
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 906cae7715..059431ed67 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -196,6 +196,8 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
196 196
197 /* fill in basic acct fields */ 197 /* fill in basic acct fields */
198 stats->version = TASKSTATS_VERSION; 198 stats->version = TASKSTATS_VERSION;
199 stats->nvcsw = tsk->nvcsw;
200 stats->nivcsw = tsk->nivcsw;
199 bacct_add_tsk(stats, tsk); 201 bacct_add_tsk(stats, tsk);
200 202
201 /* fill in extended acct fields */ 203 /* fill in extended acct fields */
@@ -242,6 +244,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
242 */ 244 */
243 delayacct_add_tsk(stats, tsk); 245 delayacct_add_tsk(stats, tsk);
244 246
247 stats->nvcsw += tsk->nvcsw;
248 stats->nivcsw += tsk->nivcsw;
245 } while_each_thread(first, tsk); 249 } while_each_thread(first, tsk);
246 250
247 unlock_task_sighand(first, &flags); 251 unlock_task_sighand(first, &flags);
diff --git a/kernel/time.c b/kernel/time.c
index f04791f694..5b81da08bb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -58,9 +58,9 @@ EXPORT_SYMBOL(sys_tz);
58asmlinkage long sys_time(time_t __user * tloc) 58asmlinkage long sys_time(time_t __user * tloc)
59{ 59{
60 time_t i; 60 time_t i;
61 struct timeval tv; 61 struct timespec tv;
62 62
63 do_gettimeofday(&tv); 63 getnstimeofday(&tv);
64 i = tv.tv_sec; 64 i = tv.tv_sec;
65 65
66 if (tloc) { 66 if (tloc) {
@@ -133,7 +133,6 @@ static inline void warp_clock(void)
133 write_seqlock_irq(&xtime_lock); 133 write_seqlock_irq(&xtime_lock);
134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
135 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 135 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
136 time_interpolator_reset();
137 write_sequnlock_irq(&xtime_lock); 136 write_sequnlock_irq(&xtime_lock);
138 clock_was_set(); 137 clock_was_set();
139} 138}
@@ -306,79 +305,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
306} 305}
307EXPORT_SYMBOL(timespec_trunc); 306EXPORT_SYMBOL(timespec_trunc);
308 307
309#ifdef CONFIG_TIME_INTERPOLATION
310void getnstimeofday (struct timespec *tv)
311{
312 unsigned long seq,sec,nsec;
313
314 do {
315 seq = read_seqbegin(&xtime_lock);
316 sec = xtime.tv_sec;
317 nsec = xtime.tv_nsec+time_interpolator_get_offset();
318 } while (unlikely(read_seqretry(&xtime_lock, seq)));
319
320 while (unlikely(nsec >= NSEC_PER_SEC)) {
321 nsec -= NSEC_PER_SEC;
322 ++sec;
323 }
324 tv->tv_sec = sec;
325 tv->tv_nsec = nsec;
326}
327EXPORT_SYMBOL_GPL(getnstimeofday);
328
329int do_settimeofday (struct timespec *tv)
330{
331 time_t wtm_sec, sec = tv->tv_sec;
332 long wtm_nsec, nsec = tv->tv_nsec;
333
334 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
335 return -EINVAL;
336
337 write_seqlock_irq(&xtime_lock);
338 {
339 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
340 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
341
342 set_normalized_timespec(&xtime, sec, nsec);
343 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
344
345 time_adjust = 0; /* stop active adjtime() */
346 time_status |= STA_UNSYNC;
347 time_maxerror = NTP_PHASE_LIMIT;
348 time_esterror = NTP_PHASE_LIMIT;
349 time_interpolator_reset();
350 }
351 write_sequnlock_irq(&xtime_lock);
352 clock_was_set();
353 return 0;
354}
355EXPORT_SYMBOL(do_settimeofday);
356
357void do_gettimeofday (struct timeval *tv)
358{
359 unsigned long seq, nsec, usec, sec, offset;
360 do {
361 seq = read_seqbegin(&xtime_lock);
362 offset = time_interpolator_get_offset();
363 sec = xtime.tv_sec;
364 nsec = xtime.tv_nsec;
365 } while (unlikely(read_seqretry(&xtime_lock, seq)));
366
367 usec = (nsec + offset) / 1000;
368
369 while (unlikely(usec >= USEC_PER_SEC)) {
370 usec -= USEC_PER_SEC;
371 ++sec;
372 }
373
374 tv->tv_sec = sec;
375 tv->tv_usec = usec;
376}
377
378EXPORT_SYMBOL(do_gettimeofday);
379
380
381#else
382#ifndef CONFIG_GENERIC_TIME 308#ifndef CONFIG_GENERIC_TIME
383/* 309/*
384 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 310 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -394,7 +320,6 @@ void getnstimeofday(struct timespec *tv)
394} 320}
395EXPORT_SYMBOL_GPL(getnstimeofday); 321EXPORT_SYMBOL_GPL(getnstimeofday);
396#endif 322#endif
397#endif
398 323
399/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 324/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
400 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 325 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 76212b2a99..2ad1c37b8d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -205,47 +205,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
205} 205}
206 206
207/** 207/**
208 * clockevents_request_device
209 */
210struct clock_event_device *clockevents_request_device(unsigned int features,
211 cpumask_t cpumask)
212{
213 struct clock_event_device *cur, *dev = NULL;
214 struct list_head *tmp;
215
216 spin_lock(&clockevents_lock);
217
218 list_for_each(tmp, &clockevent_devices) {
219 cur = list_entry(tmp, struct clock_event_device, list);
220
221 if ((cur->features & features) == features &&
222 cpus_equal(cpumask, cur->cpumask)) {
223 if (!dev || dev->rating < cur->rating)
224 dev = cur;
225 }
226 }
227
228 clockevents_exchange_device(NULL, dev);
229
230 spin_unlock(&clockevents_lock);
231
232 return dev;
233}
234
235/**
236 * clockevents_release_device
237 */
238void clockevents_release_device(struct clock_event_device *dev)
239{
240 spin_lock(&clockevents_lock);
241
242 clockevents_exchange_device(dev, NULL);
243 clockevents_notify_released();
244
245 spin_unlock(&clockevents_lock);
246}
247
248/**
249 * clockevents_notify - notification about relevant events 208 * clockevents_notify - notification about relevant events
250 */ 209 */
251void clockevents_notify(unsigned long reason, void *arg) 210void clockevents_notify(unsigned long reason, void *arg)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87aa5ff931..cd91237dbf 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,10 +10,11 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
13#include <linux/timex.h> 14#include <linux/timex.h>
14#include <linux/jiffies.h> 15#include <linux/jiffies.h>
15#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
16 17#include <linux/capability.h>
17#include <asm/div64.h> 18#include <asm/div64.h>
18#include <asm/timex.h> 19#include <asm/timex.h>
19 20
@@ -116,13 +117,7 @@ void second_overflow(void)
116 if (xtime.tv_sec % 86400 == 0) { 117 if (xtime.tv_sec % 86400 == 0) {
117 xtime.tv_sec--; 118 xtime.tv_sec--;
118 wall_to_monotonic.tv_sec++; 119 wall_to_monotonic.tv_sec++;
119 /*
120 * The timer interpolator will make time change
121 * gradually instead of an immediate jump by one second
122 */
123 time_interpolator_update(-NSEC_PER_SEC);
124 time_state = TIME_OOP; 120 time_state = TIME_OOP;
125 clock_was_set();
126 printk(KERN_NOTICE "Clock: inserting leap second " 121 printk(KERN_NOTICE "Clock: inserting leap second "
127 "23:59:60 UTC\n"); 122 "23:59:60 UTC\n");
128 } 123 }
@@ -131,13 +126,7 @@ void second_overflow(void)
131 if ((xtime.tv_sec + 1) % 86400 == 0) { 126 if ((xtime.tv_sec + 1) % 86400 == 0) {
132 xtime.tv_sec++; 127 xtime.tv_sec++;
133 wall_to_monotonic.tv_sec--; 128 wall_to_monotonic.tv_sec--;
134 /*
135 * Use of time interpolator for a gradual change of
136 * time
137 */
138 time_interpolator_update(NSEC_PER_SEC);
139 time_state = TIME_WAIT; 129 time_state = TIME_WAIT;
140 clock_was_set();
141 printk(KERN_NOTICE "Clock: deleting leap second " 130 printk(KERN_NOTICE "Clock: deleting leap second "
142 "23:59:59 UTC\n"); 131 "23:59:59 UTC\n");
143 } 132 }
@@ -187,12 +176,64 @@ u64 current_tick_length(void)
187 return tick_length; 176 return tick_length;
188} 177}
189 178
179#ifdef CONFIG_GENERIC_CMOS_UPDATE
180
181/* Disable the cmos update - used by virtualization and embedded */
182int no_sync_cmos_clock __read_mostly;
183
184static void sync_cmos_clock(unsigned long dummy);
185
186static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
190 187
191void __attribute__ ((weak)) notify_arch_cmos_timer(void) 188static void sync_cmos_clock(unsigned long dummy)
192{ 189{
193 return; 190 struct timespec now, next;
191 int fail = 1;
192
193 /*
194 * If we have an externally synchronized Linux clock, then update
195 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
196 * called as close as possible to 500 ms before the new second starts.
197 * This code is run on a timer. If the clock is set, that timer
198 * may not expire at the correct time. Thus, we adjust...
199 */
200 if (!ntp_synced())
201 /*
202 * Not synced, exit, do not restart a timer (if one is
203 * running, let it run out).
204 */
205 return;
206
207 getnstimeofday(&now);
208 if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
209 fail = update_persistent_clock(now);
210
211 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
212 if (next.tv_nsec <= 0)
213 next.tv_nsec += NSEC_PER_SEC;
214
215 if (!fail)
216 next.tv_sec = 659;
217 else
218 next.tv_sec = 0;
219
220 if (next.tv_nsec >= NSEC_PER_SEC) {
221 next.tv_sec++;
222 next.tv_nsec -= NSEC_PER_SEC;
223 }
224 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
194} 225}
195 226
227static void notify_cmos_timer(void)
228{
229 if (no_sync_cmos_clock)
230 mod_timer(&sync_cmos_timer, jiffies + 1);
231}
232
233#else
234static inline void notify_cmos_timer(void) { }
235#endif
236
196/* adjtimex mainly allows reading (and writing, if superuser) of 237/* adjtimex mainly allows reading (and writing, if superuser) of
197 * kernel time-keeping variables. used by xntpd. 238 * kernel time-keeping variables. used by xntpd.
198 */ 239 */
@@ -357,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
357 txc->stbcnt = 0; 398 txc->stbcnt = 0;
358 write_sequnlock_irq(&xtime_lock); 399 write_sequnlock_irq(&xtime_lock);
359 do_gettimeofday(&txc->time); 400 do_gettimeofday(&txc->time);
360 notify_arch_cmos_timer(); 401 notify_cmos_timer();
361 return(result); 402 return(result);
362} 403}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8001d37071..db8e0f3d40 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock); 32static DEFINE_SPINLOCK(tick_broadcast_lock);
33 33
34#ifdef CONFIG_TICK_ONESHOT
35static void tick_broadcast_clear_oneshot(int cpu);
36#else
37static inline void tick_broadcast_clear_oneshot(int cpu) { }
38#endif
39
34/* 40/*
35 * Debugging: see timer_list.c 41 * Debugging: see timer_list.c
36 */ 42 */
@@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void)
49 */ 55 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc) 56static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{ 57{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) 58 if (bc)
53 tick_setup_periodic(bc, 1); 59 tick_setup_periodic(bc, 1);
54} 60}
55 61
@@ -99,8 +105,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
99 cpu_set(cpu, tick_broadcast_mask); 105 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 106 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1; 107 ret = 1;
102 } 108 } else {
109 /*
110 * When the new device is not affected by the stop
111 * feature and the cpu is marked in the broadcast mask
112 * then clear the broadcast bit.
113 */
114 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
115 int cpu = smp_processor_id();
103 116
117 cpu_clear(cpu, tick_broadcast_mask);
118 tick_broadcast_clear_oneshot(cpu);
119 }
120 }
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 121 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret; 122 return ret;
106} 123}
@@ -299,7 +316,7 @@ void tick_suspend_broadcast(void)
299 spin_lock_irqsave(&tick_broadcast_lock, flags); 316 spin_lock_irqsave(&tick_broadcast_lock, flags);
300 317
301 bc = tick_broadcast_device.evtdev; 318 bc = tick_broadcast_device.evtdev;
302 if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 319 if (bc)
303 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 320 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
304 321
305 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 322 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -316,6 +333,8 @@ int tick_resume_broadcast(void)
316 bc = tick_broadcast_device.evtdev; 333 bc = tick_broadcast_device.evtdev;
317 334
318 if (bc) { 335 if (bc) {
336 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
337
319 switch (tick_broadcast_device.mode) { 338 switch (tick_broadcast_device.mode) {
320 case TICKDEV_MODE_PERIODIC: 339 case TICKDEV_MODE_PERIODIC:
321 if(!cpus_empty(tick_broadcast_mask)) 340 if(!cpus_empty(tick_broadcast_mask))
@@ -485,6 +504,16 @@ out:
485 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 504 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
486} 505}
487 506
507/*
508 * Reset the one shot broadcast for a cpu
509 *
510 * Called with tick_broadcast_lock held
511 */
512static void tick_broadcast_clear_oneshot(int cpu)
513{
514 cpu_clear(cpu, tick_broadcast_oneshot_mask);
515}
516
488/** 517/**
489 * tick_broadcast_setup_highres - setup the broadcast device for highres 518 * tick_broadcast_setup_highres - setup the broadcast device for highres
490 */ 519 */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a96ec9ab34..77a21abc87 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -318,12 +318,17 @@ static void tick_resume(void)
318{ 318{
319 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 319 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
320 unsigned long flags; 320 unsigned long flags;
321 int broadcast = tick_resume_broadcast();
321 322
322 spin_lock_irqsave(&tick_device_lock, flags); 323 spin_lock_irqsave(&tick_device_lock, flags);
323 if (td->mode == TICKDEV_MODE_PERIODIC) 324 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
324 tick_setup_periodic(td->evtdev, 0); 325
325 else 326 if (!broadcast) {
326 tick_resume_oneshot(); 327 if (td->mode == TICKDEV_MODE_PERIODIC)
328 tick_setup_periodic(td->evtdev, 0);
329 else
330 tick_resume_oneshot();
331 }
327 spin_unlock_irqrestore(&tick_device_lock, flags); 332 spin_unlock_irqrestore(&tick_device_lock, flags);
328} 333}
329 334
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
360 break; 365 break;
361 366
362 case CLOCK_EVT_NOTIFY_RESUME: 367 case CLOCK_EVT_NOTIFY_RESUME:
363 if (!tick_resume_broadcast()) 368 tick_resume();
364 tick_resume();
365 break; 369 break;
366 370
367 default: 371 default:
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f6997ab0c3..0258d3115d 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
73 struct clock_event_device *dev = td->evtdev; 73 struct clock_event_device *dev = td->evtdev;
74 74
75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || 75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
76 !tick_device_is_functional(dev)) 76 !tick_device_is_functional(dev)) {
77
78 printk(KERN_INFO "Clockevents: "
79 "could not switch to one-shot mode:");
80 if (!dev) {
81 printk(" no tick device\n");
82 } else {
83 if (!tick_device_is_functional(dev))
84 printk(" %s is not functional.\n", dev->name);
85 else
86 printk(" %s does not support one-shot mode.\n",
87 dev->name);
88 }
77 return -EINVAL; 89 return -EINVAL;
90 }
78 91
79 td->mode = TICKDEV_MODE_ONESHOT; 92 td->mode = TICKDEV_MODE_ONESHOT;
80 dev->event_handler = handler; 93 dev->event_handler = handler;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52db9e3c52..b416995b97 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -546,6 +546,7 @@ void tick_setup_sched_timer(void)
546{ 546{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548 ktime_t now = ktime_get(); 548 ktime_t now = ktime_get();
549 u64 offset;
549 550
550 /* 551 /*
551 * Emulate tick processing via per-CPU hrtimers: 552 * Emulate tick processing via per-CPU hrtimers:
@@ -554,8 +555,12 @@ void tick_setup_sched_timer(void)
554 ts->sched_timer.function = tick_sched_timer; 555 ts->sched_timer.function = tick_sched_timer;
555 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 556 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
556 557
557 /* Get the next period */ 558 /* Get the next period (per cpu) */
558 ts->sched_timer.expires = tick_init_jiffy_update(); 559 ts->sched_timer.expires = tick_init_jiffy_update();
560 offset = ktime_to_ns(tick_period) >> 1;
561 do_div(offset, NR_CPUS);
562 offset *= smp_processor_id();
563 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
559 564
560 for (;;) { 565 for (;;) {
561 hrtimer_forward(&ts->sched_timer, now, tick_period); 566 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d1042f82a..88c81026e0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -36,9 +36,17 @@ EXPORT_SYMBOL(xtime_lock);
36 * at zero at system boot time, so wall_to_monotonic will be negative, 36 * at zero at system boot time, so wall_to_monotonic will be negative,
37 * however, we will ALWAYS keep the tv_nsec part positive so we can use 37 * however, we will ALWAYS keep the tv_nsec part positive so we can use
38 * the usual normalization. 38 * the usual normalization.
39 *
40 * wall_to_monotonic is moved after resume from suspend for the monotonic
41 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
42 * to get the real boot based time offset.
43 *
44 * - wall_to_monotonic is no longer the boot time, getboottime must be
45 * used instead.
39 */ 46 */
40struct timespec xtime __attribute__ ((aligned (16))); 47struct timespec xtime __attribute__ ((aligned (16)));
41struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 48struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
49static unsigned long total_sleep_time; /* seconds */
42 50
43EXPORT_SYMBOL(xtime); 51EXPORT_SYMBOL(xtime);
44 52
@@ -251,6 +259,7 @@ void __init timekeeping_init(void)
251 xtime.tv_nsec = 0; 259 xtime.tv_nsec = 0;
252 set_normalized_timespec(&wall_to_monotonic, 260 set_normalized_timespec(&wall_to_monotonic,
253 -xtime.tv_sec, -xtime.tv_nsec); 261 -xtime.tv_sec, -xtime.tv_nsec);
262 total_sleep_time = 0;
254 263
255 write_sequnlock_irqrestore(&xtime_lock, flags); 264 write_sequnlock_irqrestore(&xtime_lock, flags);
256} 265}
@@ -282,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
282 291
283 xtime.tv_sec += sleep_length; 292 xtime.tv_sec += sleep_length;
284 wall_to_monotonic.tv_sec -= sleep_length; 293 wall_to_monotonic.tv_sec -= sleep_length;
294 total_sleep_time += sleep_length;
285 } 295 }
286 /* re-base the last cycle value */ 296 /* re-base the last cycle value */
287 clock->cycle_last = clocksource_read(clock); 297 clock->cycle_last = clocksource_read(clock);
@@ -391,7 +401,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
391 * this is optimized for the most common adjustments of -1,0,1, 401 * this is optimized for the most common adjustments of -1,0,1,
392 * for other values we can do a bit more work. 402 * for other values we can do a bit more work.
393 */ 403 */
394static void clocksource_adjust(struct clocksource *clock, s64 offset) 404static void clocksource_adjust(s64 offset)
395{ 405{
396 s64 error, interval = clock->cycle_interval; 406 s64 error, interval = clock->cycle_interval;
397 int adj; 407 int adj;
@@ -456,17 +466,13 @@ void update_wall_time(void)
456 second_overflow(); 466 second_overflow();
457 } 467 }
458 468
459 /* interpolator bits */
460 time_interpolator_update(clock->xtime_interval
461 >> clock->shift);
462
463 /* accumulate error between NTP and clock interval */ 469 /* accumulate error between NTP and clock interval */
464 clock->error += current_tick_length(); 470 clock->error += current_tick_length();
465 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 471 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
466 } 472 }
467 473
468 /* correct the clock when NTP error is too big */ 474 /* correct the clock when NTP error is too big */
469 clocksource_adjust(clock, offset); 475 clocksource_adjust(offset);
470 476
471 /* store full nanoseconds into xtime */ 477 /* store full nanoseconds into xtime */
472 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 478 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
@@ -476,3 +482,30 @@ void update_wall_time(void)
476 change_clocksource(); 482 change_clocksource();
477 update_vsyscall(&xtime, clock); 483 update_vsyscall(&xtime, clock);
478} 484}
485
486/**
487 * getboottime - Return the real time of system boot.
488 * @ts: pointer to the timespec to be set
489 *
490 * Returns the time of day in a timespec.
491 *
492 * This is based on the wall_to_monotonic offset and the total suspend
493 * time. Calls to settimeofday will affect the value returned (which
494 * basically means that however wrong your real time clock is at boot time,
495 * you get the right time here).
496 */
497void getboottime(struct timespec *ts)
498{
499 set_normalized_timespec(ts,
500 - (wall_to_monotonic.tv_sec + total_sleep_time),
501 - wall_to_monotonic.tv_nsec);
502}
503
504/**
505 * monotonic_to_bootbased - Convert the monotonic time to boot based.
506 * @ts: pointer to the timespec to be converted
507 */
508void monotonic_to_bootbased(struct timespec *ts)
509{
510 ts->tv_sec += total_sleep_time;
511}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 8bbcfb77f7..e5edc3a22a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,7 +38,7 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
38 38
39static void print_name_offset(struct seq_file *m, void *sym) 39static void print_name_offset(struct seq_file *m, void *sym)
40{ 40{
41 char symname[KSYM_NAME_LEN+1]; 41 char symname[KSYM_NAME_LEN];
42 42
43 if (lookup_symbol_name((unsigned long)sym, symname) < 0) 43 if (lookup_symbol_name((unsigned long)sym, symname) < 0)
44 SEQ_printf(m, "<%p>", sym); 44 SEQ_printf(m, "<%p>", sym);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 321693724a..8ed62fda16 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,6 +68,7 @@ struct entry {
68 * Number of timeout events: 68 * Number of timeout events:
69 */ 69 */
70 unsigned long count; 70 unsigned long count;
71 unsigned int timer_flag;
71 72
72 /* 73 /*
73 * We save the command-line string to preserve 74 * We save the command-line string to preserve
@@ -231,7 +232,8 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
231 * incremented. Otherwise the timer is registered in a free slot. 232 * incremented. Otherwise the timer is registered in a free slot.
232 */ 233 */
233void timer_stats_update_stats(void *timer, pid_t pid, void *startf, 234void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
234 void *timerf, char * comm) 235 void *timerf, char *comm,
236 unsigned int timer_flag)
235{ 237{
236 /* 238 /*
237 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
@@ -249,6 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
249 input.start_func = startf; 251 input.start_func = startf;
250 input.expire_func = timerf; 252 input.expire_func = timerf;
251 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag;
252 255
253 spin_lock_irqsave(lock, flags); 256 spin_lock_irqsave(lock, flags);
254 if (!active) 257 if (!active)
@@ -266,7 +269,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
266 269
267static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
268{ 271{
269 char symname[KSYM_NAME_LEN+1]; 272 char symname[KSYM_NAME_LEN];
270 273
271 if (lookup_symbol_name(addr, symname) < 0) 274 if (lookup_symbol_name(addr, symname) < 0)
272 seq_printf(m, "<%p>", (void *)addr); 275 seq_printf(m, "<%p>", (void *)addr);
@@ -295,7 +298,7 @@ static int tstats_show(struct seq_file *m, void *v)
295 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
296 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
297 300
298 seq_puts(m, "Timer Stats Version: v0.1\n"); 301 seq_puts(m, "Timer Stats Version: v0.2\n");
299 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
300 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
301 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n",
@@ -303,8 +306,13 @@ static int tstats_show(struct seq_file *m, void *v)
303 306
304 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
305 entry = entries + i; 308 entry = entries + i;
306 seq_printf(m, "%4lu, %5d %-16s ", 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ",
307 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else {
313 seq_printf(m, " %4lu, %5d %-16s ",
314 entry->count, entry->pid, entry->comm);
315 }
308 316
309 print_name_offset(m, (unsigned long)entry->start_func); 317 print_name_offset(m, (unsigned long)entry->start_func);
310 seq_puts(m, " ("); 318 seq_puts(m, " (");
diff --git a/kernel/timer.c b/kernel/timer.c
index 1a69705c2f..6ce1952eea 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG));
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)(new_base) |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
116/** 116/**
@@ -305,6 +305,20 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 305 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
306 timer->start_pid = current->pid; 306 timer->start_pid = current->pid;
307} 307}
308
309static void timer_stats_account_timer(struct timer_list *timer)
310{
311 unsigned int flag = 0;
312
313 if (unlikely(tbase_get_deferrable(timer->base)))
314 flag |= TIMER_STATS_FLAG_DEFERRABLE;
315
316 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
317 timer->function, timer->start_comm, flag);
318}
319
320#else
321static void timer_stats_account_timer(struct timer_list *timer) {}
308#endif 322#endif
309 323
310/** 324/**
@@ -431,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer);
431void add_timer_on(struct timer_list *timer, int cpu) 445void add_timer_on(struct timer_list *timer, int cpu)
432{ 446{
433 tvec_base_t *base = per_cpu(tvec_bases, cpu); 447 tvec_base_t *base = per_cpu(tvec_bases, cpu);
434 unsigned long flags; 448 unsigned long flags;
435 449
436 timer_stats_timer_set_start_info(timer); 450 timer_stats_timer_set_start_info(timer);
437 BUG_ON(timer_pending(timer) || !timer->function); 451 BUG_ON(timer_pending(timer) || !timer->function);
438 spin_lock_irqsave(&base->lock, flags); 452 spin_lock_irqsave(&base->lock, flags);
439 timer_set_base(timer, base); 453 timer_set_base(timer, base);
440 internal_add_timer(base, timer); 454 internal_add_timer(base, timer);
@@ -613,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base)
613 while (time_after_eq(jiffies, base->timer_jiffies)) { 627 while (time_after_eq(jiffies, base->timer_jiffies)) {
614 struct list_head work_list; 628 struct list_head work_list;
615 struct list_head *head = &work_list; 629 struct list_head *head = &work_list;
616 int index = base->timer_jiffies & TVR_MASK; 630 int index = base->timer_jiffies & TVR_MASK;
617 631
618 /* 632 /*
619 * Cascade timers: 633 * Cascade timers:
@@ -630,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base)
630 unsigned long data; 644 unsigned long data;
631 645
632 timer = list_first_entry(head, struct timer_list,entry); 646 timer = list_first_entry(head, struct timer_list,entry);
633 fn = timer->function; 647 fn = timer->function;
634 data = timer->data; 648 data = timer->data;
635 649
636 timer_stats_account_timer(timer); 650 timer_stats_account_timer(timer);
637 651
@@ -675,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
675 index = slot = timer_jiffies & TVR_MASK; 689 index = slot = timer_jiffies & TVR_MASK;
676 do { 690 do {
677 list_for_each_entry(nte, base->tv1.vec + slot, entry) { 691 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
678 if (tbase_get_deferrable(nte->base)) 692 if (tbase_get_deferrable(nte->base))
679 continue; 693 continue;
680 694
681 found = 1; 695 found = 1;
682 expires = nte->expires; 696 expires = nte->expires;
@@ -820,7 +834,7 @@ void update_process_times(int user_tick)
820 if (rcu_pending(cpu)) 834 if (rcu_pending(cpu))
821 rcu_check_callbacks(cpu, user_tick); 835 rcu_check_callbacks(cpu, user_tick);
822 scheduler_tick(); 836 scheduler_tick();
823 run_posix_cpu_timers(p); 837 run_posix_cpu_timers(p);
824} 838}
825 839
826/* 840/*
@@ -895,7 +909,7 @@ static inline void update_times(unsigned long ticks)
895 update_wall_time(); 909 update_wall_time();
896 calc_load(ticks); 910 calc_load(ticks);
897} 911}
898 912
899/* 913/*
900 * The 64-bit jiffies value is not atomic - you MUST NOT read it 914 * The 64-bit jiffies value is not atomic - you MUST NOT read it
901 * without sampling the sequence number in xtime_lock. 915 * without sampling the sequence number in xtime_lock.
@@ -1091,7 +1105,7 @@ asmlinkage long sys_gettid(void)
1091/** 1105/**
1092 * do_sysinfo - fill in sysinfo struct 1106 * do_sysinfo - fill in sysinfo struct
1093 * @info: pointer to buffer to fill 1107 * @info: pointer to buffer to fill
1094 */ 1108 */
1095int do_sysinfo(struct sysinfo *info) 1109int do_sysinfo(struct sysinfo *info)
1096{ 1110{
1097 unsigned long mem_total, sav_total; 1111 unsigned long mem_total, sav_total;
@@ -1114,6 +1128,7 @@ int do_sysinfo(struct sysinfo *info)
1114 getnstimeofday(&tp); 1128 getnstimeofday(&tp);
1115 tp.tv_sec += wall_to_monotonic.tv_sec; 1129 tp.tv_sec += wall_to_monotonic.tv_sec;
1116 tp.tv_nsec += wall_to_monotonic.tv_nsec; 1130 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1131 monotonic_to_bootbased(&tp);
1117 if (tp.tv_nsec - NSEC_PER_SEC >= 0) { 1132 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1118 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; 1133 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1119 tp.tv_sec++; 1134 tp.tv_sec++;
@@ -1206,7 +1221,8 @@ static int __devinit init_timers_cpu(int cpu)
1206 /* 1221 /*
1207 * The APs use this path later in boot 1222 * The APs use this path later in boot
1208 */ 1223 */
1209 base = kmalloc_node(sizeof(*base), GFP_KERNEL, 1224 base = kmalloc_node(sizeof(*base),
1225 GFP_KERNEL | __GFP_ZERO,
1210 cpu_to_node(cpu)); 1226 cpu_to_node(cpu));
1211 if (!base) 1227 if (!base)
1212 return -ENOMEM; 1228 return -ENOMEM;
@@ -1217,7 +1233,6 @@ static int __devinit init_timers_cpu(int cpu)
1217 kfree(base); 1233 kfree(base);
1218 return -ENOMEM; 1234 return -ENOMEM;
1219 } 1235 }
1220 memset(base, 0, sizeof(*base));
1221 per_cpu(tvec_bases, cpu) = base; 1236 per_cpu(tvec_bases, cpu) = base;
1222 } else { 1237 } else {
1223 /* 1238 /*
@@ -1334,194 +1349,6 @@ void __init init_timers(void)
1334 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1349 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1335} 1350}
1336 1351
1337#ifdef CONFIG_TIME_INTERPOLATION
1338
1339struct time_interpolator *time_interpolator __read_mostly;
1340static struct time_interpolator *time_interpolator_list __read_mostly;
1341static DEFINE_SPINLOCK(time_interpolator_lock);
1342
1343static inline cycles_t time_interpolator_get_cycles(unsigned int src)
1344{
1345 unsigned long (*x)(void);
1346
1347 switch (src)
1348 {
1349 case TIME_SOURCE_FUNCTION:
1350 x = time_interpolator->addr;
1351 return x();
1352
1353 case TIME_SOURCE_MMIO64 :
1354 return readq_relaxed((void __iomem *)time_interpolator->addr);
1355
1356 case TIME_SOURCE_MMIO32 :
1357 return readl_relaxed((void __iomem *)time_interpolator->addr);
1358
1359 default: return get_cycles();
1360 }
1361}
1362
1363static inline u64 time_interpolator_get_counter(int writelock)
1364{
1365 unsigned int src = time_interpolator->source;
1366
1367 if (time_interpolator->jitter)
1368 {
1369 cycles_t lcycle;
1370 cycles_t now;
1371
1372 do {
1373 lcycle = time_interpolator->last_cycle;
1374 now = time_interpolator_get_cycles(src);
1375 if (lcycle && time_after(lcycle, now))
1376 return lcycle;
1377
1378 /* When holding the xtime write lock, there's no need
1379 * to add the overhead of the cmpxchg. Readers are
1380 * force to retry until the write lock is released.
1381 */
1382 if (writelock) {
1383 time_interpolator->last_cycle = now;
1384 return now;
1385 }
1386 /* Keep track of the last timer value returned. The use of cmpxchg here
1387 * will cause contention in an SMP environment.
1388 */
1389 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1390 return now;
1391 }
1392 else
1393 return time_interpolator_get_cycles(src);
1394}
1395
1396void time_interpolator_reset(void)
1397{
1398 time_interpolator->offset = 0;
1399 time_interpolator->last_counter = time_interpolator_get_counter(1);
1400}
1401
1402#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1403
1404unsigned long time_interpolator_get_offset(void)
1405{
1406 /* If we do not have a time interpolator set up then just return zero */
1407 if (!time_interpolator)
1408 return 0;
1409
1410 return time_interpolator->offset +
1411 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1412}
1413
1414#define INTERPOLATOR_ADJUST 65536
1415#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1416
1417void time_interpolator_update(long delta_nsec)
1418{
1419 u64 counter;
1420 unsigned long offset;
1421
1422 /* If there is no time interpolator set up then do nothing */
1423 if (!time_interpolator)
1424 return;
1425
1426 /*
1427 * The interpolator compensates for late ticks by accumulating the late
1428 * time in time_interpolator->offset. A tick earlier than expected will
1429 * lead to a reset of the offset and a corresponding jump of the clock
1430 * forward. Again this only works if the interpolator clock is running
1431 * slightly slower than the regular clock and the tuning logic insures
1432 * that.
1433 */
1434
1435 counter = time_interpolator_get_counter(1);
1436 offset = time_interpolator->offset +
1437 GET_TI_NSECS(counter, time_interpolator);
1438
1439 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1440 time_interpolator->offset = offset - delta_nsec;
1441 else {
1442 time_interpolator->skips++;
1443 time_interpolator->ns_skipped += delta_nsec - offset;
1444 time_interpolator->offset = 0;
1445 }
1446 time_interpolator->last_counter = counter;
1447
1448 /* Tuning logic for time interpolator invoked every minute or so.
1449 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1450 * Increase interpolator clock speed if we skip too much time.
1451 */
1452 if (jiffies % INTERPOLATOR_ADJUST == 0)
1453 {
1454 if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1455 time_interpolator->nsec_per_cyc--;
1456 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1457 time_interpolator->nsec_per_cyc++;
1458 time_interpolator->skips = 0;
1459 time_interpolator->ns_skipped = 0;
1460 }
1461}
1462
1463static inline int
1464is_better_time_interpolator(struct time_interpolator *new)
1465{
1466 if (!time_interpolator)
1467 return 1;
1468 return new->frequency > 2*time_interpolator->frequency ||
1469 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1470}
1471
1472void
1473register_time_interpolator(struct time_interpolator *ti)
1474{
1475 unsigned long flags;
1476
1477 /* Sanity check */
1478 BUG_ON(ti->frequency == 0 || ti->mask == 0);
1479
1480 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1481 spin_lock(&time_interpolator_lock);
1482 write_seqlock_irqsave(&xtime_lock, flags);
1483 if (is_better_time_interpolator(ti)) {
1484 time_interpolator = ti;
1485 time_interpolator_reset();
1486 }
1487 write_sequnlock_irqrestore(&xtime_lock, flags);
1488
1489 ti->next = time_interpolator_list;
1490 time_interpolator_list = ti;
1491 spin_unlock(&time_interpolator_lock);
1492}
1493
1494void
1495unregister_time_interpolator(struct time_interpolator *ti)
1496{
1497 struct time_interpolator *curr, **prev;
1498 unsigned long flags;
1499
1500 spin_lock(&time_interpolator_lock);
1501 prev = &time_interpolator_list;
1502 for (curr = *prev; curr; curr = curr->next) {
1503 if (curr == ti) {
1504 *prev = curr->next;
1505 break;
1506 }
1507 prev = &curr->next;
1508 }
1509
1510 write_seqlock_irqsave(&xtime_lock, flags);
1511 if (ti == time_interpolator) {
1512 /* we lost the best time-interpolator: */
1513 time_interpolator = NULL;
1514 /* find the next-best interpolator */
1515 for (curr = time_interpolator_list; curr; curr = curr->next)
1516 if (is_better_time_interpolator(curr))
1517 time_interpolator = curr;
1518 time_interpolator_reset();
1519 }
1520 write_sequnlock_irqrestore(&xtime_lock, flags);
1521 spin_unlock(&time_interpolator_lock);
1522}
1523#endif /* CONFIG_TIME_INTERPOLATION */
1524
1525/** 1352/**
1526 * msleep - sleep safely even with waitqueue interruptions 1353 * msleep - sleep safely even with waitqueue interruptions
1527 * @msecs: Time in milliseconds to sleep for 1354 * @msecs: Time in milliseconds to sleep for
diff --git a/kernel/user.c b/kernel/user.c
index 4869563080..e7d11cef69 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,20 +14,19 @@
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/key.h> 15#include <linux/key.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/user_namespace.h>
17 19
18/* 20/*
19 * UID task count cache, to get fast user lookup in "alloc_uid" 21 * UID task count cache, to get fast user lookup in "alloc_uid"
20 * when changing user ID's (ie setuid() and friends). 22 * when changing user ID's (ie setuid() and friends).
21 */ 23 */
22 24
23#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8)
24#define UIDHASH_SZ (1 << UIDHASH_BITS)
25#define UIDHASH_MASK (UIDHASH_SZ - 1) 25#define UIDHASH_MASK (UIDHASH_SZ - 1)
26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 26#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
27#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) 27#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid)))
28 28
29static struct kmem_cache *uid_cachep; 29static struct kmem_cache *uid_cachep;
30static struct list_head uidhash_table[UIDHASH_SZ];
31 30
32/* 31/*
33 * The uidhash_lock is mostly taken from process context, but it is 32 * The uidhash_lock is mostly taken from process context, but it is
@@ -94,9 +93,10 @@ struct user_struct *find_user(uid_t uid)
94{ 93{
95 struct user_struct *ret; 94 struct user_struct *ret;
96 unsigned long flags; 95 unsigned long flags;
96 struct user_namespace *ns = current->nsproxy->user_ns;
97 97
98 spin_lock_irqsave(&uidhash_lock, flags); 98 spin_lock_irqsave(&uidhash_lock, flags);
99 ret = uid_hash_find(uid, uidhashentry(uid)); 99 ret = uid_hash_find(uid, uidhashentry(ns, uid));
100 spin_unlock_irqrestore(&uidhash_lock, flags); 100 spin_unlock_irqrestore(&uidhash_lock, flags);
101 return ret; 101 return ret;
102} 102}
@@ -120,9 +120,9 @@ void free_uid(struct user_struct *up)
120 } 120 }
121} 121}
122 122
123struct user_struct * alloc_uid(uid_t uid) 123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 124{
125 struct list_head *hashent = uidhashentry(uid); 125 struct list_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 126 struct user_struct *up;
127 127
128 spin_lock_irq(&uidhash_lock); 128 spin_lock_irq(&uidhash_lock);
@@ -208,14 +208,14 @@ static int __init uid_cache_init(void)
208 int n; 208 int n;
209 209
210 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 210 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
212 212
213 for(n = 0; n < UIDHASH_SZ; ++n) 213 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(uidhash_table + n); 214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
215 215
216 /* Insert the root user immediately (init already runs as root) */ 216 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 217 spin_lock_irq(&uidhash_lock);
218 uid_hash_insert(&root_user, uidhashentry(0)); 218 uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0));
219 spin_unlock_irq(&uidhash_lock); 219 spin_unlock_irq(&uidhash_lock);
220 220
221 return 0; 221 return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 0000000000..d055d98785
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,87 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7
8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h>
11#include <linux/user_namespace.h>
12
13struct user_namespace init_user_ns = {
14 .kref = {
15 .refcount = ATOMIC_INIT(2),
16 },
17 .root_user = &root_user,
18};
19
20EXPORT_SYMBOL_GPL(init_user_ns);
21
22#ifdef CONFIG_USER_NS
23
24/*
25 * Clone a new ns copying an original user ns, setting refcount to 1
26 * @old_ns: namespace to clone
27 * Return NULL on error (failure to kmalloc), new ns otherwise
28 */
29static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
30{
31 struct user_namespace *ns;
32 struct user_struct *new_user;
33 int n;
34
35 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
36 if (!ns)
37 return ERR_PTR(-ENOMEM);
38
39 kref_init(&ns->kref);
40
41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n);
43
44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0);
46 if (!ns->root_user) {
47 kfree(ns);
48 return ERR_PTR(-ENOMEM);
49 }
50
51 /* Reset current->user with a new one */
52 new_user = alloc_uid(ns, current->uid);
53 if (!new_user) {
54 free_uid(ns->root_user);
55 kfree(ns);
56 return ERR_PTR(-ENOMEM);
57 }
58
59 switch_uid(new_user);
60 return ns;
61}
62
63struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
64{
65 struct user_namespace *new_ns;
66
67 BUG_ON(!old_ns);
68 get_user_ns(old_ns);
69
70 if (!(flags & CLONE_NEWUSER))
71 return old_ns;
72
73 new_ns = clone_user_ns(old_ns);
74
75 put_user_ns(old_ns);
76 return new_ns;
77}
78
79void free_user_ns(struct kref *kref)
80{
81 struct user_namespace *ns;
82
83 ns = container_of(kref, struct user_namespace, kref);
84 kfree(ns);
85}
86
87#endif /* CONFIG_USER_NS */
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 160c8c5136..9d8180a0f0 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/err.h>
16 17
17/* 18/*
18 * Clone a new ns copying an original utsname, setting refcount to 1 19 * Clone a new ns copying an original utsname, setting refcount to 1
@@ -24,10 +25,11 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24 struct uts_namespace *ns; 25 struct uts_namespace *ns;
25 26
26 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
27 if (ns) { 28 if (!ns)
28 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 29 return ERR_PTR(-ENOMEM);
29 kref_init(&ns->kref); 30
30 } 31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
32 kref_init(&ns->kref);
31 return ns; 33 return ns;
32} 34}
33 35
@@ -37,7 +39,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
37 * utsname of this process won't be seen by parent, and vice 39 * utsname of this process won't be seen by parent, and vice
38 * versa. 40 * versa.
39 */ 41 */
40struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) 42struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
41{ 43{
42 struct uts_namespace *new_ns; 44 struct uts_namespace *new_ns;
43 45
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index f22b9dbd2a..c76c06466b 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,10 +18,7 @@
18static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
19{ 19{
20 char *which = table->data; 20 char *which = table->data;
21#ifdef CONFIG_UTS_NS 21
22 struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24#endif
25 if (!write) 22 if (!write)
26 down_read(&uts_sem); 23 down_read(&uts_sem);
27 else 24 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bebf73be9..58e5c152a6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -282,8 +282,8 @@ static int worker_thread(void *__cwq)
282 struct cpu_workqueue_struct *cwq = __cwq; 282 struct cpu_workqueue_struct *cwq = __cwq;
283 DEFINE_WAIT(wait); 283 DEFINE_WAIT(wait);
284 284
285 if (!cwq->wq->freezeable) 285 if (cwq->wq->freezeable)
286 current->flags |= PF_NOFREEZE; 286 set_freezable();
287 287
288 set_user_nice(current, -5); 288 set_user_nice(current, -5);
289 289
@@ -382,16 +382,16 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
382EXPORT_SYMBOL_GPL(flush_workqueue); 382EXPORT_SYMBOL_GPL(flush_workqueue);
383 383
384/* 384/*
385 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit, 385 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
386 * so this work can't be re-armed in any way. 386 * so this work can't be re-armed in any way.
387 */ 387 */
388static int try_to_grab_pending(struct work_struct *work) 388static int try_to_grab_pending(struct work_struct *work)
389{ 389{
390 struct cpu_workqueue_struct *cwq; 390 struct cpu_workqueue_struct *cwq;
391 int ret = 0; 391 int ret = -1;
392 392
393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) 393 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
394 return 1; 394 return 0;
395 395
396 /* 396 /*
397 * The queueing is in progress, or it is already queued. Try to 397 * The queueing is in progress, or it is already queued. Try to
@@ -457,10 +457,28 @@ static void wait_on_work(struct work_struct *work)
457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 457 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
458} 458}
459 459
460static int __cancel_work_timer(struct work_struct *work,
461 struct timer_list* timer)
462{
463 int ret;
464
465 do {
466 ret = (timer && likely(del_timer(timer)));
467 if (!ret)
468 ret = try_to_grab_pending(work);
469 wait_on_work(work);
470 } while (unlikely(ret < 0));
471
472 work_clear_pending(work);
473 return ret;
474}
475
460/** 476/**
461 * cancel_work_sync - block until a work_struct's callback has terminated 477 * cancel_work_sync - block until a work_struct's callback has terminated
462 * @work: the work which is to be flushed 478 * @work: the work which is to be flushed
463 * 479 *
480 * Returns true if @work was pending.
481 *
464 * cancel_work_sync() will cancel the work if it is queued. If the work's 482 * cancel_work_sync() will cancel the work if it is queued. If the work's
465 * callback appears to be running, cancel_work_sync() will block until it 483 * callback appears to be running, cancel_work_sync() will block until it
466 * has completed. 484 * has completed.
@@ -476,31 +494,26 @@ static void wait_on_work(struct work_struct *work)
476 * The caller must ensure that workqueue_struct on which this work was last 494 * The caller must ensure that workqueue_struct on which this work was last
477 * queued can't be destroyed before this function returns. 495 * queued can't be destroyed before this function returns.
478 */ 496 */
479void cancel_work_sync(struct work_struct *work) 497int cancel_work_sync(struct work_struct *work)
480{ 498{
481 while (!try_to_grab_pending(work)) 499 return __cancel_work_timer(work, NULL);
482 cpu_relax();
483 wait_on_work(work);
484 work_clear_pending(work);
485} 500}
486EXPORT_SYMBOL_GPL(cancel_work_sync); 501EXPORT_SYMBOL_GPL(cancel_work_sync);
487 502
488/** 503/**
489 * cancel_rearming_delayed_work - reliably kill off a delayed work. 504 * cancel_delayed_work_sync - reliably kill off a delayed work.
490 * @dwork: the delayed work struct 505 * @dwork: the delayed work struct
491 * 506 *
507 * Returns true if @dwork was pending.
508 *
492 * It is possible to use this function if @dwork rearms itself via queue_work() 509 * It is possible to use this function if @dwork rearms itself via queue_work()
493 * or queue_delayed_work(). See also the comment for cancel_work_sync(). 510 * or queue_delayed_work(). See also the comment for cancel_work_sync().
494 */ 511 */
495void cancel_rearming_delayed_work(struct delayed_work *dwork) 512int cancel_delayed_work_sync(struct delayed_work *dwork)
496{ 513{
497 while (!del_timer(&dwork->timer) && 514 return __cancel_work_timer(&dwork->work, &dwork->timer);
498 !try_to_grab_pending(&dwork->work))
499 cpu_relax();
500 wait_on_work(&dwork->work);
501 work_clear_pending(&dwork->work);
502} 515}
503EXPORT_SYMBOL(cancel_rearming_delayed_work); 516EXPORT_SYMBOL(cancel_delayed_work_sync);
504 517
505static struct workqueue_struct *keventd_wq __read_mostly; 518static struct workqueue_struct *keventd_wq __read_mostly;
506 519
@@ -739,18 +752,17 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
739 if (cwq->thread == NULL) 752 if (cwq->thread == NULL)
740 return; 753 return;
741 754
755 flush_cpu_workqueue(cwq);
742 /* 756 /*
743 * If the caller is CPU_DEAD the single flush_cpu_workqueue() 757 * If the caller is CPU_DEAD and cwq->worklist was not empty,
744 * is not enough, a concurrent flush_workqueue() can insert a 758 * a concurrent flush_workqueue() can insert a barrier after us.
745 * barrier after us. 759 * However, in that case run_workqueue() won't return and check
760 * kthread_should_stop() until it flushes all work_struct's.
746 * When ->worklist becomes empty it is safe to exit because no 761 * When ->worklist becomes empty it is safe to exit because no
747 * more work_structs can be queued on this cwq: flush_workqueue 762 * more work_structs can be queued on this cwq: flush_workqueue
748 * checks list_empty(), and a "normal" queue_work() can't use 763 * checks list_empty(), and a "normal" queue_work() can't use
749 * a dead CPU. 764 * a dead CPU.
750 */ 765 */
751 while (flush_cpu_workqueue(cwq))
752 ;
753
754 kthread_stop(cwq->thread); 766 kthread_stop(cwq->thread);
755 cwq->thread = NULL; 767 cwq->thread = NULL;
756} 768}