aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2007-10-12 21:27:47 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2007-10-12 21:27:47 -0400
commitb981d8b3f5e008ff10d993be633ad00564fc22cd (patch)
treee292dc07b22308912cf6a58354a608b9e5e8e1fd /kernel
parentb11d2127c4893a7315d1e16273bc8560049fa3ca (diff)
parent2b9e0aae1d50e880c58d46788e5e3ebd89d75d62 (diff)
Merge master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/macintosh/adbhid.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c16
-rw-r--r--kernel/auditfilter.c13
-rw-r--r--kernel/auditsc.c161
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex.c52
-rw-r--r--kernel/futex_compat.c30
-rw-r--r--kernel/hrtimer.c45
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/devres.c1
-rw-r--r--kernel/irq/manage.c13
-rw-r--r--kernel/irq/proc.c10
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kmod.c303
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/ksysfs.c28
-rw-r--r--kernel/kthread.c12
-rw-r--r--kernel/lockdep.c1497
-rw-r--r--kernel/lockdep_proc.c303
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/nsproxy.c17
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/Kconfig91
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/disk.c252
-rw-r--r--kernel/power/main.c134
-rw-r--r--kernel/power/power.h39
-rw-r--r--kernel/power/process.c90
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/swap.c20
-rw-r--r--kernel/power/user.c154
-rw-r--r--kernel/printk.c15
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/rwsem.c8
-rw-r--r--kernel/sched.c847
-rw-r--r--kernel/sched_debug.c52
-rw-r--r--kernel/sched_fair.c432
-rw-r--r--kernel/sched_idletask.c10
-rw-r--r--kernel/sched_rt.c73
-rw-r--r--kernel/sched_stats.h2
-rw-r--r--kernel/signal.c63
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/spinlock.c32
-rw-r--r--kernel/sys.c101
-rw-r--r--kernel/sysctl.c94
-rw-r--r--kernel/time.c117
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/ntp.c69
-rw-r--r--kernel/time/tick-broadcast.c67
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-oneshot.c15
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timekeeping.c56
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c7
-rw-r--r--kernel/timer.c212
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/user.c47
-rw-r--r--kernel/user_namespace.c3
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/workqueue.c2
71 files changed, 3643 insertions, 2175 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..6b066632e40c 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -63,3 +63,6 @@ config PREEMPT_BKL
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure. 64 Say N if you are unsure.
65 65
66config PREEMPT_NOTIFIERS
67 bool
68
diff --git a/kernel/acct.c b/kernel/acct.c
index 70d0d88e5554..24f0f8b2ba72 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -468,7 +468,7 @@ static void do_acct_process(struct file *file)
468 } 468 }
469#endif 469#endif
470 do_div(elapsed, AHZ); 470 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 471 ac.ac_btime = get_seconds() - elapsed;
472 /* we really need to bite the bullet and change layout */ 472 /* we really need to bite the bullet and change layout */
473 ac.ac_uid = current->uid; 473 ac.ac_uid = current->uid;
474 ac.ac_gid = current->gid; 474 ac.ac_gid = current->gid;
diff --git a/kernel/audit.c b/kernel/audit.c
index eb0f9165b401..2924251a6547 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb)
847} 847}
848 848
849/* Receive messages from netlink socket. */ 849/* Receive messages from netlink socket. */
850static void audit_receive(struct sock *sk, int length) 850static void audit_receive(struct sk_buff *skb)
851{ 851{
852 struct sk_buff *skb;
853 unsigned int qlen;
854
855 mutex_lock(&audit_cmd_mutex); 852 mutex_lock(&audit_cmd_mutex);
856 853 audit_receive_skb(skb);
857 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
858 skb = skb_dequeue(&sk->sk_receive_queue);
859 audit_receive_skb(skb);
860 kfree_skb(skb);
861 }
862 mutex_unlock(&audit_cmd_mutex); 854 mutex_unlock(&audit_cmd_mutex);
863} 855}
864 856
@@ -876,8 +868,8 @@ static int __init audit_init(void)
876 868
877 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 869 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
878 audit_default ? "enabled" : "disabled"); 870 audit_default ? "enabled" : "disabled");
879 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 871 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
880 NULL, THIS_MODULE); 872 audit_receive, NULL, THIS_MODULE);
881 if (!audit_sock) 873 if (!audit_sock)
882 audit_panic("cannot initialize netlink socket"); 874 audit_panic("cannot initialize netlink socket");
883 else 875 else
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 1bf093dcffe0..359645cff5b2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list)
304 304
305int audit_match_class(int class, unsigned syscall) 305int audit_match_class(int class, unsigned syscall)
306{ 306{
307 if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) 307 if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
308 return 0; 308 return 0;
309 if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) 309 if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
310 return 0; 310 return 0;
@@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
456 case AUDIT_DEVMINOR: 456 case AUDIT_DEVMINOR:
457 case AUDIT_EXIT: 457 case AUDIT_EXIT:
458 case AUDIT_SUCCESS: 458 case AUDIT_SUCCESS:
459 /* bit ops are only useful on syscall args */
460 if (f->op == AUDIT_BIT_MASK ||
461 f->op == AUDIT_BIT_TEST) {
462 err = -EINVAL;
463 goto exit_free;
464 }
465 break;
459 case AUDIT_ARG0: 466 case AUDIT_ARG0:
460 case AUDIT_ARG1: 467 case AUDIT_ARG1:
461 case AUDIT_ARG2: 468 case AUDIT_ARG2:
@@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
1566 return (left > right); 1573 return (left > right);
1567 case AUDIT_GREATER_THAN_OR_EQUAL: 1574 case AUDIT_GREATER_THAN_OR_EQUAL:
1568 return (left >= right); 1575 return (left >= right);
1576 case AUDIT_BIT_MASK:
1577 return (left & right);
1578 case AUDIT_BIT_TEST:
1579 return ((left & right) == right);
1569 } 1580 }
1570 BUG(); 1581 BUG();
1571 return 0; 1582 return 0;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b7640a5f382a..04f3ffb8d9d4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -153,7 +153,7 @@ struct audit_aux_data_execve {
153 struct audit_aux_data d; 153 struct audit_aux_data d;
154 int argc; 154 int argc;
155 int envc; 155 int envc;
156 char mem[0]; 156 struct mm_struct *mm;
157}; 157};
158 158
159struct audit_aux_data_socketcall { 159struct audit_aux_data_socketcall {
@@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair {
173 int fd[2]; 173 int fd[2];
174}; 174};
175 175
176struct audit_aux_data_path {
177 struct audit_aux_data d;
178 struct dentry *dentry;
179 struct vfsmount *mnt;
180};
181
182struct audit_aux_data_pids { 176struct audit_aux_data_pids {
183 struct audit_aux_data d; 177 struct audit_aux_data d;
184 pid_t target_pid[AUDIT_AUX_PIDS]; 178 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context)
654 struct audit_aux_data *aux; 648 struct audit_aux_data *aux;
655 649
656 while ((aux = context->aux)) { 650 while ((aux = context->aux)) {
657 if (aux->type == AUDIT_AVC_PATH) {
658 struct audit_aux_data_path *axi = (void *)aux;
659 dput(axi->dentry);
660 mntput(axi->mnt);
661 }
662
663 context->aux = aux->next; 651 context->aux = aux->next;
664 kfree(aux); 652 kfree(aux);
665 } 653 }
@@ -831,6 +819,57 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
831 return rc; 819 return rc;
832} 820}
833 821
822static void audit_log_execve_info(struct audit_buffer *ab,
823 struct audit_aux_data_execve *axi)
824{
825 int i;
826 long len, ret;
827 const char __user *p;
828 char *buf;
829
830 if (axi->mm != current->mm)
831 return; /* execve failed, no additional info */
832
833 p = (const char __user *)axi->mm->arg_start;
834
835 for (i = 0; i < axi->argc; i++, p += len) {
836 len = strnlen_user(p, MAX_ARG_STRLEN);
837 /*
838 * We just created this mm, if we can't find the strings
839 * we just copied into it something is _very_ wrong. Similar
840 * for strings that are too long, we should not have created
841 * any.
842 */
843 if (!len || len > MAX_ARG_STRLEN) {
844 WARN_ON(1);
845 send_sig(SIGKILL, current, 0);
846 }
847
848 buf = kmalloc(len, GFP_KERNEL);
849 if (!buf) {
850 audit_panic("out of memory for argv string\n");
851 break;
852 }
853
854 ret = copy_from_user(buf, p, len);
855 /*
856 * There is no reason for this copy to be short. We just
857 * copied them here, and the mm hasn't been exposed to user-
858 * space yet.
859 */
860 if (ret) {
861 WARN_ON(1);
862 send_sig(SIGKILL, current, 0);
863 }
864
865 audit_log_format(ab, "a%d=", i);
866 audit_log_untrustedstring(ab, buf);
867 audit_log_format(ab, "\n");
868
869 kfree(buf);
870 }
871}
872
834static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 873static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
835{ 874{
836 int i, call_panic = 0; 875 int i, call_panic = 0;
@@ -946,7 +985,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
946 case AUDIT_IPC: { 985 case AUDIT_IPC: {
947 struct audit_aux_data_ipcctl *axi = (void *)aux; 986 struct audit_aux_data_ipcctl *axi = (void *)aux;
948 audit_log_format(ab, 987 audit_log_format(ab,
949 "ouid=%u ogid=%u mode=%x", 988 "ouid=%u ogid=%u mode=%#o",
950 axi->uid, axi->gid, axi->mode); 989 axi->uid, axi->gid, axi->mode);
951 if (axi->osid != 0) { 990 if (axi->osid != 0) {
952 char *ctx = NULL; 991 char *ctx = NULL;
@@ -965,19 +1004,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
965 case AUDIT_IPC_SET_PERM: { 1004 case AUDIT_IPC_SET_PERM: {
966 struct audit_aux_data_ipcctl *axi = (void *)aux; 1005 struct audit_aux_data_ipcctl *axi = (void *)aux;
967 audit_log_format(ab, 1006 audit_log_format(ab,
968 "qbytes=%lx ouid=%u ogid=%u mode=%x", 1007 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
969 axi->qbytes, axi->uid, axi->gid, axi->mode); 1008 axi->qbytes, axi->uid, axi->gid, axi->mode);
970 break; } 1009 break; }
971 1010
972 case AUDIT_EXECVE: { 1011 case AUDIT_EXECVE: {
973 struct audit_aux_data_execve *axi = (void *)aux; 1012 struct audit_aux_data_execve *axi = (void *)aux;
974 int i; 1013 audit_log_execve_info(ab, axi);
975 const char *p;
976 for (i = 0, p = axi->mem; i < axi->argc; i++) {
977 audit_log_format(ab, "a%d=", i);
978 p = audit_log_untrustedstring(ab, p);
979 audit_log_format(ab, "\n");
980 }
981 break; } 1014 break; }
982 1015
983 case AUDIT_SOCKETCALL: { 1016 case AUDIT_SOCKETCALL: {
@@ -995,11 +1028,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
995 audit_log_hex(ab, axs->a, axs->len); 1028 audit_log_hex(ab, axs->a, axs->len);
996 break; } 1029 break; }
997 1030
998 case AUDIT_AVC_PATH: {
999 struct audit_aux_data_path *axi = (void *)aux;
1000 audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
1001 break; }
1002
1003 case AUDIT_FD_PAIR: { 1031 case AUDIT_FD_PAIR: {
1004 struct audit_aux_data_fd_pair *axs = (void *)aux; 1032 struct audit_aux_data_fd_pair *axs = (void *)aux;
1005 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1033 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1821,32 +1849,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
1821 return 0; 1849 return 0;
1822} 1850}
1823 1851
1852int audit_argv_kb = 32;
1853
1824int audit_bprm(struct linux_binprm *bprm) 1854int audit_bprm(struct linux_binprm *bprm)
1825{ 1855{
1826 struct audit_aux_data_execve *ax; 1856 struct audit_aux_data_execve *ax;
1827 struct audit_context *context = current->audit_context; 1857 struct audit_context *context = current->audit_context;
1828 unsigned long p, next;
1829 void *to;
1830 1858
1831 if (likely(!audit_enabled || !context || context->dummy)) 1859 if (likely(!audit_enabled || !context || context->dummy))
1832 return 0; 1860 return 0;
1833 1861
1834 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, 1862 /*
1835 GFP_KERNEL); 1863 * Even though the stack code doesn't limit the arg+env size any more,
1864 * the audit code requires that _all_ arguments be logged in a single
1865 * netlink skb. Hence cap it :-(
1866 */
1867 if (bprm->argv_len > (audit_argv_kb << 10))
1868 return -E2BIG;
1869
1870 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
1836 if (!ax) 1871 if (!ax)
1837 return -ENOMEM; 1872 return -ENOMEM;
1838 1873
1839 ax->argc = bprm->argc; 1874 ax->argc = bprm->argc;
1840 ax->envc = bprm->envc; 1875 ax->envc = bprm->envc;
1841 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { 1876 ax->mm = bprm->mm;
1842 struct page *page = bprm->page[p / PAGE_SIZE];
1843 void *kaddr = kmap(page);
1844 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1845 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1846 to += next - p;
1847 kunmap(page);
1848 }
1849
1850 ax->d.type = AUDIT_EXECVE; 1877 ax->d.type = AUDIT_EXECVE;
1851 ax->d.next = context->aux; 1878 ax->d.next = context->aux;
1852 context->aux = (void *)ax; 1879 context->aux = (void *)ax;
@@ -1949,36 +1976,6 @@ void __audit_ptrace(struct task_struct *t)
1949} 1976}
1950 1977
1951/** 1978/**
1952 * audit_avc_path - record the granting or denial of permissions
1953 * @dentry: dentry to record
1954 * @mnt: mnt to record
1955 *
1956 * Returns 0 for success or NULL context or < 0 on error.
1957 *
1958 * Called from security/selinux/avc.c::avc_audit()
1959 */
1960int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1961{
1962 struct audit_aux_data_path *ax;
1963 struct audit_context *context = current->audit_context;
1964
1965 if (likely(!context))
1966 return 0;
1967
1968 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1969 if (!ax)
1970 return -ENOMEM;
1971
1972 ax->dentry = dget(dentry);
1973 ax->mnt = mntget(mnt);
1974
1975 ax->d.type = AUDIT_AVC_PATH;
1976 ax->d.next = context->aux;
1977 context->aux = (void *)ax;
1978 return 0;
1979}
1980
1981/**
1982 * audit_signal_info - record signal info for shutting down audit subsystem 1979 * audit_signal_info - record signal info for shutting down audit subsystem
1983 * @sig: signal value 1980 * @sig: signal value
1984 * @t: task being signaled 1981 * @t: task being signaled
@@ -1995,19 +1992,19 @@ int __audit_signal_info(int sig, struct task_struct *t)
1995 extern uid_t audit_sig_uid; 1992 extern uid_t audit_sig_uid;
1996 extern u32 audit_sig_sid; 1993 extern u32 audit_sig_sid;
1997 1994
1998 if (audit_pid && t->tgid == audit_pid && 1995 if (audit_pid && t->tgid == audit_pid) {
1999 (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) { 1996 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
2000 audit_sig_pid = tsk->pid; 1997 audit_sig_pid = tsk->pid;
2001 if (ctx) 1998 if (ctx)
2002 audit_sig_uid = ctx->loginuid; 1999 audit_sig_uid = ctx->loginuid;
2003 else 2000 else
2004 audit_sig_uid = tsk->uid; 2001 audit_sig_uid = tsk->uid;
2005 selinux_get_task_sid(tsk, &audit_sig_sid); 2002 selinux_get_task_sid(tsk, &audit_sig_sid);
2003 }
2004 if (!audit_signals || audit_dummy_context())
2005 return 0;
2006 } 2006 }
2007 2007
2008 if (!audit_signals) /* audit_context checked in wrapper */
2009 return 0;
2010
2011 /* optimize the common case by putting first signal recipient directly 2008 /* optimize the common case by putting first signal recipient directly
2012 * in audit_context */ 2009 * in audit_context */
2013 if (!ctx->target_pid) { 2010 if (!ctx->target_pid) {
@@ -2026,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2026 axp->d.next = ctx->aux_pids; 2023 axp->d.next = ctx->aux_pids;
2027 ctx->aux_pids = (void *)axp; 2024 ctx->aux_pids = (void *)axp;
2028 } 2025 }
2029 BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); 2026 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
2030 2027
2031 axp->target_pid[axp->pid_count] = t->tgid; 2028 axp->target_pid[axp->pid_count] = t->tgid;
2032 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); 2029 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 181ae7086029..38033db8d8ec 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu)
273 return err; 273 return err;
274} 274}
275 275
276#ifdef CONFIG_SUSPEND_SMP 276#ifdef CONFIG_PM_SLEEP_SMP
277static cpumask_t frozen_cpus; 277static cpumask_t frozen_cpus;
278 278
279int disable_nonboot_cpus(void) 279int disable_nonboot_cpus(void)
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void)
334out: 334out:
335 mutex_unlock(&cpu_add_remove_lock); 335 mutex_unlock(&cpu_add_remove_lock);
336} 336}
337#endif 337#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b4796d850140..57e6448b171e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf)
516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 516 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
517 envp[i] = NULL; 517 envp[i] = NULL;
518 518
519 call_usermodehelper(argv[0], argv, envp, 0); 519 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
520 kfree(pathbuf); 520 kfree(pathbuf);
521} 521}
522 522
diff --git a/kernel/exit.c b/kernel/exit.c
index e8af8d0c2483..993369ee94d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
24#include <linux/pid_namespace.h> 24#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 25#include <linux/ptrace.h>
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/signalfd.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/kthread.h> 29#include <linux/kthread.h>
@@ -45,6 +44,7 @@
45#include <linux/resource.h> 44#include <linux/resource.h>
46#include <linux/blkdev.h> 45#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 46#include <linux/task_io_accounting_ops.h>
47#include <linux/freezer.h>
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50#include <asm/unistd.h> 50#include <asm/unistd.h>
@@ -85,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
85 sighand = rcu_dereference(tsk->sighand); 85 sighand = rcu_dereference(tsk->sighand);
86 spin_lock(&sighand->siglock); 86 spin_lock(&sighand->siglock);
87 87
88 /*
89 * Notify that this sighand has been detached. This must
90 * be called with the tsk->sighand lock held. Also, this
91 * access tsk->sighand internally, so it must be called
92 * before tsk->sighand is reset.
93 */
94 signalfd_detach_locked(tsk);
95
96 posix_cpu_timers_exit(tsk); 88 posix_cpu_timers_exit(tsk);
97 if (atomic_dec_and_test(&sig->count)) 89 if (atomic_dec_and_test(&sig->count))
98 posix_cpu_timers_exit_group(tsk); 90 posix_cpu_timers_exit_group(tsk);
@@ -594,6 +586,8 @@ static void exit_mm(struct task_struct * tsk)
594 tsk->mm = NULL; 586 tsk->mm = NULL;
595 up_read(&mm->mmap_sem); 587 up_read(&mm->mmap_sem);
596 enter_lazy_tlb(mm, current); 588 enter_lazy_tlb(mm, current);
589 /* We don't want this task to be frozen prematurely */
590 clear_freeze_flag(tsk);
597 task_unlock(tsk); 591 task_unlock(tsk);
598 mmput(mm); 592 mmput(mm);
599} 593}
@@ -810,7 +804,7 @@ static void exit_notify(struct task_struct *tsk)
810 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); 804 __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
811 } 805 }
812 806
813 /* Let father know we died 807 /* Let father know we died
814 * 808 *
815 * Thread signals are configurable, but you aren't going to use 809 * Thread signals are configurable, but you aren't going to use
816 * that to send signals to arbitary processes. 810 * that to send signals to arbitary processes.
@@ -823,9 +817,7 @@ static void exit_notify(struct task_struct *tsk)
823 * If our self_exec id doesn't match our parent_exec_id then 817 * If our self_exec id doesn't match our parent_exec_id then
824 * we have changed execution domain as these two values started 818 * we have changed execution domain as these two values started
825 * the same after a fork. 819 * the same after a fork.
826 *
827 */ 820 */
828
829 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && 821 if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
830 ( tsk->parent_exec_id != t->self_exec_id || 822 ( tsk->parent_exec_id != t->self_exec_id ||
831 tsk->self_exec_id != tsk->parent_exec_id) 823 tsk->self_exec_id != tsk->parent_exec_id)
@@ -845,9 +837,7 @@ static void exit_notify(struct task_struct *tsk)
845 } 837 }
846 838
847 state = EXIT_ZOMBIE; 839 state = EXIT_ZOMBIE;
848 if (tsk->exit_signal == -1 && 840 if (tsk->exit_signal == -1 && likely(!tsk->ptrace))
849 (likely(tsk->ptrace == 0) ||
850 unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
851 state = EXIT_DEAD; 841 state = EXIT_DEAD;
852 tsk->exit_state = state; 842 tsk->exit_state = state;
853 843
@@ -976,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code)
976 if (unlikely(tsk->audit_context)) 966 if (unlikely(tsk->audit_context))
977 audit_free(tsk); 967 audit_free(tsk);
978 968
969 tsk->exit_code = code;
979 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
980 971
981 exit_mm(tsk); 972 exit_mm(tsk);
@@ -997,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code)
997 if (tsk->binfmt) 988 if (tsk->binfmt)
998 module_put(tsk->binfmt->module); 989 module_put(tsk->binfmt->module);
999 990
1000 tsk->exit_code = code;
1001 proc_exit_connector(tsk); 991 proc_exit_connector(tsk);
1002 exit_task_namespaces(tsk); 992 exit_task_namespaces(tsk);
1003 exit_notify(tsk); 993 exit_notify(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index ba39bdb2a7b8..5e67f90a1694 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages)
137 /* create a slab on which task_structs can be allocated */ 137 /* create a slab on which task_structs can be allocated */
138 task_struct_cachep = 138 task_struct_cachep =
139 kmem_cache_create("task_struct", sizeof(struct task_struct), 139 kmem_cache_create("task_struct", sizeof(struct task_struct),
140 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); 140 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
141#endif 141#endif
142 142
143 /* 143 /*
@@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
334 atomic_set(&mm->mm_count, 1); 334 atomic_set(&mm->mm_count, 1);
335 init_rwsem(&mm->mmap_sem); 335 init_rwsem(&mm->mmap_sem);
336 INIT_LIST_HEAD(&mm->mmlist); 336 INIT_LIST_HEAD(&mm->mmlist);
337 mm->flags = (current->mm) ? current->mm->flags
338 : MMF_DUMP_FILTER_DEFAULT;
337 mm->core_waiters = 0; 339 mm->core_waiters = 0;
338 mm->nr_ptes = 0; 340 mm->nr_ptes = 0;
339 set_mm_counter(mm, file_rss, 0); 341 set_mm_counter(mm, file_rss, 0);
@@ -1436,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
1436 struct sighand_struct *sighand = data; 1438 struct sighand_struct *sighand = data;
1437 1439
1438 spin_lock_init(&sighand->siglock); 1440 spin_lock_init(&sighand->siglock);
1439 INIT_LIST_HEAD(&sighand->signalfd_list); 1441 init_waitqueue_head(&sighand->signalfd_wqh);
1440} 1442}
1441 1443
1442void __init proc_caches_init(void) 1444void __init proc_caches_init(void)
@@ -1444,22 +1446,22 @@ void __init proc_caches_init(void)
1444 sighand_cachep = kmem_cache_create("sighand_cache", 1446 sighand_cachep = kmem_cache_create("sighand_cache",
1445 sizeof(struct sighand_struct), 0, 1447 sizeof(struct sighand_struct), 0,
1446 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1448 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
1447 sighand_ctor, NULL); 1449 sighand_ctor);
1448 signal_cachep = kmem_cache_create("signal_cache", 1450 signal_cachep = kmem_cache_create("signal_cache",
1449 sizeof(struct signal_struct), 0, 1451 sizeof(struct signal_struct), 0,
1450 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1452 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1451 files_cachep = kmem_cache_create("files_cache", 1453 files_cachep = kmem_cache_create("files_cache",
1452 sizeof(struct files_struct), 0, 1454 sizeof(struct files_struct), 0,
1453 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1455 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1454 fs_cachep = kmem_cache_create("fs_cache", 1456 fs_cachep = kmem_cache_create("fs_cache",
1455 sizeof(struct fs_struct), 0, 1457 sizeof(struct fs_struct), 0,
1456 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1458 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1457 vm_area_cachep = kmem_cache_create("vm_area_struct", 1459 vm_area_cachep = kmem_cache_create("vm_area_struct",
1458 sizeof(struct vm_area_struct), 0, 1460 sizeof(struct vm_area_struct), 0,
1459 SLAB_PANIC, NULL, NULL); 1461 SLAB_PANIC, NULL);
1460 mm_cachep = kmem_cache_create("mm_struct", 1462 mm_cachep = kmem_cache_create("mm_struct",
1461 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1463 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1462 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1463} 1465}
1464 1466
1465/* 1467/*
@@ -1606,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1606 err = -EINVAL; 1608 err = -EINVAL;
1607 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1608 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1609 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) 1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
1612 CLONE_NEWNET))
1610 goto bad_unshare_out; 1613 goto bad_unshare_out;
1611 1614
1612 if ((err = unshare_thread(unshare_flags))) 1615 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 5c3f45d07c53..fcc94e7b4086 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address,
346 vma = find_vma(mm, address); 346 vma = find_vma(mm, address);
347 if (vma && address >= vma->vm_start && 347 if (vma && address >= vma->vm_start &&
348 (vma->vm_flags & VM_WRITE)) { 348 (vma->vm_flags & VM_WRITE)) {
349 switch (handle_mm_fault(mm, vma, address, 1)) { 349 int fault;
350 case VM_FAULT_MINOR: 350 fault = handle_mm_fault(mm, vma, address, 1);
351 ret = 0; 351 if (unlikely((fault & VM_FAULT_ERROR))) {
352 current->min_flt++; 352#if 0
353 break; 353 /* XXX: let's do this when we verify it is OK */
354 case VM_FAULT_MAJOR: 354 if (ret & VM_FAULT_OOM)
355 ret = -ENOMEM;
356#endif
357 } else {
355 ret = 0; 358 ret = 0;
356 current->maj_flt++; 359 if (fault & VM_FAULT_MAJOR)
357 break; 360 current->maj_flt++;
361 else
362 current->min_flt++;
358 } 363 }
359 } 364 }
360 if (!fshared) 365 if (!fshared)
@@ -1665,6 +1670,7 @@ pi_faulted:
1665 attempt); 1670 attempt);
1666 if (ret) 1671 if (ret)
1667 goto out; 1672 goto out;
1673 uval = 0;
1668 goto retry_unlocked; 1674 goto retry_unlocked;
1669 } 1675 }
1670 1676
@@ -1937,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
1937void exit_robust_list(struct task_struct *curr) 1943void exit_robust_list(struct task_struct *curr)
1938{ 1944{
1939 struct robust_list_head __user *head = curr->robust_list; 1945 struct robust_list_head __user *head = curr->robust_list;
1940 struct robust_list __user *entry, *pending; 1946 struct robust_list __user *entry, *next_entry, *pending;
1941 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1947 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
1942 unsigned long futex_offset; 1948 unsigned long futex_offset;
1949 int rc;
1943 1950
1944 /* 1951 /*
1945 * Fetch the list head (which was registered earlier, via 1952 * Fetch the list head (which was registered earlier, via
@@ -1959,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr)
1959 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1966 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1960 return; 1967 return;
1961 1968
1962 if (pending) 1969 next_entry = NULL; /* avoid warning with gcc */
1963 handle_futex_death((void __user *)pending + futex_offset,
1964 curr, pip);
1965
1966 while (entry != &head->list) { 1970 while (entry != &head->list) {
1967 /* 1971 /*
1972 * Fetch the next entry in the list before calling
1973 * handle_futex_death:
1974 */
1975 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
1976 /*
1968 * A pending lock might already be on the list, so 1977 * A pending lock might already be on the list, so
1969 * don't process it twice: 1978 * don't process it twice:
1970 */ 1979 */
@@ -1972,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr)
1972 if (handle_futex_death((void __user *)entry + futex_offset, 1981 if (handle_futex_death((void __user *)entry + futex_offset,
1973 curr, pi)) 1982 curr, pi))
1974 return; 1983 return;
1975 /* 1984 if (rc)
1976 * Fetch the next entry in the list:
1977 */
1978 if (fetch_robust_entry(&entry, &entry->next, &pi))
1979 return; 1985 return;
1986 entry = next_entry;
1987 pi = next_pi;
1980 /* 1988 /*
1981 * Avoid excessively long or circular lists: 1989 * Avoid excessively long or circular lists:
1982 */ 1990 */
@@ -1985,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr)
1985 1993
1986 cond_resched(); 1994 cond_resched();
1987 } 1995 }
1996
1997 if (pending)
1998 handle_futex_death((void __user *)pending + futex_offset,
1999 curr, pip);
1988} 2000}
1989 2001
1990long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2002long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -2055,8 +2067,10 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2055 } 2067 }
2056 /* 2068 /*
2057 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2069 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
2070 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2058 */ 2071 */
2059 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 2072 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2073 cmd == FUTEX_WAKE_OP)
2060 val2 = (u32) (unsigned long) utime; 2074 val2 = (u32) (unsigned long) utime;
2061 2075
2062 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2076 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f7921360efad..2c2e2954b713 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
38void compat_exit_robust_list(struct task_struct *curr) 38void compat_exit_robust_list(struct task_struct *curr)
39{ 39{
40 struct compat_robust_list_head __user *head = curr->compat_robust_list; 40 struct compat_robust_list_head __user *head = curr->compat_robust_list;
41 struct robust_list __user *entry, *pending; 41 struct robust_list __user *entry, *next_entry, *pending;
42 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 42 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
43 compat_uptr_t uentry, upending; 43 compat_uptr_t uentry, next_uentry, upending;
44 compat_long_t futex_offset; 44 compat_long_t futex_offset;
45 int rc;
45 46
46 /* 47 /*
47 * Fetch the list head (which was registered earlier, via 48 * Fetch the list head (which was registered earlier, via
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr)
61 if (fetch_robust_entry(&upending, &pending, 62 if (fetch_robust_entry(&upending, &pending,
62 &head->list_op_pending, &pip)) 63 &head->list_op_pending, &pip))
63 return; 64 return;
64 if (upending)
65 handle_futex_death((void __user *)pending + futex_offset, curr, pip);
66 65
67 while (compat_ptr(uentry) != &head->list) { 66 next_entry = NULL; /* avoid warning with gcc */
67 while (entry != (struct robust_list __user *) &head->list) {
68 /*
69 * Fetch the next entry in the list before calling
70 * handle_futex_death:
71 */
72 rc = fetch_robust_entry(&next_uentry, &next_entry,
73 (compat_uptr_t __user *)&entry->next, &next_pi);
68 /* 74 /*
69 * A pending lock might already be on the list, so 75 * A pending lock might already be on the list, so
70 * dont process it twice: 76 * dont process it twice:
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr)
74 curr, pi)) 80 curr, pi))
75 return; 81 return;
76 82
77 /* 83 if (rc)
78 * Fetch the next entry in the list:
79 */
80 if (fetch_robust_entry(&uentry, &entry,
81 (compat_uptr_t __user *)&entry->next, &pi))
82 return; 84 return;
85 uentry = next_uentry;
86 entry = next_entry;
87 pi = next_pi;
83 /* 88 /*
84 * Avoid excessively long or circular lists: 89 * Avoid excessively long or circular lists:
85 */ 90 */
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr)
88 93
89 cond_resched(); 94 cond_resched();
90 } 95 }
96 if (pending)
97 handle_futex_death((void __user *)pending + futex_offset,
98 curr, pip);
91} 99}
92 100
93asmlinkage long 101asmlinkage long
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72d034258ba1..dc8a4451d79b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -141,11 +141,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
141 141
142 do { 142 do {
143 seq = read_seqbegin(&xtime_lock); 143 seq = read_seqbegin(&xtime_lock);
144#ifdef CONFIG_NO_HZ 144 xts = current_kernel_time();
145 getnstimeofday(&xts);
146#else
147 xts = xtime;
148#endif
149 tom = wall_to_monotonic; 145 tom = wall_to_monotonic;
150 } while (read_seqretry(&xtime_lock, seq)); 146 } while (read_seqretry(&xtime_lock, seq));
151 147
@@ -281,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
281} 277}
282 278
283EXPORT_SYMBOL_GPL(ktime_add_ns); 279EXPORT_SYMBOL_GPL(ktime_add_ns);
280
281/**
282 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
283 * @kt: minuend
284 * @nsec: the scalar nsec value to subtract
285 *
286 * Returns the subtraction of @nsec from @kt in ktime_t format
287 */
288ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
289{
290 ktime_t tmp;
291
292 if (likely(nsec < NSEC_PER_SEC)) {
293 tmp.tv64 = nsec;
294 } else {
295 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
296
297 tmp = ktime_set((long)nsec, rem);
298 }
299
300 return ktime_sub(kt, tmp);
301}
302
303EXPORT_SYMBOL_GPL(ktime_sub_ns);
284# endif /* !CONFIG_KTIME_SCALAR */ 304# endif /* !CONFIG_KTIME_SCALAR */
285 305
286/* 306/*
@@ -558,7 +578,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
558 */ 578 */
559static int hrtimer_switch_to_hres(void) 579static int hrtimer_switch_to_hres(void)
560{ 580{
561 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 581 int cpu = smp_processor_id();
582 struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
562 unsigned long flags; 583 unsigned long flags;
563 584
564 if (base->hres_active) 585 if (base->hres_active)
@@ -568,6 +589,8 @@ static int hrtimer_switch_to_hres(void)
568 589
569 if (tick_init_highres()) { 590 if (tick_init_highres()) {
570 local_irq_restore(flags); 591 local_irq_restore(flags);
592 printk(KERN_WARNING "Could not switch to high resolution "
593 "mode on CPU %d\n", cpu);
571 return 0; 594 return 0;
572 } 595 }
573 base->hres_active = 1; 596 base->hres_active = 1;
@@ -683,6 +706,7 @@ static void enqueue_hrtimer(struct hrtimer *timer,
683 struct rb_node **link = &base->active.rb_node; 706 struct rb_node **link = &base->active.rb_node;
684 struct rb_node *parent = NULL; 707 struct rb_node *parent = NULL;
685 struct hrtimer *entry; 708 struct hrtimer *entry;
709 int leftmost = 1;
686 710
687 /* 711 /*
688 * Find the right place in the rbtree: 712 * Find the right place in the rbtree:
@@ -694,18 +718,19 @@ static void enqueue_hrtimer(struct hrtimer *timer,
694 * We dont care about collisions. Nodes with 718 * We dont care about collisions. Nodes with
695 * the same expiry time stay together. 719 * the same expiry time stay together.
696 */ 720 */
697 if (timer->expires.tv64 < entry->expires.tv64) 721 if (timer->expires.tv64 < entry->expires.tv64) {
698 link = &(*link)->rb_left; 722 link = &(*link)->rb_left;
699 else 723 } else {
700 link = &(*link)->rb_right; 724 link = &(*link)->rb_right;
725 leftmost = 0;
726 }
701 } 727 }
702 728
703 /* 729 /*
704 * Insert the timer to the rbtree and check whether it 730 * Insert the timer to the rbtree and check whether it
705 * replaces the first pending timer 731 * replaces the first pending timer
706 */ 732 */
707 if (!base->first || timer->expires.tv64 < 733 if (leftmost) {
708 rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
709 /* 734 /*
710 * Reprogram the clock event device. When the timer is already 735 * Reprogram the clock event device. When the timer is already
711 * expired hrtimer_enqueue_reprogram has either called the 736 * expired hrtimer_enqueue_reprogram has either called the
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 615ce97c6cfd..f1a73f0b54e7 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -352,13 +352,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
352 * keep it masked and get out of here 352 * keep it masked and get out of here
353 */ 353 */
354 action = desc->action; 354 action = desc->action;
355 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 355 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
356 desc->status |= IRQ_PENDING;
357 goto out_unlock; 356 goto out_unlock;
358 }
359 357
360 desc->status |= IRQ_INPROGRESS; 358 desc->status |= IRQ_INPROGRESS;
361 desc->status &= ~IRQ_PENDING;
362 spin_unlock(&desc->lock); 359 spin_unlock(&desc->lock);
363 360
364 action_ret = handle_IRQ_event(irq, action); 361 action_ret = handle_IRQ_event(irq, action);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d8ee241115f5..6d9204f3a370 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,5 +1,6 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/device.h>
3 4
4/* 5/*
5 * Device resource management aware IRQ request/free implementation. 6 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 203a518b6f14..7230d914eaa2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id)
462 * We do this after actually deregistering it, to make sure that 462 * We do this after actually deregistering it, to make sure that
463 * a 'real' IRQ doesn't run in parallel with our fake 463 * a 'real' IRQ doesn't run in parallel with our fake
464 */ 464 */
465 local_irq_save(flags);
465 handler(irq, dev_id); 466 handler(irq, dev_id);
467 local_irq_restore(flags);
466 } 468 }
467#endif 469#endif
468} 470}
@@ -545,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler,
545 * We do this before actually registering it, to make sure that 547 * We do this before actually registering it, to make sure that
546 * a 'real' IRQ doesn't run in parallel with our fake 548 * a 'real' IRQ doesn't run in parallel with our fake
547 */ 549 */
548 if (irqflags & IRQF_DISABLED) { 550 unsigned long flags;
549 unsigned long flags;
550 551
551 local_irq_save(flags); 552 local_irq_save(flags);
552 handler(irq, dev_id); 553 handler(irq, dev_id);
553 local_irq_restore(flags); 554 local_irq_restore(flags);
554 } else
555 handler(irq, dev_id);
556 } 555 }
557#endif 556#endif
558 557
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b4f1674fca79..50b81b98046a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir;
19static int irq_affinity_read_proc(char *page, char **start, off_t off, 19static int irq_affinity_read_proc(char *page, char **start, off_t off,
20 int count, int *eof, void *data) 20 int count, int *eof, void *data)
21{ 21{
22 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); 22 struct irq_desc *desc = irq_desc + (long)data;
23 cpumask_t *mask = &desc->affinity;
24 int len;
25
26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING)
28 mask = &desc->pending_mask;
29#endif
30 len = cpumask_scnprintf(page, count, *mask);
23 31
24 if (count - len < 2) 32 if (count - len < 2)
25 return -EINVAL; 33 return -EINVAL;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 5bfeaed7e487..a8046791ba2d 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -62,7 +62,12 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
62 */ 62 */
63 desc->chip->enable(irq); 63 desc->chip->enable(irq);
64 64
65 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 65 /*
66 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still
68 * active.
69 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
66 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
67 72
68 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip || !desc->chip->retrigger ||
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 4d32eb077179..c6a4f8aebeba 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -33,6 +33,8 @@
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/resource.h> 35#include <linux/resource.h>
36#include <linux/notifier.h>
37#include <linux/suspend.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38extern int max_threads; 40extern int max_threads;
@@ -119,9 +121,10 @@ struct subprocess_info {
119 char **argv; 121 char **argv;
120 char **envp; 122 char **envp;
121 struct key *ring; 123 struct key *ring;
122 int wait; 124 enum umh_wait wait;
123 int retval; 125 int retval;
124 struct file *stdin; 126 struct file *stdin;
127 void (*cleanup)(char **argv, char **envp);
125}; 128};
126 129
127/* 130/*
@@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data)
180 do_exit(0); 183 do_exit(0);
181} 184}
182 185
186void call_usermodehelper_freeinfo(struct subprocess_info *info)
187{
188 if (info->cleanup)
189 (*info->cleanup)(info->argv, info->envp);
190 kfree(info);
191}
192EXPORT_SYMBOL(call_usermodehelper_freeinfo);
193
183/* Keventd can't block, but this (a child) can. */ 194/* Keventd can't block, but this (a child) can. */
184static int wait_for_helper(void *data) 195static int wait_for_helper(void *data)
185{ 196{
@@ -216,8 +227,8 @@ static int wait_for_helper(void *data)
216 sub_info->retval = ret; 227 sub_info->retval = ret;
217 } 228 }
218 229
219 if (sub_info->wait < 0) 230 if (sub_info->wait == UMH_NO_WAIT)
220 kfree(sub_info); 231 call_usermodehelper_freeinfo(sub_info);
221 else 232 else
222 complete(sub_info->complete); 233 complete(sub_info->complete);
223 return 0; 234 return 0;
@@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work)
229 struct subprocess_info *sub_info = 240 struct subprocess_info *sub_info =
230 container_of(work, struct subprocess_info, work); 241 container_of(work, struct subprocess_info, work);
231 pid_t pid; 242 pid_t pid;
232 int wait = sub_info->wait; 243 enum umh_wait wait = sub_info->wait;
233 244
234 /* CLONE_VFORK: wait until the usermode helper has execve'd 245 /* CLONE_VFORK: wait until the usermode helper has execve'd
235 * successfully We need the data structures to stay around 246 * successfully We need the data structures to stay around
236 * until that is done. */ 247 * until that is done. */
237 if (wait) 248 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
238 pid = kernel_thread(wait_for_helper, sub_info, 249 pid = kernel_thread(wait_for_helper, sub_info,
239 CLONE_FS | CLONE_FILES | SIGCHLD); 250 CLONE_FS | CLONE_FILES | SIGCHLD);
240 else 251 else
241 pid = kernel_thread(____call_usermodehelper, sub_info, 252 pid = kernel_thread(____call_usermodehelper, sub_info,
242 CLONE_VFORK | SIGCHLD); 253 CLONE_VFORK | SIGCHLD);
243 254
244 if (wait < 0) 255 switch (wait) {
245 return; 256 case UMH_NO_WAIT:
257 break;
246 258
247 if (pid < 0) { 259 case UMH_WAIT_PROC:
260 if (pid > 0)
261 break;
248 sub_info->retval = pid; 262 sub_info->retval = pid;
263 /* FALLTHROUGH */
264
265 case UMH_WAIT_EXEC:
249 complete(sub_info->complete); 266 complete(sub_info->complete);
250 } else if (!wait) 267 }
251 complete(sub_info->complete); 268}
269
270#ifdef CONFIG_PM
271/*
272 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
273 * (used for preventing user land processes from being created after the user
274 * land has been frozen during a system-wide hibernation or suspend operation).
275 */
276static int usermodehelper_disabled;
277
278/* Number of helpers running */
279static atomic_t running_helpers = ATOMIC_INIT(0);
280
281/*
282 * Wait queue head used by usermodehelper_pm_callback() to wait for all running
283 * helpers to finish.
284 */
285static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
286
287/*
288 * Time to wait for running_helpers to become zero before the setting of
289 * usermodehelper_disabled in usermodehelper_pm_callback() fails
290 */
291#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
292
293static int usermodehelper_pm_callback(struct notifier_block *nfb,
294 unsigned long action,
295 void *ignored)
296{
297 long retval;
298
299 switch (action) {
300 case PM_HIBERNATION_PREPARE:
301 case PM_SUSPEND_PREPARE:
302 usermodehelper_disabled = 1;
303 smp_mb();
304 /*
305 * From now on call_usermodehelper_exec() won't start any new
306 * helpers, so it is sufficient if running_helpers turns out to
307 * be zero at one point (it may be increased later, but that
308 * doesn't matter).
309 */
310 retval = wait_event_timeout(running_helpers_waitq,
311 atomic_read(&running_helpers) == 0,
312 RUNNING_HELPERS_TIMEOUT);
313 if (retval) {
314 return NOTIFY_OK;
315 } else {
316 usermodehelper_disabled = 0;
317 return NOTIFY_BAD;
318 }
319 case PM_POST_HIBERNATION:
320 case PM_POST_SUSPEND:
321 usermodehelper_disabled = 0;
322 return NOTIFY_OK;
323 }
324
325 return NOTIFY_DONE;
326}
327
328static void helper_lock(void)
329{
330 atomic_inc(&running_helpers);
331 smp_mb__after_atomic_inc();
332}
333
334static void helper_unlock(void)
335{
336 if (atomic_dec_and_test(&running_helpers))
337 wake_up(&running_helpers_waitq);
338}
339
340static void register_pm_notifier_callback(void)
341{
342 pm_notifier(usermodehelper_pm_callback, 0);
252} 343}
344#else /* CONFIG_PM */
345#define usermodehelper_disabled 0
346
347static inline void helper_lock(void) {}
348static inline void helper_unlock(void) {}
349static inline void register_pm_notifier_callback(void) {}
350#endif /* CONFIG_PM */
253 351
254/** 352/**
255 * call_usermodehelper_keys - start a usermode application 353 * call_usermodehelper_setup - prepare to call a usermode helper
256 * @path: pathname for the application 354 * @path: path to usermode executable
257 * @argv: null-terminated argument list 355 * @argv: arg vector for process
258 * @envp: null-terminated environment list 356 * @envp: environment for process
259 * @session_keyring: session keyring for process (NULL for an empty keyring) 357 *
358 * Returns either %NULL on allocation failure, or a subprocess_info
359 * structure. This should be passed to call_usermodehelper_exec to
360 * exec the process and free the structure.
361 */
362struct subprocess_info *call_usermodehelper_setup(char *path,
363 char **argv, char **envp)
364{
365 struct subprocess_info *sub_info;
366 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
367 if (!sub_info)
368 goto out;
369
370 INIT_WORK(&sub_info->work, __call_usermodehelper);
371 sub_info->path = path;
372 sub_info->argv = argv;
373 sub_info->envp = envp;
374
375 out:
376 return sub_info;
377}
378EXPORT_SYMBOL(call_usermodehelper_setup);
379
380/**
381 * call_usermodehelper_setkeys - set the session keys for usermode helper
382 * @info: a subprocess_info returned by call_usermodehelper_setup
383 * @session_keyring: the session keyring for the process
384 */
385void call_usermodehelper_setkeys(struct subprocess_info *info,
386 struct key *session_keyring)
387{
388 info->ring = session_keyring;
389}
390EXPORT_SYMBOL(call_usermodehelper_setkeys);
391
392/**
393 * call_usermodehelper_setcleanup - set a cleanup function
394 * @info: a subprocess_info returned by call_usermodehelper_setup
395 * @cleanup: a cleanup function
396 *
397 * The cleanup function is just befor ethe subprocess_info is about to
398 * be freed. This can be used for freeing the argv and envp. The
399 * Function must be runnable in either a process context or the
400 * context in which call_usermodehelper_exec is called.
401 */
402void call_usermodehelper_setcleanup(struct subprocess_info *info,
403 void (*cleanup)(char **argv, char **envp))
404{
405 info->cleanup = cleanup;
406}
407EXPORT_SYMBOL(call_usermodehelper_setcleanup);
408
409/**
410 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
411 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
412 * @filp: set to the write-end of a pipe
413 *
414 * This constructs a pipe, and sets the read end to be the stdin of the
415 * subprocess, and returns the write-end in *@filp.
416 */
417int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
418 struct file **filp)
419{
420 struct file *f;
421
422 f = create_write_pipe();
423 if (IS_ERR(f))
424 return PTR_ERR(f);
425 *filp = f;
426
427 f = create_read_pipe(f);
428 if (IS_ERR(f)) {
429 free_write_pipe(*filp);
430 return PTR_ERR(f);
431 }
432 sub_info->stdin = f;
433
434 return 0;
435}
436EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
437
438/**
439 * call_usermodehelper_exec - start a usermode application
440 * @sub_info: information about the subprocessa
260 * @wait: wait for the application to finish and return status. 441 * @wait: wait for the application to finish and return status.
261 * when -1 don't wait at all, but you get no useful error back when 442 * when -1 don't wait at all, but you get no useful error back when
262 * the program couldn't be exec'ed. This makes it safe to call 443 * the program couldn't be exec'ed. This makes it safe to call
@@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work)
265 * Runs a user-space application. The application is started 446 * Runs a user-space application. The application is started
266 * asynchronously if wait is not set, and runs as a child of keventd. 447 * asynchronously if wait is not set, and runs as a child of keventd.
267 * (ie. it runs with full root capabilities). 448 * (ie. it runs with full root capabilities).
268 *
269 * Must be called from process context. Returns a negative error code
270 * if program was not execed successfully, or 0.
271 */ 449 */
272int call_usermodehelper_keys(char *path, char **argv, char **envp, 450int call_usermodehelper_exec(struct subprocess_info *sub_info,
273 struct key *session_keyring, int wait) 451 enum umh_wait wait)
274{ 452{
275 DECLARE_COMPLETION_ONSTACK(done); 453 DECLARE_COMPLETION_ONSTACK(done);
276 struct subprocess_info *sub_info;
277 int retval; 454 int retval;
278 455
279 if (!khelper_wq) 456 helper_lock();
280 return -EBUSY; 457 if (sub_info->path[0] == '\0') {
281 458 retval = 0;
282 if (path[0] == '\0') 459 goto out;
283 return 0; 460 }
284 461
285 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 462 if (!khelper_wq || usermodehelper_disabled) {
286 if (!sub_info) 463 retval = -EBUSY;
287 return -ENOMEM; 464 goto out;
465 }
288 466
289 INIT_WORK(&sub_info->work, __call_usermodehelper);
290 sub_info->complete = &done; 467 sub_info->complete = &done;
291 sub_info->path = path;
292 sub_info->argv = argv;
293 sub_info->envp = envp;
294 sub_info->ring = session_keyring;
295 sub_info->wait = wait; 468 sub_info->wait = wait;
296 469
297 queue_work(khelper_wq, &sub_info->work); 470 queue_work(khelper_wq, &sub_info->work);
298 if (wait < 0) /* task has freed sub_info */ 471 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
299 return 0; 472 return 0;
300 wait_for_completion(&done); 473 wait_for_completion(&done);
301 retval = sub_info->retval; 474 retval = sub_info->retval;
302 kfree(sub_info); 475
476 out:
477 call_usermodehelper_freeinfo(sub_info);
478 helper_unlock();
303 return retval; 479 return retval;
304} 480}
305EXPORT_SYMBOL(call_usermodehelper_keys); 481EXPORT_SYMBOL(call_usermodehelper_exec);
306 482
483/**
484 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
485 * @path: path to usermode executable
486 * @argv: arg vector for process
487 * @envp: environment for process
488 * @filp: set to the write-end of a pipe
489 *
490 * This is a simple wrapper which executes a usermode-helper function
491 * with a pipe as stdin. It is implemented entirely in terms of
492 * lower-level call_usermodehelper_* functions.
493 */
307int call_usermodehelper_pipe(char *path, char **argv, char **envp, 494int call_usermodehelper_pipe(char *path, char **argv, char **envp,
308 struct file **filp) 495 struct file **filp)
309{ 496{
310 DECLARE_COMPLETION(done); 497 struct subprocess_info *sub_info;
311 struct subprocess_info sub_info = { 498 int ret;
312 .work = __WORK_INITIALIZER(sub_info.work,
313 __call_usermodehelper),
314 .complete = &done,
315 .path = path,
316 .argv = argv,
317 .envp = envp,
318 .retval = 0,
319 };
320 struct file *f;
321
322 if (!khelper_wq)
323 return -EBUSY;
324 499
325 if (path[0] == '\0') 500 sub_info = call_usermodehelper_setup(path, argv, envp);
326 return 0; 501 if (sub_info == NULL)
502 return -ENOMEM;
327 503
328 f = create_write_pipe(); 504 ret = call_usermodehelper_stdinpipe(sub_info, filp);
329 if (IS_ERR(f)) 505 if (ret < 0)
330 return PTR_ERR(f); 506 goto out;
331 *filp = f;
332 507
333 f = create_read_pipe(f); 508 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
334 if (IS_ERR(f)) {
335 free_write_pipe(*filp);
336 return PTR_ERR(f);
337 }
338 sub_info.stdin = f;
339 509
340 queue_work(khelper_wq, &sub_info.work); 510 out:
341 wait_for_completion(&done); 511 call_usermodehelper_freeinfo(sub_info);
342 return sub_info.retval; 512 return ret;
343} 513}
344EXPORT_SYMBOL(call_usermodehelper_pipe); 514EXPORT_SYMBOL(call_usermodehelper_pipe);
345 515
@@ -347,4 +517,5 @@ void __init usermodehelper_init(void)
347{ 517{
348 khelper_wq = create_singlethread_workqueue("khelper"); 518 khelper_wq = create_singlethread_workqueue("khelper");
349 BUG_ON(!khelper_wq); 519 BUG_ON(!khelper_wq);
520 register_pm_notifier_callback();
350} 521}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e47d8c493f3..4b8a4493c541 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = {
675 .priority = 0x7fffffff /* we need to be notified first */ 675 .priority = 0x7fffffff /* we need to be notified first */
676}; 676};
677 677
678unsigned long __weak arch_deref_entry_point(void *entry)
679{
680 return (unsigned long)entry;
681}
678 682
679int __kprobes register_jprobe(struct jprobe *jp) 683int __kprobes register_jprobe(struct jprobe *jp)
680{ 684{
685 unsigned long addr = arch_deref_entry_point(jp->entry);
686
687 if (!kernel_text_address(addr))
688 return -EINVAL;
689
681 /* Todo: Verify probepoint is a function entry point */ 690 /* Todo: Verify probepoint is a function entry point */
682 jp->kp.pre_handler = setjmp_pre_handler; 691 jp->kp.pre_handler = setjmp_pre_handler;
683 jp->kp.break_handler = longjmp_break_handler; 692 jp->kp.break_handler = longjmp_break_handler;
@@ -1054,6 +1063,11 @@ EXPORT_SYMBOL_GPL(register_kprobe);
1054EXPORT_SYMBOL_GPL(unregister_kprobe); 1063EXPORT_SYMBOL_GPL(unregister_kprobe);
1055EXPORT_SYMBOL_GPL(register_jprobe); 1064EXPORT_SYMBOL_GPL(register_jprobe);
1056EXPORT_SYMBOL_GPL(unregister_jprobe); 1065EXPORT_SYMBOL_GPL(unregister_jprobe);
1066#ifdef CONFIG_KPROBES
1057EXPORT_SYMBOL_GPL(jprobe_return); 1067EXPORT_SYMBOL_GPL(jprobe_return);
1068#endif
1069
1070#ifdef CONFIG_KPROBES
1058EXPORT_SYMBOL_GPL(register_kretprobe); 1071EXPORT_SYMBOL_GPL(register_kretprobe);
1059EXPORT_SYMBOL_GPL(unregister_kretprobe); 1072EXPORT_SYMBOL_GPL(unregister_kretprobe);
1073#endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 559deca5ed15..d0e5c48e18c7 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
62KERNEL_ATTR_RO(kexec_crash_loaded); 62KERNEL_ATTR_RO(kexec_crash_loaded);
63#endif /* CONFIG_KEXEC */ 63#endif /* CONFIG_KEXEC */
64 64
65/*
66 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
67 */
68extern const void __start_notes __attribute__((weak));
69extern const void __stop_notes __attribute__((weak));
70#define notes_size (&__stop_notes - &__start_notes)
71
72static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
73 char *buf, loff_t off, size_t count)
74{
75 memcpy(buf, &__start_notes + off, count);
76 return count;
77}
78
79static struct bin_attribute notes_attr = {
80 .attr = {
81 .name = "notes",
82 .mode = S_IRUGO,
83 },
84 .read = &notes_read,
85};
86
65decl_subsys(kernel, NULL, NULL); 87decl_subsys(kernel, NULL, NULL);
66EXPORT_SYMBOL_GPL(kernel_subsys); 88EXPORT_SYMBOL_GPL(kernel_subsys);
67 89
@@ -88,6 +110,12 @@ static int __init ksysfs_init(void)
88 error = sysfs_create_group(&kernel_subsys.kobj, 110 error = sysfs_create_group(&kernel_subsys.kobj,
89 &kernel_attr_group); 111 &kernel_attr_group);
90 112
113 if (!error && notes_size > 0) {
114 notes_attr.size = notes_size;
115 error = sysfs_create_bin_file(&kernel_subsys.kobj,
116 &notes_attr);
117 }
118
91 return error; 119 return error;
92} 120}
93 121
diff --git a/kernel/kthread.c b/kernel/kthread.c
index a404f7ee7395..dcfe724300eb 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -214,23 +214,15 @@ int kthread_stop(struct task_struct *k)
214} 214}
215EXPORT_SYMBOL(kthread_stop); 215EXPORT_SYMBOL(kthread_stop);
216 216
217 217int kthreadd(void *unused)
218static noinline __init_refok void kthreadd_setup(void)
219{ 218{
220 struct task_struct *tsk = current; 219 struct task_struct *tsk = current;
221 220
221 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 222 set_task_comm(tsk, "kthreadd");
223
224 ignore_signals(tsk); 223 ignore_signals(tsk);
225
226 set_user_nice(tsk, -5); 224 set_user_nice(tsk, -5);
227 set_cpus_allowed(tsk, CPU_MASK_ALL); 225 set_cpus_allowed(tsk, CPU_MASK_ALL);
228}
229
230int kthreadd(void *unused)
231{
232 /* Setup a clean context for our children to inherit. */
233 kthreadd_setup();
234 226
235 current->flags |= PF_NOFREEZE; 227 current->flags |= PF_NOFREEZE;
236 228
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index edba2ffb43de..734da579ad13 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * this code maps all the lock dependencies as they occur in a live kernel 11 * this code maps all the lock dependencies as they occur in a live kernel
11 * and will warn about the following classes of locking bugs: 12 * and will warn about the following classes of locking bugs:
@@ -37,11 +38,26 @@
37#include <linux/debug_locks.h> 38#include <linux/debug_locks.h>
38#include <linux/irqflags.h> 39#include <linux/irqflags.h>
39#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h>
40 42
41#include <asm/sections.h> 43#include <asm/sections.h>
42 44
43#include "lockdep_internals.h" 45#include "lockdep_internals.h"
44 46
47#ifdef CONFIG_PROVE_LOCKING
48int prove_locking = 1;
49module_param(prove_locking, int, 0644);
50#else
51#define prove_locking 0
52#endif
53
54#ifdef CONFIG_LOCK_STAT
55int lock_stat = 1;
56module_param(lock_stat, int, 0644);
57#else
58#define lock_stat 0
59#endif
60
45/* 61/*
46 * lockdep_lock: protects the lockdep graph, the hashes and the 62 * lockdep_lock: protects the lockdep graph, the hashes and the
47 * class/list/hash allocators. 63 * class/list/hash allocators.
@@ -96,23 +112,6 @@ unsigned long nr_list_entries;
96static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; 112static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
97 113
98/* 114/*
99 * Allocate a lockdep entry. (assumes the graph_lock held, returns
100 * with NULL on failure)
101 */
102static struct lock_list *alloc_list_entry(void)
103{
104 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
105 if (!debug_locks_off_graph_unlock())
106 return NULL;
107
108 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
109 printk("turning off the locking correctness validator.\n");
110 return NULL;
111 }
112 return list_entries + nr_list_entries++;
113}
114
115/*
116 * All data structures here are protected by the global debug_lock. 115 * All data structures here are protected by the global debug_lock.
117 * 116 *
118 * Mutex key structs only get allocated, once during bootup, and never 117 * Mutex key structs only get allocated, once during bootup, and never
@@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void)
121unsigned long nr_lock_classes; 120unsigned long nr_lock_classes;
122static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 121static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
123 122
123#ifdef CONFIG_LOCK_STAT
124static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
125
126static int lock_contention_point(struct lock_class *class, unsigned long ip)
127{
128 int i;
129
130 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
131 if (class->contention_point[i] == 0) {
132 class->contention_point[i] = ip;
133 break;
134 }
135 if (class->contention_point[i] == ip)
136 break;
137 }
138
139 return i;
140}
141
142static void lock_time_inc(struct lock_time *lt, s64 time)
143{
144 if (time > lt->max)
145 lt->max = time;
146
147 if (time < lt->min || !lt->min)
148 lt->min = time;
149
150 lt->total += time;
151 lt->nr++;
152}
153
154static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
155{
156 dst->min += src->min;
157 dst->max += src->max;
158 dst->total += src->total;
159 dst->nr += src->nr;
160}
161
162struct lock_class_stats lock_stats(struct lock_class *class)
163{
164 struct lock_class_stats stats;
165 int cpu, i;
166
167 memset(&stats, 0, sizeof(struct lock_class_stats));
168 for_each_possible_cpu(cpu) {
169 struct lock_class_stats *pcs =
170 &per_cpu(lock_stats, cpu)[class - lock_classes];
171
172 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
173 stats.contention_point[i] += pcs->contention_point[i];
174
175 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
176 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
177
178 lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
179 lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
180
181 for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
182 stats.bounces[i] += pcs->bounces[i];
183 }
184
185 return stats;
186}
187
188void clear_lock_stats(struct lock_class *class)
189{
190 int cpu;
191
192 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *cpu_stats =
194 &per_cpu(lock_stats, cpu)[class - lock_classes];
195
196 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
197 }
198 memset(class->contention_point, 0, sizeof(class->contention_point));
199}
200
201static struct lock_class_stats *get_lock_stats(struct lock_class *class)
202{
203 return &get_cpu_var(lock_stats)[class - lock_classes];
204}
205
206static void put_lock_stats(struct lock_class_stats *stats)
207{
208 put_cpu_var(lock_stats);
209}
210
211static void lock_release_holdtime(struct held_lock *hlock)
212{
213 struct lock_class_stats *stats;
214 s64 holdtime;
215
216 if (!lock_stat)
217 return;
218
219 holdtime = sched_clock() - hlock->holdtime_stamp;
220
221 stats = get_lock_stats(hlock->class);
222 if (hlock->read)
223 lock_time_inc(&stats->read_holdtime, holdtime);
224 else
225 lock_time_inc(&stats->write_holdtime, holdtime);
226 put_lock_stats(stats);
227}
228#else
229static inline void lock_release_holdtime(struct held_lock *hlock)
230{
231}
232#endif
233
124/* 234/*
125 * We keep a global list of all lock classes. The list only grows, 235 * We keep a global list of all lock classes. The list only grows,
126 * never shrinks. The list is only accessed with the lockdep 236 * never shrinks. The list is only accessed with the lockdep
@@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes);
133 */ 243 */
134#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) 244#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
135#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) 245#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
136#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) 246#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
137#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
138#define classhashentry(key) (classhash_table + __classhashfn((key))) 247#define classhashentry(key) (classhash_table + __classhashfn((key)))
139 248
140static struct list_head classhash_table[CLASSHASH_SIZE]; 249static struct list_head classhash_table[CLASSHASH_SIZE];
141 250
142unsigned long nr_lock_chains;
143static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
144
145/* 251/*
146 * We put the lock dependency chains into a hash-table as well, to cache 252 * We put the lock dependency chains into a hash-table as well, to cache
147 * their existence: 253 * their existence:
148 */ 254 */
149#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) 255#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
150#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) 256#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
151#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) 257#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
152#define __chainhashfn(chain) \
153 (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
154#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) 258#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
155 259
156static struct list_head chainhash_table[CHAINHASH_SIZE]; 260static struct list_head chainhash_table[CHAINHASH_SIZE];
@@ -223,26 +327,6 @@ static int verbose(struct lock_class *class)
223 return 0; 327 return 0;
224} 328}
225 329
226#ifdef CONFIG_TRACE_IRQFLAGS
227
228static int hardirq_verbose(struct lock_class *class)
229{
230#if HARDIRQ_VERBOSE
231 return class_filter(class);
232#endif
233 return 0;
234}
235
236static int softirq_verbose(struct lock_class *class)
237{
238#if SOFTIRQ_VERBOSE
239 return class_filter(class);
240#endif
241 return 0;
242}
243
244#endif
245
246/* 330/*
247 * Stack-trace: tightly packed array of stack backtrace 331 * Stack-trace: tightly packed array of stack backtrace
248 * addresses. Protected by the graph_lock. 332 * addresses. Protected by the graph_lock.
@@ -291,6 +375,11 @@ unsigned int max_recursion_depth;
291 * about it later on, in lockdep_info(). 375 * about it later on, in lockdep_info().
292 */ 376 */
293static int lockdep_init_error; 377static int lockdep_init_error;
378static unsigned long lockdep_init_trace_data[20];
379static struct stack_trace lockdep_init_trace = {
380 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
381 .entries = lockdep_init_trace_data,
382};
294 383
295/* 384/*
296 * Various lockdep statistics: 385 * Various lockdep statistics:
@@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
482 } 571 }
483} 572}
484 573
574static void print_kernel_version(void)
575{
576 printk("%s %.*s\n", init_utsname()->release,
577 (int)strcspn(init_utsname()->version, " "),
578 init_utsname()->version);
579}
580
581static int very_verbose(struct lock_class *class)
582{
583#if VERY_VERBOSE
584 return class_filter(class);
585#endif
586 return 0;
587}
588
589/*
590 * Is this the address of a static object:
591 */
592static int static_obj(void *obj)
593{
594 unsigned long start = (unsigned long) &_stext,
595 end = (unsigned long) &_end,
596 addr = (unsigned long) obj;
597#ifdef CONFIG_SMP
598 int i;
599#endif
600
601 /*
602 * static variable?
603 */
604 if ((addr >= start) && (addr < end))
605 return 1;
606
607#ifdef CONFIG_SMP
608 /*
609 * percpu var?
610 */
611 for_each_possible_cpu(i) {
612 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
613 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
614 + per_cpu_offset(i);
615
616 if ((addr >= start) && (addr < end))
617 return 1;
618 }
619#endif
620
621 /*
622 * module var?
623 */
624 return is_module_address(addr);
625}
626
627/*
628 * To make lock name printouts unique, we calculate a unique
629 * class->name_version generation counter:
630 */
631static int count_matching_names(struct lock_class *new_class)
632{
633 struct lock_class *class;
634 int count = 0;
635
636 if (!new_class->name)
637 return 0;
638
639 list_for_each_entry(class, &all_lock_classes, lock_entry) {
640 if (new_class->key - new_class->subclass == class->key)
641 return class->name_version;
642 if (class->name && !strcmp(class->name, new_class->name))
643 count = max(count, class->name_version);
644 }
645
646 return count + 1;
647}
648
649/*
650 * Register a lock's class in the hash-table, if the class is not present
651 * yet. Otherwise we look it up. We cache the result in the lock object
652 * itself, so actual lookup of the hash should be once per lock object.
653 */
654static inline struct lock_class *
655look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656{
657 struct lockdep_subclass_key *key;
658 struct list_head *hash_head;
659 struct lock_class *class;
660
661#ifdef CONFIG_DEBUG_LOCKDEP
662 /*
663 * If the architecture calls into lockdep before initializing
664 * the hashes then we'll warn about it later. (we cannot printk
665 * right now)
666 */
667 if (unlikely(!lockdep_initialized)) {
668 lockdep_init();
669 lockdep_init_error = 1;
670 save_stack_trace(&lockdep_init_trace);
671 }
672#endif
673
674 /*
675 * Static locks do not have their class-keys yet - for them the key
676 * is the lock object itself:
677 */
678 if (unlikely(!lock->key))
679 lock->key = (void *)lock;
680
681 /*
682 * NOTE: the class-key must be unique. For dynamic locks, a static
683 * lock_class_key variable is passed in through the mutex_init()
684 * (or spin_lock_init()) call - which acts as the key. For static
685 * locks we use the lock object itself as the key.
686 */
687 BUILD_BUG_ON(sizeof(struct lock_class_key) >
688 sizeof(struct lockdep_map));
689
690 key = lock->key->subkeys + subclass;
691
692 hash_head = classhashentry(key);
693
694 /*
695 * We can walk the hash lockfree, because the hash only
696 * grows, and we are careful when adding entries to the end:
697 */
698 list_for_each_entry(class, hash_head, hash_entry) {
699 if (class->key == key) {
700 WARN_ON_ONCE(class->name != lock->name);
701 return class;
702 }
703 }
704
705 return NULL;
706}
707
708/*
709 * Register a lock's class in the hash-table, if the class is not present
710 * yet. Otherwise we look it up. We cache the result in the lock object
711 * itself, so actual lookup of the hash should be once per lock object.
712 */
713static inline struct lock_class *
714register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
715{
716 struct lockdep_subclass_key *key;
717 struct list_head *hash_head;
718 struct lock_class *class;
719 unsigned long flags;
720
721 class = look_up_lock_class(lock, subclass);
722 if (likely(class))
723 return class;
724
725 /*
726 * Debug-check: all keys must be persistent!
727 */
728 if (!static_obj(lock->key)) {
729 debug_locks_off();
730 printk("INFO: trying to register non-static key.\n");
731 printk("the code is fine but needs lockdep annotation.\n");
732 printk("turning off the locking correctness validator.\n");
733 dump_stack();
734
735 return NULL;
736 }
737
738 key = lock->key->subkeys + subclass;
739 hash_head = classhashentry(key);
740
741 raw_local_irq_save(flags);
742 if (!graph_lock()) {
743 raw_local_irq_restore(flags);
744 return NULL;
745 }
746 /*
747 * We have to do the hash-walk again, to avoid races
748 * with another CPU:
749 */
750 list_for_each_entry(class, hash_head, hash_entry)
751 if (class->key == key)
752 goto out_unlock_set;
753 /*
754 * Allocate a new key from the static array, and add it to
755 * the hash:
756 */
757 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
758 if (!debug_locks_off_graph_unlock()) {
759 raw_local_irq_restore(flags);
760 return NULL;
761 }
762 raw_local_irq_restore(flags);
763
764 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
765 printk("turning off the locking correctness validator.\n");
766 return NULL;
767 }
768 class = lock_classes + nr_lock_classes++;
769 debug_atomic_inc(&nr_unused_locks);
770 class->key = key;
771 class->name = lock->name;
772 class->subclass = subclass;
773 INIT_LIST_HEAD(&class->lock_entry);
774 INIT_LIST_HEAD(&class->locks_before);
775 INIT_LIST_HEAD(&class->locks_after);
776 class->name_version = count_matching_names(class);
777 /*
778 * We use RCU's safe list-add method to make
779 * parallel walking of the hash-list safe:
780 */
781 list_add_tail_rcu(&class->hash_entry, hash_head);
782
783 if (verbose(class)) {
784 graph_unlock();
785 raw_local_irq_restore(flags);
786
787 printk("\nnew class %p: %s", class->key, class->name);
788 if (class->name_version > 1)
789 printk("#%d", class->name_version);
790 printk("\n");
791 dump_stack();
792
793 raw_local_irq_save(flags);
794 if (!graph_lock()) {
795 raw_local_irq_restore(flags);
796 return NULL;
797 }
798 }
799out_unlock_set:
800 graph_unlock();
801 raw_local_irq_restore(flags);
802
803 if (!subclass || force)
804 lock->class_cache = class;
805
806 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
807 return NULL;
808
809 return class;
810}
811
812#ifdef CONFIG_PROVE_LOCKING
813/*
814 * Allocate a lockdep entry. (assumes the graph_lock held, returns
815 * with NULL on failure)
816 */
817static struct lock_list *alloc_list_entry(void)
818{
819 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
820 if (!debug_locks_off_graph_unlock())
821 return NULL;
822
823 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
824 printk("turning off the locking correctness validator.\n");
825 return NULL;
826 }
827 return list_entries + nr_list_entries++;
828}
829
485/* 830/*
486 * Add a new dependency to the head of the list: 831 * Add a new dependency to the head of the list:
487 */ 832 */
@@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
542 return 0; 887 return 0;
543} 888}
544 889
545static void print_kernel_version(void)
546{
547 printk("%s %.*s\n", init_utsname()->release,
548 (int)strcspn(init_utsname()->version, " "),
549 init_utsname()->version);
550}
551
552/* 890/*
553 * When a circular dependency is detected, print the 891 * When a circular dependency is detected, print the
554 * header first: 892 * header first:
@@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
640 return 1; 978 return 1;
641} 979}
642 980
643static int very_verbose(struct lock_class *class)
644{
645#if VERY_VERBOSE
646 return class_filter(class);
647#endif
648 return 0;
649}
650#ifdef CONFIG_TRACE_IRQFLAGS 981#ifdef CONFIG_TRACE_IRQFLAGS
651
652/* 982/*
653 * Forwards and backwards subgraph searching, for the purposes of 983 * Forwards and backwards subgraph searching, for the purposes of
654 * proving that two subgraphs can be connected by a new dependency 984 * proving that two subgraphs can be connected by a new dependency
@@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
821 bit_backwards, bit_forwards, irqclass); 1151 bit_backwards, bit_forwards, irqclass);
822} 1152}
823 1153
1154static int
1155check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1156 struct held_lock *next)
1157{
1158 /*
1159 * Prove that the new dependency does not connect a hardirq-safe
1160 * lock with a hardirq-unsafe lock - to achieve this we search
1161 * the backwards-subgraph starting at <prev>, and the
1162 * forwards-subgraph starting at <next>:
1163 */
1164 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
1165 LOCK_ENABLED_HARDIRQS, "hard"))
1166 return 0;
1167
1168 /*
1169 * Prove that the new dependency does not connect a hardirq-safe-read
1170 * lock with a hardirq-unsafe lock - to achieve this we search
1171 * the backwards-subgraph starting at <prev>, and the
1172 * forwards-subgraph starting at <next>:
1173 */
1174 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
1175 LOCK_ENABLED_HARDIRQS, "hard-read"))
1176 return 0;
1177
1178 /*
1179 * Prove that the new dependency does not connect a softirq-safe
1180 * lock with a softirq-unsafe lock - to achieve this we search
1181 * the backwards-subgraph starting at <prev>, and the
1182 * forwards-subgraph starting at <next>:
1183 */
1184 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
1185 LOCK_ENABLED_SOFTIRQS, "soft"))
1186 return 0;
1187 /*
1188 * Prove that the new dependency does not connect a softirq-safe-read
1189 * lock with a softirq-unsafe lock - to achieve this we search
1190 * the backwards-subgraph starting at <prev>, and the
1191 * forwards-subgraph starting at <next>:
1192 */
1193 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1194 LOCK_ENABLED_SOFTIRQS, "soft"))
1195 return 0;
1196
1197 return 1;
1198}
1199
1200static void inc_chains(void)
1201{
1202 if (current->hardirq_context)
1203 nr_hardirq_chains++;
1204 else {
1205 if (current->softirq_context)
1206 nr_softirq_chains++;
1207 else
1208 nr_process_chains++;
1209 }
1210}
1211
1212#else
1213
1214static inline int
1215check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1216 struct held_lock *next)
1217{
1218 return 1;
1219}
1220
1221static inline void inc_chains(void)
1222{
1223 nr_process_chains++;
1224}
1225
824#endif 1226#endif
825 1227
826static int 1228static int
@@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
922 if (!(check_noncircular(next->class, 0))) 1324 if (!(check_noncircular(next->class, 0)))
923 return print_circular_bug_tail(); 1325 return print_circular_bug_tail();
924 1326
925#ifdef CONFIG_TRACE_IRQFLAGS 1327 if (!check_prev_add_irq(curr, prev, next))
926 /*
927 * Prove that the new dependency does not connect a hardirq-safe
928 * lock with a hardirq-unsafe lock - to achieve this we search
929 * the backwards-subgraph starting at <prev>, and the
930 * forwards-subgraph starting at <next>:
931 */
932 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
933 LOCK_ENABLED_HARDIRQS, "hard"))
934 return 0; 1328 return 0;
935 1329
936 /* 1330 /*
937 * Prove that the new dependency does not connect a hardirq-safe-read
938 * lock with a hardirq-unsafe lock - to achieve this we search
939 * the backwards-subgraph starting at <prev>, and the
940 * forwards-subgraph starting at <next>:
941 */
942 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
943 LOCK_ENABLED_HARDIRQS, "hard-read"))
944 return 0;
945
946 /*
947 * Prove that the new dependency does not connect a softirq-safe
948 * lock with a softirq-unsafe lock - to achieve this we search
949 * the backwards-subgraph starting at <prev>, and the
950 * forwards-subgraph starting at <next>:
951 */
952 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
953 LOCK_ENABLED_SOFTIRQS, "soft"))
954 return 0;
955 /*
956 * Prove that the new dependency does not connect a softirq-safe-read
957 * lock with a softirq-unsafe lock - to achieve this we search
958 * the backwards-subgraph starting at <prev>, and the
959 * forwards-subgraph starting at <next>:
960 */
961 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
962 LOCK_ENABLED_SOFTIRQS, "soft"))
963 return 0;
964#endif
965 /*
966 * For recursive read-locks we do all the dependency checks, 1331 * For recursive read-locks we do all the dependency checks,
967 * but we dont store read-triggered dependencies (only 1332 * but we dont store read-triggered dependencies (only
968 * write-triggered dependencies). This ensures that only the 1333 * write-triggered dependencies). This ensures that only the
@@ -1088,224 +1453,8 @@ out_bug:
1088 return 0; 1453 return 0;
1089} 1454}
1090 1455
1091 1456unsigned long nr_lock_chains;
1092/* 1457static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1093 * Is this the address of a static object:
1094 */
1095static int static_obj(void *obj)
1096{
1097 unsigned long start = (unsigned long) &_stext,
1098 end = (unsigned long) &_end,
1099 addr = (unsigned long) obj;
1100#ifdef CONFIG_SMP
1101 int i;
1102#endif
1103
1104 /*
1105 * static variable?
1106 */
1107 if ((addr >= start) && (addr < end))
1108 return 1;
1109
1110#ifdef CONFIG_SMP
1111 /*
1112 * percpu var?
1113 */
1114 for_each_possible_cpu(i) {
1115 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1116 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
1117 + per_cpu_offset(i);
1118
1119 if ((addr >= start) && (addr < end))
1120 return 1;
1121 }
1122#endif
1123
1124 /*
1125 * module var?
1126 */
1127 return is_module_address(addr);
1128}
1129
1130/*
1131 * To make lock name printouts unique, we calculate a unique
1132 * class->name_version generation counter:
1133 */
1134static int count_matching_names(struct lock_class *new_class)
1135{
1136 struct lock_class *class;
1137 int count = 0;
1138
1139 if (!new_class->name)
1140 return 0;
1141
1142 list_for_each_entry(class, &all_lock_classes, lock_entry) {
1143 if (new_class->key - new_class->subclass == class->key)
1144 return class->name_version;
1145 if (class->name && !strcmp(class->name, new_class->name))
1146 count = max(count, class->name_version);
1147 }
1148
1149 return count + 1;
1150}
1151
1152/*
1153 * Register a lock's class in the hash-table, if the class is not present
1154 * yet. Otherwise we look it up. We cache the result in the lock object
1155 * itself, so actual lookup of the hash should be once per lock object.
1156 */
1157static inline struct lock_class *
1158look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1159{
1160 struct lockdep_subclass_key *key;
1161 struct list_head *hash_head;
1162 struct lock_class *class;
1163
1164#ifdef CONFIG_DEBUG_LOCKDEP
1165 /*
1166 * If the architecture calls into lockdep before initializing
1167 * the hashes then we'll warn about it later. (we cannot printk
1168 * right now)
1169 */
1170 if (unlikely(!lockdep_initialized)) {
1171 lockdep_init();
1172 lockdep_init_error = 1;
1173 }
1174#endif
1175
1176 /*
1177 * Static locks do not have their class-keys yet - for them the key
1178 * is the lock object itself:
1179 */
1180 if (unlikely(!lock->key))
1181 lock->key = (void *)lock;
1182
1183 /*
1184 * NOTE: the class-key must be unique. For dynamic locks, a static
1185 * lock_class_key variable is passed in through the mutex_init()
1186 * (or spin_lock_init()) call - which acts as the key. For static
1187 * locks we use the lock object itself as the key.
1188 */
1189 BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class));
1190
1191 key = lock->key->subkeys + subclass;
1192
1193 hash_head = classhashentry(key);
1194
1195 /*
1196 * We can walk the hash lockfree, because the hash only
1197 * grows, and we are careful when adding entries to the end:
1198 */
1199 list_for_each_entry(class, hash_head, hash_entry)
1200 if (class->key == key)
1201 return class;
1202
1203 return NULL;
1204}
1205
1206/*
1207 * Register a lock's class in the hash-table, if the class is not present
1208 * yet. Otherwise we look it up. We cache the result in the lock object
1209 * itself, so actual lookup of the hash should be once per lock object.
1210 */
1211static inline struct lock_class *
1212register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1213{
1214 struct lockdep_subclass_key *key;
1215 struct list_head *hash_head;
1216 struct lock_class *class;
1217 unsigned long flags;
1218
1219 class = look_up_lock_class(lock, subclass);
1220 if (likely(class))
1221 return class;
1222
1223 /*
1224 * Debug-check: all keys must be persistent!
1225 */
1226 if (!static_obj(lock->key)) {
1227 debug_locks_off();
1228 printk("INFO: trying to register non-static key.\n");
1229 printk("the code is fine but needs lockdep annotation.\n");
1230 printk("turning off the locking correctness validator.\n");
1231 dump_stack();
1232
1233 return NULL;
1234 }
1235
1236 key = lock->key->subkeys + subclass;
1237 hash_head = classhashentry(key);
1238
1239 raw_local_irq_save(flags);
1240 if (!graph_lock()) {
1241 raw_local_irq_restore(flags);
1242 return NULL;
1243 }
1244 /*
1245 * We have to do the hash-walk again, to avoid races
1246 * with another CPU:
1247 */
1248 list_for_each_entry(class, hash_head, hash_entry)
1249 if (class->key == key)
1250 goto out_unlock_set;
1251 /*
1252 * Allocate a new key from the static array, and add it to
1253 * the hash:
1254 */
1255 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1256 if (!debug_locks_off_graph_unlock()) {
1257 raw_local_irq_restore(flags);
1258 return NULL;
1259 }
1260 raw_local_irq_restore(flags);
1261
1262 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1263 printk("turning off the locking correctness validator.\n");
1264 return NULL;
1265 }
1266 class = lock_classes + nr_lock_classes++;
1267 debug_atomic_inc(&nr_unused_locks);
1268 class->key = key;
1269 class->name = lock->name;
1270 class->subclass = subclass;
1271 INIT_LIST_HEAD(&class->lock_entry);
1272 INIT_LIST_HEAD(&class->locks_before);
1273 INIT_LIST_HEAD(&class->locks_after);
1274 class->name_version = count_matching_names(class);
1275 /*
1276 * We use RCU's safe list-add method to make
1277 * parallel walking of the hash-list safe:
1278 */
1279 list_add_tail_rcu(&class->hash_entry, hash_head);
1280
1281 if (verbose(class)) {
1282 graph_unlock();
1283 raw_local_irq_restore(flags);
1284
1285 printk("\nnew class %p: %s", class->key, class->name);
1286 if (class->name_version > 1)
1287 printk("#%d", class->name_version);
1288 printk("\n");
1289 dump_stack();
1290
1291 raw_local_irq_save(flags);
1292 if (!graph_lock()) {
1293 raw_local_irq_restore(flags);
1294 return NULL;
1295 }
1296 }
1297out_unlock_set:
1298 graph_unlock();
1299 raw_local_irq_restore(flags);
1300
1301 if (!subclass || force)
1302 lock->class_cache = class;
1303
1304 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
1305 return NULL;
1306
1307 return class;
1308}
1309 1458
1310/* 1459/*
1311 * Look up a dependency chain. If the key is not present yet then 1460 * Look up a dependency chain. If the key is not present yet then
@@ -1366,21 +1515,72 @@ cache_hit:
1366 chain->chain_key = chain_key; 1515 chain->chain_key = chain_key;
1367 list_add_tail_rcu(&chain->entry, hash_head); 1516 list_add_tail_rcu(&chain->entry, hash_head);
1368 debug_atomic_inc(&chain_lookup_misses); 1517 debug_atomic_inc(&chain_lookup_misses);
1369#ifdef CONFIG_TRACE_IRQFLAGS 1518 inc_chains();
1370 if (current->hardirq_context) 1519
1371 nr_hardirq_chains++; 1520 return 1;
1372 else { 1521}
1373 if (current->softirq_context) 1522
1374 nr_softirq_chains++; 1523static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1375 else 1524 struct held_lock *hlock, int chain_head)
1376 nr_process_chains++; 1525{
1377 } 1526 /*
1378#else 1527 * Trylock needs to maintain the stack of held locks, but it
1379 nr_process_chains++; 1528 * does not add new dependencies, because trylock can be done
1380#endif 1529 * in any order.
1530 *
1531 * We look up the chain_key and do the O(N^2) check and update of
1532 * the dependencies only if this is a new dependency chain.
1533 * (If lookup_chain_cache() returns with 1 it acquires
1534 * graph_lock for us)
1535 */
1536 if (!hlock->trylock && (hlock->check == 2) &&
1537 lookup_chain_cache(curr->curr_chain_key, hlock->class)) {
1538 /*
1539 * Check whether last held lock:
1540 *
1541 * - is irq-safe, if this lock is irq-unsafe
1542 * - is softirq-safe, if this lock is hardirq-unsafe
1543 *
1544 * And check whether the new lock's dependency graph
1545 * could lead back to the previous lock.
1546 *
1547 * any of these scenarios could lead to a deadlock. If
1548 * All validations
1549 */
1550 int ret = check_deadlock(curr, hlock, lock, hlock->read);
1551
1552 if (!ret)
1553 return 0;
1554 /*
1555 * Mark recursive read, as we jump over it when
1556 * building dependencies (just like we jump over
1557 * trylock entries):
1558 */
1559 if (ret == 2)
1560 hlock->read = 2;
1561 /*
1562 * Add dependency only if this lock is not the head
1563 * of the chain, and if it's not a secondary read-lock:
1564 */
1565 if (!chain_head && ret != 2)
1566 if (!check_prevs_add(curr, hlock))
1567 return 0;
1568 graph_unlock();
1569 } else
1570 /* after lookup_chain_cache(): */
1571 if (unlikely(!debug_locks))
1572 return 0;
1381 1573
1382 return 1; 1574 return 1;
1383} 1575}
1576#else
1577static inline int validate_chain(struct task_struct *curr,
1578 struct lockdep_map *lock, struct held_lock *hlock,
1579 int chain_head)
1580{
1581 return 1;
1582}
1583#endif
1384 1584
1385/* 1585/*
1386 * We are building curr_chain_key incrementally, so double-check 1586 * We are building curr_chain_key incrementally, so double-check
@@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr)
1425#endif 1625#endif
1426} 1626}
1427 1627
1628static int
1629print_usage_bug(struct task_struct *curr, struct held_lock *this,
1630 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1631{
1632 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1633 return 0;
1634
1635 printk("\n=================================\n");
1636 printk( "[ INFO: inconsistent lock state ]\n");
1637 print_kernel_version();
1638 printk( "---------------------------------\n");
1639
1640 printk("inconsistent {%s} -> {%s} usage.\n",
1641 usage_str[prev_bit], usage_str[new_bit]);
1642
1643 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1644 curr->comm, curr->pid,
1645 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1646 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1647 trace_hardirqs_enabled(curr),
1648 trace_softirqs_enabled(curr));
1649 print_lock(this);
1650
1651 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1652 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1653
1654 print_irqtrace_events(curr);
1655 printk("\nother info that might help us debug this:\n");
1656 lockdep_print_held_locks(curr);
1657
1658 printk("\nstack backtrace:\n");
1659 dump_stack();
1660
1661 return 0;
1662}
1663
1664/*
1665 * Print out an error if an invalid bit is set:
1666 */
1667static inline int
1668valid_state(struct task_struct *curr, struct held_lock *this,
1669 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1670{
1671 if (unlikely(this->class->usage_mask & (1 << bad_bit)))
1672 return print_usage_bug(curr, this, bad_bit, new_bit);
1673 return 1;
1674}
1675
1676static int mark_lock(struct task_struct *curr, struct held_lock *this,
1677 enum lock_usage_bit new_bit);
1678
1428#ifdef CONFIG_TRACE_IRQFLAGS 1679#ifdef CONFIG_TRACE_IRQFLAGS
1429 1680
1430/* 1681/*
@@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr)
1518 print_ip_sym(curr->softirq_disable_ip); 1769 print_ip_sym(curr->softirq_disable_ip);
1519} 1770}
1520 1771
1521#endif 1772static int hardirq_verbose(struct lock_class *class)
1522
1523static int
1524print_usage_bug(struct task_struct *curr, struct held_lock *this,
1525 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1526{ 1773{
1527 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1774#if HARDIRQ_VERBOSE
1528 return 0; 1775 return class_filter(class);
1529 1776#endif
1530 printk("\n=================================\n");
1531 printk( "[ INFO: inconsistent lock state ]\n");
1532 print_kernel_version();
1533 printk( "---------------------------------\n");
1534
1535 printk("inconsistent {%s} -> {%s} usage.\n",
1536 usage_str[prev_bit], usage_str[new_bit]);
1537
1538 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1539 curr->comm, curr->pid,
1540 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1541 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1542 trace_hardirqs_enabled(curr),
1543 trace_softirqs_enabled(curr));
1544 print_lock(this);
1545
1546 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1547 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1548
1549 print_irqtrace_events(curr);
1550 printk("\nother info that might help us debug this:\n");
1551 lockdep_print_held_locks(curr);
1552
1553 printk("\nstack backtrace:\n");
1554 dump_stack();
1555
1556 return 0; 1777 return 0;
1557} 1778}
1558 1779
1559/* 1780static int softirq_verbose(struct lock_class *class)
1560 * Print out an error if an invalid bit is set:
1561 */
1562static inline int
1563valid_state(struct task_struct *curr, struct held_lock *this,
1564 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1565{ 1781{
1566 if (unlikely(this->class->usage_mask & (1 << bad_bit))) 1782#if SOFTIRQ_VERBOSE
1567 return print_usage_bug(curr, this, bad_bit, new_bit); 1783 return class_filter(class);
1568 return 1; 1784#endif
1785 return 0;
1569} 1786}
1570 1787
1571#define STRICT_READ_CHECKS 1 1788#define STRICT_READ_CHECKS 1
1572 1789
1573/* 1790static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1574 * Mark a lock with a usage bit, and validate the state transition: 1791 enum lock_usage_bit new_bit)
1575 */
1576static int mark_lock(struct task_struct *curr, struct held_lock *this,
1577 enum lock_usage_bit new_bit)
1578{ 1792{
1579 unsigned int new_mask = 1 << new_bit, ret = 1; 1793 int ret = 1;
1580
1581 /*
1582 * If already set then do not dirty the cacheline,
1583 * nor do any checks:
1584 */
1585 if (likely(this->class->usage_mask & new_mask))
1586 return 1;
1587
1588 if (!graph_lock())
1589 return 0;
1590 /*
1591 * Make sure we didnt race:
1592 */
1593 if (unlikely(this->class->usage_mask & new_mask)) {
1594 graph_unlock();
1595 return 1;
1596 }
1597
1598 this->class->usage_mask |= new_mask;
1599 1794
1600 if (!save_trace(this->class->usage_traces + new_bit)) 1795 switch(new_bit) {
1601 return 0;
1602
1603 switch (new_bit) {
1604#ifdef CONFIG_TRACE_IRQFLAGS
1605 case LOCK_USED_IN_HARDIRQ: 1796 case LOCK_USED_IN_HARDIRQ:
1606 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 1797 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1607 return 0; 1798 return 0;
@@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1760 if (softirq_verbose(this->class)) 1951 if (softirq_verbose(this->class))
1761 ret = 2; 1952 ret = 2;
1762 break; 1953 break;
1763#endif
1764 case LOCK_USED:
1765 /*
1766 * Add it to the global list of classes:
1767 */
1768 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
1769 debug_atomic_dec(&nr_unused_locks);
1770 break;
1771 default: 1954 default:
1772 if (!debug_locks_off_graph_unlock())
1773 return 0;
1774 WARN_ON(1); 1955 WARN_ON(1);
1775 return 0; 1956 break;
1776 }
1777
1778 graph_unlock();
1779
1780 /*
1781 * We must printk outside of the graph_lock:
1782 */
1783 if (ret == 2) {
1784 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
1785 print_lock(this);
1786 print_irqtrace_events(curr);
1787 dump_stack();
1788 } 1957 }
1789 1958
1790 return ret; 1959 return ret;
1791} 1960}
1792 1961
1793#ifdef CONFIG_TRACE_IRQFLAGS
1794/* 1962/*
1795 * Mark all held locks with a usage bit: 1963 * Mark all held locks with a usage bit:
1796 */ 1964 */
@@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip)
1973 debug_atomic_inc(&redundant_softirqs_off); 2141 debug_atomic_inc(&redundant_softirqs_off);
1974} 2142}
1975 2143
2144static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2145{
2146 /*
2147 * If non-trylock use in a hardirq or softirq context, then
2148 * mark the lock as used in these contexts:
2149 */
2150 if (!hlock->trylock) {
2151 if (hlock->read) {
2152 if (curr->hardirq_context)
2153 if (!mark_lock(curr, hlock,
2154 LOCK_USED_IN_HARDIRQ_READ))
2155 return 0;
2156 if (curr->softirq_context)
2157 if (!mark_lock(curr, hlock,
2158 LOCK_USED_IN_SOFTIRQ_READ))
2159 return 0;
2160 } else {
2161 if (curr->hardirq_context)
2162 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2163 return 0;
2164 if (curr->softirq_context)
2165 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2166 return 0;
2167 }
2168 }
2169 if (!hlock->hardirqs_off) {
2170 if (hlock->read) {
2171 if (!mark_lock(curr, hlock,
2172 LOCK_ENABLED_HARDIRQS_READ))
2173 return 0;
2174 if (curr->softirqs_enabled)
2175 if (!mark_lock(curr, hlock,
2176 LOCK_ENABLED_SOFTIRQS_READ))
2177 return 0;
2178 } else {
2179 if (!mark_lock(curr, hlock,
2180 LOCK_ENABLED_HARDIRQS))
2181 return 0;
2182 if (curr->softirqs_enabled)
2183 if (!mark_lock(curr, hlock,
2184 LOCK_ENABLED_SOFTIRQS))
2185 return 0;
2186 }
2187 }
2188
2189 return 1;
2190}
2191
2192static int separate_irq_context(struct task_struct *curr,
2193 struct held_lock *hlock)
2194{
2195 unsigned int depth = curr->lockdep_depth;
2196
2197 /*
2198 * Keep track of points where we cross into an interrupt context:
2199 */
2200 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2201 curr->softirq_context;
2202 if (depth) {
2203 struct held_lock *prev_hlock;
2204
2205 prev_hlock = curr->held_locks + depth-1;
2206 /*
2207 * If we cross into another context, reset the
2208 * hash key (this also prevents the checking and the
2209 * adding of the dependency to 'prev'):
2210 */
2211 if (prev_hlock->irq_context != hlock->irq_context)
2212 return 1;
2213 }
2214 return 0;
2215}
2216
2217#else
2218
2219static inline
2220int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
2221 enum lock_usage_bit new_bit)
2222{
2223 WARN_ON(1);
2224 return 1;
2225}
2226
2227static inline int mark_irqflags(struct task_struct *curr,
2228 struct held_lock *hlock)
2229{
2230 return 1;
2231}
2232
2233static inline int separate_irq_context(struct task_struct *curr,
2234 struct held_lock *hlock)
2235{
2236 return 0;
2237}
2238
1976#endif 2239#endif
1977 2240
1978/* 2241/*
2242 * Mark a lock with a usage bit, and validate the state transition:
2243 */
2244static int mark_lock(struct task_struct *curr, struct held_lock *this,
2245 enum lock_usage_bit new_bit)
2246{
2247 unsigned int new_mask = 1 << new_bit, ret = 1;
2248
2249 /*
2250 * If already set then do not dirty the cacheline,
2251 * nor do any checks:
2252 */
2253 if (likely(this->class->usage_mask & new_mask))
2254 return 1;
2255
2256 if (!graph_lock())
2257 return 0;
2258 /*
2259 * Make sure we didnt race:
2260 */
2261 if (unlikely(this->class->usage_mask & new_mask)) {
2262 graph_unlock();
2263 return 1;
2264 }
2265
2266 this->class->usage_mask |= new_mask;
2267
2268 if (!save_trace(this->class->usage_traces + new_bit))
2269 return 0;
2270
2271 switch (new_bit) {
2272 case LOCK_USED_IN_HARDIRQ:
2273 case LOCK_USED_IN_SOFTIRQ:
2274 case LOCK_USED_IN_HARDIRQ_READ:
2275 case LOCK_USED_IN_SOFTIRQ_READ:
2276 case LOCK_ENABLED_HARDIRQS:
2277 case LOCK_ENABLED_SOFTIRQS:
2278 case LOCK_ENABLED_HARDIRQS_READ:
2279 case LOCK_ENABLED_SOFTIRQS_READ:
2280 ret = mark_lock_irq(curr, this, new_bit);
2281 if (!ret)
2282 return 0;
2283 break;
2284 case LOCK_USED:
2285 /*
2286 * Add it to the global list of classes:
2287 */
2288 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
2289 debug_atomic_dec(&nr_unused_locks);
2290 break;
2291 default:
2292 if (!debug_locks_off_graph_unlock())
2293 return 0;
2294 WARN_ON(1);
2295 return 0;
2296 }
2297
2298 graph_unlock();
2299
2300 /*
2301 * We must printk outside of the graph_lock:
2302 */
2303 if (ret == 2) {
2304 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
2305 print_lock(this);
2306 print_irqtrace_events(curr);
2307 dump_stack();
2308 }
2309
2310 return ret;
2311}
2312
2313/*
1979 * Initialize a lock instance's lock-class mapping info: 2314 * Initialize a lock instance's lock-class mapping info:
1980 */ 2315 */
1981void lockdep_init_map(struct lockdep_map *lock, const char *name, 2316void lockdep_init_map(struct lockdep_map *lock, const char *name,
@@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
1999 lock->name = name; 2334 lock->name = name;
2000 lock->key = key; 2335 lock->key = key;
2001 lock->class_cache = NULL; 2336 lock->class_cache = NULL;
2337#ifdef CONFIG_LOCK_STAT
2338 lock->cpu = raw_smp_processor_id();
2339#endif
2002 if (subclass) 2340 if (subclass)
2003 register_lock_class(lock, subclass, 1); 2341 register_lock_class(lock, subclass, 1);
2004} 2342}
@@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2020 int chain_head = 0; 2358 int chain_head = 0;
2021 u64 chain_key; 2359 u64 chain_key;
2022 2360
2361 if (!prove_locking)
2362 check = 1;
2363
2023 if (unlikely(!debug_locks)) 2364 if (unlikely(!debug_locks))
2024 return 0; 2365 return 0;
2025 2366
@@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2070 hlock->read = read; 2411 hlock->read = read;
2071 hlock->check = check; 2412 hlock->check = check;
2072 hlock->hardirqs_off = hardirqs_off; 2413 hlock->hardirqs_off = hardirqs_off;
2073 2414#ifdef CONFIG_LOCK_STAT
2074 if (check != 2) 2415 hlock->waittime_stamp = 0;
2075 goto out_calc_hash; 2416 hlock->holdtime_stamp = sched_clock();
2076#ifdef CONFIG_TRACE_IRQFLAGS
2077 /*
2078 * If non-trylock use in a hardirq or softirq context, then
2079 * mark the lock as used in these contexts:
2080 */
2081 if (!trylock) {
2082 if (read) {
2083 if (curr->hardirq_context)
2084 if (!mark_lock(curr, hlock,
2085 LOCK_USED_IN_HARDIRQ_READ))
2086 return 0;
2087 if (curr->softirq_context)
2088 if (!mark_lock(curr, hlock,
2089 LOCK_USED_IN_SOFTIRQ_READ))
2090 return 0;
2091 } else {
2092 if (curr->hardirq_context)
2093 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
2094 return 0;
2095 if (curr->softirq_context)
2096 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
2097 return 0;
2098 }
2099 }
2100 if (!hardirqs_off) {
2101 if (read) {
2102 if (!mark_lock(curr, hlock,
2103 LOCK_ENABLED_HARDIRQS_READ))
2104 return 0;
2105 if (curr->softirqs_enabled)
2106 if (!mark_lock(curr, hlock,
2107 LOCK_ENABLED_SOFTIRQS_READ))
2108 return 0;
2109 } else {
2110 if (!mark_lock(curr, hlock,
2111 LOCK_ENABLED_HARDIRQS))
2112 return 0;
2113 if (curr->softirqs_enabled)
2114 if (!mark_lock(curr, hlock,
2115 LOCK_ENABLED_SOFTIRQS))
2116 return 0;
2117 }
2118 }
2119#endif 2417#endif
2418
2419 if (check == 2 && !mark_irqflags(curr, hlock))
2420 return 0;
2421
2120 /* mark it as used: */ 2422 /* mark it as used: */
2121 if (!mark_lock(curr, hlock, LOCK_USED)) 2423 if (!mark_lock(curr, hlock, LOCK_USED))
2122 return 0; 2424 return 0;
2123out_calc_hash: 2425
2124 /* 2426 /*
2125 * Calculate the chain hash: it's the combined has of all the 2427 * Calculate the chain hash: it's the combined has of all the
2126 * lock keys along the dependency chain. We save the hash value 2428 * lock keys along the dependency chain. We save the hash value
@@ -2143,77 +2445,15 @@ out_calc_hash:
2143 } 2445 }
2144 2446
2145 hlock->prev_chain_key = chain_key; 2447 hlock->prev_chain_key = chain_key;
2146 2448 if (separate_irq_context(curr, hlock)) {
2147#ifdef CONFIG_TRACE_IRQFLAGS 2449 chain_key = 0;
2148 /* 2450 chain_head = 1;
2149 * Keep track of points where we cross into an interrupt context:
2150 */
2151 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2152 curr->softirq_context;
2153 if (depth) {
2154 struct held_lock *prev_hlock;
2155
2156 prev_hlock = curr->held_locks + depth-1;
2157 /*
2158 * If we cross into another context, reset the
2159 * hash key (this also prevents the checking and the
2160 * adding of the dependency to 'prev'):
2161 */
2162 if (prev_hlock->irq_context != hlock->irq_context) {
2163 chain_key = 0;
2164 chain_head = 1;
2165 }
2166 } 2451 }
2167#endif
2168 chain_key = iterate_chain_key(chain_key, id); 2452 chain_key = iterate_chain_key(chain_key, id);
2169 curr->curr_chain_key = chain_key; 2453 curr->curr_chain_key = chain_key;
2170 2454
2171 /* 2455 if (!validate_chain(curr, lock, hlock, chain_head))
2172 * Trylock needs to maintain the stack of held locks, but it 2456 return 0;
2173 * does not add new dependencies, because trylock can be done
2174 * in any order.
2175 *
2176 * We look up the chain_key and do the O(N^2) check and update of
2177 * the dependencies only if this is a new dependency chain.
2178 * (If lookup_chain_cache() returns with 1 it acquires
2179 * graph_lock for us)
2180 */
2181 if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
2182 /*
2183 * Check whether last held lock:
2184 *
2185 * - is irq-safe, if this lock is irq-unsafe
2186 * - is softirq-safe, if this lock is hardirq-unsafe
2187 *
2188 * And check whether the new lock's dependency graph
2189 * could lead back to the previous lock.
2190 *
2191 * any of these scenarios could lead to a deadlock. If
2192 * All validations
2193 */
2194 int ret = check_deadlock(curr, hlock, lock, read);
2195
2196 if (!ret)
2197 return 0;
2198 /*
2199 * Mark recursive read, as we jump over it when
2200 * building dependencies (just like we jump over
2201 * trylock entries):
2202 */
2203 if (ret == 2)
2204 hlock->read = 2;
2205 /*
2206 * Add dependency only if this lock is not the head
2207 * of the chain, and if it's not a secondary read-lock:
2208 */
2209 if (!chain_head && ret != 2)
2210 if (!check_prevs_add(curr, hlock))
2211 return 0;
2212 graph_unlock();
2213 } else
2214 /* after lookup_chain_cache(): */
2215 if (unlikely(!debug_locks))
2216 return 0;
2217 2457
2218 curr->lockdep_depth++; 2458 curr->lockdep_depth++;
2219 check_chain_key(curr); 2459 check_chain_key(curr);
@@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr,
2315 return print_unlock_inbalance_bug(curr, lock, ip); 2555 return print_unlock_inbalance_bug(curr, lock, ip);
2316 2556
2317found_it: 2557found_it:
2558 lock_release_holdtime(hlock);
2559
2318 /* 2560 /*
2319 * We have the right lock to unlock, 'hlock' points to it. 2561 * We have the right lock to unlock, 'hlock' points to it.
2320 * Now we remove it from the stack, and add back the other 2562 * Now we remove it from the stack, and add back the other
@@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr,
2367 2609
2368 curr->curr_chain_key = hlock->prev_chain_key; 2610 curr->curr_chain_key = hlock->prev_chain_key;
2369 2611
2612 lock_release_holdtime(hlock);
2613
2370#ifdef CONFIG_DEBUG_LOCKDEP 2614#ifdef CONFIG_DEBUG_LOCKDEP
2371 hlock->prev_chain_key = 0; 2615 hlock->prev_chain_key = 0;
2372 hlock->class = NULL; 2616 hlock->class = NULL;
@@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2441{ 2685{
2442 unsigned long flags; 2686 unsigned long flags;
2443 2687
2688 if (unlikely(!lock_stat && !prove_locking))
2689 return;
2690
2444 if (unlikely(current->lockdep_recursion)) 2691 if (unlikely(current->lockdep_recursion))
2445 return; 2692 return;
2446 2693
@@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2460{ 2707{
2461 unsigned long flags; 2708 unsigned long flags;
2462 2709
2710 if (unlikely(!lock_stat && !prove_locking))
2711 return;
2712
2463 if (unlikely(current->lockdep_recursion)) 2713 if (unlikely(current->lockdep_recursion))
2464 return; 2714 return;
2465 2715
@@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2473 2723
2474EXPORT_SYMBOL_GPL(lock_release); 2724EXPORT_SYMBOL_GPL(lock_release);
2475 2725
2726#ifdef CONFIG_LOCK_STAT
2727static int
2728print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
2729 unsigned long ip)
2730{
2731 if (!debug_locks_off())
2732 return 0;
2733 if (debug_locks_silent)
2734 return 0;
2735
2736 printk("\n=================================\n");
2737 printk( "[ BUG: bad contention detected! ]\n");
2738 printk( "---------------------------------\n");
2739 printk("%s/%d is trying to contend lock (",
2740 curr->comm, curr->pid);
2741 print_lockdep_cache(lock);
2742 printk(") at:\n");
2743 print_ip_sym(ip);
2744 printk("but there are no locks held!\n");
2745 printk("\nother info that might help us debug this:\n");
2746 lockdep_print_held_locks(curr);
2747
2748 printk("\nstack backtrace:\n");
2749 dump_stack();
2750
2751 return 0;
2752}
2753
2754static void
2755__lock_contended(struct lockdep_map *lock, unsigned long ip)
2756{
2757 struct task_struct *curr = current;
2758 struct held_lock *hlock, *prev_hlock;
2759 struct lock_class_stats *stats;
2760 unsigned int depth;
2761 int i, point;
2762
2763 depth = curr->lockdep_depth;
2764 if (DEBUG_LOCKS_WARN_ON(!depth))
2765 return;
2766
2767 prev_hlock = NULL;
2768 for (i = depth-1; i >= 0; i--) {
2769 hlock = curr->held_locks + i;
2770 /*
2771 * We must not cross into another context:
2772 */
2773 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2774 break;
2775 if (hlock->instance == lock)
2776 goto found_it;
2777 prev_hlock = hlock;
2778 }
2779 print_lock_contention_bug(curr, lock, ip);
2780 return;
2781
2782found_it:
2783 hlock->waittime_stamp = sched_clock();
2784
2785 point = lock_contention_point(hlock->class, ip);
2786
2787 stats = get_lock_stats(hlock->class);
2788 if (point < ARRAY_SIZE(stats->contention_point))
2789 stats->contention_point[i]++;
2790 if (lock->cpu != smp_processor_id())
2791 stats->bounces[bounce_contended + !!hlock->read]++;
2792 put_lock_stats(stats);
2793}
2794
2795static void
2796__lock_acquired(struct lockdep_map *lock)
2797{
2798 struct task_struct *curr = current;
2799 struct held_lock *hlock, *prev_hlock;
2800 struct lock_class_stats *stats;
2801 unsigned int depth;
2802 u64 now;
2803 s64 waittime = 0;
2804 int i, cpu;
2805
2806 depth = curr->lockdep_depth;
2807 if (DEBUG_LOCKS_WARN_ON(!depth))
2808 return;
2809
2810 prev_hlock = NULL;
2811 for (i = depth-1; i >= 0; i--) {
2812 hlock = curr->held_locks + i;
2813 /*
2814 * We must not cross into another context:
2815 */
2816 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2817 break;
2818 if (hlock->instance == lock)
2819 goto found_it;
2820 prev_hlock = hlock;
2821 }
2822 print_lock_contention_bug(curr, lock, _RET_IP_);
2823 return;
2824
2825found_it:
2826 cpu = smp_processor_id();
2827 if (hlock->waittime_stamp) {
2828 now = sched_clock();
2829 waittime = now - hlock->waittime_stamp;
2830 hlock->holdtime_stamp = now;
2831 }
2832
2833 stats = get_lock_stats(hlock->class);
2834 if (waittime) {
2835 if (hlock->read)
2836 lock_time_inc(&stats->read_waittime, waittime);
2837 else
2838 lock_time_inc(&stats->write_waittime, waittime);
2839 }
2840 if (lock->cpu != cpu)
2841 stats->bounces[bounce_acquired + !!hlock->read]++;
2842 put_lock_stats(stats);
2843
2844 lock->cpu = cpu;
2845}
2846
2847void lock_contended(struct lockdep_map *lock, unsigned long ip)
2848{
2849 unsigned long flags;
2850
2851 if (unlikely(!lock_stat))
2852 return;
2853
2854 if (unlikely(current->lockdep_recursion))
2855 return;
2856
2857 raw_local_irq_save(flags);
2858 check_flags(flags);
2859 current->lockdep_recursion = 1;
2860 __lock_contended(lock, ip);
2861 current->lockdep_recursion = 0;
2862 raw_local_irq_restore(flags);
2863}
2864EXPORT_SYMBOL_GPL(lock_contended);
2865
2866void lock_acquired(struct lockdep_map *lock)
2867{
2868 unsigned long flags;
2869
2870 if (unlikely(!lock_stat))
2871 return;
2872
2873 if (unlikely(current->lockdep_recursion))
2874 return;
2875
2876 raw_local_irq_save(flags);
2877 check_flags(flags);
2878 current->lockdep_recursion = 1;
2879 __lock_acquired(lock);
2880 current->lockdep_recursion = 0;
2881 raw_local_irq_restore(flags);
2882}
2883EXPORT_SYMBOL_GPL(lock_acquired);
2884#endif
2885
2476/* 2886/*
2477 * Used by the testsuite, sanitize the validator state 2887 * Used by the testsuite, sanitize the validator state
2478 * after a simulated failure: 2888 * after a simulated failure:
@@ -2636,8 +3046,11 @@ void __init lockdep_info(void)
2636 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3046 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
2637 3047
2638#ifdef CONFIG_DEBUG_LOCKDEP 3048#ifdef CONFIG_DEBUG_LOCKDEP
2639 if (lockdep_init_error) 3049 if (lockdep_init_error) {
2640 printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); 3050 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
3051 printk("Call stack leading to lockdep invocation was:\n");
3052 print_stack_trace(&lockdep_init_trace, 0);
3053 }
2641#endif 3054#endif
2642} 3055}
2643 3056
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 58f35e586ee3..c851b2dcc685 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -5,7 +5,8 @@
5 * 5 *
6 * Started by Ingo Molnar: 6 * Started by Ingo Molnar:
7 * 7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 8 * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * 10 *
10 * Code for /proc/lockdep and /proc/lockdep_stats: 11 * Code for /proc/lockdep and /proc/lockdep_stats:
11 * 12 *
@@ -15,6 +16,10 @@
15#include <linux/seq_file.h> 16#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
17#include <linux/debug_locks.h> 18#include <linux/debug_locks.h>
19#include <linux/vmalloc.h>
20#include <linux/sort.h>
21#include <asm/uaccess.h>
22#include <asm/div64.h>
18 23
19#include "lockdep_internals.h" 24#include "lockdep_internals.h"
20 25
@@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
271 if (nr_list_entries) 276 if (nr_list_entries)
272 factor = sum_forward_deps / nr_list_entries; 277 factor = sum_forward_deps / nr_list_entries;
273 278
279#ifdef CONFIG_PROVE_LOCKING
274 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 280 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
275 nr_lock_chains, MAX_LOCKDEP_CHAINS); 281 nr_lock_chains, MAX_LOCKDEP_CHAINS);
282#endif
276 283
277#ifdef CONFIG_TRACE_IRQFLAGS 284#ifdef CONFIG_TRACE_IRQFLAGS
278 seq_printf(m, " in-hardirq chains: %11u\n", 285 seq_printf(m, " in-hardirq chains: %11u\n",
@@ -339,9 +346,295 @@ static const struct file_operations proc_lockdep_stats_operations = {
339 .open = lockdep_stats_open, 346 .open = lockdep_stats_open,
340 .read = seq_read, 347 .read = seq_read,
341 .llseek = seq_lseek, 348 .llseek = seq_lseek,
342 .release = seq_release, 349 .release = single_release,
350};
351
352#ifdef CONFIG_LOCK_STAT
353
354struct lock_stat_data {
355 struct lock_class *class;
356 struct lock_class_stats stats;
357};
358
359struct lock_stat_seq {
360 struct lock_stat_data *iter;
361 struct lock_stat_data *iter_end;
362 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
343}; 363};
344 364
365/*
366 * sort on absolute number of contentions
367 */
368static int lock_stat_cmp(const void *l, const void *r)
369{
370 const struct lock_stat_data *dl = l, *dr = r;
371 unsigned long nl, nr;
372
373 nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
374 nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
375
376 return nr - nl;
377}
378
379static void seq_line(struct seq_file *m, char c, int offset, int length)
380{
381 int i;
382
383 for (i = 0; i < offset; i++)
384 seq_puts(m, " ");
385 for (i = 0; i < length; i++)
386 seq_printf(m, "%c", c);
387 seq_puts(m, "\n");
388}
389
390static void snprint_time(char *buf, size_t bufsiz, s64 nr)
391{
392 unsigned long rem;
393
394 rem = do_div(nr, 1000); /* XXX: do_div_signed */
395 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
396}
397
398static void seq_time(struct seq_file *m, s64 time)
399{
400 char num[15];
401
402 snprint_time(num, sizeof(num), time);
403 seq_printf(m, " %14s", num);
404}
405
406static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
407{
408 seq_printf(m, "%14lu", lt->nr);
409 seq_time(m, lt->min);
410 seq_time(m, lt->max);
411 seq_time(m, lt->total);
412}
413
414static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
415{
416 char name[39];
417 struct lock_class *class;
418 struct lock_class_stats *stats;
419 int i, namelen;
420
421 class = data->class;
422 stats = &data->stats;
423
424 namelen = 38;
425 if (class->name_version > 1)
426 namelen -= 2; /* XXX truncates versions > 9 */
427 if (class->subclass)
428 namelen -= 2;
429
430 if (!class->name) {
431 char str[KSYM_NAME_LEN];
432 const char *key_name;
433
434 key_name = __get_key_name(class->key, str);
435 snprintf(name, namelen, "%s", key_name);
436 } else {
437 snprintf(name, namelen, "%s", class->name);
438 }
439 namelen = strlen(name);
440 if (class->name_version > 1) {
441 snprintf(name+namelen, 3, "#%d", class->name_version);
442 namelen += 2;
443 }
444 if (class->subclass) {
445 snprintf(name+namelen, 3, "/%d", class->subclass);
446 namelen += 2;
447 }
448
449 if (stats->write_holdtime.nr) {
450 if (stats->read_holdtime.nr)
451 seq_printf(m, "%38s-W:", name);
452 else
453 seq_printf(m, "%40s:", name);
454
455 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
456 seq_lock_time(m, &stats->write_waittime);
457 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
458 seq_lock_time(m, &stats->write_holdtime);
459 seq_puts(m, "\n");
460 }
461
462 if (stats->read_holdtime.nr) {
463 seq_printf(m, "%38s-R:", name);
464 seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
465 seq_lock_time(m, &stats->read_waittime);
466 seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
467 seq_lock_time(m, &stats->read_holdtime);
468 seq_puts(m, "\n");
469 }
470
471 if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
472 return;
473
474 if (stats->read_holdtime.nr)
475 namelen += 2;
476
477 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
478 char sym[KSYM_SYMBOL_LEN];
479 char ip[32];
480
481 if (class->contention_point[i] == 0)
482 break;
483
484 if (!i)
485 seq_line(m, '-', 40-namelen, namelen);
486
487 sprint_symbol(sym, class->contention_point[i]);
488 snprintf(ip, sizeof(ip), "[<%p>]",
489 (void *)class->contention_point[i]);
490 seq_printf(m, "%40s %14lu %29s %s\n", name,
491 stats->contention_point[i],
492 ip, sym);
493 }
494 if (i) {
495 seq_puts(m, "\n");
496 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
497 seq_puts(m, "\n");
498 }
499}
500
501static void seq_header(struct seq_file *m)
502{
503 seq_printf(m, "lock_stat version 0.2\n");
504 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
505 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
506 "%14s %14s\n",
507 "class name",
508 "con-bounces",
509 "contentions",
510 "waittime-min",
511 "waittime-max",
512 "waittime-total",
513 "acq-bounces",
514 "acquisitions",
515 "holdtime-min",
516 "holdtime-max",
517 "holdtime-total");
518 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
519 seq_printf(m, "\n");
520}
521
522static void *ls_start(struct seq_file *m, loff_t *pos)
523{
524 struct lock_stat_seq *data = m->private;
525
526 if (data->iter == data->stats)
527 seq_header(m);
528
529 if (data->iter == data->iter_end)
530 data->iter = NULL;
531
532 return data->iter;
533}
534
535static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
536{
537 struct lock_stat_seq *data = m->private;
538
539 (*pos)++;
540
541 data->iter = v;
542 data->iter++;
543 if (data->iter == data->iter_end)
544 data->iter = NULL;
545
546 return data->iter;
547}
548
549static void ls_stop(struct seq_file *m, void *v)
550{
551}
552
553static int ls_show(struct seq_file *m, void *v)
554{
555 struct lock_stat_seq *data = m->private;
556
557 seq_stats(m, data->iter);
558 return 0;
559}
560
561static struct seq_operations lockstat_ops = {
562 .start = ls_start,
563 .next = ls_next,
564 .stop = ls_stop,
565 .show = ls_show,
566};
567
568static int lock_stat_open(struct inode *inode, struct file *file)
569{
570 int res;
571 struct lock_class *class;
572 struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
573
574 if (!data)
575 return -ENOMEM;
576
577 res = seq_open(file, &lockstat_ops);
578 if (!res) {
579 struct lock_stat_data *iter = data->stats;
580 struct seq_file *m = file->private_data;
581
582 data->iter = iter;
583 list_for_each_entry(class, &all_lock_classes, lock_entry) {
584 iter->class = class;
585 iter->stats = lock_stats(class);
586 iter++;
587 }
588 data->iter_end = iter;
589
590 sort(data->stats, data->iter_end - data->iter,
591 sizeof(struct lock_stat_data),
592 lock_stat_cmp, NULL);
593
594 m->private = data;
595 } else
596 vfree(data);
597
598 return res;
599}
600
601static ssize_t lock_stat_write(struct file *file, const char __user *buf,
602 size_t count, loff_t *ppos)
603{
604 struct lock_class *class;
605 char c;
606
607 if (count) {
608 if (get_user(c, buf))
609 return -EFAULT;
610
611 if (c != '0')
612 return count;
613
614 list_for_each_entry(class, &all_lock_classes, lock_entry)
615 clear_lock_stats(class);
616 }
617 return count;
618}
619
620static int lock_stat_release(struct inode *inode, struct file *file)
621{
622 struct seq_file *seq = file->private_data;
623
624 vfree(seq->private);
625 seq->private = NULL;
626 return seq_release(inode, file);
627}
628
629static const struct file_operations proc_lock_stat_operations = {
630 .open = lock_stat_open,
631 .write = lock_stat_write,
632 .read = seq_read,
633 .llseek = seq_lseek,
634 .release = lock_stat_release,
635};
636#endif /* CONFIG_LOCK_STAT */
637
345static int __init lockdep_proc_init(void) 638static int __init lockdep_proc_init(void)
346{ 639{
347 struct proc_dir_entry *entry; 640 struct proc_dir_entry *entry;
@@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void)
354 if (entry) 647 if (entry)
355 entry->proc_fops = &proc_lockdep_stats_operations; 648 entry->proc_fops = &proc_lockdep_stats_operations;
356 649
650#ifdef CONFIG_LOCK_STAT
651 entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
652 if (entry)
653 entry->proc_fops = &proc_lock_stat_operations;
654#endif
655
357 return 0; 656 return 0;
358} 657}
359 658
diff --git a/kernel/module.c b/kernel/module.c
index 33c04ad51175..db0ead0363e2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
784static ssize_t show_refcnt(struct module_attribute *mattr, 784static ssize_t show_refcnt(struct module_attribute *mattr,
785 struct module *mod, char *buffer) 785 struct module *mod, char *buffer)
786{ 786{
787 /* sysfs holds a reference */ 787 return sprintf(buffer, "%u\n", module_refcount(mod));
788 return sprintf(buffer, "%u\n", module_refcount(mod)-1);
789} 788}
790 789
791static struct module_attribute refcnt = { 790static struct module_attribute refcnt = {
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 303eab18484b..691b86564dd9 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
140 waiter.task = task; 140 waiter.task = task;
141 141
142 old_val = atomic_xchg(&lock->count, -1);
143 if (old_val == 1)
144 goto done;
145
146 lock_contended(&lock->dep_map, _RET_IP_);
147
142 for (;;) { 148 for (;;) {
143 /* 149 /*
144 * Lets try to take the lock again - this is needed even if 150 * Lets try to take the lock again - this is needed even if
@@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
174 spin_lock_mutex(&lock->wait_lock, flags); 180 spin_lock_mutex(&lock->wait_lock, flags);
175 } 181 }
176 182
183done:
184 lock_acquired(&lock->dep_map);
177 /* got the lock - rejoice! */ 185 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 186 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task_thread_info(task)); 187 debug_mutex_set_owner(lock, task_thread_info(task));
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 10f0bbba382b..f1decd21a534 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,6 +20,7 @@
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h>
23 24
24static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
25 26
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
98 goto out_user; 99 goto out_user;
99 } 100 }
100 101
102 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
103 if (IS_ERR(new_nsp->net_ns)) {
104 err = PTR_ERR(new_nsp->net_ns);
105 goto out_net;
106 }
107
101 return new_nsp; 108 return new_nsp;
102 109
110out_net:
111 if (new_nsp->user_ns)
112 put_user_ns(new_nsp->user_ns);
103out_user: 113out_user:
104 if (new_nsp->pid_ns) 114 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns); 115 put_pid_ns(new_nsp->pid_ns);
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
132 142
133 get_nsproxy(old_ns); 143 get_nsproxy(old_ns);
134 144
135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) 145 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
136 return 0; 146 return 0;
137 147
138 if (!capable(CAP_SYS_ADMIN)) { 148 if (!capable(CAP_SYS_ADMIN)) {
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns)
164 put_pid_ns(ns->pid_ns); 174 put_pid_ns(ns->pid_ns);
165 if (ns->user_ns) 175 if (ns->user_ns)
166 put_user_ns(ns->user_ns); 176 put_user_ns(ns->user_ns);
177 put_net(ns->net_ns);
167 kmem_cache_free(nsproxy_cachep, ns); 178 kmem_cache_free(nsproxy_cachep, ns);
168} 179}
169 180
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
177 int err = 0; 188 int err = 0;
178 189
179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER))) 191 CLONE_NEWUSER | CLONE_NEWNET)))
181 return 0; 192 return 0;
182 193
183 if (!capable(CAP_SYS_ADMIN)) 194 if (!capable(CAP_SYS_ADMIN))
@@ -193,7 +204,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
193static int __init nsproxy_cache_init(void) 204static int __init nsproxy_cache_init(void)
194{ 205{
195 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), 206 nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy),
196 0, SLAB_PANIC, NULL, NULL); 207 0, SLAB_PANIC, NULL);
197 return 0; 208 return 0;
198} 209}
199 210
diff --git a/kernel/params.c b/kernel/params.c
index effbaaedd7f3..4e57732fcfb4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -567,7 +567,12 @@ static void __init kernel_param_sysfs_setup(const char *name,
567 kobject_set_name(&mk->kobj, name); 567 kobject_set_name(&mk->kobj, name);
568 kobject_init(&mk->kobj); 568 kobject_init(&mk->kobj);
569 ret = kobject_add(&mk->kobj); 569 ret = kobject_add(&mk->kobj);
570 BUG_ON(ret < 0); 570 if (ret) {
571 printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
572 "error number %d\n", name, ret);
573 printk(KERN_ERR "The system will be unstable now.\n");
574 return;
575 }
571 param_sysfs_setup(mk, kparam, num_params, name_skip); 576 param_sysfs_setup(mk, kparam, num_params, name_skip);
572 kobject_uevent(&mk->kobj, KOBJ_ADD); 577 kobject_uevent(&mk->kobj, KOBJ_ADD);
573} 578}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 329ce0172074..7a15afb73ed0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -241,7 +241,7 @@ static __init int init_posix_timers(void)
241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 241 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
242 242
243 posix_timers_cache = kmem_cache_create("posix_timers_cache", 243 posix_timers_cache = kmem_cache_create("posix_timers_cache",
244 sizeof (struct k_itimer), 0, 0, NULL, NULL); 244 sizeof (struct k_itimer), 0, 0, NULL);
245 idr_init(&posix_timers_id); 245 idr_init(&posix_timers_id);
246 return 0; 246 return 0;
247} 247}
@@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock,
547 new_timer->it_process = process; 547 new_timer->it_process = process;
548 list_add(&new_timer->list, 548 list_add(&new_timer->list,
549 &process->signal->posix_timers); 549 &process->signal->posix_timers);
550 spin_unlock_irqrestore(&process->sighand->siglock, flags);
551 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 550 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
552 get_task_struct(process); 551 get_task_struct(process);
552 spin_unlock_irqrestore(&process->sighand->siglock, flags);
553 } else { 553 } else {
554 spin_unlock_irqrestore(&process->sighand->siglock, flags); 554 spin_unlock_irqrestore(&process->sighand->siglock, flags);
555 process = NULL; 555 process = NULL;
@@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
605 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 605 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
606 if (timr) { 606 if (timr) {
607 spin_lock(&timr->it_lock); 607 spin_lock(&timr->it_lock);
608 spin_unlock(&idr_lock);
609 608
610 if ((timr->it_id != timer_id) || !(timr->it_process) || 609 if ((timr->it_id != timer_id) || !(timr->it_process) ||
611 timr->it_process->tgid != current->tgid) { 610 timr->it_process->tgid != current->tgid) {
612 unlock_timer(timr, *flags); 611 spin_unlock(&timr->it_lock);
612 spin_unlock_irqrestore(&idr_lock, *flags);
613 timr = NULL; 613 timr = NULL;
614 } 614 } else
615 spin_unlock(&idr_lock);
615 } else 616 } else
616 spin_unlock_irqrestore(&idr_lock, *flags); 617 spin_unlock_irqrestore(&idr_lock, *flags);
617 618
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 495b7d4dd330..14b0e10dc95c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -33,13 +33,20 @@ config PM_DEBUG
33 bool "Power Management Debug Support" 33 bool "Power Management Debug Support"
34 depends on PM 34 depends on PM
35 ---help--- 35 ---help---
36 This option enables verbose debugging support in the Power Management 36 This option enables various debugging support in the Power Management
37 code. This is helpful when debugging and reporting various PM bugs, 37 code. This is helpful when debugging and reporting PM bugs, like
38 like suspend support. 38 suspend support.
39
40config PM_VERBOSE
41 bool "Verbose Power Management debugging"
42 depends on PM_DEBUG
43 default n
44 ---help---
45 This option enables verbose messages from the Power Management code.
39 46
40config DISABLE_CONSOLE_SUSPEND 47config DISABLE_CONSOLE_SUSPEND
41 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" 48 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
42 depends on PM && PM_DEBUG 49 depends on PM_DEBUG && PM_SLEEP
43 default n 50 default n
44 ---help--- 51 ---help---
45 This option turns off the console suspend mechanism that prevents 52 This option turns off the console suspend mechanism that prevents
@@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND
50 57
51config PM_TRACE 58config PM_TRACE
52 bool "Suspend/resume event tracing" 59 bool "Suspend/resume event tracing"
53 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL 60 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL
54 default n 61 default n
55 ---help--- 62 ---help---
56 This enables some cheesy code to save the last PM event point in the 63 This enables some cheesy code to save the last PM event point in the
@@ -65,21 +72,58 @@ config PM_TRACE
65 CAUTION: this option will cause your machine's real-time clock to be 72 CAUTION: this option will cause your machine's real-time clock to be
66 set to an invalid time after a resume. 73 set to an invalid time after a resume.
67 74
68config PM_SYSFS_DEPRECATED 75config PM_SLEEP_SMP
69 bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" 76 bool
70 depends on PM && SYSFS 77 depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
71 default n 78 depends on PM_SLEEP
72 help 79 select HOTPLUG_CPU
73 The driver model started out with a sysfs file intended to provide 80 default y
74 a userspace hook for device power management. This feature has never 81
75 worked very well, except for limited testing purposes, and so it will 82config PM_SLEEP
76 be removed. It's not clear that a generic mechanism could really 83 bool
77 handle the wide variability of device power states; any replacements 84 depends on SUSPEND || HIBERNATION
78 are likely to be bus or driver specific. 85 default y
79 86
80config SOFTWARE_SUSPEND 87config SUSPEND_UP_POSSIBLE
81 bool "Software Suspend (Hibernation)" 88 bool
82 depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) 89 depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
90 || SUPERH || FRV
91 depends on !SMP
92 default y
93
94config SUSPEND_SMP_POSSIBLE
95 bool
96 depends on (X86 && !X86_VOYAGER) \
97 || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
98 depends on SMP
99 default y
100
101config SUSPEND
102 bool "Suspend to RAM and standby"
103 depends on PM
104 depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
105 default y
106 ---help---
107 Allow the system to enter sleep states in which main memory is
108 powered and thus its contents are preserved, such as the
109 suspend-to-RAM state (i.e. the ACPI S3 state).
110
111config HIBERNATION_UP_POSSIBLE
112 bool
113 depends on X86 || PPC64_SWSUSP || PPC32
114 depends on !SMP
115 default y
116
117config HIBERNATION_SMP_POSSIBLE
118 bool
119 depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
120 depends on SMP
121 default y
122
123config HIBERNATION
124 bool "Hibernation (aka 'suspend to disk')"
125 depends on PM && SWAP
126 depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
83 ---help--- 127 ---help---
84 Enable the suspend to disk (STD) functionality, which is usually 128 Enable the suspend to disk (STD) functionality, which is usually
85 called "hibernation" in user interfaces. STD checkpoints the 129 called "hibernation" in user interfaces. STD checkpoints the
@@ -117,7 +161,7 @@ config SOFTWARE_SUSPEND
117 161
118config PM_STD_PARTITION 162config PM_STD_PARTITION
119 string "Default resume partition" 163 string "Default resume partition"
120 depends on SOFTWARE_SUSPEND 164 depends on HIBERNATION
121 default "" 165 default ""
122 ---help--- 166 ---help---
123 The default resume partition is the partition that the suspend- 167 The default resume partition is the partition that the suspend-
@@ -137,11 +181,6 @@ config PM_STD_PARTITION
137 suspended image to. It will simply pick the first available swap 181 suspended image to. It will simply pick the first available swap
138 device. 182 device.
139 183
140config SUSPEND_SMP
141 bool
142 depends on HOTPLUG_CPU && (X86 || PPC64) && PM
143 default y
144
145config APM_EMULATION 184config APM_EMULATION
146 tristate "Advanced Power Management Emulation" 185 tristate "Advanced Power Management Emulation"
147 depends on PM && SYS_SUPPORTS_APM_EMULATION 186 depends on PM && SYS_SUPPORTS_APM_EMULATION
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 38725f526afc..f7dfff28ecdb 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o process.o console.o 6obj-y := main.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_PM_SLEEP) += process.o console.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
9 10
10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f445b9cd60fb..eb72255b5c86 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -45,7 +45,7 @@ enum {
45 45
46static int hibernation_mode = HIBERNATION_SHUTDOWN; 46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47 47
48struct hibernation_ops *hibernation_ops; 48static struct hibernation_ops *hibernation_ops;
49 49
50/** 50/**
51 * hibernation_set_ops - set the global hibernate operations 51 * hibernation_set_ops - set the global hibernate operations
@@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops;
54 54
55void hibernation_set_ops(struct hibernation_ops *ops) 55void hibernation_set_ops(struct hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) { 57 if (ops && !(ops->prepare && ops->enter && ops->finish
58 && ops->pre_restore && ops->restore_cleanup)) {
58 WARN_ON(1); 59 WARN_ON(1);
59 return; 60 return;
60 } 61 }
@@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops)
74 * platform driver if so configured and return an error code if it fails 75 * platform driver if so configured and return an error code if it fails
75 */ 76 */
76 77
77static int platform_prepare(void) 78static int platform_prepare(int platform_mode)
78{ 79{
79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? 80 return (platform_mode && hibernation_ops) ?
80 hibernation_ops->prepare() : 0; 81 hibernation_ops->prepare() : 0;
81} 82}
82 83
@@ -85,13 +86,146 @@ static int platform_prepare(void)
85 * using the platform driver (must be called after platform_prepare()) 86 * using the platform driver (must be called after platform_prepare())
86 */ 87 */
87 88
88static void platform_finish(void) 89static void platform_finish(int platform_mode)
89{ 90{
90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) 91 if (platform_mode && hibernation_ops)
91 hibernation_ops->finish(); 92 hibernation_ops->finish();
92} 93}
93 94
94/** 95/**
96 * platform_pre_restore - prepare the platform for the restoration from a
97 * hibernation image. If the restore fails after this function has been
98 * called, platform_restore_cleanup() must be called.
99 */
100
101static int platform_pre_restore(int platform_mode)
102{
103 return (platform_mode && hibernation_ops) ?
104 hibernation_ops->pre_restore() : 0;
105}
106
107/**
108 * platform_restore_cleanup - switch the platform to the normal mode of
109 * operation after a failing restore. If platform_pre_restore() has been
110 * called before the failing restore, this function must be called too,
111 * regardless of the result of platform_pre_restore().
112 */
113
114static void platform_restore_cleanup(int platform_mode)
115{
116 if (platform_mode && hibernation_ops)
117 hibernation_ops->restore_cleanup();
118}
119
120/**
121 * hibernation_snapshot - quiesce devices and create the hibernation
122 * snapshot image.
123 * @platform_mode - if set, use the platform driver, if available, to
124 * prepare the platform frimware for the power transition.
125 *
126 * Must be called with pm_mutex held
127 */
128
129int hibernation_snapshot(int platform_mode)
130{
131 int error;
132
133 /* Free memory before shutting down devices. */
134 error = swsusp_shrink_memory();
135 if (error)
136 return error;
137
138 suspend_console();
139 error = device_suspend(PMSG_FREEZE);
140 if (error)
141 goto Resume_console;
142
143 error = platform_prepare(platform_mode);
144 if (error)
145 goto Resume_devices;
146
147 error = disable_nonboot_cpus();
148 if (!error) {
149 if (hibernation_mode != HIBERNATION_TEST) {
150 in_suspend = 1;
151 error = swsusp_suspend();
152 /* Control returns here after successful restore */
153 } else {
154 printk("swsusp debug: Waiting for 5 seconds.\n");
155 mdelay(5000);
156 }
157 }
158 enable_nonboot_cpus();
159 Resume_devices:
160 platform_finish(platform_mode);
161 device_resume();
162 Resume_console:
163 resume_console();
164 return error;
165}
166
167/**
168 * hibernation_restore - quiesce devices and restore the hibernation
169 * snapshot image. If successful, control returns in hibernation_snaphot()
170 * @platform_mode - if set, use the platform driver, if available, to
171 * prepare the platform frimware for the transition.
172 *
173 * Must be called with pm_mutex held
174 */
175
176int hibernation_restore(int platform_mode)
177{
178 int error;
179
180 pm_prepare_console();
181 suspend_console();
182 error = device_suspend(PMSG_PRETHAW);
183 if (error)
184 goto Finish;
185
186 error = platform_pre_restore(platform_mode);
187 if (!error) {
188 error = disable_nonboot_cpus();
189 if (!error)
190 error = swsusp_resume();
191 enable_nonboot_cpus();
192 }
193 platform_restore_cleanup(platform_mode);
194 device_resume();
195 Finish:
196 resume_console();
197 pm_restore_console();
198 return error;
199}
200
201/**
202 * hibernation_platform_enter - enter the hibernation state using the
203 * platform driver (if available)
204 */
205
206int hibernation_platform_enter(void)
207{
208 int error;
209
210 if (hibernation_ops) {
211 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
212 /*
213 * We have cancelled the power transition by running
214 * hibernation_ops->finish() before saving the image, so we
215 * should let the firmware know that we're going to enter the
216 * sleep state after all
217 */
218 error = hibernation_ops->prepare();
219 sysdev_shutdown();
220 if (!error)
221 error = hibernation_ops->enter();
222 } else {
223 error = -ENOSYS;
224 }
225 return error;
226}
227
228/**
95 * power_down - Shut the machine down for hibernation. 229 * power_down - Shut the machine down for hibernation.
96 * 230 *
97 * Use the platform driver, if configured so; otherwise try 231 * Use the platform driver, if configured so; otherwise try
@@ -111,11 +245,7 @@ static void power_down(void)
111 kernel_restart(NULL); 245 kernel_restart(NULL);
112 break; 246 break;
113 case HIBERNATION_PLATFORM: 247 case HIBERNATION_PLATFORM:
114 if (hibernation_ops) { 248 hibernation_platform_enter();
115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
116 hibernation_ops->enter();
117 break;
118 }
119 } 249 }
120 kernel_halt(); 250 kernel_halt();
121 /* 251 /*
@@ -152,9 +282,16 @@ int hibernate(void)
152{ 282{
153 int error; 283 int error;
154 284
285 mutex_lock(&pm_mutex);
155 /* The snapshot device should not be opened while we're running */ 286 /* The snapshot device should not be opened while we're running */
156 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 287 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
157 return -EBUSY; 288 error = -EBUSY;
289 goto Unlock;
290 }
291
292 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
293 if (error)
294 goto Exit;
158 295
159 /* Allocate memory management structures */ 296 /* Allocate memory management structures */
160 error = create_basic_memory_bitmaps(); 297 error = create_basic_memory_bitmaps();
@@ -165,75 +302,35 @@ int hibernate(void)
165 if (error) 302 if (error)
166 goto Finish; 303 goto Finish;
167 304
168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) { 305 if (hibernation_mode == HIBERNATION_TESTPROC) {
170 printk("swsusp debug: Waiting for 5 seconds.\n"); 306 printk("swsusp debug: Waiting for 5 seconds.\n");
171 mdelay(5000); 307 mdelay(5000);
172 goto Thaw; 308 goto Thaw;
173 } 309 }
310 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
311 if (in_suspend && !error) {
312 unsigned int flags = 0;
174 313
175 /* Free memory before shutting down devices. */ 314 if (hibernation_mode == HIBERNATION_PLATFORM)
176 error = swsusp_shrink_memory(); 315 flags |= SF_PLATFORM_MODE;
177 if (error)
178 goto Thaw;
179
180 error = platform_prepare();
181 if (error)
182 goto Thaw;
183
184 suspend_console();
185 error = device_suspend(PMSG_FREEZE);
186 if (error) {
187 printk(KERN_ERR "PM: Some devices failed to suspend\n");
188 goto Resume_devices;
189 }
190 error = disable_nonboot_cpus();
191 if (error)
192 goto Enable_cpus;
193
194 if (hibernation_mode == HIBERNATION_TEST) {
195 printk("swsusp debug: Waiting for 5 seconds.\n");
196 mdelay(5000);
197 goto Enable_cpus;
198 }
199
200 pr_debug("PM: snapshotting memory.\n");
201 in_suspend = 1;
202 error = swsusp_suspend();
203 if (error)
204 goto Enable_cpus;
205
206 if (in_suspend) {
207 enable_nonboot_cpus();
208 platform_finish();
209 device_resume();
210 resume_console();
211 pr_debug("PM: writing image.\n"); 316 pr_debug("PM: writing image.\n");
212 error = swsusp_write(); 317 error = swsusp_write(flags);
318 swsusp_free();
213 if (!error) 319 if (!error)
214 power_down(); 320 power_down();
215 else {
216 swsusp_free();
217 goto Thaw;
218 }
219 } else { 321 } else {
220 pr_debug("PM: Image restored successfully.\n"); 322 pr_debug("PM: Image restored successfully.\n");
323 swsusp_free();
221 } 324 }
222
223 swsusp_free();
224 Enable_cpus:
225 enable_nonboot_cpus();
226 Resume_devices:
227 platform_finish();
228 device_resume();
229 resume_console();
230 Thaw: 325 Thaw:
231 mutex_unlock(&pm_mutex);
232 unprepare_processes(); 326 unprepare_processes();
233 Finish: 327 Finish:
234 free_basic_memory_bitmaps(); 328 free_basic_memory_bitmaps();
235 Exit: 329 Exit:
330 pm_notifier_call_chain(PM_POST_HIBERNATION);
236 atomic_inc(&snapshot_device_available); 331 atomic_inc(&snapshot_device_available);
332 Unlock:
333 mutex_unlock(&pm_mutex);
237 return error; 334 return error;
238} 335}
239 336
@@ -253,6 +350,7 @@ int hibernate(void)
253static int software_resume(void) 350static int software_resume(void)
254{ 351{
255 int error; 352 int error;
353 unsigned int flags;
256 354
257 mutex_lock(&pm_mutex); 355 mutex_lock(&pm_mutex);
258 if (!swsusp_resume_device) { 356 if (!swsusp_resume_device) {
@@ -300,30 +398,12 @@ static int software_resume(void)
300 398
301 pr_debug("PM: Reading swsusp image.\n"); 399 pr_debug("PM: Reading swsusp image.\n");
302 400
303 error = swsusp_read(); 401 error = swsusp_read(&flags);
304 if (error) {
305 swsusp_free();
306 goto Thaw;
307 }
308
309 pr_debug("PM: Preparing devices for restore.\n");
310
311 suspend_console();
312 error = device_suspend(PMSG_PRETHAW);
313 if (error)
314 goto Free;
315
316 error = disable_nonboot_cpus();
317 if (!error) 402 if (!error)
318 swsusp_resume(); 403 hibernation_restore(flags & SF_PLATFORM_MODE);
319 404
320 enable_nonboot_cpus();
321 Free:
322 swsusp_free();
323 device_resume();
324 resume_console();
325 Thaw:
326 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 405 printk(KERN_ERR "PM: Restore failed, recovering.\n");
406 swsusp_free();
327 unprepare_processes(); 407 unprepare_processes();
328 Done: 408 Done:
329 free_basic_memory_bitmaps(); 409 free_basic_memory_bitmaps();
@@ -333,7 +413,7 @@ static int software_resume(void)
333 Unlock: 413 Unlock:
334 mutex_unlock(&pm_mutex); 414 mutex_unlock(&pm_mutex);
335 pr_debug("PM: Resume from disk failed.\n"); 415 pr_debug("PM: Resume from disk failed.\n");
336 return 0; 416 return error;
337} 417}
338 418
339late_initcall(software_resume); 419late_initcall(software_resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fc45ed22620f..350b485b3b60 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,11 +23,15 @@
23 23
24#include "power.h" 24#include "power.h"
25 25
26/*This is just an arbitrary number */ 26BLOCKING_NOTIFIER_HEAD(pm_chain_head);
27#define FREE_PAGE_NUMBER (100)
28 27
29DEFINE_MUTEX(pm_mutex); 28DEFINE_MUTEX(pm_mutex);
30 29
30#ifdef CONFIG_SUSPEND
31
32/* This is just an arbitrary number */
33#define FREE_PAGE_NUMBER (100)
34
31struct pm_ops *pm_ops; 35struct pm_ops *pm_ops;
32 36
33/** 37/**
@@ -63,14 +67,11 @@ static inline void pm_finish(suspend_state_t state)
63 67
64/** 68/**
65 * suspend_prepare - Do prep work before entering low-power state. 69 * suspend_prepare - Do prep work before entering low-power state.
66 * @state: State we're entering.
67 * 70 *
68 * This is common code that is called for each state that we're 71 * This is common code that is called for each state that we're entering.
69 * entering. Allocate a console, stop all processes, then make sure 72 * Run suspend notifiers, allocate a console and stop all processes.
70 * the platform can enter the requested state.
71 */ 73 */
72 74static int suspend_prepare(void)
73static int suspend_prepare(suspend_state_t state)
74{ 75{
75 int error; 76 int error;
76 unsigned int free_pages; 77 unsigned int free_pages;
@@ -78,6 +79,10 @@ static int suspend_prepare(suspend_state_t state)
78 if (!pm_ops || !pm_ops->enter) 79 if (!pm_ops || !pm_ops->enter)
79 return -EPERM; 80 return -EPERM;
80 81
82 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
83 if (error)
84 goto Finish;
85
81 pm_prepare_console(); 86 pm_prepare_console();
82 87
83 if (freeze_processes()) { 88 if (freeze_processes()) {
@@ -85,46 +90,23 @@ static int suspend_prepare(suspend_state_t state)
85 goto Thaw; 90 goto Thaw;
86 } 91 }
87 92
88 if ((free_pages = global_page_state(NR_FREE_PAGES)) 93 free_pages = global_page_state(NR_FREE_PAGES);
89 < FREE_PAGE_NUMBER) { 94 if (free_pages < FREE_PAGE_NUMBER) {
90 pr_debug("PM: free some memory\n"); 95 pr_debug("PM: free some memory\n");
91 shrink_all_memory(FREE_PAGE_NUMBER - free_pages); 96 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
92 if (nr_free_pages() < FREE_PAGE_NUMBER) { 97 if (nr_free_pages() < FREE_PAGE_NUMBER) {
93 error = -ENOMEM; 98 error = -ENOMEM;
94 printk(KERN_ERR "PM: No enough memory\n"); 99 printk(KERN_ERR "PM: No enough memory\n");
95 goto Thaw;
96 } 100 }
97 } 101 }
98
99 if (pm_ops->set_target) {
100 error = pm_ops->set_target(state);
101 if (error)
102 goto Thaw;
103 }
104 suspend_console();
105 error = device_suspend(PMSG_SUSPEND);
106 if (error) {
107 printk(KERN_ERR "Some devices failed to suspend\n");
108 goto Resume_console;
109 }
110 if (pm_ops->prepare) {
111 if ((error = pm_ops->prepare(state)))
112 goto Resume_devices;
113 }
114
115 error = disable_nonboot_cpus();
116 if (!error) 102 if (!error)
117 return 0; 103 return 0;
118 104
119 enable_nonboot_cpus();
120 pm_finish(state);
121 Resume_devices:
122 device_resume();
123 Resume_console:
124 resume_console();
125 Thaw: 105 Thaw:
126 thaw_processes(); 106 thaw_processes();
127 pm_restore_console(); 107 pm_restore_console();
108 Finish:
109 pm_notifier_call_chain(PM_POST_SUSPEND);
128 return error; 110 return error;
129} 111}
130 112
@@ -140,6 +122,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
140 local_irq_enable(); 122 local_irq_enable();
141} 123}
142 124
125/**
126 * suspend_enter - enter the desired system sleep state.
127 * @state: state to enter
128 *
129 * This function should be called after devices have been suspended.
130 */
143int suspend_enter(suspend_state_t state) 131int suspend_enter(suspend_state_t state)
144{ 132{
145 int error = 0; 133 int error = 0;
@@ -159,23 +147,58 @@ int suspend_enter(suspend_state_t state)
159 return error; 147 return error;
160} 148}
161 149
150/**
151 * suspend_devices_and_enter - suspend devices and enter the desired system sleep
152 * state.
153 * @state: state to enter
154 */
155int suspend_devices_and_enter(suspend_state_t state)
156{
157 int error;
158
159 if (!pm_ops)
160 return -ENOSYS;
161
162 if (pm_ops->set_target) {
163 error = pm_ops->set_target(state);
164 if (error)
165 return error;
166 }
167 suspend_console();
168 error = device_suspend(PMSG_SUSPEND);
169 if (error) {
170 printk(KERN_ERR "Some devices failed to suspend\n");
171 goto Resume_console;
172 }
173 if (pm_ops->prepare) {
174 error = pm_ops->prepare(state);
175 if (error)
176 goto Resume_devices;
177 }
178 error = disable_nonboot_cpus();
179 if (!error)
180 suspend_enter(state);
181
182 enable_nonboot_cpus();
183 pm_finish(state);
184 Resume_devices:
185 device_resume();
186 Resume_console:
187 resume_console();
188 return error;
189}
162 190
163/** 191/**
164 * suspend_finish - Do final work before exiting suspend sequence. 192 * suspend_finish - Do final work before exiting suspend sequence.
165 * @state: State we're coming out of.
166 * 193 *
167 * Call platform code to clean up, restart processes, and free the 194 * Call platform code to clean up, restart processes, and free the
168 * console that we've allocated. This is not called for suspend-to-disk. 195 * console that we've allocated. This is not called for suspend-to-disk.
169 */ 196 */
170 197static void suspend_finish(void)
171static void suspend_finish(suspend_state_t state)
172{ 198{
173 enable_nonboot_cpus();
174 pm_finish(state);
175 device_resume();
176 resume_console();
177 thaw_processes(); 199 thaw_processes();
178 pm_restore_console(); 200 pm_restore_console();
201 pm_notifier_call_chain(PM_POST_SUSPEND);
179} 202}
180 203
181 204
@@ -207,7 +230,6 @@ static inline int valid_state(suspend_state_t state)
207 * Then, do the setup for suspend, enter the state, and cleaup (after 230 * Then, do the setup for suspend, enter the state, and cleaup (after
208 * we've woken up). 231 * we've woken up).
209 */ 232 */
210
211static int enter_state(suspend_state_t state) 233static int enter_state(suspend_state_t state)
212{ 234{
213 int error; 235 int error;
@@ -218,14 +240,14 @@ static int enter_state(suspend_state_t state)
218 return -EBUSY; 240 return -EBUSY;
219 241
220 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
221 if ((error = suspend_prepare(state))) 243 if ((error = suspend_prepare()))
222 goto Unlock; 244 goto Unlock;
223 245
224 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 246 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
225 error = suspend_enter(state); 247 error = suspend_devices_and_enter(state);
226 248
227 pr_debug("PM: Finishing wakeup.\n"); 249 pr_debug("PM: Finishing wakeup.\n");
228 suspend_finish(state); 250 suspend_finish();
229 Unlock: 251 Unlock:
230 mutex_unlock(&pm_mutex); 252 mutex_unlock(&pm_mutex);
231 return error; 253 return error;
@@ -249,6 +271,8 @@ int pm_suspend(suspend_state_t state)
249 271
250EXPORT_SYMBOL(pm_suspend); 272EXPORT_SYMBOL(pm_suspend);
251 273
274#endif /* CONFIG_SUSPEND */
275
252decl_subsys(power,NULL,NULL); 276decl_subsys(power,NULL,NULL);
253 277
254 278
@@ -265,14 +289,16 @@ decl_subsys(power,NULL,NULL);
265 289
266static ssize_t state_show(struct kset *kset, char *buf) 290static ssize_t state_show(struct kset *kset, char *buf)
267{ 291{
292 char *s = buf;
293#ifdef CONFIG_SUSPEND
268 int i; 294 int i;
269 char * s = buf;
270 295
271 for (i = 0; i < PM_SUSPEND_MAX; i++) { 296 for (i = 0; i < PM_SUSPEND_MAX; i++) {
272 if (pm_states[i] && valid_state(i)) 297 if (pm_states[i] && valid_state(i))
273 s += sprintf(s,"%s ", pm_states[i]); 298 s += sprintf(s,"%s ", pm_states[i]);
274 } 299 }
275#ifdef CONFIG_SOFTWARE_SUSPEND 300#endif
301#ifdef CONFIG_HIBERNATION
276 s += sprintf(s, "%s\n", "disk"); 302 s += sprintf(s, "%s\n", "disk");
277#else 303#else
278 if (s != buf) 304 if (s != buf)
@@ -284,11 +310,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
284 310
285static ssize_t state_store(struct kset *kset, const char *buf, size_t n) 311static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
286{ 312{
313#ifdef CONFIG_SUSPEND
287 suspend_state_t state = PM_SUSPEND_STANDBY; 314 suspend_state_t state = PM_SUSPEND_STANDBY;
288 const char * const *s; 315 const char * const *s;
316#endif
289 char *p; 317 char *p;
290 int error;
291 int len; 318 int len;
319 int error = -EINVAL;
292 320
293 p = memchr(buf, '\n', n); 321 p = memchr(buf, '\n', n);
294 len = p ? p - buf : n; 322 len = p ? p - buf : n;
@@ -296,17 +324,19 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
296 /* First, check if we are requested to hibernate */ 324 /* First, check if we are requested to hibernate */
297 if (len == 4 && !strncmp(buf, "disk", len)) { 325 if (len == 4 && !strncmp(buf, "disk", len)) {
298 error = hibernate(); 326 error = hibernate();
299 return error ? error : n; 327 goto Exit;
300 } 328 }
301 329
330#ifdef CONFIG_SUSPEND
302 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 331 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
303 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 332 if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
304 break; 333 break;
305 } 334 }
306 if (state < PM_SUSPEND_MAX && *s) 335 if (state < PM_SUSPEND_MAX && *s)
307 error = enter_state(state); 336 error = enter_state(state);
308 else 337#endif
309 error = -EINVAL; 338
339 Exit:
310 return error ? error : n; 340 return error ? error : n;
311} 341}
312 342
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 51381487103f..95fbf2dd3fe3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -13,7 +13,7 @@ struct swsusp_info {
13 13
14 14
15 15
16#ifdef CONFIG_SOFTWARE_SUSPEND 16#ifdef CONFIG_HIBERNATION
17/* 17/*
18 * Keep some memory free so that I/O operations can succeed without paging 18 * Keep some memory free so that I/O operations can succeed without paging
19 * [Might this be more than 4 MB?] 19 * [Might this be more than 4 MB?]
@@ -25,7 +25,10 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern struct hibernation_ops *hibernation_ops; 28/* kernel/power/disk.c */
29extern int hibernation_snapshot(int platform_mode);
30extern int hibernation_restore(int platform_mode);
31extern int hibernation_platform_enter(void);
29#endif 32#endif
30 33
31extern int pfn_is_nosave(unsigned long); 34extern int pfn_is_nosave(unsigned long);
@@ -152,16 +155,42 @@ extern sector_t alloc_swapdev_block(int swap);
152extern void free_all_swap_pages(int swap); 155extern void free_all_swap_pages(int swap);
153extern int swsusp_swap_in_use(void); 156extern int swsusp_swap_in_use(void);
154 157
158/*
159 * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
160 * the image header.
161 */
162#define SF_PLATFORM_MODE 1
163
164/* kernel/power/disk.c */
155extern int swsusp_check(void); 165extern int swsusp_check(void);
156extern int swsusp_shrink_memory(void); 166extern int swsusp_shrink_memory(void);
157extern void swsusp_free(void); 167extern void swsusp_free(void);
158extern int swsusp_suspend(void); 168extern int swsusp_suspend(void);
159extern int swsusp_resume(void); 169extern int swsusp_resume(void);
160extern int swsusp_read(void); 170extern int swsusp_read(unsigned int *flags_p);
161extern int swsusp_write(void); 171extern int swsusp_write(unsigned int flags);
162extern void swsusp_close(void); 172extern void swsusp_close(void);
163extern int suspend_enter(suspend_state_t state);
164 173
165struct timeval; 174struct timeval;
175/* kernel/power/swsusp.c */
166extern void swsusp_show_speed(struct timeval *, struct timeval *, 176extern void swsusp_show_speed(struct timeval *, struct timeval *,
167 unsigned int, char *); 177 unsigned int, char *);
178
179#ifdef CONFIG_SUSPEND
180/* kernel/power/main.c */
181extern int suspend_devices_and_enter(suspend_state_t state);
182#else /* !CONFIG_SUSPEND */
183static inline int suspend_devices_and_enter(suspend_state_t state)
184{
185 return -ENOSYS;
186}
187#endif /* !CONFIG_SUSPEND */
188
189/* kernel/power/common.c */
190extern struct blocking_notifier_head pm_chain_head;
191
192static inline int pm_notifier_call_chain(unsigned long val)
193{
194 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
195 == NOTIFY_BAD) ? -EINVAL : 0;
196}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e0233d8422b9..3434940a3df1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -40,7 +40,7 @@ static inline void frozen_process(void)
40 current->flags |= PF_FROZEN; 40 current->flags |= PF_FROZEN;
41 wmb(); 41 wmb();
42 } 42 }
43 clear_tsk_thread_flag(current, TIF_FREEZE); 43 clear_freeze_flag(current);
44} 44}
45 45
46/* Refrigerator is place where frozen processes are stored :-). */ 46/* Refrigerator is place where frozen processes are stored :-). */
@@ -72,20 +72,19 @@ void refrigerator(void)
72 schedule(); 72 schedule();
73 } 73 }
74 pr_debug("%s left refrigerator\n", current->comm); 74 pr_debug("%s left refrigerator\n", current->comm);
75 current->state = save; 75 __set_current_state(save);
76} 76}
77 77
78static inline void freeze_process(struct task_struct *p) 78static void freeze_task(struct task_struct *p)
79{ 79{
80 unsigned long flags; 80 unsigned long flags;
81 81
82 if (!freezing(p)) { 82 if (!freezing(p)) {
83 rmb(); 83 rmb();
84 if (!frozen(p)) { 84 if (!frozen(p)) {
85 set_freeze_flag(p);
85 if (p->state == TASK_STOPPED) 86 if (p->state == TASK_STOPPED)
86 force_sig_specific(SIGSTOP, p); 87 force_sig_specific(SIGSTOP, p);
87
88 freeze(p);
89 spin_lock_irqsave(&p->sighand->siglock, flags); 88 spin_lock_irqsave(&p->sighand->siglock, flags);
90 signal_wake_up(p, p->state == TASK_STOPPED); 89 signal_wake_up(p, p->state == TASK_STOPPED);
91 spin_unlock_irqrestore(&p->sighand->siglock, flags); 90 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p)
99 98
100 if (freezing(p)) { 99 if (freezing(p)) {
101 pr_debug(" clean up: %s\n", p->comm); 100 pr_debug(" clean up: %s\n", p->comm);
102 do_not_freeze(p); 101 clear_freeze_flag(p);
103 spin_lock_irqsave(&p->sighand->siglock, flags); 102 spin_lock_irqsave(&p->sighand->siglock, flags);
104 recalc_sigpending_and_wake(p); 103 recalc_sigpending_and_wake(p);
105 spin_unlock_irqrestore(&p->sighand->siglock, flags); 104 spin_unlock_irqrestore(&p->sighand->siglock, flags);
106 } 105 }
107} 106}
108 107
109static inline int is_user_space(struct task_struct *p) 108static int try_to_freeze_tasks(int freeze_user_space)
110{
111 return p->mm && !(p->flags & PF_BORROWED_MM);
112}
113
114static unsigned int try_to_freeze_tasks(int freeze_user_space)
115{ 109{
116 struct task_struct *g, *p; 110 struct task_struct *g, *p;
117 unsigned long end_time; 111 unsigned long end_time;
@@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
122 todo = 0; 116 todo = 0;
123 read_lock(&tasklist_lock); 117 read_lock(&tasklist_lock);
124 do_each_thread(g, p) { 118 do_each_thread(g, p) {
125 if (!freezeable(p)) 119 if (frozen(p) || !freezeable(p))
126 continue; 120 continue;
127 121
128 if (frozen(p)) 122 if (freeze_user_space) {
129 continue; 123 if (p->state == TASK_TRACED &&
130 124 frozen(p->parent)) {
131 if (p->state == TASK_TRACED && frozen(p->parent)) { 125 cancel_freezing(p);
132 cancel_freezing(p); 126 continue;
133 continue; 127 }
128 /*
129 * Kernel threads should not have TIF_FREEZE set
130 * at this point, so we must ensure that either
131 * p->mm is not NULL *and* PF_BORROWED_MM is
132 * unset, or TIF_FRREZE is left unset.
133 * The task_lock() is necessary to prevent races
134 * with exit_mm() or use_mm()/unuse_mm() from
135 * occuring.
136 */
137 task_lock(p);
138 if (!p->mm || (p->flags & PF_BORROWED_MM)) {
139 task_unlock(p);
140 continue;
141 }
142 freeze_task(p);
143 task_unlock(p);
144 } else {
145 freeze_task(p);
134 } 146 }
135 if (freeze_user_space && !is_user_space(p))
136 continue;
137
138 freeze_process(p);
139 if (!freezer_should_skip(p)) 147 if (!freezer_should_skip(p))
140 todo++; 148 todo++;
141 } while_each_thread(g, p); 149 } while_each_thread(g, p);
142 read_unlock(&tasklist_lock); 150 read_unlock(&tasklist_lock);
143 yield(); /* Yield is okay here */ 151 yield(); /* Yield is okay here */
144 if (todo && time_after(jiffies, end_time)) 152 if (time_after(jiffies, end_time))
145 break; 153 break;
146 } while (todo); 154 } while (todo);
147 155
@@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
152 * but it cleans up leftover PF_FREEZE requests. 160 * but it cleans up leftover PF_FREEZE requests.
153 */ 161 */
154 printk("\n"); 162 printk("\n");
155 printk(KERN_ERR "Stopping %s timed out after %d seconds " 163 printk(KERN_ERR "Freezing of %s timed out after %d seconds "
156 "(%d tasks refusing to freeze):\n", 164 "(%d tasks refusing to freeze):\n",
157 freeze_user_space ? "user space processes" : 165 freeze_user_space ? "user space " : "tasks ",
158 "kernel threads",
159 TIMEOUT / HZ, todo); 166 TIMEOUT / HZ, todo);
167 show_state();
160 read_lock(&tasklist_lock); 168 read_lock(&tasklist_lock);
161 do_each_thread(g, p) { 169 do_each_thread(g, p) {
162 if (freeze_user_space && !is_user_space(p))
163 continue;
164
165 task_lock(p); 170 task_lock(p);
166 if (freezeable(p) && !frozen(p) && 171 if (freezing(p) && !freezer_should_skip(p))
167 !freezer_should_skip(p))
168 printk(KERN_ERR " %s\n", p->comm); 172 printk(KERN_ERR " %s\n", p->comm);
169
170 cancel_freezing(p); 173 cancel_freezing(p);
171 task_unlock(p); 174 task_unlock(p);
172 } while_each_thread(g, p); 175 } while_each_thread(g, p);
173 read_unlock(&tasklist_lock); 176 read_unlock(&tasklist_lock);
174 } 177 }
175 178
176 return todo; 179 return todo ? -EBUSY : 0;
177} 180}
178 181
179/** 182/**
180 * freeze_processes - tell processes to enter the refrigerator 183 * freeze_processes - tell processes to enter the refrigerator
181 *
182 * Returns 0 on success, or the number of processes that didn't freeze,
183 * although they were told to.
184 */ 184 */
185int freeze_processes(void) 185int freeze_processes(void)
186{ 186{
187 unsigned int nr_unfrozen; 187 int error;
188 188
189 printk("Stopping tasks ... "); 189 printk("Stopping tasks ... ");
190 nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); 190 error = try_to_freeze_tasks(FREEZER_USER_SPACE);
191 if (nr_unfrozen) 191 if (error)
192 return nr_unfrozen; 192 return error;
193 193
194 sys_sync(); 194 sys_sync();
195 nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 195 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
196 if (nr_unfrozen) 196 if (error)
197 return nr_unfrozen; 197 return error;
198 198
199 printk("done.\n"); 199 printk("done.\n");
200 BUG_ON(in_atomic()); 200 BUG_ON(in_atomic());
@@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space)
210 if (!freezeable(p)) 210 if (!freezeable(p))
211 continue; 211 continue;
212 212
213 if (is_user_space(p) == !thaw_user_space) 213 if (!p->mm == thaw_user_space)
214 continue; 214 continue;
215 215
216 thaw_process(p); 216 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a3b7854b8f7c..a686590d88c1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -709,7 +709,8 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
709 region->end_pfn << PAGE_SHIFT); 709 region->end_pfn << PAGE_SHIFT);
710 710
711 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 711 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
712 memory_bm_set_bit(bm, pfn); 712 if (pfn_valid(pfn))
713 memory_bm_set_bit(bm, pfn);
713 } 714 }
714} 715}
715 716
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8b1a1b837145..917aba100575 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -33,8 +33,9 @@ extern char resume_file[];
33#define SWSUSP_SIG "S1SUSPEND" 33#define SWSUSP_SIG "S1SUSPEND"
34 34
35struct swsusp_header { 35struct swsusp_header {
36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; 36 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
37 sector_t image; 37 sector_t image;
38 unsigned int flags; /* Flags to pass to the "boot" kernel */
38 char orig_sig[10]; 39 char orig_sig[10];
39 char sig[10]; 40 char sig[10];
40} __attribute__((packed)); 41} __attribute__((packed));
@@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain)
138 * Saving part 139 * Saving part
139 */ 140 */
140 141
141static int mark_swapfiles(sector_t start) 142static int mark_swapfiles(sector_t start, unsigned int flags)
142{ 143{
143 int error; 144 int error;
144 145
@@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start)
148 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 149 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
149 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 150 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
150 swsusp_header->image = start; 151 swsusp_header->image = start;
152 swsusp_header->flags = flags;
151 error = bio_write_page(swsusp_resume_block, 153 error = bio_write_page(swsusp_resume_block,
152 swsusp_header, NULL); 154 swsusp_header, NULL);
153 } else { 155 } else {
@@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages)
369 371
370/** 372/**
371 * swsusp_write - Write entire image and metadata. 373 * swsusp_write - Write entire image and metadata.
374 * @flags: flags to pass to the "boot" kernel in the image header
372 * 375 *
373 * It is important _NOT_ to umount filesystems at this point. We want 376 * It is important _NOT_ to umount filesystems at this point. We want
374 * them synced (in case something goes wrong) but we DO not want to mark 377 * them synced (in case something goes wrong) but we DO not want to mark
@@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages)
376 * correctly, we'll mark system clean, anyway.) 379 * correctly, we'll mark system clean, anyway.)
377 */ 380 */
378 381
379int swsusp_write(void) 382int swsusp_write(unsigned int flags)
380{ 383{
381 struct swap_map_handle handle; 384 struct swap_map_handle handle;
382 struct snapshot_handle snapshot; 385 struct snapshot_handle snapshot;
@@ -415,7 +418,7 @@ int swsusp_write(void)
415 if (!error) { 418 if (!error) {
416 flush_swap_writer(&handle); 419 flush_swap_writer(&handle);
417 printk("S"); 420 printk("S");
418 error = mark_swapfiles(start); 421 error = mark_swapfiles(start, flags);
419 printk("|\n"); 422 printk("|\n");
420 } 423 }
421 } 424 }
@@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle,
540 return error; 543 return error;
541} 544}
542 545
543int swsusp_read(void) 546/**
547 * swsusp_read - read the hibernation image.
548 * @flags_p: flags passed by the "frozen" kernel in the image header should
549 * be written into this memeory location
550 */
551
552int swsusp_read(unsigned int *flags_p)
544{ 553{
545 int error; 554 int error;
546 struct swap_map_handle handle; 555 struct swap_map_handle handle;
547 struct snapshot_handle snapshot; 556 struct snapshot_handle snapshot;
548 struct swsusp_info *header; 557 struct swsusp_info *header;
549 558
559 *flags_p = swsusp_header->flags;
550 if (IS_ERR(resume_bdev)) { 560 if (IS_ERR(resume_bdev)) {
551 pr_debug("swsusp: block device not initialised\n"); 561 pr_debug("swsusp: block device not initialised\n");
552 return PTR_ERR(resume_bdev); 562 return PTR_ERR(resume_bdev);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d65305b515b1..bd0723a7df3f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
128 return res; 128 return res;
129} 129}
130 130
131static inline int platform_prepare(void)
132{
133 int error = 0;
134
135 if (hibernation_ops)
136 error = hibernation_ops->prepare();
137
138 return error;
139}
140
141static inline void platform_finish(void)
142{
143 if (hibernation_ops)
144 hibernation_ops->finish();
145}
146
147static inline int snapshot_suspend(int platform_suspend)
148{
149 int error;
150
151 mutex_lock(&pm_mutex);
152 /* Free memory before shutting down devices. */
153 error = swsusp_shrink_memory();
154 if (error)
155 goto Finish;
156
157 if (platform_suspend) {
158 error = platform_prepare();
159 if (error)
160 goto Finish;
161 }
162 suspend_console();
163 error = device_suspend(PMSG_FREEZE);
164 if (error)
165 goto Resume_devices;
166
167 error = disable_nonboot_cpus();
168 if (!error) {
169 in_suspend = 1;
170 error = swsusp_suspend();
171 }
172 enable_nonboot_cpus();
173 Resume_devices:
174 if (platform_suspend)
175 platform_finish();
176
177 device_resume();
178 resume_console();
179 Finish:
180 mutex_unlock(&pm_mutex);
181 return error;
182}
183
184static inline int snapshot_restore(int platform_suspend)
185{
186 int error;
187
188 mutex_lock(&pm_mutex);
189 pm_prepare_console();
190 if (platform_suspend) {
191 error = platform_prepare();
192 if (error)
193 goto Finish;
194 }
195 suspend_console();
196 error = device_suspend(PMSG_PRETHAW);
197 if (error)
198 goto Resume_devices;
199
200 error = disable_nonboot_cpus();
201 if (!error)
202 error = swsusp_resume();
203
204 enable_nonboot_cpus();
205 Resume_devices:
206 if (platform_suspend)
207 platform_finish();
208
209 device_resume();
210 resume_console();
211 Finish:
212 pm_restore_console();
213 mutex_unlock(&pm_mutex);
214 return error;
215}
216
217static int snapshot_ioctl(struct inode *inode, struct file *filp, 131static int snapshot_ioctl(struct inode *inode, struct file *filp,
218 unsigned int cmd, unsigned long arg) 132 unsigned int cmd, unsigned long arg)
219{ 133{
@@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
237 if (data->frozen) 151 if (data->frozen)
238 break; 152 break;
239 mutex_lock(&pm_mutex); 153 mutex_lock(&pm_mutex);
240 if (freeze_processes()) { 154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
241 thaw_processes(); 155 if (!error) {
242 error = -EBUSY; 156 error = freeze_processes();
157 if (error)
158 thaw_processes();
243 } 159 }
160 if (error)
161 pm_notifier_call_chain(PM_POST_HIBERNATION);
244 mutex_unlock(&pm_mutex); 162 mutex_unlock(&pm_mutex);
245 if (!error) 163 if (!error)
246 data->frozen = 1; 164 data->frozen = 1;
@@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
251 break; 169 break;
252 mutex_lock(&pm_mutex); 170 mutex_lock(&pm_mutex);
253 thaw_processes(); 171 thaw_processes();
172 pm_notifier_call_chain(PM_POST_HIBERNATION);
254 mutex_unlock(&pm_mutex); 173 mutex_unlock(&pm_mutex);
255 data->frozen = 0; 174 data->frozen = 0;
256 break; 175 break;
@@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
260 error = -EPERM; 179 error = -EPERM;
261 break; 180 break;
262 } 181 }
263 error = snapshot_suspend(data->platform_suspend); 182 error = hibernation_snapshot(data->platform_suspend);
264 if (!error) 183 if (!error)
265 error = put_user(in_suspend, (unsigned int __user *)arg); 184 error = put_user(in_suspend, (unsigned int __user *)arg);
266 if (!error) 185 if (!error)
@@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
274 error = -EPERM; 193 error = -EPERM;
275 break; 194 break;
276 } 195 }
277 error = snapshot_restore(data->platform_suspend); 196 error = hibernation_restore(data->platform_suspend);
278 break; 197 break;
279 198
280 case SNAPSHOT_FREE: 199 case SNAPSHOT_FREE:
@@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
336 break; 255 break;
337 256
338 case SNAPSHOT_S2RAM: 257 case SNAPSHOT_S2RAM:
339 if (!pm_ops) {
340 error = -ENOSYS;
341 break;
342 }
343
344 if (!data->frozen) { 258 if (!data->frozen) {
345 error = -EPERM; 259 error = -EPERM;
346 break; 260 break;
347 } 261 }
348
349 if (!mutex_trylock(&pm_mutex)) { 262 if (!mutex_trylock(&pm_mutex)) {
350 error = -EBUSY; 263 error = -EBUSY;
351 break; 264 break;
352 } 265 }
353 266 /*
354 if (pm_ops->prepare) { 267 * Tasks are frozen and the notifiers have been called with
355 error = pm_ops->prepare(PM_SUSPEND_MEM); 268 * PM_HIBERNATION_PREPARE
356 if (error) 269 */
357 goto OutS3; 270 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
358 }
359
360 /* Put devices to sleep */
361 suspend_console();
362 error = device_suspend(PMSG_SUSPEND);
363 if (error) {
364 printk(KERN_ERR "Failed to suspend some devices.\n");
365 } else {
366 error = disable_nonboot_cpus();
367 if (!error) {
368 /* Enter S3, system is already frozen */
369 suspend_enter(PM_SUSPEND_MEM);
370 enable_nonboot_cpus();
371 }
372 /* Wake up devices */
373 device_resume();
374 }
375 resume_console();
376 if (pm_ops->finish)
377 pm_ops->finish(PM_SUSPEND_MEM);
378
379 OutS3:
380 mutex_unlock(&pm_mutex); 271 mutex_unlock(&pm_mutex);
381 break; 272 break;
382 273
@@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
386 switch (arg) { 277 switch (arg) {
387 278
388 case PMOPS_PREPARE: 279 case PMOPS_PREPARE:
389 if (hibernation_ops) { 280 data->platform_suspend = 1;
390 data->platform_suspend = 1; 281 error = 0;
391 error = 0;
392 } else {
393 error = -ENOSYS;
394 }
395 break; 282 break;
396 283
397 case PMOPS_ENTER: 284 case PMOPS_ENTER:
398 if (data->platform_suspend) { 285 if (data->platform_suspend)
399 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 286 error = hibernation_platform_enter();
400 error = hibernation_ops->enter(); 287
401 }
402 break; 288 break;
403 289
404 case PMOPS_FINISH: 290 case PMOPS_FINISH:
diff --git a/kernel/printk.c b/kernel/printk.c
index 051d27e36a6c..8451dfc31d25 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -732,7 +732,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
732 return 0; 732 return 0;
733} 733}
734 734
735int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) 735int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
736{ 736{
737 struct console_cmdline *c; 737 struct console_cmdline *c;
738 int i; 738 int i;
@@ -1083,6 +1083,19 @@ int unregister_console(struct console *console)
1083} 1083}
1084EXPORT_SYMBOL(unregister_console); 1084EXPORT_SYMBOL(unregister_console);
1085 1085
1086static int __init disable_boot_consoles(void)
1087{
1088 if (console_drivers != NULL) {
1089 if (console_drivers->flags & CON_BOOT) {
1090 printk(KERN_INFO "turn off boot console %s%d\n",
1091 console_drivers->name, console_drivers->index);
1092 return unregister_console(console_drivers);
1093 }
1094 }
1095 return 0;
1096}
1097late_initcall(disable_boot_consoles);
1098
1086/** 1099/**
1087 * tty_write_message - write a message to a certain tty, not just the console. 1100 * tty_write_message - write a message to a certain tty, not just the console.
1088 * @tty: the destination tty_struct 1101 * @tty: the destination tty_struct
diff --git a/kernel/profile.c b/kernel/profile.c
index 5b20fe977bed..cb1e37d2dac3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -199,11 +199,11 @@ EXPORT_SYMBOL_GPL(register_timer_hook);
199EXPORT_SYMBOL_GPL(unregister_timer_hook); 199EXPORT_SYMBOL_GPL(unregister_timer_hook);
200EXPORT_SYMBOL_GPL(task_handoff_register); 200EXPORT_SYMBOL_GPL(task_handoff_register);
201EXPORT_SYMBOL_GPL(task_handoff_unregister); 201EXPORT_SYMBOL_GPL(task_handoff_unregister);
202EXPORT_SYMBOL_GPL(profile_event_register);
203EXPORT_SYMBOL_GPL(profile_event_unregister);
202 204
203#endif /* CONFIG_PROFILING */ 205#endif /* CONFIG_PROFILING */
204 206
205EXPORT_SYMBOL_GPL(profile_event_register);
206EXPORT_SYMBOL_GPL(profile_event_unregister);
207 207
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209/* 209/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4a1745f1dadf..3eca7a55f2ee 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task)
142 return -EPERM; 142 return -EPERM;
143 smp_rmb(); 143 smp_rmb();
144 if (task->mm) 144 if (task->mm)
145 dumpable = task->mm->dumpable; 145 dumpable = get_dumpable(task->mm);
146 if (!dumpable && !capable(CAP_SYS_PTRACE)) 146 if (!dumpable && !capable(CAP_SYS_PTRACE))
147 return -EPERM; 147 return -EPERM;
148 148
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
233 233
234 /* Architecture-specific hardware disable .. */ 234 /* Architecture-specific hardware disable .. */
235 ptrace_disable(child); 235 ptrace_disable(child);
236 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
236 237
237 write_lock_irq(&tasklist_lock); 238 write_lock_irq(&tasklist_lock);
238 /* protect against de_thread()->release_task() */ 239 /* protect against de_thread()->release_task() */
diff --git a/kernel/relay.c b/kernel/relay.c
index a615a8f513fc..ad855017bc59 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Public API and common code for kernel->userspace relay file support. 2 * Public API and common code for kernel->userspace relay file support.
3 * 3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs. 4 * See Documentation/filesystems/relay.txt for an overview.
5 * 5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp 6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) 7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
@@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = {
80 * 80 *
81 * Caller should already have grabbed mmap_sem. 81 * Caller should already have grabbed mmap_sem.
82 */ 82 */
83int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) 83static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
84{ 84{
85 unsigned long length = vma->vm_end - vma->vm_start; 85 unsigned long length = vma->vm_end - vma->vm_start;
86 struct file *filp = vma->vm_file; 86 struct file *filp = vma->vm_file;
@@ -145,7 +145,7 @@ depopulate:
145 * 145 *
146 * Returns channel buffer if successful, %NULL otherwise. 146 * Returns channel buffer if successful, %NULL otherwise.
147 */ 147 */
148struct rchan_buf *relay_create_buf(struct rchan *chan) 148static struct rchan_buf *relay_create_buf(struct rchan *chan)
149{ 149{
150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
151 if (!buf) 151 if (!buf)
@@ -175,7 +175,7 @@ free_buf:
175 * 175 *
176 * Should only be called from kref_put(). 176 * Should only be called from kref_put().
177 */ 177 */
178void relay_destroy_channel(struct kref *kref) 178static void relay_destroy_channel(struct kref *kref)
179{ 179{
180 struct rchan *chan = container_of(kref, struct rchan, kref); 180 struct rchan *chan = container_of(kref, struct rchan, kref);
181 kfree(chan); 181 kfree(chan);
@@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref)
185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer 185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
186 * @buf: the buffer struct 186 * @buf: the buffer struct
187 */ 187 */
188void relay_destroy_buf(struct rchan_buf *buf) 188static void relay_destroy_buf(struct rchan_buf *buf)
189{ 189{
190 struct rchan *chan = buf->chan; 190 struct rchan *chan = buf->chan;
191 unsigned int i; 191 unsigned int i;
@@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
210 * rchan_buf_struct and the channel buffer. Should only be called from 210 * rchan_buf_struct and the channel buffer. Should only be called from
211 * kref_put(). 211 * kref_put().
212 */ 212 */
213void relay_remove_buf(struct kref *kref) 213static void relay_remove_buf(struct kref *kref)
214{ 214{
215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
216 buf->chan->cb->remove_buf_file(buf->dentry); 216 buf->chan->cb->remove_buf_file(buf->dentry);
@@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref)
223 * 223 *
224 * Returns 1 if the buffer is empty, 0 otherwise. 224 * Returns 1 if the buffer is empty, 0 otherwise.
225 */ 225 */
226int relay_buf_empty(struct rchan_buf *buf) 226static int relay_buf_empty(struct rchan_buf *buf)
227{ 227{
228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; 228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
229} 229}
230EXPORT_SYMBOL_GPL(relay_buf_empty);
231 230
232/** 231/**
233 * relay_buf_full - boolean, is the channel buffer full? 232 * relay_buf_full - boolean, is the channel buffer full?
@@ -427,6 +426,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
427 426
428free_buf: 427free_buf:
429 relay_destroy_buf(buf); 428 relay_destroy_buf(buf);
429 buf = NULL;
430free_name: 430free_name:
431 kfree(tmpname); 431 kfree(tmpname);
432end: 432end:
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9a87886b022e..1ec620c03064 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem)
20 might_sleep(); 20 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
22 22
23 __down_read(sem); 23 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
24} 24}
25 25
26EXPORT_SYMBOL(down_read); 26EXPORT_SYMBOL(down_read);
@@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem)
47 might_sleep(); 47 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49 49
50 __down_write(sem); 50 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
51} 51}
52 52
53EXPORT_SYMBOL(down_write); 53EXPORT_SYMBOL(down_write);
@@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
111 might_sleep(); 111 might_sleep();
112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
113 113
114 __down_read(sem); 114 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
115} 115}
116 116
117EXPORT_SYMBOL(down_read_nested); 117EXPORT_SYMBOL(down_read_nested);
@@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
130 might_sleep(); 130 might_sleep();
131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
132 132
133 __down_write_nested(sem, subclass); 133 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
134} 134}
135 135
136EXPORT_SYMBOL(down_write_nested); 136EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/sched.c b/kernel/sched.c
index cb31fb4a1379..6c10fa796ca0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -53,6 +53,7 @@
53#include <linux/percpu.h> 53#include <linux/percpu.h>
54#include <linux/kthread.h> 54#include <linux/kthread.h>
55#include <linux/seq_file.h> 55#include <linux/seq_file.h>
56#include <linux/sysctl.h>
56#include <linux/syscalls.h> 57#include <linux/syscalls.h>
57#include <linux/times.h> 58#include <linux/times.h>
58#include <linux/tsacct_kern.h> 59#include <linux/tsacct_kern.h>
@@ -60,6 +61,7 @@
60#include <linux/delayacct.h> 61#include <linux/delayacct.h>
61#include <linux/reciprocal_div.h> 62#include <linux/reciprocal_div.h>
62#include <linux/unistd.h> 63#include <linux/unistd.h>
64#include <linux/pagemap.h>
63 65
64#include <asm/tlb.h> 66#include <asm/tlb.h>
65 67
@@ -261,9 +263,9 @@ struct rq {
261 s64 clock_max_delta; 263 s64 clock_max_delta;
262 264
263 unsigned int clock_warps, clock_overflows; 265 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events; 266 u64 idle_clock;
265 267 unsigned int clock_deep_idle_events;
266 struct sched_class *load_balance_class; 268 u64 tick_timestamp;
267 269
268 atomic_t nr_iowait; 270 atomic_t nr_iowait;
269 271
@@ -301,7 +303,7 @@ struct rq {
301 struct lock_class_key rq_lock_key; 303 struct lock_class_key rq_lock_key;
302}; 304};
303 305
304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 306static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
305static DEFINE_MUTEX(sched_hotcpu_mutex); 307static DEFINE_MUTEX(sched_hotcpu_mutex);
306 308
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 309static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -319,15 +321,19 @@ static inline int cpu_of(struct rq *rq)
319} 321}
320 322
321/* 323/*
322 * Per-runqueue clock, as finegrained as the platform can give us: 324 * Update the per-runqueue clock, as finegrained as the platform can give
325 * us, but without assuming monotonicity, etc.:
323 */ 326 */
324static unsigned long long __rq_clock(struct rq *rq) 327static void __update_rq_clock(struct rq *rq)
325{ 328{
326 u64 prev_raw = rq->prev_clock_raw; 329 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock(); 330 u64 now = sched_clock();
328 s64 delta = now - prev_raw; 331 s64 delta = now - prev_raw;
329 u64 clock = rq->clock; 332 u64 clock = rq->clock;
330 333
334#ifdef CONFIG_SCHED_DEBUG
335 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
336#endif
331 /* 337 /*
332 * Protect against sched_clock() occasionally going backwards: 338 * Protect against sched_clock() occasionally going backwards:
333 */ 339 */
@@ -338,8 +344,11 @@ static unsigned long long __rq_clock(struct rq *rq)
338 /* 344 /*
339 * Catch too large forward jumps too: 345 * Catch too large forward jumps too:
340 */ 346 */
341 if (unlikely(delta > 2*TICK_NSEC)) { 347 if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
342 clock++; 348 if (clock < rq->tick_timestamp + TICK_NSEC)
349 clock = rq->tick_timestamp + TICK_NSEC;
350 else
351 clock++;
343 rq->clock_overflows++; 352 rq->clock_overflows++;
344 } else { 353 } else {
345 if (unlikely(delta > rq->clock_max_delta)) 354 if (unlikely(delta > rq->clock_max_delta))
@@ -350,18 +359,12 @@ static unsigned long long __rq_clock(struct rq *rq)
350 359
351 rq->prev_clock_raw = now; 360 rq->prev_clock_raw = now;
352 rq->clock = clock; 361 rq->clock = clock;
353
354 return clock;
355} 362}
356 363
357static inline unsigned long long rq_clock(struct rq *rq) 364static void update_rq_clock(struct rq *rq)
358{ 365{
359 int this_cpu = smp_processor_id(); 366 if (likely(smp_processor_id() == cpu_of(rq)))
360 367 __update_rq_clock(rq);
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365} 368}
366 369
367/* 370/*
@@ -379,6 +382,25 @@ static inline unsigned long long rq_clock(struct rq *rq)
379#define task_rq(p) cpu_rq(task_cpu(p)) 382#define task_rq(p) cpu_rq(task_cpu(p))
380#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 383#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
381 384
385/*
386 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
387 * clock constructed from sched_clock():
388 */
389unsigned long long cpu_clock(int cpu)
390{
391 unsigned long long now;
392 unsigned long flags;
393 struct rq *rq;
394
395 local_irq_save(flags);
396 rq = cpu_rq(cpu);
397 update_rq_clock(rq);
398 now = rq->clock;
399 local_irq_restore(flags);
400
401 return now;
402}
403
382#ifdef CONFIG_FAIR_GROUP_SCHED 404#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */ 405/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p) 406static inline void set_task_cfs_rq(struct task_struct *p)
@@ -536,18 +558,40 @@ static inline struct rq *this_rq_lock(void)
536} 558}
537 559
538/* 560/*
539 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 561 * We are going deep-idle (irqs are disabled):
540 */ 562 */
541void sched_clock_unstable_event(void) 563void sched_clock_idle_sleep_event(void)
542{ 564{
543 unsigned long flags; 565 struct rq *rq = cpu_rq(smp_processor_id());
544 struct rq *rq;
545 566
546 rq = task_rq_lock(current, &flags); 567 spin_lock(&rq->lock);
547 rq->prev_clock_raw = sched_clock(); 568 __update_rq_clock(rq);
548 rq->clock_unstable_events++; 569 spin_unlock(&rq->lock);
549 task_rq_unlock(rq, &flags); 570 rq->clock_deep_idle_events++;
550} 571}
572EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
573
574/*
575 * We just idled delta nanoseconds (called with irqs disabled):
576 */
577void sched_clock_idle_wakeup_event(u64 delta_ns)
578{
579 struct rq *rq = cpu_rq(smp_processor_id());
580 u64 now = sched_clock();
581
582 rq->idle_clock += delta_ns;
583 /*
584 * Override the previous timestamp and ignore all
585 * sched_clock() deltas that occured while we idled,
586 * and use the PM-provided delta_ns to advance the
587 * rq clock:
588 */
589 spin_lock(&rq->lock);
590 rq->prev_clock_raw = now;
591 rq->clock += delta_ns;
592 spin_unlock(&rq->lock);
593}
594EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
551 595
552/* 596/*
553 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_task - mark a task 'to be rescheduled now'.
@@ -622,27 +666,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
622 666
623#define WMULT_SHIFT 32 667#define WMULT_SHIFT 32
624 668
625static inline unsigned long 669/*
670 * Shift right and round:
671 */
672#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
673
674static unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight, 675calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw) 676 struct load_weight *lw)
628{ 677{
629 u64 tmp; 678 u64 tmp;
630 679
631 if (unlikely(!lw->inv_weight)) 680 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight; 681 lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
633 682
634 tmp = (u64)delta_exec * weight; 683 tmp = (u64)delta_exec * weight;
635 /* 684 /*
636 * Check whether we'd overflow the 64-bit multiplication: 685 * Check whether we'd overflow the 64-bit multiplication:
637 */ 686 */
638 if (unlikely(tmp > WMULT_CONST)) { 687 if (unlikely(tmp > WMULT_CONST))
639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) 688 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
640 >> (WMULT_SHIFT/2); 689 WMULT_SHIFT/2);
641 } else { 690 else
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; 691 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
643 }
644 692
645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); 693 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
646} 694}
647 695
648static inline unsigned long 696static inline unsigned long
@@ -663,46 +711,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
663 lw->inv_weight = 0; 711 lw->inv_weight = 0;
664} 712}
665 713
666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
667{
668 if (rq->curr != rq->idle && ls->load.weight) {
669 ls->delta_exec += ls->delta_stat;
670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
671 ls->delta_stat = 0;
672 }
673}
674
675/*
676 * Update delta_exec, delta_fair fields for rq.
677 *
678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
682 *
683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
686 *
687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
689 */
690static void update_curr_load(struct rq *rq, u64 now)
691{
692 struct load_stat *ls = &rq->ls;
693 u64 start;
694
695 start = ls->load_update_start;
696 ls->load_update_start = now;
697 ls->delta_stat += now - start;
698 /*
699 * Stagger updates to ls->delta_fair. Very frequent updates
700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
704}
705
706/* 714/*
707 * To aid in avoiding the subversion of "niceness" due to uneven distribution 715 * To aid in avoiding the subversion of "niceness" due to uneven distribution
708 * of tasks with abnormal "nice" values across CPUs the contribution that 716 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -712,19 +720,6 @@ static void update_curr_load(struct rq *rq, u64 now)
712 * slice expiry etc. 720 * slice expiry etc.
713 */ 721 */
714 722
715/*
716 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
717 * If static_prio_timeslice() is ever changed to break this assumption then
718 * this code will need modification
719 */
720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
721#define load_weight(lp) \
722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
723#define PRIO_TO_LOAD_WEIGHT(prio) \
724 load_weight(static_prio_timeslice(prio))
725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
727
728#define WEIGHT_IDLEPRIO 2 723#define WEIGHT_IDLEPRIO 2
729#define WMULT_IDLEPRIO (1 << 31) 724#define WMULT_IDLEPRIO (1 << 31)
730 725
@@ -741,11 +736,14 @@ static void update_curr_load(struct rq *rq, u64 now)
741 * the relative distance between them is ~25%.) 736 * the relative distance between them is ~25%.)
742 */ 737 */
743static const int prio_to_weight[40] = { 738static const int prio_to_weight[40] = {
744/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, 739 /* -20 */ 88761, 71755, 56483, 46273, 36291,
745/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, 740 /* -15 */ 29154, 23254, 18705, 14949, 11916,
746/* 0 */ NICE_0_LOAD /* 1024 */, 741 /* -10 */ 9548, 7620, 6100, 4904, 3906,
747/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, 742 /* -5 */ 3121, 2501, 1991, 1586, 1277,
748/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, 743 /* 0 */ 1024, 820, 655, 526, 423,
744 /* 5 */ 335, 272, 215, 172, 137,
745 /* 10 */ 110, 87, 70, 56, 45,
746 /* 15 */ 36, 29, 23, 18, 15,
749}; 747};
750 748
751/* 749/*
@@ -756,42 +754,16 @@ static const int prio_to_weight[40] = {
756 * into multiplications: 754 * into multiplications:
757 */ 755 */
758static const u32 prio_to_wmult[40] = { 756static const u32 prio_to_wmult[40] = {
759/* -20 */ 48356, 60446, 75558, 94446, 118058, 757 /* -20 */ 48388, 59856, 76040, 92818, 118348,
760/* -15 */ 147573, 184467, 230589, 288233, 360285, 758 /* -15 */ 147320, 184698, 229616, 287308, 360437,
761/* -10 */ 450347, 562979, 703746, 879575, 1099582, 759 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
762/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, 760 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
763/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, 761 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
764/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, 762 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
765/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, 763 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
766/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 764 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
767}; 765};
768 766
769static inline void
770inc_load(struct rq *rq, const struct task_struct *p, u64 now)
771{
772 update_curr_load(rq, now);
773 update_load_add(&rq->ls.load, p->se.load.weight);
774}
775
776static inline void
777dec_load(struct rq *rq, const struct task_struct *p, u64 now)
778{
779 update_curr_load(rq, now);
780 update_load_sub(&rq->ls.load, p->se.load.weight);
781}
782
783static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
784{
785 rq->nr_running++;
786 inc_load(rq, p, now);
787}
788
789static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
790{
791 rq->nr_running--;
792 dec_load(rq, p, now);
793}
794
795static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); 767static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
796 768
797/* 769/*
@@ -809,8 +781,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
809 unsigned long max_nr_move, unsigned long max_load_move, 781 unsigned long max_nr_move, unsigned long max_load_move,
810 struct sched_domain *sd, enum cpu_idle_type idle, 782 struct sched_domain *sd, enum cpu_idle_type idle,
811 int *all_pinned, unsigned long *load_moved, 783 int *all_pinned, unsigned long *load_moved,
812 int this_best_prio, int best_prio, int best_prio_seen, 784 int *this_best_prio, struct rq_iterator *iterator);
813 struct rq_iterator *iterator);
814 785
815#include "sched_stats.h" 786#include "sched_stats.h"
816#include "sched_rt.c" 787#include "sched_rt.c"
@@ -822,9 +793,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
822 793
823#define sched_class_highest (&rt_sched_class) 794#define sched_class_highest (&rt_sched_class)
824 795
796static void __update_curr_load(struct rq *rq, struct load_stat *ls)
797{
798 if (rq->curr != rq->idle && ls->load.weight) {
799 ls->delta_exec += ls->delta_stat;
800 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
801 ls->delta_stat = 0;
802 }
803}
804
805/*
806 * Update delta_exec, delta_fair fields for rq.
807 *
808 * delta_fair clock advances at a rate inversely proportional to
809 * total load (rq->ls.load.weight) on the runqueue, while
810 * delta_exec advances at the same rate as wall-clock (provided
811 * cpu is not idle).
812 *
813 * delta_exec / delta_fair is a measure of the (smoothened) load on this
814 * runqueue over any given interval. This (smoothened) load is used
815 * during load balance.
816 *
817 * This function is called /before/ updating rq->ls.load
818 * and when switching tasks.
819 */
820static void update_curr_load(struct rq *rq)
821{
822 struct load_stat *ls = &rq->ls;
823 u64 start;
824
825 start = ls->load_update_start;
826 ls->load_update_start = rq->clock;
827 ls->delta_stat += rq->clock - start;
828 /*
829 * Stagger updates to ls->delta_fair. Very frequent updates
830 * can be expensive.
831 */
832 if (ls->delta_stat >= sysctl_sched_stat_granularity)
833 __update_curr_load(rq, ls);
834}
835
836static inline void inc_load(struct rq *rq, const struct task_struct *p)
837{
838 update_curr_load(rq);
839 update_load_add(&rq->ls.load, p->se.load.weight);
840}
841
842static inline void dec_load(struct rq *rq, const struct task_struct *p)
843{
844 update_curr_load(rq);
845 update_load_sub(&rq->ls.load, p->se.load.weight);
846}
847
848static void inc_nr_running(struct task_struct *p, struct rq *rq)
849{
850 rq->nr_running++;
851 inc_load(rq, p);
852}
853
854static void dec_nr_running(struct task_struct *p, struct rq *rq)
855{
856 rq->nr_running--;
857 dec_load(rq, p);
858}
859
825static void set_load_weight(struct task_struct *p) 860static void set_load_weight(struct task_struct *p)
826{ 861{
827 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
828 p->se.wait_runtime = 0; 862 p->se.wait_runtime = 0;
829 863
830 if (task_has_rt_policy(p)) { 864 if (task_has_rt_policy(p)) {
@@ -846,18 +880,16 @@ static void set_load_weight(struct task_struct *p)
846 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
847} 881}
848 882
849static void 883static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
850enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
851{ 884{
852 sched_info_queued(p); 885 sched_info_queued(p);
853 p->sched_class->enqueue_task(rq, p, wakeup, now); 886 p->sched_class->enqueue_task(rq, p, wakeup);
854 p->se.on_rq = 1; 887 p->se.on_rq = 1;
855} 888}
856 889
857static void 890static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
858dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
859{ 891{
860 p->sched_class->dequeue_task(rq, p, sleep, now); 892 p->sched_class->dequeue_task(rq, p, sleep);
861 p->se.on_rq = 0; 893 p->se.on_rq = 0;
862} 894}
863 895
@@ -912,13 +944,11 @@ static int effective_prio(struct task_struct *p)
912 */ 944 */
913static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 945static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
914{ 946{
915 u64 now = rq_clock(rq);
916
917 if (p->state == TASK_UNINTERRUPTIBLE) 947 if (p->state == TASK_UNINTERRUPTIBLE)
918 rq->nr_uninterruptible--; 948 rq->nr_uninterruptible--;
919 949
920 enqueue_task(rq, p, wakeup, now); 950 enqueue_task(rq, p, wakeup);
921 inc_nr_running(p, rq, now); 951 inc_nr_running(p, rq);
922} 952}
923 953
924/* 954/*
@@ -926,13 +956,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
926 */ 956 */
927static inline void activate_idle_task(struct task_struct *p, struct rq *rq) 957static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
928{ 958{
929 u64 now = rq_clock(rq); 959 update_rq_clock(rq);
930 960
931 if (p->state == TASK_UNINTERRUPTIBLE) 961 if (p->state == TASK_UNINTERRUPTIBLE)
932 rq->nr_uninterruptible--; 962 rq->nr_uninterruptible--;
933 963
934 enqueue_task(rq, p, 0, now); 964 enqueue_task(rq, p, 0);
935 inc_nr_running(p, rq, now); 965 inc_nr_running(p, rq);
936} 966}
937 967
938/* 968/*
@@ -940,13 +970,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
940 */ 970 */
941static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 971static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
942{ 972{
943 u64 now = rq_clock(rq);
944
945 if (p->state == TASK_UNINTERRUPTIBLE) 973 if (p->state == TASK_UNINTERRUPTIBLE)
946 rq->nr_uninterruptible++; 974 rq->nr_uninterruptible++;
947 975
948 dequeue_task(rq, p, sleep, now); 976 dequeue_task(rq, p, sleep);
949 dec_nr_running(p, rq, now); 977 dec_nr_running(p, rq);
950} 978}
951 979
952/** 980/**
@@ -981,18 +1009,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
981 u64 clock_offset, fair_clock_offset; 1009 u64 clock_offset, fair_clock_offset;
982 1010
983 clock_offset = old_rq->clock - new_rq->clock; 1011 clock_offset = old_rq->clock - new_rq->clock;
984 fair_clock_offset = old_rq->cfs.fair_clock - 1012 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
985 new_rq->cfs.fair_clock; 1013
986 if (p->se.wait_start)
987 p->se.wait_start -= clock_offset;
988 if (p->se.wait_start_fair) 1014 if (p->se.wait_start_fair)
989 p->se.wait_start_fair -= fair_clock_offset; 1015 p->se.wait_start_fair -= fair_clock_offset;
1016 if (p->se.sleep_start_fair)
1017 p->se.sleep_start_fair -= fair_clock_offset;
1018
1019#ifdef CONFIG_SCHEDSTATS
1020 if (p->se.wait_start)
1021 p->se.wait_start -= clock_offset;
990 if (p->se.sleep_start) 1022 if (p->se.sleep_start)
991 p->se.sleep_start -= clock_offset; 1023 p->se.sleep_start -= clock_offset;
992 if (p->se.block_start) 1024 if (p->se.block_start)
993 p->se.block_start -= clock_offset; 1025 p->se.block_start -= clock_offset;
994 if (p->se.sleep_start_fair) 1026#endif
995 p->se.sleep_start_fair -= fair_clock_offset;
996 1027
997 __set_task_cpu(p, new_cpu); 1028 __set_task_cpu(p, new_cpu);
998} 1029}
@@ -1511,6 +1542,7 @@ out_set_cpu:
1511 1542
1512out_activate: 1543out_activate:
1513#endif /* CONFIG_SMP */ 1544#endif /* CONFIG_SMP */
1545 update_rq_clock(rq);
1514 activate_task(rq, p, 1); 1546 activate_task(rq, p, 1);
1515 /* 1547 /*
1516 * Sync wakeups (i.e. those types of wakeups where the waker 1548 * Sync wakeups (i.e. those types of wakeups where the waker
@@ -1553,17 +1585,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1553static void __sched_fork(struct task_struct *p) 1585static void __sched_fork(struct task_struct *p)
1554{ 1586{
1555 p->se.wait_start_fair = 0; 1587 p->se.wait_start_fair = 0;
1556 p->se.wait_start = 0;
1557 p->se.exec_start = 0; 1588 p->se.exec_start = 0;
1558 p->se.sum_exec_runtime = 0; 1589 p->se.sum_exec_runtime = 0;
1590 p->se.prev_sum_exec_runtime = 0;
1559 p->se.delta_exec = 0; 1591 p->se.delta_exec = 0;
1560 p->se.delta_fair_run = 0; 1592 p->se.delta_fair_run = 0;
1561 p->se.delta_fair_sleep = 0; 1593 p->se.delta_fair_sleep = 0;
1562 p->se.wait_runtime = 0; 1594 p->se.wait_runtime = 0;
1595 p->se.sleep_start_fair = 0;
1596
1597#ifdef CONFIG_SCHEDSTATS
1598 p->se.wait_start = 0;
1563 p->se.sum_wait_runtime = 0; 1599 p->se.sum_wait_runtime = 0;
1564 p->se.sum_sleep_runtime = 0; 1600 p->se.sum_sleep_runtime = 0;
1565 p->se.sleep_start = 0; 1601 p->se.sleep_start = 0;
1566 p->se.sleep_start_fair = 0;
1567 p->se.block_start = 0; 1602 p->se.block_start = 0;
1568 p->se.sleep_max = 0; 1603 p->se.sleep_max = 0;
1569 p->se.block_max = 0; 1604 p->se.block_max = 0;
@@ -1571,10 +1606,15 @@ static void __sched_fork(struct task_struct *p)
1571 p->se.wait_max = 0; 1606 p->se.wait_max = 0;
1572 p->se.wait_runtime_overruns = 0; 1607 p->se.wait_runtime_overruns = 0;
1573 p->se.wait_runtime_underruns = 0; 1608 p->se.wait_runtime_underruns = 0;
1609#endif
1574 1610
1575 INIT_LIST_HEAD(&p->run_list); 1611 INIT_LIST_HEAD(&p->run_list);
1576 p->se.on_rq = 0; 1612 p->se.on_rq = 0;
1577 1613
1614#ifdef CONFIG_PREEMPT_NOTIFIERS
1615 INIT_HLIST_HEAD(&p->preempt_notifiers);
1616#endif
1617
1578 /* 1618 /*
1579 * We mark the process as running here, but have not actually 1619 * We mark the process as running here, but have not actually
1580 * inserted it onto the runqueue yet. This guarantees that 1620 * inserted it onto the runqueue yet. This guarantees that
@@ -1639,11 +1679,19 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1639 rq = task_rq_lock(p, &flags); 1679 rq = task_rq_lock(p, &flags);
1640 BUG_ON(p->state != TASK_RUNNING); 1680 BUG_ON(p->state != TASK_RUNNING);
1641 this_cpu = smp_processor_id(); /* parent's CPU */ 1681 this_cpu = smp_processor_id(); /* parent's CPU */
1682 update_rq_clock(rq);
1642 1683
1643 p->prio = effective_prio(p); 1684 p->prio = effective_prio(p);
1644 1685
1645 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || 1686 if (rt_prio(p->prio))
1646 task_cpu(p) != this_cpu || !current->se.on_rq) { 1687 p->sched_class = &rt_sched_class;
1688 else
1689 p->sched_class = &fair_sched_class;
1690
1691 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1692 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1693 !current->se.on_rq) {
1694
1647 activate_task(rq, p, 0); 1695 activate_task(rq, p, 0);
1648 } else { 1696 } else {
1649 /* 1697 /*
@@ -1651,14 +1699,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1651 * management (if any): 1699 * management (if any):
1652 */ 1700 */
1653 p->sched_class->task_new(rq, p); 1701 p->sched_class->task_new(rq, p);
1702 inc_nr_running(p, rq);
1654 } 1703 }
1655 check_preempt_curr(rq, p); 1704 check_preempt_curr(rq, p);
1656 task_rq_unlock(rq, &flags); 1705 task_rq_unlock(rq, &flags);
1657} 1706}
1658 1707
1708#ifdef CONFIG_PREEMPT_NOTIFIERS
1709
1710/**
1711 * preempt_notifier_register - tell me when current is being being preempted & rescheduled
1712 * @notifier: notifier struct to register
1713 */
1714void preempt_notifier_register(struct preempt_notifier *notifier)
1715{
1716 hlist_add_head(&notifier->link, &current->preempt_notifiers);
1717}
1718EXPORT_SYMBOL_GPL(preempt_notifier_register);
1719
1720/**
1721 * preempt_notifier_unregister - no longer interested in preemption notifications
1722 * @notifier: notifier struct to unregister
1723 *
1724 * This is safe to call from within a preemption notifier.
1725 */
1726void preempt_notifier_unregister(struct preempt_notifier *notifier)
1727{
1728 hlist_del(&notifier->link);
1729}
1730EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1731
1732static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1733{
1734 struct preempt_notifier *notifier;
1735 struct hlist_node *node;
1736
1737 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1738 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1739}
1740
1741static void
1742fire_sched_out_preempt_notifiers(struct task_struct *curr,
1743 struct task_struct *next)
1744{
1745 struct preempt_notifier *notifier;
1746 struct hlist_node *node;
1747
1748 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1749 notifier->ops->sched_out(notifier, next);
1750}
1751
1752#else
1753
1754static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1755{
1756}
1757
1758static void
1759fire_sched_out_preempt_notifiers(struct task_struct *curr,
1760 struct task_struct *next)
1761{
1762}
1763
1764#endif
1765
1659/** 1766/**
1660 * prepare_task_switch - prepare to switch tasks 1767 * prepare_task_switch - prepare to switch tasks
1661 * @rq: the runqueue preparing to switch 1768 * @rq: the runqueue preparing to switch
1769 * @prev: the current task that is being switched out
1662 * @next: the task we are going to switch to. 1770 * @next: the task we are going to switch to.
1663 * 1771 *
1664 * This is called with the rq lock held and interrupts off. It must 1772 * This is called with the rq lock held and interrupts off. It must
@@ -1668,8 +1776,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1668 * prepare_task_switch sets up locking and calls architecture specific 1776 * prepare_task_switch sets up locking and calls architecture specific
1669 * hooks. 1777 * hooks.
1670 */ 1778 */
1671static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) 1779static inline void
1780prepare_task_switch(struct rq *rq, struct task_struct *prev,
1781 struct task_struct *next)
1672{ 1782{
1783 fire_sched_out_preempt_notifiers(prev, next);
1673 prepare_lock_switch(rq, next); 1784 prepare_lock_switch(rq, next);
1674 prepare_arch_switch(next); 1785 prepare_arch_switch(next);
1675} 1786}
@@ -1711,6 +1822,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1711 prev_state = prev->state; 1822 prev_state = prev->state;
1712 finish_arch_switch(prev); 1823 finish_arch_switch(prev);
1713 finish_lock_switch(rq, prev); 1824 finish_lock_switch(rq, prev);
1825 fire_sched_in_preempt_notifiers(current);
1714 if (mm) 1826 if (mm)
1715 mmdrop(mm); 1827 mmdrop(mm);
1716 if (unlikely(prev_state == TASK_DEAD)) { 1828 if (unlikely(prev_state == TASK_DEAD)) {
@@ -1751,7 +1863,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
1751{ 1863{
1752 struct mm_struct *mm, *oldmm; 1864 struct mm_struct *mm, *oldmm;
1753 1865
1754 prepare_task_switch(rq, next); 1866 prepare_task_switch(rq, prev, next);
1755 mm = next->mm; 1867 mm = next->mm;
1756 oldmm = prev->active_mm; 1868 oldmm = prev->active_mm;
1757 /* 1869 /*
@@ -1874,7 +1986,6 @@ static void update_cpu_load(struct rq *this_rq)
1874 unsigned long total_load = this_rq->ls.load.weight; 1986 unsigned long total_load = this_rq->ls.load.weight;
1875 unsigned long this_load = total_load; 1987 unsigned long this_load = total_load;
1876 struct load_stat *ls = &this_rq->ls; 1988 struct load_stat *ls = &this_rq->ls;
1877 u64 now = __rq_clock(this_rq);
1878 int i, scale; 1989 int i, scale;
1879 1990
1880 this_rq->nr_load_updates++; 1991 this_rq->nr_load_updates++;
@@ -1882,7 +1993,7 @@ static void update_cpu_load(struct rq *this_rq)
1882 goto do_avg; 1993 goto do_avg;
1883 1994
1884 /* Update delta_fair/delta_exec fields first */ 1995 /* Update delta_fair/delta_exec fields first */
1885 update_curr_load(this_rq, now); 1996 update_curr_load(this_rq);
1886 1997
1887 fair_delta64 = ls->delta_fair + 1; 1998 fair_delta64 = ls->delta_fair + 1;
1888 ls->delta_fair = 0; 1999 ls->delta_fair = 0;
@@ -1890,8 +2001,8 @@ static void update_cpu_load(struct rq *this_rq)
1890 exec_delta64 = ls->delta_exec + 1; 2001 exec_delta64 = ls->delta_exec + 1;
1891 ls->delta_exec = 0; 2002 ls->delta_exec = 0;
1892 2003
1893 sample_interval64 = now - ls->load_update_last; 2004 sample_interval64 = this_rq->clock - ls->load_update_last;
1894 ls->load_update_last = now; 2005 ls->load_update_last = this_rq->clock;
1895 2006
1896 if ((s64)sample_interval64 < (s64)TICK_NSEC) 2007 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1897 sample_interval64 = TICK_NSEC; 2008 sample_interval64 = TICK_NSEC;
@@ -1946,6 +2057,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1946 spin_lock(&rq1->lock); 2057 spin_lock(&rq1->lock);
1947 } 2058 }
1948 } 2059 }
2060 update_rq_clock(rq1);
2061 update_rq_clock(rq2);
1949} 2062}
1950 2063
1951/* 2064/*
@@ -2073,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2073 if (task_running(rq, p)) 2186 if (task_running(rq, p))
2074 return 0; 2187 return 0;
2075 2188
2076 /*
2077 * Aggressive migration if too many balance attempts have failed:
2078 */
2079 if (sd->nr_balance_failed > sd->cache_nice_tries)
2080 return 1;
2081
2082 return 1; 2189 return 1;
2083} 2190}
2084 2191
@@ -2086,8 +2193,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2086 unsigned long max_nr_move, unsigned long max_load_move, 2193 unsigned long max_nr_move, unsigned long max_load_move,
2087 struct sched_domain *sd, enum cpu_idle_type idle, 2194 struct sched_domain *sd, enum cpu_idle_type idle,
2088 int *all_pinned, unsigned long *load_moved, 2195 int *all_pinned, unsigned long *load_moved,
2089 int this_best_prio, int best_prio, int best_prio_seen, 2196 int *this_best_prio, struct rq_iterator *iterator)
2090 struct rq_iterator *iterator)
2091{ 2197{
2092 int pulled = 0, pinned = 0, skip_for_load; 2198 int pulled = 0, pinned = 0, skip_for_load;
2093 struct task_struct *p; 2199 struct task_struct *p;
@@ -2112,12 +2218,8 @@ next:
2112 */ 2218 */
2113 skip_for_load = (p->se.load.weight >> 1) > rem_load_move + 2219 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2114 SCHED_LOAD_SCALE_FUZZ; 2220 SCHED_LOAD_SCALE_FUZZ;
2115 if (skip_for_load && p->prio < this_best_prio) 2221 if ((skip_for_load && p->prio >= *this_best_prio) ||
2116 skip_for_load = !best_prio_seen && p->prio == best_prio;
2117 if (skip_for_load ||
2118 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2222 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2119
2120 best_prio_seen |= p->prio == best_prio;
2121 p = iterator->next(iterator->arg); 2223 p = iterator->next(iterator->arg);
2122 goto next; 2224 goto next;
2123 } 2225 }
@@ -2131,8 +2233,8 @@ next:
2131 * and the prescribed amount of weighted load. 2233 * and the prescribed amount of weighted load.
2132 */ 2234 */
2133 if (pulled < max_nr_move && rem_load_move > 0) { 2235 if (pulled < max_nr_move && rem_load_move > 0) {
2134 if (p->prio < this_best_prio) 2236 if (p->prio < *this_best_prio)
2135 this_best_prio = p->prio; 2237 *this_best_prio = p->prio;
2136 p = iterator->next(iterator->arg); 2238 p = iterator->next(iterator->arg);
2137 goto next; 2239 goto next;
2138 } 2240 }
@@ -2151,32 +2253,52 @@ out:
2151} 2253}
2152 2254
2153/* 2255/*
2154 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted 2256 * move_tasks tries to move up to max_load_move weighted load from busiest to
2155 * load from busiest to this_rq, as part of a balancing operation within 2257 * this_rq, as part of a balancing operation within domain "sd".
2156 * "domain". Returns the number of tasks moved. 2258 * Returns 1 if successful and 0 otherwise.
2157 * 2259 *
2158 * Called with both runqueues locked. 2260 * Called with both runqueues locked.
2159 */ 2261 */
2160static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2262static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2161 unsigned long max_nr_move, unsigned long max_load_move, 2263 unsigned long max_load_move,
2162 struct sched_domain *sd, enum cpu_idle_type idle, 2264 struct sched_domain *sd, enum cpu_idle_type idle,
2163 int *all_pinned) 2265 int *all_pinned)
2164{ 2266{
2165 struct sched_class *class = sched_class_highest; 2267 struct sched_class *class = sched_class_highest;
2166 unsigned long load_moved, total_nr_moved = 0, nr_moved; 2268 unsigned long total_load_moved = 0;
2167 long rem_load_move = max_load_move; 2269 int this_best_prio = this_rq->curr->prio;
2168 2270
2169 do { 2271 do {
2170 nr_moved = class->load_balance(this_rq, this_cpu, busiest, 2272 total_load_moved +=
2171 max_nr_move, (unsigned long)rem_load_move, 2273 class->load_balance(this_rq, this_cpu, busiest,
2172 sd, idle, all_pinned, &load_moved); 2274 ULONG_MAX, max_load_move - total_load_moved,
2173 total_nr_moved += nr_moved; 2275 sd, idle, all_pinned, &this_best_prio);
2174 max_nr_move -= nr_moved;
2175 rem_load_move -= load_moved;
2176 class = class->next; 2276 class = class->next;
2177 } while (class && max_nr_move && rem_load_move > 0); 2277 } while (class && max_load_move > total_load_moved);
2278
2279 return total_load_moved > 0;
2280}
2281
2282/*
2283 * move_one_task tries to move exactly one task from busiest to this_rq, as
2284 * part of active balancing operations within "domain".
2285 * Returns 1 if successful and 0 otherwise.
2286 *
2287 * Called with both runqueues locked.
2288 */
2289static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2290 struct sched_domain *sd, enum cpu_idle_type idle)
2291{
2292 struct sched_class *class;
2293 int this_best_prio = MAX_PRIO;
2294
2295 for (class = sched_class_highest; class; class = class->next)
2296 if (class->load_balance(this_rq, this_cpu, busiest,
2297 1, ULONG_MAX, sd, idle, NULL,
2298 &this_best_prio))
2299 return 1;
2178 2300
2179 return total_nr_moved; 2301 return 0;
2180} 2302}
2181 2303
2182/* 2304/*
@@ -2235,7 +2357,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2235 2357
2236 rq = cpu_rq(i); 2358 rq = cpu_rq(i);
2237 2359
2238 if (*sd_idle && !idle_cpu(i)) 2360 if (*sd_idle && rq->nr_running)
2239 *sd_idle = 0; 2361 *sd_idle = 0;
2240 2362
2241 /* Bias balancing toward cpus of our domain */ 2363 /* Bias balancing toward cpus of our domain */
@@ -2257,9 +2379,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2257 /* 2379 /*
2258 * First idle cpu or the first cpu(busiest) in this sched group 2380 * First idle cpu or the first cpu(busiest) in this sched group
2259 * is eligible for doing load balancing at this and above 2381 * is eligible for doing load balancing at this and above
2260 * domains. 2382 * domains. In the newly idle case, we will allow all the cpu's
2383 * to do the newly idle load balance.
2261 */ 2384 */
2262 if (local_group && balance_cpu != this_cpu && balance) { 2385 if (idle != CPU_NEWLY_IDLE && local_group &&
2386 balance_cpu != this_cpu && balance) {
2263 *balance = 0; 2387 *balance = 0;
2264 goto ret; 2388 goto ret;
2265 } 2389 }
@@ -2393,7 +2517,7 @@ group_next:
2393 * a think about bumping its value to force at least one task to be 2517 * a think about bumping its value to force at least one task to be
2394 * moved 2518 * moved
2395 */ 2519 */
2396 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2520 if (*imbalance < busiest_load_per_task) {
2397 unsigned long tmp, pwr_now, pwr_move; 2521 unsigned long tmp, pwr_now, pwr_move;
2398 unsigned int imbn; 2522 unsigned int imbn;
2399 2523
@@ -2445,10 +2569,8 @@ small_imbalance:
2445 pwr_move /= SCHED_LOAD_SCALE; 2569 pwr_move /= SCHED_LOAD_SCALE;
2446 2570
2447 /* Move if we gain throughput */ 2571 /* Move if we gain throughput */
2448 if (pwr_move <= pwr_now) 2572 if (pwr_move > pwr_now)
2449 goto out_balanced; 2573 *imbalance = busiest_load_per_task;
2450
2451 *imbalance = busiest_load_per_task;
2452 } 2574 }
2453 2575
2454 return busiest; 2576 return busiest;
@@ -2506,11 +2628,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2506 */ 2628 */
2507#define MAX_PINNED_INTERVAL 512 2629#define MAX_PINNED_INTERVAL 512
2508 2630
2509static inline unsigned long minus_1_or_zero(unsigned long n)
2510{
2511 return n > 0 ? n - 1 : 0;
2512}
2513
2514/* 2631/*
2515 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2632 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2516 * tasks if there is an imbalance. 2633 * tasks if there is an imbalance.
@@ -2519,7 +2636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2519 struct sched_domain *sd, enum cpu_idle_type idle, 2636 struct sched_domain *sd, enum cpu_idle_type idle,
2520 int *balance) 2637 int *balance)
2521{ 2638{
2522 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2639 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2523 struct sched_group *group; 2640 struct sched_group *group;
2524 unsigned long imbalance; 2641 unsigned long imbalance;
2525 struct rq *busiest; 2642 struct rq *busiest;
@@ -2560,18 +2677,17 @@ redo:
2560 2677
2561 schedstat_add(sd, lb_imbalance[idle], imbalance); 2678 schedstat_add(sd, lb_imbalance[idle], imbalance);
2562 2679
2563 nr_moved = 0; 2680 ld_moved = 0;
2564 if (busiest->nr_running > 1) { 2681 if (busiest->nr_running > 1) {
2565 /* 2682 /*
2566 * Attempt to move tasks. If find_busiest_group has found 2683 * Attempt to move tasks. If find_busiest_group has found
2567 * an imbalance but busiest->nr_running <= 1, the group is 2684 * an imbalance but busiest->nr_running <= 1, the group is
2568 * still unbalanced. nr_moved simply stays zero, so it is 2685 * still unbalanced. ld_moved simply stays zero, so it is
2569 * correctly treated as an imbalance. 2686 * correctly treated as an imbalance.
2570 */ 2687 */
2571 local_irq_save(flags); 2688 local_irq_save(flags);
2572 double_rq_lock(this_rq, busiest); 2689 double_rq_lock(this_rq, busiest);
2573 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2690 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2574 minus_1_or_zero(busiest->nr_running),
2575 imbalance, sd, idle, &all_pinned); 2691 imbalance, sd, idle, &all_pinned);
2576 double_rq_unlock(this_rq, busiest); 2692 double_rq_unlock(this_rq, busiest);
2577 local_irq_restore(flags); 2693 local_irq_restore(flags);
@@ -2579,7 +2695,7 @@ redo:
2579 /* 2695 /*
2580 * some other cpu did the load balance for us. 2696 * some other cpu did the load balance for us.
2581 */ 2697 */
2582 if (nr_moved && this_cpu != smp_processor_id()) 2698 if (ld_moved && this_cpu != smp_processor_id())
2583 resched_cpu(this_cpu); 2699 resched_cpu(this_cpu);
2584 2700
2585 /* All tasks on this runqueue were pinned by CPU affinity */ 2701 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -2591,7 +2707,7 @@ redo:
2591 } 2707 }
2592 } 2708 }
2593 2709
2594 if (!nr_moved) { 2710 if (!ld_moved) {
2595 schedstat_inc(sd, lb_failed[idle]); 2711 schedstat_inc(sd, lb_failed[idle]);
2596 sd->nr_balance_failed++; 2712 sd->nr_balance_failed++;
2597 2713
@@ -2640,10 +2756,10 @@ redo:
2640 sd->balance_interval *= 2; 2756 sd->balance_interval *= 2;
2641 } 2757 }
2642 2758
2643 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2759 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2644 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2760 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2645 return -1; 2761 return -1;
2646 return nr_moved; 2762 return ld_moved;
2647 2763
2648out_balanced: 2764out_balanced:
2649 schedstat_inc(sd, lb_balanced[idle]); 2765 schedstat_inc(sd, lb_balanced[idle]);
@@ -2675,8 +2791,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2675 struct sched_group *group; 2791 struct sched_group *group;
2676 struct rq *busiest = NULL; 2792 struct rq *busiest = NULL;
2677 unsigned long imbalance; 2793 unsigned long imbalance;
2678 int nr_moved = 0; 2794 int ld_moved = 0;
2679 int sd_idle = 0; 2795 int sd_idle = 0;
2796 int all_pinned = 0;
2680 cpumask_t cpus = CPU_MASK_ALL; 2797 cpumask_t cpus = CPU_MASK_ALL;
2681 2798
2682 /* 2799 /*
@@ -2709,23 +2826,25 @@ redo:
2709 2826
2710 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); 2827 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2711 2828
2712 nr_moved = 0; 2829 ld_moved = 0;
2713 if (busiest->nr_running > 1) { 2830 if (busiest->nr_running > 1) {
2714 /* Attempt to move tasks */ 2831 /* Attempt to move tasks */
2715 double_lock_balance(this_rq, busiest); 2832 double_lock_balance(this_rq, busiest);
2716 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2833 /* this_rq->clock is already updated */
2717 minus_1_or_zero(busiest->nr_running), 2834 update_rq_clock(busiest);
2718 imbalance, sd, CPU_NEWLY_IDLE, NULL); 2835 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2836 imbalance, sd, CPU_NEWLY_IDLE,
2837 &all_pinned);
2719 spin_unlock(&busiest->lock); 2838 spin_unlock(&busiest->lock);
2720 2839
2721 if (!nr_moved) { 2840 if (unlikely(all_pinned)) {
2722 cpu_clear(cpu_of(busiest), cpus); 2841 cpu_clear(cpu_of(busiest), cpus);
2723 if (!cpus_empty(cpus)) 2842 if (!cpus_empty(cpus))
2724 goto redo; 2843 goto redo;
2725 } 2844 }
2726 } 2845 }
2727 2846
2728 if (!nr_moved) { 2847 if (!ld_moved) {
2729 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 2848 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2849 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2850 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
@@ -2733,7 +2852,7 @@ redo:
2733 } else 2852 } else
2734 sd->nr_balance_failed = 0; 2853 sd->nr_balance_failed = 0;
2735 2854
2736 return nr_moved; 2855 return ld_moved;
2737 2856
2738out_balanced: 2857out_balanced:
2739 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); 2858 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
@@ -2810,6 +2929,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2810 2929
2811 /* move a task from busiest_rq to target_rq */ 2930 /* move a task from busiest_rq to target_rq */
2812 double_lock_balance(busiest_rq, target_rq); 2931 double_lock_balance(busiest_rq, target_rq);
2932 update_rq_clock(busiest_rq);
2933 update_rq_clock(target_rq);
2813 2934
2814 /* Search for an sd spanning us and the target CPU. */ 2935 /* Search for an sd spanning us and the target CPU. */
2815 for_each_domain(target_cpu, sd) { 2936 for_each_domain(target_cpu, sd) {
@@ -2821,9 +2942,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2821 if (likely(sd)) { 2942 if (likely(sd)) {
2822 schedstat_inc(sd, alb_cnt); 2943 schedstat_inc(sd, alb_cnt);
2823 2944
2824 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2945 if (move_one_task(target_rq, target_cpu, busiest_rq,
2825 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, 2946 sd, CPU_IDLE))
2826 NULL))
2827 schedstat_inc(sd, alb_pushed); 2947 schedstat_inc(sd, alb_pushed);
2828 else 2948 else
2829 schedstat_inc(sd, alb_failed); 2949 schedstat_inc(sd, alb_failed);
@@ -2921,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2921 struct sched_domain *sd; 3041 struct sched_domain *sd;
2922 /* Earliest time when we have to do rebalance again */ 3042 /* Earliest time when we have to do rebalance again */
2923 unsigned long next_balance = jiffies + 60*HZ; 3043 unsigned long next_balance = jiffies + 60*HZ;
3044 int update_next_balance = 0;
2924 3045
2925 for_each_domain(cpu, sd) { 3046 for_each_domain(cpu, sd) {
2926 if (!(sd->flags & SD_LOAD_BALANCE)) 3047 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2957,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2957 if (sd->flags & SD_SERIALIZE) 3078 if (sd->flags & SD_SERIALIZE)
2958 spin_unlock(&balancing); 3079 spin_unlock(&balancing);
2959out: 3080out:
2960 if (time_after(next_balance, sd->last_balance + interval)) 3081 if (time_after(next_balance, sd->last_balance + interval)) {
2961 next_balance = sd->last_balance + interval; 3082 next_balance = sd->last_balance + interval;
3083 update_next_balance = 1;
3084 }
2962 3085
2963 /* 3086 /*
2964 * Stop the load balance at this level. There is another 3087 * Stop the load balance at this level. There is another
@@ -2968,7 +3091,14 @@ out:
2968 if (!balance) 3091 if (!balance)
2969 break; 3092 break;
2970 } 3093 }
2971 rq->next_balance = next_balance; 3094
3095 /*
3096 * next_balance will be updated only when there is a need.
3097 * When the cpu is attached to null domain for ex, it will not be
3098 * updated.
3099 */
3100 if (likely(update_next_balance))
3101 rq->next_balance = next_balance;
2972} 3102}
2973 3103
2974/* 3104/*
@@ -3007,7 +3137,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3007 if (need_resched()) 3137 if (need_resched())
3008 break; 3138 break;
3009 3139
3010 rebalance_domains(balance_cpu, SCHED_IDLE); 3140 rebalance_domains(balance_cpu, CPU_IDLE);
3011 3141
3012 rq = cpu_rq(balance_cpu); 3142 rq = cpu_rq(balance_cpu);
3013 if (time_after(this_rq->next_balance, rq->next_balance)) 3143 if (time_after(this_rq->next_balance, rq->next_balance))
@@ -3092,8 +3222,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3092 unsigned long max_nr_move, unsigned long max_load_move, 3222 unsigned long max_nr_move, unsigned long max_load_move,
3093 struct sched_domain *sd, enum cpu_idle_type idle, 3223 struct sched_domain *sd, enum cpu_idle_type idle,
3094 int *all_pinned, unsigned long *load_moved, 3224 int *all_pinned, unsigned long *load_moved,
3095 int this_best_prio, int best_prio, int best_prio_seen, 3225 int *this_best_prio, struct rq_iterator *iterator)
3096 struct rq_iterator *iterator)
3097{ 3226{
3098 *load_moved = 0; 3227 *load_moved = 0;
3099 3228
@@ -3119,7 +3248,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3119 rq = task_rq_lock(p, &flags); 3248 rq = task_rq_lock(p, &flags);
3120 ns = p->se.sum_exec_runtime; 3249 ns = p->se.sum_exec_runtime;
3121 if (rq->curr == p) { 3250 if (rq->curr == p) {
3122 delta_exec = rq_clock(rq) - p->se.exec_start; 3251 update_rq_clock(rq);
3252 delta_exec = rq->clock - p->se.exec_start;
3123 if ((s64)delta_exec > 0) 3253 if ((s64)delta_exec > 0)
3124 ns += delta_exec; 3254 ns += delta_exec;
3125 } 3255 }
@@ -3213,11 +3343,19 @@ void scheduler_tick(void)
3213 int cpu = smp_processor_id(); 3343 int cpu = smp_processor_id();
3214 struct rq *rq = cpu_rq(cpu); 3344 struct rq *rq = cpu_rq(cpu);
3215 struct task_struct *curr = rq->curr; 3345 struct task_struct *curr = rq->curr;
3346 u64 next_tick = rq->tick_timestamp + TICK_NSEC;
3216 3347
3217 spin_lock(&rq->lock); 3348 spin_lock(&rq->lock);
3349 __update_rq_clock(rq);
3350 /*
3351 * Let rq->clock advance by at least TICK_NSEC:
3352 */
3353 if (unlikely(rq->clock < next_tick))
3354 rq->clock = next_tick;
3355 rq->tick_timestamp = rq->clock;
3356 update_cpu_load(rq);
3218 if (curr != rq->idle) /* FIXME: needed? */ 3357 if (curr != rq->idle) /* FIXME: needed? */
3219 curr->sched_class->task_tick(rq, curr); 3358 curr->sched_class->task_tick(rq, curr);
3220 update_cpu_load(rq);
3221 spin_unlock(&rq->lock); 3359 spin_unlock(&rq->lock);
3222 3360
3223#ifdef CONFIG_SMP 3361#ifdef CONFIG_SMP
@@ -3299,7 +3437,7 @@ static inline void schedule_debug(struct task_struct *prev)
3299 * Pick up the highest-prio task: 3437 * Pick up the highest-prio task:
3300 */ 3438 */
3301static inline struct task_struct * 3439static inline struct task_struct *
3302pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) 3440pick_next_task(struct rq *rq, struct task_struct *prev)
3303{ 3441{
3304 struct sched_class *class; 3442 struct sched_class *class;
3305 struct task_struct *p; 3443 struct task_struct *p;
@@ -3309,14 +3447,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3309 * the fair class we can call that function directly: 3447 * the fair class we can call that function directly:
3310 */ 3448 */
3311 if (likely(rq->nr_running == rq->cfs.nr_running)) { 3449 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3312 p = fair_sched_class.pick_next_task(rq, now); 3450 p = fair_sched_class.pick_next_task(rq);
3313 if (likely(p)) 3451 if (likely(p))
3314 return p; 3452 return p;
3315 } 3453 }
3316 3454
3317 class = sched_class_highest; 3455 class = sched_class_highest;
3318 for ( ; ; ) { 3456 for ( ; ; ) {
3319 p = class->pick_next_task(rq, now); 3457 p = class->pick_next_task(rq);
3320 if (p) 3458 if (p)
3321 return p; 3459 return p;
3322 /* 3460 /*
@@ -3335,7 +3473,6 @@ asmlinkage void __sched schedule(void)
3335 struct task_struct *prev, *next; 3473 struct task_struct *prev, *next;
3336 long *switch_count; 3474 long *switch_count;
3337 struct rq *rq; 3475 struct rq *rq;
3338 u64 now;
3339 int cpu; 3476 int cpu;
3340 3477
3341need_resched: 3478need_resched:
@@ -3353,6 +3490,7 @@ need_resched_nonpreemptible:
3353 3490
3354 spin_lock_irq(&rq->lock); 3491 spin_lock_irq(&rq->lock);
3355 clear_tsk_need_resched(prev); 3492 clear_tsk_need_resched(prev);
3493 __update_rq_clock(rq);
3356 3494
3357 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3495 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3358 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3496 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3367,9 +3505,8 @@ need_resched_nonpreemptible:
3367 if (unlikely(!rq->nr_running)) 3505 if (unlikely(!rq->nr_running))
3368 idle_balance(cpu, rq); 3506 idle_balance(cpu, rq);
3369 3507
3370 now = __rq_clock(rq); 3508 prev->sched_class->put_prev_task(rq, prev);
3371 prev->sched_class->put_prev_task(rq, prev, now); 3509 next = pick_next_task(rq, prev);
3372 next = pick_next_task(rq, prev, now);
3373 3510
3374 sched_info_switch(prev, next); 3511 sched_info_switch(prev, next);
3375 3512
@@ -3812,17 +3949,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3812 unsigned long flags; 3949 unsigned long flags;
3813 int oldprio, on_rq; 3950 int oldprio, on_rq;
3814 struct rq *rq; 3951 struct rq *rq;
3815 u64 now;
3816 3952
3817 BUG_ON(prio < 0 || prio > MAX_PRIO); 3953 BUG_ON(prio < 0 || prio > MAX_PRIO);
3818 3954
3819 rq = task_rq_lock(p, &flags); 3955 rq = task_rq_lock(p, &flags);
3820 now = rq_clock(rq); 3956 update_rq_clock(rq);
3821 3957
3822 oldprio = p->prio; 3958 oldprio = p->prio;
3823 on_rq = p->se.on_rq; 3959 on_rq = p->se.on_rq;
3824 if (on_rq) 3960 if (on_rq)
3825 dequeue_task(rq, p, 0, now); 3961 dequeue_task(rq, p, 0);
3826 3962
3827 if (rt_prio(prio)) 3963 if (rt_prio(prio))
3828 p->sched_class = &rt_sched_class; 3964 p->sched_class = &rt_sched_class;
@@ -3832,7 +3968,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3832 p->prio = prio; 3968 p->prio = prio;
3833 3969
3834 if (on_rq) { 3970 if (on_rq) {
3835 enqueue_task(rq, p, 0, now); 3971 enqueue_task(rq, p, 0);
3836 /* 3972 /*
3837 * Reschedule if we are currently running on this runqueue and 3973 * Reschedule if we are currently running on this runqueue and
3838 * our priority decreased, or if we are not currently running on 3974 * our priority decreased, or if we are not currently running on
@@ -3855,7 +3991,6 @@ void set_user_nice(struct task_struct *p, long nice)
3855 int old_prio, delta, on_rq; 3991 int old_prio, delta, on_rq;
3856 unsigned long flags; 3992 unsigned long flags;
3857 struct rq *rq; 3993 struct rq *rq;
3858 u64 now;
3859 3994
3860 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3995 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3861 return; 3996 return;
@@ -3864,7 +3999,7 @@ void set_user_nice(struct task_struct *p, long nice)
3864 * the task might be in the middle of scheduling on another CPU. 3999 * the task might be in the middle of scheduling on another CPU.
3865 */ 4000 */
3866 rq = task_rq_lock(p, &flags); 4001 rq = task_rq_lock(p, &flags);
3867 now = rq_clock(rq); 4002 update_rq_clock(rq);
3868 /* 4003 /*
3869 * The RT priorities are set via sched_setscheduler(), but we still 4004 * The RT priorities are set via sched_setscheduler(), but we still
3870 * allow the 'normal' nice value to be set - but as expected 4005 * allow the 'normal' nice value to be set - but as expected
@@ -3877,8 +4012,8 @@ void set_user_nice(struct task_struct *p, long nice)
3877 } 4012 }
3878 on_rq = p->se.on_rq; 4013 on_rq = p->se.on_rq;
3879 if (on_rq) { 4014 if (on_rq) {
3880 dequeue_task(rq, p, 0, now); 4015 dequeue_task(rq, p, 0);
3881 dec_load(rq, p, now); 4016 dec_load(rq, p);
3882 } 4017 }
3883 4018
3884 p->static_prio = NICE_TO_PRIO(nice); 4019 p->static_prio = NICE_TO_PRIO(nice);
@@ -3888,8 +4023,8 @@ void set_user_nice(struct task_struct *p, long nice)
3888 delta = p->prio - old_prio; 4023 delta = p->prio - old_prio;
3889 4024
3890 if (on_rq) { 4025 if (on_rq) {
3891 enqueue_task(rq, p, 0, now); 4026 enqueue_task(rq, p, 0);
3892 inc_load(rq, p, now); 4027 inc_load(rq, p);
3893 /* 4028 /*
3894 * If the task increased its priority or is running and 4029 * If the task increased its priority or is running and
3895 * lowered its priority, then reschedule its CPU: 4030 * lowered its priority, then reschedule its CPU:
@@ -4125,6 +4260,7 @@ recheck:
4125 spin_unlock_irqrestore(&p->pi_lock, flags); 4260 spin_unlock_irqrestore(&p->pi_lock, flags);
4126 goto recheck; 4261 goto recheck;
4127 } 4262 }
4263 update_rq_clock(rq);
4128 on_rq = p->se.on_rq; 4264 on_rq = p->se.on_rq;
4129 if (on_rq) 4265 if (on_rq)
4130 deactivate_task(rq, p, 0); 4266 deactivate_task(rq, p, 0);
@@ -4380,10 +4516,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4380out_unlock: 4516out_unlock:
4381 read_unlock(&tasklist_lock); 4517 read_unlock(&tasklist_lock);
4382 mutex_unlock(&sched_hotcpu_mutex); 4518 mutex_unlock(&sched_hotcpu_mutex);
4383 if (retval)
4384 return retval;
4385 4519
4386 return 0; 4520 return retval;
4387} 4521}
4388 4522
4389/** 4523/**
@@ -4422,10 +4556,7 @@ asmlinkage long sys_sched_yield(void)
4422 struct rq *rq = this_rq_lock(); 4556 struct rq *rq = this_rq_lock();
4423 4557
4424 schedstat_inc(rq, yld_cnt); 4558 schedstat_inc(rq, yld_cnt);
4425 if (unlikely(rq->nr_running == 1)) 4559 current->sched_class->yield_task(rq, current);
4426 schedstat_inc(rq, yld_act_empty);
4427 else
4428 current->sched_class->yield_task(rq, current);
4429 4560
4430 /* 4561 /*
4431 * Since we are going to call schedule() anyway, there's 4562 * Since we are going to call schedule() anyway, there's
@@ -4781,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4781static inline void sched_init_granularity(void) 4912static inline void sched_init_granularity(void)
4782{ 4913{
4783 unsigned int factor = 1 + ilog2(num_online_cpus()); 4914 unsigned int factor = 1 + ilog2(num_online_cpus());
4784 const unsigned long gran_limit = 100000000; 4915 const unsigned long limit = 100000000;
4916
4917 sysctl_sched_min_granularity *= factor;
4918 if (sysctl_sched_min_granularity > limit)
4919 sysctl_sched_min_granularity = limit;
4785 4920
4786 sysctl_sched_granularity *= factor; 4921 sysctl_sched_latency *= factor;
4787 if (sysctl_sched_granularity > gran_limit) 4922 if (sysctl_sched_latency > limit)
4788 sysctl_sched_granularity = gran_limit; 4923 sysctl_sched_latency = limit;
4789 4924
4790 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4925 sysctl_sched_runtime_limit = sysctl_sched_latency;
4791 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4926 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4792} 4927}
4793 4928
4794#ifdef CONFIG_SMP 4929#ifdef CONFIG_SMP
@@ -4883,6 +5018,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4883 on_rq = p->se.on_rq; 5018 on_rq = p->se.on_rq;
4884 if (on_rq) 5019 if (on_rq)
4885 deactivate_task(rq_src, p, 0); 5020 deactivate_task(rq_src, p, 0);
5021
4886 set_task_cpu(p, dest_cpu); 5022 set_task_cpu(p, dest_cpu);
4887 if (on_rq) { 5023 if (on_rq) {
4888 activate_task(rq_dest, p, 0); 5024 activate_task(rq_dest, p, 0);
@@ -5115,14 +5251,137 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5115 for ( ; ; ) { 5251 for ( ; ; ) {
5116 if (!rq->nr_running) 5252 if (!rq->nr_running)
5117 break; 5253 break;
5118 next = pick_next_task(rq, rq->curr, rq_clock(rq)); 5254 update_rq_clock(rq);
5255 next = pick_next_task(rq, rq->curr);
5119 if (!next) 5256 if (!next)
5120 break; 5257 break;
5121 migrate_dead(dead_cpu, next); 5258 migrate_dead(dead_cpu, next);
5259
5122 } 5260 }
5123} 5261}
5124#endif /* CONFIG_HOTPLUG_CPU */ 5262#endif /* CONFIG_HOTPLUG_CPU */
5125 5263
5264#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5265
5266static struct ctl_table sd_ctl_dir[] = {
5267 {
5268 .procname = "sched_domain",
5269 .mode = 0555,
5270 },
5271 {0,},
5272};
5273
5274static struct ctl_table sd_ctl_root[] = {
5275 {
5276 .ctl_name = CTL_KERN,
5277 .procname = "kernel",
5278 .mode = 0555,
5279 .child = sd_ctl_dir,
5280 },
5281 {0,},
5282};
5283
5284static struct ctl_table *sd_alloc_ctl_entry(int n)
5285{
5286 struct ctl_table *entry =
5287 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
5288
5289 BUG_ON(!entry);
5290 memset(entry, 0, n * sizeof(struct ctl_table));
5291
5292 return entry;
5293}
5294
5295static void
5296set_table_entry(struct ctl_table *entry,
5297 const char *procname, void *data, int maxlen,
5298 mode_t mode, proc_handler *proc_handler)
5299{
5300 entry->procname = procname;
5301 entry->data = data;
5302 entry->maxlen = maxlen;
5303 entry->mode = mode;
5304 entry->proc_handler = proc_handler;
5305}
5306
5307static struct ctl_table *
5308sd_alloc_ctl_domain_table(struct sched_domain *sd)
5309{
5310 struct ctl_table *table = sd_alloc_ctl_entry(14);
5311
5312 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5313 sizeof(long), 0644, proc_doulongvec_minmax);
5314 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5315 sizeof(long), 0644, proc_doulongvec_minmax);
5316 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5317 sizeof(int), 0644, proc_dointvec_minmax);
5318 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5319 sizeof(int), 0644, proc_dointvec_minmax);
5320 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5321 sizeof(int), 0644, proc_dointvec_minmax);
5322 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5323 sizeof(int), 0644, proc_dointvec_minmax);
5324 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5325 sizeof(int), 0644, proc_dointvec_minmax);
5326 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5327 sizeof(int), 0644, proc_dointvec_minmax);
5328 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5329 sizeof(int), 0644, proc_dointvec_minmax);
5330 set_table_entry(&table[10], "cache_nice_tries",
5331 &sd->cache_nice_tries,
5332 sizeof(int), 0644, proc_dointvec_minmax);
5333 set_table_entry(&table[12], "flags", &sd->flags,
5334 sizeof(int), 0644, proc_dointvec_minmax);
5335
5336 return table;
5337}
5338
5339static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5340{
5341 struct ctl_table *entry, *table;
5342 struct sched_domain *sd;
5343 int domain_num = 0, i;
5344 char buf[32];
5345
5346 for_each_domain(cpu, sd)
5347 domain_num++;
5348 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5349
5350 i = 0;
5351 for_each_domain(cpu, sd) {
5352 snprintf(buf, 32, "domain%d", i);
5353 entry->procname = kstrdup(buf, GFP_KERNEL);
5354 entry->mode = 0555;
5355 entry->child = sd_alloc_ctl_domain_table(sd);
5356 entry++;
5357 i++;
5358 }
5359 return table;
5360}
5361
5362static struct ctl_table_header *sd_sysctl_header;
5363static void init_sched_domain_sysctl(void)
5364{
5365 int i, cpu_num = num_online_cpus();
5366 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5367 char buf[32];
5368
5369 sd_ctl_dir[0].child = entry;
5370
5371 for (i = 0; i < cpu_num; i++, entry++) {
5372 snprintf(buf, 32, "cpu%d", i);
5373 entry->procname = kstrdup(buf, GFP_KERNEL);
5374 entry->mode = 0555;
5375 entry->child = sd_alloc_ctl_cpu_table(i);
5376 }
5377 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5378}
5379#else
5380static void init_sched_domain_sysctl(void)
5381{
5382}
5383#endif
5384
5126/* 5385/*
5127 * migration_call - callback that gets triggered when a CPU is added. 5386 * migration_call - callback that gets triggered when a CPU is added.
5128 * Here we can start up the necessary migration thread for the new CPU. 5387 * Here we can start up the necessary migration thread for the new CPU.
@@ -5179,6 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5179 rq->migration_thread = NULL; 5438 rq->migration_thread = NULL;
5180 /* Idle task back to normal (off runqueue, low prio) */ 5439 /* Idle task back to normal (off runqueue, low prio) */
5181 rq = task_rq_lock(rq->idle, &flags); 5440 rq = task_rq_lock(rq->idle, &flags);
5441 update_rq_clock(rq);
5182 deactivate_task(rq, rq->idle, 0); 5442 deactivate_task(rq, rq->idle, 0);
5183 rq->idle->static_prio = MAX_PRIO; 5443 rq->idle->static_prio = MAX_PRIO;
5184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5444 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
@@ -6101,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6101} 6361}
6102 6362
6103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6363#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6104int arch_reinit_sched_domains(void) 6364static int arch_reinit_sched_domains(void)
6105{ 6365{
6106 int err; 6366 int err;
6107 6367
@@ -6130,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6130 return ret ? ret : count; 6390 return ret ? ret : count;
6131} 6391}
6132 6392
6133int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6134{
6135 int err = 0;
6136
6137#ifdef CONFIG_SCHED_SMT
6138 if (smt_capable())
6139 err = sysfs_create_file(&cls->kset.kobj,
6140 &attr_sched_smt_power_savings.attr);
6141#endif
6142#ifdef CONFIG_SCHED_MC
6143 if (!err && mc_capable())
6144 err = sysfs_create_file(&cls->kset.kobj,
6145 &attr_sched_mc_power_savings.attr);
6146#endif
6147 return err;
6148}
6149#endif
6150
6151#ifdef CONFIG_SCHED_MC 6393#ifdef CONFIG_SCHED_MC
6152static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 6394static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6153{ 6395{
@@ -6158,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6158{ 6400{
6159 return sched_power_savings_store(buf, count, 0); 6401 return sched_power_savings_store(buf, count, 0);
6160} 6402}
6161SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 6403static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6162 sched_mc_power_savings_store); 6404 sched_mc_power_savings_store);
6163#endif 6405#endif
6164 6406
6165#ifdef CONFIG_SCHED_SMT 6407#ifdef CONFIG_SCHED_SMT
@@ -6172,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6172{ 6414{
6173 return sched_power_savings_store(buf, count, 1); 6415 return sched_power_savings_store(buf, count, 1);
6174} 6416}
6175SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 6417static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6176 sched_smt_power_savings_store); 6418 sched_smt_power_savings_store);
6419#endif
6420
6421int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6422{
6423 int err = 0;
6424
6425#ifdef CONFIG_SCHED_SMT
6426 if (smt_capable())
6427 err = sysfs_create_file(&cls->kset.kobj,
6428 &attr_sched_smt_power_savings.attr);
6429#endif
6430#ifdef CONFIG_SCHED_MC
6431 if (!err && mc_capable())
6432 err = sysfs_create_file(&cls->kset.kobj,
6433 &attr_sched_mc_power_savings.attr);
6434#endif
6435 return err;
6436}
6177#endif 6437#endif
6178 6438
6179/* 6439/*
@@ -6228,6 +6488,8 @@ void __init sched_init_smp(void)
6228 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6488 /* XXX: Theoretical race here - CPU may be hotplugged now */
6229 hotcpu_notifier(update_sched_domains, 0); 6489 hotcpu_notifier(update_sched_domains, 0);
6230 6490
6491 init_sched_domain_sysctl();
6492
6231 /* Move init over to a non-isolated CPU */ 6493 /* Move init over to a non-isolated CPU */
6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6494 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6233 BUG(); 6495 BUG();
@@ -6314,6 +6576,10 @@ void __init sched_init(void)
6314 6576
6315 set_load_weight(&init_task); 6577 set_load_weight(&init_task);
6316 6578
6579#ifdef CONFIG_PREEMPT_NOTIFIERS
6580 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6581#endif
6582
6317#ifdef CONFIG_SMP 6583#ifdef CONFIG_SMP
6318 nr_cpu_ids = highest_cpu + 1; 6584 nr_cpu_ids = highest_cpu + 1;
6319 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 6585 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
@@ -6379,12 +6645,14 @@ void normalize_rt_tasks(void)
6379 do_each_thread(g, p) { 6645 do_each_thread(g, p) {
6380 p->se.fair_key = 0; 6646 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0; 6647 p->se.wait_runtime = 0;
6648 p->se.exec_start = 0;
6382 p->se.wait_start_fair = 0; 6649 p->se.wait_start_fair = 0;
6650 p->se.sleep_start_fair = 0;
6651#ifdef CONFIG_SCHEDSTATS
6383 p->se.wait_start = 0; 6652 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0; 6653 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0; 6654 p->se.block_start = 0;
6655#endif
6388 task_rq(p)->cfs.fair_clock = 0; 6656 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0; 6657 task_rq(p)->clock = 0;
6390 6658
@@ -6408,12 +6676,13 @@ void normalize_rt_tasks(void)
6408 goto out_unlock; 6676 goto out_unlock;
6409#endif 6677#endif
6410 6678
6679 update_rq_clock(rq);
6411 on_rq = p->se.on_rq; 6680 on_rq = p->se.on_rq;
6412 if (on_rq) 6681 if (on_rq)
6413 deactivate_task(task_rq(p), p, 0); 6682 deactivate_task(rq, p, 0);
6414 __setscheduler(rq, p, SCHED_NORMAL, 0); 6683 __setscheduler(rq, p, SCHED_NORMAL, 0);
6415 if (on_rq) { 6684 if (on_rq) {
6416 activate_task(task_rq(p), p, 0); 6685 activate_task(rq, p, 0);
6417 resched_task(rq->curr); 6686 resched_task(rq->curr);
6418 } 6687 }
6419#ifdef CONFIG_SMP 6688#ifdef CONFIG_SMP
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 29f2c21e7da2..c3ee38bd3426 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -29,29 +29,34 @@
29 } while (0) 29 } while (0)
30 30
31static void 31static void
32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) 32print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
33{ 33{
34 if (rq->curr == p) 34 if (rq->curr == p)
35 SEQ_printf(m, "R"); 35 SEQ_printf(m, "R");
36 else 36 else
37 SEQ_printf(m, " "); 37 SEQ_printf(m, " ");
38 38
39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " 39 SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
40 "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
41 p->comm, p->pid, 40 p->comm, p->pid,
42 (long long)p->se.fair_key, 41 (long long)p->se.fair_key,
43 (long long)(p->se.fair_key - rq->cfs.fair_clock), 42 (long long)(p->se.fair_key - rq->cfs.fair_clock),
44 (long long)p->se.wait_runtime, 43 (long long)p->se.wait_runtime,
45 (long long)(p->nvcsw + p->nivcsw), 44 (long long)(p->nvcsw + p->nivcsw),
46 p->prio, 45 p->prio);
46#ifdef CONFIG_SCHEDSTATS
47 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
47 (long long)p->se.sum_exec_runtime, 48 (long long)p->se.sum_exec_runtime,
48 (long long)p->se.sum_wait_runtime, 49 (long long)p->se.sum_wait_runtime,
49 (long long)p->se.sum_sleep_runtime, 50 (long long)p->se.sum_sleep_runtime,
50 (long long)p->se.wait_runtime_overruns, 51 (long long)p->se.wait_runtime_overruns,
51 (long long)p->se.wait_runtime_underruns); 52 (long long)p->se.wait_runtime_underruns);
53#else
54 SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
55 0LL, 0LL, 0LL, 0LL, 0LL);
56#endif
52} 57}
53 58
54static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) 59static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
55{ 60{
56 struct task_struct *g, *p; 61 struct task_struct *g, *p;
57 62
@@ -72,7 +77,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
72 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 77 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
73 continue; 78 continue;
74 79
75 print_task(m, rq, p, now); 80 print_task(m, rq, p);
76 } while_each_thread(g, p); 81 } while_each_thread(g, p);
77 82
78 read_unlock_irq(&tasklist_lock); 83 read_unlock_irq(&tasklist_lock);
@@ -101,9 +106,9 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
101 (long long)wait_runtime_rq_sum); 106 (long long)wait_runtime_rq_sum);
102} 107}
103 108
104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) 109void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
105{ 110{
106 SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); 111 SEQ_printf(m, "\ncfs_rq\n");
107 112
108#define P(x) \ 113#define P(x) \
109 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) 114 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
@@ -119,7 +124,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
119 print_cfs_rq_runtime_sum(m, cpu, cfs_rq); 124 print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
120} 125}
121 126
122static void print_cpu(struct seq_file *m, int cpu, u64 now) 127static void print_cpu(struct seq_file *m, int cpu)
123{ 128{
124 struct rq *rq = &per_cpu(runqueues, cpu); 129 struct rq *rq = &per_cpu(runqueues, cpu);
125 130
@@ -149,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
149 P(next_balance); 154 P(next_balance);
150 P(curr->pid); 155 P(curr->pid);
151 P(clock); 156 P(clock);
157 P(idle_clock);
152 P(prev_clock_raw); 158 P(prev_clock_raw);
153 P(clock_warps); 159 P(clock_warps);
154 P(clock_overflows); 160 P(clock_overflows);
155 P(clock_unstable_events); 161 P(clock_deep_idle_events);
156 P(clock_max_delta); 162 P(clock_max_delta);
157 P(cpu_load[0]); 163 P(cpu_load[0]);
158 P(cpu_load[1]); 164 P(cpu_load[1]);
@@ -161,9 +167,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
161 P(cpu_load[4]); 167 P(cpu_load[4]);
162#undef P 168#undef P
163 169
164 print_cfs_stats(m, cpu, now); 170 print_cfs_stats(m, cpu);
165 171
166 print_rq(m, rq, cpu, now); 172 print_rq(m, rq, cpu);
167} 173}
168 174
169static int sched_debug_show(struct seq_file *m, void *v) 175static int sched_debug_show(struct seq_file *m, void *v)
@@ -171,7 +177,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
171 u64 now = ktime_to_ns(ktime_get()); 177 u64 now = ktime_to_ns(ktime_get());
172 int cpu; 178 int cpu;
173 179
174 SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", 180 SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
175 init_utsname()->release, 181 init_utsname()->release,
176 (int)strcspn(init_utsname()->version, " "), 182 (int)strcspn(init_utsname()->version, " "),
177 init_utsname()->version); 183 init_utsname()->version);
@@ -179,14 +185,14 @@ static int sched_debug_show(struct seq_file *m, void *v)
179 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); 185 SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
180 186
181 for_each_online_cpu(cpu) 187 for_each_online_cpu(cpu)
182 print_cpu(m, cpu, now); 188 print_cpu(m, cpu);
183 189
184 SEQ_printf(m, "\n"); 190 SEQ_printf(m, "\n");
185 191
186 return 0; 192 return 0;
187} 193}
188 194
189void sysrq_sched_debug_show(void) 195static void sysrq_sched_debug_show(void)
190{ 196{
191 sched_debug_show(NULL, NULL); 197 sched_debug_show(NULL, NULL);
192} 198}
@@ -200,7 +206,7 @@ static struct file_operations sched_debug_fops = {
200 .open = sched_debug_open, 206 .open = sched_debug_open,
201 .read = seq_read, 207 .read = seq_read,
202 .llseek = seq_lseek, 208 .llseek = seq_lseek,
203 .release = seq_release, 209 .release = single_release,
204}; 210};
205 211
206static int __init init_sched_debug_procfs(void) 212static int __init init_sched_debug_procfs(void)
@@ -235,21 +241,24 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
235#define P(F) \ 241#define P(F) \
236 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) 242 SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
237 243
238 P(se.wait_start); 244 P(se.wait_runtime);
239 P(se.wait_start_fair); 245 P(se.wait_start_fair);
240 P(se.exec_start); 246 P(se.exec_start);
241 P(se.sleep_start);
242 P(se.sleep_start_fair); 247 P(se.sleep_start_fair);
248 P(se.sum_exec_runtime);
249
250#ifdef CONFIG_SCHEDSTATS
251 P(se.wait_start);
252 P(se.sleep_start);
243 P(se.block_start); 253 P(se.block_start);
244 P(se.sleep_max); 254 P(se.sleep_max);
245 P(se.block_max); 255 P(se.block_max);
246 P(se.exec_max); 256 P(se.exec_max);
247 P(se.wait_max); 257 P(se.wait_max);
248 P(se.wait_runtime);
249 P(se.wait_runtime_overruns); 258 P(se.wait_runtime_overruns);
250 P(se.wait_runtime_underruns); 259 P(se.wait_runtime_underruns);
251 P(se.sum_wait_runtime); 260 P(se.sum_wait_runtime);
252 P(se.sum_exec_runtime); 261#endif
253 SEQ_printf(m, "%-25s:%20Ld\n", 262 SEQ_printf(m, "%-25s:%20Ld\n",
254 "nr_switches", (long long)(p->nvcsw + p->nivcsw)); 263 "nr_switches", (long long)(p->nvcsw + p->nivcsw));
255 P(se.load.weight); 264 P(se.load.weight);
@@ -269,7 +278,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
269 278
270void proc_sched_set_task(struct task_struct *p) 279void proc_sched_set_task(struct task_struct *p)
271{ 280{
281#ifdef CONFIG_SCHEDSTATS
272 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; 282 p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
273 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
284#endif
274 p->se.sum_exec_runtime = 0; 285 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0;
275} 287}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6971db0a7160..67c67a87146e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -15,34 +15,50 @@
15 * 15 *
16 * Scaled math optimizations by Thomas Gleixner 16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
18 */ 21 */
19 22
20/* 23/*
21 * Preemption granularity: 24 * Targeted preemption latency for CPU-bound tasks:
22 * (default: 2 msec, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
23 * 26 *
24 * NOTE: this granularity value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat 28 * 'timeslice length' - timeslices in CFS are of variable length.
26 * larger than this value. (to see the precise effective timeslice 29 * (to see the precise effective timeslice length of your workload,
27 * length of your workload, run vmstat and monitor the context-switches 30 * run vmstat and monitor the context-switches field)
28 * field)
29 * 31 *
30 * On SMP systems the value of this is multiplied by the log2 of the 32 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
38
39/*
40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds)
33 */ 42 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; 43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
44
45/*
46 * sys_sched_yield() compat mode
47 *
48 * This option switches the agressive yield implementation of the
49 * old scheduler back on.
50 */
51unsigned int __read_mostly sysctl_sched_compat_yield;
35 52
36/* 53/*
37 * SCHED_BATCH wake-up granularity. 54 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds) 55 * (default: 25 msec, units: nanoseconds)
39 * 56 *
40 * This option delays the preemption effects of decoupled workloads 57 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still 58 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies. 59 * have immediate wakeup/sleep latencies.
43 */ 60 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
45 10000000000ULL/HZ;
46 62
47/* 63/*
48 * SCHED_OTHER wake-up granularity. 64 * SCHED_OTHER wake-up granularity.
@@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
52 * and reduces their over-scheduling. Synchronous workloads will still 68 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies. 69 * have immediate wakeup/sleep latencies.
54 */ 70 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; 71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
56 72
57unsigned int sysctl_sched_stat_granularity __read_mostly; 73unsigned int sysctl_sched_stat_granularity __read_mostly;
58 74
59/* 75/*
60 * Initialized in sched_init_granularity(): 76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
61 */ 77 */
62unsigned int sysctl_sched_runtime_limit __read_mostly; 78unsigned int sysctl_sched_runtime_limit __read_mostly;
63 79
@@ -75,7 +91,7 @@ enum {
75 91
76unsigned int sysctl_sched_features __read_mostly = 92unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 | 93 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 | 94 SCHED_FEAT_SLEEPER_AVG *0 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 | 95 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 | 96 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 | 97 SCHED_FEAT_START_DEBIT *1 |
@@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186 update_load_add(&cfs_rq->load, se->load.weight); 202 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++; 203 cfs_rq->nr_running++;
188 se->on_rq = 1; 204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
189} 207}
190 208
191static inline void 209static inline void
@@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
197 update_load_sub(&cfs_rq->load, se->load.weight); 215 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--; 216 cfs_rq->nr_running--;
199 se->on_rq = 0; 217 se->on_rq = 0;
218
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
200} 220}
201 221
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
214 */ 234 */
215 235
216/* 236/*
237 * Calculate the preemption granularity needed to schedule every
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 *
247 * To achieve this we use the following dynamic-granularity rule:
248 *
249 * gran = lat/nr - lat/nr/nr
250 *
251 * This comes out of the following equations:
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */
265static long
266sched_granularity(struct cfs_rq *cfs_rq)
267{
268 unsigned int gran = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running;
270
271 if (nr > 1) {
272 gran = gran/nr - gran/nr/nr;
273 gran = max(gran, sysctl_sched_min_granularity);
274 }
275
276 return gran;
277}
278
279/*
217 * We rescale the rescheduling granularity of tasks according to their 280 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially: 281 * nice level, but only linearly, not exponentially:
219 */ 282 */
@@ -222,21 +285,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
222{ 285{
223 u64 tmp; 286 u64 tmp;
224 287
288 if (likely(curr->load.weight == NICE_0_LOAD))
289 return granularity;
225 /* 290 /*
226 * Negative nice levels get the same granularity as nice-0: 291 * Positive nice levels get the same granularity as nice-0:
227 */ 292 */
228 if (likely(curr->load.weight >= NICE_0_LOAD)) 293 if (likely(curr->load.weight < NICE_0_LOAD)) {
229 return granularity; 294 tmp = curr->load.weight * (u64)granularity;
295 return (long) (tmp >> NICE_0_SHIFT);
296 }
230 /* 297 /*
231 * Positive nice level tasks get linearly finer 298 * Negative nice level tasks get linearly finer
232 * granularity: 299 * granularity:
233 */ 300 */
234 tmp = curr->load.weight * (u64)granularity; 301 tmp = curr->load.inv_weight * (u64)granularity;
235 302
236 /* 303 /*
237 * It will always fit into 'long': 304 * It will always fit into 'long':
238 */ 305 */
239 return (long) (tmp >> NICE_0_SHIFT); 306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
240} 307}
241 308
242static inline void 309static inline void
@@ -281,34 +348,28 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
281 * are not in our scheduling class. 348 * are not in our scheduling class.
282 */ 349 */
283static inline void 350static inline void
284__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) 351__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
285{ 352{
286 unsigned long delta, delta_exec, delta_fair; 353 unsigned long delta, delta_exec, delta_fair, delta_mine;
287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load; 354 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight; 355 unsigned long load = lw->weight;
290 356
291 if (unlikely(!load))
292 return;
293
294 delta_exec = curr->delta_exec; 357 delta_exec = curr->delta_exec;
295#ifdef CONFIG_SCHEDSTATS 358 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
296 if (unlikely(delta_exec > curr->exec_max))
297 curr->exec_max = delta_exec;
298#endif
299 359
300 curr->sum_exec_runtime += delta_exec; 360 curr->sum_exec_runtime += delta_exec;
301 cfs_rq->exec_clock += delta_exec; 361 cfs_rq->exec_clock += delta_exec;
302 362
363 if (unlikely(!load))
364 return;
365
303 delta_fair = calc_delta_fair(delta_exec, lw); 366 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305 368
306 if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { 369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
307 delta = calc_delta_mine(cfs_rq->sleeper_bonus, 370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
308 curr->load.weight, lw); 371 delta = min(delta, (unsigned long)(
309 if (unlikely(delta > cfs_rq->sleeper_bonus)) 372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
310 delta = cfs_rq->sleeper_bonus;
311
312 cfs_rq->sleeper_bonus -= delta; 373 cfs_rq->sleeper_bonus -= delta;
313 delta_mine -= delta; 374 delta_mine -= delta;
314 } 375 }
@@ -324,7 +385,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
324 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); 385 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
325} 386}
326 387
327static void update_curr(struct cfs_rq *cfs_rq, u64 now) 388static void update_curr(struct cfs_rq *cfs_rq)
328{ 389{
329 struct sched_entity *curr = cfs_rq_curr(cfs_rq); 390 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
330 unsigned long delta_exec; 391 unsigned long delta_exec;
@@ -337,22 +398,22 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now)
337 * since the last time we changed load (this cannot 398 * since the last time we changed load (this cannot
338 * overflow on 32 bits): 399 * overflow on 32 bits):
339 */ 400 */
340 delta_exec = (unsigned long)(now - curr->exec_start); 401 delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
341 402
342 curr->delta_exec += delta_exec; 403 curr->delta_exec += delta_exec;
343 404
344 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { 405 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
345 __update_curr(cfs_rq, curr, now); 406 __update_curr(cfs_rq, curr);
346 curr->delta_exec = 0; 407 curr->delta_exec = 0;
347 } 408 }
348 curr->exec_start = now; 409 curr->exec_start = rq_of(cfs_rq)->clock;
349} 410}
350 411
351static inline void 412static inline void
352update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 413update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
353{ 414{
354 se->wait_start_fair = cfs_rq->fair_clock; 415 se->wait_start_fair = cfs_rq->fair_clock;
355 se->wait_start = now; 416 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
356} 417}
357 418
358/* 419/*
@@ -380,8 +441,7 @@ calc_weighted(unsigned long delta, unsigned long weight, int shift)
380/* 441/*
381 * Task is being enqueued - update stats: 442 * Task is being enqueued - update stats:
382 */ 443 */
383static void 444static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
384update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
385{ 445{
386 s64 key; 446 s64 key;
387 447
@@ -390,7 +450,7 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
390 * a dequeue/enqueue event is a NOP) 450 * a dequeue/enqueue event is a NOP)
391 */ 451 */
392 if (se != cfs_rq_curr(cfs_rq)) 452 if (se != cfs_rq_curr(cfs_rq))
393 update_stats_wait_start(cfs_rq, se, now); 453 update_stats_wait_start(cfs_rq, se);
394 /* 454 /*
395 * Update the key: 455 * Update the key:
396 */ 456 */
@@ -410,7 +470,8 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
410 (WMULT_SHIFT - NICE_0_SHIFT); 470 (WMULT_SHIFT - NICE_0_SHIFT);
411 } else { 471 } else {
412 tmp = se->wait_runtime; 472 tmp = se->wait_runtime;
413 key -= (tmp * se->load.weight) >> NICE_0_SHIFT; 473 key -= (tmp * se->load.inv_weight) >>
474 (WMULT_SHIFT - NICE_0_SHIFT);
414 } 475 }
415 } 476 }
416 477
@@ -421,17 +482,12 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
421 * Note: must be called with a freshly updated rq->fair_clock. 482 * Note: must be called with a freshly updated rq->fair_clock.
422 */ 483 */
423static inline void 484static inline void
424__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 485__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
425{ 486{
426 unsigned long delta_fair = se->delta_fair_run; 487 unsigned long delta_fair = se->delta_fair_run;
427 488
428#ifdef CONFIG_SCHEDSTATS 489 schedstat_set(se->wait_max, max(se->wait_max,
429 { 490 rq_of(cfs_rq)->clock - se->wait_start));
430 s64 delta_wait = now - se->wait_start;
431 if (unlikely(delta_wait > se->wait_max))
432 se->wait_max = delta_wait;
433 }
434#endif
435 491
436 if (unlikely(se->load.weight != NICE_0_LOAD)) 492 if (unlikely(se->load.weight != NICE_0_LOAD))
437 delta_fair = calc_weighted(delta_fair, se->load.weight, 493 delta_fair = calc_weighted(delta_fair, se->load.weight,
@@ -441,53 +497,56 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
441} 497}
442 498
443static void 499static void
444update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 500update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
445{ 501{
446 unsigned long delta_fair; 502 unsigned long delta_fair;
447 503
504 if (unlikely(!se->wait_start_fair))
505 return;
506
448 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), 507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
449 (u64)(cfs_rq->fair_clock - se->wait_start_fair)); 508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
450 509
451 se->delta_fair_run += delta_fair; 510 se->delta_fair_run += delta_fair;
452 if (unlikely(abs(se->delta_fair_run) >= 511 if (unlikely(abs(se->delta_fair_run) >=
453 sysctl_sched_stat_granularity)) { 512 sysctl_sched_stat_granularity)) {
454 __update_stats_wait_end(cfs_rq, se, now); 513 __update_stats_wait_end(cfs_rq, se);
455 se->delta_fair_run = 0; 514 se->delta_fair_run = 0;
456 } 515 }
457 516
458 se->wait_start_fair = 0; 517 se->wait_start_fair = 0;
459 se->wait_start = 0; 518 schedstat_set(se->wait_start, 0);
460} 519}
461 520
462static inline void 521static inline void
463update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 522update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
464{ 523{
465 update_curr(cfs_rq, now); 524 update_curr(cfs_rq);
466 /* 525 /*
467 * Mark the end of the wait period if dequeueing a 526 * Mark the end of the wait period if dequeueing a
468 * waiting task: 527 * waiting task:
469 */ 528 */
470 if (se != cfs_rq_curr(cfs_rq)) 529 if (se != cfs_rq_curr(cfs_rq))
471 update_stats_wait_end(cfs_rq, se, now); 530 update_stats_wait_end(cfs_rq, se);
472} 531}
473 532
474/* 533/*
475 * We are picking a new current task - update its stats: 534 * We are picking a new current task - update its stats:
476 */ 535 */
477static inline void 536static inline void
478update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 537update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
479{ 538{
480 /* 539 /*
481 * We are starting a new run period: 540 * We are starting a new run period:
482 */ 541 */
483 se->exec_start = now; 542 se->exec_start = rq_of(cfs_rq)->clock;
484} 543}
485 544
486/* 545/*
487 * We are descheduling a task - update its stats: 546 * We are descheduling a task - update its stats:
488 */ 547 */
489static inline void 548static inline void
490update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 549update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
491{ 550{
492 se->exec_start = 0; 551 se->exec_start = 0;
493} 552}
@@ -496,12 +555,18 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
496 * Scheduling class queueing methods: 555 * Scheduling class queueing methods:
497 */ 556 */
498 557
499static void 558static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
500__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
501{ 559{
502 unsigned long load = cfs_rq->load.weight, delta_fair; 560 unsigned long load = cfs_rq->load.weight, delta_fair;
503 long prev_runtime; 561 long prev_runtime;
504 562
563 /*
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) 570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
506 load = rq_of(cfs_rq)->cpu_load[2]; 571 load = rq_of(cfs_rq)->cpu_load[2];
507 572
@@ -527,12 +592,9 @@ __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
527 * Track the amount of bonus we've given to sleepers: 592 * Track the amount of bonus we've given to sleepers:
528 */ 593 */
529 cfs_rq->sleeper_bonus += delta_fair; 594 cfs_rq->sleeper_bonus += delta_fair;
530
531 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
532} 595}
533 596
534static void 597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
535enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
536{ 598{
537 struct task_struct *tsk = task_of(se); 599 struct task_struct *tsk = task_of(se);
538 unsigned long delta_fair; 600 unsigned long delta_fair;
@@ -547,7 +609,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
547 se->delta_fair_sleep += delta_fair; 609 se->delta_fair_sleep += delta_fair;
548 if (unlikely(abs(se->delta_fair_sleep) >= 610 if (unlikely(abs(se->delta_fair_sleep) >=
549 sysctl_sched_stat_granularity)) { 611 sysctl_sched_stat_granularity)) {
550 __enqueue_sleeper(cfs_rq, se, now); 612 __enqueue_sleeper(cfs_rq, se);
551 se->delta_fair_sleep = 0; 613 se->delta_fair_sleep = 0;
552 } 614 }
553 615
@@ -555,7 +617,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
555 617
556#ifdef CONFIG_SCHEDSTATS 618#ifdef CONFIG_SCHEDSTATS
557 if (se->sleep_start) { 619 if (se->sleep_start) {
558 u64 delta = now - se->sleep_start; 620 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
559 621
560 if ((s64)delta < 0) 622 if ((s64)delta < 0)
561 delta = 0; 623 delta = 0;
@@ -567,7 +629,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
567 se->sum_sleep_runtime += delta; 629 se->sum_sleep_runtime += delta;
568 } 630 }
569 if (se->block_start) { 631 if (se->block_start) {
570 u64 delta = now - se->block_start; 632 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
571 633
572 if ((s64)delta < 0) 634 if ((s64)delta < 0)
573 delta = 0; 635 delta = 0;
@@ -577,31 +639,39 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
577 639
578 se->block_start = 0; 640 se->block_start = 0;
579 se->sum_sleep_runtime += delta; 641 se->sum_sleep_runtime += delta;
642
643 /*
644 * Blocking time is in units of nanosecs, so shift by 20 to
645 * get a milliseconds-range estimation of the amount of
646 * time that the task spent sleeping:
647 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) {
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20);
651 }
580 } 652 }
581#endif 653#endif
582} 654}
583 655
584static void 656static void
585enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 657enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
586 int wakeup, u64 now)
587{ 658{
588 /* 659 /*
589 * Update the fair clock. 660 * Update the fair clock.
590 */ 661 */
591 update_curr(cfs_rq, now); 662 update_curr(cfs_rq);
592 663
593 if (wakeup) 664 if (wakeup)
594 enqueue_sleeper(cfs_rq, se, now); 665 enqueue_sleeper(cfs_rq, se);
595 666
596 update_stats_enqueue(cfs_rq, se, now); 667 update_stats_enqueue(cfs_rq, se);
597 __enqueue_entity(cfs_rq, se); 668 __enqueue_entity(cfs_rq, se);
598} 669}
599 670
600static void 671static void
601dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 672dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
602 int sleep, u64 now)
603{ 673{
604 update_stats_dequeue(cfs_rq, se, now); 674 update_stats_dequeue(cfs_rq, se);
605 if (sleep) { 675 if (sleep) {
606 se->sleep_start_fair = cfs_rq->fair_clock; 676 se->sleep_start_fair = cfs_rq->fair_clock;
607#ifdef CONFIG_SCHEDSTATS 677#ifdef CONFIG_SCHEDSTATS
@@ -609,11 +679,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
609 struct task_struct *tsk = task_of(se); 679 struct task_struct *tsk = task_of(se);
610 680
611 if (tsk->state & TASK_INTERRUPTIBLE) 681 if (tsk->state & TASK_INTERRUPTIBLE)
612 se->sleep_start = now; 682 se->sleep_start = rq_of(cfs_rq)->clock;
613 if (tsk->state & TASK_UNINTERRUPTIBLE) 683 if (tsk->state & TASK_UNINTERRUPTIBLE)
614 se->block_start = now; 684 se->block_start = rq_of(cfs_rq)->clock;
615 } 685 }
616 cfs_rq->wait_runtime -= se->wait_runtime;
617#endif 686#endif
618 } 687 }
619 __dequeue_entity(cfs_rq, se); 688 __dequeue_entity(cfs_rq, se);
@@ -627,18 +696,38 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
627 struct sched_entity *curr, unsigned long granularity) 696 struct sched_entity *curr, unsigned long granularity)
628{ 697{
629 s64 __delta = curr->fair_key - se->fair_key; 698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec;
700
701 /*
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime)
716 granularity = 0;
630 717
631 /* 718 /*
632 * Take scheduling granularity into account - do not 719 * Take scheduling granularity into account - do not
633 * preempt the current task unless the best task has 720 * preempt the current task unless the best task has
634 * a larger than sched_granularity fairness advantage: 721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
635 */ 724 */
636 if (__delta > niced_granularity(curr, granularity)) 725 if (__delta > niced_granularity(curr, granularity))
637 resched_task(rq_of(cfs_rq)->curr); 726 resched_task(rq_of(cfs_rq)->curr);
638} 727}
639 728
640static inline void 729static inline void
641set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 730set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
642{ 731{
643 /* 732 /*
644 * Any task has to be enqueued before it get to execute on 733 * Any task has to be enqueued before it get to execute on
@@ -647,49 +736,47 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
647 * done a put_prev_task_fair() shortly before this, which 736 * done a put_prev_task_fair() shortly before this, which
648 * updated rq->fair_clock - used by update_stats_wait_end()) 737 * updated rq->fair_clock - used by update_stats_wait_end())
649 */ 738 */
650 update_stats_wait_end(cfs_rq, se, now); 739 update_stats_wait_end(cfs_rq, se);
651 update_stats_curr_start(cfs_rq, se, now); 740 update_stats_curr_start(cfs_rq, se);
652 set_cfs_rq_curr(cfs_rq, se); 741 set_cfs_rq_curr(cfs_rq, se);
742 se->prev_sum_exec_runtime = se->sum_exec_runtime;
653} 743}
654 744
655static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) 745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
656{ 746{
657 struct sched_entity *se = __pick_next_entity(cfs_rq); 747 struct sched_entity *se = __pick_next_entity(cfs_rq);
658 748
659 set_next_entity(cfs_rq, se, now); 749 set_next_entity(cfs_rq, se);
660 750
661 return se; 751 return se;
662} 752}
663 753
664static void 754static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
665put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
666{ 755{
667 /* 756 /*
668 * If still on the runqueue then deactivate_task() 757 * If still on the runqueue then deactivate_task()
669 * was not called and update_curr() has to be done: 758 * was not called and update_curr() has to be done:
670 */ 759 */
671 if (prev->on_rq) 760 if (prev->on_rq)
672 update_curr(cfs_rq, now); 761 update_curr(cfs_rq);
673 762
674 update_stats_curr_end(cfs_rq, prev, now); 763 update_stats_curr_end(cfs_rq, prev);
675 764
676 if (prev->on_rq) 765 if (prev->on_rq)
677 update_stats_wait_start(cfs_rq, prev, now); 766 update_stats_wait_start(cfs_rq, prev);
678 set_cfs_rq_curr(cfs_rq, NULL); 767 set_cfs_rq_curr(cfs_rq, NULL);
679} 768}
680 769
681static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 770static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
682{ 771{
683 struct rq *rq = rq_of(cfs_rq);
684 struct sched_entity *next; 772 struct sched_entity *next;
685 u64 now = __rq_clock(rq);
686 773
687 /* 774 /*
688 * Dequeue and enqueue the task to update its 775 * Dequeue and enqueue the task to update its
689 * position within the tree: 776 * position within the tree:
690 */ 777 */
691 dequeue_entity(cfs_rq, curr, 0, now); 778 dequeue_entity(cfs_rq, curr, 0);
692 enqueue_entity(cfs_rq, curr, 0, now); 779 enqueue_entity(cfs_rq, curr, 0);
693 780
694 /* 781 /*
695 * Reschedule if another task tops the current one. 782 * Reschedule if another task tops the current one.
@@ -698,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
698 if (next == curr) 785 if (next == curr)
699 return; 786 return;
700 787
701 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 788 __check_preempt_curr_fair(cfs_rq, next, curr,
789 sched_granularity(cfs_rq));
702} 790}
703 791
704/************************************************** 792/**************************************************
@@ -794,8 +882,7 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
794 * increased. Here we update the fair scheduling stats and 882 * increased. Here we update the fair scheduling stats and
795 * then put the task into the rbtree: 883 * then put the task into the rbtree:
796 */ 884 */
797static void 885static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
798enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
799{ 886{
800 struct cfs_rq *cfs_rq; 887 struct cfs_rq *cfs_rq;
801 struct sched_entity *se = &p->se; 888 struct sched_entity *se = &p->se;
@@ -804,7 +891,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
804 if (se->on_rq) 891 if (se->on_rq)
805 break; 892 break;
806 cfs_rq = cfs_rq_of(se); 893 cfs_rq = cfs_rq_of(se);
807 enqueue_entity(cfs_rq, se, wakeup, now); 894 enqueue_entity(cfs_rq, se, wakeup);
808 } 895 }
809} 896}
810 897
@@ -813,15 +900,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
813 * decreased. We remove the task from the rbtree and 900 * decreased. We remove the task from the rbtree and
814 * update the fair scheduling stats: 901 * update the fair scheduling stats:
815 */ 902 */
816static void 903static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
817dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
818{ 904{
819 struct cfs_rq *cfs_rq; 905 struct cfs_rq *cfs_rq;
820 struct sched_entity *se = &p->se; 906 struct sched_entity *se = &p->se;
821 907
822 for_each_sched_entity(se) { 908 for_each_sched_entity(se) {
823 cfs_rq = cfs_rq_of(se); 909 cfs_rq = cfs_rq_of(se);
824 dequeue_entity(cfs_rq, se, sleep, now); 910 dequeue_entity(cfs_rq, se, sleep);
825 /* Don't dequeue parent if it has other entities besides us */ 911 /* Don't dequeue parent if it has other entities besides us */
826 if (cfs_rq->load.weight) 912 if (cfs_rq->load.weight)
827 break; 913 break;
@@ -829,19 +915,62 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
829} 915}
830 916
831/* 917/*
832 * sched_yield() support is very simple - we dequeue and enqueue 918 * sched_yield() support is very simple - we dequeue and enqueue.
919 *
920 * If compat_yield is turned on then we requeue to the end of the tree.
833 */ 921 */
834static void yield_task_fair(struct rq *rq, struct task_struct *p) 922static void yield_task_fair(struct rq *rq, struct task_struct *p)
835{ 923{
836 struct cfs_rq *cfs_rq = task_cfs_rq(p); 924 struct cfs_rq *cfs_rq = task_cfs_rq(p);
837 u64 now = __rq_clock(rq); 925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
838 928
839 /* 929 /*
840 * Dequeue and enqueue the task to update its 930 * Are we the only task in the tree?
841 * position within the tree: 931 */
932 if (unlikely(cfs_rq->nr_running == 1))
933 return;
934
935 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq);
937 /*
938 * Dequeue and enqueue the task to update its
939 * position within the tree:
940 */
941 dequeue_entity(cfs_rq, &p->se, 0);
942 enqueue_entity(cfs_rq, &p->se, 0);
943
944 return;
945 }
946 /*
947 * Find the rightmost entry in the rbtree:
948 */
949 do {
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /*
956 * Already in the rightmost position?
842 */ 957 */
843 dequeue_entity(cfs_rq, &p->se, 0, now); 958 if (unlikely(rightmost == se))
844 enqueue_entity(cfs_rq, &p->se, 0, now); 959 return;
960
961 /*
962 * Minimally necessary key value to be last in the tree:
963 */
964 se->fair_key = rightmost->fair_key + 1;
965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
845} 974}
846 975
847/* 976/*
@@ -854,7 +983,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
854 unsigned long gran; 983 unsigned long gran;
855 984
856 if (unlikely(rt_prio(p->prio))) { 985 if (unlikely(rt_prio(p->prio))) {
857 update_curr(cfs_rq, rq_clock(rq)); 986 update_rq_clock(rq);
987 update_curr(cfs_rq);
858 resched_task(curr); 988 resched_task(curr);
859 return; 989 return;
860 } 990 }
@@ -870,7 +1000,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
870 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); 1000 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
871} 1001}
872 1002
873static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) 1003static struct task_struct *pick_next_task_fair(struct rq *rq)
874{ 1004{
875 struct cfs_rq *cfs_rq = &rq->cfs; 1005 struct cfs_rq *cfs_rq = &rq->cfs;
876 struct sched_entity *se; 1006 struct sched_entity *se;
@@ -879,7 +1009,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
879 return NULL; 1009 return NULL;
880 1010
881 do { 1011 do {
882 se = pick_next_entity(cfs_rq, now); 1012 se = pick_next_entity(cfs_rq);
883 cfs_rq = group_cfs_rq(se); 1013 cfs_rq = group_cfs_rq(se);
884 } while (cfs_rq); 1014 } while (cfs_rq);
885 1015
@@ -889,14 +1019,14 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
889/* 1019/*
890 * Account for a descheduled task: 1020 * Account for a descheduled task:
891 */ 1021 */
892static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) 1022static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
893{ 1023{
894 struct sched_entity *se = &prev->se; 1024 struct sched_entity *se = &prev->se;
895 struct cfs_rq *cfs_rq; 1025 struct cfs_rq *cfs_rq;
896 1026
897 for_each_sched_entity(se) { 1027 for_each_sched_entity(se) {
898 cfs_rq = cfs_rq_of(se); 1028 cfs_rq = cfs_rq_of(se);
899 put_prev_entity(cfs_rq, se, now); 1029 put_prev_entity(cfs_rq, se);
900 } 1030 }
901} 1031}
902 1032
@@ -939,6 +1069,7 @@ static struct task_struct *load_balance_next_fair(void *arg)
939 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1069 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
940} 1070}
941 1071
1072#ifdef CONFIG_FAIR_GROUP_SCHED
942static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1073static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
943{ 1074{
944 struct sched_entity *curr; 1075 struct sched_entity *curr;
@@ -952,12 +1083,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
952 1083
953 return p->prio; 1084 return p->prio;
954} 1085}
1086#endif
955 1087
956static int 1088static unsigned long
957load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1089load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 unsigned long max_nr_move, unsigned long max_load_move, 1090 unsigned long max_nr_move, unsigned long max_load_move,
959 struct sched_domain *sd, enum cpu_idle_type idle, 1091 struct sched_domain *sd, enum cpu_idle_type idle,
960 int *all_pinned, unsigned long *total_load_moved) 1092 int *all_pinned, int *this_best_prio)
961{ 1093{
962 struct cfs_rq *busy_cfs_rq; 1094 struct cfs_rq *busy_cfs_rq;
963 unsigned long load_moved, total_nr_moved = 0, nr_moved; 1095 unsigned long load_moved, total_nr_moved = 0, nr_moved;
@@ -968,15 +1100,14 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
968 cfs_rq_iterator.next = load_balance_next_fair; 1100 cfs_rq_iterator.next = load_balance_next_fair;
969 1101
970 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1102 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1103#ifdef CONFIG_FAIR_GROUP_SCHED
971 struct cfs_rq *this_cfs_rq; 1104 struct cfs_rq *this_cfs_rq;
972 long imbalance; 1105 long imbalance;
973 unsigned long maxload; 1106 unsigned long maxload;
974 int this_best_prio, best_prio, best_prio_seen = 0;
975 1107
976 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1108 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
977 1109
978 imbalance = busy_cfs_rq->load.weight - 1110 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
979 this_cfs_rq->load.weight;
980 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1111 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
981 if (imbalance <= 0) 1112 if (imbalance <= 0)
982 continue; 1113 continue;
@@ -985,27 +1116,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
985 imbalance /= 2; 1116 imbalance /= 2;
986 maxload = min(rem_load_move, imbalance); 1117 maxload = min(rem_load_move, imbalance);
987 1118
988 this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1119 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
989 best_prio = cfs_rq_best_prio(busy_cfs_rq); 1120#else
990 1121# define maxload rem_load_move
991 /* 1122#endif
992 * Enable handling of the case where there is more than one task
993 * with the best priority. If the current running task is one
994 * of those with prio==best_prio we know it won't be moved
995 * and therefore it's safe to override the skip (based on load)
996 * of any task we find with that prio.
997 */
998 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
999 best_prio_seen = 1;
1000
1001 /* pass busy_cfs_rq argument into 1123 /* pass busy_cfs_rq argument into
1002 * load_balance_[start|next]_fair iterators 1124 * load_balance_[start|next]_fair iterators
1003 */ 1125 */
1004 cfs_rq_iterator.arg = busy_cfs_rq; 1126 cfs_rq_iterator.arg = busy_cfs_rq;
1005 nr_moved = balance_tasks(this_rq, this_cpu, busiest, 1127 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
1006 max_nr_move, maxload, sd, idle, all_pinned, 1128 max_nr_move, maxload, sd, idle, all_pinned,
1007 &load_moved, this_best_prio, best_prio, 1129 &load_moved, this_best_prio, &cfs_rq_iterator);
1008 best_prio_seen, &cfs_rq_iterator);
1009 1130
1010 total_nr_moved += nr_moved; 1131 total_nr_moved += nr_moved;
1011 max_nr_move -= nr_moved; 1132 max_nr_move -= nr_moved;
@@ -1015,9 +1136,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1015 break; 1136 break;
1016 } 1137 }
1017 1138
1018 *total_load_moved = max_load_move - rem_load_move; 1139 return max_load_move - rem_load_move;
1019
1020 return total_nr_moved;
1021} 1140}
1022 1141
1023/* 1142/*
@@ -1044,35 +1163,34 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1044static void task_new_fair(struct rq *rq, struct task_struct *p) 1163static void task_new_fair(struct rq *rq, struct task_struct *p)
1045{ 1164{
1046 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1165 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1047 struct sched_entity *se = &p->se; 1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
1048 u64 now = rq_clock(rq);
1049 1167
1050 sched_info_queued(p); 1168 sched_info_queued(p);
1051 1169
1052 update_stats_enqueue(cfs_rq, se, now); 1170 update_curr(cfs_rq);
1171 update_stats_enqueue(cfs_rq, se);
1053 /* 1172 /*
1054 * Child runs first: we let it run before the parent 1173 * Child runs first: we let it run before the parent
1055 * until it reschedules once. We set up the key so that 1174 * until it reschedules once. We set up the key so that
1056 * it will preempt the parent: 1175 * it will preempt the parent:
1057 */ 1176 */
1058 p->se.fair_key = current->se.fair_key - 1177 se->fair_key = curr->fair_key -
1059 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1060 /* 1179 /*
1061 * The first wait is dominated by the child-runs-first logic, 1180 * The first wait is dominated by the child-runs-first logic,
1062 * so do not credit it with that waiting time yet: 1181 * so do not credit it with that waiting time yet:
1063 */ 1182 */
1064 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) 1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1065 p->se.wait_start_fair = 0; 1184 se->wait_start_fair = 0;
1066 1185
1067 /* 1186 /*
1068 * The statistical average of wait_runtime is about 1187 * The statistical average of wait_runtime is about
1069 * -granularity/2, so initialize the task with that: 1188 * -granularity/2, so initialize the task with that:
1070 */ 1189 */
1071 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1072 p->se.wait_runtime = -(sysctl_sched_granularity / 2); 1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
1073 1192
1074 __enqueue_entity(cfs_rq, se); 1193 __enqueue_entity(cfs_rq, se);
1075 inc_nr_running(p, rq, now);
1076} 1194}
1077 1195
1078#ifdef CONFIG_FAIR_GROUP_SCHED 1196#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1083,15 +1201,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1083 */ 1201 */
1084static void set_curr_task_fair(struct rq *rq) 1202static void set_curr_task_fair(struct rq *rq)
1085{ 1203{
1086 struct task_struct *curr = rq->curr; 1204 struct sched_entity *se = &rq->curr->se;
1087 struct sched_entity *se = &curr->se;
1088 u64 now = rq_clock(rq);
1089 struct cfs_rq *cfs_rq;
1090 1205
1091 for_each_sched_entity(se) { 1206 for_each_sched_entity(se)
1092 cfs_rq = cfs_rq_of(se); 1207 set_next_entity(cfs_rq_of(se), se);
1093 set_next_entity(cfs_rq, se, now);
1094 }
1095} 1208}
1096#else 1209#else
1097static void set_curr_task_fair(struct rq *rq) 1210static void set_curr_task_fair(struct rq *rq)
@@ -1120,12 +1233,11 @@ struct sched_class fair_sched_class __read_mostly = {
1120}; 1233};
1121 1234
1122#ifdef CONFIG_SCHED_DEBUG 1235#ifdef CONFIG_SCHED_DEBUG
1123void print_cfs_stats(struct seq_file *m, int cpu, u64 now) 1236static void print_cfs_stats(struct seq_file *m, int cpu)
1124{ 1237{
1125 struct rq *rq = cpu_rq(cpu);
1126 struct cfs_rq *cfs_rq; 1238 struct cfs_rq *cfs_rq;
1127 1239
1128 for_each_leaf_cfs_rq(rq, cfs_rq) 1240 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1129 print_cfs_rq(m, cpu, cfs_rq, now); 1241 print_cfs_rq(m, cpu, cfs_rq);
1130} 1242}
1131#endif 1243#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 41841e741c4a..3503fb2d9f96 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -13,7 +13,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
13 resched_task(rq->idle); 13 resched_task(rq->idle);
14} 14}
15 15
16static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) 16static struct task_struct *pick_next_task_idle(struct rq *rq)
17{ 17{
18 schedstat_inc(rq, sched_goidle); 18 schedstat_inc(rq, sched_goidle);
19 19
@@ -25,7 +25,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
25 * message if some code attempts to do it: 25 * message if some code attempts to do it:
26 */ 26 */
27static void 27static void
28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) 28dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
29{ 29{
30 spin_unlock_irq(&rq->lock); 30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
@@ -33,15 +33,15 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
33 spin_lock_irq(&rq->lock); 33 spin_lock_irq(&rq->lock);
34} 34}
35 35
36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) 36static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
37{ 37{
38} 38}
39 39
40static int 40static unsigned long
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, 41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move, 42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle, 43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved) 44 int *all_pinned, int *this_best_prio)
45{ 45{
46 return 0; 46 return 0;
47} 47}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1192a2741b99..4b87476a02d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -7,7 +7,7 @@
7 * Update the current task's runtime statistics. Skip current tasks that 7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 8 * are not in our scheduling class.
9 */ 9 */
10static inline void update_curr_rt(struct rq *rq, u64 now) 10static inline void update_curr_rt(struct rq *rq)
11{ 11{
12 struct task_struct *curr = rq->curr; 12 struct task_struct *curr = rq->curr;
13 u64 delta_exec; 13 u64 delta_exec;
@@ -15,18 +15,17 @@ static inline void update_curr_rt(struct rq *rq, u64 now)
15 if (!task_has_rt_policy(curr)) 15 if (!task_has_rt_policy(curr))
16 return; 16 return;
17 17
18 delta_exec = now - curr->se.exec_start; 18 delta_exec = rq->clock - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0)) 19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0; 20 delta_exec = 0;
21 if (unlikely(delta_exec > curr->se.exec_max)) 21
22 curr->se.exec_max = delta_exec; 22 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
23 23
24 curr->se.sum_exec_runtime += delta_exec; 24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now; 25 curr->se.exec_start = rq->clock;
26} 26}
27 27
28static void 28static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
29enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30{ 29{
31 struct rt_prio_array *array = &rq->rt.active; 30 struct rt_prio_array *array = &rq->rt.active;
32 31
@@ -37,12 +36,11 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
37/* 36/*
38 * Adding/removing a task to/from a priority array: 37 * Adding/removing a task to/from a priority array:
39 */ 38 */
40static void 39static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
41dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42{ 40{
43 struct rt_prio_array *array = &rq->rt.active; 41 struct rt_prio_array *array = &rq->rt.active;
44 42
45 update_curr_rt(rq, now); 43 update_curr_rt(rq);
46 44
47 list_del(&p->run_list); 45 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio)) 46 if (list_empty(array->queue + p->prio))
@@ -75,7 +73,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
75 resched_task(rq->curr); 73 resched_task(rq->curr);
76} 74}
77 75
78static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) 76static struct task_struct *pick_next_task_rt(struct rq *rq)
79{ 77{
80 struct rt_prio_array *array = &rq->rt.active; 78 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next; 79 struct task_struct *next;
@@ -89,14 +87,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
89 queue = array->queue + idx; 87 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list); 88 next = list_entry(queue->next, struct task_struct, run_list);
91 89
92 next->se.exec_start = now; 90 next->se.exec_start = rq->clock;
93 91
94 return next; 92 return next;
95} 93}
96 94
97static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) 95static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
98{ 96{
99 update_curr_rt(rq, now); 97 update_curr_rt(rq);
100 p->se.exec_start = 0; 98 p->se.exec_start = 0;
101} 99}
102 100
@@ -172,28 +170,15 @@ static struct task_struct *load_balance_next_rt(void *arg)
172 return p; 170 return p;
173} 171}
174 172
175static int 173static unsigned long
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 174load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move, 175 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle, 176 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved) 177 int *all_pinned, int *this_best_prio)
180{ 178{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved; 179 int nr_moved;
183 struct rq_iterator rt_rq_iterator; 180 struct rq_iterator rt_rq_iterator;
184 181 unsigned long load_moved;
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187
188 /*
189 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio.
194 */
195 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1;
197 182
198 rt_rq_iterator.start = load_balance_start_rt; 183 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt; 184 rt_rq_iterator.next = load_balance_next_rt;
@@ -203,11 +188,10 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
203 rt_rq_iterator.arg = busiest; 188 rt_rq_iterator.arg = busiest;
204 189
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, 190 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved, 191 max_load_move, sd, idle, all_pinned, &load_moved,
207 this_best_prio, best_prio, best_prio_seen, 192 this_best_prio, &rt_rq_iterator);
208 &rt_rq_iterator);
209 193
210 return nr_moved; 194 return load_moved;
211} 195}
212 196
213static void task_tick_rt(struct rq *rq, struct task_struct *p) 197static void task_tick_rt(struct rq *rq, struct task_struct *p)
@@ -223,19 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
223 return; 207 return;
224 208
225 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p);
227
228 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p);
230}
231 210
232/* 211 /*
233 * No parent/child timeslice management necessary for RT tasks, 212 * Requeue to the end of queue if we are not the only element
234 * just activate them: 213 * on the queue:
235 */ 214 */
236static void task_new_rt(struct rq *rq, struct task_struct *p) 215 if (p->run_list.prev != p->run_list.next) {
237{ 216 requeue_task_rt(rq, p);
238 activate_task(rq, p, 1); 217 set_tsk_need_resched(p);
218 }
239} 219}
240 220
241static struct sched_class rt_sched_class __read_mostly = { 221static struct sched_class rt_sched_class __read_mostly = {
@@ -251,5 +231,4 @@ static struct sched_class rt_sched_class __read_mostly = {
251 .load_balance = load_balance_rt, 231 .load_balance = load_balance_rt,
252 232
253 .task_tick = task_tick_rt, 233 .task_tick = task_tick_rt,
254 .task_new = task_new_rt,
255}; 234};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c63c38f6fa6e..c20a94dda61e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
116} 116}
117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 117# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 118# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
119# define schedstat_set(var, val) do { var = (val); } while (0)
119#else /* !CONFIG_SCHEDSTATS */ 120#else /* !CONFIG_SCHEDSTATS */
120static inline void 121static inline void
121rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 122rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125{} 126{}
126# define schedstat_inc(rq, field) do { } while (0) 127# define schedstat_inc(rq, field) do { } while (0)
127# define schedstat_add(rq, field, amt) do { } while (0) 128# define schedstat_add(rq, field, amt) do { } while (0)
129# define schedstat_set(var, val) do { } while (0)
128#endif 130#endif
129 131
130#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 132#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
diff --git a/kernel/signal.c b/kernel/signal.c
index 39d122753bac..792952381092 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default)
255 } 255 }
256} 256}
257 257
258int unhandled_signal(struct task_struct *tsk, int sig)
259{
260 if (is_init(tsk))
261 return 1;
262 if (tsk->ptrace & PT_PTRACED)
263 return 0;
264 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
265 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
266}
267
258 268
259/* Notify the system that a driver wants to block all signals for this 269/* Notify the system that a driver wants to block all signals for this
260 * process, and wants to be notified if any signals at all were to be 270 * process, and wants to be notified if any signals at all were to be
@@ -368,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
368 /* We only dequeue private signals from ourselves, we don't let 378 /* We only dequeue private signals from ourselves, we don't let
369 * signalfd steal them 379 * signalfd steal them
370 */ 380 */
371 if (tsk == current) 381 signr = __dequeue_signal(&tsk->pending, mask, info);
372 signr = __dequeue_signal(&tsk->pending, mask, info);
373 if (!signr) { 382 if (!signr) {
374 signr = __dequeue_signal(&tsk->signal->shared_pending, 383 signr = __dequeue_signal(&tsk->signal->shared_pending,
375 mask, info); 384 mask, info);
@@ -397,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
397 } 406 }
398 } 407 }
399 } 408 }
400 if (likely(tsk == current)) 409 recalc_sigpending();
401 recalc_sigpending();
402 if (signr && unlikely(sig_kernel_stop(signr))) { 410 if (signr && unlikely(sig_kernel_stop(signr))) {
403 /* 411 /*
404 * Set a marker that we have dequeued a stop signal. Our 412 * Set a marker that we have dequeued a stop signal. Our
@@ -415,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
415 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 423 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
416 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 424 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
417 } 425 }
418 if ( signr && 426 if (signr &&
419 ((info->si_code & __SI_MASK) == __SI_TIMER) && 427 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
420 info->si_sys_private){ 428 info->si_sys_private){
421 /* 429 /*
@@ -523,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info,
523 if (!valid_signal(sig)) 531 if (!valid_signal(sig))
524 return error; 532 return error;
525 533
526 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 534 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
527 if (error) 535 error = audit_signal_info(sig, t); /* Let audit system see the signal */
528 return error; 536 if (error)
529 537 return error;
530 error = -EPERM; 538 error = -EPERM;
531 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 539 if (((sig != SIGCONT) ||
532 && ((sig != SIGCONT) || 540 (process_session(current) != process_session(t)))
533 (process_session(current) != process_session(t))) 541 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
534 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 542 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
535 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 543 && !capable(CAP_KILL))
536 && !capable(CAP_KILL))
537 return error; 544 return error;
545 }
538 546
539 return security_task_kill(t, info, sig, 0); 547 return security_task_kill(t, info, sig, 0);
540} 548}
@@ -1290,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void)
1290void sigqueue_free(struct sigqueue *q) 1298void sigqueue_free(struct sigqueue *q)
1291{ 1299{
1292 unsigned long flags; 1300 unsigned long flags;
1301 spinlock_t *lock = &current->sighand->siglock;
1302
1293 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1303 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1294 /* 1304 /*
1295 * If the signal is still pending remove it from the 1305 * If the signal is still pending remove it from the
1296 * pending queue. 1306 * pending queue. We must hold ->siglock while testing
1307 * q->list to serialize with collect_signal().
1297 */ 1308 */
1298 if (unlikely(!list_empty(&q->list))) { 1309 spin_lock_irqsave(lock, flags);
1299 spinlock_t *lock = &current->sighand->siglock; 1310 if (!list_empty(&q->list))
1300 read_lock(&tasklist_lock); 1311 list_del_init(&q->list);
1301 spin_lock_irqsave(lock, flags); 1312 spin_unlock_irqrestore(lock, flags);
1302 if (!list_empty(&q->list)) 1313
1303 list_del_init(&q->list);
1304 spin_unlock_irqrestore(lock, flags);
1305 read_unlock(&tasklist_lock);
1306 }
1307 q->flags &= ~SIGQUEUE_PREALLOC; 1314 q->flags &= ~SIGQUEUE_PREALLOC;
1308 __sigqueue_free(q); 1315 __sigqueue_free(q);
1309} 1316}
@@ -1551,10 +1558,6 @@ static inline int may_ptrace_stop(void)
1551 (current->ptrace & PT_ATTACHED))) 1558 (current->ptrace & PT_ATTACHED)))
1552 return 0; 1559 return 0;
1553 1560
1554 if (unlikely(current->signal == current->parent->signal) &&
1555 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1556 return 0;
1557
1558 /* 1561 /*
1559 * Are we in the middle of do_coredump? 1562 * Are we in the middle of do_coredump?
1560 * If so and our tracer is also part of the coredump stopping 1563 * If so and our tracer is also part of the coredump stopping
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f546ddea43d..bd89bc4eb0b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void)
271 local_irq_restore(flags); 271 local_irq_restore(flags);
272} 272}
273 273
274EXPORT_SYMBOL(do_softirq);
275
276#endif 274#endif
277 275
278/* 276/*
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
332 wakeup_softirqd(); 330 wakeup_softirqd();
333} 331}
334 332
335EXPORT_SYMBOL(raise_softirq_irqoff);
336
337void fastcall raise_softirq(unsigned int nr) 333void fastcall raise_softirq(unsigned int nr)
338{ 334{
339 unsigned long flags; 335 unsigned long flags;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 2c6c2bf85514..cd72424c2662 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock)
72{ 72{
73 preempt_disable(); 73 preempt_disable();
74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
75 _raw_read_lock(lock); 75 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
76} 76}
77EXPORT_SYMBOL(_read_lock); 77EXPORT_SYMBOL(_read_lock);
78 78
@@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
88 * _raw_spin_lock_flags() code, because lockdep assumes 88 * _raw_spin_lock_flags() code, because lockdep assumes
89 * that interrupts are not re-enabled during lock-acquire: 89 * that interrupts are not re-enabled during lock-acquire:
90 */ 90 */
91#ifdef CONFIG_PROVE_LOCKING 91#ifdef CONFIG_LOCKDEP
92 _raw_spin_lock(lock); 92 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
93#else 93#else
94 _raw_spin_lock_flags(lock, &flags); 94 _raw_spin_lock_flags(lock, &flags);
95#endif 95#endif
@@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
102 local_irq_disable(); 102 local_irq_disable();
103 preempt_disable(); 103 preempt_disable();
104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
105 _raw_spin_lock(lock); 105 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
106} 106}
107EXPORT_SYMBOL(_spin_lock_irq); 107EXPORT_SYMBOL(_spin_lock_irq);
108 108
@@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
111 local_bh_disable(); 111 local_bh_disable();
112 preempt_disable(); 112 preempt_disable();
113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
114 _raw_spin_lock(lock); 114 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
115} 115}
116EXPORT_SYMBOL(_spin_lock_bh); 116EXPORT_SYMBOL(_spin_lock_bh);
117 117
@@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
122 local_irq_save(flags); 122 local_irq_save(flags);
123 preempt_disable(); 123 preempt_disable();
124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
125 _raw_read_lock(lock); 125 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
126 return flags; 126 return flags;
127} 127}
128EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
132 local_irq_disable(); 132 local_irq_disable();
133 preempt_disable(); 133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 _raw_read_lock(lock); 135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 136}
137EXPORT_SYMBOL(_read_lock_irq); 137EXPORT_SYMBOL(_read_lock_irq);
138 138
@@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
141 local_bh_disable(); 141 local_bh_disable();
142 preempt_disable(); 142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 _raw_read_lock(lock); 144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 145}
146EXPORT_SYMBOL(_read_lock_bh); 146EXPORT_SYMBOL(_read_lock_bh);
147 147
@@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
152 local_irq_save(flags); 152 local_irq_save(flags);
153 preempt_disable(); 153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 _raw_write_lock(lock); 155 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
156 return flags; 156 return flags;
157} 157}
158EXPORT_SYMBOL(_write_lock_irqsave); 158EXPORT_SYMBOL(_write_lock_irqsave);
@@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
162 local_irq_disable(); 162 local_irq_disable();
163 preempt_disable(); 163 preempt_disable();
164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
165 _raw_write_lock(lock); 165 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
166} 166}
167EXPORT_SYMBOL(_write_lock_irq); 167EXPORT_SYMBOL(_write_lock_irq);
168 168
@@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
171 local_bh_disable(); 171 local_bh_disable();
172 preempt_disable(); 172 preempt_disable();
173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
174 _raw_write_lock(lock); 174 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
175} 175}
176EXPORT_SYMBOL(_write_lock_bh); 176EXPORT_SYMBOL(_write_lock_bh);
177 177
@@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock)
179{ 179{
180 preempt_disable(); 180 preempt_disable();
181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); 181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
182 _raw_spin_lock(lock); 182 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
183} 183}
184 184
185EXPORT_SYMBOL(_spin_lock); 185EXPORT_SYMBOL(_spin_lock);
@@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock)
188{ 188{
189 preempt_disable(); 189 preempt_disable();
190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
191 _raw_write_lock(lock); 191 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
192} 192}
193 193
194EXPORT_SYMBOL(_write_lock); 194EXPORT_SYMBOL(_write_lock);
@@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
289{ 289{
290 preempt_disable(); 290 preempt_disable();
291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
292 _raw_spin_lock(lock); 292 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
293} 293}
294 294
295EXPORT_SYMBOL(_spin_lock_nested); 295EXPORT_SYMBOL(_spin_lock_nested);
@@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
305 * _raw_spin_lock_flags() code, because lockdep assumes 305 * _raw_spin_lock_flags() code, because lockdep assumes
306 * that interrupts are not re-enabled during lock-acquire: 306 * that interrupts are not re-enabled during lock-acquire:
307 */ 307 */
308#ifdef CONFIG_PROVE_SPIN_LOCKING 308#ifdef CONFIG_LOCKDEP
309 _raw_spin_lock(lock); 309 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
310#else 310#else
311 _raw_spin_lock_flags(lock, &flags); 311 _raw_spin_lock_flags(lock, &flags);
312#endif 312#endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 4d141ae3e802..8ae2e636eb1b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -32,6 +32,7 @@
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h>
35 36
36#include <linux/compat.h> 37#include <linux/compat.h>
37#include <linux/syscalls.h> 38#include <linux/syscalls.h>
@@ -100,6 +101,13 @@ struct pid *cad_pid;
100EXPORT_SYMBOL(cad_pid); 101EXPORT_SYMBOL(cad_pid);
101 102
102/* 103/*
104 * If set, this is used for preparing the system to power off.
105 */
106
107void (*pm_power_off_prepare)(void);
108EXPORT_SYMBOL(pm_power_off_prepare);
109
110/*
103 * Notifier list for kernel code which wants to be called 111 * Notifier list for kernel code which wants to be called
104 * at shutdown. This is used to stop any idling DMA operations 112 * at shutdown. This is used to stop any idling DMA operations
105 * and the like. 113 * and the like.
@@ -797,6 +805,7 @@ static void kernel_restart_prepare(char *cmd)
797 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 805 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
798 system_state = SYSTEM_RESTART; 806 system_state = SYSTEM_RESTART;
799 device_shutdown(); 807 device_shutdown();
808 sysdev_shutdown();
800} 809}
801 810
802/** 811/**
@@ -853,6 +862,7 @@ void kernel_shutdown_prepare(enum system_states state)
853void kernel_halt(void) 862void kernel_halt(void)
854{ 863{
855 kernel_shutdown_prepare(SYSTEM_HALT); 864 kernel_shutdown_prepare(SYSTEM_HALT);
865 sysdev_shutdown();
856 printk(KERN_EMERG "System halted.\n"); 866 printk(KERN_EMERG "System halted.\n");
857 machine_halt(); 867 machine_halt();
858} 868}
@@ -867,6 +877,10 @@ EXPORT_SYMBOL_GPL(kernel_halt);
867void kernel_power_off(void) 877void kernel_power_off(void)
868{ 878{
869 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 879 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
880 if (pm_power_off_prepare)
881 pm_power_off_prepare();
882 disable_nonboot_cpus();
883 sysdev_shutdown();
870 printk(KERN_EMERG "Power down.\n"); 884 printk(KERN_EMERG "Power down.\n");
871 machine_power_off(); 885 machine_power_off();
872} 886}
@@ -942,7 +956,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
942 unlock_kernel(); 956 unlock_kernel();
943 return -EINVAL; 957 return -EINVAL;
944 958
945#ifdef CONFIG_SOFTWARE_SUSPEND 959#ifdef CONFIG_HIBERNATION
946 case LINUX_REBOOT_CMD_SW_SUSPEND: 960 case LINUX_REBOOT_CMD_SW_SUSPEND:
947 { 961 {
948 int ret = hibernate(); 962 int ret = hibernate();
@@ -1027,7 +1041,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
1027 return -EPERM; 1041 return -EPERM;
1028 } 1042 }
1029 if (new_egid != old_egid) { 1043 if (new_egid != old_egid) {
1030 current->mm->dumpable = suid_dumpable; 1044 set_dumpable(current->mm, suid_dumpable);
1031 smp_wmb(); 1045 smp_wmb();
1032 } 1046 }
1033 if (rgid != (gid_t) -1 || 1047 if (rgid != (gid_t) -1 ||
@@ -1057,13 +1071,13 @@ asmlinkage long sys_setgid(gid_t gid)
1057 1071
1058 if (capable(CAP_SETGID)) { 1072 if (capable(CAP_SETGID)) {
1059 if (old_egid != gid) { 1073 if (old_egid != gid) {
1060 current->mm->dumpable = suid_dumpable; 1074 set_dumpable(current->mm, suid_dumpable);
1061 smp_wmb(); 1075 smp_wmb();
1062 } 1076 }
1063 current->gid = current->egid = current->sgid = current->fsgid = gid; 1077 current->gid = current->egid = current->sgid = current->fsgid = gid;
1064 } else if ((gid == current->gid) || (gid == current->sgid)) { 1078 } else if ((gid == current->gid) || (gid == current->sgid)) {
1065 if (old_egid != gid) { 1079 if (old_egid != gid) {
1066 current->mm->dumpable = suid_dumpable; 1080 set_dumpable(current->mm, suid_dumpable);
1067 smp_wmb(); 1081 smp_wmb();
1068 } 1082 }
1069 current->egid = current->fsgid = gid; 1083 current->egid = current->fsgid = gid;
@@ -1094,7 +1108,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
1094 switch_uid(new_user); 1108 switch_uid(new_user);
1095 1109
1096 if (dumpclear) { 1110 if (dumpclear) {
1097 current->mm->dumpable = suid_dumpable; 1111 set_dumpable(current->mm, suid_dumpable);
1098 smp_wmb(); 1112 smp_wmb();
1099 } 1113 }
1100 current->uid = new_ruid; 1114 current->uid = new_ruid;
@@ -1150,7 +1164,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
1150 return -EAGAIN; 1164 return -EAGAIN;
1151 1165
1152 if (new_euid != old_euid) { 1166 if (new_euid != old_euid) {
1153 current->mm->dumpable = suid_dumpable; 1167 set_dumpable(current->mm, suid_dumpable);
1154 smp_wmb(); 1168 smp_wmb();
1155 } 1169 }
1156 current->fsuid = current->euid = new_euid; 1170 current->fsuid = current->euid = new_euid;
@@ -1200,7 +1214,7 @@ asmlinkage long sys_setuid(uid_t uid)
1200 return -EPERM; 1214 return -EPERM;
1201 1215
1202 if (old_euid != uid) { 1216 if (old_euid != uid) {
1203 current->mm->dumpable = suid_dumpable; 1217 set_dumpable(current->mm, suid_dumpable);
1204 smp_wmb(); 1218 smp_wmb();
1205 } 1219 }
1206 current->fsuid = current->euid = uid; 1220 current->fsuid = current->euid = uid;
@@ -1245,7 +1259,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
1245 } 1259 }
1246 if (euid != (uid_t) -1) { 1260 if (euid != (uid_t) -1) {
1247 if (euid != current->euid) { 1261 if (euid != current->euid) {
1248 current->mm->dumpable = suid_dumpable; 1262 set_dumpable(current->mm, suid_dumpable);
1249 smp_wmb(); 1263 smp_wmb();
1250 } 1264 }
1251 current->euid = euid; 1265 current->euid = euid;
@@ -1295,7 +1309,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
1295 } 1309 }
1296 if (egid != (gid_t) -1) { 1310 if (egid != (gid_t) -1) {
1297 if (egid != current->egid) { 1311 if (egid != current->egid) {
1298 current->mm->dumpable = suid_dumpable; 1312 set_dumpable(current->mm, suid_dumpable);
1299 smp_wmb(); 1313 smp_wmb();
1300 } 1314 }
1301 current->egid = egid; 1315 current->egid = egid;
@@ -1341,7 +1355,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
1341 uid == current->suid || uid == current->fsuid || 1355 uid == current->suid || uid == current->fsuid ||
1342 capable(CAP_SETUID)) { 1356 capable(CAP_SETUID)) {
1343 if (uid != old_fsuid) { 1357 if (uid != old_fsuid) {
1344 current->mm->dumpable = suid_dumpable; 1358 set_dumpable(current->mm, suid_dumpable);
1345 smp_wmb(); 1359 smp_wmb();
1346 } 1360 }
1347 current->fsuid = uid; 1361 current->fsuid = uid;
@@ -1370,7 +1384,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
1370 gid == current->sgid || gid == current->fsgid || 1384 gid == current->sgid || gid == current->fsgid ||
1371 capable(CAP_SETGID)) { 1385 capable(CAP_SETGID)) {
1372 if (gid != old_fsgid) { 1386 if (gid != old_fsgid) {
1373 current->mm->dumpable = suid_dumpable; 1387 set_dumpable(current->mm, suid_dumpable);
1374 smp_wmb(); 1388 smp_wmb();
1375 } 1389 }
1376 current->fsgid = gid; 1390 current->fsgid = gid;
@@ -1430,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1430 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 1444 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
1431 * LBT 04.03.94 1445 * LBT 04.03.94
1432 */ 1446 */
1433
1434asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1447asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1435{ 1448{
1436 struct task_struct *p; 1449 struct task_struct *p;
@@ -1458,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1458 if (!thread_group_leader(p)) 1471 if (!thread_group_leader(p))
1459 goto out; 1472 goto out;
1460 1473
1461 if (p->real_parent == group_leader) { 1474 if (p->real_parent->tgid == group_leader->tgid) {
1462 err = -EPERM; 1475 err = -EPERM;
1463 if (task_session(p) != task_session(group_leader)) 1476 if (task_session(p) != task_session(group_leader))
1464 goto out; 1477 goto out;
@@ -2167,14 +2180,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2167 error = put_user(current->pdeath_signal, (int __user *)arg2); 2180 error = put_user(current->pdeath_signal, (int __user *)arg2);
2168 break; 2181 break;
2169 case PR_GET_DUMPABLE: 2182 case PR_GET_DUMPABLE:
2170 error = current->mm->dumpable; 2183 error = get_dumpable(current->mm);
2171 break; 2184 break;
2172 case PR_SET_DUMPABLE: 2185 case PR_SET_DUMPABLE:
2173 if (arg2 < 0 || arg2 > 1) { 2186 if (arg2 < 0 || arg2 > 1) {
2174 error = -EINVAL; 2187 error = -EINVAL;
2175 break; 2188 break;
2176 } 2189 }
2177 current->mm->dumpable = arg2; 2190 set_dumpable(current->mm, arg2);
2178 break; 2191 break;
2179 2192
2180 case PR_SET_UNALIGN: 2193 case PR_SET_UNALIGN:
@@ -2286,3 +2299,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2286 } 2299 }
2287 return err ? -EFAULT : 0; 2300 return err ? -EFAULT : 0;
2288} 2301}
2302
2303char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2304
2305static void argv_cleanup(char **argv, char **envp)
2306{
2307 argv_free(argv);
2308}
2309
2310/**
2311 * orderly_poweroff - Trigger an orderly system poweroff
2312 * @force: force poweroff if command execution fails
2313 *
2314 * This may be called from any context to trigger a system shutdown.
2315 * If the orderly shutdown fails, it will force an immediate shutdown.
2316 */
2317int orderly_poweroff(bool force)
2318{
2319 int argc;
2320 char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
2321 static char *envp[] = {
2322 "HOME=/",
2323 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2324 NULL
2325 };
2326 int ret = -ENOMEM;
2327 struct subprocess_info *info;
2328
2329 if (argv == NULL) {
2330 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2331 __func__, poweroff_cmd);
2332 goto out;
2333 }
2334
2335 info = call_usermodehelper_setup(argv[0], argv, envp);
2336 if (info == NULL) {
2337 argv_free(argv);
2338 goto out;
2339 }
2340
2341 call_usermodehelper_setcleanup(info, argv_cleanup);
2342
2343 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
2344
2345 out:
2346 if (ret && force) {
2347 printk(KERN_WARNING "Failed to start orderly shutdown: "
2348 "forcing the issue\n");
2349
2350 /* I guess this should try to kick off some daemon to
2351 sync and poweroff asap. Or not even bother syncing
2352 if we're doing an emergency shutdown? */
2353 emergency_sync();
2354 kernel_power_off();
2355 }
2356
2357 return ret;
2358}
2359EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7063ebc6db05..c7314f952647 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
27#include <linux/capability.h> 27#include <linux/capability.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/capability.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
33#include <linux/init.h> 32#include <linux/init.h>
@@ -46,6 +45,7 @@
46#include <linux/syscalls.h> 45#include <linux/syscalls.h>
47#include <linux/nfs_fs.h> 46#include <linux/nfs_fs.h>
48#include <linux/acpi.h> 47#include <linux/acpi.h>
48#include <linux/reboot.h>
49 49
50#include <asm/uaccess.h> 50#include <asm/uaccess.h>
51#include <asm/processor.h> 51#include <asm/processor.h>
@@ -77,6 +77,7 @@ extern int percpu_pagelist_fraction;
77extern int compat_log; 77extern int compat_log;
78extern int maps_protect; 78extern int maps_protect;
79extern int sysctl_stat_interval; 79extern int sysctl_stat_interval;
80extern int audit_argv_kb;
80 81
81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
82static int maxolduid = 65535; 83static int maxolduid = 65535;
@@ -159,6 +160,8 @@ extern ctl_table inotify_table[];
159int sysctl_legacy_va_layout; 160int sysctl_legacy_va_layout;
160#endif 161#endif
161 162
163extern int prove_locking;
164extern int lock_stat;
162 165
163/* The default sysctl tables: */ 166/* The default sysctl tables: */
164 167
@@ -219,8 +222,19 @@ static ctl_table kern_table[] = {
219#ifdef CONFIG_SCHED_DEBUG 222#ifdef CONFIG_SCHED_DEBUG
220 { 223 {
221 .ctl_name = CTL_UNNUMBERED, 224 .ctl_name = CTL_UNNUMBERED,
222 .procname = "sched_granularity_ns", 225 .procname = "sched_min_granularity_ns",
223 .data = &sysctl_sched_granularity, 226 .data = &sysctl_sched_min_granularity,
227 .maxlen = sizeof(unsigned int),
228 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 },
234 {
235 .ctl_name = CTL_UNNUMBERED,
236 .procname = "sched_latency_ns",
237 .data = &sysctl_sched_latency,
224 .maxlen = sizeof(unsigned int), 238 .maxlen = sizeof(unsigned int),
225 .mode = 0644, 239 .mode = 0644,
226 .proc_handler = &proc_dointvec_minmax, 240 .proc_handler = &proc_dointvec_minmax,
@@ -290,6 +304,34 @@ static ctl_table kern_table[] = {
290 }, 304 },
291#endif 305#endif
292 { 306 {
307 .ctl_name = CTL_UNNUMBERED,
308 .procname = "sched_compat_yield",
309 .data = &sysctl_sched_compat_yield,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
314#ifdef CONFIG_PROVE_LOCKING
315 {
316 .ctl_name = CTL_UNNUMBERED,
317 .procname = "prove_locking",
318 .data = &prove_locking,
319 .maxlen = sizeof(int),
320 .mode = 0644,
321 .proc_handler = &proc_dointvec,
322 },
323#endif
324#ifdef CONFIG_LOCK_STAT
325 {
326 .ctl_name = CTL_UNNUMBERED,
327 .procname = "lock_stat",
328 .data = &lock_stat,
329 .maxlen = sizeof(int),
330 .mode = 0644,
331 .proc_handler = &proc_dointvec,
332 },
333#endif
334 {
293 .ctl_name = KERN_PANIC, 335 .ctl_name = KERN_PANIC,
294 .procname = "panic", 336 .procname = "panic",
295 .data = &panic_timeout, 337 .data = &panic_timeout,
@@ -305,6 +347,16 @@ static ctl_table kern_table[] = {
305 .mode = 0644, 347 .mode = 0644,
306 .proc_handler = &proc_dointvec, 348 .proc_handler = &proc_dointvec,
307 }, 349 },
350#ifdef CONFIG_AUDITSYSCALL
351 {
352 .ctl_name = CTL_UNNUMBERED,
353 .procname = "audit_argv_kb",
354 .data = &audit_argv_kb,
355 .maxlen = sizeof(int),
356 .mode = 0644,
357 .proc_handler = &proc_dointvec,
358 },
359#endif
308 { 360 {
309 .ctl_name = KERN_CORE_PATTERN, 361 .ctl_name = KERN_CORE_PATTERN,
310 .procname = "core_pattern", 362 .procname = "core_pattern",
@@ -655,11 +707,11 @@ static ctl_table kern_table[] = {
655 .proc_handler = &proc_dointvec, 707 .proc_handler = &proc_dointvec,
656 }, 708 },
657#endif 709#endif
658#ifdef CONFIG_ACPI_SLEEP 710#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
659 { 711 {
660 .ctl_name = KERN_ACPI_VIDEO_FLAGS, 712 .ctl_name = KERN_ACPI_VIDEO_FLAGS,
661 .procname = "acpi_video_flags", 713 .procname = "acpi_video_flags",
662 .data = &acpi_video_flags, 714 .data = &acpi_realmode_flags,
663 .maxlen = sizeof (unsigned long), 715 .maxlen = sizeof (unsigned long),
664 .mode = 0644, 716 .mode = 0644,
665 .proc_handler = &proc_doulongvec_minmax, 717 .proc_handler = &proc_doulongvec_minmax,
@@ -705,13 +757,26 @@ static ctl_table kern_table[] = {
705 .proc_handler = &proc_dointvec, 757 .proc_handler = &proc_dointvec,
706 }, 758 },
707#endif 759#endif
708 760 {
761 .ctl_name = CTL_UNNUMBERED,
762 .procname = "poweroff_cmd",
763 .data = &poweroff_cmd,
764 .maxlen = POWEROFF_CMD_PATH_LEN,
765 .mode = 0644,
766 .proc_handler = &proc_dostring,
767 .strategy = &sysctl_string,
768 },
769/*
770 * NOTE: do not add new entries to this table unless you have read
771 * Documentation/sysctl/ctl_unnumbered.txt
772 */
709 { .ctl_name = 0 } 773 { .ctl_name = 0 }
710}; 774};
711 775
712/* Constants for minimum and maximum testing in vm_table. 776/* Constants for minimum and maximum testing in vm_table.
713 We use these as one-element integer vectors. */ 777 We use these as one-element integer vectors. */
714static int zero; 778static int zero;
779static int two = 2;
715static int one_hundred = 100; 780static int one_hundred = 100;
716 781
717 782
@@ -976,6 +1041,7 @@ static ctl_table vm_table[] = {
976 .mode = 0644, 1041 .mode = 0644,
977 .proc_handler = &proc_doulongvec_minmax, 1042 .proc_handler = &proc_doulongvec_minmax,
978 }, 1043 },
1044#endif
979#ifdef CONFIG_NUMA 1045#ifdef CONFIG_NUMA
980 { 1046 {
981 .ctl_name = CTL_UNNUMBERED, 1047 .ctl_name = CTL_UNNUMBERED,
@@ -987,7 +1053,6 @@ static ctl_table vm_table[] = {
987 .strategy = &sysctl_string, 1053 .strategy = &sysctl_string,
988 }, 1054 },
989#endif 1055#endif
990#endif
991#if defined(CONFIG_X86_32) || \ 1056#if defined(CONFIG_X86_32) || \
992 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1057 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
993 { 1058 {
@@ -1102,7 +1167,10 @@ static ctl_table fs_table[] = {
1102 .data = &lease_break_time, 1167 .data = &lease_break_time,
1103 .maxlen = sizeof(int), 1168 .maxlen = sizeof(int),
1104 .mode = 0644, 1169 .mode = 0644,
1105 .proc_handler = &proc_dointvec, 1170 .proc_handler = &proc_dointvec_minmax,
1171 .strategy = &sysctl_intvec,
1172 .extra1 = &zero,
1173 .extra2 = &two,
1106 }, 1174 },
1107 { 1175 {
1108 .ctl_name = FS_AIO_NR, 1176 .ctl_name = FS_AIO_NR,
@@ -1153,6 +1221,16 @@ static ctl_table fs_table[] = {
1153}; 1221};
1154 1222
1155static ctl_table debug_table[] = { 1223static ctl_table debug_table[] = {
1224#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1225 {
1226 .ctl_name = CTL_UNNUMBERED,
1227 .procname = "exception-trace",
1228 .data = &show_unhandled_signals,
1229 .maxlen = sizeof(int),
1230 .mode = 0644,
1231 .proc_handler = proc_dointvec
1232 },
1233#endif
1156 { .ctl_name = 0 } 1234 { .ctl_name = 0 }
1157}; 1235};
1158 1236
diff --git a/kernel/time.c b/kernel/time.c
index ffe19149d770..2289a8d68314 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -57,17 +57,14 @@ EXPORT_SYMBOL(sys_tz);
57 */ 57 */
58asmlinkage long sys_time(time_t __user * tloc) 58asmlinkage long sys_time(time_t __user * tloc)
59{ 59{
60 /* 60 time_t i;
61 * We read xtime.tv_sec atomically - it's updated 61 struct timespec tv;
62 * atomically by update_wall_time(), so no need to
63 * even read-lock the xtime seqlock:
64 */
65 time_t i = xtime.tv_sec;
66 62
67 smp_rmb(); /* sys_time() results are coherent */ 63 getnstimeofday(&tv);
64 i = tv.tv_sec;
68 65
69 if (tloc) { 66 if (tloc) {
70 if (put_user(i, tloc)) 67 if (put_user(i,tloc))
71 i = -EFAULT; 68 i = -EFAULT;
72 } 69 }
73 return i; 70 return i;
@@ -136,7 +133,6 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 133 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 135 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 time_interpolator_reset();
140 write_sequnlock_irq(&xtime_lock); 136 write_sequnlock_irq(&xtime_lock);
141 clock_was_set(); 137 clock_was_set();
142} 138}
@@ -219,22 +215,6 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p)
219 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; 215 return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
220} 216}
221 217
222inline struct timespec current_kernel_time(void)
223{
224 struct timespec now;
225 unsigned long seq;
226
227 do {
228 seq = read_seqbegin(&xtime_lock);
229
230 now = xtime;
231 } while (read_seqretry(&xtime_lock, seq));
232
233 return now;
234}
235
236EXPORT_SYMBOL(current_kernel_time);
237
238/** 218/**
239 * current_fs_time - Return FS time 219 * current_fs_time - Return FS time
240 * @sb: Superblock. 220 * @sb: Superblock.
@@ -309,92 +289,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
309} 289}
310EXPORT_SYMBOL(timespec_trunc); 290EXPORT_SYMBOL(timespec_trunc);
311 291
312#ifdef CONFIG_TIME_INTERPOLATION
313void getnstimeofday (struct timespec *tv)
314{
315 unsigned long seq,sec,nsec;
316
317 do {
318 seq = read_seqbegin(&xtime_lock);
319 sec = xtime.tv_sec;
320 nsec = xtime.tv_nsec+time_interpolator_get_offset();
321 } while (unlikely(read_seqretry(&xtime_lock, seq)));
322
323 while (unlikely(nsec >= NSEC_PER_SEC)) {
324 nsec -= NSEC_PER_SEC;
325 ++sec;
326 }
327 tv->tv_sec = sec;
328 tv->tv_nsec = nsec;
329}
330EXPORT_SYMBOL_GPL(getnstimeofday);
331
332int do_settimeofday (struct timespec *tv)
333{
334 time_t wtm_sec, sec = tv->tv_sec;
335 long wtm_nsec, nsec = tv->tv_nsec;
336
337 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
338 return -EINVAL;
339
340 write_seqlock_irq(&xtime_lock);
341 {
342 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
343 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
344
345 set_normalized_timespec(&xtime, sec, nsec);
346 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
347
348 time_adjust = 0; /* stop active adjtime() */
349 time_status |= STA_UNSYNC;
350 time_maxerror = NTP_PHASE_LIMIT;
351 time_esterror = NTP_PHASE_LIMIT;
352 time_interpolator_reset();
353 }
354 write_sequnlock_irq(&xtime_lock);
355 clock_was_set();
356 return 0;
357}
358EXPORT_SYMBOL(do_settimeofday);
359
360void do_gettimeofday (struct timeval *tv)
361{
362 unsigned long seq, nsec, usec, sec, offset;
363 do {
364 seq = read_seqbegin(&xtime_lock);
365 offset = time_interpolator_get_offset();
366 sec = xtime.tv_sec;
367 nsec = xtime.tv_nsec;
368 } while (unlikely(read_seqretry(&xtime_lock, seq)));
369
370 usec = (nsec + offset) / 1000;
371
372 while (unlikely(usec >= USEC_PER_SEC)) {
373 usec -= USEC_PER_SEC;
374 ++sec;
375 }
376
377 tv->tv_sec = sec;
378 tv->tv_usec = usec;
379
380 /*
381 * Make sure xtime.tv_sec [returned by sys_time()] always
382 * follows the gettimeofday() result precisely. This
383 * condition is extremely unlikely, it can hit at most
384 * once per second:
385 */
386 if (unlikely(xtime.tv_sec != tv->tv_sec)) {
387 unsigned long flags;
388
389 write_seqlock_irqsave(&xtime_lock, flags);
390 update_wall_time();
391 write_sequnlock_irqrestore(&xtime_lock, flags);
392 }
393}
394EXPORT_SYMBOL(do_gettimeofday);
395
396#else /* CONFIG_TIME_INTERPOLATION */
397
398#ifndef CONFIG_GENERIC_TIME 292#ifndef CONFIG_GENERIC_TIME
399/* 293/*
400 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 294 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
@@ -410,7 +304,6 @@ void getnstimeofday(struct timespec *tv)
410} 304}
411EXPORT_SYMBOL_GPL(getnstimeofday); 305EXPORT_SYMBOL_GPL(getnstimeofday);
412#endif 306#endif
413#endif /* CONFIG_TIME_INTERPOLATION */
414 307
415/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 308/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
416 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 309 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f66351126544..8d53106a0a92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS
23 hardware is not capable then this option only increases 23 hardware is not capable then this option only increases
24 the size of the kernel image. 24 the size of the kernel image.
25 25
26config GENERIC_CLOCKEVENTS_BUILD
27 bool
28 default y
29 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
30
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 99b6034fc86b..905b0b50792d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2ad1c37b8dfe..822beebe664a 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -113,16 +113,6 @@ int clockevents_register_notifier(struct notifier_block *nb)
113 return ret; 113 return ret;
114} 114}
115 115
116/**
117 * clockevents_unregister_notifier - unregister a clock events change listener
118 */
119void clockevents_unregister_notifier(struct notifier_block *nb)
120{
121 spin_lock(&clockevents_lock);
122 raw_notifier_chain_unregister(&clockevents_chain, nb);
123 spin_unlock(&clockevents_lock);
124}
125
126/* 116/*
127 * Notify about a clock event change. Called with clockevents_lock 117 * Notify about a clock event change. Called with clockevents_lock
128 * held. 118 * held.
@@ -204,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
204 local_irq_restore(flags); 194 local_irq_restore(flags);
205} 195}
206 196
197#ifdef CONFIG_GENERIC_CLOCKEVENTS
207/** 198/**
208 * clockevents_notify - notification about relevant events 199 * clockevents_notify - notification about relevant events
209 */ 200 */
@@ -232,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg)
232 spin_unlock(&clockevents_lock); 223 spin_unlock(&clockevents_lock);
233} 224}
234EXPORT_SYMBOL_GPL(clockevents_notify); 225EXPORT_SYMBOL_GPL(clockevents_notify);
235 226#endif
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 438c6b723ee2..de6a2d6b3ebb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
13#include <linux/timex.h> 14#include <linux/timex.h>
14#include <linux/jiffies.h> 15#include <linux/jiffies.h>
15#include <linux/hrtimer.h> 16#include <linux/hrtimer.h>
@@ -116,11 +117,6 @@ void second_overflow(void)
116 if (xtime.tv_sec % 86400 == 0) { 117 if (xtime.tv_sec % 86400 == 0) {
117 xtime.tv_sec--; 118 xtime.tv_sec--;
118 wall_to_monotonic.tv_sec++; 119 wall_to_monotonic.tv_sec++;
119 /*
120 * The timer interpolator will make time change
121 * gradually instead of an immediate jump by one second
122 */
123 time_interpolator_update(-NSEC_PER_SEC);
124 time_state = TIME_OOP; 120 time_state = TIME_OOP;
125 printk(KERN_NOTICE "Clock: inserting leap second " 121 printk(KERN_NOTICE "Clock: inserting leap second "
126 "23:59:60 UTC\n"); 122 "23:59:60 UTC\n");
@@ -130,11 +126,6 @@ void second_overflow(void)
130 if ((xtime.tv_sec + 1) % 86400 == 0) { 126 if ((xtime.tv_sec + 1) % 86400 == 0) {
131 xtime.tv_sec++; 127 xtime.tv_sec++;
132 wall_to_monotonic.tv_sec--; 128 wall_to_monotonic.tv_sec--;
133 /*
134 * Use of time interpolator for a gradual change of
135 * time
136 */
137 time_interpolator_update(NSEC_PER_SEC);
138 time_state = TIME_WAIT; 129 time_state = TIME_WAIT;
139 printk(KERN_NOTICE "Clock: deleting leap second " 130 printk(KERN_NOTICE "Clock: deleting leap second "
140 "23:59:59 UTC\n"); 131 "23:59:59 UTC\n");
@@ -185,12 +176,64 @@ u64 current_tick_length(void)
185 return tick_length; 176 return tick_length;
186} 177}
187 178
179#ifdef CONFIG_GENERIC_CMOS_UPDATE
188 180
189void __attribute__ ((weak)) notify_arch_cmos_timer(void) 181/* Disable the cmos update - used by virtualization and embedded */
182int no_sync_cmos_clock __read_mostly;
183
184static void sync_cmos_clock(unsigned long dummy);
185
186static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
187
188static void sync_cmos_clock(unsigned long dummy)
189{
190 struct timespec now, next;
191 int fail = 1;
192
193 /*
194 * If we have an externally synchronized Linux clock, then update
195 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
196 * called as close as possible to 500 ms before the new second starts.
197 * This code is run on a timer. If the clock is set, that timer
198 * may not expire at the correct time. Thus, we adjust...
199 */
200 if (!ntp_synced())
201 /*
202 * Not synced, exit, do not restart a timer (if one is
203 * running, let it run out).
204 */
205 return;
206
207 getnstimeofday(&now);
208 if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
209 fail = update_persistent_clock(now);
210
211 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
212 if (next.tv_nsec <= 0)
213 next.tv_nsec += NSEC_PER_SEC;
214
215 if (!fail)
216 next.tv_sec = 659;
217 else
218 next.tv_sec = 0;
219
220 if (next.tv_nsec >= NSEC_PER_SEC) {
221 next.tv_sec++;
222 next.tv_nsec -= NSEC_PER_SEC;
223 }
224 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
225}
226
227static void notify_cmos_timer(void)
190{ 228{
191 return; 229 if (!no_sync_cmos_clock)
230 mod_timer(&sync_cmos_timer, jiffies + 1);
192} 231}
193 232
233#else
234static inline void notify_cmos_timer(void) { }
235#endif
236
194/* adjtimex mainly allows reading (and writing, if superuser) of 237/* adjtimex mainly allows reading (and writing, if superuser) of
195 * kernel time-keeping variables. used by xntpd. 238 * kernel time-keeping variables. used by xntpd.
196 */ 239 */
@@ -355,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
355 txc->stbcnt = 0; 398 txc->stbcnt = 0;
356 write_sequnlock_irq(&xtime_lock); 399 write_sequnlock_irq(&xtime_lock);
357 do_gettimeofday(&txc->time); 400 do_gettimeofday(&txc->time);
358 notify_arch_cmos_timer(); 401 notify_cmos_timer();
359 return(result); 402 return(result);
360} 403}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8001d37071f5..298bc7c6f09f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock); 32static DEFINE_SPINLOCK(tick_broadcast_lock);
33 33
34#ifdef CONFIG_TICK_ONESHOT
35static void tick_broadcast_clear_oneshot(int cpu);
36#else
37static inline void tick_broadcast_clear_oneshot(int cpu) { }
38#endif
39
34/* 40/*
35 * Debugging: see timer_list.c 41 * Debugging: see timer_list.c
36 */ 42 */
@@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void)
49 */ 55 */
50static void tick_broadcast_start_periodic(struct clock_event_device *bc) 56static void tick_broadcast_start_periodic(struct clock_event_device *bc)
51{ 57{
52 if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) 58 if (bc)
53 tick_setup_periodic(bc, 1); 59 tick_setup_periodic(bc, 1);
54} 60}
55 61
@@ -58,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
58 */ 64 */
59int tick_check_broadcast_device(struct clock_event_device *dev) 65int tick_check_broadcast_device(struct clock_event_device *dev)
60{ 66{
61 if (tick_broadcast_device.evtdev || 67 if ((tick_broadcast_device.evtdev &&
62 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 68 tick_broadcast_device.evtdev->rating >= dev->rating) ||
69 (dev->features & CLOCK_EVT_FEAT_C3STOP))
63 return 0; 70 return 0;
64 71
65 clockevents_exchange_device(NULL, dev); 72 clockevents_exchange_device(NULL, dev);
@@ -99,8 +106,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
99 cpu_set(cpu, tick_broadcast_mask); 106 cpu_set(cpu, tick_broadcast_mask);
100 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 107 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
101 ret = 1; 108 ret = 1;
102 } 109 } else {
110 /*
111 * When the new device is not affected by the stop
112 * feature and the cpu is marked in the broadcast mask
113 * then clear the broadcast bit.
114 */
115 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
116 int cpu = smp_processor_id();
103 117
118 cpu_clear(cpu, tick_broadcast_mask);
119 tick_broadcast_clear_oneshot(cpu);
120 }
121 }
104 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 122 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
105 return ret; 123 return ret;
106} 124}
@@ -159,8 +177,6 @@ static void tick_do_periodic_broadcast(void)
159 */ 177 */
160static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 178static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
161{ 179{
162 dev->next_event.tv64 = KTIME_MAX;
163
164 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
165 181
166 /* 182 /*
@@ -299,7 +315,7 @@ void tick_suspend_broadcast(void)
299 spin_lock_irqsave(&tick_broadcast_lock, flags); 315 spin_lock_irqsave(&tick_broadcast_lock, flags);
300 316
301 bc = tick_broadcast_device.evtdev; 317 bc = tick_broadcast_device.evtdev;
302 if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 318 if (bc)
303 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 319 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
304 320
305 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 321 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -316,6 +332,8 @@ int tick_resume_broadcast(void)
316 bc = tick_broadcast_device.evtdev; 332 bc = tick_broadcast_device.evtdev;
317 333
318 if (bc) { 334 if (bc) {
335 clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME);
336
319 switch (tick_broadcast_device.mode) { 337 switch (tick_broadcast_device.mode) {
320 case TICKDEV_MODE_PERIODIC: 338 case TICKDEV_MODE_PERIODIC:
321 if(!cpus_empty(tick_broadcast_mask)) 339 if(!cpus_empty(tick_broadcast_mask))
@@ -364,11 +382,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
364int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 382int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
365{ 383{
366 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 384 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
367 385 return 0;
368 if(!cpus_empty(tick_broadcast_oneshot_mask))
369 tick_broadcast_set_event(ktime_get(), 1);
370
371 return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
372} 386}
373 387
374/* 388/*
@@ -485,16 +499,24 @@ out:
485 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 499 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
486} 500}
487 501
502/*
503 * Reset the one shot broadcast for a cpu
504 *
505 * Called with tick_broadcast_lock held
506 */
507static void tick_broadcast_clear_oneshot(int cpu)
508{
509 cpu_clear(cpu, tick_broadcast_oneshot_mask);
510}
511
488/** 512/**
489 * tick_broadcast_setup_highres - setup the broadcast device for highres 513 * tick_broadcast_setup_highres - setup the broadcast device for highres
490 */ 514 */
491void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 515void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
492{ 516{
493 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { 517 bc->event_handler = tick_handle_oneshot_broadcast;
494 bc->event_handler = tick_handle_oneshot_broadcast; 518 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
495 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 519 bc->next_event.tv64 = KTIME_MAX;
496 bc->next_event.tv64 = KTIME_MAX;
497 }
498} 520}
499 521
500/* 522/*
@@ -520,20 +542,17 @@ void tick_broadcast_switch_to_oneshot(void)
520 */ 542 */
521void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 543void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
522{ 544{
523 struct clock_event_device *bc;
524 unsigned long flags; 545 unsigned long flags;
525 unsigned int cpu = *cpup; 546 unsigned int cpu = *cpup;
526 547
527 spin_lock_irqsave(&tick_broadcast_lock, flags); 548 spin_lock_irqsave(&tick_broadcast_lock, flags);
528 549
529 bc = tick_broadcast_device.evtdev; 550 /*
551 * Clear the broadcast mask flag for the dead cpu, but do not
552 * stop the broadcast device!
553 */
530 cpu_clear(cpu, tick_broadcast_oneshot_mask); 554 cpu_clear(cpu, tick_broadcast_oneshot_mask);
531 555
532 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
533 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
534 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
535 }
536
537 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 556 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
538} 557}
539 558
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a96ec9ab3454..3f3ae3907830 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
200 200
201 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
202 if (!cpu_isset(cpu, newdev->cpumask)) 202 if (!cpu_isset(cpu, newdev->cpumask))
203 goto out; 203 goto out_bc;
204 204
205 td = &per_cpu(tick_cpu_device, cpu); 205 td = &per_cpu(tick_cpu_device, cpu);
206 curdev = td->evtdev; 206 curdev = td->evtdev;
@@ -265,7 +265,7 @@ out_bc:
265 */ 265 */
266 if (tick_check_broadcast_device(newdev)) 266 if (tick_check_broadcast_device(newdev))
267 ret = NOTIFY_STOP; 267 ret = NOTIFY_STOP;
268out: 268
269 spin_unlock_irqrestore(&tick_device_lock, flags); 269 spin_unlock_irqrestore(&tick_device_lock, flags);
270 270
271 return ret; 271 return ret;
@@ -318,12 +318,17 @@ static void tick_resume(void)
318{ 318{
319 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 319 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
320 unsigned long flags; 320 unsigned long flags;
321 int broadcast = tick_resume_broadcast();
321 322
322 spin_lock_irqsave(&tick_device_lock, flags); 323 spin_lock_irqsave(&tick_device_lock, flags);
323 if (td->mode == TICKDEV_MODE_PERIODIC) 324 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
324 tick_setup_periodic(td->evtdev, 0); 325
325 else 326 if (!broadcast) {
326 tick_resume_oneshot(); 327 if (td->mode == TICKDEV_MODE_PERIODIC)
328 tick_setup_periodic(td->evtdev, 0);
329 else
330 tick_resume_oneshot();
331 }
327 spin_unlock_irqrestore(&tick_device_lock, flags); 332 spin_unlock_irqrestore(&tick_device_lock, flags);
328} 333}
329 334
@@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
360 break; 365 break;
361 366
362 case CLOCK_EVT_NOTIFY_RESUME: 367 case CLOCK_EVT_NOTIFY_RESUME:
363 if (!tick_resume_broadcast()) 368 tick_resume();
364 tick_resume();
365 break; 369 break;
366 370
367 default: 371 default:
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f6997ab0c3c9..0258d3115d54 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
73 struct clock_event_device *dev = td->evtdev; 73 struct clock_event_device *dev = td->evtdev;
74 74
75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || 75 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
76 !tick_device_is_functional(dev)) 76 !tick_device_is_functional(dev)) {
77
78 printk(KERN_INFO "Clockevents: "
79 "could not switch to one-shot mode:");
80 if (!dev) {
81 printk(" no tick device\n");
82 } else {
83 if (!tick_device_is_functional(dev))
84 printk(" %s is not functional.\n", dev->name);
85 else
86 printk(" %s does not support one-shot mode.\n",
87 dev->name);
88 }
77 return -EINVAL; 89 return -EINVAL;
90 }
78 91
79 td->mode = TICKDEV_MODE_ONESHOT; 92 td->mode = TICKDEV_MODE_ONESHOT;
80 dev->event_handler = handler; 93 dev->event_handler = handler;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 52db9e3c526e..8c3fef1db09c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void)
160 cpu = smp_processor_id(); 160 cpu = smp_processor_id();
161 ts = &per_cpu(tick_cpu_sched, cpu); 161 ts = &per_cpu(tick_cpu_sched, cpu);
162 162
163 /*
164 * If this cpu is offline and it is the one which updates
165 * jiffies, then give up the assignment and let it be taken by
166 * the cpu which runs the tick timer next. If we don't drop
167 * this here the jiffies might be stale and do_timer() never
168 * invoked.
169 */
170 if (unlikely(!cpu_online(cpu))) {
171 if (cpu == tick_do_timer_cpu)
172 tick_do_timer_cpu = -1;
173 }
174
163 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 175 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
164 goto end; 176 goto end;
165 177
@@ -546,6 +558,7 @@ void tick_setup_sched_timer(void)
546{ 558{
547 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 559 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
548 ktime_t now = ktime_get(); 560 ktime_t now = ktime_get();
561 u64 offset;
549 562
550 /* 563 /*
551 * Emulate tick processing via per-CPU hrtimers: 564 * Emulate tick processing via per-CPU hrtimers:
@@ -554,8 +567,12 @@ void tick_setup_sched_timer(void)
554 ts->sched_timer.function = tick_sched_timer; 567 ts->sched_timer.function = tick_sched_timer;
555 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 568 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
556 569
557 /* Get the next period */ 570 /* Get the next period (per cpu) */
558 ts->sched_timer.expires = tick_init_jiffy_update(); 571 ts->sched_timer.expires = tick_init_jiffy_update();
572 offset = ktime_to_ns(tick_period) >> 1;
573 do_div(offset, NR_CPUS);
574 offset *= smp_processor_id();
575 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
559 576
560 for (;;) { 577 for (;;) {
561 hrtimer_forward(&ts->sched_timer, now, tick_period); 578 hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 728cedfd3cbd..4ad79f6bdec6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -47,10 +47,22 @@ EXPORT_SYMBOL(xtime_lock);
47struct timespec xtime __attribute__ ((aligned (16))); 47struct timespec xtime __attribute__ ((aligned (16)));
48struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 48struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
49static unsigned long total_sleep_time; /* seconds */ 49static unsigned long total_sleep_time; /* seconds */
50
51EXPORT_SYMBOL(xtime); 50EXPORT_SYMBOL(xtime);
52 51
53 52
53#ifdef CONFIG_NO_HZ
54static struct timespec xtime_cache __attribute__ ((aligned (16)));
55static inline void update_xtime_cache(u64 nsec)
56{
57 xtime_cache = xtime;
58 timespec_add_ns(&xtime_cache, nsec);
59}
60#else
61#define xtime_cache xtime
62/* We do *not* want to evaluate the argument for this case */
63#define update_xtime_cache(n) do { } while (0)
64#endif
65
54static struct clocksource *clock; /* pointer to current clocksource */ 66static struct clocksource *clock; /* pointer to current clocksource */
55 67
56 68
@@ -205,6 +217,7 @@ static void change_clocksource(void)
205} 217}
206#else 218#else
207static inline void change_clocksource(void) { } 219static inline void change_clocksource(void) { }
220static inline s64 __get_nsec_offset(void) { return 0; }
208#endif 221#endif
209 222
210/** 223/**
@@ -268,6 +281,8 @@ void __init timekeeping_init(void)
268static int timekeeping_suspended; 281static int timekeeping_suspended;
269/* time in seconds when suspend began */ 282/* time in seconds when suspend began */
270static unsigned long timekeeping_suspend_time; 283static unsigned long timekeeping_suspend_time;
284/* xtime offset when we went into suspend */
285static s64 timekeeping_suspend_nsecs;
271 286
272/** 287/**
273 * timekeeping_resume - Resumes the generic timekeeping subsystem. 288 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -293,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev)
293 wall_to_monotonic.tv_sec -= sleep_length; 308 wall_to_monotonic.tv_sec -= sleep_length;
294 total_sleep_time += sleep_length; 309 total_sleep_time += sleep_length;
295 } 310 }
311 /* Make sure that we have the correct xtime reference */
312 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
296 /* re-base the last cycle value */ 313 /* re-base the last cycle value */
297 clock->cycle_last = clocksource_read(clock); 314 clock->cycle_last = clocksource_read(clock);
298 clock->error = 0; 315 clock->error = 0;
@@ -313,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
313{ 330{
314 unsigned long flags; 331 unsigned long flags;
315 332
333 timekeeping_suspend_time = read_persistent_clock();
334
316 write_seqlock_irqsave(&xtime_lock, flags); 335 write_seqlock_irqsave(&xtime_lock, flags);
336 /* Get the current xtime offset */
337 timekeeping_suspend_nsecs = __get_nsec_offset();
317 timekeeping_suspended = 1; 338 timekeeping_suspended = 1;
318 timekeeping_suspend_time = read_persistent_clock();
319 write_sequnlock_irqrestore(&xtime_lock, flags); 339 write_sequnlock_irqrestore(&xtime_lock, flags);
320 340
321 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 341 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -401,7 +421,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
401 * this is optimized for the most common adjustments of -1,0,1, 421 * this is optimized for the most common adjustments of -1,0,1,
402 * for other values we can do a bit more work. 422 * for other values we can do a bit more work.
403 */ 423 */
404static void clocksource_adjust(struct clocksource *clock, s64 offset) 424static void clocksource_adjust(s64 offset)
405{ 425{
406 s64 error, interval = clock->cycle_interval; 426 s64 error, interval = clock->cycle_interval;
407 int adj; 427 int adj;
@@ -466,22 +486,20 @@ void update_wall_time(void)
466 second_overflow(); 486 second_overflow();
467 } 487 }
468 488
469 /* interpolator bits */
470 time_interpolator_update(clock->xtime_interval
471 >> clock->shift);
472
473 /* accumulate error between NTP and clock interval */ 489 /* accumulate error between NTP and clock interval */
474 clock->error += current_tick_length(); 490 clock->error += current_tick_length();
475 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); 491 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
476 } 492 }
477 493
478 /* correct the clock when NTP error is too big */ 494 /* correct the clock when NTP error is too big */
479 clocksource_adjust(clock, offset); 495 clocksource_adjust(offset);
480 496
481 /* store full nanoseconds into xtime */ 497 /* store full nanoseconds into xtime */
482 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 498 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
483 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 499 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
484 500
501 update_xtime_cache(cyc2ns(clock, offset));
502
485 /* check to see if there is a new clocksource to use */ 503 /* check to see if there is a new clocksource to use */
486 change_clocksource(); 504 change_clocksource();
487 update_vsyscall(&xtime, clock); 505 update_vsyscall(&xtime, clock);
@@ -513,3 +531,25 @@ void monotonic_to_bootbased(struct timespec *ts)
513{ 531{
514 ts->tv_sec += total_sleep_time; 532 ts->tv_sec += total_sleep_time;
515} 533}
534
535unsigned long get_seconds(void)
536{
537 return xtime_cache.tv_sec;
538}
539EXPORT_SYMBOL(get_seconds);
540
541
542struct timespec current_kernel_time(void)
543{
544 struct timespec now;
545 unsigned long seq;
546
547 do {
548 seq = read_seqbegin(&xtime_lock);
549
550 now = xtime_cache;
551 } while (read_seqretry(&xtime_lock, seq));
552
553 return now;
554}
555EXPORT_SYMBOL(current_kernel_time);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e5edc3a22a08..fdb2e03d4fe0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -267,7 +267,7 @@ static struct file_operations timer_list_fops = {
267 .open = timer_list_open, 267 .open = timer_list_open,
268 .read = seq_read, 268 .read = seq_read,
269 .llseek = seq_lseek, 269 .llseek = seq_lseek,
270 .release = seq_release, 270 .release = single_release,
271}; 271};
272 272
273static int __init init_timer_list_procfs(void) 273static int __init init_timer_list_procfs(void)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 8ed62fda16c6..c36bb7ed0301 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v)
327 ms = 1; 327 ms = 1;
328 328
329 if (events && period.tv_sec) 329 if (events && period.tv_sec)
330 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, 330 seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
331 events / period.tv_sec, events * 1000 / ms); 331 events, events * 1000 / ms,
332 (events * 1000000 / ms) % 1000);
332 else 333 else
333 seq_printf(m, "%ld total events\n", events); 334 seq_printf(m, "%ld total events\n", events);
334 335
@@ -399,7 +400,7 @@ static struct file_operations tstats_fops = {
399 .read = seq_read, 400 .read = seq_read,
400 .write = tstats_write, 401 .write = tstats_write,
401 .llseek = seq_lseek, 402 .llseek = seq_lseek,
402 .release = seq_release, 403 .release = single_release,
403}; 404};
404 405
405void __init init_timer_stats(void) 406void __init init_timer_stats(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index b7792fb03387..6ce1952eea7d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG));
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)(new_base) |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
116/** 116/**
@@ -445,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer);
445void add_timer_on(struct timer_list *timer, int cpu) 445void add_timer_on(struct timer_list *timer, int cpu)
446{ 446{
447 tvec_base_t *base = per_cpu(tvec_bases, cpu); 447 tvec_base_t *base = per_cpu(tvec_bases, cpu);
448 unsigned long flags; 448 unsigned long flags;
449 449
450 timer_stats_timer_set_start_info(timer); 450 timer_stats_timer_set_start_info(timer);
451 BUG_ON(timer_pending(timer) || !timer->function); 451 BUG_ON(timer_pending(timer) || !timer->function);
452 spin_lock_irqsave(&base->lock, flags); 452 spin_lock_irqsave(&base->lock, flags);
453 timer_set_base(timer, base); 453 timer_set_base(timer, base);
454 internal_add_timer(base, timer); 454 internal_add_timer(base, timer);
@@ -627,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base)
627 while (time_after_eq(jiffies, base->timer_jiffies)) { 627 while (time_after_eq(jiffies, base->timer_jiffies)) {
628 struct list_head work_list; 628 struct list_head work_list;
629 struct list_head *head = &work_list; 629 struct list_head *head = &work_list;
630 int index = base->timer_jiffies & TVR_MASK; 630 int index = base->timer_jiffies & TVR_MASK;
631 631
632 /* 632 /*
633 * Cascade timers: 633 * Cascade timers:
@@ -644,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base)
644 unsigned long data; 644 unsigned long data;
645 645
646 timer = list_first_entry(head, struct timer_list,entry); 646 timer = list_first_entry(head, struct timer_list,entry);
647 fn = timer->function; 647 fn = timer->function;
648 data = timer->data; 648 data = timer->data;
649 649
650 timer_stats_account_timer(timer); 650 timer_stats_account_timer(timer);
651 651
@@ -689,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
689 index = slot = timer_jiffies & TVR_MASK; 689 index = slot = timer_jiffies & TVR_MASK;
690 do { 690 do {
691 list_for_each_entry(nte, base->tv1.vec + slot, entry) { 691 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
692 if (tbase_get_deferrable(nte->base)) 692 if (tbase_get_deferrable(nte->base))
693 continue; 693 continue;
694 694
695 found = 1; 695 found = 1;
696 expires = nte->expires; 696 expires = nte->expires;
@@ -834,7 +834,7 @@ void update_process_times(int user_tick)
834 if (rcu_pending(cpu)) 834 if (rcu_pending(cpu))
835 rcu_check_callbacks(cpu, user_tick); 835 rcu_check_callbacks(cpu, user_tick);
836 scheduler_tick(); 836 scheduler_tick();
837 run_posix_cpu_timers(p); 837 run_posix_cpu_timers(p);
838} 838}
839 839
840/* 840/*
@@ -909,7 +909,7 @@ static inline void update_times(unsigned long ticks)
909 update_wall_time(); 909 update_wall_time();
910 calc_load(ticks); 910 calc_load(ticks);
911} 911}
912 912
913/* 913/*
914 * The 64-bit jiffies value is not atomic - you MUST NOT read it 914 * The 64-bit jiffies value is not atomic - you MUST NOT read it
915 * without sampling the sequence number in xtime_lock. 915 * without sampling the sequence number in xtime_lock.
@@ -1105,7 +1105,7 @@ asmlinkage long sys_gettid(void)
1105/** 1105/**
1106 * do_sysinfo - fill in sysinfo struct 1106 * do_sysinfo - fill in sysinfo struct
1107 * @info: pointer to buffer to fill 1107 * @info: pointer to buffer to fill
1108 */ 1108 */
1109int do_sysinfo(struct sysinfo *info) 1109int do_sysinfo(struct sysinfo *info)
1110{ 1110{
1111 unsigned long mem_total, sav_total; 1111 unsigned long mem_total, sav_total;
@@ -1349,194 +1349,6 @@ void __init init_timers(void)
1349 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1349 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1350} 1350}
1351 1351
1352#ifdef CONFIG_TIME_INTERPOLATION
1353
1354struct time_interpolator *time_interpolator __read_mostly;
1355static struct time_interpolator *time_interpolator_list __read_mostly;
1356static DEFINE_SPINLOCK(time_interpolator_lock);
1357
1358static inline cycles_t time_interpolator_get_cycles(unsigned int src)
1359{
1360 unsigned long (*x)(void);
1361
1362 switch (src)
1363 {
1364 case TIME_SOURCE_FUNCTION:
1365 x = time_interpolator->addr;
1366 return x();
1367
1368 case TIME_SOURCE_MMIO64 :
1369 return readq_relaxed((void __iomem *)time_interpolator->addr);
1370
1371 case TIME_SOURCE_MMIO32 :
1372 return readl_relaxed((void __iomem *)time_interpolator->addr);
1373
1374 default: return get_cycles();
1375 }
1376}
1377
1378static inline u64 time_interpolator_get_counter(int writelock)
1379{
1380 unsigned int src = time_interpolator->source;
1381
1382 if (time_interpolator->jitter)
1383 {
1384 cycles_t lcycle;
1385 cycles_t now;
1386
1387 do {
1388 lcycle = time_interpolator->last_cycle;
1389 now = time_interpolator_get_cycles(src);
1390 if (lcycle && time_after(lcycle, now))
1391 return lcycle;
1392
1393 /* When holding the xtime write lock, there's no need
1394 * to add the overhead of the cmpxchg. Readers are
1395 * force to retry until the write lock is released.
1396 */
1397 if (writelock) {
1398 time_interpolator->last_cycle = now;
1399 return now;
1400 }
1401 /* Keep track of the last timer value returned. The use of cmpxchg here
1402 * will cause contention in an SMP environment.
1403 */
1404 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1405 return now;
1406 }
1407 else
1408 return time_interpolator_get_cycles(src);
1409}
1410
1411void time_interpolator_reset(void)
1412{
1413 time_interpolator->offset = 0;
1414 time_interpolator->last_counter = time_interpolator_get_counter(1);
1415}
1416
1417#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1418
1419unsigned long time_interpolator_get_offset(void)
1420{
1421 /* If we do not have a time interpolator set up then just return zero */
1422 if (!time_interpolator)
1423 return 0;
1424
1425 return time_interpolator->offset +
1426 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1427}
1428
1429#define INTERPOLATOR_ADJUST 65536
1430#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1431
1432void time_interpolator_update(long delta_nsec)
1433{
1434 u64 counter;
1435 unsigned long offset;
1436
1437 /* If there is no time interpolator set up then do nothing */
1438 if (!time_interpolator)
1439 return;
1440
1441 /*
1442 * The interpolator compensates for late ticks by accumulating the late
1443 * time in time_interpolator->offset. A tick earlier than expected will
1444 * lead to a reset of the offset and a corresponding jump of the clock
1445 * forward. Again this only works if the interpolator clock is running
1446 * slightly slower than the regular clock and the tuning logic insures
1447 * that.
1448 */
1449
1450 counter = time_interpolator_get_counter(1);
1451 offset = time_interpolator->offset +
1452 GET_TI_NSECS(counter, time_interpolator);
1453
1454 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1455 time_interpolator->offset = offset - delta_nsec;
1456 else {
1457 time_interpolator->skips++;
1458 time_interpolator->ns_skipped += delta_nsec - offset;
1459 time_interpolator->offset = 0;
1460 }
1461 time_interpolator->last_counter = counter;
1462
1463 /* Tuning logic for time interpolator invoked every minute or so.
1464 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1465 * Increase interpolator clock speed if we skip too much time.
1466 */
1467 if (jiffies % INTERPOLATOR_ADJUST == 0)
1468 {
1469 if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
1470 time_interpolator->nsec_per_cyc--;
1471 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1472 time_interpolator->nsec_per_cyc++;
1473 time_interpolator->skips = 0;
1474 time_interpolator->ns_skipped = 0;
1475 }
1476}
1477
1478static inline int
1479is_better_time_interpolator(struct time_interpolator *new)
1480{
1481 if (!time_interpolator)
1482 return 1;
1483 return new->frequency > 2*time_interpolator->frequency ||
1484 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1485}
1486
1487void
1488register_time_interpolator(struct time_interpolator *ti)
1489{
1490 unsigned long flags;
1491
1492 /* Sanity check */
1493 BUG_ON(ti->frequency == 0 || ti->mask == 0);
1494
1495 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1496 spin_lock(&time_interpolator_lock);
1497 write_seqlock_irqsave(&xtime_lock, flags);
1498 if (is_better_time_interpolator(ti)) {
1499 time_interpolator = ti;
1500 time_interpolator_reset();
1501 }
1502 write_sequnlock_irqrestore(&xtime_lock, flags);
1503
1504 ti->next = time_interpolator_list;
1505 time_interpolator_list = ti;
1506 spin_unlock(&time_interpolator_lock);
1507}
1508
1509void
1510unregister_time_interpolator(struct time_interpolator *ti)
1511{
1512 struct time_interpolator *curr, **prev;
1513 unsigned long flags;
1514
1515 spin_lock(&time_interpolator_lock);
1516 prev = &time_interpolator_list;
1517 for (curr = *prev; curr; curr = curr->next) {
1518 if (curr == ti) {
1519 *prev = curr->next;
1520 break;
1521 }
1522 prev = &curr->next;
1523 }
1524
1525 write_seqlock_irqsave(&xtime_lock, flags);
1526 if (ti == time_interpolator) {
1527 /* we lost the best time-interpolator: */
1528 time_interpolator = NULL;
1529 /* find the next-best interpolator */
1530 for (curr = time_interpolator_list; curr; curr = curr->next)
1531 if (is_better_time_interpolator(curr))
1532 time_interpolator = curr;
1533 time_interpolator_reset();
1534 }
1535 write_sequnlock_irqrestore(&xtime_lock, flags);
1536 spin_unlock(&time_interpolator_lock);
1537}
1538#endif /* CONFIG_TIME_INTERPOLATION */
1539
1540/** 1352/**
1541 * msleep - sleep safely even with waitqueue interruptions 1353 * msleep - sleep safely even with waitqueue interruptions
1542 * @msecs: Time in milliseconds to sleep for 1354 * @msecs: Time in milliseconds to sleep for
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 658f638c402c..c122131a122f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -39,7 +39,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
39 ac_etime = timespec_to_ns(&ts); 39 ac_etime = timespec_to_ns(&ts);
40 do_div(ac_etime, NSEC_PER_USEC); 40 do_div(ac_etime, NSEC_PER_USEC);
41 stats->ac_etime = ac_etime; 41 stats->ac_etime = ac_etime;
42 stats->ac_btime = xtime.tv_sec - ts.tv_sec; 42 stats->ac_btime = get_seconds() - ts.tv_sec;
43 if (thread_group_leader(tsk)) { 43 if (thread_group_leader(tsk)) {
44 stats->ac_exitcode = tsk->exit_code; 44 stats->ac_exitcode = tsk->exit_code;
45 if (tsk->flags & PF_FORKNOEXEC) 45 if (tsk->flags & PF_FORKNOEXEC)
diff --git a/kernel/user.c b/kernel/user.c
index 98b82507797a..9ca2848fc356 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,25 +55,22 @@ struct user_struct root_user = {
55/* 55/*
56 * These routines must be called with the uidhash spinlock held! 56 * These routines must be called with the uidhash spinlock held!
57 */ 57 */
58static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) 58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
59{ 59{
60 list_add(&up->uidhash_list, hashent); 60 hlist_add_head(&up->uidhash_node, hashent);
61} 61}
62 62
63static inline void uid_hash_remove(struct user_struct *up) 63static inline void uid_hash_remove(struct user_struct *up)
64{ 64{
65 list_del(&up->uidhash_list); 65 hlist_del_init(&up->uidhash_node);
66} 66}
67 67
68static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) 68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
69{ 69{
70 struct list_head *up; 70 struct user_struct *user;
71 71 struct hlist_node *h;
72 list_for_each(up, hashent) {
73 struct user_struct *user;
74
75 user = list_entry(up, struct user_struct, uidhash_list);
76 72
73 hlist_for_each_entry(user, h, hashent, uidhash_node) {
77 if(user->uid == uid) { 74 if(user->uid == uid) {
78 atomic_inc(&user->__count); 75 atomic_inc(&user->__count);
79 return user; 76 return user;
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up)
122 119
123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 121{
125 struct list_head *hashent = uidhashentry(ns, uid); 122 struct hlist_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 123 struct user_struct *up;
127 124
128 spin_lock_irq(&uidhash_lock); 125 spin_lock_irq(&uidhash_lock);
@@ -202,16 +199,40 @@ void switch_uid(struct user_struct *new_user)
202 suid_keys(current); 199 suid_keys(current);
203} 200}
204 201
202void release_uids(struct user_namespace *ns)
203{
204 int i;
205 unsigned long flags;
206 struct hlist_head *head;
207 struct hlist_node *nd;
208
209 spin_lock_irqsave(&uidhash_lock, flags);
210 /*
211 * collapse the chains so that the user_struct-s will
212 * be still alive, but not in hashes. subsequent free_uid()
213 * will free them.
214 */
215 for (i = 0; i < UIDHASH_SZ; i++) {
216 head = ns->uidhash_table + i;
217 while (!hlist_empty(head)) {
218 nd = head->first;
219 hlist_del_init(nd);
220 }
221 }
222 spin_unlock_irqrestore(&uidhash_lock, flags);
223
224 free_uid(ns->root_user);
225}
205 226
206static int __init uid_cache_init(void) 227static int __init uid_cache_init(void)
207{ 228{
208 int n; 229 int n;
209 230
210 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 231 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 232 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
212 233
213 for(n = 0; n < UIDHASH_SZ; ++n) 234 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n); 235 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
215 236
216 /* Insert the root user immediately (init already runs as root) */ 237 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 238 spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d055d987850c..7af90fc4f0fd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
39 kref_init(&ns->kref); 39 kref_init(&ns->kref);
40 40
41 for (n = 0; n < UIDHASH_SZ; ++n) 41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n); 42 INIT_HLIST_HEAD(ns->uidhash_table + n);
43 43
44 /* Insert new root user. */ 44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0); 45 ns->root_user = alloc_uid(ns, 0);
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref)
81 struct user_namespace *ns; 81 struct user_namespace *ns;
82 82
83 ns = container_of(kref, struct user_namespace, kref); 83 ns = container_of(kref, struct user_namespace, kref);
84 release_uids(ns);
84 kfree(ns); 85 kfree(ns);
85} 86}
86 87
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 9d8180a0f0d8..816d7b24fa03 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
28 if (!ns) 28 if (!ns)
29 return ERR_PTR(-ENOMEM); 29 return ERR_PTR(-ENOMEM);
30 30
31 down_read(&uts_sem);
31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem);
32 kref_init(&ns->kref); 34 kref_init(&ns->kref);
33 return ns; 35 return ns;
34} 36}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 58e5c152a6bb..e080d1d744cc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -635,7 +635,7 @@ int keventd_up(void)
635int current_is_keventd(void) 635int current_is_keventd(void)
636{ 636{
637 struct cpu_workqueue_struct *cwq; 637 struct cpu_workqueue_struct *cwq;
638 int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 638 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
639 int ret = 0; 639 int ret = 0;
640 640
641 BUG_ON(!keventd_wq); 641 BUG_ON(!keventd_wq);