aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c16
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c27
-rw-r--r--kernel/futex_compat.c30
-rw-r--r--kernel/hrtimer.c24
-rw-r--r--kernel/irq/manage.c13
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/posix-timers.c15
-rw-r--r--kernel/power/Kconfig41
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/sched.c113
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c216
-rw-r--r--kernel/sched_rt.c11
-rw-r--r--kernel/signal.c49
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c46
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c61
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/user.c45
-rw-r--r--kernel/user_namespace.c3
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/workqueue.c2
37 files changed, 575 insertions, 252 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index eb0f9165b401..2924251a6547 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb)
847} 847}
848 848
849/* Receive messages from netlink socket. */ 849/* Receive messages from netlink socket. */
850static void audit_receive(struct sock *sk, int length) 850static void audit_receive(struct sk_buff *skb)
851{ 851{
852 struct sk_buff *skb;
853 unsigned int qlen;
854
855 mutex_lock(&audit_cmd_mutex); 852 mutex_lock(&audit_cmd_mutex);
856 853 audit_receive_skb(skb);
857 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
858 skb = skb_dequeue(&sk->sk_receive_queue);
859 audit_receive_skb(skb);
860 kfree_skb(skb);
861 }
862 mutex_unlock(&audit_cmd_mutex); 854 mutex_unlock(&audit_cmd_mutex);
863} 855}
864 856
@@ -876,8 +868,8 @@ static int __init audit_init(void)
876 868
877 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 869 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
878 audit_default ? "enabled" : "disabled"); 870 audit_default ? "enabled" : "disabled");
879 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 871 audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
880 NULL, THIS_MODULE); 872 audit_receive, NULL, THIS_MODULE);
881 if (!audit_sock) 873 if (!audit_sock)
882 audit_panic("cannot initialize netlink socket"); 874 audit_panic("cannot initialize netlink socket");
883 else 875 else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3401293359e8..04f3ffb8d9d4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2023,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2023 axp->d.next = ctx->aux_pids; 2023 axp->d.next = ctx->aux_pids;
2024 ctx->aux_pids = (void *)axp; 2024 ctx->aux_pids = (void *)axp;
2025 } 2025 }
2026 BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); 2026 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
2027 2027
2028 axp->target_pid[axp->pid_count] = t->tgid; 2028 axp->target_pid[axp->pid_count] = t->tgid;
2029 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); 2029 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 181ae7086029..38033db8d8ec 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu)
273 return err; 273 return err;
274} 274}
275 275
276#ifdef CONFIG_SUSPEND_SMP 276#ifdef CONFIG_PM_SLEEP_SMP
277static cpumask_t frozen_cpus; 277static cpumask_t frozen_cpus;
278 278
279int disable_nonboot_cpus(void) 279int disable_nonboot_cpus(void)
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void)
334out: 334out:
335 mutex_unlock(&cpu_add_remove_lock); 335 mutex_unlock(&cpu_add_remove_lock);
336} 336}
337#endif 337#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9578c1ae19ca..993369ee94d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
24#include <linux/pid_namespace.h> 24#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 25#include <linux/ptrace.h>
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/signalfd.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/kthread.h> 29#include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
86 sighand = rcu_dereference(tsk->sighand); 85 sighand = rcu_dereference(tsk->sighand);
87 spin_lock(&sighand->siglock); 86 spin_lock(&sighand->siglock);
88 87
89 /*
90 * Notify that this sighand has been detached. This must
91 * be called with the tsk->sighand lock held. Also, this
92 * access tsk->sighand internally, so it must be called
93 * before tsk->sighand is reset.
94 */
95 signalfd_detach_locked(tsk);
96
97 posix_cpu_timers_exit(tsk); 88 posix_cpu_timers_exit(tsk);
98 if (atomic_dec_and_test(&sig->count)) 89 if (atomic_dec_and_test(&sig->count))
99 posix_cpu_timers_exit_group(tsk); 90 posix_cpu_timers_exit_group(tsk);
@@ -975,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code)
975 if (unlikely(tsk->audit_context)) 966 if (unlikely(tsk->audit_context))
976 audit_free(tsk); 967 audit_free(tsk);
977 968
969 tsk->exit_code = code;
978 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
979 971
980 exit_mm(tsk); 972 exit_mm(tsk);
@@ -996,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code)
996 if (tsk->binfmt) 988 if (tsk->binfmt)
997 module_put(tsk->binfmt->module); 989 module_put(tsk->binfmt->module);
998 990
999 tsk->exit_code = code;
1000 proc_exit_connector(tsk); 991 proc_exit_connector(tsk);
1001 exit_task_namespaces(tsk); 992 exit_task_namespaces(tsk);
1002 exit_notify(tsk); 993 exit_notify(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d367..5e67f90a1694 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
1438 struct sighand_struct *sighand = data; 1438 struct sighand_struct *sighand = data;
1439 1439
1440 spin_lock_init(&sighand->siglock); 1440 spin_lock_init(&sighand->siglock);
1441 INIT_LIST_HEAD(&sighand->signalfd_list); 1441 init_waitqueue_head(&sighand->signalfd_wqh);
1442} 1442}
1443 1443
1444void __init proc_caches_init(void) 1444void __init proc_caches_init(void)
@@ -1608,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1608 err = -EINVAL; 1608 err = -EINVAL;
1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1609 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1610 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) 1611 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
1612 CLONE_NEWNET))
1612 goto bad_unshare_out; 1613 goto bad_unshare_out;
1613 1614
1614 if ((err = unshare_thread(unshare_flags))) 1615 if ((err = unshare_thread(unshare_flags)))
diff --git a/kernel/futex.c b/kernel/futex.c
index 3415e9ad1391..fcc94e7b4086 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1670,6 +1670,7 @@ pi_faulted:
1670 attempt); 1670 attempt);
1671 if (ret) 1671 if (ret)
1672 goto out; 1672 goto out;
1673 uval = 0;
1673 goto retry_unlocked; 1674 goto retry_unlocked;
1674 } 1675 }
1675 1676
@@ -1942,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
1942void exit_robust_list(struct task_struct *curr) 1943void exit_robust_list(struct task_struct *curr)
1943{ 1944{
1944 struct robust_list_head __user *head = curr->robust_list; 1945 struct robust_list_head __user *head = curr->robust_list;
1945 struct robust_list __user *entry, *pending; 1946 struct robust_list __user *entry, *next_entry, *pending;
1946 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1947 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
1947 unsigned long futex_offset; 1948 unsigned long futex_offset;
1949 int rc;
1948 1950
1949 /* 1951 /*
1950 * Fetch the list head (which was registered earlier, via 1952 * Fetch the list head (which was registered earlier, via
@@ -1964,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr)
1964 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1966 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1965 return; 1967 return;
1966 1968
1967 if (pending) 1969 next_entry = NULL; /* avoid warning with gcc */
1968 handle_futex_death((void __user *)pending + futex_offset,
1969 curr, pip);
1970
1971 while (entry != &head->list) { 1970 while (entry != &head->list) {
1972 /* 1971 /*
1972 * Fetch the next entry in the list before calling
1973 * handle_futex_death:
1974 */
1975 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
1976 /*
1973 * A pending lock might already be on the list, so 1977 * A pending lock might already be on the list, so
1974 * don't process it twice: 1978 * don't process it twice:
1975 */ 1979 */
@@ -1977,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr)
1977 if (handle_futex_death((void __user *)entry + futex_offset, 1981 if (handle_futex_death((void __user *)entry + futex_offset,
1978 curr, pi)) 1982 curr, pi))
1979 return; 1983 return;
1980 /* 1984 if (rc)
1981 * Fetch the next entry in the list:
1982 */
1983 if (fetch_robust_entry(&entry, &entry->next, &pi))
1984 return; 1985 return;
1986 entry = next_entry;
1987 pi = next_pi;
1985 /* 1988 /*
1986 * Avoid excessively long or circular lists: 1989 * Avoid excessively long or circular lists:
1987 */ 1990 */
@@ -1990,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr)
1990 1993
1991 cond_resched(); 1994 cond_resched();
1992 } 1995 }
1996
1997 if (pending)
1998 handle_futex_death((void __user *)pending + futex_offset,
1999 curr, pip);
1993} 2000}
1994 2001
1995long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2002long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f7921360efad..2c2e2954b713 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
38void compat_exit_robust_list(struct task_struct *curr) 38void compat_exit_robust_list(struct task_struct *curr)
39{ 39{
40 struct compat_robust_list_head __user *head = curr->compat_robust_list; 40 struct compat_robust_list_head __user *head = curr->compat_robust_list;
41 struct robust_list __user *entry, *pending; 41 struct robust_list __user *entry, *next_entry, *pending;
42 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 42 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
43 compat_uptr_t uentry, upending; 43 compat_uptr_t uentry, next_uentry, upending;
44 compat_long_t futex_offset; 44 compat_long_t futex_offset;
45 int rc;
45 46
46 /* 47 /*
47 * Fetch the list head (which was registered earlier, via 48 * Fetch the list head (which was registered earlier, via
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr)
61 if (fetch_robust_entry(&upending, &pending, 62 if (fetch_robust_entry(&upending, &pending,
62 &head->list_op_pending, &pip)) 63 &head->list_op_pending, &pip))
63 return; 64 return;
64 if (upending)
65 handle_futex_death((void __user *)pending + futex_offset, curr, pip);
66 65
67 while (compat_ptr(uentry) != &head->list) { 66 next_entry = NULL; /* avoid warning with gcc */
67 while (entry != (struct robust_list __user *) &head->list) {
68 /*
69 * Fetch the next entry in the list before calling
70 * handle_futex_death:
71 */
72 rc = fetch_robust_entry(&next_uentry, &next_entry,
73 (compat_uptr_t __user *)&entry->next, &next_pi);
68 /* 74 /*
69 * A pending lock might already be on the list, so 75 * A pending lock might already be on the list, so
70 * dont process it twice: 76 * dont process it twice:
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr)
74 curr, pi)) 80 curr, pi))
75 return; 81 return;
76 82
77 /* 83 if (rc)
78 * Fetch the next entry in the list:
79 */
80 if (fetch_robust_entry(&uentry, &entry,
81 (compat_uptr_t __user *)&entry->next, &pi))
82 return; 84 return;
85 uentry = next_uentry;
86 entry = next_entry;
87 pi = next_pi;
83 /* 88 /*
84 * Avoid excessively long or circular lists: 89 * Avoid excessively long or circular lists:
85 */ 90 */
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr)
88 93
89 cond_resched(); 94 cond_resched();
90 } 95 }
96 if (pending)
97 handle_futex_death((void __user *)pending + futex_offset,
98 curr, pip);
91} 99}
92 100
93asmlinkage long 101asmlinkage long
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c21ca6bfaa66..dc8a4451d79b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
277} 277}
278 278
279EXPORT_SYMBOL_GPL(ktime_add_ns); 279EXPORT_SYMBOL_GPL(ktime_add_ns);
280
281/**
282 * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable
283 * @kt: minuend
284 * @nsec: the scalar nsec value to subtract
285 *
286 * Returns the subtraction of @nsec from @kt in ktime_t format
287 */
288ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
289{
290 ktime_t tmp;
291
292 if (likely(nsec < NSEC_PER_SEC)) {
293 tmp.tv64 = nsec;
294 } else {
295 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
296
297 tmp = ktime_set((long)nsec, rem);
298 }
299
300 return ktime_sub(kt, tmp);
301}
302
303EXPORT_SYMBOL_GPL(ktime_sub_ns);
280# endif /* !CONFIG_KTIME_SCALAR */ 304# endif /* !CONFIG_KTIME_SCALAR */
281 305
282/* 306/*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 203a518b6f14..7230d914eaa2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id)
462 * We do this after actually deregistering it, to make sure that 462 * We do this after actually deregistering it, to make sure that
463 * a 'real' IRQ doesn't run in parallel with our fake 463 * a 'real' IRQ doesn't run in parallel with our fake
464 */ 464 */
465 local_irq_save(flags);
465 handler(irq, dev_id); 466 handler(irq, dev_id);
467 local_irq_restore(flags);
466 } 468 }
467#endif 469#endif
468} 470}
@@ -545,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler,
545 * We do this before actually registering it, to make sure that 547 * We do this before actually registering it, to make sure that
546 * a 'real' IRQ doesn't run in parallel with our fake 548 * a 'real' IRQ doesn't run in parallel with our fake
547 */ 549 */
548 if (irqflags & IRQF_DISABLED) { 550 unsigned long flags;
549 unsigned long flags;
550 551
551 local_irq_save(flags); 552 local_irq_save(flags);
552 handler(irq, dev_id); 553 handler(irq, dev_id);
553 local_irq_restore(flags); 554 local_irq_restore(flags);
554 } else
555 handler(irq, dev_id);
556 } 555 }
557#endif 556#endif
558 557
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9809cc1f33d6..c6a4f8aebeba 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
505 if (ret < 0) 505 if (ret < 0)
506 goto out; 506 goto out;
507 507
508 return call_usermodehelper_exec(sub_info, 1); 508 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
509 509
510 out: 510 out:
511 call_usermodehelper_freeinfo(sub_info); 511 call_usermodehelper_freeinfo(sub_info);
diff --git a/kernel/module.c b/kernel/module.c
index 33c04ad51175..db0ead0363e2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
784static ssize_t show_refcnt(struct module_attribute *mattr, 784static ssize_t show_refcnt(struct module_attribute *mattr,
785 struct module *mod, char *buffer) 785 struct module *mod, char *buffer)
786{ 786{
787 /* sysfs holds a reference */ 787 return sprintf(buffer, "%u\n", module_refcount(mod));
788 return sprintf(buffer, "%u\n", module_refcount(mod)-1);
789} 788}
790 789
791static struct module_attribute refcnt = { 790static struct module_attribute refcnt = {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a4fb7d46971f..f1decd21a534 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -20,6 +20,7 @@
20#include <linux/mnt_namespace.h> 20#include <linux/mnt_namespace.h>
21#include <linux/utsname.h> 21#include <linux/utsname.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h>
23 24
24static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
25 26
@@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
98 goto out_user; 99 goto out_user;
99 } 100 }
100 101
102 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
103 if (IS_ERR(new_nsp->net_ns)) {
104 err = PTR_ERR(new_nsp->net_ns);
105 goto out_net;
106 }
107
101 return new_nsp; 108 return new_nsp;
102 109
110out_net:
111 if (new_nsp->user_ns)
112 put_user_ns(new_nsp->user_ns);
103out_user: 113out_user:
104 if (new_nsp->pid_ns) 114 if (new_nsp->pid_ns)
105 put_pid_ns(new_nsp->pid_ns); 115 put_pid_ns(new_nsp->pid_ns);
@@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
132 142
133 get_nsproxy(old_ns); 143 get_nsproxy(old_ns);
134 144
135 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) 145 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET)))
136 return 0; 146 return 0;
137 147
138 if (!capable(CAP_SYS_ADMIN)) { 148 if (!capable(CAP_SYS_ADMIN)) {
@@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns)
164 put_pid_ns(ns->pid_ns); 174 put_pid_ns(ns->pid_ns);
165 if (ns->user_ns) 175 if (ns->user_ns)
166 put_user_ns(ns->user_ns); 176 put_user_ns(ns->user_ns);
177 put_net(ns->net_ns);
167 kmem_cache_free(nsproxy_cachep, ns); 178 kmem_cache_free(nsproxy_cachep, ns);
168} 179}
169 180
@@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
177 int err = 0; 188 int err = 0;
178 189
179 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 190 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
180 CLONE_NEWUSER))) 191 CLONE_NEWUSER | CLONE_NEWNET)))
181 return 0; 192 return 0;
182 193
183 if (!capable(CAP_SYS_ADMIN)) 194 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 55b3761edaa9..57efe0400bc2 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock,
547 new_timer->it_process = process; 547 new_timer->it_process = process;
548 list_add(&new_timer->list, 548 list_add(&new_timer->list,
549 &process->signal->posix_timers); 549 &process->signal->posix_timers);
550 spin_unlock_irqrestore(&process->sighand->siglock, flags);
551 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 550 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
552 get_task_struct(process); 551 get_task_struct(process);
552 spin_unlock_irqrestore(&process->sighand->siglock, flags);
553 } else { 553 } else {
554 spin_unlock_irqrestore(&process->sighand->siglock, flags); 554 spin_unlock_irqrestore(&process->sighand->siglock, flags);
555 process = NULL; 555 process = NULL;
@@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
605 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 605 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
606 if (timr) { 606 if (timr) {
607 spin_lock(&timr->it_lock); 607 spin_lock(&timr->it_lock);
608 spin_unlock(&idr_lock);
609 608
610 if ((timr->it_id != timer_id) || !(timr->it_process) || 609 if ((timr->it_id != timer_id) || !(timr->it_process) ||
611 timr->it_process->tgid != current->tgid) { 610 timr->it_process->tgid != current->tgid) {
612 unlock_timer(timr, *flags); 611 spin_unlock(&timr->it_lock);
612 spin_unlock_irqrestore(&idr_lock, *flags);
613 timr = NULL; 613 timr = NULL;
614 } 614 } else
615 spin_unlock(&idr_lock);
615 } else 616 } else
616 spin_unlock_irqrestore(&idr_lock, *flags); 617 spin_unlock_irqrestore(&idr_lock, *flags);
617 618
@@ -711,7 +712,7 @@ sys_timer_getoverrun(timer_t timer_id)
711{ 712{
712 struct k_itimer *timr; 713 struct k_itimer *timr;
713 int overrun; 714 int overrun;
714 long flags; 715 unsigned long flags;
715 716
716 timr = lock_timer(timer_id, &flags); 717 timr = lock_timer(timer_id, &flags);
717 if (!timr) 718 if (!timr)
@@ -783,7 +784,7 @@ sys_timer_settime(timer_t timer_id, int flags,
783 struct k_itimer *timr; 784 struct k_itimer *timr;
784 struct itimerspec new_spec, old_spec; 785 struct itimerspec new_spec, old_spec;
785 int error = 0; 786 int error = 0;
786 long flag; 787 unsigned long flag;
787 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 788 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
788 789
789 if (!new_setting) 790 if (!new_setting)
@@ -835,7 +836,7 @@ asmlinkage long
835sys_timer_delete(timer_t timer_id) 836sys_timer_delete(timer_t timer_id)
836{ 837{
837 struct k_itimer *timer; 838 struct k_itimer *timer;
838 long flags; 839 unsigned long flags;
839 840
840retry_delete: 841retry_delete:
841 timer = lock_timer(timer_id, &flags); 842 timer = lock_timer(timer_id, &flags);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 412859f8d94a..14b0e10dc95c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,15 +72,10 @@ config PM_TRACE
72 CAUTION: this option will cause your machine's real-time clock to be 72 CAUTION: this option will cause your machine's real-time clock to be
73 set to an invalid time after a resume. 73 set to an invalid time after a resume.
74 74
75config SUSPEND_SMP_POSSIBLE 75config PM_SLEEP_SMP
76 bool
77 depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC))
78 depends on SMP
79 default y
80
81config SUSPEND_SMP
82 bool 76 bool
83 depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP 77 depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
78 depends on PM_SLEEP
84 select HOTPLUG_CPU 79 select HOTPLUG_CPU
85 default y 80 default y
86 81
@@ -89,20 +84,46 @@ config PM_SLEEP
89 depends on SUSPEND || HIBERNATION 84 depends on SUSPEND || HIBERNATION
90 default y 85 default y
91 86
87config SUSPEND_UP_POSSIBLE
88 bool
89 depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
90 || SUPERH || FRV
91 depends on !SMP
92 default y
93
94config SUSPEND_SMP_POSSIBLE
95 bool
96 depends on (X86 && !X86_VOYAGER) \
97 || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
98 depends on SMP
99 default y
100
92config SUSPEND 101config SUSPEND
93 bool "Suspend to RAM and standby" 102 bool "Suspend to RAM and standby"
94 depends on PM 103 depends on PM
95 depends on !SMP || SUSPEND_SMP_POSSIBLE 104 depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
96 default y 105 default y
97 ---help--- 106 ---help---
98 Allow the system to enter sleep states in which main memory is 107 Allow the system to enter sleep states in which main memory is
99 powered and thus its contents are preserved, such as the 108 powered and thus its contents are preserved, such as the
100 suspend-to-RAM state (i.e. the ACPI S3 state). 109 suspend-to-RAM state (i.e. the ACPI S3 state).
101 110
111config HIBERNATION_UP_POSSIBLE
112 bool
113 depends on X86 || PPC64_SWSUSP || PPC32
114 depends on !SMP
115 default y
116
117config HIBERNATION_SMP_POSSIBLE
118 bool
119 depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
120 depends on SMP
121 default y
122
102config HIBERNATION 123config HIBERNATION
103 bool "Hibernation (aka 'suspend to disk')" 124 bool "Hibernation (aka 'suspend to disk')"
104 depends on PM && SWAP 125 depends on PM && SWAP
105 depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE 126 depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
106 ---help--- 127 ---help---
107 Enable the suspend to disk (STD) functionality, which is usually 128 Enable the suspend to disk (STD) functionality, which is usually
108 called "hibernation" in user interfaces. STD checkpoints the 129 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/printk.c b/kernel/printk.c
index bd2cd062878d..8451dfc31d25 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1083,6 +1083,19 @@ int unregister_console(struct console *console)
1083} 1083}
1084EXPORT_SYMBOL(unregister_console); 1084EXPORT_SYMBOL(unregister_console);
1085 1085
1086static int __init disable_boot_consoles(void)
1087{
1088 if (console_drivers != NULL) {
1089 if (console_drivers->flags & CON_BOOT) {
1090 printk(KERN_INFO "turn off boot console %s%d\n",
1091 console_drivers->name, console_drivers->index);
1092 return unregister_console(console_drivers);
1093 }
1094 }
1095 return 0;
1096}
1097late_initcall(disable_boot_consoles);
1098
1086/** 1099/**
1087 * tty_write_message - write a message to a certain tty, not just the console. 1100 * tty_write_message - write a message to a certain tty, not just the console.
1088 * @tty: the destination tty_struct 1101 * @tty: the destination tty_struct
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 82a558b655da..3eca7a55f2ee 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
233 233
234 /* Architecture-specific hardware disable .. */ 234 /* Architecture-specific hardware disable .. */
235 ptrace_disable(child); 235 ptrace_disable(child);
236 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
236 237
237 write_lock_irq(&tasklist_lock); 238 write_lock_irq(&tasklist_lock);
238 /* protect against de_thread()->release_task() */ 239 /* protect against de_thread()->release_task() */
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..6c10fa796ca0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -61,6 +61,7 @@
61#include <linux/delayacct.h> 61#include <linux/delayacct.h>
62#include <linux/reciprocal_div.h> 62#include <linux/reciprocal_div.h>
63#include <linux/unistd.h> 63#include <linux/unistd.h>
64#include <linux/pagemap.h>
64 65
65#include <asm/tlb.h> 66#include <asm/tlb.h>
66 67
@@ -262,7 +263,8 @@ struct rq {
262 s64 clock_max_delta; 263 s64 clock_max_delta;
263 264
264 unsigned int clock_warps, clock_overflows; 265 unsigned int clock_warps, clock_overflows;
265 unsigned int clock_unstable_events; 266 u64 idle_clock;
267 unsigned int clock_deep_idle_events;
266 u64 tick_timestamp; 268 u64 tick_timestamp;
267 269
268 atomic_t nr_iowait; 270 atomic_t nr_iowait;
@@ -556,18 +558,40 @@ static inline struct rq *this_rq_lock(void)
556} 558}
557 559
558/* 560/*
559 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 561 * We are going deep-idle (irqs are disabled):
560 */ 562 */
561void sched_clock_unstable_event(void) 563void sched_clock_idle_sleep_event(void)
562{ 564{
563 unsigned long flags; 565 struct rq *rq = cpu_rq(smp_processor_id());
564 struct rq *rq;
565 566
566 rq = task_rq_lock(current, &flags); 567 spin_lock(&rq->lock);
567 rq->prev_clock_raw = sched_clock(); 568 __update_rq_clock(rq);
568 rq->clock_unstable_events++; 569 spin_unlock(&rq->lock);
569 task_rq_unlock(rq, &flags); 570 rq->clock_deep_idle_events++;
571}
572EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
573
574/*
575 * We just idled delta nanoseconds (called with irqs disabled):
576 */
577void sched_clock_idle_wakeup_event(u64 delta_ns)
578{
579 struct rq *rq = cpu_rq(smp_processor_id());
580 u64 now = sched_clock();
581
582 rq->idle_clock += delta_ns;
583 /*
584 * Override the previous timestamp and ignore all
585 * sched_clock() deltas that occured while we idled,
586 * and use the PM-provided delta_ns to advance the
587 * rq clock:
588 */
589 spin_lock(&rq->lock);
590 rq->prev_clock_raw = now;
591 rq->clock += delta_ns;
592 spin_unlock(&rq->lock);
570} 593}
594EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
571 595
572/* 596/*
573 * resched_task - mark a task 'to be rescheduled now'. 597 * resched_task - mark a task 'to be rescheduled now'.
@@ -645,7 +669,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
645/* 669/*
646 * Shift right and round: 670 * Shift right and round:
647 */ 671 */
648#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 672#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
649 673
650static unsigned long 674static unsigned long
651calc_delta_mine(unsigned long delta_exec, unsigned long weight, 675calc_delta_mine(unsigned long delta_exec, unsigned long weight,
@@ -661,10 +685,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
661 * Check whether we'd overflow the 64-bit multiplication: 685 * Check whether we'd overflow the 64-bit multiplication:
662 */ 686 */
663 if (unlikely(tmp > WMULT_CONST)) 687 if (unlikely(tmp > WMULT_CONST))
664 tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 688 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
665 WMULT_SHIFT/2); 689 WMULT_SHIFT/2);
666 else 690 else
667 tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); 691 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
668 692
669 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 693 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
670} 694}
@@ -835,7 +859,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
835 859
836static void set_load_weight(struct task_struct *p) 860static void set_load_weight(struct task_struct *p)
837{ 861{
838 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
839 p->se.wait_runtime = 0; 862 p->se.wait_runtime = 0;
840 863
841 if (task_has_rt_policy(p)) { 864 if (task_has_rt_policy(p)) {
@@ -1564,6 +1587,7 @@ static void __sched_fork(struct task_struct *p)
1564 p->se.wait_start_fair = 0; 1587 p->se.wait_start_fair = 0;
1565 p->se.exec_start = 0; 1588 p->se.exec_start = 0;
1566 p->se.sum_exec_runtime = 0; 1589 p->se.sum_exec_runtime = 0;
1590 p->se.prev_sum_exec_runtime = 0;
1567 p->se.delta_exec = 0; 1591 p->se.delta_exec = 0;
1568 p->se.delta_fair_run = 0; 1592 p->se.delta_fair_run = 0;
1569 p->se.delta_fair_sleep = 0; 1593 p->se.delta_fair_sleep = 0;
@@ -1659,6 +1683,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1659 1683
1660 p->prio = effective_prio(p); 1684 p->prio = effective_prio(p);
1661 1685
1686 if (rt_prio(p->prio))
1687 p->sched_class = &rt_sched_class;
1688 else
1689 p->sched_class = &fair_sched_class;
1690
1662 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || 1691 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1663 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || 1692 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1664 !current->se.on_rq) { 1693 !current->se.on_rq) {
@@ -2157,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2157 if (task_running(rq, p)) 2186 if (task_running(rq, p))
2158 return 0; 2187 return 0;
2159 2188
2160 /*
2161 * Aggressive migration if too many balance attempts have failed:
2162 */
2163 if (sd->nr_balance_failed > sd->cache_nice_tries)
2164 return 1;
2165
2166 return 1; 2189 return 1;
2167} 2190}
2168 2191
@@ -2494,7 +2517,7 @@ group_next:
2494 * a think about bumping its value to force at least one task to be 2517 * a think about bumping its value to force at least one task to be
2495 * moved 2518 * moved
2496 */ 2519 */
2497 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2520 if (*imbalance < busiest_load_per_task) {
2498 unsigned long tmp, pwr_now, pwr_move; 2521 unsigned long tmp, pwr_now, pwr_move;
2499 unsigned int imbn; 2522 unsigned int imbn;
2500 2523
@@ -2546,10 +2569,8 @@ small_imbalance:
2546 pwr_move /= SCHED_LOAD_SCALE; 2569 pwr_move /= SCHED_LOAD_SCALE;
2547 2570
2548 /* Move if we gain throughput */ 2571 /* Move if we gain throughput */
2549 if (pwr_move <= pwr_now) 2572 if (pwr_move > pwr_now)
2550 goto out_balanced; 2573 *imbalance = busiest_load_per_task;
2551
2552 *imbalance = busiest_load_per_task;
2553 } 2574 }
2554 2575
2555 return busiest; 2576 return busiest;
@@ -3020,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3020 struct sched_domain *sd; 3041 struct sched_domain *sd;
3021 /* Earliest time when we have to do rebalance again */ 3042 /* Earliest time when we have to do rebalance again */
3022 unsigned long next_balance = jiffies + 60*HZ; 3043 unsigned long next_balance = jiffies + 60*HZ;
3044 int update_next_balance = 0;
3023 3045
3024 for_each_domain(cpu, sd) { 3046 for_each_domain(cpu, sd) {
3025 if (!(sd->flags & SD_LOAD_BALANCE)) 3047 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3056,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3056 if (sd->flags & SD_SERIALIZE) 3078 if (sd->flags & SD_SERIALIZE)
3057 spin_unlock(&balancing); 3079 spin_unlock(&balancing);
3058out: 3080out:
3059 if (time_after(next_balance, sd->last_balance + interval)) 3081 if (time_after(next_balance, sd->last_balance + interval)) {
3060 next_balance = sd->last_balance + interval; 3082 next_balance = sd->last_balance + interval;
3083 update_next_balance = 1;
3084 }
3061 3085
3062 /* 3086 /*
3063 * Stop the load balance at this level. There is another 3087 * Stop the load balance at this level. There is another
@@ -3067,7 +3091,14 @@ out:
3067 if (!balance) 3091 if (!balance)
3068 break; 3092 break;
3069 } 3093 }
3070 rq->next_balance = next_balance; 3094
3095 /*
3096 * next_balance will be updated only when there is a need.
3097 * When the cpu is attached to null domain for ex, it will not be
3098 * updated.
3099 */
3100 if (likely(update_next_balance))
3101 rq->next_balance = next_balance;
3071} 3102}
3072 3103
3073/* 3104/*
@@ -4525,10 +4556,7 @@ asmlinkage long sys_sched_yield(void)
4525 struct rq *rq = this_rq_lock(); 4556 struct rq *rq = this_rq_lock();
4526 4557
4527 schedstat_inc(rq, yld_cnt); 4558 schedstat_inc(rq, yld_cnt);
4528 if (unlikely(rq->nr_running == 1)) 4559 current->sched_class->yield_task(rq, current);
4529 schedstat_inc(rq, yld_act_empty);
4530 else
4531 current->sched_class->yield_task(rq, current);
4532 4560
4533 /* 4561 /*
4534 * Since we are going to call schedule() anyway, there's 4562 * Since we are going to call schedule() anyway, there's
@@ -4884,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4884static inline void sched_init_granularity(void) 4912static inline void sched_init_granularity(void)
4885{ 4913{
4886 unsigned int factor = 1 + ilog2(num_online_cpus()); 4914 unsigned int factor = 1 + ilog2(num_online_cpus());
4887 const unsigned long gran_limit = 100000000; 4915 const unsigned long limit = 100000000;
4916
4917 sysctl_sched_min_granularity *= factor;
4918 if (sysctl_sched_min_granularity > limit)
4919 sysctl_sched_min_granularity = limit;
4888 4920
4889 sysctl_sched_granularity *= factor; 4921 sysctl_sched_latency *= factor;
4890 if (sysctl_sched_granularity > gran_limit) 4922 if (sysctl_sched_latency > limit)
4891 sysctl_sched_granularity = gran_limit; 4923 sysctl_sched_latency = limit;
4892 4924
4893 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4925 sysctl_sched_runtime_limit = sysctl_sched_latency;
4894 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4926 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4895} 4927}
4896 4928
4897#ifdef CONFIG_SMP 4929#ifdef CONFIG_SMP
@@ -5234,15 +5266,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5234static struct ctl_table sd_ctl_dir[] = { 5266static struct ctl_table sd_ctl_dir[] = {
5235 { 5267 {
5236 .procname = "sched_domain", 5268 .procname = "sched_domain",
5237 .mode = 0755, 5269 .mode = 0555,
5238 }, 5270 },
5239 {0,}, 5271 {0,},
5240}; 5272};
5241 5273
5242static struct ctl_table sd_ctl_root[] = { 5274static struct ctl_table sd_ctl_root[] = {
5243 { 5275 {
5276 .ctl_name = CTL_KERN,
5244 .procname = "kernel", 5277 .procname = "kernel",
5245 .mode = 0755, 5278 .mode = 0555,
5246 .child = sd_ctl_dir, 5279 .child = sd_ctl_dir,
5247 }, 5280 },
5248 {0,}, 5281 {0,},
@@ -5318,7 +5351,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5318 for_each_domain(cpu, sd) { 5351 for_each_domain(cpu, sd) {
5319 snprintf(buf, 32, "domain%d", i); 5352 snprintf(buf, 32, "domain%d", i);
5320 entry->procname = kstrdup(buf, GFP_KERNEL); 5353 entry->procname = kstrdup(buf, GFP_KERNEL);
5321 entry->mode = 0755; 5354 entry->mode = 0555;
5322 entry->child = sd_alloc_ctl_domain_table(sd); 5355 entry->child = sd_alloc_ctl_domain_table(sd);
5323 entry++; 5356 entry++;
5324 i++; 5357 i++;
@@ -5338,7 +5371,7 @@ static void init_sched_domain_sysctl(void)
5338 for (i = 0; i < cpu_num; i++, entry++) { 5371 for (i = 0; i < cpu_num; i++, entry++) {
5339 snprintf(buf, 32, "cpu%d", i); 5372 snprintf(buf, 32, "cpu%d", i);
5340 entry->procname = kstrdup(buf, GFP_KERNEL); 5373 entry->procname = kstrdup(buf, GFP_KERNEL);
5341 entry->mode = 0755; 5374 entry->mode = 0555;
5342 entry->child = sd_alloc_ctl_cpu_table(i); 5375 entry->child = sd_alloc_ctl_cpu_table(i);
5343 } 5376 }
5344 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5377 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..c3ee38bd3426 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
154 P(next_balance); 154 P(next_balance);
155 P(curr->pid); 155 P(curr->pid);
156 P(clock); 156 P(clock);
157 P(idle_clock);
157 P(prev_clock_raw); 158 P(prev_clock_raw);
158 P(clock_warps); 159 P(clock_warps);
159 P(clock_overflows); 160 P(clock_overflows);
160 P(clock_unstable_events); 161 P(clock_deep_idle_events);
161 P(clock_max_delta); 162 P(clock_max_delta);
162 P(cpu_load[0]); 163 P(cpu_load[0]);
163 P(cpu_load[1]); 164 P(cpu_load[1]);
@@ -282,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p)
282 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
283#endif 284#endif
284 p->se.sum_exec_runtime = 0; 285 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0;
285} 287}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fedbb51bba96..67c67a87146e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -15,34 +15,50 @@
15 * 15 *
16 * Scaled math optimizations by Thomas Gleixner 16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
18 */ 21 */
19 22
20/* 23/*
21 * Preemption granularity: 24 * Targeted preemption latency for CPU-bound tasks:
22 * (default: 2 msec, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
23 * 26 *
24 * NOTE: this granularity value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat 28 * 'timeslice length' - timeslices in CFS are of variable length.
26 * larger than this value. (to see the precise effective timeslice 29 * (to see the precise effective timeslice length of your workload,
27 * length of your workload, run vmstat and monitor the context-switches 30 * run vmstat and monitor the context-switches field)
28 * field)
29 * 31 *
30 * On SMP systems the value of this is multiplied by the log2 of the 32 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
38
39/*
40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds)
33 */ 42 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; 43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
44
45/*
46 * sys_sched_yield() compat mode
47 *
48 * This option switches the agressive yield implementation of the
49 * old scheduler back on.
50 */
51unsigned int __read_mostly sysctl_sched_compat_yield;
35 52
36/* 53/*
37 * SCHED_BATCH wake-up granularity. 54 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds) 55 * (default: 25 msec, units: nanoseconds)
39 * 56 *
40 * This option delays the preemption effects of decoupled workloads 57 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still 58 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies. 59 * have immediate wakeup/sleep latencies.
43 */ 60 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
45 10000000000ULL/HZ;
46 62
47/* 63/*
48 * SCHED_OTHER wake-up granularity. 64 * SCHED_OTHER wake-up granularity.
@@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
52 * and reduces their over-scheduling. Synchronous workloads will still 68 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies. 69 * have immediate wakeup/sleep latencies.
54 */ 70 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; 71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
56 72
57unsigned int sysctl_sched_stat_granularity __read_mostly; 73unsigned int sysctl_sched_stat_granularity __read_mostly;
58 74
59/* 75/*
60 * Initialized in sched_init_granularity(): 76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
61 */ 77 */
62unsigned int sysctl_sched_runtime_limit __read_mostly; 78unsigned int sysctl_sched_runtime_limit __read_mostly;
63 79
@@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186 update_load_add(&cfs_rq->load, se->load.weight); 202 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++; 203 cfs_rq->nr_running++;
188 se->on_rq = 1; 204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
189} 207}
190 208
191static inline void 209static inline void
@@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
197 update_load_sub(&cfs_rq->load, se->load.weight); 215 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--; 216 cfs_rq->nr_running--;
199 se->on_rq = 0; 217 se->on_rq = 0;
218
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
200} 220}
201 221
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
214 */ 234 */
215 235
216/* 236/*
237 * Calculate the preemption granularity needed to schedule every
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 *
247 * To achieve this we use the following dynamic-granularity rule:
248 *
249 * gran = lat/nr - lat/nr/nr
250 *
251 * This comes out of the following equations:
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */
265static long
266sched_granularity(struct cfs_rq *cfs_rq)
267{
268 unsigned int gran = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running;
270
271 if (nr > 1) {
272 gran = gran/nr - gran/nr/nr;
273 gran = max(gran, sysctl_sched_min_granularity);
274 }
275
276 return gran;
277}
278
279/*
217 * We rescale the rescheduling granularity of tasks according to their 280 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially: 281 * nice level, but only linearly, not exponentially:
219 */ 282 */
@@ -240,7 +303,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
240 /* 303 /*
241 * It will always fit into 'long': 304 * It will always fit into 'long':
242 */ 305 */
243 return (long) (tmp >> WMULT_SHIFT); 306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
244} 307}
245 308
246static inline void 309static inline void
@@ -303,10 +366,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
303 delta_fair = calc_delta_fair(delta_exec, lw); 366 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305 368
306 if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { 369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
307 delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); 370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
308 delta = calc_delta_mine(delta, curr->load.weight, lw); 371 delta = min(delta, (unsigned long)(
309 delta = min((u64)delta, cfs_rq->sleeper_bonus); 372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
310 cfs_rq->sleeper_bonus -= delta; 373 cfs_rq->sleeper_bonus -= delta;
311 delta_mine -= delta; 374 delta_mine -= delta;
312 } 375 }
@@ -438,6 +501,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
438{ 501{
439 unsigned long delta_fair; 502 unsigned long delta_fair;
440 503
504 if (unlikely(!se->wait_start_fair))
505 return;
506
441 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), 507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
442 (u64)(cfs_rq->fair_clock - se->wait_start_fair)); 508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
443 509
@@ -494,6 +560,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
494 unsigned long load = cfs_rq->load.weight, delta_fair; 560 unsigned long load = cfs_rq->load.weight, delta_fair;
495 long prev_runtime; 561 long prev_runtime;
496 562
563 /*
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
497 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) 570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
498 load = rq_of(cfs_rq)->cpu_load[2]; 571 load = rq_of(cfs_rq)->cpu_load[2];
499 572
@@ -519,10 +592,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
519 * Track the amount of bonus we've given to sleepers: 592 * Track the amount of bonus we've given to sleepers:
520 */ 593 */
521 cfs_rq->sleeper_bonus += delta_fair; 594 cfs_rq->sleeper_bonus += delta_fair;
522 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
523 cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit;
524
525 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
526} 595}
527 596
528static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -570,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
570 639
571 se->block_start = 0; 640 se->block_start = 0;
572 se->sum_sleep_runtime += delta; 641 se->sum_sleep_runtime += delta;
642
643 /*
644 * Blocking time is in units of nanosecs, so shift by 20 to
645 * get a milliseconds-range estimation of the amount of
646 * time that the task spent sleeping:
647 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) {
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20);
651 }
573 } 652 }
574#endif 653#endif
575} 654}
@@ -604,7 +683,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
604 if (tsk->state & TASK_UNINTERRUPTIBLE) 683 if (tsk->state & TASK_UNINTERRUPTIBLE)
605 se->block_start = rq_of(cfs_rq)->clock; 684 se->block_start = rq_of(cfs_rq)->clock;
606 } 685 }
607 cfs_rq->wait_runtime -= se->wait_runtime;
608#endif 686#endif
609 } 687 }
610 __dequeue_entity(cfs_rq, se); 688 __dequeue_entity(cfs_rq, se);
@@ -618,11 +696,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
618 struct sched_entity *curr, unsigned long granularity) 696 struct sched_entity *curr, unsigned long granularity)
619{ 697{
620 s64 __delta = curr->fair_key - se->fair_key; 698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec;
700
701 /*
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime)
716 granularity = 0;
621 717
622 /* 718 /*
623 * Take scheduling granularity into account - do not 719 * Take scheduling granularity into account - do not
624 * preempt the current task unless the best task has 720 * preempt the current task unless the best task has
625 * a larger than sched_granularity fairness advantage: 721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
626 */ 724 */
627 if (__delta > niced_granularity(curr, granularity)) 725 if (__delta > niced_granularity(curr, granularity))
628 resched_task(rq_of(cfs_rq)->curr); 726 resched_task(rq_of(cfs_rq)->curr);
@@ -641,6 +739,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
641 update_stats_wait_end(cfs_rq, se); 739 update_stats_wait_end(cfs_rq, se);
642 update_stats_curr_start(cfs_rq, se); 740 update_stats_curr_start(cfs_rq, se);
643 set_cfs_rq_curr(cfs_rq, se); 741 set_cfs_rq_curr(cfs_rq, se);
742 se->prev_sum_exec_runtime = se->sum_exec_runtime;
644} 743}
645 744
646static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
@@ -686,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
686 if (next == curr) 785 if (next == curr)
687 return; 786 return;
688 787
689 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 788 __check_preempt_curr_fair(cfs_rq, next, curr,
789 sched_granularity(cfs_rq));
690} 790}
691 791
692/************************************************** 792/**************************************************
@@ -815,19 +915,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
815} 915}
816 916
817/* 917/*
818 * sched_yield() support is very simple - we dequeue and enqueue 918 * sched_yield() support is very simple - we dequeue and enqueue.
919 *
920 * If compat_yield is turned on then we requeue to the end of the tree.
819 */ 921 */
820static void yield_task_fair(struct rq *rq, struct task_struct *p) 922static void yield_task_fair(struct rq *rq, struct task_struct *p)
821{ 923{
822 struct cfs_rq *cfs_rq = task_cfs_rq(p); 924 struct cfs_rq *cfs_rq = task_cfs_rq(p);
925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
823 928
824 __update_rq_clock(rq);
825 /* 929 /*
826 * Dequeue and enqueue the task to update its 930 * Are we the only task in the tree?
827 * position within the tree: 931 */
932 if (unlikely(cfs_rq->nr_running == 1))
933 return;
934
935 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq);
937 /*
938 * Dequeue and enqueue the task to update its
939 * position within the tree:
940 */
941 dequeue_entity(cfs_rq, &p->se, 0);
942 enqueue_entity(cfs_rq, &p->se, 0);
943
944 return;
945 }
946 /*
947 * Find the rightmost entry in the rbtree:
948 */
949 do {
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /*
956 * Already in the rightmost position?
957 */
958 if (unlikely(rightmost == se))
959 return;
960
961 /*
962 * Minimally necessary key value to be last in the tree:
828 */ 963 */
829 dequeue_entity(cfs_rq, &p->se, 0); 964 se->fair_key = rightmost->fair_key + 1;
830 enqueue_entity(cfs_rq, &p->se, 0); 965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
831} 974}
832 975
833/* 976/*
@@ -1020,31 +1163,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1020static void task_new_fair(struct rq *rq, struct task_struct *p) 1163static void task_new_fair(struct rq *rq, struct task_struct *p)
1021{ 1164{
1022 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1165 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1023 struct sched_entity *se = &p->se; 1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
1024 1167
1025 sched_info_queued(p); 1168 sched_info_queued(p);
1026 1169
1170 update_curr(cfs_rq);
1027 update_stats_enqueue(cfs_rq, se); 1171 update_stats_enqueue(cfs_rq, se);
1028 /* 1172 /*
1029 * Child runs first: we let it run before the parent 1173 * Child runs first: we let it run before the parent
1030 * until it reschedules once. We set up the key so that 1174 * until it reschedules once. We set up the key so that
1031 * it will preempt the parent: 1175 * it will preempt the parent:
1032 */ 1176 */
1033 p->se.fair_key = current->se.fair_key - 1177 se->fair_key = curr->fair_key -
1034 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1035 /* 1179 /*
1036 * The first wait is dominated by the child-runs-first logic, 1180 * The first wait is dominated by the child-runs-first logic,
1037 * so do not credit it with that waiting time yet: 1181 * so do not credit it with that waiting time yet:
1038 */ 1182 */
1039 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) 1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1040 p->se.wait_start_fair = 0; 1184 se->wait_start_fair = 0;
1041 1185
1042 /* 1186 /*
1043 * The statistical average of wait_runtime is about 1187 * The statistical average of wait_runtime is about
1044 * -granularity/2, so initialize the task with that: 1188 * -granularity/2, so initialize the task with that:
1045 */ 1189 */
1046 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1047 p->se.wait_runtime = -(sysctl_sched_granularity / 2); 1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
1048 1192
1049 __enqueue_entity(cfs_rq, se); 1193 __enqueue_entity(cfs_rq, se);
1050} 1194}
@@ -1057,7 +1201,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1057 */ 1201 */
1058static void set_curr_task_fair(struct rq *rq) 1202static void set_curr_task_fair(struct rq *rq)
1059{ 1203{
1060 struct sched_entity *se = &rq->curr.se; 1204 struct sched_entity *se = &rq->curr->se;
1061 1205
1062 for_each_sched_entity(se) 1206 for_each_sched_entity(se)
1063 set_next_entity(cfs_rq_of(se), se); 1207 set_next_entity(cfs_rq_of(se), se);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dcdcad632fd9..4b87476a02d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
207 return; 207 return;
208 208
209 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = static_prio_timeslice(p->static_prio);
210 set_tsk_need_resched(p);
211 210
212 /* put it at the end of the queue: */ 211 /*
213 requeue_task_rt(rq, p); 212 * Requeue to the end of queue if we are not the only element
213 * on the queue:
214 */
215 if (p->run_list.prev != p->run_list.next) {
216 requeue_task_rt(rq, p);
217 set_tsk_need_resched(p);
218 }
214} 219}
215 220
216static struct sched_class rt_sched_class __read_mostly = { 221static struct sched_class rt_sched_class __read_mostly = {
diff --git a/kernel/signal.c b/kernel/signal.c
index b27c01a66448..792952381092 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
378 /* We only dequeue private signals from ourselves, we don't let 378 /* We only dequeue private signals from ourselves, we don't let
379 * signalfd steal them 379 * signalfd steal them
380 */ 380 */
381 if (tsk == current) 381 signr = __dequeue_signal(&tsk->pending, mask, info);
382 signr = __dequeue_signal(&tsk->pending, mask, info);
383 if (!signr) { 382 if (!signr) {
384 signr = __dequeue_signal(&tsk->signal->shared_pending, 383 signr = __dequeue_signal(&tsk->signal->shared_pending,
385 mask, info); 384 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
407 } 406 }
408 } 407 }
409 } 408 }
410 if (likely(tsk == current)) 409 recalc_sigpending();
411 recalc_sigpending();
412 if (signr && unlikely(sig_kernel_stop(signr))) { 410 if (signr && unlikely(sig_kernel_stop(signr))) {
413 /* 411 /*
414 * Set a marker that we have dequeued a stop signal. Our 412 * Set a marker that we have dequeued a stop signal. Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
425 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 423 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
426 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 424 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
427 } 425 }
428 if ( signr && 426 if (signr &&
429 ((info->si_code & __SI_MASK) == __SI_TIMER) && 427 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
430 info->si_sys_private){ 428 info->si_sys_private){
431 /* 429 /*
@@ -533,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info,
533 if (!valid_signal(sig)) 531 if (!valid_signal(sig))
534 return error; 532 return error;
535 533
536 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 534 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
537 if (error) 535 error = audit_signal_info(sig, t); /* Let audit system see the signal */
538 return error; 536 if (error)
539 537 return error;
540 error = -EPERM; 538 error = -EPERM;
541 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 539 if (((sig != SIGCONT) ||
542 && ((sig != SIGCONT) || 540 (process_session(current) != process_session(t)))
543 (process_session(current) != process_session(t))) 541 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
544 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 542 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
545 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 543 && !capable(CAP_KILL))
546 && !capable(CAP_KILL))
547 return error; 544 return error;
545 }
548 546
549 return security_task_kill(t, info, sig, 0); 547 return security_task_kill(t, info, sig, 0);
550} 548}
@@ -1300,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void)
1300void sigqueue_free(struct sigqueue *q) 1298void sigqueue_free(struct sigqueue *q)
1301{ 1299{
1302 unsigned long flags; 1300 unsigned long flags;
1301 spinlock_t *lock = &current->sighand->siglock;
1302
1303 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1303 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1304 /* 1304 /*
1305 * If the signal is still pending remove it from the 1305 * If the signal is still pending remove it from the
1306 * pending queue. 1306 * pending queue. We must hold ->siglock while testing
1307 * q->list to serialize with collect_signal().
1307 */ 1308 */
1308 if (unlikely(!list_empty(&q->list))) { 1309 spin_lock_irqsave(lock, flags);
1309 spinlock_t *lock = &current->sighand->siglock; 1310 if (!list_empty(&q->list))
1310 read_lock(&tasklist_lock); 1311 list_del_init(&q->list);
1311 spin_lock_irqsave(lock, flags); 1312 spin_unlock_irqrestore(lock, flags);
1312 if (!list_empty(&q->list)) 1313
1313 list_del_init(&q->list);
1314 spin_unlock_irqrestore(lock, flags);
1315 read_unlock(&tasklist_lock);
1316 }
1317 q->flags &= ~SIGQUEUE_PREALLOC; 1314 q->flags &= ~SIGQUEUE_PREALLOC;
1318 __sigqueue_free(q); 1315 __sigqueue_free(q);
1319} 1316}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0f546ddea43d..bd89bc4eb0b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -271,8 +271,6 @@ asmlinkage void do_softirq(void)
271 local_irq_restore(flags); 271 local_irq_restore(flags);
272} 272}
273 273
274EXPORT_SYMBOL(do_softirq);
275
276#endif 274#endif
277 275
278/* 276/*
@@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
332 wakeup_softirqd(); 330 wakeup_softirqd();
333} 331}
334 332
335EXPORT_SYMBOL(raise_softirq_irqoff);
336
337void fastcall raise_softirq(unsigned int nr) 333void fastcall raise_softirq(unsigned int nr)
338{ 334{
339 unsigned long flags; 335 unsigned long flags;
diff --git a/kernel/sys.c b/kernel/sys.c
index 449b81b98b3d..8ae2e636eb1b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -32,6 +32,7 @@
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h>
35 36
36#include <linux/compat.h> 37#include <linux/compat.h>
37#include <linux/syscalls.h> 38#include <linux/syscalls.h>
@@ -878,6 +879,7 @@ void kernel_power_off(void)
878 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 879 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
879 if (pm_power_off_prepare) 880 if (pm_power_off_prepare)
880 pm_power_off_prepare(); 881 pm_power_off_prepare();
882 disable_nonboot_cpus();
881 sysdev_shutdown(); 883 sysdev_shutdown();
882 printk(KERN_EMERG "Power down.\n"); 884 printk(KERN_EMERG "Power down.\n");
883 machine_power_off(); 885 machine_power_off();
@@ -1442,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1442 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 1444 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
1443 * LBT 04.03.94 1445 * LBT 04.03.94
1444 */ 1446 */
1445
1446asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1447asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1447{ 1448{
1448 struct task_struct *p; 1449 struct task_struct *p;
@@ -1470,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1470 if (!thread_group_leader(p)) 1471 if (!thread_group_leader(p))
1471 goto out; 1472 goto out;
1472 1473
1473 if (p->real_parent == group_leader) { 1474 if (p->real_parent->tgid == group_leader->tgid) {
1474 err = -EPERM; 1475 err = -EPERM;
1475 if (task_session(p) != task_session(group_leader)) 1476 if (task_session(p) != task_session(group_leader))
1476 goto out; 1477 goto out;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bdb8c07e04f..6c97259e863e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,7 +27,6 @@
27#include <linux/capability.h> 27#include <linux/capability.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/capability.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
33#include <linux/init.h> 32#include <linux/init.h>
@@ -223,8 +222,19 @@ static ctl_table kern_table[] = {
223#ifdef CONFIG_SCHED_DEBUG 222#ifdef CONFIG_SCHED_DEBUG
224 { 223 {
225 .ctl_name = CTL_UNNUMBERED, 224 .ctl_name = CTL_UNNUMBERED,
226 .procname = "sched_granularity_ns", 225 .procname = "sched_min_granularity_ns",
227 .data = &sysctl_sched_granularity, 226 .data = &sysctl_sched_min_granularity,
227 .maxlen = sizeof(unsigned int),
228 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 },
234 {
235 .ctl_name = CTL_UNNUMBERED,
236 .procname = "sched_latency_ns",
237 .data = &sysctl_sched_latency,
228 .maxlen = sizeof(unsigned int), 238 .maxlen = sizeof(unsigned int),
229 .mode = 0644, 239 .mode = 0644,
230 .proc_handler = &proc_dointvec_minmax, 240 .proc_handler = &proc_dointvec_minmax,
@@ -284,6 +294,23 @@ static ctl_table kern_table[] = {
284 .mode = 0644, 294 .mode = 0644,
285 .proc_handler = &proc_dointvec, 295 .proc_handler = &proc_dointvec,
286 }, 296 },
297 {
298 .ctl_name = CTL_UNNUMBERED,
299 .procname = "sched_features",
300 .data = &sysctl_sched_features,
301 .maxlen = sizeof(unsigned int),
302 .mode = 0644,
303 .proc_handler = &proc_dointvec,
304 },
305#endif
306 {
307 .ctl_name = CTL_UNNUMBERED,
308 .procname = "sched_compat_yield",
309 .data = &sysctl_sched_compat_yield,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
287#ifdef CONFIG_PROVE_LOCKING 314#ifdef CONFIG_PROVE_LOCKING
288 { 315 {
289 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
@@ -305,15 +332,6 @@ static ctl_table kern_table[] = {
305 }, 332 },
306#endif 333#endif
307 { 334 {
308 .ctl_name = CTL_UNNUMBERED,
309 .procname = "sched_features",
310 .data = &sysctl_sched_features,
311 .maxlen = sizeof(unsigned int),
312 .mode = 0644,
313 .proc_handler = &proc_dointvec,
314 },
315#endif
316 {
317 .ctl_name = KERN_PANIC, 335 .ctl_name = KERN_PANIC,
318 .procname = "panic", 336 .procname = "panic",
319 .data = &panic_timeout, 337 .data = &panic_timeout,
@@ -1035,7 +1053,7 @@ static ctl_table vm_table[] = {
1035 .strategy = &sysctl_string, 1053 .strategy = &sysctl_string,
1036 }, 1054 },
1037#endif 1055#endif
1038#if defined(CONFIG_X86_32) || \ 1056#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
1039 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 1057 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
1040 { 1058 {
1041 .ctl_name = VM_VDSO_ENABLED, 1059 .ctl_name = VM_VDSO_ENABLED,
@@ -1203,7 +1221,7 @@ static ctl_table fs_table[] = {
1203}; 1221};
1204 1222
1205static ctl_table debug_table[] = { 1223static ctl_table debug_table[] = {
1206#ifdef CONFIG_X86 1224#if defined(CONFIG_X86) || defined(CONFIG_PPC)
1207 { 1225 {
1208 .ctl_name = CTL_UNNUMBERED, 1226 .ctl_name = CTL_UNNUMBERED,
1209 .procname = "exception-trace", 1227 .procname = "exception-trace",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f66351126544..8d53106a0a92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -23,3 +23,8 @@ config HIGH_RES_TIMERS
23 hardware is not capable then this option only increases 23 hardware is not capable then this option only increases
24 the size of the kernel image. 24 the size of the kernel image.
25 25
26config GENERIC_CLOCKEVENTS_BUILD
27 bool
28 default y
29 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
30
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 99b6034fc86b..905b0b50792d 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 6obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 41dd3105ce7f..822beebe664a 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
194 local_irq_restore(flags); 194 local_irq_restore(flags);
195} 195}
196 196
197#ifdef CONFIG_GENERIC_CLOCKEVENTS
197/** 198/**
198 * clockevents_notify - notification about relevant events 199 * clockevents_notify - notification about relevant events
199 */ 200 */
@@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg)
222 spin_unlock(&clockevents_lock); 223 spin_unlock(&clockevents_lock);
223} 224}
224EXPORT_SYMBOL_GPL(clockevents_notify); 225EXPORT_SYMBOL_GPL(clockevents_notify);
225 226#endif
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cd91237dbfe3..de6a2d6b3ebb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy)
226 226
227static void notify_cmos_timer(void) 227static void notify_cmos_timer(void)
228{ 228{
229 if (no_sync_cmos_clock) 229 if (!no_sync_cmos_clock)
230 mod_timer(&sync_cmos_timer, jiffies + 1); 230 mod_timer(&sync_cmos_timer, jiffies + 1);
231} 231}
232 232
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index db8e0f3d409b..fc3fc79b3d59 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64 */ 64 */
65int tick_check_broadcast_device(struct clock_event_device *dev) 65int tick_check_broadcast_device(struct clock_event_device *dev)
66{ 66{
67 if (tick_broadcast_device.evtdev || 67 if ((tick_broadcast_device.evtdev &&
68 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 68 tick_broadcast_device.evtdev->rating >= dev->rating) ||
69 (dev->features & CLOCK_EVT_FEAT_C3STOP))
69 return 0; 70 return 0;
70 71
71 clockevents_exchange_device(NULL, dev); 72 clockevents_exchange_device(NULL, dev);
@@ -176,8 +177,6 @@ static void tick_do_periodic_broadcast(void)
176 */ 177 */
177static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 178static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
178{ 179{
179 dev->next_event.tv64 = KTIME_MAX;
180
181 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
182 181
183 /* 182 /*
@@ -218,26 +217,43 @@ static void tick_do_broadcast_on_off(void *why)
218 bc = tick_broadcast_device.evtdev; 217 bc = tick_broadcast_device.evtdev;
219 218
220 /* 219 /*
221 * Is the device in broadcast mode forever or is it not 220 * Is the device not affected by the powerstate ?
222 * affected by the powerstate ?
223 */ 221 */
224 if (!dev || !tick_device_is_functional(dev) || 222 if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
225 !(dev->features & CLOCK_EVT_FEAT_C3STOP))
226 goto out; 223 goto out;
227 224
228 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { 225 /*
226 * Defect device ?
227 */
228 if (!tick_device_is_functional(dev)) {
229 /*
230 * AMD C1E wreckage fixup:
231 *
232 * Device was registered functional in the first
233 * place. Now the secondary CPU detected the C1E
234 * misfeature and notifies us to fix it up
235 */
236 if (*reason != CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
237 goto out;
238 }
239
240 switch (*reason) {
241 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
242 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 243 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 244 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 245 if (td->mode == TICKDEV_MODE_PERIODIC)
232 clockevents_set_mode(dev, 246 clockevents_set_mode(dev,
233 CLOCK_EVT_MODE_SHUTDOWN); 247 CLOCK_EVT_MODE_SHUTDOWN);
234 } 248 }
235 } else { 249 break;
250 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
236 if (cpu_isset(cpu, tick_broadcast_mask)) { 251 if (cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_clear(cpu, tick_broadcast_mask); 252 cpu_clear(cpu, tick_broadcast_mask);
238 if (td->mode == TICKDEV_MODE_PERIODIC) 253 if (td->mode == TICKDEV_MODE_PERIODIC)
239 tick_setup_periodic(dev, 0); 254 tick_setup_periodic(dev, 0);
240 } 255 }
256 break;
241 } 257 }
242 258
243 if (cpus_empty(tick_broadcast_mask)) 259 if (cpus_empty(tick_broadcast_mask))
@@ -383,11 +399,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
383int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 399int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384{ 400{
385 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 401 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
386 402 return 0;
387 if(!cpus_empty(tick_broadcast_oneshot_mask))
388 tick_broadcast_set_event(ktime_get(), 1);
389
390 return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
391} 403}
392 404
393/* 405/*
@@ -519,11 +531,9 @@ static void tick_broadcast_clear_oneshot(int cpu)
519 */ 531 */
520void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 532void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
521{ 533{
522 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { 534 bc->event_handler = tick_handle_oneshot_broadcast;
523 bc->event_handler = tick_handle_oneshot_broadcast; 535 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
524 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 536 bc->next_event.tv64 = KTIME_MAX;
525 bc->next_event.tv64 = KTIME_MAX;
526 }
527} 537}
528 538
529/* 539/*
@@ -549,20 +559,17 @@ void tick_broadcast_switch_to_oneshot(void)
549 */ 559 */
550void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 560void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
551{ 561{
552 struct clock_event_device *bc;
553 unsigned long flags; 562 unsigned long flags;
554 unsigned int cpu = *cpup; 563 unsigned int cpu = *cpup;
555 564
556 spin_lock_irqsave(&tick_broadcast_lock, flags); 565 spin_lock_irqsave(&tick_broadcast_lock, flags);
557 566
558 bc = tick_broadcast_device.evtdev; 567 /*
568 * Clear the broadcast mask flag for the dead cpu, but do not
569 * stop the broadcast device!
570 */
559 cpu_clear(cpu, tick_broadcast_oneshot_mask); 571 cpu_clear(cpu, tick_broadcast_oneshot_mask);
560 572
561 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
562 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
563 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
564 }
565
566 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 573 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
567} 574}
568 575
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 77a21abc8716..1bea399a9ef0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
200 200
201 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
202 if (!cpu_isset(cpu, newdev->cpumask)) 202 if (!cpu_isset(cpu, newdev->cpumask))
203 goto out; 203 goto out_bc;
204 204
205 td = &per_cpu(tick_cpu_device, cpu); 205 td = &per_cpu(tick_cpu_device, cpu);
206 curdev = td->evtdev; 206 curdev = td->evtdev;
@@ -265,7 +265,7 @@ out_bc:
265 */ 265 */
266 if (tick_check_broadcast_device(newdev)) 266 if (tick_check_broadcast_device(newdev))
267 ret = NOTIFY_STOP; 267 ret = NOTIFY_STOP;
268out: 268
269 spin_unlock_irqrestore(&tick_device_lock, flags); 269 spin_unlock_irqrestore(&tick_device_lock, flags);
270 270
271 return ret; 271 return ret;
@@ -345,6 +345,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
345 345
346 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 346 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 347 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
348 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
348 tick_broadcast_on_off(reason, dev); 349 tick_broadcast_on_off(reason, dev);
349 break; 350 break;
350 351
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b416995b9757..8c3fef1db09c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void)
160 cpu = smp_processor_id(); 160 cpu = smp_processor_id();
161 ts = &per_cpu(tick_cpu_sched, cpu); 161 ts = &per_cpu(tick_cpu_sched, cpu);
162 162
163 /*
164 * If this cpu is offline and it is the one which updates
165 * jiffies, then give up the assignment and let it be taken by
166 * the cpu which runs the tick timer next. If we don't drop
167 * this here the jiffies might be stale and do_timer() never
168 * invoked.
169 */
170 if (unlikely(!cpu_online(cpu))) {
171 if (cpu == tick_do_timer_cpu)
172 tick_do_timer_cpu = -1;
173 }
174
163 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 175 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
164 goto end; 176 goto end;
165 177
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index acc417b5a9b7..4ad79f6bdec6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -217,6 +217,7 @@ static void change_clocksource(void)
217} 217}
218#else 218#else
219static inline void change_clocksource(void) { } 219static inline void change_clocksource(void) { }
220static inline s64 __get_nsec_offset(void) { return 0; }
220#endif 221#endif
221 222
222/** 223/**
@@ -280,6 +281,8 @@ void __init timekeeping_init(void)
280static int timekeeping_suspended; 281static int timekeeping_suspended;
281/* time in seconds when suspend began */ 282/* time in seconds when suspend began */
282static unsigned long timekeeping_suspend_time; 283static unsigned long timekeeping_suspend_time;
284/* xtime offset when we went into suspend */
285static s64 timekeeping_suspend_nsecs;
283 286
284/** 287/**
285 * timekeeping_resume - Resumes the generic timekeeping subsystem. 288 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev)
305 wall_to_monotonic.tv_sec -= sleep_length; 308 wall_to_monotonic.tv_sec -= sleep_length;
306 total_sleep_time += sleep_length; 309 total_sleep_time += sleep_length;
307 } 310 }
311 /* Make sure that we have the correct xtime reference */
312 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
308 /* re-base the last cycle value */ 313 /* re-base the last cycle value */
309 clock->cycle_last = clocksource_read(clock); 314 clock->cycle_last = clocksource_read(clock);
310 clock->error = 0; 315 clock->error = 0;
@@ -325,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
325{ 330{
326 unsigned long flags; 331 unsigned long flags;
327 332
333 timekeeping_suspend_time = read_persistent_clock();
334
328 write_seqlock_irqsave(&xtime_lock, flags); 335 write_seqlock_irqsave(&xtime_lock, flags);
336 /* Get the current xtime offset */
337 timekeeping_suspend_nsecs = __get_nsec_offset();
329 timekeeping_suspended = 1; 338 timekeeping_suspended = 1;
330 timekeeping_suspend_time = read_persistent_clock();
331 write_sequnlock_irqrestore(&xtime_lock, flags); 339 write_sequnlock_irqrestore(&xtime_lock, flags);
332 340
333 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 341 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 3c38fb5eae1b..c36bb7ed0301 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v)
327 ms = 1; 327 ms = 1;
328 328
329 if (events && period.tv_sec) 329 if (events && period.tv_sec)
330 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, 330 seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
331 events / period.tv_sec, events * 1000 / ms); 331 events, events * 1000 / ms,
332 (events * 1000000 / ms) % 1000);
332 else 333 else
333 seq_printf(m, "%ld total events\n", events); 334 seq_printf(m, "%ld total events\n", events);
334 335
diff --git a/kernel/user.c b/kernel/user.c
index e7d11cef6998..9ca2848fc356 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,25 +55,22 @@ struct user_struct root_user = {
55/* 55/*
56 * These routines must be called with the uidhash spinlock held! 56 * These routines must be called with the uidhash spinlock held!
57 */ 57 */
58static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) 58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
59{ 59{
60 list_add(&up->uidhash_list, hashent); 60 hlist_add_head(&up->uidhash_node, hashent);
61} 61}
62 62
63static inline void uid_hash_remove(struct user_struct *up) 63static inline void uid_hash_remove(struct user_struct *up)
64{ 64{
65 list_del(&up->uidhash_list); 65 hlist_del_init(&up->uidhash_node);
66} 66}
67 67
68static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) 68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
69{ 69{
70 struct list_head *up; 70 struct user_struct *user;
71 71 struct hlist_node *h;
72 list_for_each(up, hashent) {
73 struct user_struct *user;
74
75 user = list_entry(up, struct user_struct, uidhash_list);
76 72
73 hlist_for_each_entry(user, h, hashent, uidhash_node) {
77 if(user->uid == uid) { 74 if(user->uid == uid) {
78 atomic_inc(&user->__count); 75 atomic_inc(&user->__count);
79 return user; 76 return user;
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up)
122 119
123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 121{
125 struct list_head *hashent = uidhashentry(ns, uid); 122 struct hlist_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 123 struct user_struct *up;
127 124
128 spin_lock_irq(&uidhash_lock); 125 spin_lock_irq(&uidhash_lock);
@@ -202,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
202 suid_keys(current); 199 suid_keys(current);
203} 200}
204 201
202void release_uids(struct user_namespace *ns)
203{
204 int i;
205 unsigned long flags;
206 struct hlist_head *head;
207 struct hlist_node *nd;
208
209 spin_lock_irqsave(&uidhash_lock, flags);
210 /*
211 * collapse the chains so that the user_struct-s will
212 * be still alive, but not in hashes. subsequent free_uid()
213 * will free them.
214 */
215 for (i = 0; i < UIDHASH_SZ; i++) {
216 head = ns->uidhash_table + i;
217 while (!hlist_empty(head)) {
218 nd = head->first;
219 hlist_del_init(nd);
220 }
221 }
222 spin_unlock_irqrestore(&uidhash_lock, flags);
223
224 free_uid(ns->root_user);
225}
205 226
206static int __init uid_cache_init(void) 227static int __init uid_cache_init(void)
207{ 228{
@@ -211,7 +232,7 @@ static int __init uid_cache_init(void)
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 232 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
212 233
213 for(n = 0; n < UIDHASH_SZ; ++n) 234 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n); 235 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
215 236
216 /* Insert the root user immediately (init already runs as root) */ 237 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 238 spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d055d987850c..7af90fc4f0fd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
39 kref_init(&ns->kref); 39 kref_init(&ns->kref);
40 40
41 for (n = 0; n < UIDHASH_SZ; ++n) 41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n); 42 INIT_HLIST_HEAD(ns->uidhash_table + n);
43 43
44 /* Insert new root user. */ 44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0); 45 ns->root_user = alloc_uid(ns, 0);
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref)
81 struct user_namespace *ns; 81 struct user_namespace *ns;
82 82
83 ns = container_of(kref, struct user_namespace, kref); 83 ns = container_of(kref, struct user_namespace, kref);
84 release_uids(ns);
84 kfree(ns); 85 kfree(ns);
85} 86}
86 87
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 9d8180a0f0d8..816d7b24fa03 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
28 if (!ns) 28 if (!ns)
29 return ERR_PTR(-ENOMEM); 29 return ERR_PTR(-ENOMEM);
30 30
31 down_read(&uts_sem);
31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem);
32 kref_init(&ns->kref); 34 kref_init(&ns->kref);
33 return ns; 35 return ns;
34} 36}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 58e5c152a6bb..e080d1d744cc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -635,7 +635,7 @@ int keventd_up(void)
635int current_is_keventd(void) 635int current_is_keventd(void)
636{ 636{
637 struct cpu_workqueue_struct *cwq; 637 struct cpu_workqueue_struct *cwq;
638 int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 638 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
639 int ret = 0; 639 int ret = 0;
640 640
641 BUG_ON(!keventd_wq); 641 BUG_ON(!keventd_wq);