aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c26
-rw-r--r--kernel/futex_compat.c30
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/power/Kconfig41
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/sched.c112
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c216
-rw-r--r--kernel/sched_rt.c11
-rw-r--r--kernel/signal.c49
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c41
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c17
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/time/timer_stats.c5
-rw-r--r--kernel/user.c45
-rw-r--r--kernel/user_namespace.c3
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/workqueue.c2
26 files changed, 464 insertions, 203 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 181ae7086029..38033db8d8ec 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu)
273 return err; 273 return err;
274} 274}
275 275
276#ifdef CONFIG_SUSPEND_SMP 276#ifdef CONFIG_PM_SLEEP_SMP
277static cpumask_t frozen_cpus; 277static cpumask_t frozen_cpus;
278 278
279int disable_nonboot_cpus(void) 279int disable_nonboot_cpus(void)
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void)
334out: 334out:
335 mutex_unlock(&cpu_add_remove_lock); 335 mutex_unlock(&cpu_add_remove_lock);
336} 336}
337#endif 337#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/exit.c b/kernel/exit.c
index 9578c1ae19ca..993369ee94d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
24#include <linux/pid_namespace.h> 24#include <linux/pid_namespace.h>
25#include <linux/ptrace.h> 25#include <linux/ptrace.h>
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/signalfd.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/kthread.h> 29#include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
86 sighand = rcu_dereference(tsk->sighand); 85 sighand = rcu_dereference(tsk->sighand);
87 spin_lock(&sighand->siglock); 86 spin_lock(&sighand->siglock);
88 87
89 /*
90 * Notify that this sighand has been detached. This must
91 * be called with the tsk->sighand lock held. Also, this
92 * access tsk->sighand internally, so it must be called
93 * before tsk->sighand is reset.
94 */
95 signalfd_detach_locked(tsk);
96
97 posix_cpu_timers_exit(tsk); 88 posix_cpu_timers_exit(tsk);
98 if (atomic_dec_and_test(&sig->count)) 89 if (atomic_dec_and_test(&sig->count))
99 posix_cpu_timers_exit_group(tsk); 90 posix_cpu_timers_exit_group(tsk);
@@ -975,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code)
975 if (unlikely(tsk->audit_context)) 966 if (unlikely(tsk->audit_context))
976 audit_free(tsk); 967 audit_free(tsk);
977 968
969 tsk->exit_code = code;
978 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
979 971
980 exit_mm(tsk); 972 exit_mm(tsk);
@@ -996,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code)
996 if (tsk->binfmt) 988 if (tsk->binfmt)
997 module_put(tsk->binfmt->module); 989 module_put(tsk->binfmt->module);
998 990
999 tsk->exit_code = code;
1000 proc_exit_connector(tsk); 991 proc_exit_connector(tsk);
1001 exit_task_namespaces(tsk); 992 exit_task_namespaces(tsk);
1002 exit_notify(tsk); 993 exit_notify(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d367..33f12f48684a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
1438 struct sighand_struct *sighand = data; 1438 struct sighand_struct *sighand = data;
1439 1439
1440 spin_lock_init(&sighand->siglock); 1440 spin_lock_init(&sighand->siglock);
1441 INIT_LIST_HEAD(&sighand->signalfd_list); 1441 init_waitqueue_head(&sighand->signalfd_wqh);
1442} 1442}
1443 1443
1444void __init proc_caches_init(void) 1444void __init proc_caches_init(void)
diff --git a/kernel/futex.c b/kernel/futex.c
index e8935b195e88..fcc94e7b4086 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1943,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
1943void exit_robust_list(struct task_struct *curr) 1943void exit_robust_list(struct task_struct *curr)
1944{ 1944{
1945 struct robust_list_head __user *head = curr->robust_list; 1945 struct robust_list_head __user *head = curr->robust_list;
1946 struct robust_list __user *entry, *pending; 1946 struct robust_list __user *entry, *next_entry, *pending;
1947 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1947 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
1948 unsigned long futex_offset; 1948 unsigned long futex_offset;
1949 int rc;
1949 1950
1950 /* 1951 /*
1951 * Fetch the list head (which was registered earlier, via 1952 * Fetch the list head (which was registered earlier, via
@@ -1965,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr)
1965 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1966 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1966 return; 1967 return;
1967 1968
1968 if (pending) 1969 next_entry = NULL; /* avoid warning with gcc */
1969 handle_futex_death((void __user *)pending + futex_offset,
1970 curr, pip);
1971
1972 while (entry != &head->list) { 1970 while (entry != &head->list) {
1973 /* 1971 /*
1972 * Fetch the next entry in the list before calling
1973 * handle_futex_death:
1974 */
1975 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
1976 /*
1974 * A pending lock might already be on the list, so 1977 * A pending lock might already be on the list, so
1975 * don't process it twice: 1978 * don't process it twice:
1976 */ 1979 */
@@ -1978,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr)
1978 if (handle_futex_death((void __user *)entry + futex_offset, 1981 if (handle_futex_death((void __user *)entry + futex_offset,
1979 curr, pi)) 1982 curr, pi))
1980 return; 1983 return;
1981 /* 1984 if (rc)
1982 * Fetch the next entry in the list:
1983 */
1984 if (fetch_robust_entry(&entry, &entry->next, &pi))
1985 return; 1985 return;
1986 entry = next_entry;
1987 pi = next_pi;
1986 /* 1988 /*
1987 * Avoid excessively long or circular lists: 1989 * Avoid excessively long or circular lists:
1988 */ 1990 */
@@ -1991,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr)
1991 1993
1992 cond_resched(); 1994 cond_resched();
1993 } 1995 }
1996
1997 if (pending)
1998 handle_futex_death((void __user *)pending + futex_offset,
1999 curr, pip);
1994} 2000}
1995 2001
1996long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2002long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f7921360efad..2c2e2954b713 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
38void compat_exit_robust_list(struct task_struct *curr) 38void compat_exit_robust_list(struct task_struct *curr)
39{ 39{
40 struct compat_robust_list_head __user *head = curr->compat_robust_list; 40 struct compat_robust_list_head __user *head = curr->compat_robust_list;
41 struct robust_list __user *entry, *pending; 41 struct robust_list __user *entry, *next_entry, *pending;
42 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 42 unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip;
43 compat_uptr_t uentry, upending; 43 compat_uptr_t uentry, next_uentry, upending;
44 compat_long_t futex_offset; 44 compat_long_t futex_offset;
45 int rc;
45 46
46 /* 47 /*
47 * Fetch the list head (which was registered earlier, via 48 * Fetch the list head (which was registered earlier, via
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr)
61 if (fetch_robust_entry(&upending, &pending, 62 if (fetch_robust_entry(&upending, &pending,
62 &head->list_op_pending, &pip)) 63 &head->list_op_pending, &pip))
63 return; 64 return;
64 if (upending)
65 handle_futex_death((void __user *)pending + futex_offset, curr, pip);
66 65
67 while (compat_ptr(uentry) != &head->list) { 66 next_entry = NULL; /* avoid warning with gcc */
67 while (entry != (struct robust_list __user *) &head->list) {
68 /*
69 * Fetch the next entry in the list before calling
70 * handle_futex_death:
71 */
72 rc = fetch_robust_entry(&next_uentry, &next_entry,
73 (compat_uptr_t __user *)&entry->next, &next_pi);
68 /* 74 /*
69 * A pending lock might already be on the list, so 75 * A pending lock might already be on the list, so
70 * dont process it twice: 76 * dont process it twice:
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr)
74 curr, pi)) 80 curr, pi))
75 return; 81 return;
76 82
77 /* 83 if (rc)
78 * Fetch the next entry in the list:
79 */
80 if (fetch_robust_entry(&uentry, &entry,
81 (compat_uptr_t __user *)&entry->next, &pi))
82 return; 84 return;
85 uentry = next_uentry;
86 entry = next_entry;
87 pi = next_pi;
83 /* 88 /*
84 * Avoid excessively long or circular lists: 89 * Avoid excessively long or circular lists:
85 */ 90 */
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr)
88 93
89 cond_resched(); 94 cond_resched();
90 } 95 }
96 if (pending)
97 handle_futex_death((void __user *)pending + futex_offset,
98 curr, pip);
91} 99}
92 100
93asmlinkage long 101asmlinkage long
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 853aefbd184b..7230d914eaa2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -547,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler,
547 * We do this before actually registering it, to make sure that 547 * We do this before actually registering it, to make sure that
548 * a 'real' IRQ doesn't run in parallel with our fake 548 * a 'real' IRQ doesn't run in parallel with our fake
549 */ 549 */
550 if (irqflags & IRQF_DISABLED) { 550 unsigned long flags;
551 unsigned long flags;
552 551
553 local_irq_save(flags); 552 local_irq_save(flags);
554 handler(irq, dev_id); 553 handler(irq, dev_id);
555 local_irq_restore(flags); 554 local_irq_restore(flags);
556 } else
557 handler(irq, dev_id);
558 } 555 }
559#endif 556#endif
560 557
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9809cc1f33d6..c6a4f8aebeba 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
505 if (ret < 0) 505 if (ret < 0)
506 goto out; 506 goto out;
507 507
508 return call_usermodehelper_exec(sub_info, 1); 508 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
509 509
510 out: 510 out:
511 call_usermodehelper_freeinfo(sub_info); 511 call_usermodehelper_freeinfo(sub_info);
diff --git a/kernel/module.c b/kernel/module.c
index 33c04ad51175..db0ead0363e2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
784static ssize_t show_refcnt(struct module_attribute *mattr, 784static ssize_t show_refcnt(struct module_attribute *mattr,
785 struct module *mod, char *buffer) 785 struct module *mod, char *buffer)
786{ 786{
787 /* sysfs holds a reference */ 787 return sprintf(buffer, "%u\n", module_refcount(mod));
788 return sprintf(buffer, "%u\n", module_refcount(mod)-1);
789} 788}
790 789
791static struct module_attribute refcnt = { 790static struct module_attribute refcnt = {
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 412859f8d94a..14b0e10dc95c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,15 +72,10 @@ config PM_TRACE
72 CAUTION: this option will cause your machine's real-time clock to be 72 CAUTION: this option will cause your machine's real-time clock to be
73 set to an invalid time after a resume. 73 set to an invalid time after a resume.
74 74
75config SUSPEND_SMP_POSSIBLE 75config PM_SLEEP_SMP
76 bool
77 depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC))
78 depends on SMP
79 default y
80
81config SUSPEND_SMP
82 bool 76 bool
83 depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP 77 depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
78 depends on PM_SLEEP
84 select HOTPLUG_CPU 79 select HOTPLUG_CPU
85 default y 80 default y
86 81
@@ -89,20 +84,46 @@ config PM_SLEEP
89 depends on SUSPEND || HIBERNATION 84 depends on SUSPEND || HIBERNATION
90 default y 85 default y
91 86
87config SUSPEND_UP_POSSIBLE
88 bool
89 depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
90 || SUPERH || FRV
91 depends on !SMP
92 default y
93
94config SUSPEND_SMP_POSSIBLE
95 bool
96 depends on (X86 && !X86_VOYAGER) \
97 || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
98 depends on SMP
99 default y
100
92config SUSPEND 101config SUSPEND
93 bool "Suspend to RAM and standby" 102 bool "Suspend to RAM and standby"
94 depends on PM 103 depends on PM
95 depends on !SMP || SUSPEND_SMP_POSSIBLE 104 depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
96 default y 105 default y
97 ---help--- 106 ---help---
98 Allow the system to enter sleep states in which main memory is 107 Allow the system to enter sleep states in which main memory is
99 powered and thus its contents are preserved, such as the 108 powered and thus its contents are preserved, such as the
100 suspend-to-RAM state (i.e. the ACPI S3 state). 109 suspend-to-RAM state (i.e. the ACPI S3 state).
101 110
111config HIBERNATION_UP_POSSIBLE
112 bool
113 depends on X86 || PPC64_SWSUSP || PPC32
114 depends on !SMP
115 default y
116
117config HIBERNATION_SMP_POSSIBLE
118 bool
119 depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
120 depends on SMP
121 default y
122
102config HIBERNATION 123config HIBERNATION
103 bool "Hibernation (aka 'suspend to disk')" 124 bool "Hibernation (aka 'suspend to disk')"
104 depends on PM && SWAP 125 depends on PM && SWAP
105 depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE 126 depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
106 ---help--- 127 ---help---
107 Enable the suspend to disk (STD) functionality, which is usually 128 Enable the suspend to disk (STD) functionality, which is usually
108 called "hibernation" in user interfaces. STD checkpoints the 129 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 82a558b655da..3eca7a55f2ee 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
233 233
234 /* Architecture-specific hardware disable .. */ 234 /* Architecture-specific hardware disable .. */
235 ptrace_disable(child); 235 ptrace_disable(child);
236 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
236 237
237 write_lock_irq(&tasklist_lock); 238 write_lock_irq(&tasklist_lock);
238 /* protect against de_thread()->release_task() */ 239 /* protect against de_thread()->release_task() */
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..6107a0cd6325 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
262 s64 clock_max_delta; 262 s64 clock_max_delta;
263 263
264 unsigned int clock_warps, clock_overflows; 264 unsigned int clock_warps, clock_overflows;
265 unsigned int clock_unstable_events; 265 u64 idle_clock;
266 unsigned int clock_deep_idle_events;
266 u64 tick_timestamp; 267 u64 tick_timestamp;
267 268
268 atomic_t nr_iowait; 269 atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
556} 557}
557 558
558/* 559/*
559 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 560 * We are going deep-idle (irqs are disabled):
560 */ 561 */
561void sched_clock_unstable_event(void) 562void sched_clock_idle_sleep_event(void)
562{ 563{
563 unsigned long flags; 564 struct rq *rq = cpu_rq(smp_processor_id());
564 struct rq *rq;
565 565
566 rq = task_rq_lock(current, &flags); 566 spin_lock(&rq->lock);
567 rq->prev_clock_raw = sched_clock(); 567 __update_rq_clock(rq);
568 rq->clock_unstable_events++; 568 spin_unlock(&rq->lock);
569 task_rq_unlock(rq, &flags); 569 rq->clock_deep_idle_events++;
570}
571EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
572
573/*
574 * We just idled delta nanoseconds (called with irqs disabled):
575 */
576void sched_clock_idle_wakeup_event(u64 delta_ns)
577{
578 struct rq *rq = cpu_rq(smp_processor_id());
579 u64 now = sched_clock();
580
581 rq->idle_clock += delta_ns;
582 /*
583 * Override the previous timestamp and ignore all
584 * sched_clock() deltas that occured while we idled,
585 * and use the PM-provided delta_ns to advance the
586 * rq clock:
587 */
588 spin_lock(&rq->lock);
589 rq->prev_clock_raw = now;
590 rq->clock += delta_ns;
591 spin_unlock(&rq->lock);
570} 592}
593EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
571 594
572/* 595/*
573 * resched_task - mark a task 'to be rescheduled now'. 596 * resched_task - mark a task 'to be rescheduled now'.
@@ -645,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
645/* 668/*
646 * Shift right and round: 669 * Shift right and round:
647 */ 670 */
648#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 671#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
649 672
650static unsigned long 673static unsigned long
651calc_delta_mine(unsigned long delta_exec, unsigned long weight, 674calc_delta_mine(unsigned long delta_exec, unsigned long weight,
@@ -661,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
661 * Check whether we'd overflow the 64-bit multiplication: 684 * Check whether we'd overflow the 64-bit multiplication:
662 */ 685 */
663 if (unlikely(tmp > WMULT_CONST)) 686 if (unlikely(tmp > WMULT_CONST))
664 tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 687 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
665 WMULT_SHIFT/2); 688 WMULT_SHIFT/2);
666 else 689 else
667 tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); 690 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
668 691
669 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 692 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
670} 693}
@@ -835,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
835 858
836static void set_load_weight(struct task_struct *p) 859static void set_load_weight(struct task_struct *p)
837{ 860{
838 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
839 p->se.wait_runtime = 0; 861 p->se.wait_runtime = 0;
840 862
841 if (task_has_rt_policy(p)) { 863 if (task_has_rt_policy(p)) {
@@ -1564,6 +1586,7 @@ static void __sched_fork(struct task_struct *p)
1564 p->se.wait_start_fair = 0; 1586 p->se.wait_start_fair = 0;
1565 p->se.exec_start = 0; 1587 p->se.exec_start = 0;
1566 p->se.sum_exec_runtime = 0; 1588 p->se.sum_exec_runtime = 0;
1589 p->se.prev_sum_exec_runtime = 0;
1567 p->se.delta_exec = 0; 1590 p->se.delta_exec = 0;
1568 p->se.delta_fair_run = 0; 1591 p->se.delta_fair_run = 0;
1569 p->se.delta_fair_sleep = 0; 1592 p->se.delta_fair_sleep = 0;
@@ -1659,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1659 1682
1660 p->prio = effective_prio(p); 1683 p->prio = effective_prio(p);
1661 1684
1685 if (rt_prio(p->prio))
1686 p->sched_class = &rt_sched_class;
1687 else
1688 p->sched_class = &fair_sched_class;
1689
1662 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || 1690 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1663 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || 1691 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1664 !current->se.on_rq) { 1692 !current->se.on_rq) {
@@ -2157,12 +2185,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2157 if (task_running(rq, p)) 2185 if (task_running(rq, p))
2158 return 0; 2186 return 0;
2159 2187
2160 /*
2161 * Aggressive migration if too many balance attempts have failed:
2162 */
2163 if (sd->nr_balance_failed > sd->cache_nice_tries)
2164 return 1;
2165
2166 return 1; 2188 return 1;
2167} 2189}
2168 2190
@@ -2494,7 +2516,7 @@ group_next:
2494 * a think about bumping its value to force at least one task to be 2516 * a think about bumping its value to force at least one task to be
2495 * moved 2517 * moved
2496 */ 2518 */
2497 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2519 if (*imbalance < busiest_load_per_task) {
2498 unsigned long tmp, pwr_now, pwr_move; 2520 unsigned long tmp, pwr_now, pwr_move;
2499 unsigned int imbn; 2521 unsigned int imbn;
2500 2522
@@ -2546,10 +2568,8 @@ small_imbalance:
2546 pwr_move /= SCHED_LOAD_SCALE; 2568 pwr_move /= SCHED_LOAD_SCALE;
2547 2569
2548 /* Move if we gain throughput */ 2570 /* Move if we gain throughput */
2549 if (pwr_move <= pwr_now) 2571 if (pwr_move > pwr_now)
2550 goto out_balanced; 2572 *imbalance = busiest_load_per_task;
2551
2552 *imbalance = busiest_load_per_task;
2553 } 2573 }
2554 2574
2555 return busiest; 2575 return busiest;
@@ -3020,6 +3040,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3020 struct sched_domain *sd; 3040 struct sched_domain *sd;
3021 /* Earliest time when we have to do rebalance again */ 3041 /* Earliest time when we have to do rebalance again */
3022 unsigned long next_balance = jiffies + 60*HZ; 3042 unsigned long next_balance = jiffies + 60*HZ;
3043 int update_next_balance = 0;
3023 3044
3024 for_each_domain(cpu, sd) { 3045 for_each_domain(cpu, sd) {
3025 if (!(sd->flags & SD_LOAD_BALANCE)) 3046 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3056,8 +3077,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3056 if (sd->flags & SD_SERIALIZE) 3077 if (sd->flags & SD_SERIALIZE)
3057 spin_unlock(&balancing); 3078 spin_unlock(&balancing);
3058out: 3079out:
3059 if (time_after(next_balance, sd->last_balance + interval)) 3080 if (time_after(next_balance, sd->last_balance + interval)) {
3060 next_balance = sd->last_balance + interval; 3081 next_balance = sd->last_balance + interval;
3082 update_next_balance = 1;
3083 }
3061 3084
3062 /* 3085 /*
3063 * Stop the load balance at this level. There is another 3086 * Stop the load balance at this level. There is another
@@ -3067,7 +3090,14 @@ out:
3067 if (!balance) 3090 if (!balance)
3068 break; 3091 break;
3069 } 3092 }
3070 rq->next_balance = next_balance; 3093
3094 /*
3095 * next_balance will be updated only when there is a need.
3096 * When the cpu is attached to null domain for ex, it will not be
3097 * updated.
3098 */
3099 if (likely(update_next_balance))
3100 rq->next_balance = next_balance;
3071} 3101}
3072 3102
3073/* 3103/*
@@ -4525,10 +4555,7 @@ asmlinkage long sys_sched_yield(void)
4525 struct rq *rq = this_rq_lock(); 4555 struct rq *rq = this_rq_lock();
4526 4556
4527 schedstat_inc(rq, yld_cnt); 4557 schedstat_inc(rq, yld_cnt);
4528 if (unlikely(rq->nr_running == 1)) 4558 current->sched_class->yield_task(rq, current);
4529 schedstat_inc(rq, yld_act_empty);
4530 else
4531 current->sched_class->yield_task(rq, current);
4532 4559
4533 /* 4560 /*
4534 * Since we are going to call schedule() anyway, there's 4561 * Since we are going to call schedule() anyway, there's
@@ -4884,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4884static inline void sched_init_granularity(void) 4911static inline void sched_init_granularity(void)
4885{ 4912{
4886 unsigned int factor = 1 + ilog2(num_online_cpus()); 4913 unsigned int factor = 1 + ilog2(num_online_cpus());
4887 const unsigned long gran_limit = 100000000; 4914 const unsigned long limit = 100000000;
4915
4916 sysctl_sched_min_granularity *= factor;
4917 if (sysctl_sched_min_granularity > limit)
4918 sysctl_sched_min_granularity = limit;
4888 4919
4889 sysctl_sched_granularity *= factor; 4920 sysctl_sched_latency *= factor;
4890 if (sysctl_sched_granularity > gran_limit) 4921 if (sysctl_sched_latency > limit)
4891 sysctl_sched_granularity = gran_limit; 4922 sysctl_sched_latency = limit;
4892 4923
4893 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4924 sysctl_sched_runtime_limit = sysctl_sched_latency;
4894 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4925 sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
4895} 4926}
4896 4927
4897#ifdef CONFIG_SMP 4928#ifdef CONFIG_SMP
@@ -5234,15 +5265,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5234static struct ctl_table sd_ctl_dir[] = { 5265static struct ctl_table sd_ctl_dir[] = {
5235 { 5266 {
5236 .procname = "sched_domain", 5267 .procname = "sched_domain",
5237 .mode = 0755, 5268 .mode = 0555,
5238 }, 5269 },
5239 {0,}, 5270 {0,},
5240}; 5271};
5241 5272
5242static struct ctl_table sd_ctl_root[] = { 5273static struct ctl_table sd_ctl_root[] = {
5243 { 5274 {
5275 .ctl_name = CTL_KERN,
5244 .procname = "kernel", 5276 .procname = "kernel",
5245 .mode = 0755, 5277 .mode = 0555,
5246 .child = sd_ctl_dir, 5278 .child = sd_ctl_dir,
5247 }, 5279 },
5248 {0,}, 5280 {0,},
@@ -5318,7 +5350,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5318 for_each_domain(cpu, sd) { 5350 for_each_domain(cpu, sd) {
5319 snprintf(buf, 32, "domain%d", i); 5351 snprintf(buf, 32, "domain%d", i);
5320 entry->procname = kstrdup(buf, GFP_KERNEL); 5352 entry->procname = kstrdup(buf, GFP_KERNEL);
5321 entry->mode = 0755; 5353 entry->mode = 0555;
5322 entry->child = sd_alloc_ctl_domain_table(sd); 5354 entry->child = sd_alloc_ctl_domain_table(sd);
5323 entry++; 5355 entry++;
5324 i++; 5356 i++;
@@ -5338,7 +5370,7 @@ static void init_sched_domain_sysctl(void)
5338 for (i = 0; i < cpu_num; i++, entry++) { 5370 for (i = 0; i < cpu_num; i++, entry++) {
5339 snprintf(buf, 32, "cpu%d", i); 5371 snprintf(buf, 32, "cpu%d", i);
5340 entry->procname = kstrdup(buf, GFP_KERNEL); 5372 entry->procname = kstrdup(buf, GFP_KERNEL);
5341 entry->mode = 0755; 5373 entry->mode = 0555;
5342 entry->child = sd_alloc_ctl_cpu_table(i); 5374 entry->child = sd_alloc_ctl_cpu_table(i);
5343 } 5375 }
5344 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5376 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..c3ee38bd3426 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
154 P(next_balance); 154 P(next_balance);
155 P(curr->pid); 155 P(curr->pid);
156 P(clock); 156 P(clock);
157 P(idle_clock);
157 P(prev_clock_raw); 158 P(prev_clock_raw);
158 P(clock_warps); 159 P(clock_warps);
159 P(clock_overflows); 160 P(clock_overflows);
160 P(clock_unstable_events); 161 P(clock_deep_idle_events);
161 P(clock_max_delta); 162 P(clock_max_delta);
162 P(cpu_load[0]); 163 P(cpu_load[0]);
163 P(cpu_load[1]); 164 P(cpu_load[1]);
@@ -282,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p)
282 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; 283 p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
283#endif 284#endif
284 p->se.sum_exec_runtime = 0; 285 p->se.sum_exec_runtime = 0;
286 p->se.prev_sum_exec_runtime = 0;
285} 287}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fedbb51bba96..67c67a87146e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -15,34 +15,50 @@
15 * 15 *
16 * Scaled math optimizations by Thomas Gleixner 16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
18 */ 21 */
19 22
20/* 23/*
21 * Preemption granularity: 24 * Targeted preemption latency for CPU-bound tasks:
22 * (default: 2 msec, units: nanoseconds) 25 * (default: 20ms, units: nanoseconds)
23 * 26 *
24 * NOTE: this granularity value is not the same as the concept of 27 * NOTE: this latency value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat 28 * 'timeslice length' - timeslices in CFS are of variable length.
26 * larger than this value. (to see the precise effective timeslice 29 * (to see the precise effective timeslice length of your workload,
27 * length of your workload, run vmstat and monitor the context-switches 30 * run vmstat and monitor the context-switches field)
28 * field)
29 * 31 *
30 * On SMP systems the value of this is multiplied by the log2 of the 32 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
35 * Targeted preemption latency for CPU-bound tasks:
36 */
37unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
38
39/*
40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 2 msec, units: nanoseconds)
33 */ 42 */
34unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; 43unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
44
45/*
46 * sys_sched_yield() compat mode
47 *
48 * This option switches the agressive yield implementation of the
49 * old scheduler back on.
50 */
51unsigned int __read_mostly sysctl_sched_compat_yield;
35 52
36/* 53/*
37 * SCHED_BATCH wake-up granularity. 54 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds) 55 * (default: 25 msec, units: nanoseconds)
39 * 56 *
40 * This option delays the preemption effects of decoupled workloads 57 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still 58 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies. 59 * have immediate wakeup/sleep latencies.
43 */ 60 */
44unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 61unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
45 10000000000ULL/HZ;
46 62
47/* 63/*
48 * SCHED_OTHER wake-up granularity. 64 * SCHED_OTHER wake-up granularity.
@@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
52 * and reduces their over-scheduling. Synchronous workloads will still 68 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies. 69 * have immediate wakeup/sleep latencies.
54 */ 70 */
55unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; 71unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
56 72
57unsigned int sysctl_sched_stat_granularity __read_mostly; 73unsigned int sysctl_sched_stat_granularity __read_mostly;
58 74
59/* 75/*
60 * Initialized in sched_init_granularity(): 76 * Initialized in sched_init_granularity() [to 5 times the base granularity]:
61 */ 77 */
62unsigned int sysctl_sched_runtime_limit __read_mostly; 78unsigned int sysctl_sched_runtime_limit __read_mostly;
63 79
@@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
186 update_load_add(&cfs_rq->load, se->load.weight); 202 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++; 203 cfs_rq->nr_running++;
188 se->on_rq = 1; 204 se->on_rq = 1;
205
206 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
189} 207}
190 208
191static inline void 209static inline void
@@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
197 update_load_sub(&cfs_rq->load, se->load.weight); 215 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--; 216 cfs_rq->nr_running--;
199 se->on_rq = 0; 217 se->on_rq = 0;
218
219 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
200} 220}
201 221
202static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 222static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
214 */ 234 */
215 235
216/* 236/*
237 * Calculate the preemption granularity needed to schedule every
238 * runnable task once per sysctl_sched_latency amount of time.
239 * (down to a sensible low limit on granularity)
240 *
241 * For example, if there are 2 tasks running and latency is 10 msecs,
242 * we switch tasks every 5 msecs. If we have 3 tasks running, we have
243 * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
244 * for each task. We do finer and finer scheduling up to until we
245 * reach the minimum granularity value.
246 *
247 * To achieve this we use the following dynamic-granularity rule:
248 *
249 * gran = lat/nr - lat/nr/nr
250 *
251 * This comes out of the following equations:
252 *
253 * kA1 + gran = kB1
254 * kB2 + gran = kA2
255 * kA2 = kA1
256 * kB2 = kB1 - d + d/nr
257 * lat = d * nr
258 *
259 * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
260 * '1' is start of time, '2' is end of time, 'd' is delay between
261 * 1 and 2 (during which task B was running), 'nr' is number of tasks
262 * running, 'lat' is the the period of each task. ('lat' is the
263 * sched_latency that we aim for.)
264 */
265static long
266sched_granularity(struct cfs_rq *cfs_rq)
267{
268 unsigned int gran = sysctl_sched_latency;
269 unsigned int nr = cfs_rq->nr_running;
270
271 if (nr > 1) {
272 gran = gran/nr - gran/nr/nr;
273 gran = max(gran, sysctl_sched_min_granularity);
274 }
275
276 return gran;
277}
278
279/*
217 * We rescale the rescheduling granularity of tasks according to their 280 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially: 281 * nice level, but only linearly, not exponentially:
219 */ 282 */
@@ -240,7 +303,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
240 /* 303 /*
241 * It will always fit into 'long': 304 * It will always fit into 'long':
242 */ 305 */
243 return (long) (tmp >> WMULT_SHIFT); 306 return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
244} 307}
245 308
246static inline void 309static inline void
@@ -303,10 +366,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
303 delta_fair = calc_delta_fair(delta_exec, lw); 366 delta_fair = calc_delta_fair(delta_exec, lw);
304 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 367 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
305 368
306 if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { 369 if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
307 delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); 370 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
308 delta = calc_delta_mine(delta, curr->load.weight, lw); 371 delta = min(delta, (unsigned long)(
309 delta = min((u64)delta, cfs_rq->sleeper_bonus); 372 (long)sysctl_sched_runtime_limit - curr->wait_runtime));
310 cfs_rq->sleeper_bonus -= delta; 373 cfs_rq->sleeper_bonus -= delta;
311 delta_mine -= delta; 374 delta_mine -= delta;
312 } 375 }
@@ -438,6 +501,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
438{ 501{
439 unsigned long delta_fair; 502 unsigned long delta_fair;
440 503
504 if (unlikely(!se->wait_start_fair))
505 return;
506
441 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), 507 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
442 (u64)(cfs_rq->fair_clock - se->wait_start_fair)); 508 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
443 509
@@ -494,6 +560,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
494 unsigned long load = cfs_rq->load.weight, delta_fair; 560 unsigned long load = cfs_rq->load.weight, delta_fair;
495 long prev_runtime; 561 long prev_runtime;
496 562
563 /*
564 * Do not boost sleepers if there's too much bonus 'in flight'
565 * already:
566 */
567 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
568 return;
569
497 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) 570 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
498 load = rq_of(cfs_rq)->cpu_load[2]; 571 load = rq_of(cfs_rq)->cpu_load[2];
499 572
@@ -519,10 +592,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
519 * Track the amount of bonus we've given to sleepers: 592 * Track the amount of bonus we've given to sleepers:
520 */ 593 */
521 cfs_rq->sleeper_bonus += delta_fair; 594 cfs_rq->sleeper_bonus += delta_fair;
522 if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
523 cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit;
524
525 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
526} 595}
527 596
528static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 597static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -570,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
570 639
571 se->block_start = 0; 640 se->block_start = 0;
572 se->sum_sleep_runtime += delta; 641 se->sum_sleep_runtime += delta;
642
643 /*
644 * Blocking time is in units of nanosecs, so shift by 20 to
645 * get a milliseconds-range estimation of the amount of
646 * time that the task spent sleeping:
647 */
648 if (unlikely(prof_on == SLEEP_PROFILING)) {
649 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
650 delta >> 20);
651 }
573 } 652 }
574#endif 653#endif
575} 654}
@@ -604,7 +683,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
604 if (tsk->state & TASK_UNINTERRUPTIBLE) 683 if (tsk->state & TASK_UNINTERRUPTIBLE)
605 se->block_start = rq_of(cfs_rq)->clock; 684 se->block_start = rq_of(cfs_rq)->clock;
606 } 685 }
607 cfs_rq->wait_runtime -= se->wait_runtime;
608#endif 686#endif
609 } 687 }
610 __dequeue_entity(cfs_rq, se); 688 __dequeue_entity(cfs_rq, se);
@@ -618,11 +696,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
618 struct sched_entity *curr, unsigned long granularity) 696 struct sched_entity *curr, unsigned long granularity)
619{ 697{
620 s64 __delta = curr->fair_key - se->fair_key; 698 s64 __delta = curr->fair_key - se->fair_key;
699 unsigned long ideal_runtime, delta_exec;
700
701 /*
702 * ideal_runtime is compared against sum_exec_runtime, which is
703 * walltime, hence do not scale.
704 */
705 ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
706 (unsigned long)sysctl_sched_min_granularity);
707
708 /*
709 * If we executed more than what the latency constraint suggests,
710 * reduce the rescheduling granularity. This way the total latency
711 * of how much a task is not scheduled converges to
712 * sysctl_sched_latency:
713 */
714 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
715 if (delta_exec > ideal_runtime)
716 granularity = 0;
621 717
622 /* 718 /*
623 * Take scheduling granularity into account - do not 719 * Take scheduling granularity into account - do not
624 * preempt the current task unless the best task has 720 * preempt the current task unless the best task has
625 * a larger than sched_granularity fairness advantage: 721 * a larger than sched_granularity fairness advantage:
722 *
723 * scale granularity as key space is in fair_clock.
626 */ 724 */
627 if (__delta > niced_granularity(curr, granularity)) 725 if (__delta > niced_granularity(curr, granularity))
628 resched_task(rq_of(cfs_rq)->curr); 726 resched_task(rq_of(cfs_rq)->curr);
@@ -641,6 +739,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
641 update_stats_wait_end(cfs_rq, se); 739 update_stats_wait_end(cfs_rq, se);
642 update_stats_curr_start(cfs_rq, se); 740 update_stats_curr_start(cfs_rq, se);
643 set_cfs_rq_curr(cfs_rq, se); 741 set_cfs_rq_curr(cfs_rq, se);
742 se->prev_sum_exec_runtime = se->sum_exec_runtime;
644} 743}
645 744
646static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 745static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
@@ -686,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
686 if (next == curr) 785 if (next == curr)
687 return; 786 return;
688 787
689 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 788 __check_preempt_curr_fair(cfs_rq, next, curr,
789 sched_granularity(cfs_rq));
690} 790}
691 791
692/************************************************** 792/**************************************************
@@ -815,19 +915,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
815} 915}
816 916
817/* 917/*
818 * sched_yield() support is very simple - we dequeue and enqueue 918 * sched_yield() support is very simple - we dequeue and enqueue.
919 *
920 * If compat_yield is turned on then we requeue to the end of the tree.
819 */ 921 */
820static void yield_task_fair(struct rq *rq, struct task_struct *p) 922static void yield_task_fair(struct rq *rq, struct task_struct *p)
821{ 923{
822 struct cfs_rq *cfs_rq = task_cfs_rq(p); 924 struct cfs_rq *cfs_rq = task_cfs_rq(p);
925 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
926 struct sched_entity *rightmost, *se = &p->se;
927 struct rb_node *parent;
823 928
824 __update_rq_clock(rq);
825 /* 929 /*
826 * Dequeue and enqueue the task to update its 930 * Are we the only task in the tree?
827 * position within the tree: 931 */
932 if (unlikely(cfs_rq->nr_running == 1))
933 return;
934
935 if (likely(!sysctl_sched_compat_yield)) {
936 __update_rq_clock(rq);
937 /*
938 * Dequeue and enqueue the task to update its
939 * position within the tree:
940 */
941 dequeue_entity(cfs_rq, &p->se, 0);
942 enqueue_entity(cfs_rq, &p->se, 0);
943
944 return;
945 }
946 /*
947 * Find the rightmost entry in the rbtree:
948 */
949 do {
950 parent = *link;
951 link = &parent->rb_right;
952 } while (*link);
953
954 rightmost = rb_entry(parent, struct sched_entity, run_node);
955 /*
956 * Already in the rightmost position?
957 */
958 if (unlikely(rightmost == se))
959 return;
960
961 /*
962 * Minimally necessary key value to be last in the tree:
828 */ 963 */
829 dequeue_entity(cfs_rq, &p->se, 0); 964 se->fair_key = rightmost->fair_key + 1;
830 enqueue_entity(cfs_rq, &p->se, 0); 965
966 if (cfs_rq->rb_leftmost == &se->run_node)
967 cfs_rq->rb_leftmost = rb_next(&se->run_node);
968 /*
969 * Relink the task to the rightmost position:
970 */
971 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
972 rb_link_node(&se->run_node, parent, link);
973 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
831} 974}
832 975
833/* 976/*
@@ -1020,31 +1163,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1020static void task_new_fair(struct rq *rq, struct task_struct *p) 1163static void task_new_fair(struct rq *rq, struct task_struct *p)
1021{ 1164{
1022 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1165 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1023 struct sched_entity *se = &p->se; 1166 struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
1024 1167
1025 sched_info_queued(p); 1168 sched_info_queued(p);
1026 1169
1170 update_curr(cfs_rq);
1027 update_stats_enqueue(cfs_rq, se); 1171 update_stats_enqueue(cfs_rq, se);
1028 /* 1172 /*
1029 * Child runs first: we let it run before the parent 1173 * Child runs first: we let it run before the parent
1030 * until it reschedules once. We set up the key so that 1174 * until it reschedules once. We set up the key so that
1031 * it will preempt the parent: 1175 * it will preempt the parent:
1032 */ 1176 */
1033 p->se.fair_key = current->se.fair_key - 1177 se->fair_key = curr->fair_key -
1034 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1178 niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
1035 /* 1179 /*
1036 * The first wait is dominated by the child-runs-first logic, 1180 * The first wait is dominated by the child-runs-first logic,
1037 * so do not credit it with that waiting time yet: 1181 * so do not credit it with that waiting time yet:
1038 */ 1182 */
1039 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) 1183 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1040 p->se.wait_start_fair = 0; 1184 se->wait_start_fair = 0;
1041 1185
1042 /* 1186 /*
1043 * The statistical average of wait_runtime is about 1187 * The statistical average of wait_runtime is about
1044 * -granularity/2, so initialize the task with that: 1188 * -granularity/2, so initialize the task with that:
1045 */ 1189 */
1046 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1190 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1047 p->se.wait_runtime = -(sysctl_sched_granularity / 2); 1191 se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
1048 1192
1049 __enqueue_entity(cfs_rq, se); 1193 __enqueue_entity(cfs_rq, se);
1050} 1194}
@@ -1057,7 +1201,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1057 */ 1201 */
1058static void set_curr_task_fair(struct rq *rq) 1202static void set_curr_task_fair(struct rq *rq)
1059{ 1203{
1060 struct sched_entity *se = &rq->curr.se; 1204 struct sched_entity *se = &rq->curr->se;
1061 1205
1062 for_each_sched_entity(se) 1206 for_each_sched_entity(se)
1063 set_next_entity(cfs_rq_of(se), se); 1207 set_next_entity(cfs_rq_of(se), se);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index dcdcad632fd9..4b87476a02d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
207 return; 207 return;
208 208
209 p->time_slice = static_prio_timeslice(p->static_prio); 209 p->time_slice = static_prio_timeslice(p->static_prio);
210 set_tsk_need_resched(p);
211 210
212 /* put it at the end of the queue: */ 211 /*
213 requeue_task_rt(rq, p); 212 * Requeue to the end of queue if we are not the only element
213 * on the queue:
214 */
215 if (p->run_list.prev != p->run_list.next) {
216 requeue_task_rt(rq, p);
217 set_tsk_need_resched(p);
218 }
214} 219}
215 220
216static struct sched_class rt_sched_class __read_mostly = { 221static struct sched_class rt_sched_class __read_mostly = {
diff --git a/kernel/signal.c b/kernel/signal.c
index ad63109e413c..792952381092 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
378 /* We only dequeue private signals from ourselves, we don't let 378 /* We only dequeue private signals from ourselves, we don't let
379 * signalfd steal them 379 * signalfd steal them
380 */ 380 */
381 if (likely(tsk == current)) 381 signr = __dequeue_signal(&tsk->pending, mask, info);
382 signr = __dequeue_signal(&tsk->pending, mask, info);
383 if (!signr) { 382 if (!signr) {
384 signr = __dequeue_signal(&tsk->signal->shared_pending, 383 signr = __dequeue_signal(&tsk->signal->shared_pending,
385 mask, info); 384 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
407 } 406 }
408 } 407 }
409 } 408 }
410 if (likely(tsk == current)) 409 recalc_sigpending();
411 recalc_sigpending();
412 if (signr && unlikely(sig_kernel_stop(signr))) { 410 if (signr && unlikely(sig_kernel_stop(signr))) {
413 /* 411 /*
414 * Set a marker that we have dequeued a stop signal. Our 412 * Set a marker that we have dequeued a stop signal. Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
425 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 423 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
426 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 424 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
427 } 425 }
428 if (signr && likely(tsk == current) && 426 if (signr &&
429 ((info->si_code & __SI_MASK) == __SI_TIMER) && 427 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
430 info->si_sys_private){ 428 info->si_sys_private){
431 /* 429 /*
@@ -533,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info,
533 if (!valid_signal(sig)) 531 if (!valid_signal(sig))
534 return error; 532 return error;
535 533
536 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 534 if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
537 if (error) 535 error = audit_signal_info(sig, t); /* Let audit system see the signal */
538 return error; 536 if (error)
539 537 return error;
540 error = -EPERM; 538 error = -EPERM;
541 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 539 if (((sig != SIGCONT) ||
542 && ((sig != SIGCONT) || 540 (process_session(current) != process_session(t)))
543 (process_session(current) != process_session(t))) 541 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
544 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 542 && (current->uid ^ t->suid) && (current->uid ^ t->uid)
545 && (current->uid ^ t->suid) && (current->uid ^ t->uid) 543 && !capable(CAP_KILL))
546 && !capable(CAP_KILL))
547 return error; 544 return error;
545 }
548 546
549 return security_task_kill(t, info, sig, 0); 547 return security_task_kill(t, info, sig, 0);
550} 548}
@@ -1300,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void)
1300void sigqueue_free(struct sigqueue *q) 1298void sigqueue_free(struct sigqueue *q)
1301{ 1299{
1302 unsigned long flags; 1300 unsigned long flags;
1301 spinlock_t *lock = &current->sighand->siglock;
1302
1303 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1303 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1304 /* 1304 /*
1305 * If the signal is still pending remove it from the 1305 * If the signal is still pending remove it from the
1306 * pending queue. 1306 * pending queue. We must hold ->siglock while testing
1307 * q->list to serialize with collect_signal().
1307 */ 1308 */
1308 if (unlikely(!list_empty(&q->list))) { 1309 spin_lock_irqsave(lock, flags);
1309 spinlock_t *lock = &current->sighand->siglock; 1310 if (!list_empty(&q->list))
1310 read_lock(&tasklist_lock); 1311 list_del_init(&q->list);
1311 spin_lock_irqsave(lock, flags); 1312 spin_unlock_irqrestore(lock, flags);
1312 if (!list_empty(&q->list)) 1313
1313 list_del_init(&q->list);
1314 spin_unlock_irqrestore(lock, flags);
1315 read_unlock(&tasklist_lock);
1316 }
1317 q->flags &= ~SIGQUEUE_PREALLOC; 1314 q->flags &= ~SIGQUEUE_PREALLOC;
1318 __sigqueue_free(q); 1315 __sigqueue_free(q);
1319} 1316}
diff --git a/kernel/sys.c b/kernel/sys.c
index 449b81b98b3d..8ae2e636eb1b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -32,6 +32,7 @@
32#include <linux/getcpu.h> 32#include <linux/getcpu.h>
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h>
35 36
36#include <linux/compat.h> 37#include <linux/compat.h>
37#include <linux/syscalls.h> 38#include <linux/syscalls.h>
@@ -878,6 +879,7 @@ void kernel_power_off(void)
878 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 879 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
879 if (pm_power_off_prepare) 880 if (pm_power_off_prepare)
880 pm_power_off_prepare(); 881 pm_power_off_prepare();
882 disable_nonboot_cpus();
881 sysdev_shutdown(); 883 sysdev_shutdown();
882 printk(KERN_EMERG "Power down.\n"); 884 printk(KERN_EMERG "Power down.\n");
883 machine_power_off(); 885 machine_power_off();
@@ -1442,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf)
1442 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 1444 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
1443 * LBT 04.03.94 1445 * LBT 04.03.94
1444 */ 1446 */
1445
1446asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1447asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1447{ 1448{
1448 struct task_struct *p; 1449 struct task_struct *p;
@@ -1470,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1470 if (!thread_group_leader(p)) 1471 if (!thread_group_leader(p))
1471 goto out; 1472 goto out;
1472 1473
1473 if (p->real_parent == group_leader) { 1474 if (p->real_parent->tgid == group_leader->tgid) {
1474 err = -EPERM; 1475 err = -EPERM;
1475 if (task_session(p) != task_session(group_leader)) 1476 if (task_session(p) != task_session(group_leader))
1476 goto out; 1477 goto out;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9029690f4fae..53a456ebf6d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,8 +222,19 @@ static ctl_table kern_table[] = {
222#ifdef CONFIG_SCHED_DEBUG 222#ifdef CONFIG_SCHED_DEBUG
223 { 223 {
224 .ctl_name = CTL_UNNUMBERED, 224 .ctl_name = CTL_UNNUMBERED,
225 .procname = "sched_granularity_ns", 225 .procname = "sched_min_granularity_ns",
226 .data = &sysctl_sched_granularity, 226 .data = &sysctl_sched_min_granularity,
227 .maxlen = sizeof(unsigned int),
228 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax,
230 .strategy = &sysctl_intvec,
231 .extra1 = &min_sched_granularity_ns,
232 .extra2 = &max_sched_granularity_ns,
233 },
234 {
235 .ctl_name = CTL_UNNUMBERED,
236 .procname = "sched_latency_ns",
237 .data = &sysctl_sched_latency,
227 .maxlen = sizeof(unsigned int), 238 .maxlen = sizeof(unsigned int),
228 .mode = 0644, 239 .mode = 0644,
229 .proc_handler = &proc_dointvec_minmax, 240 .proc_handler = &proc_dointvec_minmax,
@@ -283,6 +294,23 @@ static ctl_table kern_table[] = {
283 .mode = 0644, 294 .mode = 0644,
284 .proc_handler = &proc_dointvec, 295 .proc_handler = &proc_dointvec,
285 }, 296 },
297 {
298 .ctl_name = CTL_UNNUMBERED,
299 .procname = "sched_features",
300 .data = &sysctl_sched_features,
301 .maxlen = sizeof(unsigned int),
302 .mode = 0644,
303 .proc_handler = &proc_dointvec,
304 },
305#endif
306 {
307 .ctl_name = CTL_UNNUMBERED,
308 .procname = "sched_compat_yield",
309 .data = &sysctl_sched_compat_yield,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
286#ifdef CONFIG_PROVE_LOCKING 314#ifdef CONFIG_PROVE_LOCKING
287 { 315 {
288 .ctl_name = CTL_UNNUMBERED, 316 .ctl_name = CTL_UNNUMBERED,
@@ -304,15 +332,6 @@ static ctl_table kern_table[] = {
304 }, 332 },
305#endif 333#endif
306 { 334 {
307 .ctl_name = CTL_UNNUMBERED,
308 .procname = "sched_features",
309 .data = &sysctl_sched_features,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = &proc_dointvec,
313 },
314#endif
315 {
316 .ctl_name = KERN_PANIC, 335 .ctl_name = KERN_PANIC,
317 .procname = "panic", 336 .procname = "panic",
318 .data = &panic_timeout, 337 .data = &panic_timeout,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cd91237dbfe3..de6a2d6b3ebb 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy)
226 226
227static void notify_cmos_timer(void) 227static void notify_cmos_timer(void)
228{ 228{
229 if (no_sync_cmos_clock) 229 if (!no_sync_cmos_clock)
230 mod_timer(&sync_cmos_timer, jiffies + 1); 230 mod_timer(&sync_cmos_timer, jiffies + 1);
231} 231}
232 232
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index db8e0f3d409b..0962e0577660 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -383,11 +383,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
383int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 383int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384{ 384{
385 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 385 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
386 386 return 0;
387 if(!cpus_empty(tick_broadcast_oneshot_mask))
388 tick_broadcast_set_event(ktime_get(), 1);
389
390 return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
391} 387}
392 388
393/* 389/*
@@ -549,20 +545,17 @@ void tick_broadcast_switch_to_oneshot(void)
549 */ 545 */
550void tick_shutdown_broadcast_oneshot(unsigned int *cpup) 546void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
551{ 547{
552 struct clock_event_device *bc;
553 unsigned long flags; 548 unsigned long flags;
554 unsigned int cpu = *cpup; 549 unsigned int cpu = *cpup;
555 550
556 spin_lock_irqsave(&tick_broadcast_lock, flags); 551 spin_lock_irqsave(&tick_broadcast_lock, flags);
557 552
558 bc = tick_broadcast_device.evtdev; 553 /*
554 * Clear the broadcast mask flag for the dead cpu, but do not
555 * stop the broadcast device!
556 */
559 cpu_clear(cpu, tick_broadcast_oneshot_mask); 557 cpu_clear(cpu, tick_broadcast_oneshot_mask);
560 558
561 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
562 if (bc && cpus_empty(tick_broadcast_oneshot_mask))
563 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
564 }
565
566 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 559 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
567} 560}
568 561
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b416995b9757..8c3fef1db09c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void)
160 cpu = smp_processor_id(); 160 cpu = smp_processor_id();
161 ts = &per_cpu(tick_cpu_sched, cpu); 161 ts = &per_cpu(tick_cpu_sched, cpu);
162 162
163 /*
164 * If this cpu is offline and it is the one which updates
165 * jiffies, then give up the assignment and let it be taken by
166 * the cpu which runs the tick timer next. If we don't drop
167 * this here the jiffies might be stale and do_timer() never
168 * invoked.
169 */
170 if (unlikely(!cpu_online(cpu))) {
171 if (cpu == tick_do_timer_cpu)
172 tick_do_timer_cpu = -1;
173 }
174
163 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 175 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
164 goto end; 176 goto end;
165 177
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index acc417b5a9b7..4ad79f6bdec6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -217,6 +217,7 @@ static void change_clocksource(void)
217} 217}
218#else 218#else
219static inline void change_clocksource(void) { } 219static inline void change_clocksource(void) { }
220static inline s64 __get_nsec_offset(void) { return 0; }
220#endif 221#endif
221 222
222/** 223/**
@@ -280,6 +281,8 @@ void __init timekeeping_init(void)
280static int timekeeping_suspended; 281static int timekeeping_suspended;
281/* time in seconds when suspend began */ 282/* time in seconds when suspend began */
282static unsigned long timekeeping_suspend_time; 283static unsigned long timekeeping_suspend_time;
284/* xtime offset when we went into suspend */
285static s64 timekeeping_suspend_nsecs;
283 286
284/** 287/**
285 * timekeeping_resume - Resumes the generic timekeeping subsystem. 288 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev)
305 wall_to_monotonic.tv_sec -= sleep_length; 308 wall_to_monotonic.tv_sec -= sleep_length;
306 total_sleep_time += sleep_length; 309 total_sleep_time += sleep_length;
307 } 310 }
311 /* Make sure that we have the correct xtime reference */
312 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
308 /* re-base the last cycle value */ 313 /* re-base the last cycle value */
309 clock->cycle_last = clocksource_read(clock); 314 clock->cycle_last = clocksource_read(clock);
310 clock->error = 0; 315 clock->error = 0;
@@ -325,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
325{ 330{
326 unsigned long flags; 331 unsigned long flags;
327 332
333 timekeeping_suspend_time = read_persistent_clock();
334
328 write_seqlock_irqsave(&xtime_lock, flags); 335 write_seqlock_irqsave(&xtime_lock, flags);
336 /* Get the current xtime offset */
337 timekeeping_suspend_nsecs = __get_nsec_offset();
329 timekeeping_suspended = 1; 338 timekeeping_suspended = 1;
330 timekeeping_suspend_time = read_persistent_clock();
331 write_sequnlock_irqrestore(&xtime_lock, flags); 339 write_sequnlock_irqrestore(&xtime_lock, flags);
332 340
333 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 341 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 3c38fb5eae1b..c36bb7ed0301 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v)
327 ms = 1; 327 ms = 1;
328 328
329 if (events && period.tv_sec) 329 if (events && period.tv_sec)
330 seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, 330 seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
331 events / period.tv_sec, events * 1000 / ms); 331 events, events * 1000 / ms,
332 (events * 1000000 / ms) % 1000);
332 else 333 else
333 seq_printf(m, "%ld total events\n", events); 334 seq_printf(m, "%ld total events\n", events);
334 335
diff --git a/kernel/user.c b/kernel/user.c
index e7d11cef6998..9ca2848fc356 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,25 +55,22 @@ struct user_struct root_user = {
55/* 55/*
56 * These routines must be called with the uidhash spinlock held! 56 * These routines must be called with the uidhash spinlock held!
57 */ 57 */
58static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) 58static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
59{ 59{
60 list_add(&up->uidhash_list, hashent); 60 hlist_add_head(&up->uidhash_node, hashent);
61} 61}
62 62
63static inline void uid_hash_remove(struct user_struct *up) 63static inline void uid_hash_remove(struct user_struct *up)
64{ 64{
65 list_del(&up->uidhash_list); 65 hlist_del_init(&up->uidhash_node);
66} 66}
67 67
68static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) 68static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
69{ 69{
70 struct list_head *up; 70 struct user_struct *user;
71 71 struct hlist_node *h;
72 list_for_each(up, hashent) {
73 struct user_struct *user;
74
75 user = list_entry(up, struct user_struct, uidhash_list);
76 72
73 hlist_for_each_entry(user, h, hashent, uidhash_node) {
77 if(user->uid == uid) { 74 if(user->uid == uid) {
78 atomic_inc(&user->__count); 75 atomic_inc(&user->__count);
79 return user; 76 return user;
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up)
122 119
123struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 120struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
124{ 121{
125 struct list_head *hashent = uidhashentry(ns, uid); 122 struct hlist_head *hashent = uidhashentry(ns, uid);
126 struct user_struct *up; 123 struct user_struct *up;
127 124
128 spin_lock_irq(&uidhash_lock); 125 spin_lock_irq(&uidhash_lock);
@@ -202,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
202 suid_keys(current); 199 suid_keys(current);
203} 200}
204 201
202void release_uids(struct user_namespace *ns)
203{
204 int i;
205 unsigned long flags;
206 struct hlist_head *head;
207 struct hlist_node *nd;
208
209 spin_lock_irqsave(&uidhash_lock, flags);
210 /*
211 * collapse the chains so that the user_struct-s will
212 * be still alive, but not in hashes. subsequent free_uid()
213 * will free them.
214 */
215 for (i = 0; i < UIDHASH_SZ; i++) {
216 head = ns->uidhash_table + i;
217 while (!hlist_empty(head)) {
218 nd = head->first;
219 hlist_del_init(nd);
220 }
221 }
222 spin_unlock_irqrestore(&uidhash_lock, flags);
223
224 free_uid(ns->root_user);
225}
205 226
206static int __init uid_cache_init(void) 227static int __init uid_cache_init(void)
207{ 228{
@@ -211,7 +232,7 @@ static int __init uid_cache_init(void)
211 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 232 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
212 233
213 for(n = 0; n < UIDHASH_SZ; ++n) 234 for(n = 0; n < UIDHASH_SZ; ++n)
214 INIT_LIST_HEAD(init_user_ns.uidhash_table + n); 235 INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
215 236
216 /* Insert the root user immediately (init already runs as root) */ 237 /* Insert the root user immediately (init already runs as root) */
217 spin_lock_irq(&uidhash_lock); 238 spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d055d987850c..7af90fc4f0fd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
39 kref_init(&ns->kref); 39 kref_init(&ns->kref);
40 40
41 for (n = 0; n < UIDHASH_SZ; ++n) 41 for (n = 0; n < UIDHASH_SZ; ++n)
42 INIT_LIST_HEAD(ns->uidhash_table + n); 42 INIT_HLIST_HEAD(ns->uidhash_table + n);
43 43
44 /* Insert new root user. */ 44 /* Insert new root user. */
45 ns->root_user = alloc_uid(ns, 0); 45 ns->root_user = alloc_uid(ns, 0);
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref)
81 struct user_namespace *ns; 81 struct user_namespace *ns;
82 82
83 ns = container_of(kref, struct user_namespace, kref); 83 ns = container_of(kref, struct user_namespace, kref);
84 release_uids(ns);
84 kfree(ns); 85 kfree(ns);
85} 86}
86 87
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 9d8180a0f0d8..816d7b24fa03 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
28 if (!ns) 28 if (!ns)
29 return ERR_PTR(-ENOMEM); 29 return ERR_PTR(-ENOMEM);
30 30
31 down_read(&uts_sem);
31 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem);
32 kref_init(&ns->kref); 34 kref_init(&ns->kref);
33 return ns; 35 return ns;
34} 36}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 58e5c152a6bb..e080d1d744cc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -635,7 +635,7 @@ int keventd_up(void)
635int current_is_keventd(void) 635int current_is_keventd(void)
636{ 636{
637 struct cpu_workqueue_struct *cwq; 637 struct cpu_workqueue_struct *cwq;
638 int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ 638 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
639 int ret = 0; 639 int ret = 0;
640 640
641 BUG_ON(!keventd_wq); 641 BUG_ON(!keventd_wq);