diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 4 | ||||
-rw-r--r-- | kernel/exit.c | 11 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/futex.c | 26 | ||||
-rw-r--r-- | kernel/futex_compat.c | 30 | ||||
-rw-r--r-- | kernel/irq/manage.c | 11 | ||||
-rw-r--r-- | kernel/kmod.c | 2 | ||||
-rw-r--r-- | kernel/module.c | 3 | ||||
-rw-r--r-- | kernel/power/Kconfig | 41 | ||||
-rw-r--r-- | kernel/ptrace.c | 1 | ||||
-rw-r--r-- | kernel/sched.c | 112 | ||||
-rw-r--r-- | kernel/sched_debug.c | 4 | ||||
-rw-r--r-- | kernel/sched_fair.c | 216 | ||||
-rw-r--r-- | kernel/sched_rt.c | 11 | ||||
-rw-r--r-- | kernel/signal.c | 49 | ||||
-rw-r--r-- | kernel/sys.c | 5 | ||||
-rw-r--r-- | kernel/sysctl.c | 41 | ||||
-rw-r--r-- | kernel/time/ntp.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 17 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 12 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 10 | ||||
-rw-r--r-- | kernel/time/timer_stats.c | 5 | ||||
-rw-r--r-- | kernel/user.c | 45 | ||||
-rw-r--r-- | kernel/user_namespace.c | 3 | ||||
-rw-r--r-- | kernel/utsname.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
26 files changed, 464 insertions, 203 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 181ae7086029..38033db8d8ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
273 | return err; | 273 | return err; |
274 | } | 274 | } |
275 | 275 | ||
276 | #ifdef CONFIG_SUSPEND_SMP | 276 | #ifdef CONFIG_PM_SLEEP_SMP |
277 | static cpumask_t frozen_cpus; | 277 | static cpumask_t frozen_cpus; |
278 | 278 | ||
279 | int disable_nonboot_cpus(void) | 279 | int disable_nonboot_cpus(void) |
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void) | |||
334 | out: | 334 | out: |
335 | mutex_unlock(&cpu_add_remove_lock); | 335 | mutex_unlock(&cpu_add_remove_lock); |
336 | } | 336 | } |
337 | #endif | 337 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/exit.c b/kernel/exit.c index 9578c1ae19ca..993369ee94d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/pid_namespace.h> | 24 | #include <linux/pid_namespace.h> |
25 | #include <linux/ptrace.h> | 25 | #include <linux/ptrace.h> |
26 | #include <linux/profile.h> | 26 | #include <linux/profile.h> |
27 | #include <linux/signalfd.h> | ||
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
29 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
30 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
86 | sighand = rcu_dereference(tsk->sighand); | 85 | sighand = rcu_dereference(tsk->sighand); |
87 | spin_lock(&sighand->siglock); | 86 | spin_lock(&sighand->siglock); |
88 | 87 | ||
89 | /* | ||
90 | * Notify that this sighand has been detached. This must | ||
91 | * be called with the tsk->sighand lock held. Also, this | ||
92 | * access tsk->sighand internally, so it must be called | ||
93 | * before tsk->sighand is reset. | ||
94 | */ | ||
95 | signalfd_detach_locked(tsk); | ||
96 | |||
97 | posix_cpu_timers_exit(tsk); | 88 | posix_cpu_timers_exit(tsk); |
98 | if (atomic_dec_and_test(&sig->count)) | 89 | if (atomic_dec_and_test(&sig->count)) |
99 | posix_cpu_timers_exit_group(tsk); | 90 | posix_cpu_timers_exit_group(tsk); |
@@ -975,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
975 | if (unlikely(tsk->audit_context)) | 966 | if (unlikely(tsk->audit_context)) |
976 | audit_free(tsk); | 967 | audit_free(tsk); |
977 | 968 | ||
969 | tsk->exit_code = code; | ||
978 | taskstats_exit(tsk, group_dead); | 970 | taskstats_exit(tsk, group_dead); |
979 | 971 | ||
980 | exit_mm(tsk); | 972 | exit_mm(tsk); |
@@ -996,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code) | |||
996 | if (tsk->binfmt) | 988 | if (tsk->binfmt) |
997 | module_put(tsk->binfmt->module); | 989 | module_put(tsk->binfmt->module); |
998 | 990 | ||
999 | tsk->exit_code = code; | ||
1000 | proc_exit_connector(tsk); | 991 | proc_exit_connector(tsk); |
1001 | exit_task_namespaces(tsk); | 992 | exit_task_namespaces(tsk); |
1002 | exit_notify(tsk); | 993 | exit_notify(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index 7332e236d367..33f12f48684a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, | |||
1438 | struct sighand_struct *sighand = data; | 1438 | struct sighand_struct *sighand = data; |
1439 | 1439 | ||
1440 | spin_lock_init(&sighand->siglock); | 1440 | spin_lock_init(&sighand->siglock); |
1441 | INIT_LIST_HEAD(&sighand->signalfd_list); | 1441 | init_waitqueue_head(&sighand->signalfd_wqh); |
1442 | } | 1442 | } |
1443 | 1443 | ||
1444 | void __init proc_caches_init(void) | 1444 | void __init proc_caches_init(void) |
diff --git a/kernel/futex.c b/kernel/futex.c index e8935b195e88..fcc94e7b4086 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1943,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, | |||
1943 | void exit_robust_list(struct task_struct *curr) | 1943 | void exit_robust_list(struct task_struct *curr) |
1944 | { | 1944 | { |
1945 | struct robust_list_head __user *head = curr->robust_list; | 1945 | struct robust_list_head __user *head = curr->robust_list; |
1946 | struct robust_list __user *entry, *pending; | 1946 | struct robust_list __user *entry, *next_entry, *pending; |
1947 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 1947 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; |
1948 | unsigned long futex_offset; | 1948 | unsigned long futex_offset; |
1949 | int rc; | ||
1949 | 1950 | ||
1950 | /* | 1951 | /* |
1951 | * Fetch the list head (which was registered earlier, via | 1952 | * Fetch the list head (which was registered earlier, via |
@@ -1965,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr) | |||
1965 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) | 1966 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) |
1966 | return; | 1967 | return; |
1967 | 1968 | ||
1968 | if (pending) | 1969 | next_entry = NULL; /* avoid warning with gcc */ |
1969 | handle_futex_death((void __user *)pending + futex_offset, | ||
1970 | curr, pip); | ||
1971 | |||
1972 | while (entry != &head->list) { | 1970 | while (entry != &head->list) { |
1973 | /* | 1971 | /* |
1972 | * Fetch the next entry in the list before calling | ||
1973 | * handle_futex_death: | ||
1974 | */ | ||
1975 | rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); | ||
1976 | /* | ||
1974 | * A pending lock might already be on the list, so | 1977 | * A pending lock might already be on the list, so |
1975 | * don't process it twice: | 1978 | * don't process it twice: |
1976 | */ | 1979 | */ |
@@ -1978,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) | |||
1978 | if (handle_futex_death((void __user *)entry + futex_offset, | 1981 | if (handle_futex_death((void __user *)entry + futex_offset, |
1979 | curr, pi)) | 1982 | curr, pi)) |
1980 | return; | 1983 | return; |
1981 | /* | 1984 | if (rc) |
1982 | * Fetch the next entry in the list: | ||
1983 | */ | ||
1984 | if (fetch_robust_entry(&entry, &entry->next, &pi)) | ||
1985 | return; | 1985 | return; |
1986 | entry = next_entry; | ||
1987 | pi = next_pi; | ||
1986 | /* | 1988 | /* |
1987 | * Avoid excessively long or circular lists: | 1989 | * Avoid excessively long or circular lists: |
1988 | */ | 1990 | */ |
@@ -1991,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) | |||
1991 | 1993 | ||
1992 | cond_resched(); | 1994 | cond_resched(); |
1993 | } | 1995 | } |
1996 | |||
1997 | if (pending) | ||
1998 | handle_futex_death((void __user *)pending + futex_offset, | ||
1999 | curr, pip); | ||
1994 | } | 2000 | } |
1995 | 2001 | ||
1996 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | 2002 | long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f7921360efad..2c2e2954b713 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | |||
38 | void compat_exit_robust_list(struct task_struct *curr) | 38 | void compat_exit_robust_list(struct task_struct *curr) |
39 | { | 39 | { |
40 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | 40 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
41 | struct robust_list __user *entry, *pending; | 41 | struct robust_list __user *entry, *next_entry, *pending; |
42 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 42 | unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; |
43 | compat_uptr_t uentry, upending; | 43 | compat_uptr_t uentry, next_uentry, upending; |
44 | compat_long_t futex_offset; | 44 | compat_long_t futex_offset; |
45 | int rc; | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * Fetch the list head (which was registered earlier, via | 48 | * Fetch the list head (which was registered earlier, via |
@@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
61 | if (fetch_robust_entry(&upending, &pending, | 62 | if (fetch_robust_entry(&upending, &pending, |
62 | &head->list_op_pending, &pip)) | 63 | &head->list_op_pending, &pip)) |
63 | return; | 64 | return; |
64 | if (upending) | ||
65 | handle_futex_death((void __user *)pending + futex_offset, curr, pip); | ||
66 | 65 | ||
67 | while (compat_ptr(uentry) != &head->list) { | 66 | next_entry = NULL; /* avoid warning with gcc */ |
67 | while (entry != (struct robust_list __user *) &head->list) { | ||
68 | /* | ||
69 | * Fetch the next entry in the list before calling | ||
70 | * handle_futex_death: | ||
71 | */ | ||
72 | rc = fetch_robust_entry(&next_uentry, &next_entry, | ||
73 | (compat_uptr_t __user *)&entry->next, &next_pi); | ||
68 | /* | 74 | /* |
69 | * A pending lock might already be on the list, so | 75 | * A pending lock might already be on the list, so |
70 | * dont process it twice: | 76 | * dont process it twice: |
@@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
74 | curr, pi)) | 80 | curr, pi)) |
75 | return; | 81 | return; |
76 | 82 | ||
77 | /* | 83 | if (rc) |
78 | * Fetch the next entry in the list: | ||
79 | */ | ||
80 | if (fetch_robust_entry(&uentry, &entry, | ||
81 | (compat_uptr_t __user *)&entry->next, &pi)) | ||
82 | return; | 84 | return; |
85 | uentry = next_uentry; | ||
86 | entry = next_entry; | ||
87 | pi = next_pi; | ||
83 | /* | 88 | /* |
84 | * Avoid excessively long or circular lists: | 89 | * Avoid excessively long or circular lists: |
85 | */ | 90 | */ |
@@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
88 | 93 | ||
89 | cond_resched(); | 94 | cond_resched(); |
90 | } | 95 | } |
96 | if (pending) | ||
97 | handle_futex_death((void __user *)pending + futex_offset, | ||
98 | curr, pip); | ||
91 | } | 99 | } |
92 | 100 | ||
93 | asmlinkage long | 101 | asmlinkage long |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 853aefbd184b..7230d914eaa2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -547,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, | |||
547 | * We do this before actually registering it, to make sure that | 547 | * We do this before actually registering it, to make sure that |
548 | * a 'real' IRQ doesn't run in parallel with our fake | 548 | * a 'real' IRQ doesn't run in parallel with our fake |
549 | */ | 549 | */ |
550 | if (irqflags & IRQF_DISABLED) { | 550 | unsigned long flags; |
551 | unsigned long flags; | ||
552 | 551 | ||
553 | local_irq_save(flags); | 552 | local_irq_save(flags); |
554 | handler(irq, dev_id); | 553 | handler(irq, dev_id); |
555 | local_irq_restore(flags); | 554 | local_irq_restore(flags); |
556 | } else | ||
557 | handler(irq, dev_id); | ||
558 | } | 555 | } |
559 | #endif | 556 | #endif |
560 | 557 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 9809cc1f33d6..c6a4f8aebeba 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
505 | if (ret < 0) | 505 | if (ret < 0) |
506 | goto out; | 506 | goto out; |
507 | 507 | ||
508 | return call_usermodehelper_exec(sub_info, 1); | 508 | return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); |
509 | 509 | ||
510 | out: | 510 | out: |
511 | call_usermodehelper_freeinfo(sub_info); | 511 | call_usermodehelper_freeinfo(sub_info); |
diff --git a/kernel/module.c b/kernel/module.c index 33c04ad51175..db0ead0363e2 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); | |||
784 | static ssize_t show_refcnt(struct module_attribute *mattr, | 784 | static ssize_t show_refcnt(struct module_attribute *mattr, |
785 | struct module *mod, char *buffer) | 785 | struct module *mod, char *buffer) |
786 | { | 786 | { |
787 | /* sysfs holds a reference */ | 787 | return sprintf(buffer, "%u\n", module_refcount(mod)); |
788 | return sprintf(buffer, "%u\n", module_refcount(mod)-1); | ||
789 | } | 788 | } |
790 | 789 | ||
791 | static struct module_attribute refcnt = { | 790 | static struct module_attribute refcnt = { |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 412859f8d94a..14b0e10dc95c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -72,15 +72,10 @@ config PM_TRACE | |||
72 | CAUTION: this option will cause your machine's real-time clock to be | 72 | CAUTION: this option will cause your machine's real-time clock to be |
73 | set to an invalid time after a resume. | 73 | set to an invalid time after a resume. |
74 | 74 | ||
75 | config SUSPEND_SMP_POSSIBLE | 75 | config PM_SLEEP_SMP |
76 | bool | ||
77 | depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC)) | ||
78 | depends on SMP | ||
79 | default y | ||
80 | |||
81 | config SUSPEND_SMP | ||
82 | bool | 76 | bool |
83 | depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP | 77 | depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE |
78 | depends on PM_SLEEP | ||
84 | select HOTPLUG_CPU | 79 | select HOTPLUG_CPU |
85 | default y | 80 | default y |
86 | 81 | ||
@@ -89,20 +84,46 @@ config PM_SLEEP | |||
89 | depends on SUSPEND || HIBERNATION | 84 | depends on SUSPEND || HIBERNATION |
90 | default y | 85 | default y |
91 | 86 | ||
87 | config SUSPEND_UP_POSSIBLE | ||
88 | bool | ||
89 | depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ | ||
90 | || SUPERH || FRV | ||
91 | depends on !SMP | ||
92 | default y | ||
93 | |||
94 | config SUSPEND_SMP_POSSIBLE | ||
95 | bool | ||
96 | depends on (X86 && !X86_VOYAGER) \ | ||
97 | || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM | ||
98 | depends on SMP | ||
99 | default y | ||
100 | |||
92 | config SUSPEND | 101 | config SUSPEND |
93 | bool "Suspend to RAM and standby" | 102 | bool "Suspend to RAM and standby" |
94 | depends on PM | 103 | depends on PM |
95 | depends on !SMP || SUSPEND_SMP_POSSIBLE | 104 | depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE |
96 | default y | 105 | default y |
97 | ---help--- | 106 | ---help--- |
98 | Allow the system to enter sleep states in which main memory is | 107 | Allow the system to enter sleep states in which main memory is |
99 | powered and thus its contents are preserved, such as the | 108 | powered and thus its contents are preserved, such as the |
100 | suspend-to-RAM state (i.e. the ACPI S3 state). | 109 | suspend-to-RAM state (i.e. the ACPI S3 state). |
101 | 110 | ||
111 | config HIBERNATION_UP_POSSIBLE | ||
112 | bool | ||
113 | depends on X86 || PPC64_SWSUSP || PPC32 | ||
114 | depends on !SMP | ||
115 | default y | ||
116 | |||
117 | config HIBERNATION_SMP_POSSIBLE | ||
118 | bool | ||
119 | depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP | ||
120 | depends on SMP | ||
121 | default y | ||
122 | |||
102 | config HIBERNATION | 123 | config HIBERNATION |
103 | bool "Hibernation (aka 'suspend to disk')" | 124 | bool "Hibernation (aka 'suspend to disk')" |
104 | depends on PM && SWAP | 125 | depends on PM && SWAP |
105 | depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE | 126 | depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE |
106 | ---help--- | 127 | ---help--- |
107 | Enable the suspend to disk (STD) functionality, which is usually | 128 | Enable the suspend to disk (STD) functionality, which is usually |
108 | called "hibernation" in user interfaces. STD checkpoints the | 129 | called "hibernation" in user interfaces. STD checkpoints the |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 82a558b655da..3eca7a55f2ee 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
233 | 233 | ||
234 | /* Architecture-specific hardware disable .. */ | 234 | /* Architecture-specific hardware disable .. */ |
235 | ptrace_disable(child); | 235 | ptrace_disable(child); |
236 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
236 | 237 | ||
237 | write_lock_irq(&tasklist_lock); | 238 | write_lock_irq(&tasklist_lock); |
238 | /* protect against de_thread()->release_task() */ | 239 | /* protect against de_thread()->release_task() */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f1..6107a0cd6325 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -262,7 +262,8 @@ struct rq { | |||
262 | s64 clock_max_delta; | 262 | s64 clock_max_delta; |
263 | 263 | ||
264 | unsigned int clock_warps, clock_overflows; | 264 | unsigned int clock_warps, clock_overflows; |
265 | unsigned int clock_unstable_events; | 265 | u64 idle_clock; |
266 | unsigned int clock_deep_idle_events; | ||
266 | u64 tick_timestamp; | 267 | u64 tick_timestamp; |
267 | 268 | ||
268 | atomic_t nr_iowait; | 269 | atomic_t nr_iowait; |
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) | |||
556 | } | 557 | } |
557 | 558 | ||
558 | /* | 559 | /* |
559 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: | 560 | * We are going deep-idle (irqs are disabled): |
560 | */ | 561 | */ |
561 | void sched_clock_unstable_event(void) | 562 | void sched_clock_idle_sleep_event(void) |
562 | { | 563 | { |
563 | unsigned long flags; | 564 | struct rq *rq = cpu_rq(smp_processor_id()); |
564 | struct rq *rq; | ||
565 | 565 | ||
566 | rq = task_rq_lock(current, &flags); | 566 | spin_lock(&rq->lock); |
567 | rq->prev_clock_raw = sched_clock(); | 567 | __update_rq_clock(rq); |
568 | rq->clock_unstable_events++; | 568 | spin_unlock(&rq->lock); |
569 | task_rq_unlock(rq, &flags); | 569 | rq->clock_deep_idle_events++; |
570 | } | ||
571 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | ||
572 | |||
573 | /* | ||
574 | * We just idled delta nanoseconds (called with irqs disabled): | ||
575 | */ | ||
576 | void sched_clock_idle_wakeup_event(u64 delta_ns) | ||
577 | { | ||
578 | struct rq *rq = cpu_rq(smp_processor_id()); | ||
579 | u64 now = sched_clock(); | ||
580 | |||
581 | rq->idle_clock += delta_ns; | ||
582 | /* | ||
583 | * Override the previous timestamp and ignore all | ||
584 | * sched_clock() deltas that occured while we idled, | ||
585 | * and use the PM-provided delta_ns to advance the | ||
586 | * rq clock: | ||
587 | */ | ||
588 | spin_lock(&rq->lock); | ||
589 | rq->prev_clock_raw = now; | ||
590 | rq->clock += delta_ns; | ||
591 | spin_unlock(&rq->lock); | ||
570 | } | 592 | } |
593 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | ||
571 | 594 | ||
572 | /* | 595 | /* |
573 | * resched_task - mark a task 'to be rescheduled now'. | 596 | * resched_task - mark a task 'to be rescheduled now'. |
@@ -645,7 +668,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) | |||
645 | /* | 668 | /* |
646 | * Shift right and round: | 669 | * Shift right and round: |
647 | */ | 670 | */ |
648 | #define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 671 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
649 | 672 | ||
650 | static unsigned long | 673 | static unsigned long |
651 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 674 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
@@ -661,10 +684,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
661 | * Check whether we'd overflow the 64-bit multiplication: | 684 | * Check whether we'd overflow the 64-bit multiplication: |
662 | */ | 685 | */ |
663 | if (unlikely(tmp > WMULT_CONST)) | 686 | if (unlikely(tmp > WMULT_CONST)) |
664 | tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | 687 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, |
665 | WMULT_SHIFT/2); | 688 | WMULT_SHIFT/2); |
666 | else | 689 | else |
667 | tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); | 690 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); |
668 | 691 | ||
669 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 692 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
670 | } | 693 | } |
@@ -835,7 +858,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) | |||
835 | 858 | ||
836 | static void set_load_weight(struct task_struct *p) | 859 | static void set_load_weight(struct task_struct *p) |
837 | { | 860 | { |
838 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | ||
839 | p->se.wait_runtime = 0; | 861 | p->se.wait_runtime = 0; |
840 | 862 | ||
841 | if (task_has_rt_policy(p)) { | 863 | if (task_has_rt_policy(p)) { |
@@ -1564,6 +1586,7 @@ static void __sched_fork(struct task_struct *p) | |||
1564 | p->se.wait_start_fair = 0; | 1586 | p->se.wait_start_fair = 0; |
1565 | p->se.exec_start = 0; | 1587 | p->se.exec_start = 0; |
1566 | p->se.sum_exec_runtime = 0; | 1588 | p->se.sum_exec_runtime = 0; |
1589 | p->se.prev_sum_exec_runtime = 0; | ||
1567 | p->se.delta_exec = 0; | 1590 | p->se.delta_exec = 0; |
1568 | p->se.delta_fair_run = 0; | 1591 | p->se.delta_fair_run = 0; |
1569 | p->se.delta_fair_sleep = 0; | 1592 | p->se.delta_fair_sleep = 0; |
@@ -1659,6 +1682,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1659 | 1682 | ||
1660 | p->prio = effective_prio(p); | 1683 | p->prio = effective_prio(p); |
1661 | 1684 | ||
1685 | if (rt_prio(p->prio)) | ||
1686 | p->sched_class = &rt_sched_class; | ||
1687 | else | ||
1688 | p->sched_class = &fair_sched_class; | ||
1689 | |||
1662 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | 1690 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || |
1663 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | 1691 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || |
1664 | !current->se.on_rq) { | 1692 | !current->se.on_rq) { |
@@ -2157,12 +2185,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2157 | if (task_running(rq, p)) | 2185 | if (task_running(rq, p)) |
2158 | return 0; | 2186 | return 0; |
2159 | 2187 | ||
2160 | /* | ||
2161 | * Aggressive migration if too many balance attempts have failed: | ||
2162 | */ | ||
2163 | if (sd->nr_balance_failed > sd->cache_nice_tries) | ||
2164 | return 1; | ||
2165 | |||
2166 | return 1; | 2188 | return 1; |
2167 | } | 2189 | } |
2168 | 2190 | ||
@@ -2494,7 +2516,7 @@ group_next: | |||
2494 | * a think about bumping its value to force at least one task to be | 2516 | * a think about bumping its value to force at least one task to be |
2495 | * moved | 2517 | * moved |
2496 | */ | 2518 | */ |
2497 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { | 2519 | if (*imbalance < busiest_load_per_task) { |
2498 | unsigned long tmp, pwr_now, pwr_move; | 2520 | unsigned long tmp, pwr_now, pwr_move; |
2499 | unsigned int imbn; | 2521 | unsigned int imbn; |
2500 | 2522 | ||
@@ -2546,10 +2568,8 @@ small_imbalance: | |||
2546 | pwr_move /= SCHED_LOAD_SCALE; | 2568 | pwr_move /= SCHED_LOAD_SCALE; |
2547 | 2569 | ||
2548 | /* Move if we gain throughput */ | 2570 | /* Move if we gain throughput */ |
2549 | if (pwr_move <= pwr_now) | 2571 | if (pwr_move > pwr_now) |
2550 | goto out_balanced; | 2572 | *imbalance = busiest_load_per_task; |
2551 | |||
2552 | *imbalance = busiest_load_per_task; | ||
2553 | } | 2573 | } |
2554 | 2574 | ||
2555 | return busiest; | 2575 | return busiest; |
@@ -3020,6 +3040,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3020 | struct sched_domain *sd; | 3040 | struct sched_domain *sd; |
3021 | /* Earliest time when we have to do rebalance again */ | 3041 | /* Earliest time when we have to do rebalance again */ |
3022 | unsigned long next_balance = jiffies + 60*HZ; | 3042 | unsigned long next_balance = jiffies + 60*HZ; |
3043 | int update_next_balance = 0; | ||
3023 | 3044 | ||
3024 | for_each_domain(cpu, sd) { | 3045 | for_each_domain(cpu, sd) { |
3025 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3046 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -3056,8 +3077,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3056 | if (sd->flags & SD_SERIALIZE) | 3077 | if (sd->flags & SD_SERIALIZE) |
3057 | spin_unlock(&balancing); | 3078 | spin_unlock(&balancing); |
3058 | out: | 3079 | out: |
3059 | if (time_after(next_balance, sd->last_balance + interval)) | 3080 | if (time_after(next_balance, sd->last_balance + interval)) { |
3060 | next_balance = sd->last_balance + interval; | 3081 | next_balance = sd->last_balance + interval; |
3082 | update_next_balance = 1; | ||
3083 | } | ||
3061 | 3084 | ||
3062 | /* | 3085 | /* |
3063 | * Stop the load balance at this level. There is another | 3086 | * Stop the load balance at this level. There is another |
@@ -3067,7 +3090,14 @@ out: | |||
3067 | if (!balance) | 3090 | if (!balance) |
3068 | break; | 3091 | break; |
3069 | } | 3092 | } |
3070 | rq->next_balance = next_balance; | 3093 | |
3094 | /* | ||
3095 | * next_balance will be updated only when there is a need. | ||
3096 | * When the cpu is attached to null domain for ex, it will not be | ||
3097 | * updated. | ||
3098 | */ | ||
3099 | if (likely(update_next_balance)) | ||
3100 | rq->next_balance = next_balance; | ||
3071 | } | 3101 | } |
3072 | 3102 | ||
3073 | /* | 3103 | /* |
@@ -4525,10 +4555,7 @@ asmlinkage long sys_sched_yield(void) | |||
4525 | struct rq *rq = this_rq_lock(); | 4555 | struct rq *rq = this_rq_lock(); |
4526 | 4556 | ||
4527 | schedstat_inc(rq, yld_cnt); | 4557 | schedstat_inc(rq, yld_cnt); |
4528 | if (unlikely(rq->nr_running == 1)) | 4558 | current->sched_class->yield_task(rq, current); |
4529 | schedstat_inc(rq, yld_act_empty); | ||
4530 | else | ||
4531 | current->sched_class->yield_task(rq, current); | ||
4532 | 4559 | ||
4533 | /* | 4560 | /* |
4534 | * Since we are going to call schedule() anyway, there's | 4561 | * Since we are going to call schedule() anyway, there's |
@@ -4884,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4884 | static inline void sched_init_granularity(void) | 4911 | static inline void sched_init_granularity(void) |
4885 | { | 4912 | { |
4886 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 4913 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
4887 | const unsigned long gran_limit = 100000000; | 4914 | const unsigned long limit = 100000000; |
4915 | |||
4916 | sysctl_sched_min_granularity *= factor; | ||
4917 | if (sysctl_sched_min_granularity > limit) | ||
4918 | sysctl_sched_min_granularity = limit; | ||
4888 | 4919 | ||
4889 | sysctl_sched_granularity *= factor; | 4920 | sysctl_sched_latency *= factor; |
4890 | if (sysctl_sched_granularity > gran_limit) | 4921 | if (sysctl_sched_latency > limit) |
4891 | sysctl_sched_granularity = gran_limit; | 4922 | sysctl_sched_latency = limit; |
4892 | 4923 | ||
4893 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | 4924 | sysctl_sched_runtime_limit = sysctl_sched_latency; |
4894 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | 4925 | sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; |
4895 | } | 4926 | } |
4896 | 4927 | ||
4897 | #ifdef CONFIG_SMP | 4928 | #ifdef CONFIG_SMP |
@@ -5234,15 +5265,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
5234 | static struct ctl_table sd_ctl_dir[] = { | 5265 | static struct ctl_table sd_ctl_dir[] = { |
5235 | { | 5266 | { |
5236 | .procname = "sched_domain", | 5267 | .procname = "sched_domain", |
5237 | .mode = 0755, | 5268 | .mode = 0555, |
5238 | }, | 5269 | }, |
5239 | {0,}, | 5270 | {0,}, |
5240 | }; | 5271 | }; |
5241 | 5272 | ||
5242 | static struct ctl_table sd_ctl_root[] = { | 5273 | static struct ctl_table sd_ctl_root[] = { |
5243 | { | 5274 | { |
5275 | .ctl_name = CTL_KERN, | ||
5244 | .procname = "kernel", | 5276 | .procname = "kernel", |
5245 | .mode = 0755, | 5277 | .mode = 0555, |
5246 | .child = sd_ctl_dir, | 5278 | .child = sd_ctl_dir, |
5247 | }, | 5279 | }, |
5248 | {0,}, | 5280 | {0,}, |
@@ -5318,7 +5350,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
5318 | for_each_domain(cpu, sd) { | 5350 | for_each_domain(cpu, sd) { |
5319 | snprintf(buf, 32, "domain%d", i); | 5351 | snprintf(buf, 32, "domain%d", i); |
5320 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5352 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5321 | entry->mode = 0755; | 5353 | entry->mode = 0555; |
5322 | entry->child = sd_alloc_ctl_domain_table(sd); | 5354 | entry->child = sd_alloc_ctl_domain_table(sd); |
5323 | entry++; | 5355 | entry++; |
5324 | i++; | 5356 | i++; |
@@ -5338,7 +5370,7 @@ static void init_sched_domain_sysctl(void) | |||
5338 | for (i = 0; i < cpu_num; i++, entry++) { | 5370 | for (i = 0; i < cpu_num; i++, entry++) { |
5339 | snprintf(buf, 32, "cpu%d", i); | 5371 | snprintf(buf, 32, "cpu%d", i); |
5340 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5372 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5341 | entry->mode = 0755; | 5373 | entry->mode = 0555; |
5342 | entry->child = sd_alloc_ctl_cpu_table(i); | 5374 | entry->child = sd_alloc_ctl_cpu_table(i); |
5343 | } | 5375 | } |
5344 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5376 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b85..c3ee38bd3426 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
154 | P(next_balance); | 154 | P(next_balance); |
155 | P(curr->pid); | 155 | P(curr->pid); |
156 | P(clock); | 156 | P(clock); |
157 | P(idle_clock); | ||
157 | P(prev_clock_raw); | 158 | P(prev_clock_raw); |
158 | P(clock_warps); | 159 | P(clock_warps); |
159 | P(clock_overflows); | 160 | P(clock_overflows); |
160 | P(clock_unstable_events); | 161 | P(clock_deep_idle_events); |
161 | P(clock_max_delta); | 162 | P(clock_max_delta); |
162 | P(cpu_load[0]); | 163 | P(cpu_load[0]); |
163 | P(cpu_load[1]); | 164 | P(cpu_load[1]); |
@@ -282,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p) | |||
282 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; | 283 | p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; |
283 | #endif | 284 | #endif |
284 | p->se.sum_exec_runtime = 0; | 285 | p->se.sum_exec_runtime = 0; |
286 | p->se.prev_sum_exec_runtime = 0; | ||
285 | } | 287 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fedbb51bba96..67c67a87146e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -15,34 +15,50 @@ | |||
15 | * | 15 | * |
16 | * Scaled math optimizations by Thomas Gleixner | 16 | * Scaled math optimizations by Thomas Gleixner |
17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> |
18 | * | ||
19 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | ||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
18 | */ | 21 | */ |
19 | 22 | ||
20 | /* | 23 | /* |
21 | * Preemption granularity: | 24 | * Targeted preemption latency for CPU-bound tasks: |
22 | * (default: 2 msec, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
23 | * | 26 | * |
24 | * NOTE: this granularity value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | 28 | * 'timeslice length' - timeslices in CFS are of variable length. |
26 | * larger than this value. (to see the precise effective timeslice | 29 | * (to see the precise effective timeslice length of your workload, |
27 | * length of your workload, run vmstat and monitor the context-switches | 30 | * run vmstat and monitor the context-switches field) |
28 | * field) | ||
29 | * | 31 | * |
30 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * On SMP systems the value of this is multiplied by the log2 of the |
31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way |
32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | 34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) |
35 | * Targeted preemption latency for CPU-bound tasks: | ||
36 | */ | ||
37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | ||
38 | |||
39 | /* | ||
40 | * Minimal preemption granularity for CPU-bound tasks: | ||
41 | * (default: 2 msec, units: nanoseconds) | ||
33 | */ | 42 | */ |
34 | unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; | 43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; |
44 | |||
45 | /* | ||
46 | * sys_sched_yield() compat mode | ||
47 | * | ||
48 | * This option switches the agressive yield implementation of the | ||
49 | * old scheduler back on. | ||
50 | */ | ||
51 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
35 | 52 | ||
36 | /* | 53 | /* |
37 | * SCHED_BATCH wake-up granularity. | 54 | * SCHED_BATCH wake-up granularity. |
38 | * (default: 10 msec, units: nanoseconds) | 55 | * (default: 25 msec, units: nanoseconds) |
39 | * | 56 | * |
40 | * This option delays the preemption effects of decoupled workloads | 57 | * This option delays the preemption effects of decoupled workloads |
41 | * and reduces their over-scheduling. Synchronous workloads will still | 58 | * and reduces their over-scheduling. Synchronous workloads will still |
42 | * have immediate wakeup/sleep latencies. | 59 | * have immediate wakeup/sleep latencies. |
43 | */ | 60 | */ |
44 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | 61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; |
45 | 10000000000ULL/HZ; | ||
46 | 62 | ||
47 | /* | 63 | /* |
48 | * SCHED_OTHER wake-up granularity. | 64 | * SCHED_OTHER wake-up granularity. |
@@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | |||
52 | * and reduces their over-scheduling. Synchronous workloads will still | 68 | * and reduces their over-scheduling. Synchronous workloads will still |
53 | * have immediate wakeup/sleep latencies. | 69 | * have immediate wakeup/sleep latencies. |
54 | */ | 70 | */ |
55 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; | 71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; |
56 | 72 | ||
57 | unsigned int sysctl_sched_stat_granularity __read_mostly; | 73 | unsigned int sysctl_sched_stat_granularity __read_mostly; |
58 | 74 | ||
59 | /* | 75 | /* |
60 | * Initialized in sched_init_granularity(): | 76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: |
61 | */ | 77 | */ |
62 | unsigned int sysctl_sched_runtime_limit __read_mostly; | 78 | unsigned int sysctl_sched_runtime_limit __read_mostly; |
63 | 79 | ||
@@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
186 | update_load_add(&cfs_rq->load, se->load.weight); | 202 | update_load_add(&cfs_rq->load, se->load.weight); |
187 | cfs_rq->nr_running++; | 203 | cfs_rq->nr_running++; |
188 | se->on_rq = 1; | 204 | se->on_rq = 1; |
205 | |||
206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
189 | } | 207 | } |
190 | 208 | ||
191 | static inline void | 209 | static inline void |
@@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
197 | update_load_sub(&cfs_rq->load, se->load.weight); | 215 | update_load_sub(&cfs_rq->load, se->load.weight); |
198 | cfs_rq->nr_running--; | 216 | cfs_rq->nr_running--; |
199 | se->on_rq = 0; | 217 | se->on_rq = 0; |
218 | |||
219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | ||
200 | } | 220 | } |
201 | 221 | ||
202 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
@@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
214 | */ | 234 | */ |
215 | 235 | ||
216 | /* | 236 | /* |
237 | * Calculate the preemption granularity needed to schedule every | ||
238 | * runnable task once per sysctl_sched_latency amount of time. | ||
239 | * (down to a sensible low limit on granularity) | ||
240 | * | ||
241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
244 | * for each task. We do finer and finer scheduling up to until we | ||
245 | * reach the minimum granularity value. | ||
246 | * | ||
247 | * To achieve this we use the following dynamic-granularity rule: | ||
248 | * | ||
249 | * gran = lat/nr - lat/nr/nr | ||
250 | * | ||
251 | * This comes out of the following equations: | ||
252 | * | ||
253 | * kA1 + gran = kB1 | ||
254 | * kB2 + gran = kA2 | ||
255 | * kA2 = kA1 | ||
256 | * kB2 = kB1 - d + d/nr | ||
257 | * lat = d * nr | ||
258 | * | ||
259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
263 | * sched_latency that we aim for.) | ||
264 | */ | ||
265 | static long | ||
266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
267 | { | ||
268 | unsigned int gran = sysctl_sched_latency; | ||
269 | unsigned int nr = cfs_rq->nr_running; | ||
270 | |||
271 | if (nr > 1) { | ||
272 | gran = gran/nr - gran/nr/nr; | ||
273 | gran = max(gran, sysctl_sched_min_granularity); | ||
274 | } | ||
275 | |||
276 | return gran; | ||
277 | } | ||
278 | |||
279 | /* | ||
217 | * We rescale the rescheduling granularity of tasks according to their | 280 | * We rescale the rescheduling granularity of tasks according to their |
218 | * nice level, but only linearly, not exponentially: | 281 | * nice level, but only linearly, not exponentially: |
219 | */ | 282 | */ |
@@ -240,7 +303,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity) | |||
240 | /* | 303 | /* |
241 | * It will always fit into 'long': | 304 | * It will always fit into 'long': |
242 | */ | 305 | */ |
243 | return (long) (tmp >> WMULT_SHIFT); | 306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); |
244 | } | 307 | } |
245 | 308 | ||
246 | static inline void | 309 | static inline void |
@@ -303,10 +366,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
303 | delta_fair = calc_delta_fair(delta_exec, lw); | 366 | delta_fair = calc_delta_fair(delta_exec, lw); |
304 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | 367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); |
305 | 368 | ||
306 | if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { | 369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { |
307 | delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); | 370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); |
308 | delta = calc_delta_mine(delta, curr->load.weight, lw); | 371 | delta = min(delta, (unsigned long)( |
309 | delta = min((u64)delta, cfs_rq->sleeper_bonus); | 372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); |
310 | cfs_rq->sleeper_bonus -= delta; | 373 | cfs_rq->sleeper_bonus -= delta; |
311 | delta_mine -= delta; | 374 | delta_mine -= delta; |
312 | } | 375 | } |
@@ -438,6 +501,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
438 | { | 501 | { |
439 | unsigned long delta_fair; | 502 | unsigned long delta_fair; |
440 | 503 | ||
504 | if (unlikely(!se->wait_start_fair)) | ||
505 | return; | ||
506 | |||
441 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | 507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), |
442 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | 508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); |
443 | 509 | ||
@@ -494,6 +560,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
494 | unsigned long load = cfs_rq->load.weight, delta_fair; | 560 | unsigned long load = cfs_rq->load.weight, delta_fair; |
495 | long prev_runtime; | 561 | long prev_runtime; |
496 | 562 | ||
563 | /* | ||
564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
565 | * already: | ||
566 | */ | ||
567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
568 | return; | ||
569 | |||
497 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | 570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) |
498 | load = rq_of(cfs_rq)->cpu_load[2]; | 571 | load = rq_of(cfs_rq)->cpu_load[2]; |
499 | 572 | ||
@@ -519,10 +592,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
519 | * Track the amount of bonus we've given to sleepers: | 592 | * Track the amount of bonus we've given to sleepers: |
520 | */ | 593 | */ |
521 | cfs_rq->sleeper_bonus += delta_fair; | 594 | cfs_rq->sleeper_bonus += delta_fair; |
522 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
523 | cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit; | ||
524 | |||
525 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
526 | } | 595 | } |
527 | 596 | ||
528 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -570,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
570 | 639 | ||
571 | se->block_start = 0; | 640 | se->block_start = 0; |
572 | se->sum_sleep_runtime += delta; | 641 | se->sum_sleep_runtime += delta; |
642 | |||
643 | /* | ||
644 | * Blocking time is in units of nanosecs, so shift by 20 to | ||
645 | * get a milliseconds-range estimation of the amount of | ||
646 | * time that the task spent sleeping: | ||
647 | */ | ||
648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | ||
650 | delta >> 20); | ||
651 | } | ||
573 | } | 652 | } |
574 | #endif | 653 | #endif |
575 | } | 654 | } |
@@ -604,7 +683,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
604 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 683 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
605 | se->block_start = rq_of(cfs_rq)->clock; | 684 | se->block_start = rq_of(cfs_rq)->clock; |
606 | } | 685 | } |
607 | cfs_rq->wait_runtime -= se->wait_runtime; | ||
608 | #endif | 686 | #endif |
609 | } | 687 | } |
610 | __dequeue_entity(cfs_rq, se); | 688 | __dequeue_entity(cfs_rq, se); |
@@ -618,11 +696,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
618 | struct sched_entity *curr, unsigned long granularity) | 696 | struct sched_entity *curr, unsigned long granularity) |
619 | { | 697 | { |
620 | s64 __delta = curr->fair_key - se->fair_key; | 698 | s64 __delta = curr->fair_key - se->fair_key; |
699 | unsigned long ideal_runtime, delta_exec; | ||
700 | |||
701 | /* | ||
702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
703 | * walltime, hence do not scale. | ||
704 | */ | ||
705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
706 | (unsigned long)sysctl_sched_min_granularity); | ||
707 | |||
708 | /* | ||
709 | * If we executed more than what the latency constraint suggests, | ||
710 | * reduce the rescheduling granularity. This way the total latency | ||
711 | * of how much a task is not scheduled converges to | ||
712 | * sysctl_sched_latency: | ||
713 | */ | ||
714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | ||
715 | if (delta_exec > ideal_runtime) | ||
716 | granularity = 0; | ||
621 | 717 | ||
622 | /* | 718 | /* |
623 | * Take scheduling granularity into account - do not | 719 | * Take scheduling granularity into account - do not |
624 | * preempt the current task unless the best task has | 720 | * preempt the current task unless the best task has |
625 | * a larger than sched_granularity fairness advantage: | 721 | * a larger than sched_granularity fairness advantage: |
722 | * | ||
723 | * scale granularity as key space is in fair_clock. | ||
626 | */ | 724 | */ |
627 | if (__delta > niced_granularity(curr, granularity)) | 725 | if (__delta > niced_granularity(curr, granularity)) |
628 | resched_task(rq_of(cfs_rq)->curr); | 726 | resched_task(rq_of(cfs_rq)->curr); |
@@ -641,6 +739,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
641 | update_stats_wait_end(cfs_rq, se); | 739 | update_stats_wait_end(cfs_rq, se); |
642 | update_stats_curr_start(cfs_rq, se); | 740 | update_stats_curr_start(cfs_rq, se); |
643 | set_cfs_rq_curr(cfs_rq, se); | 741 | set_cfs_rq_curr(cfs_rq, se); |
742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | ||
644 | } | 743 | } |
645 | 744 | ||
646 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
@@ -686,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
686 | if (next == curr) | 785 | if (next == curr) |
687 | return; | 786 | return; |
688 | 787 | ||
689 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | 788 | __check_preempt_curr_fair(cfs_rq, next, curr, |
789 | sched_granularity(cfs_rq)); | ||
690 | } | 790 | } |
691 | 791 | ||
692 | /************************************************** | 792 | /************************************************** |
@@ -815,19 +915,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
815 | } | 915 | } |
816 | 916 | ||
817 | /* | 917 | /* |
818 | * sched_yield() support is very simple - we dequeue and enqueue | 918 | * sched_yield() support is very simple - we dequeue and enqueue. |
919 | * | ||
920 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
819 | */ | 921 | */ |
820 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) |
821 | { | 923 | { |
822 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
926 | struct sched_entity *rightmost, *se = &p->se; | ||
927 | struct rb_node *parent; | ||
823 | 928 | ||
824 | __update_rq_clock(rq); | ||
825 | /* | 929 | /* |
826 | * Dequeue and enqueue the task to update its | 930 | * Are we the only task in the tree? |
827 | * position within the tree: | 931 | */ |
932 | if (unlikely(cfs_rq->nr_running == 1)) | ||
933 | return; | ||
934 | |||
935 | if (likely(!sysctl_sched_compat_yield)) { | ||
936 | __update_rq_clock(rq); | ||
937 | /* | ||
938 | * Dequeue and enqueue the task to update its | ||
939 | * position within the tree: | ||
940 | */ | ||
941 | dequeue_entity(cfs_rq, &p->se, 0); | ||
942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
943 | |||
944 | return; | ||
945 | } | ||
946 | /* | ||
947 | * Find the rightmost entry in the rbtree: | ||
948 | */ | ||
949 | do { | ||
950 | parent = *link; | ||
951 | link = &parent->rb_right; | ||
952 | } while (*link); | ||
953 | |||
954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
955 | /* | ||
956 | * Already in the rightmost position? | ||
957 | */ | ||
958 | if (unlikely(rightmost == se)) | ||
959 | return; | ||
960 | |||
961 | /* | ||
962 | * Minimally necessary key value to be last in the tree: | ||
828 | */ | 963 | */ |
829 | dequeue_entity(cfs_rq, &p->se, 0); | 964 | se->fair_key = rightmost->fair_key + 1; |
830 | enqueue_entity(cfs_rq, &p->se, 0); | 965 | |
966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
968 | /* | ||
969 | * Relink the task to the rightmost position: | ||
970 | */ | ||
971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
972 | rb_link_node(&se->run_node, parent, link); | ||
973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
831 | } | 974 | } |
832 | 975 | ||
833 | /* | 976 | /* |
@@ -1020,31 +1163,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1020 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
1021 | { | 1164 | { |
1022 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1023 | struct sched_entity *se = &p->se; | 1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); |
1024 | 1167 | ||
1025 | sched_info_queued(p); | 1168 | sched_info_queued(p); |
1026 | 1169 | ||
1170 | update_curr(cfs_rq); | ||
1027 | update_stats_enqueue(cfs_rq, se); | 1171 | update_stats_enqueue(cfs_rq, se); |
1028 | /* | 1172 | /* |
1029 | * Child runs first: we let it run before the parent | 1173 | * Child runs first: we let it run before the parent |
1030 | * until it reschedules once. We set up the key so that | 1174 | * until it reschedules once. We set up the key so that |
1031 | * it will preempt the parent: | 1175 | * it will preempt the parent: |
1032 | */ | 1176 | */ |
1033 | p->se.fair_key = current->se.fair_key - | 1177 | se->fair_key = curr->fair_key - |
1034 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | 1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; |
1035 | /* | 1179 | /* |
1036 | * The first wait is dominated by the child-runs-first logic, | 1180 | * The first wait is dominated by the child-runs-first logic, |
1037 | * so do not credit it with that waiting time yet: | 1181 | * so do not credit it with that waiting time yet: |
1038 | */ | 1182 | */ |
1039 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | 1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) |
1040 | p->se.wait_start_fair = 0; | 1184 | se->wait_start_fair = 0; |
1041 | 1185 | ||
1042 | /* | 1186 | /* |
1043 | * The statistical average of wait_runtime is about | 1187 | * The statistical average of wait_runtime is about |
1044 | * -granularity/2, so initialize the task with that: | 1188 | * -granularity/2, so initialize the task with that: |
1045 | */ | 1189 | */ |
1046 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) |
1047 | p->se.wait_runtime = -(sysctl_sched_granularity / 2); | 1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); |
1048 | 1192 | ||
1049 | __enqueue_entity(cfs_rq, se); | 1193 | __enqueue_entity(cfs_rq, se); |
1050 | } | 1194 | } |
@@ -1057,7 +1201,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1057 | */ | 1201 | */ |
1058 | static void set_curr_task_fair(struct rq *rq) | 1202 | static void set_curr_task_fair(struct rq *rq) |
1059 | { | 1203 | { |
1060 | struct sched_entity *se = &rq->curr.se; | 1204 | struct sched_entity *se = &rq->curr->se; |
1061 | 1205 | ||
1062 | for_each_sched_entity(se) | 1206 | for_each_sched_entity(se) |
1063 | set_next_entity(cfs_rq_of(se), se); | 1207 | set_next_entity(cfs_rq_of(se), se); |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index dcdcad632fd9..4b87476a02d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
207 | return; | 207 | return; |
208 | 208 | ||
209 | p->time_slice = static_prio_timeslice(p->static_prio); | 209 | p->time_slice = static_prio_timeslice(p->static_prio); |
210 | set_tsk_need_resched(p); | ||
211 | 210 | ||
212 | /* put it at the end of the queue: */ | 211 | /* |
213 | requeue_task_rt(rq, p); | 212 | * Requeue to the end of queue if we are not the only element |
213 | * on the queue: | ||
214 | */ | ||
215 | if (p->run_list.prev != p->run_list.next) { | ||
216 | requeue_task_rt(rq, p); | ||
217 | set_tsk_need_resched(p); | ||
218 | } | ||
214 | } | 219 | } |
215 | 220 | ||
216 | static struct sched_class rt_sched_class __read_mostly = { | 221 | static struct sched_class rt_sched_class __read_mostly = { |
diff --git a/kernel/signal.c b/kernel/signal.c index ad63109e413c..792952381092 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
378 | /* We only dequeue private signals from ourselves, we don't let | 378 | /* We only dequeue private signals from ourselves, we don't let |
379 | * signalfd steal them | 379 | * signalfd steal them |
380 | */ | 380 | */ |
381 | if (likely(tsk == current)) | 381 | signr = __dequeue_signal(&tsk->pending, mask, info); |
382 | signr = __dequeue_signal(&tsk->pending, mask, info); | ||
383 | if (!signr) { | 382 | if (!signr) { |
384 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 383 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
385 | mask, info); | 384 | mask, info); |
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
407 | } | 406 | } |
408 | } | 407 | } |
409 | } | 408 | } |
410 | if (likely(tsk == current)) | 409 | recalc_sigpending(); |
411 | recalc_sigpending(); | ||
412 | if (signr && unlikely(sig_kernel_stop(signr))) { | 410 | if (signr && unlikely(sig_kernel_stop(signr))) { |
413 | /* | 411 | /* |
414 | * Set a marker that we have dequeued a stop signal. Our | 412 | * Set a marker that we have dequeued a stop signal. Our |
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
425 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) | 423 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
426 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 424 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; |
427 | } | 425 | } |
428 | if (signr && likely(tsk == current) && | 426 | if (signr && |
429 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 427 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
430 | info->si_sys_private){ | 428 | info->si_sys_private){ |
431 | /* | 429 | /* |
@@ -533,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
533 | if (!valid_signal(sig)) | 531 | if (!valid_signal(sig)) |
534 | return error; | 532 | return error; |
535 | 533 | ||
536 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ | 534 | if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { |
537 | if (error) | 535 | error = audit_signal_info(sig, t); /* Let audit system see the signal */ |
538 | return error; | 536 | if (error) |
539 | 537 | return error; | |
540 | error = -EPERM; | 538 | error = -EPERM; |
541 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) | 539 | if (((sig != SIGCONT) || |
542 | && ((sig != SIGCONT) || | 540 | (process_session(current) != process_session(t))) |
543 | (process_session(current) != process_session(t))) | 541 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
544 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 542 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
545 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 543 | && !capable(CAP_KILL)) |
546 | && !capable(CAP_KILL)) | ||
547 | return error; | 544 | return error; |
545 | } | ||
548 | 546 | ||
549 | return security_task_kill(t, info, sig, 0); | 547 | return security_task_kill(t, info, sig, 0); |
550 | } | 548 | } |
@@ -1300,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void) | |||
1300 | void sigqueue_free(struct sigqueue *q) | 1298 | void sigqueue_free(struct sigqueue *q) |
1301 | { | 1299 | { |
1302 | unsigned long flags; | 1300 | unsigned long flags; |
1301 | spinlock_t *lock = ¤t->sighand->siglock; | ||
1302 | |||
1303 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1303 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1304 | /* | 1304 | /* |
1305 | * If the signal is still pending remove it from the | 1305 | * If the signal is still pending remove it from the |
1306 | * pending queue. | 1306 | * pending queue. We must hold ->siglock while testing |
1307 | * q->list to serialize with collect_signal(). | ||
1307 | */ | 1308 | */ |
1308 | if (unlikely(!list_empty(&q->list))) { | 1309 | spin_lock_irqsave(lock, flags); |
1309 | spinlock_t *lock = ¤t->sighand->siglock; | 1310 | if (!list_empty(&q->list)) |
1310 | read_lock(&tasklist_lock); | 1311 | list_del_init(&q->list); |
1311 | spin_lock_irqsave(lock, flags); | 1312 | spin_unlock_irqrestore(lock, flags); |
1312 | if (!list_empty(&q->list)) | 1313 | |
1313 | list_del_init(&q->list); | ||
1314 | spin_unlock_irqrestore(lock, flags); | ||
1315 | read_unlock(&tasklist_lock); | ||
1316 | } | ||
1317 | q->flags &= ~SIGQUEUE_PREALLOC; | 1314 | q->flags &= ~SIGQUEUE_PREALLOC; |
1318 | __sigqueue_free(q); | 1315 | __sigqueue_free(q); |
1319 | } | 1316 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 449b81b98b3d..8ae2e636eb1b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/getcpu.h> | 32 | #include <linux/getcpu.h> |
33 | #include <linux/task_io_accounting_ops.h> | 33 | #include <linux/task_io_accounting_ops.h> |
34 | #include <linux/seccomp.h> | 34 | #include <linux/seccomp.h> |
35 | #include <linux/cpu.h> | ||
35 | 36 | ||
36 | #include <linux/compat.h> | 37 | #include <linux/compat.h> |
37 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
@@ -878,6 +879,7 @@ void kernel_power_off(void) | |||
878 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | 879 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); |
879 | if (pm_power_off_prepare) | 880 | if (pm_power_off_prepare) |
880 | pm_power_off_prepare(); | 881 | pm_power_off_prepare(); |
882 | disable_nonboot_cpus(); | ||
881 | sysdev_shutdown(); | 883 | sysdev_shutdown(); |
882 | printk(KERN_EMERG "Power down.\n"); | 884 | printk(KERN_EMERG "Power down.\n"); |
883 | machine_power_off(); | 885 | machine_power_off(); |
@@ -1442,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
1442 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. | 1444 | * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. |
1443 | * LBT 04.03.94 | 1445 | * LBT 04.03.94 |
1444 | */ | 1446 | */ |
1445 | |||
1446 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | 1447 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) |
1447 | { | 1448 | { |
1448 | struct task_struct *p; | 1449 | struct task_struct *p; |
@@ -1470,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1470 | if (!thread_group_leader(p)) | 1471 | if (!thread_group_leader(p)) |
1471 | goto out; | 1472 | goto out; |
1472 | 1473 | ||
1473 | if (p->real_parent == group_leader) { | 1474 | if (p->real_parent->tgid == group_leader->tgid) { |
1474 | err = -EPERM; | 1475 | err = -EPERM; |
1475 | if (task_session(p) != task_session(group_leader)) | 1476 | if (task_session(p) != task_session(group_leader)) |
1476 | goto out; | 1477 | goto out; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9029690f4fae..53a456ebf6d5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -222,8 +222,19 @@ static ctl_table kern_table[] = { | |||
222 | #ifdef CONFIG_SCHED_DEBUG | 222 | #ifdef CONFIG_SCHED_DEBUG |
223 | { | 223 | { |
224 | .ctl_name = CTL_UNNUMBERED, | 224 | .ctl_name = CTL_UNNUMBERED, |
225 | .procname = "sched_granularity_ns", | 225 | .procname = "sched_min_granularity_ns", |
226 | .data = &sysctl_sched_granularity, | 226 | .data = &sysctl_sched_min_granularity, |
227 | .maxlen = sizeof(unsigned int), | ||
228 | .mode = 0644, | ||
229 | .proc_handler = &proc_dointvec_minmax, | ||
230 | .strategy = &sysctl_intvec, | ||
231 | .extra1 = &min_sched_granularity_ns, | ||
232 | .extra2 = &max_sched_granularity_ns, | ||
233 | }, | ||
234 | { | ||
235 | .ctl_name = CTL_UNNUMBERED, | ||
236 | .procname = "sched_latency_ns", | ||
237 | .data = &sysctl_sched_latency, | ||
227 | .maxlen = sizeof(unsigned int), | 238 | .maxlen = sizeof(unsigned int), |
228 | .mode = 0644, | 239 | .mode = 0644, |
229 | .proc_handler = &proc_dointvec_minmax, | 240 | .proc_handler = &proc_dointvec_minmax, |
@@ -283,6 +294,23 @@ static ctl_table kern_table[] = { | |||
283 | .mode = 0644, | 294 | .mode = 0644, |
284 | .proc_handler = &proc_dointvec, | 295 | .proc_handler = &proc_dointvec, |
285 | }, | 296 | }, |
297 | { | ||
298 | .ctl_name = CTL_UNNUMBERED, | ||
299 | .procname = "sched_features", | ||
300 | .data = &sysctl_sched_features, | ||
301 | .maxlen = sizeof(unsigned int), | ||
302 | .mode = 0644, | ||
303 | .proc_handler = &proc_dointvec, | ||
304 | }, | ||
305 | #endif | ||
306 | { | ||
307 | .ctl_name = CTL_UNNUMBERED, | ||
308 | .procname = "sched_compat_yield", | ||
309 | .data = &sysctl_sched_compat_yield, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = &proc_dointvec, | ||
313 | }, | ||
286 | #ifdef CONFIG_PROVE_LOCKING | 314 | #ifdef CONFIG_PROVE_LOCKING |
287 | { | 315 | { |
288 | .ctl_name = CTL_UNNUMBERED, | 316 | .ctl_name = CTL_UNNUMBERED, |
@@ -304,15 +332,6 @@ static ctl_table kern_table[] = { | |||
304 | }, | 332 | }, |
305 | #endif | 333 | #endif |
306 | { | 334 | { |
307 | .ctl_name = CTL_UNNUMBERED, | ||
308 | .procname = "sched_features", | ||
309 | .data = &sysctl_sched_features, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = &proc_dointvec, | ||
313 | }, | ||
314 | #endif | ||
315 | { | ||
316 | .ctl_name = KERN_PANIC, | 335 | .ctl_name = KERN_PANIC, |
317 | .procname = "panic", | 336 | .procname = "panic", |
318 | .data = &panic_timeout, | 337 | .data = &panic_timeout, |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cd91237dbfe3..de6a2d6b3ebb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy) | |||
226 | 226 | ||
227 | static void notify_cmos_timer(void) | 227 | static void notify_cmos_timer(void) |
228 | { | 228 | { |
229 | if (no_sync_cmos_clock) | 229 | if (!no_sync_cmos_clock) |
230 | mod_timer(&sync_cmos_timer, jiffies + 1); | 230 | mod_timer(&sync_cmos_timer, jiffies + 1); |
231 | } | 231 | } |
232 | 232 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index db8e0f3d409b..0962e0577660 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -383,11 +383,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
383 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 383 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
384 | { | 384 | { |
385 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 385 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
386 | 386 | return 0; | |
387 | if(!cpus_empty(tick_broadcast_oneshot_mask)) | ||
388 | tick_broadcast_set_event(ktime_get(), 1); | ||
389 | |||
390 | return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask); | ||
391 | } | 387 | } |
392 | 388 | ||
393 | /* | 389 | /* |
@@ -549,20 +545,17 @@ void tick_broadcast_switch_to_oneshot(void) | |||
549 | */ | 545 | */ |
550 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | 546 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) |
551 | { | 547 | { |
552 | struct clock_event_device *bc; | ||
553 | unsigned long flags; | 548 | unsigned long flags; |
554 | unsigned int cpu = *cpup; | 549 | unsigned int cpu = *cpup; |
555 | 550 | ||
556 | spin_lock_irqsave(&tick_broadcast_lock, flags); | 551 | spin_lock_irqsave(&tick_broadcast_lock, flags); |
557 | 552 | ||
558 | bc = tick_broadcast_device.evtdev; | 553 | /* |
554 | * Clear the broadcast mask flag for the dead cpu, but do not | ||
555 | * stop the broadcast device! | ||
556 | */ | ||
559 | cpu_clear(cpu, tick_broadcast_oneshot_mask); | 557 | cpu_clear(cpu, tick_broadcast_oneshot_mask); |
560 | 558 | ||
561 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { | ||
562 | if (bc && cpus_empty(tick_broadcast_oneshot_mask)) | ||
563 | clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); | ||
564 | } | ||
565 | |||
566 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 559 | spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
567 | } | 560 | } |
568 | 561 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b416995b9757..8c3fef1db09c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void) | |||
160 | cpu = smp_processor_id(); | 160 | cpu = smp_processor_id(); |
161 | ts = &per_cpu(tick_cpu_sched, cpu); | 161 | ts = &per_cpu(tick_cpu_sched, cpu); |
162 | 162 | ||
163 | /* | ||
164 | * If this cpu is offline and it is the one which updates | ||
165 | * jiffies, then give up the assignment and let it be taken by | ||
166 | * the cpu which runs the tick timer next. If we don't drop | ||
167 | * this here the jiffies might be stale and do_timer() never | ||
168 | * invoked. | ||
169 | */ | ||
170 | if (unlikely(!cpu_online(cpu))) { | ||
171 | if (cpu == tick_do_timer_cpu) | ||
172 | tick_do_timer_cpu = -1; | ||
173 | } | ||
174 | |||
163 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 175 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
164 | goto end; | 176 | goto end; |
165 | 177 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index acc417b5a9b7..4ad79f6bdec6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -217,6 +217,7 @@ static void change_clocksource(void) | |||
217 | } | 217 | } |
218 | #else | 218 | #else |
219 | static inline void change_clocksource(void) { } | 219 | static inline void change_clocksource(void) { } |
220 | static inline s64 __get_nsec_offset(void) { return 0; } | ||
220 | #endif | 221 | #endif |
221 | 222 | ||
222 | /** | 223 | /** |
@@ -280,6 +281,8 @@ void __init timekeeping_init(void) | |||
280 | static int timekeeping_suspended; | 281 | static int timekeeping_suspended; |
281 | /* time in seconds when suspend began */ | 282 | /* time in seconds when suspend began */ |
282 | static unsigned long timekeeping_suspend_time; | 283 | static unsigned long timekeeping_suspend_time; |
284 | /* xtime offset when we went into suspend */ | ||
285 | static s64 timekeeping_suspend_nsecs; | ||
283 | 286 | ||
284 | /** | 287 | /** |
285 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 288 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
@@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev) | |||
305 | wall_to_monotonic.tv_sec -= sleep_length; | 308 | wall_to_monotonic.tv_sec -= sleep_length; |
306 | total_sleep_time += sleep_length; | 309 | total_sleep_time += sleep_length; |
307 | } | 310 | } |
311 | /* Make sure that we have the correct xtime reference */ | ||
312 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | ||
308 | /* re-base the last cycle value */ | 313 | /* re-base the last cycle value */ |
309 | clock->cycle_last = clocksource_read(clock); | 314 | clock->cycle_last = clocksource_read(clock); |
310 | clock->error = 0; | 315 | clock->error = 0; |
@@ -325,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
325 | { | 330 | { |
326 | unsigned long flags; | 331 | unsigned long flags; |
327 | 332 | ||
333 | timekeeping_suspend_time = read_persistent_clock(); | ||
334 | |||
328 | write_seqlock_irqsave(&xtime_lock, flags); | 335 | write_seqlock_irqsave(&xtime_lock, flags); |
336 | /* Get the current xtime offset */ | ||
337 | timekeeping_suspend_nsecs = __get_nsec_offset(); | ||
329 | timekeeping_suspended = 1; | 338 | timekeeping_suspended = 1; |
330 | timekeeping_suspend_time = read_persistent_clock(); | ||
331 | write_sequnlock_irqrestore(&xtime_lock, flags); | 339 | write_sequnlock_irqrestore(&xtime_lock, flags); |
332 | 340 | ||
333 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 341 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 3c38fb5eae1b..c36bb7ed0301 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v) | |||
327 | ms = 1; | 327 | ms = 1; |
328 | 328 | ||
329 | if (events && period.tv_sec) | 329 | if (events && period.tv_sec) |
330 | seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, | 330 | seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", |
331 | events / period.tv_sec, events * 1000 / ms); | 331 | events, events * 1000 / ms, |
332 | (events * 1000000 / ms) % 1000); | ||
332 | else | 333 | else |
333 | seq_printf(m, "%ld total events\n", events); | 334 | seq_printf(m, "%ld total events\n", events); |
334 | 335 | ||
diff --git a/kernel/user.c b/kernel/user.c index e7d11cef6998..9ca2848fc356 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -55,25 +55,22 @@ struct user_struct root_user = { | |||
55 | /* | 55 | /* |
56 | * These routines must be called with the uidhash spinlock held! | 56 | * These routines must be called with the uidhash spinlock held! |
57 | */ | 57 | */ |
58 | static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) | 58 | static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) |
59 | { | 59 | { |
60 | list_add(&up->uidhash_list, hashent); | 60 | hlist_add_head(&up->uidhash_node, hashent); |
61 | } | 61 | } |
62 | 62 | ||
63 | static inline void uid_hash_remove(struct user_struct *up) | 63 | static inline void uid_hash_remove(struct user_struct *up) |
64 | { | 64 | { |
65 | list_del(&up->uidhash_list); | 65 | hlist_del_init(&up->uidhash_node); |
66 | } | 66 | } |
67 | 67 | ||
68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) | 68 | static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) |
69 | { | 69 | { |
70 | struct list_head *up; | 70 | struct user_struct *user; |
71 | 71 | struct hlist_node *h; | |
72 | list_for_each(up, hashent) { | ||
73 | struct user_struct *user; | ||
74 | |||
75 | user = list_entry(up, struct user_struct, uidhash_list); | ||
76 | 72 | ||
73 | hlist_for_each_entry(user, h, hashent, uidhash_node) { | ||
77 | if(user->uid == uid) { | 74 | if(user->uid == uid) { |
78 | atomic_inc(&user->__count); | 75 | atomic_inc(&user->__count); |
79 | return user; | 76 | return user; |
@@ -122,7 +119,7 @@ void free_uid(struct user_struct *up) | |||
122 | 119 | ||
123 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 120 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
124 | { | 121 | { |
125 | struct list_head *hashent = uidhashentry(ns, uid); | 122 | struct hlist_head *hashent = uidhashentry(ns, uid); |
126 | struct user_struct *up; | 123 | struct user_struct *up; |
127 | 124 | ||
128 | spin_lock_irq(&uidhash_lock); | 125 | spin_lock_irq(&uidhash_lock); |
@@ -202,6 +199,30 @@ void switch_uid(struct user_struct *new_user) | |||
202 | suid_keys(current); | 199 | suid_keys(current); |
203 | } | 200 | } |
204 | 201 | ||
202 | void release_uids(struct user_namespace *ns) | ||
203 | { | ||
204 | int i; | ||
205 | unsigned long flags; | ||
206 | struct hlist_head *head; | ||
207 | struct hlist_node *nd; | ||
208 | |||
209 | spin_lock_irqsave(&uidhash_lock, flags); | ||
210 | /* | ||
211 | * collapse the chains so that the user_struct-s will | ||
212 | * be still alive, but not in hashes. subsequent free_uid() | ||
213 | * will free them. | ||
214 | */ | ||
215 | for (i = 0; i < UIDHASH_SZ; i++) { | ||
216 | head = ns->uidhash_table + i; | ||
217 | while (!hlist_empty(head)) { | ||
218 | nd = head->first; | ||
219 | hlist_del_init(nd); | ||
220 | } | ||
221 | } | ||
222 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
223 | |||
224 | free_uid(ns->root_user); | ||
225 | } | ||
205 | 226 | ||
206 | static int __init uid_cache_init(void) | 227 | static int __init uid_cache_init(void) |
207 | { | 228 | { |
@@ -211,7 +232,7 @@ static int __init uid_cache_init(void) | |||
211 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | 232 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); |
212 | 233 | ||
213 | for(n = 0; n < UIDHASH_SZ; ++n) | 234 | for(n = 0; n < UIDHASH_SZ; ++n) |
214 | INIT_LIST_HEAD(init_user_ns.uidhash_table + n); | 235 | INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); |
215 | 236 | ||
216 | /* Insert the root user immediately (init already runs as root) */ | 237 | /* Insert the root user immediately (init already runs as root) */ |
217 | spin_lock_irq(&uidhash_lock); | 238 | spin_lock_irq(&uidhash_lock); |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d055d987850c..7af90fc4f0fd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) | |||
39 | kref_init(&ns->kref); | 39 | kref_init(&ns->kref); |
40 | 40 | ||
41 | for (n = 0; n < UIDHASH_SZ; ++n) | 41 | for (n = 0; n < UIDHASH_SZ; ++n) |
42 | INIT_LIST_HEAD(ns->uidhash_table + n); | 42 | INIT_HLIST_HEAD(ns->uidhash_table + n); |
43 | 43 | ||
44 | /* Insert new root user. */ | 44 | /* Insert new root user. */ |
45 | ns->root_user = alloc_uid(ns, 0); | 45 | ns->root_user = alloc_uid(ns, 0); |
@@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref) | |||
81 | struct user_namespace *ns; | 81 | struct user_namespace *ns; |
82 | 82 | ||
83 | ns = container_of(kref, struct user_namespace, kref); | 83 | ns = container_of(kref, struct user_namespace, kref); |
84 | release_uids(ns); | ||
84 | kfree(ns); | 85 | kfree(ns); |
85 | } | 86 | } |
86 | 87 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 9d8180a0f0d8..816d7b24fa03 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) | |||
28 | if (!ns) | 28 | if (!ns) |
29 | return ERR_PTR(-ENOMEM); | 29 | return ERR_PTR(-ENOMEM); |
30 | 30 | ||
31 | down_read(&uts_sem); | ||
31 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); | 32 | memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); |
33 | up_read(&uts_sem); | ||
32 | kref_init(&ns->kref); | 34 | kref_init(&ns->kref); |
33 | return ns; | 35 | return ns; |
34 | } | 36 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6bb..e080d1d744cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -635,7 +635,7 @@ int keventd_up(void) | |||
635 | int current_is_keventd(void) | 635 | int current_is_keventd(void) |
636 | { | 636 | { |
637 | struct cpu_workqueue_struct *cwq; | 637 | struct cpu_workqueue_struct *cwq; |
638 | int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | 638 | int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ |
639 | int ret = 0; | 639 | int ret = 0; |
640 | 640 | ||
641 | BUG_ON(!keventd_wq); | 641 | BUG_ON(!keventd_wq); |