aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/async.c94
-rw-r--r--kernel/cgroup.c33
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/futex.c53
-rw-r--r--kernel/hrtimer.c45
-rw-r--r--kernel/irq/chip.c9
-rw-r--r--kernel/irq/handle.c55
-rw-r--r--kernel/irq/internals.h1
-rw-r--r--kernel/irq/manage.c202
-rw-r--r--kernel/irq/numa_migrate.c18
-rw-r--r--kernel/irq/spurious.c14
-rw-r--r--kernel/itimer.c4
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/latencytop.c83
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/posix-cpu-timers.c196
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/console.c6
-rw-r--r--kernel/power/disk.c32
-rw-r--r--kernel/power/main.c34
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/power/user.c8
-rw-r--r--kernel/printk.c15
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/rcuclassic.c6
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcupreempt.c3
-rw-r--r--kernel/rcutree.c6
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/sched.c1033
-rw-r--r--kernel/sched_clock.c30
-rw-r--r--kernel/sched_debug.c8
-rw-r--r--kernel/sched_fair.c80
-rw-r--r--kernel/sched_features.h3
-rw-r--r--kernel/sched_rt.c541
-rw-r--r--kernel/sched_stats.h55
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/smp.c36
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/softlockup.c9
-rw-r--r--kernel/sys.c47
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c20
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/ntp.c444
-rw-r--r--kernel/time/tick-common.c26
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/timer.c110
-rw-r--r--kernel/trace/Kconfig25
-rw-r--r--kernel/trace/ftrace.c38
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c14
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c19
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/user.c37
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/wait.c59
-rw-r--r--kernel/workqueue.c20
71 files changed, 2936 insertions, 1202 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b6..e4791b3ba55d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o
51obj-$(CONFIG_MODULES) += module.o 51obj-$(CONFIG_MODULES) += module.o
52obj-$(CONFIG_KALLSYMS) += kallsyms.o 52obj-$(CONFIG_KALLSYMS) += kallsyms.o
53obj-$(CONFIG_PM) += power/ 53obj-$(CONFIG_PM) += power/
54obj-$(CONFIG_FREEZER) += power/
54obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 55obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
55obj-$(CONFIG_KEXEC) += kexec.o 56obj-$(CONFIG_KEXEC) += kexec.o
56obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 57obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/async.c b/kernel/async.c
index 608b32b42812..f565891f2c9b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -54,6 +54,7 @@ asynchronous and synchronous parts of the kernel.
54#include <linux/sched.h> 54#include <linux/sched.h>
55#include <linux/init.h> 55#include <linux/init.h>
56#include <linux/kthread.h> 56#include <linux/kthread.h>
57#include <linux/delay.h>
57#include <asm/atomic.h> 58#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
@@ -132,21 +133,23 @@ static void run_one_entry(void)
132 entry = list_first_entry(&async_pending, struct async_entry, list); 133 entry = list_first_entry(&async_pending, struct async_entry, list);
133 134
134 /* 2) move it to the running queue */ 135 /* 2) move it to the running queue */
135 list_del(&entry->list); 136 list_move_tail(&entry->list, entry->running);
136 list_add_tail(&entry->list, &async_running);
137 spin_unlock_irqrestore(&async_lock, flags); 137 spin_unlock_irqrestore(&async_lock, flags);
138 138
139 /* 3) run it (and print duration)*/ 139 /* 3) run it (and print duration)*/
140 if (initcall_debug && system_state == SYSTEM_BOOTING) { 140 if (initcall_debug && system_state == SYSTEM_BOOTING) {
141 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); 141 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
142 entry->func, task_pid_nr(current));
142 calltime = ktime_get(); 143 calltime = ktime_get();
143 } 144 }
144 entry->func(entry->data, entry->cookie); 145 entry->func(entry->data, entry->cookie);
145 if (initcall_debug && system_state == SYSTEM_BOOTING) { 146 if (initcall_debug && system_state == SYSTEM_BOOTING) {
146 rettime = ktime_get(); 147 rettime = ktime_get();
147 delta = ktime_sub(rettime, calltime); 148 delta = ktime_sub(rettime, calltime);
148 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, 149 printk("initcall %lli_%pF returned 0 after %lld usecs\n",
149 entry->func, ktime_to_ns(delta) >> 10); 150 (long long)entry->cookie,
151 entry->func,
152 (long long)ktime_to_ns(delta) >> 10);
150 } 153 }
151 154
152 /* 4) remove it from the running queue */ 155 /* 4) remove it from the running queue */
@@ -205,18 +208,44 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 return newcookie; 208 return newcookie;
206} 209}
207 210
211/**
212 * async_schedule - schedule a function for asynchronous execution
213 * @ptr: function to execute asynchronously
214 * @data: data pointer to pass to the function
215 *
216 * Returns an async_cookie_t that may be used for checkpointing later.
217 * Note: This function may be called from atomic or non-atomic contexts.
218 */
208async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 219async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
209{ 220{
210 return __async_schedule(ptr, data, &async_pending); 221 return __async_schedule(ptr, data, &async_running);
211} 222}
212EXPORT_SYMBOL_GPL(async_schedule); 223EXPORT_SYMBOL_GPL(async_schedule);
213 224
214async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) 225/**
226 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
227 * @ptr: function to execute asynchronously
228 * @data: data pointer to pass to the function
229 * @running: running list for the domain
230 *
231 * Returns an async_cookie_t that may be used for checkpointing later.
232 * @running may be used in the async_synchronize_*_domain() functions
233 * to wait within a certain synchronization domain rather than globally.
234 * A synchronization domain is specified via the running queue @running to use.
235 * Note: This function may be called from atomic or non-atomic contexts.
236 */
237async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
238 struct list_head *running)
215{ 239{
216 return __async_schedule(ptr, data, running); 240 return __async_schedule(ptr, data, running);
217} 241}
218EXPORT_SYMBOL_GPL(async_schedule_special); 242EXPORT_SYMBOL_GPL(async_schedule_domain);
219 243
244/**
245 * async_synchronize_full - synchronize all asynchronous function calls
246 *
247 * This function waits until all asynchronous function calls have been done.
248 */
220void async_synchronize_full(void) 249void async_synchronize_full(void)
221{ 250{
222 do { 251 do {
@@ -225,13 +254,30 @@ void async_synchronize_full(void)
225} 254}
226EXPORT_SYMBOL_GPL(async_synchronize_full); 255EXPORT_SYMBOL_GPL(async_synchronize_full);
227 256
228void async_synchronize_full_special(struct list_head *list) 257/**
258 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
259 * @list: running list to synchronize on
260 *
261 * This function waits until all asynchronous function calls for the
262 * synchronization domain specified by the running list @list have been done.
263 */
264void async_synchronize_full_domain(struct list_head *list)
229{ 265{
230 async_synchronize_cookie_special(next_cookie, list); 266 async_synchronize_cookie_domain(next_cookie, list);
231} 267}
232EXPORT_SYMBOL_GPL(async_synchronize_full_special); 268EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
233 269
234void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) 270/**
271 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
272 * @cookie: async_cookie_t to use as checkpoint
273 * @running: running list to synchronize on
274 *
275 * This function waits until all asynchronous function calls for the
276 * synchronization domain specified by the running list @list submitted
277 * prior to @cookie have been done.
278 */
279void async_synchronize_cookie_domain(async_cookie_t cookie,
280 struct list_head *running)
235{ 281{
236 ktime_t starttime, delta, endtime; 282 ktime_t starttime, delta, endtime;
237 283
@@ -247,14 +293,22 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
247 delta = ktime_sub(endtime, starttime); 293 delta = ktime_sub(endtime, starttime);
248 294
249 printk("async_continuing @ %i after %lli usec\n", 295 printk("async_continuing @ %i after %lli usec\n",
250 task_pid_nr(current), ktime_to_ns(delta) >> 10); 296 task_pid_nr(current),
297 (long long)ktime_to_ns(delta) >> 10);
251 } 298 }
252} 299}
253EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); 300EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
254 301
302/**
303 * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
304 * @cookie: async_cookie_t to use as checkpoint
305 *
306 * This function waits until all asynchronous function calls prior to @cookie
307 * have been done.
308 */
255void async_synchronize_cookie(async_cookie_t cookie) 309void async_synchronize_cookie(async_cookie_t cookie)
256{ 310{
257 async_synchronize_cookie_special(cookie, &async_running); 311 async_synchronize_cookie_domain(cookie, &async_running);
258} 312}
259EXPORT_SYMBOL_GPL(async_synchronize_cookie); 313EXPORT_SYMBOL_GPL(async_synchronize_cookie);
260 314
@@ -315,7 +369,11 @@ static int async_manager_thread(void *unused)
315 ec = atomic_read(&entry_count); 369 ec = atomic_read(&entry_count);
316 370
317 while (tc < ec && tc < MAX_THREADS) { 371 while (tc < ec && tc < MAX_THREADS) {
318 kthread_run(async_thread, NULL, "async/%i", tc); 372 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
373 tc))) {
374 msleep(100);
375 continue;
376 }
319 atomic_inc(&thread_count); 377 atomic_inc(&thread_count);
320 tc++; 378 tc++;
321 } 379 }
@@ -330,7 +388,9 @@ static int async_manager_thread(void *unused)
330static int __init async_init(void) 388static int __init async_init(void)
331{ 389{
332 if (async_enabled) 390 if (async_enabled)
333 kthread_run(async_manager_thread, NULL, "async/mgr"); 391 if (IS_ERR(kthread_run(async_manager_thread, NULL,
392 "async/mgr")))
393 async_enabled = 0;
334 return 0; 394 return 0;
335} 395}
336 396
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..9edb5c4b79b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1115,13 +1115,15 @@ static void cgroup_kill_sb(struct super_block *sb) {
1115 } 1115 }
1116 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1117 1117
1118 list_del(&root->root_list); 1118 if (!list_empty(&root->root_list)) {
1119 root_count--; 1119 list_del(&root->root_list);
1120 root_count--;
1121 }
1120 1122
1121 mutex_unlock(&cgroup_mutex); 1123 mutex_unlock(&cgroup_mutex);
1122 1124
1123 kfree(root);
1124 kill_litter_super(sb); 1125 kill_litter_super(sb);
1126 kfree(root);
1125} 1127}
1126 1128
1127static struct file_system_type cgroup_fs_type = { 1129static struct file_system_type cgroup_fs_type = {
@@ -2349,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2351 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i]; 2352 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root) 2353 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i); 2354 mutex_lock(&ss->hierarchy_mutex);
2353 } 2355 }
2354} 2356}
2355 2357
@@ -2434,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2434 2436
2435 err_remove: 2437 err_remove:
2436 2438
2439 cgroup_lock_hierarchy(root);
2437 list_del(&cgrp->sibling); 2440 list_del(&cgrp->sibling);
2441 cgroup_unlock_hierarchy(root);
2438 root->number_of_cgroups--; 2442 root->number_of_cgroups--;
2439 2443
2440 err_destroy: 2444 err_destroy:
@@ -2507,7 +2511,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
2507 for_each_subsys(cgrp->root, ss) { 2511 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 2512 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt; 2513 int refcnt;
2510 do { 2514 while (1) {
2511 /* We can only remove a CSS with a refcnt==1 */ 2515 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt); 2516 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) { 2517 if (refcnt > 1) {
@@ -2521,7 +2525,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
2521 * css_tryget() to spin until we set the 2525 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort 2526 * CSS_REMOVED bits or abort
2523 */ 2527 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); 2528 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
2529 break;
2530 cpu_relax();
2531 }
2525 } 2532 }
2526 done: 2533 done:
2527 for_each_subsys(cgrp->root, ss) { 2534 for_each_subsys(cgrp->root, ss) {
@@ -2630,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2630 BUG_ON(!list_empty(&init_task.tasks)); 2637 BUG_ON(!list_empty(&init_task.tasks));
2631 2638
2632 mutex_init(&ss->hierarchy_mutex); 2639 mutex_init(&ss->hierarchy_mutex);
2640 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
2633 ss->active = 1; 2641 ss->active = 1;
2634} 2642}
2635 2643
@@ -2991,20 +2999,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2991 mutex_unlock(&cgroup_mutex); 2999 mutex_unlock(&cgroup_mutex);
2992 return 0; 3000 return 0;
2993 } 3001 }
2994 task_lock(tsk);
2995 cg = tsk->cgroups;
2996 parent = task_cgroup(tsk, subsys->subsys_id);
2997 3002
2998 /* Pin the hierarchy */ 3003 /* Pin the hierarchy */
2999 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { 3004 if (!atomic_inc_not_zero(&root->sb->s_active)) {
3000 /* We race with the final deactivate_super() */ 3005 /* We race with the final deactivate_super() */
3001 mutex_unlock(&cgroup_mutex); 3006 mutex_unlock(&cgroup_mutex);
3002 return 0; 3007 return 0;
3003 } 3008 }
3004 3009
3005 /* Keep the cgroup alive */ 3010 /* Keep the cgroup alive */
3011 task_lock(tsk);
3012 parent = task_cgroup(tsk, subsys->subsys_id);
3013 cg = tsk->cgroups;
3006 get_css_set(cg); 3014 get_css_set(cg);
3007 task_unlock(tsk); 3015 task_unlock(tsk);
3016
3008 mutex_unlock(&cgroup_mutex); 3017 mutex_unlock(&cgroup_mutex);
3009 3018
3010 /* Now do the VFS work to create a cgroup */ 3019 /* Now do the VFS work to create a cgroup */
@@ -3043,7 +3052,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3043 mutex_unlock(&inode->i_mutex); 3052 mutex_unlock(&inode->i_mutex);
3044 put_css_set(cg); 3053 put_css_set(cg);
3045 3054
3046 deactivate_super(parent->root->sb); 3055 deactivate_super(root->sb);
3047 /* The cgroup is still accessible in the VFS, but 3056 /* The cgroup is still accessible in the VFS, but
3048 * we're not going to try to rmdir() it at this 3057 * we're not going to try to rmdir() it at this
3049 * point. */ 3058 * point. */
@@ -3069,7 +3078,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3069 mutex_lock(&cgroup_mutex); 3078 mutex_lock(&cgroup_mutex);
3070 put_css_set(cg); 3079 put_css_set(cg);
3071 mutex_unlock(&cgroup_mutex); 3080 mutex_unlock(&cgroup_mutex);
3072 deactivate_super(parent->root->sb); 3081 deactivate_super(root->sb);
3073 return ret; 3082 return ret;
3074} 3083}
3075 3084
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a85678865c5e..f76db9dcaa05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
64 * Tracks how many cpusets are currently defined in system. 72 * Tracks how many cpusets are currently defined in system.
65 * When there is only one cpuset (the root cpuset) we can 73 * When there is only one cpuset (the root cpuset) we can
66 * short circuit some hooks. 74 * short circuit some hooks.
@@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
831 */ 839 */
832static void async_rebuild_sched_domains(void) 840static void async_rebuild_sched_domains(void)
833{ 841{
834 schedule_work(&rebuild_sched_domains_work); 842 queue_work(cpuset_wq, &rebuild_sched_domains_work);
835} 843}
836 844
837/* 845/*
@@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void)
2111 2119
2112 hotcpu_notifier(cpuset_track_online_cpus, 0); 2120 hotcpu_notifier(cpuset_track_online_cpus, 0);
2113 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2121 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2122
2123 cpuset_wq = create_singlethread_workqueue("cpuset");
2124 BUG_ON(!cpuset_wq);
2114} 2125}
2115 2126
2116/** 2127/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b76..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
98 * @size: size of requested memory area 98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle 99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address 100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area. 101 * to allocated area.
102 * 102 *
103 * This function should be only called from per-arch dma_alloc_coherent() 103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools. 104 * to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
118 mem = dev->dma_mem; 118 mem = dev->dma_mem;
119 if (!mem) 119 if (!mem)
120 return 0; 120 return 0;
121 if (unlikely(size > mem->size)) 121
122 return 0; 122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
123 126
124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); 127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) { 128 if (unlikely(pageno < 0))
126 /* 129 goto err;
127 * Memory was found in the per-device arena. 130
128 */ 131 /*
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); 132 * Memory was found in the per-device area.
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT); 133 */
131 memset(*ret, 0, size); 134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { 135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
133 /* 136 memset(*ret, 0, size);
134 * The per-device arena is exhausted and we are not 137
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
144 }
145 return 1; 138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
146} 147}
147EXPORT_SYMBOL(dma_alloc_from_coherent); 148EXPORT_SYMBOL(dma_alloc_from_coherent);
148 149
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..efd30ccf3858 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk)
118 * We won't ever get here for the group leader, since it 118 * We won't ever get here for the group leader, since it
119 * will have been the last reference on the signal_struct. 119 * will have been the last reference on the signal_struct.
120 */ 120 */
121 sig->utime = cputime_add(sig->utime, task_utime(tsk));
122 sig->stime = cputime_add(sig->stime, task_stime(tsk));
121 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 123 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
122 sig->min_flt += tsk->min_flt; 124 sig->min_flt += tsk->min_flt;
123 sig->maj_flt += tsk->maj_flt; 125 sig->maj_flt += tsk->maj_flt;
@@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
126 sig->inblock += task_io_get_inblock(tsk); 128 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 129 sig->oublock += task_io_get_oublock(tsk);
128 task_io_accounting_add(&sig->ioac, &tsk->ioac); 130 task_io_accounting_add(&sig->ioac, &tsk->ioac);
131 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
129 sig = NULL; /* Marker for below. */ 132 sig = NULL; /* Marker for below. */
130 } 133 }
131 134
diff --git a/kernel/fork.c b/kernel/fork.c
index bf0cef8bbdf2..4854c2c4a82e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,17 +817,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
817static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 817static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
818{ 818{
819 struct signal_struct *sig; 819 struct signal_struct *sig;
820 int ret;
821 820
822 if (clone_flags & CLONE_THREAD) { 821 if (clone_flags & CLONE_THREAD) {
823 ret = thread_group_cputime_clone_thread(current); 822 atomic_inc(&current->signal->count);
824 if (likely(!ret)) { 823 atomic_inc(&current->signal->live);
825 atomic_inc(&current->signal->count); 824 return 0;
826 atomic_inc(&current->signal->live);
827 }
828 return ret;
829 } 825 }
830 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 826 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
827
828 if (sig)
829 posix_cpu_timers_init_group(sig);
830
831 tsk->signal = sig; 831 tsk->signal = sig;
832 if (!sig) 832 if (!sig)
833 return -ENOMEM; 833 return -ENOMEM;
@@ -851,21 +851,20 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
851 sig->tty_old_pgrp = NULL; 851 sig->tty_old_pgrp = NULL;
852 sig->tty = NULL; 852 sig->tty = NULL;
853 853
854 sig->cutime = sig->cstime = cputime_zero; 854 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
855 sig->gtime = cputime_zero; 855 sig->gtime = cputime_zero;
856 sig->cgtime = cputime_zero; 856 sig->cgtime = cputime_zero;
857 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 857 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
858 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 858 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
859 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 859 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
860 task_io_accounting_init(&sig->ioac); 860 task_io_accounting_init(&sig->ioac);
861 sig->sum_sched_runtime = 0;
861 taskstats_tgid_init(sig); 862 taskstats_tgid_init(sig);
862 863
863 task_lock(current->group_leader); 864 task_lock(current->group_leader);
864 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 865 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
865 task_unlock(current->group_leader); 866 task_unlock(current->group_leader);
866 867
867 posix_cpu_timers_init_group(sig);
868
869 acct_init_pacct(&sig->pacct); 868 acct_init_pacct(&sig->pacct);
870 869
871 tty_audit_fork(sig); 870 tty_audit_fork(sig);
@@ -1007,6 +1006,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1007 * triggers too late. This doesn't hurt, the check is only there 1006 * triggers too late. This doesn't hurt, the check is only there
1008 * to stop root fork bombs. 1007 * to stop root fork bombs.
1009 */ 1008 */
1009 retval = -EAGAIN;
1010 if (nr_threads >= max_threads) 1010 if (nr_threads >= max_threads)
1011 goto bad_fork_cleanup_count; 1011 goto bad_fork_cleanup_count;
1012 1012
@@ -1095,7 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1095#ifdef CONFIG_DEBUG_MUTEXES 1095#ifdef CONFIG_DEBUG_MUTEXES
1096 p->blocked_on = NULL; /* not blocked yet */ 1096 p->blocked_on = NULL; /* not blocked yet */
1097#endif 1097#endif
1098 if (unlikely(ptrace_reparented(current))) 1098 if (unlikely(current->ptrace))
1099 ptrace_fork(p, clone_flags); 1099 ptrace_fork(p, clone_flags);
1100 1100
1101 /* Perform scheduler related setup. Assign this task to a CPU. */ 1101 /* Perform scheduler related setup. Assign this task to a CPU. */
@@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1179#endif 1179#endif
1180 clear_all_latency_tracing(p); 1180 clear_all_latency_tracing(p);
1181 1181
1182 /* Our parent execution domain becomes current domain
1183 These must match for thread signalling to apply */
1184 p->parent_exec_id = p->self_exec_id;
1185
1186 /* ok, now we should be set up.. */ 1182 /* ok, now we should be set up.. */
1187 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1183 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1188 p->pdeath_signal = 0; 1184 p->pdeath_signal = 0;
@@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1220 set_task_cpu(p, smp_processor_id()); 1216 set_task_cpu(p, smp_processor_id());
1221 1217
1222 /* CLONE_PARENT re-uses the old parent */ 1218 /* CLONE_PARENT re-uses the old parent */
1223 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1219 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1224 p->real_parent = current->real_parent; 1220 p->real_parent = current->real_parent;
1225 else 1221 p->parent_exec_id = current->parent_exec_id;
1222 } else {
1226 p->real_parent = current; 1223 p->real_parent = current;
1224 p->parent_exec_id = current->self_exec_id;
1225 }
1227 1226
1228 spin_lock(&current->sighand->siglock); 1227 spin_lock(&current->sighand->siglock);
1229 1228
diff --git a/kernel/futex.c b/kernel/futex.c
index f89d373a9c6d..438701adce23 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1165,6 +1165,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1165 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1165 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1166{ 1166{
1167 struct task_struct *curr = current; 1167 struct task_struct *curr = current;
1168 struct restart_block *restart;
1168 DECLARE_WAITQUEUE(wait, curr); 1169 DECLARE_WAITQUEUE(wait, curr);
1169 struct futex_hash_bucket *hb; 1170 struct futex_hash_bucket *hb;
1170 struct futex_q q; 1171 struct futex_q q;
@@ -1216,11 +1217,13 @@ retry:
1216 1217
1217 if (!ret) 1218 if (!ret)
1218 goto retry; 1219 goto retry;
1219 return ret; 1220 goto out;
1220 } 1221 }
1221 ret = -EWOULDBLOCK; 1222 ret = -EWOULDBLOCK;
1222 if (uval != val) 1223 if (unlikely(uval != val)) {
1223 goto out_unlock_put_key; 1224 queue_unlock(&q, hb);
1225 goto out_put_key;
1226 }
1224 1227
1225 /* Only actually queue if *uaddr contained val. */ 1228 /* Only actually queue if *uaddr contained val. */
1226 queue_me(&q, hb); 1229 queue_me(&q, hb);
@@ -1284,38 +1287,38 @@ retry:
1284 */ 1287 */
1285 1288
1286 /* If we were woken (and unqueued), we succeeded, whatever. */ 1289 /* If we were woken (and unqueued), we succeeded, whatever. */
1290 ret = 0;
1287 if (!unqueue_me(&q)) 1291 if (!unqueue_me(&q))
1288 return 0; 1292 goto out_put_key;
1293 ret = -ETIMEDOUT;
1289 if (rem) 1294 if (rem)
1290 return -ETIMEDOUT; 1295 goto out_put_key;
1291 1296
1292 /* 1297 /*
1293 * We expect signal_pending(current), but another thread may 1298 * We expect signal_pending(current), but another thread may
1294 * have handled it for us already. 1299 * have handled it for us already.
1295 */ 1300 */
1301 ret = -ERESTARTSYS;
1296 if (!abs_time) 1302 if (!abs_time)
1297 return -ERESTARTSYS; 1303 goto out_put_key;
1298 else {
1299 struct restart_block *restart;
1300 restart = &current_thread_info()->restart_block;
1301 restart->fn = futex_wait_restart;
1302 restart->futex.uaddr = (u32 *)uaddr;
1303 restart->futex.val = val;
1304 restart->futex.time = abs_time->tv64;
1305 restart->futex.bitset = bitset;
1306 restart->futex.flags = 0;
1307
1308 if (fshared)
1309 restart->futex.flags |= FLAGS_SHARED;
1310 if (clockrt)
1311 restart->futex.flags |= FLAGS_CLOCKRT;
1312 return -ERESTART_RESTARTBLOCK;
1313 }
1314 1304
1315out_unlock_put_key: 1305 restart = &current_thread_info()->restart_block;
1316 queue_unlock(&q, hb); 1306 restart->fn = futex_wait_restart;
1317 put_futex_key(fshared, &q.key); 1307 restart->futex.uaddr = (u32 *)uaddr;
1308 restart->futex.val = val;
1309 restart->futex.time = abs_time->tv64;
1310 restart->futex.bitset = bitset;
1311 restart->futex.flags = 0;
1312
1313 if (fshared)
1314 restart->futex.flags |= FLAGS_SHARED;
1315 if (clockrt)
1316 restart->futex.flags |= FLAGS_CLOCKRT;
1318 1317
1318 ret = -ERESTART_RESTARTBLOCK;
1319
1320out_put_key:
1321 put_futex_key(fshared, &q.key);
1319out: 1322out:
1320 return ret; 1323 return ret;
1321} 1324}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2dc30c59c5fd..f394d2a42ca3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
501 continue; 501 continue;
502 timer = rb_entry(base->first, struct hrtimer, node); 502 timer = rb_entry(base->first, struct hrtimer, node);
503 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 503 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
504 /*
505 * clock_was_set() has changed base->offset so the
506 * result might be negative. Fix it up to prevent a
507 * false positive in clockevents_program_event()
508 */
509 if (expires.tv64 < 0)
510 expires.tv64 = 0;
504 if (expires.tv64 < cpu_base->expires_next.tv64) 511 if (expires.tv64 < cpu_base->expires_next.tv64)
505 cpu_base->expires_next = expires; 512 cpu_base->expires_next = expires;
506 } 513 }
@@ -614,7 +621,9 @@ void clock_was_set(void)
614 */ 621 */
615void hres_timers_resume(void) 622void hres_timers_resume(void)
616{ 623{
617 /* Retrigger the CPU local events: */ 624 WARN_ONCE(!irqs_disabled(),
625 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
626
618 retrigger_next_event(NULL); 627 retrigger_next_event(NULL);
619} 628}
620 629
@@ -1156,6 +1165,29 @@ static void __run_hrtimer(struct hrtimer *timer)
1156 1165
1157#ifdef CONFIG_HIGH_RES_TIMERS 1166#ifdef CONFIG_HIGH_RES_TIMERS
1158 1167
1168static int force_clock_reprogram;
1169
1170/*
1171 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1172 * is hanging, which could happen with something that slows the interrupt
1173 * such as the tracing. Then we force the clock reprogramming for each future
1174 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1175 * threshold that we will overwrite.
1176 * The next tick event will be scheduled to 3 times we currently spend on
1177 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1178 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1179 * let it running without serious starvation.
1180 */
1181
1182static inline void
1183hrtimer_interrupt_hanging(struct clock_event_device *dev,
1184 ktime_t try_time)
1185{
1186 force_clock_reprogram = 1;
1187 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1188 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1189 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1190}
1159/* 1191/*
1160 * High resolution timer interrupt 1192 * High resolution timer interrupt
1161 * Called with interrupts disabled 1193 * Called with interrupts disabled
@@ -1165,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1165 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1197 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1166 struct hrtimer_clock_base *base; 1198 struct hrtimer_clock_base *base;
1167 ktime_t expires_next, now; 1199 ktime_t expires_next, now;
1200 int nr_retries = 0;
1168 int i; 1201 int i;
1169 1202
1170 BUG_ON(!cpu_base->hres_active); 1203 BUG_ON(!cpu_base->hres_active);
@@ -1172,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1172 dev->next_event.tv64 = KTIME_MAX; 1205 dev->next_event.tv64 = KTIME_MAX;
1173 1206
1174 retry: 1207 retry:
1208 /* 5 retries is enough to notice a hang */
1209 if (!(++nr_retries % 5))
1210 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1211
1175 now = ktime_get(); 1212 now = ktime_get();
1176 1213
1177 expires_next.tv64 = KTIME_MAX; 1214 expires_next.tv64 = KTIME_MAX;
@@ -1224,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1224 1261
1225 /* Reprogramming necessary ? */ 1262 /* Reprogramming necessary ? */
1226 if (expires_next.tv64 != KTIME_MAX) { 1263 if (expires_next.tv64 != KTIME_MAX) {
1227 if (tick_program_event(expires_next, 0)) 1264 if (tick_program_event(expires_next, force_clock_reprogram))
1228 goto retry; 1265 goto retry;
1229 } 1266 }
1230} 1267}
@@ -1578,6 +1615,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1578 break; 1615 break;
1579 1616
1580#ifdef CONFIG_HOTPLUG_CPU 1617#ifdef CONFIG_HOTPLUG_CPU
1618 case CPU_DYING:
1619 case CPU_DYING_FROZEN:
1620 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1621 break;
1581 case CPU_DEAD: 1622 case CPU_DEAD:
1582 case CPU_DEAD_FROZEN: 1623 case CPU_DEAD_FROZEN:
1583 { 1624 {
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..03d0bed2b8d9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -78,6 +78,7 @@ void dynamic_irq_cleanup(unsigned int irq)
78 desc->handle_irq = handle_bad_irq; 78 desc->handle_irq = handle_bad_irq;
79 desc->chip = &no_irq_chip; 79 desc->chip = &no_irq_chip;
80 desc->name = NULL; 80 desc->name = NULL;
81 clear_kstat_irqs(desc);
81 spin_unlock_irqrestore(&desc->lock, flags); 82 spin_unlock_irqrestore(&desc->lock, flags);
82} 83}
83 84
@@ -290,7 +291,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
290 desc->chip->mask_ack(irq); 291 desc->chip->mask_ack(irq);
291 else { 292 else {
292 desc->chip->mask(irq); 293 desc->chip->mask(irq);
293 desc->chip->ack(irq); 294 if (desc->chip->ack)
295 desc->chip->ack(irq);
294 } 296 }
295} 297}
296 298
@@ -383,6 +385,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
383out_unlock: 385out_unlock:
384 spin_unlock(&desc->lock); 386 spin_unlock(&desc->lock);
385} 387}
388EXPORT_SYMBOL_GPL(handle_level_irq);
386 389
387/** 390/**
388 * handle_fasteoi_irq - irq handler for transparent controllers 391 * handle_fasteoi_irq - irq handler for transparent controllers
@@ -475,7 +478,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
476 479
477 /* Start handling the irq */ 480 /* Start handling the irq */
478 desc->chip->ack(irq); 481 if (desc->chip->ack)
482 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc); 483 desc = irq_remap_to_desc(irq, desc);
480 484
481 /* Mark the IRQ currently in progress.*/ 485 /* Mark the IRQ currently in progress.*/
@@ -593,6 +597,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
593 } 597 }
594 spin_unlock_irqrestore(&desc->lock, flags); 598 spin_unlock_irqrestore(&desc->lock, flags);
595} 599}
600EXPORT_SYMBOL_GPL(__set_irq_handler);
596 601
597void 602void
598set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, 603set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..f6cdda68e5c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -39,6 +39,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
39 ack_bad_irq(irq); 39 ack_bad_irq(irq);
40} 40}
41 41
42#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
43static void __init init_irq_default_affinity(void)
44{
45 alloc_bootmem_cpumask_var(&irq_default_affinity);
46 cpumask_setall(irq_default_affinity);
47}
48#else
49static void __init init_irq_default_affinity(void)
50{
51}
52#endif
53
42/* 54/*
43 * Linux has a controller-independent interrupt architecture. 55 * Linux has a controller-independent interrupt architecture.
44 * Every controller has a 'controller-template', that is used 56 * Every controller has a 'controller-template', that is used
@@ -71,19 +83,21 @@ static struct irq_desc irq_desc_init = {
71 83
72void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
73{ 85{
74 unsigned long bytes;
75 char *ptr;
76 int node; 86 int node;
77 87 void *ptr;
78 /* Compute how many bytes we need per irq and allocate them */
79 bytes = nr * sizeof(unsigned int);
80 88
81 node = cpu_to_node(cpu); 89 node = cpu_to_node(cpu);
82 ptr = kzalloc_node(bytes, GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
83 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
84 91
85 if (ptr) 92 /*
86 desc->kstat_irqs = (unsigned int *)ptr; 93 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one
95 */
96 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n",
98 cpu, node);
99 desc->kstat_irqs = ptr;
100 }
87} 101}
88 102
89static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@ -134,6 +148,8 @@ int __init early_irq_init(void)
134 int legacy_count; 148 int legacy_count;
135 int i; 149 int i;
136 150
151 init_irq_default_affinity();
152
137 desc = irq_desc_legacy; 153 desc = irq_desc_legacy;
138 legacy_count = ARRAY_SIZE(irq_desc_legacy); 154 legacy_count = ARRAY_SIZE(irq_desc_legacy);
139 155
@@ -213,17 +229,22 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
213 } 229 }
214}; 230};
215 231
232static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
216int __init early_irq_init(void) 233int __init early_irq_init(void)
217{ 234{
218 struct irq_desc *desc; 235 struct irq_desc *desc;
219 int count; 236 int count;
220 int i; 237 int i;
221 238
239 init_irq_default_affinity();
240
222 desc = irq_desc; 241 desc = irq_desc;
223 count = ARRAY_SIZE(irq_desc); 242 count = ARRAY_SIZE(irq_desc);
224 243
225 for (i = 0; i < count; i++) 244 for (i = 0; i < count; i++) {
226 desc[i].irq = i; 245 desc[i].irq = i;
246 desc[i].kstat_irqs = kstat_irqs_all[i];
247 }
227 248
228 return arch_early_irq_init(); 249 return arch_early_irq_init();
229} 250}
@@ -239,6 +260,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
239} 260}
240#endif /* !CONFIG_SPARSE_IRQ */ 261#endif /* !CONFIG_SPARSE_IRQ */
241 262
263void clear_kstat_irqs(struct irq_desc *desc)
264{
265 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
266}
267
242/* 268/*
243 * What should we do if we get a hw irq event on an illegal vector? 269 * What should we do if we get a hw irq event on an illegal vector?
244 * Each architecture has to answer this themself. 270 * Each architecture has to answer this themself.
@@ -312,6 +338,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
312 irqreturn_t ret, retval = IRQ_NONE; 338 irqreturn_t ret, retval = IRQ_NONE;
313 unsigned int status = 0; 339 unsigned int status = 0;
314 340
341 WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
342
315 if (!(action->flags & IRQF_DISABLED)) 343 if (!(action->flags & IRQF_DISABLED))
316 local_irq_enable_in_hardirq(); 344 local_irq_enable_in_hardirq();
317 345
@@ -331,6 +359,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
331} 359}
332 360
333#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ 361#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
362
363#ifdef CONFIG_ENABLE_WARN_DEPRECATED
364# warning __do_IRQ is deprecated. Please convert to proper flow handlers
365#endif
366
334/** 367/**
335 * __do_IRQ - original all in one highlevel IRQ handler 368 * __do_IRQ - original all in one highlevel IRQ handler
336 * @irq: the interrupt number 369 * @irq: the interrupt number
@@ -451,12 +484,10 @@ void early_init_irq_lock_class(void)
451 } 484 }
452} 485}
453 486
454#ifdef CONFIG_SPARSE_IRQ
455unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 487unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
456{ 488{
457 struct irq_desc *desc = irq_to_desc(irq); 489 struct irq_desc *desc = irq_to_desc(irq);
458 return desc ? desc->kstat_irqs[cpu] : 0; 490 return desc ? desc->kstat_irqs[cpu] : 0;
459} 491}
460#endif
461EXPORT_SYMBOL(kstat_irqs_cpu); 492EXPORT_SYMBOL(kstat_irqs_cpu);
462 493
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..b60950bf5a16 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,6 +15,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
15 15
16extern struct lock_class_key irq_desc_lock_class; 16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern void clear_kstat_irqs(struct irq_desc *desc);
18extern spinlock_t sparse_irq_lock; 19extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; 20extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20 21
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..ea119effe096 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,17 +15,9 @@
15 15
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
19cpumask_var_t irq_default_affinity; 19cpumask_var_t irq_default_affinity;
20 20
21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
28
29/** 21/**
30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
31 * @irq: interrupt number to wait for 23 * @irq: interrupt number to wait for
@@ -117,7 +109,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
117/* 109/*
118 * Generic version of the affinity autoselector. 110 * Generic version of the affinity autoselector.
119 */ 111 */
120int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 112static int setup_affinity(unsigned int irq, struct irq_desc *desc)
121{ 113{
122 if (!irq_can_set_affinity(irq)) 114 if (!irq_can_set_affinity(irq))
123 return 0; 115 return 0;
@@ -141,7 +133,7 @@ set_affinity:
141 return 0; 133 return 0;
142} 134}
143#else 135#else
144static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) 136static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
145{ 137{
146 return irq_select_affinity(irq); 138 return irq_select_affinity(irq);
147} 139}
@@ -157,14 +149,14 @@ int irq_select_affinity_usr(unsigned int irq)
157 int ret; 149 int ret;
158 150
159 spin_lock_irqsave(&desc->lock, flags); 151 spin_lock_irqsave(&desc->lock, flags);
160 ret = do_irq_select_affinity(irq, desc); 152 ret = setup_affinity(irq, desc);
161 spin_unlock_irqrestore(&desc->lock, flags); 153 spin_unlock_irqrestore(&desc->lock, flags);
162 154
163 return ret; 155 return ret;
164} 156}
165 157
166#else 158#else
167static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) 159static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
168{ 160{
169 return 0; 161 return 0;
170} 162}
@@ -397,9 +389,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
397 * allocate special interrupts that are part of the architecture. 389 * allocate special interrupts that are part of the architecture.
398 */ 390 */
399static int 391static int
400__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) 392__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
401{ 393{
402 struct irqaction *old, **p; 394 struct irqaction *old, **old_ptr;
403 const char *old_name = NULL; 395 const char *old_name = NULL;
404 unsigned long flags; 396 unsigned long flags;
405 int shared = 0; 397 int shared = 0;
@@ -431,8 +423,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
431 * The following block of code has to be executed atomically 423 * The following block of code has to be executed atomically
432 */ 424 */
433 spin_lock_irqsave(&desc->lock, flags); 425 spin_lock_irqsave(&desc->lock, flags);
434 p = &desc->action; 426 old_ptr = &desc->action;
435 old = *p; 427 old = *old_ptr;
436 if (old) { 428 if (old) {
437 /* 429 /*
438 * Can't share interrupts unless both agree to and are 430 * Can't share interrupts unless both agree to and are
@@ -455,8 +447,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
455 447
456 /* add new interrupt at end of irq queue */ 448 /* add new interrupt at end of irq queue */
457 do { 449 do {
458 p = &old->next; 450 old_ptr = &old->next;
459 old = *p; 451 old = *old_ptr;
460 } while (old); 452 } while (old);
461 shared = 1; 453 shared = 1;
462 } 454 }
@@ -496,7 +488,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
496 desc->status |= IRQ_NO_BALANCING; 488 desc->status |= IRQ_NO_BALANCING;
497 489
498 /* Set default affinity mask once everything is setup */ 490 /* Set default affinity mask once everything is setup */
499 do_irq_select_affinity(irq, desc); 491 setup_affinity(irq, desc);
500 492
501 } else if ((new->flags & IRQF_TRIGGER_MASK) 493 } else if ((new->flags & IRQF_TRIGGER_MASK)
502 && (new->flags & IRQF_TRIGGER_MASK) 494 && (new->flags & IRQF_TRIGGER_MASK)
@@ -507,7 +499,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
507 (int)(new->flags & IRQF_TRIGGER_MASK)); 499 (int)(new->flags & IRQF_TRIGGER_MASK));
508 } 500 }
509 501
510 *p = new; 502 *old_ptr = new;
511 503
512 /* Reset broken irq detection when installing new handler */ 504 /* Reset broken irq detection when installing new handler */
513 desc->irq_count = 0; 505 desc->irq_count = 0;
@@ -557,90 +549,117 @@ int setup_irq(unsigned int irq, struct irqaction *act)
557 549
558 return __setup_irq(irq, desc, act); 550 return __setup_irq(irq, desc, act);
559} 551}
552EXPORT_SYMBOL_GPL(setup_irq);
560 553
561/** 554 /*
562 * free_irq - free an interrupt 555 * Internal function to unregister an irqaction - used to free
563 * @irq: Interrupt line to free 556 * regular and special interrupts that are part of the architecture.
564 * @dev_id: Device identity to free
565 *
566 * Remove an interrupt handler. The handler is removed and if the
567 * interrupt line is no longer in use by any driver it is disabled.
568 * On a shared IRQ the caller must ensure the interrupt is disabled
569 * on the card it drives before calling this function. The function
570 * does not return until any executing interrupts for this IRQ
571 * have completed.
572 *
573 * This function must not be called from interrupt context.
574 */ 557 */
575void free_irq(unsigned int irq, void *dev_id) 558static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
576{ 559{
577 struct irq_desc *desc = irq_to_desc(irq); 560 struct irq_desc *desc = irq_to_desc(irq);
578 struct irqaction **p; 561 struct irqaction *action, **action_ptr;
579 unsigned long flags; 562 unsigned long flags;
580 563
581 WARN_ON(in_interrupt()); 564 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
582 565
583 if (!desc) 566 if (!desc)
584 return; 567 return NULL;
585 568
586 spin_lock_irqsave(&desc->lock, flags); 569 spin_lock_irqsave(&desc->lock, flags);
587 p = &desc->action; 570
571 /*
572 * There can be multiple actions per IRQ descriptor, find the right
573 * one based on the dev_id:
574 */
575 action_ptr = &desc->action;
588 for (;;) { 576 for (;;) {
589 struct irqaction *action = *p; 577 action = *action_ptr;
590 578
591 if (action) { 579 if (!action) {
592 struct irqaction **pp = p; 580 WARN(1, "Trying to free already-free IRQ %d\n", irq);
581 spin_unlock_irqrestore(&desc->lock, flags);
593 582
594 p = &action->next; 583 return NULL;
595 if (action->dev_id != dev_id) 584 }
596 continue;
597 585
598 /* Found it - now remove it from the list of entries */ 586 if (action->dev_id == dev_id)
599 *pp = action->next; 587 break;
588 action_ptr = &action->next;
589 }
590
591 /* Found it - now remove it from the list of entries: */
592 *action_ptr = action->next;
600 593
601 /* Currently used only by UML, might disappear one day.*/ 594 /* Currently used only by UML, might disappear one day: */
602#ifdef CONFIG_IRQ_RELEASE_METHOD 595#ifdef CONFIG_IRQ_RELEASE_METHOD
603 if (desc->chip->release) 596 if (desc->chip->release)
604 desc->chip->release(irq, dev_id); 597 desc->chip->release(irq, dev_id);
605#endif 598#endif
606 599
607 if (!desc->action) { 600 /* If this was the last handler, shut down the IRQ line: */
608 desc->status |= IRQ_DISABLED; 601 if (!desc->action) {
609 if (desc->chip->shutdown) 602 desc->status |= IRQ_DISABLED;
610 desc->chip->shutdown(irq); 603 if (desc->chip->shutdown)
611 else 604 desc->chip->shutdown(irq);
612 desc->chip->disable(irq); 605 else
613 } 606 desc->chip->disable(irq);
614 spin_unlock_irqrestore(&desc->lock, flags); 607 }
615 unregister_handler_proc(irq, action); 608 spin_unlock_irqrestore(&desc->lock, flags);
609
610 unregister_handler_proc(irq, action);
611
612 /* Make sure it's not being used on another CPU: */
613 synchronize_irq(irq);
616 614
617 /* Make sure it's not being used on another CPU */
618 synchronize_irq(irq);
619#ifdef CONFIG_DEBUG_SHIRQ
620 /*
621 * It's a shared IRQ -- the driver ought to be
622 * prepared for it to happen even now it's
623 * being freed, so let's make sure.... We do
624 * this after actually deregistering it, to
625 * make sure that a 'real' IRQ doesn't run in
626 * parallel with our fake
627 */
628 if (action->flags & IRQF_SHARED) {
629 local_irq_save(flags);
630 action->handler(irq, dev_id);
631 local_irq_restore(flags);
632 }
633#endif
634 kfree(action);
635 return;
636 }
637 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
638#ifdef CONFIG_DEBUG_SHIRQ 615#ifdef CONFIG_DEBUG_SHIRQ
639 dump_stack(); 616 /*
640#endif 617 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
641 spin_unlock_irqrestore(&desc->lock, flags); 618 * event to happen even now it's being freed, so let's make sure that
642 return; 619 * is so by doing an extra call to the handler ....
620 *
621 * ( We do this after actually deregistering it, to make sure that a
622 * 'real' IRQ doesn't run in * parallel with our fake. )
623 */
624 if (action->flags & IRQF_SHARED) {
625 local_irq_save(flags);
626 action->handler(irq, dev_id);
627 local_irq_restore(flags);
643 } 628 }
629#endif
630 return action;
631}
632
633/**
634 * remove_irq - free an interrupt
635 * @irq: Interrupt line to free
636 * @act: irqaction for the interrupt
637 *
638 * Used to remove interrupts statically setup by the early boot process.
639 */
640void remove_irq(unsigned int irq, struct irqaction *act)
641{
642 __free_irq(irq, act->dev_id);
643}
644EXPORT_SYMBOL_GPL(remove_irq);
645
646/**
647 * free_irq - free an interrupt allocated with request_irq
648 * @irq: Interrupt line to free
649 * @dev_id: Device identity to free
650 *
651 * Remove an interrupt handler. The handler is removed and if the
652 * interrupt line is no longer in use by any driver it is disabled.
653 * On a shared IRQ the caller must ensure the interrupt is disabled
654 * on the card it drives before calling this function. The function
655 * does not return until any executing interrupts for this IRQ
656 * have completed.
657 *
658 * This function must not be called from interrupt context.
659 */
660void free_irq(unsigned int irq, void *dev_id)
661{
662 kfree(__free_irq(irq, dev_id));
644} 663}
645EXPORT_SYMBOL(free_irq); 664EXPORT_SYMBOL(free_irq);
646 665
@@ -687,11 +706,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
687 * the behavior is classified as "will not fix" so we need to 706 * the behavior is classified as "will not fix" so we need to
688 * start nudging drivers away from using that idiom. 707 * start nudging drivers away from using that idiom.
689 */ 708 */
690 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) 709 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
691 == (IRQF_SHARED|IRQF_DISABLED)) 710 (IRQF_SHARED|IRQF_DISABLED)) {
692 pr_warning("IRQ %d/%s: IRQF_DISABLED is not " 711 pr_warning(
693 "guaranteed on shared IRQs\n", 712 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
694 irq, devname); 713 irq, devname);
714 }
695 715
696#ifdef CONFIG_LOCKDEP 716#ifdef CONFIG_LOCKDEP
697 /* 717 /*
@@ -717,15 +737,13 @@ int request_irq(unsigned int irq, irq_handler_t handler,
717 if (!handler) 737 if (!handler)
718 return -EINVAL; 738 return -EINVAL;
719 739
720 action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); 740 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
721 if (!action) 741 if (!action)
722 return -ENOMEM; 742 return -ENOMEM;
723 743
724 action->handler = handler; 744 action->handler = handler;
725 action->flags = irqflags; 745 action->flags = irqflags;
726 cpus_clear(action->mask);
727 action->name = devname; 746 action->name = devname;
728 action->next = NULL;
729 action->dev_id = dev_id; 747 action->dev_id = dev_id;
730 748
731 retval = __setup_irq(irq, desc, action); 749 retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..aef18ab6b75b 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int cpu, int nr)
19{ 19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, cpu, nr);
23 21
24 if (desc->kstat_irqs != old_desc->kstat_irqs) { 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
25 /* Compute how many bytes we need per irq and allocate them */ 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
26 bytes = nr * sizeof(unsigned int); 24 nr * sizeof(*desc->kstat_irqs));
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30} 25}
31 26
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) 27static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -71,7 +66,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
71 desc = irq_desc_ptrs[irq]; 66 desc = irq_desc_ptrs[irq];
72 67
73 if (desc && old_desc != desc) 68 if (desc && old_desc != desc)
74 goto out_unlock; 69 goto out_unlock;
75 70
76 node = cpu_to_node(cpu); 71 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 72 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
@@ -84,10 +79,15 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu); 79 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85 80
86 irq_desc_ptrs[irq] = desc; 81 irq_desc_ptrs[irq] = desc;
82 spin_unlock_irqrestore(&sparse_irq_lock, flags);
87 83
88 /* free the old one */ 84 /* free the old one */
89 free_one_irq_desc(old_desc, desc); 85 free_one_irq_desc(old_desc, desc);
86 spin_unlock(&old_desc->lock);
90 kfree(old_desc); 87 kfree(old_desc);
88 spin_lock(&desc->lock);
89
90 return desc;
91 91
92out_unlock: 92out_unlock:
93 spin_unlock_irqrestore(&sparse_irq_lock, flags); 93 spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_spurious_irqs(unsigned long dummy) 107static void poll_all_shared_irqs(void)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
123 123
124 try_one_irq(i, desc); 124 try_one_irq(i, desc);
125 } 125 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
126 131
127 mod_timer(&poll_spurious_irq_timer, 132 mod_timer(&poll_spurious_irq_timer,
128 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
129} 134}
130 135
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
131/* 143/*
132 * If 99,900 of the previous 100,000 interrupts have not been handled 144 * If 99,900 of the previous 100,000 interrupts have not been handled
133 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8bd..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
62 struct task_cputime cputime; 62 struct task_cputime cputime;
63 cputime_t utime; 63 cputime_t utime;
64 64
65 thread_group_cputime(tsk, &cputime); 65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime; 66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
82 struct task_cputime times; 82 struct task_cputime times;
83 cputime_t ptime; 83 cputime_t ptime;
84 84
85 thread_group_cputime(tsk, &times); 85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime); 86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33extern const unsigned long kallsyms_addresses[]; 33/* These will be re-linked against their real values during the second link stage */
34extern const u8 kallsyms_names[]; 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak));
35 36
36/* tell the compiler that the count isn't in the small data section if the arch 37/* tell the compiler that the count isn't in the small data section if the arch
37 * has one (eg: FRV) 38 * has one (eg: FRV)
38 */ 39 */
39extern const unsigned long kallsyms_num_syms 40extern const unsigned long kallsyms_num_syms
40 __attribute__((__section__(".rodata"))); 41__attribute__((weak, section(".rodata")));
41 42
42extern const u8 kallsyms_token_table[]; 43extern const u8 kallsyms_token_table[] __attribute__((weak));
43extern const u16 kallsyms_token_index[]; 44extern const u16 kallsyms_token_index[] __attribute__((weak));
44 45
45extern const unsigned long kallsyms_markers[]; 46extern const unsigned long kallsyms_markers[] __attribute__((weak));
46 47
47static inline int is_kernel_inittext(unsigned long addr) 48static inline int is_kernel_inittext(unsigned long addr)
48{ 49{
@@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
167 unsigned long symbol_start = 0, symbol_end = 0; 168 unsigned long symbol_start = 0, symbol_end = 0;
168 unsigned long i, low, high, mid; 169 unsigned long i, low, high, mid;
169 170
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
170 /* do a binary search on the sorted kallsyms_addresses array */ 174 /* do a binary search on the sorted kallsyms_addresses array */
171 low = 0; 175 low = 0;
172 high = kallsyms_num_syms; 176 high = kallsyms_num_syms;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864e..483899578259 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1465,6 +1465,11 @@ int kernel_kexec(void)
1465 error = device_power_down(PMSG_FREEZE); 1465 error = device_power_down(PMSG_FREEZE);
1466 if (error) 1466 if (error)
1467 goto Enable_irqs; 1467 goto Enable_irqs;
1468
1469 /* Suspend system devices */
1470 error = sysdev_suspend(PMSG_FREEZE);
1471 if (error)
1472 goto Power_up_devices;
1468 } else 1473 } else
1469#endif 1474#endif
1470 { 1475 {
@@ -1477,6 +1482,8 @@ int kernel_kexec(void)
1477 1482
1478#ifdef CONFIG_KEXEC_JUMP 1483#ifdef CONFIG_KEXEC_JUMP
1479 if (kexec_image->preserve_context) { 1484 if (kexec_image->preserve_context) {
1485 sysdev_resume();
1486 Power_up_devices:
1480 device_power_up(PMSG_RESTORE); 1487 device_power_up(PMSG_RESTORE);
1481 Enable_irqs: 1488 Enable_irqs:
1482 local_irq_enable(); 1489 local_irq_enable();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1b9cbdc0127a..7ba8cd9845cb 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -123,7 +123,7 @@ static int collect_garbage_slots(void);
123static int __kprobes check_safety(void) 123static int __kprobes check_safety(void)
124{ 124{
125 int ret = 0; 125 int ret = 0;
126#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) 126#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
127 ret = freeze_processes(); 127 ret = freeze_processes();
128 if (ret == 0) { 128 if (ret == 0) {
129 struct task_struct *p, *q; 129 struct task_struct *p, *q;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdbc..ca07c5c0c914 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
9 * as published by the Free Software Foundation; version 2 9 * as published by the Free Software Foundation; version 2
10 * of the License. 10 * of the License.
11 */ 11 */
12
13/*
14 * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
15 * used by the "latencytop" userspace tool. The latency that is tracked is not
16 * the 'traditional' interrupt latency (which is primarily caused by something
17 * else consuming CPU), but instead, it is the latency an application encounters
18 * because the kernel sleeps on its behalf for various reasons.
19 *
20 * This code tracks 2 levels of statistics:
21 * 1) System level latency
22 * 2) Per process latency
23 *
24 * The latency is stored in fixed sized data structures in an accumulated form;
25 * if the "same" latency cause is hit twice, this will be tracked as one entry
26 * in the data structure. Both the count, total accumulated latency and maximum
27 * latency are tracked in this data structure. When the fixed size structure is
28 * full, no new causes are tracked until the buffer is flushed by writing to
29 * the /proc file; the userspace tool does this on a regular basis.
30 *
31 * A latency cause is identified by a stringified backtrace at the point that
32 * the scheduler gets invoked. The userland tool will use this string to
33 * identify the cause of the latency in human readable form.
34 *
35 * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
36 * These files look like this:
37 *
38 * Latency Top version : v0.1
39 * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
40 * | | | |
41 * | | | +----> the stringified backtrace
42 * | | +---------> The maximum latency for this entry in microseconds
43 * | +--------------> The accumulated latency for this entry (microseconds)
44 * +-------------------> The number of times this entry is hit
45 *
46 * (note: the average latency is the accumulated latency divided by the number
47 * of times)
48 */
49
12#include <linux/latencytop.h> 50#include <linux/latencytop.h>
13#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
14#include <linux/seq_file.h> 52#include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
72 firstnonnull = i; 110 firstnonnull = i;
73 continue; 111 continue;
74 } 112 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 113 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
76 unsigned long record = lat->backtrace[q]; 114 unsigned long record = lat->backtrace[q];
77 115
78 if (latency_record[i].backtrace[q] != record) { 116 if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
101 memcpy(&latency_record[i], lat, sizeof(struct latency_record)); 139 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
102} 140}
103 141
104static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) 142/*
143 * Iterator to store a backtrace into a latency record entry
144 */
145static inline void store_stacktrace(struct task_struct *tsk,
146 struct latency_record *lat)
105{ 147{
106 struct stack_trace trace; 148 struct stack_trace trace;
107 149
108 memset(&trace, 0, sizeof(trace)); 150 memset(&trace, 0, sizeof(trace));
109 trace.max_entries = LT_BACKTRACEDEPTH; 151 trace.max_entries = LT_BACKTRACEDEPTH;
110 trace.entries = &lat->backtrace[0]; 152 trace.entries = &lat->backtrace[0];
111 trace.skip = 0;
112 save_stack_trace_tsk(tsk, &trace); 153 save_stack_trace_tsk(tsk, &trace);
113} 154}
114 155
156/**
157 * __account_scheduler_latency - record an occured latency
158 * @tsk - the task struct of the task hitting the latency
159 * @usecs - the duration of the latency in microseconds
160 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
161 *
162 * This function is the main entry point for recording latency entries
163 * as called by the scheduler.
164 *
165 * This function has a few special cases to deal with normal 'non-latency'
166 * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
167 * since this usually is caused by waiting for events via select() and co.
168 *
169 * Negative latencies (caused by time going backwards) are also explicitly
170 * skipped.
171 */
115void __sched 172void __sched
116account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) 173__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
117{ 174{
118 unsigned long flags; 175 unsigned long flags;
119 int i, q; 176 int i, q;
120 struct latency_record lat; 177 struct latency_record lat;
121 178
122 if (!latencytop_enabled)
123 return;
124
125 /* Long interruptible waits are generally user requested... */ 179 /* Long interruptible waits are generally user requested... */
126 if (inter && usecs > 5000) 180 if (inter && usecs > 5000)
127 return; 181 return;
128 182
183 /* Negative sleeps are time going backwards */
184 /* Zero-time sleeps are non-interesting */
185 if (usecs <= 0)
186 return;
187
129 memset(&lat, 0, sizeof(lat)); 188 memset(&lat, 0, sizeof(lat));
130 lat.count = 1; 189 lat.count = 1;
131 lat.time = usecs; 190 lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 if (tsk->latency_record_count >= LT_SAVECOUNT) 202 if (tsk->latency_record_count >= LT_SAVECOUNT)
144 goto out_unlock; 203 goto out_unlock;
145 204
146 for (i = 0; i < LT_SAVECOUNT ; i++) { 205 for (i = 0; i < LT_SAVECOUNT; i++) {
147 struct latency_record *mylat; 206 struct latency_record *mylat;
148 int same = 1; 207 int same = 1;
149 208
150 mylat = &tsk->latency_record[i]; 209 mylat = &tsk->latency_record[i];
151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 210 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
152 unsigned long record = lat.backtrace[q]; 211 unsigned long record = lat.backtrace[q];
153 212
154 if (mylat->backtrace[q] != record) { 213 if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
186 for (i = 0; i < MAXLR; i++) { 245 for (i = 0; i < MAXLR; i++) {
187 if (latency_record[i].backtrace[0]) { 246 if (latency_record[i].backtrace[0]) {
188 int q; 247 int q;
189 seq_printf(m, "%i %li %li ", 248 seq_printf(m, "%i %lu %lu ",
190 latency_record[i].count, 249 latency_record[i].count,
191 latency_record[i].time, 250 latency_record[i].time,
192 latency_record[i].max); 251 latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
223 return single_open(filp, lstats_show, NULL); 282 return single_open(filp, lstats_show, NULL);
224} 283}
225 284
226static struct file_operations lstats_fops = { 285static const struct file_operations lstats_fops = {
227 .open = lstats_open, 286 .open = lstats_open,
228 .read = seq_read, 287 .read = seq_read,
229 .write = lstats_write, 288 .write = lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
236 proc_create("latency_stats", 0644, NULL, &lstats_fops); 295 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237 return 0; 296 return 0;
238} 297}
239__initcall(init_lstats_procfs); 298device_initcall(init_lstats_procfs);
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd72..77672233387f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
573/* Init the unload section of the module. */ 573/* Init the unload section of the module. */
574static void module_unload_init(struct module *mod) 574static void module_unload_init(struct module *mod)
575{ 575{
576 unsigned int i; 576 int cpu;
577 577
578 INIT_LIST_HEAD(&mod->modules_which_use_me); 578 INIT_LIST_HEAD(&mod->modules_which_use_me);
579 for (i = 0; i < NR_CPUS; i++) 579 for_each_possible_cpu(cpu)
580 local_set(&mod->ref[i].count, 0); 580 local_set(__module_ref_addr(mod, cpu), 0);
581 /* Hold reference count during initialization. */ 581 /* Hold reference count during initialization. */
582 local_set(&mod->ref[raw_smp_processor_id()].count, 1); 582 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
583 /* Backwards compatibility macros put refcount during init. */ 583 /* Backwards compatibility macros put refcount during init. */
584 mod->waiter = current; 584 mod->waiter = current;
585} 585}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
717 717
718unsigned int module_refcount(struct module *mod) 718unsigned int module_refcount(struct module *mod)
719{ 719{
720 unsigned int i, total = 0; 720 unsigned int total = 0;
721 int cpu;
721 722
722 for (i = 0; i < NR_CPUS; i++) 723 for_each_possible_cpu(cpu)
723 total += local_read(&mod->ref[i].count); 724 total += local_read(__module_ref_addr(mod, cpu));
724 return total; 725 return total;
725} 726}
726EXPORT_SYMBOL(module_refcount); 727EXPORT_SYMBOL(module_refcount);
@@ -821,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
821 mutex_lock(&module_mutex); 822 mutex_lock(&module_mutex);
822 /* Store the name of the last unloaded module for diagnostic purposes */ 823 /* Store the name of the last unloaded module for diagnostic purposes */
823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 824 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
824 unregister_dynamic_debug_module(mod->name); 825 ddebug_remove_module(mod->name);
825 free_module(mod); 826 free_module(mod);
826 827
827 out: 828 out:
@@ -894,7 +895,7 @@ void module_put(struct module *module)
894{ 895{
895 if (module) { 896 if (module) {
896 unsigned int cpu = get_cpu(); 897 unsigned int cpu = get_cpu();
897 local_dec(&module->ref[cpu].count); 898 local_dec(__module_ref_addr(module, cpu));
898 /* Maybe they're waiting for us to drop reference? */ 899 /* Maybe they're waiting for us to drop reference? */
899 if (unlikely(!module_is_live(module))) 900 if (unlikely(!module_is_live(module)))
900 wake_up_process(module->waiter); 901 wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
1464 kfree(mod->args); 1465 kfree(mod->args);
1465 if (mod->percpu) 1466 if (mod->percpu)
1466 percpu_modfree(mod->percpu); 1467 percpu_modfree(mod->percpu);
1467 1468#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1469 if (mod->refptr)
1470 percpu_modfree(mod->refptr);
1471#endif
1468 /* Free lock-classes: */ 1472 /* Free lock-classes: */
1469 lockdep_free_key_range(mod->module_core, mod->core_size); 1473 lockdep_free_key_range(mod->module_core, mod->core_size);
1470 1474
@@ -1823,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod,
1823} 1827}
1824#endif /* CONFIG_KALLSYMS */ 1828#endif /* CONFIG_KALLSYMS */
1825 1829
1826static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) 1830static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1827{ 1831{
1828#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG 1832#ifdef CONFIG_DYNAMIC_DEBUG
1829 unsigned int i; 1833 if (ddebug_add_module(debug, num, debug->modname))
1830 1834 printk(KERN_ERR "dynamic debug error adding module: %s\n",
1831 for (i = 0; i < num; i++) { 1835 debug->modname);
1832 register_dynamic_debug_module(debug[i].modname, 1836#endif
1833 debug[i].type,
1834 debug[i].logical_modname,
1835 debug[i].flag_names,
1836 debug[i].hash, debug[i].hash2);
1837 }
1838#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1839} 1837}
1840 1838
1841static void *module_alloc_update_bounds(unsigned long size) 1839static void *module_alloc_update_bounds(unsigned long size)
@@ -2070,6 +2068,14 @@ static noinline struct module *load_module(void __user *umod,
2070 /* Module has been moved. */ 2068 /* Module has been moved. */
2071 mod = (void *)sechdrs[modindex].sh_addr; 2069 mod = (void *)sechdrs[modindex].sh_addr;
2072 2070
2071#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2072 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
2073 mod->name);
2074 if (!mod->refptr) {
2075 err = -ENOMEM;
2076 goto free_init;
2077 }
2078#endif
2073 /* Now we've moved module, initialize linked lists, etc. */ 2079 /* Now we've moved module, initialize linked lists, etc. */
2074 module_unload_init(mod); 2080 module_unload_init(mod);
2075 2081
@@ -2201,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod,
2201 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2207 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2202 2208
2203 if (!mod->taints) { 2209 if (!mod->taints) {
2204 struct mod_debug *debug; 2210 struct _ddebug *debug;
2205 unsigned int num_debug; 2211 unsigned int num_debug;
2206 2212
2207 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2213 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2208 sizeof(*debug), &num_debug); 2214 sizeof(*debug), &num_debug);
2209 dynamic_printk_setup(debug, num_debug); 2215 if (debug)
2216 dynamic_debug_setup(debug, num_debug);
2210 } 2217 }
2211 2218
2212 /* sechdrs[0].sh_size is always zero */ 2219 /* sechdrs[0].sh_size is always zero */
@@ -2276,9 +2283,14 @@ static noinline struct module *load_module(void __user *umod,
2276 ftrace_release(mod->module_core, mod->core_size); 2283 ftrace_release(mod->module_core, mod->core_size);
2277 free_unload: 2284 free_unload:
2278 module_unload_free(mod); 2285 module_unload_free(mod);
2286 free_init:
2287#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2288 percpu_modfree(mod->refptr);
2289#endif
2279 module_free(mod, mod->module_init); 2290 module_free(mod, mod->module_init);
2280 free_core: 2291 free_core:
2281 module_free(mod, mod->module_core); 2292 module_free(mod, mod->module_core);
2293 /* mod will be freed with core. Don't access it beyond this line! */
2282 free_percpu: 2294 free_percpu:
2283 if (percpu) 2295 if (percpu)
2284 percpu_modfree(percpu); 2296 percpu_modfree(percpu);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..8e5d9a68b022 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11 11
12/* 12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct task_cputime *totals, *tot;
62 int i;
63
64 totals = tsk->signal->cputime.totals;
65 if (!totals) {
66 times->utime = tsk->utime;
67 times->stime = tsk->stime;
68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
69 return;
70 }
71
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 13 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */ 14 */
85void update_rlimit_cpu(unsigned long rlim_new) 15void update_rlimit_cpu(unsigned long rlim_new)
@@ -300,6 +230,71 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
300 return 0; 230 return 0;
301} 231}
302 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{
235 struct sighand_struct *sighand;
236 struct signal_struct *sig;
237 struct task_struct *t;
238
239 *times = INIT_CPUTIME;
240
241 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand);
243 if (!sighand)
244 goto out;
245
246 sig = tsk->signal;
247
248 t = tsk;
249 do {
250 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime;
253
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out:
261 rcu_read_unlock();
262}
263
264static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
265{
266 if (cputime_gt(b->utime, a->utime))
267 a->utime = b->utime;
268
269 if (cputime_gt(b->stime, a->stime))
270 a->stime = b->stime;
271
272 if (b->sum_exec_runtime > a->sum_exec_runtime)
273 a->sum_exec_runtime = b->sum_exec_runtime;
274}
275
276void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
277{
278 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
279 struct task_cputime sum;
280 unsigned long flags;
281
282 spin_lock_irqsave(&cputimer->lock, flags);
283 if (!cputimer->running) {
284 cputimer->running = 1;
285 /*
286 * The POSIX timer interface allows for absolute time expiry
287 * values through the TIMER_ABSTIME flag, therefore we have
288 * to synchronize the timer to the clock every time we start
289 * it.
290 */
291 thread_group_cputime(tsk, &sum);
292 update_gt_cputime(&cputimer->cputime, &sum);
293 }
294 *times = cputimer->cputime;
295 spin_unlock_irqrestore(&cputimer->lock, flags);
296}
297
303/* 298/*
304 * Sample a process (thread group) clock for the given group_leader task. 299 * Sample a process (thread group) clock for the given group_leader task.
305 * Must be called with tasklist_lock held for reading. 300 * Must be called with tasklist_lock held for reading.
@@ -527,7 +522,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
527{ 522{
528 struct task_cputime cputime; 523 struct task_cputime cputime;
529 524
530 thread_group_cputime(tsk, &cputime); 525 thread_group_cputimer(tsk, &cputime);
531 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
532 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
533} 528}
@@ -686,6 +681,33 @@ static void cpu_timer_fire(struct k_itimer *timer)
686} 681}
687 682
688/* 683/*
684 * Sample a process (thread group) timer for the given group_leader task.
685 * Must be called with tasklist_lock held for reading.
686 */
687static int cpu_timer_sample_group(const clockid_t which_clock,
688 struct task_struct *p,
689 union cpu_time_count *cpu)
690{
691 struct task_cputime cputime;
692
693 thread_group_cputimer(p, &cputime);
694 switch (CPUCLOCK_WHICH(which_clock)) {
695 default:
696 return -EINVAL;
697 case CPUCLOCK_PROF:
698 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
699 break;
700 case CPUCLOCK_VIRT:
701 cpu->cpu = cputime.utime;
702 break;
703 case CPUCLOCK_SCHED:
704 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
705 break;
706 }
707 return 0;
708}
709
710/*
689 * Guts of sys_timer_settime for CPU timers. 711 * Guts of sys_timer_settime for CPU timers.
690 * This is called with the timer locked and interrupts disabled. 712 * This is called with the timer locked and interrupts disabled.
691 * If we return TIMER_RETRY, it's necessary to release the timer's lock 713 * If we return TIMER_RETRY, it's necessary to release the timer's lock
@@ -746,7 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
746 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 768 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
747 cpu_clock_sample(timer->it_clock, p, &val); 769 cpu_clock_sample(timer->it_clock, p, &val);
748 } else { 770 } else {
749 cpu_clock_sample_group(timer->it_clock, p, &val); 771 cpu_timer_sample_group(timer->it_clock, p, &val);
750 } 772 }
751 773
752 if (old) { 774 if (old) {
@@ -894,7 +916,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
894 read_unlock(&tasklist_lock); 916 read_unlock(&tasklist_lock);
895 goto dead; 917 goto dead;
896 } else { 918 } else {
897 cpu_clock_sample_group(timer->it_clock, p, &now); 919 cpu_timer_sample_group(timer->it_clock, p, &now);
898 clear_dead = (unlikely(p->exit_state) && 920 clear_dead = (unlikely(p->exit_state) &&
899 thread_group_empty(p)); 921 thread_group_empty(p));
900 } 922 }
@@ -1034,6 +1056,19 @@ static void check_thread_timers(struct task_struct *tsk,
1034 } 1056 }
1035} 1057}
1036 1058
1059static void stop_process_timers(struct task_struct *tsk)
1060{
1061 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
1062 unsigned long flags;
1063
1064 if (!cputimer->running)
1065 return;
1066
1067 spin_lock_irqsave(&cputimer->lock, flags);
1068 cputimer->running = 0;
1069 spin_unlock_irqrestore(&cputimer->lock, flags);
1070}
1071
1037/* 1072/*
1038 * Check for any per-thread CPU timers that have fired and move them 1073 * Check for any per-thread CPU timers that have fired and move them
1039 * off the tsk->*_timers list onto the firing list. Per-thread timers 1074 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1057,13 +1092,15 @@ static void check_process_timers(struct task_struct *tsk,
1057 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1092 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1058 list_empty(&timers[CPUCLOCK_VIRT]) && 1093 list_empty(&timers[CPUCLOCK_VIRT]) &&
1059 cputime_eq(sig->it_virt_expires, cputime_zero) && 1094 cputime_eq(sig->it_virt_expires, cputime_zero) &&
1060 list_empty(&timers[CPUCLOCK_SCHED])) 1095 list_empty(&timers[CPUCLOCK_SCHED])) {
1096 stop_process_timers(tsk);
1061 return; 1097 return;
1098 }
1062 1099
1063 /* 1100 /*
1064 * Collect the current process totals. 1101 * Collect the current process totals.
1065 */ 1102 */
1066 thread_group_cputime(tsk, &cputime); 1103 thread_group_cputimer(tsk, &cputime);
1067 utime = cputime.utime; 1104 utime = cputime.utime;
1068 ptime = cputime_add(utime, cputime.stime); 1105 ptime = cputime_add(utime, cputime.stime);
1069 sum_sched_runtime = cputime.sum_exec_runtime; 1106 sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1234,7 +1271,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1234 clear_dead_task(timer, now); 1271 clear_dead_task(timer, now);
1235 goto out_unlock; 1272 goto out_unlock;
1236 } 1273 }
1237 cpu_clock_sample_group(timer->it_clock, p, &now); 1274 cpu_timer_sample_group(timer->it_clock, p, &now);
1238 bump_cpu_timer(timer, now); 1275 bump_cpu_timer(timer, now);
1239 /* Leave the tasklist_lock locked for the call below. */ 1276 /* Leave the tasklist_lock locked for the call below. */
1240 } 1277 }
@@ -1329,11 +1366,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1329 if (!task_cputime_zero(&sig->cputime_expires)) { 1366 if (!task_cputime_zero(&sig->cputime_expires)) {
1330 struct task_cputime group_sample; 1367 struct task_cputime group_sample;
1331 1368
1332 thread_group_cputime(tsk, &group_sample); 1369 thread_group_cputimer(tsk, &group_sample);
1333 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1370 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1334 return 1; 1371 return 1;
1335 } 1372 }
1336 return 0; 1373
1374 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
1337} 1375}
1338 1376
1339/* 1377/*
@@ -1411,7 +1449,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1411 struct list_head *head; 1449 struct list_head *head;
1412 1450
1413 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1451 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1414 cpu_clock_sample_group(clock_idx, tsk, &now); 1452 cpu_timer_sample_group(clock_idx, tsk, &now);
1415 1453
1416 if (oldval) { 1454 if (oldval) {
1417 if (!cputime_eq(*oldval, cputime_zero)) { 1455 if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 597823b5b700..720ea4f781bd 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o
8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
9 10
10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b8628be2a465..a3961b205de7 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -78,6 +78,12 @@ void pm_restore_console(void)
78 } 78 }
79 set_console(orig_fgconsole); 79 set_console(orig_fgconsole);
80 release_console_sem(); 80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 }
86
81 kmsg_redirect = orig_kmsg; 87 kmsg_redirect = orig_kmsg;
82} 88}
83#endif 89#endif
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..4a4a206b1979 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
71 mutex_unlock(&pm_mutex); 71 mutex_unlock(&pm_mutex);
72} 72}
73 73
74static bool entering_platform_hibernation;
75
76bool system_entering_hibernation(void)
77{
78 return entering_platform_hibernation;
79}
80EXPORT_SYMBOL(system_entering_hibernation);
81
74#ifdef CONFIG_PM_DEBUG 82#ifdef CONFIG_PM_DEBUG
75static void hibernation_debug_sleep(void) 83static void hibernation_debug_sleep(void)
76{ 84{
@@ -219,6 +227,12 @@ static int create_image(int platform_mode)
219 "aborting hibernation\n"); 227 "aborting hibernation\n");
220 goto Enable_irqs; 228 goto Enable_irqs;
221 } 229 }
230 sysdev_suspend(PMSG_FREEZE);
231 if (error) {
232 printk(KERN_ERR "PM: Some devices failed to power down, "
233 "aborting hibernation\n");
234 goto Power_up_devices;
235 }
222 236
223 if (hibernation_test(TEST_CORE)) 237 if (hibernation_test(TEST_CORE))
224 goto Power_up; 238 goto Power_up;
@@ -234,9 +248,11 @@ static int create_image(int platform_mode)
234 if (!in_suspend) 248 if (!in_suspend)
235 platform_leave(platform_mode); 249 platform_leave(platform_mode);
236 Power_up: 250 Power_up:
251 sysdev_resume();
237 /* NOTE: device_power_up() is just a resume() for devices 252 /* NOTE: device_power_up() is just a resume() for devices
238 * that suspended with irqs off ... no overall powerup. 253 * that suspended with irqs off ... no overall powerup.
239 */ 254 */
255 Power_up_devices:
240 device_power_up(in_suspend ? 256 device_power_up(in_suspend ?
241 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 257 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
242 Enable_irqs: 258 Enable_irqs:
@@ -327,6 +343,7 @@ static int resume_target_kernel(void)
327 "aborting resume\n"); 343 "aborting resume\n");
328 goto Enable_irqs; 344 goto Enable_irqs;
329 } 345 }
346 sysdev_suspend(PMSG_QUIESCE);
330 /* We'll ignore saved state, but this gets preempt count (etc) right */ 347 /* We'll ignore saved state, but this gets preempt count (etc) right */
331 save_processor_state(); 348 save_processor_state();
332 error = restore_highmem(); 349 error = restore_highmem();
@@ -349,6 +366,7 @@ static int resume_target_kernel(void)
349 swsusp_free(); 366 swsusp_free();
350 restore_processor_state(); 367 restore_processor_state();
351 touch_softlockup_watchdog(); 368 touch_softlockup_watchdog();
369 sysdev_resume();
352 device_power_up(PMSG_RECOVER); 370 device_power_up(PMSG_RECOVER);
353 Enable_irqs: 371 Enable_irqs:
354 local_irq_enable(); 372 local_irq_enable();
@@ -411,6 +429,7 @@ int hibernation_platform_enter(void)
411 if (error) 429 if (error)
412 goto Close; 430 goto Close;
413 431
432 entering_platform_hibernation = true;
414 suspend_console(); 433 suspend_console();
415 error = device_suspend(PMSG_HIBERNATE); 434 error = device_suspend(PMSG_HIBERNATE);
416 if (error) { 435 if (error) {
@@ -431,6 +450,7 @@ int hibernation_platform_enter(void)
431 local_irq_disable(); 450 local_irq_disable();
432 error = device_power_down(PMSG_HIBERNATE); 451 error = device_power_down(PMSG_HIBERNATE);
433 if (!error) { 452 if (!error) {
453 sysdev_suspend(PMSG_HIBERNATE);
434 hibernation_ops->enter(); 454 hibernation_ops->enter();
435 /* We should never get here */ 455 /* We should never get here */
436 while (1); 456 while (1);
@@ -445,6 +465,7 @@ int hibernation_platform_enter(void)
445 Finish: 465 Finish:
446 hibernation_ops->finish(); 466 hibernation_ops->finish();
447 Resume_devices: 467 Resume_devices:
468 entering_platform_hibernation = false;
448 device_resume(PMSG_RESTORE); 469 device_resume(PMSG_RESTORE);
449 resume_console(); 470 resume_console();
450 Close: 471 Close:
@@ -585,6 +606,12 @@ static int software_resume(void)
585 unsigned int flags; 606 unsigned int flags;
586 607
587 /* 608 /*
609 * If the user said "noresume".. bail out early.
610 */
611 if (noresume)
612 return 0;
613
614 /*
588 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs 615 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
589 * is configured into the kernel. Since the regular hibernate 616 * is configured into the kernel. Since the regular hibernate
590 * trigger path is via sysfs which takes a buffer mutex before 617 * trigger path is via sysfs which takes a buffer mutex before
@@ -600,6 +627,11 @@ static int software_resume(void)
600 mutex_unlock(&pm_mutex); 627 mutex_unlock(&pm_mutex);
601 return -ENOENT; 628 return -ENOENT;
602 } 629 }
630 /*
631 * Some device discovery might still be in progress; we need
632 * to wait for this to finish.
633 */
634 wait_for_device_probe();
603 swsusp_resume_device = name_to_dev_t(resume_file); 635 swsusp_resume_device = name_to_dev_t(resume_file);
604 pr_debug("PM: Resume from partition %s\n", resume_file); 636 pr_debug("PM: Resume from partition %s\n", resume_file);
605 } else { 637 } else {
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 239988873971..c9632f841f64 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,16 +57,6 @@ int pm_notifier_call_chain(unsigned long val)
57#ifdef CONFIG_PM_DEBUG 57#ifdef CONFIG_PM_DEBUG
58int pm_test_level = TEST_NONE; 58int pm_test_level = TEST_NONE;
59 59
60static int suspend_test(int level)
61{
62 if (pm_test_level == level) {
63 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
64 mdelay(5000);
65 return 1;
66 }
67 return 0;
68}
69
70static const char * const pm_tests[__TEST_AFTER_LAST] = { 60static const char * const pm_tests[__TEST_AFTER_LAST] = {
71 [TEST_NONE] = "none", 61 [TEST_NONE] = "none",
72 [TEST_CORE] = "core", 62 [TEST_CORE] = "core",
@@ -125,14 +115,24 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
125} 115}
126 116
127power_attr(pm_test); 117power_attr(pm_test);
128#else /* !CONFIG_PM_DEBUG */ 118#endif /* CONFIG_PM_DEBUG */
129static inline int suspend_test(int level) { return 0; }
130#endif /* !CONFIG_PM_DEBUG */
131 119
132#endif /* CONFIG_PM_SLEEP */ 120#endif /* CONFIG_PM_SLEEP */
133 121
134#ifdef CONFIG_SUSPEND 122#ifdef CONFIG_SUSPEND
135 123
124static int suspend_test(int level)
125{
126#ifdef CONFIG_PM_DEBUG
127 if (pm_test_level == level) {
128 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
129 mdelay(5000);
130 return 1;
131 }
132#endif /* !CONFIG_PM_DEBUG */
133 return 0;
134}
135
136#ifdef CONFIG_PM_TEST_SUSPEND 136#ifdef CONFIG_PM_TEST_SUSPEND
137 137
138/* 138/*
@@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state)
298 goto Done; 298 goto Done;
299 } 299 }
300 300
301 if (!suspend_test(TEST_CORE)) 301 error = sysdev_suspend(PMSG_SUSPEND);
302 error = suspend_ops->enter(state); 302 if (!error) {
303 if (!suspend_test(TEST_CORE))
304 error = suspend_ops->enter(state);
305 sysdev_resume();
306 }
303 307
304 device_power_up(PMSG_RESUME); 308 device_power_up(PMSG_RESUME);
305 Done: 309 Done:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6da14358537c..505f319e489c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -60,6 +60,7 @@ static struct block_device *resume_bdev;
60static int submit(int rw, pgoff_t page_off, struct page *page, 60static int submit(int rw, pgoff_t page_off, struct page *page,
61 struct bio **bio_chain) 61 struct bio **bio_chain)
62{ 62{
63 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
63 struct bio *bio; 64 struct bio *bio;
64 65
65 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
@@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
80 bio_get(bio); 81 bio_get(bio);
81 82
82 if (bio_chain == NULL) { 83 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 84 submit_bio(bio_rw, bio);
84 wait_on_page_locked(page); 85 wait_on_page_locked(page);
85 if (rw == READ) 86 if (rw == READ)
86 bio_set_pages_dirty(bio); 87 bio_set_pages_dirty(bio);
@@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
90 get_page(page); /* These pages are freed later */ 91 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain; 92 bio->bi_private = *bio_chain;
92 *bio_chain = bio; 93 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 94 submit_bio(bio_rw, bio);
94 } 95 }
95 return 0; 96 return 0;
96} 97}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 005b93d839ba..6c85359364f2 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -95,15 +95,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
95 data->swap = swsusp_resume_device ? 95 data->swap = swsusp_resume_device ?
96 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 96 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
97 data->mode = O_RDONLY; 97 data->mode = O_RDONLY;
98 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 98 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
99 if (error) 99 if (error)
100 pm_notifier_call_chain(PM_POST_RESTORE); 100 pm_notifier_call_chain(PM_POST_HIBERNATION);
101 } else { 101 } else {
102 data->swap = -1; 102 data->swap = -1;
103 data->mode = O_WRONLY; 103 data->mode = O_WRONLY;
104 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 104 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
105 if (error) 105 if (error)
106 pm_notifier_call_chain(PM_POST_HIBERNATION); 106 pm_notifier_call_chain(PM_POST_RESTORE);
107 } 107 }
108 if (error) 108 if (error)
109 atomic_inc(&snapshot_device_available); 109 atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk.c b/kernel/printk.c
index 69188f226a93..e3602d0755b0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -73,7 +73,6 @@ EXPORT_SYMBOL(oops_in_progress);
73 * driver system. 73 * driver system.
74 */ 74 */
75static DECLARE_MUTEX(console_sem); 75static DECLARE_MUTEX(console_sem);
76static DECLARE_MUTEX(secondary_console_sem);
77struct console *console_drivers; 76struct console *console_drivers;
78EXPORT_SYMBOL_GPL(console_drivers); 77EXPORT_SYMBOL_GPL(console_drivers);
79 78
@@ -891,12 +890,14 @@ void suspend_console(void)
891 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 890 printk("Suspending console(s) (use no_console_suspend to debug)\n");
892 acquire_console_sem(); 891 acquire_console_sem();
893 console_suspended = 1; 892 console_suspended = 1;
893 up(&console_sem);
894} 894}
895 895
896void resume_console(void) 896void resume_console(void)
897{ 897{
898 if (!console_suspend_enabled) 898 if (!console_suspend_enabled)
899 return; 899 return;
900 down(&console_sem);
900 console_suspended = 0; 901 console_suspended = 0;
901 release_console_sem(); 902 release_console_sem();
902} 903}
@@ -912,11 +913,9 @@ void resume_console(void)
912void acquire_console_sem(void) 913void acquire_console_sem(void)
913{ 914{
914 BUG_ON(in_interrupt()); 915 BUG_ON(in_interrupt());
915 if (console_suspended) {
916 down(&secondary_console_sem);
917 return;
918 }
919 down(&console_sem); 916 down(&console_sem);
917 if (console_suspended)
918 return;
920 console_locked = 1; 919 console_locked = 1;
921 console_may_schedule = 1; 920 console_may_schedule = 1;
922} 921}
@@ -926,6 +925,10 @@ int try_acquire_console_sem(void)
926{ 925{
927 if (down_trylock(&console_sem)) 926 if (down_trylock(&console_sem))
928 return -1; 927 return -1;
928 if (console_suspended) {
929 up(&console_sem);
930 return -1;
931 }
929 console_locked = 1; 932 console_locked = 1;
930 console_may_schedule = 0; 933 console_may_schedule = 0;
931 return 0; 934 return 0;
@@ -979,7 +982,7 @@ void release_console_sem(void)
979 unsigned wake_klogd = 0; 982 unsigned wake_klogd = 0;
980 983
981 if (console_suspended) { 984 if (console_suspended) {
982 up(&secondary_console_sem); 985 up(&console_sem);
983 return; 986 return;
984 } 987 }
985 988
diff --git a/kernel/profile.c b/kernel/profile.c
index 784933acf5b8..7724e0409bae 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -114,12 +114,15 @@ int __ref profile_init(void)
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask); 116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
117 return 0; 118 return 0;
118 } 119 }
119 120
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM; 122 return -ENOMEM;
122 123
124 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
125
123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 126 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
124 if (prof_buffer) 127 if (prof_buffer)
125 return 0; 128 return 0;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 490934fc7ac3..654c640a6b9c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
679void rcu_check_callbacks(int cpu, int user) 679void rcu_check_callbacks(int cpu, int user)
680{ 680{
681 if (user || 681 if (user ||
682 (idle_cpu(cpu) && !in_softirq() && 682 (idle_cpu(cpu) && rcu_scheduler_active &&
683 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 683 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
684 684
685 /* 685 /*
686 * Get here if this CPU took its interrupt from user 686 * Get here if this CPU took its interrupt from user
@@ -716,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user)
716 raise_rcu_softirq(); 716 raise_rcu_softirq();
717} 717}
718 718
719static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 719static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
720 struct rcu_data *rdp) 720 struct rcu_data *rdp)
721{ 721{
722 unsigned long flags; 722 unsigned long flags;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881aa..cae8a059cf47 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48enum rcu_barrier { 49enum rcu_barrier {
49 RCU_BARRIER_STD, 50 RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
55static atomic_t rcu_barrier_cpu_count; 56static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 57static DEFINE_MUTEX(rcu_barrier_mutex);
57static struct completion rcu_barrier_completion; 58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly;
58 60
59/* 61/*
60 * Awaken the corresponding synchronize_rcu() instance now that a 62 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head)
80void synchronize_rcu(void) 82void synchronize_rcu(void)
81{ 83{
82 struct rcu_synchronize rcu; 84 struct rcu_synchronize rcu;
85
86 if (rcu_blocking_is_gp())
87 return;
88
83 init_completion(&rcu.completion); 89 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */ 90 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu); 91 call_rcu(&rcu.head, wakeme_after_rcu);
@@ -175,3 +181,9 @@ void __init rcu_init(void)
175 __rcu_init(); 181 __rcu_init();
176} 182}
177 183
184void rcu_scheduler_starting(void)
185{
186 WARN_ON(num_online_cpus() != 1);
187 WARN_ON(nr_context_switches() > 0);
188 rcu_scheduler_active = 1;
189}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50781f9..5d59e850fb71 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1181,6 +1181,9 @@ void __synchronize_sched(void)
1181{ 1181{
1182 struct rcu_synchronize rcu; 1182 struct rcu_synchronize rcu;
1183 1183
1184 if (num_online_cpus() == 1)
1185 return; /* blocking is gp if only one CPU! */
1186
1184 init_completion(&rcu.completion); 1187 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */ 1188 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1189 call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c60..97ce31579ec0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
948void rcu_check_callbacks(int cpu, int user) 948void rcu_check_callbacks(int cpu, int user)
949{ 949{
950 if (user || 950 if (user ||
951 (idle_cpu(cpu) && !in_softirq() && 951 (idle_cpu(cpu) && rcu_scheduler_active &&
952 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 952 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
953 953
954 /* 954 /*
955 * Get here if this CPU took its interrupt from user 955 * Get here if this CPU took its interrupt from user
@@ -1314,7 +1314,7 @@ int rcu_needs_cpu(int cpu)
1314 * access due to the fact that this CPU cannot possibly have any RCU 1314 * access due to the fact that this CPU cannot possibly have any RCU
1315 * callbacks in flight yet. 1315 * callbacks in flight yet.
1316 */ 1316 */
1317static void 1317static void __cpuinit
1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1319{ 1319{
1320 unsigned long flags; 1320 unsigned long flags;
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..8f2179c8056f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
663 663
664 mutex_lock(&relay_channels_mutex); 664 mutex_lock(&relay_channels_mutex);
665 /* Is chan already set up? */ 665 /* Is chan already set up? */
666 if (unlikely(chan->has_base_filename)) 666 if (unlikely(chan->has_base_filename)) {
667 mutex_unlock(&relay_channels_mutex);
667 return -EEXIST; 668 return -EEXIST;
669 }
668 chan->has_base_filename = 1; 670 chan->has_base_filename = 1;
669 chan->parent = parent; 671 chan->parent = parent;
670 curr_cpu = get_cpu(); 672 curr_cpu = get_cpu();
@@ -748,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
748 * from the scheduler (trying to re-grab 750 * from the scheduler (trying to re-grab
749 * rq->lock), so defer it. 751 * rq->lock), so defer it.
750 */ 752 */
751 __mod_timer(&buf->timer, jiffies + 1); 753 mod_timer(&buf->timer, jiffies + 1);
752 } 754 }
753 755
754 old = buf->data; 756 old = buf->data;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..9f8506d68fdc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 331 */
332static DEFINE_SPINLOCK(task_group_lock); 332static DEFINE_SPINLOCK(task_group_lock);
333 333
334#ifdef CONFIG_SMP
335static int root_task_group_empty(void)
336{
337 return list_empty(&root_task_group.children);
338}
339#endif
340
334#ifdef CONFIG_FAIR_GROUP_SCHED 341#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 342#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 343# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 398
392#else 399#else
393 400
401#ifdef CONFIG_SMP
402static int root_task_group_empty(void)
403{
404 return 1;
405}
406#endif
407
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 408static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 409static inline struct task_group *task_group(struct task_struct *p)
396{ 410{
@@ -467,11 +481,17 @@ struct rt_rq {
467 struct rt_prio_array active; 481 struct rt_prio_array active;
468 unsigned long rt_nr_running; 482 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 483#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 484 struct {
485 int curr; /* highest queued rt task prio */
486#ifdef CONFIG_SMP
487 int next; /* next highest */
488#endif
489 } highest_prio;
471#endif 490#endif
472#ifdef CONFIG_SMP 491#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 492 unsigned long rt_nr_migratory;
474 int overloaded; 493 int overloaded;
494 struct plist_head pushable_tasks;
475#endif 495#endif
476 int rt_throttled; 496 int rt_throttled;
477 u64 rt_time; 497 u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
549 unsigned long nr_running; 569 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 570 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 571 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 572#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 573 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 574 unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
590 struct root_domain *rd; 609 struct root_domain *rd;
591 struct sched_domain *sd; 610 struct sched_domain *sd;
592 611
612 unsigned char idle_at_tick;
593 /* For active balancing */ 613 /* For active balancing */
594 int active_balance; 614 int active_balance;
595 int push_cpu; 615 int push_cpu;
@@ -618,9 +638,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 638 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 639
620 /* sys_sched_yield() stats */ 640 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 641 unsigned int yld_count;
625 642
626 /* schedule() stats */ 643 /* schedule() stats */
@@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)
1183 1200
1184 assert_spin_locked(&task_rq(p)->lock); 1201 assert_spin_locked(&task_rq(p)->lock);
1185 1202
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1203 if (test_tsk_need_resched(p))
1187 return; 1204 return;
1188 1205
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1206 set_tsk_need_resched(p);
1190 1207
1191 cpu = task_cpu(p); 1208 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1209 if (cpu == smp_processor_id())
@@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1259 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1260 * idle task through an additional NOOP schedule()
1244 */ 1261 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1262 set_tsk_need_resched(rq->idle);
1246 1263
1247 /* NEED_RESCHED must be visible before we test polling */ 1264 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1265 smp_mb();
@@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1627
1611#endif 1628#endif
1612 1629
1630#ifdef CONFIG_PREEMPT
1631
1613/* 1632/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1633 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1634 * way at the expense of forcing extra atomic operations in all
1635 * invocations. This assures that the double_lock is acquired using the
1636 * same underlying policy as the spinlock_t on this architecture, which
1637 * reduces latency compared to the unfair variant below. However, it
1638 * also adds more overhead and therefore may reduce throughput.
1615 */ 1639 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1640static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(this_rq->lock)
1642 __acquires(busiest->lock)
1643 __acquires(this_rq->lock)
1644{
1645 spin_unlock(&this_rq->lock);
1646 double_rq_lock(this_rq, busiest);
1647
1648 return 1;
1649}
1650
1651#else
1652/*
1653 * Unfair double_lock_balance: Optimizes throughput at the expense of
1654 * latency by eliminating extra atomic operations when the locks are
1655 * already in proper order on entry. This favors lower cpu-ids and will
1656 * grant the double lock to lower cpus over higher ids under contention,
1657 * regardless of entry order into the function.
1658 */
1659static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1660 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1661 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1662 __acquires(this_rq->lock)
1620{ 1663{
1621 int ret = 0; 1664 int ret = 0;
1622 1665
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1666 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1667 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1668 spin_unlock(&this_rq->lock);
@@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1675 return ret;
1638} 1676}
1639 1677
1678#endif /* CONFIG_PREEMPT */
1679
1680/*
1681 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1682 */
1683static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1684{
1685 if (unlikely(!irqs_disabled())) {
1686 /* printk() doesn't work good under rq->lock */
1687 spin_unlock(&this_rq->lock);
1688 BUG_ON(1);
1689 }
1690
1691 return _double_lock_balance(this_rq, busiest);
1692}
1693
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1694static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1695 __releases(busiest->lock)
1642{ 1696{
@@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1759
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1760static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1761{
1762 if (wakeup)
1763 p->se.start_runtime = p->se.sum_exec_runtime;
1764
1708 sched_info_queued(p); 1765 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1766 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1767 p->se.on_rq = 1;
@@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1769
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1770static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1771{
1715 if (sleep && p->se.last_wakeup) { 1772 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1773 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1774 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1775 p->se.sum_exec_runtime - p->se.last_wakeup);
1776 p->se.last_wakeup = 0;
1777 } else {
1778 update_avg(&p->se.avg_wakeup,
1779 sysctl_sched_wakeup_granularity);
1780 }
1719 } 1781 }
1720 1782
1721 sched_info_dequeued(p); 1783 sched_info_dequeued(p);
@@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2017 * it must be off the runqueue _entirely_, and not 2079 * it must be off the runqueue _entirely_, and not
2018 * preempted! 2080 * preempted!
2019 * 2081 *
2020 * So if it wa still runnable (but just not actively 2082 * So if it was still runnable (but just not actively
2021 * running right now), it's preempted, and we should 2083 * running right now), it's preempted, and we should
2022 * yield - it could be a while. 2084 * yield - it could be a while.
2023 */ 2085 */
@@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2267 sync = 0; 2329 sync = 0;
2268 2330
2269#ifdef CONFIG_SMP 2331#ifdef CONFIG_SMP
2270 if (sched_feat(LB_WAKEUP_UPDATE)) { 2332 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2271 struct sched_domain *sd; 2333 struct sched_domain *sd;
2272 2334
2273 this_cpu = raw_smp_processor_id(); 2335 this_cpu = raw_smp_processor_id();
@@ -2345,6 +2407,22 @@ out_activate:
2345 activate_task(rq, p, 1); 2407 activate_task(rq, p, 1);
2346 success = 1; 2408 success = 1;
2347 2409
2410 /*
2411 * Only attribute actual wakeups done by this task.
2412 */
2413 if (!in_interrupt()) {
2414 struct sched_entity *se = &current->se;
2415 u64 sample = se->sum_exec_runtime;
2416
2417 if (se->last_wakeup)
2418 sample -= se->last_wakeup;
2419 else
2420 sample -= se->start_runtime;
2421 update_avg(&se->avg_wakeup, sample);
2422
2423 se->last_wakeup = se->sum_exec_runtime;
2424 }
2425
2348out_running: 2426out_running:
2349 trace_sched_wakeup(rq, p, success); 2427 trace_sched_wakeup(rq, p, success);
2350 check_preempt_curr(rq, p, sync); 2428 check_preempt_curr(rq, p, sync);
@@ -2355,8 +2433,6 @@ out_running:
2355 p->sched_class->task_wake_up(rq, p); 2433 p->sched_class->task_wake_up(rq, p);
2356#endif 2434#endif
2357out: 2435out:
2358 current->se.last_wakeup = current->se.sum_exec_runtime;
2359
2360 task_rq_unlock(rq, &flags); 2436 task_rq_unlock(rq, &flags);
2361 2437
2362 return success; 2438 return success;
@@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p)
2386 p->se.prev_sum_exec_runtime = 0; 2462 p->se.prev_sum_exec_runtime = 0;
2387 p->se.last_wakeup = 0; 2463 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2464 p->se.avg_overlap = 0;
2465 p->se.start_runtime = 0;
2466 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2389 2467
2390#ifdef CONFIG_SCHEDSTATS 2468#ifdef CONFIG_SCHEDSTATS
2391 p->se.wait_start = 0; 2469 p->se.wait_start = 0;
@@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2448 /* Want to start with kernel preemption disabled. */ 2526 /* Want to start with kernel preemption disabled. */
2449 task_thread_info(p)->preempt_count = 1; 2527 task_thread_info(p)->preempt_count = 1;
2450#endif 2528#endif
2529 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2530
2451 put_cpu(); 2531 put_cpu();
2452} 2532}
2453 2533
@@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2491#ifdef CONFIG_PREEMPT_NOTIFIERS 2571#ifdef CONFIG_PREEMPT_NOTIFIERS
2492 2572
2493/** 2573/**
2494 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2574 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2495 * @notifier: notifier struct to register 2575 * @notifier: notifier struct to register
2496 */ 2576 */
2497void preempt_notifier_register(struct preempt_notifier *notifier) 2577void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2588{ 2668{
2589 struct mm_struct *mm = rq->prev_mm; 2669 struct mm_struct *mm = rq->prev_mm;
2590 long prev_state; 2670 long prev_state;
2671#ifdef CONFIG_SMP
2672 int post_schedule = 0;
2673
2674 if (current->sched_class->needs_post_schedule)
2675 post_schedule = current->sched_class->needs_post_schedule(rq);
2676#endif
2591 2677
2592 rq->prev_mm = NULL; 2678 rq->prev_mm = NULL;
2593 2679
@@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2606 finish_arch_switch(prev); 2692 finish_arch_switch(prev);
2607 finish_lock_switch(rq, prev); 2693 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2694#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2695 if (post_schedule)
2610 current->sched_class->post_schedule(rq); 2696 current->sched_class->post_schedule(rq);
2611#endif 2697#endif
2612 2698
@@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2913 struct sched_domain *sd, enum cpu_idle_type idle, 2999 struct sched_domain *sd, enum cpu_idle_type idle,
2914 int *all_pinned) 3000 int *all_pinned)
2915{ 3001{
3002 int tsk_cache_hot = 0;
2916 /* 3003 /*
2917 * We do not migrate tasks that are: 3004 * We do not migrate tasks that are:
2918 * 1) running (obviously), or 3005 * 1) running (obviously), or
@@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2936 * 2) too many balance attempts have failed. 3023 * 2) too many balance attempts have failed.
2937 */ 3024 */
2938 3025
2939 if (!task_hot(p, rq->clock, sd) || 3026 tsk_cache_hot = task_hot(p, rq->clock, sd);
2940 sd->nr_balance_failed > sd->cache_nice_tries) { 3027 if (!tsk_cache_hot ||
3028 sd->nr_balance_failed > sd->cache_nice_tries) {
2941#ifdef CONFIG_SCHEDSTATS 3029#ifdef CONFIG_SCHEDSTATS
2942 if (task_hot(p, rq->clock, sd)) { 3030 if (tsk_cache_hot) {
2943 schedstat_inc(sd, lb_hot_gained[idle]); 3031 schedstat_inc(sd, lb_hot_gained[idle]);
2944 schedstat_inc(p, se.nr_forced_migrations); 3032 schedstat_inc(p, se.nr_forced_migrations);
2945 } 3033 }
@@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2947 return 1; 3035 return 1;
2948 } 3036 }
2949 3037
2950 if (task_hot(p, rq->clock, sd)) { 3038 if (tsk_cache_hot) {
2951 schedstat_inc(p, se.nr_failed_migrations_hot); 3039 schedstat_inc(p, se.nr_failed_migrations_hot);
2952 return 0; 3040 return 0;
2953 } 3041 }
@@ -2987,6 +3075,16 @@ next:
2987 pulled++; 3075 pulled++;
2988 rem_load_move -= p->se.load.weight; 3076 rem_load_move -= p->se.load.weight;
2989 3077
3078#ifdef CONFIG_PREEMPT
3079 /*
3080 * NEWIDLE balancing is a source of latency, so preemptible kernels
3081 * will stop after the first task is pulled to minimize the critical
3082 * section.
3083 */
3084 if (idle == CPU_NEWLY_IDLE)
3085 goto out;
3086#endif
3087
2990 /* 3088 /*
2991 * We only want to steal up to the prescribed amount of weighted load. 3089 * We only want to steal up to the prescribed amount of weighted load.
2992 */ 3090 */
@@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3033 sd, idle, all_pinned, &this_best_prio); 3131 sd, idle, all_pinned, &this_best_prio);
3034 class = class->next; 3132 class = class->next;
3035 3133
3134#ifdef CONFIG_PREEMPT
3135 /*
3136 * NEWIDLE balancing is a source of latency, so preemptible
3137 * kernels will stop after the first task is pulled to minimize
3138 * the critical section.
3139 */
3036 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3140 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3037 break; 3141 break;
3038 3142#endif
3039 } while (class && max_load_move > total_load_moved); 3143 } while (class && max_load_move > total_load_moved);
3040 3144
3041 return total_load_moved > 0; 3145 return total_load_moved > 0;
@@ -3085,246 +3189,479 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3085 3189
3086 return 0; 3190 return 0;
3087} 3191}
3192/********** Helpers for find_busiest_group ************************/
3193/**
3194 * sd_lb_stats - Structure to store the statistics of a sched_domain
3195 * during load balancing.
3196 */
3197struct sd_lb_stats {
3198 struct sched_group *busiest; /* Busiest group in this sd */
3199 struct sched_group *this; /* Local group in this sd */
3200 unsigned long total_load; /* Total load of all groups in sd */
3201 unsigned long total_pwr; /* Total power of all groups in sd */
3202 unsigned long avg_load; /* Average load across all groups in sd */
3203
3204 /** Statistics of this group */
3205 unsigned long this_load;
3206 unsigned long this_load_per_task;
3207 unsigned long this_nr_running;
3208
3209 /* Statistics of the busiest group */
3210 unsigned long max_load;
3211 unsigned long busiest_load_per_task;
3212 unsigned long busiest_nr_running;
3213
3214 int group_imb; /* Is there imbalance in this sd */
3215#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3216 int power_savings_balance; /* Is powersave balance needed for this sd */
3217 struct sched_group *group_min; /* Least loaded group in sd */
3218 struct sched_group *group_leader; /* Group which relieves group_min */
3219 unsigned long min_load_per_task; /* load_per_task in group_min */
3220 unsigned long leader_nr_running; /* Nr running of group_leader */
3221 unsigned long min_nr_running; /* Nr running of group_min */
3222#endif
3223};
3088 3224
3089/* 3225/**
3090 * find_busiest_group finds and returns the busiest CPU group within the 3226 * sg_lb_stats - stats of a sched_group required for load_balancing
3091 * domain. It calculates and returns the amount of weighted load which 3227 */
3092 * should be moved to restore balance via the imbalance parameter. 3228struct sg_lb_stats {
3229 unsigned long avg_load; /*Avg load across the CPUs of the group */
3230 unsigned long group_load; /* Total load over the CPUs of the group */
3231 unsigned long sum_nr_running; /* Nr tasks running in the group */
3232 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3233 unsigned long group_capacity;
3234 int group_imb; /* Is there an imbalance in the group ? */
3235};
3236
3237/**
3238 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3239 * @group: The group whose first cpu is to be returned.
3093 */ 3240 */
3094static struct sched_group * 3241static inline unsigned int group_first_cpu(struct sched_group *group)
3095find_busiest_group(struct sched_domain *sd, int this_cpu,
3096 unsigned long *imbalance, enum cpu_idle_type idle,
3097 int *sd_idle, const struct cpumask *cpus, int *balance)
3098{ 3242{
3099 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3243 return cpumask_first(sched_group_cpus(group));
3100 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3244}
3101 unsigned long max_pull;
3102 unsigned long busiest_load_per_task, busiest_nr_running;
3103 unsigned long this_load_per_task, this_nr_running;
3104 int load_idx, group_imb = 0;
3105#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3106 int power_savings_balance = 1;
3107 unsigned long leader_nr_running = 0, min_load_per_task = 0;
3108 unsigned long min_nr_running = ULONG_MAX;
3109 struct sched_group *group_min = NULL, *group_leader = NULL;
3110#endif
3111 3245
3112 max_load = this_load = total_load = total_pwr = 0; 3246/**
3113 busiest_load_per_task = busiest_nr_running = 0; 3247 * get_sd_load_idx - Obtain the load index for a given sched domain.
3114 this_load_per_task = this_nr_running = 0; 3248 * @sd: The sched_domain whose load_idx is to be obtained.
3249 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3250 */
3251static inline int get_sd_load_idx(struct sched_domain *sd,
3252 enum cpu_idle_type idle)
3253{
3254 int load_idx;
3115 3255
3116 if (idle == CPU_NOT_IDLE) 3256 switch (idle) {
3257 case CPU_NOT_IDLE:
3117 load_idx = sd->busy_idx; 3258 load_idx = sd->busy_idx;
3118 else if (idle == CPU_NEWLY_IDLE) 3259 break;
3260
3261 case CPU_NEWLY_IDLE:
3119 load_idx = sd->newidle_idx; 3262 load_idx = sd->newidle_idx;
3120 else 3263 break;
3264 default:
3121 load_idx = sd->idle_idx; 3265 load_idx = sd->idle_idx;
3266 break;
3267 }
3122 3268
3123 do { 3269 return load_idx;
3124 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3270}
3125 int local_group;
3126 int i;
3127 int __group_imb = 0;
3128 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3129 unsigned long sum_nr_running, sum_weighted_load;
3130 unsigned long sum_avg_load_per_task;
3131 unsigned long avg_load_per_task;
3132 3271
3133 local_group = cpumask_test_cpu(this_cpu,
3134 sched_group_cpus(group));
3135 3272
3136 if (local_group) 3273#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3137 balance_cpu = cpumask_first(sched_group_cpus(group)); 3274/**
3275 * init_sd_power_savings_stats - Initialize power savings statistics for
3276 * the given sched_domain, during load balancing.
3277 *
3278 * @sd: Sched domain whose power-savings statistics are to be initialized.
3279 * @sds: Variable containing the statistics for sd.
3280 * @idle: Idle status of the CPU at which we're performing load-balancing.
3281 */
3282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3284{
3285 /*
3286 * Busy processors will not participate in power savings
3287 * balance.
3288 */
3289 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3290 sds->power_savings_balance = 0;
3291 else {
3292 sds->power_savings_balance = 1;
3293 sds->min_nr_running = ULONG_MAX;
3294 sds->leader_nr_running = 0;
3295 }
3296}
3138 3297
3139 /* Tally up the load of all CPUs in the group */ 3298/**
3140 sum_weighted_load = sum_nr_running = avg_load = 0; 3299 * update_sd_power_savings_stats - Update the power saving stats for a
3141 sum_avg_load_per_task = avg_load_per_task = 0; 3300 * sched_domain while performing load balancing.
3301 *
3302 * @group: sched_group belonging to the sched_domain under consideration.
3303 * @sds: Variable containing the statistics of the sched_domain
3304 * @local_group: Does group contain the CPU for which we're performing
3305 * load balancing ?
3306 * @sgs: Variable containing the statistics of the group.
3307 */
3308static inline void update_sd_power_savings_stats(struct sched_group *group,
3309 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3310{
3142 3311
3143 max_cpu_load = 0; 3312 if (!sds->power_savings_balance)
3144 min_cpu_load = ~0UL; 3313 return;
3145 3314
3146 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3315 /*
3147 struct rq *rq = cpu_rq(i); 3316 * If the local group is idle or completely loaded
3317 * no need to do power savings balance at this domain
3318 */
3319 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3320 !sds->this_nr_running))
3321 sds->power_savings_balance = 0;
3322
3323 /*
3324 * If a group is already running at full capacity or idle,
3325 * don't include that group in power savings calculations
3326 */
3327 if (!sds->power_savings_balance ||
3328 sgs->sum_nr_running >= sgs->group_capacity ||
3329 !sgs->sum_nr_running)
3330 return;
3148 3331
3149 if (*sd_idle && rq->nr_running) 3332 /*
3150 *sd_idle = 0; 3333 * Calculate the group which has the least non-idle load.
3334 * This is the group from where we need to pick up the load
3335 * for saving power
3336 */
3337 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3338 (sgs->sum_nr_running == sds->min_nr_running &&
3339 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3340 sds->group_min = group;
3341 sds->min_nr_running = sgs->sum_nr_running;
3342 sds->min_load_per_task = sgs->sum_weighted_load /
3343 sgs->sum_nr_running;
3344 }
3151 3345
3152 /* Bias balancing toward cpus of our domain */ 3346 /*
3153 if (local_group) { 3347 * Calculate the group which is almost near its
3154 if (idle_cpu(i) && !first_idle_cpu) { 3348 * capacity but still has some space to pick up some load
3155 first_idle_cpu = 1; 3349 * from other group and save more power
3156 balance_cpu = i; 3350 */
3157 } 3351 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3352 return;
3158 3353
3159 load = target_load(i, load_idx); 3354 if (sgs->sum_nr_running > sds->leader_nr_running ||
3160 } else { 3355 (sgs->sum_nr_running == sds->leader_nr_running &&
3161 load = source_load(i, load_idx); 3356 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3162 if (load > max_cpu_load) 3357 sds->group_leader = group;
3163 max_cpu_load = load; 3358 sds->leader_nr_running = sgs->sum_nr_running;
3164 if (min_cpu_load > load) 3359 }
3165 min_cpu_load = load; 3360}
3166 }
3167 3361
3168 avg_load += load; 3362/**
3169 sum_nr_running += rq->nr_running; 3363 * check_power_save_busiest_group - Check if we have potential to perform
3170 sum_weighted_load += weighted_cpuload(i); 3364 * some power-savings balance. If yes, set the busiest group to be
3365 * the least loaded group in the sched_domain, so that it's CPUs can
3366 * be put to idle.
3367 *
3368 * @sds: Variable containing the statistics of the sched_domain
3369 * under consideration.
3370 * @this_cpu: Cpu at which we're currently performing load-balancing.
3371 * @imbalance: Variable to store the imbalance.
3372 *
3373 * Returns 1 if there is potential to perform power-savings balance.
3374 * Else returns 0.
3375 */
3376static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3377 int this_cpu, unsigned long *imbalance)
3378{
3379 if (!sds->power_savings_balance)
3380 return 0;
3171 3381
3172 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3382 if (sds->this != sds->group_leader ||
3173 } 3383 sds->group_leader == sds->group_min)
3384 return 0;
3174 3385
3175 /* 3386 *imbalance = sds->min_load_per_task;
3176 * First idle cpu or the first cpu(busiest) in this sched group 3387 sds->busiest = sds->group_min;
3177 * is eligible for doing load balancing at this and above
3178 * domains. In the newly idle case, we will allow all the cpu's
3179 * to do the newly idle load balance.
3180 */
3181 if (idle != CPU_NEWLY_IDLE && local_group &&
3182 balance_cpu != this_cpu && balance) {
3183 *balance = 0;
3184 goto ret;
3185 }
3186 3388
3187 total_load += avg_load; 3389 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3188 total_pwr += group->__cpu_power; 3390 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3391 group_first_cpu(sds->group_leader);
3392 }
3189 3393
3190 /* Adjust by relative CPU power of the group */ 3394 return 1;
3191 avg_load = sg_div_cpu_power(group,
3192 avg_load * SCHED_LOAD_SCALE);
3193 3395
3396}
3397#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3398static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3399 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3400{
3401 return;
3402}
3194 3403
3195 /* 3404static inline void update_sd_power_savings_stats(struct sched_group *group,
3196 * Consider the group unbalanced when the imbalance is larger 3405 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3197 * than the average weight of two tasks. 3406{
3198 * 3407 return;
3199 * APZ: with cgroup the avg task weight can vary wildly and 3408}
3200 * might not be a suitable number - should we keep a
3201 * normalized nr_running number somewhere that negates
3202 * the hierarchy?
3203 */
3204 avg_load_per_task = sg_div_cpu_power(group,
3205 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3206 3409
3207 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3410static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3208 __group_imb = 1; 3411 int this_cpu, unsigned long *imbalance)
3412{
3413 return 0;
3414}
3415#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3209 3416
3210 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3211 3417
3418/**
3419 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3420 * @group: sched_group whose statistics are to be updated.
3421 * @this_cpu: Cpu for which load balance is currently performed.
3422 * @idle: Idle status of this_cpu
3423 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3424 * @sd_idle: Idle status of the sched_domain containing group.
3425 * @local_group: Does group contain this_cpu.
3426 * @cpus: Set of cpus considered for load balancing.
3427 * @balance: Should we balance.
3428 * @sgs: variable to hold the statistics for this group.
3429 */
3430static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3431 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3432 int local_group, const struct cpumask *cpus,
3433 int *balance, struct sg_lb_stats *sgs)
3434{
3435 unsigned long load, max_cpu_load, min_cpu_load;
3436 int i;
3437 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3438 unsigned long sum_avg_load_per_task;
3439 unsigned long avg_load_per_task;
3440
3441 if (local_group)
3442 balance_cpu = group_first_cpu(group);
3443
3444 /* Tally up the load of all CPUs in the group */
3445 sum_avg_load_per_task = avg_load_per_task = 0;
3446 max_cpu_load = 0;
3447 min_cpu_load = ~0UL;
3448
3449 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3450 struct rq *rq = cpu_rq(i);
3451
3452 if (*sd_idle && rq->nr_running)
3453 *sd_idle = 0;
3454
3455 /* Bias balancing toward cpus of our domain */
3212 if (local_group) { 3456 if (local_group) {
3213 this_load = avg_load; 3457 if (idle_cpu(i) && !first_idle_cpu) {
3214 this = group; 3458 first_idle_cpu = 1;
3215 this_nr_running = sum_nr_running; 3459 balance_cpu = i;
3216 this_load_per_task = sum_weighted_load; 3460 }
3217 } else if (avg_load > max_load && 3461
3218 (sum_nr_running > group_capacity || __group_imb)) { 3462 load = target_load(i, load_idx);
3219 max_load = avg_load; 3463 } else {
3220 busiest = group; 3464 load = source_load(i, load_idx);
3221 busiest_nr_running = sum_nr_running; 3465 if (load > max_cpu_load)
3222 busiest_load_per_task = sum_weighted_load; 3466 max_cpu_load = load;
3223 group_imb = __group_imb; 3467 if (min_cpu_load > load)
3468 min_cpu_load = load;
3224 } 3469 }
3225 3470
3226#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3471 sgs->group_load += load;
3227 /* 3472 sgs->sum_nr_running += rq->nr_running;
3228 * Busy processors will not participate in power savings 3473 sgs->sum_weighted_load += weighted_cpuload(i);
3229 * balance.
3230 */
3231 if (idle == CPU_NOT_IDLE ||
3232 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3233 goto group_next;
3234 3474
3235 /* 3475 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3236 * If the local group is idle or completely loaded 3476 }
3237 * no need to do power savings balance at this domain
3238 */
3239 if (local_group && (this_nr_running >= group_capacity ||
3240 !this_nr_running))
3241 power_savings_balance = 0;
3242 3477
3243 /* 3478 /*
3244 * If a group is already running at full capacity or idle, 3479 * First idle cpu or the first cpu(busiest) in this sched group
3245 * don't include that group in power savings calculations 3480 * is eligible for doing load balancing at this and above
3246 */ 3481 * domains. In the newly idle case, we will allow all the cpu's
3247 if (!power_savings_balance || sum_nr_running >= group_capacity 3482 * to do the newly idle load balance.
3248 || !sum_nr_running) 3483 */
3249 goto group_next; 3484 if (idle != CPU_NEWLY_IDLE && local_group &&
3485 balance_cpu != this_cpu && balance) {
3486 *balance = 0;
3487 return;
3488 }
3250 3489
3251 /* 3490 /* Adjust by relative CPU power of the group */
3252 * Calculate the group which has the least non-idle load. 3491 sgs->avg_load = sg_div_cpu_power(group,
3253 * This is the group from where we need to pick up the load 3492 sgs->group_load * SCHED_LOAD_SCALE);
3254 * for saving power
3255 */
3256 if ((sum_nr_running < min_nr_running) ||
3257 (sum_nr_running == min_nr_running &&
3258 cpumask_first(sched_group_cpus(group)) >
3259 cpumask_first(sched_group_cpus(group_min)))) {
3260 group_min = group;
3261 min_nr_running = sum_nr_running;
3262 min_load_per_task = sum_weighted_load /
3263 sum_nr_running;
3264 }
3265 3493
3266 /* 3494
3267 * Calculate the group which is almost near its 3495 /*
3268 * capacity but still has some space to pick up some load 3496 * Consider the group unbalanced when the imbalance is larger
3269 * from other group and save more power 3497 * than the average weight of two tasks.
3270 */ 3498 *
3271 if (sum_nr_running <= group_capacity - 1) { 3499 * APZ: with cgroup the avg task weight can vary wildly and
3272 if (sum_nr_running > leader_nr_running || 3500 * might not be a suitable number - should we keep a
3273 (sum_nr_running == leader_nr_running && 3501 * normalized nr_running number somewhere that negates
3274 cpumask_first(sched_group_cpus(group)) < 3502 * the hierarchy?
3275 cpumask_first(sched_group_cpus(group_leader)))) { 3503 */
3276 group_leader = group; 3504 avg_load_per_task = sg_div_cpu_power(group,
3277 leader_nr_running = sum_nr_running; 3505 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3278 } 3506
3507 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3508 sgs->group_imb = 1;
3509
3510 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3511
3512}
3513
3514/**
3515 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3516 * @sd: sched_domain whose statistics are to be updated.
3517 * @this_cpu: Cpu for which load balance is currently performed.
3518 * @idle: Idle status of this_cpu
3519 * @sd_idle: Idle status of the sched_domain containing group.
3520 * @cpus: Set of cpus considered for load balancing.
3521 * @balance: Should we balance.
3522 * @sds: variable to hold the statistics for this sched_domain.
3523 */
3524static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3525 enum cpu_idle_type idle, int *sd_idle,
3526 const struct cpumask *cpus, int *balance,
3527 struct sd_lb_stats *sds)
3528{
3529 struct sched_group *group = sd->groups;
3530 struct sg_lb_stats sgs;
3531 int load_idx;
3532
3533 init_sd_power_savings_stats(sd, sds, idle);
3534 load_idx = get_sd_load_idx(sd, idle);
3535
3536 do {
3537 int local_group;
3538
3539 local_group = cpumask_test_cpu(this_cpu,
3540 sched_group_cpus(group));
3541 memset(&sgs, 0, sizeof(sgs));
3542 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3543 local_group, cpus, balance, &sgs);
3544
3545 if (local_group && balance && !(*balance))
3546 return;
3547
3548 sds->total_load += sgs.group_load;
3549 sds->total_pwr += group->__cpu_power;
3550
3551 if (local_group) {
3552 sds->this_load = sgs.avg_load;
3553 sds->this = group;
3554 sds->this_nr_running = sgs.sum_nr_running;
3555 sds->this_load_per_task = sgs.sum_weighted_load;
3556 } else if (sgs.avg_load > sds->max_load &&
3557 (sgs.sum_nr_running > sgs.group_capacity ||
3558 sgs.group_imb)) {
3559 sds->max_load = sgs.avg_load;
3560 sds->busiest = group;
3561 sds->busiest_nr_running = sgs.sum_nr_running;
3562 sds->busiest_load_per_task = sgs.sum_weighted_load;
3563 sds->group_imb = sgs.group_imb;
3279 } 3564 }
3280group_next: 3565
3281#endif 3566 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3282 group = group->next; 3567 group = group->next;
3283 } while (group != sd->groups); 3568 } while (group != sd->groups);
3284 3569
3285 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3570}
3286 goto out_balanced;
3287
3288 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3289 3571
3290 if (this_load >= avg_load || 3572/**
3291 100*max_load <= sd->imbalance_pct*this_load) 3573 * fix_small_imbalance - Calculate the minor imbalance that exists
3292 goto out_balanced; 3574 * amongst the groups of a sched_domain, during
3575 * load balancing.
3576 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3577 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3578 * @imbalance: Variable to store the imbalance.
3579 */
3580static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3581 int this_cpu, unsigned long *imbalance)
3582{
3583 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3584 unsigned int imbn = 2;
3585
3586 if (sds->this_nr_running) {
3587 sds->this_load_per_task /= sds->this_nr_running;
3588 if (sds->busiest_load_per_task >
3589 sds->this_load_per_task)
3590 imbn = 1;
3591 } else
3592 sds->this_load_per_task =
3593 cpu_avg_load_per_task(this_cpu);
3293 3594
3294 busiest_load_per_task /= busiest_nr_running; 3595 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3295 if (group_imb) 3596 sds->busiest_load_per_task * imbn) {
3296 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3597 *imbalance = sds->busiest_load_per_task;
3598 return;
3599 }
3297 3600
3298 /* 3601 /*
3299 * We're trying to get all the cpus to the average_load, so we don't 3602 * OK, we don't have enough imbalance to justify moving tasks,
3300 * want to push ourselves above the average load, nor do we wish to 3603 * however we may be able to increase total CPU power used by
3301 * reduce the max loaded cpu below the average load, as either of these 3604 * moving them.
3302 * actions would just result in more rebalancing later, and ping-pong
3303 * tasks around. Thus we look for the minimum possible imbalance.
3304 * Negative imbalances (*we* are more loaded than anyone else) will
3305 * be counted as no imbalance for these purposes -- we can't fix that
3306 * by pulling tasks to us. Be careful of negative numbers as they'll
3307 * appear as very large values with unsigned longs.
3308 */ 3605 */
3309 if (max_load <= busiest_load_per_task)
3310 goto out_balanced;
3311 3606
3607 pwr_now += sds->busiest->__cpu_power *
3608 min(sds->busiest_load_per_task, sds->max_load);
3609 pwr_now += sds->this->__cpu_power *
3610 min(sds->this_load_per_task, sds->this_load);
3611 pwr_now /= SCHED_LOAD_SCALE;
3612
3613 /* Amount of load we'd subtract */
3614 tmp = sg_div_cpu_power(sds->busiest,
3615 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3616 if (sds->max_load > tmp)
3617 pwr_move += sds->busiest->__cpu_power *
3618 min(sds->busiest_load_per_task, sds->max_load - tmp);
3619
3620 /* Amount of load we'd add */
3621 if (sds->max_load * sds->busiest->__cpu_power <
3622 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3623 tmp = sg_div_cpu_power(sds->this,
3624 sds->max_load * sds->busiest->__cpu_power);
3625 else
3626 tmp = sg_div_cpu_power(sds->this,
3627 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3628 pwr_move += sds->this->__cpu_power *
3629 min(sds->this_load_per_task, sds->this_load + tmp);
3630 pwr_move /= SCHED_LOAD_SCALE;
3631
3632 /* Move if we gain throughput */
3633 if (pwr_move > pwr_now)
3634 *imbalance = sds->busiest_load_per_task;
3635}
3636
3637/**
3638 * calculate_imbalance - Calculate the amount of imbalance present within the
3639 * groups of a given sched_domain during load balance.
3640 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3641 * @this_cpu: Cpu for which currently load balance is being performed.
3642 * @imbalance: The variable to store the imbalance.
3643 */
3644static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3645 unsigned long *imbalance)
3646{
3647 unsigned long max_pull;
3312 /* 3648 /*
3313 * In the presence of smp nice balancing, certain scenarios can have 3649 * In the presence of smp nice balancing, certain scenarios can have
3314 * max load less than avg load(as we skip the groups at or below 3650 * max load less than avg load(as we skip the groups at or below
3315 * its cpu_power, while calculating max_load..) 3651 * its cpu_power, while calculating max_load..)
3316 */ 3652 */
3317 if (max_load < avg_load) { 3653 if (sds->max_load < sds->avg_load) {
3318 *imbalance = 0; 3654 *imbalance = 0;
3319 goto small_imbalance; 3655 return fix_small_imbalance(sds, this_cpu, imbalance);
3320 } 3656 }
3321 3657
3322 /* Don't want to pull so many tasks that a group would go idle */ 3658 /* Don't want to pull so many tasks that a group would go idle */
3323 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3659 max_pull = min(sds->max_load - sds->avg_load,
3660 sds->max_load - sds->busiest_load_per_task);
3324 3661
3325 /* How much load to actually move to equalise the imbalance */ 3662 /* How much load to actually move to equalise the imbalance */
3326 *imbalance = min(max_pull * busiest->__cpu_power, 3663 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3327 (avg_load - this_load) * this->__cpu_power) 3664 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3328 / SCHED_LOAD_SCALE; 3665 / SCHED_LOAD_SCALE;
3329 3666
3330 /* 3667 /*
@@ -3333,78 +3670,110 @@ group_next:
3333 * a think about bumping its value to force at least one task to be 3670 * a think about bumping its value to force at least one task to be
3334 * moved 3671 * moved
3335 */ 3672 */
3336 if (*imbalance < busiest_load_per_task) { 3673 if (*imbalance < sds->busiest_load_per_task)
3337 unsigned long tmp, pwr_now, pwr_move; 3674 return fix_small_imbalance(sds, this_cpu, imbalance);
3338 unsigned int imbn;
3339
3340small_imbalance:
3341 pwr_move = pwr_now = 0;
3342 imbn = 2;
3343 if (this_nr_running) {
3344 this_load_per_task /= this_nr_running;
3345 if (busiest_load_per_task > this_load_per_task)
3346 imbn = 1;
3347 } else
3348 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3349 3675
3350 if (max_load - this_load + busiest_load_per_task >= 3676}
3351 busiest_load_per_task * imbn) { 3677/******* find_busiest_group() helpers end here *********************/
3352 *imbalance = busiest_load_per_task;
3353 return busiest;
3354 }
3355 3678
3356 /* 3679/**
3357 * OK, we don't have enough imbalance to justify moving tasks, 3680 * find_busiest_group - Returns the busiest group within the sched_domain
3358 * however we may be able to increase total CPU power used by 3681 * if there is an imbalance. If there isn't an imbalance, and
3359 * moving them. 3682 * the user has opted for power-savings, it returns a group whose
3360 */ 3683 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3684 * such a group exists.
3685 *
3686 * Also calculates the amount of weighted load which should be moved
3687 * to restore balance.
3688 *
3689 * @sd: The sched_domain whose busiest group is to be returned.
3690 * @this_cpu: The cpu for which load balancing is currently being performed.
3691 * @imbalance: Variable which stores amount of weighted load which should
3692 * be moved to restore balance/put a group to idle.
3693 * @idle: The idle status of this_cpu.
3694 * @sd_idle: The idleness of sd
3695 * @cpus: The set of CPUs under consideration for load-balancing.
3696 * @balance: Pointer to a variable indicating if this_cpu
3697 * is the appropriate cpu to perform load balancing at this_level.
3698 *
3699 * Returns: - the busiest group if imbalance exists.
3700 * - If no imbalance and user has opted for power-savings balance,
3701 * return the least loaded group whose CPUs can be
3702 * put to idle by rebalancing its tasks onto our group.
3703 */
3704static struct sched_group *
3705find_busiest_group(struct sched_domain *sd, int this_cpu,
3706 unsigned long *imbalance, enum cpu_idle_type idle,
3707 int *sd_idle, const struct cpumask *cpus, int *balance)
3708{
3709 struct sd_lb_stats sds;
3361 3710
3362 pwr_now += busiest->__cpu_power * 3711 memset(&sds, 0, sizeof(sds));
3363 min(busiest_load_per_task, max_load);
3364 pwr_now += this->__cpu_power *
3365 min(this_load_per_task, this_load);
3366 pwr_now /= SCHED_LOAD_SCALE;
3367
3368 /* Amount of load we'd subtract */
3369 tmp = sg_div_cpu_power(busiest,
3370 busiest_load_per_task * SCHED_LOAD_SCALE);
3371 if (max_load > tmp)
3372 pwr_move += busiest->__cpu_power *
3373 min(busiest_load_per_task, max_load - tmp);
3374
3375 /* Amount of load we'd add */
3376 if (max_load * busiest->__cpu_power <
3377 busiest_load_per_task * SCHED_LOAD_SCALE)
3378 tmp = sg_div_cpu_power(this,
3379 max_load * busiest->__cpu_power);
3380 else
3381 tmp = sg_div_cpu_power(this,
3382 busiest_load_per_task * SCHED_LOAD_SCALE);
3383 pwr_move += this->__cpu_power *
3384 min(this_load_per_task, this_load + tmp);
3385 pwr_move /= SCHED_LOAD_SCALE;
3386 3712
3387 /* Move if we gain throughput */ 3713 /*
3388 if (pwr_move > pwr_now) 3714 * Compute the various statistics relavent for load balancing at
3389 *imbalance = busiest_load_per_task; 3715 * this level.
3390 } 3716 */
3717 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3718 balance, &sds);
3719
3720 /* Cases where imbalance does not exist from POV of this_cpu */
3721 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3722 * at this level.
3723 * 2) There is no busy sibling group to pull from.
3724 * 3) This group is the busiest group.
3725 * 4) This group is more busy than the avg busieness at this
3726 * sched_domain.
3727 * 5) The imbalance is within the specified limit.
3728 * 6) Any rebalance would lead to ping-pong
3729 */
3730 if (balance && !(*balance))
3731 goto ret;
3391 3732
3392 return busiest; 3733 if (!sds.busiest || sds.busiest_nr_running == 0)
3734 goto out_balanced;
3393 3735
3394out_balanced: 3736 if (sds.this_load >= sds.max_load)
3395#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3737 goto out_balanced;
3396 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3397 goto ret;
3398 3738
3399 if (this == group_leader && group_leader != group_min) { 3739 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3400 *imbalance = min_load_per_task; 3740
3401 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3741 if (sds.this_load >= sds.avg_load)
3402 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3742 goto out_balanced;
3403 cpumask_first(sched_group_cpus(group_leader)); 3743
3404 } 3744 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3405 return group_min; 3745 goto out_balanced;
3406 } 3746
3407#endif 3747 sds.busiest_load_per_task /= sds.busiest_nr_running;
3748 if (sds.group_imb)
3749 sds.busiest_load_per_task =
3750 min(sds.busiest_load_per_task, sds.avg_load);
3751
3752 /*
3753 * We're trying to get all the cpus to the average_load, so we don't
3754 * want to push ourselves above the average load, nor do we wish to
3755 * reduce the max loaded cpu below the average load, as either of these
3756 * actions would just result in more rebalancing later, and ping-pong
3757 * tasks around. Thus we look for the minimum possible imbalance.
3758 * Negative imbalances (*we* are more loaded than anyone else) will
3759 * be counted as no imbalance for these purposes -- we can't fix that
3760 * by pulling tasks to us. Be careful of negative numbers as they'll
3761 * appear as very large values with unsigned longs.
3762 */
3763 if (sds.max_load <= sds.busiest_load_per_task)
3764 goto out_balanced;
3765
3766 /* Looks like there is an imbalance. Compute it */
3767 calculate_imbalance(&sds, this_cpu, imbalance);
3768 return sds.busiest;
3769
3770out_balanced:
3771 /*
3772 * There is no obvious imbalance. But check if we can do some balancing
3773 * to save power.
3774 */
3775 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3776 return sds.busiest;
3408ret: 3777ret:
3409 *imbalance = 0; 3778 *imbalance = 0;
3410 return NULL; 3779 return NULL;
@@ -3880,19 +4249,24 @@ int select_nohz_load_balancer(int stop_tick)
3880 int cpu = smp_processor_id(); 4249 int cpu = smp_processor_id();
3881 4250
3882 if (stop_tick) { 4251 if (stop_tick) {
3883 cpumask_set_cpu(cpu, nohz.cpu_mask);
3884 cpu_rq(cpu)->in_nohz_recently = 1; 4252 cpu_rq(cpu)->in_nohz_recently = 1;
3885 4253
3886 /* 4254 if (!cpu_active(cpu)) {
3887 * If we are going offline and still the leader, give up! 4255 if (atomic_read(&nohz.load_balancer) != cpu)
3888 */ 4256 return 0;
3889 if (!cpu_active(cpu) && 4257
3890 atomic_read(&nohz.load_balancer) == cpu) { 4258 /*
4259 * If we are going offline and still the leader,
4260 * give up!
4261 */
3891 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4262 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3892 BUG(); 4263 BUG();
4264
3893 return 0; 4265 return 0;
3894 } 4266 }
3895 4267
4268 cpumask_set_cpu(cpu, nohz.cpu_mask);
4269
3896 /* time for ilb owner also to sleep */ 4270 /* time for ilb owner also to sleep */
3897 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4271 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3898 if (atomic_read(&nohz.load_balancer) == cpu) 4272 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4052,6 +4426,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4052#endif 4426#endif
4053} 4427}
4054 4428
4429static inline int on_null_domain(int cpu)
4430{
4431 return !rcu_dereference(cpu_rq(cpu)->sd);
4432}
4433
4055/* 4434/*
4056 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4435 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4057 * 4436 *
@@ -4109,7 +4488,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4109 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4488 cpumask_test_cpu(cpu, nohz.cpu_mask))
4110 return; 4489 return;
4111#endif 4490#endif
4112 if (time_after_eq(jiffies, rq->next_balance)) 4491 /* Don't need to rebalance while attached to NULL domain */
4492 if (time_after_eq(jiffies, rq->next_balance) &&
4493 likely(!on_null_domain(cpu)))
4113 raise_softirq(SCHED_SOFTIRQ); 4494 raise_softirq(SCHED_SOFTIRQ);
4114} 4495}
4115 4496
@@ -4503,11 +4884,33 @@ static inline void schedule_debug(struct task_struct *prev)
4503#endif 4884#endif
4504} 4885}
4505 4886
4887static void put_prev_task(struct rq *rq, struct task_struct *prev)
4888{
4889 if (prev->state == TASK_RUNNING) {
4890 u64 runtime = prev->se.sum_exec_runtime;
4891
4892 runtime -= prev->se.prev_sum_exec_runtime;
4893 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4894
4895 /*
4896 * In order to avoid avg_overlap growing stale when we are
4897 * indeed overlapping and hence not getting put to sleep, grow
4898 * the avg_overlap on preemption.
4899 *
4900 * We use the average preemption runtime because that
4901 * correlates to the amount of cache footprint a task can
4902 * build up.
4903 */
4904 update_avg(&prev->se.avg_overlap, runtime);
4905 }
4906 prev->sched_class->put_prev_task(rq, prev);
4907}
4908
4506/* 4909/*
4507 * Pick up the highest-prio task: 4910 * Pick up the highest-prio task:
4508 */ 4911 */
4509static inline struct task_struct * 4912static inline struct task_struct *
4510pick_next_task(struct rq *rq, struct task_struct *prev) 4913pick_next_task(struct rq *rq)
4511{ 4914{
4512 const struct sched_class *class; 4915 const struct sched_class *class;
4513 struct task_struct *p; 4916 struct task_struct *p;
@@ -4581,8 +4984,8 @@ need_resched_nonpreemptible:
4581 if (unlikely(!rq->nr_running)) 4984 if (unlikely(!rq->nr_running))
4582 idle_balance(cpu, rq); 4985 idle_balance(cpu, rq);
4583 4986
4584 prev->sched_class->put_prev_task(rq, prev); 4987 put_prev_task(rq, prev);
4585 next = pick_next_task(rq, prev); 4988 next = pick_next_task(rq);
4586 4989
4587 if (likely(prev != next)) { 4990 if (likely(prev != next)) {
4588 sched_info_switch(prev, next); 4991 sched_info_switch(prev, next);
@@ -4637,7 +5040,7 @@ asmlinkage void __sched preempt_schedule(void)
4637 * between schedule and now. 5040 * between schedule and now.
4638 */ 5041 */
4639 barrier(); 5042 barrier();
4640 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5043 } while (need_resched());
4641} 5044}
4642EXPORT_SYMBOL(preempt_schedule); 5045EXPORT_SYMBOL(preempt_schedule);
4643 5046
@@ -4666,7 +5069,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4666 * between schedule and now. 5069 * between schedule and now.
4667 */ 5070 */
4668 barrier(); 5071 barrier();
4669 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5072 } while (need_resched());
4670} 5073}
4671 5074
4672#endif /* CONFIG_PREEMPT */ 5075#endif /* CONFIG_PREEMPT */
@@ -4687,8 +5090,8 @@ EXPORT_SYMBOL(default_wake_function);
4687 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5090 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4688 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5091 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4689 */ 5092 */
4690static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5093void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4691 int nr_exclusive, int sync, void *key) 5094 int nr_exclusive, int sync, void *key)
4692{ 5095{
4693 wait_queue_t *curr, *next; 5096 wait_queue_t *curr, *next;
4694 5097
@@ -5140,7 +5543,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5140 if (increment > 40) 5543 if (increment > 40)
5141 increment = 40; 5544 increment = 40;
5142 5545
5143 nice = PRIO_TO_NICE(current->static_prio) + increment; 5546 nice = TASK_NICE(current) + increment;
5144 if (nice < -20) 5547 if (nice < -20)
5145 nice = -20; 5548 nice = -20;
5146 if (nice > 19) 5549 if (nice > 19)
@@ -6418,7 +6821,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6418 if (!rq->nr_running) 6821 if (!rq->nr_running)
6419 break; 6822 break;
6420 update_rq_clock(rq); 6823 update_rq_clock(rq);
6421 next = pick_next_task(rq, rq->curr); 6824 next = pick_next_task(rq);
6422 if (!next) 6825 if (!next)
6423 break; 6826 break;
6424 next->sched_class->put_prev_task(rq, next); 6827 next->sched_class->put_prev_task(rq, next);
@@ -6939,20 +7342,26 @@ static void free_rootdomain(struct root_domain *rd)
6939 7342
6940static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7343static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6941{ 7344{
7345 struct root_domain *old_rd = NULL;
6942 unsigned long flags; 7346 unsigned long flags;
6943 7347
6944 spin_lock_irqsave(&rq->lock, flags); 7348 spin_lock_irqsave(&rq->lock, flags);
6945 7349
6946 if (rq->rd) { 7350 if (rq->rd) {
6947 struct root_domain *old_rd = rq->rd; 7351 old_rd = rq->rd;
6948 7352
6949 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7353 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6950 set_rq_offline(rq); 7354 set_rq_offline(rq);
6951 7355
6952 cpumask_clear_cpu(rq->cpu, old_rd->span); 7356 cpumask_clear_cpu(rq->cpu, old_rd->span);
6953 7357
6954 if (atomic_dec_and_test(&old_rd->refcount)) 7358 /*
6955 free_rootdomain(old_rd); 7359 * If we dont want to free the old_rt yet then
7360 * set old_rd to NULL to skip the freeing later
7361 * in this function:
7362 */
7363 if (!atomic_dec_and_test(&old_rd->refcount))
7364 old_rd = NULL;
6956 } 7365 }
6957 7366
6958 atomic_inc(&rd->refcount); 7367 atomic_inc(&rd->refcount);
@@ -6963,6 +7372,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6963 set_rq_online(rq); 7372 set_rq_online(rq);
6964 7373
6965 spin_unlock_irqrestore(&rq->lock, flags); 7374 spin_unlock_irqrestore(&rq->lock, flags);
7375
7376 if (old_rd)
7377 free_rootdomain(old_rd);
6966} 7378}
6967 7379
6968static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7380static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -8204,11 +8616,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8204 __set_bit(MAX_RT_PRIO, array->bitmap); 8616 __set_bit(MAX_RT_PRIO, array->bitmap);
8205 8617
8206#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8618#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8207 rt_rq->highest_prio = MAX_RT_PRIO; 8619 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8620#ifdef CONFIG_SMP
8621 rt_rq->highest_prio.next = MAX_RT_PRIO;
8622#endif
8208#endif 8623#endif
8209#ifdef CONFIG_SMP 8624#ifdef CONFIG_SMP
8210 rt_rq->rt_nr_migratory = 0; 8625 rt_rq->rt_nr_migratory = 0;
8211 rt_rq->overloaded = 0; 8626 rt_rq->overloaded = 0;
8627 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8212#endif 8628#endif
8213 8629
8214 rt_rq->rt_time = 0; 8630 rt_rq->rt_time = 0;
@@ -9210,6 +9626,16 @@ static int sched_rt_global_constraints(void)
9210 9626
9211 return ret; 9627 return ret;
9212} 9628}
9629
9630int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9631{
9632 /* Don't accept realtime tasks when there is no way for them to run */
9633 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9634 return 0;
9635
9636 return 1;
9637}
9638
9213#else /* !CONFIG_RT_GROUP_SCHED */ 9639#else /* !CONFIG_RT_GROUP_SCHED */
9214static int sched_rt_global_constraints(void) 9640static int sched_rt_global_constraints(void)
9215{ 9641{
@@ -9303,8 +9729,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9303 struct task_struct *tsk) 9729 struct task_struct *tsk)
9304{ 9730{
9305#ifdef CONFIG_RT_GROUP_SCHED 9731#ifdef CONFIG_RT_GROUP_SCHED
9306 /* Don't accept realtime tasks when there is no way for them to run */ 9732 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9307 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9308 return -EINVAL; 9733 return -EINVAL;
9309#else 9734#else
9310 /* We don't support RT-tasks being in separate groups */ 9735 /* We don't support RT-tasks being in separate groups */
@@ -9575,7 +10000,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9575 struct cpuacct *ca; 10000 struct cpuacct *ca;
9576 int cpu; 10001 int cpu;
9577 10002
9578 if (!cpuacct_subsys.active) 10003 if (unlikely(!cpuacct_subsys.active))
9579 return; 10004 return;
9580 10005
9581 cpu = task_cpu(tsk); 10006 cpu = task_cpu(tsk);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..390f33234bd0 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,11 @@
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
25 * consistent between cpus (never more than 2 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
26 */ 26 */
27#include <linux/sched.h>
28#include <linux/percpu.h>
29#include <linux/spinlock.h> 27#include <linux/spinlock.h>
30#include <linux/ktime.h>
31#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/percpu.h>
30#include <linux/ktime.h>
31#include <linux/sched.h>
32 32
33/* 33/*
34 * Scheduler clock - returns current time in nanosec units. 34 * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +43,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
43static __read_mostly int sched_clock_running; 43static __read_mostly int sched_clock_running;
44 44
45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
46__read_mostly int sched_clock_stable;
46 47
47struct sched_clock_data { 48struct sched_clock_data {
48 /* 49 /*
@@ -87,7 +88,7 @@ void sched_clock_init(void)
87} 88}
88 89
89/* 90/*
90 * min,max except they take wrapping into account 91 * min, max except they take wrapping into account
91 */ 92 */
92 93
93static inline u64 wrap_min(u64 x, u64 y) 94static inline u64 wrap_min(u64 x, u64 y)
@@ -111,15 +112,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
111 s64 delta = now - scd->tick_raw; 112 s64 delta = now - scd->tick_raw;
112 u64 clock, min_clock, max_clock; 113 u64 clock, min_clock, max_clock;
113 114
114 WARN_ON_ONCE(!irqs_disabled());
115
116 if (unlikely(delta < 0)) 115 if (unlikely(delta < 0))
117 delta = 0; 116 delta = 0;
118 117
119 /* 118 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 119 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 120 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 121 * scd->tick_gtod + TICK_NSEC);
123 */ 122 */
124 123
125 clock = scd->tick_gtod + delta; 124 clock = scd->tick_gtod + delta;
@@ -148,12 +147,13 @@ static void lock_double_clock(struct sched_clock_data *data1,
148 147
149u64 sched_clock_cpu(int cpu) 148u64 sched_clock_cpu(int cpu)
150{ 149{
151 struct sched_clock_data *scd = cpu_sdc(cpu);
152 u64 now, clock, this_clock, remote_clock; 150 u64 now, clock, this_clock, remote_clock;
151 struct sched_clock_data *scd;
153 152
154 if (unlikely(!sched_clock_running)) 153 if (sched_clock_stable)
155 return 0ull; 154 return sched_clock();
156 155
156 scd = cpu_sdc(cpu);
157 WARN_ON_ONCE(!irqs_disabled()); 157 WARN_ON_ONCE(!irqs_disabled());
158 now = sched_clock(); 158 now = sched_clock();
159 159
@@ -195,14 +195,18 @@ u64 sched_clock_cpu(int cpu)
195 195
196void sched_clock_tick(void) 196void sched_clock_tick(void)
197{ 197{
198 struct sched_clock_data *scd = this_scd(); 198 struct sched_clock_data *scd;
199 u64 now, now_gtod; 199 u64 now, now_gtod;
200 200
201 if (sched_clock_stable)
202 return;
203
201 if (unlikely(!sched_clock_running)) 204 if (unlikely(!sched_clock_running))
202 return; 205 return;
203 206
204 WARN_ON_ONCE(!irqs_disabled()); 207 WARN_ON_ONCE(!irqs_disabled());
205 208
209 scd = this_scd();
206 now_gtod = ktime_to_ns(ktime_get()); 210 now_gtod = ktime_to_ns(ktime_get());
207 now = sched_clock(); 211 now = sched_clock();
208 212
@@ -250,7 +254,7 @@ u64 sched_clock_cpu(int cpu)
250 return sched_clock(); 254 return sched_clock();
251} 255}
252 256
253#endif 257#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
254 258
255unsigned long long cpu_clock(int cpu) 259unsigned long long cpu_clock(int cpu)
256{ 260{
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..467ca72f1657 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
272 P(nr_switches); 272 P(nr_switches);
273 P(nr_load_updates); 273 P(nr_load_updates);
274 P(nr_uninterruptible); 274 P(nr_uninterruptible);
275 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
276 PN(next_balance); 275 PN(next_balance);
277 P(curr->pid); 276 P(curr->pid);
278 PN(clock); 277 PN(clock);
@@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
287#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
288#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
289 288
290 P(yld_exp_empty);
291 P(yld_act_empty);
292 P(yld_both_empty);
293 P(yld_count); 289 P(yld_count);
294 290
295 P(sched_switch); 291 P(sched_switch);
@@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
314 u64 now = ktime_to_ns(ktime_get()); 310 u64 now = ktime_to_ns(ktime_get());
315 int cpu; 311 int cpu;
316 312
317 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", 313 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
318 init_utsname()->release, 314 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "), 315 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version); 316 init_utsname()->version);
@@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
325 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 321 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
326#define PN(x) \ 322#define PN(x) \
327 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 323 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
324 P(jiffies);
328 PN(sysctl_sched_latency); 325 PN(sysctl_sched_latency);
329 PN(sysctl_sched_min_granularity); 326 PN(sysctl_sched_min_granularity);
330 PN(sysctl_sched_wakeup_granularity); 327 PN(sysctl_sched_wakeup_granularity);
@@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
397 PN(se.vruntime); 394 PN(se.vruntime);
398 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
399 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup);
400 398
401 nr_switches = p->nvcsw + p->nivcsw; 399 nr_switches = p->nvcsw + p->nivcsw;
402 400
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044f..3816f217f119 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
719 __enqueue_entity(cfs_rq, se); 719 __enqueue_entity(cfs_rq, se);
720} 720}
721 721
722static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 722static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
723{ 723{
724 if (cfs_rq->last == se) 724 if (cfs_rq->last == se)
725 cfs_rq->last = NULL; 725 cfs_rq->last = NULL;
@@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
728 cfs_rq->next = NULL; 728 cfs_rq->next = NULL;
729} 729}
730 730
731static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
732{
733 for_each_sched_entity(se)
734 __clear_buddies(cfs_rq_of(se), se);
735}
736
731static void 737static void
732dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 738dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
733{ 739{
@@ -768,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
768 774
769 ideal_runtime = sched_slice(cfs_rq, curr); 775 ideal_runtime = sched_slice(cfs_rq, curr);
770 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 776 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
771 if (delta_exec > ideal_runtime) 777 if (delta_exec > ideal_runtime) {
772 resched_task(rq_of(cfs_rq)->curr); 778 resched_task(rq_of(cfs_rq)->curr);
779 /*
780 * The current task ran long enough, ensure it doesn't get
781 * re-elected due to buddy favours.
782 */
783 clear_buddies(cfs_rq, curr);
784 }
773} 785}
774 786
775static void 787static void
@@ -1302,16 +1314,63 @@ out:
1302} 1314}
1303#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1304 1316
1305static unsigned long wakeup_gran(struct sched_entity *se) 1317/*
1318 * Adaptive granularity
1319 *
1320 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1321 * with the limit of wakeup_gran -- when it never does a wakeup.
1322 *
1323 * So the smaller avg_wakeup is the faster we want this task to preempt,
1324 * but we don't want to treat the preemptee unfairly and therefore allow it
1325 * to run for at least the amount of time we'd like to run.
1326 *
1327 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1328 *
1329 * NOTE: we use *nr_running to scale with load, this nicely matches the
1330 * degrading latency on load.
1331 */
1332static unsigned long
1333adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1334{
1335 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1336 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1337 u64 gran = 0;
1338
1339 if (this_run < expected_wakeup)
1340 gran = expected_wakeup - this_run;
1341
1342 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1343}
1344
1345static unsigned long
1346wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1306{ 1347{
1307 unsigned long gran = sysctl_sched_wakeup_granularity; 1348 unsigned long gran = sysctl_sched_wakeup_granularity;
1308 1349
1350 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1351 gran = adaptive_gran(curr, se);
1352
1309 /* 1353 /*
1310 * More easily preempt - nice tasks, while not making it harder for 1354 * Since its curr running now, convert the gran from real-time
1311 * + nice tasks. 1355 * to virtual-time in his units.
1312 */ 1356 */
1313 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) 1357 if (sched_feat(ASYM_GRAN)) {
1314 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); 1358 /*
1359 * By using 'se' instead of 'curr' we penalize light tasks, so
1360 * they get preempted easier. That is, if 'se' < 'curr' then
1361 * the resulting gran will be larger, therefore penalizing the
1362 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1363 * be smaller, again penalizing the lighter task.
1364 *
1365 * This is especially important for buddies when the leftmost
1366 * task is higher priority than the buddy.
1367 */
1368 if (unlikely(se->load.weight != NICE_0_LOAD))
1369 gran = calc_delta_fair(gran, se);
1370 } else {
1371 if (unlikely(curr->load.weight != NICE_0_LOAD))
1372 gran = calc_delta_fair(gran, curr);
1373 }
1315 1374
1316 return gran; 1375 return gran;
1317} 1376}
@@ -1338,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1338 if (vdiff <= 0) 1397 if (vdiff <= 0)
1339 return -1; 1398 return -1;
1340 1399
1341 gran = wakeup_gran(curr); 1400 gran = wakeup_gran(curr, se);
1342 if (vdiff > gran) 1401 if (vdiff > gran)
1343 return 1; 1402 return 1;
1344 1403
@@ -1452,6 +1511,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1452 1511
1453 do { 1512 do {
1454 se = pick_next_entity(cfs_rq); 1513 se = pick_next_entity(cfs_rq);
1514 /*
1515 * If se was a buddy, clear it so that it will have to earn
1516 * the favour again.
1517 */
1518 __clear_buddies(cfs_rq, se);
1455 set_next_entity(cfs_rq, se); 1519 set_next_entity(cfs_rq, se);
1456 cfs_rq = group_cfs_rq(se); 1520 cfs_rq = group_cfs_rq(se);
1457 } while (cfs_rq); 1521 } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..76f61756e677 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1)
3SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
4SCHED_FEAT(START_DEBIT, 1) 5SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 6SCHED_FEAT(AFFINE_WAKEUPS, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..c79dc7844012 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{
8 return container_of(rt_se, struct task_struct, rt);
9}
10
11#ifdef CONFIG_RT_GROUP_SCHED
12
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{
15 return rt_rq->rq;
16}
17
18static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
19{
20 return rt_se->rt_rq;
21}
22
23#else /* CONFIG_RT_GROUP_SCHED */
24
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{
27 return container_of(rt_rq, struct rq, rt);
28}
29
30static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
31{
32 struct task_struct *p = rt_task_of(rt_se);
33 struct rq *rq = task_rq(p);
34
35 return &rq->rt;
36}
37
38#endif /* CONFIG_RT_GROUP_SCHED */
39
6#ifdef CONFIG_SMP 40#ifdef CONFIG_SMP
7 41
8static inline int rt_overloaded(struct rq *rq) 42static inline int rt_overloaded(struct rq *rq)
@@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)
37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 71 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 72}
39 73
40static void update_rt_migration(struct rq *rq) 74static void update_rt_migration(struct rt_rq *rt_rq)
41{ 75{
42 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { 76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
43 if (!rq->rt.overloaded) { 77 if (!rt_rq->overloaded) {
44 rt_set_overload(rq); 78 rt_set_overload(rq_of_rt_rq(rt_rq));
45 rq->rt.overloaded = 1; 79 rt_rq->overloaded = 1;
46 } 80 }
47 } else if (rq->rt.overloaded) { 81 } else if (rt_rq->overloaded) {
48 rt_clear_overload(rq); 82 rt_clear_overload(rq_of_rt_rq(rt_rq));
49 rq->rt.overloaded = 0; 83 rt_rq->overloaded = 0;
50 } 84 }
51} 85}
52#endif /* CONFIG_SMP */
53 86
54static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{
89 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++;
91
92 update_rt_migration(rt_rq);
93}
94
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{
97 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--;
99
100 update_rt_migration(rt_rq);
101}
102
103static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
104{
105 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
106 plist_node_init(&p->pushable_tasks, p->prio);
107 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
108}
109
110static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
111{
112 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113}
114
115#else
116
117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
55{ 118{
56 return container_of(rt_se, struct task_struct, rt);
57} 119}
58 120
121static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
122{
123}
124
125static inline
126void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
127{
128}
129
130static inline
131void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
132{
133}
134
135#endif /* CONFIG_SMP */
136
59static inline int on_rt_rq(struct sched_rt_entity *rt_se) 137static inline int on_rt_rq(struct sched_rt_entity *rt_se)
60{ 138{
61 return !list_empty(&rt_se->run_list); 139 return !list_empty(&rt_se->run_list);
@@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 157#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 158 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 159
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{
84 return rt_rq->rq;
85}
86
87static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
88{
89 return rt_se->rt_rq;
90}
91
92#define for_each_sched_rt_entity(rt_se) \ 160#define for_each_sched_rt_entity(rt_se) \
93 for (; rt_se; rt_se = rt_se->parent) 161 for (; rt_se; rt_se = rt_se->parent)
94 162
@@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
108 if (rt_rq->rt_nr_running) { 176 if (rt_rq->rt_nr_running) {
109 if (rt_se && !on_rt_rq(rt_se)) 177 if (rt_se && !on_rt_rq(rt_se))
110 enqueue_rt_entity(rt_se); 178 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 179 if (rt_rq->highest_prio.curr < curr->prio)
112 resched_task(curr); 180 resched_task(curr);
113 } 181 }
114} 182}
@@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
176#define for_each_leaf_rt_rq(rt_rq, rq) \ 244#define for_each_leaf_rt_rq(rt_rq, rq) \
177 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 245 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
178 246
179static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
180{
181 return container_of(rt_rq, struct rq, rt);
182}
183
184static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
185{
186 struct task_struct *p = rt_task_of(rt_se);
187 struct rq *rq = task_rq(p);
188
189 return &rq->rt;
190}
191
192#define for_each_sched_rt_entity(rt_se) \ 247#define for_each_sched_rt_entity(rt_se) \
193 for (; rt_se; rt_se = NULL) 248 for (; rt_se; rt_se = NULL)
194 249
@@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
473 struct rt_rq *rt_rq = group_rt_rq(rt_se); 528 struct rt_rq *rt_rq = group_rt_rq(rt_se);
474 529
475 if (rt_rq) 530 if (rt_rq)
476 return rt_rq->highest_prio; 531 return rt_rq->highest_prio.curr;
477#endif 532#endif
478 533
479 return rt_task_of(rt_se)->prio; 534 return rt_task_of(rt_se)->prio;
@@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)
547 } 602 }
548} 603}
549 604
550static inline 605#if defined CONFIG_SMP
551void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 606
607static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
608
609static inline int next_prio(struct rq *rq)
552{ 610{
553 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 611 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
554 rt_rq->rt_nr_running++; 612
555#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 613 if (next && rt_prio(next->prio))
556 if (rt_se_prio(rt_se) < rt_rq->highest_prio) { 614 return next->prio;
557#ifdef CONFIG_SMP 615 else
558 struct rq *rq = rq_of_rt_rq(rt_rq); 616 return MAX_RT_PRIO;
559#endif 617}
618
619static void
620inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
621{
622 struct rq *rq = rq_of_rt_rq(rt_rq);
623
624 if (prio < prev_prio) {
625
626 /*
627 * If the new task is higher in priority than anything on the
628 * run-queue, we know that the previous high becomes our
629 * next-highest.
630 */
631 rt_rq->highest_prio.next = prev_prio;
560 632
561 rt_rq->highest_prio = rt_se_prio(rt_se);
562#ifdef CONFIG_SMP
563 if (rq->online) 633 if (rq->online)
564 cpupri_set(&rq->rd->cpupri, rq->cpu, 634 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
565 rt_se_prio(rt_se));
566#endif
567 }
568#endif
569#ifdef CONFIG_SMP
570 if (rt_se->nr_cpus_allowed > 1) {
571 struct rq *rq = rq_of_rt_rq(rt_rq);
572 635
573 rq->rt.rt_nr_migratory++; 636 } else if (prio == rt_rq->highest_prio.curr)
574 } 637 /*
638 * If the next task is equal in priority to the highest on
639 * the run-queue, then we implicitly know that the next highest
640 * task cannot be any lower than current
641 */
642 rt_rq->highest_prio.next = prio;
643 else if (prio < rt_rq->highest_prio.next)
644 /*
645 * Otherwise, we need to recompute next-highest
646 */
647 rt_rq->highest_prio.next = next_prio(rq);
648}
575 649
576 update_rt_migration(rq_of_rt_rq(rt_rq)); 650static void
577#endif 651dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
578#ifdef CONFIG_RT_GROUP_SCHED 652{
579 if (rt_se_boosted(rt_se)) 653 struct rq *rq = rq_of_rt_rq(rt_rq);
580 rt_rq->rt_nr_boosted++;
581 654
582 if (rt_rq->tg) 655 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
583 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 656 rt_rq->highest_prio.next = next_prio(rq);
584#else 657
585 start_rt_bandwidth(&def_rt_bandwidth); 658 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
586#endif 659 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
587} 660}
588 661
662#else /* CONFIG_SMP */
663
589static inline 664static inline
590void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 665void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
591{ 666static inline
592#ifdef CONFIG_SMP 667void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
593 int highest_prio = rt_rq->highest_prio; 668
594#endif 669#endif /* CONFIG_SMP */
595 670
596 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
597 WARN_ON(!rt_rq->rt_nr_running);
598 rt_rq->rt_nr_running--;
599#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 671#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
672static void
673inc_rt_prio(struct rt_rq *rt_rq, int prio)
674{
675 int prev_prio = rt_rq->highest_prio.curr;
676
677 if (prio < prev_prio)
678 rt_rq->highest_prio.curr = prio;
679
680 inc_rt_prio_smp(rt_rq, prio, prev_prio);
681}
682
683static void
684dec_rt_prio(struct rt_rq *rt_rq, int prio)
685{
686 int prev_prio = rt_rq->highest_prio.curr;
687
600 if (rt_rq->rt_nr_running) { 688 if (rt_rq->rt_nr_running) {
601 struct rt_prio_array *array;
602 689
603 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); 690 WARN_ON(prio < prev_prio);
604 if (rt_se_prio(rt_se) == rt_rq->highest_prio) { 691
605 /* recalculate */ 692 /*
606 array = &rt_rq->active; 693 * This may have been our highest task, and therefore
607 rt_rq->highest_prio = 694 * we may have some recomputation to do
695 */
696 if (prio == prev_prio) {
697 struct rt_prio_array *array = &rt_rq->active;
698
699 rt_rq->highest_prio.curr =
608 sched_find_first_bit(array->bitmap); 700 sched_find_first_bit(array->bitmap);
609 } /* otherwise leave rq->highest prio alone */ 701 }
702
610 } else 703 } else
611 rt_rq->highest_prio = MAX_RT_PRIO; 704 rt_rq->highest_prio.curr = MAX_RT_PRIO;
612#endif
613#ifdef CONFIG_SMP
614 if (rt_se->nr_cpus_allowed > 1) {
615 struct rq *rq = rq_of_rt_rq(rt_rq);
616 rq->rt.rt_nr_migratory--;
617 }
618 705
619 if (rt_rq->highest_prio != highest_prio) { 706 dec_rt_prio_smp(rt_rq, prio, prev_prio);
620 struct rq *rq = rq_of_rt_rq(rt_rq); 707}
621 708
622 if (rq->online) 709#else
623 cpupri_set(&rq->rd->cpupri, rq->cpu, 710
624 rt_rq->highest_prio); 711static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
625 } 712static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
713
714#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
626 715
627 update_rt_migration(rq_of_rt_rq(rt_rq));
628#endif /* CONFIG_SMP */
629#ifdef CONFIG_RT_GROUP_SCHED 716#ifdef CONFIG_RT_GROUP_SCHED
717
718static void
719inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
720{
721 if (rt_se_boosted(rt_se))
722 rt_rq->rt_nr_boosted++;
723
724 if (rt_rq->tg)
725 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
726}
727
728static void
729dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
730{
630 if (rt_se_boosted(rt_se)) 731 if (rt_se_boosted(rt_se))
631 rt_rq->rt_nr_boosted--; 732 rt_rq->rt_nr_boosted--;
632 733
633 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 734 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
634#endif 735}
736
737#else /* CONFIG_RT_GROUP_SCHED */
738
739static void
740inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
741{
742 start_rt_bandwidth(&def_rt_bandwidth);
743}
744
745static inline
746void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
747
748#endif /* CONFIG_RT_GROUP_SCHED */
749
750static inline
751void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
752{
753 int prio = rt_se_prio(rt_se);
754
755 WARN_ON(!rt_prio(prio));
756 rt_rq->rt_nr_running++;
757
758 inc_rt_prio(rt_rq, prio);
759 inc_rt_migration(rt_se, rt_rq);
760 inc_rt_group(rt_se, rt_rq);
761}
762
763static inline
764void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
765{
766 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
767 WARN_ON(!rt_rq->rt_nr_running);
768 rt_rq->rt_nr_running--;
769
770 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
771 dec_rt_migration(rt_se, rt_rq);
772 dec_rt_group(rt_se, rt_rq);
635} 773}
636 774
637static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 775static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
718 856
719 enqueue_rt_entity(rt_se); 857 enqueue_rt_entity(rt_se);
720 858
859 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860 enqueue_pushable_task(rq, p);
861
721 inc_cpu_load(rq, p->se.load.weight); 862 inc_cpu_load(rq, p->se.load.weight);
722} 863}
723 864
@@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
728 update_curr_rt(rq); 869 update_curr_rt(rq);
729 dequeue_rt_entity(rt_se); 870 dequeue_rt_entity(rt_se);
730 871
872 dequeue_pushable_task(rq, p);
873
731 dec_cpu_load(rq, p->se.load.weight); 874 dec_cpu_load(rq, p->se.load.weight);
732} 875}
733 876
@@ -878,7 +1021,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
878 return next; 1021 return next;
879} 1022}
880 1023
881static struct task_struct *pick_next_task_rt(struct rq *rq) 1024static struct task_struct *_pick_next_task_rt(struct rq *rq)
882{ 1025{
883 struct sched_rt_entity *rt_se; 1026 struct sched_rt_entity *rt_se;
884 struct task_struct *p; 1027 struct task_struct *p;
@@ -900,6 +1043,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
900 1043
901 p = rt_task_of(rt_se); 1044 p = rt_task_of(rt_se);
902 p->se.exec_start = rq->clock; 1045 p->se.exec_start = rq->clock;
1046
1047 return p;
1048}
1049
1050static struct task_struct *pick_next_task_rt(struct rq *rq)
1051{
1052 struct task_struct *p = _pick_next_task_rt(rq);
1053
1054 /* The running task is never eligible for pushing */
1055 if (p)
1056 dequeue_pushable_task(rq, p);
1057
903 return p; 1058 return p;
904} 1059}
905 1060
@@ -907,6 +1062,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
907{ 1062{
908 update_curr_rt(rq); 1063 update_curr_rt(rq);
909 p->se.exec_start = 0; 1064 p->se.exec_start = 0;
1065
1066 /*
1067 * The previous task needs to be made eligible for pushing
1068 * if it is still active
1069 */
1070 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1071 enqueue_pushable_task(rq, p);
910} 1072}
911 1073
912#ifdef CONFIG_SMP 1074#ifdef CONFIG_SMP
@@ -968,8 +1130,8 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 1130 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
969 return this_cpu; 1131 return this_cpu;
970 1132
971 first = first_cpu(*mask); 1133 first = cpumask_first(mask);
972 if (first != NR_CPUS) 1134 if (first < nr_cpu_ids)
973 return first; 1135 return first;
974 1136
975 return -1; 1137 return -1;
@@ -1072,7 +1234,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1072 } 1234 }
1073 1235
1074 /* If this rq is still suitable use it. */ 1236 /* If this rq is still suitable use it. */
1075 if (lowest_rq->rt.highest_prio > task->prio) 1237 if (lowest_rq->rt.highest_prio.curr > task->prio)
1076 break; 1238 break;
1077 1239
1078 /* try again */ 1240 /* try again */
@@ -1083,6 +1245,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1083 return lowest_rq; 1245 return lowest_rq;
1084} 1246}
1085 1247
1248static inline int has_pushable_tasks(struct rq *rq)
1249{
1250 return !plist_head_empty(&rq->rt.pushable_tasks);
1251}
1252
1253static struct task_struct *pick_next_pushable_task(struct rq *rq)
1254{
1255 struct task_struct *p;
1256
1257 if (!has_pushable_tasks(rq))
1258 return NULL;
1259
1260 p = plist_first_entry(&rq->rt.pushable_tasks,
1261 struct task_struct, pushable_tasks);
1262
1263 BUG_ON(rq->cpu != task_cpu(p));
1264 BUG_ON(task_current(rq, p));
1265 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1266
1267 BUG_ON(!p->se.on_rq);
1268 BUG_ON(!rt_task(p));
1269
1270 return p;
1271}
1272
1086/* 1273/*
1087 * If the current CPU has more than one RT task, see if the non 1274 * If the current CPU has more than one RT task, see if the non
1088 * running task can migrate over to a CPU that is running a task 1275 * running task can migrate over to a CPU that is running a task
@@ -1092,13 +1279,11 @@ static int push_rt_task(struct rq *rq)
1092{ 1279{
1093 struct task_struct *next_task; 1280 struct task_struct *next_task;
1094 struct rq *lowest_rq; 1281 struct rq *lowest_rq;
1095 int ret = 0;
1096 int paranoid = RT_MAX_TRIES;
1097 1282
1098 if (!rq->rt.overloaded) 1283 if (!rq->rt.overloaded)
1099 return 0; 1284 return 0;
1100 1285
1101 next_task = pick_next_highest_task_rt(rq, -1); 1286 next_task = pick_next_pushable_task(rq);
1102 if (!next_task) 1287 if (!next_task)
1103 return 0; 1288 return 0;
1104 1289
@@ -1127,16 +1312,34 @@ static int push_rt_task(struct rq *rq)
1127 struct task_struct *task; 1312 struct task_struct *task;
1128 /* 1313 /*
1129 * find lock_lowest_rq releases rq->lock 1314 * find lock_lowest_rq releases rq->lock
1130 * so it is possible that next_task has changed. 1315 * so it is possible that next_task has migrated.
1131 * If it has, then try again. 1316 *
1317 * We need to make sure that the task is still on the same
1318 * run-queue and is also still the next task eligible for
1319 * pushing.
1132 */ 1320 */
1133 task = pick_next_highest_task_rt(rq, -1); 1321 task = pick_next_pushable_task(rq);
1134 if (unlikely(task != next_task) && task && paranoid--) { 1322 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1135 put_task_struct(next_task); 1323 /*
1136 next_task = task; 1324 * If we get here, the task hasnt moved at all, but
1137 goto retry; 1325 * it has failed to push. We will not try again,
1326 * since the other cpus will pull from us when they
1327 * are ready.
1328 */
1329 dequeue_pushable_task(rq, next_task);
1330 goto out;
1138 } 1331 }
1139 goto out; 1332
1333 if (!task)
1334 /* No more tasks, just exit */
1335 goto out;
1336
1337 /*
1338 * Something has shifted, try again.
1339 */
1340 put_task_struct(next_task);
1341 next_task = task;
1342 goto retry;
1140 } 1343 }
1141 1344
1142 deactivate_task(rq, next_task, 0); 1345 deactivate_task(rq, next_task, 0);
@@ -1147,23 +1350,12 @@ static int push_rt_task(struct rq *rq)
1147 1350
1148 double_unlock_balance(rq, lowest_rq); 1351 double_unlock_balance(rq, lowest_rq);
1149 1352
1150 ret = 1;
1151out: 1353out:
1152 put_task_struct(next_task); 1354 put_task_struct(next_task);
1153 1355
1154 return ret; 1356 return 1;
1155} 1357}
1156 1358
1157/*
1158 * TODO: Currently we just use the second highest prio task on
1159 * the queue, and stop when it can't migrate (or there's
1160 * no more RT tasks). There may be a case where a lower
1161 * priority RT task has a different affinity than the
1162 * higher RT task. In this case the lower RT task could
1163 * possibly be able to migrate where as the higher priority
1164 * RT task could not. We currently ignore this issue.
1165 * Enhancements are welcome!
1166 */
1167static void push_rt_tasks(struct rq *rq) 1359static void push_rt_tasks(struct rq *rq)
1168{ 1360{
1169 /* push_rt_task will return true if it moved an RT */ 1361 /* push_rt_task will return true if it moved an RT */
@@ -1174,33 +1366,35 @@ static void push_rt_tasks(struct rq *rq)
1174static int pull_rt_task(struct rq *this_rq) 1366static int pull_rt_task(struct rq *this_rq)
1175{ 1367{
1176 int this_cpu = this_rq->cpu, ret = 0, cpu; 1368 int this_cpu = this_rq->cpu, ret = 0, cpu;
1177 struct task_struct *p, *next; 1369 struct task_struct *p;
1178 struct rq *src_rq; 1370 struct rq *src_rq;
1179 1371
1180 if (likely(!rt_overloaded(this_rq))) 1372 if (likely(!rt_overloaded(this_rq)))
1181 return 0; 1373 return 0;
1182 1374
1183 next = pick_next_task_rt(this_rq);
1184
1185 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1375 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1186 if (this_cpu == cpu) 1376 if (this_cpu == cpu)
1187 continue; 1377 continue;
1188 1378
1189 src_rq = cpu_rq(cpu); 1379 src_rq = cpu_rq(cpu);
1380
1381 /*
1382 * Don't bother taking the src_rq->lock if the next highest
1383 * task is known to be lower-priority than our current task.
1384 * This may look racy, but if this value is about to go
1385 * logically higher, the src_rq will push this task away.
1386 * And if its going logically lower, we do not care
1387 */
1388 if (src_rq->rt.highest_prio.next >=
1389 this_rq->rt.highest_prio.curr)
1390 continue;
1391
1190 /* 1392 /*
1191 * We can potentially drop this_rq's lock in 1393 * We can potentially drop this_rq's lock in
1192 * double_lock_balance, and another CPU could 1394 * double_lock_balance, and another CPU could
1193 * steal our next task - hence we must cause 1395 * alter this_rq
1194 * the caller to recalculate the next task
1195 * in that case:
1196 */ 1396 */
1197 if (double_lock_balance(this_rq, src_rq)) { 1397 double_lock_balance(this_rq, src_rq);
1198 struct task_struct *old_next = next;
1199
1200 next = pick_next_task_rt(this_rq);
1201 if (next != old_next)
1202 ret = 1;
1203 }
1204 1398
1205 /* 1399 /*
1206 * Are there still pullable RT tasks? 1400 * Are there still pullable RT tasks?
@@ -1214,7 +1408,7 @@ static int pull_rt_task(struct rq *this_rq)
1214 * Do we have an RT task that preempts 1408 * Do we have an RT task that preempts
1215 * the to-be-scheduled task? 1409 * the to-be-scheduled task?
1216 */ 1410 */
1217 if (p && (!next || (p->prio < next->prio))) { 1411 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1218 WARN_ON(p == src_rq->curr); 1412 WARN_ON(p == src_rq->curr);
1219 WARN_ON(!p->se.on_rq); 1413 WARN_ON(!p->se.on_rq);
1220 1414
@@ -1224,12 +1418,9 @@ static int pull_rt_task(struct rq *this_rq)
1224 * This is just that p is wakeing up and hasn't 1418 * This is just that p is wakeing up and hasn't
1225 * had a chance to schedule. We only pull 1419 * had a chance to schedule. We only pull
1226 * p if it is lower in priority than the 1420 * p if it is lower in priority than the
1227 * current task on the run queue or 1421 * current task on the run queue
1228 * this_rq next task is lower in prio than
1229 * the current task on that rq.
1230 */ 1422 */
1231 if (p->prio < src_rq->curr->prio || 1423 if (p->prio < src_rq->curr->prio)
1232 (next && next->prio < src_rq->curr->prio))
1233 goto skip; 1424 goto skip;
1234 1425
1235 ret = 1; 1426 ret = 1;
@@ -1242,13 +1433,7 @@ static int pull_rt_task(struct rq *this_rq)
1242 * case there's an even higher prio task 1433 * case there's an even higher prio task
1243 * in another runqueue. (low likelyhood 1434 * in another runqueue. (low likelyhood
1244 * but possible) 1435 * but possible)
1245 *
1246 * Update next so that we won't pick a task
1247 * on another cpu with a priority lower (or equal)
1248 * than the one we just picked.
1249 */ 1436 */
1250 next = p;
1251
1252 } 1437 }
1253 skip: 1438 skip:
1254 double_unlock_balance(this_rq, src_rq); 1439 double_unlock_balance(this_rq, src_rq);
@@ -1260,24 +1445,27 @@ static int pull_rt_task(struct rq *this_rq)
1260static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1445static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1261{ 1446{
1262 /* Try to pull RT tasks here if we lower this rq's prio */ 1447 /* Try to pull RT tasks here if we lower this rq's prio */
1263 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) 1448 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1264 pull_rt_task(rq); 1449 pull_rt_task(rq);
1265} 1450}
1266 1451
1452/*
1453 * assumes rq->lock is held
1454 */
1455static int needs_post_schedule_rt(struct rq *rq)
1456{
1457 return has_pushable_tasks(rq);
1458}
1459
1267static void post_schedule_rt(struct rq *rq) 1460static void post_schedule_rt(struct rq *rq)
1268{ 1461{
1269 /* 1462 /*
1270 * If we have more than one rt_task queued, then 1463 * This is only called if needs_post_schedule_rt() indicates that
1271 * see if we can push the other rt_tasks off to other CPUS. 1464 * we need to push tasks away
1272 * Note we may release the rq lock, and since
1273 * the lock was owned by prev, we need to release it
1274 * first via finish_lock_switch and then reaquire it here.
1275 */ 1465 */
1276 if (unlikely(rq->rt.overloaded)) { 1466 spin_lock_irq(&rq->lock);
1277 spin_lock_irq(&rq->lock); 1467 push_rt_tasks(rq);
1278 push_rt_tasks(rq); 1468 spin_unlock_irq(&rq->lock);
1279 spin_unlock_irq(&rq->lock);
1280 }
1281} 1469}
1282 1470
1283/* 1471/*
@@ -1288,7 +1476,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1288{ 1476{
1289 if (!task_running(rq, p) && 1477 if (!task_running(rq, p) &&
1290 !test_tsk_need_resched(rq->curr) && 1478 !test_tsk_need_resched(rq->curr) &&
1291 rq->rt.overloaded) 1479 has_pushable_tasks(rq) &&
1480 p->rt.nr_cpus_allowed > 1)
1292 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1293} 1482}
1294 1483
@@ -1324,6 +1513,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1324 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1513 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1325 struct rq *rq = task_rq(p); 1514 struct rq *rq = task_rq(p);
1326 1515
1516 if (!task_current(rq, p)) {
1517 /*
1518 * Make sure we dequeue this task from the pushable list
1519 * before going further. It will either remain off of
1520 * the list because we are no longer pushable, or it
1521 * will be requeued.
1522 */
1523 if (p->rt.nr_cpus_allowed > 1)
1524 dequeue_pushable_task(rq, p);
1525
1526 /*
1527 * Requeue if our weight is changing and still > 1
1528 */
1529 if (weight > 1)
1530 enqueue_pushable_task(rq, p);
1531
1532 }
1533
1327 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1534 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1328 rq->rt.rt_nr_migratory++; 1535 rq->rt.rt_nr_migratory++;
1329 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { 1536 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1331,7 +1538,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 rq->rt.rt_nr_migratory--; 1538 rq->rt.rt_nr_migratory--;
1332 } 1539 }
1333 1540
1334 update_rt_migration(rq); 1541 update_rt_migration(&rq->rt);
1335 } 1542 }
1336 1543
1337 cpumask_copy(&p->cpus_allowed, new_mask); 1544 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -1346,7 +1553,7 @@ static void rq_online_rt(struct rq *rq)
1346 1553
1347 __enable_runtime(rq); 1554 __enable_runtime(rq);
1348 1555
1349 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); 1556 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1350} 1557}
1351 1558
1352/* Assumes rq->lock is held */ 1559/* Assumes rq->lock is held */
@@ -1438,7 +1645,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1438 * can release the rq lock and p could migrate. 1645 * can release the rq lock and p could migrate.
1439 * Only reschedule if p is still on the same runqueue. 1646 * Only reschedule if p is still on the same runqueue.
1440 */ 1647 */
1441 if (p->prio > rq->rt.highest_prio && rq->curr == p) 1648 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1442 resched_task(p); 1649 resched_task(p);
1443#else 1650#else
1444 /* For UP simply resched on drop of prio */ 1651 /* For UP simply resched on drop of prio */
@@ -1509,6 +1716,9 @@ static void set_curr_task_rt(struct rq *rq)
1509 struct task_struct *p = rq->curr; 1716 struct task_struct *p = rq->curr;
1510 1717
1511 p->se.exec_start = rq->clock; 1718 p->se.exec_start = rq->clock;
1719
1720 /* The running task is never eligible for pushing */
1721 dequeue_pushable_task(rq, p);
1512} 1722}
1513 1723
1514static const struct sched_class rt_sched_class = { 1724static const struct sched_class rt_sched_class = {
@@ -1531,6 +1741,7 @@ static const struct sched_class rt_sched_class = {
1531 .rq_online = rq_online_rt, 1741 .rq_online = rq_online_rt,
1532 .rq_offline = rq_offline_rt, 1742 .rq_offline = rq_offline_rt,
1533 .pre_schedule = pre_schedule_rt, 1743 .pre_schedule = pre_schedule_rt,
1744 .needs_post_schedule = needs_post_schedule_rt,
1534 .post_schedule = post_schedule_rt, 1745 .post_schedule = post_schedule_rt,
1535 .task_wake_up = task_wake_up_rt, 1746 .task_wake_up = task_wake_up_rt,
1536 .switched_from = switched_from_rt, 1747 .switched_from = switched_from_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d1226..32d2bd4061b0 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,7 +4,7 @@
4 * bump this up when changing the output format or the meaning of an existing 4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort) 5 * format, so that tools can adapt (or abort)
6 */ 6 */
7#define SCHEDSTAT_VERSION 14 7#define SCHEDSTAT_VERSION 15
8 8
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
26 26
27 /* runqueue-specific stats */ 27 /* runqueue-specific stats */
28 seq_printf(seq, 28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", 29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_both_empty, 30 cpu, rq->yld_count,
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 32 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_cpu_time, 33 rq->rq_cpu_time,
@@ -296,20 +295,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
296static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
297 cputime_t cputime) 296 cputime_t cputime)
298{ 297{
299 struct signal_struct *sig; 298 struct thread_group_cputimer *cputimer;
300 299
301 /* tsk == current, ensure it is safe to use ->signal */ 300 /* tsk == current, ensure it is safe to use ->signal */
302 if (unlikely(tsk->exit_state)) 301 if (unlikely(tsk->exit_state))
303 return; 302 return;
304 303
305 sig = tsk->signal; 304 cputimer = &tsk->signal->cputimer;
306 if (sig->cputime.totals) {
307 struct task_cputime *times;
308 305
309 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 306 if (!cputimer->running)
310 times->utime = cputime_add(times->utime, cputime); 307 return;
311 put_cpu_no_resched(); 308
312 } 309 spin_lock(&cputimer->lock);
310 cputimer->cputime.utime =
311 cputime_add(cputimer->cputime.utime, cputime);
312 spin_unlock(&cputimer->lock);
313} 313}
314 314
315/** 315/**
@@ -325,20 +325,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 325static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 326 cputime_t cputime)
327{ 327{
328 struct signal_struct *sig; 328 struct thread_group_cputimer *cputimer;
329 329
330 /* tsk == current, ensure it is safe to use ->signal */ 330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state)) 331 if (unlikely(tsk->exit_state))
332 return; 332 return;
333 333
334 sig = tsk->signal; 334 cputimer = &tsk->signal->cputimer;
335 if (sig->cputime.totals) {
336 struct task_cputime *times;
337 335
338 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 336 if (!cputimer->running)
339 times->stime = cputime_add(times->stime, cputime); 337 return;
340 put_cpu_no_resched(); 338
341 } 339 spin_lock(&cputimer->lock);
340 cputimer->cputime.stime =
341 cputime_add(cputimer->cputime.stime, cputime);
342 spin_unlock(&cputimer->lock);
342} 343}
343 344
344/** 345/**
@@ -354,6 +355,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
354static inline void account_group_exec_runtime(struct task_struct *tsk, 355static inline void account_group_exec_runtime(struct task_struct *tsk,
355 unsigned long long ns) 356 unsigned long long ns)
356{ 357{
358 struct thread_group_cputimer *cputimer;
357 struct signal_struct *sig; 359 struct signal_struct *sig;
358 360
359 sig = tsk->signal; 361 sig = tsk->signal;
@@ -362,11 +364,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
362 if (unlikely(!sig)) 364 if (unlikely(!sig))
363 return; 365 return;
364 366
365 if (sig->cputime.totals) { 367 cputimer = &sig->cputimer;
366 struct task_cputime *times;
367 368
368 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 369 if (!cputimer->running)
369 times->sum_exec_runtime += ns; 370 return;
370 put_cpu_no_resched(); 371
371 } 372 spin_lock(&cputimer->lock);
373 cputimer->cputime.sum_exec_runtime += ns;
374 spin_unlock(&cputimer->lock);
372} 375}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ad64fcb731f2..57d4b13b631d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/seccomp.h> 9#include <linux/seccomp.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/compat.h>
11 12
12/* #define SECCOMP_DEBUG 1 */ 13/* #define SECCOMP_DEBUG 1 */
13#define NR_SECCOMP_MODES 1 14#define NR_SECCOMP_MODES 1
@@ -22,7 +23,7 @@ static int mode1_syscalls[] = {
22 0, /* null terminated */ 23 0, /* null terminated */
23}; 24};
24 25
25#ifdef TIF_32BIT 26#ifdef CONFIG_COMPAT
26static int mode1_syscalls_32[] = { 27static int mode1_syscalls_32[] = {
27 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 28 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
28 0, /* null terminated */ 29 0, /* null terminated */
@@ -37,8 +38,8 @@ void __secure_computing(int this_syscall)
37 switch (mode) { 38 switch (mode) {
38 case 1: 39 case 1:
39 syscall = mode1_syscalls; 40 syscall = mode1_syscalls;
40#ifdef TIF_32BIT 41#ifdef CONFIG_COMPAT
41 if (test_thread_flag(TIF_32BIT)) 42 if (is_compat_task())
42 syscall = mode1_syscalls_32; 43 syscall = mode1_syscalls_32;
43#endif 44#endif
44 do { 45 do {
diff --git a/kernel/signal.c b/kernel/signal.c
index e73759783dc8..1c8814481a11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
909 } 909 }
910#endif 910#endif
911 printk("\n"); 911 printk("\n");
912 preempt_disable();
912 show_regs(regs); 913 show_regs(regs);
914 preempt_enable();
913} 915}
914 916
915static int __init setup_print_fatal_signals(char *str) 917static int __init setup_print_fatal_signals(char *str)
@@ -1365,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1365 struct siginfo info; 1367 struct siginfo info;
1366 unsigned long flags; 1368 unsigned long flags;
1367 struct sighand_struct *psig; 1369 struct sighand_struct *psig;
1368 struct task_cputime cputime;
1369 int ret = sig; 1370 int ret = sig;
1370 1371
1371 BUG_ON(sig == -1); 1372 BUG_ON(sig == -1);
@@ -1395,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1395 info.si_uid = __task_cred(tsk)->uid; 1396 info.si_uid = __task_cred(tsk)->uid;
1396 rcu_read_unlock(); 1397 rcu_read_unlock();
1397 1398
1398 thread_group_cputime(tsk, &cputime); 1399 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1399 info.si_utime = cputime_to_jiffies(cputime.utime); 1400 tsk->signal->utime));
1400 info.si_stime = cputime_to_jiffies(cputime.stime); 1401 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1402 tsk->signal->stime));
1401 1403
1402 info.si_status = tsk->exit_code & 0x7f; 1404 info.si_status = tsk->exit_code & 0x7f;
1403 if (tsk->exit_code & 0x80) 1405 if (tsk->exit_code & 0x80)
@@ -1573,7 +1575,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1573 read_lock(&tasklist_lock); 1575 read_lock(&tasklist_lock);
1574 if (may_ptrace_stop()) { 1576 if (may_ptrace_stop()) {
1575 do_notify_parent_cldstop(current, CLD_TRAPPED); 1577 do_notify_parent_cldstop(current, CLD_TRAPPED);
1578 /*
1579 * Don't want to allow preemption here, because
1580 * sys_ptrace() needs this task to be inactive.
1581 *
1582 * XXX: implement read_unlock_no_resched().
1583 */
1584 preempt_disable();
1576 read_unlock(&tasklist_lock); 1585 read_unlock(&tasklist_lock);
1586 preempt_enable_no_resched();
1577 schedule(); 1587 schedule();
1578 } else { 1588 } else {
1579 /* 1589 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e88..bbedbb7efe32 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
18enum { 18enum {
19 CSD_FLAG_WAIT = 0x01, 19 CSD_FLAG_WAIT = 0x01,
20 CSD_FLAG_ALLOC = 0x02, 20 CSD_FLAG_ALLOC = 0x02,
21 CSD_FLAG_LOCK = 0x04,
21}; 22};
22 23
23struct call_function_data { 24struct call_function_data {
@@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void)
186 if (data_flags & CSD_FLAG_WAIT) { 187 if (data_flags & CSD_FLAG_WAIT) {
187 smp_wmb(); 188 smp_wmb();
188 data->flags &= ~CSD_FLAG_WAIT; 189 data->flags &= ~CSD_FLAG_WAIT;
190 } else if (data_flags & CSD_FLAG_LOCK) {
191 smp_wmb();
192 data->flags &= ~CSD_FLAG_LOCK;
189 } else if (data_flags & CSD_FLAG_ALLOC) 193 } else if (data_flags & CSD_FLAG_ALLOC)
190 kfree(data); 194 kfree(data);
191 } 195 }
@@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void)
196 } 200 }
197} 201}
198 202
203static DEFINE_PER_CPU(struct call_single_data, csd_data);
204
199/* 205/*
200 * smp_call_function_single - Run a function on a specific CPU 206 * smp_call_function_single - Run a function on a specific CPU
201 * @func: The function to run. This must be fast and non-blocking. 207 * @func: The function to run. This must be fast and non-blocking.
@@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
224 func(info); 230 func(info);
225 local_irq_restore(flags); 231 local_irq_restore(flags);
226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 232 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 233 struct call_single_data *data;
228 234
229 if (!wait) { 235 if (!wait) {
236 /*
237 * We are calling a function on a single CPU
238 * and we are not going to wait for it to finish.
239 * We first try to allocate the data, but if we
240 * fail, we fall back to use a per cpu data to pass
241 * the information to that CPU. Since all callers
242 * of this code will use the same data, we must
243 * synchronize the callers to prevent a new caller
244 * from corrupting the data before the callee
245 * can access it.
246 *
247 * The CSD_FLAG_LOCK is used to let us know when
248 * the IPI handler is done with the data.
249 * The first caller will set it, and the callee
250 * will clear it. The next caller must wait for
251 * it to clear before we set it again. This
252 * will make sure the callee is done with the
253 * data before a new caller will use it.
254 */
230 data = kmalloc(sizeof(*data), GFP_ATOMIC); 255 data = kmalloc(sizeof(*data), GFP_ATOMIC);
231 if (data) 256 if (data)
232 data->flags = CSD_FLAG_ALLOC; 257 data->flags = CSD_FLAG_ALLOC;
233 } 258 else {
234 if (!data) { 259 data = &per_cpu(csd_data, me);
260 while (data->flags & CSD_FLAG_LOCK)
261 cpu_relax();
262 data->flags = CSD_FLAG_LOCK;
263 }
264 } else {
235 data = &d; 265 data = &d;
236 data->flags = CSD_FLAG_WAIT; 266 data->flags = CSD_FLAG_WAIT;
237 } 267 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..9041ea7948fe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
626 preempt_enable_no_resched(); 626 preempt_enable_no_resched();
627 cond_resched(); 627 cond_resched();
628 preempt_disable(); 628 preempt_disable();
629 rcu_qsctr_inc((long)__bind_cpu);
629 } 630 }
630 preempt_enable(); 631 preempt_enable();
631 set_current_state(TASK_INTERRUPTIBLE); 632 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..85d5a2455103 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
16#include <linux/lockdep.h> 16#include <linux/lockdep.h>
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/sysctl.h>
19 20
20#include <asm/irq_regs.h> 21#include <asm/irq_regs.h>
21 22
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
88} 89}
89EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
90 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer,
94 size_t *lenp, loff_t *ppos)
95{
96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
98}
99
91/* 100/*
92 * This callback runs from the timer interrupt, and checks 101 * This callback runs from the timer interrupt, and checks
93 * whether the watchdog thread has hung or not: 102 * whether the watchdog thread has hung or not:
diff --git a/kernel/sys.c b/kernel/sys.c
index e7dc0e10a485..37f458e6882a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
559 abort_creds(new); 559 abort_creds(new);
560 return retval; 560 return retval;
561} 561}
562 562
563/* 563/*
564 * change the user struct in a credentials set to match the new UID 564 * change the user struct in a credentials set to match the new UID
565 */ 565 */
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) {
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
574 if (atomic_read(&new_user->processes) >= 579 if (atomic_read(&new_user->processes) >=
575 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 580 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
576 new_user != INIT_USER) { 581 new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
631 goto error; 636 goto error;
632 } 637 }
633 638
634 retval = -EAGAIN; 639 if (new->uid != old->uid) {
635 if (new->uid != old->uid && set_user(new) < 0) 640 retval = set_user(new);
636 goto error; 641 if (retval < 0)
637 642 goto error;
643 }
638 if (ruid != (uid_t) -1 || 644 if (ruid != (uid_t) -1 ||
639 (euid != (uid_t) -1 && euid != old->uid)) 645 (euid != (uid_t) -1 && euid != old->uid))
640 new->suid = new->euid; 646 new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
680 retval = -EPERM; 686 retval = -EPERM;
681 if (capable(CAP_SETUID)) { 687 if (capable(CAP_SETUID)) {
682 new->suid = new->uid = uid; 688 new->suid = new->uid = uid;
683 if (uid != old->uid && set_user(new) < 0) { 689 if (uid != old->uid) {
684 retval = -EAGAIN; 690 retval = set_user(new);
685 goto error; 691 if (retval < 0)
692 goto error;
686 } 693 }
687 } else if (uid != old->uid && uid != new->suid) { 694 } else if (uid != old->uid && uid != new->suid) {
688 goto error; 695 goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
734 goto error; 741 goto error;
735 } 742 }
736 743
737 retval = -EAGAIN;
738 if (ruid != (uid_t) -1) { 744 if (ruid != (uid_t) -1) {
739 new->uid = ruid; 745 new->uid = ruid;
740 if (ruid != old->uid && set_user(new) < 0) 746 if (ruid != old->uid) {
741 goto error; 747 retval = set_user(new);
748 if (retval < 0)
749 goto error;
750 }
742 } 751 }
743 if (euid != (uid_t) -1) 752 if (euid != (uid_t) -1)
744 new->euid = euid; 753 new->euid = euid;
@@ -1525,22 +1534,14 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1525 return -EINVAL; 1534 return -EINVAL;
1526 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1535 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1527 return -EFAULT; 1536 return -EFAULT;
1537 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1538 return -EINVAL;
1528 old_rlim = current->signal->rlim + resource; 1539 old_rlim = current->signal->rlim + resource;
1529 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1540 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1530 !capable(CAP_SYS_RESOURCE)) 1541 !capable(CAP_SYS_RESOURCE))
1531 return -EPERM; 1542 return -EPERM;
1532 1543 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1533 if (resource == RLIMIT_NOFILE) { 1544 return -EPERM;
1534 if (new_rlim.rlim_max == RLIM_INFINITY)
1535 new_rlim.rlim_max = sysctl_nr_open;
1536 if (new_rlim.rlim_cur == RLIM_INFINITY)
1537 new_rlim.rlim_cur = sysctl_nr_open;
1538 if (new_rlim.rlim_max > sysctl_nr_open)
1539 return -EPERM;
1540 }
1541
1542 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1543 return -EINVAL;
1544 1545
1545 retval = security_task_setrlimit(resource, &new_rlim); 1546 retval = security_task_setrlimit(resource, &new_rlim);
1546 if (retval) 1547 if (retval)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 368d1638ee78..c5ef44ff850f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int two = 2;
101 101
102static int zero; 102static int zero;
103static int one = 1; 103static int one = 1;
104static unsigned long one_ul = 1;
104static int one_hundred = 100; 105static int one_hundred = 100;
105 106
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 107/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -809,7 +810,7 @@ static struct ctl_table kern_table[] = {
809 .data = &softlockup_thresh, 810 .data = &softlockup_thresh,
810 .maxlen = sizeof(int), 811 .maxlen = sizeof(int),
811 .mode = 0644, 812 .mode = 0644,
812 .proc_handler = &proc_dointvec_minmax, 813 .proc_handler = &proc_dosoftlockup_thresh,
813 .strategy = &sysctl_intvec, 814 .strategy = &sysctl_intvec,
814 .extra1 = &neg_one, 815 .extra1 = &neg_one,
815 .extra2 = &sixty, 816 .extra2 = &sixty,
@@ -974,7 +975,7 @@ static struct ctl_table vm_table[] = {
974 .mode = 0644, 975 .mode = 0644,
975 .proc_handler = &dirty_background_bytes_handler, 976 .proc_handler = &dirty_background_bytes_handler,
976 .strategy = &sysctl_intvec, 977 .strategy = &sysctl_intvec,
977 .extra1 = &one, 978 .extra1 = &one_ul,
978 }, 979 },
979 { 980 {
980 .ctl_name = VM_DIRTY_RATIO, 981 .ctl_name = VM_DIRTY_RATIO,
@@ -995,7 +996,7 @@ static struct ctl_table vm_table[] = {
995 .mode = 0644, 996 .mode = 0644,
996 .proc_handler = &dirty_bytes_handler, 997 .proc_handler = &dirty_bytes_handler,
997 .strategy = &sysctl_intvec, 998 .strategy = &sysctl_intvec,
998 .extra1 = &one, 999 .extra1 = &one_ul,
999 }, 1000 },
1000 { 1001 {
1001 .procname = "dirty_writeback_centisecs", 1002 .procname = "dirty_writeback_centisecs",
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, 219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, 220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, 221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
222 {} 223 {}
223}; 224};
224 225
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792d..0b0a6366c9d4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83cf..d13be216a790 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
68 if (dev->mode != mode) { 68 if (dev->mode != mode) {
69 dev->set_mode(mode, dev); 69 dev->set_mode(mode, dev);
70 dev->mode = mode; 70 dev->mode = mode;
71
72 /*
73 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
74 * on it, so fix it up and emit a warning:
75 */
76 if (mode == CLOCK_EVT_MODE_ONESHOT) {
77 if (unlikely(!dev->mult)) {
78 dev->mult = 1;
79 WARN_ON(1);
80 }
81 }
71 } 82 }
72} 83}
73 84
@@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 179 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask); 180 BUG_ON(!dev->cpumask);
170 181
171 /*
172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
173 * on it, so fix it up and emit a warning:
174 */
175 if (unlikely(!dev->mult)) {
176 dev->mult = 1;
177 WARN_ON(1);
178 }
179
180 spin_lock(&clockevents_lock); 182 spin_lock(&clockevents_lock);
181 183
182 list_add(&dev->list, &clockevent_devices); 184 list_add(&dev->list, &clockevent_devices);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f08..c46c931a7fe7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 32#include <linux/tick.h>
33 33
34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc,
36 u64 start_tstamp)
37{
38 tc->cc = cc;
39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp;
41}
42EXPORT_SYMBOL(timecounter_init);
43
44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function
46 * @tc: Pointer to time counter
47 *
48 * When the underlying cycle counter runs over, this will be handled
49 * correctly as long as it does not run over more than once between
50 * calls.
51 *
52 * The first call to this function for a new time counter initializes
53 * the time tracking and returns an undefined result.
54 */
55static u64 timecounter_read_delta(struct timecounter *tc)
56{
57 cycle_t cycle_now, cycle_delta;
58 u64 ns_offset;
59
60 /* read cycle counter: */
61 cycle_now = tc->cc->read(tc->cc);
62
63 /* calculate the delta since the last timecounter_read_delta(): */
64 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
65
66 /* convert to nanoseconds: */
67 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
68
69 /* update time stamp of timecounter_read_delta() call: */
70 tc->cycle_last = cycle_now;
71
72 return ns_offset;
73}
74
75u64 timecounter_read(struct timecounter *tc)
76{
77 u64 nsec;
78
79 /* increment time by nanoseconds since last call */
80 nsec = timecounter_read_delta(tc);
81 nsec += tc->nsec;
82 tc->nsec = nsec;
83
84 return nsec;
85}
86EXPORT_SYMBOL(timecounter_read);
87
88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp)
90{
91 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
92 u64 nsec;
93
94 /*
95 * Instead of always treating cycle_tstamp as more recent
96 * than tc->cycle_last, detect when it is too far in the
97 * future and treat it as old time stamp instead.
98 */
99 if (cycle_delta > tc->cc->mask / 2) {
100 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
101 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
102 } else {
103 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
104 }
105
106 return nsec;
107}
108EXPORT_SYMBOL(timecounter_cyc2time);
109
34/* XXX - Would like a better way for initializing curr_clocksource */ 110/* XXX - Would like a better way for initializing curr_clocksource */
35extern struct clocksource clocksource_jiffies; 111extern struct clocksource clocksource_jiffies;
36 112
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f5f793d92415..7fc64375ff43 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,71 +1,129 @@
1/* 1/*
2 * linux/kernel/time/ntp.c
3 *
4 * NTP state machine interfaces and logic. 2 * NTP state machine interfaces and logic.
5 * 3 *
6 * This code was mainly moved from kernel/timer.c and kernel/time.c 4 * This code was mainly moved from kernel/timer.c and kernel/time.c
7 * Please see those files for relevant copyright info and historical 5 * Please see those files for relevant copyright info and historical
8 * changelogs. 6 * changelogs.
9 */ 7 */
10
11#include <linux/mm.h>
12#include <linux/time.h>
13#include <linux/timex.h>
14#include <linux/jiffies.h>
15#include <linux/hrtimer.h>
16#include <linux/capability.h> 8#include <linux/capability.h>
17#include <linux/math64.h>
18#include <linux/clocksource.h> 9#include <linux/clocksource.h>
19#include <linux/workqueue.h> 10#include <linux/workqueue.h>
20#include <asm/timex.h> 11#include <linux/hrtimer.h>
12#include <linux/jiffies.h>
13#include <linux/math64.h>
14#include <linux/timex.h>
15#include <linux/time.h>
16#include <linux/mm.h>
21 17
22/* 18/*
23 * Timekeeping variables 19 * NTP timekeeping variables:
24 */ 20 */
25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
26unsigned long tick_nsec; /* ACTHZ period (nsec) */
27u64 tick_length;
28static u64 tick_length_base;
29 21
30static struct hrtimer leap_timer; 22/* USER_HZ period (usecs): */
23unsigned long tick_usec = TICK_USEC;
31 24
32#define MAX_TICKADJ 500 /* microsecs */ 25/* ACTHZ period (nsecs): */
33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26unsigned long tick_nsec;
34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 27
28u64 tick_length;
29static u64 tick_length_base;
30
31static struct hrtimer leap_timer;
32
33#define MAX_TICKADJ 500LL /* usecs */
34#define MAX_TICKADJ_SCALED \
35 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
35 36
36/* 37/*
37 * phase-lock loop variables 38 * phase-lock loop variables
38 */ 39 */
39/* TIME_ERROR prevents overwriting the CMOS clock */
40static int time_state = TIME_OK; /* clock synchronization status */
41int time_status = STA_UNSYNC; /* clock status bits */
42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
44static long time_constant = 2; /* pll time constant */
45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
47static s64 time_freq; /* frequency offset (scaled ns/s)*/
48static long time_reftime; /* time at last adjustment (s) */
49long time_adjust;
50static long ntp_tick_adj;
51 40
41/*
42 * clock synchronization status
43 *
44 * (TIME_ERROR prevents overwriting the CMOS clock)
45 */
46static int time_state = TIME_OK;
47
48/* clock status bits: */
49int time_status = STA_UNSYNC;
50
51/* TAI offset (secs): */
52static long time_tai;
53
54/* time adjustment (nsecs): */
55static s64 time_offset;
56
57/* pll time constant: */
58static long time_constant = 2;
59
60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT;
62
63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT;
65
66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq;
68
69/* time at last adjustment (secs): */
70static long time_reftime;
71
72long time_adjust;
73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj;
76
77/*
78 * NTP methods:
79 */
80
81/*
82 * Update (tick_length, tick_length_base, tick_nsec), based
83 * on (tick_usec, ntp_tick_adj, time_freq):
84 */
52static void ntp_update_frequency(void) 85static void ntp_update_frequency(void)
53{ 86{
54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 87 u64 second_length;
55 << NTP_SCALE_SHIFT; 88 u64 new_base;
56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; 89
57 second_length += time_freq; 90 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
91 << NTP_SCALE_SHIFT;
92
93 second_length += ntp_tick_adj;
94 second_length += time_freq;
58 95
59 tick_length_base = second_length; 96 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
97 new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
60 98
61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; 99 /*
62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); 100 * Don't wait for the next second_overflow, apply
101 * the change to the tick length immediately:
102 */
103 tick_length += new_base - tick_length_base;
104 tick_length_base = new_base;
105}
106
107static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
108{
109 time_status &= ~STA_MODE;
110
111 if (secs < MINSEC)
112 return 0;
113
114 if (!(time_status & STA_FLL) && (secs <= MAXSEC))
115 return 0;
116
117 time_status |= STA_MODE;
118
119 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
63} 120}
64 121
65static void ntp_update_offset(long offset) 122static void ntp_update_offset(long offset)
66{ 123{
67 long mtemp;
68 s64 freq_adj; 124 s64 freq_adj;
125 s64 offset64;
126 long secs;
69 127
70 if (!(time_status & STA_PLL)) 128 if (!(time_status & STA_PLL))
71 return; 129 return;
@@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)
84 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
86 */ 144 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0) 145 secs = xtime.tv_sec - time_reftime;
88 time_reftime = xtime.tv_sec; 146 if (unlikely(time_status & STA_FREQHOLD))
89 mtemp = xtime.tv_sec - time_reftime; 147 secs = 0;
148
90 time_reftime = xtime.tv_sec; 149 time_reftime = xtime.tv_sec;
91 150
92 freq_adj = (s64)offset * mtemp; 151 offset64 = offset;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); 152 freq_adj = (offset64 * secs) <<
94 time_status &= ~STA_MODE; 153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103 154
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); 155 freq_adj += ntp_update_offset_fll(offset64, secs);
156
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158
159 time_freq = max(freq_adj, -MAXFREQ_SCALED);
160
161 time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
105} 162}
106 163
107/** 164/**
@@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)
111 */ 168 */
112void ntp_clear(void) 169void ntp_clear(void)
113{ 170{
114 time_adjust = 0; /* stop active adjtime() */ 171 time_adjust = 0; /* stop active adjtime() */
115 time_status |= STA_UNSYNC; 172 time_status |= STA_UNSYNC;
116 time_maxerror = NTP_PHASE_LIMIT; 173 time_maxerror = NTP_PHASE_LIMIT;
117 time_esterror = NTP_PHASE_LIMIT; 174 time_esterror = NTP_PHASE_LIMIT;
118 175
119 ntp_update_frequency(); 176 ntp_update_frequency();
120 177
121 tick_length = tick_length_base; 178 tick_length = tick_length_base;
122 time_offset = 0; 179 time_offset = 0;
123} 180}
124 181
125/* 182/*
@@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
140 xtime.tv_sec--; 197 xtime.tv_sec--;
141 wall_to_monotonic.tv_sec++; 198 wall_to_monotonic.tv_sec++;
142 time_state = TIME_OOP; 199 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 200 printk(KERN_NOTICE
144 "inserting leap second 23:59:60 UTC\n"); 201 "Clock: inserting leap second 23:59:60 UTC\n");
145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 202 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 res = HRTIMER_RESTART; 203 res = HRTIMER_RESTART;
147 break; 204 break;
@@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
150 time_tai--; 207 time_tai--;
151 wall_to_monotonic.tv_sec--; 208 wall_to_monotonic.tv_sec--;
152 time_state = TIME_WAIT; 209 time_state = TIME_WAIT;
153 printk(KERN_NOTICE "Clock: " 210 printk(KERN_NOTICE
154 "deleting leap second 23:59:59 UTC\n"); 211 "Clock: deleting leap second 23:59:59 UTC\n");
155 break; 212 break;
156 case TIME_OOP: 213 case TIME_OOP:
157 time_tai++; 214 time_tai++;
@@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
179 */ 236 */
180void second_overflow(void) 237void second_overflow(void)
181{ 238{
182 s64 time_adj; 239 s64 delta;
183 240
184 /* Bump the maxerror field */ 241 /* Bump the maxerror field */
185 time_maxerror += MAXFREQ / NSEC_PER_USEC; 242 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -192,24 +249,30 @@ void second_overflow(void)
192 * Compute the phase adjustment for the next second. The offset is 249 * Compute the phase adjustment for the next second. The offset is
193 * reduced by a fixed factor times the time constant. 250 * reduced by a fixed factor times the time constant.
194 */ 251 */
195 tick_length = tick_length_base; 252 tick_length = tick_length_base;
196 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 253
197 time_offset -= time_adj; 254 delta = shift_right(time_offset, SHIFT_PLL + time_constant);
198 tick_length += time_adj; 255 time_offset -= delta;
199 256 tick_length += delta;
200 if (unlikely(time_adjust)) { 257
201 if (time_adjust > MAX_TICKADJ) { 258 if (!time_adjust)
202 time_adjust -= MAX_TICKADJ; 259 return;
203 tick_length += MAX_TICKADJ_SCALED; 260
204 } else if (time_adjust < -MAX_TICKADJ) { 261 if (time_adjust > MAX_TICKADJ) {
205 time_adjust += MAX_TICKADJ; 262 time_adjust -= MAX_TICKADJ;
206 tick_length -= MAX_TICKADJ_SCALED; 263 tick_length += MAX_TICKADJ_SCALED;
207 } else { 264 return;
208 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
209 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
210 time_adjust = 0;
211 }
212 } 265 }
266
267 if (time_adjust < -MAX_TICKADJ) {
268 time_adjust += MAX_TICKADJ;
269 tick_length -= MAX_TICKADJ_SCALED;
270 return;
271 }
272
273 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
274 << NTP_SCALE_SHIFT;
275 time_adjust = 0;
213} 276}
214 277
215#ifdef CONFIG_GENERIC_CMOS_UPDATE 278#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)
233 * This code is run on a timer. If the clock is set, that timer 296 * This code is run on a timer. If the clock is set, that timer
234 * may not expire at the correct time. Thus, we adjust... 297 * may not expire at the correct time. Thus, we adjust...
235 */ 298 */
236 if (!ntp_synced()) 299 if (!ntp_synced()) {
237 /* 300 /*
238 * Not synced, exit, do not restart a timer (if one is 301 * Not synced, exit, do not restart a timer (if one is
239 * running, let it run out). 302 * running, let it run out).
240 */ 303 */
241 return; 304 return;
305 }
242 306
243 getnstimeofday(&now); 307 getnstimeofday(&now);
244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 308 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@@ -270,7 +334,116 @@ static void notify_cmos_timer(void)
270static inline void notify_cmos_timer(void) { } 334static inline void notify_cmos_timer(void) { }
271#endif 335#endif
272 336
273/* adjtimex mainly allows reading (and writing, if superuser) of 337/*
338 * Start the leap seconds timer:
339 */
340static inline void ntp_start_leap_timer(struct timespec *ts)
341{
342 long now = ts->tv_sec;
343
344 if (time_status & STA_INS) {
345 time_state = TIME_INS;
346 now += 86400 - now % 86400;
347 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
348
349 return;
350 }
351
352 if (time_status & STA_DEL) {
353 time_state = TIME_DEL;
354 now += 86400 - (now + 1) % 86400;
355 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
356 }
357}
358
359/*
360 * Propagate a new txc->status value into the NTP state:
361 */
362static inline void process_adj_status(struct timex *txc, struct timespec *ts)
363{
364 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
365 time_state = TIME_OK;
366 time_status = STA_UNSYNC;
367 }
368
369 /*
370 * If we turn on PLL adjustments then reset the
371 * reference time to current time.
372 */
373 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
374 time_reftime = xtime.tv_sec;
375
376 /* only set allowed bits */
377 time_status &= STA_RONLY;
378 time_status |= txc->status & ~STA_RONLY;
379
380 switch (time_state) {
381 case TIME_OK:
382 ntp_start_leap_timer(ts);
383 break;
384 case TIME_INS:
385 case TIME_DEL:
386 time_state = TIME_OK;
387 ntp_start_leap_timer(ts);
388 case TIME_WAIT:
389 if (!(time_status & (STA_INS | STA_DEL)))
390 time_state = TIME_OK;
391 break;
392 case TIME_OOP:
393 hrtimer_restart(&leap_timer);
394 break;
395 }
396}
397/*
398 * Called with the xtime lock held, so we can access and modify
399 * all the global NTP state:
400 */
401static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
402{
403 if (txc->modes & ADJ_STATUS)
404 process_adj_status(txc, ts);
405
406 if (txc->modes & ADJ_NANO)
407 time_status |= STA_NANO;
408
409 if (txc->modes & ADJ_MICRO)
410 time_status &= ~STA_NANO;
411
412 if (txc->modes & ADJ_FREQUENCY) {
413 time_freq = txc->freq * PPM_SCALE;
414 time_freq = min(time_freq, MAXFREQ_SCALED);
415 time_freq = max(time_freq, -MAXFREQ_SCALED);
416 }
417
418 if (txc->modes & ADJ_MAXERROR)
419 time_maxerror = txc->maxerror;
420
421 if (txc->modes & ADJ_ESTERROR)
422 time_esterror = txc->esterror;
423
424 if (txc->modes & ADJ_TIMECONST) {
425 time_constant = txc->constant;
426 if (!(time_status & STA_NANO))
427 time_constant += 4;
428 time_constant = min(time_constant, (long)MAXTC);
429 time_constant = max(time_constant, 0l);
430 }
431
432 if (txc->modes & ADJ_TAI && txc->constant > 0)
433 time_tai = txc->constant;
434
435 if (txc->modes & ADJ_OFFSET)
436 ntp_update_offset(txc->offset);
437
438 if (txc->modes & ADJ_TICK)
439 tick_usec = txc->tick;
440
441 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
442 ntp_update_frequency();
443}
444
445/*
446 * adjtimex mainly allows reading (and writing, if superuser) of
274 * kernel time-keeping variables. used by xntpd. 447 * kernel time-keeping variables. used by xntpd.
275 */ 448 */
276int do_adjtimex(struct timex *txc) 449int do_adjtimex(struct timex *txc)
@@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)
291 if (txc->modes && !capable(CAP_SYS_TIME)) 464 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM; 465 return -EPERM;
293 466
294 /* if the quartz is off by more than 10% something is VERY wrong! */ 467 /*
468 * if the quartz is off by more than 10% then
469 * something is VERY wrong!
470 */
295 if (txc->modes & ADJ_TICK && 471 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ || 472 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ)) 473 txc->tick > 1100000/USER_HZ))
298 return -EINVAL; 474 return -EINVAL;
299 475
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK) 476 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer); 477 hrtimer_cancel(&leap_timer);
@@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)
305 481
306 write_seqlock_irq(&xtime_lock); 482 write_seqlock_irq(&xtime_lock);
307 483
308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) { 484 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust; 485 long save_adjust = time_adjust;
311 486
@@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)
315 ntp_update_frequency(); 490 ntp_update_frequency();
316 } 491 }
317 txc->offset = save_adjust; 492 txc->offset = save_adjust;
318 goto adj_done; 493 } else {
319 }
320 if (txc->modes) {
321 long sec;
322
323 if (txc->modes & ADJ_STATUS) {
324 if ((time_status & STA_PLL) &&
325 !(txc->status & STA_PLL)) {
326 time_state = TIME_OK;
327 time_status = STA_UNSYNC;
328 }
329 /* only set allowed bits */
330 time_status &= STA_RONLY;
331 time_status |= txc->status & ~STA_RONLY;
332
333 switch (time_state) {
334 case TIME_OK:
335 start_timer:
336 sec = ts.tv_sec;
337 if (time_status & STA_INS) {
338 time_state = TIME_INS;
339 sec += 86400 - sec % 86400;
340 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
341 } else if (time_status & STA_DEL) {
342 time_state = TIME_DEL;
343 sec += 86400 - (sec + 1) % 86400;
344 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
345 }
346 break;
347 case TIME_INS:
348 case TIME_DEL:
349 time_state = TIME_OK;
350 goto start_timer;
351 break;
352 case TIME_WAIT:
353 if (!(time_status & (STA_INS | STA_DEL)))
354 time_state = TIME_OK;
355 break;
356 case TIME_OOP:
357 hrtimer_restart(&leap_timer);
358 break;
359 }
360 }
361
362 if (txc->modes & ADJ_NANO)
363 time_status |= STA_NANO;
364 if (txc->modes & ADJ_MICRO)
365 time_status &= ~STA_NANO;
366
367 if (txc->modes & ADJ_FREQUENCY) {
368 time_freq = (s64)txc->freq * PPM_SCALE;
369 time_freq = min(time_freq, MAXFREQ_SCALED);
370 time_freq = max(time_freq, -MAXFREQ_SCALED);
371 }
372
373 if (txc->modes & ADJ_MAXERROR)
374 time_maxerror = txc->maxerror;
375 if (txc->modes & ADJ_ESTERROR)
376 time_esterror = txc->esterror;
377
378 if (txc->modes & ADJ_TIMECONST) {
379 time_constant = txc->constant;
380 if (!(time_status & STA_NANO))
381 time_constant += 4;
382 time_constant = min(time_constant, (long)MAXTC);
383 time_constant = max(time_constant, 0l);
384 }
385
386 if (txc->modes & ADJ_TAI && txc->constant > 0)
387 time_tai = txc->constant;
388
389 if (txc->modes & ADJ_OFFSET)
390 ntp_update_offset(txc->offset);
391 if (txc->modes & ADJ_TICK)
392 tick_usec = txc->tick;
393 494
394 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) 495 /* If there are input parameters, then process them: */
395 ntp_update_frequency(); 496 if (txc->modes)
396 } 497 process_adjtimex_modes(txc, &ts);
397 498
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 499 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT); 500 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO)) 501 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC; 502 txc->offset /= NSEC_PER_USEC;
503 }
402 504
403adj_done:
404 result = time_state; /* mostly `TIME_OK' */ 505 result = time_state; /* mostly `TIME_OK' */
405 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 506 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
406 result = TIME_ERROR; 507 result = TIME_ERROR;
407 508
408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 509 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); 510 PPM_SCALE_INV, NTP_SCALE_SHIFT);
410 txc->maxerror = time_maxerror; 511 txc->maxerror = time_maxerror;
411 txc->esterror = time_esterror; 512 txc->esterror = time_esterror;
412 txc->status = time_status; 513 txc->status = time_status;
@@ -425,6 +526,7 @@ adj_done:
425 txc->calcnt = 0; 526 txc->calcnt = 0;
426 txc->errcnt = 0; 527 txc->errcnt = 0;
427 txc->stbcnt = 0; 528 txc->stbcnt = 0;
529
428 write_sequnlock_irq(&xtime_lock); 530 write_sequnlock_irq(&xtime_lock);
429 531
430 txc->time.tv_sec = ts.tv_sec; 532 txc->time.tv_sec = ts.tv_sec;
@@ -440,6 +542,8 @@ adj_done:
440static int __init ntp_tick_adj_setup(char *str) 542static int __init ntp_tick_adj_setup(char *str)
441{ 543{
442 ntp_tick_adj = simple_strtol(str, NULL, 0); 544 ntp_tick_adj = simple_strtol(str, NULL, 0);
545 ntp_tick_adj <<= NTP_SCALE_SHIFT;
546
443 return 1; 547 return 1;
444} 548}
445 549
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a09..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -274,6 +274,21 @@ out_bc:
274} 274}
275 275
276/* 276/*
277 * Transfer the do_timer job away from a dying cpu.
278 *
279 * Called with interrupts disabled.
280 */
281static void tick_handover_do_timer(int *cpup)
282{
283 if (*cpup == tick_do_timer_cpu) {
284 int cpu = cpumask_first(cpu_online_mask);
285
286 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
287 TICK_DO_TIMER_NONE;
288 }
289}
290
291/*
277 * Shutdown an event device on a given cpu: 292 * Shutdown an event device on a given cpu:
278 * 293 *
279 * This is called on a life CPU, when a CPU is dead. So we cannot 294 * This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
297 clockevents_exchange_device(dev, NULL); 312 clockevents_exchange_device(dev, NULL);
298 td->evtdev = NULL; 313 td->evtdev = NULL;
299 } 314 }
300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = cpumask_first(cpu_online_mask);
303
304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE;
306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 315 spin_unlock_irqrestore(&tick_device_lock, flags);
308} 316}
309 317
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
357 tick_broadcast_oneshot_control(reason); 365 tick_broadcast_oneshot_control(reason);
358 break; 366 break;
359 367
368 case CLOCK_EVT_NOTIFY_CPU_DYING:
369 tick_handover_do_timer(dev);
370 break;
371
360 case CLOCK_EVT_NOTIFY_CPU_DEAD: 372 case CLOCK_EVT_NOTIFY_CPU_DEAD:
361 tick_shutdown_broadcast_oneshot(dev); 373 tick_shutdown_broadcast_oneshot(dev);
362 tick_shutdown_broadcast(dev); 374 tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0a..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(void)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 000000000000..71e7f1a19156
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/math64.h>
23
24/*
25 * fixed point arithmetic scale factor for skew
26 *
27 * Usually one would measure skew in ppb (parts per billion, 1e9), but
28 * using a factor of 2 simplifies the math.
29 */
30#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
31
32ktime_t timecompare_transform(struct timecompare *sync,
33 u64 source_tstamp)
34{
35 u64 nsec;
36
37 nsec = source_tstamp + sync->offset;
38 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
39 TIMECOMPARE_SKEW_RESOLUTION;
40
41 return ns_to_ktime(nsec);
42}
43EXPORT_SYMBOL(timecompare_transform);
44
45int timecompare_offset(struct timecompare *sync,
46 s64 *offset,
47 u64 *source_tstamp)
48{
49 u64 start_source = 0, end_source = 0;
50 struct {
51 s64 offset;
52 s64 duration_target;
53 } buffer[10], sample, *samples;
54 int counter = 0, i;
55 int used;
56 int index;
57 int num_samples = sync->num_samples;
58
59 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
60 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
61 if (!samples) {
62 samples = buffer;
63 num_samples = sizeof(buffer)/sizeof(buffer[0]);
64 }
65 } else {
66 samples = buffer;
67 }
68
69 /* run until we have enough valid samples, but do not try forever */
70 i = 0;
71 counter = 0;
72 while (1) {
73 u64 ts;
74 ktime_t start, end;
75
76 start = sync->target();
77 ts = timecounter_read(sync->source);
78 end = sync->target();
79
80 if (!i)
81 start_source = ts;
82
83 /* ignore negative durations */
84 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
85 if (sample.duration_target >= 0) {
86 /*
87 * assume symetric delay to and from source:
88 * average target time corresponds to measured
89 * source time
90 */
91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 -
93 ts;
94
95 /* simple insertion sort based on duration */
96 index = counter - 1;
97 while (index >= 0) {
98 if (samples[index].duration_target <
99 sample.duration_target)
100 break;
101 samples[index + 1] = samples[index];
102 index--;
103 }
104 samples[index + 1] = sample;
105 counter++;
106 }
107
108 i++;
109 if (counter >= num_samples || i >= 100000) {
110 end_source = ts;
111 break;
112 }
113 }
114
115 *source_tstamp = (end_source + start_source) / 2;
116
117 /* remove outliers by only using 75% of the samples */
118 used = counter * 3 / 4;
119 if (!used)
120 used = counter;
121 if (used) {
122 /* calculate average */
123 s64 off = 0;
124 for (index = 0; index < used; index++)
125 off += samples[index].offset;
126 *offset = div_s64(off, used);
127 }
128
129 if (samples && samples != buffer)
130 kfree(samples);
131
132 return used;
133}
134EXPORT_SYMBOL(timecompare_offset);
135
136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp)
138{
139 s64 offset;
140 u64 average_time;
141
142 if (!timecompare_offset(sync, &offset, &average_time))
143 return;
144
145 if (!sync->last_update) {
146 sync->last_update = average_time;
147 sync->offset = offset;
148 sync->skew = 0;
149 } else {
150 s64 delta_nsec = average_time - sync->last_update;
151
152 /* avoid division by negative or small deltas */
153 if (delta_nsec >= 10000) {
154 s64 delta_offset_nsec = offset - sync->offset;
155 s64 skew; /* delta_offset_nsec *
156 TIMECOMPARE_SKEW_RESOLUTION /
157 delta_nsec */
158 u64 divisor;
159
160 /* div_s64() is limited to 32 bit divisor */
161 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
162 divisor = delta_nsec;
163 while (unlikely(divisor >= ((s64)1) << 32)) {
164 /* divide both by 2; beware, right shift
165 of negative value has undefined
166 behavior and can only be used for
167 the positive divisor */
168 skew = div_s64(skew, 2);
169 divisor >>= 1;
170 }
171 skew = div_s64(skew, divisor);
172
173 /*
174 * Calculate new overall skew as 4/16 the
175 * old value and 12/16 the new one. This is
176 * a rather arbitrary tradeoff between
177 * only using the latest measurement (0/16 and
178 * 16/16) and even more weight on past measurements.
179 */
180#define TIMECOMPARE_NEW_SKEW_PER_16 12
181 sync->skew =
182 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
183 sync->skew +
184 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
185 16);
186 sync->last_update = average_time;
187 sync->offset = offset;
188 }
189 }
190}
191EXPORT_SYMBOL(__timecompare_update);
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143d..9b77fc9a9ac8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
589 } 589 }
590} 590}
591 591
592int __mod_timer(struct timer_list *timer, unsigned long expires) 592static inline int
593__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
593{ 594{
594 struct tvec_base *base, *new_base; 595 struct tvec_base *base, *new_base;
595 unsigned long flags; 596 unsigned long flags;
596 int ret = 0; 597 int ret;
598
599 ret = 0;
597 600
598 timer_stats_timer_set_start_info(timer); 601 timer_stats_timer_set_start_info(timer);
599 BUG_ON(!timer->function); 602 BUG_ON(!timer->function);
@@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
603 if (timer_pending(timer)) { 606 if (timer_pending(timer)) {
604 detach_timer(timer, 0); 607 detach_timer(timer, 0);
605 ret = 1; 608 ret = 1;
609 } else {
610 if (pending_only)
611 goto out_unlock;
606 } 612 }
607 613
608 debug_timer_activate(timer); 614 debug_timer_activate(timer);
@@ -629,42 +635,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
629 635
630 timer->expires = expires; 636 timer->expires = expires;
631 internal_add_timer(base, timer); 637 internal_add_timer(base, timer);
638
639out_unlock:
632 spin_unlock_irqrestore(&base->lock, flags); 640 spin_unlock_irqrestore(&base->lock, flags);
633 641
634 return ret; 642 return ret;
635} 643}
636 644
637EXPORT_SYMBOL(__mod_timer);
638
639/** 645/**
640 * add_timer_on - start a timer on a particular CPU 646 * mod_timer_pending - modify a pending timer's timeout
641 * @timer: the timer to be added 647 * @timer: the pending timer to be modified
642 * @cpu: the CPU to start it on 648 * @expires: new timeout in jiffies
643 * 649 *
644 * This is not very scalable on SMP. Double adds are not possible. 650 * mod_timer_pending() is the same for pending timers as mod_timer(),
651 * but will not re-activate and modify already deleted timers.
652 *
653 * It is useful for unserialized use of timers.
645 */ 654 */
646void add_timer_on(struct timer_list *timer, int cpu) 655int mod_timer_pending(struct timer_list *timer, unsigned long expires)
647{ 656{
648 struct tvec_base *base = per_cpu(tvec_bases, cpu); 657 return __mod_timer(timer, expires, true);
649 unsigned long flags;
650
651 timer_stats_timer_set_start_info(timer);
652 BUG_ON(timer_pending(timer) || !timer->function);
653 spin_lock_irqsave(&base->lock, flags);
654 timer_set_base(timer, base);
655 debug_timer_activate(timer);
656 internal_add_timer(base, timer);
657 /*
658 * Check whether the other CPU is idle and needs to be
659 * triggered to reevaluate the timer wheel when nohz is
660 * active. We are protected against the other CPU fiddling
661 * with the timer by holding the timer base lock. This also
662 * makes sure that a CPU on the way to idle can not evaluate
663 * the timer wheel.
664 */
665 wake_up_idle_cpu(cpu);
666 spin_unlock_irqrestore(&base->lock, flags);
667} 658}
659EXPORT_SYMBOL(mod_timer_pending);
668 660
669/** 661/**
670 * mod_timer - modify a timer's timeout 662 * mod_timer - modify a timer's timeout
@@ -688,9 +680,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
688 */ 680 */
689int mod_timer(struct timer_list *timer, unsigned long expires) 681int mod_timer(struct timer_list *timer, unsigned long expires)
690{ 682{
691 BUG_ON(!timer->function);
692
693 timer_stats_timer_set_start_info(timer);
694 /* 683 /*
695 * This is a common optimization triggered by the 684 * This is a common optimization triggered by the
696 * networking code - if the timer is re-modified 685 * networking code - if the timer is re-modified
@@ -699,12 +688,62 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
699 if (timer->expires == expires && timer_pending(timer)) 688 if (timer->expires == expires && timer_pending(timer))
700 return 1; 689 return 1;
701 690
702 return __mod_timer(timer, expires); 691 return __mod_timer(timer, expires, false);
703} 692}
704
705EXPORT_SYMBOL(mod_timer); 693EXPORT_SYMBOL(mod_timer);
706 694
707/** 695/**
696 * add_timer - start a timer
697 * @timer: the timer to be added
698 *
699 * The kernel will do a ->function(->data) callback from the
700 * timer interrupt at the ->expires point in the future. The
701 * current time is 'jiffies'.
702 *
703 * The timer's ->expires, ->function (and if the handler uses it, ->data)
704 * fields must be set prior calling this function.
705 *
706 * Timers with an ->expires field in the past will be executed in the next
707 * timer tick.
708 */
709void add_timer(struct timer_list *timer)
710{
711 BUG_ON(timer_pending(timer));
712 mod_timer(timer, timer->expires);
713}
714EXPORT_SYMBOL(add_timer);
715
716/**
717 * add_timer_on - start a timer on a particular CPU
718 * @timer: the timer to be added
719 * @cpu: the CPU to start it on
720 *
721 * This is not very scalable on SMP. Double adds are not possible.
722 */
723void add_timer_on(struct timer_list *timer, int cpu)
724{
725 struct tvec_base *base = per_cpu(tvec_bases, cpu);
726 unsigned long flags;
727
728 timer_stats_timer_set_start_info(timer);
729 BUG_ON(timer_pending(timer) || !timer->function);
730 spin_lock_irqsave(&base->lock, flags);
731 timer_set_base(timer, base);
732 debug_timer_activate(timer);
733 internal_add_timer(base, timer);
734 /*
735 * Check whether the other CPU is idle and needs to be
736 * triggered to reevaluate the timer wheel when nohz is
737 * active. We are protected against the other CPU fiddling
738 * with the timer by holding the timer base lock. This also
739 * makes sure that a CPU on the way to idle can not evaluate
740 * the timer wheel.
741 */
742 wake_up_idle_cpu(cpu);
743 spin_unlock_irqrestore(&base->lock, flags);
744}
745
746/**
708 * del_timer - deactive a timer. 747 * del_timer - deactive a timer.
709 * @timer: the timer to be deactivated 748 * @timer: the timer to be deactivated
710 * 749 *
@@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer)
733 772
734 return ret; 773 return ret;
735} 774}
736
737EXPORT_SYMBOL(del_timer); 775EXPORT_SYMBOL(del_timer);
738 776
739#ifdef CONFIG_SMP 777#ifdef CONFIG_SMP
@@ -767,7 +805,6 @@ out:
767 805
768 return ret; 806 return ret;
769} 807}
770
771EXPORT_SYMBOL(try_to_del_timer_sync); 808EXPORT_SYMBOL(try_to_del_timer_sync);
772 809
773/** 810/**
@@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer)
796 cpu_relax(); 833 cpu_relax();
797 } 834 }
798} 835}
799
800EXPORT_SYMBOL(del_timer_sync); 836EXPORT_SYMBOL(del_timer_sync);
801#endif 837#endif
802 838
@@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout)
1268 expire = timeout + jiffies; 1304 expire = timeout + jiffies;
1269 1305
1270 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1306 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1271 __mod_timer(&timer, expire); 1307 __mod_timer(&timer, expire, false);
1272 schedule(); 1308 schedule();
1273 del_singleshot_timer_sync(&timer); 1309 del_singleshot_timer_sync(&timer);
1274 1310
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..34e707e5ab87 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config FUNCTION_TRACER
52 depends on HAVE_FUNCTION_TRACER 52 depends on HAVE_FUNCTION_TRACER
53 depends on DEBUG_KERNEL 53 depends on DEBUG_KERNEL
54 select FRAME_POINTER 54 select FRAME_POINTER
55 select KALLSYMS
55 select TRACING 56 select TRACING
56 select CONTEXT_SWITCH_TRACER 57 select CONTEXT_SWITCH_TRACER
57 help 58 help
@@ -238,6 +239,7 @@ config STACK_TRACER
238 depends on DEBUG_KERNEL 239 depends on DEBUG_KERNEL
239 select FUNCTION_TRACER 240 select FUNCTION_TRACER
240 select STACKTRACE 241 select STACKTRACE
242 select KALLSYMS
241 help 243 help
242 This special tracer records the maximum stack footprint of the 244 This special tracer records the maximum stack footprint of the
243 kernel and displays it in debugfs/tracing/stack_trace. 245 kernel and displays it in debugfs/tracing/stack_trace.
@@ -302,4 +304,27 @@ config FTRACE_STARTUP_TEST
302 functioning properly. It will do tests on all the configured 304 functioning properly. It will do tests on all the configured
303 tracers of ftrace. 305 tracers of ftrace.
304 306
307config MMIOTRACE
308 bool "Memory mapped IO tracing"
309 depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
310 select TRACING
311 help
312 Mmiotrace traces Memory Mapped I/O access and is meant for
313 debugging and reverse engineering. It is called from the ioremap
314 implementation and works via page faults. Tracing is disabled by
315 default and can be enabled at run-time.
316
317 See Documentation/tracers/mmiotrace.txt.
318 If you are not helping to develop drivers, say N.
319
320config MMIOTRACE_TEST
321 tristate "Test module for mmiotrace"
322 depends on MMIOTRACE && m
323 help
324 This is a dumb module for testing mmiotrace. It is very dangerous
325 as it will write garbage to IO memory starting at a given address.
326 However, it should be safe to use on e.g. unused portion of VRAM.
327
328 Say N, unless you absolutely know what you are doing.
329
305endmenu 330endmenu
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..fdf913dfc7e8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
17#include <linux/clocksource.h> 17#include <linux/clocksource.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/suspend.h>
20#include <linux/debugfs.h> 21#include <linux/debugfs.h>
21#include <linux/hardirq.h> 22#include <linux/hardirq.h>
22#include <linux/kthread.h> 23#include <linux/kthread.h>
@@ -1736,9 +1737,12 @@ static void clear_ftrace_pid(struct pid *pid)
1736{ 1737{
1737 struct task_struct *p; 1738 struct task_struct *p;
1738 1739
1740 rcu_read_lock();
1739 do_each_pid_task(pid, PIDTYPE_PID, p) { 1741 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p); 1742 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p); 1743 } while_each_pid_task(pid, PIDTYPE_PID, p);
1744 rcu_read_unlock();
1745
1742 put_pid(pid); 1746 put_pid(pid);
1743} 1747}
1744 1748
@@ -1746,9 +1750,11 @@ static void set_ftrace_pid(struct pid *pid)
1746{ 1750{
1747 struct task_struct *p; 1751 struct task_struct *p;
1748 1752
1753 rcu_read_lock();
1749 do_each_pid_task(pid, PIDTYPE_PID, p) { 1754 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p); 1755 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p); 1756 } while_each_pid_task(pid, PIDTYPE_PID, p);
1757 rcu_read_unlock();
1752} 1758}
1753 1759
1754static void clear_ftrace_pid_task(struct pid **pid) 1760static void clear_ftrace_pid_task(struct pid **pid)
@@ -1965,6 +1971,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1971#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966 1972
1967static atomic_t ftrace_graph_active; 1973static atomic_t ftrace_graph_active;
1974static struct notifier_block ftrace_suspend_notifier;
1968 1975
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 1976int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{ 1977{
@@ -2026,7 +2033,7 @@ free:
2026static int start_graph_tracing(void) 2033static int start_graph_tracing(void)
2027{ 2034{
2028 struct ftrace_ret_stack **ret_stack_list; 2035 struct ftrace_ret_stack **ret_stack_list;
2029 int ret; 2036 int ret, cpu;
2030 2037
2031 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * 2038 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2032 sizeof(struct ftrace_ret_stack *), 2039 sizeof(struct ftrace_ret_stack *),
@@ -2035,6 +2042,10 @@ static int start_graph_tracing(void)
2035 if (!ret_stack_list) 2042 if (!ret_stack_list)
2036 return -ENOMEM; 2043 return -ENOMEM;
2037 2044
2045 /* The cpu_boot init_task->ret_stack will never be freed */
2046 for_each_online_cpu(cpu)
2047 ftrace_graph_init_task(idle_task(cpu));
2048
2038 do { 2049 do {
2039 ret = alloc_retstack_tasklist(ret_stack_list); 2050 ret = alloc_retstack_tasklist(ret_stack_list);
2040 } while (ret == -EAGAIN); 2051 } while (ret == -EAGAIN);
@@ -2043,6 +2054,27 @@ static int start_graph_tracing(void)
2043 return ret; 2054 return ret;
2044} 2055}
2045 2056
2057/*
2058 * Hibernation protection.
2059 * The state of the current task is too much unstable during
2060 * suspend/restore to disk. We want to protect against that.
2061 */
2062static int
2063ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
2064 void *unused)
2065{
2066 switch (state) {
2067 case PM_HIBERNATION_PREPARE:
2068 pause_graph_tracing();
2069 break;
2070
2071 case PM_POST_HIBERNATION:
2072 unpause_graph_tracing();
2073 break;
2074 }
2075 return NOTIFY_DONE;
2076}
2077
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc, 2078int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc) 2079 trace_func_graph_ent_t entryfunc)
2048{ 2080{
@@ -2050,6 +2082,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2050 2082
2051 mutex_lock(&ftrace_sysctl_lock); 2083 mutex_lock(&ftrace_sysctl_lock);
2052 2084
2085 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2086 register_pm_notifier(&ftrace_suspend_notifier);
2087
2053 atomic_inc(&ftrace_graph_active); 2088 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing(); 2089 ret = start_graph_tracing();
2055 if (ret) { 2090 if (ret) {
@@ -2075,6 +2110,7 @@ void unregister_ftrace_graph(void)
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 2110 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub; 2111 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 2112 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2113 unregister_pm_notifier(&ftrace_suspend_notifier);
2078 2114
2079 mutex_unlock(&ftrace_sysctl_lock); 2115 mutex_unlock(&ftrace_sysctl_lock);
2080} 2116}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..bd38c5cfd8ad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -246,7 +246,7 @@ static inline int test_time_stamp(u64 delta)
246 return 0; 246 return 0;
247} 247}
248 248
249#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) 249#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
250 250
251/* 251/*
252 * head_page == tail_page && head == tail then buffer is empty. 252 * head_page == tail_page && head == tail then buffer is empty.
@@ -1025,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1025 } 1025 }
1026 1026
1027 if (next_page == head_page) { 1027 if (next_page == head_page) {
1028 if (!(buffer->flags & RB_FL_OVERWRITE)) { 1028 if (!(buffer->flags & RB_FL_OVERWRITE))
1029 /* reset write */
1030 if (tail <= BUF_PAGE_SIZE)
1031 local_set(&tail_page->write, tail);
1032 goto out_unlock; 1029 goto out_unlock;
1033 }
1034 1030
1035 /* tail_page has not moved yet? */ 1031 /* tail_page has not moved yet? */
1036 if (tail_page == cpu_buffer->tail_page) { 1032 if (tail_page == cpu_buffer->tail_page) {
@@ -1105,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1105 return event; 1101 return event;
1106 1102
1107 out_unlock: 1103 out_unlock:
1104 /* reset write */
1105 if (tail <= BUF_PAGE_SIZE)
1106 local_set(&tail_page->write, tail);
1107
1108 __raw_spin_unlock(&cpu_buffer->lock); 1108 __raw_spin_unlock(&cpu_buffer->lock);
1109 local_irq_restore(flags); 1109 local_irq_restore(flags);
1110 return NULL; 1110 return NULL;
@@ -2174,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2174 2174
2175 cpu_buffer->overrun = 0; 2175 cpu_buffer->overrun = 0;
2176 cpu_buffer->entries = 0; 2176 cpu_buffer->entries = 0;
2177
2178 cpu_buffer->write_stamp = 0;
2179 cpu_buffer->read_stamp = 0;
2177} 2180}
2178 2181
2179/** 2182/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add95..17bb88d86ac2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,7 +40,7 @@
40 40
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42 42
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 43unsigned long __read_mostly tracing_max_latency;
44unsigned long __read_mostly tracing_thresh; 44unsigned long __read_mostly tracing_thresh;
45 45
46/* 46/*
@@ -3736,7 +3736,7 @@ static struct notifier_block trace_die_notifier = {
3736 * it if we decide to change what log level the ftrace dump 3736 * it if we decide to change what log level the ftrace dump
3737 * should be at. 3737 * should be at.
3738 */ 3738 */
3739#define KERN_TRACE KERN_INFO 3739#define KERN_TRACE KERN_EMERG
3740 3740
3741static void 3741static void
3742trace_printk_seq(struct trace_seq *s) 3742trace_printk_seq(struct trace_seq *s)
@@ -3770,6 +3770,7 @@ void ftrace_dump(void)
3770 dump_ran = 1; 3770 dump_ran = 1;
3771 3771
3772 /* No turning back! */ 3772 /* No turning back! */
3773 tracing_off();
3773 ftrace_kill(); 3774 ftrace_kill();
3774 3775
3775 for_each_tracing_cpu(cpu) { 3776 for_each_tracing_cpu(cpu) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
380 380
381static void __irqsoff_tracer_init(struct trace_array *tr) 381static void __irqsoff_tracer_init(struct trace_array *tr)
382{ 382{
383 tracing_max_latency = 0;
383 irqsoff_trace = tr; 384 irqsoff_trace = tr;
384 /* make sure that the tracer is visible */ 385 /* make sure that the tracer is visible */
385 smp_wmb(); 386 smp_wmb();
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..80e503ef6136 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <asm/atomic.h>
12 13
13#include "trace.h" 14#include "trace.h"
14 15
@@ -19,6 +20,7 @@ struct header_iter {
19static struct trace_array *mmio_trace_array; 20static struct trace_array *mmio_trace_array;
20static bool overrun_detected; 21static bool overrun_detected;
21static unsigned long prev_overruns; 22static unsigned long prev_overruns;
23static atomic_t dropped_count;
22 24
23static void mmio_reset_data(struct trace_array *tr) 25static void mmio_reset_data(struct trace_array *tr)
24{ 26{
@@ -121,11 +123,11 @@ static void mmio_close(struct trace_iterator *iter)
121 123
122static unsigned long count_overruns(struct trace_iterator *iter) 124static unsigned long count_overruns(struct trace_iterator *iter)
123{ 125{
124 unsigned long cnt = 0; 126 unsigned long cnt = atomic_xchg(&dropped_count, 0);
125 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 127 unsigned long over = ring_buffer_overruns(iter->tr->buffer);
126 128
127 if (over > prev_overruns) 129 if (over > prev_overruns)
128 cnt = over - prev_overruns; 130 cnt += over - prev_overruns;
129 prev_overruns = over; 131 prev_overruns = over;
130 return cnt; 132 return cnt;
131} 133}
@@ -310,8 +312,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
310 312
311 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 313 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
312 &irq_flags); 314 &irq_flags);
313 if (!event) 315 if (!event) {
316 atomic_inc(&dropped_count);
314 return; 317 return;
318 }
315 entry = ring_buffer_event_data(event); 319 entry = ring_buffer_event_data(event);
316 tracing_generic_entry_update(&entry->ent, 0, preempt_count()); 320 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
317 entry->ent.type = TRACE_MMIO_RW; 321 entry->ent.type = TRACE_MMIO_RW;
@@ -338,8 +342,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
338 342
339 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 343 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
340 &irq_flags); 344 &irq_flags);
341 if (!event) 345 if (!event) {
346 atomic_inc(&dropped_count);
342 return; 347 return;
348 }
343 entry = ring_buffer_event_data(event); 349 entry = ring_buffer_event_data(event);
344 tracing_generic_entry_update(&entry->ent, 0, preempt_count()); 350 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
345 entry->ent.type = TRACE_MMIO_MAP; 351 entry->ent.type = TRACE_MMIO_MAP;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
333 333
334static int wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
335{ 335{
336 tracing_max_latency = 0;
336 wakeup_trace = tr; 337 wakeup_trace = tr;
337 start_wakeup_tracer(tr); 338 start_wakeup_tracer(tr);
338 return 0; 339 return 0;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..bc8e80a86bca 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -23,10 +23,20 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
23{ 23{
24 struct ring_buffer_event *event; 24 struct ring_buffer_event *event;
25 struct trace_entry *entry; 25 struct trace_entry *entry;
26 unsigned int loops = 0;
26 27
27 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 28 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
28 entry = ring_buffer_event_data(event); 29 entry = ring_buffer_event_data(event);
29 30
31 /*
32 * The ring buffer is a size of trace_buf_size, if
33 * we loop more than the size, there's something wrong
34 * with the ring buffer.
35 */
36 if (loops++ > trace_buf_size) {
37 printk(KERN_CONT ".. bad ring buffer ");
38 goto failed;
39 }
30 if (!trace_valid_entry(entry)) { 40 if (!trace_valid_entry(entry)) {
31 printk(KERN_CONT ".. invalid entry %d ", 41 printk(KERN_CONT ".. invalid entry %d ",
32 entry->type); 42 entry->type);
@@ -57,11 +67,20 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
57 67
58 cnt = ring_buffer_entries(tr->buffer); 68 cnt = ring_buffer_entries(tr->buffer);
59 69
70 /*
71 * The trace_test_buffer_cpu runs a while loop to consume all data.
72 * If the calling tracer is broken, and is constantly filling
73 * the buffer, this will run forever, and hard lock the box.
74 * We disable the ring buffer while we do this test to prevent
75 * a hard lock up.
76 */
77 tracing_off();
60 for_each_possible_cpu(cpu) { 78 for_each_possible_cpu(cpu) {
61 ret = trace_test_buffer_cpu(tr, cpu); 79 ret = trace_test_buffer_cpu(tr, cpu);
62 if (ret) 80 if (ret)
63 break; 81 break;
64 } 82 }
83 tracing_on();
65 __raw_spin_unlock(&ftrace_max_lock); 84 __raw_spin_unlock(&ftrace_max_lock);
66 local_irq_restore(flags); 85 local_irq_restore(flags);
67 86
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b05a4b..00d59d048edf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
122 if (likely(tsk->mm)) { 122 if (likely(tsk->mm)) {
123 cputime_t time, dtime; 123 cputime_t time, dtime;
124 struct timeval value; 124 struct timeval value;
125 unsigned long flags;
125 u64 delta; 126 u64 delta;
126 127
128 local_irq_save(flags);
127 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
128 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = cputime_sub(time, tsk->acct_timexpd);
129 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
131 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
132 134
133 if (delta == 0) 135 if (delta == 0)
134 return; 136 goto out;
135 tsk->acct_timexpd = time; 137 tsk->acct_timexpd = time;
136 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 138 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
137 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 139 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
140 out:
141 local_irq_restore(flags);
138 } 142 }
139} 143}
140 144
diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f447..850e0ba41c1e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
20 20
21struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
22 .kref = { 22 .kref = {
23 .refcount = ATOMIC_INIT(1), 23 .refcount = ATOMIC_INIT(2),
24 }, 24 },
25 .creator = &root_user, 25 .creator = &root_user,
26}; 26};
@@ -72,6 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
72static void uid_hash_remove(struct user_struct *up) 72static void uid_hash_remove(struct user_struct *up)
73{ 73{
74 hlist_del_init(&up->uidhash_node); 74 hlist_del_init(&up->uidhash_node);
75 put_user_ns(up->user_ns);
75} 76}
76 77
77static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -285,14 +286,12 @@ int __init uids_sysfs_init(void)
285/* work function to remove sysfs directory for a user and free up 286/* work function to remove sysfs directory for a user and free up
286 * corresponding structures. 287 * corresponding structures.
287 */ 288 */
288static void remove_user_sysfs_dir(struct work_struct *w) 289static void cleanup_user_struct(struct work_struct *w)
289{ 290{
290 struct user_struct *up = container_of(w, struct user_struct, work); 291 struct user_struct *up = container_of(w, struct user_struct, work);
291 unsigned long flags; 292 unsigned long flags;
292 int remove_user = 0; 293 int remove_user = 0;
293 294
294 if (up->user_ns != &init_user_ns)
295 return;
296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 295 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
297 * atomic. 296 * atomic.
298 */ 297 */
@@ -311,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)
311 if (!remove_user) 310 if (!remove_user)
312 goto done; 311 goto done;
313 312
314 kobject_uevent(&up->kobj, KOBJ_REMOVE); 313 if (up->user_ns == &init_user_ns) {
315 kobject_del(&up->kobj); 314 kobject_uevent(&up->kobj, KOBJ_REMOVE);
316 kobject_put(&up->kobj); 315 kobject_del(&up->kobj);
316 kobject_put(&up->kobj);
317 }
317 318
318 sched_destroy_user(up); 319 sched_destroy_user(up);
319 key_put(up->uid_keyring); 320 key_put(up->uid_keyring);
@@ -334,8 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
334 atomic_inc(&up->__count); 335 atomic_inc(&up->__count);
335 spin_unlock_irqrestore(&uidhash_lock, flags); 336 spin_unlock_irqrestore(&uidhash_lock, flags);
336 337
337 put_user_ns(up->user_ns); 338 INIT_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, remove_user_sysfs_dir);
339 schedule_work(&up->work); 339 schedule_work(&up->work);
340} 340}
341 341
@@ -357,12 +357,29 @@ static void free_user(struct user_struct *up, unsigned long flags)
357 sched_destroy_user(up); 357 sched_destroy_user(up);
358 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
359 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
361 kmem_cache_free(uid_cachep, up); 360 kmem_cache_free(uid_cachep, up);
362} 361}
363 362
364#endif 363#endif
365 364
365#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
366/*
367 * We need to check if a setuid can take place. This function should be called
368 * before successfully completing the setuid.
369 */
370int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
371{
372
373 return sched_rt_can_attach(up->tg, tsk);
374
375}
376#else
377int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
378{
379 return 1;
380}
381#endif
382
366/* 383/*
367 * Locate the user_struct for the passed UID. If found, take a ref on it. The 384 * Locate the user_struct for the passed UID. If found, take a ref on it. The
368 * caller must undo that ref with free_uid(). 385 * caller must undo that ref with free_uid().
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 79084311ee57..076c7c8215b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -60,12 +60,25 @@ int create_user_ns(struct cred *new)
60 return 0; 60 return 0;
61} 61}
62 62
63void free_user_ns(struct kref *kref) 63/*
64 * Deferred destructor for a user namespace. This is required because
65 * free_user_ns() may be called with uidhash_lock held, but we need to call
66 * back to free_uid() which will want to take the lock again.
67 */
68static void free_user_ns_work(struct work_struct *work)
64{ 69{
65 struct user_namespace *ns; 70 struct user_namespace *ns =
66 71 container_of(work, struct user_namespace, destroyer);
67 ns = container_of(kref, struct user_namespace, kref);
68 free_uid(ns->creator); 72 free_uid(ns->creator);
69 kfree(ns); 73 kfree(ns);
70} 74}
75
76void free_user_ns(struct kref *kref)
77{
78 struct user_namespace *ns =
79 container_of(kref, struct user_namespace, kref);
80
81 INIT_WORK(&ns->destroyer, free_user_ns_work);
82 schedule_work(&ns->destroyer);
83}
71EXPORT_SYMBOL(free_user_ns); 84EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91} 91}
92EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
93 93
94/*
95 * finish_wait - clean up after waiting in a queue
96 * @q: waitqueue waited on
97 * @wait: wait descriptor
98 *
99 * Sets current thread back to running state and removes
100 * the wait descriptor from the given waitqueue if still
101 * queued.
102 */
94void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 103void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
95{ 104{
96 unsigned long flags; 105 unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
117} 126}
118EXPORT_SYMBOL(finish_wait); 127EXPORT_SYMBOL(finish_wait);
119 128
129/*
130 * abort_exclusive_wait - abort exclusive waiting in a queue
131 * @q: waitqueue waited on
132 * @wait: wait descriptor
133 * @state: runstate of the waiter to be woken
134 * @key: key to identify a wait bit queue or %NULL
135 *
136 * Sets current thread back to running state and removes
137 * the wait descriptor from the given waitqueue if still
138 * queued.
139 *
140 * Wakes up the next waiter if the caller is concurrently
141 * woken up through the queue.
142 *
143 * This prevents waiter starvation where an exclusive waiter
144 * aborts and is woken up concurrently and noone wakes up
145 * the next waiter.
146 */
147void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
148 unsigned int mode, void *key)
149{
150 unsigned long flags;
151
152 __set_current_state(TASK_RUNNING);
153 spin_lock_irqsave(&q->lock, flags);
154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key);
158 spin_unlock_irqrestore(&q->lock, flags);
159}
160EXPORT_SYMBOL(abort_exclusive_wait);
161
120int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 162int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
121{ 163{
122 int ret = default_wake_function(wait, mode, sync, key); 164 int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
177__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 219__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
178 int (*action)(void *), unsigned mode) 220 int (*action)(void *), unsigned mode)
179{ 221{
180 int ret = 0;
181
182 do { 222 do {
223 int ret;
224
183 prepare_to_wait_exclusive(wq, &q->wait, mode); 225 prepare_to_wait_exclusive(wq, &q->wait, mode);
184 if (test_bit(q->key.bit_nr, q->key.flags)) { 226 if (!test_bit(q->key.bit_nr, q->key.flags))
185 if ((ret = (*action)(q->key.flags))) 227 continue;
186 break; 228 ret = action(q->key.flags);
187 } 229 if (!ret)
230 continue;
231 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
232 return ret;
188 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 233 } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
189 finish_wait(wq, &q->wait); 234 finish_wait(wq, &q->wait);
190 return ret; 235 return 0;
191} 236}
192EXPORT_SYMBOL(__wait_on_bit_lock); 237EXPORT_SYMBOL(__wait_on_bit_lock);
193 238
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..1f0c509b40d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -971,6 +971,8 @@ undo:
971} 971}
972 972
973#ifdef CONFIG_SMP 973#ifdef CONFIG_SMP
974static struct workqueue_struct *work_on_cpu_wq __read_mostly;
975
974struct work_for_cpu { 976struct work_for_cpu {
975 struct work_struct work; 977 struct work_struct work;
976 long (*fn)(void *); 978 long (*fn)(void *);
@@ -991,8 +993,8 @@ static void do_work_for_cpu(struct work_struct *w)
991 * @fn: the function to run 993 * @fn: the function to run
992 * @arg: the function arg 994 * @arg: the function arg
993 * 995 *
994 * This will return -EINVAL in the cpu is not online, or the return value 996 * This will return the value @fn returns.
995 * of @fn otherwise. 997 * It is up to the caller to ensure that the cpu doesn't go offline.
996 */ 998 */
997long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 999long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
998{ 1000{
@@ -1001,14 +1003,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1001 INIT_WORK(&wfc.work, do_work_for_cpu); 1003 INIT_WORK(&wfc.work, do_work_for_cpu);
1002 wfc.fn = fn; 1004 wfc.fn = fn;
1003 wfc.arg = arg; 1005 wfc.arg = arg;
1004 get_online_cpus(); 1006 queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
1005 if (unlikely(!cpu_online(cpu))) 1007 flush_work(&wfc.work);
1006 wfc.ret = -EINVAL;
1007 else {
1008 schedule_work_on(cpu, &wfc.work);
1009 flush_work(&wfc.work);
1010 }
1011 put_online_cpus();
1012 1008
1013 return wfc.ret; 1009 return wfc.ret;
1014} 1010}
@@ -1025,4 +1021,8 @@ void __init init_workqueues(void)
1025 hotcpu_notifier(workqueue_cpu_callback, 0); 1021 hotcpu_notifier(workqueue_cpu_callback, 0);
1026 keventd_wq = create_workqueue("events"); 1022 keventd_wq = create_workqueue("events");
1027 BUG_ON(!keventd_wq); 1023 BUG_ON(!keventd_wq);
1024#ifdef CONFIG_SMP
1025 work_on_cpu_wq = create_workqueue("work_on_cpu");
1026 BUG_ON(!work_on_cpu_wq);
1027#endif
1028} 1028}