diff options
Diffstat (limited to 'kernel')
123 files changed, 6628 insertions, 2206 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b95b35..24e7cb0ba26 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY | |||
35 | 35 | ||
36 | config PREEMPT | 36 | config PREEMPT |
37 | bool "Preemptible Kernel (Low-Latency Desktop)" | 37 | bool "Preemptible Kernel (Low-Latency Desktop)" |
38 | select PREEMPT_COUNT | ||
38 | help | 39 | help |
39 | This option reduces the latency of the kernel by making | 40 | This option reduces the latency of the kernel by making |
40 | all kernel code (that is not executing in a critical section) | 41 | all kernel code (that is not executing in a critical section) |
@@ -52,3 +53,5 @@ config PREEMPT | |||
52 | 53 | ||
53 | endchoice | 54 | endchoice |
54 | 55 | ||
56 | config PREEMPT_COUNT | ||
57 | bool \ No newline at end of file | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 2d64cfcc8b4..eca595e2fd5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
13 | async.o range.o jump_label.o | 13 | async.o range.o |
14 | obj-y += groups.o | 14 | obj-y += groups.o |
15 | 15 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += events/ | |||
107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
108 | obj-$(CONFIG_PADATA) += padata.o | 108 | obj-$(CONFIG_PADATA) += padata.o |
109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | ||
110 | 111 | ||
111 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 112 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
112 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 113 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
@@ -125,11 +126,10 @@ targets += config_data.gz | |||
125 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | 126 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
126 | $(call if_changed,gzip) | 127 | $(call if_changed,gzip) |
127 | 128 | ||
128 | quiet_cmd_ikconfiggz = IKCFG $@ | 129 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") |
129 | cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ | ||
130 | targets += config_data.h | 130 | targets += config_data.h |
131 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | 131 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE |
132 | $(call if_changed,ikconfiggz) | 132 | $(call filechk,ikconfiggz) |
133 | 133 | ||
134 | $(obj)/time.o: $(obj)/timeconst.h | 134 | $(obj)/time.o: $(obj)/timeconst.h |
135 | 135 | ||
diff --git a/kernel/async.c b/kernel/async.c index cd9dbb913c7..d5fe7af0de2 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel. | |||
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
52 | #include <linux/atomic.h> | ||
53 | #include <linux/ktime.h> | ||
52 | #include <linux/module.h> | 54 | #include <linux/module.h> |
53 | #include <linux/wait.h> | 55 | #include <linux/wait.h> |
54 | #include <linux/sched.h> | 56 | #include <linux/sched.h> |
55 | #include <linux/slab.h> | 57 | #include <linux/slab.h> |
56 | #include <linux/workqueue.h> | 58 | #include <linux/workqueue.h> |
57 | #include <asm/atomic.h> | ||
58 | 59 | ||
59 | static async_cookie_t next_cookie = 1; | 60 | static async_cookie_t next_cookie = 1; |
60 | 61 | ||
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work) | |||
128 | 129 | ||
129 | /* 2) run (and print duration) */ | 130 | /* 2) run (and print duration) */ |
130 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 131 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
131 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, | 132 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", |
133 | (long long)entry->cookie, | ||
132 | entry->func, task_pid_nr(current)); | 134 | entry->func, task_pid_nr(current)); |
133 | calltime = ktime_get(); | 135 | calltime = ktime_get(); |
134 | } | 136 | } |
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
136 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 138 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
137 | rettime = ktime_get(); | 139 | rettime = ktime_get(); |
138 | delta = ktime_sub(rettime, calltime); | 140 | delta = ktime_sub(rettime, calltime); |
139 | printk("initcall %lli_%pF returned 0 after %lld usecs\n", | 141 | printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", |
140 | (long long)entry->cookie, | 142 | (long long)entry->cookie, |
141 | entry->func, | 143 | entry->func, |
142 | (long long)ktime_to_ns(delta) >> 10); | 144 | (long long)ktime_to_ns(delta) >> 10); |
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, | |||
270 | ktime_t starttime, delta, endtime; | 272 | ktime_t starttime, delta, endtime; |
271 | 273 | ||
272 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 274 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
273 | printk("async_waiting @ %i\n", task_pid_nr(current)); | 275 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
274 | starttime = ktime_get(); | 276 | starttime = ktime_get(); |
275 | } | 277 | } |
276 | 278 | ||
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, | |||
280 | endtime = ktime_get(); | 282 | endtime = ktime_get(); |
281 | delta = ktime_sub(endtime, starttime); | 283 | delta = ktime_sub(endtime, starttime); |
282 | 284 | ||
283 | printk("async_continuing @ %i after %lli usec\n", | 285 | printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", |
284 | task_pid_nr(current), | 286 | task_pid_nr(current), |
285 | (long long)ktime_to_ns(delta) >> 10); | 287 | (long long)ktime_to_ns(delta) >> 10); |
286 | } | 288 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 93950031706..0a1355ca3d7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -43,7 +43,7 @@ | |||
43 | 43 | ||
44 | #include <linux/init.h> | 44 | #include <linux/init.h> |
45 | #include <asm/types.h> | 45 | #include <asm/types.h> |
46 | #include <asm/atomic.h> | 46 | #include <linux/atomic.h> |
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
@@ -55,6 +55,9 @@ | |||
55 | #include <net/sock.h> | 55 | #include <net/sock.h> |
56 | #include <net/netlink.h> | 56 | #include <net/netlink.h> |
57 | #include <linux/skbuff.h> | 57 | #include <linux/skbuff.h> |
58 | #ifdef CONFIG_SECURITY | ||
59 | #include <linux/security.h> | ||
60 | #endif | ||
58 | #include <linux/netlink.h> | 61 | #include <linux/netlink.h> |
59 | #include <linux/freezer.h> | 62 | #include <linux/freezer.h> |
60 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, | |||
1502 | } | 1505 | } |
1503 | } | 1506 | } |
1504 | 1507 | ||
1508 | #ifdef CONFIG_SECURITY | ||
1509 | /** | ||
1510 | * audit_log_secctx - Converts and logs SELinux context | ||
1511 | * @ab: audit_buffer | ||
1512 | * @secid: security number | ||
1513 | * | ||
1514 | * This is a helper function that calls security_secid_to_secctx to convert | ||
1515 | * secid to secctx and then adds the (converted) SELinux context to the audit | ||
1516 | * log by calling audit_log_format, thus also preventing leak of internal secid | ||
1517 | * to userspace. If secid cannot be converted audit_panic is called. | ||
1518 | */ | ||
1519 | void audit_log_secctx(struct audit_buffer *ab, u32 secid) | ||
1520 | { | ||
1521 | u32 len; | ||
1522 | char *secctx; | ||
1523 | |||
1524 | if (security_secid_to_secctx(secid, &secctx, &len)) { | ||
1525 | audit_panic("Cannot convert secid to context"); | ||
1526 | } else { | ||
1527 | audit_log_format(ab, " obj=%s", secctx); | ||
1528 | security_release_secctx(secctx, len); | ||
1529 | } | ||
1530 | } | ||
1531 | EXPORT_SYMBOL(audit_log_secctx); | ||
1532 | #endif | ||
1533 | |||
1505 | EXPORT_SYMBOL(audit_log_start); | 1534 | EXPORT_SYMBOL(audit_log_start); |
1506 | EXPORT_SYMBOL(audit_log_end); | 1535 | EXPORT_SYMBOL(audit_log_end); |
1507 | EXPORT_SYMBOL(audit_log_format); | 1536 | EXPORT_SYMBOL(audit_log_format); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e99dda04b12..5bf0790497e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree) | |||
93 | atomic_inc(&tree->count); | 93 | atomic_inc(&tree->count); |
94 | } | 94 | } |
95 | 95 | ||
96 | static void __put_tree(struct rcu_head *rcu) | ||
97 | { | ||
98 | struct audit_tree *tree = container_of(rcu, struct audit_tree, head); | ||
99 | kfree(tree); | ||
100 | } | ||
101 | |||
102 | static inline void put_tree(struct audit_tree *tree) | 96 | static inline void put_tree(struct audit_tree *tree) |
103 | { | 97 | { |
104 | if (atomic_dec_and_test(&tree->count)) | 98 | if (atomic_dec_and_test(&tree->count)) |
105 | call_rcu(&tree->head, __put_tree); | 99 | kfree_rcu(tree, head); |
106 | } | 100 | } |
107 | 101 | ||
108 | /* to avoid bringing the entire thing in audit.h */ | 102 | /* to avoid bringing the entire thing in audit.h */ |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 00d79df03e7..ce4b054acee 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -44,7 +44,7 @@ | |||
44 | 44 | ||
45 | #include <linux/init.h> | 45 | #include <linux/init.h> |
46 | #include <asm/types.h> | 46 | #include <asm/types.h> |
47 | #include <asm/atomic.h> | 47 | #include <linux/atomic.h> |
48 | #include <linux/fs.h> | 48 | #include <linux/fs.h> |
49 | #include <linux/namei.h> | 49 | #include <linux/namei.h> |
50 | #include <linux/mm.h> | 50 | #include <linux/mm.h> |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d115d72..54a36fe288f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -27,9 +27,11 @@ | |||
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/cgroup.h> | 29 | #include <linux/cgroup.h> |
30 | #include <linux/cred.h> | ||
30 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
31 | #include <linux/errno.h> | 32 | #include <linux/errno.h> |
32 | #include <linux/fs.h> | 33 | #include <linux/fs.h> |
34 | #include <linux/init_task.h> | ||
33 | #include <linux/kernel.h> | 35 | #include <linux/kernel.h> |
34 | #include <linux/list.h> | 36 | #include <linux/list.h> |
35 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
@@ -59,7 +61,7 @@ | |||
59 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
60 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ |
61 | 63 | ||
62 | #include <asm/atomic.h> | 64 | #include <linux/atomic.h> |
63 | 65 | ||
64 | static DEFINE_MUTEX(cgroup_mutex); | 66 | static DEFINE_MUTEX(cgroup_mutex); |
65 | 67 | ||
@@ -268,6 +270,33 @@ static void cgroup_release_agent(struct work_struct *work); | |||
268 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 270 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
269 | static void check_for_release(struct cgroup *cgrp); | 271 | static void check_for_release(struct cgroup *cgrp); |
270 | 272 | ||
273 | /* | ||
274 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
275 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
276 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
277 | * to zero, soon. | ||
278 | * | ||
279 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
280 | */ | ||
281 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
282 | |||
283 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
284 | { | ||
285 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
286 | wake_up_all(&cgroup_rmdir_waitq); | ||
287 | } | ||
288 | |||
289 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
290 | { | ||
291 | css_get(css); | ||
292 | } | ||
293 | |||
294 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
295 | { | ||
296 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
297 | css_put(css); | ||
298 | } | ||
299 | |||
271 | /* Link structure for associating css_set objects with cgroups */ | 300 | /* Link structure for associating css_set objects with cgroups */ |
272 | struct cg_cgroup_link { | 301 | struct cg_cgroup_link { |
273 | /* | 302 | /* |
@@ -327,52 +356,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
327 | return &css_set_table[index]; | 356 | return &css_set_table[index]; |
328 | } | 357 | } |
329 | 358 | ||
330 | /* We don't maintain the lists running through each css_set to its | 359 | static void free_css_set_work(struct work_struct *work) |
331 | * task until after the first call to cgroup_iter_start(). This | ||
332 | * reduces the fork()/exit() overhead for people who have cgroups | ||
333 | * compiled into their kernel but not actually in use */ | ||
334 | static int use_task_css_set_links __read_mostly; | ||
335 | |||
336 | static void __put_css_set(struct css_set *cg, int taskexit) | ||
337 | { | 360 | { |
361 | struct css_set *cg = container_of(work, struct css_set, work); | ||
338 | struct cg_cgroup_link *link; | 362 | struct cg_cgroup_link *link; |
339 | struct cg_cgroup_link *saved_link; | 363 | struct cg_cgroup_link *saved_link; |
340 | /* | ||
341 | * Ensure that the refcount doesn't hit zero while any readers | ||
342 | * can see it. Similar to atomic_dec_and_lock(), but for an | ||
343 | * rwlock | ||
344 | */ | ||
345 | if (atomic_add_unless(&cg->refcount, -1, 1)) | ||
346 | return; | ||
347 | write_lock(&css_set_lock); | ||
348 | if (!atomic_dec_and_test(&cg->refcount)) { | ||
349 | write_unlock(&css_set_lock); | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | /* This css_set is dead. unlink it and release cgroup refcounts */ | ||
354 | hlist_del(&cg->hlist); | ||
355 | css_set_count--; | ||
356 | 364 | ||
365 | write_lock(&css_set_lock); | ||
357 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 366 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, |
358 | cg_link_list) { | 367 | cg_link_list) { |
359 | struct cgroup *cgrp = link->cgrp; | 368 | struct cgroup *cgrp = link->cgrp; |
360 | list_del(&link->cg_link_list); | 369 | list_del(&link->cg_link_list); |
361 | list_del(&link->cgrp_link_list); | 370 | list_del(&link->cgrp_link_list); |
362 | if (atomic_dec_and_test(&cgrp->count) && | 371 | if (atomic_dec_and_test(&cgrp->count)) { |
363 | notify_on_release(cgrp)) { | ||
364 | if (taskexit) | ||
365 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
366 | check_for_release(cgrp); | 372 | check_for_release(cgrp); |
373 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
367 | } | 374 | } |
368 | |||
369 | kfree(link); | 375 | kfree(link); |
370 | } | 376 | } |
371 | |||
372 | write_unlock(&css_set_lock); | 377 | write_unlock(&css_set_lock); |
373 | kfree_rcu(cg, rcu_head); | 378 | |
379 | kfree(cg); | ||
380 | } | ||
381 | |||
382 | static void free_css_set_rcu(struct rcu_head *obj) | ||
383 | { | ||
384 | struct css_set *cg = container_of(obj, struct css_set, rcu_head); | ||
385 | |||
386 | INIT_WORK(&cg->work, free_css_set_work); | ||
387 | schedule_work(&cg->work); | ||
374 | } | 388 | } |
375 | 389 | ||
390 | /* We don't maintain the lists running through each css_set to its | ||
391 | * task until after the first call to cgroup_iter_start(). This | ||
392 | * reduces the fork()/exit() overhead for people who have cgroups | ||
393 | * compiled into their kernel but not actually in use */ | ||
394 | static int use_task_css_set_links __read_mostly; | ||
395 | |||
376 | /* | 396 | /* |
377 | * refcounted get/put for css_set objects | 397 | * refcounted get/put for css_set objects |
378 | */ | 398 | */ |
@@ -381,14 +401,26 @@ static inline void get_css_set(struct css_set *cg) | |||
381 | atomic_inc(&cg->refcount); | 401 | atomic_inc(&cg->refcount); |
382 | } | 402 | } |
383 | 403 | ||
384 | static inline void put_css_set(struct css_set *cg) | 404 | static void put_css_set(struct css_set *cg) |
385 | { | 405 | { |
386 | __put_css_set(cg, 0); | 406 | /* |
387 | } | 407 | * Ensure that the refcount doesn't hit zero while any readers |
408 | * can see it. Similar to atomic_dec_and_lock(), but for an | ||
409 | * rwlock | ||
410 | */ | ||
411 | if (atomic_add_unless(&cg->refcount, -1, 1)) | ||
412 | return; | ||
413 | write_lock(&css_set_lock); | ||
414 | if (!atomic_dec_and_test(&cg->refcount)) { | ||
415 | write_unlock(&css_set_lock); | ||
416 | return; | ||
417 | } | ||
388 | 418 | ||
389 | static inline void put_css_set_taskexit(struct css_set *cg) | 419 | hlist_del(&cg->hlist); |
390 | { | 420 | css_set_count--; |
391 | __put_css_set(cg, 1); | 421 | |
422 | write_unlock(&css_set_lock); | ||
423 | call_rcu(&cg->rcu_head, free_css_set_rcu); | ||
392 | } | 424 | } |
393 | 425 | ||
394 | /* | 426 | /* |
@@ -720,9 +752,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
720 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with | 752 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with |
721 | * another. It does so using cgroup_mutex, however there are | 753 | * another. It does so using cgroup_mutex, however there are |
722 | * several performance critical places that need to reference | 754 | * several performance critical places that need to reference |
723 | * task->cgroup without the expense of grabbing a system global | 755 | * task->cgroups without the expense of grabbing a system global |
724 | * mutex. Therefore except as noted below, when dereferencing or, as | 756 | * mutex. Therefore except as noted below, when dereferencing or, as |
725 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use | 757 | * in cgroup_attach_task(), modifying a task's cgroups pointer we use |
726 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 758 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
727 | * the task_struct routinely used for such matters. | 759 | * the task_struct routinely used for such matters. |
728 | * | 760 | * |
@@ -912,33 +944,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
912 | } | 944 | } |
913 | 945 | ||
914 | /* | 946 | /* |
915 | * A queue for waiters to do rmdir() cgroup. A tasks will sleep when | ||
916 | * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some | ||
917 | * reference to css->refcnt. In general, this refcnt is expected to goes down | ||
918 | * to zero, soon. | ||
919 | * | ||
920 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | ||
921 | */ | ||
922 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | ||
923 | |||
924 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | ||
925 | { | ||
926 | if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) | ||
927 | wake_up_all(&cgroup_rmdir_waitq); | ||
928 | } | ||
929 | |||
930 | void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) | ||
931 | { | ||
932 | css_get(css); | ||
933 | } | ||
934 | |||
935 | void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | ||
936 | { | ||
937 | cgroup_wakeup_rmdir_waiter(css->cgroup); | ||
938 | css_put(css); | ||
939 | } | ||
940 | |||
941 | /* | ||
942 | * Call with cgroup_mutex held. Drops reference counts on modules, including | 947 | * Call with cgroup_mutex held. Drops reference counts on modules, including |
943 | * any duplicate ones that parse_cgroupfs_options took. If this function | 948 | * any duplicate ones that parse_cgroupfs_options took. If this function |
944 | * returns an error, no reference counts are touched. | 949 | * returns an error, no reference counts are touched. |
@@ -1173,10 +1178,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1173 | 1178 | ||
1174 | /* | 1179 | /* |
1175 | * If the 'all' option was specified select all the subsystems, | 1180 | * If the 'all' option was specified select all the subsystems, |
1176 | * otherwise 'all, 'none' and a subsystem name options were not | 1181 | * otherwise if 'none', 'name=' and a subsystem name options |
1177 | * specified, let's default to 'all' | 1182 | * were not specified, let's default to 'all' |
1178 | */ | 1183 | */ |
1179 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | 1184 | if (all_ss || (!one_ss && !opts->none && !opts->name)) { |
1180 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1185 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1181 | struct cgroup_subsys *ss = subsys[i]; | 1186 | struct cgroup_subsys *ss = subsys[i]; |
1182 | if (ss == NULL) | 1187 | if (ss == NULL) |
@@ -1514,6 +1519,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1514 | struct cgroup *root_cgrp = &root->top_cgroup; | 1519 | struct cgroup *root_cgrp = &root->top_cgroup; |
1515 | struct inode *inode; | 1520 | struct inode *inode; |
1516 | struct cgroupfs_root *existing_root; | 1521 | struct cgroupfs_root *existing_root; |
1522 | const struct cred *cred; | ||
1517 | int i; | 1523 | int i; |
1518 | 1524 | ||
1519 | BUG_ON(sb->s_root != NULL); | 1525 | BUG_ON(sb->s_root != NULL); |
@@ -1593,7 +1599,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1593 | BUG_ON(!list_empty(&root_cgrp->children)); | 1599 | BUG_ON(!list_empty(&root_cgrp->children)); |
1594 | BUG_ON(root->number_of_cgroups != 1); | 1600 | BUG_ON(root->number_of_cgroups != 1); |
1595 | 1601 | ||
1602 | cred = override_creds(&init_cred); | ||
1596 | cgroup_populate_dir(root_cgrp); | 1603 | cgroup_populate_dir(root_cgrp); |
1604 | revert_creds(cred); | ||
1597 | mutex_unlock(&cgroup_mutex); | 1605 | mutex_unlock(&cgroup_mutex); |
1598 | mutex_unlock(&inode->i_mutex); | 1606 | mutex_unlock(&inode->i_mutex); |
1599 | } else { | 1607 | } else { |
@@ -1697,7 +1705,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1697 | { | 1705 | { |
1698 | char *start; | 1706 | char *start; |
1699 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1707 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, |
1700 | rcu_read_lock_held() || | ||
1701 | cgroup_lock_is_held()); | 1708 | cgroup_lock_is_held()); |
1702 | 1709 | ||
1703 | if (!dentry || cgrp == dummytop) { | 1710 | if (!dentry || cgrp == dummytop) { |
@@ -1723,7 +1730,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1723 | break; | 1730 | break; |
1724 | 1731 | ||
1725 | dentry = rcu_dereference_check(cgrp->dentry, | 1732 | dentry = rcu_dereference_check(cgrp->dentry, |
1726 | rcu_read_lock_held() || | ||
1727 | cgroup_lock_is_held()); | 1733 | cgroup_lock_is_held()); |
1728 | if (!cgrp->parent) | 1734 | if (!cgrp->parent) |
1729 | continue; | 1735 | continue; |
@@ -1820,6 +1826,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1820 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1826 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1821 | struct cgroup *oldcgrp; | 1827 | struct cgroup *oldcgrp; |
1822 | struct cgroupfs_root *root = cgrp->root; | 1828 | struct cgroupfs_root *root = cgrp->root; |
1829 | struct css_set *cg; | ||
1823 | 1830 | ||
1824 | /* Nothing to do if the task is already in that cgroup */ | 1831 | /* Nothing to do if the task is already in that cgroup */ |
1825 | oldcgrp = task_cgroup_from_root(tsk, root); | 1832 | oldcgrp = task_cgroup_from_root(tsk, root); |
@@ -1849,6 +1856,11 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1849 | } | 1856 | } |
1850 | } | 1857 | } |
1851 | 1858 | ||
1859 | task_lock(tsk); | ||
1860 | cg = tsk->cgroups; | ||
1861 | get_css_set(cg); | ||
1862 | task_unlock(tsk); | ||
1863 | |||
1852 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); | 1864 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
1853 | if (retval) | 1865 | if (retval) |
1854 | goto out; | 1866 | goto out; |
@@ -1861,8 +1873,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1861 | if (ss->attach) | 1873 | if (ss->attach) |
1862 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1874 | ss->attach(ss, cgrp, oldcgrp, tsk); |
1863 | } | 1875 | } |
1864 | 1876 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | |
1865 | synchronize_rcu(); | 1877 | /* put_css_set will not destroy cg until after an RCU grace period */ |
1878 | put_css_set(cg); | ||
1866 | 1879 | ||
1867 | /* | 1880 | /* |
1868 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1881 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
@@ -2095,11 +2108,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2095 | continue; | 2108 | continue; |
2096 | /* get old css_set pointer */ | 2109 | /* get old css_set pointer */ |
2097 | task_lock(tsk); | 2110 | task_lock(tsk); |
2098 | if (tsk->flags & PF_EXITING) { | ||
2099 | /* ignore this task if it's going away */ | ||
2100 | task_unlock(tsk); | ||
2101 | continue; | ||
2102 | } | ||
2103 | oldcg = tsk->cgroups; | 2111 | oldcg = tsk->cgroups; |
2104 | get_css_set(oldcg); | 2112 | get_css_set(oldcg); |
2105 | task_unlock(tsk); | 2113 | task_unlock(tsk); |
@@ -2189,6 +2197,24 @@ out_free_group_list: | |||
2189 | return retval; | 2197 | return retval; |
2190 | } | 2198 | } |
2191 | 2199 | ||
2200 | static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk) | ||
2201 | { | ||
2202 | struct cgroup_subsys *ss; | ||
2203 | int ret; | ||
2204 | |||
2205 | for_each_subsys(cgrp->root, ss) { | ||
2206 | if (ss->allow_attach) { | ||
2207 | ret = ss->allow_attach(cgrp, tsk); | ||
2208 | if (ret) | ||
2209 | return ret; | ||
2210 | } else { | ||
2211 | return -EACCES; | ||
2212 | } | ||
2213 | } | ||
2214 | |||
2215 | return 0; | ||
2216 | } | ||
2217 | |||
2192 | /* | 2218 | /* |
2193 | * Find the task_struct of the task to attach by vpid and pass it along to the | 2219 | * Find the task_struct of the task to attach by vpid and pass it along to the |
2194 | * function to attach either it or all tasks in its threadgroup. Will take | 2220 | * function to attach either it or all tasks in its threadgroup. Will take |
@@ -2234,9 +2260,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2234 | if (cred->euid && | 2260 | if (cred->euid && |
2235 | cred->euid != tcred->uid && | 2261 | cred->euid != tcred->uid && |
2236 | cred->euid != tcred->suid) { | 2262 | cred->euid != tcred->suid) { |
2237 | rcu_read_unlock(); | 2263 | /* |
2238 | cgroup_unlock(); | 2264 | * if the default permission check fails, give each |
2239 | return -EACCES; | 2265 | * cgroup a chance to extend the permission check |
2266 | */ | ||
2267 | ret = cgroup_allow_attach(cgrp, tsk); | ||
2268 | if (ret) { | ||
2269 | rcu_read_unlock(); | ||
2270 | cgroup_unlock(); | ||
2271 | return ret; | ||
2272 | } | ||
2240 | } | 2273 | } |
2241 | get_task_struct(tsk); | 2274 | get_task_struct(tsk); |
2242 | rcu_read_unlock(); | 2275 | rcu_read_unlock(); |
@@ -3542,7 +3575,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3542 | } | 3575 | } |
3543 | 3576 | ||
3544 | /* the process need read permission on control file */ | 3577 | /* the process need read permission on control file */ |
3545 | ret = file_permission(cfile, MAY_READ); | 3578 | /* AV: shouldn't we check that it's been opened for read instead? */ |
3579 | ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); | ||
3546 | if (ret < 0) | 3580 | if (ret < 0) |
3547 | goto fail; | 3581 | goto fail; |
3548 | 3582 | ||
@@ -3810,6 +3844,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3810 | if (err < 0) | 3844 | if (err < 0) |
3811 | goto err_remove; | 3845 | goto err_remove; |
3812 | 3846 | ||
3847 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
3848 | |||
3813 | /* The cgroup directory was pre-locked for us */ | 3849 | /* The cgroup directory was pre-locked for us */ |
3814 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); | 3850 | BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); |
3815 | 3851 | ||
@@ -3941,6 +3977,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) | |||
3941 | return !failed; | 3977 | return !failed; |
3942 | } | 3978 | } |
3943 | 3979 | ||
3980 | /* checks if all of the css_sets attached to a cgroup have a refcount of 0. | ||
3981 | * Must be called with css_set_lock held */ | ||
3982 | static int cgroup_css_sets_empty(struct cgroup *cgrp) | ||
3983 | { | ||
3984 | struct cg_cgroup_link *link; | ||
3985 | |||
3986 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { | ||
3987 | struct css_set *cg = link->cg; | ||
3988 | if (atomic_read(&cg->refcount) > 0) | ||
3989 | return 0; | ||
3990 | } | ||
3991 | |||
3992 | return 1; | ||
3993 | } | ||
3994 | |||
3944 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 3995 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
3945 | { | 3996 | { |
3946 | struct cgroup *cgrp = dentry->d_fsdata; | 3997 | struct cgroup *cgrp = dentry->d_fsdata; |
@@ -3953,7 +4004,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
3953 | /* the vfs holds both inode->i_mutex already */ | 4004 | /* the vfs holds both inode->i_mutex already */ |
3954 | again: | 4005 | again: |
3955 | mutex_lock(&cgroup_mutex); | 4006 | mutex_lock(&cgroup_mutex); |
3956 | if (atomic_read(&cgrp->count) != 0) { | 4007 | if (!cgroup_css_sets_empty(cgrp)) { |
3957 | mutex_unlock(&cgroup_mutex); | 4008 | mutex_unlock(&cgroup_mutex); |
3958 | return -EBUSY; | 4009 | return -EBUSY; |
3959 | } | 4010 | } |
@@ -3986,7 +4037,7 @@ again: | |||
3986 | 4037 | ||
3987 | mutex_lock(&cgroup_mutex); | 4038 | mutex_lock(&cgroup_mutex); |
3988 | parent = cgrp->parent; | 4039 | parent = cgrp->parent; |
3989 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { | 4040 | if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) { |
3990 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4041 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
3991 | mutex_unlock(&cgroup_mutex); | 4042 | mutex_unlock(&cgroup_mutex); |
3992 | return -EBUSY; | 4043 | return -EBUSY; |
@@ -4026,7 +4077,6 @@ again: | |||
4026 | cgroup_d_remove_dir(d); | 4077 | cgroup_d_remove_dir(d); |
4027 | dput(d); | 4078 | dput(d); |
4028 | 4079 | ||
4029 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4030 | check_for_release(parent); | 4080 | check_for_release(parent); |
4031 | 4081 | ||
4032 | /* | 4082 | /* |
@@ -4626,7 +4676,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4626 | task_unlock(tsk); | 4676 | task_unlock(tsk); |
4627 | 4677 | ||
4628 | if (cg) | 4678 | if (cg) |
4629 | put_css_set_taskexit(cg); | 4679 | put_css_set(cg); |
4630 | } | 4680 | } |
4631 | 4681 | ||
4632 | /** | 4682 | /** |
@@ -4680,6 +4730,14 @@ static void check_for_release(struct cgroup *cgrp) | |||
4680 | } | 4730 | } |
4681 | 4731 | ||
4682 | /* Caller must verify that the css is not for root cgroup */ | 4732 | /* Caller must verify that the css is not for root cgroup */ |
4733 | void __css_get(struct cgroup_subsys_state *css, int count) | ||
4734 | { | ||
4735 | atomic_add(count, &css->refcnt); | ||
4736 | set_bit(CGRP_RELEASABLE, &css->cgroup->flags); | ||
4737 | } | ||
4738 | EXPORT_SYMBOL_GPL(__css_get); | ||
4739 | |||
4740 | /* Caller must verify that the css is not for root cgroup */ | ||
4683 | void __css_put(struct cgroup_subsys_state *css, int count) | 4741 | void __css_put(struct cgroup_subsys_state *css, int count) |
4684 | { | 4742 | { |
4685 | struct cgroup *cgrp = css->cgroup; | 4743 | struct cgroup *cgrp = css->cgroup; |
@@ -4687,10 +4745,7 @@ void __css_put(struct cgroup_subsys_state *css, int count) | |||
4687 | rcu_read_lock(); | 4745 | rcu_read_lock(); |
4688 | val = atomic_sub_return(count, &css->refcnt); | 4746 | val = atomic_sub_return(count, &css->refcnt); |
4689 | if (val == 1) { | 4747 | if (val == 1) { |
4690 | if (notify_on_release(cgrp)) { | 4748 | check_for_release(cgrp); |
4691 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
4692 | check_for_release(cgrp); | ||
4693 | } | ||
4694 | cgroup_wakeup_rmdir_waiter(cgrp); | 4749 | cgroup_wakeup_rmdir_waiter(cgrp); |
4695 | } | 4750 | } |
4696 | rcu_read_unlock(); | 4751 | rcu_read_unlock(); |
@@ -4813,8 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
4813 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 4868 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
4814 | * it's unchanged until freed. | 4869 | * it's unchanged until freed. |
4815 | */ | 4870 | */ |
4816 | cssid = rcu_dereference_check(css->id, | 4871 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
4817 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
4818 | 4872 | ||
4819 | if (cssid) | 4873 | if (cssid) |
4820 | return cssid->id; | 4874 | return cssid->id; |
@@ -4826,8 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
4826 | { | 4880 | { |
4827 | struct css_id *cssid; | 4881 | struct css_id *cssid; |
4828 | 4882 | ||
4829 | cssid = rcu_dereference_check(css->id, | 4883 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
4830 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
4831 | 4884 | ||
4832 | if (cssid) | 4885 | if (cssid) |
4833 | return cssid->depth; | 4886 | return cssid->depth; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e691818d7e4..a3f638ac3de 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
153 | kfree(cgroup_freezer(cgroup)); | 153 | kfree(cgroup_freezer(cgroup)); |
154 | } | 154 | } |
155 | 155 | ||
156 | /* task is frozen or will freeze immediately when next it gets woken */ | ||
157 | static bool is_task_frozen_enough(struct task_struct *task) | ||
158 | { | ||
159 | return frozen(task) || | ||
160 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
161 | } | ||
162 | |||
156 | /* | 163 | /* |
157 | * The call to cgroup_lock() in the freezer.state write method prevents | 164 | * The call to cgroup_lock() in the freezer.state write method prevents |
158 | * a write to that file racing against an attach, and hence the | 165 | * a write to that file racing against an attach, and hence the |
@@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup, | |||
231 | cgroup_iter_start(cgroup, &it); | 238 | cgroup_iter_start(cgroup, &it); |
232 | while ((task = cgroup_iter_next(cgroup, &it))) { | 239 | while ((task = cgroup_iter_next(cgroup, &it))) { |
233 | ntotal++; | 240 | ntotal++; |
234 | if (frozen(task)) | 241 | if (is_task_frozen_enough(task)) |
235 | nfrozen++; | 242 | nfrozen++; |
236 | } | 243 | } |
237 | 244 | ||
@@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
284 | while ((task = cgroup_iter_next(cgroup, &it))) { | 291 | while ((task = cgroup_iter_next(cgroup, &it))) { |
285 | if (!freeze_task(task, true)) | 292 | if (!freeze_task(task, true)) |
286 | continue; | 293 | continue; |
287 | if (frozen(task)) | 294 | if (is_task_frozen_enough(task)) |
288 | continue; | 295 | continue; |
289 | if (!freezing(task) && !freezer_should_skip(task)) | 296 | if (!freezing(task) && !freezer_should_skip(task)) |
290 | num_cant_freeze_now++; | 297 | num_cant_freeze_now++; |
diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093acd..e2435ee9993 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user | |||
158 | __put_user(ts->tv_sec, &cts->tv_sec) || | 158 | __put_user(ts->tv_sec, &cts->tv_sec) || |
159 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | 159 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; |
160 | } | 160 | } |
161 | EXPORT_SYMBOL_GPL(put_compat_timespec); | ||
161 | 162 | ||
162 | static long compat_nanosleep_restart(struct restart_block *restart) | 163 | static long compat_nanosleep_restart(struct restart_block *restart) |
163 | { | 164 | { |
@@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | |||
890 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 891 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
891 | } | 892 | } |
892 | } | 893 | } |
894 | EXPORT_SYMBOL_GPL(sigset_from_compat); | ||
893 | 895 | ||
894 | asmlinkage long | 896 | asmlinkage long |
895 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | 897 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, |
@@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
991 | sigset_from_compat(&newset, &newset32); | 993 | sigset_from_compat(&newset, &newset32); |
992 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 994 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
993 | 995 | ||
994 | spin_lock_irq(¤t->sighand->siglock); | ||
995 | current->saved_sigmask = current->blocked; | 996 | current->saved_sigmask = current->blocked; |
996 | current->blocked = newset; | 997 | set_current_blocked(&newset); |
997 | recalc_sigpending(); | ||
998 | spin_unlock_irq(¤t->sighand->siglock); | ||
999 | 998 | ||
1000 | current->state = TASK_INTERRUPTIBLE; | 999 | current->state = TASK_INTERRUPTIBLE; |
1001 | schedule(); | 1000 | schedule(); |
diff --git a/kernel/configs.c b/kernel/configs.c index b4066b44a99..42e8fa075ee 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void) | |||
92 | module_init(ikconfig_init); | 92 | module_init(ikconfig_init); |
93 | module_exit(ikconfig_cleanup); | 93 | module_exit(ikconfig_cleanup); |
94 | 94 | ||
95 | #endif /* CONFIG_IKCONFIG_PROC */ | ||
96 | |||
95 | MODULE_LICENSE("GPL"); | 97 | MODULE_LICENSE("GPL"); |
96 | MODULE_AUTHOR("Randy Dunlap"); | 98 | MODULE_AUTHOR("Randy Dunlap"); |
97 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); | 99 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); |
98 | |||
99 | #endif /* CONFIG_IKCONFIG_PROC */ | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b..eae3d9b3957 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | ||
18 | 19 | ||
19 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
20 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void) | |||
476 | return 0; | 477 | return 0; |
477 | } | 478 | } |
478 | core_initcall(alloc_frozen_cpus); | 479 | core_initcall(alloc_frozen_cpus); |
480 | |||
481 | /* | ||
482 | * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU | ||
483 | * hotplug when tasks are about to be frozen. Also, don't allow the freezer | ||
484 | * to continue until any currently running CPU hotplug operation gets | ||
485 | * completed. | ||
486 | * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the | ||
487 | * 'cpu_add_remove_lock'. And this same lock is also taken by the regular | ||
488 | * CPU hotplug path and released only after it is complete. Thus, we | ||
489 | * (and hence the freezer) will block here until any currently running CPU | ||
490 | * hotplug operation gets completed. | ||
491 | */ | ||
492 | void cpu_hotplug_disable_before_freeze(void) | ||
493 | { | ||
494 | cpu_maps_update_begin(); | ||
495 | cpu_hotplug_disabled = 1; | ||
496 | cpu_maps_update_done(); | ||
497 | } | ||
498 | |||
499 | |||
500 | /* | ||
501 | * When tasks have been thawed, re-enable regular CPU hotplug (which had been | ||
502 | * disabled while beginning to freeze tasks). | ||
503 | */ | ||
504 | void cpu_hotplug_enable_after_thaw(void) | ||
505 | { | ||
506 | cpu_maps_update_begin(); | ||
507 | cpu_hotplug_disabled = 0; | ||
508 | cpu_maps_update_done(); | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * When callbacks for CPU hotplug notifications are being executed, we must | ||
513 | * ensure that the state of the system with respect to the tasks being frozen | ||
514 | * or not, as reported by the notification, remains unchanged *throughout the | ||
515 | * duration* of the execution of the callbacks. | ||
516 | * Hence we need to prevent the freezer from racing with regular CPU hotplug. | ||
517 | * | ||
518 | * This synchronization is implemented by mutually excluding regular CPU | ||
519 | * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ | ||
520 | * Hibernate notifications. | ||
521 | */ | ||
522 | static int | ||
523 | cpu_hotplug_pm_callback(struct notifier_block *nb, | ||
524 | unsigned long action, void *ptr) | ||
525 | { | ||
526 | switch (action) { | ||
527 | |||
528 | case PM_SUSPEND_PREPARE: | ||
529 | case PM_HIBERNATION_PREPARE: | ||
530 | cpu_hotplug_disable_before_freeze(); | ||
531 | break; | ||
532 | |||
533 | case PM_POST_SUSPEND: | ||
534 | case PM_POST_HIBERNATION: | ||
535 | cpu_hotplug_enable_after_thaw(); | ||
536 | break; | ||
537 | |||
538 | default: | ||
539 | return NOTIFY_DONE; | ||
540 | } | ||
541 | |||
542 | return NOTIFY_OK; | ||
543 | } | ||
544 | |||
545 | |||
546 | int cpu_hotplug_pm_sync_init(void) | ||
547 | { | ||
548 | pm_notifier(cpu_hotplug_pm_callback, 0); | ||
549 | return 0; | ||
550 | } | ||
551 | core_initcall(cpu_hotplug_pm_sync_init); | ||
552 | |||
479 | #endif /* CONFIG_PM_SLEEP_SMP */ | 553 | #endif /* CONFIG_PM_SLEEP_SMP */ |
480 | 554 | ||
481 | /** | 555 | /** |
@@ -594,3 +668,23 @@ void init_cpu_online(const struct cpumask *src) | |||
594 | { | 668 | { |
595 | cpumask_copy(to_cpumask(cpu_online_bits), src); | 669 | cpumask_copy(to_cpumask(cpu_online_bits), src); |
596 | } | 670 | } |
671 | |||
672 | static ATOMIC_NOTIFIER_HEAD(idle_notifier); | ||
673 | |||
674 | void idle_notifier_register(struct notifier_block *n) | ||
675 | { | ||
676 | atomic_notifier_chain_register(&idle_notifier, n); | ||
677 | } | ||
678 | EXPORT_SYMBOL_GPL(idle_notifier_register); | ||
679 | |||
680 | void idle_notifier_unregister(struct notifier_block *n) | ||
681 | { | ||
682 | atomic_notifier_chain_unregister(&idle_notifier, n); | ||
683 | } | ||
684 | EXPORT_SYMBOL_GPL(idle_notifier_unregister); | ||
685 | |||
686 | void idle_notifier_call_chain(unsigned long val) | ||
687 | { | ||
688 | atomic_notifier_call_chain(&idle_notifier, val, NULL); | ||
689 | } | ||
690 | EXPORT_SYMBOL_GPL(idle_notifier_call_chain); | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c81..10131fdaff7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -55,7 +55,7 @@ | |||
55 | #include <linux/sort.h> | 55 | #include <linux/sort.h> |
56 | 56 | ||
57 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
58 | #include <asm/atomic.h> | 58 | #include <linux/atomic.h> |
59 | #include <linux/mutex.h> | 59 | #include <linux/mutex.h> |
60 | #include <linux/workqueue.h> | 60 | #include <linux/workqueue.h> |
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
@@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) | |||
2460 | 2460 | ||
2461 | int cpuset_mem_spread_node(void) | 2461 | int cpuset_mem_spread_node(void) |
2462 | { | 2462 | { |
2463 | if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) | ||
2464 | current->cpuset_mem_spread_rotor = | ||
2465 | node_random(¤t->mems_allowed); | ||
2466 | |||
2463 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); | 2467 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); |
2464 | } | 2468 | } |
2465 | 2469 | ||
2466 | int cpuset_slab_spread_node(void) | 2470 | int cpuset_slab_spread_node(void) |
2467 | { | 2471 | { |
2472 | if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) | ||
2473 | current->cpuset_slab_spread_rotor = | ||
2474 | node_random(¤t->mems_allowed); | ||
2475 | |||
2468 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); | 2476 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); |
2469 | } | 2477 | } |
2470 | 2478 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 174fa84eca3..8ef31f53c44 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -508,10 +508,8 @@ int commit_creds(struct cred *new) | |||
508 | key_fsgid_changed(task); | 508 | key_fsgid_changed(task); |
509 | 509 | ||
510 | /* do it | 510 | /* do it |
511 | * - What if a process setreuid()'s and this brings the | 511 | * RLIMIT_NPROC limits on user->processes have already been checked |
512 | * new uid over his NPROC rlimit? We can check this now | 512 | * in set_user(). |
513 | * cheaply with the new uid cache, so if it matters | ||
514 | * we should be checking for it. -DaveM | ||
515 | */ | 513 | */ |
516 | alter_cred_subscribers(new, 2); | 514 | alter_cred_subscribers(new, 2); |
517 | if (new->user != old->user) | 515 | if (new->user != old->user) |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bad6786dee8..0d7c08784ef 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -51,7 +51,7 @@ | |||
51 | 51 | ||
52 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
53 | #include <asm/byteorder.h> | 53 | #include <asm/byteorder.h> |
54 | #include <asm/atomic.h> | 54 | #include <linux/atomic.h> |
55 | #include <asm/system.h> | 55 | #include <asm/system.h> |
56 | 56 | ||
57 | #include "debug_core.h" | 57 | #include "debug_core.h" |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a11db956dd6..34872482315 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -42,6 +42,8 @@ | |||
42 | /* Our I/O buffers. */ | 42 | /* Our I/O buffers. */ |
43 | static char remcom_in_buffer[BUFMAX]; | 43 | static char remcom_in_buffer[BUFMAX]; |
44 | static char remcom_out_buffer[BUFMAX]; | 44 | static char remcom_out_buffer[BUFMAX]; |
45 | static int gdbstub_use_prev_in_buf; | ||
46 | static int gdbstub_prev_in_buf_pos; | ||
45 | 47 | ||
46 | /* Storage for the registers, in GDB format. */ | 48 | /* Storage for the registers, in GDB format. */ |
47 | static unsigned long gdb_regs[(NUMREGBYTES + | 49 | static unsigned long gdb_regs[(NUMREGBYTES + |
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void) | |||
58 | int ret = -1; | 60 | int ret = -1; |
59 | int i; | 61 | int i; |
60 | 62 | ||
63 | if (unlikely(gdbstub_use_prev_in_buf)) { | ||
64 | if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) | ||
65 | return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; | ||
66 | else | ||
67 | gdbstub_use_prev_in_buf = 0; | ||
68 | } | ||
69 | |||
61 | /* poll any additional I/O interfaces that are defined */ | 70 | /* poll any additional I/O interfaces that are defined */ |
62 | while (ret < 0) | 71 | while (ret < 0) |
63 | for (i = 0; kdb_poll_funcs[i] != NULL; i++) { | 72 | for (i = 0; kdb_poll_funcs[i] != NULL; i++) { |
@@ -109,7 +118,6 @@ static void get_packet(char *buffer) | |||
109 | buffer[count] = ch; | 118 | buffer[count] = ch; |
110 | count = count + 1; | 119 | count = count + 1; |
111 | } | 120 | } |
112 | buffer[count] = 0; | ||
113 | 121 | ||
114 | if (ch == '#') { | 122 | if (ch == '#') { |
115 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; | 123 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; |
@@ -124,6 +132,7 @@ static void get_packet(char *buffer) | |||
124 | if (dbg_io_ops->flush) | 132 | if (dbg_io_ops->flush) |
125 | dbg_io_ops->flush(); | 133 | dbg_io_ops->flush(); |
126 | } | 134 | } |
135 | buffer[count] = 0; | ||
127 | } while (checksum != xmitcsum); | 136 | } while (checksum != xmitcsum); |
128 | } | 137 | } |
129 | 138 | ||
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) | |||
1082 | case 'c': | 1091 | case 'c': |
1083 | strcpy(remcom_in_buffer, cmd); | 1092 | strcpy(remcom_in_buffer, cmd); |
1084 | return 0; | 1093 | return 0; |
1085 | case '?': | 1094 | case '$': |
1086 | gdb_cmd_status(ks); | 1095 | strcpy(remcom_in_buffer, cmd); |
1087 | break; | 1096 | gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); |
1088 | case '\0': | 1097 | gdbstub_prev_in_buf_pos = 0; |
1089 | strcpy(remcom_out_buffer, ""); | 1098 | return 0; |
1090 | break; | ||
1091 | } | 1099 | } |
1092 | dbg_io_ops->write_char('+'); | 1100 | dbg_io_ops->write_char('+'); |
1093 | put_packet(remcom_out_buffer); | 1101 | put_packet(remcom_out_buffer); |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 2f62fe85f16..7179eac7b41 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv) | |||
112 | unsigned long addr; | 112 | unsigned long addr; |
113 | long offset; | 113 | long offset; |
114 | 114 | ||
115 | kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ | 115 | /* Prompt after each proc in bta */ |
116 | kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each | 116 | kdbgetintenv("BTAPROMPT", &btaprompt); |
117 | * proc in bta */ | ||
118 | 117 | ||
119 | if (strcmp(argv[0], "bta") == 0) { | 118 | if (strcmp(argv[0], "bta") == 0) { |
120 | struct task_struct *g, *p; | 119 | struct task_struct *g, *p; |
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds index 56c88e4db30..9834ad303ab 100644 --- a/kernel/debug/kdb/kdb_cmds +++ b/kernel/debug/kdb/kdb_cmds | |||
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging" | |||
18 | endefcmd | 18 | endefcmd |
19 | 19 | ||
20 | defcmd dumpall "" "First line debugging" | 20 | defcmd dumpall "" "First line debugging" |
21 | set BTSYMARG 1 | ||
22 | set BTARGS 9 | ||
23 | pid R | 21 | pid R |
24 | -dumpcommon | 22 | -dumpcommon |
25 | -bta | 23 | -bta |
26 | endefcmd | 24 | endefcmd |
27 | 25 | ||
28 | defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" | 26 | defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" |
29 | set BTSYMARG 1 | ||
30 | set BTARGS 9 | ||
31 | pid R | 27 | pid R |
32 | -dumpcommon | 28 | -dumpcommon |
33 | -btc | 29 | -btc |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index dd0b1b7dd02..d9ca9aa481e 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs); | |||
30 | int kdb_poll_idx = 1; | 30 | int kdb_poll_idx = 1; |
31 | EXPORT_SYMBOL_GPL(kdb_poll_idx); | 31 | EXPORT_SYMBOL_GPL(kdb_poll_idx); |
32 | 32 | ||
33 | static struct kgdb_state *kdb_ks; | ||
34 | |||
33 | int kdb_stub(struct kgdb_state *ks) | 35 | int kdb_stub(struct kgdb_state *ks) |
34 | { | 36 | { |
35 | int error = 0; | 37 | int error = 0; |
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
39 | kdb_dbtrap_t db_result = KDB_DB_NOBPT; | 41 | kdb_dbtrap_t db_result = KDB_DB_NOBPT; |
40 | int i; | 42 | int i; |
41 | 43 | ||
44 | kdb_ks = ks; | ||
42 | if (KDB_STATE(REENTRY)) { | 45 | if (KDB_STATE(REENTRY)) { |
43 | reason = KDB_REASON_SWITCH; | 46 | reason = KDB_REASON_SWITCH; |
44 | KDB_STATE_CLEAR(REENTRY); | 47 | KDB_STATE_CLEAR(REENTRY); |
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks) | |||
123 | KDB_STATE_CLEAR(PAGER); | 126 | KDB_STATE_CLEAR(PAGER); |
124 | kdbnearsym_cleanup(); | 127 | kdbnearsym_cleanup(); |
125 | if (error == KDB_CMD_KGDB) { | 128 | if (error == KDB_CMD_KGDB) { |
126 | if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { | 129 | if (KDB_STATE(DOING_KGDB)) |
127 | /* | ||
128 | * This inteface glue which allows kdb to transition in into | ||
129 | * the gdb stub. In order to do this the '?' or '' gdb serial | ||
130 | * packet response is processed here. And then control is | ||
131 | * passed to the gdbstub. | ||
132 | */ | ||
133 | if (KDB_STATE(DOING_KGDB)) | ||
134 | gdbstub_state(ks, "?"); | ||
135 | else | ||
136 | gdbstub_state(ks, ""); | ||
137 | KDB_STATE_CLEAR(DOING_KGDB); | 130 | KDB_STATE_CLEAR(DOING_KGDB); |
138 | KDB_STATE_CLEAR(DOING_KGDB2); | ||
139 | } | ||
140 | return DBG_PASS_EVENT; | 131 | return DBG_PASS_EVENT; |
141 | } | 132 | } |
142 | kdb_bp_install(ks->linux_regs); | 133 | kdb_bp_install(ks->linux_regs); |
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
166 | return kgdb_info[ks->cpu].ret_state; | 157 | return kgdb_info[ks->cpu].ret_state; |
167 | } | 158 | } |
168 | 159 | ||
160 | void kdb_gdb_state_pass(char *buf) | ||
161 | { | ||
162 | gdbstub_state(kdb_ks, buf); | ||
163 | } | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 96fdaac46a8..4802eb5840e 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN]; | |||
31 | 31 | ||
32 | int kdb_trap_printk; | 32 | int kdb_trap_printk; |
33 | 33 | ||
34 | static void kgdb_transition_check(char *buffer) | 34 | static int kgdb_transition_check(char *buffer) |
35 | { | 35 | { |
36 | int slen = strlen(buffer); | 36 | if (buffer[0] != '+' && buffer[0] != '$') { |
37 | if (strncmp(buffer, "$?#3f", slen) != 0 && | ||
38 | strncmp(buffer, "$qSupported#37", slen) != 0 && | ||
39 | strncmp(buffer, "+$qSupported#37", slen) != 0) { | ||
40 | KDB_STATE_SET(KGDB_TRANS); | 37 | KDB_STATE_SET(KGDB_TRANS); |
41 | kdb_printf("%s", buffer); | 38 | kdb_printf("%s", buffer); |
39 | } else { | ||
40 | int slen = strlen(buffer); | ||
41 | if (slen > 3 && buffer[slen - 3] == '#') { | ||
42 | kdb_gdb_state_pass(buffer); | ||
43 | strcpy(buffer, "kgdb"); | ||
44 | KDB_STATE_SET(DOING_KGDB); | ||
45 | return 1; | ||
46 | } | ||
42 | } | 47 | } |
48 | return 0; | ||
43 | } | 49 | } |
44 | 50 | ||
45 | static int kdb_read_get_key(char *buffer, size_t bufsize) | 51 | static int kdb_read_get_key(char *buffer, size_t bufsize) |
@@ -251,6 +257,10 @@ poll_again: | |||
251 | case 13: /* enter */ | 257 | case 13: /* enter */ |
252 | *lastchar++ = '\n'; | 258 | *lastchar++ = '\n'; |
253 | *lastchar++ = '\0'; | 259 | *lastchar++ = '\0'; |
260 | if (!KDB_STATE(KGDB_TRANS)) { | ||
261 | KDB_STATE_SET(KGDB_TRANS); | ||
262 | kdb_printf("%s", buffer); | ||
263 | } | ||
254 | kdb_printf("\n"); | 264 | kdb_printf("\n"); |
255 | return buffer; | 265 | return buffer; |
256 | case 4: /* Del */ | 266 | case 4: /* Del */ |
@@ -382,22 +392,26 @@ poll_again: | |||
382 | * printed characters if we think that | 392 | * printed characters if we think that |
383 | * kgdb is connecting, until the check | 393 | * kgdb is connecting, until the check |
384 | * fails */ | 394 | * fails */ |
385 | if (!KDB_STATE(KGDB_TRANS)) | 395 | if (!KDB_STATE(KGDB_TRANS)) { |
386 | kgdb_transition_check(buffer); | 396 | if (kgdb_transition_check(buffer)) |
387 | else | 397 | return buffer; |
398 | } else { | ||
388 | kdb_printf("%c", key); | 399 | kdb_printf("%c", key); |
400 | } | ||
389 | } | 401 | } |
390 | /* Special escape to kgdb */ | 402 | /* Special escape to kgdb */ |
391 | if (lastchar - buffer >= 5 && | 403 | if (lastchar - buffer >= 5 && |
392 | strcmp(lastchar - 5, "$?#3f") == 0) { | 404 | strcmp(lastchar - 5, "$?#3f") == 0) { |
405 | kdb_gdb_state_pass(lastchar - 5); | ||
393 | strcpy(buffer, "kgdb"); | 406 | strcpy(buffer, "kgdb"); |
394 | KDB_STATE_SET(DOING_KGDB); | 407 | KDB_STATE_SET(DOING_KGDB); |
395 | return buffer; | 408 | return buffer; |
396 | } | 409 | } |
397 | if (lastchar - buffer >= 14 && | 410 | if (lastchar - buffer >= 11 && |
398 | strcmp(lastchar - 14, "$qSupported#37") == 0) { | 411 | strcmp(lastchar - 11, "$qSupported") == 0) { |
412 | kdb_gdb_state_pass(lastchar - 11); | ||
399 | strcpy(buffer, "kgdb"); | 413 | strcpy(buffer, "kgdb"); |
400 | KDB_STATE_SET(DOING_KGDB2); | 414 | KDB_STATE_SET(DOING_KGDB); |
401 | return buffer; | 415 | return buffer; |
402 | } | 416 | } |
403 | } | 417 | } |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index be14779bcef..63786e71a3c 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -145,7 +145,6 @@ static char *__env[] = { | |||
145 | #endif | 145 | #endif |
146 | "RADIX=16", | 146 | "RADIX=16", |
147 | "MDCOUNT=8", /* lines of md output */ | 147 | "MDCOUNT=8", /* lines of md output */ |
148 | "BTARGS=9", /* 9 possible args in bt */ | ||
149 | KDB_PLATFORM_ENV, | 148 | KDB_PLATFORM_ENV, |
150 | "DTABCOUNT=30", | 149 | "DTABCOUNT=30", |
151 | "NOSECT=1", | 150 | "NOSECT=1", |
@@ -172,6 +171,7 @@ static char *__env[] = { | |||
172 | (char *)0, | 171 | (char *)0, |
173 | (char *)0, | 172 | (char *)0, |
174 | (char *)0, | 173 | (char *)0, |
174 | (char *)0, | ||
175 | }; | 175 | }; |
176 | 176 | ||
177 | static const int __nenv = (sizeof(__env) / sizeof(char *)); | 177 | static const int __nenv = (sizeof(__env) / sizeof(char *)); |
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, | |||
1386 | } | 1386 | } |
1387 | 1387 | ||
1388 | if (result == KDB_CMD_KGDB) { | 1388 | if (result == KDB_CMD_KGDB) { |
1389 | if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) | 1389 | if (!KDB_STATE(DOING_KGDB)) |
1390 | kdb_printf("Entering please attach debugger " | 1390 | kdb_printf("Entering please attach debugger " |
1391 | "or use $D#44+ or $3#33\n"); | 1391 | "or use $D#44+ or $3#33\n"); |
1392 | break; | 1392 | break; |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 35d69ed1dfb..e381d105b40 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -21,7 +21,6 @@ | |||
21 | #define KDB_CMD_SS (-1003) | 21 | #define KDB_CMD_SS (-1003) |
22 | #define KDB_CMD_SSB (-1004) | 22 | #define KDB_CMD_SSB (-1004) |
23 | #define KDB_CMD_KGDB (-1005) | 23 | #define KDB_CMD_KGDB (-1005) |
24 | #define KDB_CMD_KGDB2 (-1006) | ||
25 | 24 | ||
26 | /* Internal debug flags */ | 25 | /* Internal debug flags */ |
27 | #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ | 26 | #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ |
@@ -146,7 +145,6 @@ extern int kdb_state; | |||
146 | * keyboard on this cpu */ | 145 | * keyboard on this cpu */ |
147 | #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ | 146 | #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ |
148 | #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ | 147 | #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ |
149 | #define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ | ||
150 | #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ | 148 | #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ |
151 | #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch | 149 | #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch |
152 | * specific use */ | 150 | * specific use */ |
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val); | |||
218 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 216 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
219 | extern void kdb_meminfo_proc_show(void); | 217 | extern void kdb_meminfo_proc_show(void); |
220 | extern char *kdb_getstr(char *, size_t, char *); | 218 | extern char *kdb_getstr(char *, size_t, char *); |
219 | extern void kdb_gdb_state_pass(char *buf); | ||
221 | 220 | ||
222 | /* Defines for kdb_symbol_print */ | 221 | /* Defines for kdb_symbol_print */ |
223 | #define KDB_SP_SPACEB 0x0001 /* Space before string */ | 222 | #define KDB_SP_SPACEB 0x0001 /* Space before string */ |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b610aa7..418b3f7053a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -19,8 +19,10 @@ | |||
19 | #include <linux/time.h> | 19 | #include <linux/time.h> |
20 | #include <linux/sysctl.h> | 20 | #include <linux/sysctl.h> |
21 | #include <linux/delayacct.h> | 21 | #include <linux/delayacct.h> |
22 | #include <linux/module.h> | ||
22 | 23 | ||
23 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | 24 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ |
25 | EXPORT_SYMBOL_GPL(delayacct_on); | ||
24 | struct kmem_cache *delayacct_cache; | 26 | struct kmem_cache *delayacct_cache; |
25 | 27 | ||
26 | static int __init delayacct_setup_disable(char *str) | 28 | static int __init delayacct_setup_disable(char *str) |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 1ce23d3d839..89e5e8aa4c3 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER | |||
2 | CFLAGS_REMOVE_core.o = -pg | 2 | CFLAGS_REMOVE_core.o = -pg |
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o | 5 | obj-y := core.o ring_buffer.o |
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 9efe7108cca..0f857782d06 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -36,6 +36,8 @@ | |||
36 | #include <linux/ftrace_event.h> | 36 | #include <linux/ftrace_event.h> |
37 | #include <linux/hw_breakpoint.h> | 37 | #include <linux/hw_breakpoint.h> |
38 | 38 | ||
39 | #include "internal.h" | ||
40 | |||
39 | #include <asm/irq_regs.h> | 41 | #include <asm/irq_regs.h> |
40 | 42 | ||
41 | struct remote_function_call { | 43 | struct remote_function_call { |
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx) | |||
200 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | 202 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); |
201 | } | 203 | } |
202 | 204 | ||
205 | static void perf_ctx_lock(struct perf_cpu_context *cpuctx, | ||
206 | struct perf_event_context *ctx) | ||
207 | { | ||
208 | raw_spin_lock(&cpuctx->ctx.lock); | ||
209 | if (ctx) | ||
210 | raw_spin_lock(&ctx->lock); | ||
211 | } | ||
212 | |||
213 | static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | ||
214 | struct perf_event_context *ctx) | ||
215 | { | ||
216 | if (ctx) | ||
217 | raw_spin_unlock(&ctx->lock); | ||
218 | raw_spin_unlock(&cpuctx->ctx.lock); | ||
219 | } | ||
220 | |||
203 | #ifdef CONFIG_CGROUP_PERF | 221 | #ifdef CONFIG_CGROUP_PERF |
204 | 222 | ||
205 | /* | 223 | /* |
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
340 | rcu_read_lock(); | 358 | rcu_read_lock(); |
341 | 359 | ||
342 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 360 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
343 | |||
344 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 361 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
345 | 362 | ||
346 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
347 | |||
348 | /* | 363 | /* |
349 | * perf_cgroup_events says at least one | 364 | * perf_cgroup_events says at least one |
350 | * context on this CPU has cgroup events. | 365 | * context on this CPU has cgroup events. |
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
353 | * events for a context. | 368 | * events for a context. |
354 | */ | 369 | */ |
355 | if (cpuctx->ctx.nr_cgroups > 0) { | 370 | if (cpuctx->ctx.nr_cgroups > 0) { |
371 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
372 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
356 | 373 | ||
357 | if (mode & PERF_CGROUP_SWOUT) { | 374 | if (mode & PERF_CGROUP_SWOUT) { |
358 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | 375 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); |
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
372 | cpuctx->cgrp = perf_cgroup_from_task(task); | 389 | cpuctx->cgrp = perf_cgroup_from_task(task); |
373 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | 390 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); |
374 | } | 391 | } |
392 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
393 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
375 | } | 394 | } |
376 | |||
377 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
378 | } | 395 | } |
379 | 396 | ||
380 | rcu_read_unlock(); | 397 | rcu_read_unlock(); |
@@ -382,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
382 | local_irq_restore(flags); | 399 | local_irq_restore(flags); |
383 | } | 400 | } |
384 | 401 | ||
385 | static inline void perf_cgroup_sched_out(struct task_struct *task) | 402 | static inline void perf_cgroup_sched_out(struct task_struct *task, |
403 | struct task_struct *next) | ||
386 | { | 404 | { |
387 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | 405 | struct perf_cgroup *cgrp1; |
406 | struct perf_cgroup *cgrp2 = NULL; | ||
407 | |||
408 | /* | ||
409 | * we come here when we know perf_cgroup_events > 0 | ||
410 | */ | ||
411 | cgrp1 = perf_cgroup_from_task(task); | ||
412 | |||
413 | /* | ||
414 | * next is NULL when called from perf_event_enable_on_exec() | ||
415 | * that will systematically cause a cgroup_switch() | ||
416 | */ | ||
417 | if (next) | ||
418 | cgrp2 = perf_cgroup_from_task(next); | ||
419 | |||
420 | /* | ||
421 | * only schedule out current cgroup events if we know | ||
422 | * that we are switching to a different cgroup. Otherwise, | ||
423 | * do no touch the cgroup events. | ||
424 | */ | ||
425 | if (cgrp1 != cgrp2) | ||
426 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
388 | } | 427 | } |
389 | 428 | ||
390 | static inline void perf_cgroup_sched_in(struct task_struct *task) | 429 | static inline void perf_cgroup_sched_in(struct task_struct *prev, |
430 | struct task_struct *task) | ||
391 | { | 431 | { |
392 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | 432 | struct perf_cgroup *cgrp1; |
433 | struct perf_cgroup *cgrp2 = NULL; | ||
434 | |||
435 | /* | ||
436 | * we come here when we know perf_cgroup_events > 0 | ||
437 | */ | ||
438 | cgrp1 = perf_cgroup_from_task(task); | ||
439 | |||
440 | /* prev can never be NULL */ | ||
441 | cgrp2 = perf_cgroup_from_task(prev); | ||
442 | |||
443 | /* | ||
444 | * only need to schedule in cgroup events if we are changing | ||
445 | * cgroup during ctxsw. Cgroup events were not scheduled | ||
446 | * out of ctxsw out if that was not the case. | ||
447 | */ | ||
448 | if (cgrp1 != cgrp2) | ||
449 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
393 | } | 450 | } |
394 | 451 | ||
395 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | 452 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, |
@@ -501,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | |||
501 | { | 558 | { |
502 | } | 559 | } |
503 | 560 | ||
504 | static inline void perf_cgroup_sched_out(struct task_struct *task) | 561 | static inline void perf_cgroup_sched_out(struct task_struct *task, |
562 | struct task_struct *next) | ||
505 | { | 563 | { |
506 | } | 564 | } |
507 | 565 | ||
508 | static inline void perf_cgroup_sched_in(struct task_struct *task) | 566 | static inline void perf_cgroup_sched_in(struct task_struct *prev, |
567 | struct task_struct *task) | ||
509 | { | 568 | { |
510 | } | 569 | } |
511 | 570 | ||
@@ -731,6 +790,7 @@ static u64 perf_event_time(struct perf_event *event) | |||
731 | 790 | ||
732 | /* | 791 | /* |
733 | * Update the total_time_enabled and total_time_running fields for a event. | 792 | * Update the total_time_enabled and total_time_running fields for a event. |
793 | * The caller of this function needs to hold the ctx->lock. | ||
734 | */ | 794 | */ |
735 | static void update_event_times(struct perf_event *event) | 795 | static void update_event_times(struct perf_event *event) |
736 | { | 796 | { |
@@ -1105,6 +1165,10 @@ static int __perf_remove_from_context(void *info) | |||
1105 | raw_spin_lock(&ctx->lock); | 1165 | raw_spin_lock(&ctx->lock); |
1106 | event_sched_out(event, cpuctx, ctx); | 1166 | event_sched_out(event, cpuctx, ctx); |
1107 | list_del_event(event, ctx); | 1167 | list_del_event(event, ctx); |
1168 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { | ||
1169 | ctx->is_active = 0; | ||
1170 | cpuctx->task_ctx = NULL; | ||
1171 | } | ||
1108 | raw_spin_unlock(&ctx->lock); | 1172 | raw_spin_unlock(&ctx->lock); |
1109 | 1173 | ||
1110 | return 0; | 1174 | return 0; |
@@ -1454,8 +1518,24 @@ static void add_event_to_ctx(struct perf_event *event, | |||
1454 | event->tstamp_stopped = tstamp; | 1518 | event->tstamp_stopped = tstamp; |
1455 | } | 1519 | } |
1456 | 1520 | ||
1457 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | 1521 | static void task_ctx_sched_out(struct perf_event_context *ctx); |
1458 | struct task_struct *tsk); | 1522 | static void |
1523 | ctx_sched_in(struct perf_event_context *ctx, | ||
1524 | struct perf_cpu_context *cpuctx, | ||
1525 | enum event_type_t event_type, | ||
1526 | struct task_struct *task); | ||
1527 | |||
1528 | static void perf_event_sched_in(struct perf_cpu_context *cpuctx, | ||
1529 | struct perf_event_context *ctx, | ||
1530 | struct task_struct *task) | ||
1531 | { | ||
1532 | cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); | ||
1533 | if (ctx) | ||
1534 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); | ||
1535 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); | ||
1536 | if (ctx) | ||
1537 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); | ||
1538 | } | ||
1459 | 1539 | ||
1460 | /* | 1540 | /* |
1461 | * Cross CPU call to install and enable a performance event | 1541 | * Cross CPU call to install and enable a performance event |
@@ -1466,20 +1546,37 @@ static int __perf_install_in_context(void *info) | |||
1466 | { | 1546 | { |
1467 | struct perf_event *event = info; | 1547 | struct perf_event *event = info; |
1468 | struct perf_event_context *ctx = event->ctx; | 1548 | struct perf_event_context *ctx = event->ctx; |
1469 | struct perf_event *leader = event->group_leader; | ||
1470 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1549 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1471 | int err; | 1550 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
1551 | struct task_struct *task = current; | ||
1552 | |||
1553 | perf_ctx_lock(cpuctx, task_ctx); | ||
1554 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
1472 | 1555 | ||
1473 | /* | 1556 | /* |
1474 | * In case we're installing a new context to an already running task, | 1557 | * If there was an active task_ctx schedule it out. |
1475 | * could also happen before perf_event_task_sched_in() on architectures | ||
1476 | * which do context switches with IRQs enabled. | ||
1477 | */ | 1558 | */ |
1478 | if (ctx->task && !cpuctx->task_ctx) | 1559 | if (task_ctx) |
1479 | perf_event_context_sched_in(ctx, ctx->task); | 1560 | task_ctx_sched_out(task_ctx); |
1561 | |||
1562 | /* | ||
1563 | * If the context we're installing events in is not the | ||
1564 | * active task_ctx, flip them. | ||
1565 | */ | ||
1566 | if (ctx->task && task_ctx != ctx) { | ||
1567 | if (task_ctx) | ||
1568 | raw_spin_unlock(&task_ctx->lock); | ||
1569 | raw_spin_lock(&ctx->lock); | ||
1570 | task_ctx = ctx; | ||
1571 | } | ||
1572 | |||
1573 | if (task_ctx) { | ||
1574 | cpuctx->task_ctx = task_ctx; | ||
1575 | task = task_ctx->task; | ||
1576 | } | ||
1577 | |||
1578 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
1480 | 1579 | ||
1481 | raw_spin_lock(&ctx->lock); | ||
1482 | ctx->is_active = 1; | ||
1483 | update_context_time(ctx); | 1580 | update_context_time(ctx); |
1484 | /* | 1581 | /* |
1485 | * update cgrp time only if current cgrp | 1582 | * update cgrp time only if current cgrp |
@@ -1490,43 +1587,13 @@ static int __perf_install_in_context(void *info) | |||
1490 | 1587 | ||
1491 | add_event_to_ctx(event, ctx); | 1588 | add_event_to_ctx(event, ctx); |
1492 | 1589 | ||
1493 | if (!event_filter_match(event)) | ||
1494 | goto unlock; | ||
1495 | |||
1496 | /* | 1590 | /* |
1497 | * Don't put the event on if it is disabled or if | 1591 | * Schedule everything back in |
1498 | * it is in a group and the group isn't on. | ||
1499 | */ | 1592 | */ |
1500 | if (event->state != PERF_EVENT_STATE_INACTIVE || | 1593 | perf_event_sched_in(cpuctx, task_ctx, task); |
1501 | (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) | ||
1502 | goto unlock; | ||
1503 | 1594 | ||
1504 | /* | 1595 | perf_pmu_enable(cpuctx->ctx.pmu); |
1505 | * An exclusive event can't go on if there are already active | 1596 | perf_ctx_unlock(cpuctx, task_ctx); |
1506 | * hardware events, and no hardware event can go on if there | ||
1507 | * is already an exclusive event on. | ||
1508 | */ | ||
1509 | if (!group_can_go_on(event, cpuctx, 1)) | ||
1510 | err = -EEXIST; | ||
1511 | else | ||
1512 | err = event_sched_in(event, cpuctx, ctx); | ||
1513 | |||
1514 | if (err) { | ||
1515 | /* | ||
1516 | * This event couldn't go on. If it is in a group | ||
1517 | * then we have to pull the whole group off. | ||
1518 | * If the event group is pinned then put it in error state. | ||
1519 | */ | ||
1520 | if (leader != event) | ||
1521 | group_sched_out(leader, cpuctx, ctx); | ||
1522 | if (leader->attr.pinned) { | ||
1523 | update_group_times(leader); | ||
1524 | leader->state = PERF_EVENT_STATE_ERROR; | ||
1525 | } | ||
1526 | } | ||
1527 | |||
1528 | unlock: | ||
1529 | raw_spin_unlock(&ctx->lock); | ||
1530 | 1597 | ||
1531 | return 0; | 1598 | return 0; |
1532 | } | 1599 | } |
@@ -1739,7 +1806,7 @@ out: | |||
1739 | raw_spin_unlock_irq(&ctx->lock); | 1806 | raw_spin_unlock_irq(&ctx->lock); |
1740 | } | 1807 | } |
1741 | 1808 | ||
1742 | static int perf_event_refresh(struct perf_event *event, int refresh) | 1809 | int perf_event_refresh(struct perf_event *event, int refresh) |
1743 | { | 1810 | { |
1744 | /* | 1811 | /* |
1745 | * not supported on inherited events | 1812 | * not supported on inherited events |
@@ -1752,36 +1819,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
1752 | 1819 | ||
1753 | return 0; | 1820 | return 0; |
1754 | } | 1821 | } |
1822 | EXPORT_SYMBOL_GPL(perf_event_refresh); | ||
1755 | 1823 | ||
1756 | static void ctx_sched_out(struct perf_event_context *ctx, | 1824 | static void ctx_sched_out(struct perf_event_context *ctx, |
1757 | struct perf_cpu_context *cpuctx, | 1825 | struct perf_cpu_context *cpuctx, |
1758 | enum event_type_t event_type) | 1826 | enum event_type_t event_type) |
1759 | { | 1827 | { |
1760 | struct perf_event *event; | 1828 | struct perf_event *event; |
1829 | int is_active = ctx->is_active; | ||
1761 | 1830 | ||
1762 | raw_spin_lock(&ctx->lock); | 1831 | ctx->is_active &= ~event_type; |
1763 | perf_pmu_disable(ctx->pmu); | ||
1764 | ctx->is_active = 0; | ||
1765 | if (likely(!ctx->nr_events)) | 1832 | if (likely(!ctx->nr_events)) |
1766 | goto out; | 1833 | return; |
1834 | |||
1767 | update_context_time(ctx); | 1835 | update_context_time(ctx); |
1768 | update_cgrp_time_from_cpuctx(cpuctx); | 1836 | update_cgrp_time_from_cpuctx(cpuctx); |
1769 | |||
1770 | if (!ctx->nr_active) | 1837 | if (!ctx->nr_active) |
1771 | goto out; | 1838 | return; |
1772 | 1839 | ||
1773 | if (event_type & EVENT_PINNED) { | 1840 | perf_pmu_disable(ctx->pmu); |
1841 | if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { | ||
1774 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1842 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1775 | group_sched_out(event, cpuctx, ctx); | 1843 | group_sched_out(event, cpuctx, ctx); |
1776 | } | 1844 | } |
1777 | 1845 | ||
1778 | if (event_type & EVENT_FLEXIBLE) { | 1846 | if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { |
1779 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1847 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1780 | group_sched_out(event, cpuctx, ctx); | 1848 | group_sched_out(event, cpuctx, ctx); |
1781 | } | 1849 | } |
1782 | out: | ||
1783 | perf_pmu_enable(ctx->pmu); | 1850 | perf_pmu_enable(ctx->pmu); |
1784 | raw_spin_unlock(&ctx->lock); | ||
1785 | } | 1851 | } |
1786 | 1852 | ||
1787 | /* | 1853 | /* |
@@ -1929,8 +1995,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
1929 | rcu_read_unlock(); | 1995 | rcu_read_unlock(); |
1930 | 1996 | ||
1931 | if (do_switch) { | 1997 | if (do_switch) { |
1998 | raw_spin_lock(&ctx->lock); | ||
1932 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | 1999 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
1933 | cpuctx->task_ctx = NULL; | 2000 | cpuctx->task_ctx = NULL; |
2001 | raw_spin_unlock(&ctx->lock); | ||
1934 | } | 2002 | } |
1935 | } | 2003 | } |
1936 | 2004 | ||
@@ -1962,11 +2030,10 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1962 | * cgroup event are system-wide mode only | 2030 | * cgroup event are system-wide mode only |
1963 | */ | 2031 | */ |
1964 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2032 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) |
1965 | perf_cgroup_sched_out(task); | 2033 | perf_cgroup_sched_out(task, next); |
1966 | } | 2034 | } |
1967 | 2035 | ||
1968 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 2036 | static void task_ctx_sched_out(struct perf_event_context *ctx) |
1969 | enum event_type_t event_type) | ||
1970 | { | 2037 | { |
1971 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 2038 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1972 | 2039 | ||
@@ -1976,7 +2043,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
1976 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) | 2043 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) |
1977 | return; | 2044 | return; |
1978 | 2045 | ||
1979 | ctx_sched_out(ctx, cpuctx, event_type); | 2046 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
1980 | cpuctx->task_ctx = NULL; | 2047 | cpuctx->task_ctx = NULL; |
1981 | } | 2048 | } |
1982 | 2049 | ||
@@ -2055,11 +2122,11 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
2055 | struct task_struct *task) | 2122 | struct task_struct *task) |
2056 | { | 2123 | { |
2057 | u64 now; | 2124 | u64 now; |
2125 | int is_active = ctx->is_active; | ||
2058 | 2126 | ||
2059 | raw_spin_lock(&ctx->lock); | 2127 | ctx->is_active |= event_type; |
2060 | ctx->is_active = 1; | ||
2061 | if (likely(!ctx->nr_events)) | 2128 | if (likely(!ctx->nr_events)) |
2062 | goto out; | 2129 | return; |
2063 | 2130 | ||
2064 | now = perf_clock(); | 2131 | now = perf_clock(); |
2065 | ctx->timestamp = now; | 2132 | ctx->timestamp = now; |
@@ -2068,15 +2135,12 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
2068 | * First go through the list and put on any pinned groups | 2135 | * First go through the list and put on any pinned groups |
2069 | * in order to give them the best chance of going on. | 2136 | * in order to give them the best chance of going on. |
2070 | */ | 2137 | */ |
2071 | if (event_type & EVENT_PINNED) | 2138 | if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) |
2072 | ctx_pinned_sched_in(ctx, cpuctx); | 2139 | ctx_pinned_sched_in(ctx, cpuctx); |
2073 | 2140 | ||
2074 | /* Then walk through the lower prio flexible groups */ | 2141 | /* Then walk through the lower prio flexible groups */ |
2075 | if (event_type & EVENT_FLEXIBLE) | 2142 | if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) |
2076 | ctx_flexible_sched_in(ctx, cpuctx); | 2143 | ctx_flexible_sched_in(ctx, cpuctx); |
2077 | |||
2078 | out: | ||
2079 | raw_spin_unlock(&ctx->lock); | ||
2080 | } | 2144 | } |
2081 | 2145 | ||
2082 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2146 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
@@ -2088,19 +2152,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
2088 | ctx_sched_in(ctx, cpuctx, event_type, task); | 2152 | ctx_sched_in(ctx, cpuctx, event_type, task); |
2089 | } | 2153 | } |
2090 | 2154 | ||
2091 | static void task_ctx_sched_in(struct perf_event_context *ctx, | ||
2092 | enum event_type_t event_type) | ||
2093 | { | ||
2094 | struct perf_cpu_context *cpuctx; | ||
2095 | |||
2096 | cpuctx = __get_cpu_context(ctx); | ||
2097 | if (cpuctx->task_ctx == ctx) | ||
2098 | return; | ||
2099 | |||
2100 | ctx_sched_in(ctx, cpuctx, event_type, NULL); | ||
2101 | cpuctx->task_ctx = ctx; | ||
2102 | } | ||
2103 | |||
2104 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | 2155 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
2105 | struct task_struct *task) | 2156 | struct task_struct *task) |
2106 | { | 2157 | { |
@@ -2110,6 +2161,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2110 | if (cpuctx->task_ctx == ctx) | 2161 | if (cpuctx->task_ctx == ctx) |
2111 | return; | 2162 | return; |
2112 | 2163 | ||
2164 | perf_ctx_lock(cpuctx, ctx); | ||
2113 | perf_pmu_disable(ctx->pmu); | 2165 | perf_pmu_disable(ctx->pmu); |
2114 | /* | 2166 | /* |
2115 | * We want to keep the following priority order: | 2167 | * We want to keep the following priority order: |
@@ -2118,18 +2170,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2118 | */ | 2170 | */ |
2119 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2171 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2120 | 2172 | ||
2121 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); | 2173 | perf_event_sched_in(cpuctx, ctx, task); |
2122 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); | ||
2123 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); | ||
2124 | 2174 | ||
2125 | cpuctx->task_ctx = ctx; | 2175 | cpuctx->task_ctx = ctx; |
2126 | 2176 | ||
2177 | perf_pmu_enable(ctx->pmu); | ||
2178 | perf_ctx_unlock(cpuctx, ctx); | ||
2179 | |||
2127 | /* | 2180 | /* |
2128 | * Since these rotations are per-cpu, we need to ensure the | 2181 | * Since these rotations are per-cpu, we need to ensure the |
2129 | * cpu-context we got scheduled on is actually rotating. | 2182 | * cpu-context we got scheduled on is actually rotating. |
2130 | */ | 2183 | */ |
2131 | perf_pmu_rotate_start(ctx->pmu); | 2184 | perf_pmu_rotate_start(ctx->pmu); |
2132 | perf_pmu_enable(ctx->pmu); | ||
2133 | } | 2185 | } |
2134 | 2186 | ||
2135 | /* | 2187 | /* |
@@ -2143,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2143 | * accessing the event control register. If a NMI hits, then it will | 2195 | * accessing the event control register. If a NMI hits, then it will |
2144 | * keep the event running. | 2196 | * keep the event running. |
2145 | */ | 2197 | */ |
2146 | void __perf_event_task_sched_in(struct task_struct *task) | 2198 | void __perf_event_task_sched_in(struct task_struct *prev, |
2199 | struct task_struct *task) | ||
2147 | { | 2200 | { |
2148 | struct perf_event_context *ctx; | 2201 | struct perf_event_context *ctx; |
2149 | int ctxn; | 2202 | int ctxn; |
@@ -2161,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
2161 | * cgroup event are system-wide mode only | 2214 | * cgroup event are system-wide mode only |
2162 | */ | 2215 | */ |
2163 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | 2216 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) |
2164 | perf_cgroup_sched_in(task); | 2217 | perf_cgroup_sched_in(prev, task); |
2165 | } | 2218 | } |
2166 | 2219 | ||
2167 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2220 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -2269,7 +2322,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2269 | u64 interrupts, now; | 2322 | u64 interrupts, now; |
2270 | s64 delta; | 2323 | s64 delta; |
2271 | 2324 | ||
2272 | raw_spin_lock(&ctx->lock); | ||
2273 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 2325 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
2274 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2326 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2275 | continue; | 2327 | continue; |
@@ -2301,7 +2353,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2301 | if (delta > 0) | 2353 | if (delta > 0) |
2302 | perf_adjust_period(event, period, delta); | 2354 | perf_adjust_period(event, period, delta); |
2303 | } | 2355 | } |
2304 | raw_spin_unlock(&ctx->lock); | ||
2305 | } | 2356 | } |
2306 | 2357 | ||
2307 | /* | 2358 | /* |
@@ -2309,16 +2360,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2309 | */ | 2360 | */ |
2310 | static void rotate_ctx(struct perf_event_context *ctx) | 2361 | static void rotate_ctx(struct perf_event_context *ctx) |
2311 | { | 2362 | { |
2312 | raw_spin_lock(&ctx->lock); | ||
2313 | |||
2314 | /* | 2363 | /* |
2315 | * Rotate the first entry last of non-pinned groups. Rotation might be | 2364 | * Rotate the first entry last of non-pinned groups. Rotation might be |
2316 | * disabled by the inheritance code. | 2365 | * disabled by the inheritance code. |
2317 | */ | 2366 | */ |
2318 | if (!ctx->rotate_disable) | 2367 | if (!ctx->rotate_disable) |
2319 | list_rotate_left(&ctx->flexible_groups); | 2368 | list_rotate_left(&ctx->flexible_groups); |
2320 | |||
2321 | raw_spin_unlock(&ctx->lock); | ||
2322 | } | 2369 | } |
2323 | 2370 | ||
2324 | /* | 2371 | /* |
@@ -2345,6 +2392,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2345 | rotate = 1; | 2392 | rotate = 1; |
2346 | } | 2393 | } |
2347 | 2394 | ||
2395 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2348 | perf_pmu_disable(cpuctx->ctx.pmu); | 2396 | perf_pmu_disable(cpuctx->ctx.pmu); |
2349 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | 2397 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); |
2350 | if (ctx) | 2398 | if (ctx) |
@@ -2355,21 +2403,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2355 | 2403 | ||
2356 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2404 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2357 | if (ctx) | 2405 | if (ctx) |
2358 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 2406 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
2359 | 2407 | ||
2360 | rotate_ctx(&cpuctx->ctx); | 2408 | rotate_ctx(&cpuctx->ctx); |
2361 | if (ctx) | 2409 | if (ctx) |
2362 | rotate_ctx(ctx); | 2410 | rotate_ctx(ctx); |
2363 | 2411 | ||
2364 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); | 2412 | perf_event_sched_in(cpuctx, ctx, current); |
2365 | if (ctx) | ||
2366 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | ||
2367 | 2413 | ||
2368 | done: | 2414 | done: |
2369 | if (remove) | 2415 | if (remove) |
2370 | list_del_init(&cpuctx->rotation_list); | 2416 | list_del_init(&cpuctx->rotation_list); |
2371 | 2417 | ||
2372 | perf_pmu_enable(cpuctx->ctx.pmu); | 2418 | perf_pmu_enable(cpuctx->ctx.pmu); |
2419 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2373 | } | 2420 | } |
2374 | 2421 | ||
2375 | void perf_event_task_tick(void) | 2422 | void perf_event_task_tick(void) |
@@ -2423,10 +2470,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2423 | * ctxswin cgroup events which are already scheduled | 2470 | * ctxswin cgroup events which are already scheduled |
2424 | * in. | 2471 | * in. |
2425 | */ | 2472 | */ |
2426 | perf_cgroup_sched_out(current); | 2473 | perf_cgroup_sched_out(current, NULL); |
2427 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
2428 | 2474 | ||
2429 | raw_spin_lock(&ctx->lock); | 2475 | raw_spin_lock(&ctx->lock); |
2476 | task_ctx_sched_out(ctx); | ||
2430 | 2477 | ||
2431 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 2478 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
2432 | ret = event_enable_on_exec(event, ctx); | 2479 | ret = event_enable_on_exec(event, ctx); |
@@ -2835,16 +2882,12 @@ retry: | |||
2835 | unclone_ctx(ctx); | 2882 | unclone_ctx(ctx); |
2836 | ++ctx->pin_count; | 2883 | ++ctx->pin_count; |
2837 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2884 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
2838 | } | 2885 | } else { |
2839 | |||
2840 | if (!ctx) { | ||
2841 | ctx = alloc_perf_context(pmu, task); | 2886 | ctx = alloc_perf_context(pmu, task); |
2842 | err = -ENOMEM; | 2887 | err = -ENOMEM; |
2843 | if (!ctx) | 2888 | if (!ctx) |
2844 | goto errout; | 2889 | goto errout; |
2845 | 2890 | ||
2846 | get_ctx(ctx); | ||
2847 | |||
2848 | err = 0; | 2891 | err = 0; |
2849 | mutex_lock(&task->perf_event_mutex); | 2892 | mutex_lock(&task->perf_event_mutex); |
2850 | /* | 2893 | /* |
@@ -2856,14 +2899,14 @@ retry: | |||
2856 | else if (task->perf_event_ctxp[ctxn]) | 2899 | else if (task->perf_event_ctxp[ctxn]) |
2857 | err = -EAGAIN; | 2900 | err = -EAGAIN; |
2858 | else { | 2901 | else { |
2902 | get_ctx(ctx); | ||
2859 | ++ctx->pin_count; | 2903 | ++ctx->pin_count; |
2860 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | 2904 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); |
2861 | } | 2905 | } |
2862 | mutex_unlock(&task->perf_event_mutex); | 2906 | mutex_unlock(&task->perf_event_mutex); |
2863 | 2907 | ||
2864 | if (unlikely(err)) { | 2908 | if (unlikely(err)) { |
2865 | put_task_struct(task); | 2909 | put_ctx(ctx); |
2866 | kfree(ctx); | ||
2867 | 2910 | ||
2868 | if (err == -EAGAIN) | 2911 | if (err == -EAGAIN) |
2869 | goto retry; | 2912 | goto retry; |
@@ -2890,7 +2933,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
2890 | kfree(event); | 2933 | kfree(event); |
2891 | } | 2934 | } |
2892 | 2935 | ||
2893 | static void perf_buffer_put(struct perf_buffer *buffer); | 2936 | static void ring_buffer_put(struct ring_buffer *rb); |
2894 | 2937 | ||
2895 | static void free_event(struct perf_event *event) | 2938 | static void free_event(struct perf_event *event) |
2896 | { | 2939 | { |
@@ -2913,9 +2956,9 @@ static void free_event(struct perf_event *event) | |||
2913 | } | 2956 | } |
2914 | } | 2957 | } |
2915 | 2958 | ||
2916 | if (event->buffer) { | 2959 | if (event->rb) { |
2917 | perf_buffer_put(event->buffer); | 2960 | ring_buffer_put(event->rb); |
2918 | event->buffer = NULL; | 2961 | event->rb = NULL; |
2919 | } | 2962 | } |
2920 | 2963 | ||
2921 | if (is_cgroup_event(event)) | 2964 | if (is_cgroup_event(event)) |
@@ -2934,12 +2977,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
2934 | { | 2977 | { |
2935 | struct perf_event_context *ctx = event->ctx; | 2978 | struct perf_event_context *ctx = event->ctx; |
2936 | 2979 | ||
2937 | /* | ||
2938 | * Remove from the PMU, can't get re-enabled since we got | ||
2939 | * here because the last ref went. | ||
2940 | */ | ||
2941 | perf_event_disable(event); | ||
2942 | |||
2943 | WARN_ON_ONCE(ctx->parent_ctx); | 2980 | WARN_ON_ONCE(ctx->parent_ctx); |
2944 | /* | 2981 | /* |
2945 | * There are two ways this annotation is useful: | 2982 | * There are two ways this annotation is useful: |
@@ -2956,8 +2993,8 @@ int perf_event_release_kernel(struct perf_event *event) | |||
2956 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | 2993 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); |
2957 | raw_spin_lock_irq(&ctx->lock); | 2994 | raw_spin_lock_irq(&ctx->lock); |
2958 | perf_group_detach(event); | 2995 | perf_group_detach(event); |
2959 | list_del_event(event, ctx); | ||
2960 | raw_spin_unlock_irq(&ctx->lock); | 2996 | raw_spin_unlock_irq(&ctx->lock); |
2997 | perf_remove_from_context(event); | ||
2961 | mutex_unlock(&ctx->mutex); | 2998 | mutex_unlock(&ctx->mutex); |
2962 | 2999 | ||
2963 | free_event(event); | 3000 | free_event(event); |
@@ -3149,13 +3186,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
3149 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 3186 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
3150 | { | 3187 | { |
3151 | struct perf_event *event = file->private_data; | 3188 | struct perf_event *event = file->private_data; |
3152 | struct perf_buffer *buffer; | 3189 | struct ring_buffer *rb; |
3153 | unsigned int events = POLL_HUP; | 3190 | unsigned int events = POLL_HUP; |
3154 | 3191 | ||
3155 | rcu_read_lock(); | 3192 | rcu_read_lock(); |
3156 | buffer = rcu_dereference(event->buffer); | 3193 | rb = rcu_dereference(event->rb); |
3157 | if (buffer) | 3194 | if (rb) |
3158 | events = atomic_xchg(&buffer->poll, 0); | 3195 | events = atomic_xchg(&rb->poll, 0); |
3159 | rcu_read_unlock(); | 3196 | rcu_read_unlock(); |
3160 | 3197 | ||
3161 | poll_wait(file, &event->waitq, wait); | 3198 | poll_wait(file, &event->waitq, wait); |
@@ -3358,6 +3395,18 @@ static int perf_event_index(struct perf_event *event) | |||
3358 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; | 3395 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; |
3359 | } | 3396 | } |
3360 | 3397 | ||
3398 | static void calc_timer_values(struct perf_event *event, | ||
3399 | u64 *enabled, | ||
3400 | u64 *running) | ||
3401 | { | ||
3402 | u64 now, ctx_time; | ||
3403 | |||
3404 | now = perf_clock(); | ||
3405 | ctx_time = event->shadow_ctx_time + now; | ||
3406 | *enabled = ctx_time - event->tstamp_enabled; | ||
3407 | *running = ctx_time - event->tstamp_running; | ||
3408 | } | ||
3409 | |||
3361 | /* | 3410 | /* |
3362 | * Callers need to ensure there can be no nesting of this function, otherwise | 3411 | * Callers need to ensure there can be no nesting of this function, otherwise |
3363 | * the seqlock logic goes bad. We can not serialize this because the arch | 3412 | * the seqlock logic goes bad. We can not serialize this because the arch |
@@ -3366,14 +3415,25 @@ static int perf_event_index(struct perf_event *event) | |||
3366 | void perf_event_update_userpage(struct perf_event *event) | 3415 | void perf_event_update_userpage(struct perf_event *event) |
3367 | { | 3416 | { |
3368 | struct perf_event_mmap_page *userpg; | 3417 | struct perf_event_mmap_page *userpg; |
3369 | struct perf_buffer *buffer; | 3418 | struct ring_buffer *rb; |
3419 | u64 enabled, running; | ||
3370 | 3420 | ||
3371 | rcu_read_lock(); | 3421 | rcu_read_lock(); |
3372 | buffer = rcu_dereference(event->buffer); | 3422 | /* |
3373 | if (!buffer) | 3423 | * compute total_time_enabled, total_time_running |
3424 | * based on snapshot values taken when the event | ||
3425 | * was last scheduled in. | ||
3426 | * | ||
3427 | * we cannot simply called update_context_time() | ||
3428 | * because of locking issue as we can be called in | ||
3429 | * NMI context | ||
3430 | */ | ||
3431 | calc_timer_values(event, &enabled, &running); | ||
3432 | rb = rcu_dereference(event->rb); | ||
3433 | if (!rb) | ||
3374 | goto unlock; | 3434 | goto unlock; |
3375 | 3435 | ||
3376 | userpg = buffer->user_page; | 3436 | userpg = rb->user_page; |
3377 | 3437 | ||
3378 | /* | 3438 | /* |
3379 | * Disable preemption so as to not let the corresponding user-space | 3439 | * Disable preemption so as to not let the corresponding user-space |
@@ -3387,10 +3447,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3387 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 3447 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
3388 | userpg->offset -= local64_read(&event->hw.prev_count); | 3448 | userpg->offset -= local64_read(&event->hw.prev_count); |
3389 | 3449 | ||
3390 | userpg->time_enabled = event->total_time_enabled + | 3450 | userpg->time_enabled = enabled + |
3391 | atomic64_read(&event->child_total_time_enabled); | 3451 | atomic64_read(&event->child_total_time_enabled); |
3392 | 3452 | ||
3393 | userpg->time_running = event->total_time_running + | 3453 | userpg->time_running = running + |
3394 | atomic64_read(&event->child_total_time_running); | 3454 | atomic64_read(&event->child_total_time_running); |
3395 | 3455 | ||
3396 | barrier(); | 3456 | barrier(); |
@@ -3400,220 +3460,10 @@ unlock: | |||
3400 | rcu_read_unlock(); | 3460 | rcu_read_unlock(); |
3401 | } | 3461 | } |
3402 | 3462 | ||
3403 | static unsigned long perf_data_size(struct perf_buffer *buffer); | ||
3404 | |||
3405 | static void | ||
3406 | perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) | ||
3407 | { | ||
3408 | long max_size = perf_data_size(buffer); | ||
3409 | |||
3410 | if (watermark) | ||
3411 | buffer->watermark = min(max_size, watermark); | ||
3412 | |||
3413 | if (!buffer->watermark) | ||
3414 | buffer->watermark = max_size / 2; | ||
3415 | |||
3416 | if (flags & PERF_BUFFER_WRITABLE) | ||
3417 | buffer->writable = 1; | ||
3418 | |||
3419 | atomic_set(&buffer->refcount, 1); | ||
3420 | } | ||
3421 | |||
3422 | #ifndef CONFIG_PERF_USE_VMALLOC | ||
3423 | |||
3424 | /* | ||
3425 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | ||
3426 | */ | ||
3427 | |||
3428 | static struct page * | ||
3429 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) | ||
3430 | { | ||
3431 | if (pgoff > buffer->nr_pages) | ||
3432 | return NULL; | ||
3433 | |||
3434 | if (pgoff == 0) | ||
3435 | return virt_to_page(buffer->user_page); | ||
3436 | |||
3437 | return virt_to_page(buffer->data_pages[pgoff - 1]); | ||
3438 | } | ||
3439 | |||
3440 | static void *perf_mmap_alloc_page(int cpu) | ||
3441 | { | ||
3442 | struct page *page; | ||
3443 | int node; | ||
3444 | |||
3445 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
3446 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
3447 | if (!page) | ||
3448 | return NULL; | ||
3449 | |||
3450 | return page_address(page); | ||
3451 | } | ||
3452 | |||
3453 | static struct perf_buffer * | ||
3454 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
3455 | { | ||
3456 | struct perf_buffer *buffer; | ||
3457 | unsigned long size; | ||
3458 | int i; | ||
3459 | |||
3460 | size = sizeof(struct perf_buffer); | ||
3461 | size += nr_pages * sizeof(void *); | ||
3462 | |||
3463 | buffer = kzalloc(size, GFP_KERNEL); | ||
3464 | if (!buffer) | ||
3465 | goto fail; | ||
3466 | |||
3467 | buffer->user_page = perf_mmap_alloc_page(cpu); | ||
3468 | if (!buffer->user_page) | ||
3469 | goto fail_user_page; | ||
3470 | |||
3471 | for (i = 0; i < nr_pages; i++) { | ||
3472 | buffer->data_pages[i] = perf_mmap_alloc_page(cpu); | ||
3473 | if (!buffer->data_pages[i]) | ||
3474 | goto fail_data_pages; | ||
3475 | } | ||
3476 | |||
3477 | buffer->nr_pages = nr_pages; | ||
3478 | |||
3479 | perf_buffer_init(buffer, watermark, flags); | ||
3480 | |||
3481 | return buffer; | ||
3482 | |||
3483 | fail_data_pages: | ||
3484 | for (i--; i >= 0; i--) | ||
3485 | free_page((unsigned long)buffer->data_pages[i]); | ||
3486 | |||
3487 | free_page((unsigned long)buffer->user_page); | ||
3488 | |||
3489 | fail_user_page: | ||
3490 | kfree(buffer); | ||
3491 | |||
3492 | fail: | ||
3493 | return NULL; | ||
3494 | } | ||
3495 | |||
3496 | static void perf_mmap_free_page(unsigned long addr) | ||
3497 | { | ||
3498 | struct page *page = virt_to_page((void *)addr); | ||
3499 | |||
3500 | page->mapping = NULL; | ||
3501 | __free_page(page); | ||
3502 | } | ||
3503 | |||
3504 | static void perf_buffer_free(struct perf_buffer *buffer) | ||
3505 | { | ||
3506 | int i; | ||
3507 | |||
3508 | perf_mmap_free_page((unsigned long)buffer->user_page); | ||
3509 | for (i = 0; i < buffer->nr_pages; i++) | ||
3510 | perf_mmap_free_page((unsigned long)buffer->data_pages[i]); | ||
3511 | kfree(buffer); | ||
3512 | } | ||
3513 | |||
3514 | static inline int page_order(struct perf_buffer *buffer) | ||
3515 | { | ||
3516 | return 0; | ||
3517 | } | ||
3518 | |||
3519 | #else | ||
3520 | |||
3521 | /* | ||
3522 | * Back perf_mmap() with vmalloc memory. | ||
3523 | * | ||
3524 | * Required for architectures that have d-cache aliasing issues. | ||
3525 | */ | ||
3526 | |||
3527 | static inline int page_order(struct perf_buffer *buffer) | ||
3528 | { | ||
3529 | return buffer->page_order; | ||
3530 | } | ||
3531 | |||
3532 | static struct page * | ||
3533 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) | ||
3534 | { | ||
3535 | if (pgoff > (1UL << page_order(buffer))) | ||
3536 | return NULL; | ||
3537 | |||
3538 | return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); | ||
3539 | } | ||
3540 | |||
3541 | static void perf_mmap_unmark_page(void *addr) | ||
3542 | { | ||
3543 | struct page *page = vmalloc_to_page(addr); | ||
3544 | |||
3545 | page->mapping = NULL; | ||
3546 | } | ||
3547 | |||
3548 | static void perf_buffer_free_work(struct work_struct *work) | ||
3549 | { | ||
3550 | struct perf_buffer *buffer; | ||
3551 | void *base; | ||
3552 | int i, nr; | ||
3553 | |||
3554 | buffer = container_of(work, struct perf_buffer, work); | ||
3555 | nr = 1 << page_order(buffer); | ||
3556 | |||
3557 | base = buffer->user_page; | ||
3558 | for (i = 0; i < nr + 1; i++) | ||
3559 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | ||
3560 | |||
3561 | vfree(base); | ||
3562 | kfree(buffer); | ||
3563 | } | ||
3564 | |||
3565 | static void perf_buffer_free(struct perf_buffer *buffer) | ||
3566 | { | ||
3567 | schedule_work(&buffer->work); | ||
3568 | } | ||
3569 | |||
3570 | static struct perf_buffer * | ||
3571 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
3572 | { | ||
3573 | struct perf_buffer *buffer; | ||
3574 | unsigned long size; | ||
3575 | void *all_buf; | ||
3576 | |||
3577 | size = sizeof(struct perf_buffer); | ||
3578 | size += sizeof(void *); | ||
3579 | |||
3580 | buffer = kzalloc(size, GFP_KERNEL); | ||
3581 | if (!buffer) | ||
3582 | goto fail; | ||
3583 | |||
3584 | INIT_WORK(&buffer->work, perf_buffer_free_work); | ||
3585 | |||
3586 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | ||
3587 | if (!all_buf) | ||
3588 | goto fail_all_buf; | ||
3589 | |||
3590 | buffer->user_page = all_buf; | ||
3591 | buffer->data_pages[0] = all_buf + PAGE_SIZE; | ||
3592 | buffer->page_order = ilog2(nr_pages); | ||
3593 | buffer->nr_pages = 1; | ||
3594 | |||
3595 | perf_buffer_init(buffer, watermark, flags); | ||
3596 | |||
3597 | return buffer; | ||
3598 | |||
3599 | fail_all_buf: | ||
3600 | kfree(buffer); | ||
3601 | |||
3602 | fail: | ||
3603 | return NULL; | ||
3604 | } | ||
3605 | |||
3606 | #endif | ||
3607 | |||
3608 | static unsigned long perf_data_size(struct perf_buffer *buffer) | ||
3609 | { | ||
3610 | return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); | ||
3611 | } | ||
3612 | |||
3613 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 3463 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
3614 | { | 3464 | { |
3615 | struct perf_event *event = vma->vm_file->private_data; | 3465 | struct perf_event *event = vma->vm_file->private_data; |
3616 | struct perf_buffer *buffer; | 3466 | struct ring_buffer *rb; |
3617 | int ret = VM_FAULT_SIGBUS; | 3467 | int ret = VM_FAULT_SIGBUS; |
3618 | 3468 | ||
3619 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | 3469 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
@@ -3623,14 +3473,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
3623 | } | 3473 | } |
3624 | 3474 | ||
3625 | rcu_read_lock(); | 3475 | rcu_read_lock(); |
3626 | buffer = rcu_dereference(event->buffer); | 3476 | rb = rcu_dereference(event->rb); |
3627 | if (!buffer) | 3477 | if (!rb) |
3628 | goto unlock; | 3478 | goto unlock; |
3629 | 3479 | ||
3630 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) | 3480 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
3631 | goto unlock; | 3481 | goto unlock; |
3632 | 3482 | ||
3633 | vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); | 3483 | vmf->page = perf_mmap_to_page(rb, vmf->pgoff); |
3634 | if (!vmf->page) | 3484 | if (!vmf->page) |
3635 | goto unlock; | 3485 | goto unlock; |
3636 | 3486 | ||
@@ -3645,35 +3495,35 @@ unlock: | |||
3645 | return ret; | 3495 | return ret; |
3646 | } | 3496 | } |
3647 | 3497 | ||
3648 | static void perf_buffer_free_rcu(struct rcu_head *rcu_head) | 3498 | static void rb_free_rcu(struct rcu_head *rcu_head) |
3649 | { | 3499 | { |
3650 | struct perf_buffer *buffer; | 3500 | struct ring_buffer *rb; |
3651 | 3501 | ||
3652 | buffer = container_of(rcu_head, struct perf_buffer, rcu_head); | 3502 | rb = container_of(rcu_head, struct ring_buffer, rcu_head); |
3653 | perf_buffer_free(buffer); | 3503 | rb_free(rb); |
3654 | } | 3504 | } |
3655 | 3505 | ||
3656 | static struct perf_buffer *perf_buffer_get(struct perf_event *event) | 3506 | static struct ring_buffer *ring_buffer_get(struct perf_event *event) |
3657 | { | 3507 | { |
3658 | struct perf_buffer *buffer; | 3508 | struct ring_buffer *rb; |
3659 | 3509 | ||
3660 | rcu_read_lock(); | 3510 | rcu_read_lock(); |
3661 | buffer = rcu_dereference(event->buffer); | 3511 | rb = rcu_dereference(event->rb); |
3662 | if (buffer) { | 3512 | if (rb) { |
3663 | if (!atomic_inc_not_zero(&buffer->refcount)) | 3513 | if (!atomic_inc_not_zero(&rb->refcount)) |
3664 | buffer = NULL; | 3514 | rb = NULL; |
3665 | } | 3515 | } |
3666 | rcu_read_unlock(); | 3516 | rcu_read_unlock(); |
3667 | 3517 | ||
3668 | return buffer; | 3518 | return rb; |
3669 | } | 3519 | } |
3670 | 3520 | ||
3671 | static void perf_buffer_put(struct perf_buffer *buffer) | 3521 | static void ring_buffer_put(struct ring_buffer *rb) |
3672 | { | 3522 | { |
3673 | if (!atomic_dec_and_test(&buffer->refcount)) | 3523 | if (!atomic_dec_and_test(&rb->refcount)) |
3674 | return; | 3524 | return; |
3675 | 3525 | ||
3676 | call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); | 3526 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3677 | } | 3527 | } |
3678 | 3528 | ||
3679 | static void perf_mmap_open(struct vm_area_struct *vma) | 3529 | static void perf_mmap_open(struct vm_area_struct *vma) |
@@ -3688,16 +3538,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3688 | struct perf_event *event = vma->vm_file->private_data; | 3538 | struct perf_event *event = vma->vm_file->private_data; |
3689 | 3539 | ||
3690 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3540 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
3691 | unsigned long size = perf_data_size(event->buffer); | 3541 | unsigned long size = perf_data_size(event->rb); |
3692 | struct user_struct *user = event->mmap_user; | 3542 | struct user_struct *user = event->mmap_user; |
3693 | struct perf_buffer *buffer = event->buffer; | 3543 | struct ring_buffer *rb = event->rb; |
3694 | 3544 | ||
3695 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3545 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
3696 | vma->vm_mm->locked_vm -= event->mmap_locked; | 3546 | vma->vm_mm->locked_vm -= event->mmap_locked; |
3697 | rcu_assign_pointer(event->buffer, NULL); | 3547 | rcu_assign_pointer(event->rb, NULL); |
3698 | mutex_unlock(&event->mmap_mutex); | 3548 | mutex_unlock(&event->mmap_mutex); |
3699 | 3549 | ||
3700 | perf_buffer_put(buffer); | 3550 | ring_buffer_put(rb); |
3701 | free_uid(user); | 3551 | free_uid(user); |
3702 | } | 3552 | } |
3703 | } | 3553 | } |
@@ -3715,7 +3565,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3715 | unsigned long user_locked, user_lock_limit; | 3565 | unsigned long user_locked, user_lock_limit; |
3716 | struct user_struct *user = current_user(); | 3566 | struct user_struct *user = current_user(); |
3717 | unsigned long locked, lock_limit; | 3567 | unsigned long locked, lock_limit; |
3718 | struct perf_buffer *buffer; | 3568 | struct ring_buffer *rb; |
3719 | unsigned long vma_size; | 3569 | unsigned long vma_size; |
3720 | unsigned long nr_pages; | 3570 | unsigned long nr_pages; |
3721 | long user_extra, extra; | 3571 | long user_extra, extra; |
@@ -3724,7 +3574,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3724 | /* | 3574 | /* |
3725 | * Don't allow mmap() of inherited per-task counters. This would | 3575 | * Don't allow mmap() of inherited per-task counters. This would |
3726 | * create a performance issue due to all children writing to the | 3576 | * create a performance issue due to all children writing to the |
3727 | * same buffer. | 3577 | * same rb. |
3728 | */ | 3578 | */ |
3729 | if (event->cpu == -1 && event->attr.inherit) | 3579 | if (event->cpu == -1 && event->attr.inherit) |
3730 | return -EINVAL; | 3580 | return -EINVAL; |
@@ -3736,7 +3586,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3736 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 3586 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
3737 | 3587 | ||
3738 | /* | 3588 | /* |
3739 | * If we have buffer pages ensure they're a power-of-two number, so we | 3589 | * If we have rb pages ensure they're a power-of-two number, so we |
3740 | * can do bitmasks instead of modulo. | 3590 | * can do bitmasks instead of modulo. |
3741 | */ | 3591 | */ |
3742 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | 3592 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
@@ -3750,9 +3600,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3750 | 3600 | ||
3751 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3601 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3752 | mutex_lock(&event->mmap_mutex); | 3602 | mutex_lock(&event->mmap_mutex); |
3753 | if (event->buffer) { | 3603 | if (event->rb) { |
3754 | if (event->buffer->nr_pages == nr_pages) | 3604 | if (event->rb->nr_pages == nr_pages) |
3755 | atomic_inc(&event->buffer->refcount); | 3605 | atomic_inc(&event->rb->refcount); |
3756 | else | 3606 | else |
3757 | ret = -EINVAL; | 3607 | ret = -EINVAL; |
3758 | goto unlock; | 3608 | goto unlock; |
@@ -3782,18 +3632,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3782 | goto unlock; | 3632 | goto unlock; |
3783 | } | 3633 | } |
3784 | 3634 | ||
3785 | WARN_ON(event->buffer); | 3635 | WARN_ON(event->rb); |
3786 | 3636 | ||
3787 | if (vma->vm_flags & VM_WRITE) | 3637 | if (vma->vm_flags & VM_WRITE) |
3788 | flags |= PERF_BUFFER_WRITABLE; | 3638 | flags |= RING_BUFFER_WRITABLE; |
3639 | |||
3640 | rb = rb_alloc(nr_pages, | ||
3641 | event->attr.watermark ? event->attr.wakeup_watermark : 0, | ||
3642 | event->cpu, flags); | ||
3789 | 3643 | ||
3790 | buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, | 3644 | if (!rb) { |
3791 | event->cpu, flags); | ||
3792 | if (!buffer) { | ||
3793 | ret = -ENOMEM; | 3645 | ret = -ENOMEM; |
3794 | goto unlock; | 3646 | goto unlock; |
3795 | } | 3647 | } |
3796 | rcu_assign_pointer(event->buffer, buffer); | 3648 | rcu_assign_pointer(event->rb, rb); |
3797 | 3649 | ||
3798 | atomic_long_add(user_extra, &user->locked_vm); | 3650 | atomic_long_add(user_extra, &user->locked_vm); |
3799 | event->mmap_locked = extra; | 3651 | event->mmap_locked = extra; |
@@ -3892,117 +3744,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | |||
3892 | } | 3744 | } |
3893 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | 3745 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
3894 | 3746 | ||
3895 | /* | ||
3896 | * Output | ||
3897 | */ | ||
3898 | static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, | ||
3899 | unsigned long offset, unsigned long head) | ||
3900 | { | ||
3901 | unsigned long mask; | ||
3902 | |||
3903 | if (!buffer->writable) | ||
3904 | return true; | ||
3905 | |||
3906 | mask = perf_data_size(buffer) - 1; | ||
3907 | |||
3908 | offset = (offset - tail) & mask; | ||
3909 | head = (head - tail) & mask; | ||
3910 | |||
3911 | if ((int)(head - offset) < 0) | ||
3912 | return false; | ||
3913 | |||
3914 | return true; | ||
3915 | } | ||
3916 | |||
3917 | static void perf_output_wakeup(struct perf_output_handle *handle) | ||
3918 | { | ||
3919 | atomic_set(&handle->buffer->poll, POLL_IN); | ||
3920 | |||
3921 | if (handle->nmi) { | ||
3922 | handle->event->pending_wakeup = 1; | ||
3923 | irq_work_queue(&handle->event->pending); | ||
3924 | } else | ||
3925 | perf_event_wakeup(handle->event); | ||
3926 | } | ||
3927 | |||
3928 | /* | ||
3929 | * We need to ensure a later event_id doesn't publish a head when a former | ||
3930 | * event isn't done writing. However since we need to deal with NMIs we | ||
3931 | * cannot fully serialize things. | ||
3932 | * | ||
3933 | * We only publish the head (and generate a wakeup) when the outer-most | ||
3934 | * event completes. | ||
3935 | */ | ||
3936 | static void perf_output_get_handle(struct perf_output_handle *handle) | ||
3937 | { | ||
3938 | struct perf_buffer *buffer = handle->buffer; | ||
3939 | |||
3940 | preempt_disable(); | ||
3941 | local_inc(&buffer->nest); | ||
3942 | handle->wakeup = local_read(&buffer->wakeup); | ||
3943 | } | ||
3944 | |||
3945 | static void perf_output_put_handle(struct perf_output_handle *handle) | ||
3946 | { | ||
3947 | struct perf_buffer *buffer = handle->buffer; | ||
3948 | unsigned long head; | ||
3949 | |||
3950 | again: | ||
3951 | head = local_read(&buffer->head); | ||
3952 | |||
3953 | /* | ||
3954 | * IRQ/NMI can happen here, which means we can miss a head update. | ||
3955 | */ | ||
3956 | |||
3957 | if (!local_dec_and_test(&buffer->nest)) | ||
3958 | goto out; | ||
3959 | |||
3960 | /* | ||
3961 | * Publish the known good head. Rely on the full barrier implied | ||
3962 | * by atomic_dec_and_test() order the buffer->head read and this | ||
3963 | * write. | ||
3964 | */ | ||
3965 | buffer->user_page->data_head = head; | ||
3966 | |||
3967 | /* | ||
3968 | * Now check if we missed an update, rely on the (compiler) | ||
3969 | * barrier in atomic_dec_and_test() to re-read buffer->head. | ||
3970 | */ | ||
3971 | if (unlikely(head != local_read(&buffer->head))) { | ||
3972 | local_inc(&buffer->nest); | ||
3973 | goto again; | ||
3974 | } | ||
3975 | |||
3976 | if (handle->wakeup != local_read(&buffer->wakeup)) | ||
3977 | perf_output_wakeup(handle); | ||
3978 | |||
3979 | out: | ||
3980 | preempt_enable(); | ||
3981 | } | ||
3982 | |||
3983 | __always_inline void perf_output_copy(struct perf_output_handle *handle, | ||
3984 | const void *buf, unsigned int len) | ||
3985 | { | ||
3986 | do { | ||
3987 | unsigned long size = min_t(unsigned long, handle->size, len); | ||
3988 | |||
3989 | memcpy(handle->addr, buf, size); | ||
3990 | |||
3991 | len -= size; | ||
3992 | handle->addr += size; | ||
3993 | buf += size; | ||
3994 | handle->size -= size; | ||
3995 | if (!handle->size) { | ||
3996 | struct perf_buffer *buffer = handle->buffer; | ||
3997 | |||
3998 | handle->page++; | ||
3999 | handle->page &= buffer->nr_pages - 1; | ||
4000 | handle->addr = buffer->data_pages[handle->page]; | ||
4001 | handle->size = PAGE_SIZE << page_order(buffer); | ||
4002 | } | ||
4003 | } while (len); | ||
4004 | } | ||
4005 | |||
4006 | static void __perf_event_header__init_id(struct perf_event_header *header, | 3747 | static void __perf_event_header__init_id(struct perf_event_header *header, |
4007 | struct perf_sample_data *data, | 3748 | struct perf_sample_data *data, |
4008 | struct perf_event *event) | 3749 | struct perf_event *event) |
@@ -4033,9 +3774,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
4033 | } | 3774 | } |
4034 | } | 3775 | } |
4035 | 3776 | ||
4036 | static void perf_event_header__init_id(struct perf_event_header *header, | 3777 | void perf_event_header__init_id(struct perf_event_header *header, |
4037 | struct perf_sample_data *data, | 3778 | struct perf_sample_data *data, |
4038 | struct perf_event *event) | 3779 | struct perf_event *event) |
4039 | { | 3780 | { |
4040 | if (event->attr.sample_id_all) | 3781 | if (event->attr.sample_id_all) |
4041 | __perf_event_header__init_id(header, data, event); | 3782 | __perf_event_header__init_id(header, data, event); |
@@ -4062,121 +3803,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
4062 | perf_output_put(handle, data->cpu_entry); | 3803 | perf_output_put(handle, data->cpu_entry); |
4063 | } | 3804 | } |
4064 | 3805 | ||
4065 | static void perf_event__output_id_sample(struct perf_event *event, | 3806 | void perf_event__output_id_sample(struct perf_event *event, |
4066 | struct perf_output_handle *handle, | 3807 | struct perf_output_handle *handle, |
4067 | struct perf_sample_data *sample) | 3808 | struct perf_sample_data *sample) |
4068 | { | 3809 | { |
4069 | if (event->attr.sample_id_all) | 3810 | if (event->attr.sample_id_all) |
4070 | __perf_event__output_id_sample(handle, sample); | 3811 | __perf_event__output_id_sample(handle, sample); |
4071 | } | 3812 | } |
4072 | 3813 | ||
4073 | int perf_output_begin(struct perf_output_handle *handle, | ||
4074 | struct perf_event *event, unsigned int size, | ||
4075 | int nmi, int sample) | ||
4076 | { | ||
4077 | struct perf_buffer *buffer; | ||
4078 | unsigned long tail, offset, head; | ||
4079 | int have_lost; | ||
4080 | struct perf_sample_data sample_data; | ||
4081 | struct { | ||
4082 | struct perf_event_header header; | ||
4083 | u64 id; | ||
4084 | u64 lost; | ||
4085 | } lost_event; | ||
4086 | |||
4087 | rcu_read_lock(); | ||
4088 | /* | ||
4089 | * For inherited events we send all the output towards the parent. | ||
4090 | */ | ||
4091 | if (event->parent) | ||
4092 | event = event->parent; | ||
4093 | |||
4094 | buffer = rcu_dereference(event->buffer); | ||
4095 | if (!buffer) | ||
4096 | goto out; | ||
4097 | |||
4098 | handle->buffer = buffer; | ||
4099 | handle->event = event; | ||
4100 | handle->nmi = nmi; | ||
4101 | handle->sample = sample; | ||
4102 | |||
4103 | if (!buffer->nr_pages) | ||
4104 | goto out; | ||
4105 | |||
4106 | have_lost = local_read(&buffer->lost); | ||
4107 | if (have_lost) { | ||
4108 | lost_event.header.size = sizeof(lost_event); | ||
4109 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
4110 | event); | ||
4111 | size += lost_event.header.size; | ||
4112 | } | ||
4113 | |||
4114 | perf_output_get_handle(handle); | ||
4115 | |||
4116 | do { | ||
4117 | /* | ||
4118 | * Userspace could choose to issue a mb() before updating the | ||
4119 | * tail pointer. So that all reads will be completed before the | ||
4120 | * write is issued. | ||
4121 | */ | ||
4122 | tail = ACCESS_ONCE(buffer->user_page->data_tail); | ||
4123 | smp_rmb(); | ||
4124 | offset = head = local_read(&buffer->head); | ||
4125 | head += size; | ||
4126 | if (unlikely(!perf_output_space(buffer, tail, offset, head))) | ||
4127 | goto fail; | ||
4128 | } while (local_cmpxchg(&buffer->head, offset, head) != offset); | ||
4129 | |||
4130 | if (head - local_read(&buffer->wakeup) > buffer->watermark) | ||
4131 | local_add(buffer->watermark, &buffer->wakeup); | ||
4132 | |||
4133 | handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); | ||
4134 | handle->page &= buffer->nr_pages - 1; | ||
4135 | handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); | ||
4136 | handle->addr = buffer->data_pages[handle->page]; | ||
4137 | handle->addr += handle->size; | ||
4138 | handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; | ||
4139 | |||
4140 | if (have_lost) { | ||
4141 | lost_event.header.type = PERF_RECORD_LOST; | ||
4142 | lost_event.header.misc = 0; | ||
4143 | lost_event.id = event->id; | ||
4144 | lost_event.lost = local_xchg(&buffer->lost, 0); | ||
4145 | |||
4146 | perf_output_put(handle, lost_event); | ||
4147 | perf_event__output_id_sample(event, handle, &sample_data); | ||
4148 | } | ||
4149 | |||
4150 | return 0; | ||
4151 | |||
4152 | fail: | ||
4153 | local_inc(&buffer->lost); | ||
4154 | perf_output_put_handle(handle); | ||
4155 | out: | ||
4156 | rcu_read_unlock(); | ||
4157 | |||
4158 | return -ENOSPC; | ||
4159 | } | ||
4160 | |||
4161 | void perf_output_end(struct perf_output_handle *handle) | ||
4162 | { | ||
4163 | struct perf_event *event = handle->event; | ||
4164 | struct perf_buffer *buffer = handle->buffer; | ||
4165 | |||
4166 | int wakeup_events = event->attr.wakeup_events; | ||
4167 | |||
4168 | if (handle->sample && wakeup_events) { | ||
4169 | int events = local_inc_return(&buffer->events); | ||
4170 | if (events >= wakeup_events) { | ||
4171 | local_sub(wakeup_events, &buffer->events); | ||
4172 | local_inc(&buffer->wakeup); | ||
4173 | } | ||
4174 | } | ||
4175 | |||
4176 | perf_output_put_handle(handle); | ||
4177 | rcu_read_unlock(); | ||
4178 | } | ||
4179 | |||
4180 | static void perf_output_read_one(struct perf_output_handle *handle, | 3814 | static void perf_output_read_one(struct perf_output_handle *handle, |
4181 | struct perf_event *event, | 3815 | struct perf_event *event, |
4182 | u64 enabled, u64 running) | 3816 | u64 enabled, u64 running) |
@@ -4197,7 +3831,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
4197 | if (read_format & PERF_FORMAT_ID) | 3831 | if (read_format & PERF_FORMAT_ID) |
4198 | values[n++] = primary_event_id(event); | 3832 | values[n++] = primary_event_id(event); |
4199 | 3833 | ||
4200 | perf_output_copy(handle, values, n * sizeof(u64)); | 3834 | __output_copy(handle, values, n * sizeof(u64)); |
4201 | } | 3835 | } |
4202 | 3836 | ||
4203 | /* | 3837 | /* |
@@ -4227,7 +3861,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4227 | if (read_format & PERF_FORMAT_ID) | 3861 | if (read_format & PERF_FORMAT_ID) |
4228 | values[n++] = primary_event_id(leader); | 3862 | values[n++] = primary_event_id(leader); |
4229 | 3863 | ||
4230 | perf_output_copy(handle, values, n * sizeof(u64)); | 3864 | __output_copy(handle, values, n * sizeof(u64)); |
4231 | 3865 | ||
4232 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 3866 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
4233 | n = 0; | 3867 | n = 0; |
@@ -4239,7 +3873,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4239 | if (read_format & PERF_FORMAT_ID) | 3873 | if (read_format & PERF_FORMAT_ID) |
4240 | values[n++] = primary_event_id(sub); | 3874 | values[n++] = primary_event_id(sub); |
4241 | 3875 | ||
4242 | perf_output_copy(handle, values, n * sizeof(u64)); | 3876 | __output_copy(handle, values, n * sizeof(u64)); |
4243 | } | 3877 | } |
4244 | } | 3878 | } |
4245 | 3879 | ||
@@ -4249,7 +3883,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4249 | static void perf_output_read(struct perf_output_handle *handle, | 3883 | static void perf_output_read(struct perf_output_handle *handle, |
4250 | struct perf_event *event) | 3884 | struct perf_event *event) |
4251 | { | 3885 | { |
4252 | u64 enabled = 0, running = 0, now, ctx_time; | 3886 | u64 enabled = 0, running = 0; |
4253 | u64 read_format = event->attr.read_format; | 3887 | u64 read_format = event->attr.read_format; |
4254 | 3888 | ||
4255 | /* | 3889 | /* |
@@ -4261,12 +3895,8 @@ static void perf_output_read(struct perf_output_handle *handle, | |||
4261 | * because of locking issue as we are called in | 3895 | * because of locking issue as we are called in |
4262 | * NMI context | 3896 | * NMI context |
4263 | */ | 3897 | */ |
4264 | if (read_format & PERF_FORMAT_TOTAL_TIMES) { | 3898 | if (read_format & PERF_FORMAT_TOTAL_TIMES) |
4265 | now = perf_clock(); | 3899 | calc_timer_values(event, &enabled, &running); |
4266 | ctx_time = event->shadow_ctx_time + now; | ||
4267 | enabled = ctx_time - event->tstamp_enabled; | ||
4268 | running = ctx_time - event->tstamp_running; | ||
4269 | } | ||
4270 | 3900 | ||
4271 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 3901 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
4272 | perf_output_read_group(handle, event, enabled, running); | 3902 | perf_output_read_group(handle, event, enabled, running); |
@@ -4319,7 +3949,7 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4319 | 3949 | ||
4320 | size *= sizeof(u64); | 3950 | size *= sizeof(u64); |
4321 | 3951 | ||
4322 | perf_output_copy(handle, data->callchain, size); | 3952 | __output_copy(handle, data->callchain, size); |
4323 | } else { | 3953 | } else { |
4324 | u64 nr = 0; | 3954 | u64 nr = 0; |
4325 | perf_output_put(handle, nr); | 3955 | perf_output_put(handle, nr); |
@@ -4329,8 +3959,8 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4329 | if (sample_type & PERF_SAMPLE_RAW) { | 3959 | if (sample_type & PERF_SAMPLE_RAW) { |
4330 | if (data->raw) { | 3960 | if (data->raw) { |
4331 | perf_output_put(handle, data->raw->size); | 3961 | perf_output_put(handle, data->raw->size); |
4332 | perf_output_copy(handle, data->raw->data, | 3962 | __output_copy(handle, data->raw->data, |
4333 | data->raw->size); | 3963 | data->raw->size); |
4334 | } else { | 3964 | } else { |
4335 | struct { | 3965 | struct { |
4336 | u32 size; | 3966 | u32 size; |
@@ -4342,6 +3972,20 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4342 | perf_output_put(handle, raw); | 3972 | perf_output_put(handle, raw); |
4343 | } | 3973 | } |
4344 | } | 3974 | } |
3975 | |||
3976 | if (!event->attr.watermark) { | ||
3977 | int wakeup_events = event->attr.wakeup_events; | ||
3978 | |||
3979 | if (wakeup_events) { | ||
3980 | struct ring_buffer *rb = handle->rb; | ||
3981 | int events = local_inc_return(&rb->events); | ||
3982 | |||
3983 | if (events >= wakeup_events) { | ||
3984 | local_sub(wakeup_events, &rb->events); | ||
3985 | local_inc(&rb->wakeup); | ||
3986 | } | ||
3987 | } | ||
3988 | } | ||
4345 | } | 3989 | } |
4346 | 3990 | ||
4347 | void perf_prepare_sample(struct perf_event_header *header, | 3991 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4386,7 +4030,7 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
4386 | } | 4030 | } |
4387 | } | 4031 | } |
4388 | 4032 | ||
4389 | static void perf_event_output(struct perf_event *event, int nmi, | 4033 | static void perf_event_output(struct perf_event *event, |
4390 | struct perf_sample_data *data, | 4034 | struct perf_sample_data *data, |
4391 | struct pt_regs *regs) | 4035 | struct pt_regs *regs) |
4392 | { | 4036 | { |
@@ -4398,7 +4042,7 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
4398 | 4042 | ||
4399 | perf_prepare_sample(&header, data, event, regs); | 4043 | perf_prepare_sample(&header, data, event, regs); |
4400 | 4044 | ||
4401 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 4045 | if (perf_output_begin(&handle, event, header.size)) |
4402 | goto exit; | 4046 | goto exit; |
4403 | 4047 | ||
4404 | perf_output_sample(&handle, &header, data, event); | 4048 | perf_output_sample(&handle, &header, data, event); |
@@ -4438,7 +4082,7 @@ perf_event_read_event(struct perf_event *event, | |||
4438 | int ret; | 4082 | int ret; |
4439 | 4083 | ||
4440 | perf_event_header__init_id(&read_event.header, &sample, event); | 4084 | perf_event_header__init_id(&read_event.header, &sample, event); |
4441 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 4085 | ret = perf_output_begin(&handle, event, read_event.header.size); |
4442 | if (ret) | 4086 | if (ret) |
4443 | return; | 4087 | return; |
4444 | 4088 | ||
@@ -4481,7 +4125,7 @@ static void perf_event_task_output(struct perf_event *event, | |||
4481 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4125 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
4482 | 4126 | ||
4483 | ret = perf_output_begin(&handle, event, | 4127 | ret = perf_output_begin(&handle, event, |
4484 | task_event->event_id.header.size, 0, 0); | 4128 | task_event->event_id.header.size); |
4485 | if (ret) | 4129 | if (ret) |
4486 | goto out; | 4130 | goto out; |
4487 | 4131 | ||
@@ -4618,7 +4262,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
4618 | 4262 | ||
4619 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4263 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
4620 | ret = perf_output_begin(&handle, event, | 4264 | ret = perf_output_begin(&handle, event, |
4621 | comm_event->event_id.header.size, 0, 0); | 4265 | comm_event->event_id.header.size); |
4622 | 4266 | ||
4623 | if (ret) | 4267 | if (ret) |
4624 | goto out; | 4268 | goto out; |
@@ -4627,7 +4271,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
4627 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4271 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
4628 | 4272 | ||
4629 | perf_output_put(&handle, comm_event->event_id); | 4273 | perf_output_put(&handle, comm_event->event_id); |
4630 | perf_output_copy(&handle, comm_event->comm, | 4274 | __output_copy(&handle, comm_event->comm, |
4631 | comm_event->comm_size); | 4275 | comm_event->comm_size); |
4632 | 4276 | ||
4633 | perf_event__output_id_sample(event, &handle, &sample); | 4277 | perf_event__output_id_sample(event, &handle, &sample); |
@@ -4765,7 +4409,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4765 | 4409 | ||
4766 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 4410 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
4767 | ret = perf_output_begin(&handle, event, | 4411 | ret = perf_output_begin(&handle, event, |
4768 | mmap_event->event_id.header.size, 0, 0); | 4412 | mmap_event->event_id.header.size); |
4769 | if (ret) | 4413 | if (ret) |
4770 | goto out; | 4414 | goto out; |
4771 | 4415 | ||
@@ -4773,7 +4417,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4773 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4417 | mmap_event->event_id.tid = perf_event_tid(event, current); |
4774 | 4418 | ||
4775 | perf_output_put(&handle, mmap_event->event_id); | 4419 | perf_output_put(&handle, mmap_event->event_id); |
4776 | perf_output_copy(&handle, mmap_event->file_name, | 4420 | __output_copy(&handle, mmap_event->file_name, |
4777 | mmap_event->file_size); | 4421 | mmap_event->file_size); |
4778 | 4422 | ||
4779 | perf_event__output_id_sample(event, &handle, &sample); | 4423 | perf_event__output_id_sample(event, &handle, &sample); |
@@ -4829,7 +4473,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
4829 | 4473 | ||
4830 | if (file) { | 4474 | if (file) { |
4831 | /* | 4475 | /* |
4832 | * d_path works from the end of the buffer backwards, so we | 4476 | * d_path works from the end of the rb backwards, so we |
4833 | * need to add enough zero bytes after the string to handle | 4477 | * need to add enough zero bytes after the string to handle |
4834 | * the 64bit alignment we do later. | 4478 | * the 64bit alignment we do later. |
4835 | */ | 4479 | */ |
@@ -4960,7 +4604,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
4960 | perf_event_header__init_id(&throttle_event.header, &sample, event); | 4604 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
4961 | 4605 | ||
4962 | ret = perf_output_begin(&handle, event, | 4606 | ret = perf_output_begin(&handle, event, |
4963 | throttle_event.header.size, 1, 0); | 4607 | throttle_event.header.size); |
4964 | if (ret) | 4608 | if (ret) |
4965 | return; | 4609 | return; |
4966 | 4610 | ||
@@ -4973,7 +4617,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
4973 | * Generic event overflow handling, sampling. | 4617 | * Generic event overflow handling, sampling. |
4974 | */ | 4618 | */ |
4975 | 4619 | ||
4976 | static int __perf_event_overflow(struct perf_event *event, int nmi, | 4620 | static int __perf_event_overflow(struct perf_event *event, |
4977 | int throttle, struct perf_sample_data *data, | 4621 | int throttle, struct perf_sample_data *data, |
4978 | struct pt_regs *regs) | 4622 | struct pt_regs *regs) |
4979 | { | 4623 | { |
@@ -5016,34 +4660,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
5016 | if (events && atomic_dec_and_test(&event->event_limit)) { | 4660 | if (events && atomic_dec_and_test(&event->event_limit)) { |
5017 | ret = 1; | 4661 | ret = 1; |
5018 | event->pending_kill = POLL_HUP; | 4662 | event->pending_kill = POLL_HUP; |
5019 | if (nmi) { | 4663 | event->pending_disable = 1; |
5020 | event->pending_disable = 1; | 4664 | irq_work_queue(&event->pending); |
5021 | irq_work_queue(&event->pending); | ||
5022 | } else | ||
5023 | perf_event_disable(event); | ||
5024 | } | 4665 | } |
5025 | 4666 | ||
5026 | if (event->overflow_handler) | 4667 | if (event->overflow_handler) |
5027 | event->overflow_handler(event, nmi, data, regs); | 4668 | event->overflow_handler(event, data, regs); |
5028 | else | 4669 | else |
5029 | perf_event_output(event, nmi, data, regs); | 4670 | perf_event_output(event, data, regs); |
5030 | 4671 | ||
5031 | if (event->fasync && event->pending_kill) { | 4672 | if (event->fasync && event->pending_kill) { |
5032 | if (nmi) { | 4673 | event->pending_wakeup = 1; |
5033 | event->pending_wakeup = 1; | 4674 | irq_work_queue(&event->pending); |
5034 | irq_work_queue(&event->pending); | ||
5035 | } else | ||
5036 | perf_event_wakeup(event); | ||
5037 | } | 4675 | } |
5038 | 4676 | ||
5039 | return ret; | 4677 | return ret; |
5040 | } | 4678 | } |
5041 | 4679 | ||
5042 | int perf_event_overflow(struct perf_event *event, int nmi, | 4680 | int perf_event_overflow(struct perf_event *event, |
5043 | struct perf_sample_data *data, | 4681 | struct perf_sample_data *data, |
5044 | struct pt_regs *regs) | 4682 | struct pt_regs *regs) |
5045 | { | 4683 | { |
5046 | return __perf_event_overflow(event, nmi, 1, data, regs); | 4684 | return __perf_event_overflow(event, 1, data, regs); |
5047 | } | 4685 | } |
5048 | 4686 | ||
5049 | /* | 4687 | /* |
@@ -5092,7 +4730,7 @@ again: | |||
5092 | } | 4730 | } |
5093 | 4731 | ||
5094 | static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | 4732 | static void perf_swevent_overflow(struct perf_event *event, u64 overflow, |
5095 | int nmi, struct perf_sample_data *data, | 4733 | struct perf_sample_data *data, |
5096 | struct pt_regs *regs) | 4734 | struct pt_regs *regs) |
5097 | { | 4735 | { |
5098 | struct hw_perf_event *hwc = &event->hw; | 4736 | struct hw_perf_event *hwc = &event->hw; |
@@ -5106,7 +4744,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
5106 | return; | 4744 | return; |
5107 | 4745 | ||
5108 | for (; overflow; overflow--) { | 4746 | for (; overflow; overflow--) { |
5109 | if (__perf_event_overflow(event, nmi, throttle, | 4747 | if (__perf_event_overflow(event, throttle, |
5110 | data, regs)) { | 4748 | data, regs)) { |
5111 | /* | 4749 | /* |
5112 | * We inhibit the overflow from happening when | 4750 | * We inhibit the overflow from happening when |
@@ -5119,7 +4757,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
5119 | } | 4757 | } |
5120 | 4758 | ||
5121 | static void perf_swevent_event(struct perf_event *event, u64 nr, | 4759 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
5122 | int nmi, struct perf_sample_data *data, | 4760 | struct perf_sample_data *data, |
5123 | struct pt_regs *regs) | 4761 | struct pt_regs *regs) |
5124 | { | 4762 | { |
5125 | struct hw_perf_event *hwc = &event->hw; | 4763 | struct hw_perf_event *hwc = &event->hw; |
@@ -5133,12 +4771,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
5133 | return; | 4771 | return; |
5134 | 4772 | ||
5135 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4773 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
5136 | return perf_swevent_overflow(event, 1, nmi, data, regs); | 4774 | return perf_swevent_overflow(event, 1, data, regs); |
5137 | 4775 | ||
5138 | if (local64_add_negative(nr, &hwc->period_left)) | 4776 | if (local64_add_negative(nr, &hwc->period_left)) |
5139 | return; | 4777 | return; |
5140 | 4778 | ||
5141 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4779 | perf_swevent_overflow(event, 0, data, regs); |
5142 | } | 4780 | } |
5143 | 4781 | ||
5144 | static int perf_exclude_event(struct perf_event *event, | 4782 | static int perf_exclude_event(struct perf_event *event, |
@@ -5226,7 +4864,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) | |||
5226 | } | 4864 | } |
5227 | 4865 | ||
5228 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | 4866 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, |
5229 | u64 nr, int nmi, | 4867 | u64 nr, |
5230 | struct perf_sample_data *data, | 4868 | struct perf_sample_data *data, |
5231 | struct pt_regs *regs) | 4869 | struct pt_regs *regs) |
5232 | { | 4870 | { |
@@ -5242,7 +4880,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
5242 | 4880 | ||
5243 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4881 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
5244 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4882 | if (perf_swevent_match(event, type, event_id, data, regs)) |
5245 | perf_swevent_event(event, nr, nmi, data, regs); | 4883 | perf_swevent_event(event, nr, data, regs); |
5246 | } | 4884 | } |
5247 | end: | 4885 | end: |
5248 | rcu_read_unlock(); | 4886 | rcu_read_unlock(); |
@@ -5263,8 +4901,7 @@ inline void perf_swevent_put_recursion_context(int rctx) | |||
5263 | put_recursion_context(swhash->recursion, rctx); | 4901 | put_recursion_context(swhash->recursion, rctx); |
5264 | } | 4902 | } |
5265 | 4903 | ||
5266 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4904 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) |
5267 | struct pt_regs *regs, u64 addr) | ||
5268 | { | 4905 | { |
5269 | struct perf_sample_data data; | 4906 | struct perf_sample_data data; |
5270 | int rctx; | 4907 | int rctx; |
@@ -5276,7 +4913,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
5276 | 4913 | ||
5277 | perf_sample_data_init(&data, addr); | 4914 | perf_sample_data_init(&data, addr); |
5278 | 4915 | ||
5279 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); | 4916 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
5280 | 4917 | ||
5281 | perf_swevent_put_recursion_context(rctx); | 4918 | perf_swevent_put_recursion_context(rctx); |
5282 | preempt_enable_notrace(); | 4919 | preempt_enable_notrace(); |
@@ -5524,7 +5161,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
5524 | 5161 | ||
5525 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5162 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
5526 | if (perf_tp_event_match(event, &data, regs)) | 5163 | if (perf_tp_event_match(event, &data, regs)) |
5527 | perf_swevent_event(event, count, 1, &data, regs); | 5164 | perf_swevent_event(event, count, &data, regs); |
5528 | } | 5165 | } |
5529 | 5166 | ||
5530 | perf_swevent_put_recursion_context(rctx); | 5167 | perf_swevent_put_recursion_context(rctx); |
@@ -5617,7 +5254,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
5617 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5254 | perf_sample_data_init(&sample, bp->attr.bp_addr); |
5618 | 5255 | ||
5619 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5256 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
5620 | perf_swevent_event(bp, 1, 1, &sample, regs); | 5257 | perf_swevent_event(bp, 1, &sample, regs); |
5621 | } | 5258 | } |
5622 | #endif | 5259 | #endif |
5623 | 5260 | ||
@@ -5646,7 +5283,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5646 | 5283 | ||
5647 | if (regs && !perf_exclude_event(event, regs)) { | 5284 | if (regs && !perf_exclude_event(event, regs)) { |
5648 | if (!(event->attr.exclude_idle && current->pid == 0)) | 5285 | if (!(event->attr.exclude_idle && current->pid == 0)) |
5649 | if (perf_event_overflow(event, 0, &data, regs)) | 5286 | if (perf_event_overflow(event, &data, regs)) |
5650 | ret = HRTIMER_NORESTART; | 5287 | ret = HRTIMER_NORESTART; |
5651 | } | 5288 | } |
5652 | 5289 | ||
@@ -5986,6 +5623,7 @@ free_dev: | |||
5986 | } | 5623 | } |
5987 | 5624 | ||
5988 | static struct lock_class_key cpuctx_mutex; | 5625 | static struct lock_class_key cpuctx_mutex; |
5626 | static struct lock_class_key cpuctx_lock; | ||
5989 | 5627 | ||
5990 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | 5628 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
5991 | { | 5629 | { |
@@ -6036,6 +5674,7 @@ skip_type: | |||
6036 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 5674 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
6037 | __perf_event_init_context(&cpuctx->ctx); | 5675 | __perf_event_init_context(&cpuctx->ctx); |
6038 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | 5676 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); |
5677 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | ||
6039 | cpuctx->ctx.type = cpu_context; | 5678 | cpuctx->ctx.type = cpu_context; |
6040 | cpuctx->ctx.pmu = pmu; | 5679 | cpuctx->ctx.pmu = pmu; |
6041 | cpuctx->jiffies_interval = 1; | 5680 | cpuctx->jiffies_interval = 1; |
@@ -6150,7 +5789,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6150 | struct task_struct *task, | 5789 | struct task_struct *task, |
6151 | struct perf_event *group_leader, | 5790 | struct perf_event *group_leader, |
6152 | struct perf_event *parent_event, | 5791 | struct perf_event *parent_event, |
6153 | perf_overflow_handler_t overflow_handler) | 5792 | perf_overflow_handler_t overflow_handler, |
5793 | void *context) | ||
6154 | { | 5794 | { |
6155 | struct pmu *pmu; | 5795 | struct pmu *pmu; |
6156 | struct perf_event *event; | 5796 | struct perf_event *event; |
@@ -6208,10 +5848,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6208 | #endif | 5848 | #endif |
6209 | } | 5849 | } |
6210 | 5850 | ||
6211 | if (!overflow_handler && parent_event) | 5851 | if (!overflow_handler && parent_event) { |
6212 | overflow_handler = parent_event->overflow_handler; | 5852 | overflow_handler = parent_event->overflow_handler; |
5853 | context = parent_event->overflow_handler_context; | ||
5854 | } | ||
6213 | 5855 | ||
6214 | event->overflow_handler = overflow_handler; | 5856 | event->overflow_handler = overflow_handler; |
5857 | event->overflow_handler_context = context; | ||
6215 | 5858 | ||
6216 | if (attr->disabled) | 5859 | if (attr->disabled) |
6217 | event->state = PERF_EVENT_STATE_OFF; | 5860 | event->state = PERF_EVENT_STATE_OFF; |
@@ -6326,13 +5969,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6326 | if (ret) | 5969 | if (ret) |
6327 | return -EFAULT; | 5970 | return -EFAULT; |
6328 | 5971 | ||
6329 | /* | ||
6330 | * If the type exists, the corresponding creation will verify | ||
6331 | * the attr->config. | ||
6332 | */ | ||
6333 | if (attr->type >= PERF_TYPE_MAX) | ||
6334 | return -EINVAL; | ||
6335 | |||
6336 | if (attr->__reserved_1) | 5972 | if (attr->__reserved_1) |
6337 | return -EINVAL; | 5973 | return -EINVAL; |
6338 | 5974 | ||
@@ -6354,7 +5990,7 @@ err_size: | |||
6354 | static int | 5990 | static int |
6355 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 5991 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
6356 | { | 5992 | { |
6357 | struct perf_buffer *buffer = NULL, *old_buffer = NULL; | 5993 | struct ring_buffer *rb = NULL, *old_rb = NULL; |
6358 | int ret = -EINVAL; | 5994 | int ret = -EINVAL; |
6359 | 5995 | ||
6360 | if (!output_event) | 5996 | if (!output_event) |
@@ -6371,7 +6007,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | |||
6371 | goto out; | 6007 | goto out; |
6372 | 6008 | ||
6373 | /* | 6009 | /* |
6374 | * If its not a per-cpu buffer, it must be the same task. | 6010 | * If its not a per-cpu rb, it must be the same task. |
6375 | */ | 6011 | */ |
6376 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 6012 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
6377 | goto out; | 6013 | goto out; |
@@ -6383,20 +6019,20 @@ set: | |||
6383 | goto unlock; | 6019 | goto unlock; |
6384 | 6020 | ||
6385 | if (output_event) { | 6021 | if (output_event) { |
6386 | /* get the buffer we want to redirect to */ | 6022 | /* get the rb we want to redirect to */ |
6387 | buffer = perf_buffer_get(output_event); | 6023 | rb = ring_buffer_get(output_event); |
6388 | if (!buffer) | 6024 | if (!rb) |
6389 | goto unlock; | 6025 | goto unlock; |
6390 | } | 6026 | } |
6391 | 6027 | ||
6392 | old_buffer = event->buffer; | 6028 | old_rb = event->rb; |
6393 | rcu_assign_pointer(event->buffer, buffer); | 6029 | rcu_assign_pointer(event->rb, rb); |
6394 | ret = 0; | 6030 | ret = 0; |
6395 | unlock: | 6031 | unlock: |
6396 | mutex_unlock(&event->mmap_mutex); | 6032 | mutex_unlock(&event->mmap_mutex); |
6397 | 6033 | ||
6398 | if (old_buffer) | 6034 | if (old_rb) |
6399 | perf_buffer_put(old_buffer); | 6035 | ring_buffer_put(old_rb); |
6400 | out: | 6036 | out: |
6401 | return ret; | 6037 | return ret; |
6402 | } | 6038 | } |
@@ -6478,7 +6114,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6478 | } | 6114 | } |
6479 | } | 6115 | } |
6480 | 6116 | ||
6481 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | 6117 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
6118 | NULL, NULL); | ||
6482 | if (IS_ERR(event)) { | 6119 | if (IS_ERR(event)) { |
6483 | err = PTR_ERR(event); | 6120 | err = PTR_ERR(event); |
6484 | goto err_task; | 6121 | goto err_task; |
@@ -6663,7 +6300,8 @@ err_fd: | |||
6663 | struct perf_event * | 6300 | struct perf_event * |
6664 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 6301 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
6665 | struct task_struct *task, | 6302 | struct task_struct *task, |
6666 | perf_overflow_handler_t overflow_handler) | 6303 | perf_overflow_handler_t overflow_handler, |
6304 | void *context) | ||
6667 | { | 6305 | { |
6668 | struct perf_event_context *ctx; | 6306 | struct perf_event_context *ctx; |
6669 | struct perf_event *event; | 6307 | struct perf_event *event; |
@@ -6673,7 +6311,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6673 | * Get the target context (task or percpu): | 6311 | * Get the target context (task or percpu): |
6674 | */ | 6312 | */ |
6675 | 6313 | ||
6676 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); | 6314 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, |
6315 | overflow_handler, context); | ||
6677 | if (IS_ERR(event)) { | 6316 | if (IS_ERR(event)) { |
6678 | err = PTR_ERR(event); | 6317 | err = PTR_ERR(event); |
6679 | goto err; | 6318 | goto err; |
@@ -6780,7 +6419,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
6780 | * our context. | 6419 | * our context. |
6781 | */ | 6420 | */ |
6782 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); | 6421 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
6783 | task_ctx_sched_out(child_ctx, EVENT_ALL); | ||
6784 | 6422 | ||
6785 | /* | 6423 | /* |
6786 | * Take the context lock here so that if find_get_context is | 6424 | * Take the context lock here so that if find_get_context is |
@@ -6788,6 +6426,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
6788 | * incremented the context's refcount before we do put_ctx below. | 6426 | * incremented the context's refcount before we do put_ctx below. |
6789 | */ | 6427 | */ |
6790 | raw_spin_lock(&child_ctx->lock); | 6428 | raw_spin_lock(&child_ctx->lock); |
6429 | task_ctx_sched_out(child_ctx); | ||
6791 | child->perf_event_ctxp[ctxn] = NULL; | 6430 | child->perf_event_ctxp[ctxn] = NULL; |
6792 | /* | 6431 | /* |
6793 | * If this context is a clone; unclone it so it can't get | 6432 | * If this context is a clone; unclone it so it can't get |
@@ -6957,7 +6596,7 @@ inherit_event(struct perf_event *parent_event, | |||
6957 | parent_event->cpu, | 6596 | parent_event->cpu, |
6958 | child, | 6597 | child, |
6959 | group_leader, parent_event, | 6598 | group_leader, parent_event, |
6960 | NULL); | 6599 | NULL, NULL); |
6961 | if (IS_ERR(child_event)) | 6600 | if (IS_ERR(child_event)) |
6962 | return child_event; | 6601 | return child_event; |
6963 | get_ctx(child_ctx); | 6602 | get_ctx(child_ctx); |
@@ -6984,6 +6623,8 @@ inherit_event(struct perf_event *parent_event, | |||
6984 | 6623 | ||
6985 | child_event->ctx = child_ctx; | 6624 | child_event->ctx = child_ctx; |
6986 | child_event->overflow_handler = parent_event->overflow_handler; | 6625 | child_event->overflow_handler = parent_event->overflow_handler; |
6626 | child_event->overflow_handler_context | ||
6627 | = parent_event->overflow_handler_context; | ||
6987 | 6628 | ||
6988 | /* | 6629 | /* |
6989 | * Precalculate sample_data sizes | 6630 | * Precalculate sample_data sizes |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55..b7971d6f38b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp) | |||
431 | struct perf_event * | 431 | struct perf_event * |
432 | register_user_hw_breakpoint(struct perf_event_attr *attr, | 432 | register_user_hw_breakpoint(struct perf_event_attr *attr, |
433 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
434 | void *context, | ||
434 | struct task_struct *tsk) | 435 | struct task_struct *tsk) |
435 | { | 436 | { |
436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); | 437 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered, |
438 | context); | ||
437 | } | 439 | } |
438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 440 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
439 | 441 | ||
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); | |||
502 | */ | 504 | */ |
503 | struct perf_event * __percpu * | 505 | struct perf_event * __percpu * |
504 | register_wide_hw_breakpoint(struct perf_event_attr *attr, | 506 | register_wide_hw_breakpoint(struct perf_event_attr *attr, |
505 | perf_overflow_handler_t triggered) | 507 | perf_overflow_handler_t triggered, |
508 | void *context) | ||
506 | { | 509 | { |
507 | struct perf_event * __percpu *cpu_events, **pevent, *bp; | 510 | struct perf_event * __percpu *cpu_events, **pevent, *bp; |
508 | long err; | 511 | long err; |
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
515 | get_online_cpus(); | 518 | get_online_cpus(); |
516 | for_each_online_cpu(cpu) { | 519 | for_each_online_cpu(cpu) { |
517 | pevent = per_cpu_ptr(cpu_events, cpu); | 520 | pevent = per_cpu_ptr(cpu_events, cpu); |
518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); | 521 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, |
522 | triggered, context); | ||
519 | 523 | ||
520 | *pevent = bp; | 524 | *pevent = bp; |
521 | 525 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h new file mode 100644 index 00000000000..09097dd8116 --- /dev/null +++ b/kernel/events/internal.h | |||
@@ -0,0 +1,96 @@ | |||
1 | #ifndef _KERNEL_EVENTS_INTERNAL_H | ||
2 | #define _KERNEL_EVENTS_INTERNAL_H | ||
3 | |||
4 | #define RING_BUFFER_WRITABLE 0x01 | ||
5 | |||
6 | struct ring_buffer { | ||
7 | atomic_t refcount; | ||
8 | struct rcu_head rcu_head; | ||
9 | #ifdef CONFIG_PERF_USE_VMALLOC | ||
10 | struct work_struct work; | ||
11 | int page_order; /* allocation order */ | ||
12 | #endif | ||
13 | int nr_pages; /* nr of data pages */ | ||
14 | int writable; /* are we writable */ | ||
15 | |||
16 | atomic_t poll; /* POLL_ for wakeups */ | ||
17 | |||
18 | local_t head; /* write position */ | ||
19 | local_t nest; /* nested writers */ | ||
20 | local_t events; /* event limit */ | ||
21 | local_t wakeup; /* wakeup stamp */ | ||
22 | local_t lost; /* nr records lost */ | ||
23 | |||
24 | long watermark; /* wakeup watermark */ | ||
25 | |||
26 | struct perf_event_mmap_page *user_page; | ||
27 | void *data_pages[0]; | ||
28 | }; | ||
29 | |||
30 | extern void rb_free(struct ring_buffer *rb); | ||
31 | extern struct ring_buffer * | ||
32 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); | ||
33 | extern void perf_event_wakeup(struct perf_event *event); | ||
34 | |||
35 | extern void | ||
36 | perf_event_header__init_id(struct perf_event_header *header, | ||
37 | struct perf_sample_data *data, | ||
38 | struct perf_event *event); | ||
39 | extern void | ||
40 | perf_event__output_id_sample(struct perf_event *event, | ||
41 | struct perf_output_handle *handle, | ||
42 | struct perf_sample_data *sample); | ||
43 | |||
44 | extern struct page * | ||
45 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); | ||
46 | |||
47 | #ifdef CONFIG_PERF_USE_VMALLOC | ||
48 | /* | ||
49 | * Back perf_mmap() with vmalloc memory. | ||
50 | * | ||
51 | * Required for architectures that have d-cache aliasing issues. | ||
52 | */ | ||
53 | |||
54 | static inline int page_order(struct ring_buffer *rb) | ||
55 | { | ||
56 | return rb->page_order; | ||
57 | } | ||
58 | |||
59 | #else | ||
60 | |||
61 | static inline int page_order(struct ring_buffer *rb) | ||
62 | { | ||
63 | return 0; | ||
64 | } | ||
65 | #endif | ||
66 | |||
67 | static unsigned long perf_data_size(struct ring_buffer *rb) | ||
68 | { | ||
69 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | ||
70 | } | ||
71 | |||
72 | static inline void | ||
73 | __output_copy(struct perf_output_handle *handle, | ||
74 | const void *buf, unsigned int len) | ||
75 | { | ||
76 | do { | ||
77 | unsigned long size = min_t(unsigned long, handle->size, len); | ||
78 | |||
79 | memcpy(handle->addr, buf, size); | ||
80 | |||
81 | len -= size; | ||
82 | handle->addr += size; | ||
83 | buf += size; | ||
84 | handle->size -= size; | ||
85 | if (!handle->size) { | ||
86 | struct ring_buffer *rb = handle->rb; | ||
87 | |||
88 | handle->page++; | ||
89 | handle->page &= rb->nr_pages - 1; | ||
90 | handle->addr = rb->data_pages[handle->page]; | ||
91 | handle->size = PAGE_SIZE << page_order(rb); | ||
92 | } | ||
93 | } while (len); | ||
94 | } | ||
95 | |||
96 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c new file mode 100644 index 00000000000..a2a29205cc0 --- /dev/null +++ b/kernel/events/ring_buffer.c | |||
@@ -0,0 +1,380 @@ | |||
1 | /* | ||
2 | * Performance events ring-buffer code: | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
8 | * | ||
9 | * For licensing details see kernel-base/COPYING | ||
10 | */ | ||
11 | |||
12 | #include <linux/perf_event.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/slab.h> | ||
15 | |||
16 | #include "internal.h" | ||
17 | |||
18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
19 | unsigned long offset, unsigned long head) | ||
20 | { | ||
21 | unsigned long mask; | ||
22 | |||
23 | if (!rb->writable) | ||
24 | return true; | ||
25 | |||
26 | mask = perf_data_size(rb) - 1; | ||
27 | |||
28 | offset = (offset - tail) & mask; | ||
29 | head = (head - tail) & mask; | ||
30 | |||
31 | if ((int)(head - offset) < 0) | ||
32 | return false; | ||
33 | |||
34 | return true; | ||
35 | } | ||
36 | |||
37 | static void perf_output_wakeup(struct perf_output_handle *handle) | ||
38 | { | ||
39 | atomic_set(&handle->rb->poll, POLL_IN); | ||
40 | |||
41 | handle->event->pending_wakeup = 1; | ||
42 | irq_work_queue(&handle->event->pending); | ||
43 | } | ||
44 | |||
45 | /* | ||
46 | * We need to ensure a later event_id doesn't publish a head when a former | ||
47 | * event isn't done writing. However since we need to deal with NMIs we | ||
48 | * cannot fully serialize things. | ||
49 | * | ||
50 | * We only publish the head (and generate a wakeup) when the outer-most | ||
51 | * event completes. | ||
52 | */ | ||
53 | static void perf_output_get_handle(struct perf_output_handle *handle) | ||
54 | { | ||
55 | struct ring_buffer *rb = handle->rb; | ||
56 | |||
57 | preempt_disable(); | ||
58 | local_inc(&rb->nest); | ||
59 | handle->wakeup = local_read(&rb->wakeup); | ||
60 | } | ||
61 | |||
62 | static void perf_output_put_handle(struct perf_output_handle *handle) | ||
63 | { | ||
64 | struct ring_buffer *rb = handle->rb; | ||
65 | unsigned long head; | ||
66 | |||
67 | again: | ||
68 | head = local_read(&rb->head); | ||
69 | |||
70 | /* | ||
71 | * IRQ/NMI can happen here, which means we can miss a head update. | ||
72 | */ | ||
73 | |||
74 | if (!local_dec_and_test(&rb->nest)) | ||
75 | goto out; | ||
76 | |||
77 | /* | ||
78 | * Publish the known good head. Rely on the full barrier implied | ||
79 | * by atomic_dec_and_test() order the rb->head read and this | ||
80 | * write. | ||
81 | */ | ||
82 | rb->user_page->data_head = head; | ||
83 | |||
84 | /* | ||
85 | * Now check if we missed an update, rely on the (compiler) | ||
86 | * barrier in atomic_dec_and_test() to re-read rb->head. | ||
87 | */ | ||
88 | if (unlikely(head != local_read(&rb->head))) { | ||
89 | local_inc(&rb->nest); | ||
90 | goto again; | ||
91 | } | ||
92 | |||
93 | if (handle->wakeup != local_read(&rb->wakeup)) | ||
94 | perf_output_wakeup(handle); | ||
95 | |||
96 | out: | ||
97 | preempt_enable(); | ||
98 | } | ||
99 | |||
100 | int perf_output_begin(struct perf_output_handle *handle, | ||
101 | struct perf_event *event, unsigned int size) | ||
102 | { | ||
103 | struct ring_buffer *rb; | ||
104 | unsigned long tail, offset, head; | ||
105 | int have_lost; | ||
106 | struct perf_sample_data sample_data; | ||
107 | struct { | ||
108 | struct perf_event_header header; | ||
109 | u64 id; | ||
110 | u64 lost; | ||
111 | } lost_event; | ||
112 | |||
113 | rcu_read_lock(); | ||
114 | /* | ||
115 | * For inherited events we send all the output towards the parent. | ||
116 | */ | ||
117 | if (event->parent) | ||
118 | event = event->parent; | ||
119 | |||
120 | rb = rcu_dereference(event->rb); | ||
121 | if (!rb) | ||
122 | goto out; | ||
123 | |||
124 | handle->rb = rb; | ||
125 | handle->event = event; | ||
126 | |||
127 | if (!rb->nr_pages) | ||
128 | goto out; | ||
129 | |||
130 | have_lost = local_read(&rb->lost); | ||
131 | if (have_lost) { | ||
132 | lost_event.header.size = sizeof(lost_event); | ||
133 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
134 | event); | ||
135 | size += lost_event.header.size; | ||
136 | } | ||
137 | |||
138 | perf_output_get_handle(handle); | ||
139 | |||
140 | do { | ||
141 | /* | ||
142 | * Userspace could choose to issue a mb() before updating the | ||
143 | * tail pointer. So that all reads will be completed before the | ||
144 | * write is issued. | ||
145 | */ | ||
146 | tail = ACCESS_ONCE(rb->user_page->data_tail); | ||
147 | smp_rmb(); | ||
148 | offset = head = local_read(&rb->head); | ||
149 | head += size; | ||
150 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | ||
151 | goto fail; | ||
152 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | ||
153 | |||
154 | if (head - local_read(&rb->wakeup) > rb->watermark) | ||
155 | local_add(rb->watermark, &rb->wakeup); | ||
156 | |||
157 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | ||
158 | handle->page &= rb->nr_pages - 1; | ||
159 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
160 | handle->addr = rb->data_pages[handle->page]; | ||
161 | handle->addr += handle->size; | ||
162 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
163 | |||
164 | if (have_lost) { | ||
165 | lost_event.header.type = PERF_RECORD_LOST; | ||
166 | lost_event.header.misc = 0; | ||
167 | lost_event.id = event->id; | ||
168 | lost_event.lost = local_xchg(&rb->lost, 0); | ||
169 | |||
170 | perf_output_put(handle, lost_event); | ||
171 | perf_event__output_id_sample(event, handle, &sample_data); | ||
172 | } | ||
173 | |||
174 | return 0; | ||
175 | |||
176 | fail: | ||
177 | local_inc(&rb->lost); | ||
178 | perf_output_put_handle(handle); | ||
179 | out: | ||
180 | rcu_read_unlock(); | ||
181 | |||
182 | return -ENOSPC; | ||
183 | } | ||
184 | |||
185 | void perf_output_copy(struct perf_output_handle *handle, | ||
186 | const void *buf, unsigned int len) | ||
187 | { | ||
188 | __output_copy(handle, buf, len); | ||
189 | } | ||
190 | |||
191 | void perf_output_end(struct perf_output_handle *handle) | ||
192 | { | ||
193 | perf_output_put_handle(handle); | ||
194 | rcu_read_unlock(); | ||
195 | } | ||
196 | |||
197 | static void | ||
198 | ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | ||
199 | { | ||
200 | long max_size = perf_data_size(rb); | ||
201 | |||
202 | if (watermark) | ||
203 | rb->watermark = min(max_size, watermark); | ||
204 | |||
205 | if (!rb->watermark) | ||
206 | rb->watermark = max_size / 2; | ||
207 | |||
208 | if (flags & RING_BUFFER_WRITABLE) | ||
209 | rb->writable = 1; | ||
210 | |||
211 | atomic_set(&rb->refcount, 1); | ||
212 | } | ||
213 | |||
214 | #ifndef CONFIG_PERF_USE_VMALLOC | ||
215 | |||
216 | /* | ||
217 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | ||
218 | */ | ||
219 | |||
220 | struct page * | ||
221 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
222 | { | ||
223 | if (pgoff > rb->nr_pages) | ||
224 | return NULL; | ||
225 | |||
226 | if (pgoff == 0) | ||
227 | return virt_to_page(rb->user_page); | ||
228 | |||
229 | return virt_to_page(rb->data_pages[pgoff - 1]); | ||
230 | } | ||
231 | |||
232 | static void *perf_mmap_alloc_page(int cpu) | ||
233 | { | ||
234 | struct page *page; | ||
235 | int node; | ||
236 | |||
237 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
238 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
239 | if (!page) | ||
240 | return NULL; | ||
241 | |||
242 | return page_address(page); | ||
243 | } | ||
244 | |||
245 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
246 | { | ||
247 | struct ring_buffer *rb; | ||
248 | unsigned long size; | ||
249 | int i; | ||
250 | |||
251 | size = sizeof(struct ring_buffer); | ||
252 | size += nr_pages * sizeof(void *); | ||
253 | |||
254 | rb = kzalloc(size, GFP_KERNEL); | ||
255 | if (!rb) | ||
256 | goto fail; | ||
257 | |||
258 | rb->user_page = perf_mmap_alloc_page(cpu); | ||
259 | if (!rb->user_page) | ||
260 | goto fail_user_page; | ||
261 | |||
262 | for (i = 0; i < nr_pages; i++) { | ||
263 | rb->data_pages[i] = perf_mmap_alloc_page(cpu); | ||
264 | if (!rb->data_pages[i]) | ||
265 | goto fail_data_pages; | ||
266 | } | ||
267 | |||
268 | rb->nr_pages = nr_pages; | ||
269 | |||
270 | ring_buffer_init(rb, watermark, flags); | ||
271 | |||
272 | return rb; | ||
273 | |||
274 | fail_data_pages: | ||
275 | for (i--; i >= 0; i--) | ||
276 | free_page((unsigned long)rb->data_pages[i]); | ||
277 | |||
278 | free_page((unsigned long)rb->user_page); | ||
279 | |||
280 | fail_user_page: | ||
281 | kfree(rb); | ||
282 | |||
283 | fail: | ||
284 | return NULL; | ||
285 | } | ||
286 | |||
287 | static void perf_mmap_free_page(unsigned long addr) | ||
288 | { | ||
289 | struct page *page = virt_to_page((void *)addr); | ||
290 | |||
291 | page->mapping = NULL; | ||
292 | __free_page(page); | ||
293 | } | ||
294 | |||
295 | void rb_free(struct ring_buffer *rb) | ||
296 | { | ||
297 | int i; | ||
298 | |||
299 | perf_mmap_free_page((unsigned long)rb->user_page); | ||
300 | for (i = 0; i < rb->nr_pages; i++) | ||
301 | perf_mmap_free_page((unsigned long)rb->data_pages[i]); | ||
302 | kfree(rb); | ||
303 | } | ||
304 | |||
305 | #else | ||
306 | |||
307 | struct page * | ||
308 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
309 | { | ||
310 | if (pgoff > (1UL << page_order(rb))) | ||
311 | return NULL; | ||
312 | |||
313 | return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); | ||
314 | } | ||
315 | |||
316 | static void perf_mmap_unmark_page(void *addr) | ||
317 | { | ||
318 | struct page *page = vmalloc_to_page(addr); | ||
319 | |||
320 | page->mapping = NULL; | ||
321 | } | ||
322 | |||
323 | static void rb_free_work(struct work_struct *work) | ||
324 | { | ||
325 | struct ring_buffer *rb; | ||
326 | void *base; | ||
327 | int i, nr; | ||
328 | |||
329 | rb = container_of(work, struct ring_buffer, work); | ||
330 | nr = 1 << page_order(rb); | ||
331 | |||
332 | base = rb->user_page; | ||
333 | for (i = 0; i < nr + 1; i++) | ||
334 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | ||
335 | |||
336 | vfree(base); | ||
337 | kfree(rb); | ||
338 | } | ||
339 | |||
340 | void rb_free(struct ring_buffer *rb) | ||
341 | { | ||
342 | schedule_work(&rb->work); | ||
343 | } | ||
344 | |||
345 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
346 | { | ||
347 | struct ring_buffer *rb; | ||
348 | unsigned long size; | ||
349 | void *all_buf; | ||
350 | |||
351 | size = sizeof(struct ring_buffer); | ||
352 | size += sizeof(void *); | ||
353 | |||
354 | rb = kzalloc(size, GFP_KERNEL); | ||
355 | if (!rb) | ||
356 | goto fail; | ||
357 | |||
358 | INIT_WORK(&rb->work, rb_free_work); | ||
359 | |||
360 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | ||
361 | if (!all_buf) | ||
362 | goto fail_all_buf; | ||
363 | |||
364 | rb->user_page = all_buf; | ||
365 | rb->data_pages[0] = all_buf + PAGE_SIZE; | ||
366 | rb->page_order = ilog2(nr_pages); | ||
367 | rb->nr_pages = 1; | ||
368 | |||
369 | ring_buffer_init(rb, watermark, flags); | ||
370 | |||
371 | return rb; | ||
372 | |||
373 | fail_all_buf: | ||
374 | kfree(rb); | ||
375 | |||
376 | fail: | ||
377 | return NULL; | ||
378 | } | ||
379 | |||
380 | #endif | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 64879bdff92..9d13da8a8c2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -87,7 +87,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
87 | struct tty_struct *uninitialized_var(tty); | 87 | struct tty_struct *uninitialized_var(tty); |
88 | 88 | ||
89 | sighand = rcu_dereference_check(tsk->sighand, | 89 | sighand = rcu_dereference_check(tsk->sighand, |
90 | rcu_read_lock_held() || | ||
91 | lockdep_tasklist_lock_is_held()); | 90 | lockdep_tasklist_lock_is_held()); |
92 | spin_lock(&sighand->siglock); | 91 | spin_lock(&sighand->siglock); |
93 | 92 | ||
@@ -171,7 +170,6 @@ void release_task(struct task_struct * p) | |||
171 | struct task_struct *leader; | 170 | struct task_struct *leader; |
172 | int zap_leader; | 171 | int zap_leader; |
173 | repeat: | 172 | repeat: |
174 | tracehook_prepare_release_task(p); | ||
175 | /* don't need to get the RCU readlock here - the process is dead and | 173 | /* don't need to get the RCU readlock here - the process is dead and |
176 | * can't be modifying its own credentials. But shut RCU-lockdep up */ | 174 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
177 | rcu_read_lock(); | 175 | rcu_read_lock(); |
@@ -181,7 +179,7 @@ repeat: | |||
181 | proc_flush_task(p); | 179 | proc_flush_task(p); |
182 | 180 | ||
183 | write_lock_irq(&tasklist_lock); | 181 | write_lock_irq(&tasklist_lock); |
184 | tracehook_finish_release_task(p); | 182 | ptrace_release_task(p); |
185 | __exit_signal(p); | 183 | __exit_signal(p); |
186 | 184 | ||
187 | /* | 185 | /* |
@@ -192,22 +190,12 @@ repeat: | |||
192 | zap_leader = 0; | 190 | zap_leader = 0; |
193 | leader = p->group_leader; | 191 | leader = p->group_leader; |
194 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 192 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { |
195 | BUG_ON(task_detached(leader)); | ||
196 | do_notify_parent(leader, leader->exit_signal); | ||
197 | /* | 193 | /* |
198 | * If we were the last child thread and the leader has | 194 | * If we were the last child thread and the leader has |
199 | * exited already, and the leader's parent ignores SIGCHLD, | 195 | * exited already, and the leader's parent ignores SIGCHLD, |
200 | * then we are the one who should release the leader. | 196 | * then we are the one who should release the leader. |
201 | * | ||
202 | * do_notify_parent() will have marked it self-reaping in | ||
203 | * that case. | ||
204 | */ | ||
205 | zap_leader = task_detached(leader); | ||
206 | |||
207 | /* | ||
208 | * This maintains the invariant that release_task() | ||
209 | * only runs on a task in EXIT_DEAD, just for sanity. | ||
210 | */ | 197 | */ |
198 | zap_leader = do_notify_parent(leader, leader->exit_signal); | ||
211 | if (zap_leader) | 199 | if (zap_leader) |
212 | leader->exit_state = EXIT_DEAD; | 200 | leader->exit_state = EXIT_DEAD; |
213 | } | 201 | } |
@@ -279,18 +267,16 @@ int is_current_pgrp_orphaned(void) | |||
279 | return retval; | 267 | return retval; |
280 | } | 268 | } |
281 | 269 | ||
282 | static int has_stopped_jobs(struct pid *pgrp) | 270 | static bool has_stopped_jobs(struct pid *pgrp) |
283 | { | 271 | { |
284 | int retval = 0; | ||
285 | struct task_struct *p; | 272 | struct task_struct *p; |
286 | 273 | ||
287 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 274 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
288 | if (!task_is_stopped(p)) | 275 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
289 | continue; | 276 | return true; |
290 | retval = 1; | ||
291 | break; | ||
292 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 277 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
293 | return retval; | 278 | |
279 | return false; | ||
294 | } | 280 | } |
295 | 281 | ||
296 | /* | 282 | /* |
@@ -753,7 +739,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
753 | { | 739 | { |
754 | list_move_tail(&p->sibling, &p->real_parent->children); | 740 | list_move_tail(&p->sibling, &p->real_parent->children); |
755 | 741 | ||
756 | if (task_detached(p)) | 742 | if (p->exit_state == EXIT_DEAD) |
757 | return; | 743 | return; |
758 | /* | 744 | /* |
759 | * If this is a threaded reparent there is no need to | 745 | * If this is a threaded reparent there is no need to |
@@ -766,10 +752,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
766 | p->exit_signal = SIGCHLD; | 752 | p->exit_signal = SIGCHLD; |
767 | 753 | ||
768 | /* If it has exited notify the new parent about this child's death. */ | 754 | /* If it has exited notify the new parent about this child's death. */ |
769 | if (!task_ptrace(p) && | 755 | if (!p->ptrace && |
770 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 756 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
771 | do_notify_parent(p, p->exit_signal); | 757 | if (do_notify_parent(p, p->exit_signal)) { |
772 | if (task_detached(p)) { | ||
773 | p->exit_state = EXIT_DEAD; | 758 | p->exit_state = EXIT_DEAD; |
774 | list_move_tail(&p->sibling, dead); | 759 | list_move_tail(&p->sibling, dead); |
775 | } | 760 | } |
@@ -796,7 +781,7 @@ static void forget_original_parent(struct task_struct *father) | |||
796 | do { | 781 | do { |
797 | t->real_parent = reaper; | 782 | t->real_parent = reaper; |
798 | if (t->parent == father) { | 783 | if (t->parent == father) { |
799 | BUG_ON(task_ptrace(t)); | 784 | BUG_ON(t->ptrace); |
800 | t->parent = t->real_parent; | 785 | t->parent = t->real_parent; |
801 | } | 786 | } |
802 | if (t->pdeath_signal) | 787 | if (t->pdeath_signal) |
@@ -821,8 +806,7 @@ static void forget_original_parent(struct task_struct *father) | |||
821 | */ | 806 | */ |
822 | static void exit_notify(struct task_struct *tsk, int group_dead) | 807 | static void exit_notify(struct task_struct *tsk, int group_dead) |
823 | { | 808 | { |
824 | int signal; | 809 | bool autoreap; |
825 | void *cookie; | ||
826 | 810 | ||
827 | /* | 811 | /* |
828 | * This does two things: | 812 | * This does two things: |
@@ -853,26 +837,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
853 | * we have changed execution domain as these two values started | 837 | * we have changed execution domain as these two values started |
854 | * the same after a fork. | 838 | * the same after a fork. |
855 | */ | 839 | */ |
856 | if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && | 840 | if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && |
857 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || | 841 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || |
858 | tsk->self_exec_id != tsk->parent_exec_id)) | 842 | tsk->self_exec_id != tsk->parent_exec_id)) |
859 | tsk->exit_signal = SIGCHLD; | 843 | tsk->exit_signal = SIGCHLD; |
860 | 844 | ||
861 | signal = tracehook_notify_death(tsk, &cookie, group_dead); | 845 | if (unlikely(tsk->ptrace)) { |
862 | if (signal >= 0) | 846 | int sig = thread_group_leader(tsk) && |
863 | signal = do_notify_parent(tsk, signal); | 847 | thread_group_empty(tsk) && |
848 | !ptrace_reparented(tsk) ? | ||
849 | tsk->exit_signal : SIGCHLD; | ||
850 | autoreap = do_notify_parent(tsk, sig); | ||
851 | } else if (thread_group_leader(tsk)) { | ||
852 | autoreap = thread_group_empty(tsk) && | ||
853 | do_notify_parent(tsk, tsk->exit_signal); | ||
854 | } else { | ||
855 | autoreap = true; | ||
856 | } | ||
864 | 857 | ||
865 | tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; | 858 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
866 | 859 | ||
867 | /* mt-exec, de_thread() is waiting for group leader */ | 860 | /* mt-exec, de_thread() is waiting for group leader */ |
868 | if (unlikely(tsk->signal->notify_count < 0)) | 861 | if (unlikely(tsk->signal->notify_count < 0)) |
869 | wake_up_process(tsk->signal->group_exit_task); | 862 | wake_up_process(tsk->signal->group_exit_task); |
870 | write_unlock_irq(&tasklist_lock); | 863 | write_unlock_irq(&tasklist_lock); |
871 | 864 | ||
872 | tracehook_report_death(tsk, signal, cookie, group_dead); | ||
873 | |||
874 | /* If the process is dead, release it - nobody will wait for it */ | 865 | /* If the process is dead, release it - nobody will wait for it */ |
875 | if (signal == DEATH_REAP) | 866 | if (autoreap) |
876 | release_task(tsk); | 867 | release_task(tsk); |
877 | } | 868 | } |
878 | 869 | ||
@@ -908,7 +899,6 @@ NORET_TYPE void do_exit(long code) | |||
908 | 899 | ||
909 | profile_task_exit(tsk); | 900 | profile_task_exit(tsk); |
910 | 901 | ||
911 | WARN_ON(atomic_read(&tsk->fs_excl)); | ||
912 | WARN_ON(blk_needs_flush_plug(tsk)); | 902 | WARN_ON(blk_needs_flush_plug(tsk)); |
913 | 903 | ||
914 | if (unlikely(in_interrupt())) | 904 | if (unlikely(in_interrupt())) |
@@ -925,7 +915,7 @@ NORET_TYPE void do_exit(long code) | |||
925 | */ | 915 | */ |
926 | set_fs(USER_DS); | 916 | set_fs(USER_DS); |
927 | 917 | ||
928 | tracehook_report_exit(&code); | 918 | ptrace_event(PTRACE_EVENT_EXIT, code); |
929 | 919 | ||
930 | validate_creds_for_do_exit(tsk); | 920 | validate_creds_for_do_exit(tsk); |
931 | 921 | ||
@@ -994,6 +984,7 @@ NORET_TYPE void do_exit(long code) | |||
994 | trace_sched_process_exit(tsk); | 984 | trace_sched_process_exit(tsk); |
995 | 985 | ||
996 | exit_sem(tsk); | 986 | exit_sem(tsk); |
987 | exit_shm(tsk); | ||
997 | exit_files(tsk); | 988 | exit_files(tsk); |
998 | exit_fs(tsk); | 989 | exit_fs(tsk); |
999 | check_stack_usage(); | 990 | check_stack_usage(); |
@@ -1239,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1239 | traced = ptrace_reparented(p); | 1230 | traced = ptrace_reparented(p); |
1240 | /* | 1231 | /* |
1241 | * It can be ptraced but not reparented, check | 1232 | * It can be ptraced but not reparented, check |
1242 | * !task_detached() to filter out sub-threads. | 1233 | * thread_group_leader() to filter out sub-threads. |
1243 | */ | 1234 | */ |
1244 | if (likely(!traced) && likely(!task_detached(p))) { | 1235 | if (likely(!traced) && thread_group_leader(p)) { |
1245 | struct signal_struct *psig; | 1236 | struct signal_struct *psig; |
1246 | struct signal_struct *sig; | 1237 | struct signal_struct *sig; |
1247 | unsigned long maxrss; | 1238 | unsigned long maxrss; |
@@ -1349,16 +1340,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1349 | /* We dropped tasklist, ptracer could die and untrace */ | 1340 | /* We dropped tasklist, ptracer could die and untrace */ |
1350 | ptrace_unlink(p); | 1341 | ptrace_unlink(p); |
1351 | /* | 1342 | /* |
1352 | * If this is not a detached task, notify the parent. | 1343 | * If this is not a sub-thread, notify the parent. |
1353 | * If it's still not detached after that, don't release | 1344 | * If parent wants a zombie, don't release it now. |
1354 | * it now. | ||
1355 | */ | 1345 | */ |
1356 | if (!task_detached(p)) { | 1346 | if (thread_group_leader(p) && |
1357 | do_notify_parent(p, p->exit_signal); | 1347 | !do_notify_parent(p, p->exit_signal)) { |
1358 | if (!task_detached(p)) { | 1348 | p->exit_state = EXIT_ZOMBIE; |
1359 | p->exit_state = EXIT_ZOMBIE; | 1349 | p = NULL; |
1360 | p = NULL; | ||
1361 | } | ||
1362 | } | 1350 | } |
1363 | write_unlock_irq(&tasklist_lock); | 1351 | write_unlock_irq(&tasklist_lock); |
1364 | } | 1352 | } |
@@ -1371,7 +1359,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1371 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1359 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
1372 | { | 1360 | { |
1373 | if (ptrace) { | 1361 | if (ptrace) { |
1374 | if (task_is_stopped_or_traced(p)) | 1362 | if (task_is_stopped_or_traced(p) && |
1363 | !(p->jobctl & JOBCTL_LISTENING)) | ||
1375 | return &p->exit_code; | 1364 | return &p->exit_code; |
1376 | } else { | 1365 | } else { |
1377 | if (p->signal->flags & SIGNAL_STOP_STOPPED) | 1366 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
@@ -1557,8 +1546,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1557 | } | 1546 | } |
1558 | 1547 | ||
1559 | /* dead body doesn't have much to contribute */ | 1548 | /* dead body doesn't have much to contribute */ |
1560 | if (p->exit_state == EXIT_DEAD) | 1549 | if (unlikely(p->exit_state == EXIT_DEAD)) { |
1550 | /* | ||
1551 | * But do not ignore this task until the tracer does | ||
1552 | * wait_task_zombie()->do_notify_parent(). | ||
1553 | */ | ||
1554 | if (likely(!ptrace) && unlikely(ptrace_reparented(p))) | ||
1555 | wo->notask_error = 0; | ||
1561 | return 0; | 1556 | return 0; |
1557 | } | ||
1562 | 1558 | ||
1563 | /* slay zombie? */ | 1559 | /* slay zombie? */ |
1564 | if (p->exit_state == EXIT_ZOMBIE) { | 1560 | if (p->exit_state == EXIT_ZOMBIE) { |
@@ -1567,7 +1563,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1567 | * Notification and reaping will be cascaded to the real | 1563 | * Notification and reaping will be cascaded to the real |
1568 | * parent when the ptracer detaches. | 1564 | * parent when the ptracer detaches. |
1569 | */ | 1565 | */ |
1570 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1566 | if (likely(!ptrace) && unlikely(p->ptrace)) { |
1571 | /* it will become visible, clear notask_error */ | 1567 | /* it will become visible, clear notask_error */ |
1572 | wo->notask_error = 0; | 1568 | wo->notask_error = 0; |
1573 | return 0; | 1569 | return 0; |
@@ -1610,8 +1606,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1610 | * own children, it should create a separate process which | 1606 | * own children, it should create a separate process which |
1611 | * takes the role of real parent. | 1607 | * takes the role of real parent. |
1612 | */ | 1608 | */ |
1613 | if (likely(!ptrace) && task_ptrace(p) && | 1609 | if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) |
1614 | same_thread_group(p->parent, p->real_parent)) | ||
1615 | return 0; | 1610 | return 0; |
1616 | 1611 | ||
1617 | /* | 1612 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 25c6111fe3a..067992d4838 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -37,7 +37,6 @@ | |||
37 | #include <linux/swap.h> | 37 | #include <linux/swap.h> |
38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
39 | #include <linux/jiffies.h> | 39 | #include <linux/jiffies.h> |
40 | #include <linux/tracehook.h> | ||
41 | #include <linux/futex.h> | 40 | #include <linux/futex.h> |
42 | #include <linux/compat.h> | 41 | #include <linux/compat.h> |
43 | #include <linux/kthread.h> | 42 | #include <linux/kthread.h> |
@@ -84,7 +83,7 @@ | |||
84 | * Protected counters by write_lock_irq(&tasklist_lock) | 83 | * Protected counters by write_lock_irq(&tasklist_lock) |
85 | */ | 84 | */ |
86 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 85 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
87 | int nr_threads; /* The idle threads do not count.. */ | 86 | int nr_threads; /* The idle threads do not count.. */ |
88 | 87 | ||
89 | int max_threads; /* tunable limit on nr_threads */ | 88 | int max_threads; /* tunable limit on nr_threads */ |
90 | 89 | ||
@@ -157,6 +156,9 @@ struct kmem_cache *vm_area_cachep; | |||
157 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 156 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
158 | static struct kmem_cache *mm_cachep; | 157 | static struct kmem_cache *mm_cachep; |
159 | 158 | ||
159 | /* Notifier list called when a task struct is freed */ | ||
160 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | ||
161 | |||
160 | static void account_kernel_stack(struct thread_info *ti, int account) | 162 | static void account_kernel_stack(struct thread_info *ti, int account) |
161 | { | 163 | { |
162 | struct zone *zone = page_zone(virt_to_page(ti)); | 164 | struct zone *zone = page_zone(virt_to_page(ti)); |
@@ -188,6 +190,18 @@ static inline void put_signal_struct(struct signal_struct *sig) | |||
188 | free_signal_struct(sig); | 190 | free_signal_struct(sig); |
189 | } | 191 | } |
190 | 192 | ||
193 | int task_free_register(struct notifier_block *n) | ||
194 | { | ||
195 | return atomic_notifier_chain_register(&task_free_notifier, n); | ||
196 | } | ||
197 | EXPORT_SYMBOL(task_free_register); | ||
198 | |||
199 | int task_free_unregister(struct notifier_block *n) | ||
200 | { | ||
201 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | ||
202 | } | ||
203 | EXPORT_SYMBOL(task_free_unregister); | ||
204 | |||
191 | void __put_task_struct(struct task_struct *tsk) | 205 | void __put_task_struct(struct task_struct *tsk) |
192 | { | 206 | { |
193 | WARN_ON(!tsk->exit_state); | 207 | WARN_ON(!tsk->exit_state); |
@@ -199,6 +213,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
199 | delayacct_tsk_free(tsk); | 213 | delayacct_tsk_free(tsk); |
200 | put_signal_struct(tsk->signal); | 214 | put_signal_struct(tsk->signal); |
201 | 215 | ||
216 | atomic_notifier_call_chain(&task_free_notifier, 0, tsk); | ||
202 | if (!profile_handoff_task(tsk)) | 217 | if (!profile_handoff_task(tsk)) |
203 | free_task(tsk); | 218 | free_task(tsk); |
204 | } | 219 | } |
@@ -237,7 +252,7 @@ void __init fork_init(unsigned long mempages) | |||
237 | /* | 252 | /* |
238 | * we need to allow at least 20 threads to boot a system | 253 | * we need to allow at least 20 threads to boot a system |
239 | */ | 254 | */ |
240 | if(max_threads < 20) | 255 | if (max_threads < 20) |
241 | max_threads = 20; | 256 | max_threads = 20; |
242 | 257 | ||
243 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | 258 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; |
@@ -273,7 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
273 | return NULL; | 288 | return NULL; |
274 | } | 289 | } |
275 | 290 | ||
276 | err = arch_dup_task_struct(tsk, orig); | 291 | err = arch_dup_task_struct(tsk, orig); |
277 | if (err) | 292 | if (err) |
278 | goto out; | 293 | goto out; |
279 | 294 | ||
@@ -296,9 +311,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
296 | tsk->stack_canary = get_random_int(); | 311 | tsk->stack_canary = get_random_int(); |
297 | #endif | 312 | #endif |
298 | 313 | ||
299 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 314 | /* |
300 | atomic_set(&tsk->usage,2); | 315 | * One for us, one for whoever does the "release_task()" (usually |
301 | atomic_set(&tsk->fs_excl, 0); | 316 | * parent) |
317 | */ | ||
318 | atomic_set(&tsk->usage, 2); | ||
302 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 319 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
303 | tsk->btrace_seq = 0; | 320 | tsk->btrace_seq = 0; |
304 | #endif | 321 | #endif |
@@ -446,7 +463,7 @@ fail_nomem: | |||
446 | goto out; | 463 | goto out; |
447 | } | 464 | } |
448 | 465 | ||
449 | static inline int mm_alloc_pgd(struct mm_struct * mm) | 466 | static inline int mm_alloc_pgd(struct mm_struct *mm) |
450 | { | 467 | { |
451 | mm->pgd = pgd_alloc(mm); | 468 | mm->pgd = pgd_alloc(mm); |
452 | if (unlikely(!mm->pgd)) | 469 | if (unlikely(!mm->pgd)) |
@@ -454,7 +471,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) | |||
454 | return 0; | 471 | return 0; |
455 | } | 472 | } |
456 | 473 | ||
457 | static inline void mm_free_pgd(struct mm_struct * mm) | 474 | static inline void mm_free_pgd(struct mm_struct *mm) |
458 | { | 475 | { |
459 | pgd_free(mm, mm->pgd); | 476 | pgd_free(mm, mm->pgd); |
460 | } | 477 | } |
@@ -491,7 +508,7 @@ static void mm_init_aio(struct mm_struct *mm) | |||
491 | #endif | 508 | #endif |
492 | } | 509 | } |
493 | 510 | ||
494 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | 511 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) |
495 | { | 512 | { |
496 | atomic_set(&mm->mm_users, 1); | 513 | atomic_set(&mm->mm_users, 1); |
497 | atomic_set(&mm->mm_count, 1); | 514 | atomic_set(&mm->mm_count, 1); |
@@ -522,9 +539,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
522 | /* | 539 | /* |
523 | * Allocate and initialize an mm_struct. | 540 | * Allocate and initialize an mm_struct. |
524 | */ | 541 | */ |
525 | struct mm_struct * mm_alloc(void) | 542 | struct mm_struct *mm_alloc(void) |
526 | { | 543 | { |
527 | struct mm_struct * mm; | 544 | struct mm_struct *mm; |
528 | 545 | ||
529 | mm = allocate_mm(); | 546 | mm = allocate_mm(); |
530 | if (!mm) | 547 | if (!mm) |
@@ -592,7 +609,7 @@ void added_exe_file_vma(struct mm_struct *mm) | |||
592 | void removed_exe_file_vma(struct mm_struct *mm) | 609 | void removed_exe_file_vma(struct mm_struct *mm) |
593 | { | 610 | { |
594 | mm->num_exe_file_vmas--; | 611 | mm->num_exe_file_vmas--; |
595 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ | 612 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { |
596 | fput(mm->exe_file); | 613 | fput(mm->exe_file); |
597 | mm->exe_file = NULL; | 614 | mm->exe_file = NULL; |
598 | } | 615 | } |
@@ -784,9 +801,9 @@ fail_nocontext: | |||
784 | return NULL; | 801 | return NULL; |
785 | } | 802 | } |
786 | 803 | ||
787 | static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | 804 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) |
788 | { | 805 | { |
789 | struct mm_struct * mm, *oldmm; | 806 | struct mm_struct *mm, *oldmm; |
790 | int retval; | 807 | int retval; |
791 | 808 | ||
792 | tsk->min_flt = tsk->maj_flt = 0; | 809 | tsk->min_flt = tsk->maj_flt = 0; |
@@ -853,7 +870,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) | |||
853 | return 0; | 870 | return 0; |
854 | } | 871 | } |
855 | 872 | ||
856 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 873 | static int copy_files(unsigned long clone_flags, struct task_struct *tsk) |
857 | { | 874 | { |
858 | struct files_struct *oldf, *newf; | 875 | struct files_struct *oldf, *newf; |
859 | int error = 0; | 876 | int error = 0; |
@@ -1020,7 +1037,7 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1020 | { | 1037 | { |
1021 | raw_spin_lock_init(&p->pi_lock); | 1038 | raw_spin_lock_init(&p->pi_lock); |
1022 | #ifdef CONFIG_RT_MUTEXES | 1039 | #ifdef CONFIG_RT_MUTEXES |
1023 | plist_head_init_raw(&p->pi_waiters, &p->pi_lock); | 1040 | plist_head_init(&p->pi_waiters); |
1024 | p->pi_blocked_on = NULL; | 1041 | p->pi_blocked_on = NULL; |
1025 | #endif | 1042 | #endif |
1026 | } | 1043 | } |
@@ -1117,6 +1134,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1117 | p->real_cred->user != INIT_USER) | 1134 | p->real_cred->user != INIT_USER) |
1118 | goto bad_fork_free; | 1135 | goto bad_fork_free; |
1119 | } | 1136 | } |
1137 | current->flags &= ~PF_NPROC_EXCEEDED; | ||
1120 | 1138 | ||
1121 | retval = copy_creds(p, clone_flags); | 1139 | retval = copy_creds(p, clone_flags); |
1122 | if (retval < 0) | 1140 | if (retval < 0) |
@@ -1175,13 +1193,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1175 | cgroup_fork(p); | 1193 | cgroup_fork(p); |
1176 | #ifdef CONFIG_NUMA | 1194 | #ifdef CONFIG_NUMA |
1177 | p->mempolicy = mpol_dup(p->mempolicy); | 1195 | p->mempolicy = mpol_dup(p->mempolicy); |
1178 | if (IS_ERR(p->mempolicy)) { | 1196 | if (IS_ERR(p->mempolicy)) { |
1179 | retval = PTR_ERR(p->mempolicy); | 1197 | retval = PTR_ERR(p->mempolicy); |
1180 | p->mempolicy = NULL; | 1198 | p->mempolicy = NULL; |
1181 | goto bad_fork_cleanup_cgroup; | 1199 | goto bad_fork_cleanup_cgroup; |
1182 | } | 1200 | } |
1183 | mpol_fix_fork_child_flag(p); | 1201 | mpol_fix_fork_child_flag(p); |
1184 | #endif | 1202 | #endif |
1203 | #ifdef CONFIG_CPUSETS | ||
1204 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; | ||
1205 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; | ||
1206 | #endif | ||
1185 | #ifdef CONFIG_TRACE_IRQFLAGS | 1207 | #ifdef CONFIG_TRACE_IRQFLAGS |
1186 | p->irq_events = 0; | 1208 | p->irq_events = 0; |
1187 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 1209 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
@@ -1221,25 +1243,33 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1221 | retval = perf_event_init_task(p); | 1243 | retval = perf_event_init_task(p); |
1222 | if (retval) | 1244 | if (retval) |
1223 | goto bad_fork_cleanup_policy; | 1245 | goto bad_fork_cleanup_policy; |
1224 | 1246 | retval = audit_alloc(p); | |
1225 | if ((retval = audit_alloc(p))) | 1247 | if (retval) |
1226 | goto bad_fork_cleanup_policy; | 1248 | goto bad_fork_cleanup_policy; |
1227 | /* copy all the process information */ | 1249 | /* copy all the process information */ |
1228 | if ((retval = copy_semundo(clone_flags, p))) | 1250 | retval = copy_semundo(clone_flags, p); |
1251 | if (retval) | ||
1229 | goto bad_fork_cleanup_audit; | 1252 | goto bad_fork_cleanup_audit; |
1230 | if ((retval = copy_files(clone_flags, p))) | 1253 | retval = copy_files(clone_flags, p); |
1254 | if (retval) | ||
1231 | goto bad_fork_cleanup_semundo; | 1255 | goto bad_fork_cleanup_semundo; |
1232 | if ((retval = copy_fs(clone_flags, p))) | 1256 | retval = copy_fs(clone_flags, p); |
1257 | if (retval) | ||
1233 | goto bad_fork_cleanup_files; | 1258 | goto bad_fork_cleanup_files; |
1234 | if ((retval = copy_sighand(clone_flags, p))) | 1259 | retval = copy_sighand(clone_flags, p); |
1260 | if (retval) | ||
1235 | goto bad_fork_cleanup_fs; | 1261 | goto bad_fork_cleanup_fs; |
1236 | if ((retval = copy_signal(clone_flags, p))) | 1262 | retval = copy_signal(clone_flags, p); |
1263 | if (retval) | ||
1237 | goto bad_fork_cleanup_sighand; | 1264 | goto bad_fork_cleanup_sighand; |
1238 | if ((retval = copy_mm(clone_flags, p))) | 1265 | retval = copy_mm(clone_flags, p); |
1266 | if (retval) | ||
1239 | goto bad_fork_cleanup_signal; | 1267 | goto bad_fork_cleanup_signal; |
1240 | if ((retval = copy_namespaces(clone_flags, p))) | 1268 | retval = copy_namespaces(clone_flags, p); |
1269 | if (retval) | ||
1241 | goto bad_fork_cleanup_mm; | 1270 | goto bad_fork_cleanup_mm; |
1242 | if ((retval = copy_io(clone_flags, p))) | 1271 | retval = copy_io(clone_flags, p); |
1272 | if (retval) | ||
1243 | goto bad_fork_cleanup_namespaces; | 1273 | goto bad_fork_cleanup_namespaces; |
1244 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); | 1274 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); |
1245 | if (retval) | 1275 | if (retval) |
@@ -1261,7 +1291,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1261 | /* | 1291 | /* |
1262 | * Clear TID on mm_release()? | 1292 | * Clear TID on mm_release()? |
1263 | */ | 1293 | */ |
1264 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1294 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
1265 | #ifdef CONFIG_BLOCK | 1295 | #ifdef CONFIG_BLOCK |
1266 | p->plug = NULL; | 1296 | p->plug = NULL; |
1267 | #endif | 1297 | #endif |
@@ -1329,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1329 | * it's process group. | 1359 | * it's process group. |
1330 | * A fatal signal pending means that current will exit, so the new | 1360 | * A fatal signal pending means that current will exit, so the new |
1331 | * thread can't slip out of an OOM kill (or normal SIGKILL). | 1361 | * thread can't slip out of an OOM kill (or normal SIGKILL). |
1332 | */ | 1362 | */ |
1333 | recalc_sigpending(); | 1363 | recalc_sigpending(); |
1334 | if (signal_pending(current)) { | 1364 | if (signal_pending(current)) { |
1335 | spin_unlock(¤t->sighand->siglock); | 1365 | spin_unlock(¤t->sighand->siglock); |
@@ -1347,7 +1377,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1347 | } | 1377 | } |
1348 | 1378 | ||
1349 | if (likely(p->pid)) { | 1379 | if (likely(p->pid)) { |
1350 | tracehook_finish_clone(p, clone_flags, trace); | 1380 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
1351 | 1381 | ||
1352 | if (thread_group_leader(p)) { | 1382 | if (thread_group_leader(p)) { |
1353 | if (is_child_reaper(pid)) | 1383 | if (is_child_reaper(pid)) |
@@ -1488,10 +1518,22 @@ long do_fork(unsigned long clone_flags, | |||
1488 | } | 1518 | } |
1489 | 1519 | ||
1490 | /* | 1520 | /* |
1491 | * When called from kernel_thread, don't do user tracing stuff. | 1521 | * Determine whether and which event to report to ptracer. When |
1522 | * called from kernel_thread or CLONE_UNTRACED is explicitly | ||
1523 | * requested, no event is reported; otherwise, report if the event | ||
1524 | * for the type of forking is enabled. | ||
1492 | */ | 1525 | */ |
1493 | if (likely(user_mode(regs))) | 1526 | if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { |
1494 | trace = tracehook_prepare_clone(clone_flags); | 1527 | if (clone_flags & CLONE_VFORK) |
1528 | trace = PTRACE_EVENT_VFORK; | ||
1529 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | ||
1530 | trace = PTRACE_EVENT_CLONE; | ||
1531 | else | ||
1532 | trace = PTRACE_EVENT_FORK; | ||
1533 | |||
1534 | if (likely(!ptrace_event_enabled(current, trace))) | ||
1535 | trace = 0; | ||
1536 | } | ||
1495 | 1537 | ||
1496 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1538 | p = copy_process(clone_flags, stack_start, regs, stack_size, |
1497 | child_tidptr, NULL, trace); | 1539 | child_tidptr, NULL, trace); |
@@ -1515,26 +1557,26 @@ long do_fork(unsigned long clone_flags, | |||
1515 | } | 1557 | } |
1516 | 1558 | ||
1517 | audit_finish_fork(p); | 1559 | audit_finish_fork(p); |
1518 | tracehook_report_clone(regs, clone_flags, nr, p); | ||
1519 | 1560 | ||
1520 | /* | 1561 | /* |
1521 | * We set PF_STARTING at creation in case tracing wants to | 1562 | * We set PF_STARTING at creation in case tracing wants to |
1522 | * use this to distinguish a fully live task from one that | 1563 | * use this to distinguish a fully live task from one that |
1523 | * hasn't gotten to tracehook_report_clone() yet. Now we | 1564 | * hasn't finished SIGSTOP raising yet. Now we clear it |
1524 | * clear it and set the child going. | 1565 | * and set the child going. |
1525 | */ | 1566 | */ |
1526 | p->flags &= ~PF_STARTING; | 1567 | p->flags &= ~PF_STARTING; |
1527 | 1568 | ||
1528 | wake_up_new_task(p); | 1569 | wake_up_new_task(p); |
1529 | 1570 | ||
1530 | tracehook_report_clone_complete(trace, regs, | 1571 | /* forking complete and child started to run, tell ptracer */ |
1531 | clone_flags, nr, p); | 1572 | if (unlikely(trace)) |
1573 | ptrace_event(trace, nr); | ||
1532 | 1574 | ||
1533 | if (clone_flags & CLONE_VFORK) { | 1575 | if (clone_flags & CLONE_VFORK) { |
1534 | freezer_do_not_count(); | 1576 | freezer_do_not_count(); |
1535 | wait_for_completion(&vfork); | 1577 | wait_for_completion(&vfork); |
1536 | freezer_count(); | 1578 | freezer_count(); |
1537 | tracehook_report_vfork_done(p, nr); | 1579 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); |
1538 | } | 1580 | } |
1539 | } else { | 1581 | } else { |
1540 | nr = PTR_ERR(p); | 1582 | nr = PTR_ERR(p); |
@@ -1581,6 +1623,7 @@ void __init proc_caches_init(void) | |||
1581 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1623 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
1582 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); | 1624 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); |
1583 | mmap_init(); | 1625 | mmap_init(); |
1626 | nsproxy_cache_init(); | ||
1584 | } | 1627 | } |
1585 | 1628 | ||
1586 | /* | 1629 | /* |
@@ -1677,12 +1720,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1677 | */ | 1720 | */ |
1678 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1721 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
1679 | do_sysvsem = 1; | 1722 | do_sysvsem = 1; |
1680 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1723 | err = unshare_fs(unshare_flags, &new_fs); |
1724 | if (err) | ||
1681 | goto bad_unshare_out; | 1725 | goto bad_unshare_out; |
1682 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1726 | err = unshare_fd(unshare_flags, &new_fd); |
1727 | if (err) | ||
1683 | goto bad_unshare_cleanup_fs; | 1728 | goto bad_unshare_cleanup_fs; |
1684 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1729 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); |
1685 | new_fs))) | 1730 | if (err) |
1686 | goto bad_unshare_cleanup_fd; | 1731 | goto bad_unshare_cleanup_fd; |
1687 | 1732 | ||
1688 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { | 1733 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |
diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282ea..e6160fa842e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key) | |||
218 | * @uaddr: virtual address of the futex | 218 | * @uaddr: virtual address of the futex |
219 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 219 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
220 | * @key: address where result is stored. | 220 | * @key: address where result is stored. |
221 | * @rw: mapping needs to be read/write (values: VERIFY_READ, | ||
222 | * VERIFY_WRITE) | ||
221 | * | 223 | * |
222 | * Returns a negative error code or 0 | 224 | * Returns a negative error code or 0 |
223 | * The key words are stored in *key on success. | 225 | * The key words are stored in *key on success. |
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key) | |||
229 | * lock_page() might sleep, the caller should not hold a spinlock. | 231 | * lock_page() might sleep, the caller should not hold a spinlock. |
230 | */ | 232 | */ |
231 | static int | 233 | static int |
232 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | 234 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) |
233 | { | 235 | { |
234 | unsigned long address = (unsigned long)uaddr; | 236 | unsigned long address = (unsigned long)uaddr; |
235 | struct mm_struct *mm = current->mm; | 237 | struct mm_struct *mm = current->mm; |
236 | struct page *page, *page_head; | 238 | struct page *page, *page_head; |
237 | int err; | 239 | int err, ro = 0; |
238 | 240 | ||
239 | /* | 241 | /* |
240 | * The futex address must be "naturally" aligned. | 242 | * The futex address must be "naturally" aligned. |
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
262 | 264 | ||
263 | again: | 265 | again: |
264 | err = get_user_pages_fast(address, 1, 1, &page); | 266 | err = get_user_pages_fast(address, 1, 1, &page); |
267 | /* | ||
268 | * If write access is not required (eg. FUTEX_WAIT), try | ||
269 | * and get read-only access. | ||
270 | */ | ||
271 | if (err == -EFAULT && rw == VERIFY_READ) { | ||
272 | err = get_user_pages_fast(address, 1, 0, &page); | ||
273 | ro = 1; | ||
274 | } | ||
265 | if (err < 0) | 275 | if (err < 0) |
266 | return err; | 276 | return err; |
277 | else | ||
278 | err = 0; | ||
267 | 279 | ||
268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 280 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
269 | page_head = page; | 281 | page_head = page; |
@@ -302,10 +314,29 @@ again: | |||
302 | #endif | 314 | #endif |
303 | 315 | ||
304 | lock_page(page_head); | 316 | lock_page(page_head); |
317 | |||
318 | /* | ||
319 | * If page_head->mapping is NULL, then it cannot be a PageAnon | ||
320 | * page; but it might be the ZERO_PAGE or in the gate area or | ||
321 | * in a special mapping (all cases which we are happy to fail); | ||
322 | * or it may have been a good file page when get_user_pages_fast | ||
323 | * found it, but truncated or holepunched or subjected to | ||
324 | * invalidate_complete_page2 before we got the page lock (also | ||
325 | * cases which we are happy to fail). And we hold a reference, | ||
326 | * so refcount care in invalidate_complete_page's remove_mapping | ||
327 | * prevents drop_caches from setting mapping to NULL beneath us. | ||
328 | * | ||
329 | * The case we do have to guard against is when memory pressure made | ||
330 | * shmem_writepage move it from filecache to swapcache beneath us: | ||
331 | * an unlikely race, but we do need to retry for page_head->mapping. | ||
332 | */ | ||
305 | if (!page_head->mapping) { | 333 | if (!page_head->mapping) { |
334 | int shmem_swizzled = PageSwapCache(page_head); | ||
306 | unlock_page(page_head); | 335 | unlock_page(page_head); |
307 | put_page(page_head); | 336 | put_page(page_head); |
308 | goto again; | 337 | if (shmem_swizzled) |
338 | goto again; | ||
339 | return -EFAULT; | ||
309 | } | 340 | } |
310 | 341 | ||
311 | /* | 342 | /* |
@@ -316,6 +347,15 @@ again: | |||
316 | * the object not the particular process. | 347 | * the object not the particular process. |
317 | */ | 348 | */ |
318 | if (PageAnon(page_head)) { | 349 | if (PageAnon(page_head)) { |
350 | /* | ||
351 | * A RO anonymous page will never change and thus doesn't make | ||
352 | * sense for futex operations. | ||
353 | */ | ||
354 | if (ro) { | ||
355 | err = -EFAULT; | ||
356 | goto out; | ||
357 | } | ||
358 | |||
319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 359 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
320 | key->private.mm = mm; | 360 | key->private.mm = mm; |
321 | key->private.address = address; | 361 | key->private.address = address; |
@@ -327,9 +367,10 @@ again: | |||
327 | 367 | ||
328 | get_futex_key_refs(key); | 368 | get_futex_key_refs(key); |
329 | 369 | ||
370 | out: | ||
330 | unlock_page(page_head); | 371 | unlock_page(page_head); |
331 | put_page(page_head); | 372 | put_page(page_head); |
332 | return 0; | 373 | return err; |
333 | } | 374 | } |
334 | 375 | ||
335 | static inline void put_futex_key(union futex_key *key) | 376 | static inline void put_futex_key(union futex_key *key) |
@@ -355,8 +396,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) | |||
355 | int ret; | 396 | int ret; |
356 | 397 | ||
357 | down_read(&mm->mmap_sem); | 398 | down_read(&mm->mmap_sem); |
358 | ret = get_user_pages(current, mm, (unsigned long)uaddr, | 399 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, |
359 | 1, 1, 0, NULL, NULL); | 400 | FAULT_FLAG_WRITE); |
360 | up_read(&mm->mmap_sem); | 401 | up_read(&mm->mmap_sem); |
361 | 402 | ||
362 | return ret < 0 ? ret : 0; | 403 | return ret < 0 ? ret : 0; |
@@ -940,7 +981,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
940 | if (!bitset) | 981 | if (!bitset) |
941 | return -EINVAL; | 982 | return -EINVAL; |
942 | 983 | ||
943 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 984 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); |
944 | if (unlikely(ret != 0)) | 985 | if (unlikely(ret != 0)) |
945 | goto out; | 986 | goto out; |
946 | 987 | ||
@@ -986,10 +1027,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | |||
986 | int ret, op_ret; | 1027 | int ret, op_ret; |
987 | 1028 | ||
988 | retry: | 1029 | retry: |
989 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); | 1030 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
990 | if (unlikely(ret != 0)) | 1031 | if (unlikely(ret != 0)) |
991 | goto out; | 1032 | goto out; |
992 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 1033 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
993 | if (unlikely(ret != 0)) | 1034 | if (unlikely(ret != 0)) |
994 | goto out_put_key1; | 1035 | goto out_put_key1; |
995 | 1036 | ||
@@ -1243,10 +1284,11 @@ retry: | |||
1243 | pi_state = NULL; | 1284 | pi_state = NULL; |
1244 | } | 1285 | } |
1245 | 1286 | ||
1246 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); | 1287 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
1247 | if (unlikely(ret != 0)) | 1288 | if (unlikely(ret != 0)) |
1248 | goto out; | 1289 | goto out; |
1249 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 1290 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, |
1291 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | ||
1250 | if (unlikely(ret != 0)) | 1292 | if (unlikely(ret != 0)) |
1251 | goto out_put_key1; | 1293 | goto out_put_key1; |
1252 | 1294 | ||
@@ -1790,7 +1832,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |||
1790 | * while the syscall executes. | 1832 | * while the syscall executes. |
1791 | */ | 1833 | */ |
1792 | retry: | 1834 | retry: |
1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); | 1835 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); |
1794 | if (unlikely(ret != 0)) | 1836 | if (unlikely(ret != 0)) |
1795 | return ret; | 1837 | return ret; |
1796 | 1838 | ||
@@ -1941,7 +1983,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, | |||
1941 | } | 1983 | } |
1942 | 1984 | ||
1943 | retry: | 1985 | retry: |
1944 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); | 1986 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); |
1945 | if (unlikely(ret != 0)) | 1987 | if (unlikely(ret != 0)) |
1946 | goto out; | 1988 | goto out; |
1947 | 1989 | ||
@@ -2060,7 +2102,7 @@ retry: | |||
2060 | if ((uval & FUTEX_TID_MASK) != vpid) | 2102 | if ((uval & FUTEX_TID_MASK) != vpid) |
2061 | return -EPERM; | 2103 | return -EPERM; |
2062 | 2104 | ||
2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 2105 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); |
2064 | if (unlikely(ret != 0)) | 2106 | if (unlikely(ret != 0)) |
2065 | goto out; | 2107 | goto out; |
2066 | 2108 | ||
@@ -2249,7 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2249 | debug_rt_mutex_init_waiter(&rt_waiter); | 2291 | debug_rt_mutex_init_waiter(&rt_waiter); |
2250 | rt_waiter.task = NULL; | 2292 | rt_waiter.task = NULL; |
2251 | 2293 | ||
2252 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 2294 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
2253 | if (unlikely(ret != 0)) | 2295 | if (unlikely(ret != 0)) |
2254 | goto out; | 2296 | goto out; |
2255 | 2297 | ||
@@ -2697,7 +2739,7 @@ static int __init futex_init(void) | |||
2697 | futex_cmpxchg_enabled = 1; | 2739 | futex_cmpxchg_enabled = 1; |
2698 | 2740 | ||
2699 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2741 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
2700 | plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); | 2742 | plist_head_init(&futex_queues[i].chain); |
2701 | spin_lock_init(&futex_queues[i].lock); | 2743 | spin_lock_init(&futex_queues[i].lock); |
2702 | } | 2744 | } |
2703 | 2745 | ||
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 5bf924d80b5..824b741925b 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
@@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling" | |||
3 | config GCOV_KERNEL | 3 | config GCOV_KERNEL |
4 | bool "Enable gcov-based kernel profiling" | 4 | bool "Enable gcov-based kernel profiling" |
5 | depends on DEBUG_FS | 5 | depends on DEBUG_FS |
6 | select CONSTRUCTORS | 6 | select CONSTRUCTORS if !UML |
7 | default n | 7 | default n |
8 | ---help--- | 8 | ---help--- |
9 | This option enables gcov-based code profiling (e.g. for code coverage | 9 | This option enables gcov-based code profiling (e.g. for code coverage |
@@ -35,7 +35,7 @@ config GCOV_KERNEL | |||
35 | config GCOV_PROFILE_ALL | 35 | config GCOV_PROFILE_ALL |
36 | bool "Profile entire Kernel" | 36 | bool "Profile entire Kernel" |
37 | depends on GCOV_KERNEL | 37 | depends on GCOV_KERNEL |
38 | depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE | 38 | depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM |
39 | default n | 39 | default n |
40 | ---help--- | 40 | ---help--- |
41 | This options activates profiling for the entire kernel. | 41 | This options activates profiling for the entire kernel. |
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL | |||
46 | larger and run slower. Also be sure to exclude files from profiling | 46 | larger and run slower. Also be sure to exclude files from profiling |
47 | which are not linked to the kernel image to prevent linker errors. | 47 | which are not linked to the kernel image to prevent linker errors. |
48 | 48 | ||
49 | config GCOV_CTORS | ||
50 | string | ||
51 | depends on CONSTRUCTORS | ||
52 | default ".init_array" if ARM && AEABI | ||
53 | default ".ctors" | ||
54 | |||
49 | endmenu | 55 | endmenu |
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb426003..d753d1152b7 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c | |||
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter) | |||
297 | } | 297 | } |
298 | 298 | ||
299 | /* Mapping of logical record number to actual file content. */ | 299 | /* Mapping of logical record number to actual file content. */ |
300 | #define RECORD_FILE_MAGIC 0 | 300 | #define RECORD_FILE_MAGIC 0 |
301 | #define RECORD_GCOV_VERSION 1 | 301 | #define RECORD_GCOV_VERSION 1 |
302 | #define RECORD_TIME_STAMP 2 | 302 | #define RECORD_TIME_STAMP 2 |
303 | #define RECORD_FUNCTION_TAG 3 | 303 | #define RECORD_FUNCTION_TAG 3 |
304 | #define RECORD_FUNCTON_TAG_LEN 4 | 304 | #define RECORD_FUNCTON_TAG_LEN 4 |
305 | #define RECORD_FUNCTION_IDENT 5 | 305 | #define RECORD_FUNCTION_IDENT 5 |
306 | #define RECORD_FUNCTION_CHECK 6 | 306 | #define RECORD_FUNCTION_CHECK_LINE 6 |
307 | #define RECORD_COUNT_TAG 7 | 307 | #define RECORD_FUNCTION_CHECK_CFG 7 |
308 | #define RECORD_COUNT_LEN 8 | 308 | #define RECORD_FUNCTION_NAME_LEN 8 |
309 | #define RECORD_COUNT 9 | 309 | #define RECORD_FUNCTION_NAME 9 |
310 | #define RECORD_COUNT_TAG 10 | ||
311 | #define RECORD_COUNT_LEN 11 | ||
312 | #define RECORD_COUNT 12 | ||
313 | |||
314 | /* Return length of string encoded in GCOV format. */ | ||
315 | static size_t | ||
316 | sizeof_str(const char *str) | ||
317 | { | ||
318 | size_t len; | ||
319 | len = (str) ? strlen(str) : 0; | ||
320 | if (len == 0) | ||
321 | return 1; | ||
322 | return 1 + ((len + 4) >> 2); | ||
323 | } | ||
310 | 324 | ||
311 | /** | 325 | /** |
312 | * gcov_iter_next - advance file iterator to next logical record | 326 | * gcov_iter_next - advance file iterator to next logical record |
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter) | |||
323 | case RECORD_FUNCTON_TAG_LEN: | 337 | case RECORD_FUNCTON_TAG_LEN: |
324 | case RECORD_FUNCTION_IDENT: | 338 | case RECORD_FUNCTION_IDENT: |
325 | case RECORD_COUNT_TAG: | 339 | case RECORD_COUNT_TAG: |
340 | case RECORD_FUNCTION_CHECK_LINE: | ||
341 | case RECORD_FUNCTION_CHECK_CFG: | ||
342 | case RECORD_FUNCTION_NAME_LEN: | ||
326 | /* Advance to next record */ | 343 | /* Advance to next record */ |
327 | iter->record++; | 344 | iter->record++; |
328 | break; | 345 | break; |
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter) | |||
332 | /* fall through */ | 349 | /* fall through */ |
333 | case RECORD_COUNT_LEN: | 350 | case RECORD_COUNT_LEN: |
334 | if (iter->count < get_func(iter)->n_ctrs[iter->type]) { | 351 | if (iter->count < get_func(iter)->n_ctrs[iter->type]) { |
335 | iter->record = 9; | 352 | iter->record = 12; |
336 | break; | 353 | break; |
337 | } | 354 | } |
338 | /* Advance to next counter type */ | 355 | /* Advance to next counter type */ |
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter) | |||
340 | iter->count = 0; | 357 | iter->count = 0; |
341 | iter->type++; | 358 | iter->type++; |
342 | /* fall through */ | 359 | /* fall through */ |
343 | case RECORD_FUNCTION_CHECK: | 360 | case RECORD_FUNCTION_NAME: |
344 | if (iter->type < iter->num_types) { | 361 | if (iter->type < iter->num_types) { |
345 | iter->record = 7; | 362 | iter->record = 10; |
346 | break; | 363 | break; |
347 | } | 364 | } |
348 | /* Advance to next function */ | 365 | /* Advance to next function */ |
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v) | |||
395 | data[1] = (v >> 32); | 412 | data[1] = (v >> 32); |
396 | return seq_write(seq, data, sizeof(data)); | 413 | return seq_write(seq, data, sizeof(data)); |
397 | } | 414 | } |
415 | /** | ||
416 | * seq_write_gcov_str - write string in gcov format to seq_file | ||
417 | * @seq: seq_file handle | ||
418 | * @str: string to be stored | ||
419 | * | ||
420 | * Number format defined by gcc: numbers are recorded in the 32 bit | ||
421 | * unsigned binary form of the endianness of the machine generating the | ||
422 | * file. 64 bit numbers are stored as two 32 bit numbers, the low part | ||
423 | * first. | ||
424 | */ | ||
425 | static int seq_write_gcov_str(struct seq_file *seq, const char *str) | ||
426 | { | ||
427 | if (str) { | ||
428 | size_t len; | ||
429 | int str_off; | ||
430 | u32 data; | ||
431 | len = strlen(str); | ||
432 | for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) { | ||
433 | memcpy(&data, (str + str_off * 4), 4); | ||
434 | seq_write(seq, &data, sizeof(data)); | ||
435 | } | ||
436 | data = 0; | ||
437 | memcpy(&data, (str + str_off * 4), (len - str_off * 4)); | ||
438 | return seq_write(seq, &data, sizeof(data)); | ||
439 | } else { | ||
440 | return 0; | ||
441 | } | ||
442 | } | ||
398 | 443 | ||
399 | /** | 444 | /** |
400 | * gcov_iter_write - write data for current pos to seq_file | 445 | * gcov_iter_write - write data for current pos to seq_file |
@@ -421,13 +466,24 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) | |||
421 | rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); | 466 | rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); |
422 | break; | 467 | break; |
423 | case RECORD_FUNCTON_TAG_LEN: | 468 | case RECORD_FUNCTON_TAG_LEN: |
424 | rc = seq_write_gcov_u32(seq, 2); | 469 | rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH + |
470 | (sizeof_str(get_func(iter)->name))); | ||
425 | break; | 471 | break; |
426 | case RECORD_FUNCTION_IDENT: | 472 | case RECORD_FUNCTION_IDENT: |
427 | rc = seq_write_gcov_u32(seq, get_func(iter)->ident); | 473 | rc = seq_write_gcov_u32(seq, get_func(iter)->ident); |
428 | break; | 474 | break; |
429 | case RECORD_FUNCTION_CHECK: | 475 | case RECORD_FUNCTION_CHECK_LINE: |
430 | rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); | 476 | rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum); |
477 | break; | ||
478 | case RECORD_FUNCTION_CHECK_CFG: | ||
479 | rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum); | ||
480 | break; | ||
481 | case RECORD_FUNCTION_NAME_LEN: | ||
482 | rc = seq_write_gcov_u32(seq, | ||
483 | (sizeof_str(get_func(iter)->name) - 1)); | ||
484 | break; | ||
485 | case RECORD_FUNCTION_NAME: | ||
486 | rc = seq_write_gcov_str(seq, get_func(iter)->name); | ||
431 | break; | 487 | break; |
432 | case RECORD_COUNT_TAG: | 488 | case RECORD_COUNT_TAG: |
433 | rc = seq_write_gcov_u32(seq, | 489 | rc = seq_write_gcov_u32(seq, |
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a..040c6980df0 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h | |||
@@ -21,9 +21,10 @@ | |||
21 | * gcc and need to be kept as close to the original definition as possible to | 21 | * gcc and need to be kept as close to the original definition as possible to |
22 | * remain compatible. | 22 | * remain compatible. |
23 | */ | 23 | */ |
24 | #define GCOV_COUNTERS 5 | 24 | #define GCOV_COUNTERS 10 |
25 | #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) | 25 | #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) |
26 | #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) | 26 | #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) |
27 | #define GCOV_TAG_FUNCTION_LENGTH 3 | ||
27 | #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) | 28 | #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) |
28 | #define GCOV_TAG_FOR_COUNTER(count) \ | 29 | #define GCOV_TAG_FOR_COUNTER(count) \ |
29 | (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) | 30 | (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) |
@@ -34,10 +35,38 @@ typedef long gcov_type; | |||
34 | typedef long long gcov_type; | 35 | typedef long long gcov_type; |
35 | #endif | 36 | #endif |
36 | 37 | ||
38 | /* | ||
39 | * Source module info. The data structure is used in both runtime and | ||
40 | * profile-use phase. | ||
41 | */ | ||
42 | struct gcov_module_info { | ||
43 | unsigned int ident; | ||
44 | /* | ||
45 | * This is overloaded to mean two things: | ||
46 | * (1) means FDO/LIPO in instrumented binary. | ||
47 | * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use. | ||
48 | */ | ||
49 | unsigned int is_primary; | ||
50 | unsigned int is_exported; | ||
51 | unsigned int lang; | ||
52 | char *da_filename; | ||
53 | char *source_filename; | ||
54 | unsigned int num_quote_paths; | ||
55 | unsigned int num_bracket_paths; | ||
56 | unsigned int num_cpp_defines; | ||
57 | unsigned int num_cpp_includes; | ||
58 | unsigned int num_cl_args; | ||
59 | char *string_array[1]; | ||
60 | }; | ||
61 | |||
62 | |||
37 | /** | 63 | /** |
38 | * struct gcov_fn_info - profiling meta data per function | 64 | * struct gcov_fn_info - profiling meta data per function |
39 | * @ident: object file-unique function identifier | 65 | * @ident: object file-unique function identifier |
40 | * @checksum: function checksum | 66 | * @lineno_checksum: function lineno checksum |
67 | * @cfg_checksum: function cfg checksum | ||
68 | * @dc_offset: direct call offset | ||
69 | * @name: function name | ||
41 | * @n_ctrs: number of values per counter type belonging to this function | 70 | * @n_ctrs: number of values per counter type belonging to this function |
42 | * | 71 | * |
43 | * This data is generated by gcc during compilation and doesn't change | 72 | * This data is generated by gcc during compilation and doesn't change |
@@ -45,7 +74,10 @@ typedef long long gcov_type; | |||
45 | */ | 74 | */ |
46 | struct gcov_fn_info { | 75 | struct gcov_fn_info { |
47 | unsigned int ident; | 76 | unsigned int ident; |
48 | unsigned int checksum; | 77 | unsigned int lineno_checksum; |
78 | unsigned int cfg_checksum; | ||
79 | unsigned int dc_offset; | ||
80 | const char *name; | ||
49 | unsigned int n_ctrs[0]; | 81 | unsigned int n_ctrs[0]; |
50 | }; | 82 | }; |
51 | 83 | ||
@@ -67,9 +99,11 @@ struct gcov_ctr_info { | |||
67 | /** | 99 | /** |
68 | * struct gcov_info - profiling data per object file | 100 | * struct gcov_info - profiling data per object file |
69 | * @version: gcov version magic indicating the gcc version used for compilation | 101 | * @version: gcov version magic indicating the gcc version used for compilation |
102 | * @modinfo: additional module information | ||
70 | * @next: list head for a singly-linked list | 103 | * @next: list head for a singly-linked list |
71 | * @stamp: time stamp | 104 | * @stamp: time stamp |
72 | * @filename: name of the associated gcov data file | 105 | * @filename: name of the associated gcov data file |
106 | * @eof_pos: end position of profile data | ||
73 | * @n_functions: number of instrumented functions | 107 | * @n_functions: number of instrumented functions |
74 | * @functions: function data | 108 | * @functions: function data |
75 | * @ctr_mask: mask specifying which counter types are active | 109 | * @ctr_mask: mask specifying which counter types are active |
@@ -80,9 +114,11 @@ struct gcov_ctr_info { | |||
80 | */ | 114 | */ |
81 | struct gcov_info { | 115 | struct gcov_info { |
82 | unsigned int version; | 116 | unsigned int version; |
117 | struct gcov_module_info *mod_info; | ||
83 | struct gcov_info *next; | 118 | struct gcov_info *next; |
84 | unsigned int stamp; | 119 | unsigned int stamp; |
85 | const char *filename; | 120 | const char *filename; |
121 | unsigned int eof_pos; | ||
86 | unsigned int n_functions; | 122 | unsigned int n_functions; |
87 | const struct gcov_fn_info *functions; | 123 | const struct gcov_fn_info *functions; |
88 | unsigned int ctr_mask; | 124 | unsigned int ctr_mask; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 11e89690382..2391745f656 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -887,10 +887,13 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
887 | struct hrtimer_clock_base *base, | 887 | struct hrtimer_clock_base *base, |
888 | unsigned long newstate, int reprogram) | 888 | unsigned long newstate, int reprogram) |
889 | { | 889 | { |
890 | struct timerqueue_node *next_timer; | ||
890 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 891 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
891 | goto out; | 892 | goto out; |
892 | 893 | ||
893 | if (&timer->node == timerqueue_getnext(&base->active)) { | 894 | next_timer = timerqueue_getnext(&base->active); |
895 | timerqueue_del(&base->active, &timer->node); | ||
896 | if (&timer->node == next_timer) { | ||
894 | #ifdef CONFIG_HIGH_RES_TIMERS | 897 | #ifdef CONFIG_HIGH_RES_TIMERS |
895 | /* Reprogram the clock event device. if enabled */ | 898 | /* Reprogram the clock event device. if enabled */ |
896 | if (reprogram && hrtimer_hres_active()) { | 899 | if (reprogram && hrtimer_hres_active()) { |
@@ -903,7 +906,6 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
903 | } | 906 | } |
904 | #endif | 907 | #endif |
905 | } | 908 | } |
906 | timerqueue_del(&base->active, &timer->node); | ||
907 | if (!timerqueue_getnext(&base->active)) | 909 | if (!timerqueue_getnext(&base->active)) |
908 | base->cpu_base->active_bases &= ~(1 << base->index); | 910 | base->cpu_base->active_bases &= ~(1 << base->index); |
909 | out: | 911 | out: |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ea640120ab8..e972276f12f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
74 | 74 | ||
75 | /* | 75 | /* |
76 | * Ensure the task is not frozen. | 76 | * Ensure the task is not frozen. |
77 | * Also, when a freshly created task is scheduled once, changes | 77 | * Also, skip vfork and any other user process that freezer should skip. |
78 | * its state to TASK_UNINTERRUPTIBLE without having ever been | ||
79 | * switched out once, it musn't be checked. | ||
80 | */ | 78 | */ |
81 | if (unlikely(t->flags & PF_FROZEN || !switch_count)) | 79 | if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) |
80 | return; | ||
81 | |||
82 | /* | ||
83 | * When a freshly created task is scheduled once, changes its state to | ||
84 | * TASK_UNINTERRUPTIBLE without having ever been switched out once, it | ||
85 | * musn't be checked. | ||
86 | */ | ||
87 | if (unlikely(!switch_count)) | ||
82 | return; | 88 | return; |
83 | 89 | ||
84 | if (switch_count != t->last_switch_count) { | 90 | if (switch_count != t->last_switch_count) { |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1d051b38e0..5a38bf4de64 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER | |||
52 | config GENERIC_IRQ_CHIP | 52 | config GENERIC_IRQ_CHIP |
53 | bool | 53 | bool |
54 | 54 | ||
55 | # Generic irq_domain hw <--> linux irq number translation | ||
56 | config IRQ_DOMAIN | ||
57 | bool | ||
58 | |||
55 | # Support forced irq threading | 59 | # Support forced irq threading |
56 | config IRQ_FORCED_THREADING | 60 | config IRQ_FORCED_THREADING |
57 | bool | 61 | bool |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 73290056cfb..fff17381f0a 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -2,6 +2,7 @@ | |||
2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | 3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o |
4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
5 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | ||
5 | obj-$(CONFIG_PROC_FS) += proc.o | 6 | obj-$(CONFIG_PROC_FS) += proc.o |
6 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
7 | obj-$(CONFIG_PM_SLEEP) += pm.o | 8 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d5a3009da71..dc5114b4c16 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc) | |||
178 | desc->depth = 1; | 178 | desc->depth = 1; |
179 | if (desc->irq_data.chip->irq_shutdown) | 179 | if (desc->irq_data.chip->irq_shutdown) |
180 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 180 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
181 | if (desc->irq_data.chip->irq_disable) | 181 | else if (desc->irq_data.chip->irq_disable) |
182 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 182 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
183 | else | 183 | else |
184 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 184 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa5..bd8e788d71e 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) | |||
87 | { | 87 | { |
88 | struct irq_devres match_data = { irq, dev_id }; | 88 | struct irq_devres match_data = { irq, dev_id }; |
89 | 89 | ||
90 | free_irq(irq, dev_id); | ||
91 | WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, | 90 | WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, |
92 | &match_data)); | 91 | &match_data)); |
92 | free_irq(irq, dev_id); | ||
93 | } | 93 | } |
94 | EXPORT_SYMBOL(devm_free_irq); | 94 | EXPORT_SYMBOL(devm_free_irq); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 3a2cab407b9..e38544dddb1 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
246 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | 246 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); |
247 | 247 | ||
248 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | 248 | for (i = gc->irq_base; msk; msk >>= 1, i++) { |
249 | if (!msk & 0x01) | 249 | if (!(msk & 0x01)) |
250 | continue; | 250 | continue; |
251 | 251 | ||
252 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | 252 | if (flags & IRQ_GC_INIT_NESTED_LOCK) |
@@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
301 | raw_spin_unlock(&gc_lock); | 301 | raw_spin_unlock(&gc_lock); |
302 | 302 | ||
303 | for (; msk; msk >>= 1, i++) { | 303 | for (; msk; msk >>= 1, i++) { |
304 | if (!msk & 0x01) | 304 | if (!(msk & 0x01)) |
305 | continue; | 305 | continue; |
306 | 306 | ||
307 | /* Remove handler first. That will mask the irq line */ | 307 | /* Remove handler first. That will mask the irq line */ |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 4c60a50e66b..039b889ea05 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { } | |||
70 | static inline int desc_node(struct irq_desc *desc) { return 0; } | 70 | static inline int desc_node(struct irq_desc *desc) { return 0; } |
71 | #endif | 71 | #endif |
72 | 72 | ||
73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | 73 | static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, |
74 | struct module *owner) | ||
74 | { | 75 | { |
75 | int cpu; | 76 | int cpu; |
76 | 77 | ||
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) | |||
86 | desc->irq_count = 0; | 87 | desc->irq_count = 0; |
87 | desc->irqs_unhandled = 0; | 88 | desc->irqs_unhandled = 0; |
88 | desc->name = NULL; | 89 | desc->name = NULL; |
90 | desc->owner = owner; | ||
89 | for_each_possible_cpu(cpu) | 91 | for_each_possible_cpu(cpu) |
90 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; | 92 | *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; |
91 | desc_smp_init(desc, node); | 93 | desc_smp_init(desc, node); |
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc) | |||
128 | static inline void free_masks(struct irq_desc *desc) { } | 130 | static inline void free_masks(struct irq_desc *desc) { } |
129 | #endif | 131 | #endif |
130 | 132 | ||
131 | static struct irq_desc *alloc_desc(int irq, int node) | 133 | static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) |
132 | { | 134 | { |
133 | struct irq_desc *desc; | 135 | struct irq_desc *desc; |
134 | gfp_t gfp = GFP_KERNEL; | 136 | gfp_t gfp = GFP_KERNEL; |
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node) | |||
147 | raw_spin_lock_init(&desc->lock); | 149 | raw_spin_lock_init(&desc->lock); |
148 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 150 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
149 | 151 | ||
150 | desc_set_defaults(irq, desc, node); | 152 | desc_set_defaults(irq, desc, node, owner); |
151 | 153 | ||
152 | return desc; | 154 | return desc; |
153 | 155 | ||
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq) | |||
173 | kfree(desc); | 175 | kfree(desc); |
174 | } | 176 | } |
175 | 177 | ||
176 | static int alloc_descs(unsigned int start, unsigned int cnt, int node) | 178 | static int alloc_descs(unsigned int start, unsigned int cnt, int node, |
179 | struct module *owner) | ||
177 | { | 180 | { |
178 | struct irq_desc *desc; | 181 | struct irq_desc *desc; |
179 | int i; | 182 | int i; |
180 | 183 | ||
181 | for (i = 0; i < cnt; i++) { | 184 | for (i = 0; i < cnt; i++) { |
182 | desc = alloc_desc(start + i, node); | 185 | desc = alloc_desc(start + i, node, owner); |
183 | if (!desc) | 186 | if (!desc) |
184 | goto err; | 187 | goto err; |
185 | mutex_lock(&sparse_irq_lock); | 188 | mutex_lock(&sparse_irq_lock); |
@@ -227,7 +230,7 @@ int __init early_irq_init(void) | |||
227 | nr_irqs = initcnt; | 230 | nr_irqs = initcnt; |
228 | 231 | ||
229 | for (i = 0; i < initcnt; i++) { | 232 | for (i = 0; i < initcnt; i++) { |
230 | desc = alloc_desc(i, node); | 233 | desc = alloc_desc(i, node, NULL); |
231 | set_bit(i, allocated_irqs); | 234 | set_bit(i, allocated_irqs); |
232 | irq_insert_desc(i, desc); | 235 | irq_insert_desc(i, desc); |
233 | } | 236 | } |
@@ -261,7 +264,7 @@ int __init early_irq_init(void) | |||
261 | alloc_masks(&desc[i], GFP_KERNEL, node); | 264 | alloc_masks(&desc[i], GFP_KERNEL, node); |
262 | raw_spin_lock_init(&desc[i].lock); | 265 | raw_spin_lock_init(&desc[i].lock); |
263 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 266 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
264 | desc_set_defaults(i, &desc[i], node); | 267 | desc_set_defaults(i, &desc[i], node, NULL); |
265 | } | 268 | } |
266 | return arch_early_irq_init(); | 269 | return arch_early_irq_init(); |
267 | } | 270 | } |
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq) | |||
276 | dynamic_irq_cleanup(irq); | 279 | dynamic_irq_cleanup(irq); |
277 | } | 280 | } |
278 | 281 | ||
279 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) | 282 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, |
283 | struct module *owner) | ||
280 | { | 284 | { |
285 | u32 i; | ||
286 | |||
287 | for (i = 0; i < cnt; i++) { | ||
288 | struct irq_desc *desc = irq_to_desc(start + i); | ||
289 | |||
290 | desc->owner = owner; | ||
291 | } | ||
281 | return start; | 292 | return start; |
282 | } | 293 | } |
283 | 294 | ||
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs); | |||
333 | * @from: Start the search from this irq number | 344 | * @from: Start the search from this irq number |
334 | * @cnt: Number of consecutive irqs to allocate. | 345 | * @cnt: Number of consecutive irqs to allocate. |
335 | * @node: Preferred node on which the irq descriptor should be allocated | 346 | * @node: Preferred node on which the irq descriptor should be allocated |
347 | * @owner: Owning module (can be NULL) | ||
336 | * | 348 | * |
337 | * Returns the first irq number or error code | 349 | * Returns the first irq number or error code |
338 | */ | 350 | */ |
339 | int __ref | 351 | int __ref |
340 | irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | 352 | __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, |
353 | struct module *owner) | ||
341 | { | 354 | { |
342 | int start, ret; | 355 | int start, ret; |
343 | 356 | ||
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | |||
366 | 379 | ||
367 | bitmap_set(allocated_irqs, start, cnt); | 380 | bitmap_set(allocated_irqs, start, cnt); |
368 | mutex_unlock(&sparse_irq_lock); | 381 | mutex_unlock(&sparse_irq_lock); |
369 | return alloc_descs(start, cnt, node); | 382 | return alloc_descs(start, cnt, node, owner); |
370 | 383 | ||
371 | err: | 384 | err: |
372 | mutex_unlock(&sparse_irq_lock); | 385 | mutex_unlock(&sparse_irq_lock); |
373 | return ret; | 386 | return ret; |
374 | } | 387 | } |
375 | EXPORT_SYMBOL_GPL(irq_alloc_descs); | 388 | EXPORT_SYMBOL_GPL(__irq_alloc_descs); |
376 | 389 | ||
377 | /** | 390 | /** |
378 | * irq_reserve_irqs - mark irqs allocated | 391 | * irq_reserve_irqs - mark irqs allocated |
@@ -440,7 +453,7 @@ void dynamic_irq_cleanup(unsigned int irq) | |||
440 | unsigned long flags; | 453 | unsigned long flags; |
441 | 454 | ||
442 | raw_spin_lock_irqsave(&desc->lock, flags); | 455 | raw_spin_lock_irqsave(&desc->lock, flags); |
443 | desc_set_defaults(irq, desc, desc_node(desc)); | 456 | desc_set_defaults(irq, desc, desc_node(desc), NULL); |
444 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 457 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
445 | } | 458 | } |
446 | 459 | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c new file mode 100644 index 00000000000..b57a3776de4 --- /dev/null +++ b/kernel/irq/irqdomain.c | |||
@@ -0,0 +1,184 @@ | |||
1 | #include <linux/irq.h> | ||
2 | #include <linux/irqdomain.h> | ||
3 | #include <linux/module.h> | ||
4 | #include <linux/mutex.h> | ||
5 | #include <linux/of.h> | ||
6 | #include <linux/of_address.h> | ||
7 | #include <linux/slab.h> | ||
8 | |||
9 | static LIST_HEAD(irq_domain_list); | ||
10 | static DEFINE_MUTEX(irq_domain_mutex); | ||
11 | |||
12 | /** | ||
13 | * irq_domain_add() - Register an irq_domain | ||
14 | * @domain: ptr to initialized irq_domain structure | ||
15 | * | ||
16 | * Registers an irq_domain structure. The irq_domain must at a minimum be | ||
17 | * initialized with an ops structure pointer, and either a ->to_irq hook or | ||
18 | * a valid irq_base value. Everything else is optional. | ||
19 | */ | ||
20 | void irq_domain_add(struct irq_domain *domain) | ||
21 | { | ||
22 | struct irq_data *d; | ||
23 | int hwirq; | ||
24 | |||
25 | /* | ||
26 | * This assumes that the irq_domain owner has already allocated | ||
27 | * the irq_descs. This block will be removed when support for dynamic | ||
28 | * allocation of irq_descs is added to irq_domain. | ||
29 | */ | ||
30 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | ||
31 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | ||
32 | if (!d) { | ||
33 | WARN(1, "error: assigning domain to non existant irq_desc"); | ||
34 | return; | ||
35 | } | ||
36 | if (d->domain) { | ||
37 | /* things are broken; just report, don't clean up */ | ||
38 | WARN(1, "error: irq_desc already assigned to a domain"); | ||
39 | return; | ||
40 | } | ||
41 | d->domain = domain; | ||
42 | d->hwirq = hwirq; | ||
43 | } | ||
44 | |||
45 | mutex_lock(&irq_domain_mutex); | ||
46 | list_add(&domain->list, &irq_domain_list); | ||
47 | mutex_unlock(&irq_domain_mutex); | ||
48 | } | ||
49 | |||
50 | /** | ||
51 | * irq_domain_del() - Unregister an irq_domain | ||
52 | * @domain: ptr to registered irq_domain. | ||
53 | */ | ||
54 | void irq_domain_del(struct irq_domain *domain) | ||
55 | { | ||
56 | struct irq_data *d; | ||
57 | int hwirq; | ||
58 | |||
59 | mutex_lock(&irq_domain_mutex); | ||
60 | list_del(&domain->list); | ||
61 | mutex_unlock(&irq_domain_mutex); | ||
62 | |||
63 | /* Clear the irq_domain assignments */ | ||
64 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | ||
65 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | ||
66 | d->domain = NULL; | ||
67 | } | ||
68 | } | ||
69 | |||
70 | #if defined(CONFIG_OF_IRQ) | ||
71 | /** | ||
72 | * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec | ||
73 | * | ||
74 | * Used by the device tree interrupt mapping code to translate a device tree | ||
75 | * interrupt specifier to a valid linux irq number. Returns either a valid | ||
76 | * linux IRQ number or 0. | ||
77 | * | ||
78 | * When the caller no longer need the irq number returned by this function it | ||
79 | * should arrange to call irq_dispose_mapping(). | ||
80 | */ | ||
81 | unsigned int irq_create_of_mapping(struct device_node *controller, | ||
82 | const u32 *intspec, unsigned int intsize) | ||
83 | { | ||
84 | struct irq_domain *domain; | ||
85 | unsigned long hwirq; | ||
86 | unsigned int irq, type; | ||
87 | int rc = -EINVAL; | ||
88 | |||
89 | /* Find a domain which can translate the irq spec */ | ||
90 | mutex_lock(&irq_domain_mutex); | ||
91 | list_for_each_entry(domain, &irq_domain_list, list) { | ||
92 | if (!domain->ops->dt_translate) | ||
93 | continue; | ||
94 | rc = domain->ops->dt_translate(domain, controller, | ||
95 | intspec, intsize, &hwirq, &type); | ||
96 | if (rc == 0) | ||
97 | break; | ||
98 | } | ||
99 | mutex_unlock(&irq_domain_mutex); | ||
100 | |||
101 | if (rc != 0) | ||
102 | return 0; | ||
103 | |||
104 | irq = irq_domain_to_irq(domain, hwirq); | ||
105 | if (type != IRQ_TYPE_NONE) | ||
106 | irq_set_irq_type(irq, type); | ||
107 | pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", | ||
108 | controller->full_name, (int)hwirq, irq, type); | ||
109 | return irq; | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | ||
112 | |||
113 | /** | ||
114 | * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() | ||
115 | * @irq: linux irq number to be discarded | ||
116 | * | ||
117 | * Calling this function indicates the caller no longer needs a reference to | ||
118 | * the linux irq number returned by a prior call to irq_create_of_mapping(). | ||
119 | */ | ||
120 | void irq_dispose_mapping(unsigned int irq) | ||
121 | { | ||
122 | /* | ||
123 | * nothing yet; will be filled when support for dynamic allocation of | ||
124 | * irq_descs is added to irq_domain | ||
125 | */ | ||
126 | } | ||
127 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | ||
128 | |||
129 | int irq_domain_simple_dt_translate(struct irq_domain *d, | ||
130 | struct device_node *controller, | ||
131 | const u32 *intspec, unsigned int intsize, | ||
132 | unsigned long *out_hwirq, unsigned int *out_type) | ||
133 | { | ||
134 | if (d->of_node != controller) | ||
135 | return -EINVAL; | ||
136 | if (intsize < 1) | ||
137 | return -EINVAL; | ||
138 | |||
139 | *out_hwirq = intspec[0]; | ||
140 | *out_type = IRQ_TYPE_NONE; | ||
141 | if (intsize > 1) | ||
142 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; | ||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | struct irq_domain_ops irq_domain_simple_ops = { | ||
147 | .dt_translate = irq_domain_simple_dt_translate, | ||
148 | }; | ||
149 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
150 | |||
151 | /** | ||
152 | * irq_domain_create_simple() - Set up a 'simple' translation range | ||
153 | */ | ||
154 | void irq_domain_add_simple(struct device_node *controller, int irq_base) | ||
155 | { | ||
156 | struct irq_domain *domain; | ||
157 | |||
158 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | ||
159 | if (!domain) { | ||
160 | WARN_ON(1); | ||
161 | return; | ||
162 | } | ||
163 | |||
164 | domain->irq_base = irq_base; | ||
165 | domain->of_node = of_node_get(controller); | ||
166 | domain->ops = &irq_domain_simple_ops; | ||
167 | irq_domain_add(domain); | ||
168 | } | ||
169 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | ||
170 | |||
171 | void irq_domain_generate_simple(const struct of_device_id *match, | ||
172 | u64 phys_base, unsigned int irq_start) | ||
173 | { | ||
174 | struct device_node *node; | ||
175 | pr_info("looking for phys_base=%llx, irq_start=%i\n", | ||
176 | (unsigned long long) phys_base, (int) irq_start); | ||
177 | node = of_find_matching_node_by_address(NULL, match, phys_base); | ||
178 | if (node) | ||
179 | irq_domain_add_simple(node, irq_start); | ||
180 | else | ||
181 | pr_info("no node found\n"); | ||
182 | } | ||
183 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | ||
184 | #endif /* CONFIG_OF_IRQ */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a7840aeb0f..d6c4adc2804 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -620,8 +620,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) | |||
620 | 620 | ||
621 | static int irq_wait_for_interrupt(struct irqaction *action) | 621 | static int irq_wait_for_interrupt(struct irqaction *action) |
622 | { | 622 | { |
623 | set_current_state(TASK_INTERRUPTIBLE); | ||
624 | |||
623 | while (!kthread_should_stop()) { | 625 | while (!kthread_should_stop()) { |
624 | set_current_state(TASK_INTERRUPTIBLE); | ||
625 | 626 | ||
626 | if (test_and_clear_bit(IRQTF_RUNTHREAD, | 627 | if (test_and_clear_bit(IRQTF_RUNTHREAD, |
627 | &action->thread_flags)) { | 628 | &action->thread_flags)) { |
@@ -629,7 +630,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
629 | return 0; | 630 | return 0; |
630 | } | 631 | } |
631 | schedule(); | 632 | schedule(); |
633 | set_current_state(TASK_INTERRUPTIBLE); | ||
632 | } | 634 | } |
635 | __set_current_state(TASK_RUNNING); | ||
633 | return -1; | 636 | return -1; |
634 | } | 637 | } |
635 | 638 | ||
@@ -883,6 +886,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
883 | 886 | ||
884 | if (desc->irq_data.chip == &no_irq_chip) | 887 | if (desc->irq_data.chip == &no_irq_chip) |
885 | return -ENOSYS; | 888 | return -ENOSYS; |
889 | if (!try_module_get(desc->owner)) | ||
890 | return -ENODEV; | ||
886 | /* | 891 | /* |
887 | * Some drivers like serial.c use request_irq() heavily, | 892 | * Some drivers like serial.c use request_irq() heavily, |
888 | * so we have to be careful not to interfere with a | 893 | * so we have to be careful not to interfere with a |
@@ -906,8 +911,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
906 | */ | 911 | */ |
907 | nested = irq_settings_is_nested_thread(desc); | 912 | nested = irq_settings_is_nested_thread(desc); |
908 | if (nested) { | 913 | if (nested) { |
909 | if (!new->thread_fn) | 914 | if (!new->thread_fn) { |
910 | return -EINVAL; | 915 | ret = -EINVAL; |
916 | goto out_mput; | ||
917 | } | ||
911 | /* | 918 | /* |
912 | * Replace the primary handler which was provided from | 919 | * Replace the primary handler which was provided from |
913 | * the driver for non nested interrupt handling by the | 920 | * the driver for non nested interrupt handling by the |
@@ -929,8 +936,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
929 | 936 | ||
930 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, | 937 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, |
931 | new->name); | 938 | new->name); |
932 | if (IS_ERR(t)) | 939 | if (IS_ERR(t)) { |
933 | return PTR_ERR(t); | 940 | ret = PTR_ERR(t); |
941 | goto out_mput; | ||
942 | } | ||
934 | /* | 943 | /* |
935 | * We keep the reference to the task struct even if | 944 | * We keep the reference to the task struct even if |
936 | * the thread dies to avoid that the interrupt code | 945 | * the thread dies to avoid that the interrupt code |
@@ -1095,6 +1104,8 @@ out_thread: | |||
1095 | kthread_stop(t); | 1104 | kthread_stop(t); |
1096 | put_task_struct(t); | 1105 | put_task_struct(t); |
1097 | } | 1106 | } |
1107 | out_mput: | ||
1108 | module_put(desc->owner); | ||
1098 | return ret; | 1109 | return ret; |
1099 | } | 1110 | } |
1100 | 1111 | ||
@@ -1203,6 +1214,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1203 | put_task_struct(action->thread); | 1214 | put_task_struct(action->thread); |
1204 | } | 1215 | } |
1205 | 1216 | ||
1217 | module_put(desc->owner); | ||
1206 | return action; | 1218 | return action; |
1207 | } | 1219 | } |
1208 | 1220 | ||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c987..fe4b09cf829 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/syscore_ops.h> | ||
12 | 13 | ||
13 | #include "internals.h" | 14 | #include "internals.h" |
14 | 15 | ||
@@ -39,25 +40,58 @@ void suspend_device_irqs(void) | |||
39 | } | 40 | } |
40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 41 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
41 | 42 | ||
42 | /** | 43 | static void resume_irqs(bool want_early) |
43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
44 | * | ||
45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | ||
46 | * have the IRQS_SUSPENDED flag set. | ||
47 | */ | ||
48 | void resume_device_irqs(void) | ||
49 | { | 44 | { |
50 | struct irq_desc *desc; | 45 | struct irq_desc *desc; |
51 | int irq; | 46 | int irq; |
52 | 47 | ||
53 | for_each_irq_desc(irq, desc) { | 48 | for_each_irq_desc(irq, desc) { |
54 | unsigned long flags; | 49 | unsigned long flags; |
50 | bool is_early = desc->action && | ||
51 | desc->action->flags & IRQF_EARLY_RESUME; | ||
52 | |||
53 | if (is_early != want_early) | ||
54 | continue; | ||
55 | 55 | ||
56 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
57 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
58 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
59 | } | 59 | } |
60 | } | 60 | } |
61 | |||
62 | /** | ||
63 | * irq_pm_syscore_ops - enable interrupt lines early | ||
64 | * | ||
65 | * Enable all interrupt lines with %IRQF_EARLY_RESUME set. | ||
66 | */ | ||
67 | static void irq_pm_syscore_resume(void) | ||
68 | { | ||
69 | resume_irqs(true); | ||
70 | } | ||
71 | |||
72 | static struct syscore_ops irq_pm_syscore_ops = { | ||
73 | .resume = irq_pm_syscore_resume, | ||
74 | }; | ||
75 | |||
76 | static int __init irq_pm_init_ops(void) | ||
77 | { | ||
78 | register_syscore_ops(&irq_pm_syscore_ops); | ||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | device_initcall(irq_pm_init_ops); | ||
83 | |||
84 | /** | ||
85 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
86 | * | ||
87 | * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously | ||
88 | * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag | ||
89 | * set as well as those with %IRQF_FORCE_RESUME. | ||
90 | */ | ||
91 | void resume_device_irqs(void) | ||
92 | { | ||
93 | resume_irqs(false); | ||
94 | } | ||
61 | EXPORT_SYMBOL_GPL(resume_device_irqs); | 95 | EXPORT_SYMBOL_GPL(resume_device_irqs); |
62 | 96 | ||
63 | /** | 97 | /** |
@@ -70,8 +104,13 @@ int check_wakeup_irqs(void) | |||
70 | 104 | ||
71 | for_each_irq_desc(irq, desc) { | 105 | for_each_irq_desc(irq, desc) { |
72 | if (irqd_is_wakeup_set(&desc->irq_data)) { | 106 | if (irqd_is_wakeup_set(&desc->irq_data)) { |
73 | if (desc->istate & IRQS_PENDING) | 107 | if (desc->istate & IRQS_PENDING) { |
108 | pr_info("Wakeup IRQ %d %s pending, suspend aborted\n", | ||
109 | irq, | ||
110 | desc->action && desc->action->name ? | ||
111 | desc->action->name : ""); | ||
74 | return -EBUSY; | 112 | return -EBUSY; |
113 | } | ||
75 | continue; | 114 | continue; |
76 | } | 115 | } |
77 | /* | 116 | /* |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c..ef60772d2fe 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -55,17 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | |||
55 | */ | 55 | */ |
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) |
57 | { | 57 | { |
58 | /* | ||
59 | * We do not resend level type interrupts. Level type | ||
60 | * interrupts are resent by hardware when they are still | ||
61 | * active. | ||
62 | */ | ||
63 | if (irq_settings_is_level(desc)) | ||
64 | return; | ||
65 | if (desc->istate & IRQS_REPLAY) | ||
66 | return; | ||
67 | if (desc->istate & IRQS_PENDING) { | 58 | if (desc->istate & IRQS_PENDING) { |
68 | desc->istate &= ~IRQS_PENDING; | 59 | desc->istate &= ~IRQS_PENDING; |
60 | /* | ||
61 | * We do not resend level type interrupts. Level type | ||
62 | * interrupts are resent by hardware when they are still | ||
63 | * active. | ||
64 | */ | ||
65 | if (irq_settings_is_level(desc)) | ||
66 | return; | ||
67 | if (desc->istate & IRQS_REPLAY) | ||
68 | return; | ||
69 | |||
69 | desc->istate |= IRQS_REPLAY; | 70 | desc->istate |= IRQS_REPLAY; |
70 | 71 | ||
71 | if (!desc->irq_data.chip->irq_retrigger || | 72 | if (!desc->irq_data.chip->irq_retrigger || |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index aa57d5da18c..dc813a948be 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
84 | */ | 84 | */ |
85 | action = desc->action; | 85 | action = desc->action; |
86 | if (!action || !(action->flags & IRQF_SHARED) || | 86 | if (!action || !(action->flags & IRQF_SHARED) || |
87 | (action->flags & __IRQF_TIMER) || !action->next) | 87 | (action->flags & __IRQF_TIMER) || |
88 | (action->handler(irq, action->dev_id) == IRQ_HANDLED) || | ||
89 | !action->next) | ||
88 | goto out; | 90 | goto out; |
89 | 91 | ||
90 | /* Already running on another processor */ | 92 | /* Already running on another processor */ |
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq) | |||
115 | struct irq_desc *desc; | 117 | struct irq_desc *desc; |
116 | int i, ok = 0; | 118 | int i, ok = 0; |
117 | 119 | ||
118 | if (atomic_inc_return(&irq_poll_active) == 1) | 120 | if (atomic_inc_return(&irq_poll_active) != 1) |
119 | goto out; | 121 | goto out; |
120 | 122 | ||
121 | irq_poll_cpu = smp_processor_id(); | 123 | irq_poll_cpu = smp_processor_id(); |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3..e6f1f24ad57 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key) | |||
66 | return; | 66 | return; |
67 | 67 | ||
68 | jump_label_lock(); | 68 | jump_label_lock(); |
69 | if (atomic_add_return(1, &key->enabled) == 1) | 69 | if (atomic_read(&key->enabled) == 0) |
70 | jump_label_update(key, JUMP_LABEL_ENABLE); | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
71 | atomic_inc(&key->enabled); | ||
71 | jump_label_unlock(); | 72 | jump_label_unlock(); |
72 | } | 73 | } |
73 | 74 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 8d814cbc810..296fbc84d65 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void) | |||
1095 | size_t size = 0; | 1095 | size_t size = 0; |
1096 | mutex_lock(&kexec_mutex); | 1096 | mutex_lock(&kexec_mutex); |
1097 | if (crashk_res.end != crashk_res.start) | 1097 | if (crashk_res.end != crashk_res.start) |
1098 | size = crashk_res.end - crashk_res.start + 1; | 1098 | size = resource_size(&crashk_res); |
1099 | mutex_unlock(&kexec_mutex); | 1099 | mutex_unlock(&kexec_mutex); |
1100 | return size; | 1100 | return size; |
1101 | } | 1101 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 47613dfb7b2..a4bea97c75b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...) | |||
114 | atomic_inc(&kmod_concurrent); | 114 | atomic_inc(&kmod_concurrent); |
115 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | 115 | if (atomic_read(&kmod_concurrent) > max_modprobes) { |
116 | /* We may be blaming an innocent here, but unlikely */ | 116 | /* We may be blaming an innocent here, but unlikely */ |
117 | if (kmod_loop_msg++ < 5) | 117 | if (kmod_loop_msg < 5) { |
118 | printk(KERN_ERR | 118 | printk(KERN_ERR |
119 | "request_module: runaway loop modprobe %s\n", | 119 | "request_module: runaway loop modprobe %s\n", |
120 | module_name); | 120 | module_name); |
121 | kmod_loop_msg++; | ||
122 | } | ||
121 | atomic_dec(&kmod_concurrent); | 123 | atomic_dec(&kmod_concurrent); |
122 | return -ENOMEM; | 124 | return -ENOMEM; |
123 | } | 125 | } |
@@ -274,7 +276,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
274 | * (used for preventing user land processes from being created after the user | 276 | * (used for preventing user land processes from being created after the user |
275 | * land has been frozen during a system-wide hibernation or suspend operation). | 277 | * land has been frozen during a system-wide hibernation or suspend operation). |
276 | */ | 278 | */ |
277 | static int usermodehelper_disabled; | 279 | static int usermodehelper_disabled = 1; |
278 | 280 | ||
279 | /* Number of helpers running */ | 281 | /* Number of helpers running */ |
280 | static atomic_t running_helpers = ATOMIC_INIT(0); | 282 | static atomic_t running_helpers = ATOMIC_INIT(0); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 77981813a1e..b30fd54eb98 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
1255 | /* | 1255 | /* |
1256 | * If we have a symbol_name argument, look it up and add the offset field | 1256 | * If we have a symbol_name argument, look it up and add the offset field |
1257 | * to it. This way, we can specify a relative address to a symbol. | 1257 | * to it. This way, we can specify a relative address to a symbol. |
1258 | * This returns encoded errors if it fails to look up symbol or invalid | ||
1259 | * combination of parameters. | ||
1258 | */ | 1260 | */ |
1259 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | 1261 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) |
1260 | { | 1262 | { |
1261 | kprobe_opcode_t *addr = p->addr; | 1263 | kprobe_opcode_t *addr = p->addr; |
1264 | |||
1265 | if ((p->symbol_name && p->addr) || | ||
1266 | (!p->symbol_name && !p->addr)) | ||
1267 | goto invalid; | ||
1268 | |||
1262 | if (p->symbol_name) { | 1269 | if (p->symbol_name) { |
1263 | if (addr) | ||
1264 | return NULL; | ||
1265 | kprobe_lookup_name(p->symbol_name, addr); | 1270 | kprobe_lookup_name(p->symbol_name, addr); |
1271 | if (!addr) | ||
1272 | return ERR_PTR(-ENOENT); | ||
1266 | } | 1273 | } |
1267 | 1274 | ||
1268 | if (!addr) | 1275 | addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); |
1269 | return NULL; | 1276 | if (addr) |
1270 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); | 1277 | return addr; |
1278 | |||
1279 | invalid: | ||
1280 | return ERR_PTR(-EINVAL); | ||
1271 | } | 1281 | } |
1272 | 1282 | ||
1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1283 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1311 | kprobe_opcode_t *addr; | 1321 | kprobe_opcode_t *addr; |
1312 | 1322 | ||
1313 | addr = kprobe_addr(p); | 1323 | addr = kprobe_addr(p); |
1314 | if (!addr) | 1324 | if (IS_ERR(addr)) |
1315 | return -EINVAL; | 1325 | return PTR_ERR(addr); |
1316 | p->addr = addr; | 1326 | p->addr = addr; |
1317 | 1327 | ||
1318 | ret = check_kprobe_rereg(p); | 1328 | ret = check_kprobe_rereg(p); |
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1335 | */ | 1345 | */ |
1336 | probed_mod = __module_text_address((unsigned long) p->addr); | 1346 | probed_mod = __module_text_address((unsigned long) p->addr); |
1337 | if (probed_mod) { | 1347 | if (probed_mod) { |
1348 | /* Return -ENOENT if fail. */ | ||
1349 | ret = -ENOENT; | ||
1338 | /* | 1350 | /* |
1339 | * We must hold a refcount of the probed module while updating | 1351 | * We must hold a refcount of the probed module while updating |
1340 | * its code to prohibit unexpected unloading. | 1352 | * its code to prohibit unexpected unloading. |
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
1351 | module_put(probed_mod); | 1363 | module_put(probed_mod); |
1352 | goto fail_with_jump_label; | 1364 | goto fail_with_jump_label; |
1353 | } | 1365 | } |
1366 | /* ret will be updated by following code */ | ||
1354 | } | 1367 | } |
1355 | preempt_enable(); | 1368 | preempt_enable(); |
1356 | jump_label_unlock(); | 1369 | jump_label_unlock(); |
@@ -1399,7 +1412,7 @@ out: | |||
1399 | fail_with_jump_label: | 1412 | fail_with_jump_label: |
1400 | preempt_enable(); | 1413 | preempt_enable(); |
1401 | jump_label_unlock(); | 1414 | jump_label_unlock(); |
1402 | return -EINVAL; | 1415 | return ret; |
1403 | } | 1416 | } |
1404 | EXPORT_SYMBOL_GPL(register_kprobe); | 1417 | EXPORT_SYMBOL_GPL(register_kprobe); |
1405 | 1418 | ||
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
1686 | 1699 | ||
1687 | if (kretprobe_blacklist_size) { | 1700 | if (kretprobe_blacklist_size) { |
1688 | addr = kprobe_addr(&rp->kp); | 1701 | addr = kprobe_addr(&rp->kp); |
1689 | if (!addr) | 1702 | if (IS_ERR(addr)) |
1690 | return -EINVAL; | 1703 | return PTR_ERR(addr); |
1691 | 1704 | ||
1692 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 1705 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
1693 | if (kretprobe_blacklist[i].addr == addr) | 1706 | if (kretprobe_blacklist[i].addr == addr) |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 298c9276dfd..447960603fb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/stringify.h> | 44 | #include <linux/stringify.h> |
45 | #include <linux/bitops.h> | 45 | #include <linux/bitops.h> |
46 | #include <linux/gfp.h> | 46 | #include <linux/gfp.h> |
47 | #include <linux/kmemcheck.h> | ||
47 | 48 | ||
48 | #include <asm/sections.h> | 49 | #include <asm/sections.h> |
49 | 50 | ||
@@ -2468,6 +2469,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
2468 | 2469 | ||
2469 | BUG_ON(usage_bit >= LOCK_USAGE_STATES); | 2470 | BUG_ON(usage_bit >= LOCK_USAGE_STATES); |
2470 | 2471 | ||
2472 | if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) | ||
2473 | continue; | ||
2474 | |||
2471 | if (!mark_lock(curr, hlock, usage_bit)) | 2475 | if (!mark_lock(curr, hlock, usage_bit)) |
2472 | return 0; | 2476 | return 0; |
2473 | } | 2477 | } |
@@ -2478,34 +2482,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
2478 | /* | 2482 | /* |
2479 | * Hardirqs will be enabled: | 2483 | * Hardirqs will be enabled: |
2480 | */ | 2484 | */ |
2481 | void trace_hardirqs_on_caller(unsigned long ip) | 2485 | static void __trace_hardirqs_on_caller(unsigned long ip) |
2482 | { | 2486 | { |
2483 | struct task_struct *curr = current; | 2487 | struct task_struct *curr = current; |
2484 | 2488 | ||
2485 | time_hardirqs_on(CALLER_ADDR0, ip); | ||
2486 | |||
2487 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
2488 | return; | ||
2489 | |||
2490 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | ||
2491 | return; | ||
2492 | |||
2493 | if (unlikely(curr->hardirqs_enabled)) { | ||
2494 | /* | ||
2495 | * Neither irq nor preemption are disabled here | ||
2496 | * so this is racy by nature but losing one hit | ||
2497 | * in a stat is not a big deal. | ||
2498 | */ | ||
2499 | __debug_atomic_inc(redundant_hardirqs_on); | ||
2500 | return; | ||
2501 | } | ||
2502 | /* we'll do an OFF -> ON transition: */ | 2489 | /* we'll do an OFF -> ON transition: */ |
2503 | curr->hardirqs_enabled = 1; | 2490 | curr->hardirqs_enabled = 1; |
2504 | 2491 | ||
2505 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
2506 | return; | ||
2507 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
2508 | return; | ||
2509 | /* | 2492 | /* |
2510 | * We are going to turn hardirqs on, so set the | 2493 | * We are going to turn hardirqs on, so set the |
2511 | * usage bit for all held locks: | 2494 | * usage bit for all held locks: |
@@ -2525,6 +2508,37 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2525 | curr->hardirq_enable_event = ++curr->irq_events; | 2508 | curr->hardirq_enable_event = ++curr->irq_events; |
2526 | debug_atomic_inc(hardirqs_on_events); | 2509 | debug_atomic_inc(hardirqs_on_events); |
2527 | } | 2510 | } |
2511 | |||
2512 | void trace_hardirqs_on_caller(unsigned long ip) | ||
2513 | { | ||
2514 | time_hardirqs_on(CALLER_ADDR0, ip); | ||
2515 | |||
2516 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
2517 | return; | ||
2518 | |||
2519 | if (unlikely(current->hardirqs_enabled)) { | ||
2520 | /* | ||
2521 | * Neither irq nor preemption are disabled here | ||
2522 | * so this is racy by nature but losing one hit | ||
2523 | * in a stat is not a big deal. | ||
2524 | */ | ||
2525 | __debug_atomic_inc(redundant_hardirqs_on); | ||
2526 | return; | ||
2527 | } | ||
2528 | |||
2529 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
2530 | return; | ||
2531 | |||
2532 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | ||
2533 | return; | ||
2534 | |||
2535 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
2536 | return; | ||
2537 | |||
2538 | current->lockdep_recursion = 1; | ||
2539 | __trace_hardirqs_on_caller(ip); | ||
2540 | current->lockdep_recursion = 0; | ||
2541 | } | ||
2528 | EXPORT_SYMBOL(trace_hardirqs_on_caller); | 2542 | EXPORT_SYMBOL(trace_hardirqs_on_caller); |
2529 | 2543 | ||
2530 | void trace_hardirqs_on(void) | 2544 | void trace_hardirqs_on(void) |
@@ -2574,7 +2588,7 @@ void trace_softirqs_on(unsigned long ip) | |||
2574 | { | 2588 | { |
2575 | struct task_struct *curr = current; | 2589 | struct task_struct *curr = current; |
2576 | 2590 | ||
2577 | if (unlikely(!debug_locks)) | 2591 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2578 | return; | 2592 | return; |
2579 | 2593 | ||
2580 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2594 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
@@ -2585,6 +2599,7 @@ void trace_softirqs_on(unsigned long ip) | |||
2585 | return; | 2599 | return; |
2586 | } | 2600 | } |
2587 | 2601 | ||
2602 | current->lockdep_recursion = 1; | ||
2588 | /* | 2603 | /* |
2589 | * We'll do an OFF -> ON transition: | 2604 | * We'll do an OFF -> ON transition: |
2590 | */ | 2605 | */ |
@@ -2599,6 +2614,7 @@ void trace_softirqs_on(unsigned long ip) | |||
2599 | */ | 2614 | */ |
2600 | if (curr->hardirqs_enabled) | 2615 | if (curr->hardirqs_enabled) |
2601 | mark_held_locks(curr, SOFTIRQ); | 2616 | mark_held_locks(curr, SOFTIRQ); |
2617 | current->lockdep_recursion = 0; | ||
2602 | } | 2618 | } |
2603 | 2619 | ||
2604 | /* | 2620 | /* |
@@ -2608,7 +2624,7 @@ void trace_softirqs_off(unsigned long ip) | |||
2608 | { | 2624 | { |
2609 | struct task_struct *curr = current; | 2625 | struct task_struct *curr = current; |
2610 | 2626 | ||
2611 | if (unlikely(!debug_locks)) | 2627 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2612 | return; | 2628 | return; |
2613 | 2629 | ||
2614 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2630 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
@@ -2861,6 +2877,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2861 | { | 2877 | { |
2862 | int i; | 2878 | int i; |
2863 | 2879 | ||
2880 | kmemcheck_mark_initialized(lock, sizeof(*lock)); | ||
2881 | |||
2864 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | 2882 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) |
2865 | lock->class_cache[i] = NULL; | 2883 | lock->class_cache[i] = NULL; |
2866 | 2884 | ||
@@ -3099,7 +3117,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
3099 | if (!class) | 3117 | if (!class) |
3100 | class = look_up_lock_class(lock, 0); | 3118 | class = look_up_lock_class(lock, 0); |
3101 | 3119 | ||
3102 | if (DEBUG_LOCKS_WARN_ON(!class)) | 3120 | /* |
3121 | * If look_up_lock_class() failed to find a class, we're trying | ||
3122 | * to test if we hold a lock that has never yet been acquired. | ||
3123 | * Clearly if the lock hasn't been acquired _ever_, we're not | ||
3124 | * holding it either, so report failure. | ||
3125 | */ | ||
3126 | if (!class) | ||
3103 | return 0; | 3127 | return 0; |
3104 | 3128 | ||
3105 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) | 3129 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) |
diff --git a/kernel/module.c b/kernel/module.c index 795bdc7f5c3..e0ddcece2be 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ | |||
545 | mod->field = kstrdup(s, GFP_KERNEL); \ | 545 | mod->field = kstrdup(s, GFP_KERNEL); \ |
546 | } \ | 546 | } \ |
547 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | 547 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ |
548 | struct module *mod, char *buffer) \ | 548 | struct module_kobject *mk, char *buffer) \ |
549 | { \ | 549 | { \ |
550 | return sprintf(buffer, "%s\n", mod->field); \ | 550 | return sprintf(buffer, "%s\n", mk->mod->field); \ |
551 | } \ | 551 | } \ |
552 | static int modinfo_##field##_exists(struct module *mod) \ | 552 | static int modinfo_##field##_exists(struct module *mod) \ |
553 | { \ | 553 | { \ |
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr) | |||
902 | EXPORT_SYMBOL_GPL(symbol_put_addr); | 902 | EXPORT_SYMBOL_GPL(symbol_put_addr); |
903 | 903 | ||
904 | static ssize_t show_refcnt(struct module_attribute *mattr, | 904 | static ssize_t show_refcnt(struct module_attribute *mattr, |
905 | struct module *mod, char *buffer) | 905 | struct module_kobject *mk, char *buffer) |
906 | { | 906 | { |
907 | return sprintf(buffer, "%u\n", module_refcount(mod)); | 907 | return sprintf(buffer, "%u\n", module_refcount(mk->mod)); |
908 | } | 908 | } |
909 | 909 | ||
910 | static struct module_attribute refcnt = { | 910 | static struct module_attribute refcnt = { |
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod) | |||
952 | #endif /* CONFIG_MODULE_UNLOAD */ | 952 | #endif /* CONFIG_MODULE_UNLOAD */ |
953 | 953 | ||
954 | static ssize_t show_initstate(struct module_attribute *mattr, | 954 | static ssize_t show_initstate(struct module_attribute *mattr, |
955 | struct module *mod, char *buffer) | 955 | struct module_kobject *mk, char *buffer) |
956 | { | 956 | { |
957 | const char *state = "unknown"; | 957 | const char *state = "unknown"; |
958 | 958 | ||
959 | switch (mod->state) { | 959 | switch (mk->mod->state) { |
960 | case MODULE_STATE_LIVE: | 960 | case MODULE_STATE_LIVE: |
961 | state = "live"; | 961 | state = "live"; |
962 | break; | 962 | break; |
@@ -975,10 +975,27 @@ static struct module_attribute initstate = { | |||
975 | .show = show_initstate, | 975 | .show = show_initstate, |
976 | }; | 976 | }; |
977 | 977 | ||
978 | static ssize_t store_uevent(struct module_attribute *mattr, | ||
979 | struct module_kobject *mk, | ||
980 | const char *buffer, size_t count) | ||
981 | { | ||
982 | enum kobject_action action; | ||
983 | |||
984 | if (kobject_action_type(buffer, count, &action) == 0) | ||
985 | kobject_uevent(&mk->kobj, action); | ||
986 | return count; | ||
987 | } | ||
988 | |||
989 | struct module_attribute module_uevent = { | ||
990 | .attr = { .name = "uevent", .mode = 0200 }, | ||
991 | .store = store_uevent, | ||
992 | }; | ||
993 | |||
978 | static struct module_attribute *modinfo_attrs[] = { | 994 | static struct module_attribute *modinfo_attrs[] = { |
979 | &modinfo_version, | 995 | &modinfo_version, |
980 | &modinfo_srcversion, | 996 | &modinfo_srcversion, |
981 | &initstate, | 997 | &initstate, |
998 | &module_uevent, | ||
982 | #ifdef CONFIG_MODULE_UNLOAD | 999 | #ifdef CONFIG_MODULE_UNLOAD |
983 | &refcnt, | 1000 | &refcnt, |
984 | #endif | 1001 | #endif |
@@ -1187,7 +1204,7 @@ struct module_sect_attrs | |||
1187 | }; | 1204 | }; |
1188 | 1205 | ||
1189 | static ssize_t module_sect_show(struct module_attribute *mattr, | 1206 | static ssize_t module_sect_show(struct module_attribute *mattr, |
1190 | struct module *mod, char *buf) | 1207 | struct module_kobject *mk, char *buf) |
1191 | { | 1208 | { |
1192 | struct module_sect_attr *sattr = | 1209 | struct module_sect_attr *sattr = |
1193 | container_of(mattr, struct module_sect_attr, mattr); | 1210 | container_of(mattr, struct module_sect_attr, mattr); |
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { } | |||
1697 | static void unset_module_init_ro_nx(struct module *mod) { } | 1714 | static void unset_module_init_ro_nx(struct module *mod) { } |
1698 | #endif | 1715 | #endif |
1699 | 1716 | ||
1717 | void __weak module_free(struct module *mod, void *module_region) | ||
1718 | { | ||
1719 | vfree(module_region); | ||
1720 | } | ||
1721 | |||
1722 | void __weak module_arch_cleanup(struct module *mod) | ||
1723 | { | ||
1724 | } | ||
1725 | |||
1700 | /* Free a module, remove from lists, etc. */ | 1726 | /* Free a module, remove from lists, etc. */ |
1701 | static void free_module(struct module *mod) | 1727 | static void free_module(struct module *mod) |
1702 | { | 1728 | { |
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1851 | return ret; | 1877 | return ret; |
1852 | } | 1878 | } |
1853 | 1879 | ||
1880 | int __weak apply_relocate(Elf_Shdr *sechdrs, | ||
1881 | const char *strtab, | ||
1882 | unsigned int symindex, | ||
1883 | unsigned int relsec, | ||
1884 | struct module *me) | ||
1885 | { | ||
1886 | pr_err("module %s: REL relocation unsupported\n", me->name); | ||
1887 | return -ENOEXEC; | ||
1888 | } | ||
1889 | |||
1890 | int __weak apply_relocate_add(Elf_Shdr *sechdrs, | ||
1891 | const char *strtab, | ||
1892 | unsigned int symindex, | ||
1893 | unsigned int relsec, | ||
1894 | struct module *me) | ||
1895 | { | ||
1896 | pr_err("module %s: RELA relocation unsupported\n", me->name); | ||
1897 | return -ENOEXEC; | ||
1898 | } | ||
1899 | |||
1854 | static int apply_relocations(struct module *mod, const struct load_info *info) | 1900 | static int apply_relocations(struct module *mod, const struct load_info *info) |
1855 | { | 1901 | { |
1856 | unsigned int i; | 1902 | unsigned int i; |
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug) | |||
2235 | ddebug_remove_module(debug->modname); | 2281 | ddebug_remove_module(debug->modname); |
2236 | } | 2282 | } |
2237 | 2283 | ||
2284 | void * __weak module_alloc(unsigned long size) | ||
2285 | { | ||
2286 | return size == 0 ? NULL : vmalloc_exec(size); | ||
2287 | } | ||
2288 | |||
2238 | static void *module_alloc_update_bounds(unsigned long size) | 2289 | static void *module_alloc_update_bounds(unsigned long size) |
2239 | { | 2290 | { |
2240 | void *ret = module_alloc(size); | 2291 | void *ret = module_alloc(size); |
@@ -2477,7 +2528,7 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
2477 | mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); | 2528 | mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); |
2478 | #endif | 2529 | #endif |
2479 | #ifdef CONFIG_CONSTRUCTORS | 2530 | #ifdef CONFIG_CONSTRUCTORS |
2480 | mod->ctors = section_objs(info, ".ctors", | 2531 | mod->ctors = section_objs(info, CONFIG_GCOV_CTORS, |
2481 | sizeof(*mod->ctors), &mod->num_ctors); | 2532 | sizeof(*mod->ctors), &mod->num_ctors); |
2482 | #endif | 2533 | #endif |
2483 | 2534 | ||
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod) | |||
2645 | set_fs(old_fs); | 2696 | set_fs(old_fs); |
2646 | } | 2697 | } |
2647 | 2698 | ||
2699 | int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | ||
2700 | Elf_Shdr *sechdrs, | ||
2701 | char *secstrings, | ||
2702 | struct module *mod) | ||
2703 | { | ||
2704 | return 0; | ||
2705 | } | ||
2706 | |||
2648 | static struct module *layout_and_allocate(struct load_info *info) | 2707 | static struct module *layout_and_allocate(struct load_info *info) |
2649 | { | 2708 | { |
2650 | /* Module within temporary copy. */ | 2709 | /* Module within temporary copy. */ |
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info) | |||
2716 | module_free(mod, mod->module_core); | 2775 | module_free(mod, mod->module_core); |
2717 | } | 2776 | } |
2718 | 2777 | ||
2778 | int __weak module_finalize(const Elf_Ehdr *hdr, | ||
2779 | const Elf_Shdr *sechdrs, | ||
2780 | struct module *me) | ||
2781 | { | ||
2782 | return 0; | ||
2783 | } | ||
2784 | |||
2719 | static int post_relocation(struct module *mod, const struct load_info *info) | 2785 | static int post_relocation(struct module *mod, const struct load_info *info) |
2720 | { | 2786 | { |
2721 | /* Sort exception table now relocations are done. */ | 2787 | /* Sort exception table now relocations are done. */ |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb56..8d7b435806c 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) | |||
525 | } | 525 | } |
526 | EXPORT_SYMBOL_GPL(srcu_init_notifier_head); | 526 | EXPORT_SYMBOL_GPL(srcu_init_notifier_head); |
527 | 527 | ||
528 | /** | ||
529 | * register_reboot_notifier - Register function to be called at reboot time | ||
530 | * @nb: Info about notifier function to be called | ||
531 | * | ||
532 | * Registers a function with the list of functions | ||
533 | * to be called at reboot time. | ||
534 | * | ||
535 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
536 | * always returns zero. | ||
537 | */ | ||
538 | int register_reboot_notifier(struct notifier_block *nb) | ||
539 | { | ||
540 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
541 | } | ||
542 | EXPORT_SYMBOL(register_reboot_notifier); | ||
543 | |||
544 | /** | ||
545 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
546 | * @nb: Hook to be unregistered | ||
547 | * | ||
548 | * Unregisters a previously registered reboot | ||
549 | * notifier function. | ||
550 | * | ||
551 | * Returns zero on success, or %-ENOENT on failure. | ||
552 | */ | ||
553 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
554 | { | ||
555 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
556 | } | ||
557 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
558 | |||
559 | static ATOMIC_NOTIFIER_HEAD(die_chain); | 528 | static ATOMIC_NOTIFIER_HEAD(die_chain); |
560 | 529 | ||
561 | int notrace __kprobes notify_die(enum die_val val, const char *str, | 530 | int notrace __kprobes notify_die(enum die_val val, const char *str, |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index d6a00f3de15..9aeab4b98c6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -271,10 +271,8 @@ out: | |||
271 | return err; | 271 | return err; |
272 | } | 272 | } |
273 | 273 | ||
274 | static int __init nsproxy_cache_init(void) | 274 | int __init nsproxy_cache_init(void) |
275 | { | 275 | { |
276 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); | 276 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); |
277 | return 0; | 277 | return 0; |
278 | } | 278 | } |
279 | |||
280 | module_init(nsproxy_cache_init); | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 69231670eb9..41fc78ea3db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -27,13 +27,19 @@ | |||
27 | #define PANIC_TIMER_STEP 100 | 27 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 28 | #define PANIC_BLINK_SPD 18 |
29 | 29 | ||
30 | /* Machine specific panic information string */ | ||
31 | char *mach_panic_string; | ||
32 | |||
30 | int panic_on_oops; | 33 | int panic_on_oops; |
31 | static unsigned long tainted_mask; | 34 | static unsigned long tainted_mask; |
32 | static int pause_on_oops; | 35 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 36 | static int pause_on_oops_flag; |
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 37 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | 38 | ||
36 | int panic_timeout; | 39 | #ifndef CONFIG_PANIC_TIMEOUT |
40 | #define CONFIG_PANIC_TIMEOUT 0 | ||
41 | #endif | ||
42 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | ||
37 | EXPORT_SYMBOL_GPL(panic_timeout); | 43 | EXPORT_SYMBOL_GPL(panic_timeout); |
38 | 44 | ||
39 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 45 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
@@ -119,6 +125,8 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
119 | } | 125 | } |
120 | mdelay(PANIC_TIMER_STEP); | 126 | mdelay(PANIC_TIMER_STEP); |
121 | } | 127 | } |
128 | } | ||
129 | if (panic_timeout != 0) { | ||
122 | /* | 130 | /* |
123 | * This will not be a clean reboot, with everything | 131 | * This will not be a clean reboot, with everything |
124 | * shutting down. But if there is a chance of | 132 | * shutting down. But if there is a chance of |
@@ -342,6 +350,11 @@ late_initcall(init_oops_id); | |||
342 | void print_oops_end_marker(void) | 350 | void print_oops_end_marker(void) |
343 | { | 351 | { |
344 | init_oops_id(); | 352 | init_oops_id(); |
353 | |||
354 | if (mach_panic_string) | ||
355 | printk(KERN_WARNING "Board Information: %s\n", | ||
356 | mach_panic_string); | ||
357 | |||
345 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | 358 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", |
346 | (unsigned long long)oops_id); | 359 | (unsigned long long)oops_id); |
347 | } | 360 | } |
diff --git a/kernel/params.c b/kernel/params.c index ed72e133086..22df3e0d142 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -225,8 +225,8 @@ int parse_args(const char *name, | |||
225 | int ret; \ | 225 | int ret; \ |
226 | \ | 226 | \ |
227 | ret = strtolfn(val, 0, &l); \ | 227 | ret = strtolfn(val, 0, &l); \ |
228 | if (ret == -EINVAL || ((type)l != l)) \ | 228 | if (ret < 0 || ((type)l != l)) \ |
229 | return -EINVAL; \ | 229 | return ret < 0 ? ret : -EINVAL; \ |
230 | *((type *)kp->arg) = l; \ | 230 | *((type *)kp->arg) = l; \ |
231 | return 0; \ | 231 | return 0; \ |
232 | } \ | 232 | } \ |
@@ -511,7 +511,7 @@ struct module_param_attrs | |||
511 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr) | 511 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr) |
512 | 512 | ||
513 | static ssize_t param_attr_show(struct module_attribute *mattr, | 513 | static ssize_t param_attr_show(struct module_attribute *mattr, |
514 | struct module *mod, char *buf) | 514 | struct module_kobject *mk, char *buf) |
515 | { | 515 | { |
516 | int count; | 516 | int count; |
517 | struct param_attribute *attribute = to_param_attr(mattr); | 517 | struct param_attribute *attribute = to_param_attr(mattr); |
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, | |||
531 | 531 | ||
532 | /* sysfs always hands a nul-terminated string in buf. We rely on that. */ | 532 | /* sysfs always hands a nul-terminated string in buf. We rely on that. */ |
533 | static ssize_t param_attr_store(struct module_attribute *mattr, | 533 | static ssize_t param_attr_store(struct module_attribute *mattr, |
534 | struct module *owner, | 534 | struct module_kobject *km, |
535 | const char *buf, size_t len) | 535 | const char *buf, size_t len) |
536 | { | 536 | { |
537 | int err; | 537 | int err; |
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
730 | mk->kobj.kset = module_kset; | 730 | mk->kobj.kset = module_kset; |
731 | err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, | 731 | err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, |
732 | "%s", name); | 732 | "%s", name); |
733 | #ifdef CONFIG_MODULES | ||
734 | if (!err) | ||
735 | err = sysfs_create_file(&mk->kobj, &module_uevent.attr); | ||
736 | #endif | ||
733 | if (err) { | 737 | if (err) { |
734 | kobject_put(&mk->kobj); | 738 | kobject_put(&mk->kobj); |
735 | printk(KERN_ERR | 739 | printk(KERN_ERR |
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void) | |||
807 | } | 811 | } |
808 | 812 | ||
809 | ssize_t __modver_version_show(struct module_attribute *mattr, | 813 | ssize_t __modver_version_show(struct module_attribute *mattr, |
810 | struct module *mod, char *buf) | 814 | struct module_kobject *mk, char *buf) |
811 | { | 815 | { |
812 | struct module_version_attribute *vattr = | 816 | struct module_version_attribute *vattr = |
813 | container_of(mattr, struct module_version_attribute, mattr); | 817 | container_of(mattr, struct module_version_attribute, mattr); |
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj, | |||
852 | if (!attribute->show) | 856 | if (!attribute->show) |
853 | return -EIO; | 857 | return -EIO; |
854 | 858 | ||
855 | ret = attribute->show(attribute, mk->mod, buf); | 859 | ret = attribute->show(attribute, mk, buf); |
856 | 860 | ||
857 | return ret; | 861 | return ret; |
858 | } | 862 | } |
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
871 | if (!attribute->store) | 875 | if (!attribute->store) |
872 | return -EIO; | 876 | return -EIO; |
873 | 877 | ||
874 | ret = attribute->store(attribute, mk->mod, buf, len); | 878 | ret = attribute->store(attribute, mk, buf, len); |
875 | 879 | ||
876 | return ret; | 880 | return ret; |
877 | } | 881 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index 57a8346a270..e432057f3b2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
405 | if (pid) { | 405 | if (pid) { |
406 | struct hlist_node *first; | 406 | struct hlist_node *first; |
407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), | 407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
408 | rcu_read_lock_held() || | ||
409 | lockdep_tasklist_lock_is_held()); | 408 | lockdep_tasklist_lock_is_held()); |
410 | if (first) | 409 | if (first) |
411 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 410 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 6824ca7d4d0..82da7ac3b1f 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock); | |||
74 | static struct pm_qos_object null_pm_qos; | 74 | static struct pm_qos_object null_pm_qos; |
75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); | 75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); |
76 | static struct pm_qos_object cpu_dma_pm_qos = { | 76 | static struct pm_qos_object cpu_dma_pm_qos = { |
77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), | 77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), |
78 | .notifiers = &cpu_dma_lat_notifier, | 78 | .notifiers = &cpu_dma_lat_notifier, |
79 | .name = "cpu_dma_latency", | 79 | .name = "cpu_dma_latency", |
80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | 80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { | |||
84 | 84 | ||
85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); | 85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); |
86 | static struct pm_qos_object network_lat_pm_qos = { | 86 | static struct pm_qos_object network_lat_pm_qos = { |
87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), | 87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), |
88 | .notifiers = &network_lat_notifier, | 88 | .notifiers = &network_lat_notifier, |
89 | .name = "network_latency", | 89 | .name = "network_latency", |
90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | 90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = { | |||
95 | 95 | ||
96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); | 96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); |
97 | static struct pm_qos_object network_throughput_pm_qos = { | 97 | static struct pm_qos_object network_throughput_pm_qos = { |
98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), | 98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), |
99 | .notifiers = &network_throughput_notifier, | 99 | .notifiers = &network_throughput_notifier, |
100 | .name = "network_throughput", | 100 | .name = "network_throughput", |
101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | 101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
@@ -104,11 +104,59 @@ static struct pm_qos_object network_throughput_pm_qos = { | |||
104 | }; | 104 | }; |
105 | 105 | ||
106 | 106 | ||
107 | static BLOCKING_NOTIFIER_HEAD(min_online_cpus_notifier); | ||
108 | static struct pm_qos_object min_online_cpus_pm_qos = { | ||
109 | .requests = PLIST_HEAD_INIT(min_online_cpus_pm_qos.requests), | ||
110 | .notifiers = &min_online_cpus_notifier, | ||
111 | .name = "min_online_cpus", | ||
112 | .target_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE, | ||
113 | .default_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE, | ||
114 | .type = PM_QOS_MAX, | ||
115 | }; | ||
116 | |||
117 | |||
118 | static BLOCKING_NOTIFIER_HEAD(max_online_cpus_notifier); | ||
119 | static struct pm_qos_object max_online_cpus_pm_qos = { | ||
120 | .requests = PLIST_HEAD_INIT(max_online_cpus_pm_qos.requests), | ||
121 | .notifiers = &max_online_cpus_notifier, | ||
122 | .name = "max_online_cpus", | ||
123 | .target_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE, | ||
124 | .default_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE, | ||
125 | .type = PM_QOS_MIN, | ||
126 | }; | ||
127 | |||
128 | |||
129 | static BLOCKING_NOTIFIER_HEAD(cpu_freq_min_notifier); | ||
130 | static struct pm_qos_object cpu_freq_min_pm_qos = { | ||
131 | .requests = PLIST_HEAD_INIT(cpu_freq_min_pm_qos.requests), | ||
132 | .notifiers = &cpu_freq_min_notifier, | ||
133 | .name = "cpu_freq_min", | ||
134 | .target_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE, | ||
135 | .default_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE, | ||
136 | .type = PM_QOS_MAX, | ||
137 | }; | ||
138 | |||
139 | |||
140 | static BLOCKING_NOTIFIER_HEAD(cpu_freq_max_notifier); | ||
141 | static struct pm_qos_object cpu_freq_max_pm_qos = { | ||
142 | .requests = PLIST_HEAD_INIT(cpu_freq_max_pm_qos.requests), | ||
143 | .notifiers = &cpu_freq_max_notifier, | ||
144 | .name = "cpu_freq_max", | ||
145 | .target_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE, | ||
146 | .default_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE, | ||
147 | .type = PM_QOS_MIN, | ||
148 | }; | ||
149 | |||
150 | |||
107 | static struct pm_qos_object *pm_qos_array[] = { | 151 | static struct pm_qos_object *pm_qos_array[] = { |
108 | &null_pm_qos, | 152 | &null_pm_qos, |
109 | &cpu_dma_pm_qos, | 153 | &cpu_dma_pm_qos, |
110 | &network_lat_pm_qos, | 154 | &network_lat_pm_qos, |
111 | &network_throughput_pm_qos | 155 | &network_throughput_pm_qos, |
156 | &min_online_cpus_pm_qos, | ||
157 | &max_online_cpus_pm_qos, | ||
158 | &cpu_freq_min_pm_qos, | ||
159 | &cpu_freq_max_pm_qos | ||
112 | }; | 160 | }; |
113 | 161 | ||
114 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | 162 | static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, |
@@ -459,21 +507,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
459 | static int __init pm_qos_power_init(void) | 507 | static int __init pm_qos_power_init(void) |
460 | { | 508 | { |
461 | int ret = 0; | 509 | int ret = 0; |
510 | int i; | ||
462 | 511 | ||
463 | ret = register_pm_qos_misc(&cpu_dma_pm_qos); | 512 | BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); |
464 | if (ret < 0) { | 513 | |
465 | printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); | 514 | for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { |
466 | return ret; | 515 | ret = register_pm_qos_misc(pm_qos_array[i]); |
467 | } | 516 | if (ret < 0) { |
468 | ret = register_pm_qos_misc(&network_lat_pm_qos); | 517 | printk(KERN_ERR "pm_qos_param: %s setup failed\n", |
469 | if (ret < 0) { | 518 | pm_qos_array[i]->name); |
470 | printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); | 519 | return ret; |
471 | return ret; | 520 | } |
472 | } | 521 | } |
473 | ret = register_pm_qos_misc(&network_throughput_pm_qos); | ||
474 | if (ret < 0) | ||
475 | printk(KERN_ERR | ||
476 | "pm_qos_param: network_throughput setup failed\n"); | ||
477 | 522 | ||
478 | return ret; | 523 | return ret; |
479 | } | 524 | } |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e..640ded8f5c4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
250 | do { | 250 | do { |
251 | times->utime = cputime_add(times->utime, t->utime); | 251 | times->utime = cputime_add(times->utime, t->utime); |
252 | times->stime = cputime_add(times->stime, t->stime); | 252 | times->stime = cputime_add(times->stime, t->stime); |
253 | times->sum_exec_runtime += t->se.sum_exec_runtime; | 253 | times->sum_exec_runtime += task_sched_runtime(t); |
254 | } while_each_thread(tsk, t); | 254 | } while_each_thread(tsk, t); |
255 | out: | 255 | out: |
256 | rcu_read_unlock(); | 256 | rcu_read_unlock(); |
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
274 | struct task_cputime sum; | 274 | struct task_cputime sum; |
275 | unsigned long flags; | 275 | unsigned long flags; |
276 | 276 | ||
277 | spin_lock_irqsave(&cputimer->lock, flags); | ||
278 | if (!cputimer->running) { | 277 | if (!cputimer->running) { |
279 | cputimer->running = 1; | ||
280 | /* | 278 | /* |
281 | * The POSIX timer interface allows for absolute time expiry | 279 | * The POSIX timer interface allows for absolute time expiry |
282 | * values through the TIMER_ABSTIME flag, therefore we have | 280 | * values through the TIMER_ABSTIME flag, therefore we have |
@@ -284,8 +282,11 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
284 | * it. | 282 | * it. |
285 | */ | 283 | */ |
286 | thread_group_cputime(tsk, &sum); | 284 | thread_group_cputime(tsk, &sum); |
285 | spin_lock_irqsave(&cputimer->lock, flags); | ||
286 | cputimer->running = 1; | ||
287 | update_gt_cputime(&cputimer->cputime, &sum); | 287 | update_gt_cputime(&cputimer->cputime, &sum); |
288 | } | 288 | } else |
289 | spin_lock_irqsave(&cputimer->lock, flags); | ||
289 | *times = cputimer->cputime; | 290 | *times = cputimer->cputime; |
290 | spin_unlock_irqrestore(&cputimer->lock, flags); | 291 | spin_unlock_irqrestore(&cputimer->lock, flags); |
291 | } | 292 | } |
@@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
312 | cpu->cpu = cputime.utime; | 313 | cpu->cpu = cputime.utime; |
313 | break; | 314 | break; |
314 | case CPUCLOCK_SCHED: | 315 | case CPUCLOCK_SCHED: |
315 | cpu->sched = thread_group_sched_runtime(p); | 316 | thread_group_cputime(p, &cputime); |
317 | cpu->sched = cputime.sum_exec_runtime; | ||
316 | break; | 318 | break; |
317 | } | 319 | } |
318 | return 0; | 320 | return 0; |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 87f4d24b55b..fcf5a834c4e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -18,6 +18,73 @@ config SUSPEND_FREEZER | |||
18 | 18 | ||
19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. | 19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
20 | 20 | ||
21 | config HAS_WAKELOCK | ||
22 | bool | ||
23 | |||
24 | config HAS_EARLYSUSPEND | ||
25 | bool | ||
26 | |||
27 | config WAKELOCK | ||
28 | bool "Wake lock" | ||
29 | depends on PM && RTC_CLASS | ||
30 | default n | ||
31 | select HAS_WAKELOCK | ||
32 | ---help--- | ||
33 | Enable wakelocks. When user space request a sleep state the | ||
34 | sleep request will be delayed until no wake locks are held. | ||
35 | |||
36 | config WAKELOCK_STAT | ||
37 | bool "Wake lock stats" | ||
38 | depends on WAKELOCK | ||
39 | default y | ||
40 | ---help--- | ||
41 | Report wake lock stats in /proc/wakelocks | ||
42 | |||
43 | config USER_WAKELOCK | ||
44 | bool "Userspace wake locks" | ||
45 | depends on WAKELOCK | ||
46 | default y | ||
47 | ---help--- | ||
48 | User-space wake lock api. Write "lockname" or "lockname timeout" | ||
49 | to /sys/power/wake_lock lock and if needed create a wake lock. | ||
50 | Write "lockname" to /sys/power/wake_unlock to unlock a user wake | ||
51 | lock. | ||
52 | |||
53 | config EARLYSUSPEND | ||
54 | bool "Early suspend" | ||
55 | depends on WAKELOCK | ||
56 | default y | ||
57 | select HAS_EARLYSUSPEND | ||
58 | ---help--- | ||
59 | Call early suspend handlers when the user requested sleep state | ||
60 | changes. | ||
61 | |||
62 | choice | ||
63 | prompt "User-space screen access" | ||
64 | default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE | ||
65 | default CONSOLE_EARLYSUSPEND | ||
66 | depends on HAS_EARLYSUSPEND | ||
67 | |||
68 | config NO_USER_SPACE_SCREEN_ACCESS_CONTROL | ||
69 | bool "None" | ||
70 | |||
71 | config CONSOLE_EARLYSUSPEND | ||
72 | bool "Console switch on early-suspend" | ||
73 | depends on HAS_EARLYSUSPEND && VT | ||
74 | ---help--- | ||
75 | Register early suspend handler to perform a console switch to | ||
76 | when user-space should stop drawing to the screen and a switch | ||
77 | back when it should resume. | ||
78 | |||
79 | config FB_EARLYSUSPEND | ||
80 | bool "Sysfs interface" | ||
81 | depends on HAS_EARLYSUSPEND | ||
82 | ---help--- | ||
83 | Register early suspend handler that notifies and waits for | ||
84 | user-space through sysfs when user-space should stop drawing | ||
85 | to the screen and notifies user-space when it should resume. | ||
86 | endchoice | ||
87 | |||
21 | config HIBERNATE_CALLBACKS | 88 | config HIBERNATE_CALLBACKS |
22 | bool | 89 | bool |
23 | 90 | ||
@@ -193,8 +260,8 @@ config APM_EMULATION | |||
193 | notification of APM "events" (e.g. battery status change). | 260 | notification of APM "events" (e.g. battery status change). |
194 | 261 | ||
195 | In order to use APM, you will need supporting software. For location | 262 | In order to use APM, you will need supporting software. For location |
196 | and more information, read <file:Documentation/power/pm.txt> and the | 263 | and more information, read <file:Documentation/power/apm-acpi.txt> |
197 | Battery Powered Linux mini-HOWTO, available from | 264 | and the Battery Powered Linux mini-HOWTO, available from |
198 | <http://www.tldp.org/docs.html#howto>. | 265 | <http://www.tldp.org/docs.html#howto>. |
199 | 266 | ||
200 | This driver does not spin down disk drives (see the hdparm(8) | 267 | This driver does not spin down disk drives (see the hdparm(8) |
@@ -224,6 +291,21 @@ config PM_OPP | |||
224 | implementations a ready to use framework to manage OPPs. | 291 | implementations a ready to use framework to manage OPPs. |
225 | For more information, read <file:Documentation/power/opp.txt> | 292 | For more information, read <file:Documentation/power/opp.txt> |
226 | 293 | ||
227 | config PM_RUNTIME_CLK | 294 | config PM_CLK |
295 | def_bool y | ||
296 | depends on PM && HAVE_CLK | ||
297 | |||
298 | config PM_GENERIC_DOMAINS | ||
299 | bool | ||
300 | depends on PM | ||
301 | |||
302 | config PM_GENERIC_DOMAINS_RUNTIME | ||
228 | def_bool y | 303 | def_bool y |
229 | depends on PM_RUNTIME && HAVE_CLK | 304 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS |
305 | |||
306 | config SUSPEND_TIME | ||
307 | bool "Log time spent in suspend" | ||
308 | ---help--- | ||
309 | Prints the time spent in suspend in the kernel log, and | ||
310 | keeps statistics on the time spent in suspend in | ||
311 | /sys/kernel/debug/suspend_time | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a9064..9b224e16b19 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -8,5 +8,11 @@ obj-$(CONFIG_SUSPEND) += suspend.o | |||
8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
9 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ | 9 | obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ |
10 | block_io.o | 10 | block_io.o |
11 | obj-$(CONFIG_WAKELOCK) += wakelock.o | ||
12 | obj-$(CONFIG_USER_WAKELOCK) += userwakelock.o | ||
13 | obj-$(CONFIG_EARLYSUSPEND) += earlysuspend.o | ||
14 | obj-$(CONFIG_CONSOLE_EARLYSUSPEND) += consoleearlysuspend.o | ||
15 | obj-$(CONFIG_FB_EARLYSUSPEND) += fbearlysuspend.o | ||
16 | obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o | ||
11 | 17 | ||
12 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 18 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/consoleearlysuspend.c b/kernel/power/consoleearlysuspend.c new file mode 100644 index 00000000000..a3edcb26738 --- /dev/null +++ b/kernel/power/consoleearlysuspend.c | |||
@@ -0,0 +1,78 @@ | |||
1 | /* kernel/power/consoleearlysuspend.c | ||
2 | * | ||
3 | * Copyright (C) 2005-2008 Google, Inc. | ||
4 | * | ||
5 | * This software is licensed under the terms of the GNU General Public | ||
6 | * License version 2, as published by the Free Software Foundation, and | ||
7 | * may be copied, distributed, and modified under those terms. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/console.h> | ||
17 | #include <linux/earlysuspend.h> | ||
18 | #include <linux/kbd_kern.h> | ||
19 | #include <linux/module.h> | ||
20 | #include <linux/vt_kern.h> | ||
21 | #include <linux/wait.h> | ||
22 | |||
23 | #define EARLY_SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | ||
24 | |||
25 | static int orig_fgconsole; | ||
26 | static void console_early_suspend(struct early_suspend *h) | ||
27 | { | ||
28 | acquire_console_sem(); | ||
29 | orig_fgconsole = fg_console; | ||
30 | if (vc_allocate(EARLY_SUSPEND_CONSOLE)) | ||
31 | goto err; | ||
32 | if (set_console(EARLY_SUSPEND_CONSOLE)) | ||
33 | goto err; | ||
34 | release_console_sem(); | ||
35 | |||
36 | if (vt_waitactive(EARLY_SUSPEND_CONSOLE + 1)) | ||
37 | pr_warning("console_early_suspend: Can't switch VCs.\n"); | ||
38 | return; | ||
39 | err: | ||
40 | pr_warning("console_early_suspend: Can't set console\n"); | ||
41 | release_console_sem(); | ||
42 | } | ||
43 | |||
44 | static void console_late_resume(struct early_suspend *h) | ||
45 | { | ||
46 | int ret; | ||
47 | acquire_console_sem(); | ||
48 | ret = set_console(orig_fgconsole); | ||
49 | release_console_sem(); | ||
50 | if (ret) { | ||
51 | pr_warning("console_late_resume: Can't set console.\n"); | ||
52 | return; | ||
53 | } | ||
54 | |||
55 | if (vt_waitactive(orig_fgconsole + 1)) | ||
56 | pr_warning("console_late_resume: Can't switch VCs.\n"); | ||
57 | } | ||
58 | |||
59 | static struct early_suspend console_early_suspend_desc = { | ||
60 | .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, | ||
61 | .suspend = console_early_suspend, | ||
62 | .resume = console_late_resume, | ||
63 | }; | ||
64 | |||
65 | static int __init console_early_suspend_init(void) | ||
66 | { | ||
67 | register_early_suspend(&console_early_suspend_desc); | ||
68 | return 0; | ||
69 | } | ||
70 | |||
71 | static void __exit console_early_suspend_exit(void) | ||
72 | { | ||
73 | unregister_early_suspend(&console_early_suspend_desc); | ||
74 | } | ||
75 | |||
76 | module_init(console_early_suspend_init); | ||
77 | module_exit(console_early_suspend_exit); | ||
78 | |||
diff --git a/kernel/power/earlysuspend.c b/kernel/power/earlysuspend.c new file mode 100644 index 00000000000..b15f02eba45 --- /dev/null +++ b/kernel/power/earlysuspend.c | |||
@@ -0,0 +1,187 @@ | |||
1 | /* kernel/power/earlysuspend.c | ||
2 | * | ||
3 | * Copyright (C) 2005-2008 Google, Inc. | ||
4 | * | ||
5 | * This software is licensed under the terms of the GNU General Public | ||
6 | * License version 2, as published by the Free Software Foundation, and | ||
7 | * may be copied, distributed, and modified under those terms. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/earlysuspend.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/mutex.h> | ||
19 | #include <linux/rtc.h> | ||
20 | #include <linux/syscalls.h> /* sys_sync */ | ||
21 | #include <linux/wakelock.h> | ||
22 | #include <linux/workqueue.h> | ||
23 | |||
24 | #include "power.h" | ||
25 | |||
26 | enum { | ||
27 | DEBUG_USER_STATE = 1U << 0, | ||
28 | DEBUG_SUSPEND = 1U << 2, | ||
29 | DEBUG_VERBOSE = 1U << 3, | ||
30 | }; | ||
31 | static int debug_mask = DEBUG_USER_STATE; | ||
32 | module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); | ||
33 | |||
34 | static DEFINE_MUTEX(early_suspend_lock); | ||
35 | static LIST_HEAD(early_suspend_handlers); | ||
36 | static void early_suspend(struct work_struct *work); | ||
37 | static void late_resume(struct work_struct *work); | ||
38 | static DECLARE_WORK(early_suspend_work, early_suspend); | ||
39 | static DECLARE_WORK(late_resume_work, late_resume); | ||
40 | static DEFINE_SPINLOCK(state_lock); | ||
41 | enum { | ||
42 | SUSPEND_REQUESTED = 0x1, | ||
43 | SUSPENDED = 0x2, | ||
44 | SUSPEND_REQUESTED_AND_SUSPENDED = SUSPEND_REQUESTED | SUSPENDED, | ||
45 | }; | ||
46 | static int state; | ||
47 | |||
48 | void register_early_suspend(struct early_suspend *handler) | ||
49 | { | ||
50 | struct list_head *pos; | ||
51 | |||
52 | mutex_lock(&early_suspend_lock); | ||
53 | list_for_each(pos, &early_suspend_handlers) { | ||
54 | struct early_suspend *e; | ||
55 | e = list_entry(pos, struct early_suspend, link); | ||
56 | if (e->level > handler->level) | ||
57 | break; | ||
58 | } | ||
59 | list_add_tail(&handler->link, pos); | ||
60 | if ((state & SUSPENDED) && handler->suspend) | ||
61 | handler->suspend(handler); | ||
62 | mutex_unlock(&early_suspend_lock); | ||
63 | } | ||
64 | EXPORT_SYMBOL(register_early_suspend); | ||
65 | |||
66 | void unregister_early_suspend(struct early_suspend *handler) | ||
67 | { | ||
68 | mutex_lock(&early_suspend_lock); | ||
69 | list_del(&handler->link); | ||
70 | mutex_unlock(&early_suspend_lock); | ||
71 | } | ||
72 | EXPORT_SYMBOL(unregister_early_suspend); | ||
73 | |||
74 | static void early_suspend(struct work_struct *work) | ||
75 | { | ||
76 | struct early_suspend *pos; | ||
77 | unsigned long irqflags; | ||
78 | int abort = 0; | ||
79 | |||
80 | mutex_lock(&early_suspend_lock); | ||
81 | spin_lock_irqsave(&state_lock, irqflags); | ||
82 | if (state == SUSPEND_REQUESTED) | ||
83 | state |= SUSPENDED; | ||
84 | else | ||
85 | abort = 1; | ||
86 | spin_unlock_irqrestore(&state_lock, irqflags); | ||
87 | |||
88 | if (abort) { | ||
89 | if (debug_mask & DEBUG_SUSPEND) | ||
90 | pr_info("early_suspend: abort, state %d\n", state); | ||
91 | mutex_unlock(&early_suspend_lock); | ||
92 | goto abort; | ||
93 | } | ||
94 | |||
95 | if (debug_mask & DEBUG_SUSPEND) | ||
96 | pr_info("early_suspend: call handlers\n"); | ||
97 | list_for_each_entry(pos, &early_suspend_handlers, link) { | ||
98 | if (pos->suspend != NULL) { | ||
99 | if (debug_mask & DEBUG_VERBOSE) | ||
100 | pr_info("early_suspend: calling %pf\n", pos->suspend); | ||
101 | pos->suspend(pos); | ||
102 | } | ||
103 | } | ||
104 | mutex_unlock(&early_suspend_lock); | ||
105 | |||
106 | if (debug_mask & DEBUG_SUSPEND) | ||
107 | pr_info("early_suspend: sync\n"); | ||
108 | |||
109 | sys_sync(); | ||
110 | abort: | ||
111 | spin_lock_irqsave(&state_lock, irqflags); | ||
112 | if (state == SUSPEND_REQUESTED_AND_SUSPENDED) | ||
113 | wake_unlock(&main_wake_lock); | ||
114 | spin_unlock_irqrestore(&state_lock, irqflags); | ||
115 | } | ||
116 | |||
117 | static void late_resume(struct work_struct *work) | ||
118 | { | ||
119 | struct early_suspend *pos; | ||
120 | unsigned long irqflags; | ||
121 | int abort = 0; | ||
122 | |||
123 | mutex_lock(&early_suspend_lock); | ||
124 | spin_lock_irqsave(&state_lock, irqflags); | ||
125 | if (state == SUSPENDED) | ||
126 | state &= ~SUSPENDED; | ||
127 | else | ||
128 | abort = 1; | ||
129 | spin_unlock_irqrestore(&state_lock, irqflags); | ||
130 | |||
131 | if (abort) { | ||
132 | if (debug_mask & DEBUG_SUSPEND) | ||
133 | pr_info("late_resume: abort, state %d\n", state); | ||
134 | goto abort; | ||
135 | } | ||
136 | if (debug_mask & DEBUG_SUSPEND) | ||
137 | pr_info("late_resume: call handlers\n"); | ||
138 | list_for_each_entry_reverse(pos, &early_suspend_handlers, link) { | ||
139 | if (pos->resume != NULL) { | ||
140 | if (debug_mask & DEBUG_VERBOSE) | ||
141 | pr_info("late_resume: calling %pf\n", pos->resume); | ||
142 | |||
143 | pos->resume(pos); | ||
144 | } | ||
145 | } | ||
146 | if (debug_mask & DEBUG_SUSPEND) | ||
147 | pr_info("late_resume: done\n"); | ||
148 | abort: | ||
149 | mutex_unlock(&early_suspend_lock); | ||
150 | } | ||
151 | |||
152 | void request_suspend_state(suspend_state_t new_state) | ||
153 | { | ||
154 | unsigned long irqflags; | ||
155 | int old_sleep; | ||
156 | |||
157 | spin_lock_irqsave(&state_lock, irqflags); | ||
158 | old_sleep = state & SUSPEND_REQUESTED; | ||
159 | if (debug_mask & DEBUG_USER_STATE) { | ||
160 | struct timespec ts; | ||
161 | struct rtc_time tm; | ||
162 | getnstimeofday(&ts); | ||
163 | rtc_time_to_tm(ts.tv_sec, &tm); | ||
164 | pr_info("request_suspend_state: %s (%d->%d) at %lld " | ||
165 | "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", | ||
166 | new_state != PM_SUSPEND_ON ? "sleep" : "wakeup", | ||
167 | requested_suspend_state, new_state, | ||
168 | ktime_to_ns(ktime_get()), | ||
169 | tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, | ||
170 | tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec); | ||
171 | } | ||
172 | if (!old_sleep && new_state != PM_SUSPEND_ON) { | ||
173 | state |= SUSPEND_REQUESTED; | ||
174 | queue_work(suspend_work_queue, &early_suspend_work); | ||
175 | } else if (old_sleep && new_state == PM_SUSPEND_ON) { | ||
176 | state &= ~SUSPEND_REQUESTED; | ||
177 | wake_lock(&main_wake_lock); | ||
178 | queue_work(suspend_work_queue, &late_resume_work); | ||
179 | } | ||
180 | requested_suspend_state = new_state; | ||
181 | spin_unlock_irqrestore(&state_lock, irqflags); | ||
182 | } | ||
183 | |||
184 | suspend_state_t get_suspend_state(void) | ||
185 | { | ||
186 | return requested_suspend_state; | ||
187 | } | ||
diff --git a/kernel/power/fbearlysuspend.c b/kernel/power/fbearlysuspend.c new file mode 100644 index 00000000000..15137650149 --- /dev/null +++ b/kernel/power/fbearlysuspend.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* kernel/power/fbearlysuspend.c | ||
2 | * | ||
3 | * Copyright (C) 2005-2008 Google, Inc. | ||
4 | * | ||
5 | * This software is licensed under the terms of the GNU General Public | ||
6 | * License version 2, as published by the Free Software Foundation, and | ||
7 | * may be copied, distributed, and modified under those terms. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/earlysuspend.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/wait.h> | ||
19 | |||
20 | #include "power.h" | ||
21 | |||
22 | static wait_queue_head_t fb_state_wq; | ||
23 | static DEFINE_SPINLOCK(fb_state_lock); | ||
24 | static enum { | ||
25 | FB_STATE_STOPPED_DRAWING, | ||
26 | FB_STATE_REQUEST_STOP_DRAWING, | ||
27 | FB_STATE_DRAWING_OK, | ||
28 | } fb_state; | ||
29 | |||
30 | /* tell userspace to stop drawing, wait for it to stop */ | ||
31 | static void stop_drawing_early_suspend(struct early_suspend *h) | ||
32 | { | ||
33 | int ret; | ||
34 | unsigned long irq_flags; | ||
35 | |||
36 | spin_lock_irqsave(&fb_state_lock, irq_flags); | ||
37 | fb_state = FB_STATE_REQUEST_STOP_DRAWING; | ||
38 | spin_unlock_irqrestore(&fb_state_lock, irq_flags); | ||
39 | |||
40 | wake_up_all(&fb_state_wq); | ||
41 | ret = wait_event_timeout(fb_state_wq, | ||
42 | fb_state == FB_STATE_STOPPED_DRAWING, | ||
43 | HZ); | ||
44 | if (unlikely(fb_state != FB_STATE_STOPPED_DRAWING)) | ||
45 | pr_warning("stop_drawing_early_suspend: timeout waiting for " | ||
46 | "userspace to stop drawing\n"); | ||
47 | } | ||
48 | |||
49 | /* tell userspace to start drawing */ | ||
50 | static void start_drawing_late_resume(struct early_suspend *h) | ||
51 | { | ||
52 | unsigned long irq_flags; | ||
53 | |||
54 | spin_lock_irqsave(&fb_state_lock, irq_flags); | ||
55 | fb_state = FB_STATE_DRAWING_OK; | ||
56 | spin_unlock_irqrestore(&fb_state_lock, irq_flags); | ||
57 | wake_up(&fb_state_wq); | ||
58 | } | ||
59 | |||
60 | static struct early_suspend stop_drawing_early_suspend_desc = { | ||
61 | .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, | ||
62 | .suspend = stop_drawing_early_suspend, | ||
63 | .resume = start_drawing_late_resume, | ||
64 | }; | ||
65 | |||
66 | static ssize_t wait_for_fb_sleep_show(struct kobject *kobj, | ||
67 | struct kobj_attribute *attr, char *buf) | ||
68 | { | ||
69 | char *s = buf; | ||
70 | int ret; | ||
71 | |||
72 | ret = wait_event_interruptible(fb_state_wq, | ||
73 | fb_state != FB_STATE_DRAWING_OK); | ||
74 | if (ret && fb_state == FB_STATE_DRAWING_OK) | ||
75 | return ret; | ||
76 | else | ||
77 | s += sprintf(buf, "sleeping"); | ||
78 | return s - buf; | ||
79 | } | ||
80 | |||
81 | static ssize_t wait_for_fb_wake_show(struct kobject *kobj, | ||
82 | struct kobj_attribute *attr, char *buf) | ||
83 | { | ||
84 | char *s = buf; | ||
85 | int ret; | ||
86 | unsigned long irq_flags; | ||
87 | |||
88 | spin_lock_irqsave(&fb_state_lock, irq_flags); | ||
89 | if (fb_state == FB_STATE_REQUEST_STOP_DRAWING) { | ||
90 | fb_state = FB_STATE_STOPPED_DRAWING; | ||
91 | wake_up(&fb_state_wq); | ||
92 | } | ||
93 | spin_unlock_irqrestore(&fb_state_lock, irq_flags); | ||
94 | |||
95 | ret = wait_event_interruptible(fb_state_wq, | ||
96 | fb_state == FB_STATE_DRAWING_OK); | ||
97 | if (ret && fb_state != FB_STATE_DRAWING_OK) | ||
98 | return ret; | ||
99 | else | ||
100 | s += sprintf(buf, "awake"); | ||
101 | |||
102 | return s - buf; | ||
103 | } | ||
104 | |||
105 | #define power_ro_attr(_name) \ | ||
106 | static struct kobj_attribute _name##_attr = { \ | ||
107 | .attr = { \ | ||
108 | .name = __stringify(_name), \ | ||
109 | .mode = 0444, \ | ||
110 | }, \ | ||
111 | .show = _name##_show, \ | ||
112 | .store = NULL, \ | ||
113 | } | ||
114 | |||
115 | power_ro_attr(wait_for_fb_sleep); | ||
116 | power_ro_attr(wait_for_fb_wake); | ||
117 | |||
118 | static struct attribute *g[] = { | ||
119 | &wait_for_fb_sleep_attr.attr, | ||
120 | &wait_for_fb_wake_attr.attr, | ||
121 | NULL, | ||
122 | }; | ||
123 | |||
124 | static struct attribute_group attr_group = { | ||
125 | .attrs = g, | ||
126 | }; | ||
127 | |||
128 | static int __init android_power_init(void) | ||
129 | { | ||
130 | int ret; | ||
131 | |||
132 | init_waitqueue_head(&fb_state_wq); | ||
133 | fb_state = FB_STATE_DRAWING_OK; | ||
134 | |||
135 | ret = sysfs_create_group(power_kobj, &attr_group); | ||
136 | if (ret) { | ||
137 | pr_err("android_power_init: sysfs_create_group failed\n"); | ||
138 | return ret; | ||
139 | } | ||
140 | |||
141 | register_early_suspend(&stop_drawing_early_suspend_desc); | ||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | static void __exit android_power_exit(void) | ||
146 | { | ||
147 | unregister_early_suspend(&stop_drawing_early_suspend_desc); | ||
148 | sysfs_remove_group(power_kobj, &attr_group); | ||
149 | } | ||
150 | |||
151 | module_init(android_power_init); | ||
152 | module_exit(android_power_exit); | ||
153 | |||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 2981af4ce7c..3304594553c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); | |||
37 | 37 | ||
38 | int pm_notifier_call_chain(unsigned long val) | 38 | int pm_notifier_call_chain(unsigned long val) |
39 | { | 39 | { |
40 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | 40 | int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); |
41 | == NOTIFY_BAD) ? -EINVAL : 0; | 41 | |
42 | return notifier_to_errno(ret); | ||
42 | } | 43 | } |
43 | 44 | ||
44 | /* If set, devices may be suspended and resumed asynchronously. */ | 45 | /* If set, devices may be suspended and resumed asynchronously. */ |
@@ -170,7 +171,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
170 | const char *buf, size_t n) | 171 | const char *buf, size_t n) |
171 | { | 172 | { |
172 | #ifdef CONFIG_SUSPEND | 173 | #ifdef CONFIG_SUSPEND |
174 | #ifdef CONFIG_EARLYSUSPEND | ||
175 | suspend_state_t state = PM_SUSPEND_ON; | ||
176 | #else | ||
173 | suspend_state_t state = PM_SUSPEND_STANDBY; | 177 | suspend_state_t state = PM_SUSPEND_STANDBY; |
178 | #endif | ||
174 | const char * const *s; | 179 | const char * const *s; |
175 | #endif | 180 | #endif |
176 | char *p; | 181 | char *p; |
@@ -192,8 +197,15 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
192 | break; | 197 | break; |
193 | } | 198 | } |
194 | if (state < PM_SUSPEND_MAX && *s) | 199 | if (state < PM_SUSPEND_MAX && *s) |
200 | #ifdef CONFIG_EARLYSUSPEND | ||
201 | if (state == PM_SUSPEND_ON || valid_state(state)) { | ||
202 | error = 0; | ||
203 | request_suspend_state(state); | ||
204 | } | ||
205 | #else | ||
195 | error = enter_state(state); | 206 | error = enter_state(state); |
196 | #endif | 207 | #endif |
208 | #endif | ||
197 | 209 | ||
198 | Exit: | 210 | Exit: |
199 | return error ? error : n; | 211 | return error ? error : n; |
@@ -297,6 +309,11 @@ power_attr(pm_trace_dev_match); | |||
297 | 309 | ||
298 | #endif /* CONFIG_PM_TRACE */ | 310 | #endif /* CONFIG_PM_TRACE */ |
299 | 311 | ||
312 | #ifdef CONFIG_USER_WAKELOCK | ||
313 | power_attr(wake_lock); | ||
314 | power_attr(wake_unlock); | ||
315 | #endif | ||
316 | |||
300 | static struct attribute * g[] = { | 317 | static struct attribute * g[] = { |
301 | &state_attr.attr, | 318 | &state_attr.attr, |
302 | #ifdef CONFIG_PM_TRACE | 319 | #ifdef CONFIG_PM_TRACE |
@@ -309,6 +326,10 @@ static struct attribute * g[] = { | |||
309 | #ifdef CONFIG_PM_DEBUG | 326 | #ifdef CONFIG_PM_DEBUG |
310 | &pm_test_attr.attr, | 327 | &pm_test_attr.attr, |
311 | #endif | 328 | #endif |
329 | #ifdef CONFIG_USER_WAKELOCK | ||
330 | &wake_lock_attr.attr, | ||
331 | &wake_unlock_attr.attr, | ||
332 | #endif | ||
312 | #endif | 333 | #endif |
313 | NULL, | 334 | NULL, |
314 | }; | 335 | }; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a2628..b6b9006480f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -245,3 +245,27 @@ static inline void suspend_thaw_processes(void) | |||
245 | { | 245 | { |
246 | } | 246 | } |
247 | #endif | 247 | #endif |
248 | |||
249 | #ifdef CONFIG_WAKELOCK | ||
250 | /* kernel/power/wakelock.c */ | ||
251 | extern struct workqueue_struct *suspend_work_queue; | ||
252 | extern struct wake_lock main_wake_lock; | ||
253 | extern suspend_state_t requested_suspend_state; | ||
254 | #endif | ||
255 | |||
256 | #ifdef CONFIG_USER_WAKELOCK | ||
257 | ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
258 | char *buf); | ||
259 | ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
260 | const char *buf, size_t n); | ||
261 | ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
262 | char *buf); | ||
263 | ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
264 | const char *buf, size_t n); | ||
265 | #endif | ||
266 | |||
267 | #ifdef CONFIG_EARLYSUSPEND | ||
268 | /* kernel/power/earlysuspend.c */ | ||
269 | void request_suspend_state(suspend_state_t state); | ||
270 | suspend_state_t get_suspend_state(void); | ||
271 | #endif | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9..31338cdeafc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/workqueue.h> | 18 | #include <linux/workqueue.h> |
19 | #include <linux/wakelock.h> | ||
19 | 20 | ||
20 | /* | 21 | /* |
21 | * Timeout for stopping processes | 22 | * Timeout for stopping processes |
@@ -82,6 +83,10 @@ static int try_to_freeze_tasks(bool sig_only) | |||
82 | todo += wq_busy; | 83 | todo += wq_busy; |
83 | } | 84 | } |
84 | 85 | ||
86 | if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) { | ||
87 | wakeup = 1; | ||
88 | break; | ||
89 | } | ||
85 | if (!todo || time_after(jiffies, end_time)) | 90 | if (!todo || time_after(jiffies, end_time)) |
86 | break; | 91 | break; |
87 | 92 | ||
@@ -108,19 +113,25 @@ static int try_to_freeze_tasks(bool sig_only) | |||
108 | * and caller must call thaw_processes() if something fails), | 113 | * and caller must call thaw_processes() if something fails), |
109 | * but it cleans up leftover PF_FREEZE requests. | 114 | * but it cleans up leftover PF_FREEZE requests. |
110 | */ | 115 | */ |
111 | printk("\n"); | 116 | if(wakeup) { |
112 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " | 117 | printk("\n"); |
113 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 118 | printk(KERN_ERR "Freezing of %s aborted\n", |
114 | wakeup ? "aborted" : "failed", | 119 | sig_only ? "user space " : "tasks "); |
115 | elapsed_csecs / 100, elapsed_csecs % 100, | 120 | } |
116 | todo - wq_busy, wq_busy); | 121 | else { |
117 | 122 | printk("\n"); | |
123 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | ||
124 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | ||
125 | elapsed_csecs / 100, elapsed_csecs % 100, | ||
126 | todo - wq_busy, wq_busy); | ||
127 | } | ||
118 | thaw_workqueues(); | 128 | thaw_workqueues(); |
119 | 129 | ||
120 | read_lock(&tasklist_lock); | 130 | read_lock(&tasklist_lock); |
121 | do_each_thread(g, p) { | 131 | do_each_thread(g, p) { |
122 | task_lock(p); | 132 | task_lock(p); |
123 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) | 133 | if (freezing(p) && !freezer_should_skip(p) && |
134 | elapsed_csecs > 100) | ||
124 | sched_show_task(p); | 135 | sched_show_task(p); |
125 | cancel_freezing(p); | 136 | cancel_freezing(p); |
126 | task_unlock(p); | 137 | task_unlock(p); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1c41ba21541..a6f6e3114a2 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -28,6 +28,9 @@ | |||
28 | #include "power.h" | 28 | #include "power.h" |
29 | 29 | ||
30 | const char *const pm_states[PM_SUSPEND_MAX] = { | 30 | const char *const pm_states[PM_SUSPEND_MAX] = { |
31 | #ifdef CONFIG_EARLYSUSPEND | ||
32 | [PM_SUSPEND_ON] = "on", | ||
33 | #endif | ||
31 | [PM_SUSPEND_STANDBY] = "standby", | 34 | [PM_SUSPEND_STANDBY] = "standby", |
32 | [PM_SUSPEND_MEM] = "mem", | 35 | [PM_SUSPEND_MEM] = "mem", |
33 | }; | 36 | }; |
@@ -44,6 +47,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) | |||
44 | suspend_ops = ops; | 47 | suspend_ops = ops; |
45 | mutex_unlock(&pm_mutex); | 48 | mutex_unlock(&pm_mutex); |
46 | } | 49 | } |
50 | EXPORT_SYMBOL_GPL(suspend_set_ops); | ||
47 | 51 | ||
48 | bool valid_state(suspend_state_t state) | 52 | bool valid_state(suspend_state_t state) |
49 | { | 53 | { |
@@ -65,6 +69,7 @@ int suspend_valid_only_mem(suspend_state_t state) | |||
65 | { | 69 | { |
66 | return state == PM_SUSPEND_MEM; | 70 | return state == PM_SUSPEND_MEM; |
67 | } | 71 | } |
72 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); | ||
68 | 73 | ||
69 | static int suspend_test(int level) | 74 | static int suspend_test(int level) |
70 | { | 75 | { |
@@ -126,12 +131,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) | |||
126 | } | 131 | } |
127 | 132 | ||
128 | /** | 133 | /** |
129 | * suspend_enter - enter the desired system sleep state. | 134 | * suspend_enter - enter the desired system sleep state. |
130 | * @state: state to enter | 135 | * @state: State to enter |
136 | * @wakeup: Returns information that suspend should not be entered again. | ||
131 | * | 137 | * |
132 | * This function should be called after devices have been suspended. | 138 | * This function should be called after devices have been suspended. |
133 | */ | 139 | */ |
134 | static int suspend_enter(suspend_state_t state) | 140 | static int suspend_enter(suspend_state_t state, bool *wakeup) |
135 | { | 141 | { |
136 | int error; | 142 | int error; |
137 | 143 | ||
@@ -165,7 +171,8 @@ static int suspend_enter(suspend_state_t state) | |||
165 | 171 | ||
166 | error = syscore_suspend(); | 172 | error = syscore_suspend(); |
167 | if (!error) { | 173 | if (!error) { |
168 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { | 174 | *wakeup = pm_wakeup_pending(); |
175 | if (!(suspend_test(TEST_CORE) || *wakeup)) { | ||
169 | error = suspend_ops->enter(state); | 176 | error = suspend_ops->enter(state); |
170 | events_check_enabled = false; | 177 | events_check_enabled = false; |
171 | } | 178 | } |
@@ -199,6 +206,7 @@ static int suspend_enter(suspend_state_t state) | |||
199 | int suspend_devices_and_enter(suspend_state_t state) | 206 | int suspend_devices_and_enter(suspend_state_t state) |
200 | { | 207 | { |
201 | int error; | 208 | int error; |
209 | bool wakeup = false; | ||
202 | 210 | ||
203 | if (!suspend_ops) | 211 | if (!suspend_ops) |
204 | return -ENOSYS; | 212 | return -ENOSYS; |
@@ -220,7 +228,10 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
220 | if (suspend_test(TEST_DEVICES)) | 228 | if (suspend_test(TEST_DEVICES)) |
221 | goto Recover_platform; | 229 | goto Recover_platform; |
222 | 230 | ||
223 | error = suspend_enter(state); | 231 | do { |
232 | error = suspend_enter(state, &wakeup); | ||
233 | } while (!error && !wakeup | ||
234 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); | ||
224 | 235 | ||
225 | Resume_devices: | 236 | Resume_devices: |
226 | suspend_test_start(); | 237 | suspend_test_start(); |
@@ -307,7 +318,7 @@ int enter_state(suspend_state_t state) | |||
307 | */ | 318 | */ |
308 | int pm_suspend(suspend_state_t state) | 319 | int pm_suspend(suspend_state_t state) |
309 | { | 320 | { |
310 | if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) | 321 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) |
311 | return enter_state(state); | 322 | return enter_state(state); |
312 | return -EINVAL; | 323 | return -EINVAL; |
313 | } | 324 | } |
diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c new file mode 100644 index 00000000000..d2a65da9f22 --- /dev/null +++ b/kernel/power/suspend_time.c | |||
@@ -0,0 +1,111 @@ | |||
1 | /* | ||
2 | * debugfs file to track time spent in suspend | ||
3 | * | ||
4 | * Copyright (c) 2011, Google, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/debugfs.h> | ||
18 | #include <linux/err.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/seq_file.h> | ||
22 | #include <linux/syscore_ops.h> | ||
23 | #include <linux/time.h> | ||
24 | |||
25 | static struct timespec suspend_time_before; | ||
26 | static unsigned int time_in_suspend_bins[32]; | ||
27 | |||
28 | #ifdef CONFIG_DEBUG_FS | ||
29 | static int suspend_time_debug_show(struct seq_file *s, void *data) | ||
30 | { | ||
31 | int bin; | ||
32 | seq_printf(s, "time (secs) count\n"); | ||
33 | seq_printf(s, "------------------\n"); | ||
34 | for (bin = 0; bin < 32; bin++) { | ||
35 | if (time_in_suspend_bins[bin] == 0) | ||
36 | continue; | ||
37 | seq_printf(s, "%4d - %4d %4u\n", | ||
38 | bin ? 1 << (bin - 1) : 0, 1 << bin, | ||
39 | time_in_suspend_bins[bin]); | ||
40 | } | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | static int suspend_time_debug_open(struct inode *inode, struct file *file) | ||
45 | { | ||
46 | return single_open(file, suspend_time_debug_show, NULL); | ||
47 | } | ||
48 | |||
49 | static const struct file_operations suspend_time_debug_fops = { | ||
50 | .open = suspend_time_debug_open, | ||
51 | .read = seq_read, | ||
52 | .llseek = seq_lseek, | ||
53 | .release = single_release, | ||
54 | }; | ||
55 | |||
56 | static int __init suspend_time_debug_init(void) | ||
57 | { | ||
58 | struct dentry *d; | ||
59 | |||
60 | d = debugfs_create_file("suspend_time", 0755, NULL, NULL, | ||
61 | &suspend_time_debug_fops); | ||
62 | if (!d) { | ||
63 | pr_err("Failed to create suspend_time debug file\n"); | ||
64 | return -ENOMEM; | ||
65 | } | ||
66 | |||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | late_initcall(suspend_time_debug_init); | ||
71 | #endif | ||
72 | |||
73 | static int suspend_time_syscore_suspend(void) | ||
74 | { | ||
75 | read_persistent_clock(&suspend_time_before); | ||
76 | |||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static void suspend_time_syscore_resume(void) | ||
81 | { | ||
82 | struct timespec after; | ||
83 | |||
84 | read_persistent_clock(&after); | ||
85 | |||
86 | after = timespec_sub(after, suspend_time_before); | ||
87 | |||
88 | time_in_suspend_bins[fls(after.tv_sec)]++; | ||
89 | |||
90 | pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec, | ||
91 | after.tv_nsec / NSEC_PER_MSEC); | ||
92 | } | ||
93 | |||
94 | static struct syscore_ops suspend_time_syscore_ops = { | ||
95 | .suspend = suspend_time_syscore_suspend, | ||
96 | .resume = suspend_time_syscore_resume, | ||
97 | }; | ||
98 | |||
99 | static int suspend_time_syscore_init(void) | ||
100 | { | ||
101 | register_syscore_ops(&suspend_time_syscore_ops); | ||
102 | |||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static void suspend_time_syscore_exit(void) | ||
107 | { | ||
108 | unregister_syscore_ops(&suspend_time_syscore_ops); | ||
109 | } | ||
110 | module_init(suspend_time_syscore_init); | ||
111 | module_exit(suspend_time_syscore_exit); | ||
diff --git a/kernel/power/userwakelock.c b/kernel/power/userwakelock.c new file mode 100644 index 00000000000..a28a8db4146 --- /dev/null +++ b/kernel/power/userwakelock.c | |||
@@ -0,0 +1,219 @@ | |||
1 | /* kernel/power/userwakelock.c | ||
2 | * | ||
3 | * Copyright (C) 2005-2008 Google, Inc. | ||
4 | * | ||
5 | * This software is licensed under the terms of the GNU General Public | ||
6 | * License version 2, as published by the Free Software Foundation, and | ||
7 | * may be copied, distributed, and modified under those terms. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/ctype.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/wakelock.h> | ||
19 | #include <linux/slab.h> | ||
20 | |||
21 | #include "power.h" | ||
22 | |||
23 | enum { | ||
24 | DEBUG_FAILURE = BIT(0), | ||
25 | DEBUG_ERROR = BIT(1), | ||
26 | DEBUG_NEW = BIT(2), | ||
27 | DEBUG_ACCESS = BIT(3), | ||
28 | DEBUG_LOOKUP = BIT(4), | ||
29 | }; | ||
30 | static int debug_mask = DEBUG_FAILURE; | ||
31 | module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); | ||
32 | |||
33 | static DEFINE_MUTEX(tree_lock); | ||
34 | |||
35 | struct user_wake_lock { | ||
36 | struct rb_node node; | ||
37 | struct wake_lock wake_lock; | ||
38 | char name[0]; | ||
39 | }; | ||
40 | struct rb_root user_wake_locks; | ||
41 | |||
42 | static struct user_wake_lock *lookup_wake_lock_name( | ||
43 | const char *buf, int allocate, long *timeoutptr) | ||
44 | { | ||
45 | struct rb_node **p = &user_wake_locks.rb_node; | ||
46 | struct rb_node *parent = NULL; | ||
47 | struct user_wake_lock *l; | ||
48 | int diff; | ||
49 | u64 timeout; | ||
50 | int name_len; | ||
51 | const char *arg; | ||
52 | |||
53 | /* Find length of lock name and start of optional timeout string */ | ||
54 | arg = buf; | ||
55 | while (*arg && !isspace(*arg)) | ||
56 | arg++; | ||
57 | name_len = arg - buf; | ||
58 | if (!name_len) | ||
59 | goto bad_arg; | ||
60 | while (isspace(*arg)) | ||
61 | arg++; | ||
62 | |||
63 | /* Process timeout string */ | ||
64 | if (timeoutptr && *arg) { | ||
65 | timeout = simple_strtoull(arg, (char **)&arg, 0); | ||
66 | while (isspace(*arg)) | ||
67 | arg++; | ||
68 | if (*arg) | ||
69 | goto bad_arg; | ||
70 | /* convert timeout from nanoseconds to jiffies > 0 */ | ||
71 | timeout += (NSEC_PER_SEC / HZ) - 1; | ||
72 | do_div(timeout, (NSEC_PER_SEC / HZ)); | ||
73 | if (timeout <= 0) | ||
74 | timeout = 1; | ||
75 | *timeoutptr = timeout; | ||
76 | } else if (*arg) | ||
77 | goto bad_arg; | ||
78 | else if (timeoutptr) | ||
79 | *timeoutptr = 0; | ||
80 | |||
81 | /* Lookup wake lock in rbtree */ | ||
82 | while (*p) { | ||
83 | parent = *p; | ||
84 | l = rb_entry(parent, struct user_wake_lock, node); | ||
85 | diff = strncmp(buf, l->name, name_len); | ||
86 | if (!diff && l->name[name_len]) | ||
87 | diff = -1; | ||
88 | if (debug_mask & DEBUG_ERROR) | ||
89 | pr_info("lookup_wake_lock_name: compare %.*s %s %d\n", | ||
90 | name_len, buf, l->name, diff); | ||
91 | |||
92 | if (diff < 0) | ||
93 | p = &(*p)->rb_left; | ||
94 | else if (diff > 0) | ||
95 | p = &(*p)->rb_right; | ||
96 | else | ||
97 | return l; | ||
98 | } | ||
99 | |||
100 | /* Allocate and add new wakelock to rbtree */ | ||
101 | if (!allocate) { | ||
102 | if (debug_mask & DEBUG_ERROR) | ||
103 | pr_info("lookup_wake_lock_name: %.*s not found\n", | ||
104 | name_len, buf); | ||
105 | return ERR_PTR(-EINVAL); | ||
106 | } | ||
107 | l = kzalloc(sizeof(*l) + name_len + 1, GFP_KERNEL); | ||
108 | if (l == NULL) { | ||
109 | if (debug_mask & DEBUG_FAILURE) | ||
110 | pr_err("lookup_wake_lock_name: failed to allocate " | ||
111 | "memory for %.*s\n", name_len, buf); | ||
112 | return ERR_PTR(-ENOMEM); | ||
113 | } | ||
114 | memcpy(l->name, buf, name_len); | ||
115 | if (debug_mask & DEBUG_NEW) | ||
116 | pr_info("lookup_wake_lock_name: new wake lock %s\n", l->name); | ||
117 | wake_lock_init(&l->wake_lock, WAKE_LOCK_SUSPEND, l->name); | ||
118 | rb_link_node(&l->node, parent, p); | ||
119 | rb_insert_color(&l->node, &user_wake_locks); | ||
120 | return l; | ||
121 | |||
122 | bad_arg: | ||
123 | if (debug_mask & DEBUG_ERROR) | ||
124 | pr_info("lookup_wake_lock_name: wake lock, %.*s, bad arg, %s\n", | ||
125 | name_len, buf, arg); | ||
126 | return ERR_PTR(-EINVAL); | ||
127 | } | ||
128 | |||
129 | ssize_t wake_lock_show( | ||
130 | struct kobject *kobj, struct kobj_attribute *attr, char *buf) | ||
131 | { | ||
132 | char *s = buf; | ||
133 | char *end = buf + PAGE_SIZE; | ||
134 | struct rb_node *n; | ||
135 | struct user_wake_lock *l; | ||
136 | |||
137 | mutex_lock(&tree_lock); | ||
138 | |||
139 | for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) { | ||
140 | l = rb_entry(n, struct user_wake_lock, node); | ||
141 | if (wake_lock_active(&l->wake_lock)) | ||
142 | s += scnprintf(s, end - s, "%s ", l->name); | ||
143 | } | ||
144 | s += scnprintf(s, end - s, "\n"); | ||
145 | |||
146 | mutex_unlock(&tree_lock); | ||
147 | return (s - buf); | ||
148 | } | ||
149 | |||
150 | ssize_t wake_lock_store( | ||
151 | struct kobject *kobj, struct kobj_attribute *attr, | ||
152 | const char *buf, size_t n) | ||
153 | { | ||
154 | long timeout; | ||
155 | struct user_wake_lock *l; | ||
156 | |||
157 | mutex_lock(&tree_lock); | ||
158 | l = lookup_wake_lock_name(buf, 1, &timeout); | ||
159 | if (IS_ERR(l)) { | ||
160 | n = PTR_ERR(l); | ||
161 | goto bad_name; | ||
162 | } | ||
163 | |||
164 | if (debug_mask & DEBUG_ACCESS) | ||
165 | pr_info("wake_lock_store: %s, timeout %ld\n", l->name, timeout); | ||
166 | |||
167 | if (timeout) | ||
168 | wake_lock_timeout(&l->wake_lock, timeout); | ||
169 | else | ||
170 | wake_lock(&l->wake_lock); | ||
171 | bad_name: | ||
172 | mutex_unlock(&tree_lock); | ||
173 | return n; | ||
174 | } | ||
175 | |||
176 | |||
177 | ssize_t wake_unlock_show( | ||
178 | struct kobject *kobj, struct kobj_attribute *attr, char *buf) | ||
179 | { | ||
180 | char *s = buf; | ||
181 | char *end = buf + PAGE_SIZE; | ||
182 | struct rb_node *n; | ||
183 | struct user_wake_lock *l; | ||
184 | |||
185 | mutex_lock(&tree_lock); | ||
186 | |||
187 | for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) { | ||
188 | l = rb_entry(n, struct user_wake_lock, node); | ||
189 | if (!wake_lock_active(&l->wake_lock)) | ||
190 | s += scnprintf(s, end - s, "%s ", l->name); | ||
191 | } | ||
192 | s += scnprintf(s, end - s, "\n"); | ||
193 | |||
194 | mutex_unlock(&tree_lock); | ||
195 | return (s - buf); | ||
196 | } | ||
197 | |||
198 | ssize_t wake_unlock_store( | ||
199 | struct kobject *kobj, struct kobj_attribute *attr, | ||
200 | const char *buf, size_t n) | ||
201 | { | ||
202 | struct user_wake_lock *l; | ||
203 | |||
204 | mutex_lock(&tree_lock); | ||
205 | l = lookup_wake_lock_name(buf, 0, NULL); | ||
206 | if (IS_ERR(l)) { | ||
207 | n = PTR_ERR(l); | ||
208 | goto not_found; | ||
209 | } | ||
210 | |||
211 | if (debug_mask & DEBUG_ACCESS) | ||
212 | pr_info("wake_unlock_store: %s\n", l->name); | ||
213 | |||
214 | wake_unlock(&l->wake_lock); | ||
215 | not_found: | ||
216 | mutex_unlock(&tree_lock); | ||
217 | return n; | ||
218 | } | ||
219 | |||
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 00000000000..81e1b7c65ca --- /dev/null +++ b/kernel/power/wakelock.c | |||
@@ -0,0 +1,634 @@ | |||
1 | /* kernel/power/wakelock.c | ||
2 | * | ||
3 | * Copyright (C) 2005-2008 Google, Inc. | ||
4 | * | ||
5 | * This software is licensed under the terms of the GNU General Public | ||
6 | * License version 2, as published by the Free Software Foundation, and | ||
7 | * may be copied, distributed, and modified under those terms. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/module.h> | ||
17 | #include <linux/platform_device.h> | ||
18 | #include <linux/rtc.h> | ||
19 | #include <linux/suspend.h> | ||
20 | #include <linux/syscalls.h> /* sys_sync */ | ||
21 | #include <linux/wakelock.h> | ||
22 | #ifdef CONFIG_WAKELOCK_STAT | ||
23 | #include <linux/proc_fs.h> | ||
24 | #endif | ||
25 | #include "power.h" | ||
26 | |||
27 | enum { | ||
28 | DEBUG_EXIT_SUSPEND = 1U << 0, | ||
29 | DEBUG_WAKEUP = 1U << 1, | ||
30 | DEBUG_SUSPEND = 1U << 2, | ||
31 | DEBUG_EXPIRE = 1U << 3, | ||
32 | DEBUG_WAKE_LOCK = 1U << 4, | ||
33 | }; | ||
34 | static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP; | ||
35 | module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); | ||
36 | |||
37 | #define WAKE_LOCK_TYPE_MASK (0x0f) | ||
38 | #define WAKE_LOCK_INITIALIZED (1U << 8) | ||
39 | #define WAKE_LOCK_ACTIVE (1U << 9) | ||
40 | #define WAKE_LOCK_AUTO_EXPIRE (1U << 10) | ||
41 | #define WAKE_LOCK_PREVENTING_SUSPEND (1U << 11) | ||
42 | |||
43 | static DEFINE_SPINLOCK(list_lock); | ||
44 | static LIST_HEAD(inactive_locks); | ||
45 | static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT]; | ||
46 | static int current_event_num; | ||
47 | struct workqueue_struct *suspend_work_queue; | ||
48 | struct wake_lock main_wake_lock; | ||
49 | suspend_state_t requested_suspend_state = PM_SUSPEND_MEM; | ||
50 | static struct wake_lock unknown_wakeup; | ||
51 | static struct wake_lock suspend_backoff_lock; | ||
52 | |||
53 | #define SUSPEND_BACKOFF_THRESHOLD 10 | ||
54 | #define SUSPEND_BACKOFF_INTERVAL 10000 | ||
55 | |||
56 | static unsigned suspend_short_count; | ||
57 | |||
58 | #ifdef CONFIG_WAKELOCK_STAT | ||
59 | static struct wake_lock deleted_wake_locks; | ||
60 | static ktime_t last_sleep_time_update; | ||
61 | static int wait_for_wakeup; | ||
62 | |||
63 | int get_expired_time(struct wake_lock *lock, ktime_t *expire_time) | ||
64 | { | ||
65 | struct timespec ts; | ||
66 | struct timespec kt; | ||
67 | struct timespec tomono; | ||
68 | struct timespec delta; | ||
69 | struct timespec sleep; | ||
70 | long timeout; | ||
71 | |||
72 | if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE)) | ||
73 | return 0; | ||
74 | get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep); | ||
75 | timeout = lock->expires - jiffies; | ||
76 | if (timeout > 0) | ||
77 | return 0; | ||
78 | jiffies_to_timespec(-timeout, &delta); | ||
79 | set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec, | ||
80 | kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec); | ||
81 | *expire_time = timespec_to_ktime(ts); | ||
82 | return 1; | ||
83 | } | ||
84 | |||
85 | |||
86 | static int print_lock_stat(struct seq_file *m, struct wake_lock *lock) | ||
87 | { | ||
88 | int lock_count = lock->stat.count; | ||
89 | int expire_count = lock->stat.expire_count; | ||
90 | ktime_t active_time = ktime_set(0, 0); | ||
91 | ktime_t total_time = lock->stat.total_time; | ||
92 | ktime_t max_time = lock->stat.max_time; | ||
93 | |||
94 | ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time; | ||
95 | if (lock->flags & WAKE_LOCK_ACTIVE) { | ||
96 | ktime_t now, add_time; | ||
97 | int expired = get_expired_time(lock, &now); | ||
98 | if (!expired) | ||
99 | now = ktime_get(); | ||
100 | add_time = ktime_sub(now, lock->stat.last_time); | ||
101 | lock_count++; | ||
102 | if (!expired) | ||
103 | active_time = add_time; | ||
104 | else | ||
105 | expire_count++; | ||
106 | total_time = ktime_add(total_time, add_time); | ||
107 | if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) | ||
108 | prevent_suspend_time = ktime_add(prevent_suspend_time, | ||
109 | ktime_sub(now, last_sleep_time_update)); | ||
110 | if (add_time.tv64 > max_time.tv64) | ||
111 | max_time = add_time; | ||
112 | } | ||
113 | |||
114 | return seq_printf(m, | ||
115 | "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n", | ||
116 | lock->name, lock_count, expire_count, | ||
117 | lock->stat.wakeup_count, ktime_to_ns(active_time), | ||
118 | ktime_to_ns(total_time), | ||
119 | ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time), | ||
120 | ktime_to_ns(lock->stat.last_time)); | ||
121 | } | ||
122 | |||
123 | static int wakelock_stats_show(struct seq_file *m, void *unused) | ||
124 | { | ||
125 | unsigned long irqflags; | ||
126 | struct wake_lock *lock; | ||
127 | int ret; | ||
128 | int type; | ||
129 | |||
130 | spin_lock_irqsave(&list_lock, irqflags); | ||
131 | |||
132 | ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since" | ||
133 | "\ttotal_time\tsleep_time\tmax_time\tlast_change\n"); | ||
134 | list_for_each_entry(lock, &inactive_locks, link) | ||
135 | ret = print_lock_stat(m, lock); | ||
136 | for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) { | ||
137 | list_for_each_entry(lock, &active_wake_locks[type], link) | ||
138 | ret = print_lock_stat(m, lock); | ||
139 | } | ||
140 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | static void wake_unlock_stat_locked(struct wake_lock *lock, int expired) | ||
145 | { | ||
146 | ktime_t duration; | ||
147 | ktime_t now; | ||
148 | if (!(lock->flags & WAKE_LOCK_ACTIVE)) | ||
149 | return; | ||
150 | if (get_expired_time(lock, &now)) | ||
151 | expired = 1; | ||
152 | else | ||
153 | now = ktime_get(); | ||
154 | lock->stat.count++; | ||
155 | if (expired) | ||
156 | lock->stat.expire_count++; | ||
157 | duration = ktime_sub(now, lock->stat.last_time); | ||
158 | lock->stat.total_time = ktime_add(lock->stat.total_time, duration); | ||
159 | if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time)) | ||
160 | lock->stat.max_time = duration; | ||
161 | lock->stat.last_time = ktime_get(); | ||
162 | if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) { | ||
163 | duration = ktime_sub(now, last_sleep_time_update); | ||
164 | lock->stat.prevent_suspend_time = ktime_add( | ||
165 | lock->stat.prevent_suspend_time, duration); | ||
166 | lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND; | ||
167 | } | ||
168 | } | ||
169 | |||
170 | static void update_sleep_wait_stats_locked(int done) | ||
171 | { | ||
172 | struct wake_lock *lock; | ||
173 | ktime_t now, etime, elapsed, add; | ||
174 | int expired; | ||
175 | |||
176 | now = ktime_get(); | ||
177 | elapsed = ktime_sub(now, last_sleep_time_update); | ||
178 | list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) { | ||
179 | expired = get_expired_time(lock, &etime); | ||
180 | if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) { | ||
181 | if (expired) | ||
182 | add = ktime_sub(etime, last_sleep_time_update); | ||
183 | else | ||
184 | add = elapsed; | ||
185 | lock->stat.prevent_suspend_time = ktime_add( | ||
186 | lock->stat.prevent_suspend_time, add); | ||
187 | } | ||
188 | if (done || expired) | ||
189 | lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND; | ||
190 | else | ||
191 | lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND; | ||
192 | } | ||
193 | last_sleep_time_update = now; | ||
194 | } | ||
195 | #endif | ||
196 | |||
197 | |||
198 | static void expire_wake_lock(struct wake_lock *lock) | ||
199 | { | ||
200 | #ifdef CONFIG_WAKELOCK_STAT | ||
201 | wake_unlock_stat_locked(lock, 1); | ||
202 | #endif | ||
203 | lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE); | ||
204 | list_del(&lock->link); | ||
205 | list_add(&lock->link, &inactive_locks); | ||
206 | if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE)) | ||
207 | pr_info("expired wake lock %s\n", lock->name); | ||
208 | } | ||
209 | |||
210 | /* Caller must acquire the list_lock spinlock */ | ||
211 | static void print_active_locks(int type) | ||
212 | { | ||
213 | struct wake_lock *lock; | ||
214 | bool print_expired = true; | ||
215 | |||
216 | BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); | ||
217 | list_for_each_entry(lock, &active_wake_locks[type], link) { | ||
218 | if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) { | ||
219 | long timeout = lock->expires - jiffies; | ||
220 | if (timeout > 0) | ||
221 | pr_info("active wake lock %s, time left %ld\n", | ||
222 | lock->name, timeout); | ||
223 | else if (print_expired) | ||
224 | pr_info("wake lock %s, expired\n", lock->name); | ||
225 | } else { | ||
226 | pr_info("active wake lock %s\n", lock->name); | ||
227 | if (!(debug_mask & DEBUG_EXPIRE)) | ||
228 | print_expired = false; | ||
229 | } | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static long has_wake_lock_locked(int type) | ||
234 | { | ||
235 | struct wake_lock *lock, *n; | ||
236 | long max_timeout = 0; | ||
237 | |||
238 | BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); | ||
239 | list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) { | ||
240 | if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) { | ||
241 | long timeout = lock->expires - jiffies; | ||
242 | if (timeout <= 0) | ||
243 | expire_wake_lock(lock); | ||
244 | else if (timeout > max_timeout) | ||
245 | max_timeout = timeout; | ||
246 | } else | ||
247 | return -1; | ||
248 | } | ||
249 | return max_timeout; | ||
250 | } | ||
251 | |||
252 | long has_wake_lock(int type) | ||
253 | { | ||
254 | long ret; | ||
255 | unsigned long irqflags; | ||
256 | spin_lock_irqsave(&list_lock, irqflags); | ||
257 | ret = has_wake_lock_locked(type); | ||
258 | if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND) | ||
259 | print_active_locks(type); | ||
260 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
261 | return ret; | ||
262 | } | ||
263 | |||
264 | static void suspend_backoff(void) | ||
265 | { | ||
266 | pr_info("suspend: too many immediate wakeups, back off\n"); | ||
267 | wake_lock_timeout(&suspend_backoff_lock, | ||
268 | msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL)); | ||
269 | } | ||
270 | |||
271 | static void suspend(struct work_struct *work) | ||
272 | { | ||
273 | int ret; | ||
274 | int entry_event_num; | ||
275 | struct timespec ts_entry, ts_exit; | ||
276 | |||
277 | if (has_wake_lock(WAKE_LOCK_SUSPEND)) { | ||
278 | if (debug_mask & DEBUG_SUSPEND) | ||
279 | pr_info("suspend: abort suspend\n"); | ||
280 | return; | ||
281 | } | ||
282 | |||
283 | entry_event_num = current_event_num; | ||
284 | sys_sync(); | ||
285 | if (debug_mask & DEBUG_SUSPEND) | ||
286 | pr_info("suspend: enter suspend\n"); | ||
287 | getnstimeofday(&ts_entry); | ||
288 | ret = pm_suspend(requested_suspend_state); | ||
289 | getnstimeofday(&ts_exit); | ||
290 | |||
291 | if (debug_mask & DEBUG_EXIT_SUSPEND) { | ||
292 | struct rtc_time tm; | ||
293 | rtc_time_to_tm(ts_exit.tv_sec, &tm); | ||
294 | pr_info("suspend: exit suspend, ret = %d " | ||
295 | "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret, | ||
296 | tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, | ||
297 | tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec); | ||
298 | } | ||
299 | |||
300 | if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) { | ||
301 | ++suspend_short_count; | ||
302 | |||
303 | if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) { | ||
304 | suspend_backoff(); | ||
305 | suspend_short_count = 0; | ||
306 | } | ||
307 | } else { | ||
308 | suspend_short_count = 0; | ||
309 | } | ||
310 | |||
311 | if (current_event_num == entry_event_num) { | ||
312 | if (debug_mask & DEBUG_SUSPEND) | ||
313 | pr_info("suspend: pm_suspend returned with no event\n"); | ||
314 | wake_lock_timeout(&unknown_wakeup, HZ / 2); | ||
315 | } | ||
316 | } | ||
317 | static DECLARE_WORK(suspend_work, suspend); | ||
318 | |||
319 | static void expire_wake_locks(unsigned long data) | ||
320 | { | ||
321 | long has_lock; | ||
322 | unsigned long irqflags; | ||
323 | if (debug_mask & DEBUG_EXPIRE) | ||
324 | pr_info("expire_wake_locks: start\n"); | ||
325 | spin_lock_irqsave(&list_lock, irqflags); | ||
326 | if (debug_mask & DEBUG_SUSPEND) | ||
327 | print_active_locks(WAKE_LOCK_SUSPEND); | ||
328 | has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND); | ||
329 | if (debug_mask & DEBUG_EXPIRE) | ||
330 | pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock); | ||
331 | if (has_lock == 0) | ||
332 | queue_work(suspend_work_queue, &suspend_work); | ||
333 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
334 | } | ||
335 | static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0); | ||
336 | |||
337 | static int power_suspend_late(struct device *dev) | ||
338 | { | ||
339 | int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0; | ||
340 | #ifdef CONFIG_WAKELOCK_STAT | ||
341 | wait_for_wakeup = !ret; | ||
342 | #endif | ||
343 | if (debug_mask & DEBUG_SUSPEND) | ||
344 | pr_info("power_suspend_late return %d\n", ret); | ||
345 | return ret; | ||
346 | } | ||
347 | |||
348 | static struct dev_pm_ops power_driver_pm_ops = { | ||
349 | .suspend_noirq = power_suspend_late, | ||
350 | }; | ||
351 | |||
352 | static struct platform_driver power_driver = { | ||
353 | .driver.name = "power", | ||
354 | .driver.pm = &power_driver_pm_ops, | ||
355 | }; | ||
356 | static struct platform_device power_device = { | ||
357 | .name = "power", | ||
358 | }; | ||
359 | |||
360 | void wake_lock_init(struct wake_lock *lock, int type, const char *name) | ||
361 | { | ||
362 | unsigned long irqflags = 0; | ||
363 | |||
364 | if (name) | ||
365 | lock->name = name; | ||
366 | BUG_ON(!lock->name); | ||
367 | |||
368 | if (debug_mask & DEBUG_WAKE_LOCK) | ||
369 | pr_info("wake_lock_init name=%s\n", lock->name); | ||
370 | #ifdef CONFIG_WAKELOCK_STAT | ||
371 | lock->stat.count = 0; | ||
372 | lock->stat.expire_count = 0; | ||
373 | lock->stat.wakeup_count = 0; | ||
374 | lock->stat.total_time = ktime_set(0, 0); | ||
375 | lock->stat.prevent_suspend_time = ktime_set(0, 0); | ||
376 | lock->stat.max_time = ktime_set(0, 0); | ||
377 | lock->stat.last_time = ktime_set(0, 0); | ||
378 | #endif | ||
379 | lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED; | ||
380 | |||
381 | INIT_LIST_HEAD(&lock->link); | ||
382 | spin_lock_irqsave(&list_lock, irqflags); | ||
383 | list_add(&lock->link, &inactive_locks); | ||
384 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
385 | } | ||
386 | EXPORT_SYMBOL(wake_lock_init); | ||
387 | |||
388 | void wake_lock_destroy(struct wake_lock *lock) | ||
389 | { | ||
390 | unsigned long irqflags; | ||
391 | if (debug_mask & DEBUG_WAKE_LOCK) | ||
392 | pr_info("wake_lock_destroy name=%s\n", lock->name); | ||
393 | spin_lock_irqsave(&list_lock, irqflags); | ||
394 | lock->flags &= ~WAKE_LOCK_INITIALIZED; | ||
395 | #ifdef CONFIG_WAKELOCK_STAT | ||
396 | if (lock->stat.count) { | ||
397 | deleted_wake_locks.stat.count += lock->stat.count; | ||
398 | deleted_wake_locks.stat.expire_count += lock->stat.expire_count; | ||
399 | deleted_wake_locks.stat.total_time = | ||
400 | ktime_add(deleted_wake_locks.stat.total_time, | ||
401 | lock->stat.total_time); | ||
402 | deleted_wake_locks.stat.prevent_suspend_time = | ||
403 | ktime_add(deleted_wake_locks.stat.prevent_suspend_time, | ||
404 | lock->stat.prevent_suspend_time); | ||
405 | deleted_wake_locks.stat.max_time = | ||
406 | ktime_add(deleted_wake_locks.stat.max_time, | ||
407 | lock->stat.max_time); | ||
408 | } | ||
409 | #endif | ||
410 | list_del(&lock->link); | ||
411 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
412 | } | ||
413 | EXPORT_SYMBOL(wake_lock_destroy); | ||
414 | |||
415 | static void wake_lock_internal( | ||
416 | struct wake_lock *lock, long timeout, int has_timeout) | ||
417 | { | ||
418 | int type; | ||
419 | unsigned long irqflags; | ||
420 | long expire_in; | ||
421 | |||
422 | spin_lock_irqsave(&list_lock, irqflags); | ||
423 | type = lock->flags & WAKE_LOCK_TYPE_MASK; | ||
424 | BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); | ||
425 | BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED)); | ||
426 | #ifdef CONFIG_WAKELOCK_STAT | ||
427 | if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) { | ||
428 | if (debug_mask & DEBUG_WAKEUP) | ||
429 | pr_info("wakeup wake lock: %s\n", lock->name); | ||
430 | wait_for_wakeup = 0; | ||
431 | lock->stat.wakeup_count++; | ||
432 | } | ||
433 | if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) && | ||
434 | (long)(lock->expires - jiffies) <= 0) { | ||
435 | wake_unlock_stat_locked(lock, 0); | ||
436 | lock->stat.last_time = ktime_get(); | ||
437 | } | ||
438 | #endif | ||
439 | if (!(lock->flags & WAKE_LOCK_ACTIVE)) { | ||
440 | lock->flags |= WAKE_LOCK_ACTIVE; | ||
441 | #ifdef CONFIG_WAKELOCK_STAT | ||
442 | lock->stat.last_time = ktime_get(); | ||
443 | #endif | ||
444 | } | ||
445 | list_del(&lock->link); | ||
446 | if (has_timeout) { | ||
447 | if (debug_mask & DEBUG_WAKE_LOCK) | ||
448 | pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n", | ||
449 | lock->name, type, timeout / HZ, | ||
450 | (timeout % HZ) * MSEC_PER_SEC / HZ); | ||
451 | lock->expires = jiffies + timeout; | ||
452 | lock->flags |= WAKE_LOCK_AUTO_EXPIRE; | ||
453 | list_add_tail(&lock->link, &active_wake_locks[type]); | ||
454 | } else { | ||
455 | if (debug_mask & DEBUG_WAKE_LOCK) | ||
456 | pr_info("wake_lock: %s, type %d\n", lock->name, type); | ||
457 | lock->expires = LONG_MAX; | ||
458 | lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE; | ||
459 | list_add(&lock->link, &active_wake_locks[type]); | ||
460 | } | ||
461 | if (type == WAKE_LOCK_SUSPEND) { | ||
462 | current_event_num++; | ||
463 | #ifdef CONFIG_WAKELOCK_STAT | ||
464 | if (lock == &main_wake_lock) | ||
465 | update_sleep_wait_stats_locked(1); | ||
466 | else if (!wake_lock_active(&main_wake_lock)) | ||
467 | update_sleep_wait_stats_locked(0); | ||
468 | #endif | ||
469 | if (has_timeout) | ||
470 | expire_in = has_wake_lock_locked(type); | ||
471 | else | ||
472 | expire_in = -1; | ||
473 | if (expire_in > 0) { | ||
474 | if (debug_mask & DEBUG_EXPIRE) | ||
475 | pr_info("wake_lock: %s, start expire timer, " | ||
476 | "%ld\n", lock->name, expire_in); | ||
477 | mod_timer(&expire_timer, jiffies + expire_in); | ||
478 | } else { | ||
479 | if (del_timer(&expire_timer)) | ||
480 | if (debug_mask & DEBUG_EXPIRE) | ||
481 | pr_info("wake_lock: %s, stop expire timer\n", | ||
482 | lock->name); | ||
483 | if (expire_in == 0) | ||
484 | queue_work(suspend_work_queue, &suspend_work); | ||
485 | } | ||
486 | } | ||
487 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
488 | } | ||
489 | |||
490 | void wake_lock(struct wake_lock *lock) | ||
491 | { | ||
492 | wake_lock_internal(lock, 0, 0); | ||
493 | } | ||
494 | EXPORT_SYMBOL(wake_lock); | ||
495 | |||
496 | void wake_lock_timeout(struct wake_lock *lock, long timeout) | ||
497 | { | ||
498 | wake_lock_internal(lock, timeout, 1); | ||
499 | } | ||
500 | EXPORT_SYMBOL(wake_lock_timeout); | ||
501 | |||
502 | void wake_unlock(struct wake_lock *lock) | ||
503 | { | ||
504 | int type; | ||
505 | unsigned long irqflags; | ||
506 | spin_lock_irqsave(&list_lock, irqflags); | ||
507 | type = lock->flags & WAKE_LOCK_TYPE_MASK; | ||
508 | #ifdef CONFIG_WAKELOCK_STAT | ||
509 | wake_unlock_stat_locked(lock, 0); | ||
510 | #endif | ||
511 | if (debug_mask & DEBUG_WAKE_LOCK) | ||
512 | pr_info("wake_unlock: %s\n", lock->name); | ||
513 | lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE); | ||
514 | list_del(&lock->link); | ||
515 | list_add(&lock->link, &inactive_locks); | ||
516 | if (type == WAKE_LOCK_SUSPEND) { | ||
517 | long has_lock = has_wake_lock_locked(type); | ||
518 | if (has_lock > 0) { | ||
519 | if (debug_mask & DEBUG_EXPIRE) | ||
520 | pr_info("wake_unlock: %s, start expire timer, " | ||
521 | "%ld\n", lock->name, has_lock); | ||
522 | mod_timer(&expire_timer, jiffies + has_lock); | ||
523 | } else { | ||
524 | if (del_timer(&expire_timer)) | ||
525 | if (debug_mask & DEBUG_EXPIRE) | ||
526 | pr_info("wake_unlock: %s, stop expire " | ||
527 | "timer\n", lock->name); | ||
528 | if (has_lock == 0) | ||
529 | queue_work(suspend_work_queue, &suspend_work); | ||
530 | } | ||
531 | if (lock == &main_wake_lock) { | ||
532 | if (debug_mask & DEBUG_SUSPEND) | ||
533 | print_active_locks(WAKE_LOCK_SUSPEND); | ||
534 | #ifdef CONFIG_WAKELOCK_STAT | ||
535 | update_sleep_wait_stats_locked(0); | ||
536 | #endif | ||
537 | } | ||
538 | } | ||
539 | spin_unlock_irqrestore(&list_lock, irqflags); | ||
540 | } | ||
541 | EXPORT_SYMBOL(wake_unlock); | ||
542 | |||
543 | int wake_lock_active(struct wake_lock *lock) | ||
544 | { | ||
545 | return !!(lock->flags & WAKE_LOCK_ACTIVE); | ||
546 | } | ||
547 | EXPORT_SYMBOL(wake_lock_active); | ||
548 | |||
549 | static int wakelock_stats_open(struct inode *inode, struct file *file) | ||
550 | { | ||
551 | return single_open(file, wakelock_stats_show, NULL); | ||
552 | } | ||
553 | |||
554 | static const struct file_operations wakelock_stats_fops = { | ||
555 | .owner = THIS_MODULE, | ||
556 | .open = wakelock_stats_open, | ||
557 | .read = seq_read, | ||
558 | .llseek = seq_lseek, | ||
559 | .release = single_release, | ||
560 | }; | ||
561 | |||
562 | static int __init wakelocks_init(void) | ||
563 | { | ||
564 | int ret; | ||
565 | int i; | ||
566 | |||
567 | for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++) | ||
568 | INIT_LIST_HEAD(&active_wake_locks[i]); | ||
569 | |||
570 | #ifdef CONFIG_WAKELOCK_STAT | ||
571 | wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND, | ||
572 | "deleted_wake_locks"); | ||
573 | #endif | ||
574 | wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main"); | ||
575 | wake_lock(&main_wake_lock); | ||
576 | wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups"); | ||
577 | wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND, | ||
578 | "suspend_backoff"); | ||
579 | |||
580 | ret = platform_device_register(&power_device); | ||
581 | if (ret) { | ||
582 | pr_err("wakelocks_init: platform_device_register failed\n"); | ||
583 | goto err_platform_device_register; | ||
584 | } | ||
585 | ret = platform_driver_register(&power_driver); | ||
586 | if (ret) { | ||
587 | pr_err("wakelocks_init: platform_driver_register failed\n"); | ||
588 | goto err_platform_driver_register; | ||
589 | } | ||
590 | |||
591 | suspend_work_queue = create_singlethread_workqueue("suspend"); | ||
592 | if (suspend_work_queue == NULL) { | ||
593 | ret = -ENOMEM; | ||
594 | goto err_suspend_work_queue; | ||
595 | } | ||
596 | |||
597 | #ifdef CONFIG_WAKELOCK_STAT | ||
598 | proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops); | ||
599 | #endif | ||
600 | |||
601 | return 0; | ||
602 | |||
603 | err_suspend_work_queue: | ||
604 | platform_driver_unregister(&power_driver); | ||
605 | err_platform_driver_register: | ||
606 | platform_device_unregister(&power_device); | ||
607 | err_platform_device_register: | ||
608 | wake_lock_destroy(&suspend_backoff_lock); | ||
609 | wake_lock_destroy(&unknown_wakeup); | ||
610 | wake_lock_destroy(&main_wake_lock); | ||
611 | #ifdef CONFIG_WAKELOCK_STAT | ||
612 | wake_lock_destroy(&deleted_wake_locks); | ||
613 | #endif | ||
614 | return ret; | ||
615 | } | ||
616 | |||
617 | static void __exit wakelocks_exit(void) | ||
618 | { | ||
619 | #ifdef CONFIG_WAKELOCK_STAT | ||
620 | remove_proc_entry("wakelocks", NULL); | ||
621 | #endif | ||
622 | destroy_workqueue(suspend_work_queue); | ||
623 | platform_driver_unregister(&power_driver); | ||
624 | platform_device_unregister(&power_device); | ||
625 | wake_lock_destroy(&suspend_backoff_lock); | ||
626 | wake_lock_destroy(&unknown_wakeup); | ||
627 | wake_lock_destroy(&main_wake_lock); | ||
628 | #ifdef CONFIG_WAKELOCK_STAT | ||
629 | wake_lock_destroy(&deleted_wake_locks); | ||
630 | #endif | ||
631 | } | ||
632 | |||
633 | core_initcall(wakelocks_init); | ||
634 | module_exit(wakelocks_exit); | ||
diff --git a/kernel/printk.c b/kernel/printk.c index b799a2ee96e..cbebc142be1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -53,6 +53,10 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) | |||
53 | 53 | ||
54 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 54 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
55 | 55 | ||
56 | #ifdef CONFIG_DEBUG_LL | ||
57 | extern void printascii(char *); | ||
58 | #endif | ||
59 | |||
56 | /* printk's without a loglevel use this.. */ | 60 | /* printk's without a loglevel use this.. */ |
57 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 61 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
58 | 62 | ||
@@ -297,6 +301,53 @@ static inline void boot_delay_msec(void) | |||
297 | } | 301 | } |
298 | #endif | 302 | #endif |
299 | 303 | ||
304 | /* | ||
305 | * Return the number of unread characters in the log buffer. | ||
306 | */ | ||
307 | static int log_buf_get_len(void) | ||
308 | { | ||
309 | return logged_chars; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Clears the ring-buffer | ||
314 | */ | ||
315 | void log_buf_clear(void) | ||
316 | { | ||
317 | logged_chars = 0; | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * Copy a range of characters from the log buffer. | ||
322 | */ | ||
323 | int log_buf_copy(char *dest, int idx, int len) | ||
324 | { | ||
325 | int ret, max; | ||
326 | bool took_lock = false; | ||
327 | |||
328 | if (!oops_in_progress) { | ||
329 | spin_lock_irq(&logbuf_lock); | ||
330 | took_lock = true; | ||
331 | } | ||
332 | |||
333 | max = log_buf_get_len(); | ||
334 | if (idx < 0 || idx >= max) { | ||
335 | ret = -1; | ||
336 | } else { | ||
337 | if (len > max - idx) | ||
338 | len = max - idx; | ||
339 | ret = len; | ||
340 | idx += (log_end - max); | ||
341 | while (len-- > 0) | ||
342 | dest[len] = LOG_BUF(idx + len); | ||
343 | } | ||
344 | |||
345 | if (took_lock) | ||
346 | spin_unlock_irq(&logbuf_lock); | ||
347 | |||
348 | return ret; | ||
349 | } | ||
350 | |||
300 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 351 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT |
301 | int dmesg_restrict = 1; | 352 | int dmesg_restrict = 1; |
302 | #else | 353 | #else |
@@ -325,8 +376,10 @@ static int check_syslog_permissions(int type, bool from_file) | |||
325 | return 0; | 376 | return 0; |
326 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | 377 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ |
327 | if (capable(CAP_SYS_ADMIN)) { | 378 | if (capable(CAP_SYS_ADMIN)) { |
328 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | 379 | printk_once(KERN_WARNING "%s (%d): " |
329 | "but no CAP_SYSLOG (deprecated).\n"); | 380 | "Attempt to access syslog with CAP_SYS_ADMIN " |
381 | "but no CAP_SYSLOG (deprecated).\n", | ||
382 | current->comm, task_pid_nr(current)); | ||
330 | return 0; | 383 | return 0; |
331 | } | 384 | } |
332 | return -EPERM; | 385 | return -EPERM; |
@@ -789,7 +842,7 @@ static inline int can_use_console(unsigned int cpu) | |||
789 | static int console_trylock_for_printk(unsigned int cpu) | 842 | static int console_trylock_for_printk(unsigned int cpu) |
790 | __releases(&logbuf_lock) | 843 | __releases(&logbuf_lock) |
791 | { | 844 | { |
792 | int retval = 0; | 845 | int retval = 0, wake = 0; |
793 | 846 | ||
794 | if (console_trylock()) { | 847 | if (console_trylock()) { |
795 | retval = 1; | 848 | retval = 1; |
@@ -802,12 +855,14 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
802 | */ | 855 | */ |
803 | if (!can_use_console(cpu)) { | 856 | if (!can_use_console(cpu)) { |
804 | console_locked = 0; | 857 | console_locked = 0; |
805 | up(&console_sem); | 858 | wake = 1; |
806 | retval = 0; | 859 | retval = 0; |
807 | } | 860 | } |
808 | } | 861 | } |
809 | printk_cpu = UINT_MAX; | 862 | printk_cpu = UINT_MAX; |
810 | spin_unlock(&logbuf_lock); | 863 | spin_unlock(&logbuf_lock); |
864 | if (wake) | ||
865 | up(&console_sem); | ||
811 | return retval; | 866 | return retval; |
812 | } | 867 | } |
813 | static const char recursion_bug_msg [] = | 868 | static const char recursion_bug_msg [] = |
@@ -882,6 +937,10 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
882 | if (trace_override && !trace_recurse) | 937 | if (trace_override && !trace_recurse) |
883 | TRACE("%s", printk_buf); | 938 | TRACE("%s", printk_buf); |
884 | 939 | ||
940 | #ifdef CONFIG_DEBUG_LL | ||
941 | printascii(printk_buf); | ||
942 | #endif | ||
943 | |||
885 | p = printk_buf; | 944 | p = printk_buf; |
886 | 945 | ||
887 | /* Read log level and handle special printk prefix */ | 946 | /* Read log level and handle special printk prefix */ |
@@ -1156,7 +1215,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, | |||
1156 | switch (action) { | 1215 | switch (action) { |
1157 | case CPU_ONLINE: | 1216 | case CPU_ONLINE: |
1158 | case CPU_DEAD: | 1217 | case CPU_DEAD: |
1159 | case CPU_DYING: | ||
1160 | case CPU_DOWN_FAILED: | 1218 | case CPU_DOWN_FAILED: |
1161 | case CPU_UP_CANCELED: | 1219 | case CPU_UP_CANCELED: |
1162 | console_lock(); | 1220 | console_lock(); |
@@ -1252,7 +1310,7 @@ void console_unlock(void) | |||
1252 | { | 1310 | { |
1253 | unsigned long flags; | 1311 | unsigned long flags; |
1254 | unsigned _con_start, _log_end; | 1312 | unsigned _con_start, _log_end; |
1255 | unsigned wake_klogd = 0; | 1313 | unsigned wake_klogd = 0, retry = 0; |
1256 | 1314 | ||
1257 | if (console_suspended) { | 1315 | if (console_suspended) { |
1258 | up(&console_sem); | 1316 | up(&console_sem); |
@@ -1261,6 +1319,7 @@ void console_unlock(void) | |||
1261 | 1319 | ||
1262 | console_may_schedule = 0; | 1320 | console_may_schedule = 0; |
1263 | 1321 | ||
1322 | again: | ||
1264 | for ( ; ; ) { | 1323 | for ( ; ; ) { |
1265 | spin_lock_irqsave(&logbuf_lock, flags); | 1324 | spin_lock_irqsave(&logbuf_lock, flags); |
1266 | wake_klogd |= log_start - log_end; | 1325 | wake_klogd |= log_start - log_end; |
@@ -1281,8 +1340,23 @@ void console_unlock(void) | |||
1281 | if (unlikely(exclusive_console)) | 1340 | if (unlikely(exclusive_console)) |
1282 | exclusive_console = NULL; | 1341 | exclusive_console = NULL; |
1283 | 1342 | ||
1343 | spin_unlock(&logbuf_lock); | ||
1344 | |||
1284 | up(&console_sem); | 1345 | up(&console_sem); |
1346 | |||
1347 | /* | ||
1348 | * Someone could have filled up the buffer again, so re-check if there's | ||
1349 | * something to flush. In case we cannot trylock the console_sem again, | ||
1350 | * there's a new owner and the console_unlock() from them will do the | ||
1351 | * flush, no worries. | ||
1352 | */ | ||
1353 | spin_lock(&logbuf_lock); | ||
1354 | if (con_start != log_end) | ||
1355 | retry = 1; | ||
1285 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1356 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1357 | if (retry && console_trylock()) | ||
1358 | goto again; | ||
1359 | |||
1286 | if (wake_klogd) | 1360 | if (wake_klogd) |
1287 | wake_up_klogd(); | 1361 | wake_up_klogd(); |
1288 | } | 1362 | } |
@@ -1594,7 +1668,7 @@ static int __init printk_late_init(void) | |||
1594 | struct console *con; | 1668 | struct console *con; |
1595 | 1669 | ||
1596 | for_each_console(con) { | 1670 | for_each_console(con) { |
1597 | if (con->flags & CON_BOOT) { | 1671 | if (!keep_bootcon && con->flags & CON_BOOT) { |
1598 | printk(KERN_INFO "turn off boot console %s%d\n", | 1672 | printk(KERN_INFO "turn off boot console %s%d\n", |
1599 | con->name, con->index); | 1673 | con->name, con->index); |
1600 | unregister_console(con); | 1674 | unregister_console(con); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2df115790cd..67d1fdd3c55 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -23,8 +23,15 @@ | |||
23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
24 | #include <linux/regset.h> | 24 | #include <linux/regset.h> |
25 | #include <linux/hw_breakpoint.h> | 25 | #include <linux/hw_breakpoint.h> |
26 | #include <linux/cn_proc.h> | ||
26 | 27 | ||
27 | 28 | ||
29 | static int ptrace_trapping_sleep_fn(void *flags) | ||
30 | { | ||
31 | schedule(); | ||
32 | return 0; | ||
33 | } | ||
34 | |||
28 | /* | 35 | /* |
29 | * ptrace a task: make the debugger its new parent and | 36 | * ptrace a task: make the debugger its new parent and |
30 | * move it to the ptrace list. | 37 | * move it to the ptrace list. |
@@ -77,13 +84,31 @@ void __ptrace_unlink(struct task_struct *child) | |||
77 | spin_lock(&child->sighand->siglock); | 84 | spin_lock(&child->sighand->siglock); |
78 | 85 | ||
79 | /* | 86 | /* |
80 | * Reinstate GROUP_STOP_PENDING if group stop is in effect and | 87 | * Clear all pending traps and TRAPPING. TRAPPING should be |
88 | * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. | ||
89 | */ | ||
90 | task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); | ||
91 | task_clear_jobctl_trapping(child); | ||
92 | |||
93 | /* | ||
94 | * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and | ||
81 | * @child isn't dead. | 95 | * @child isn't dead. |
82 | */ | 96 | */ |
83 | if (!(child->flags & PF_EXITING) && | 97 | if (!(child->flags & PF_EXITING) && |
84 | (child->signal->flags & SIGNAL_STOP_STOPPED || | 98 | (child->signal->flags & SIGNAL_STOP_STOPPED || |
85 | child->signal->group_stop_count)) | 99 | child->signal->group_stop_count)) { |
86 | child->group_stop |= GROUP_STOP_PENDING; | 100 | child->jobctl |= JOBCTL_STOP_PENDING; |
101 | |||
102 | /* | ||
103 | * This is only possible if this thread was cloned by the | ||
104 | * traced task running in the stopped group, set the signal | ||
105 | * for the future reports. | ||
106 | * FIXME: we should change ptrace_init_task() to handle this | ||
107 | * case. | ||
108 | */ | ||
109 | if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) | ||
110 | child->jobctl |= SIGSTOP; | ||
111 | } | ||
87 | 112 | ||
88 | /* | 113 | /* |
89 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | 114 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick |
@@ -91,16 +116,30 @@ void __ptrace_unlink(struct task_struct *child) | |||
91 | * is in TASK_TRACED; otherwise, we might unduly disrupt | 116 | * is in TASK_TRACED; otherwise, we might unduly disrupt |
92 | * TASK_KILLABLE sleeps. | 117 | * TASK_KILLABLE sleeps. |
93 | */ | 118 | */ |
94 | if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) | 119 | if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) |
95 | signal_wake_up(child, task_is_traced(child)); | 120 | signal_wake_up(child, task_is_traced(child)); |
96 | 121 | ||
97 | spin_unlock(&child->sighand->siglock); | 122 | spin_unlock(&child->sighand->siglock); |
98 | } | 123 | } |
99 | 124 | ||
100 | /* | 125 | /** |
101 | * Check that we have indeed attached to the thing.. | 126 | * ptrace_check_attach - check whether ptracee is ready for ptrace operation |
127 | * @child: ptracee to check for | ||
128 | * @ignore_state: don't check whether @child is currently %TASK_TRACED | ||
129 | * | ||
130 | * Check whether @child is being ptraced by %current and ready for further | ||
131 | * ptrace operations. If @ignore_state is %false, @child also should be in | ||
132 | * %TASK_TRACED state and on return the child is guaranteed to be traced | ||
133 | * and not executing. If @ignore_state is %true, @child can be in any | ||
134 | * state. | ||
135 | * | ||
136 | * CONTEXT: | ||
137 | * Grabs and releases tasklist_lock and @child->sighand->siglock. | ||
138 | * | ||
139 | * RETURNS: | ||
140 | * 0 on success, -ESRCH if %child is not ready. | ||
102 | */ | 141 | */ |
103 | int ptrace_check_attach(struct task_struct *child, int kill) | 142 | int ptrace_check_attach(struct task_struct *child, bool ignore_state) |
104 | { | 143 | { |
105 | int ret = -ESRCH; | 144 | int ret = -ESRCH; |
106 | 145 | ||
@@ -119,13 +158,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
119 | */ | 158 | */ |
120 | spin_lock_irq(&child->sighand->siglock); | 159 | spin_lock_irq(&child->sighand->siglock); |
121 | WARN_ON_ONCE(task_is_stopped(child)); | 160 | WARN_ON_ONCE(task_is_stopped(child)); |
122 | if (task_is_traced(child) || kill) | 161 | if (ignore_state || (task_is_traced(child) && |
162 | !(child->jobctl & JOBCTL_LISTENING))) | ||
123 | ret = 0; | 163 | ret = 0; |
124 | spin_unlock_irq(&child->sighand->siglock); | 164 | spin_unlock_irq(&child->sighand->siglock); |
125 | } | 165 | } |
126 | read_unlock(&tasklist_lock); | 166 | read_unlock(&tasklist_lock); |
127 | 167 | ||
128 | if (!ret && !kill) | 168 | if (!ret && !ignore_state) |
129 | ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; | 169 | ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; |
130 | 170 | ||
131 | /* All systems go.. */ | 171 | /* All systems go.. */ |
@@ -182,11 +222,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
182 | return !err; | 222 | return !err; |
183 | } | 223 | } |
184 | 224 | ||
185 | static int ptrace_attach(struct task_struct *task) | 225 | static int ptrace_attach(struct task_struct *task, long request, |
226 | unsigned long flags) | ||
186 | { | 227 | { |
187 | bool wait_trap = false; | 228 | bool seize = (request == PTRACE_SEIZE); |
188 | int retval; | 229 | int retval; |
189 | 230 | ||
231 | /* | ||
232 | * SEIZE will enable new ptrace behaviors which will be implemented | ||
233 | * gradually. SEIZE_DEVEL is used to prevent applications | ||
234 | * expecting full SEIZE behaviors trapping on kernel commits which | ||
235 | * are still in the process of implementing them. | ||
236 | * | ||
237 | * Only test programs for new ptrace behaviors being implemented | ||
238 | * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. | ||
239 | * | ||
240 | * Once SEIZE behaviors are completely implemented, this flag and | ||
241 | * the following test will be removed. | ||
242 | */ | ||
243 | retval = -EIO; | ||
244 | if (seize && !(flags & PTRACE_SEIZE_DEVEL)) | ||
245 | goto out; | ||
246 | |||
190 | audit_ptrace(task); | 247 | audit_ptrace(task); |
191 | 248 | ||
192 | retval = -EPERM; | 249 | retval = -EPERM; |
@@ -218,16 +275,21 @@ static int ptrace_attach(struct task_struct *task) | |||
218 | goto unlock_tasklist; | 275 | goto unlock_tasklist; |
219 | 276 | ||
220 | task->ptrace = PT_PTRACED; | 277 | task->ptrace = PT_PTRACED; |
278 | if (seize) | ||
279 | task->ptrace |= PT_SEIZED; | ||
221 | if (task_ns_capable(task, CAP_SYS_PTRACE)) | 280 | if (task_ns_capable(task, CAP_SYS_PTRACE)) |
222 | task->ptrace |= PT_PTRACE_CAP; | 281 | task->ptrace |= PT_PTRACE_CAP; |
223 | 282 | ||
224 | __ptrace_link(task, current); | 283 | __ptrace_link(task, current); |
225 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | 284 | |
285 | /* SEIZE doesn't trap tracee on attach */ | ||
286 | if (!seize) | ||
287 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | ||
226 | 288 | ||
227 | spin_lock(&task->sighand->siglock); | 289 | spin_lock(&task->sighand->siglock); |
228 | 290 | ||
229 | /* | 291 | /* |
230 | * If the task is already STOPPED, set GROUP_STOP_PENDING and | 292 | * If the task is already STOPPED, set JOBCTL_TRAP_STOP and |
231 | * TRAPPING, and kick it so that it transits to TRACED. TRAPPING | 293 | * TRAPPING, and kick it so that it transits to TRACED. TRAPPING |
232 | * will be cleared if the child completes the transition or any | 294 | * will be cleared if the child completes the transition or any |
233 | * event which clears the group stop states happens. We'll wait | 295 | * event which clears the group stop states happens. We'll wait |
@@ -243,11 +305,9 @@ static int ptrace_attach(struct task_struct *task) | |||
243 | * The following task_is_stopped() test is safe as both transitions | 305 | * The following task_is_stopped() test is safe as both transitions |
244 | * in and out of STOPPED are protected by siglock. | 306 | * in and out of STOPPED are protected by siglock. |
245 | */ | 307 | */ |
246 | if (task_is_stopped(task)) { | 308 | if (task_is_stopped(task) && |
247 | task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; | 309 | task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) |
248 | signal_wake_up(task, 1); | 310 | signal_wake_up(task, 1); |
249 | wait_trap = true; | ||
250 | } | ||
251 | 311 | ||
252 | spin_unlock(&task->sighand->siglock); | 312 | spin_unlock(&task->sighand->siglock); |
253 | 313 | ||
@@ -257,9 +317,12 @@ unlock_tasklist: | |||
257 | unlock_creds: | 317 | unlock_creds: |
258 | mutex_unlock(&task->signal->cred_guard_mutex); | 318 | mutex_unlock(&task->signal->cred_guard_mutex); |
259 | out: | 319 | out: |
260 | if (wait_trap) | 320 | if (!retval) { |
261 | wait_event(current->signal->wait_chldexit, | 321 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, |
262 | !(task->group_stop & GROUP_STOP_TRAPPING)); | 322 | ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); |
323 | proc_ptrace_connector(task, PTRACE_ATTACH); | ||
324 | } | ||
325 | |||
263 | return retval; | 326 | return retval; |
264 | } | 327 | } |
265 | 328 | ||
@@ -322,25 +385,27 @@ static int ignoring_children(struct sighand_struct *sigh) | |||
322 | */ | 385 | */ |
323 | static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | 386 | static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) |
324 | { | 387 | { |
388 | bool dead; | ||
389 | |||
325 | __ptrace_unlink(p); | 390 | __ptrace_unlink(p); |
326 | 391 | ||
327 | if (p->exit_state == EXIT_ZOMBIE) { | 392 | if (p->exit_state != EXIT_ZOMBIE) |
328 | if (!task_detached(p) && thread_group_empty(p)) { | 393 | return false; |
329 | if (!same_thread_group(p->real_parent, tracer)) | 394 | |
330 | do_notify_parent(p, p->exit_signal); | 395 | dead = !thread_group_leader(p); |
331 | else if (ignoring_children(tracer->sighand)) { | 396 | |
332 | __wake_up_parent(p, tracer); | 397 | if (!dead && thread_group_empty(p)) { |
333 | p->exit_signal = -1; | 398 | if (!same_thread_group(p->real_parent, tracer)) |
334 | } | 399 | dead = do_notify_parent(p, p->exit_signal); |
335 | } | 400 | else if (ignoring_children(tracer->sighand)) { |
336 | if (task_detached(p)) { | 401 | __wake_up_parent(p, tracer); |
337 | /* Mark it as in the process of being reaped. */ | 402 | dead = true; |
338 | p->exit_state = EXIT_DEAD; | ||
339 | return true; | ||
340 | } | 403 | } |
341 | } | 404 | } |
342 | 405 | /* Mark it as in the process of being reaped. */ | |
343 | return false; | 406 | if (dead) |
407 | p->exit_state = EXIT_DEAD; | ||
408 | return dead; | ||
344 | } | 409 | } |
345 | 410 | ||
346 | static int ptrace_detach(struct task_struct *child, unsigned int data) | 411 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
@@ -365,6 +430,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
365 | } | 430 | } |
366 | write_unlock_irq(&tasklist_lock); | 431 | write_unlock_irq(&tasklist_lock); |
367 | 432 | ||
433 | proc_ptrace_connector(child, PTRACE_DETACH); | ||
368 | if (unlikely(dead)) | 434 | if (unlikely(dead)) |
369 | release_task(child); | 435 | release_task(child); |
370 | 436 | ||
@@ -611,10 +677,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
611 | int ptrace_request(struct task_struct *child, long request, | 677 | int ptrace_request(struct task_struct *child, long request, |
612 | unsigned long addr, unsigned long data) | 678 | unsigned long addr, unsigned long data) |
613 | { | 679 | { |
680 | bool seized = child->ptrace & PT_SEIZED; | ||
614 | int ret = -EIO; | 681 | int ret = -EIO; |
615 | siginfo_t siginfo; | 682 | siginfo_t siginfo, *si; |
616 | void __user *datavp = (void __user *) data; | 683 | void __user *datavp = (void __user *) data; |
617 | unsigned long __user *datalp = datavp; | 684 | unsigned long __user *datalp = datavp; |
685 | unsigned long flags; | ||
618 | 686 | ||
619 | switch (request) { | 687 | switch (request) { |
620 | case PTRACE_PEEKTEXT: | 688 | case PTRACE_PEEKTEXT: |
@@ -647,6 +715,59 @@ int ptrace_request(struct task_struct *child, long request, | |||
647 | ret = ptrace_setsiginfo(child, &siginfo); | 715 | ret = ptrace_setsiginfo(child, &siginfo); |
648 | break; | 716 | break; |
649 | 717 | ||
718 | case PTRACE_INTERRUPT: | ||
719 | /* | ||
720 | * Stop tracee without any side-effect on signal or job | ||
721 | * control. At least one trap is guaranteed to happen | ||
722 | * after this request. If @child is already trapped, the | ||
723 | * current trap is not disturbed and another trap will | ||
724 | * happen after the current trap is ended with PTRACE_CONT. | ||
725 | * | ||
726 | * The actual trap might not be PTRACE_EVENT_STOP trap but | ||
727 | * the pending condition is cleared regardless. | ||
728 | */ | ||
729 | if (unlikely(!seized || !lock_task_sighand(child, &flags))) | ||
730 | break; | ||
731 | |||
732 | /* | ||
733 | * INTERRUPT doesn't disturb existing trap sans one | ||
734 | * exception. If ptracer issued LISTEN for the current | ||
735 | * STOP, this INTERRUPT should clear LISTEN and re-trap | ||
736 | * tracee into STOP. | ||
737 | */ | ||
738 | if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) | ||
739 | signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); | ||
740 | |||
741 | unlock_task_sighand(child, &flags); | ||
742 | ret = 0; | ||
743 | break; | ||
744 | |||
745 | case PTRACE_LISTEN: | ||
746 | /* | ||
747 | * Listen for events. Tracee must be in STOP. It's not | ||
748 | * resumed per-se but is not considered to be in TRACED by | ||
749 | * wait(2) or ptrace(2). If an async event (e.g. group | ||
750 | * stop state change) happens, tracee will enter STOP trap | ||
751 | * again. Alternatively, ptracer can issue INTERRUPT to | ||
752 | * finish listening and re-trap tracee into STOP. | ||
753 | */ | ||
754 | if (unlikely(!seized || !lock_task_sighand(child, &flags))) | ||
755 | break; | ||
756 | |||
757 | si = child->last_siginfo; | ||
758 | if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { | ||
759 | child->jobctl |= JOBCTL_LISTENING; | ||
760 | /* | ||
761 | * If NOTIFY is set, it means event happened between | ||
762 | * start of this trap and now. Trigger re-trap. | ||
763 | */ | ||
764 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) | ||
765 | signal_wake_up(child, true); | ||
766 | ret = 0; | ||
767 | } | ||
768 | unlock_task_sighand(child, &flags); | ||
769 | break; | ||
770 | |||
650 | case PTRACE_DETACH: /* detach a process that was attached. */ | 771 | case PTRACE_DETACH: /* detach a process that was attached. */ |
651 | ret = ptrace_detach(child, data); | 772 | ret = ptrace_detach(child, data); |
652 | break; | 773 | break; |
@@ -761,8 +882,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
761 | goto out; | 882 | goto out; |
762 | } | 883 | } |
763 | 884 | ||
764 | if (request == PTRACE_ATTACH) { | 885 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
765 | ret = ptrace_attach(child); | 886 | ret = ptrace_attach(child, request, data); |
766 | /* | 887 | /* |
767 | * Some architectures need to do book-keeping after | 888 | * Some architectures need to do book-keeping after |
768 | * a ptrace attach. | 889 | * a ptrace attach. |
@@ -772,7 +893,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
772 | goto out_put_task_struct; | 893 | goto out_put_task_struct; |
773 | } | 894 | } |
774 | 895 | ||
775 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | 896 | ret = ptrace_check_attach(child, request == PTRACE_KILL || |
897 | request == PTRACE_INTERRUPT); | ||
776 | if (ret < 0) | 898 | if (ret < 0) |
777 | goto out_put_task_struct; | 899 | goto out_put_task_struct; |
778 | 900 | ||
@@ -903,8 +1025,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
903 | goto out; | 1025 | goto out; |
904 | } | 1026 | } |
905 | 1027 | ||
906 | if (request == PTRACE_ATTACH) { | 1028 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
907 | ret = ptrace_attach(child); | 1029 | ret = ptrace_attach(child, request, data); |
908 | /* | 1030 | /* |
909 | * Some architectures need to do book-keeping after | 1031 | * Some architectures need to do book-keeping after |
910 | * a ptrace attach. | 1032 | * a ptrace attach. |
@@ -914,7 +1036,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
914 | goto out_put_task_struct; | 1036 | goto out_put_task_struct; |
915 | } | 1037 | } |
916 | 1038 | ||
917 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | 1039 | ret = ptrace_check_attach(child, request == PTRACE_KILL || |
1040 | request == PTRACE_INTERRUPT); | ||
918 | if (!ret) | 1041 | if (!ret) |
919 | ret = compat_arch_ptrace(child, request, addr, data); | 1042 | ret = compat_arch_ptrace(child, request, addr, data); |
920 | 1043 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 7784bd216b6..ddddb320be6 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
39 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
40 | #include <asm/atomic.h> | 40 | #include <linux/atomic.h> |
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/percpu.h> | 42 | #include <linux/percpu.h> |
43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e138db0338..98f51b13bb7 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -33,7 +33,7 @@ | |||
33 | #include <linux/rcupdate.h> | 33 | #include <linux/rcupdate.h> |
34 | #include <linux/interrupt.h> | 34 | #include <linux/interrupt.h> |
35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
36 | #include <asm/atomic.h> | 36 | #include <linux/atomic.h> |
37 | #include <linux/bitops.h> | 37 | #include <linux/bitops.h> |
38 | #include <linux/completion.h> | 38 | #include <linux/completion.h> |
39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused) | |||
941 | idx = cur_ops->readlock(); | 941 | idx = cur_ops->readlock(); |
942 | completed = cur_ops->completed(); | 942 | completed = cur_ops->completed(); |
943 | p = rcu_dereference_check(rcu_torture_current, | 943 | p = rcu_dereference_check(rcu_torture_current, |
944 | rcu_read_lock_held() || | ||
945 | rcu_read_lock_bh_held() || | 944 | rcu_read_lock_bh_held() || |
946 | rcu_read_lock_sched_held() || | 945 | rcu_read_lock_sched_held() || |
947 | srcu_read_lock_held(&srcu_ctl)); | 946 | srcu_read_lock_held(&srcu_ctl)); |
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg) | |||
1002 | idx = cur_ops->readlock(); | 1001 | idx = cur_ops->readlock(); |
1003 | completed = cur_ops->completed(); | 1002 | completed = cur_ops->completed(); |
1004 | p = rcu_dereference_check(rcu_torture_current, | 1003 | p = rcu_dereference_check(rcu_torture_current, |
1005 | rcu_read_lock_held() || | ||
1006 | rcu_read_lock_bh_held() || | 1004 | rcu_read_lock_bh_held() || |
1007 | rcu_read_lock_sched_held() || | 1005 | rcu_read_lock_sched_held() || |
1008 | srcu_read_lock_held(&srcu_ctl)); | 1006 | srcu_read_lock_held(&srcu_ctl)); |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4e144876dc6..3b0c0986afc 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -31,7 +31,7 @@ | |||
31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
32 | #include <linux/interrupt.h> | 32 | #include <linux/interrupt.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <asm/atomic.h> | 34 | #include <linux/atomic.h> |
35 | #include <linux/bitops.h> | 35 | #include <linux/bitops.h> |
36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
37 | #include <linux/completion.h> | 37 | #include <linux/completion.h> |
diff --git a/kernel/resource.c b/kernel/resource.c index 3ff40178dce..c8dc249da5c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old, | |||
419 | else | 419 | else |
420 | tmp.end = root->end; | 420 | tmp.end = root->end; |
421 | 421 | ||
422 | if (tmp.end < tmp.start) | ||
423 | goto next; | ||
424 | |||
422 | resource_clip(&tmp, constraint->min, constraint->max); | 425 | resource_clip(&tmp, constraint->min, constraint->max); |
423 | arch_remove_reservations(&tmp); | 426 | arch_remove_reservations(&tmp); |
424 | 427 | ||
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old, | |||
436 | return 0; | 439 | return 0; |
437 | } | 440 | } |
438 | } | 441 | } |
439 | if (!this) | 442 | |
443 | next: if (!this || this->end == root->end) | ||
440 | break; | 444 | break; |
445 | |||
441 | if (this != old) | 446 | if (this != old) |
442 | tmp.start = this->end + 1; | 447 | tmp.start = this->end + 1; |
443 | this = this->sibling; | 448 | this = this->sibling; |
@@ -553,6 +558,27 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
553 | 558 | ||
554 | EXPORT_SYMBOL(allocate_resource); | 559 | EXPORT_SYMBOL(allocate_resource); |
555 | 560 | ||
561 | /** | ||
562 | * lookup_resource - find an existing resource by a resource start address | ||
563 | * @root: root resource descriptor | ||
564 | * @start: resource start address | ||
565 | * | ||
566 | * Returns a pointer to the resource if found, NULL otherwise | ||
567 | */ | ||
568 | struct resource *lookup_resource(struct resource *root, resource_size_t start) | ||
569 | { | ||
570 | struct resource *res; | ||
571 | |||
572 | read_lock(&resource_lock); | ||
573 | for (res = root->child; res; res = res->sibling) { | ||
574 | if (res->start == start) | ||
575 | break; | ||
576 | } | ||
577 | read_unlock(&resource_lock); | ||
578 | |||
579 | return res; | ||
580 | } | ||
581 | |||
556 | /* | 582 | /* |
557 | * Insert a resource into the resource tree. If successful, return NULL, | 583 | * Insert a resource into the resource tree. If successful, return NULL, |
558 | * otherwise return the conflicting resource (compare to __request_resource()) | 584 | * otherwise return the conflicting resource (compare to __request_resource()) |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index ab449117aaf..255e1662acd 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
890 | { | 890 | { |
891 | lock->owner = NULL; | 891 | lock->owner = NULL; |
892 | raw_spin_lock_init(&lock->wait_lock); | 892 | raw_spin_lock_init(&lock->wait_lock); |
893 | plist_head_init_raw(&lock->wait_list, &lock->wait_lock); | 893 | plist_head_init(&lock->wait_list); |
894 | 894 | ||
895 | debug_rt_mutex_init(lock, name); | 895 | debug_rt_mutex_init(lock, name); |
896 | } | 896 | } |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b05f5..9f48f3d82e9 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | 12 | ||
13 | #include <asm/system.h> | 13 | #include <asm/system.h> |
14 | #include <asm/atomic.h> | 14 | #include <linux/atomic.h> |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * lock for reading | 17 | * lock for reading |
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
117 | 117 | ||
118 | EXPORT_SYMBOL(down_read_nested); | 118 | EXPORT_SYMBOL(down_read_nested); |
119 | 119 | ||
120 | void down_read_non_owner(struct rw_semaphore *sem) | ||
121 | { | ||
122 | might_sleep(); | ||
123 | |||
124 | __down_read(sem); | ||
125 | } | ||
126 | |||
127 | EXPORT_SYMBOL(down_read_non_owner); | ||
128 | |||
129 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 120 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
130 | { | 121 | { |
131 | might_sleep(); | 122 | might_sleep(); |
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
136 | 127 | ||
137 | EXPORT_SYMBOL(down_write_nested); | 128 | EXPORT_SYMBOL(down_write_nested); |
138 | 129 | ||
139 | void up_read_non_owner(struct rw_semaphore *sem) | ||
140 | { | ||
141 | __up_read(sem); | ||
142 | } | ||
143 | |||
144 | EXPORT_SYMBOL(up_read_non_owner); | ||
145 | |||
146 | #endif | 130 | #endif |
147 | 131 | ||
148 | 132 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index c4b6bd5151f..a1bf2646d12 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -71,10 +71,14 @@ | |||
71 | #include <linux/ctype.h> | 71 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 72 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 73 | #include <linux/slab.h> |
74 | #include <linux/cpuacct.h> | ||
74 | 75 | ||
75 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | 78 | #include <asm/mutex.h> |
79 | #ifdef CONFIG_PARAVIRT | ||
80 | #include <asm/paravirt.h> | ||
81 | #endif | ||
78 | 82 | ||
79 | #include "sched_cpupri.h" | 83 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 84 | #include "workqueue_sched.h" |
@@ -129,7 +133,7 @@ static void litmus_tick(struct rq*, struct task_struct*); | |||
129 | 133 | ||
130 | static inline int rt_policy(int policy) | 134 | static inline int rt_policy(int policy) |
131 | { | 135 | { |
132 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 136 | if (policy == SCHED_FIFO || policy == SCHED_RR) |
133 | return 1; | 137 | return 1; |
134 | return 0; | 138 | return 0; |
135 | } | 139 | } |
@@ -433,6 +437,7 @@ struct litmus_rq { | |||
433 | */ | 437 | */ |
434 | struct root_domain { | 438 | struct root_domain { |
435 | atomic_t refcount; | 439 | atomic_t refcount; |
440 | atomic_t rto_count; | ||
436 | struct rcu_head rcu; | 441 | struct rcu_head rcu; |
437 | cpumask_var_t span; | 442 | cpumask_var_t span; |
438 | cpumask_var_t online; | 443 | cpumask_var_t online; |
@@ -442,7 +447,6 @@ struct root_domain { | |||
442 | * one runnable RT task. | 447 | * one runnable RT task. |
443 | */ | 448 | */ |
444 | cpumask_var_t rto_mask; | 449 | cpumask_var_t rto_mask; |
445 | atomic_t rto_count; | ||
446 | struct cpupri cpupri; | 450 | struct cpupri cpupri; |
447 | }; | 451 | }; |
448 | 452 | ||
@@ -540,6 +544,12 @@ struct rq { | |||
540 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 544 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
541 | u64 prev_irq_time; | 545 | u64 prev_irq_time; |
542 | #endif | 546 | #endif |
547 | #ifdef CONFIG_PARAVIRT | ||
548 | u64 prev_steal_time; | ||
549 | #endif | ||
550 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
551 | u64 prev_steal_time_rq; | ||
552 | #endif | ||
543 | 553 | ||
544 | /* calc_load related fields */ | 554 | /* calc_load related fields */ |
545 | unsigned long calc_load_update; | 555 | unsigned long calc_load_update; |
@@ -593,7 +603,6 @@ static inline int cpu_of(struct rq *rq) | |||
593 | 603 | ||
594 | #define rcu_dereference_check_sched_domain(p) \ | 604 | #define rcu_dereference_check_sched_domain(p) \ |
595 | rcu_dereference_check((p), \ | 605 | rcu_dereference_check((p), \ |
596 | rcu_read_lock_held() || \ | ||
597 | lockdep_is_held(&sched_domains_mutex)) | 606 | lockdep_is_held(&sched_domains_mutex)) |
598 | 607 | ||
599 | /* | 608 | /* |
@@ -1581,38 +1590,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1581 | return rq->avg_load_per_task; | 1590 | return rq->avg_load_per_task; |
1582 | } | 1591 | } |
1583 | 1592 | ||
1584 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1585 | |||
1586 | /* | ||
1587 | * Compute the cpu's hierarchical load factor for each task group. | ||
1588 | * This needs to be done in a top-down fashion because the load of a child | ||
1589 | * group is a fraction of its parents load. | ||
1590 | */ | ||
1591 | static int tg_load_down(struct task_group *tg, void *data) | ||
1592 | { | ||
1593 | unsigned long load; | ||
1594 | long cpu = (long)data; | ||
1595 | |||
1596 | if (!tg->parent) { | ||
1597 | load = cpu_rq(cpu)->load.weight; | ||
1598 | } else { | ||
1599 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
1600 | load *= tg->se[cpu]->load.weight; | ||
1601 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
1602 | } | ||
1603 | |||
1604 | tg->cfs_rq[cpu]->h_load = load; | ||
1605 | |||
1606 | return 0; | ||
1607 | } | ||
1608 | |||
1609 | static void update_h_load(long cpu) | ||
1610 | { | ||
1611 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | ||
1612 | } | ||
1613 | |||
1614 | #endif | ||
1615 | |||
1616 | #ifdef CONFIG_PREEMPT | 1593 | #ifdef CONFIG_PREEMPT |
1617 | 1594 | ||
1618 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 1595 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
@@ -1966,10 +1943,28 @@ void account_system_vtime(struct task_struct *curr) | |||
1966 | } | 1943 | } |
1967 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1944 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1968 | 1945 | ||
1969 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1946 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1947 | |||
1948 | #ifdef CONFIG_PARAVIRT | ||
1949 | static inline u64 steal_ticks(u64 steal) | ||
1970 | { | 1950 | { |
1971 | s64 irq_delta; | 1951 | if (unlikely(steal > NSEC_PER_SEC)) |
1952 | return div_u64(steal, TICK_NSEC); | ||
1953 | |||
1954 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
1955 | } | ||
1956 | #endif | ||
1972 | 1957 | ||
1958 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1959 | { | ||
1960 | /* | ||
1961 | * In theory, the compile should just see 0 here, and optimize out the call | ||
1962 | * to sched_rt_avg_update. But I don't trust it... | ||
1963 | */ | ||
1964 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
1965 | s64 steal = 0, irq_delta = 0; | ||
1966 | #endif | ||
1967 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1973 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | 1968 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1974 | 1969 | ||
1975 | /* | 1970 | /* |
@@ -1992,12 +1987,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1992 | 1987 | ||
1993 | rq->prev_irq_time += irq_delta; | 1988 | rq->prev_irq_time += irq_delta; |
1994 | delta -= irq_delta; | 1989 | delta -= irq_delta; |
1990 | #endif | ||
1991 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
1992 | if (static_branch((¶virt_steal_rq_enabled))) { | ||
1993 | u64 st; | ||
1994 | |||
1995 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
1996 | steal -= rq->prev_steal_time_rq; | ||
1997 | |||
1998 | if (unlikely(steal > delta)) | ||
1999 | steal = delta; | ||
2000 | |||
2001 | st = steal_ticks(steal); | ||
2002 | steal = st * TICK_NSEC; | ||
2003 | |||
2004 | rq->prev_steal_time_rq += steal; | ||
2005 | |||
2006 | delta -= steal; | ||
2007 | } | ||
2008 | #endif | ||
2009 | |||
1995 | rq->clock_task += delta; | 2010 | rq->clock_task += delta; |
1996 | 2011 | ||
1997 | if (irq_delta && sched_feat(NONIRQ_POWER)) | 2012 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) |
1998 | sched_rt_avg_update(rq, irq_delta); | 2013 | if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) |
2014 | sched_rt_avg_update(rq, irq_delta + steal); | ||
2015 | #endif | ||
1999 | } | 2016 | } |
2000 | 2017 | ||
2018 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
2001 | static int irqtime_account_hi_update(void) | 2019 | static int irqtime_account_hi_update(void) |
2002 | { | 2020 | { |
2003 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2021 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
@@ -2032,12 +2050,7 @@ static int irqtime_account_si_update(void) | |||
2032 | 2050 | ||
2033 | #define sched_clock_irqtime (0) | 2051 | #define sched_clock_irqtime (0) |
2034 | 2052 | ||
2035 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 2053 | #endif |
2036 | { | ||
2037 | rq->clock_task += delta; | ||
2038 | } | ||
2039 | |||
2040 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
2041 | 2054 | ||
2042 | #include "sched_idletask.c" | 2055 | #include "sched_idletask.c" |
2043 | #include "sched_fair.c" | 2056 | #include "sched_fair.c" |
@@ -2238,7 +2251,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2238 | 2251 | ||
2239 | if (task_cpu(p) != new_cpu) { | 2252 | if (task_cpu(p) != new_cpu) { |
2240 | p->se.nr_migrations++; | 2253 | p->se.nr_migrations++; |
2241 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); | 2254 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
2242 | } | 2255 | } |
2243 | 2256 | ||
2244 | __set_task_cpu(p, new_cpu); | 2257 | __set_task_cpu(p, new_cpu); |
@@ -2515,7 +2528,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
2515 | if (p->sched_class->task_woken) | 2528 | if (p->sched_class->task_woken) |
2516 | p->sched_class->task_woken(rq, p); | 2529 | p->sched_class->task_woken(rq, p); |
2517 | 2530 | ||
2518 | if (unlikely(rq->idle_stamp)) { | 2531 | if (rq->idle_stamp) { |
2519 | u64 delta = rq->clock - rq->idle_stamp; | 2532 | u64 delta = rq->clock - rq->idle_stamp; |
2520 | u64 max = 2*sysctl_sched_migration_cost; | 2533 | u64 max = 2*sysctl_sched_migration_cost; |
2521 | 2534 | ||
@@ -2927,7 +2940,7 @@ void sched_fork(struct task_struct *p) | |||
2927 | #if defined(CONFIG_SMP) | 2940 | #if defined(CONFIG_SMP) |
2928 | p->on_cpu = 0; | 2941 | p->on_cpu = 0; |
2929 | #endif | 2942 | #endif |
2930 | #ifdef CONFIG_PREEMPT | 2943 | #ifdef CONFIG_PREEMPT_COUNT |
2931 | /* Want to start with kernel preemption disabled. */ | 2944 | /* Want to start with kernel preemption disabled. */ |
2932 | task_thread_info(p)->preempt_count = 1; | 2945 | task_thread_info(p)->preempt_count = 1; |
2933 | #endif | 2946 | #endif |
@@ -3096,7 +3109,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
3096 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3109 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3097 | local_irq_disable(); | 3110 | local_irq_disable(); |
3098 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3111 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3099 | perf_event_task_sched_in(current); | 3112 | perf_event_task_sched_in(prev, current); |
3100 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3113 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3101 | local_irq_enable(); | 3114 | local_irq_enable(); |
3102 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3115 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
@@ -3775,30 +3788,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3775 | } | 3788 | } |
3776 | 3789 | ||
3777 | /* | 3790 | /* |
3778 | * Return sum_exec_runtime for the thread group. | ||
3779 | * In case the task is currently running, return the sum plus current's | ||
3780 | * pending runtime that have not been accounted yet. | ||
3781 | * | ||
3782 | * Note that the thread group might have other running tasks as well, | ||
3783 | * so the return value not includes other pending runtime that other | ||
3784 | * running tasks might have. | ||
3785 | */ | ||
3786 | unsigned long long thread_group_sched_runtime(struct task_struct *p) | ||
3787 | { | ||
3788 | struct task_cputime totals; | ||
3789 | unsigned long flags; | ||
3790 | struct rq *rq; | ||
3791 | u64 ns; | ||
3792 | |||
3793 | rq = task_rq_lock(p, &flags); | ||
3794 | thread_group_cputime(p, &totals); | ||
3795 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | ||
3796 | task_rq_unlock(rq, p, &flags); | ||
3797 | |||
3798 | return ns; | ||
3799 | } | ||
3800 | |||
3801 | /* | ||
3802 | * Account user cpu time to a process. | 3791 | * Account user cpu time to a process. |
3803 | * @p: the process that the cpu time gets accounted to | 3792 | * @p: the process that the cpu time gets accounted to |
3804 | * @cputime: the cpu time spent in user space since the last update | 3793 | * @cputime: the cpu time spent in user space since the last update |
@@ -3939,6 +3928,25 @@ void account_idle_time(cputime_t cputime) | |||
3939 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 3928 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); |
3940 | } | 3929 | } |
3941 | 3930 | ||
3931 | static __always_inline bool steal_account_process_tick(void) | ||
3932 | { | ||
3933 | #ifdef CONFIG_PARAVIRT | ||
3934 | if (static_branch(¶virt_steal_enabled)) { | ||
3935 | u64 steal, st = 0; | ||
3936 | |||
3937 | steal = paravirt_steal_clock(smp_processor_id()); | ||
3938 | steal -= this_rq()->prev_steal_time; | ||
3939 | |||
3940 | st = steal_ticks(steal); | ||
3941 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
3942 | |||
3943 | account_steal_time(st); | ||
3944 | return st; | ||
3945 | } | ||
3946 | #endif | ||
3947 | return false; | ||
3948 | } | ||
3949 | |||
3942 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3950 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3943 | 3951 | ||
3944 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 3952 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -3970,6 +3978,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
3970 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 3978 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); |
3971 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3979 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3972 | 3980 | ||
3981 | if (steal_account_process_tick()) | ||
3982 | return; | ||
3983 | |||
3973 | if (irqtime_account_hi_update()) { | 3984 | if (irqtime_account_hi_update()) { |
3974 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3985 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3975 | } else if (irqtime_account_si_update()) { | 3986 | } else if (irqtime_account_si_update()) { |
@@ -4023,6 +4034,9 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
4023 | return; | 4034 | return; |
4024 | } | 4035 | } |
4025 | 4036 | ||
4037 | if (steal_account_process_tick()) | ||
4038 | return; | ||
4039 | |||
4026 | if (user_tick) | 4040 | if (user_tick) |
4027 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4041 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
4028 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4042 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -4320,9 +4334,9 @@ pick_next_task(struct rq *rq) | |||
4320 | } | 4334 | } |
4321 | 4335 | ||
4322 | /* | 4336 | /* |
4323 | * schedule() is the main scheduler function. | 4337 | * __schedule() is the main scheduler function. |
4324 | */ | 4338 | */ |
4325 | asmlinkage void __sched schedule(void) | 4339 | static void __sched __schedule(void) |
4326 | { | 4340 | { |
4327 | struct task_struct *prev, *next; | 4341 | struct task_struct *prev, *next; |
4328 | unsigned long *switch_count; | 4342 | unsigned long *switch_count; |
@@ -4371,16 +4385,6 @@ litmus_need_resched_nonpreemptible: | |||
4371 | if (to_wakeup) | 4385 | if (to_wakeup) |
4372 | try_to_wake_up_local(to_wakeup); | 4386 | try_to_wake_up_local(to_wakeup); |
4373 | } | 4387 | } |
4374 | |||
4375 | /* | ||
4376 | * If we are going to sleep and we have plugged IO | ||
4377 | * queued, make sure to submit it to avoid deadlocks. | ||
4378 | */ | ||
4379 | if (blk_needs_flush_plug(prev)) { | ||
4380 | raw_spin_unlock(&rq->lock); | ||
4381 | blk_schedule_flush_plug(prev); | ||
4382 | raw_spin_lock(&rq->lock); | ||
4383 | } | ||
4384 | } | 4388 | } |
4385 | switch_count = &prev->nvcsw; | 4389 | switch_count = &prev->nvcsw; |
4386 | } | 4390 | } |
@@ -4436,17 +4440,34 @@ litmus_need_resched_nonpreemptible: | |||
4436 | 4440 | ||
4437 | srp_ceiling_block(); | 4441 | srp_ceiling_block(); |
4438 | } | 4442 | } |
4443 | |||
4444 | static inline void sched_submit_work(struct task_struct *tsk) | ||
4445 | { | ||
4446 | if (!tsk->state) | ||
4447 | return; | ||
4448 | /* | ||
4449 | * If we are going to sleep and we have plugged IO queued, | ||
4450 | * make sure to submit it to avoid deadlocks. | ||
4451 | */ | ||
4452 | if (blk_needs_flush_plug(tsk)) | ||
4453 | blk_schedule_flush_plug(tsk); | ||
4454 | } | ||
4455 | |||
4456 | asmlinkage void __sched schedule(void) | ||
4457 | { | ||
4458 | struct task_struct *tsk = current; | ||
4459 | |||
4460 | sched_submit_work(tsk); | ||
4461 | __schedule(); | ||
4462 | } | ||
4439 | EXPORT_SYMBOL(schedule); | 4463 | EXPORT_SYMBOL(schedule); |
4440 | 4464 | ||
4441 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4465 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4442 | 4466 | ||
4443 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | 4467 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4444 | { | 4468 | { |
4445 | bool ret = false; | ||
4446 | |||
4447 | rcu_read_lock(); | ||
4448 | if (lock->owner != owner) | 4469 | if (lock->owner != owner) |
4449 | goto fail; | 4470 | return false; |
4450 | 4471 | ||
4451 | /* | 4472 | /* |
4452 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | 4473 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
@@ -4456,11 +4477,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | |||
4456 | */ | 4477 | */ |
4457 | barrier(); | 4478 | barrier(); |
4458 | 4479 | ||
4459 | ret = owner->on_cpu; | 4480 | return owner->on_cpu; |
4460 | fail: | ||
4461 | rcu_read_unlock(); | ||
4462 | |||
4463 | return ret; | ||
4464 | } | 4481 | } |
4465 | 4482 | ||
4466 | /* | 4483 | /* |
@@ -4472,21 +4489,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
4472 | if (!sched_feat(OWNER_SPIN)) | 4489 | if (!sched_feat(OWNER_SPIN)) |
4473 | return 0; | 4490 | return 0; |
4474 | 4491 | ||
4492 | rcu_read_lock(); | ||
4475 | while (owner_running(lock, owner)) { | 4493 | while (owner_running(lock, owner)) { |
4476 | if (need_resched()) | 4494 | if (need_resched()) |
4477 | return 0; | 4495 | break; |
4478 | 4496 | ||
4479 | arch_mutex_cpu_relax(); | 4497 | arch_mutex_cpu_relax(); |
4480 | } | 4498 | } |
4499 | rcu_read_unlock(); | ||
4481 | 4500 | ||
4482 | /* | 4501 | /* |
4483 | * If the owner changed to another task there is likely | 4502 | * We break out the loop above on need_resched() and when the |
4484 | * heavy contention, stop spinning. | 4503 | * owner changed, which is a sign for heavy contention. Return |
4504 | * success only when lock->owner is NULL. | ||
4485 | */ | 4505 | */ |
4486 | if (lock->owner) | 4506 | return lock->owner == NULL; |
4487 | return 0; | ||
4488 | |||
4489 | return 1; | ||
4490 | } | 4507 | } |
4491 | #endif | 4508 | #endif |
4492 | 4509 | ||
@@ -4509,7 +4526,7 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
4509 | 4526 | ||
4510 | do { | 4527 | do { |
4511 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 4528 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
4512 | schedule(); | 4529 | __schedule(); |
4513 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 4530 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
4514 | 4531 | ||
4515 | /* | 4532 | /* |
@@ -4537,7 +4554,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
4537 | do { | 4554 | do { |
4538 | add_preempt_count(PREEMPT_ACTIVE); | 4555 | add_preempt_count(PREEMPT_ACTIVE); |
4539 | local_irq_enable(); | 4556 | local_irq_enable(); |
4540 | schedule(); | 4557 | __schedule(); |
4541 | local_irq_disable(); | 4558 | local_irq_disable(); |
4542 | sub_preempt_count(PREEMPT_ACTIVE); | 4559 | sub_preempt_count(PREEMPT_ACTIVE); |
4543 | 4560 | ||
@@ -5682,7 +5699,7 @@ static inline int should_resched(void) | |||
5682 | static void __cond_resched(void) | 5699 | static void __cond_resched(void) |
5683 | { | 5700 | { |
5684 | add_preempt_count(PREEMPT_ACTIVE); | 5701 | add_preempt_count(PREEMPT_ACTIVE); |
5685 | schedule(); | 5702 | __schedule(); |
5686 | sub_preempt_count(PREEMPT_ACTIVE); | 5703 | sub_preempt_count(PREEMPT_ACTIVE); |
5687 | } | 5704 | } |
5688 | 5705 | ||
@@ -6618,7 +6635,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |||
6618 | unsigned long action, void *hcpu) | 6635 | unsigned long action, void *hcpu) |
6619 | { | 6636 | { |
6620 | switch (action & ~CPU_TASKS_FROZEN) { | 6637 | switch (action & ~CPU_TASKS_FROZEN) { |
6621 | case CPU_ONLINE: | 6638 | case CPU_STARTING: |
6622 | case CPU_DOWN_FAILED: | 6639 | case CPU_DOWN_FAILED: |
6623 | set_cpu_active((long)hcpu, true); | 6640 | set_cpu_active((long)hcpu, true); |
6624 | return NOTIFY_OK; | 6641 | return NOTIFY_OK; |
@@ -7537,6 +7554,7 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
7537 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | 7554 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); |
7538 | if (sd && (sd->flags & SD_OVERLAP)) | 7555 | if (sd && (sd->flags & SD_OVERLAP)) |
7539 | free_sched_groups(sd->groups, 0); | 7556 | free_sched_groups(sd->groups, 0); |
7557 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7540 | kfree(*per_cpu_ptr(sdd->sg, j)); | 7558 | kfree(*per_cpu_ptr(sdd->sg, j)); |
7541 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 7559 | kfree(*per_cpu_ptr(sdd->sgp, j)); |
7542 | } | 7560 | } |
@@ -8022,17 +8040,10 @@ int in_sched_functions(unsigned long addr) | |||
8022 | && addr < (unsigned long)__sched_text_end); | 8040 | && addr < (unsigned long)__sched_text_end); |
8023 | } | 8041 | } |
8024 | 8042 | ||
8025 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 8043 | static void init_cfs_rq(struct cfs_rq *cfs_rq) |
8026 | { | 8044 | { |
8027 | cfs_rq->tasks_timeline = RB_ROOT; | 8045 | cfs_rq->tasks_timeline = RB_ROOT; |
8028 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8046 | INIT_LIST_HEAD(&cfs_rq->tasks); |
8029 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8030 | cfs_rq->rq = rq; | ||
8031 | /* allow initial update_cfs_load() to truncate */ | ||
8032 | #ifdef CONFIG_SMP | ||
8033 | cfs_rq->load_stamp = 1; | ||
8034 | #endif | ||
8035 | #endif | ||
8036 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8047 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
8037 | #ifndef CONFIG_64BIT | 8048 | #ifndef CONFIG_64BIT |
8038 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 8049 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
@@ -8052,27 +8063,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
8052 | /* delimiter for bitsearch: */ | 8063 | /* delimiter for bitsearch: */ |
8053 | __set_bit(MAX_RT_PRIO, array->bitmap); | 8064 | __set_bit(MAX_RT_PRIO, array->bitmap); |
8054 | 8065 | ||
8055 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 8066 | #if defined CONFIG_SMP |
8056 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | 8067 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
8057 | #ifdef CONFIG_SMP | ||
8058 | rt_rq->highest_prio.next = MAX_RT_PRIO; | 8068 | rt_rq->highest_prio.next = MAX_RT_PRIO; |
8059 | #endif | ||
8060 | #endif | ||
8061 | #ifdef CONFIG_SMP | ||
8062 | rt_rq->rt_nr_migratory = 0; | 8069 | rt_rq->rt_nr_migratory = 0; |
8063 | rt_rq->overloaded = 0; | 8070 | rt_rq->overloaded = 0; |
8064 | plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); | 8071 | plist_head_init(&rt_rq->pushable_tasks); |
8065 | #endif | 8072 | #endif |
8066 | 8073 | ||
8067 | rt_rq->rt_time = 0; | 8074 | rt_rq->rt_time = 0; |
8068 | rt_rq->rt_throttled = 0; | 8075 | rt_rq->rt_throttled = 0; |
8069 | rt_rq->rt_runtime = 0; | 8076 | rt_rq->rt_runtime = 0; |
8070 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | 8077 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); |
8071 | |||
8072 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8073 | rt_rq->rt_nr_boosted = 0; | ||
8074 | rt_rq->rq = rq; | ||
8075 | #endif | ||
8076 | } | 8078 | } |
8077 | 8079 | ||
8078 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8080 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -8081,11 +8083,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
8081 | struct sched_entity *parent) | 8083 | struct sched_entity *parent) |
8082 | { | 8084 | { |
8083 | struct rq *rq = cpu_rq(cpu); | 8085 | struct rq *rq = cpu_rq(cpu); |
8084 | tg->cfs_rq[cpu] = cfs_rq; | 8086 | |
8085 | init_cfs_rq(cfs_rq, rq); | ||
8086 | cfs_rq->tg = tg; | 8087 | cfs_rq->tg = tg; |
8088 | cfs_rq->rq = rq; | ||
8089 | #ifdef CONFIG_SMP | ||
8090 | /* allow initial update_cfs_load() to truncate */ | ||
8091 | cfs_rq->load_stamp = 1; | ||
8092 | #endif | ||
8087 | 8093 | ||
8094 | tg->cfs_rq[cpu] = cfs_rq; | ||
8088 | tg->se[cpu] = se; | 8095 | tg->se[cpu] = se; |
8096 | |||
8089 | /* se could be NULL for root_task_group */ | 8097 | /* se could be NULL for root_task_group */ |
8090 | if (!se) | 8098 | if (!se) |
8091 | return; | 8099 | return; |
@@ -8108,12 +8116,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
8108 | { | 8116 | { |
8109 | struct rq *rq = cpu_rq(cpu); | 8117 | struct rq *rq = cpu_rq(cpu); |
8110 | 8118 | ||
8111 | tg->rt_rq[cpu] = rt_rq; | 8119 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
8112 | init_rt_rq(rt_rq, rq); | 8120 | rt_rq->rt_nr_boosted = 0; |
8121 | rt_rq->rq = rq; | ||
8113 | rt_rq->tg = tg; | 8122 | rt_rq->tg = tg; |
8114 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8115 | 8123 | ||
8124 | tg->rt_rq[cpu] = rt_rq; | ||
8116 | tg->rt_se[cpu] = rt_se; | 8125 | tg->rt_se[cpu] = rt_se; |
8126 | |||
8117 | if (!rt_se) | 8127 | if (!rt_se) |
8118 | return; | 8128 | return; |
8119 | 8129 | ||
@@ -8195,7 +8205,7 @@ void __init sched_init(void) | |||
8195 | rq->nr_running = 0; | 8205 | rq->nr_running = 0; |
8196 | rq->calc_load_active = 0; | 8206 | rq->calc_load_active = 0; |
8197 | rq->calc_load_update = jiffies + LOAD_FREQ; | 8207 | rq->calc_load_update = jiffies + LOAD_FREQ; |
8198 | init_cfs_rq(&rq->cfs, rq); | 8208 | init_cfs_rq(&rq->cfs); |
8199 | init_rt_rq(&rq->rt, rq); | 8209 | init_rt_rq(&rq->rt, rq); |
8200 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8210 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8201 | root_task_group.shares = root_task_group_load; | 8211 | root_task_group.shares = root_task_group_load; |
@@ -8266,7 +8276,7 @@ void __init sched_init(void) | |||
8266 | #endif | 8276 | #endif |
8267 | 8277 | ||
8268 | #ifdef CONFIG_RT_MUTEXES | 8278 | #ifdef CONFIG_RT_MUTEXES |
8269 | plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); | 8279 | plist_head_init(&init_task.pi_waiters); |
8270 | #endif | 8280 | #endif |
8271 | 8281 | ||
8272 | /* | 8282 | /* |
@@ -8300,6 +8310,7 @@ void __init sched_init(void) | |||
8300 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | 8310 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
8301 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | 8311 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); |
8302 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | 8312 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); |
8313 | nohz.next_balance = jiffies; | ||
8303 | #endif | 8314 | #endif |
8304 | /* May be allocated at isolcpus cmdline parse time */ | 8315 | /* May be allocated at isolcpus cmdline parse time */ |
8305 | if (cpu_isolated_map == NULL) | 8316 | if (cpu_isolated_map == NULL) |
@@ -8309,7 +8320,7 @@ void __init sched_init(void) | |||
8309 | scheduler_running = 1; | 8320 | scheduler_running = 1; |
8310 | } | 8321 | } |
8311 | 8322 | ||
8312 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 8323 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
8313 | static inline int preempt_count_equals(int preempt_offset) | 8324 | static inline int preempt_count_equals(int preempt_offset) |
8314 | { | 8325 | { |
8315 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8326 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
@@ -8317,13 +8328,23 @@ static inline int preempt_count_equals(int preempt_offset) | |||
8317 | return (nested == preempt_offset); | 8328 | return (nested == preempt_offset); |
8318 | } | 8329 | } |
8319 | 8330 | ||
8331 | static int __might_sleep_init_called; | ||
8332 | int __init __might_sleep_init(void) | ||
8333 | { | ||
8334 | __might_sleep_init_called = 1; | ||
8335 | return 0; | ||
8336 | } | ||
8337 | early_initcall(__might_sleep_init); | ||
8338 | |||
8320 | void __might_sleep(const char *file, int line, int preempt_offset) | 8339 | void __might_sleep(const char *file, int line, int preempt_offset) |
8321 | { | 8340 | { |
8322 | #ifdef in_atomic | ||
8323 | static unsigned long prev_jiffy; /* ratelimiting */ | 8341 | static unsigned long prev_jiffy; /* ratelimiting */ |
8324 | 8342 | ||
8325 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 8343 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
8326 | system_state != SYSTEM_RUNNING || oops_in_progress) | 8344 | oops_in_progress) |
8345 | return; | ||
8346 | if (system_state != SYSTEM_RUNNING && | ||
8347 | (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) | ||
8327 | return; | 8348 | return; |
8328 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 8349 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
8329 | return; | 8350 | return; |
@@ -8341,7 +8362,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
8341 | if (irqs_disabled()) | 8362 | if (irqs_disabled()) |
8342 | print_irqtrace_events(current); | 8363 | print_irqtrace_events(current); |
8343 | dump_stack(); | 8364 | dump_stack(); |
8344 | #endif | ||
8345 | } | 8365 | } |
8346 | EXPORT_SYMBOL(__might_sleep); | 8366 | EXPORT_SYMBOL(__might_sleep); |
8347 | #endif | 8367 | #endif |
@@ -8500,6 +8520,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8500 | if (!se) | 8520 | if (!se) |
8501 | goto err_free_rq; | 8521 | goto err_free_rq; |
8502 | 8522 | ||
8523 | init_cfs_rq(cfs_rq); | ||
8503 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8524 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8504 | } | 8525 | } |
8505 | 8526 | ||
@@ -8527,7 +8548,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
8527 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | 8548 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); |
8528 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8549 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8529 | } | 8550 | } |
8530 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8551 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
8531 | static inline void free_fair_sched_group(struct task_group *tg) | 8552 | static inline void free_fair_sched_group(struct task_group *tg) |
8532 | { | 8553 | { |
8533 | } | 8554 | } |
@@ -8548,7 +8569,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
8548 | { | 8569 | { |
8549 | int i; | 8570 | int i; |
8550 | 8571 | ||
8551 | destroy_rt_bandwidth(&tg->rt_bandwidth); | 8572 | if (tg->rt_se) |
8573 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8552 | 8574 | ||
8553 | for_each_possible_cpu(i) { | 8575 | for_each_possible_cpu(i) { |
8554 | if (tg->rt_rq) | 8576 | if (tg->rt_rq) |
@@ -8589,6 +8611,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8589 | if (!rt_se) | 8611 | if (!rt_se) |
8590 | goto err_free_rq; | 8612 | goto err_free_rq; |
8591 | 8613 | ||
8614 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
8615 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8592 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 8616 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8593 | } | 8617 | } |
8594 | 8618 | ||
@@ -9067,6 +9091,20 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9067 | } | 9091 | } |
9068 | 9092 | ||
9069 | static int | 9093 | static int |
9094 | cpu_cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk) | ||
9095 | { | ||
9096 | const struct cred *cred = current_cred(), *tcred; | ||
9097 | |||
9098 | tcred = __task_cred(tsk); | ||
9099 | |||
9100 | if ((current != tsk) && !capable(CAP_SYS_NICE) && | ||
9101 | cred->euid != tcred->uid && cred->euid != tcred->suid) | ||
9102 | return -EACCES; | ||
9103 | |||
9104 | return 0; | ||
9105 | } | ||
9106 | |||
9107 | static int | ||
9070 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 9108 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
9071 | { | 9109 | { |
9072 | #ifdef CONFIG_RT_GROUP_SCHED | 9110 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9171,6 +9209,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9171 | .name = "cpu", | 9209 | .name = "cpu", |
9172 | .create = cpu_cgroup_create, | 9210 | .create = cpu_cgroup_create, |
9173 | .destroy = cpu_cgroup_destroy, | 9211 | .destroy = cpu_cgroup_destroy, |
9212 | .allow_attach = cpu_cgroup_allow_attach, | ||
9174 | .can_attach_task = cpu_cgroup_can_attach_task, | 9213 | .can_attach_task = cpu_cgroup_can_attach_task, |
9175 | .attach_task = cpu_cgroup_attach_task, | 9214 | .attach_task = cpu_cgroup_attach_task, |
9176 | .exit = cpu_cgroup_exit, | 9215 | .exit = cpu_cgroup_exit, |
@@ -9197,8 +9236,30 @@ struct cpuacct { | |||
9197 | u64 __percpu *cpuusage; | 9236 | u64 __percpu *cpuusage; |
9198 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | 9237 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; |
9199 | struct cpuacct *parent; | 9238 | struct cpuacct *parent; |
9239 | struct cpuacct_charge_calls *cpufreq_fn; | ||
9240 | void *cpuacct_data; | ||
9200 | }; | 9241 | }; |
9201 | 9242 | ||
9243 | static struct cpuacct *cpuacct_root; | ||
9244 | |||
9245 | /* Default calls for cpufreq accounting */ | ||
9246 | static struct cpuacct_charge_calls *cpuacct_cpufreq; | ||
9247 | int cpuacct_register_cpufreq(struct cpuacct_charge_calls *fn) | ||
9248 | { | ||
9249 | cpuacct_cpufreq = fn; | ||
9250 | |||
9251 | /* | ||
9252 | * Root node is created before platform can register callbacks, | ||
9253 | * initalize here. | ||
9254 | */ | ||
9255 | if (cpuacct_root && fn) { | ||
9256 | cpuacct_root->cpufreq_fn = fn; | ||
9257 | if (fn->init) | ||
9258 | fn->init(&cpuacct_root->cpuacct_data); | ||
9259 | } | ||
9260 | return 0; | ||
9261 | } | ||
9262 | |||
9202 | struct cgroup_subsys cpuacct_subsys; | 9263 | struct cgroup_subsys cpuacct_subsys; |
9203 | 9264 | ||
9204 | /* return cpu accounting group corresponding to this container */ | 9265 | /* return cpu accounting group corresponding to this container */ |
@@ -9233,8 +9294,16 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
9233 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 9294 | if (percpu_counter_init(&ca->cpustat[i], 0)) |
9234 | goto out_free_counters; | 9295 | goto out_free_counters; |
9235 | 9296 | ||
9297 | ca->cpufreq_fn = cpuacct_cpufreq; | ||
9298 | |||
9299 | /* If available, have platform code initalize cpu frequency table */ | ||
9300 | if (ca->cpufreq_fn && ca->cpufreq_fn->init) | ||
9301 | ca->cpufreq_fn->init(&ca->cpuacct_data); | ||
9302 | |||
9236 | if (cgrp->parent) | 9303 | if (cgrp->parent) |
9237 | ca->parent = cgroup_ca(cgrp->parent); | 9304 | ca->parent = cgroup_ca(cgrp->parent); |
9305 | else | ||
9306 | cpuacct_root = ca; | ||
9238 | 9307 | ||
9239 | return &ca->css; | 9308 | return &ca->css; |
9240 | 9309 | ||
@@ -9362,6 +9431,32 @@ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
9362 | return 0; | 9431 | return 0; |
9363 | } | 9432 | } |
9364 | 9433 | ||
9434 | static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft, | ||
9435 | struct cgroup_map_cb *cb) | ||
9436 | { | ||
9437 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
9438 | if (ca->cpufreq_fn && ca->cpufreq_fn->cpufreq_show) | ||
9439 | ca->cpufreq_fn->cpufreq_show(ca->cpuacct_data, cb); | ||
9440 | |||
9441 | return 0; | ||
9442 | } | ||
9443 | |||
9444 | /* return total cpu power usage (milliWatt second) of a group */ | ||
9445 | static u64 cpuacct_powerusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
9446 | { | ||
9447 | int i; | ||
9448 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
9449 | u64 totalpower = 0; | ||
9450 | |||
9451 | if (ca->cpufreq_fn && ca->cpufreq_fn->power_usage) | ||
9452 | for_each_present_cpu(i) { | ||
9453 | totalpower += ca->cpufreq_fn->power_usage( | ||
9454 | ca->cpuacct_data); | ||
9455 | } | ||
9456 | |||
9457 | return totalpower; | ||
9458 | } | ||
9459 | |||
9365 | static struct cftype files[] = { | 9460 | static struct cftype files[] = { |
9366 | { | 9461 | { |
9367 | .name = "usage", | 9462 | .name = "usage", |
@@ -9376,6 +9471,14 @@ static struct cftype files[] = { | |||
9376 | .name = "stat", | 9471 | .name = "stat", |
9377 | .read_map = cpuacct_stats_show, | 9472 | .read_map = cpuacct_stats_show, |
9378 | }, | 9473 | }, |
9474 | { | ||
9475 | .name = "cpufreq", | ||
9476 | .read_map = cpuacct_cpufreq_show, | ||
9477 | }, | ||
9478 | { | ||
9479 | .name = "power", | ||
9480 | .read_u64 = cpuacct_powerusage_read | ||
9481 | }, | ||
9379 | }; | 9482 | }; |
9380 | 9483 | ||
9381 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | 9484 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |
@@ -9405,6 +9508,10 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9405 | for (; ca; ca = ca->parent) { | 9508 | for (; ca; ca = ca->parent) { |
9406 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 9509 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9407 | *cpuusage += cputime; | 9510 | *cpuusage += cputime; |
9511 | |||
9512 | /* Call back into platform code to account for CPU speeds */ | ||
9513 | if (ca->cpufreq_fn && ca->cpufreq_fn->charge) | ||
9514 | ca->cpufreq_fn->charge(ca->cpuacct_data, cputime, cpu); | ||
9408 | } | 9515 | } |
9409 | 9516 | ||
9410 | rcu_read_unlock(); | 9517 | rcu_read_unlock(); |
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 05577055cfc..c2f0e7248dc 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h | |||
@@ -13,6 +13,7 @@ struct autogroup { | |||
13 | int nice; | 13 | int nice; |
14 | }; | 14 | }; |
15 | 15 | ||
16 | static inline bool task_group_is_autogroup(struct task_group *tg); | ||
16 | static inline struct task_group * | 17 | static inline struct task_group * |
17 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); |
18 | 19 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 334eb474af9..22999b257ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
135 | return grp->my_q; | 135 | return grp->my_q; |
136 | } | 136 | } |
137 | 137 | ||
138 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
139 | * another cpu ('this_cpu') | ||
140 | */ | ||
141 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
142 | { | ||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
144 | } | ||
145 | |||
146 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 138 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
147 | { | 139 | { |
148 | if (!cfs_rq->on_list) { | 140 | if (!cfs_rq->on_list) { |
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
271 | return NULL; | 263 | return NULL; |
272 | } | 264 | } |
273 | 265 | ||
274 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
275 | { | ||
276 | return &cpu_rq(this_cpu)->cfs; | ||
277 | } | ||
278 | |||
279 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 266 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
280 | { | 267 | { |
281 | } | 268 | } |
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a, | |||
334 | return (s64)(a->vruntime - b->vruntime) < 0; | 321 | return (s64)(a->vruntime - b->vruntime) < 0; |
335 | } | 322 | } |
336 | 323 | ||
337 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
338 | { | ||
339 | return se->vruntime - cfs_rq->min_vruntime; | ||
340 | } | ||
341 | |||
342 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 324 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
343 | { | 325 | { |
344 | u64 vruntime = cfs_rq->min_vruntime; | 326 | u64 vruntime = cfs_rq->min_vruntime; |
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
372 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 354 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
373 | struct rb_node *parent = NULL; | 355 | struct rb_node *parent = NULL; |
374 | struct sched_entity *entry; | 356 | struct sched_entity *entry; |
375 | s64 key = entity_key(cfs_rq, se); | ||
376 | int leftmost = 1; | 357 | int leftmost = 1; |
377 | 358 | ||
378 | /* | 359 | /* |
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
385 | * We dont care about collisions. Nodes with | 366 | * We dont care about collisions. Nodes with |
386 | * the same key stay together. | 367 | * the same key stay together. |
387 | */ | 368 | */ |
388 | if (key < entity_key(cfs_rq, entry)) { | 369 | if (entity_before(se, entry)) { |
389 | link = &parent->rb_left; | 370 | link = &parent->rb_left; |
390 | } else { | 371 | } else { |
391 | link = &parent->rb_right; | 372 | link = &parent->rb_right; |
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1336 | } | 1317 | } |
1337 | 1318 | ||
1338 | for_each_sched_entity(se) { | 1319 | for_each_sched_entity(se) { |
1339 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1320 | cfs_rq = cfs_rq_of(se); |
1340 | 1321 | ||
1341 | update_cfs_load(cfs_rq, 0); | 1322 | update_cfs_load(cfs_rq, 0); |
1342 | update_cfs_shares(cfs_rq); | 1323 | update_cfs_shares(cfs_rq); |
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1370 | */ | 1351 | */ |
1371 | if (task_sleep && parent_entity(se)) | 1352 | if (task_sleep && parent_entity(se)) |
1372 | set_next_buddy(parent_entity(se)); | 1353 | set_next_buddy(parent_entity(se)); |
1354 | |||
1355 | /* avoid re-evaluating load for this entity */ | ||
1356 | se = parent_entity(se); | ||
1373 | break; | 1357 | break; |
1374 | } | 1358 | } |
1375 | flags |= DEQUEUE_SLEEP; | 1359 | flags |= DEQUEUE_SLEEP; |
1376 | } | 1360 | } |
1377 | 1361 | ||
1378 | for_each_sched_entity(se) { | 1362 | for_each_sched_entity(se) { |
1379 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1363 | cfs_rq = cfs_rq_of(se); |
1380 | 1364 | ||
1381 | update_cfs_load(cfs_rq, 0); | 1365 | update_cfs_load(cfs_rq, 0); |
1382 | update_cfs_shares(cfs_rq); | 1366 | update_cfs_shares(cfs_rq); |
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1481 | * effect of the currently running task from the load | 1465 | * effect of the currently running task from the load |
1482 | * of the current CPU: | 1466 | * of the current CPU: |
1483 | */ | 1467 | */ |
1484 | rcu_read_lock(); | ||
1485 | if (sync) { | 1468 | if (sync) { |
1486 | tg = task_group(current); | 1469 | tg = task_group(current); |
1487 | weight = current->se.load.weight; | 1470 | weight = current->se.load.weight; |
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
1517 | balanced = this_eff_load <= prev_eff_load; | 1500 | balanced = this_eff_load <= prev_eff_load; |
1518 | } else | 1501 | } else |
1519 | balanced = true; | 1502 | balanced = true; |
1520 | rcu_read_unlock(); | ||
1521 | 1503 | ||
1522 | /* | 1504 | /* |
1523 | * If the currently running task will sleep within | 1505 | * If the currently running task will sleep within |
@@ -1924,8 +1906,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1924 | if (!sched_feat(WAKEUP_PREEMPT)) | 1906 | if (!sched_feat(WAKEUP_PREEMPT)) |
1925 | return; | 1907 | return; |
1926 | 1908 | ||
1927 | update_curr(cfs_rq); | ||
1928 | find_matching_se(&se, &pse); | 1909 | find_matching_se(&se, &pse); |
1910 | update_curr(cfs_rq_of(se)); | ||
1929 | BUG_ON(!pse); | 1911 | BUG_ON(!pse); |
1930 | if (wakeup_preempt_entity(se, pse) == 1) { | 1912 | if (wakeup_preempt_entity(se, pse) == 1) { |
1931 | /* | 1913 | /* |
@@ -2234,11 +2216,43 @@ static void update_shares(int cpu) | |||
2234 | struct rq *rq = cpu_rq(cpu); | 2216 | struct rq *rq = cpu_rq(cpu); |
2235 | 2217 | ||
2236 | rcu_read_lock(); | 2218 | rcu_read_lock(); |
2219 | /* | ||
2220 | * Iterates the task_group tree in a bottom up fashion, see | ||
2221 | * list_add_leaf_cfs_rq() for details. | ||
2222 | */ | ||
2237 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2223 | for_each_leaf_cfs_rq(rq, cfs_rq) |
2238 | update_shares_cpu(cfs_rq->tg, cpu); | 2224 | update_shares_cpu(cfs_rq->tg, cpu); |
2239 | rcu_read_unlock(); | 2225 | rcu_read_unlock(); |
2240 | } | 2226 | } |
2241 | 2227 | ||
2228 | /* | ||
2229 | * Compute the cpu's hierarchical load factor for each task group. | ||
2230 | * This needs to be done in a top-down fashion because the load of a child | ||
2231 | * group is a fraction of its parents load. | ||
2232 | */ | ||
2233 | static int tg_load_down(struct task_group *tg, void *data) | ||
2234 | { | ||
2235 | unsigned long load; | ||
2236 | long cpu = (long)data; | ||
2237 | |||
2238 | if (!tg->parent) { | ||
2239 | load = cpu_rq(cpu)->load.weight; | ||
2240 | } else { | ||
2241 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
2242 | load *= tg->se[cpu]->load.weight; | ||
2243 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
2244 | } | ||
2245 | |||
2246 | tg->cfs_rq[cpu]->h_load = load; | ||
2247 | |||
2248 | return 0; | ||
2249 | } | ||
2250 | |||
2251 | static void update_h_load(long cpu) | ||
2252 | { | ||
2253 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | ||
2254 | } | ||
2255 | |||
2242 | static unsigned long | 2256 | static unsigned long |
2243 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2257 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2244 | unsigned long max_load_move, | 2258 | unsigned long max_load_move, |
@@ -2246,14 +2260,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2246 | int *all_pinned) | 2260 | int *all_pinned) |
2247 | { | 2261 | { |
2248 | long rem_load_move = max_load_move; | 2262 | long rem_load_move = max_load_move; |
2249 | int busiest_cpu = cpu_of(busiest); | 2263 | struct cfs_rq *busiest_cfs_rq; |
2250 | struct task_group *tg; | ||
2251 | 2264 | ||
2252 | rcu_read_lock(); | 2265 | rcu_read_lock(); |
2253 | update_h_load(busiest_cpu); | 2266 | update_h_load(cpu_of(busiest)); |
2254 | 2267 | ||
2255 | list_for_each_entry_rcu(tg, &task_groups, list) { | 2268 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { |
2256 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | ||
2257 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | 2269 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; |
2258 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 2270 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
2259 | u64 rem_load, moved_load; | 2271 | u64 rem_load, moved_load; |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1e7066d76c2..2e74677cb04 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1) | |||
61 | SCHED_FEAT(OWNER_SPIN, 1) | 61 | SCHED_FEAT(OWNER_SPIN, 1) |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on time not spent running tasks |
65 | */ | 65 | */ |
66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONTASK_POWER, 1) |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * Queue remote wakeups on the target CPU and process them | 69 | * Queue remote wakeups on the target CPU and process them |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index db04161fe37..b827550a0d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -187,11 +187,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
187 | 187 | ||
188 | typedef struct task_group *rt_rq_iter_t; | 188 | typedef struct task_group *rt_rq_iter_t; |
189 | 189 | ||
190 | #define for_each_rt_rq(rt_rq, iter, rq) \ | 190 | static inline struct task_group *next_task_group(struct task_group *tg) |
191 | for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ | 191 | { |
192 | (&iter->list != &task_groups) && \ | 192 | do { |
193 | (rt_rq = iter->rt_rq[cpu_of(rq)]); \ | 193 | tg = list_entry_rcu(tg->list.next, |
194 | iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) | 194 | typeof(struct task_group), list); |
195 | } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); | ||
196 | |||
197 | if (&tg->list == &task_groups) | ||
198 | tg = NULL; | ||
199 | |||
200 | return tg; | ||
201 | } | ||
202 | |||
203 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
204 | for (iter = container_of(&task_groups, typeof(*iter), list); \ | ||
205 | (iter = next_task_group(iter)) && \ | ||
206 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) | ||
195 | 207 | ||
196 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 208 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
197 | { | 209 | { |
@@ -1045,7 +1057,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1045 | */ | 1057 | */ |
1046 | if (curr && unlikely(rt_task(curr)) && | 1058 | if (curr && unlikely(rt_task(curr)) && |
1047 | (curr->rt.nr_cpus_allowed < 2 || | 1059 | (curr->rt.nr_cpus_allowed < 2 || |
1048 | curr->prio < p->prio) && | 1060 | curr->prio <= p->prio) && |
1049 | (p->rt.nr_cpus_allowed > 1)) { | 1061 | (p->rt.nr_cpus_allowed > 1)) { |
1050 | int target = find_lowest_rq(p); | 1062 | int target = find_lowest_rq(p); |
1051 | 1063 | ||
@@ -1133,7 +1145,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1133 | 1145 | ||
1134 | rt_rq = &rq->rt; | 1146 | rt_rq = &rq->rt; |
1135 | 1147 | ||
1136 | if (unlikely(!rt_rq->rt_nr_running)) | 1148 | if (!rt_rq->rt_nr_running) |
1137 | return NULL; | 1149 | return NULL; |
1138 | 1150 | ||
1139 | if (rt_rq_throttled(rt_rq)) | 1151 | if (rt_rq_throttled(rt_rq)) |
@@ -1555,7 +1567,7 @@ skip: | |||
1555 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | 1567 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) |
1556 | { | 1568 | { |
1557 | /* Try to pull RT tasks here if we lower this rq's prio */ | 1569 | /* Try to pull RT tasks here if we lower this rq's prio */ |
1558 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) | 1570 | if (rq->rt.highest_prio.curr > prev->prio) |
1559 | pull_rt_task(rq); | 1571 | pull_rt_task(rq); |
1560 | } | 1572 | } |
1561 | 1573 | ||
@@ -1576,7 +1588,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
1576 | p->rt.nr_cpus_allowed > 1 && | 1588 | p->rt.nr_cpus_allowed > 1 && |
1577 | rt_task(rq->curr) && | 1589 | rt_task(rq->curr) && |
1578 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1590 | (rq->curr->rt.nr_cpus_allowed < 2 || |
1579 | rq->curr->prio < p->prio)) | 1591 | rq->curr->prio <= p->prio)) |
1580 | push_rt_tasks(rq); | 1592 | push_rt_tasks(rq); |
1581 | } | 1593 | } |
1582 | 1594 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 415d85d6f6c..195331c56ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) | |||
87 | /* | 87 | /* |
88 | * Tracers may want to know about even ignored signals. | 88 | * Tracers may want to know about even ignored signals. |
89 | */ | 89 | */ |
90 | return !tracehook_consider_ignored_signal(t, sig); | 90 | return !t->ptrace; |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
124 | 124 | ||
125 | static int recalc_sigpending_tsk(struct task_struct *t) | 125 | static int recalc_sigpending_tsk(struct task_struct *t) |
126 | { | 126 | { |
127 | if ((t->group_stop & GROUP_STOP_PENDING) || | 127 | if ((t->jobctl & JOBCTL_PENDING_MASK) || |
128 | PENDING(&t->pending, &t->blocked) || | 128 | PENDING(&t->pending, &t->blocked) || |
129 | PENDING(&t->signal->shared_pending, &t->blocked)) { | 129 | PENDING(&t->signal->shared_pending, &t->blocked)) { |
130 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 130 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) | |||
150 | 150 | ||
151 | void recalc_sigpending(void) | 151 | void recalc_sigpending(void) |
152 | { | 152 | { |
153 | if (unlikely(tracehook_force_sigpending())) | 153 | if (!recalc_sigpending_tsk(current) && !freezing(current)) |
154 | set_thread_flag(TIF_SIGPENDING); | ||
155 | else if (!recalc_sigpending_tsk(current) && !freezing(current)) | ||
156 | clear_thread_flag(TIF_SIGPENDING); | 154 | clear_thread_flag(TIF_SIGPENDING); |
157 | 155 | ||
158 | } | 156 | } |
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig) | |||
224 | } | 222 | } |
225 | 223 | ||
226 | /** | 224 | /** |
227 | * task_clear_group_stop_trapping - clear group stop trapping bit | 225 | * task_set_jobctl_pending - set jobctl pending bits |
228 | * @task: target task | 226 | * @task: target task |
227 | * @mask: pending bits to set | ||
229 | * | 228 | * |
230 | * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it | 229 | * Clear @mask from @task->jobctl. @mask must be subset of |
231 | * and wake up the ptracer. Note that we don't need any further locking. | 230 | * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | |
232 | * @task->siglock guarantees that @task->parent points to the ptracer. | 231 | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is |
232 | * cleared. If @task is already being killed or exiting, this function | ||
233 | * becomes noop. | ||
234 | * | ||
235 | * CONTEXT: | ||
236 | * Must be called with @task->sighand->siglock held. | ||
237 | * | ||
238 | * RETURNS: | ||
239 | * %true if @mask is set, %false if made noop because @task was dying. | ||
240 | */ | ||
241 | bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) | ||
242 | { | ||
243 | BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | | ||
244 | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); | ||
245 | BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); | ||
246 | |||
247 | if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) | ||
248 | return false; | ||
249 | |||
250 | if (mask & JOBCTL_STOP_SIGMASK) | ||
251 | task->jobctl &= ~JOBCTL_STOP_SIGMASK; | ||
252 | |||
253 | task->jobctl |= mask; | ||
254 | return true; | ||
255 | } | ||
256 | |||
257 | /** | ||
258 | * task_clear_jobctl_trapping - clear jobctl trapping bit | ||
259 | * @task: target task | ||
260 | * | ||
261 | * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. | ||
262 | * Clear it and wake up the ptracer. Note that we don't need any further | ||
263 | * locking. @task->siglock guarantees that @task->parent points to the | ||
264 | * ptracer. | ||
233 | * | 265 | * |
234 | * CONTEXT: | 266 | * CONTEXT: |
235 | * Must be called with @task->sighand->siglock held. | 267 | * Must be called with @task->sighand->siglock held. |
236 | */ | 268 | */ |
237 | static void task_clear_group_stop_trapping(struct task_struct *task) | 269 | void task_clear_jobctl_trapping(struct task_struct *task) |
238 | { | 270 | { |
239 | if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { | 271 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { |
240 | task->group_stop &= ~GROUP_STOP_TRAPPING; | 272 | task->jobctl &= ~JOBCTL_TRAPPING; |
241 | __wake_up_sync_key(&task->parent->signal->wait_chldexit, | 273 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); |
242 | TASK_UNINTERRUPTIBLE, 1, task); | ||
243 | } | 274 | } |
244 | } | 275 | } |
245 | 276 | ||
246 | /** | 277 | /** |
247 | * task_clear_group_stop_pending - clear pending group stop | 278 | * task_clear_jobctl_pending - clear jobctl pending bits |
248 | * @task: target task | 279 | * @task: target task |
280 | * @mask: pending bits to clear | ||
249 | * | 281 | * |
250 | * Clear group stop states for @task. | 282 | * Clear @mask from @task->jobctl. @mask must be subset of |
283 | * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other | ||
284 | * STOP bits are cleared together. | ||
285 | * | ||
286 | * If clearing of @mask leaves no stop or trap pending, this function calls | ||
287 | * task_clear_jobctl_trapping(). | ||
251 | * | 288 | * |
252 | * CONTEXT: | 289 | * CONTEXT: |
253 | * Must be called with @task->sighand->siglock held. | 290 | * Must be called with @task->sighand->siglock held. |
254 | */ | 291 | */ |
255 | void task_clear_group_stop_pending(struct task_struct *task) | 292 | void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) |
256 | { | 293 | { |
257 | task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | | 294 | BUG_ON(mask & ~JOBCTL_PENDING_MASK); |
258 | GROUP_STOP_DEQUEUED); | 295 | |
296 | if (mask & JOBCTL_STOP_PENDING) | ||
297 | mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; | ||
298 | |||
299 | task->jobctl &= ~mask; | ||
300 | |||
301 | if (!(task->jobctl & JOBCTL_PENDING_MASK)) | ||
302 | task_clear_jobctl_trapping(task); | ||
259 | } | 303 | } |
260 | 304 | ||
261 | /** | 305 | /** |
262 | * task_participate_group_stop - participate in a group stop | 306 | * task_participate_group_stop - participate in a group stop |
263 | * @task: task participating in a group stop | 307 | * @task: task participating in a group stop |
264 | * | 308 | * |
265 | * @task has GROUP_STOP_PENDING set and is participating in a group stop. | 309 | * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. |
266 | * Group stop states are cleared and the group stop count is consumed if | 310 | * Group stop states are cleared and the group stop count is consumed if |
267 | * %GROUP_STOP_CONSUME was set. If the consumption completes the group | 311 | * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group |
268 | * stop, the appropriate %SIGNAL_* flags are set. | 312 | * stop, the appropriate %SIGNAL_* flags are set. |
269 | * | 313 | * |
270 | * CONTEXT: | 314 | * CONTEXT: |
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task) | |||
277 | static bool task_participate_group_stop(struct task_struct *task) | 321 | static bool task_participate_group_stop(struct task_struct *task) |
278 | { | 322 | { |
279 | struct signal_struct *sig = task->signal; | 323 | struct signal_struct *sig = task->signal; |
280 | bool consume = task->group_stop & GROUP_STOP_CONSUME; | 324 | bool consume = task->jobctl & JOBCTL_STOP_CONSUME; |
281 | 325 | ||
282 | WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); | 326 | WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); |
283 | 327 | ||
284 | task_clear_group_stop_pending(task); | 328 | task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); |
285 | 329 | ||
286 | if (!consume) | 330 | if (!consume) |
287 | return false; | 331 | return false; |
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
449 | return 1; | 493 | return 1; |
450 | if (handler != SIG_IGN && handler != SIG_DFL) | 494 | if (handler != SIG_IGN && handler != SIG_DFL) |
451 | return 0; | 495 | return 0; |
452 | return !tracehook_consider_fatal_signal(tsk, sig); | 496 | /* if ptraced, let the tracer determine */ |
497 | return !tsk->ptrace; | ||
453 | } | 498 | } |
454 | 499 | ||
455 | /* | 500 | /* |
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
604 | * is to alert stop-signal processing code when another | 649 | * is to alert stop-signal processing code when another |
605 | * processor has come along and cleared the flag. | 650 | * processor has come along and cleared the flag. |
606 | */ | 651 | */ |
607 | current->group_stop |= GROUP_STOP_DEQUEUED; | 652 | current->jobctl |= JOBCTL_STOP_DEQUEUED; |
608 | } | 653 | } |
609 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { | 654 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { |
610 | /* | 655 | /* |
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
773 | return security_task_kill(t, info, sig, 0); | 818 | return security_task_kill(t, info, sig, 0); |
774 | } | 819 | } |
775 | 820 | ||
821 | /** | ||
822 | * ptrace_trap_notify - schedule trap to notify ptracer | ||
823 | * @t: tracee wanting to notify tracer | ||
824 | * | ||
825 | * This function schedules sticky ptrace trap which is cleared on the next | ||
826 | * TRAP_STOP to notify ptracer of an event. @t must have been seized by | ||
827 | * ptracer. | ||
828 | * | ||
829 | * If @t is running, STOP trap will be taken. If trapped for STOP and | ||
830 | * ptracer is listening for events, tracee is woken up so that it can | ||
831 | * re-trap for the new event. If trapped otherwise, STOP trap will be | ||
832 | * eventually taken without returning to userland after the existing traps | ||
833 | * are finished by PTRACE_CONT. | ||
834 | * | ||
835 | * CONTEXT: | ||
836 | * Must be called with @task->sighand->siglock held. | ||
837 | */ | ||
838 | static void ptrace_trap_notify(struct task_struct *t) | ||
839 | { | ||
840 | WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); | ||
841 | assert_spin_locked(&t->sighand->siglock); | ||
842 | |||
843 | task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); | ||
844 | signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); | ||
845 | } | ||
846 | |||
776 | /* | 847 | /* |
777 | * Handle magic process-wide effects of stop/continue signals. Unlike | 848 | * Handle magic process-wide effects of stop/continue signals. Unlike |
778 | * the signal actions, these happen immediately at signal-generation | 849 | * the signal actions, these happen immediately at signal-generation |
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
809 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 880 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); |
810 | t = p; | 881 | t = p; |
811 | do { | 882 | do { |
812 | task_clear_group_stop_pending(t); | 883 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); |
813 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | 884 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); |
814 | wake_up_state(t, __TASK_STOPPED); | 885 | if (likely(!(t->ptrace & PT_SEIZED))) |
886 | wake_up_state(t, __TASK_STOPPED); | ||
887 | else | ||
888 | ptrace_trap_notify(t); | ||
815 | } while_each_thread(p, t); | 889 | } while_each_thread(p, t); |
816 | 890 | ||
817 | /* | 891 | /* |
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
908 | if (sig_fatal(p, sig) && | 982 | if (sig_fatal(p, sig) && |
909 | !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && | 983 | !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && |
910 | !sigismember(&t->real_blocked, sig) && | 984 | !sigismember(&t->real_blocked, sig) && |
911 | (sig == SIGKILL || | 985 | (sig == SIGKILL || !t->ptrace)) { |
912 | !tracehook_consider_fatal_signal(t, sig))) { | ||
913 | /* | 986 | /* |
914 | * This signal will be fatal to the whole group. | 987 | * This signal will be fatal to the whole group. |
915 | */ | 988 | */ |
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
925 | signal->group_stop_count = 0; | 998 | signal->group_stop_count = 0; |
926 | t = p; | 999 | t = p; |
927 | do { | 1000 | do { |
928 | task_clear_group_stop_pending(t); | 1001 | task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); |
929 | sigaddset(&t->pending.signal, SIGKILL); | 1002 | sigaddset(&t->pending.signal, SIGKILL); |
930 | signal_wake_up(t, 1); | 1003 | signal_wake_up(t, 1); |
931 | } while_each_thread(p, t); | 1004 | } while_each_thread(p, t); |
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p) | |||
1160 | p->signal->group_stop_count = 0; | 1233 | p->signal->group_stop_count = 0; |
1161 | 1234 | ||
1162 | while_each_thread(p, t) { | 1235 | while_each_thread(p, t) { |
1163 | task_clear_group_stop_pending(t); | 1236 | task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); |
1164 | count++; | 1237 | count++; |
1165 | 1238 | ||
1166 | /* Don't bother with already dead threads */ | 1239 | /* Don't bother with already dead threads */ |
@@ -1511,22 +1584,22 @@ ret: | |||
1511 | * Let a parent know about the death of a child. | 1584 | * Let a parent know about the death of a child. |
1512 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. | 1585 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. |
1513 | * | 1586 | * |
1514 | * Returns -1 if our parent ignored us and so we've switched to | 1587 | * Returns true if our parent ignored us and so we've switched to |
1515 | * self-reaping, or else @sig. | 1588 | * self-reaping. |
1516 | */ | 1589 | */ |
1517 | int do_notify_parent(struct task_struct *tsk, int sig) | 1590 | bool do_notify_parent(struct task_struct *tsk, int sig) |
1518 | { | 1591 | { |
1519 | struct siginfo info; | 1592 | struct siginfo info; |
1520 | unsigned long flags; | 1593 | unsigned long flags; |
1521 | struct sighand_struct *psig; | 1594 | struct sighand_struct *psig; |
1522 | int ret = sig; | 1595 | bool autoreap = false; |
1523 | 1596 | ||
1524 | BUG_ON(sig == -1); | 1597 | BUG_ON(sig == -1); |
1525 | 1598 | ||
1526 | /* do_notify_parent_cldstop should have been called instead. */ | 1599 | /* do_notify_parent_cldstop should have been called instead. */ |
1527 | BUG_ON(task_is_stopped_or_traced(tsk)); | 1600 | BUG_ON(task_is_stopped_or_traced(tsk)); |
1528 | 1601 | ||
1529 | BUG_ON(!task_ptrace(tsk) && | 1602 | BUG_ON(!tsk->ptrace && |
1530 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | 1603 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); |
1531 | 1604 | ||
1532 | info.si_signo = sig; | 1605 | info.si_signo = sig; |
@@ -1565,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
1565 | 1638 | ||
1566 | psig = tsk->parent->sighand; | 1639 | psig = tsk->parent->sighand; |
1567 | spin_lock_irqsave(&psig->siglock, flags); | 1640 | spin_lock_irqsave(&psig->siglock, flags); |
1568 | if (!task_ptrace(tsk) && sig == SIGCHLD && | 1641 | if (!tsk->ptrace && sig == SIGCHLD && |
1569 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || | 1642 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || |
1570 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { | 1643 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { |
1571 | /* | 1644 | /* |
@@ -1583,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
1583 | * is implementation-defined: we do (if you don't want | 1656 | * is implementation-defined: we do (if you don't want |
1584 | * it, just use SIG_IGN instead). | 1657 | * it, just use SIG_IGN instead). |
1585 | */ | 1658 | */ |
1586 | ret = tsk->exit_signal = -1; | 1659 | autoreap = true; |
1587 | if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) | 1660 | if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) |
1588 | sig = -1; | 1661 | sig = 0; |
1589 | } | 1662 | } |
1590 | if (valid_signal(sig) && sig > 0) | 1663 | if (valid_signal(sig) && sig) |
1591 | __group_send_sig_info(sig, &info, tsk->parent); | 1664 | __group_send_sig_info(sig, &info, tsk->parent); |
1592 | __wake_up_parent(tsk, tsk->parent); | 1665 | __wake_up_parent(tsk, tsk->parent); |
1593 | spin_unlock_irqrestore(&psig->siglock, flags); | 1666 | spin_unlock_irqrestore(&psig->siglock, flags); |
1594 | 1667 | ||
1595 | return ret; | 1668 | return autoreap; |
1596 | } | 1669 | } |
1597 | 1670 | ||
1598 | /** | 1671 | /** |
@@ -1665,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1665 | 1738 | ||
1666 | static inline int may_ptrace_stop(void) | 1739 | static inline int may_ptrace_stop(void) |
1667 | { | 1740 | { |
1668 | if (!likely(task_ptrace(current))) | 1741 | if (!likely(current->ptrace)) |
1669 | return 0; | 1742 | return 0; |
1670 | /* | 1743 | /* |
1671 | * Are we in the middle of do_coredump? | 1744 | * Are we in the middle of do_coredump? |
@@ -1694,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk) | |||
1694 | } | 1767 | } |
1695 | 1768 | ||
1696 | /* | 1769 | /* |
1697 | * Test whether the target task of the usual cldstop notification - the | ||
1698 | * real_parent of @child - is in the same group as the ptracer. | ||
1699 | */ | ||
1700 | static bool real_parent_is_ptracer(struct task_struct *child) | ||
1701 | { | ||
1702 | return same_thread_group(child->parent, child->real_parent); | ||
1703 | } | ||
1704 | |||
1705 | /* | ||
1706 | * This must be called with current->sighand->siglock held. | 1770 | * This must be called with current->sighand->siglock held. |
1707 | * | 1771 | * |
1708 | * This should be the path for all ptrace stops. | 1772 | * This should be the path for all ptrace stops. |
@@ -1739,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1739 | } | 1803 | } |
1740 | 1804 | ||
1741 | /* | 1805 | /* |
1742 | * If @why is CLD_STOPPED, we're trapping to participate in a group | 1806 | * We're committing to trapping. TRACED should be visible before |
1743 | * stop. Do the bookkeeping. Note that if SIGCONT was delievered | 1807 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). |
1744 | * while siglock was released for the arch hook, PENDING could be | 1808 | * Also, transition to TRACED and updates to ->jobctl should be |
1745 | * clear now. We act as if SIGCONT is received after TASK_TRACED | 1809 | * atomic with respect to siglock and should be done after the arch |
1746 | * is entered - ignore it. | 1810 | * hook as siglock is released and regrabbed across it. |
1747 | */ | 1811 | */ |
1748 | if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) | 1812 | set_current_state(TASK_TRACED); |
1749 | gstop_done = task_participate_group_stop(current); | ||
1750 | 1813 | ||
1751 | current->last_siginfo = info; | 1814 | current->last_siginfo = info; |
1752 | current->exit_code = exit_code; | 1815 | current->exit_code = exit_code; |
1753 | 1816 | ||
1754 | /* | 1817 | /* |
1755 | * TRACED should be visible before TRAPPING is cleared; otherwise, | 1818 | * If @why is CLD_STOPPED, we're trapping to participate in a group |
1756 | * the tracer might fail do_wait(). | 1819 | * stop. Do the bookkeeping. Note that if SIGCONT was delievered |
1820 | * across siglock relocks since INTERRUPT was scheduled, PENDING | ||
1821 | * could be clear now. We act as if SIGCONT is received after | ||
1822 | * TASK_TRACED is entered - ignore it. | ||
1757 | */ | 1823 | */ |
1758 | set_current_state(TASK_TRACED); | 1824 | if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) |
1825 | gstop_done = task_participate_group_stop(current); | ||
1759 | 1826 | ||
1760 | /* | 1827 | /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ |
1761 | * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and | 1828 | task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); |
1762 | * transition to TASK_TRACED should be atomic with respect to | 1829 | if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) |
1763 | * siglock. This hsould be done after the arch hook as siglock is | 1830 | task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); |
1764 | * released and regrabbed across it. | 1831 | |
1765 | */ | 1832 | /* entering a trap, clear TRAPPING */ |
1766 | task_clear_group_stop_trapping(current); | 1833 | task_clear_jobctl_trapping(current); |
1767 | 1834 | ||
1768 | spin_unlock_irq(¤t->sighand->siglock); | 1835 | spin_unlock_irq(¤t->sighand->siglock); |
1769 | read_lock(&tasklist_lock); | 1836 | read_lock(&tasklist_lock); |
@@ -1779,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1779 | * separately unless they're gonna be duplicates. | 1846 | * separately unless they're gonna be duplicates. |
1780 | */ | 1847 | */ |
1781 | do_notify_parent_cldstop(current, true, why); | 1848 | do_notify_parent_cldstop(current, true, why); |
1782 | if (gstop_done && !real_parent_is_ptracer(current)) | 1849 | if (gstop_done && ptrace_reparented(current)) |
1783 | do_notify_parent_cldstop(current, false, why); | 1850 | do_notify_parent_cldstop(current, false, why); |
1784 | 1851 | ||
1785 | /* | 1852 | /* |
@@ -1799,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1799 | * | 1866 | * |
1800 | * If @gstop_done, the ptracer went away between group stop | 1867 | * If @gstop_done, the ptracer went away between group stop |
1801 | * completion and here. During detach, it would have set | 1868 | * completion and here. During detach, it would have set |
1802 | * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED | 1869 | * JOBCTL_STOP_PENDING on us and we'll re-enter |
1803 | * in do_signal_stop() on return, so notifying the real | 1870 | * TASK_STOPPED in do_signal_stop() on return, so notifying |
1804 | * parent of the group stop completion is enough. | 1871 | * the real parent of the group stop completion is enough. |
1805 | */ | 1872 | */ |
1806 | if (gstop_done) | 1873 | if (gstop_done) |
1807 | do_notify_parent_cldstop(current, false, why); | 1874 | do_notify_parent_cldstop(current, false, why); |
@@ -1827,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1827 | spin_lock_irq(¤t->sighand->siglock); | 1894 | spin_lock_irq(¤t->sighand->siglock); |
1828 | current->last_siginfo = NULL; | 1895 | current->last_siginfo = NULL; |
1829 | 1896 | ||
1897 | /* LISTENING can be set only during STOP traps, clear it */ | ||
1898 | current->jobctl &= ~JOBCTL_LISTENING; | ||
1899 | |||
1830 | /* | 1900 | /* |
1831 | * Queued signals ignored us while we were stopped for tracing. | 1901 | * Queued signals ignored us while we were stopped for tracing. |
1832 | * So check for any that we should take before resuming user mode. | 1902 | * So check for any that we should take before resuming user mode. |
@@ -1835,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1835 | recalc_sigpending_tsk(current); | 1905 | recalc_sigpending_tsk(current); |
1836 | } | 1906 | } |
1837 | 1907 | ||
1838 | void ptrace_notify(int exit_code) | 1908 | static void ptrace_do_notify(int signr, int exit_code, int why) |
1839 | { | 1909 | { |
1840 | siginfo_t info; | 1910 | siginfo_t info; |
1841 | 1911 | ||
1842 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | ||
1843 | |||
1844 | memset(&info, 0, sizeof info); | 1912 | memset(&info, 0, sizeof info); |
1845 | info.si_signo = SIGTRAP; | 1913 | info.si_signo = signr; |
1846 | info.si_code = exit_code; | 1914 | info.si_code = exit_code; |
1847 | info.si_pid = task_pid_vnr(current); | 1915 | info.si_pid = task_pid_vnr(current); |
1848 | info.si_uid = current_uid(); | 1916 | info.si_uid = current_uid(); |
1849 | 1917 | ||
1850 | /* Let the debugger run. */ | 1918 | /* Let the debugger run. */ |
1919 | ptrace_stop(exit_code, why, 1, &info); | ||
1920 | } | ||
1921 | |||
1922 | void ptrace_notify(int exit_code) | ||
1923 | { | ||
1924 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | ||
1925 | |||
1851 | spin_lock_irq(¤t->sighand->siglock); | 1926 | spin_lock_irq(¤t->sighand->siglock); |
1852 | ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); | 1927 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); |
1853 | spin_unlock_irq(¤t->sighand->siglock); | 1928 | spin_unlock_irq(¤t->sighand->siglock); |
1854 | } | 1929 | } |
1855 | 1930 | ||
1856 | /* | 1931 | /** |
1857 | * This performs the stopping for SIGSTOP and other stop signals. | 1932 | * do_signal_stop - handle group stop for SIGSTOP and other stop signals |
1858 | * We have to stop all threads in the thread group. | 1933 | * @signr: signr causing group stop if initiating |
1859 | * Returns non-zero if we've actually stopped and released the siglock. | 1934 | * |
1860 | * Returns zero if we didn't stop and still hold the siglock. | 1935 | * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr |
1936 | * and participate in it. If already set, participate in the existing | ||
1937 | * group stop. If participated in a group stop (and thus slept), %true is | ||
1938 | * returned with siglock released. | ||
1939 | * | ||
1940 | * If ptraced, this function doesn't handle stop itself. Instead, | ||
1941 | * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock | ||
1942 | * untouched. The caller must ensure that INTERRUPT trap handling takes | ||
1943 | * places afterwards. | ||
1944 | * | ||
1945 | * CONTEXT: | ||
1946 | * Must be called with @current->sighand->siglock held, which is released | ||
1947 | * on %true return. | ||
1948 | * | ||
1949 | * RETURNS: | ||
1950 | * %false if group stop is already cancelled or ptrace trap is scheduled. | ||
1951 | * %true if participated in group stop. | ||
1861 | */ | 1952 | */ |
1862 | static int do_signal_stop(int signr) | 1953 | static bool do_signal_stop(int signr) |
1954 | __releases(¤t->sighand->siglock) | ||
1863 | { | 1955 | { |
1864 | struct signal_struct *sig = current->signal; | 1956 | struct signal_struct *sig = current->signal; |
1865 | 1957 | ||
1866 | if (!(current->group_stop & GROUP_STOP_PENDING)) { | 1958 | if (!(current->jobctl & JOBCTL_STOP_PENDING)) { |
1867 | unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; | 1959 | unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; |
1868 | struct task_struct *t; | 1960 | struct task_struct *t; |
1869 | 1961 | ||
1870 | /* signr will be recorded in task->group_stop for retries */ | 1962 | /* signr will be recorded in task->jobctl for retries */ |
1871 | WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); | 1963 | WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); |
1872 | 1964 | ||
1873 | if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || | 1965 | if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || |
1874 | unlikely(signal_group_exit(sig))) | 1966 | unlikely(signal_group_exit(sig))) |
1875 | return 0; | 1967 | return false; |
1876 | /* | 1968 | /* |
1877 | * There is no group stop already in progress. We must | 1969 | * There is no group stop already in progress. We must |
1878 | * initiate one now. | 1970 | * initiate one now. |
@@ -1894,29 +1986,31 @@ static int do_signal_stop(int signr) | |||
1894 | */ | 1986 | */ |
1895 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) | 1987 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
1896 | sig->group_exit_code = signr; | 1988 | sig->group_exit_code = signr; |
1897 | else | ||
1898 | WARN_ON_ONCE(!task_ptrace(current)); | ||
1899 | 1989 | ||
1900 | current->group_stop &= ~GROUP_STOP_SIGMASK; | 1990 | sig->group_stop_count = 0; |
1901 | current->group_stop |= signr | gstop; | 1991 | |
1902 | sig->group_stop_count = 1; | 1992 | if (task_set_jobctl_pending(current, signr | gstop)) |
1993 | sig->group_stop_count++; | ||
1994 | |||
1903 | for (t = next_thread(current); t != current; | 1995 | for (t = next_thread(current); t != current; |
1904 | t = next_thread(t)) { | 1996 | t = next_thread(t)) { |
1905 | t->group_stop &= ~GROUP_STOP_SIGMASK; | ||
1906 | /* | 1997 | /* |
1907 | * Setting state to TASK_STOPPED for a group | 1998 | * Setting state to TASK_STOPPED for a group |
1908 | * stop is always done with the siglock held, | 1999 | * stop is always done with the siglock held, |
1909 | * so this check has no races. | 2000 | * so this check has no races. |
1910 | */ | 2001 | */ |
1911 | if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { | 2002 | if (!task_is_stopped(t) && |
1912 | t->group_stop |= signr | gstop; | 2003 | task_set_jobctl_pending(t, signr | gstop)) { |
1913 | sig->group_stop_count++; | 2004 | sig->group_stop_count++; |
1914 | signal_wake_up(t, 0); | 2005 | if (likely(!(t->ptrace & PT_SEIZED))) |
2006 | signal_wake_up(t, 0); | ||
2007 | else | ||
2008 | ptrace_trap_notify(t); | ||
1915 | } | 2009 | } |
1916 | } | 2010 | } |
1917 | } | 2011 | } |
1918 | retry: | 2012 | |
1919 | if (likely(!task_ptrace(current))) { | 2013 | if (likely(!current->ptrace)) { |
1920 | int notify = 0; | 2014 | int notify = 0; |
1921 | 2015 | ||
1922 | /* | 2016 | /* |
@@ -1947,43 +2041,65 @@ retry: | |||
1947 | 2041 | ||
1948 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2042 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
1949 | schedule(); | 2043 | schedule(); |
1950 | 2044 | return true; | |
1951 | spin_lock_irq(¤t->sighand->siglock); | ||
1952 | } else { | 2045 | } else { |
1953 | ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, | 2046 | /* |
1954 | CLD_STOPPED, 0, NULL); | 2047 | * While ptraced, group stop is handled by STOP trap. |
1955 | current->exit_code = 0; | 2048 | * Schedule it and let the caller deal with it. |
2049 | */ | ||
2050 | task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); | ||
2051 | return false; | ||
1956 | } | 2052 | } |
2053 | } | ||
1957 | 2054 | ||
1958 | /* | 2055 | /** |
1959 | * GROUP_STOP_PENDING could be set if another group stop has | 2056 | * do_jobctl_trap - take care of ptrace jobctl traps |
1960 | * started since being woken up or ptrace wants us to transit | 2057 | * |
1961 | * between TASK_STOPPED and TRACED. Retry group stop. | 2058 | * When PT_SEIZED, it's used for both group stop and explicit |
1962 | */ | 2059 | * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with |
1963 | if (current->group_stop & GROUP_STOP_PENDING) { | 2060 | * accompanying siginfo. If stopped, lower eight bits of exit_code contain |
1964 | WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); | 2061 | * the stop signal; otherwise, %SIGTRAP. |
1965 | goto retry; | 2062 | * |
2063 | * When !PT_SEIZED, it's used only for group stop trap with stop signal | ||
2064 | * number as exit_code and no siginfo. | ||
2065 | * | ||
2066 | * CONTEXT: | ||
2067 | * Must be called with @current->sighand->siglock held, which may be | ||
2068 | * released and re-acquired before returning with intervening sleep. | ||
2069 | */ | ||
2070 | static void do_jobctl_trap(void) | ||
2071 | { | ||
2072 | struct signal_struct *signal = current->signal; | ||
2073 | int signr = current->jobctl & JOBCTL_STOP_SIGMASK; | ||
2074 | |||
2075 | if (current->ptrace & PT_SEIZED) { | ||
2076 | if (!signal->group_stop_count && | ||
2077 | !(signal->flags & SIGNAL_STOP_STOPPED)) | ||
2078 | signr = SIGTRAP; | ||
2079 | WARN_ON_ONCE(!signr); | ||
2080 | ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), | ||
2081 | CLD_STOPPED); | ||
2082 | } else { | ||
2083 | WARN_ON_ONCE(!signr); | ||
2084 | ptrace_stop(signr, CLD_STOPPED, 0, NULL); | ||
2085 | current->exit_code = 0; | ||
1966 | } | 2086 | } |
1967 | |||
1968 | /* PTRACE_ATTACH might have raced with task killing, clear trapping */ | ||
1969 | task_clear_group_stop_trapping(current); | ||
1970 | |||
1971 | spin_unlock_irq(¤t->sighand->siglock); | ||
1972 | |||
1973 | tracehook_finish_jctl(); | ||
1974 | |||
1975 | return 1; | ||
1976 | } | 2087 | } |
1977 | 2088 | ||
1978 | static int ptrace_signal(int signr, siginfo_t *info, | 2089 | static int ptrace_signal(int signr, siginfo_t *info, |
1979 | struct pt_regs *regs, void *cookie) | 2090 | struct pt_regs *regs, void *cookie) |
1980 | { | 2091 | { |
1981 | if (!task_ptrace(current)) | ||
1982 | return signr; | ||
1983 | |||
1984 | ptrace_signal_deliver(regs, cookie); | 2092 | ptrace_signal_deliver(regs, cookie); |
1985 | 2093 | /* | |
1986 | /* Let the debugger run. */ | 2094 | * We do not check sig_kernel_stop(signr) but set this marker |
2095 | * unconditionally because we do not know whether debugger will | ||
2096 | * change signr. This flag has no meaning unless we are going | ||
2097 | * to stop after return from ptrace_stop(). In this case it will | ||
2098 | * be checked in do_signal_stop(), we should only stop if it was | ||
2099 | * not cleared by SIGCONT while we were sleeping. See also the | ||
2100 | * comment in dequeue_signal(). | ||
2101 | */ | ||
2102 | current->jobctl |= JOBCTL_STOP_DEQUEUED; | ||
1987 | ptrace_stop(signr, CLD_TRAPPED, 0, info); | 2103 | ptrace_stop(signr, CLD_TRAPPED, 0, info); |
1988 | 2104 | ||
1989 | /* We're back. Did the debugger cancel the sig? */ | 2105 | /* We're back. Did the debugger cancel the sig? */ |
@@ -2039,7 +2155,6 @@ relock: | |||
2039 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. | 2155 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. |
2040 | */ | 2156 | */ |
2041 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { | 2157 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { |
2042 | struct task_struct *leader; | ||
2043 | int why; | 2158 | int why; |
2044 | 2159 | ||
2045 | if (signal->flags & SIGNAL_CLD_CONTINUED) | 2160 | if (signal->flags & SIGNAL_CLD_CONTINUED) |
@@ -2060,13 +2175,11 @@ relock: | |||
2060 | * a duplicate. | 2175 | * a duplicate. |
2061 | */ | 2176 | */ |
2062 | read_lock(&tasklist_lock); | 2177 | read_lock(&tasklist_lock); |
2063 | |||
2064 | do_notify_parent_cldstop(current, false, why); | 2178 | do_notify_parent_cldstop(current, false, why); |
2065 | 2179 | ||
2066 | leader = current->group_leader; | 2180 | if (ptrace_reparented(current->group_leader)) |
2067 | if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) | 2181 | do_notify_parent_cldstop(current->group_leader, |
2068 | do_notify_parent_cldstop(leader, true, why); | 2182 | true, why); |
2069 | |||
2070 | read_unlock(&tasklist_lock); | 2183 | read_unlock(&tasklist_lock); |
2071 | 2184 | ||
2072 | goto relock; | 2185 | goto relock; |
@@ -2074,37 +2187,31 @@ relock: | |||
2074 | 2187 | ||
2075 | for (;;) { | 2188 | for (;;) { |
2076 | struct k_sigaction *ka; | 2189 | struct k_sigaction *ka; |
2077 | /* | 2190 | |
2078 | * Tracing can induce an artificial signal and choose sigaction. | 2191 | if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && |
2079 | * The return value in @signr determines the default action, | 2192 | do_signal_stop(0)) |
2080 | * but @info->si_signo is the signal number we will report. | ||
2081 | */ | ||
2082 | signr = tracehook_get_signal(current, regs, info, return_ka); | ||
2083 | if (unlikely(signr < 0)) | ||
2084 | goto relock; | 2193 | goto relock; |
2085 | if (unlikely(signr != 0)) | ||
2086 | ka = return_ka; | ||
2087 | else { | ||
2088 | if (unlikely(current->group_stop & | ||
2089 | GROUP_STOP_PENDING) && do_signal_stop(0)) | ||
2090 | goto relock; | ||
2091 | 2194 | ||
2092 | signr = dequeue_signal(current, ¤t->blocked, | 2195 | if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { |
2093 | info); | 2196 | do_jobctl_trap(); |
2197 | spin_unlock_irq(&sighand->siglock); | ||
2198 | goto relock; | ||
2199 | } | ||
2094 | 2200 | ||
2095 | if (!signr) | 2201 | signr = dequeue_signal(current, ¤t->blocked, info); |
2096 | break; /* will return 0 */ | ||
2097 | 2202 | ||
2098 | if (signr != SIGKILL) { | 2203 | if (!signr) |
2099 | signr = ptrace_signal(signr, info, | 2204 | break; /* will return 0 */ |
2100 | regs, cookie); | ||
2101 | if (!signr) | ||
2102 | continue; | ||
2103 | } | ||
2104 | 2205 | ||
2105 | ka = &sighand->action[signr-1]; | 2206 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
2207 | signr = ptrace_signal(signr, info, | ||
2208 | regs, cookie); | ||
2209 | if (!signr) | ||
2210 | continue; | ||
2106 | } | 2211 | } |
2107 | 2212 | ||
2213 | ka = &sighand->action[signr-1]; | ||
2214 | |||
2108 | /* Trace actually delivered signals. */ | 2215 | /* Trace actually delivered signals. */ |
2109 | trace_signal_deliver(signr, info, ka); | 2216 | trace_signal_deliver(signr, info, ka); |
2110 | 2217 | ||
@@ -2260,7 +2367,7 @@ void exit_signals(struct task_struct *tsk) | |||
2260 | signotset(&unblocked); | 2367 | signotset(&unblocked); |
2261 | retarget_shared_pending(tsk, &unblocked); | 2368 | retarget_shared_pending(tsk, &unblocked); |
2262 | 2369 | ||
2263 | if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && | 2370 | if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && |
2264 | task_participate_group_stop(tsk)) | 2371 | task_participate_group_stop(tsk)) |
2265 | group_stop = CLD_STOPPED; | 2372 | group_stop = CLD_STOPPED; |
2266 | out: | 2373 | out: |
@@ -2993,15 +3100,11 @@ SYSCALL_DEFINE0(sgetmask) | |||
2993 | 3100 | ||
2994 | SYSCALL_DEFINE1(ssetmask, int, newmask) | 3101 | SYSCALL_DEFINE1(ssetmask, int, newmask) |
2995 | { | 3102 | { |
2996 | int old; | 3103 | int old = current->blocked.sig[0]; |
2997 | 3104 | sigset_t newset; | |
2998 | spin_lock_irq(¤t->sighand->siglock); | ||
2999 | old = current->blocked.sig[0]; | ||
3000 | 3105 | ||
3001 | siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| | 3106 | siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); |
3002 | sigmask(SIGSTOP))); | 3107 | set_current_blocked(&newset); |
3003 | recalc_sigpending(); | ||
3004 | spin_unlock_irq(¤t->sighand->siglock); | ||
3005 | 3108 | ||
3006 | return old; | 3109 | return old; |
3007 | } | 3110 | } |
@@ -3058,11 +3161,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
3058 | return -EFAULT; | 3161 | return -EFAULT; |
3059 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 3162 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
3060 | 3163 | ||
3061 | spin_lock_irq(¤t->sighand->siglock); | ||
3062 | current->saved_sigmask = current->blocked; | 3164 | current->saved_sigmask = current->blocked; |
3063 | current->blocked = newset; | 3165 | set_current_blocked(&newset); |
3064 | recalc_sigpending(); | ||
3065 | spin_unlock_irq(¤t->sighand->siglock); | ||
3066 | 3166 | ||
3067 | current->state = TASK_INTERRUPTIBLE; | 3167 | current->state = TASK_INTERRUPTIBLE; |
3068 | schedule(); | 3168 | schedule(); |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index eb212f8f8bc..d20c6983aad 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * Architectures that do not implement save_stack_trace_tsk get this | 29 | * Architectures that do not implement save_stack_trace_tsk or |
30 | * weak alias and a once-per-bootup warning (whenever this facility | 30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
31 | * is utilized - for example by procfs): | 31 | * (whenever this facility is utilized - for example by procfs): |
32 | */ | 32 | */ |
33 | __weak void | 33 | __weak void |
34 | save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 34 | save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
35 | { | 35 | { |
36 | WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); | 36 | WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); |
37 | } | 37 | } |
38 | |||
39 | __weak void | ||
40 | save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) | ||
41 | { | ||
42 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); | ||
43 | } | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e3516b29076..ba5070ce576 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
21 | 21 | ||
22 | #include <asm/atomic.h> | 22 | #include <linux/atomic.h> |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Structure to determine completion condition and record errors. May | 25 | * Structure to determine completion condition and record errors. May |
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
136 | static DEFINE_MUTEX(stop_cpus_mutex); | 136 | static DEFINE_MUTEX(stop_cpus_mutex); |
137 | static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); | 137 | static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); |
138 | 138 | ||
139 | int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | 139 | static void queue_stop_cpus_work(const struct cpumask *cpumask, |
140 | cpu_stop_fn_t fn, void *arg, | ||
141 | struct cpu_stop_done *done) | ||
140 | { | 142 | { |
141 | struct cpu_stop_work *work; | 143 | struct cpu_stop_work *work; |
142 | struct cpu_stop_done done; | ||
143 | unsigned int cpu; | 144 | unsigned int cpu; |
144 | 145 | ||
145 | /* initialize works and done */ | 146 | /* initialize works and done */ |
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |||
147 | work = &per_cpu(stop_cpus_work, cpu); | 148 | work = &per_cpu(stop_cpus_work, cpu); |
148 | work->fn = fn; | 149 | work->fn = fn; |
149 | work->arg = arg; | 150 | work->arg = arg; |
150 | work->done = &done; | 151 | work->done = done; |
151 | } | 152 | } |
152 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); | ||
153 | 153 | ||
154 | /* | 154 | /* |
155 | * Disable preemption while queueing to avoid getting | 155 | * Disable preemption while queueing to avoid getting |
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |||
161 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), | 161 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), |
162 | &per_cpu(stop_cpus_work, cpu)); | 162 | &per_cpu(stop_cpus_work, cpu)); |
163 | preempt_enable(); | 163 | preempt_enable(); |
164 | } | ||
164 | 165 | ||
166 | static int __stop_cpus(const struct cpumask *cpumask, | ||
167 | cpu_stop_fn_t fn, void *arg) | ||
168 | { | ||
169 | struct cpu_stop_done done; | ||
170 | |||
171 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); | ||
172 | queue_stop_cpus_work(cpumask, fn, arg, &done); | ||
165 | wait_for_completion(&done.completion); | 173 | wait_for_completion(&done.completion); |
166 | return done.executed ? done.ret : -ENOENT; | 174 | return done.executed ? done.ret : -ENOENT; |
167 | } | 175 | } |
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data) | |||
431 | struct stop_machine_data *smdata = data; | 439 | struct stop_machine_data *smdata = data; |
432 | enum stopmachine_state curstate = STOPMACHINE_NONE; | 440 | enum stopmachine_state curstate = STOPMACHINE_NONE; |
433 | int cpu = smp_processor_id(), err = 0; | 441 | int cpu = smp_processor_id(), err = 0; |
442 | unsigned long flags; | ||
434 | bool is_active; | 443 | bool is_active; |
435 | 444 | ||
445 | /* | ||
446 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
447 | * already be disabled. Save the state and restore it on exit. | ||
448 | */ | ||
449 | local_save_flags(flags); | ||
450 | |||
436 | if (!smdata->active_cpus) | 451 | if (!smdata->active_cpus) |
437 | is_active = cpu == cpumask_first(cpu_online_mask); | 452 | is_active = cpu == cpumask_first(cpu_online_mask); |
438 | else | 453 | else |
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data) | |||
460 | } | 475 | } |
461 | } while (curstate != STOPMACHINE_EXIT); | 476 | } while (curstate != STOPMACHINE_EXIT); |
462 | 477 | ||
463 | local_irq_enable(); | 478 | local_irq_restore(flags); |
464 | return err; | 479 | return err; |
465 | } | 480 | } |
466 | 481 | ||
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
487 | } | 502 | } |
488 | EXPORT_SYMBOL_GPL(stop_machine); | 503 | EXPORT_SYMBOL_GPL(stop_machine); |
489 | 504 | ||
505 | /** | ||
506 | * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU | ||
507 | * @fn: the function to run | ||
508 | * @data: the data ptr for the @fn() | ||
509 | * @cpus: the cpus to run the @fn() on (NULL = any online cpu) | ||
510 | * | ||
511 | * This is identical to stop_machine() but can be called from a CPU which | ||
512 | * is not active. The local CPU is in the process of hotplug (so no other | ||
513 | * CPU hotplug can start) and not marked active and doesn't have enough | ||
514 | * context to sleep. | ||
515 | * | ||
516 | * This function provides stop_machine() functionality for such state by | ||
517 | * using busy-wait for synchronization and executing @fn directly for local | ||
518 | * CPU. | ||
519 | * | ||
520 | * CONTEXT: | ||
521 | * Local CPU is inactive. Temporarily stops all active CPUs. | ||
522 | * | ||
523 | * RETURNS: | ||
524 | * 0 if all executions of @fn returned 0, any non zero return value if any | ||
525 | * returned non zero. | ||
526 | */ | ||
527 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | ||
528 | const struct cpumask *cpus) | ||
529 | { | ||
530 | struct stop_machine_data smdata = { .fn = fn, .data = data, | ||
531 | .active_cpus = cpus }; | ||
532 | struct cpu_stop_done done; | ||
533 | int ret; | ||
534 | |||
535 | /* Local CPU must be inactive and CPU hotplug in progress. */ | ||
536 | BUG_ON(cpu_active(raw_smp_processor_id())); | ||
537 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | ||
538 | |||
539 | /* No proper task established and can't sleep - busy wait for lock. */ | ||
540 | while (!mutex_trylock(&stop_cpus_mutex)) | ||
541 | cpu_relax(); | ||
542 | |||
543 | /* Schedule work on other CPUs and execute directly for local CPU */ | ||
544 | set_state(&smdata, STOPMACHINE_PREPARE); | ||
545 | cpu_stop_init_done(&done, num_active_cpus()); | ||
546 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | ||
547 | &done); | ||
548 | ret = stop_machine_cpu_stop(&smdata); | ||
549 | |||
550 | /* Busy wait for completion. */ | ||
551 | while (!completion_done(&done.completion)) | ||
552 | cpu_relax(); | ||
553 | |||
554 | mutex_unlock(&stop_cpus_mutex); | ||
555 | return ret ?: done.ret; | ||
556 | } | ||
557 | |||
490 | #endif /* CONFIG_STOP_MACHINE */ | 558 | #endif /* CONFIG_STOP_MACHINE */ |
diff --git a/kernel/sys.c b/kernel/sys.c index e4128b278f2..1dbbe695a5e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
10 | #include <linux/mman.h> | 10 | #include <linux/mman.h> |
11 | #include <linux/notifier.h> | ||
12 | #include <linux/reboot.h> | 11 | #include <linux/reboot.h> |
13 | #include <linux/prctl.h> | 12 | #include <linux/prctl.h> |
14 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
@@ -38,6 +37,8 @@ | |||
38 | #include <linux/fs_struct.h> | 37 | #include <linux/fs_struct.h> |
39 | #include <linux/gfp.h> | 38 | #include <linux/gfp.h> |
40 | #include <linux/syscore_ops.h> | 39 | #include <linux/syscore_ops.h> |
40 | #include <linux/version.h> | ||
41 | #include <linux/ctype.h> | ||
41 | 42 | ||
42 | #include <linux/compat.h> | 43 | #include <linux/compat.h> |
43 | #include <linux/syscalls.h> | 44 | #include <linux/syscalls.h> |
@@ -45,6 +46,8 @@ | |||
45 | #include <linux/user_namespace.h> | 46 | #include <linux/user_namespace.h> |
46 | 47 | ||
47 | #include <linux/kmsg_dump.h> | 48 | #include <linux/kmsg_dump.h> |
49 | /* Move somewhere else to avoid recompiling? */ | ||
50 | #include <generated/utsrelease.h> | ||
48 | 51 | ||
49 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
50 | #include <asm/io.h> | 53 | #include <asm/io.h> |
@@ -320,6 +323,37 @@ void kernel_restart_prepare(char *cmd) | |||
320 | } | 323 | } |
321 | 324 | ||
322 | /** | 325 | /** |
326 | * register_reboot_notifier - Register function to be called at reboot time | ||
327 | * @nb: Info about notifier function to be called | ||
328 | * | ||
329 | * Registers a function with the list of functions | ||
330 | * to be called at reboot time. | ||
331 | * | ||
332 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
333 | * always returns zero. | ||
334 | */ | ||
335 | int register_reboot_notifier(struct notifier_block *nb) | ||
336 | { | ||
337 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
338 | } | ||
339 | EXPORT_SYMBOL(register_reboot_notifier); | ||
340 | |||
341 | /** | ||
342 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
343 | * @nb: Hook to be unregistered | ||
344 | * | ||
345 | * Unregisters a previously registered reboot | ||
346 | * notifier function. | ||
347 | * | ||
348 | * Returns zero on success, or %-ENOENT on failure. | ||
349 | */ | ||
350 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
351 | { | ||
352 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
353 | } | ||
354 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
355 | |||
356 | /** | ||
323 | * kernel_restart - reboot the system | 357 | * kernel_restart - reboot the system |
324 | * @cmd: pointer to buffer containing command to execute for restart | 358 | * @cmd: pointer to buffer containing command to execute for restart |
325 | * or %NULL | 359 | * or %NULL |
@@ -591,11 +625,18 @@ static int set_user(struct cred *new) | |||
591 | if (!new_user) | 625 | if (!new_user) |
592 | return -EAGAIN; | 626 | return -EAGAIN; |
593 | 627 | ||
628 | /* | ||
629 | * We don't fail in case of NPROC limit excess here because too many | ||
630 | * poorly written programs don't check set*uid() return code, assuming | ||
631 | * it never fails if called by root. We may still enforce NPROC limit | ||
632 | * for programs doing set*uid()+execve() by harmlessly deferring the | ||
633 | * failure to the execve() stage. | ||
634 | */ | ||
594 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && | 635 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && |
595 | new_user != INIT_USER) { | 636 | new_user != INIT_USER) |
596 | free_uid(new_user); | 637 | current->flags |= PF_NPROC_EXCEEDED; |
597 | return -EAGAIN; | 638 | else |
598 | } | 639 | current->flags &= ~PF_NPROC_EXCEEDED; |
599 | 640 | ||
600 | free_uid(new->user); | 641 | free_uid(new->user); |
601 | new->user = new_user; | 642 | new->user = new_user; |
@@ -1124,6 +1165,34 @@ DECLARE_RWSEM(uts_sem); | |||
1124 | #define override_architecture(name) 0 | 1165 | #define override_architecture(name) 0 |
1125 | #endif | 1166 | #endif |
1126 | 1167 | ||
1168 | /* | ||
1169 | * Work around broken programs that cannot handle "Linux 3.0". | ||
1170 | * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 | ||
1171 | */ | ||
1172 | static int override_release(char __user *release, int len) | ||
1173 | { | ||
1174 | int ret = 0; | ||
1175 | char buf[65]; | ||
1176 | |||
1177 | if (current->personality & UNAME26) { | ||
1178 | char *rest = UTS_RELEASE; | ||
1179 | int ndots = 0; | ||
1180 | unsigned v; | ||
1181 | |||
1182 | while (*rest) { | ||
1183 | if (*rest == '.' && ++ndots >= 3) | ||
1184 | break; | ||
1185 | if (!isdigit(*rest) && *rest != '.') | ||
1186 | break; | ||
1187 | rest++; | ||
1188 | } | ||
1189 | v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; | ||
1190 | snprintf(buf, len, "2.6.%u%s", v, rest); | ||
1191 | ret = copy_to_user(release, buf, len); | ||
1192 | } | ||
1193 | return ret; | ||
1194 | } | ||
1195 | |||
1127 | SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) | 1196 | SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) |
1128 | { | 1197 | { |
1129 | int errno = 0; | 1198 | int errno = 0; |
@@ -1133,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) | |||
1133 | errno = -EFAULT; | 1202 | errno = -EFAULT; |
1134 | up_read(&uts_sem); | 1203 | up_read(&uts_sem); |
1135 | 1204 | ||
1205 | if (!errno && override_release(name->release, sizeof(name->release))) | ||
1206 | errno = -EFAULT; | ||
1136 | if (!errno && override_architecture(name)) | 1207 | if (!errno && override_architecture(name)) |
1137 | errno = -EFAULT; | 1208 | errno = -EFAULT; |
1138 | return errno; | 1209 | return errno; |
@@ -1154,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) | |||
1154 | error = -EFAULT; | 1225 | error = -EFAULT; |
1155 | up_read(&uts_sem); | 1226 | up_read(&uts_sem); |
1156 | 1227 | ||
1228 | if (!error && override_release(name->release, sizeof(name->release))) | ||
1229 | error = -EFAULT; | ||
1157 | if (!error && override_architecture(name)) | 1230 | if (!error && override_architecture(name)) |
1158 | error = -EFAULT; | 1231 | error = -EFAULT; |
1159 | return error; | 1232 | return error; |
@@ -1188,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) | |||
1188 | 1261 | ||
1189 | if (!error && override_architecture(name)) | 1262 | if (!error && override_architecture(name)) |
1190 | error = -EFAULT; | 1263 | error = -EFAULT; |
1264 | if (!error && override_release(name->release, sizeof(name->release))) | ||
1265 | error = -EFAULT; | ||
1191 | return error ? -EFAULT : 0; | 1266 | return error ? -EFAULT : 0; |
1192 | } | 1267 | } |
1193 | #endif | 1268 | #endif |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 62cbc8877fe..a9a5de07c4f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void) | |||
16 | return -ENOSYS; | 16 | return -ENOSYS; |
17 | } | 17 | } |
18 | 18 | ||
19 | cond_syscall(sys_nfsservctl); | ||
20 | cond_syscall(sys_quotactl); | 19 | cond_syscall(sys_quotactl); |
21 | cond_syscall(sys32_quotactl); | 20 | cond_syscall(sys32_quotactl); |
22 | cond_syscall(sys_acct); | 21 | cond_syscall(sys_acct); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f175d98bd35..fd15163f360 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -96,6 +96,7 @@ extern char core_pattern[]; | |||
96 | extern unsigned int core_pipe_limit; | 96 | extern unsigned int core_pipe_limit; |
97 | extern int pid_max; | 97 | extern int pid_max; |
98 | extern int min_free_kbytes; | 98 | extern int min_free_kbytes; |
99 | extern int min_free_order_shift; | ||
99 | extern int pid_max_min, pid_max_max; | 100 | extern int pid_max_min, pid_max_max; |
100 | extern int sysctl_drop_caches; | 101 | extern int sysctl_drop_caches; |
101 | extern int percpu_pagelist_fraction; | 102 | extern int percpu_pagelist_fraction; |
@@ -1189,6 +1190,13 @@ static struct ctl_table vm_table[] = { | |||
1189 | .extra1 = &zero, | 1190 | .extra1 = &zero, |
1190 | }, | 1191 | }, |
1191 | { | 1192 | { |
1193 | .procname = "min_free_order_shift", | ||
1194 | .data = &min_free_order_shift, | ||
1195 | .maxlen = sizeof(min_free_order_shift), | ||
1196 | .mode = 0644, | ||
1197 | .proc_handler = &proc_dointvec | ||
1198 | }, | ||
1199 | { | ||
1192 | .procname = "percpu_pagelist_fraction", | 1200 | .procname = "percpu_pagelist_fraction", |
1193 | .data = &percpu_pagelist_fraction, | 1201 | .data = &percpu_pagelist_fraction, |
1194 | .maxlen = sizeof(percpu_pagelist_fraction), | 1202 | .maxlen = sizeof(percpu_pagelist_fraction), |
@@ -1590,16 +1598,11 @@ void sysctl_head_get(struct ctl_table_header *head) | |||
1590 | spin_unlock(&sysctl_lock); | 1598 | spin_unlock(&sysctl_lock); |
1591 | } | 1599 | } |
1592 | 1600 | ||
1593 | static void free_head(struct rcu_head *rcu) | ||
1594 | { | ||
1595 | kfree(container_of(rcu, struct ctl_table_header, rcu)); | ||
1596 | } | ||
1597 | |||
1598 | void sysctl_head_put(struct ctl_table_header *head) | 1601 | void sysctl_head_put(struct ctl_table_header *head) |
1599 | { | 1602 | { |
1600 | spin_lock(&sysctl_lock); | 1603 | spin_lock(&sysctl_lock); |
1601 | if (!--head->count) | 1604 | if (!--head->count) |
1602 | call_rcu(&head->rcu, free_head); | 1605 | kfree_rcu(head, rcu); |
1603 | spin_unlock(&sysctl_lock); | 1606 | spin_unlock(&sysctl_lock); |
1604 | } | 1607 | } |
1605 | 1608 | ||
@@ -1971,10 +1974,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1971 | start_unregistering(header); | 1974 | start_unregistering(header); |
1972 | if (!--header->parent->count) { | 1975 | if (!--header->parent->count) { |
1973 | WARN_ON(1); | 1976 | WARN_ON(1); |
1974 | call_rcu(&header->parent->rcu, free_head); | 1977 | kfree_rcu(header->parent, rcu); |
1975 | } | 1978 | } |
1976 | if (!--header->count) | 1979 | if (!--header->count) |
1977 | call_rcu(&header->rcu, free_head); | 1980 | kfree_rcu(header, rcu); |
1978 | spin_unlock(&sysctl_lock); | 1981 | spin_unlock(&sysctl_lock); |
1979 | } | 1982 | } |
1980 | 1983 | ||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 3b8e028b960..2ce1b308672 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1,6 +1,6 @@ | |||
1 | #include <linux/stat.h> | 1 | #include <linux/stat.h> |
2 | #include <linux/sysctl.h> | 2 | #include <linux/sysctl.h> |
3 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" | 3 | #include "../fs/xfs/xfs_sysctl.h" |
4 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
5 | #include <linux/string.h> | 5 | #include <linux/string.h> |
6 | #include <net/ip_vs.h> | 6 | #include <net/ip_vs.h> |
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1354 | 1354 | ||
1355 | fput(file); | 1355 | fput(file); |
1356 | out_putname: | 1356 | out_putname: |
1357 | putname(pathname); | 1357 | __putname(pathname); |
1358 | out: | 1358 | out: |
1359 | return result; | 1359 | return result; |
1360 | } | 1360 | } |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 4e4932a7b36..362da653813 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -1,6 +1,6 @@ | |||
1 | #include <linux/stat.h> | 1 | #include <linux/stat.h> |
2 | #include <linux/sysctl.h> | 2 | #include <linux/sysctl.h> |
3 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" | 3 | #include "../fs/xfs/xfs_sysctl.h" |
4 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
5 | #include <linux/string.h> | 5 | #include <linux/string.h> |
6 | #include <net/ip_vs.h> | 6 | #include <net/ip_vs.h> |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index fc0f2200541..e66046456f4 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <net/genetlink.h> | 30 | #include <net/genetlink.h> |
31 | #include <asm/atomic.h> | 31 | #include <linux/atomic.h> |
32 | 32 | ||
33 | /* | 33 | /* |
34 | * Maximum length of a cpumask that can be specified in | 34 | * Maximum length of a cpumask that can be specified in |
@@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
291 | if (!cpumask_subset(mask, cpu_possible_mask)) | 291 | if (!cpumask_subset(mask, cpu_possible_mask)) |
292 | return -EINVAL; | 292 | return -EINVAL; |
293 | 293 | ||
294 | s = NULL; | ||
295 | if (isadd == REGISTER) { | 294 | if (isadd == REGISTER) { |
296 | for_each_cpu(cpu, mask) { | 295 | for_each_cpu(cpu, mask) { |
297 | if (!s) | 296 | s = kmalloc_node(sizeof(struct listener), |
298 | s = kmalloc_node(sizeof(struct listener), | 297 | GFP_KERNEL, cpu_to_node(cpu)); |
299 | GFP_KERNEL, cpu_to_node(cpu)); | ||
300 | if (!s) | 298 | if (!s) |
301 | goto cleanup; | 299 | goto cleanup; |
300 | |||
302 | s->pid = pid; | 301 | s->pid = pid; |
303 | INIT_LIST_HEAD(&s->list); | ||
304 | s->valid = 1; | 302 | s->valid = 1; |
305 | 303 | ||
306 | listeners = &per_cpu(listener_array, cpu); | 304 | listeners = &per_cpu(listener_array, cpu); |
307 | down_write(&listeners->sem); | 305 | down_write(&listeners->sem); |
308 | list_for_each_entry_safe(s2, tmp, &listeners->list, list) { | 306 | list_for_each_entry(s2, &listeners->list, list) { |
309 | if (s2->pid == pid) | 307 | if (s2->pid == pid && s2->valid) |
310 | goto next_cpu; | 308 | goto exists; |
311 | } | 309 | } |
312 | list_add(&s->list, &listeners->list); | 310 | list_add(&s->list, &listeners->list); |
313 | s = NULL; | 311 | s = NULL; |
314 | next_cpu: | 312 | exists: |
315 | up_write(&listeners->sem); | 313 | up_write(&listeners->sem); |
314 | kfree(s); /* nop if NULL */ | ||
316 | } | 315 | } |
317 | kfree(s); | ||
318 | return 0; | 316 | return 0; |
319 | } | 317 | } |
320 | 318 | ||
@@ -657,6 +655,7 @@ static struct genl_ops taskstats_ops = { | |||
657 | .cmd = TASKSTATS_CMD_GET, | 655 | .cmd = TASKSTATS_CMD_GET, |
658 | .doit = taskstats_user_cmd, | 656 | .doit = taskstats_user_cmd, |
659 | .policy = taskstats_cmd_get_policy, | 657 | .policy = taskstats_cmd_get_policy, |
658 | .flags = GENL_ADMIN_PERM, | ||
660 | }; | 659 | }; |
661 | 660 | ||
662 | static struct genl_ops cgroupstats_ops = { | 661 | static struct genl_ops cgroupstats_ops = { |
diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c..d7760621452 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); | |||
575 | /* | 575 | /* |
576 | * Convert jiffies/jiffies_64 to clock_t and back. | 576 | * Convert jiffies/jiffies_64 to clock_t and back. |
577 | */ | 577 | */ |
578 | clock_t jiffies_to_clock_t(long x) | 578 | clock_t jiffies_to_clock_t(unsigned long x) |
579 | { | 579 | { |
580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | 580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 |
581 | # if HZ < USER_HZ | 581 | # if HZ < USER_HZ |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index e2fd74b8e8c..cae2ad7491b 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
2 | obj-y += timeconv.o posix-clock.o alarmtimer.o | 2 | obj-y += timeconv.o posix-clock.o #alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 59f369f98a0..8b70c76910a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -181,7 +181,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
181 | struct alarm *alarm; | 181 | struct alarm *alarm; |
182 | ktime_t expired = next->expires; | 182 | ktime_t expired = next->expires; |
183 | 183 | ||
184 | if (expired.tv64 >= now.tv64) | 184 | if (expired.tv64 > now.tv64) |
185 | break; | 185 | break; |
186 | 186 | ||
187 | alarm = container_of(next, struct alarm, node); | 187 | alarm = container_of(next, struct alarm, node); |
@@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
441 | static void alarm_timer_get(struct k_itimer *timr, | 441 | static void alarm_timer_get(struct k_itimer *timr, |
442 | struct itimerspec *cur_setting) | 442 | struct itimerspec *cur_setting) |
443 | { | 443 | { |
444 | memset(cur_setting, 0, sizeof(struct itimerspec)); | ||
445 | |||
444 | cur_setting->it_interval = | 446 | cur_setting->it_interval = |
445 | ktime_to_timespec(timr->it.alarmtimer.period); | 447 | ktime_to_timespec(timr->it.alarmtimer.period); |
446 | cur_setting->it_value = | 448 | cur_setting->it_value = |
@@ -479,11 +481,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
479 | if (!rtcdev) | 481 | if (!rtcdev) |
480 | return -ENOTSUPP; | 482 | return -ENOTSUPP; |
481 | 483 | ||
482 | /* Save old values */ | 484 | /* |
483 | old_setting->it_interval = | 485 | * XXX HACK! Currently we can DOS a system if the interval |
484 | ktime_to_timespec(timr->it.alarmtimer.period); | 486 | * period on alarmtimers is too small. Cap the interval here |
485 | old_setting->it_value = | 487 | * to 100us and solve this properly in a future patch! -jstultz |
486 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | 488 | */ |
489 | if ((new_setting->it_interval.tv_sec == 0) && | ||
490 | (new_setting->it_interval.tv_nsec < 100000)) | ||
491 | new_setting->it_interval.tv_nsec = 100000; | ||
492 | |||
493 | if (old_setting) | ||
494 | alarm_timer_get(timr, old_setting); | ||
487 | 495 | ||
488 | /* If the timer was already set, cancel it */ | 496 | /* If the timer was already set, cancel it */ |
489 | alarm_cancel(&timr->it.alarmtimer); | 497 | alarm_cancel(&timr->it.alarmtimer); |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0..8f77da18fef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -494,6 +494,22 @@ void clocksource_touch_watchdog(void) | |||
494 | } | 494 | } |
495 | 495 | ||
496 | /** | 496 | /** |
497 | * clocksource_max_adjustment- Returns max adjustment amount | ||
498 | * @cs: Pointer to clocksource | ||
499 | * | ||
500 | */ | ||
501 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
502 | { | ||
503 | u64 ret; | ||
504 | /* | ||
505 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
506 | */ | ||
507 | ret = (u64)cs->mult * 11; | ||
508 | do_div(ret,100); | ||
509 | return (u32)ret; | ||
510 | } | ||
511 | |||
512 | /** | ||
497 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 513 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
498 | * @cs: Pointer to clocksource | 514 | * @cs: Pointer to clocksource |
499 | * | 515 | * |
@@ -505,25 +521,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
505 | /* | 521 | /* |
506 | * Calculate the maximum number of cycles that we can pass to the | 522 | * Calculate the maximum number of cycles that we can pass to the |
507 | * cyc2ns function without overflowing a 64-bit signed result. The | 523 | * cyc2ns function without overflowing a 64-bit signed result. The |
508 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 524 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
509 | * is equivalent to the below. | 525 | * which is equivalent to the below. |
510 | * max_cycles < (2^63)/cs->mult | 526 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
511 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 527 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
512 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 528 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
513 | * max_cycles < 2^(63 - log2(cs->mult)) | 529 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
514 | * max_cycles < 1 << (63 - log2(cs->mult)) | 530 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
515 | * Please note that we add 1 to the result of the log2 to account for | 531 | * Please note that we add 1 to the result of the log2 to account for |
516 | * any rounding errors, ensure the above inequality is satisfied and | 532 | * any rounding errors, ensure the above inequality is satisfied and |
517 | * no overflow will occur. | 533 | * no overflow will occur. |
518 | */ | 534 | */ |
519 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 535 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
520 | 536 | ||
521 | /* | 537 | /* |
522 | * The actual maximum number of cycles we can defer the clocksource is | 538 | * The actual maximum number of cycles we can defer the clocksource is |
523 | * determined by the minimum of max_cycles and cs->mask. | 539 | * determined by the minimum of max_cycles and cs->mask. |
540 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
541 | * too long if there's a large negative adjustment. | ||
524 | */ | 542 | */ |
525 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 543 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
526 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 544 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
545 | cs->shift); | ||
527 | 546 | ||
528 | /* | 547 | /* |
529 | * To ensure that the clocksource does not wrap whilst we are idle, | 548 | * To ensure that the clocksource does not wrap whilst we are idle, |
@@ -531,7 +550,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
531 | * note a margin of 12.5% is used because this can be computed with | 550 | * note a margin of 12.5% is used because this can be computed with |
532 | * a shift, versus say 10% which would require division. | 551 | * a shift, versus say 10% which would require division. |
533 | */ | 552 | */ |
534 | return max_nsecs - (max_nsecs >> 5); | 553 | return max_nsecs - (max_nsecs >> 3); |
535 | } | 554 | } |
536 | 555 | ||
537 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 556 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -642,7 +661,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
642 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 661 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
643 | { | 662 | { |
644 | u64 sec; | 663 | u64 sec; |
645 | |||
646 | /* | 664 | /* |
647 | * Calc the maximum number of seconds which we can run before | 665 | * Calc the maximum number of seconds which we can run before |
648 | * wrapping around. For clocksources which have a mask > 32bit | 666 | * wrapping around. For clocksources which have a mask > 32bit |
@@ -653,7 +671,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
653 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | 671 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% |
654 | * margin as we do in clocksource_max_deferment() | 672 | * margin as we do in clocksource_max_deferment() |
655 | */ | 673 | */ |
656 | sec = (cs->mask - (cs->mask >> 5)); | 674 | sec = (cs->mask - (cs->mask >> 3)); |
657 | do_div(sec, freq); | 675 | do_div(sec, freq); |
658 | do_div(sec, scale); | 676 | do_div(sec, scale); |
659 | if (!sec) | 677 | if (!sec) |
@@ -663,6 +681,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
663 | 681 | ||
664 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 682 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
665 | NSEC_PER_SEC / scale, sec * scale); | 683 | NSEC_PER_SEC / scale, sec * scale); |
684 | |||
685 | /* | ||
686 | * for clocksources that have large mults, to avoid overflow. | ||
687 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
688 | * | ||
689 | */ | ||
690 | cs->maxadj = clocksource_max_adjustment(cs); | ||
691 | while ((cs->mult + cs->maxadj < cs->mult) | ||
692 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
693 | cs->mult >>= 1; | ||
694 | cs->shift--; | ||
695 | cs->maxadj = clocksource_max_adjustment(cs); | ||
696 | } | ||
697 | |||
666 | cs->max_idle_ns = clocksource_max_deferment(cs); | 698 | cs->max_idle_ns = clocksource_max_deferment(cs); |
667 | } | 699 | } |
668 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 700 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
@@ -703,6 +735,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
703 | */ | 735 | */ |
704 | int clocksource_register(struct clocksource *cs) | 736 | int clocksource_register(struct clocksource *cs) |
705 | { | 737 | { |
738 | /* calculate max adjustment for given mult/shift */ | ||
739 | cs->maxadj = clocksource_max_adjustment(cs); | ||
740 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
741 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
742 | cs->name); | ||
743 | |||
706 | /* calculate max idle time permitted for this clocksource */ | 744 | /* calculate max idle time permitted for this clocksource */ |
707 | cs->max_idle_ns = clocksource_max_deferment(cs); | 745 | cs->max_idle_ns = clocksource_max_deferment(cs); |
708 | 746 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d13273..7a90d021b79 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
72 | return 0; | 72 | return 0; |
73 | 73 | ||
74 | clockevents_exchange_device(NULL, dev); | 74 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
75 | tick_broadcast_device.evtdev = dev; | 75 | tick_broadcast_device.evtdev = dev; |
76 | if (!cpumask_empty(tick_get_broadcast_mask())) | 76 | if (!cpumask_empty(tick_get_broadcast_mask())) |
77 | tick_broadcast_start_periodic(dev); | 77 | tick_broadcast_start_periodic(dev); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 342408cf68d..6f9798bf240 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | ||
253 | nsecs += arch_gettimeoffset(); | ||
252 | 254 | ||
253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | /* | 256 | /* |
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
280 | *ts = xtime; | 282 | *ts = xtime; |
281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | ||
286 | nsecs += arch_gettimeoffset(); | ||
283 | 287 | ||
284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | ||
@@ -604,6 +608,12 @@ static struct timespec timekeeping_suspend_time; | |||
604 | */ | 608 | */ |
605 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | 609 | static void __timekeeping_inject_sleeptime(struct timespec *delta) |
606 | { | 610 | { |
611 | if (!timespec_valid(delta)) { | ||
612 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | ||
613 | "sleep delta value!\n"); | ||
614 | return; | ||
615 | } | ||
616 | |||
607 | xtime = timespec_add(xtime, *delta); | 617 | xtime = timespec_add(xtime, *delta); |
608 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | 618 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); |
609 | total_sleep_time = timespec_add(total_sleep_time, *delta); | 619 | total_sleep_time = timespec_add(total_sleep_time, *delta); |
@@ -686,12 +696,34 @@ static void timekeeping_resume(void) | |||
686 | static int timekeeping_suspend(void) | 696 | static int timekeeping_suspend(void) |
687 | { | 697 | { |
688 | unsigned long flags; | 698 | unsigned long flags; |
699 | struct timespec delta, delta_delta; | ||
700 | static struct timespec old_delta; | ||
689 | 701 | ||
690 | read_persistent_clock(&timekeeping_suspend_time); | 702 | read_persistent_clock(&timekeeping_suspend_time); |
691 | 703 | ||
692 | write_seqlock_irqsave(&xtime_lock, flags); | 704 | write_seqlock_irqsave(&xtime_lock, flags); |
693 | timekeeping_forward_now(); | 705 | timekeeping_forward_now(); |
694 | timekeeping_suspended = 1; | 706 | timekeeping_suspended = 1; |
707 | |||
708 | /* | ||
709 | * To avoid drift caused by repeated suspend/resumes, | ||
710 | * which each can add ~1 second drift error, | ||
711 | * try to compensate so the difference in system time | ||
712 | * and persistent_clock time stays close to constant. | ||
713 | */ | ||
714 | delta = timespec_sub(xtime, timekeeping_suspend_time); | ||
715 | delta_delta = timespec_sub(delta, old_delta); | ||
716 | if (abs(delta_delta.tv_sec) >= 2) { | ||
717 | /* | ||
718 | * if delta_delta is too large, assume time correction | ||
719 | * has occured and set old_delta to the current delta. | ||
720 | */ | ||
721 | old_delta = delta; | ||
722 | } else { | ||
723 | /* Otherwise try to adjust old_system to compensate */ | ||
724 | timekeeping_suspend_time = | ||
725 | timespec_add(timekeeping_suspend_time, delta_delta); | ||
726 | } | ||
695 | write_sequnlock_irqrestore(&xtime_lock, flags); | 727 | write_sequnlock_irqrestore(&xtime_lock, flags); |
696 | 728 | ||
697 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 729 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
@@ -792,6 +824,13 @@ static void timekeeping_adjust(s64 offset) | |||
792 | } else | 824 | } else |
793 | return; | 825 | return; |
794 | 826 | ||
827 | WARN_ONCE(timekeeper.clock->maxadj && | ||
828 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
829 | timekeeper.clock->maxadj), | ||
830 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
831 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
832 | (long)timekeeper.clock->mult + | ||
833 | timekeeper.clock->maxadj); | ||
795 | timekeeper.mult += adj; | 834 | timekeeper.mult += adj; |
796 | timekeeper.xtime_interval += interval; | 835 | timekeeper.xtime_interval += interval; |
797 | timekeeper.xtime_nsec -= offset; | 836 | timekeeper.xtime_nsec -= offset; |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ad39e556cb..93168c0f991 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED | |||
82 | power:power_frequency | 82 | power:power_frequency |
83 | This is for userspace compatibility | 83 | This is for userspace compatibility |
84 | and will vanish after 5 kernel iterations, | 84 | and will vanish after 5 kernel iterations, |
85 | namely 2.6.41. | 85 | namely 3.1. |
86 | 86 | ||
87 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
88 | bool | 88 | bool |
@@ -487,6 +487,39 @@ config RING_BUFFER_BENCHMARK | |||
487 | 487 | ||
488 | If unsure, say N. | 488 | If unsure, say N. |
489 | 489 | ||
490 | config TRACELEVEL | ||
491 | bool "Add capability to prioritize traces" | ||
492 | depends on EVENT_TRACING | ||
493 | help | ||
494 | This option allows subsystem programmers to add priorities to trace | ||
495 | events by calling to tracelevel_register. Traces of high priority | ||
496 | will automatically be enabled on kernel boot, and users can change | ||
497 | the the trace level in a kernel parameter. | ||
498 | |||
499 | config TRACEDUMP | ||
500 | bool "Dumping functionality for ftrace" | ||
501 | depends on FUNCTION_TRACER | ||
502 | help | ||
503 | This option adds functionality to dump tracing data in several forms | ||
504 | Data can be dumped in ascii form or as raw pages from the tracing | ||
505 | ring buffers, along with the saved cmdlines. This is specified by | ||
506 | the module parameter tracedump_ascii. Data will be compressed | ||
507 | using zlib. | ||
508 | |||
509 | config TRACEDUMP_PANIC | ||
510 | bool "Tracedump to console on panic" | ||
511 | depends on TRACEDUMP | ||
512 | help | ||
513 | With this option, tracedump will automatically dump to the console | ||
514 | on a kernel panic. | ||
515 | |||
516 | config TRACEDUMP_PROCFS | ||
517 | bool "Tracedump via proc file" | ||
518 | depends on TRACEDUMP | ||
519 | help | ||
520 | With this option, tracedump can be dumped from user space by reading | ||
521 | from /proc/tracedump. | ||
522 | |||
490 | endif # FTRACE | 523 | endif # FTRACE |
491 | 524 | ||
492 | endif # TRACING_SUPPORT | 525 | endif # TRACING_SUPPORT |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c..1360a1a90d5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o | |||
56 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 58 | endif |
59 | obj-$(CONFIG_TRACELEVEL) += tracelevel.o | ||
60 | obj-$(CONFIG_TRACEDUMP) += tracedump.o | ||
59 | 61 | ||
60 | libftrace-y := ftrace.o | 62 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6957aa298df..7c910a5593a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
206 | what |= MASK_TC_BIT(rw, RAHEAD); | 206 | what |= MASK_TC_BIT(rw, RAHEAD); |
207 | what |= MASK_TC_BIT(rw, META); | 207 | what |= MASK_TC_BIT(rw, META); |
208 | what |= MASK_TC_BIT(rw, DISCARD); | 208 | what |= MASK_TC_BIT(rw, DISCARD); |
209 | what |= MASK_TC_BIT(rw, FLUSH); | ||
210 | what |= MASK_TC_BIT(rw, FUA); | ||
209 | 211 | ||
210 | pid = tsk->pid; | 212 | pid = tsk->pid; |
211 | if (act_log_check(bt, what, sector, pid)) | 213 | if (act_log_check(bt, what, sector, pid)) |
@@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) | |||
1054 | goto out; | 1056 | goto out; |
1055 | } | 1057 | } |
1056 | 1058 | ||
1059 | if (tc & BLK_TC_FLUSH) | ||
1060 | rwbs[i++] = 'F'; | ||
1061 | |||
1057 | if (tc & BLK_TC_DISCARD) | 1062 | if (tc & BLK_TC_DISCARD) |
1058 | rwbs[i++] = 'D'; | 1063 | rwbs[i++] = 'D'; |
1059 | else if (tc & BLK_TC_WRITE) | 1064 | else if (tc & BLK_TC_WRITE) |
@@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) | |||
1063 | else | 1068 | else |
1064 | rwbs[i++] = 'N'; | 1069 | rwbs[i++] = 'N'; |
1065 | 1070 | ||
1071 | if (tc & BLK_TC_FUA) | ||
1072 | rwbs[i++] = 'F'; | ||
1066 | if (tc & BLK_TC_AHEAD) | 1073 | if (tc & BLK_TC_AHEAD) |
1067 | rwbs[i++] = 'A'; | 1074 | rwbs[i++] = 'A'; |
1068 | if (tc & BLK_TC_BARRIER) | ||
1069 | rwbs[i++] = 'B'; | ||
1070 | if (tc & BLK_TC_SYNC) | 1075 | if (tc & BLK_TC_SYNC) |
1071 | rwbs[i++] = 'S'; | 1076 | rwbs[i++] = 'S'; |
1072 | if (tc & BLK_TC_META) | 1077 | if (tc & BLK_TC_META) |
@@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); | |||
1132 | 1137 | ||
1133 | static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | 1138 | static int blk_log_action_classic(struct trace_iterator *iter, const char *act) |
1134 | { | 1139 | { |
1135 | char rwbs[6]; | 1140 | char rwbs[RWBS_LEN]; |
1136 | unsigned long long ts = iter->ts; | 1141 | unsigned long long ts = iter->ts; |
1137 | unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); | 1142 | unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); |
1138 | unsigned secs = (unsigned long)ts; | 1143 | unsigned secs = (unsigned long)ts; |
@@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) | |||
1148 | 1153 | ||
1149 | static int blk_log_action(struct trace_iterator *iter, const char *act) | 1154 | static int blk_log_action(struct trace_iterator *iter, const char *act) |
1150 | { | 1155 | { |
1151 | char rwbs[6]; | 1156 | char rwbs[RWBS_LEN]; |
1152 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); | 1157 | const struct blk_io_trace *t = te_blk_io_trace(iter->ent); |
1153 | 1158 | ||
1154 | fill_rwbs(rwbs, t); | 1159 | fill_rwbs(rwbs, t); |
@@ -1561,7 +1566,7 @@ static const struct { | |||
1561 | } mask_maps[] = { | 1566 | } mask_maps[] = { |
1562 | { BLK_TC_READ, "read" }, | 1567 | { BLK_TC_READ, "read" }, |
1563 | { BLK_TC_WRITE, "write" }, | 1568 | { BLK_TC_WRITE, "write" }, |
1564 | { BLK_TC_BARRIER, "barrier" }, | 1569 | { BLK_TC_FLUSH, "flush" }, |
1565 | { BLK_TC_SYNC, "sync" }, | 1570 | { BLK_TC_SYNC, "sync" }, |
1566 | { BLK_TC_QUEUE, "queue" }, | 1571 | { BLK_TC_QUEUE, "queue" }, |
1567 | { BLK_TC_REQUEUE, "requeue" }, | 1572 | { BLK_TC_REQUEUE, "requeue" }, |
@@ -1573,6 +1578,7 @@ static const struct { | |||
1573 | { BLK_TC_META, "meta" }, | 1578 | { BLK_TC_META, "meta" }, |
1574 | { BLK_TC_DISCARD, "discard" }, | 1579 | { BLK_TC_DISCARD, "discard" }, |
1575 | { BLK_TC_DRV_DATA, "drv_data" }, | 1580 | { BLK_TC_DRV_DATA, "drv_data" }, |
1581 | { BLK_TC_FUA, "fua" }, | ||
1576 | }; | 1582 | }; |
1577 | 1583 | ||
1578 | static int blk_trace_str2mask(const char *str) | 1584 | static int blk_trace_str2mask(const char *str) |
@@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1788 | { | 1794 | { |
1789 | int i = 0; | 1795 | int i = 0; |
1790 | 1796 | ||
1797 | if (rw & REQ_FLUSH) | ||
1798 | rwbs[i++] = 'F'; | ||
1799 | |||
1791 | if (rw & WRITE) | 1800 | if (rw & WRITE) |
1792 | rwbs[i++] = 'W'; | 1801 | rwbs[i++] = 'W'; |
1793 | else if (rw & REQ_DISCARD) | 1802 | else if (rw & REQ_DISCARD) |
@@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1797 | else | 1806 | else |
1798 | rwbs[i++] = 'N'; | 1807 | rwbs[i++] = 'N'; |
1799 | 1808 | ||
1809 | if (rw & REQ_FUA) | ||
1810 | rwbs[i++] = 'F'; | ||
1800 | if (rw & REQ_RAHEAD) | 1811 | if (rw & REQ_RAHEAD) |
1801 | rwbs[i++] = 'A'; | 1812 | rwbs[i++] = 'A'; |
1802 | if (rw & REQ_SYNC) | 1813 | if (rw & REQ_SYNC) |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 908038f5744..798b16cd40f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -32,7 +32,6 @@ | |||
32 | 32 | ||
33 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
34 | 34 | ||
35 | #include <asm/ftrace.h> | ||
36 | #include <asm/setup.h> | 35 | #include <asm/setup.h> |
37 | 36 | ||
38 | #include "trace_output.h" | 37 | #include "trace_output.h" |
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly; | |||
82 | 81 | ||
83 | static DEFINE_MUTEX(ftrace_lock); | 82 | static DEFINE_MUTEX(ftrace_lock); |
84 | 83 | ||
85 | static struct ftrace_ops ftrace_list_end __read_mostly = | 84 | static struct ftrace_ops ftrace_list_end __read_mostly = { |
86 | { | ||
87 | .func = ftrace_stub, | 85 | .func = ftrace_stub, |
88 | }; | 86 | }; |
89 | 87 | ||
90 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | 88 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
91 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 89 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
92 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 90 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
91 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; | ||
93 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 92 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
94 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 93 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
95 | static struct ftrace_ops global_ops; | 94 | static struct ftrace_ops global_ops; |
@@ -148,6 +147,7 @@ void clear_ftrace_function(void) | |||
148 | { | 147 | { |
149 | ftrace_trace_function = ftrace_stub; | 148 | ftrace_trace_function = ftrace_stub; |
150 | __ftrace_trace_function = ftrace_stub; | 149 | __ftrace_trace_function = ftrace_stub; |
150 | __ftrace_trace_function_delay = ftrace_stub; | ||
151 | ftrace_pid_function = ftrace_stub; | 151 | ftrace_pid_function = ftrace_stub; |
152 | } | 152 | } |
153 | 153 | ||
@@ -210,7 +210,12 @@ static void update_ftrace_function(void) | |||
210 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 210 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
211 | ftrace_trace_function = func; | 211 | ftrace_trace_function = func; |
212 | #else | 212 | #else |
213 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
214 | /* do not update till all functions have been modified */ | ||
215 | __ftrace_trace_function_delay = func; | ||
216 | #else | ||
213 | __ftrace_trace_function = func; | 217 | __ftrace_trace_function = func; |
218 | #endif | ||
214 | ftrace_trace_function = ftrace_test_stop_func; | 219 | ftrace_trace_function = ftrace_test_stop_func; |
215 | #endif | 220 | #endif |
216 | } | 221 | } |
@@ -785,8 +790,7 @@ static void unregister_ftrace_profiler(void) | |||
785 | unregister_ftrace_graph(); | 790 | unregister_ftrace_graph(); |
786 | } | 791 | } |
787 | #else | 792 | #else |
788 | static struct ftrace_ops ftrace_profile_ops __read_mostly = | 793 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
789 | { | ||
790 | .func = function_profile_call, | 794 | .func = function_profile_call, |
791 | }; | 795 | }; |
792 | 796 | ||
@@ -806,19 +810,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, | |||
806 | size_t cnt, loff_t *ppos) | 810 | size_t cnt, loff_t *ppos) |
807 | { | 811 | { |
808 | unsigned long val; | 812 | unsigned long val; |
809 | char buf[64]; /* big enough to hold a number */ | ||
810 | int ret; | 813 | int ret; |
811 | 814 | ||
812 | if (cnt >= sizeof(buf)) | 815 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
813 | return -EINVAL; | 816 | if (ret) |
814 | |||
815 | if (copy_from_user(&buf, ubuf, cnt)) | ||
816 | return -EFAULT; | ||
817 | |||
818 | buf[cnt] = 0; | ||
819 | |||
820 | ret = strict_strtoul(buf, 10, &val); | ||
821 | if (ret < 0) | ||
822 | return ret; | 817 | return ret; |
823 | 818 | ||
824 | val = !!val; | 819 | val = !!val; |
@@ -952,7 +947,7 @@ struct ftrace_func_probe { | |||
952 | }; | 947 | }; |
953 | 948 | ||
954 | enum { | 949 | enum { |
955 | FTRACE_ENABLE_CALLS = (1 << 0), | 950 | FTRACE_UPDATE_CALLS = (1 << 0), |
956 | FTRACE_DISABLE_CALLS = (1 << 1), | 951 | FTRACE_DISABLE_CALLS = (1 << 1), |
957 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | 952 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), |
958 | FTRACE_START_FUNC_RET = (1 << 3), | 953 | FTRACE_START_FUNC_RET = (1 << 3), |
@@ -1182,8 +1177,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
1182 | return NULL; | 1177 | return NULL; |
1183 | } | 1178 | } |
1184 | 1179 | ||
1180 | static void | ||
1181 | ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); | ||
1182 | static void | ||
1183 | ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); | ||
1184 | |||
1185 | static int | 1185 | static int |
1186 | ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | 1186 | ftrace_hash_move(struct ftrace_ops *ops, int enable, |
1187 | struct ftrace_hash **dst, struct ftrace_hash *src) | ||
1187 | { | 1188 | { |
1188 | struct ftrace_func_entry *entry; | 1189 | struct ftrace_func_entry *entry; |
1189 | struct hlist_node *tp, *tn; | 1190 | struct hlist_node *tp, *tn; |
@@ -1193,9 +1194,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | |||
1193 | unsigned long key; | 1194 | unsigned long key; |
1194 | int size = src->count; | 1195 | int size = src->count; |
1195 | int bits = 0; | 1196 | int bits = 0; |
1197 | int ret; | ||
1196 | int i; | 1198 | int i; |
1197 | 1199 | ||
1198 | /* | 1200 | /* |
1201 | * Remove the current set, update the hash and add | ||
1202 | * them back. | ||
1203 | */ | ||
1204 | ftrace_hash_rec_disable(ops, enable); | ||
1205 | |||
1206 | /* | ||
1199 | * If the new source is empty, just free dst and assign it | 1207 | * If the new source is empty, just free dst and assign it |
1200 | * the empty_hash. | 1208 | * the empty_hash. |
1201 | */ | 1209 | */ |
@@ -1215,9 +1223,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | |||
1215 | if (bits > FTRACE_HASH_MAX_BITS) | 1223 | if (bits > FTRACE_HASH_MAX_BITS) |
1216 | bits = FTRACE_HASH_MAX_BITS; | 1224 | bits = FTRACE_HASH_MAX_BITS; |
1217 | 1225 | ||
1226 | ret = -ENOMEM; | ||
1218 | new_hash = alloc_ftrace_hash(bits); | 1227 | new_hash = alloc_ftrace_hash(bits); |
1219 | if (!new_hash) | 1228 | if (!new_hash) |
1220 | return -ENOMEM; | 1229 | goto out; |
1221 | 1230 | ||
1222 | size = 1 << src->size_bits; | 1231 | size = 1 << src->size_bits; |
1223 | for (i = 0; i < size; i++) { | 1232 | for (i = 0; i < size; i++) { |
@@ -1236,7 +1245,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | |||
1236 | rcu_assign_pointer(*dst, new_hash); | 1245 | rcu_assign_pointer(*dst, new_hash); |
1237 | free_ftrace_hash_rcu(old_hash); | 1246 | free_ftrace_hash_rcu(old_hash); |
1238 | 1247 | ||
1239 | return 0; | 1248 | ret = 0; |
1249 | out: | ||
1250 | /* | ||
1251 | * Enable regardless of ret: | ||
1252 | * On success, we enable the new hash. | ||
1253 | * On failure, we re-enable the original hash. | ||
1254 | */ | ||
1255 | ftrace_hash_rec_enable(ops, enable); | ||
1256 | |||
1257 | return ret; | ||
1240 | } | 1258 | } |
1241 | 1259 | ||
1242 | /* | 1260 | /* |
@@ -1498,7 +1516,7 @@ int ftrace_text_reserved(void *start, void *end) | |||
1498 | 1516 | ||
1499 | 1517 | ||
1500 | static int | 1518 | static int |
1501 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1519 | __ftrace_replace_code(struct dyn_ftrace *rec, int update) |
1502 | { | 1520 | { |
1503 | unsigned long ftrace_addr; | 1521 | unsigned long ftrace_addr; |
1504 | unsigned long flag = 0UL; | 1522 | unsigned long flag = 0UL; |
@@ -1506,17 +1524,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1506 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1524 | ftrace_addr = (unsigned long)FTRACE_ADDR; |
1507 | 1525 | ||
1508 | /* | 1526 | /* |
1509 | * If we are enabling tracing: | 1527 | * If we are updating calls: |
1510 | * | 1528 | * |
1511 | * If the record has a ref count, then we need to enable it | 1529 | * If the record has a ref count, then we need to enable it |
1512 | * because someone is using it. | 1530 | * because someone is using it. |
1513 | * | 1531 | * |
1514 | * Otherwise we make sure its disabled. | 1532 | * Otherwise we make sure its disabled. |
1515 | * | 1533 | * |
1516 | * If we are disabling tracing, then disable all records that | 1534 | * If we are disabling calls, then disable all records that |
1517 | * are enabled. | 1535 | * are enabled. |
1518 | */ | 1536 | */ |
1519 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) | 1537 | if (update && (rec->flags & ~FTRACE_FL_MASK)) |
1520 | flag = FTRACE_FL_ENABLED; | 1538 | flag = FTRACE_FL_ENABLED; |
1521 | 1539 | ||
1522 | /* If the state of this record hasn't changed, then do nothing */ | 1540 | /* If the state of this record hasn't changed, then do nothing */ |
@@ -1532,7 +1550,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1532 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 1550 | return ftrace_make_nop(NULL, rec, ftrace_addr); |
1533 | } | 1551 | } |
1534 | 1552 | ||
1535 | static void ftrace_replace_code(int enable) | 1553 | static void ftrace_replace_code(int update) |
1536 | { | 1554 | { |
1537 | struct dyn_ftrace *rec; | 1555 | struct dyn_ftrace *rec; |
1538 | struct ftrace_page *pg; | 1556 | struct ftrace_page *pg; |
@@ -1546,7 +1564,7 @@ static void ftrace_replace_code(int enable) | |||
1546 | if (rec->flags & FTRACE_FL_FREE) | 1564 | if (rec->flags & FTRACE_FL_FREE) |
1547 | continue; | 1565 | continue; |
1548 | 1566 | ||
1549 | failed = __ftrace_replace_code(rec, enable); | 1567 | failed = __ftrace_replace_code(rec, update); |
1550 | if (failed) { | 1568 | if (failed) { |
1551 | ftrace_bug(failed, rec->ip); | 1569 | ftrace_bug(failed, rec->ip); |
1552 | /* Stop processing */ | 1570 | /* Stop processing */ |
@@ -1596,7 +1614,13 @@ static int __ftrace_modify_code(void *data) | |||
1596 | { | 1614 | { |
1597 | int *command = data; | 1615 | int *command = data; |
1598 | 1616 | ||
1599 | if (*command & FTRACE_ENABLE_CALLS) | 1617 | /* |
1618 | * Do not call function tracer while we update the code. | ||
1619 | * We are in stop machine, no worrying about races. | ||
1620 | */ | ||
1621 | function_trace_stop++; | ||
1622 | |||
1623 | if (*command & FTRACE_UPDATE_CALLS) | ||
1600 | ftrace_replace_code(1); | 1624 | ftrace_replace_code(1); |
1601 | else if (*command & FTRACE_DISABLE_CALLS) | 1625 | else if (*command & FTRACE_DISABLE_CALLS) |
1602 | ftrace_replace_code(0); | 1626 | ftrace_replace_code(0); |
@@ -1609,6 +1633,18 @@ static int __ftrace_modify_code(void *data) | |||
1609 | else if (*command & FTRACE_STOP_FUNC_RET) | 1633 | else if (*command & FTRACE_STOP_FUNC_RET) |
1610 | ftrace_disable_ftrace_graph_caller(); | 1634 | ftrace_disable_ftrace_graph_caller(); |
1611 | 1635 | ||
1636 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
1637 | /* | ||
1638 | * For archs that call ftrace_test_stop_func(), we must | ||
1639 | * wait till after we update all the function callers | ||
1640 | * before we update the callback. This keeps different | ||
1641 | * ops that record different functions from corrupting | ||
1642 | * each other. | ||
1643 | */ | ||
1644 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
1645 | #endif | ||
1646 | function_trace_stop--; | ||
1647 | |||
1612 | return 0; | 1648 | return 0; |
1613 | } | 1649 | } |
1614 | 1650 | ||
@@ -1652,7 +1688,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
1652 | return -ENODEV; | 1688 | return -ENODEV; |
1653 | 1689 | ||
1654 | ftrace_start_up++; | 1690 | ftrace_start_up++; |
1655 | command |= FTRACE_ENABLE_CALLS; | 1691 | command |= FTRACE_UPDATE_CALLS; |
1656 | 1692 | ||
1657 | /* ops marked global share the filter hashes */ | 1693 | /* ops marked global share the filter hashes */ |
1658 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 1694 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
@@ -1704,8 +1740,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
1704 | if (ops != &global_ops || !global_start_up) | 1740 | if (ops != &global_ops || !global_start_up) |
1705 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 1741 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
1706 | 1742 | ||
1707 | if (!ftrace_start_up) | 1743 | command |= FTRACE_UPDATE_CALLS; |
1708 | command |= FTRACE_DISABLE_CALLS; | ||
1709 | 1744 | ||
1710 | if (saved_ftrace_func != ftrace_trace_function) { | 1745 | if (saved_ftrace_func != ftrace_trace_function) { |
1711 | saved_ftrace_func = ftrace_trace_function; | 1746 | saved_ftrace_func = ftrace_trace_function; |
@@ -1727,7 +1762,7 @@ static void ftrace_startup_sysctl(void) | |||
1727 | saved_ftrace_func = NULL; | 1762 | saved_ftrace_func = NULL; |
1728 | /* ftrace_start_up is true if we want ftrace running */ | 1763 | /* ftrace_start_up is true if we want ftrace running */ |
1729 | if (ftrace_start_up) | 1764 | if (ftrace_start_up) |
1730 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | 1765 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); |
1731 | } | 1766 | } |
1732 | 1767 | ||
1733 | static void ftrace_shutdown_sysctl(void) | 1768 | static void ftrace_shutdown_sysctl(void) |
@@ -1744,10 +1779,36 @@ static cycle_t ftrace_update_time; | |||
1744 | static unsigned long ftrace_update_cnt; | 1779 | static unsigned long ftrace_update_cnt; |
1745 | unsigned long ftrace_update_tot_cnt; | 1780 | unsigned long ftrace_update_tot_cnt; |
1746 | 1781 | ||
1782 | static int ops_traces_mod(struct ftrace_ops *ops) | ||
1783 | { | ||
1784 | struct ftrace_hash *hash; | ||
1785 | |||
1786 | hash = ops->filter_hash; | ||
1787 | return !!(!hash || !hash->count); | ||
1788 | } | ||
1789 | |||
1747 | static int ftrace_update_code(struct module *mod) | 1790 | static int ftrace_update_code(struct module *mod) |
1748 | { | 1791 | { |
1749 | struct dyn_ftrace *p; | 1792 | struct dyn_ftrace *p; |
1750 | cycle_t start, stop; | 1793 | cycle_t start, stop; |
1794 | unsigned long ref = 0; | ||
1795 | |||
1796 | /* | ||
1797 | * When adding a module, we need to check if tracers are | ||
1798 | * currently enabled and if they are set to trace all functions. | ||
1799 | * If they are, we need to enable the module functions as well | ||
1800 | * as update the reference counts for those function records. | ||
1801 | */ | ||
1802 | if (mod) { | ||
1803 | struct ftrace_ops *ops; | ||
1804 | |||
1805 | for (ops = ftrace_ops_list; | ||
1806 | ops != &ftrace_list_end; ops = ops->next) { | ||
1807 | if (ops->flags & FTRACE_OPS_FL_ENABLED && | ||
1808 | ops_traces_mod(ops)) | ||
1809 | ref++; | ||
1810 | } | ||
1811 | } | ||
1751 | 1812 | ||
1752 | start = ftrace_now(raw_smp_processor_id()); | 1813 | start = ftrace_now(raw_smp_processor_id()); |
1753 | ftrace_update_cnt = 0; | 1814 | ftrace_update_cnt = 0; |
@@ -1760,7 +1821,7 @@ static int ftrace_update_code(struct module *mod) | |||
1760 | 1821 | ||
1761 | p = ftrace_new_addrs; | 1822 | p = ftrace_new_addrs; |
1762 | ftrace_new_addrs = p->newlist; | 1823 | ftrace_new_addrs = p->newlist; |
1763 | p->flags = 0L; | 1824 | p->flags = ref; |
1764 | 1825 | ||
1765 | /* | 1826 | /* |
1766 | * Do the initial record conversion from mcount jump | 1827 | * Do the initial record conversion from mcount jump |
@@ -1783,7 +1844,7 @@ static int ftrace_update_code(struct module *mod) | |||
1783 | * conversion puts the module to the correct state, thus | 1844 | * conversion puts the module to the correct state, thus |
1784 | * passing the ftrace_make_call check. | 1845 | * passing the ftrace_make_call check. |
1785 | */ | 1846 | */ |
1786 | if (ftrace_start_up) { | 1847 | if (ftrace_start_up && ref) { |
1787 | int failed = __ftrace_replace_code(p, 1); | 1848 | int failed = __ftrace_replace_code(p, 1); |
1788 | if (failed) { | 1849 | if (failed) { |
1789 | ftrace_bug(failed, p->ip); | 1850 | ftrace_bug(failed, p->ip); |
@@ -2407,10 +2468,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) | |||
2407 | */ | 2468 | */ |
2408 | 2469 | ||
2409 | static int | 2470 | static int |
2410 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | 2471 | ftrace_mod_callback(struct ftrace_hash *hash, |
2472 | char *func, char *cmd, char *param, int enable) | ||
2411 | { | 2473 | { |
2412 | struct ftrace_ops *ops = &global_ops; | ||
2413 | struct ftrace_hash *hash; | ||
2414 | char *mod; | 2474 | char *mod; |
2415 | int ret = -EINVAL; | 2475 | int ret = -EINVAL; |
2416 | 2476 | ||
@@ -2430,11 +2490,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | |||
2430 | if (!strlen(mod)) | 2490 | if (!strlen(mod)) |
2431 | return ret; | 2491 | return ret; |
2432 | 2492 | ||
2433 | if (enable) | ||
2434 | hash = ops->filter_hash; | ||
2435 | else | ||
2436 | hash = ops->notrace_hash; | ||
2437 | |||
2438 | ret = ftrace_match_module_records(hash, func, mod); | 2493 | ret = ftrace_match_module_records(hash, func, mod); |
2439 | if (!ret) | 2494 | if (!ret) |
2440 | ret = -EINVAL; | 2495 | ret = -EINVAL; |
@@ -2760,7 +2815,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, | |||
2760 | mutex_lock(&ftrace_cmd_mutex); | 2815 | mutex_lock(&ftrace_cmd_mutex); |
2761 | list_for_each_entry(p, &ftrace_commands, list) { | 2816 | list_for_each_entry(p, &ftrace_commands, list) { |
2762 | if (strcmp(p->name, command) == 0) { | 2817 | if (strcmp(p->name, command) == 0) { |
2763 | ret = p->func(func, command, next, enable); | 2818 | ret = p->func(hash, func, command, next, enable); |
2764 | goto out_unlock; | 2819 | goto out_unlock; |
2765 | } | 2820 | } |
2766 | } | 2821 | } |
@@ -2857,7 +2912,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
2857 | ftrace_match_records(hash, buf, len); | 2912 | ftrace_match_records(hash, buf, len); |
2858 | 2913 | ||
2859 | mutex_lock(&ftrace_lock); | 2914 | mutex_lock(&ftrace_lock); |
2860 | ret = ftrace_hash_move(orig_hash, hash); | 2915 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
2916 | if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED | ||
2917 | && ftrace_enabled) | ||
2918 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | ||
2919 | |||
2861 | mutex_unlock(&ftrace_lock); | 2920 | mutex_unlock(&ftrace_lock); |
2862 | 2921 | ||
2863 | mutex_unlock(&ftrace_regex_lock); | 2922 | mutex_unlock(&ftrace_regex_lock); |
@@ -3040,18 +3099,12 @@ ftrace_regex_release(struct inode *inode, struct file *file) | |||
3040 | orig_hash = &iter->ops->notrace_hash; | 3099 | orig_hash = &iter->ops->notrace_hash; |
3041 | 3100 | ||
3042 | mutex_lock(&ftrace_lock); | 3101 | mutex_lock(&ftrace_lock); |
3043 | /* | 3102 | ret = ftrace_hash_move(iter->ops, filter_hash, |
3044 | * Remove the current set, update the hash and add | 3103 | orig_hash, iter->hash); |
3045 | * them back. | 3104 | if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) |
3046 | */ | 3105 | && ftrace_enabled) |
3047 | ftrace_hash_rec_disable(iter->ops, filter_hash); | 3106 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); |
3048 | ret = ftrace_hash_move(orig_hash, iter->hash); | 3107 | |
3049 | if (!ret) { | ||
3050 | ftrace_hash_rec_enable(iter->ops, filter_hash); | ||
3051 | if (iter->ops->flags & FTRACE_OPS_FL_ENABLED | ||
3052 | && ftrace_enabled) | ||
3053 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
3054 | } | ||
3055 | mutex_unlock(&ftrace_lock); | 3108 | mutex_unlock(&ftrace_lock); |
3056 | } | 3109 | } |
3057 | free_ftrace_hash(iter->hash); | 3110 | free_ftrace_hash(iter->hash); |
@@ -3330,7 +3383,7 @@ static int ftrace_process_locs(struct module *mod, | |||
3330 | { | 3383 | { |
3331 | unsigned long *p; | 3384 | unsigned long *p; |
3332 | unsigned long addr; | 3385 | unsigned long addr; |
3333 | unsigned long flags; | 3386 | unsigned long flags = 0; /* Shut up gcc */ |
3334 | 3387 | ||
3335 | mutex_lock(&ftrace_lock); | 3388 | mutex_lock(&ftrace_lock); |
3336 | p = start; | 3389 | p = start; |
@@ -3348,12 +3401,18 @@ static int ftrace_process_locs(struct module *mod, | |||
3348 | } | 3401 | } |
3349 | 3402 | ||
3350 | /* | 3403 | /* |
3351 | * Disable interrupts to prevent interrupts from executing | 3404 | * We only need to disable interrupts on start up |
3352 | * code that is being modified. | 3405 | * because we are modifying code that an interrupt |
3406 | * may execute, and the modification is not atomic. | ||
3407 | * But for modules, nothing runs the code we modify | ||
3408 | * until we are finished with it, and there's no | ||
3409 | * reason to cause large interrupt latencies while we do it. | ||
3353 | */ | 3410 | */ |
3354 | local_irq_save(flags); | 3411 | if (!mod) |
3412 | local_irq_save(flags); | ||
3355 | ftrace_update_code(mod); | 3413 | ftrace_update_code(mod); |
3356 | local_irq_restore(flags); | 3414 | if (!mod) |
3415 | local_irq_restore(flags); | ||
3357 | mutex_unlock(&ftrace_lock); | 3416 | mutex_unlock(&ftrace_lock); |
3358 | 3417 | ||
3359 | return 0; | 3418 | return 0; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0c7aa40794..731201bf4ac 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
997 | unsigned nr_pages) | 997 | unsigned nr_pages) |
998 | { | 998 | { |
999 | struct buffer_page *bpage, *tmp; | 999 | struct buffer_page *bpage, *tmp; |
1000 | unsigned long addr; | ||
1001 | LIST_HEAD(pages); | 1000 | LIST_HEAD(pages); |
1002 | unsigned i; | 1001 | unsigned i; |
1003 | 1002 | ||
1004 | WARN_ON(!nr_pages); | 1003 | WARN_ON(!nr_pages); |
1005 | 1004 | ||
1006 | for (i = 0; i < nr_pages; i++) { | 1005 | for (i = 0; i < nr_pages; i++) { |
1006 | struct page *page; | ||
1007 | /* | ||
1008 | * __GFP_NORETRY flag makes sure that the allocation fails | ||
1009 | * gracefully without invoking oom-killer and the system is | ||
1010 | * not destabilized. | ||
1011 | */ | ||
1007 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1012 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
1008 | GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); | 1013 | GFP_KERNEL | __GFP_NORETRY, |
1014 | cpu_to_node(cpu_buffer->cpu)); | ||
1009 | if (!bpage) | 1015 | if (!bpage) |
1010 | goto free_pages; | 1016 | goto free_pages; |
1011 | 1017 | ||
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1013 | 1019 | ||
1014 | list_add(&bpage->list, &pages); | 1020 | list_add(&bpage->list, &pages); |
1015 | 1021 | ||
1016 | addr = __get_free_page(GFP_KERNEL); | 1022 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), |
1017 | if (!addr) | 1023 | GFP_KERNEL | __GFP_NORETRY, 0); |
1024 | if (!page) | ||
1018 | goto free_pages; | 1025 | goto free_pages; |
1019 | bpage->page = (void *)addr; | 1026 | bpage->page = page_address(page); |
1020 | rb_init_page(bpage->page); | 1027 | rb_init_page(bpage->page); |
1021 | } | 1028 | } |
1022 | 1029 | ||
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1045 | { | 1052 | { |
1046 | struct ring_buffer_per_cpu *cpu_buffer; | 1053 | struct ring_buffer_per_cpu *cpu_buffer; |
1047 | struct buffer_page *bpage; | 1054 | struct buffer_page *bpage; |
1048 | unsigned long addr; | 1055 | struct page *page; |
1049 | int ret; | 1056 | int ret; |
1050 | 1057 | ||
1051 | cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), | 1058 | cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), |
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1067 | rb_check_bpage(cpu_buffer, bpage); | 1074 | rb_check_bpage(cpu_buffer, bpage); |
1068 | 1075 | ||
1069 | cpu_buffer->reader_page = bpage; | 1076 | cpu_buffer->reader_page = bpage; |
1070 | addr = __get_free_page(GFP_KERNEL); | 1077 | page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); |
1071 | if (!addr) | 1078 | if (!page) |
1072 | goto fail_free_reader; | 1079 | goto fail_free_reader; |
1073 | bpage->page = (void *)addr; | 1080 | bpage->page = page_address(page); |
1074 | rb_init_page(bpage->page); | 1081 | rb_init_page(bpage->page); |
1075 | 1082 | ||
1076 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1083 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1314 | unsigned nr_pages, rm_pages, new_pages; | 1321 | unsigned nr_pages, rm_pages, new_pages; |
1315 | struct buffer_page *bpage, *tmp; | 1322 | struct buffer_page *bpage, *tmp; |
1316 | unsigned long buffer_size; | 1323 | unsigned long buffer_size; |
1317 | unsigned long addr; | ||
1318 | LIST_HEAD(pages); | 1324 | LIST_HEAD(pages); |
1319 | int i, cpu; | 1325 | int i, cpu; |
1320 | 1326 | ||
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
1375 | 1381 | ||
1376 | for_each_buffer_cpu(buffer, cpu) { | 1382 | for_each_buffer_cpu(buffer, cpu) { |
1377 | for (i = 0; i < new_pages; i++) { | 1383 | for (i = 0; i < new_pages; i++) { |
1384 | struct page *page; | ||
1385 | /* | ||
1386 | * __GFP_NORETRY flag makes sure that the allocation | ||
1387 | * fails gracefully without invoking oom-killer and | ||
1388 | * the system is not destabilized. | ||
1389 | */ | ||
1378 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1390 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), |
1379 | cache_line_size()), | 1391 | cache_line_size()), |
1380 | GFP_KERNEL, cpu_to_node(cpu)); | 1392 | GFP_KERNEL | __GFP_NORETRY, |
1393 | cpu_to_node(cpu)); | ||
1381 | if (!bpage) | 1394 | if (!bpage) |
1382 | goto free_pages; | 1395 | goto free_pages; |
1383 | list_add(&bpage->list, &pages); | 1396 | list_add(&bpage->list, &pages); |
1384 | addr = __get_free_page(GFP_KERNEL); | 1397 | page = alloc_pages_node(cpu_to_node(cpu), |
1385 | if (!addr) | 1398 | GFP_KERNEL | __GFP_NORETRY, 0); |
1399 | if (!page) | ||
1386 | goto free_pages; | 1400 | goto free_pages; |
1387 | bpage->page = (void *)addr; | 1401 | bpage->page = page_address(page); |
1388 | rb_init_page(bpage->page); | 1402 | rb_init_page(bpage->page); |
1389 | } | 1403 | } |
1390 | } | 1404 | } |
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); | |||
3730 | * Returns: | 3744 | * Returns: |
3731 | * The page allocated, or NULL on error. | 3745 | * The page allocated, or NULL on error. |
3732 | */ | 3746 | */ |
3733 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) | 3747 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) |
3734 | { | 3748 | { |
3735 | struct buffer_data_page *bpage; | 3749 | struct buffer_data_page *bpage; |
3736 | unsigned long addr; | 3750 | struct page *page; |
3737 | 3751 | ||
3738 | addr = __get_free_page(GFP_KERNEL); | 3752 | page = alloc_pages_node(cpu_to_node(cpu), |
3739 | if (!addr) | 3753 | GFP_KERNEL | __GFP_NORETRY, 0); |
3754 | if (!page) | ||
3740 | return NULL; | 3755 | return NULL; |
3741 | 3756 | ||
3742 | bpage = (void *)addr; | 3757 | bpage = page_address(page); |
3743 | 3758 | ||
3744 | rb_init_page(bpage); | 3759 | rb_init_page(bpage); |
3745 | 3760 | ||
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
3978 | size_t cnt, loff_t *ppos) | 3993 | size_t cnt, loff_t *ppos) |
3979 | { | 3994 | { |
3980 | unsigned long *p = filp->private_data; | 3995 | unsigned long *p = filp->private_data; |
3981 | char buf[64]; | ||
3982 | unsigned long val; | 3996 | unsigned long val; |
3983 | int ret; | 3997 | int ret; |
3984 | 3998 | ||
3985 | if (cnt >= sizeof(buf)) | 3999 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
3986 | return -EINVAL; | 4000 | if (ret) |
3987 | |||
3988 | if (copy_from_user(&buf, ubuf, cnt)) | ||
3989 | return -EFAULT; | ||
3990 | |||
3991 | buf[cnt] = 0; | ||
3992 | |||
3993 | ret = strict_strtoul(buf, 10, &val); | ||
3994 | if (ret < 0) | ||
3995 | return ret; | 4001 | return ret; |
3996 | 4002 | ||
3997 | if (val) | 4003 | if (val) |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a61463..a5457d577b9 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) | |||
106 | int inc; | 106 | int inc; |
107 | int i; | 107 | int i; |
108 | 108 | ||
109 | bpage = ring_buffer_alloc_read_page(buffer); | 109 | bpage = ring_buffer_alloc_read_page(buffer, cpu); |
110 | if (!bpage) | 110 | if (!bpage) |
111 | return EVENT_DROPPED; | 111 | return EVENT_DROPPED; |
112 | 112 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee9c921d7f2..17a2d44e1af 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
343 | static int trace_stop_count; | 343 | static int trace_stop_count; |
344 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
345 | 345 | ||
346 | static void wakeup_work_handler(struct work_struct *work) | ||
347 | { | ||
348 | wake_up(&trace_wait); | ||
349 | } | ||
350 | |||
351 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | ||
352 | |||
346 | /** | 353 | /** |
347 | * trace_wake_up - wake up tasks waiting for trace input | 354 | * trace_wake_up - wake up tasks waiting for trace input |
348 | * | 355 | * |
349 | * Simply wakes up any task that is blocked on the trace_wait | 356 | * Schedules a delayed work to wake up any task that is blocked on the |
350 | * queue. These is used with trace_poll for tasks polling the trace. | 357 | * trace_wait queue. These is used with trace_poll for tasks polling the |
358 | * trace. | ||
351 | */ | 359 | */ |
352 | void trace_wake_up(void) | 360 | void trace_wake_up(void) |
353 | { | 361 | { |
354 | int cpu; | 362 | const unsigned long delay = msecs_to_jiffies(2); |
355 | 363 | ||
356 | if (trace_flags & TRACE_ITER_BLOCK) | 364 | if (trace_flags & TRACE_ITER_BLOCK) |
357 | return; | 365 | return; |
358 | /* | 366 | schedule_delayed_work(&wakeup_work, delay); |
359 | * The runqueue_is_locked() can fail, but this is the best we | ||
360 | * have for now: | ||
361 | */ | ||
362 | cpu = get_cpu(); | ||
363 | if (!runqueue_is_locked(cpu)) | ||
364 | wake_up(&trace_wait); | ||
365 | put_cpu(); | ||
366 | } | 367 | } |
367 | 368 | ||
368 | static int __init set_buf_size(char *str) | 369 | static int __init set_buf_size(char *str) |
@@ -424,6 +425,7 @@ static const char *trace_options[] = { | |||
424 | "graph-time", | 425 | "graph-time", |
425 | "record-cmd", | 426 | "record-cmd", |
426 | "overwrite", | 427 | "overwrite", |
428 | "disable_on_free", | ||
427 | NULL | 429 | NULL |
428 | }; | 430 | }; |
429 | 431 | ||
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, | |||
1191 | } | 1193 | } |
1192 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); | 1194 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); |
1193 | 1195 | ||
1196 | void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, | ||
1197 | struct ring_buffer_event *event, | ||
1198 | unsigned long flags, int pc, | ||
1199 | struct pt_regs *regs) | ||
1200 | { | ||
1201 | ring_buffer_unlock_commit(buffer, event); | ||
1202 | |||
1203 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); | ||
1204 | ftrace_trace_userstack(buffer, flags, pc); | ||
1205 | } | ||
1206 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); | ||
1207 | |||
1194 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, | 1208 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, |
1195 | struct ring_buffer_event *event) | 1209 | struct ring_buffer_event *event) |
1196 | { | 1210 | { |
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, | |||
1234 | } | 1248 | } |
1235 | 1249 | ||
1236 | #ifdef CONFIG_STACKTRACE | 1250 | #ifdef CONFIG_STACKTRACE |
1251 | |||
1252 | #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) | ||
1253 | struct ftrace_stack { | ||
1254 | unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; | ||
1255 | }; | ||
1256 | |||
1257 | static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); | ||
1258 | static DEFINE_PER_CPU(int, ftrace_stack_reserve); | ||
1259 | |||
1237 | static void __ftrace_trace_stack(struct ring_buffer *buffer, | 1260 | static void __ftrace_trace_stack(struct ring_buffer *buffer, |
1238 | unsigned long flags, | 1261 | unsigned long flags, |
1239 | int skip, int pc) | 1262 | int skip, int pc, struct pt_regs *regs) |
1240 | { | 1263 | { |
1241 | struct ftrace_event_call *call = &event_kernel_stack; | 1264 | struct ftrace_event_call *call = &event_kernel_stack; |
1242 | struct ring_buffer_event *event; | 1265 | struct ring_buffer_event *event; |
1243 | struct stack_entry *entry; | 1266 | struct stack_entry *entry; |
1244 | struct stack_trace trace; | 1267 | struct stack_trace trace; |
1268 | int use_stack; | ||
1269 | int size = FTRACE_STACK_ENTRIES; | ||
1270 | |||
1271 | trace.nr_entries = 0; | ||
1272 | trace.skip = skip; | ||
1273 | |||
1274 | /* | ||
1275 | * Since events can happen in NMIs there's no safe way to | ||
1276 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt | ||
1277 | * or NMI comes in, it will just have to use the default | ||
1278 | * FTRACE_STACK_SIZE. | ||
1279 | */ | ||
1280 | preempt_disable_notrace(); | ||
1281 | |||
1282 | use_stack = ++__get_cpu_var(ftrace_stack_reserve); | ||
1283 | /* | ||
1284 | * We don't need any atomic variables, just a barrier. | ||
1285 | * If an interrupt comes in, we don't care, because it would | ||
1286 | * have exited and put the counter back to what we want. | ||
1287 | * We just need a barrier to keep gcc from moving things | ||
1288 | * around. | ||
1289 | */ | ||
1290 | barrier(); | ||
1291 | if (use_stack == 1) { | ||
1292 | trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; | ||
1293 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; | ||
1294 | |||
1295 | if (regs) | ||
1296 | save_stack_trace_regs(regs, &trace); | ||
1297 | else | ||
1298 | save_stack_trace(&trace); | ||
1299 | |||
1300 | if (trace.nr_entries > size) | ||
1301 | size = trace.nr_entries; | ||
1302 | } else | ||
1303 | /* From now on, use_stack is a boolean */ | ||
1304 | use_stack = 0; | ||
1305 | |||
1306 | size *= sizeof(unsigned long); | ||
1245 | 1307 | ||
1246 | event = trace_buffer_lock_reserve(buffer, TRACE_STACK, | 1308 | event = trace_buffer_lock_reserve(buffer, TRACE_STACK, |
1247 | sizeof(*entry), flags, pc); | 1309 | sizeof(*entry) + size, flags, pc); |
1248 | if (!event) | 1310 | if (!event) |
1249 | return; | 1311 | goto out; |
1250 | entry = ring_buffer_event_data(event); | 1312 | entry = ring_buffer_event_data(event); |
1251 | memset(&entry->caller, 0, sizeof(entry->caller)); | ||
1252 | 1313 | ||
1253 | trace.nr_entries = 0; | 1314 | memset(&entry->caller, 0, size); |
1254 | trace.max_entries = FTRACE_STACK_ENTRIES; | 1315 | |
1255 | trace.skip = skip; | 1316 | if (use_stack) |
1256 | trace.entries = entry->caller; | 1317 | memcpy(&entry->caller, trace.entries, |
1318 | trace.nr_entries * sizeof(unsigned long)); | ||
1319 | else { | ||
1320 | trace.max_entries = FTRACE_STACK_ENTRIES; | ||
1321 | trace.entries = entry->caller; | ||
1322 | if (regs) | ||
1323 | save_stack_trace_regs(regs, &trace); | ||
1324 | else | ||
1325 | save_stack_trace(&trace); | ||
1326 | } | ||
1327 | |||
1328 | entry->size = trace.nr_entries; | ||
1257 | 1329 | ||
1258 | save_stack_trace(&trace); | ||
1259 | if (!filter_check_discard(call, entry, buffer, event)) | 1330 | if (!filter_check_discard(call, entry, buffer, event)) |
1260 | ring_buffer_unlock_commit(buffer, event); | 1331 | ring_buffer_unlock_commit(buffer, event); |
1332 | |||
1333 | out: | ||
1334 | /* Again, don't let gcc optimize things here */ | ||
1335 | barrier(); | ||
1336 | __get_cpu_var(ftrace_stack_reserve)--; | ||
1337 | preempt_enable_notrace(); | ||
1338 | |||
1339 | } | ||
1340 | |||
1341 | void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, | ||
1342 | int skip, int pc, struct pt_regs *regs) | ||
1343 | { | ||
1344 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) | ||
1345 | return; | ||
1346 | |||
1347 | __ftrace_trace_stack(buffer, flags, skip, pc, regs); | ||
1261 | } | 1348 | } |
1262 | 1349 | ||
1263 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | 1350 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, |
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | |||
1266 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) | 1353 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) |
1267 | return; | 1354 | return; |
1268 | 1355 | ||
1269 | __ftrace_trace_stack(buffer, flags, skip, pc); | 1356 | __ftrace_trace_stack(buffer, flags, skip, pc, NULL); |
1270 | } | 1357 | } |
1271 | 1358 | ||
1272 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, | 1359 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, |
1273 | int pc) | 1360 | int pc) |
1274 | { | 1361 | { |
1275 | __ftrace_trace_stack(tr->buffer, flags, skip, pc); | 1362 | __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); |
1276 | } | 1363 | } |
1277 | 1364 | ||
1278 | /** | 1365 | /** |
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void) | |||
1288 | local_save_flags(flags); | 1375 | local_save_flags(flags); |
1289 | 1376 | ||
1290 | /* skipping 3 traces, seems to get us at the caller of this function */ | 1377 | /* skipping 3 traces, seems to get us at the caller of this function */ |
1291 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); | 1378 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); |
1292 | } | 1379 | } |
1293 | 1380 | ||
1294 | static DEFINE_PER_CPU(int, user_stack_count); | 1381 | static DEFINE_PER_CPU(int, user_stack_count); |
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
1536 | 1623 | ||
1537 | ftrace_enable_cpu(); | 1624 | ftrace_enable_cpu(); |
1538 | 1625 | ||
1539 | return event ? ring_buffer_event_data(event) : NULL; | 1626 | if (event) { |
1627 | iter->ent_size = ring_buffer_event_length(event); | ||
1628 | return ring_buffer_event_data(event); | ||
1629 | } | ||
1630 | iter->ent_size = 0; | ||
1631 | return NULL; | ||
1540 | } | 1632 | } |
1541 | 1633 | ||
1542 | static struct trace_entry * | 1634 | static struct trace_entry * |
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m) | |||
2051 | { | 2143 | { |
2052 | struct trace_iterator *iter = m->private; | 2144 | struct trace_iterator *iter = m->private; |
2053 | 2145 | ||
2146 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
2147 | return; | ||
2148 | |||
2054 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { | 2149 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { |
2055 | /* print nothing if the buffers are empty */ | 2150 | /* print nothing if the buffers are empty */ |
2056 | if (trace_empty(iter)) | 2151 | if (trace_empty(iter)) |
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
2701 | size_t cnt, loff_t *ppos) | 2796 | size_t cnt, loff_t *ppos) |
2702 | { | 2797 | { |
2703 | struct trace_array *tr = filp->private_data; | 2798 | struct trace_array *tr = filp->private_data; |
2704 | char buf[64]; | ||
2705 | unsigned long val; | 2799 | unsigned long val; |
2706 | int ret; | 2800 | int ret; |
2707 | 2801 | ||
2708 | if (cnt >= sizeof(buf)) | 2802 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
2709 | return -EINVAL; | 2803 | if (ret) |
2710 | |||
2711 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2712 | return -EFAULT; | ||
2713 | |||
2714 | buf[cnt] = 0; | ||
2715 | |||
2716 | ret = strict_strtoul(buf, 10, &val); | ||
2717 | if (ret < 0) | ||
2718 | return ret; | 2804 | return ret; |
2719 | 2805 | ||
2720 | val = !!val; | 2806 | val = !!val; |
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
2767 | return t->init(tr); | 2853 | return t->init(tr); |
2768 | } | 2854 | } |
2769 | 2855 | ||
2770 | static int tracing_resize_ring_buffer(unsigned long size) | 2856 | static int __tracing_resize_ring_buffer(unsigned long size) |
2771 | { | 2857 | { |
2772 | int ret; | 2858 | int ret; |
2773 | 2859 | ||
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
2819 | return ret; | 2905 | return ret; |
2820 | } | 2906 | } |
2821 | 2907 | ||
2908 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | ||
2909 | { | ||
2910 | int cpu, ret = size; | ||
2911 | |||
2912 | mutex_lock(&trace_types_lock); | ||
2913 | |||
2914 | tracing_stop(); | ||
2915 | |||
2916 | /* disable all cpu buffers */ | ||
2917 | for_each_tracing_cpu(cpu) { | ||
2918 | if (global_trace.data[cpu]) | ||
2919 | atomic_inc(&global_trace.data[cpu]->disabled); | ||
2920 | if (max_tr.data[cpu]) | ||
2921 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
2922 | } | ||
2923 | |||
2924 | if (size != global_trace.entries) | ||
2925 | ret = __tracing_resize_ring_buffer(size); | ||
2926 | |||
2927 | if (ret < 0) | ||
2928 | ret = -ENOMEM; | ||
2929 | |||
2930 | for_each_tracing_cpu(cpu) { | ||
2931 | if (global_trace.data[cpu]) | ||
2932 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
2933 | if (max_tr.data[cpu]) | ||
2934 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
2935 | } | ||
2936 | |||
2937 | tracing_start(); | ||
2938 | mutex_unlock(&trace_types_lock); | ||
2939 | |||
2940 | return ret; | ||
2941 | } | ||
2942 | |||
2822 | 2943 | ||
2823 | /** | 2944 | /** |
2824 | * tracing_update_buffers - used by tracing facility to expand ring buffers | 2945 | * tracing_update_buffers - used by tracing facility to expand ring buffers |
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void) | |||
2836 | 2957 | ||
2837 | mutex_lock(&trace_types_lock); | 2958 | mutex_lock(&trace_types_lock); |
2838 | if (!ring_buffer_expanded) | 2959 | if (!ring_buffer_expanded) |
2839 | ret = tracing_resize_ring_buffer(trace_buf_size); | 2960 | ret = __tracing_resize_ring_buffer(trace_buf_size); |
2840 | mutex_unlock(&trace_types_lock); | 2961 | mutex_unlock(&trace_types_lock); |
2841 | 2962 | ||
2842 | return ret; | 2963 | return ret; |
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf) | |||
2860 | mutex_lock(&trace_types_lock); | 2981 | mutex_lock(&trace_types_lock); |
2861 | 2982 | ||
2862 | if (!ring_buffer_expanded) { | 2983 | if (!ring_buffer_expanded) { |
2863 | ret = tracing_resize_ring_buffer(trace_buf_size); | 2984 | ret = __tracing_resize_ring_buffer(trace_buf_size); |
2864 | if (ret < 0) | 2985 | if (ret < 0) |
2865 | goto out; | 2986 | goto out; |
2866 | ret = 0; | 2987 | ret = 0; |
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, | |||
2966 | size_t cnt, loff_t *ppos) | 3087 | size_t cnt, loff_t *ppos) |
2967 | { | 3088 | { |
2968 | unsigned long *ptr = filp->private_data; | 3089 | unsigned long *ptr = filp->private_data; |
2969 | char buf[64]; | ||
2970 | unsigned long val; | 3090 | unsigned long val; |
2971 | int ret; | 3091 | int ret; |
2972 | 3092 | ||
2973 | if (cnt >= sizeof(buf)) | 3093 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
2974 | return -EINVAL; | 3094 | if (ret) |
2975 | |||
2976 | if (copy_from_user(&buf, ubuf, cnt)) | ||
2977 | return -EFAULT; | ||
2978 | |||
2979 | buf[cnt] = 0; | ||
2980 | |||
2981 | ret = strict_strtoul(buf, 10, &val); | ||
2982 | if (ret < 0) | ||
2983 | return ret; | 3095 | return ret; |
2984 | 3096 | ||
2985 | *ptr = val * 1000; | 3097 | *ptr = val * 1000; |
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3434 | size_t cnt, loff_t *ppos) | 3546 | size_t cnt, loff_t *ppos) |
3435 | { | 3547 | { |
3436 | unsigned long val; | 3548 | unsigned long val; |
3437 | char buf[64]; | 3549 | int ret; |
3438 | int ret, cpu; | ||
3439 | |||
3440 | if (cnt >= sizeof(buf)) | ||
3441 | return -EINVAL; | ||
3442 | |||
3443 | if (copy_from_user(&buf, ubuf, cnt)) | ||
3444 | return -EFAULT; | ||
3445 | |||
3446 | buf[cnt] = 0; | ||
3447 | 3550 | ||
3448 | ret = strict_strtoul(buf, 10, &val); | 3551 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
3449 | if (ret < 0) | 3552 | if (ret) |
3450 | return ret; | 3553 | return ret; |
3451 | 3554 | ||
3452 | /* must have at least 1 entry */ | 3555 | /* must have at least 1 entry */ |
3453 | if (!val) | 3556 | if (!val) |
3454 | return -EINVAL; | 3557 | return -EINVAL; |
3455 | 3558 | ||
3456 | mutex_lock(&trace_types_lock); | ||
3457 | |||
3458 | tracing_stop(); | ||
3459 | |||
3460 | /* disable all cpu buffers */ | ||
3461 | for_each_tracing_cpu(cpu) { | ||
3462 | if (global_trace.data[cpu]) | ||
3463 | atomic_inc(&global_trace.data[cpu]->disabled); | ||
3464 | if (max_tr.data[cpu]) | ||
3465 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
3466 | } | ||
3467 | |||
3468 | /* value is in KB */ | 3559 | /* value is in KB */ |
3469 | val <<= 10; | 3560 | val <<= 10; |
3470 | 3561 | ||
3471 | if (val != global_trace.entries) { | 3562 | ret = tracing_resize_ring_buffer(val); |
3472 | ret = tracing_resize_ring_buffer(val); | 3563 | if (ret < 0) |
3473 | if (ret < 0) { | 3564 | return ret; |
3474 | cnt = ret; | ||
3475 | goto out; | ||
3476 | } | ||
3477 | } | ||
3478 | 3565 | ||
3479 | *ppos += cnt; | 3566 | *ppos += cnt; |
3480 | 3567 | ||
3481 | /* If check pages failed, return ENOMEM */ | 3568 | return cnt; |
3482 | if (tracing_disabled) | 3569 | } |
3483 | cnt = -ENOMEM; | ||
3484 | out: | ||
3485 | for_each_tracing_cpu(cpu) { | ||
3486 | if (global_trace.data[cpu]) | ||
3487 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
3488 | if (max_tr.data[cpu]) | ||
3489 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
3490 | } | ||
3491 | 3570 | ||
3492 | tracing_start(); | 3571 | static ssize_t |
3493 | mutex_unlock(&trace_types_lock); | 3572 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, |
3573 | size_t cnt, loff_t *ppos) | ||
3574 | { | ||
3575 | /* | ||
3576 | * There is no need to read what the user has written, this function | ||
3577 | * is just to make sure that there is no error when "echo" is used | ||
3578 | */ | ||
3579 | |||
3580 | *ppos += cnt; | ||
3494 | 3581 | ||
3495 | return cnt; | 3582 | return cnt; |
3496 | } | 3583 | } |
3497 | 3584 | ||
3585 | static int | ||
3586 | tracing_free_buffer_release(struct inode *inode, struct file *filp) | ||
3587 | { | ||
3588 | /* disable tracing ? */ | ||
3589 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | ||
3590 | tracing_off(); | ||
3591 | /* resize the ring buffer to 0 */ | ||
3592 | tracing_resize_ring_buffer(0); | ||
3593 | |||
3594 | return 0; | ||
3595 | } | ||
3596 | |||
3498 | static int mark_printk(const char *fmt, ...) | 3597 | static int mark_printk(const char *fmt, ...) |
3499 | { | 3598 | { |
3500 | int ret; | 3599 | int ret; |
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = { | |||
3640 | .llseek = generic_file_llseek, | 3739 | .llseek = generic_file_llseek, |
3641 | }; | 3740 | }; |
3642 | 3741 | ||
3742 | static const struct file_operations tracing_free_buffer_fops = { | ||
3743 | .write = tracing_free_buffer_write, | ||
3744 | .release = tracing_free_buffer_release, | ||
3745 | }; | ||
3746 | |||
3643 | static const struct file_operations tracing_mark_fops = { | 3747 | static const struct file_operations tracing_mark_fops = { |
3644 | .open = tracing_open_generic, | 3748 | .open = tracing_open_generic, |
3645 | .write = tracing_mark_write, | 3749 | .write = tracing_mark_write, |
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3696 | return 0; | 3800 | return 0; |
3697 | 3801 | ||
3698 | if (!info->spare) | 3802 | if (!info->spare) |
3699 | info->spare = ring_buffer_alloc_read_page(info->tr->buffer); | 3803 | info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); |
3700 | if (!info->spare) | 3804 | if (!info->spare) |
3701 | return -ENOMEM; | 3805 | return -ENOMEM; |
3702 | 3806 | ||
@@ -3704,8 +3808,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3704 | if (info->read < PAGE_SIZE) | 3808 | if (info->read < PAGE_SIZE) |
3705 | goto read; | 3809 | goto read; |
3706 | 3810 | ||
3707 | info->read = 0; | ||
3708 | |||
3709 | trace_access_lock(info->cpu); | 3811 | trace_access_lock(info->cpu); |
3710 | ret = ring_buffer_read_page(info->tr->buffer, | 3812 | ret = ring_buffer_read_page(info->tr->buffer, |
3711 | &info->spare, | 3813 | &info->spare, |
@@ -3715,6 +3817,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3715 | if (ret < 0) | 3817 | if (ret < 0) |
3716 | return 0; | 3818 | return 0; |
3717 | 3819 | ||
3820 | info->read = 0; | ||
3821 | |||
3718 | read: | 3822 | read: |
3719 | size = PAGE_SIZE - info->read; | 3823 | size = PAGE_SIZE - info->read; |
3720 | if (size > count) | 3824 | if (size > count) |
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3853 | 3957 | ||
3854 | ref->ref = 1; | 3958 | ref->ref = 1; |
3855 | ref->buffer = info->tr->buffer; | 3959 | ref->buffer = info->tr->buffer; |
3856 | ref->page = ring_buffer_alloc_read_page(ref->buffer); | 3960 | ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); |
3857 | if (!ref->page) { | 3961 | if (!ref->page) { |
3858 | kfree(ref); | 3962 | kfree(ref); |
3859 | break; | 3963 | break; |
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3862 | r = ring_buffer_read_page(ref->buffer, &ref->page, | 3966 | r = ring_buffer_read_page(ref->buffer, &ref->page, |
3863 | len, info->cpu, 1); | 3967 | len, info->cpu, 1); |
3864 | if (r < 0) { | 3968 | if (r < 0) { |
3865 | ring_buffer_free_read_page(ref->buffer, | 3969 | ring_buffer_free_read_page(ref->buffer, ref->page); |
3866 | ref->page); | ||
3867 | kfree(ref); | 3970 | kfree(ref); |
3868 | break; | 3971 | break; |
3869 | } | 3972 | } |
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
4099 | { | 4202 | { |
4100 | struct trace_option_dentry *topt = filp->private_data; | 4203 | struct trace_option_dentry *topt = filp->private_data; |
4101 | unsigned long val; | 4204 | unsigned long val; |
4102 | char buf[64]; | ||
4103 | int ret; | 4205 | int ret; |
4104 | 4206 | ||
4105 | if (cnt >= sizeof(buf)) | 4207 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
4106 | return -EINVAL; | 4208 | if (ret) |
4107 | |||
4108 | if (copy_from_user(&buf, ubuf, cnt)) | ||
4109 | return -EFAULT; | ||
4110 | |||
4111 | buf[cnt] = 0; | ||
4112 | |||
4113 | ret = strict_strtoul(buf, 10, &val); | ||
4114 | if (ret < 0) | ||
4115 | return ret; | 4209 | return ret; |
4116 | 4210 | ||
4117 | if (val != 0 && val != 1) | 4211 | if (val != 0 && val != 1) |
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
4159 | loff_t *ppos) | 4253 | loff_t *ppos) |
4160 | { | 4254 | { |
4161 | long index = (long)filp->private_data; | 4255 | long index = (long)filp->private_data; |
4162 | char buf[64]; | ||
4163 | unsigned long val; | 4256 | unsigned long val; |
4164 | int ret; | 4257 | int ret; |
4165 | 4258 | ||
4166 | if (cnt >= sizeof(buf)) | 4259 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
4167 | return -EINVAL; | 4260 | if (ret) |
4168 | |||
4169 | if (copy_from_user(&buf, ubuf, cnt)) | ||
4170 | return -EFAULT; | ||
4171 | |||
4172 | buf[cnt] = 0; | ||
4173 | |||
4174 | ret = strict_strtoul(buf, 10, &val); | ||
4175 | if (ret < 0) | ||
4176 | return ret; | 4261 | return ret; |
4177 | 4262 | ||
4178 | if (val != 0 && val != 1) | 4263 | if (val != 0 && val != 1) |
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void) | |||
4365 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4450 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4366 | &global_trace, &tracing_entries_fops); | 4451 | &global_trace, &tracing_entries_fops); |
4367 | 4452 | ||
4453 | trace_create_file("free_buffer", 0644, d_tracer, | ||
4454 | &global_trace, &tracing_free_buffer_fops); | ||
4455 | |||
4368 | trace_create_file("trace_marker", 0220, d_tracer, | 4456 | trace_create_file("trace_marker", 0220, d_tracer, |
4369 | NULL, &tracing_mark_fops); | 4457 | NULL, &tracing_mark_fops); |
4370 | 4458 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61..616846bcfee 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -2,7 +2,7 @@ | |||
2 | #define _LINUX_KERNEL_TRACE_H | 2 | #define _LINUX_KERNEL_TRACE_H |
3 | 3 | ||
4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
5 | #include <asm/atomic.h> | 5 | #include <linux/atomic.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/clocksource.h> | 7 | #include <linux/clocksource.h> |
8 | #include <linux/ring_buffer.h> | 8 | #include <linux/ring_buffer.h> |
@@ -278,6 +278,29 @@ struct tracer { | |||
278 | }; | 278 | }; |
279 | 279 | ||
280 | 280 | ||
281 | /* Only current can touch trace_recursion */ | ||
282 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
283 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
284 | |||
285 | /* Ring buffer has the 10 LSB bits to count */ | ||
286 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | ||
287 | |||
288 | /* for function tracing recursion */ | ||
289 | #define TRACE_INTERNAL_BIT (1<<11) | ||
290 | #define TRACE_GLOBAL_BIT (1<<12) | ||
291 | /* | ||
292 | * Abuse of the trace_recursion. | ||
293 | * As we need a way to maintain state if we are tracing the function | ||
294 | * graph in irq because we want to trace a particular function that | ||
295 | * was called in irq context but we have irq tracing off. Since this | ||
296 | * can only be modified by current, we can reuse trace_recursion. | ||
297 | */ | ||
298 | #define TRACE_IRQ_BIT (1<<13) | ||
299 | |||
300 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | ||
301 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | ||
302 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | ||
303 | |||
281 | #define TRACE_PIPE_ALL_CPU -1 | 304 | #define TRACE_PIPE_ALL_CPU -1 |
282 | 305 | ||
283 | int tracer_init(struct tracer *t, struct trace_array *tr); | 306 | int tracer_init(struct tracer *t, struct trace_array *tr); |
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr, | |||
389 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | 412 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, |
390 | int skip, int pc); | 413 | int skip, int pc); |
391 | 414 | ||
415 | void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, | ||
416 | int skip, int pc, struct pt_regs *regs); | ||
417 | |||
392 | void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, | 418 | void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, |
393 | int pc); | 419 | int pc); |
394 | 420 | ||
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, | |||
400 | { | 426 | { |
401 | } | 427 | } |
402 | 428 | ||
429 | static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, | ||
430 | unsigned long flags, int skip, | ||
431 | int pc, struct pt_regs *regs) | ||
432 | { | ||
433 | } | ||
434 | |||
403 | static inline void ftrace_trace_userstack(struct ring_buffer *buffer, | 435 | static inline void ftrace_trace_userstack(struct ring_buffer *buffer, |
404 | unsigned long flags, int pc) | 436 | unsigned long flags, int pc) |
405 | { | 437 | { |
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
507 | return 1; | 539 | return 1; |
508 | 540 | ||
509 | for (i = 0; i < ftrace_graph_count; i++) { | 541 | for (i = 0; i < ftrace_graph_count; i++) { |
510 | if (addr == ftrace_graph_funcs[i]) | 542 | if (addr == ftrace_graph_funcs[i]) { |
543 | /* | ||
544 | * If no irqs are to be traced, but a set_graph_function | ||
545 | * is set, and called by an interrupt handler, we still | ||
546 | * want to trace it. | ||
547 | */ | ||
548 | if (in_irq()) | ||
549 | trace_recursion_set(TRACE_IRQ_BIT); | ||
550 | else | ||
551 | trace_recursion_clear(TRACE_IRQ_BIT); | ||
511 | return 1; | 552 | return 1; |
553 | } | ||
512 | } | 554 | } |
513 | 555 | ||
514 | return 0; | 556 | return 0; |
@@ -609,6 +651,7 @@ enum trace_iterator_flags { | |||
609 | TRACE_ITER_GRAPH_TIME = 0x80000, | 651 | TRACE_ITER_GRAPH_TIME = 0x80000, |
610 | TRACE_ITER_RECORD_CMD = 0x100000, | 652 | TRACE_ITER_RECORD_CMD = 0x100000, |
611 | TRACE_ITER_OVERWRITE = 0x200000, | 653 | TRACE_ITER_OVERWRITE = 0x200000, |
654 | TRACE_ITER_STOP_ON_FREE = 0x400000, | ||
612 | }; | 655 | }; |
613 | 656 | ||
614 | /* | 657 | /* |
@@ -677,6 +720,7 @@ struct event_subsystem { | |||
677 | struct dentry *entry; | 720 | struct dentry *entry; |
678 | struct event_filter *filter; | 721 | struct event_filter *filter; |
679 | int nr_events; | 722 | int nr_events; |
723 | int ref_count; | ||
680 | }; | 724 | }; |
681 | 725 | ||
682 | #define FILTER_PRED_INVALID ((unsigned short)-1) | 726 | #define FILTER_PRED_INVALID ((unsigned short)-1) |
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[]; | |||
784 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) | 828 | FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) |
785 | #include "trace_entries.h" | 829 | #include "trace_entries.h" |
786 | 830 | ||
787 | /* Only current can touch trace_recursion */ | ||
788 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
789 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
790 | |||
791 | /* Ring buffer has the 10 LSB bits to count */ | ||
792 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | ||
793 | |||
794 | /* for function tracing recursion */ | ||
795 | #define TRACE_INTERNAL_BIT (1<<11) | ||
796 | #define TRACE_GLOBAL_BIT (1<<12) | ||
797 | |||
798 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | ||
799 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | ||
800 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | ||
801 | |||
802 | #endif /* _LINUX_KERNEL_TRACE_H */ | 831 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c84d9..93365907f21 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, | |||
161 | TRACE_STACK, | 161 | TRACE_STACK, |
162 | 162 | ||
163 | F_STRUCT( | 163 | F_STRUCT( |
164 | __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) | 164 | __field( int, size ) |
165 | __dynamic_array(unsigned long, caller ) | ||
165 | ), | 166 | ), |
166 | 167 | ||
167 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" | 168 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a..c212a7f934e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void) | |||
244 | mutex_unlock(&event_mutex); | 244 | mutex_unlock(&event_mutex); |
245 | } | 245 | } |
246 | 246 | ||
247 | static void __put_system(struct event_subsystem *system) | ||
248 | { | ||
249 | struct event_filter *filter = system->filter; | ||
250 | |||
251 | WARN_ON_ONCE(system->ref_count == 0); | ||
252 | if (--system->ref_count) | ||
253 | return; | ||
254 | |||
255 | if (filter) { | ||
256 | kfree(filter->filter_string); | ||
257 | kfree(filter); | ||
258 | } | ||
259 | kfree(system->name); | ||
260 | kfree(system); | ||
261 | } | ||
262 | |||
263 | static void __get_system(struct event_subsystem *system) | ||
264 | { | ||
265 | WARN_ON_ONCE(system->ref_count == 0); | ||
266 | system->ref_count++; | ||
267 | } | ||
268 | |||
269 | static void put_system(struct event_subsystem *system) | ||
270 | { | ||
271 | mutex_lock(&event_mutex); | ||
272 | __put_system(system); | ||
273 | mutex_unlock(&event_mutex); | ||
274 | } | ||
275 | |||
247 | /* | 276 | /* |
248 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. | 277 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. |
249 | */ | 278 | */ |
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
486 | loff_t *ppos) | 515 | loff_t *ppos) |
487 | { | 516 | { |
488 | struct ftrace_event_call *call = filp->private_data; | 517 | struct ftrace_event_call *call = filp->private_data; |
489 | char buf[64]; | ||
490 | unsigned long val; | 518 | unsigned long val; |
491 | int ret; | 519 | int ret; |
492 | 520 | ||
493 | if (cnt >= sizeof(buf)) | 521 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
494 | return -EINVAL; | 522 | if (ret) |
495 | |||
496 | if (copy_from_user(&buf, ubuf, cnt)) | ||
497 | return -EFAULT; | ||
498 | |||
499 | buf[cnt] = 0; | ||
500 | |||
501 | ret = strict_strtoul(buf, 10, &val); | ||
502 | if (ret < 0) | ||
503 | return ret; | 523 | return ret; |
504 | 524 | ||
505 | ret = tracing_update_buffers(); | 525 | ret = tracing_update_buffers(); |
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
528 | loff_t *ppos) | 548 | loff_t *ppos) |
529 | { | 549 | { |
530 | const char set_to_char[4] = { '?', '0', '1', 'X' }; | 550 | const char set_to_char[4] = { '?', '0', '1', 'X' }; |
531 | const char *system = filp->private_data; | 551 | struct event_subsystem *system = filp->private_data; |
532 | struct ftrace_event_call *call; | 552 | struct ftrace_event_call *call; |
533 | char buf[2]; | 553 | char buf[2]; |
534 | int set = 0; | 554 | int set = 0; |
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
539 | if (!call->name || !call->class || !call->class->reg) | 559 | if (!call->name || !call->class || !call->class->reg) |
540 | continue; | 560 | continue; |
541 | 561 | ||
542 | if (system && strcmp(call->class->system, system) != 0) | 562 | if (system && strcmp(call->class->system, system->name) != 0) |
543 | continue; | 563 | continue; |
544 | 564 | ||
545 | /* | 565 | /* |
@@ -569,21 +589,13 @@ static ssize_t | |||
569 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | 589 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, |
570 | loff_t *ppos) | 590 | loff_t *ppos) |
571 | { | 591 | { |
572 | const char *system = filp->private_data; | 592 | struct event_subsystem *system = filp->private_data; |
593 | const char *name = NULL; | ||
573 | unsigned long val; | 594 | unsigned long val; |
574 | char buf[64]; | ||
575 | ssize_t ret; | 595 | ssize_t ret; |
576 | 596 | ||
577 | if (cnt >= sizeof(buf)) | 597 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
578 | return -EINVAL; | 598 | if (ret) |
579 | |||
580 | if (copy_from_user(&buf, ubuf, cnt)) | ||
581 | return -EFAULT; | ||
582 | |||
583 | buf[cnt] = 0; | ||
584 | |||
585 | ret = strict_strtoul(buf, 10, &val); | ||
586 | if (ret < 0) | ||
587 | return ret; | 599 | return ret; |
588 | 600 | ||
589 | ret = tracing_update_buffers(); | 601 | ret = tracing_update_buffers(); |
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
593 | if (val != 0 && val != 1) | 605 | if (val != 0 && val != 1) |
594 | return -EINVAL; | 606 | return -EINVAL; |
595 | 607 | ||
596 | ret = __ftrace_set_clr_event(NULL, system, NULL, val); | 608 | /* |
609 | * Opening of "enable" adds a ref count to system, | ||
610 | * so the name is safe to use. | ||
611 | */ | ||
612 | if (system) | ||
613 | name = system->name; | ||
614 | |||
615 | ret = __ftrace_set_clr_event(NULL, name, NULL, val); | ||
597 | if (ret) | 616 | if (ret) |
598 | goto out; | 617 | goto out; |
599 | 618 | ||
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
826 | return cnt; | 845 | return cnt; |
827 | } | 846 | } |
828 | 847 | ||
848 | static LIST_HEAD(event_subsystems); | ||
849 | |||
850 | static int subsystem_open(struct inode *inode, struct file *filp) | ||
851 | { | ||
852 | struct event_subsystem *system = NULL; | ||
853 | int ret; | ||
854 | |||
855 | if (!inode->i_private) | ||
856 | goto skip_search; | ||
857 | |||
858 | /* Make sure the system still exists */ | ||
859 | mutex_lock(&event_mutex); | ||
860 | list_for_each_entry(system, &event_subsystems, list) { | ||
861 | if (system == inode->i_private) { | ||
862 | /* Don't open systems with no events */ | ||
863 | if (!system->nr_events) { | ||
864 | system = NULL; | ||
865 | break; | ||
866 | } | ||
867 | __get_system(system); | ||
868 | break; | ||
869 | } | ||
870 | } | ||
871 | mutex_unlock(&event_mutex); | ||
872 | |||
873 | if (system != inode->i_private) | ||
874 | return -ENODEV; | ||
875 | |||
876 | skip_search: | ||
877 | ret = tracing_open_generic(inode, filp); | ||
878 | if (ret < 0 && system) | ||
879 | put_system(system); | ||
880 | |||
881 | return ret; | ||
882 | } | ||
883 | |||
884 | static int subsystem_release(struct inode *inode, struct file *file) | ||
885 | { | ||
886 | struct event_subsystem *system = inode->i_private; | ||
887 | |||
888 | if (system) | ||
889 | put_system(system); | ||
890 | |||
891 | return 0; | ||
892 | } | ||
893 | |||
829 | static ssize_t | 894 | static ssize_t |
830 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | 895 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, |
831 | loff_t *ppos) | 896 | loff_t *ppos) |
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = { | |||
963 | }; | 1028 | }; |
964 | 1029 | ||
965 | static const struct file_operations ftrace_subsystem_filter_fops = { | 1030 | static const struct file_operations ftrace_subsystem_filter_fops = { |
966 | .open = tracing_open_generic, | 1031 | .open = subsystem_open, |
967 | .read = subsystem_filter_read, | 1032 | .read = subsystem_filter_read, |
968 | .write = subsystem_filter_write, | 1033 | .write = subsystem_filter_write, |
969 | .llseek = default_llseek, | 1034 | .llseek = default_llseek, |
1035 | .release = subsystem_release, | ||
970 | }; | 1036 | }; |
971 | 1037 | ||
972 | static const struct file_operations ftrace_system_enable_fops = { | 1038 | static const struct file_operations ftrace_system_enable_fops = { |
973 | .open = tracing_open_generic, | 1039 | .open = subsystem_open, |
974 | .read = system_enable_read, | 1040 | .read = system_enable_read, |
975 | .write = system_enable_write, | 1041 | .write = system_enable_write, |
976 | .llseek = default_llseek, | 1042 | .llseek = default_llseek, |
1043 | .release = subsystem_release, | ||
977 | }; | 1044 | }; |
978 | 1045 | ||
979 | static const struct file_operations ftrace_show_header_fops = { | 1046 | static const struct file_operations ftrace_show_header_fops = { |
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void) | |||
1002 | return d_events; | 1069 | return d_events; |
1003 | } | 1070 | } |
1004 | 1071 | ||
1005 | static LIST_HEAD(event_subsystems); | ||
1006 | |||
1007 | static struct dentry * | 1072 | static struct dentry * |
1008 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1073 | event_subsystem_dir(const char *name, struct dentry *d_events) |
1009 | { | 1074 | { |
@@ -1035,6 +1100,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
1035 | } | 1100 | } |
1036 | 1101 | ||
1037 | system->nr_events = 1; | 1102 | system->nr_events = 1; |
1103 | system->ref_count = 1; | ||
1038 | system->name = kstrdup(name, GFP_KERNEL); | 1104 | system->name = kstrdup(name, GFP_KERNEL); |
1039 | if (!system->name) { | 1105 | if (!system->name) { |
1040 | debugfs_remove(system->entry); | 1106 | debugfs_remove(system->entry); |
@@ -1062,8 +1128,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
1062 | "'%s/filter' entry\n", name); | 1128 | "'%s/filter' entry\n", name); |
1063 | } | 1129 | } |
1064 | 1130 | ||
1065 | trace_create_file("enable", 0644, system->entry, | 1131 | trace_create_file("enable", 0644, system->entry, system, |
1066 | (void *)system->name, | ||
1067 | &ftrace_system_enable_fops); | 1132 | &ftrace_system_enable_fops); |
1068 | 1133 | ||
1069 | return system->entry; | 1134 | return system->entry; |
@@ -1184,16 +1249,9 @@ static void remove_subsystem_dir(const char *name) | |||
1184 | list_for_each_entry(system, &event_subsystems, list) { | 1249 | list_for_each_entry(system, &event_subsystems, list) { |
1185 | if (strcmp(system->name, name) == 0) { | 1250 | if (strcmp(system->name, name) == 0) { |
1186 | if (!--system->nr_events) { | 1251 | if (!--system->nr_events) { |
1187 | struct event_filter *filter = system->filter; | ||
1188 | |||
1189 | debugfs_remove_recursive(system->entry); | 1252 | debugfs_remove_recursive(system->entry); |
1190 | list_del(&system->list); | 1253 | list_del(&system->list); |
1191 | if (filter) { | 1254 | __put_system(system); |
1192 | kfree(filter->filter_string); | ||
1193 | kfree(filter); | ||
1194 | } | ||
1195 | kfree(system->name); | ||
1196 | kfree(system); | ||
1197 | } | 1255 | } |
1198 | break; | 1256 | break; |
1199 | } | 1257 | } |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddcfbf2..bd3c6369f80 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -1766,7 +1766,7 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1766 | * replace the filter for the call. | 1766 | * replace the filter for the call. |
1767 | */ | 1767 | */ |
1768 | filter = call->filter; | 1768 | filter = call->filter; |
1769 | call->filter = filter_item->filter; | 1769 | rcu_assign_pointer(call->filter, filter_item->filter); |
1770 | filter_item->filter = filter; | 1770 | filter_item->filter = filter; |
1771 | 1771 | ||
1772 | fail = false; | 1772 | fail = false; |
@@ -1821,7 +1821,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1821 | filter = call->filter; | 1821 | filter = call->filter; |
1822 | if (!filter) | 1822 | if (!filter) |
1823 | goto out_unlock; | 1823 | goto out_unlock; |
1824 | call->filter = NULL; | 1824 | RCU_INIT_POINTER(call->filter, NULL); |
1825 | /* Make sure the filter is not being used */ | 1825 | /* Make sure the filter is not being used */ |
1826 | synchronize_sched(); | 1826 | synchronize_sched(); |
1827 | __free_filter(filter); | 1827 | __free_filter(filter); |
@@ -1862,7 +1862,7 @@ out: | |||
1862 | * string | 1862 | * string |
1863 | */ | 1863 | */ |
1864 | tmp = call->filter; | 1864 | tmp = call->filter; |
1865 | call->filter = filter; | 1865 | rcu_assign_pointer(call->filter, filter); |
1866 | if (tmp) { | 1866 | if (tmp) { |
1867 | /* Make sure the call is done with the filter */ | 1867 | /* Make sure the call is done with the filter */ |
1868 | synchronize_sched(); | 1868 | synchronize_sched(); |
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1886 | 1886 | ||
1887 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
1888 | 1888 | ||
1889 | /* Make sure the system still has events */ | ||
1890 | if (!system->nr_events) { | ||
1891 | err = -ENODEV; | ||
1892 | goto out_unlock; | ||
1893 | } | ||
1894 | |||
1889 | if (!strcmp(strstrip(filter_string), "0")) { | 1895 | if (!strcmp(strstrip(filter_string), "0")) { |
1890 | filter_free_subsystem_preds(system); | 1896 | filter_free_subsystem_preds(system); |
1891 | remove_filter_string(system->filter); | 1897 | remove_filter_string(system->filter); |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8d0e1cc4e97..c7b0c6a7db0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) | |||
324 | } | 324 | } |
325 | 325 | ||
326 | static int | 326 | static int |
327 | ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) | 327 | ftrace_trace_onoff_callback(struct ftrace_hash *hash, |
328 | char *glob, char *cmd, char *param, int enable) | ||
328 | { | 329 | { |
329 | struct ftrace_probe_ops *ops; | 330 | struct ftrace_probe_ops *ops; |
330 | void *count = (void *)-1; | 331 | void *count = (void *)-1; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb24ed8..a7d2a4c653d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { | |||
74 | 74 | ||
75 | static struct trace_array *graph_array; | 75 | static struct trace_array *graph_array; |
76 | 76 | ||
77 | /* | ||
78 | * DURATION column is being also used to display IRQ signs, | ||
79 | * following values are used by print_graph_irq and others | ||
80 | * to fill in space into DURATION column. | ||
81 | */ | ||
82 | enum { | ||
83 | DURATION_FILL_FULL = -1, | ||
84 | DURATION_FILL_START = -2, | ||
85 | DURATION_FILL_END = -3, | ||
86 | }; | ||
87 | |||
88 | static enum print_line_t | ||
89 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | ||
90 | u32 flags); | ||
77 | 91 | ||
78 | /* Add a function return address to the trace stack on thread info.*/ | 92 | /* Add a function return address to the trace stack on thread info.*/ |
79 | int | 93 | int |
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
213 | 227 | ||
214 | static inline int ftrace_graph_ignore_irqs(void) | 228 | static inline int ftrace_graph_ignore_irqs(void) |
215 | { | 229 | { |
216 | if (!ftrace_graph_skip_irqs) | 230 | if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) |
217 | return 0; | 231 | return 0; |
218 | 232 | ||
219 | return in_irq(); | 233 | return in_irq(); |
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
577 | return next; | 591 | return next; |
578 | } | 592 | } |
579 | 593 | ||
580 | /* Signal a overhead of time execution to the output */ | ||
581 | static int | ||
582 | print_graph_overhead(unsigned long long duration, struct trace_seq *s, | ||
583 | u32 flags) | ||
584 | { | ||
585 | /* If duration disappear, we don't need anything */ | ||
586 | if (!(flags & TRACE_GRAPH_PRINT_DURATION)) | ||
587 | return 1; | ||
588 | |||
589 | /* Non nested entry or return */ | ||
590 | if (duration == -1) | ||
591 | return trace_seq_printf(s, " "); | ||
592 | |||
593 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | ||
594 | /* Duration exceeded 100 msecs */ | ||
595 | if (duration > 100000ULL) | ||
596 | return trace_seq_printf(s, "! "); | ||
597 | |||
598 | /* Duration exceeded 10 msecs */ | ||
599 | if (duration > 10000ULL) | ||
600 | return trace_seq_printf(s, "+ "); | ||
601 | } | ||
602 | |||
603 | return trace_seq_printf(s, " "); | ||
604 | } | ||
605 | |||
606 | static int print_graph_abs_time(u64 t, struct trace_seq *s) | 594 | static int print_graph_abs_time(u64 t, struct trace_seq *s) |
607 | { | 595 | { |
608 | unsigned long usecs_rem; | 596 | unsigned long usecs_rem; |
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
625 | addr >= (unsigned long)__irqentry_text_end) | 613 | addr >= (unsigned long)__irqentry_text_end) |
626 | return TRACE_TYPE_UNHANDLED; | 614 | return TRACE_TYPE_UNHANDLED; |
627 | 615 | ||
628 | /* Absolute time */ | 616 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
629 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 617 | /* Absolute time */ |
630 | ret = print_graph_abs_time(iter->ts, s); | 618 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { |
631 | if (!ret) | 619 | ret = print_graph_abs_time(iter->ts, s); |
632 | return TRACE_TYPE_PARTIAL_LINE; | 620 | if (!ret) |
633 | } | 621 | return TRACE_TYPE_PARTIAL_LINE; |
622 | } | ||
634 | 623 | ||
635 | /* Cpu */ | 624 | /* Cpu */ |
636 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 625 | if (flags & TRACE_GRAPH_PRINT_CPU) { |
637 | ret = print_graph_cpu(s, cpu); | 626 | ret = print_graph_cpu(s, cpu); |
638 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 627 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
639 | return TRACE_TYPE_PARTIAL_LINE; | 628 | return TRACE_TYPE_PARTIAL_LINE; |
640 | } | 629 | } |
641 | 630 | ||
642 | /* Proc */ | 631 | /* Proc */ |
643 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 632 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
644 | ret = print_graph_proc(s, pid); | 633 | ret = print_graph_proc(s, pid); |
645 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 634 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
646 | return TRACE_TYPE_PARTIAL_LINE; | 635 | return TRACE_TYPE_PARTIAL_LINE; |
647 | ret = trace_seq_printf(s, " | "); | 636 | ret = trace_seq_printf(s, " | "); |
648 | if (!ret) | 637 | if (!ret) |
649 | return TRACE_TYPE_PARTIAL_LINE; | 638 | return TRACE_TYPE_PARTIAL_LINE; |
639 | } | ||
650 | } | 640 | } |
651 | 641 | ||
652 | /* No overhead */ | 642 | /* No overhead */ |
653 | ret = print_graph_overhead(-1, s, flags); | 643 | ret = print_graph_duration(DURATION_FILL_START, s, flags); |
654 | if (!ret) | 644 | if (ret != TRACE_TYPE_HANDLED) |
655 | return TRACE_TYPE_PARTIAL_LINE; | 645 | return ret; |
656 | 646 | ||
657 | if (type == TRACE_GRAPH_ENT) | 647 | if (type == TRACE_GRAPH_ENT) |
658 | ret = trace_seq_printf(s, "==========>"); | 648 | ret = trace_seq_printf(s, "==========>"); |
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
662 | if (!ret) | 652 | if (!ret) |
663 | return TRACE_TYPE_PARTIAL_LINE; | 653 | return TRACE_TYPE_PARTIAL_LINE; |
664 | 654 | ||
665 | /* Don't close the duration column if haven't one */ | 655 | ret = print_graph_duration(DURATION_FILL_END, s, flags); |
666 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 656 | if (ret != TRACE_TYPE_HANDLED) |
667 | trace_seq_printf(s, " |"); | 657 | return ret; |
658 | |||
668 | ret = trace_seq_printf(s, "\n"); | 659 | ret = trace_seq_printf(s, "\n"); |
669 | 660 | ||
670 | if (!ret) | 661 | if (!ret) |
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
716 | } | 707 | } |
717 | 708 | ||
718 | static enum print_line_t | 709 | static enum print_line_t |
719 | print_graph_duration(unsigned long long duration, struct trace_seq *s) | 710 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
711 | u32 flags) | ||
720 | { | 712 | { |
721 | int ret; | 713 | int ret = -1; |
714 | |||
715 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || | ||
716 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
717 | return TRACE_TYPE_HANDLED; | ||
718 | |||
719 | /* No real adata, just filling the column with spaces */ | ||
720 | switch (duration) { | ||
721 | case DURATION_FILL_FULL: | ||
722 | ret = trace_seq_printf(s, " | "); | ||
723 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
724 | case DURATION_FILL_START: | ||
725 | ret = trace_seq_printf(s, " "); | ||
726 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
727 | case DURATION_FILL_END: | ||
728 | ret = trace_seq_printf(s, " |"); | ||
729 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
730 | } | ||
731 | |||
732 | /* Signal a overhead of time execution to the output */ | ||
733 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | ||
734 | /* Duration exceeded 100 msecs */ | ||
735 | if (duration > 100000ULL) | ||
736 | ret = trace_seq_printf(s, "! "); | ||
737 | /* Duration exceeded 10 msecs */ | ||
738 | else if (duration > 10000ULL) | ||
739 | ret = trace_seq_printf(s, "+ "); | ||
740 | } | ||
741 | |||
742 | /* | ||
743 | * The -1 means we either did not exceed the duration tresholds | ||
744 | * or we dont want to print out the overhead. Either way we need | ||
745 | * to fill out the space. | ||
746 | */ | ||
747 | if (ret == -1) | ||
748 | ret = trace_seq_printf(s, " "); | ||
749 | |||
750 | /* Catching here any failure happenned above */ | ||
751 | if (!ret) | ||
752 | return TRACE_TYPE_PARTIAL_LINE; | ||
722 | 753 | ||
723 | ret = trace_print_graph_duration(duration, s); | 754 | ret = trace_print_graph_duration(duration, s); |
724 | if (ret != TRACE_TYPE_HANDLED) | 755 | if (ret != TRACE_TYPE_HANDLED) |
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
767 | cpu_data->enter_funcs[call->depth] = 0; | 798 | cpu_data->enter_funcs[call->depth] = 0; |
768 | } | 799 | } |
769 | 800 | ||
770 | /* Overhead */ | 801 | /* Overhead and duration */ |
771 | ret = print_graph_overhead(duration, s, flags); | 802 | ret = print_graph_duration(duration, s, flags); |
772 | if (!ret) | 803 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
773 | return TRACE_TYPE_PARTIAL_LINE; | 804 | return TRACE_TYPE_PARTIAL_LINE; |
774 | 805 | ||
775 | /* Duration */ | ||
776 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | ||
777 | ret = print_graph_duration(duration, s); | ||
778 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
779 | return TRACE_TYPE_PARTIAL_LINE; | ||
780 | } | ||
781 | |||
782 | /* Function */ | 806 | /* Function */ |
783 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 807 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
784 | ret = trace_seq_printf(s, " "); | 808 | ret = trace_seq_printf(s, " "); |
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
815 | cpu_data->enter_funcs[call->depth] = call->func; | 839 | cpu_data->enter_funcs[call->depth] = call->func; |
816 | } | 840 | } |
817 | 841 | ||
818 | /* No overhead */ | ||
819 | ret = print_graph_overhead(-1, s, flags); | ||
820 | if (!ret) | ||
821 | return TRACE_TYPE_PARTIAL_LINE; | ||
822 | |||
823 | /* No time */ | 842 | /* No time */ |
824 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | 843 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); |
825 | ret = trace_seq_printf(s, " | "); | 844 | if (ret != TRACE_TYPE_HANDLED) |
826 | if (!ret) | 845 | return ret; |
827 | return TRACE_TYPE_PARTIAL_LINE; | ||
828 | } | ||
829 | 846 | ||
830 | /* Function */ | 847 | /* Function */ |
831 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 848 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
865 | return TRACE_TYPE_PARTIAL_LINE; | 882 | return TRACE_TYPE_PARTIAL_LINE; |
866 | } | 883 | } |
867 | 884 | ||
885 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
886 | return 0; | ||
887 | |||
868 | /* Absolute time */ | 888 | /* Absolute time */ |
869 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 889 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { |
870 | ret = print_graph_abs_time(iter->ts, s); | 890 | ret = print_graph_abs_time(iter->ts, s); |
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
1078 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1098 | if (print_graph_prologue(iter, s, 0, 0, flags)) |
1079 | return TRACE_TYPE_PARTIAL_LINE; | 1099 | return TRACE_TYPE_PARTIAL_LINE; |
1080 | 1100 | ||
1081 | /* Overhead */ | 1101 | /* Overhead and duration */ |
1082 | ret = print_graph_overhead(duration, s, flags); | 1102 | ret = print_graph_duration(duration, s, flags); |
1083 | if (!ret) | 1103 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
1084 | return TRACE_TYPE_PARTIAL_LINE; | 1104 | return TRACE_TYPE_PARTIAL_LINE; |
1085 | 1105 | ||
1086 | /* Duration */ | ||
1087 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | ||
1088 | ret = print_graph_duration(duration, s); | ||
1089 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
1090 | return TRACE_TYPE_PARTIAL_LINE; | ||
1091 | } | ||
1092 | |||
1093 | /* Closing brace */ | 1106 | /* Closing brace */ |
1094 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { | 1107 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { |
1095 | ret = trace_seq_printf(s, " "); | 1108 | ret = trace_seq_printf(s, " "); |
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1146 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1159 | if (print_graph_prologue(iter, s, 0, 0, flags)) |
1147 | return TRACE_TYPE_PARTIAL_LINE; | 1160 | return TRACE_TYPE_PARTIAL_LINE; |
1148 | 1161 | ||
1149 | /* No overhead */ | ||
1150 | ret = print_graph_overhead(-1, s, flags); | ||
1151 | if (!ret) | ||
1152 | return TRACE_TYPE_PARTIAL_LINE; | ||
1153 | |||
1154 | /* No time */ | 1162 | /* No time */ |
1155 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | 1163 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); |
1156 | ret = trace_seq_printf(s, " | "); | 1164 | if (ret != TRACE_TYPE_HANDLED) |
1157 | if (!ret) | 1165 | return ret; |
1158 | return TRACE_TYPE_PARTIAL_LINE; | ||
1159 | } | ||
1160 | 1166 | ||
1161 | /* Indentation */ | 1167 | /* Indentation */ |
1162 | if (depth > 0) | 1168 | if (depth > 0) |
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1207 | 1213 | ||
1208 | 1214 | ||
1209 | enum print_line_t | 1215 | enum print_line_t |
1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1216 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
1211 | { | 1217 | { |
1212 | struct ftrace_graph_ent_entry *field; | 1218 | struct ftrace_graph_ent_entry *field; |
1213 | struct fgraph_data *data = iter->private; | 1219 | struct fgraph_data *data = iter->private; |
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
1270 | static enum print_line_t | 1276 | static enum print_line_t |
1271 | print_graph_function(struct trace_iterator *iter) | 1277 | print_graph_function(struct trace_iterator *iter) |
1272 | { | 1278 | { |
1273 | return __print_graph_function_flags(iter, tracer_flags.val); | 1279 | return print_graph_function_flags(iter, tracer_flags.val); |
1274 | } | ||
1275 | |||
1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
1277 | u32 flags) | ||
1278 | { | ||
1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
1281 | else | ||
1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1283 | |||
1284 | return __print_graph_function_flags(iter, flags); | ||
1285 | } | 1280 | } |
1286 | 1281 | ||
1287 | static enum print_line_t | 1282 | static enum print_line_t |
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
1309 | seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); | 1304 | seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); |
1310 | seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); | 1305 | seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); |
1311 | seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); | 1306 | seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); |
1312 | seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); | 1307 | seq_printf(s, "#%.*s||| / \n", size, spaces); |
1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | ||
1314 | } | 1308 | } |
1315 | 1309 | ||
1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | 1310 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
1329 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1323 | if (flags & TRACE_GRAPH_PRINT_PROC) |
1330 | seq_printf(s, " TASK/PID "); | 1324 | seq_printf(s, " TASK/PID "); |
1331 | if (lat) | 1325 | if (lat) |
1332 | seq_printf(s, "|||||"); | 1326 | seq_printf(s, "||||"); |
1333 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1327 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
1334 | seq_printf(s, " DURATION "); | 1328 | seq_printf(s, " DURATION "); |
1335 | seq_printf(s, " FUNCTION CALLS\n"); | 1329 | seq_printf(s, " FUNCTION CALLS\n"); |
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
1343 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1337 | if (flags & TRACE_GRAPH_PRINT_PROC) |
1344 | seq_printf(s, " | | "); | 1338 | seq_printf(s, " | | "); |
1345 | if (lat) | 1339 | if (lat) |
1346 | seq_printf(s, "|||||"); | 1340 | seq_printf(s, "||||"); |
1347 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1341 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
1348 | seq_printf(s, " | | "); | 1342 | seq_printf(s, " | | "); |
1349 | seq_printf(s, " | | | |\n"); | 1343 | seq_printf(s, " | | | |\n"); |
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
1358 | { | 1352 | { |
1359 | struct trace_iterator *iter = s->private; | 1353 | struct trace_iterator *iter = s->private; |
1360 | 1354 | ||
1355 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
1356 | return; | ||
1357 | |||
1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | 1358 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { |
1362 | /* print nothing if the buffers are empty */ | 1359 | /* print nothing if the buffers are empty */ |
1363 | if (trace_empty(iter)) | 1360 | if (trace_empty(iter)) |
1364 | return; | 1361 | return; |
1365 | 1362 | ||
1366 | print_trace_header(s, iter); | 1363 | print_trace_header(s, iter); |
1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | 1364 | } |
1368 | } else | ||
1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
1370 | 1365 | ||
1371 | __print_graph_headers_flags(s, flags); | 1366 | __print_graph_headers_flags(s, flags); |
1372 | } | 1367 | } |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c77424be284..667aa8cc0cf 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
226 | } | 226 | } |
227 | 227 | ||
228 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ | 228 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ |
229 | TRACE_GRAPH_PRINT_PROC) | 229 | TRACE_GRAPH_PRINT_PROC | \ |
230 | TRACE_GRAPH_PRINT_ABS_TIME | \ | ||
231 | TRACE_GRAPH_PRINT_DURATION) | ||
230 | 232 | ||
231 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 233 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
232 | { | 234 | { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 27d13b36b8b..00d527c945a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) | |||
343 | DEFINE_FETCH_deref(string) | 343 | DEFINE_FETCH_deref(string) |
344 | DEFINE_FETCH_deref(string_size) | 344 | DEFINE_FETCH_deref(string_size) |
345 | 345 | ||
346 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
347 | { | ||
348 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
349 | update_deref_fetch_param(data->orig.data); | ||
350 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
351 | update_symbol_cache(data->orig.data); | ||
352 | } | ||
353 | |||
346 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 354 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) |
347 | { | 355 | { |
348 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | 356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) | |||
377 | #define fetch_bitfield_string_size NULL | 385 | #define fetch_bitfield_string_size NULL |
378 | 386 | ||
379 | static __kprobes void | 387 | static __kprobes void |
388 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
389 | { | ||
390 | /* | ||
391 | * Don't check the bitfield itself, because this must be the | ||
392 | * last fetch function. | ||
393 | */ | ||
394 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
395 | update_deref_fetch_param(data->orig.data); | ||
396 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
397 | update_symbol_cache(data->orig.data); | ||
398 | } | ||
399 | |||
400 | static __kprobes void | ||
380 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | 401 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) |
381 | { | 402 | { |
382 | /* | 403 | /* |
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
389 | free_symbol_cache(data->orig.data); | 410 | free_symbol_cache(data->orig.data); |
390 | kfree(data); | 411 | kfree(data); |
391 | } | 412 | } |
413 | |||
392 | /* Default (unsigned long) fetch type */ | 414 | /* Default (unsigned long) fetch type */ |
393 | #define __DEFAULT_FETCH_TYPE(t) u##t | 415 | #define __DEFAULT_FETCH_TYPE(t) u##t |
394 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 416 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
@@ -536,6 +558,7 @@ struct probe_arg { | |||
536 | /* Flags for trace_probe */ | 558 | /* Flags for trace_probe */ |
537 | #define TP_FLAG_TRACE 1 | 559 | #define TP_FLAG_TRACE 1 |
538 | #define TP_FLAG_PROFILE 2 | 560 | #define TP_FLAG_PROFILE 2 |
561 | #define TP_FLAG_REGISTERED 4 | ||
539 | 562 | ||
540 | struct trace_probe { | 563 | struct trace_probe { |
541 | struct list_head list; | 564 | struct list_head list; |
@@ -555,16 +578,49 @@ struct trace_probe { | |||
555 | (sizeof(struct probe_arg) * (n))) | 578 | (sizeof(struct probe_arg) * (n))) |
556 | 579 | ||
557 | 580 | ||
558 | static __kprobes int probe_is_return(struct trace_probe *tp) | 581 | static __kprobes int trace_probe_is_return(struct trace_probe *tp) |
559 | { | 582 | { |
560 | return tp->rp.handler != NULL; | 583 | return tp->rp.handler != NULL; |
561 | } | 584 | } |
562 | 585 | ||
563 | static __kprobes const char *probe_symbol(struct trace_probe *tp) | 586 | static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) |
564 | { | 587 | { |
565 | return tp->symbol ? tp->symbol : "unknown"; | 588 | return tp->symbol ? tp->symbol : "unknown"; |
566 | } | 589 | } |
567 | 590 | ||
591 | static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) | ||
592 | { | ||
593 | return tp->rp.kp.offset; | ||
594 | } | ||
595 | |||
596 | static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) | ||
597 | { | ||
598 | return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); | ||
599 | } | ||
600 | |||
601 | static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) | ||
602 | { | ||
603 | return !!(tp->flags & TP_FLAG_REGISTERED); | ||
604 | } | ||
605 | |||
606 | static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) | ||
607 | { | ||
608 | return !!(kprobe_gone(&tp->rp.kp)); | ||
609 | } | ||
610 | |||
611 | static __kprobes bool trace_probe_within_module(struct trace_probe *tp, | ||
612 | struct module *mod) | ||
613 | { | ||
614 | int len = strlen(mod->name); | ||
615 | const char *name = trace_probe_symbol(tp); | ||
616 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; | ||
617 | } | ||
618 | |||
619 | static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) | ||
620 | { | ||
621 | return !!strchr(trace_probe_symbol(tp), ':'); | ||
622 | } | ||
623 | |||
568 | static int register_probe_event(struct trace_probe *tp); | 624 | static int register_probe_event(struct trace_probe *tp); |
569 | static void unregister_probe_event(struct trace_probe *tp); | 625 | static void unregister_probe_event(struct trace_probe *tp); |
570 | 626 | ||
@@ -646,6 +702,16 @@ error: | |||
646 | return ERR_PTR(ret); | 702 | return ERR_PTR(ret); |
647 | } | 703 | } |
648 | 704 | ||
705 | static void update_probe_arg(struct probe_arg *arg) | ||
706 | { | ||
707 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
708 | update_bitfield_fetch_param(arg->fetch.data); | ||
709 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
710 | update_deref_fetch_param(arg->fetch.data); | ||
711 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
712 | update_symbol_cache(arg->fetch.data); | ||
713 | } | ||
714 | |||
649 | static void free_probe_arg(struct probe_arg *arg) | 715 | static void free_probe_arg(struct probe_arg *arg) |
650 | { | 716 | { |
651 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | 717 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp) | |||
671 | kfree(tp); | 737 | kfree(tp); |
672 | } | 738 | } |
673 | 739 | ||
674 | static struct trace_probe *find_probe_event(const char *event, | 740 | static struct trace_probe *find_trace_probe(const char *event, |
675 | const char *group) | 741 | const char *group) |
676 | { | 742 | { |
677 | struct trace_probe *tp; | 743 | struct trace_probe *tp; |
@@ -683,15 +749,104 @@ static struct trace_probe *find_probe_event(const char *event, | |||
683 | return NULL; | 749 | return NULL; |
684 | } | 750 | } |
685 | 751 | ||
686 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | 752 | /* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ |
687 | static void unregister_trace_probe(struct trace_probe *tp) | 753 | static int enable_trace_probe(struct trace_probe *tp, int flag) |
688 | { | 754 | { |
689 | if (probe_is_return(tp)) | 755 | int ret = 0; |
690 | unregister_kretprobe(&tp->rp); | 756 | |
757 | tp->flags |= flag; | ||
758 | if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && | ||
759 | !trace_probe_has_gone(tp)) { | ||
760 | if (trace_probe_is_return(tp)) | ||
761 | ret = enable_kretprobe(&tp->rp); | ||
762 | else | ||
763 | ret = enable_kprobe(&tp->rp.kp); | ||
764 | } | ||
765 | |||
766 | return ret; | ||
767 | } | ||
768 | |||
769 | /* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ | ||
770 | static void disable_trace_probe(struct trace_probe *tp, int flag) | ||
771 | { | ||
772 | tp->flags &= ~flag; | ||
773 | if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { | ||
774 | if (trace_probe_is_return(tp)) | ||
775 | disable_kretprobe(&tp->rp); | ||
776 | else | ||
777 | disable_kprobe(&tp->rp.kp); | ||
778 | } | ||
779 | } | ||
780 | |||
781 | /* Internal register function - just handle k*probes and flags */ | ||
782 | static int __register_trace_probe(struct trace_probe *tp) | ||
783 | { | ||
784 | int i, ret; | ||
785 | |||
786 | if (trace_probe_is_registered(tp)) | ||
787 | return -EINVAL; | ||
788 | |||
789 | for (i = 0; i < tp->nr_args; i++) | ||
790 | update_probe_arg(&tp->args[i]); | ||
791 | |||
792 | /* Set/clear disabled flag according to tp->flag */ | ||
793 | if (trace_probe_is_enabled(tp)) | ||
794 | tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; | ||
795 | else | ||
796 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | ||
797 | |||
798 | if (trace_probe_is_return(tp)) | ||
799 | ret = register_kretprobe(&tp->rp); | ||
691 | else | 800 | else |
692 | unregister_kprobe(&tp->rp.kp); | 801 | ret = register_kprobe(&tp->rp.kp); |
802 | |||
803 | if (ret == 0) | ||
804 | tp->flags |= TP_FLAG_REGISTERED; | ||
805 | else { | ||
806 | pr_warning("Could not insert probe at %s+%lu: %d\n", | ||
807 | trace_probe_symbol(tp), trace_probe_offset(tp), ret); | ||
808 | if (ret == -ENOENT && trace_probe_is_on_module(tp)) { | ||
809 | pr_warning("This probe might be able to register after" | ||
810 | "target module is loaded. Continue.\n"); | ||
811 | ret = 0; | ||
812 | } else if (ret == -EILSEQ) { | ||
813 | pr_warning("Probing address(0x%p) is not an " | ||
814 | "instruction boundary.\n", | ||
815 | tp->rp.kp.addr); | ||
816 | ret = -EINVAL; | ||
817 | } | ||
818 | } | ||
819 | |||
820 | return ret; | ||
821 | } | ||
822 | |||
823 | /* Internal unregister function - just handle k*probes and flags */ | ||
824 | static void __unregister_trace_probe(struct trace_probe *tp) | ||
825 | { | ||
826 | if (trace_probe_is_registered(tp)) { | ||
827 | if (trace_probe_is_return(tp)) | ||
828 | unregister_kretprobe(&tp->rp); | ||
829 | else | ||
830 | unregister_kprobe(&tp->rp.kp); | ||
831 | tp->flags &= ~TP_FLAG_REGISTERED; | ||
832 | /* Cleanup kprobe for reuse */ | ||
833 | if (tp->rp.kp.symbol_name) | ||
834 | tp->rp.kp.addr = NULL; | ||
835 | } | ||
836 | } | ||
837 | |||
838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | ||
839 | static int unregister_trace_probe(struct trace_probe *tp) | ||
840 | { | ||
841 | /* Enabled event can not be unregistered */ | ||
842 | if (trace_probe_is_enabled(tp)) | ||
843 | return -EBUSY; | ||
844 | |||
845 | __unregister_trace_probe(tp); | ||
693 | list_del(&tp->list); | 846 | list_del(&tp->list); |
694 | unregister_probe_event(tp); | 847 | unregister_probe_event(tp); |
848 | |||
849 | return 0; | ||
695 | } | 850 | } |
696 | 851 | ||
697 | /* Register a trace_probe and probe_event */ | 852 | /* Register a trace_probe and probe_event */ |
@@ -702,41 +857,68 @@ static int register_trace_probe(struct trace_probe *tp) | |||
702 | 857 | ||
703 | mutex_lock(&probe_lock); | 858 | mutex_lock(&probe_lock); |
704 | 859 | ||
705 | /* register as an event */ | 860 | /* Delete old (same name) event if exist */ |
706 | old_tp = find_probe_event(tp->call.name, tp->call.class->system); | 861 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); |
707 | if (old_tp) { | 862 | if (old_tp) { |
708 | /* delete old event */ | 863 | ret = unregister_trace_probe(old_tp); |
709 | unregister_trace_probe(old_tp); | 864 | if (ret < 0) |
865 | goto end; | ||
710 | free_trace_probe(old_tp); | 866 | free_trace_probe(old_tp); |
711 | } | 867 | } |
868 | |||
869 | /* Register new event */ | ||
712 | ret = register_probe_event(tp); | 870 | ret = register_probe_event(tp); |
713 | if (ret) { | 871 | if (ret) { |
714 | pr_warning("Failed to register probe event(%d)\n", ret); | 872 | pr_warning("Failed to register probe event(%d)\n", ret); |
715 | goto end; | 873 | goto end; |
716 | } | 874 | } |
717 | 875 | ||
718 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | 876 | /* Register k*probe */ |
719 | if (probe_is_return(tp)) | 877 | ret = __register_trace_probe(tp); |
720 | ret = register_kretprobe(&tp->rp); | 878 | if (ret < 0) |
721 | else | ||
722 | ret = register_kprobe(&tp->rp.kp); | ||
723 | |||
724 | if (ret) { | ||
725 | pr_warning("Could not insert probe(%d)\n", ret); | ||
726 | if (ret == -EILSEQ) { | ||
727 | pr_warning("Probing address(0x%p) is not an " | ||
728 | "instruction boundary.\n", | ||
729 | tp->rp.kp.addr); | ||
730 | ret = -EINVAL; | ||
731 | } | ||
732 | unregister_probe_event(tp); | 879 | unregister_probe_event(tp); |
733 | } else | 880 | else |
734 | list_add_tail(&tp->list, &probe_list); | 881 | list_add_tail(&tp->list, &probe_list); |
882 | |||
735 | end: | 883 | end: |
736 | mutex_unlock(&probe_lock); | 884 | mutex_unlock(&probe_lock); |
737 | return ret; | 885 | return ret; |
738 | } | 886 | } |
739 | 887 | ||
888 | /* Module notifier call back, checking event on the module */ | ||
889 | static int trace_probe_module_callback(struct notifier_block *nb, | ||
890 | unsigned long val, void *data) | ||
891 | { | ||
892 | struct module *mod = data; | ||
893 | struct trace_probe *tp; | ||
894 | int ret; | ||
895 | |||
896 | if (val != MODULE_STATE_COMING) | ||
897 | return NOTIFY_DONE; | ||
898 | |||
899 | /* Update probes on coming module */ | ||
900 | mutex_lock(&probe_lock); | ||
901 | list_for_each_entry(tp, &probe_list, list) { | ||
902 | if (trace_probe_within_module(tp, mod)) { | ||
903 | /* Don't need to check busy - this should have gone. */ | ||
904 | __unregister_trace_probe(tp); | ||
905 | ret = __register_trace_probe(tp); | ||
906 | if (ret) | ||
907 | pr_warning("Failed to re-register probe %s on" | ||
908 | "%s: %d\n", | ||
909 | tp->call.name, mod->name, ret); | ||
910 | } | ||
911 | } | ||
912 | mutex_unlock(&probe_lock); | ||
913 | |||
914 | return NOTIFY_DONE; | ||
915 | } | ||
916 | |||
917 | static struct notifier_block trace_probe_module_nb = { | ||
918 | .notifier_call = trace_probe_module_callback, | ||
919 | .priority = 1 /* Invoked after kprobe module callback */ | ||
920 | }; | ||
921 | |||
740 | /* Split symbol and offset. */ | 922 | /* Split symbol and offset. */ |
741 | static int split_symbol_offset(char *symbol, unsigned long *offset) | 923 | static int split_symbol_offset(char *symbol, unsigned long *offset) |
742 | { | 924 | { |
@@ -962,8 +1144,8 @@ static int create_trace_probe(int argc, char **argv) | |||
962 | { | 1144 | { |
963 | /* | 1145 | /* |
964 | * Argument syntax: | 1146 | * Argument syntax: |
965 | * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] | 1147 | * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] |
966 | * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] | 1148 | * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] |
967 | * Fetch args: | 1149 | * Fetch args: |
968 | * $retval : fetch return value | 1150 | * $retval : fetch return value |
969 | * $stack : fetch stack address | 1151 | * $stack : fetch stack address |
@@ -1025,17 +1207,18 @@ static int create_trace_probe(int argc, char **argv) | |||
1025 | return -EINVAL; | 1207 | return -EINVAL; |
1026 | } | 1208 | } |
1027 | mutex_lock(&probe_lock); | 1209 | mutex_lock(&probe_lock); |
1028 | tp = find_probe_event(event, group); | 1210 | tp = find_trace_probe(event, group); |
1029 | if (!tp) { | 1211 | if (!tp) { |
1030 | mutex_unlock(&probe_lock); | 1212 | mutex_unlock(&probe_lock); |
1031 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 1213 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
1032 | return -ENOENT; | 1214 | return -ENOENT; |
1033 | } | 1215 | } |
1034 | /* delete an event */ | 1216 | /* delete an event */ |
1035 | unregister_trace_probe(tp); | 1217 | ret = unregister_trace_probe(tp); |
1036 | free_trace_probe(tp); | 1218 | if (ret == 0) |
1219 | free_trace_probe(tp); | ||
1037 | mutex_unlock(&probe_lock); | 1220 | mutex_unlock(&probe_lock); |
1038 | return 0; | 1221 | return ret; |
1039 | } | 1222 | } |
1040 | 1223 | ||
1041 | if (argc < 2) { | 1224 | if (argc < 2) { |
@@ -1144,20 +1327,30 @@ error: | |||
1144 | return ret; | 1327 | return ret; |
1145 | } | 1328 | } |
1146 | 1329 | ||
1147 | static void cleanup_all_probes(void) | 1330 | static int release_all_trace_probes(void) |
1148 | { | 1331 | { |
1149 | struct trace_probe *tp; | 1332 | struct trace_probe *tp; |
1333 | int ret = 0; | ||
1150 | 1334 | ||
1151 | mutex_lock(&probe_lock); | 1335 | mutex_lock(&probe_lock); |
1336 | /* Ensure no probe is in use. */ | ||
1337 | list_for_each_entry(tp, &probe_list, list) | ||
1338 | if (trace_probe_is_enabled(tp)) { | ||
1339 | ret = -EBUSY; | ||
1340 | goto end; | ||
1341 | } | ||
1152 | /* TODO: Use batch unregistration */ | 1342 | /* TODO: Use batch unregistration */ |
1153 | while (!list_empty(&probe_list)) { | 1343 | while (!list_empty(&probe_list)) { |
1154 | tp = list_entry(probe_list.next, struct trace_probe, list); | 1344 | tp = list_entry(probe_list.next, struct trace_probe, list); |
1155 | unregister_trace_probe(tp); | 1345 | unregister_trace_probe(tp); |
1156 | free_trace_probe(tp); | 1346 | free_trace_probe(tp); |
1157 | } | 1347 | } |
1348 | |||
1349 | end: | ||
1158 | mutex_unlock(&probe_lock); | 1350 | mutex_unlock(&probe_lock); |
1159 | } | ||
1160 | 1351 | ||
1352 | return ret; | ||
1353 | } | ||
1161 | 1354 | ||
1162 | /* Probes listing interfaces */ | 1355 | /* Probes listing interfaces */ |
1163 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) | 1356 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) |
@@ -1181,15 +1374,16 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
1181 | struct trace_probe *tp = v; | 1374 | struct trace_probe *tp = v; |
1182 | int i; | 1375 | int i; |
1183 | 1376 | ||
1184 | seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); | 1377 | seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); |
1185 | seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); | 1378 | seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); |
1186 | 1379 | ||
1187 | if (!tp->symbol) | 1380 | if (!tp->symbol) |
1188 | seq_printf(m, " 0x%p", tp->rp.kp.addr); | 1381 | seq_printf(m, " 0x%p", tp->rp.kp.addr); |
1189 | else if (tp->rp.kp.offset) | 1382 | else if (tp->rp.kp.offset) |
1190 | seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); | 1383 | seq_printf(m, " %s+%u", trace_probe_symbol(tp), |
1384 | tp->rp.kp.offset); | ||
1191 | else | 1385 | else |
1192 | seq_printf(m, " %s", probe_symbol(tp)); | 1386 | seq_printf(m, " %s", trace_probe_symbol(tp)); |
1193 | 1387 | ||
1194 | for (i = 0; i < tp->nr_args; i++) | 1388 | for (i = 0; i < tp->nr_args; i++) |
1195 | seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); | 1389 | seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); |
@@ -1207,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { | |||
1207 | 1401 | ||
1208 | static int probes_open(struct inode *inode, struct file *file) | 1402 | static int probes_open(struct inode *inode, struct file *file) |
1209 | { | 1403 | { |
1210 | if ((file->f_mode & FMODE_WRITE) && | 1404 | int ret; |
1211 | (file->f_flags & O_TRUNC)) | 1405 | |
1212 | cleanup_all_probes(); | 1406 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
1407 | ret = release_all_trace_probes(); | ||
1408 | if (ret < 0) | ||
1409 | return ret; | ||
1410 | } | ||
1213 | 1411 | ||
1214 | return seq_open(file, &probes_seq_op); | 1412 | return seq_open(file, &probes_seq_op); |
1215 | } | 1413 | } |
@@ -1397,7 +1595,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1397 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1595 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1398 | 1596 | ||
1399 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1597 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1400 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1598 | trace_nowake_buffer_unlock_commit_regs(buffer, event, |
1599 | irq_flags, pc, regs); | ||
1401 | } | 1600 | } |
1402 | 1601 | ||
1403 | /* Kretprobe handler */ | 1602 | /* Kretprobe handler */ |
@@ -1429,7 +1628,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
1429 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1628 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1430 | 1629 | ||
1431 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1630 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1432 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1631 | trace_nowake_buffer_unlock_commit_regs(buffer, event, |
1632 | irq_flags, pc, regs); | ||
1433 | } | 1633 | } |
1434 | 1634 | ||
1435 | /* Event entry printers */ | 1635 | /* Event entry printers */ |
@@ -1511,30 +1711,6 @@ partial: | |||
1511 | return TRACE_TYPE_PARTIAL_LINE; | 1711 | return TRACE_TYPE_PARTIAL_LINE; |
1512 | } | 1712 | } |
1513 | 1713 | ||
1514 | static int probe_event_enable(struct ftrace_event_call *call) | ||
1515 | { | ||
1516 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1517 | |||
1518 | tp->flags |= TP_FLAG_TRACE; | ||
1519 | if (probe_is_return(tp)) | ||
1520 | return enable_kretprobe(&tp->rp); | ||
1521 | else | ||
1522 | return enable_kprobe(&tp->rp.kp); | ||
1523 | } | ||
1524 | |||
1525 | static void probe_event_disable(struct ftrace_event_call *call) | ||
1526 | { | ||
1527 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1528 | |||
1529 | tp->flags &= ~TP_FLAG_TRACE; | ||
1530 | if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { | ||
1531 | if (probe_is_return(tp)) | ||
1532 | disable_kretprobe(&tp->rp); | ||
1533 | else | ||
1534 | disable_kprobe(&tp->rp.kp); | ||
1535 | } | ||
1536 | } | ||
1537 | |||
1538 | #undef DEFINE_FIELD | 1714 | #undef DEFINE_FIELD |
1539 | #define DEFINE_FIELD(type, item, name, is_signed) \ | 1715 | #define DEFINE_FIELD(type, item, name, is_signed) \ |
1540 | do { \ | 1716 | do { \ |
@@ -1596,7 +1772,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | |||
1596 | 1772 | ||
1597 | const char *fmt, *arg; | 1773 | const char *fmt, *arg; |
1598 | 1774 | ||
1599 | if (!probe_is_return(tp)) { | 1775 | if (!trace_probe_is_return(tp)) { |
1600 | fmt = "(%lx)"; | 1776 | fmt = "(%lx)"; |
1601 | arg = "REC->" FIELD_STRING_IP; | 1777 | arg = "REC->" FIELD_STRING_IP; |
1602 | } else { | 1778 | } else { |
@@ -1713,49 +1889,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1713 | head = this_cpu_ptr(call->perf_events); | 1889 | head = this_cpu_ptr(call->perf_events); |
1714 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1890 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); |
1715 | } | 1891 | } |
1716 | |||
1717 | static int probe_perf_enable(struct ftrace_event_call *call) | ||
1718 | { | ||
1719 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1720 | |||
1721 | tp->flags |= TP_FLAG_PROFILE; | ||
1722 | |||
1723 | if (probe_is_return(tp)) | ||
1724 | return enable_kretprobe(&tp->rp); | ||
1725 | else | ||
1726 | return enable_kprobe(&tp->rp.kp); | ||
1727 | } | ||
1728 | |||
1729 | static void probe_perf_disable(struct ftrace_event_call *call) | ||
1730 | { | ||
1731 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1732 | |||
1733 | tp->flags &= ~TP_FLAG_PROFILE; | ||
1734 | |||
1735 | if (!(tp->flags & TP_FLAG_TRACE)) { | ||
1736 | if (probe_is_return(tp)) | ||
1737 | disable_kretprobe(&tp->rp); | ||
1738 | else | ||
1739 | disable_kprobe(&tp->rp.kp); | ||
1740 | } | ||
1741 | } | ||
1742 | #endif /* CONFIG_PERF_EVENTS */ | 1892 | #endif /* CONFIG_PERF_EVENTS */ |
1743 | 1893 | ||
1744 | static __kprobes | 1894 | static __kprobes |
1745 | int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) | 1895 | int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) |
1746 | { | 1896 | { |
1897 | struct trace_probe *tp = (struct trace_probe *)event->data; | ||
1898 | |||
1747 | switch (type) { | 1899 | switch (type) { |
1748 | case TRACE_REG_REGISTER: | 1900 | case TRACE_REG_REGISTER: |
1749 | return probe_event_enable(event); | 1901 | return enable_trace_probe(tp, TP_FLAG_TRACE); |
1750 | case TRACE_REG_UNREGISTER: | 1902 | case TRACE_REG_UNREGISTER: |
1751 | probe_event_disable(event); | 1903 | disable_trace_probe(tp, TP_FLAG_TRACE); |
1752 | return 0; | 1904 | return 0; |
1753 | 1905 | ||
1754 | #ifdef CONFIG_PERF_EVENTS | 1906 | #ifdef CONFIG_PERF_EVENTS |
1755 | case TRACE_REG_PERF_REGISTER: | 1907 | case TRACE_REG_PERF_REGISTER: |
1756 | return probe_perf_enable(event); | 1908 | return enable_trace_probe(tp, TP_FLAG_PROFILE); |
1757 | case TRACE_REG_PERF_UNREGISTER: | 1909 | case TRACE_REG_PERF_UNREGISTER: |
1758 | probe_perf_disable(event); | 1910 | disable_trace_probe(tp, TP_FLAG_PROFILE); |
1759 | return 0; | 1911 | return 0; |
1760 | #endif | 1912 | #endif |
1761 | } | 1913 | } |
@@ -1805,7 +1957,7 @@ static int register_probe_event(struct trace_probe *tp) | |||
1805 | 1957 | ||
1806 | /* Initialize ftrace_event_call */ | 1958 | /* Initialize ftrace_event_call */ |
1807 | INIT_LIST_HEAD(&call->class->fields); | 1959 | INIT_LIST_HEAD(&call->class->fields); |
1808 | if (probe_is_return(tp)) { | 1960 | if (trace_probe_is_return(tp)) { |
1809 | call->event.funcs = &kretprobe_funcs; | 1961 | call->event.funcs = &kretprobe_funcs; |
1810 | call->class->define_fields = kretprobe_event_define_fields; | 1962 | call->class->define_fields = kretprobe_event_define_fields; |
1811 | } else { | 1963 | } else { |
@@ -1844,6 +1996,9 @@ static __init int init_kprobe_trace(void) | |||
1844 | struct dentry *d_tracer; | 1996 | struct dentry *d_tracer; |
1845 | struct dentry *entry; | 1997 | struct dentry *entry; |
1846 | 1998 | ||
1999 | if (register_module_notifier(&trace_probe_module_nb)) | ||
2000 | return -EINVAL; | ||
2001 | |||
1847 | d_tracer = tracing_init_dentry(); | 2002 | d_tracer = tracing_init_dentry(); |
1848 | if (!d_tracer) | 2003 | if (!d_tracer) |
1849 | return 0; | 2004 | return 0; |
@@ -1897,12 +2052,12 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1897 | warn++; | 2052 | warn++; |
1898 | } else { | 2053 | } else { |
1899 | /* Enable trace point */ | 2054 | /* Enable trace point */ |
1900 | tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); | 2055 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); |
1901 | if (WARN_ON_ONCE(tp == NULL)) { | 2056 | if (WARN_ON_ONCE(tp == NULL)) { |
1902 | pr_warning("error on getting new probe.\n"); | 2057 | pr_warning("error on getting new probe.\n"); |
1903 | warn++; | 2058 | warn++; |
1904 | } else | 2059 | } else |
1905 | probe_event_enable(&tp->call); | 2060 | enable_trace_probe(tp, TP_FLAG_TRACE); |
1906 | } | 2061 | } |
1907 | 2062 | ||
1908 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | 2063 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " |
@@ -1912,12 +2067,12 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1912 | warn++; | 2067 | warn++; |
1913 | } else { | 2068 | } else { |
1914 | /* Enable trace point */ | 2069 | /* Enable trace point */ |
1915 | tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); | 2070 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); |
1916 | if (WARN_ON_ONCE(tp == NULL)) { | 2071 | if (WARN_ON_ONCE(tp == NULL)) { |
1917 | pr_warning("error on getting new probe.\n"); | 2072 | pr_warning("error on getting new probe.\n"); |
1918 | warn++; | 2073 | warn++; |
1919 | } else | 2074 | } else |
1920 | probe_event_enable(&tp->call); | 2075 | enable_trace_probe(tp, TP_FLAG_TRACE); |
1921 | } | 2076 | } |
1922 | 2077 | ||
1923 | if (warn) | 2078 | if (warn) |
@@ -1925,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1925 | 2080 | ||
1926 | ret = target(1, 2, 3, 4, 5, 6); | 2081 | ret = target(1, 2, 3, 4, 5, 6); |
1927 | 2082 | ||
2083 | /* Disable trace points before removing it */ | ||
2084 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | ||
2085 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2086 | pr_warning("error on getting test probe.\n"); | ||
2087 | warn++; | ||
2088 | } else | ||
2089 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2090 | |||
2091 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | ||
2092 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2093 | pr_warning("error on getting 2nd test probe.\n"); | ||
2094 | warn++; | ||
2095 | } else | ||
2096 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2097 | |||
1928 | ret = command_trace_probe("-:testprobe"); | 2098 | ret = command_trace_probe("-:testprobe"); |
1929 | if (WARN_ON_ONCE(ret)) { | 2099 | if (WARN_ON_ONCE(ret)) { |
1930 | pr_warning("error on deleting a probe.\n"); | 2100 | pr_warning("error on deleting a probe.\n"); |
@@ -1938,7 +2108,7 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1938 | } | 2108 | } |
1939 | 2109 | ||
1940 | end: | 2110 | end: |
1941 | cleanup_all_probes(); | 2111 | release_all_trace_probes(); |
1942 | if (warn) | 2112 | if (warn) |
1943 | pr_cont("NG: Some tests are failed. Please check them.\n"); | 2113 | pr_cont("NG: Some tests are failed. Please check them.\n"); |
1944 | else | 2114 | else |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa376505..fd3c8aae55e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/time.h> | 13 | #include <linux/time.h> |
14 | 14 | ||
15 | #include <asm/atomic.h> | 15 | #include <linux/atomic.h> |
16 | 16 | ||
17 | #include "trace.h" | 17 | #include "trace.h" |
18 | #include "trace_output.h" | 18 | #include "trace_output.h" |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e37de492a9e..51999309a6c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
1107 | { | 1107 | { |
1108 | struct stack_entry *field; | 1108 | struct stack_entry *field; |
1109 | struct trace_seq *s = &iter->seq; | 1109 | struct trace_seq *s = &iter->seq; |
1110 | int i; | 1110 | unsigned long *p; |
1111 | unsigned long *end; | ||
1111 | 1112 | ||
1112 | trace_assign_type(field, iter->ent); | 1113 | trace_assign_type(field, iter->ent); |
1114 | end = (unsigned long *)((long)iter->ent + iter->ent_size); | ||
1113 | 1115 | ||
1114 | if (!trace_seq_puts(s, "<stack trace>\n")) | 1116 | if (!trace_seq_puts(s, "<stack trace>\n")) |
1115 | goto partial; | 1117 | goto partial; |
1116 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { | 1118 | |
1117 | if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) | 1119 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { |
1118 | break; | ||
1119 | if (!trace_seq_puts(s, " => ")) | 1120 | if (!trace_seq_puts(s, " => ")) |
1120 | goto partial; | 1121 | goto partial; |
1121 | 1122 | ||
1122 | if (!seq_print_ip_sym(s, field->caller[i], flags)) | 1123 | if (!seq_print_ip_sym(s, *p, flags)) |
1123 | goto partial; | 1124 | goto partial; |
1124 | if (!trace_seq_puts(s, "\n")) | 1125 | if (!trace_seq_puts(s, "\n")) |
1125 | goto partial; | 1126 | goto partial; |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f029dd4fd2c..e4a70c0c71b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) | |||
227 | graph_trace_close(iter); | 227 | graph_trace_close(iter); |
228 | } | 228 | } |
229 | 229 | ||
230 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | 230 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ |
231 | TRACE_GRAPH_PRINT_ABS_TIME | \ | ||
232 | TRACE_GRAPH_PRINT_DURATION) | ||
231 | 233 | ||
232 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | 234 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) |
233 | { | 235 | { |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b0b53b8e4c2..77575b386d9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
156 | { | 156 | { |
157 | long *ptr = filp->private_data; | 157 | long *ptr = filp->private_data; |
158 | unsigned long val, flags; | 158 | unsigned long val, flags; |
159 | char buf[64]; | ||
160 | int ret; | 159 | int ret; |
161 | int cpu; | 160 | int cpu; |
162 | 161 | ||
163 | if (count >= sizeof(buf)) | 162 | ret = kstrtoul_from_user(ubuf, count, 10, &val); |
164 | return -EINVAL; | 163 | if (ret) |
165 | |||
166 | if (copy_from_user(&buf, ubuf, count)) | ||
167 | return -EFAULT; | ||
168 | |||
169 | buf[count] = 0; | ||
170 | |||
171 | ret = strict_strtoul(buf, 10, &val); | ||
172 | if (ret < 0) | ||
173 | return ret; | 164 | return ret; |
174 | 165 | ||
175 | local_irq_save(flags); | 166 | local_irq_save(flags); |
diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c new file mode 100644 index 00000000000..a83532bc36d --- /dev/null +++ b/kernel/trace/tracedump.c | |||
@@ -0,0 +1,682 @@ | |||
1 | /* | ||
2 | * kernel/trace/tracedump.c | ||
3 | * | ||
4 | * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms and conditions of the GNU General Public License, | ||
8 | * version 2, as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
13 | * more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License along with | ||
16 | * this program; if not, write to the Free Software Foundation, Inc., | ||
17 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/console.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/init.h> | ||
24 | #include <linux/irqflags.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/moduleparam.h> | ||
27 | #include <linux/mutex.h> | ||
28 | #include <linux/notifier.h> | ||
29 | #include <linux/proc_fs.h> | ||
30 | #include <linux/ring_buffer.h> | ||
31 | #include <linux/sched.h> | ||
32 | #include <linux/smp.h> | ||
33 | #include <linux/string.h> | ||
34 | #include <linux/threads.h> | ||
35 | #include <linux/tracedump.h> | ||
36 | #include <linux/uaccess.h> | ||
37 | #include <linux/vmalloc.h> | ||
38 | #include <linux/zlib.h> | ||
39 | |||
40 | #include "trace.h" | ||
41 | #include "trace_output.h" | ||
42 | |||
43 | #define CPU_MAX (NR_CPUS-1) | ||
44 | |||
45 | #define TRYM(fn, ...) do { \ | ||
46 | int try_error = (fn); \ | ||
47 | if (try_error < 0) { \ | ||
48 | printk(__VA_ARGS__); \ | ||
49 | return try_error; \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | #define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__) | ||
54 | |||
55 | /* Stolen from printk.c */ | ||
56 | #define for_each_console(con) \ | ||
57 | for (con = console_drivers; con != NULL; con = con->next) | ||
58 | |||
59 | #define TAG KERN_ERR "tracedump: " | ||
60 | |||
61 | #define TD_MIN_CONSUME 2000 | ||
62 | #define TD_COMPRESS_CHUNK 0x8000 | ||
63 | |||
64 | static DEFINE_MUTEX(tracedump_proc_lock); | ||
65 | |||
66 | static const char MAGIC_NUMBER[9] = "TRACEDUMP"; | ||
67 | static const char CPU_DELIM[7] = "CPU_END"; | ||
68 | #define CMDLINE_DELIM "|" | ||
69 | |||
70 | /* Type of output */ | ||
71 | static bool current_format; | ||
72 | static bool format_ascii; | ||
73 | module_param(format_ascii, bool, S_IRUGO | S_IWUSR); | ||
74 | MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data"); | ||
75 | |||
76 | /* Max size of output */ | ||
77 | static uint panic_size = 0x80000; | ||
78 | module_param(panic_size, uint, S_IRUGO | S_IWUSR); | ||
79 | MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)"); | ||
80 | |||
81 | static uint compress_level = 9; | ||
82 | module_param(compress_level, uint, S_IRUGO | S_IWUSR); | ||
83 | MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]"); | ||
84 | |||
85 | static char out_buf[TD_COMPRESS_CHUNK]; | ||
86 | static z_stream stream; | ||
87 | static int compress_done; | ||
88 | static int flush; | ||
89 | |||
90 | static int old_trace_flags; | ||
91 | |||
92 | static struct trace_iterator iter; | ||
93 | static struct pager_s { | ||
94 | struct trace_array *tr; | ||
95 | void *spare; | ||
96 | int cpu; | ||
97 | int len; | ||
98 | char __user *ubuf; | ||
99 | } pager; | ||
100 | |||
101 | static char cmdline_buf[16+TASK_COMM_LEN]; | ||
102 | |||
103 | static int print_to_console(const char *buf, size_t len) | ||
104 | { | ||
105 | struct console *con; | ||
106 | |||
107 | /* Stolen from printk.c */ | ||
108 | for_each_console(con) { | ||
109 | if ((con->flags & CON_ENABLED) && con->write && | ||
110 | (cpu_online(smp_processor_id()) || | ||
111 | (con->flags & CON_ANYTIME))) | ||
112 | con->write(con, buf, len); | ||
113 | } | ||
114 | return 0; | ||
115 | } | ||
116 | |||
117 | static int print_to_user(const char *buf, size_t len) | ||
118 | { | ||
119 | int size; | ||
120 | size = copy_to_user(pager.ubuf, buf, len); | ||
121 | if (size > 0) { | ||
122 | printk(TAG "Failed to copy to user %d bytes\n", size); | ||
123 | return -EINVAL; | ||
124 | } | ||
125 | return 0; | ||
126 | } | ||
127 | |||
128 | static int print(const char *buf, size_t len, int print_to) | ||
129 | { | ||
130 | if (print_to == TD_PRINT_CONSOLE) | ||
131 | TRY(print_to_console(buf, len)); | ||
132 | else if (print_to == TD_PRINT_USER) | ||
133 | TRY(print_to_user(buf, len)); | ||
134 | return 0; | ||
135 | } | ||
136 | |||
137 | /* print_magic will print MAGIC_NUMBER using the | ||
138 | * print function selected by print_to. | ||
139 | */ | ||
140 | static inline ssize_t print_magic(int print_to) | ||
141 | { | ||
142 | print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to); | ||
143 | return sizeof(MAGIC_NUMBER); | ||
144 | } | ||
145 | |||
146 | static int iter_init(void) | ||
147 | { | ||
148 | int cpu; | ||
149 | |||
150 | /* Make iter point to global ring buffer used in trace. */ | ||
151 | trace_init_global_iter(&iter); | ||
152 | |||
153 | /* Disable tracing */ | ||
154 | for_each_tracing_cpu(cpu) { | ||
155 | atomic_inc(&iter.tr->data[cpu]->disabled); | ||
156 | } | ||
157 | |||
158 | /* Save flags */ | ||
159 | old_trace_flags = trace_flags; | ||
160 | |||
161 | /* Dont look at memory in panic mode. */ | ||
162 | trace_flags &= ~TRACE_ITER_SYM_USEROBJ; | ||
163 | |||
164 | /* Prepare ring buffer iter */ | ||
165 | for_each_tracing_cpu(cpu) { | ||
166 | iter.buffer_iter[cpu] = | ||
167 | ring_buffer_read_prepare(iter.tr->buffer, cpu); | ||
168 | } | ||
169 | ring_buffer_read_prepare_sync(); | ||
170 | for_each_tracing_cpu(cpu) { | ||
171 | ring_buffer_read_start(iter.buffer_iter[cpu]); | ||
172 | tracing_iter_reset(&iter, cpu); | ||
173 | } | ||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | /* iter_next gets the next entry in the ring buffer, ordered by time. | ||
178 | * If there are no more entries, returns 0. | ||
179 | */ | ||
180 | static ssize_t iter_next(void) | ||
181 | { | ||
182 | /* Zero out the iterator's seq */ | ||
183 | memset(&iter.seq, 0, | ||
184 | sizeof(struct trace_iterator) - | ||
185 | offsetof(struct trace_iterator, seq)); | ||
186 | |||
187 | while (!trace_empty(&iter)) { | ||
188 | if (trace_find_next_entry_inc(&iter) == NULL) { | ||
189 | printk(TAG "trace_find_next_entry failed!\n"); | ||
190 | return -EINVAL; | ||
191 | } | ||
192 | |||
193 | /* Copy the ring buffer data to iterator's seq */ | ||
194 | print_trace_line(&iter); | ||
195 | if (iter.seq.len != 0) | ||
196 | return iter.seq.len; | ||
197 | } | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static int iter_deinit(void) | ||
202 | { | ||
203 | int cpu; | ||
204 | /* Enable tracing */ | ||
205 | for_each_tracing_cpu(cpu) { | ||
206 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | ||
207 | } | ||
208 | for_each_tracing_cpu(cpu) { | ||
209 | atomic_dec(&iter.tr->data[cpu]->disabled); | ||
210 | } | ||
211 | |||
212 | /* Restore flags */ | ||
213 | trace_flags = old_trace_flags; | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | static int pager_init(void) | ||
218 | { | ||
219 | int cpu; | ||
220 | |||
221 | /* Need to do this to get a pointer to global_trace (iter.tr). | ||
222 | Lame, I know. */ | ||
223 | trace_init_global_iter(&iter); | ||
224 | |||
225 | /* Turn off tracing */ | ||
226 | for_each_tracing_cpu(cpu) { | ||
227 | atomic_inc(&iter.tr->data[cpu]->disabled); | ||
228 | } | ||
229 | |||
230 | memset(&pager, 0, sizeof(pager)); | ||
231 | pager.tr = iter.tr; | ||
232 | pager.len = TD_COMPRESS_CHUNK; | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | /* pager_next_cpu moves the pager to the next cpu. | ||
238 | * Returns 0 if pager is done, else 1. | ||
239 | */ | ||
240 | static ssize_t pager_next_cpu(void) | ||
241 | { | ||
242 | if (pager.cpu <= CPU_MAX) { | ||
243 | pager.cpu += 1; | ||
244 | return 1; | ||
245 | } | ||
246 | |||
247 | return 0; | ||
248 | } | ||
249 | |||
250 | /* pager_next gets the next page of data from the ring buffer | ||
251 | * of the current cpu. Returns page size or 0 if no more data. | ||
252 | */ | ||
253 | static ssize_t pager_next(void) | ||
254 | { | ||
255 | int ret; | ||
256 | |||
257 | if (pager.cpu > CPU_MAX) | ||
258 | return 0; | ||
259 | |||
260 | if (!pager.spare) | ||
261 | pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu); | ||
262 | if (!pager.spare) { | ||
263 | printk(TAG "ring_buffer_alloc_read_page failed!"); | ||
264 | return -ENOMEM; | ||
265 | } | ||
266 | |||
267 | ret = ring_buffer_read_page(pager.tr->buffer, | ||
268 | &pager.spare, | ||
269 | pager.len, | ||
270 | pager.cpu, 0); | ||
271 | if (ret < 0) | ||
272 | return 0; | ||
273 | |||
274 | return PAGE_SIZE; | ||
275 | } | ||
276 | |||
277 | static int pager_deinit(void) | ||
278 | { | ||
279 | int cpu; | ||
280 | if (pager.spare != NULL) | ||
281 | ring_buffer_free_read_page(pager.tr->buffer, pager.spare); | ||
282 | |||
283 | for_each_tracing_cpu(cpu) { | ||
284 | atomic_dec(&iter.tr->data[cpu]->disabled); | ||
285 | } | ||
286 | return 0; | ||
287 | } | ||
288 | |||
289 | /* cmdline_next gets the next saved cmdline from the trace and | ||
290 | * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty. | ||
291 | * but will reset itself on a subsequent call. | ||
292 | */ | ||
293 | static ssize_t cmdline_next(void) | ||
294 | { | ||
295 | static int pid; | ||
296 | ssize_t size = 0; | ||
297 | |||
298 | if (pid >= PID_MAX_DEFAULT) | ||
299 | pid = -1; | ||
300 | |||
301 | while (size == 0 && pid < PID_MAX_DEFAULT) { | ||
302 | pid++; | ||
303 | trace_find_cmdline(pid, cmdline_buf); | ||
304 | if (!strncmp(cmdline_buf, "<...>", 5)) | ||
305 | continue; | ||
306 | |||
307 | sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d" | ||
308 | CMDLINE_DELIM, pid); | ||
309 | size = strlen(cmdline_buf); | ||
310 | } | ||
311 | return size; | ||
312 | } | ||
313 | |||
314 | /* comsume_events removes the first 'num' entries from the ring buffer. */ | ||
315 | static int consume_events(size_t num) | ||
316 | { | ||
317 | TRY(iter_init()); | ||
318 | for (; num > 0 && !trace_empty(&iter); num--) { | ||
319 | trace_find_next_entry_inc(&iter); | ||
320 | ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts, | ||
321 | &iter.lost_events); | ||
322 | } | ||
323 | TRY(iter_deinit()); | ||
324 | return 0; | ||
325 | } | ||
326 | |||
327 | static int data_init(void) | ||
328 | { | ||
329 | if (current_format) | ||
330 | TRY(iter_init()); | ||
331 | else | ||
332 | TRY(pager_init()); | ||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | /* data_next will figure out the right 'next' function to | ||
337 | * call and will select the right buffer to pass back | ||
338 | * to compress_next. | ||
339 | * | ||
340 | * iter_next should be used to get data entry-by-entry, ordered | ||
341 | * by time, which is what we need in order to convert it to ascii. | ||
342 | * | ||
343 | * pager_next will return a full page of raw data at a time, one | ||
344 | * CPU at a time. pager_next_cpu must be called to get the next CPU. | ||
345 | * cmdline_next will get the next saved cmdline | ||
346 | */ | ||
347 | static ssize_t data_next(const char **buf) | ||
348 | { | ||
349 | ssize_t size; | ||
350 | |||
351 | if (current_format) { | ||
352 | TRY(size = iter_next()); | ||
353 | *buf = iter.seq.buffer; | ||
354 | } else { | ||
355 | TRY(size = pager_next()); | ||
356 | *buf = pager.spare; | ||
357 | if (size == 0) { | ||
358 | if (pager_next_cpu()) { | ||
359 | size = sizeof(CPU_DELIM); | ||
360 | *buf = CPU_DELIM; | ||
361 | } else { | ||
362 | TRY(size = cmdline_next()); | ||
363 | *buf = cmdline_buf; | ||
364 | } | ||
365 | } | ||
366 | } | ||
367 | return size; | ||
368 | } | ||
369 | |||
370 | static int data_deinit(void) | ||
371 | { | ||
372 | if (current_format) | ||
373 | TRY(iter_deinit()); | ||
374 | else | ||
375 | TRY(pager_deinit()); | ||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static int compress_init(void) | ||
380 | { | ||
381 | int workspacesize, ret; | ||
382 | |||
383 | compress_done = 0; | ||
384 | flush = Z_NO_FLUSH; | ||
385 | stream.data_type = current_format ? Z_ASCII : Z_BINARY; | ||
386 | workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); | ||
387 | stream.workspace = vmalloc(workspacesize); | ||
388 | if (!stream.workspace) { | ||
389 | printk(TAG "Could not allocate " | ||
390 | "enough memory for zlib!\n"); | ||
391 | return -ENOMEM; | ||
392 | } | ||
393 | memset(stream.workspace, 0, workspacesize); | ||
394 | |||
395 | ret = zlib_deflateInit(&stream, compress_level); | ||
396 | if (ret != Z_OK) { | ||
397 | printk(TAG "%s\n", stream.msg); | ||
398 | return ret; | ||
399 | } | ||
400 | stream.avail_in = 0; | ||
401 | stream.avail_out = 0; | ||
402 | TRY(data_init()); | ||
403 | return 0; | ||
404 | } | ||
405 | |||
406 | /* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes | ||
407 | * of data into the output buffer. It gets the data by calling data_next. | ||
408 | * It will return the most data it possibly can. If it returns 0, then | ||
409 | * there is no more data. | ||
410 | * | ||
411 | * By the way that zlib works, each call to zlib_deflate will possibly | ||
412 | * consume up to avail_in bytes from next_in, and will fill up to | ||
413 | * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take | ||
414 | * any more input. It will output until it is finished, and will return | ||
415 | * Z_STREAM_END. | ||
416 | */ | ||
417 | static ssize_t compress_next(size_t max_out) | ||
418 | { | ||
419 | ssize_t ret; | ||
420 | max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK); | ||
421 | stream.next_out = out_buf; | ||
422 | stream.avail_out = max_out; | ||
423 | while (stream.avail_out > 0 && !compress_done) { | ||
424 | if (stream.avail_in == 0 && flush != Z_FINISH) { | ||
425 | TRY(stream.avail_in = | ||
426 | data_next((const char **)&stream.next_in)); | ||
427 | flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH; | ||
428 | } | ||
429 | if (stream.next_in != NULL) { | ||
430 | TRYM((ret = zlib_deflate(&stream, flush)), | ||
431 | "zlib: %s\n", stream.msg); | ||
432 | compress_done = (ret == Z_STREAM_END); | ||
433 | } | ||
434 | } | ||
435 | ret = max_out - stream.avail_out; | ||
436 | return ret; | ||
437 | } | ||
438 | |||
439 | static int compress_deinit(void) | ||
440 | { | ||
441 | TRY(data_deinit()); | ||
442 | |||
443 | zlib_deflateEnd(&stream); | ||
444 | vfree(stream.workspace); | ||
445 | |||
446 | /* TODO: remove */ | ||
447 | printk(TAG "Total in: %ld\n", stream.total_in); | ||
448 | printk(TAG "Total out: %ld\n", stream.total_out); | ||
449 | return stream.total_out; | ||
450 | } | ||
451 | |||
452 | static int compress_reset(void) | ||
453 | { | ||
454 | TRY(compress_deinit()); | ||
455 | TRY(compress_init()); | ||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | /* tracedump_init initializes all tracedump components. | ||
460 | * Call this before tracedump_next | ||
461 | */ | ||
462 | int tracedump_init(void) | ||
463 | { | ||
464 | TRY(compress_init()); | ||
465 | return 0; | ||
466 | } | ||
467 | |||
468 | /* tracedump_next will print up to max_out data from the tracing ring | ||
469 | * buffers using the print function selected by print_to. The data is | ||
470 | * compressed using zlib. | ||
471 | * | ||
472 | * The output type of the data is specified by the format_ascii module | ||
473 | * parameter. If format_ascii == 1, human-readable data will be output. | ||
474 | * Otherwise, it will output raw data from the ring buffer in cpu order, | ||
475 | * followed by the saved_cmdlines data. | ||
476 | */ | ||
477 | ssize_t tracedump_next(size_t max_out, int print_to) | ||
478 | { | ||
479 | ssize_t size; | ||
480 | TRY(size = compress_next(max_out)); | ||
481 | print(out_buf, size, print_to); | ||
482 | return size; | ||
483 | } | ||
484 | |||
485 | /* tracedump_all will print all data in the tracing ring buffers using | ||
486 | * the print function selected by print_to. The data is compressed using | ||
487 | * zlib, and is surrounded by MAGIC_NUMBER. | ||
488 | * | ||
489 | * The output type of the data is specified by the format_ascii module | ||
490 | * parameter. If format_ascii == 1, human-readable data will be output. | ||
491 | * Otherwise, it will output raw data from the ring buffer in cpu order, | ||
492 | * followed by the saved_cmdlines data. | ||
493 | */ | ||
494 | ssize_t tracedump_all(int print_to) | ||
495 | { | ||
496 | ssize_t ret, size = 0; | ||
497 | TRY(size += print_magic(print_to)); | ||
498 | |||
499 | do { | ||
500 | /* Here the size used doesn't really matter, | ||
501 | * since we're dumping everything. */ | ||
502 | TRY(ret = tracedump_next(0xFFFFFFFF, print_to)); | ||
503 | size += ret; | ||
504 | } while (ret > 0); | ||
505 | |||
506 | TRY(size += print_magic(print_to)); | ||
507 | |||
508 | return size; | ||
509 | } | ||
510 | |||
511 | /* tracedump_deinit deinitializes all tracedump components. | ||
512 | * This must be called, even on error. | ||
513 | */ | ||
514 | int tracedump_deinit(void) | ||
515 | { | ||
516 | TRY(compress_deinit()); | ||
517 | return 0; | ||
518 | } | ||
519 | |||
520 | /* tracedump_reset reinitializes all tracedump components. */ | ||
521 | int tracedump_reset(void) | ||
522 | { | ||
523 | TRY(compress_reset()); | ||
524 | return 0; | ||
525 | } | ||
526 | |||
527 | |||
528 | |||
529 | /* tracedump_open opens the tracedump file for reading. */ | ||
530 | static int tracedump_open(struct inode *inode, struct file *file) | ||
531 | { | ||
532 | int ret; | ||
533 | mutex_lock(&tracedump_proc_lock); | ||
534 | current_format = format_ascii; | ||
535 | ret = tracedump_init(); | ||
536 | if (ret < 0) | ||
537 | goto err; | ||
538 | |||
539 | ret = nonseekable_open(inode, file); | ||
540 | if (ret < 0) | ||
541 | goto err; | ||
542 | return ret; | ||
543 | |||
544 | err: | ||
545 | mutex_unlock(&tracedump_proc_lock); | ||
546 | return ret; | ||
547 | } | ||
548 | |||
549 | /* tracedump_read will reads data from tracedump_next and prints | ||
550 | * it to userspace. It will surround the data with MAGIC_NUMBER. | ||
551 | */ | ||
552 | static ssize_t tracedump_read(struct file *file, char __user *buf, | ||
553 | size_t len, loff_t *offset) | ||
554 | { | ||
555 | static int done; | ||
556 | ssize_t size = 0; | ||
557 | |||
558 | pager.ubuf = buf; | ||
559 | |||
560 | if (*offset == 0) { | ||
561 | done = 0; | ||
562 | TRY(size = print_magic(TD_PRINT_USER)); | ||
563 | } else if (!done) { | ||
564 | TRY(size = tracedump_next(len, TD_PRINT_USER)); | ||
565 | if (size == 0) { | ||
566 | TRY(size = print_magic(TD_PRINT_USER)); | ||
567 | done = 1; | ||
568 | } | ||
569 | } | ||
570 | |||
571 | *offset += size; | ||
572 | |||
573 | return size; | ||
574 | } | ||
575 | |||
576 | static int tracedump_release(struct inode *inode, struct file *file) | ||
577 | { | ||
578 | int ret; | ||
579 | ret = tracedump_deinit(); | ||
580 | mutex_unlock(&tracedump_proc_lock); | ||
581 | return ret; | ||
582 | } | ||
583 | |||
584 | /* tracedump_dump dumps all tracing data from the tracing ring buffers | ||
585 | * to all consoles. For details about the output format, see | ||
586 | * tracedump_all. | ||
587 | |||
588 | * At most max_out bytes are dumped. To accomplish this, | ||
589 | * tracedump_dump calls tracedump_all several times without writing the data, | ||
590 | * each time tossing out old data until it reaches its goal. | ||
591 | * | ||
592 | * Note: dumping raw pages currently does NOT follow the size limit. | ||
593 | */ | ||
594 | |||
595 | int tracedump_dump(size_t max_out) | ||
596 | { | ||
597 | ssize_t size; | ||
598 | size_t consume; | ||
599 | |||
600 | printk(TAG "\n"); | ||
601 | |||
602 | tracedump_init(); | ||
603 | |||
604 | if (format_ascii) { | ||
605 | size = tracedump_all(TD_NO_PRINT); | ||
606 | if (size < 0) { | ||
607 | printk(TAG "failed to dump\n"); | ||
608 | goto out; | ||
609 | } | ||
610 | while (size > max_out) { | ||
611 | TRY(tracedump_deinit()); | ||
612 | /* Events take more or less 60 ascii bytes each, | ||
613 | not counting compression */ | ||
614 | consume = TD_MIN_CONSUME + (size - max_out) / | ||
615 | (60 / (compress_level + 1)); | ||
616 | TRY(consume_events(consume)); | ||
617 | TRY(tracedump_init()); | ||
618 | size = tracedump_all(TD_NO_PRINT); | ||
619 | if (size < 0) { | ||
620 | printk(TAG "failed to dump\n"); | ||
621 | goto out; | ||
622 | } | ||
623 | } | ||
624 | |||
625 | TRY(tracedump_reset()); | ||
626 | } | ||
627 | size = tracedump_all(TD_PRINT_CONSOLE); | ||
628 | if (size < 0) { | ||
629 | printk(TAG "failed to dump\n"); | ||
630 | goto out; | ||
631 | } | ||
632 | |||
633 | out: | ||
634 | tracedump_deinit(); | ||
635 | printk(KERN_INFO "\n" TAG " end\n"); | ||
636 | return size; | ||
637 | } | ||
638 | |||
639 | static const struct file_operations tracedump_fops = { | ||
640 | .owner = THIS_MODULE, | ||
641 | .open = tracedump_open, | ||
642 | .read = tracedump_read, | ||
643 | .release = tracedump_release, | ||
644 | }; | ||
645 | |||
646 | #ifdef CONFIG_TRACEDUMP_PANIC | ||
647 | static int tracedump_panic_handler(struct notifier_block *this, | ||
648 | unsigned long event, void *unused) | ||
649 | { | ||
650 | tracedump_dump(panic_size); | ||
651 | return 0; | ||
652 | } | ||
653 | |||
654 | static struct notifier_block tracedump_panic_notifier = { | ||
655 | .notifier_call = tracedump_panic_handler, | ||
656 | .next = NULL, | ||
657 | .priority = 150 /* priority: INT_MAX >= x >= 0 */ | ||
658 | }; | ||
659 | #endif | ||
660 | |||
661 | static int __init tracedump_initcall(void) | ||
662 | { | ||
663 | #ifdef CONFIG_TRACEDUMP_PROCFS | ||
664 | struct proc_dir_entry *entry; | ||
665 | |||
666 | /* Create a procfs file for easy dumping */ | ||
667 | entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL); | ||
668 | if (!entry) | ||
669 | printk(TAG "failed to create proc entry\n"); | ||
670 | else | ||
671 | entry->proc_fops = &tracedump_fops; | ||
672 | #endif | ||
673 | |||
674 | #ifdef CONFIG_TRACEDUMP_PANIC | ||
675 | /* Automatically dump to console on a kernel panic */ | ||
676 | atomic_notifier_chain_register(&panic_notifier_list, | ||
677 | &tracedump_panic_notifier); | ||
678 | #endif | ||
679 | return 0; | ||
680 | } | ||
681 | |||
682 | early_initcall(tracedump_initcall); | ||
diff --git a/kernel/trace/tracelevel.c b/kernel/trace/tracelevel.c new file mode 100644 index 00000000000..9f8b8eedbb5 --- /dev/null +++ b/kernel/trace/tracelevel.c | |||
@@ -0,0 +1,142 @@ | |||
1 | /* | ||
2 | * kernel/trace/tracelevel.c | ||
3 | * | ||
4 | * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms and conditions of the GNU General Public License, | ||
8 | * version 2, as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
13 | * more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License along with | ||
16 | * this program; if not, write to the Free Software Foundation, Inc., | ||
17 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/ftrace_event.h> | ||
22 | #include <linux/list.h> | ||
23 | #include <linux/moduleparam.h> | ||
24 | #include <linux/mutex.h> | ||
25 | #include <linux/tracelevel.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | |||
28 | #include "trace.h" | ||
29 | |||
30 | #define TAG KERN_ERR "tracelevel: " | ||
31 | |||
32 | struct tracelevel_record { | ||
33 | struct list_head list; | ||
34 | char *name; | ||
35 | int level; | ||
36 | }; | ||
37 | |||
38 | static LIST_HEAD(tracelevel_list); | ||
39 | |||
40 | static bool started; | ||
41 | static unsigned int tracelevel_level = TRACELEVEL_DEFAULT; | ||
42 | |||
43 | static DEFINE_MUTEX(tracelevel_record_lock); | ||
44 | |||
45 | /* tracelevel_set_event sets a single event if set = 1, or | ||
46 | * clears an event if set = 0. | ||
47 | */ | ||
48 | static int tracelevel_set_event(struct tracelevel_record *evt, bool set) | ||
49 | { | ||
50 | if (trace_set_clr_event(NULL, evt->name, set) < 0) { | ||
51 | printk(TAG "failed to set event %s\n", evt->name); | ||
52 | return -EINVAL; | ||
53 | } | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | /* Registers an event. If possible, it also sets it. | ||
58 | * If not, we'll set it in tracelevel_init. | ||
59 | */ | ||
60 | int __tracelevel_register(char *name, unsigned int level) | ||
61 | { | ||
62 | struct tracelevel_record *evt = (struct tracelevel_record *) | ||
63 | vmalloc(sizeof(struct tracelevel_record)); | ||
64 | if (!evt) { | ||
65 | printk(TAG "failed to allocate tracelevel_record for %s\n", | ||
66 | name); | ||
67 | return -ENOMEM; | ||
68 | } | ||
69 | |||
70 | evt->name = name; | ||
71 | evt->level = level; | ||
72 | |||
73 | mutex_lock(&tracelevel_record_lock); | ||
74 | list_add(&evt->list, &tracelevel_list); | ||
75 | mutex_unlock(&tracelevel_record_lock); | ||
76 | |||
77 | if (level >= tracelevel_level && started) | ||
78 | tracelevel_set_event(evt, 1); | ||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | /* tracelevel_set_level sets the global level, clears events | ||
83 | * lower than that level, and enables events greater or equal. | ||
84 | */ | ||
85 | int tracelevel_set_level(int level) | ||
86 | { | ||
87 | struct tracelevel_record *evt = NULL; | ||
88 | |||
89 | if (level < 0 || level > TRACELEVEL_MAX) | ||
90 | return -EINVAL; | ||
91 | tracelevel_level = level; | ||
92 | |||
93 | mutex_lock(&tracelevel_record_lock); | ||
94 | list_for_each_entry(evt, &tracelevel_list, list) { | ||
95 | if (evt->level >= level) | ||
96 | tracelevel_set_event(evt, 1); | ||
97 | else | ||
98 | tracelevel_set_event(evt, 0); | ||
99 | } | ||
100 | mutex_unlock(&tracelevel_record_lock); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static int param_set_level(const char *val, const struct kernel_param *kp) | ||
105 | { | ||
106 | int level, ret; | ||
107 | ret = strict_strtol(val, 0, &level); | ||
108 | if (ret < 0) | ||
109 | return ret; | ||
110 | return tracelevel_set_level(level); | ||
111 | } | ||
112 | |||
113 | static int param_get_level(char *buffer, const struct kernel_param *kp) | ||
114 | { | ||
115 | return param_get_int(buffer, kp); | ||
116 | } | ||
117 | |||
118 | static struct kernel_param_ops tracelevel_level_ops = { | ||
119 | .set = param_set_level, | ||
120 | .get = param_get_level | ||
121 | }; | ||
122 | |||
123 | module_param_cb(level, &tracelevel_level_ops, &tracelevel_level, 0644); | ||
124 | |||
125 | /* Turn on the tracing that has been registered thus far. */ | ||
126 | static int __init tracelevel_init(void) | ||
127 | { | ||
128 | int ret; | ||
129 | started = true; | ||
130 | |||
131 | /* Ring buffer is initialize to 1 page until the user sets a tracer. | ||
132 | * Since we're doing this manually, we need to ask for expanded buffer. | ||
133 | */ | ||
134 | ret = tracing_update_buffers(); | ||
135 | if (ret < 0) | ||
136 | return ret; | ||
137 | |||
138 | return tracelevel_set_level(tracelevel_level); | ||
139 | } | ||
140 | |||
141 | /* Tracing mechanism is set up during fs_initcall. */ | ||
142 | fs_initcall_sync(tracelevel_init); | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 24dc60d9fa1..5bbfac85866 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
78 | 78 | ||
79 | #define KB 1024 | 79 | #define KB 1024 |
80 | #define MB (1024*KB) | 80 | #define MB (1024*KB) |
81 | #define KB_MASK (~(KB-1)) | ||
81 | /* | 82 | /* |
82 | * fill in extended accounting fields | 83 | * fill in extended accounting fields |
83 | */ | 84 | */ |
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
95 | stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; | 96 | stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; |
96 | mmput(mm); | 97 | mmput(mm); |
97 | } | 98 | } |
98 | stats->read_char = p->ioac.rchar; | 99 | stats->read_char = p->ioac.rchar & KB_MASK; |
99 | stats->write_char = p->ioac.wchar; | 100 | stats->write_char = p->ioac.wchar & KB_MASK; |
100 | stats->read_syscalls = p->ioac.syscr; | 101 | stats->read_syscalls = p->ioac.syscr & KB_MASK; |
101 | stats->write_syscalls = p->ioac.syscw; | 102 | stats->write_syscalls = p->ioac.syscw & KB_MASK; |
102 | #ifdef CONFIG_TASK_IO_ACCOUNTING | 103 | #ifdef CONFIG_TASK_IO_ACCOUNTING |
103 | stats->read_bytes = p->ioac.read_bytes; | 104 | stats->read_bytes = p->ioac.read_bytes & KB_MASK; |
104 | stats->write_bytes = p->ioac.write_bytes; | 105 | stats->write_bytes = p->ioac.write_bytes & KB_MASK; |
105 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; | 106 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; |
106 | #else | 107 | #else |
107 | stats->read_bytes = 0; | 108 | stats->read_bytes = 0; |
108 | stats->write_bytes = 0; | 109 | stats->write_bytes = 0; |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3d0c56ad479..36491cd5b7d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts) | |||
200 | } | 200 | } |
201 | 201 | ||
202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
203 | |||
203 | static struct perf_event_attr wd_hw_attr = { | 204 | static struct perf_event_attr wd_hw_attr = { |
204 | .type = PERF_TYPE_HARDWARE, | 205 | .type = PERF_TYPE_HARDWARE, |
205 | .config = PERF_COUNT_HW_CPU_CYCLES, | 206 | .config = PERF_COUNT_HW_CPU_CYCLES, |
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = { | |||
209 | }; | 210 | }; |
210 | 211 | ||
211 | /* Callback function for perf event subsystem */ | 212 | /* Callback function for perf event subsystem */ |
212 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, | 213 | static void watchdog_overflow_callback(struct perf_event *event, |
213 | struct perf_sample_data *data, | 214 | struct perf_sample_data *data, |
214 | struct pt_regs *regs) | 215 | struct pt_regs *regs) |
215 | { | 216 | { |
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu) | |||
368 | if (event != NULL) | 369 | if (event != NULL) |
369 | goto out_enable; | 370 | goto out_enable; |
370 | 371 | ||
371 | /* Try to register using hardware perf events */ | ||
372 | wd_attr = &wd_hw_attr; | 372 | wd_attr = &wd_hw_attr; |
373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | 373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
374 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); | 374 | |
375 | /* Try to register using hardware perf events */ | ||
376 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
375 | if (!IS_ERR(event)) { | 377 | if (!IS_ERR(event)) { |
376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 378 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
377 | goto out_save; | 379 | goto out_save; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0400553f0d0..1783aabc612 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t; | |||
221 | * per-CPU workqueues: | 221 | * per-CPU workqueues: |
222 | */ | 222 | */ |
223 | struct workqueue_struct { | 223 | struct workqueue_struct { |
224 | unsigned int flags; /* I: WQ_* flags */ | 224 | unsigned int flags; /* W: WQ_* flags */ |
225 | union { | 225 | union { |
226 | struct cpu_workqueue_struct __percpu *pcpu; | 226 | struct cpu_workqueue_struct __percpu *pcpu; |
227 | struct cpu_workqueue_struct *single; | 227 | struct cpu_workqueue_struct *single; |
@@ -240,6 +240,7 @@ struct workqueue_struct { | |||
240 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | 240 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ |
241 | struct worker *rescuer; /* I: rescue worker */ | 241 | struct worker *rescuer; /* I: rescue worker */ |
242 | 242 | ||
243 | int nr_drainers; /* W: drain in progress */ | ||
243 | int saved_max_active; /* W: saved cwq max_active */ | 244 | int saved_max_active; /* W: saved cwq max_active */ |
244 | const char *name; /* I: workqueue name */ | 245 | const char *name; /* I: workqueue name */ |
245 | #ifdef CONFIG_LOCKDEP | 246 | #ifdef CONFIG_LOCKDEP |
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
990 | debug_work_activate(work); | 991 | debug_work_activate(work); |
991 | 992 | ||
992 | /* if dying, only works from the same workqueue are allowed */ | 993 | /* if dying, only works from the same workqueue are allowed */ |
993 | if (unlikely(wq->flags & WQ_DYING) && | 994 | if (unlikely(wq->flags & WQ_DRAINING) && |
994 | WARN_ON_ONCE(!is_chained_work(wq))) | 995 | WARN_ON_ONCE(!is_chained_work(wq))) |
995 | return; | 996 | return; |
996 | 997 | ||
@@ -2381,6 +2382,59 @@ out_unlock: | |||
2381 | } | 2382 | } |
2382 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2383 | EXPORT_SYMBOL_GPL(flush_workqueue); |
2383 | 2384 | ||
2385 | /** | ||
2386 | * drain_workqueue - drain a workqueue | ||
2387 | * @wq: workqueue to drain | ||
2388 | * | ||
2389 | * Wait until the workqueue becomes empty. While draining is in progress, | ||
2390 | * only chain queueing is allowed. IOW, only currently pending or running | ||
2391 | * work items on @wq can queue further work items on it. @wq is flushed | ||
2392 | * repeatedly until it becomes empty. The number of flushing is detemined | ||
2393 | * by the depth of chaining and should be relatively short. Whine if it | ||
2394 | * takes too long. | ||
2395 | */ | ||
2396 | void drain_workqueue(struct workqueue_struct *wq) | ||
2397 | { | ||
2398 | unsigned int flush_cnt = 0; | ||
2399 | unsigned int cpu; | ||
2400 | |||
2401 | /* | ||
2402 | * __queue_work() needs to test whether there are drainers, is much | ||
2403 | * hotter than drain_workqueue() and already looks at @wq->flags. | ||
2404 | * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. | ||
2405 | */ | ||
2406 | spin_lock(&workqueue_lock); | ||
2407 | if (!wq->nr_drainers++) | ||
2408 | wq->flags |= WQ_DRAINING; | ||
2409 | spin_unlock(&workqueue_lock); | ||
2410 | reflush: | ||
2411 | flush_workqueue(wq); | ||
2412 | |||
2413 | for_each_cwq_cpu(cpu, wq) { | ||
2414 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
2415 | bool drained; | ||
2416 | |||
2417 | spin_lock_irq(&cwq->gcwq->lock); | ||
2418 | drained = !cwq->nr_active && list_empty(&cwq->delayed_works); | ||
2419 | spin_unlock_irq(&cwq->gcwq->lock); | ||
2420 | |||
2421 | if (drained) | ||
2422 | continue; | ||
2423 | |||
2424 | if (++flush_cnt == 10 || | ||
2425 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
2426 | pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", | ||
2427 | wq->name, flush_cnt); | ||
2428 | goto reflush; | ||
2429 | } | ||
2430 | |||
2431 | spin_lock(&workqueue_lock); | ||
2432 | if (!--wq->nr_drainers) | ||
2433 | wq->flags &= ~WQ_DRAINING; | ||
2434 | spin_unlock(&workqueue_lock); | ||
2435 | } | ||
2436 | EXPORT_SYMBOL_GPL(drain_workqueue); | ||
2437 | |||
2384 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | 2438 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, |
2385 | bool wait_executing) | 2439 | bool wait_executing) |
2386 | { | 2440 | { |
@@ -3009,34 +3063,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
3009 | */ | 3063 | */ |
3010 | void destroy_workqueue(struct workqueue_struct *wq) | 3064 | void destroy_workqueue(struct workqueue_struct *wq) |
3011 | { | 3065 | { |
3012 | unsigned int flush_cnt = 0; | ||
3013 | unsigned int cpu; | 3066 | unsigned int cpu; |
3014 | 3067 | ||
3015 | /* | 3068 | /* drain it before proceeding with destruction */ |
3016 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | 3069 | drain_workqueue(wq); |
3017 | * set, only chain queueing is allowed. IOW, only currently | ||
3018 | * pending or running work items on @wq can queue further work | ||
3019 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
3020 | * The number of flushing is detemined by the depth of chaining and | ||
3021 | * should be relatively short. Whine if it takes too long. | ||
3022 | */ | ||
3023 | wq->flags |= WQ_DYING; | ||
3024 | reflush: | ||
3025 | flush_workqueue(wq); | ||
3026 | |||
3027 | for_each_cwq_cpu(cpu, wq) { | ||
3028 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3029 | |||
3030 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
3031 | continue; | ||
3032 | |||
3033 | if (++flush_cnt == 10 || | ||
3034 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
3035 | printk(KERN_WARNING "workqueue %s: flush on " | ||
3036 | "destruction isn't complete after %u tries\n", | ||
3037 | wq->name, flush_cnt); | ||
3038 | goto reflush; | ||
3039 | } | ||
3040 | 3070 | ||
3041 | /* | 3071 | /* |
3042 | * wq list is used to freeze wq, remove from list after | 3072 | * wq list is used to freeze wq, remove from list after |