aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/audit.c29
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c938
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/exit.c92
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/generic-chip.c18
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/lockdep.c33
-rw-r--r--kernel/module.c80
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/pm_qos_params.c6
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/suspend.c20
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c197
-rw-r--r--kernel/rcutree.c26
-rw-r--r--kernel/rcutree_plugin.h68
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/sched.c449
-rw-r--r--kernel/sched_autogroup.h1
-rw-r--r--kernel/sched_fair.c118
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_rt.c26
-rw-r--r--kernel/signal.c444
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/stop_machine.c78
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/time/timekeeping.c28
-rw-r--r--kernel/trace/ftrace.c157
-rw-r--r--kernel/trace/ring_buffer.c66
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c326
-rw-r--r--kernel/trace/trace.h61
-rw-r--r--kernel/trace/trace_entries.h3
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c3
-rw-r--r--kernel/trace/trace_functions_graph.c225
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c324
-rw-r--r--kernel/trace/trace_output.c11
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c81
59 files changed, 3011 insertions, 1780 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
35 35
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT
38 help 39 help
39 This option reduces the latency of the kernel by making 40 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section) 41 all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
52 53
53endchoice 54endchoice
54 55
56config PREEMPT_COUNT
57 bool \ No newline at end of file
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h>
53#include <linux/ktime.h>
52#include <linux/module.h> 54#include <linux/module.h>
53#include <linux/wait.h> 55#include <linux/wait.h>
54#include <linux/sched.h> 56#include <linux/sched.h>
55#include <linux/slab.h> 57#include <linux/slab.h>
56#include <linux/workqueue.h> 58#include <linux/workqueue.h>
57#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
60 61
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
128 129
129 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
130 if (initcall_debug && system_state == SYSTEM_BOOTING) { 131 if (initcall_debug && system_state == SYSTEM_BOOTING) {
131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie,
132 entry->func, task_pid_nr(current)); 134 entry->func, task_pid_nr(current));
133 calltime = ktime_get(); 135 calltime = ktime_get();
134 } 136 }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
136 if (initcall_debug && system_state == SYSTEM_BOOTING) { 138 if (initcall_debug && system_state == SYSTEM_BOOTING) {
137 rettime = ktime_get(); 139 rettime = ktime_get();
138 delta = ktime_sub(rettime, calltime); 140 delta = ktime_sub(rettime, calltime);
139 printk("initcall %lli_%pF returned 0 after %lld usecs\n", 141 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
140 (long long)entry->cookie, 142 (long long)entry->cookie,
141 entry->func, 143 entry->func,
142 (long long)ktime_to_ns(delta) >> 10); 144 (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
270 ktime_t starttime, delta, endtime; 272 ktime_t starttime, delta, endtime;
271 273
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk("async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 276 starttime = ktime_get();
275 } 277 }
276 278
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
280 endtime = ktime_get(); 282 endtime = ktime_get();
281 delta = ktime_sub(endtime, starttime); 283 delta = ktime_sub(endtime, starttime);
282 284
283 printk("async_continuing @ %i after %lli usec\n", 285 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
284 task_pid_nr(current), 286 task_pid_nr(current),
285 (long long)ktime_to_ns(delta) >> 10); 287 (long long)ktime_to_ns(delta) >> 10);
286 } 288 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..52501b5d4902 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
55#include <net/sock.h> 55#include <net/sock.h>
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#ifdef CONFIG_SECURITY
59#include <linux/security.h>
60#endif
58#include <linux/netlink.h> 61#include <linux/netlink.h>
59#include <linux/freezer.h> 62#include <linux/freezer.h>
60#include <linux/tty.h> 63#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1502 } 1505 }
1503} 1506}
1504 1507
1508#ifdef CONFIG_SECURITY
1509/**
1510 * audit_log_secctx - Converts and logs SELinux context
1511 * @ab: audit_buffer
1512 * @secid: security number
1513 *
1514 * This is a helper function that calls security_secid_to_secctx to convert
1515 * secid to secctx and then adds the (converted) SELinux context to the audit
1516 * log by calling audit_log_format, thus also preventing leak of internal secid
1517 * to userspace. If secid cannot be converted audit_panic is called.
1518 */
1519void audit_log_secctx(struct audit_buffer *ab, u32 secid)
1520{
1521 u32 len;
1522 char *secctx;
1523
1524 if (security_secid_to_secctx(secid, &secctx, &len)) {
1525 audit_panic("Cannot convert secid to context");
1526 } else {
1527 audit_log_format(ab, " obj=%s", secctx);
1528 security_release_secctx(secctx, len);
1529 }
1530}
1531EXPORT_SYMBOL(audit_log_secctx);
1532#endif
1533
1505EXPORT_SYMBOL(audit_log_start); 1534EXPORT_SYMBOL(audit_log_start);
1506EXPORT_SYMBOL(audit_log_end); 1535EXPORT_SYMBOL(audit_log_end);
1507EXPORT_SYMBOL(audit_log_format); 1536EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b126..5bf0790497e7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
93 atomic_inc(&tree->count); 93 atomic_inc(&tree->count);
94} 94}
95 95
96static void __put_tree(struct rcu_head *rcu)
97{
98 struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
99 kfree(tree);
100}
101
102static inline void put_tree(struct audit_tree *tree) 96static inline void put_tree(struct audit_tree *tree)
103{ 97{
104 if (atomic_dec_and_test(&tree->count)) 98 if (atomic_dec_and_test(&tree->count))
105 call_rcu(&tree->head, __put_tree); 99 kfree_rcu(tree, head);
106} 100}
107 101
108/* to avoid bringing the entire thing in audit.h */ 102/* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5ae71d6e274b..a63507b92ca4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3540,7 +3540,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3540 } 3540 }
3541 3541
3542 /* the process need read permission on control file */ 3542 /* the process need read permission on control file */
3543 ret = file_permission(cfile, MAY_READ); 3543 /* AV: shouldn't we check that it's been opened for read instead? */
3544 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3544 if (ret < 0) 3545 if (ret < 0)
3545 goto fail; 3546 goto fail;
3546 3547
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd5..18197ae2d465 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -890,6 +890,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
891 } 891 }
892} 892}
893EXPORT_SYMBOL_GPL(sigset_from_compat);
893 894
894asmlinkage long 895asmlinkage long
895compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 896compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/module.h>
22 23
23int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 24int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
25EXPORT_SYMBOL_GPL(delayacct_on);
24struct kmem_cache *delayacct_cache; 26struct kmem_cache *delayacct_cache;
25 27
26static int __init delayacct_setup_disable(char *str) 28static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o 5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108ccaf..b8785e26ee1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -731,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
731 748
732/* 749/*
733 * Update the total_time_enabled and total_time_running fields for a event. 750 * Update the total_time_enabled and total_time_running fields for a event.
751 * The caller of this function needs to hold the ctx->lock.
734 */ 752 */
735static void update_event_times(struct perf_event *event) 753static void update_event_times(struct perf_event *event)
736{ 754{
@@ -1105,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
1105 raw_spin_lock(&ctx->lock); 1123 raw_spin_lock(&ctx->lock);
1106 event_sched_out(event, cpuctx, ctx); 1124 event_sched_out(event, cpuctx, ctx);
1107 list_del_event(event, ctx); 1125 list_del_event(event, ctx);
1126 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1127 ctx->is_active = 0;
1128 cpuctx->task_ctx = NULL;
1129 }
1108 raw_spin_unlock(&ctx->lock); 1130 raw_spin_unlock(&ctx->lock);
1109 1131
1110 return 0; 1132 return 0;
@@ -1454,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
1454 event->tstamp_stopped = tstamp; 1476 event->tstamp_stopped = tstamp;
1455} 1477}
1456 1478
1457static void perf_event_context_sched_in(struct perf_event_context *ctx, 1479static void task_ctx_sched_out(struct perf_event_context *ctx);
1458 struct task_struct *tsk); 1480static void
1481ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type,
1484 struct task_struct *task);
1485
1486static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1487 struct perf_event_context *ctx,
1488 struct task_struct *task)
1489{
1490 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1491 if (ctx)
1492 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1493 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1494 if (ctx)
1495 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1496}
1459 1497
1460/* 1498/*
1461 * Cross CPU call to install and enable a performance event 1499 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1504,37 @@ static int __perf_install_in_context(void *info)
1466{ 1504{
1467 struct perf_event *event = info; 1505 struct perf_event *event = info;
1468 struct perf_event_context *ctx = event->ctx; 1506 struct perf_event_context *ctx = event->ctx;
1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1471 int err; 1508 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1509 struct task_struct *task = current;
1510
1511 perf_ctx_lock(cpuctx, task_ctx);
1512 perf_pmu_disable(cpuctx->ctx.pmu);
1472 1513
1473 /* 1514 /*
1474 * In case we're installing a new context to an already running task, 1515 * If there was an active task_ctx schedule it out.
1475 * could also happen before perf_event_task_sched_in() on architectures
1476 * which do context switches with IRQs enabled.
1477 */ 1516 */
1478 if (ctx->task && !cpuctx->task_ctx) 1517 if (task_ctx)
1479 perf_event_context_sched_in(ctx, ctx->task); 1518 task_ctx_sched_out(task_ctx);
1519
1520 /*
1521 * If the context we're installing events in is not the
1522 * active task_ctx, flip them.
1523 */
1524 if (ctx->task && task_ctx != ctx) {
1525 if (task_ctx)
1526 raw_spin_unlock(&task_ctx->lock);
1527 raw_spin_lock(&ctx->lock);
1528 task_ctx = ctx;
1529 }
1530
1531 if (task_ctx) {
1532 cpuctx->task_ctx = task_ctx;
1533 task = task_ctx->task;
1534 }
1535
1536 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1480 1537
1481 raw_spin_lock(&ctx->lock);
1482 ctx->is_active = 1;
1483 update_context_time(ctx); 1538 update_context_time(ctx);
1484 /* 1539 /*
1485 * update cgrp time only if current cgrp 1540 * update cgrp time only if current cgrp
@@ -1490,43 +1545,13 @@ static int __perf_install_in_context(void *info)
1490 1545
1491 add_event_to_ctx(event, ctx); 1546 add_event_to_ctx(event, ctx);
1492 1547
1493 if (!event_filter_match(event))
1494 goto unlock;
1495
1496 /*
1497 * Don't put the event on if it is disabled or if
1498 * it is in a group and the group isn't on.
1499 */
1500 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1501 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1502 goto unlock;
1503
1504 /* 1548 /*
1505 * An exclusive event can't go on if there are already active 1549 * Schedule everything back in
1506 * hardware events, and no hardware event can go on if there
1507 * is already an exclusive event on.
1508 */ 1550 */
1509 if (!group_can_go_on(event, cpuctx, 1)) 1551 perf_event_sched_in(cpuctx, task_ctx, task);
1510 err = -EEXIST;
1511 else
1512 err = event_sched_in(event, cpuctx, ctx);
1513
1514 if (err) {
1515 /*
1516 * This event couldn't go on. If it is in a group
1517 * then we have to pull the whole group off.
1518 * If the event group is pinned then put it in error state.
1519 */
1520 if (leader != event)
1521 group_sched_out(leader, cpuctx, ctx);
1522 if (leader->attr.pinned) {
1523 update_group_times(leader);
1524 leader->state = PERF_EVENT_STATE_ERROR;
1525 }
1526 }
1527 1552
1528unlock: 1553 perf_pmu_enable(cpuctx->ctx.pmu);
1529 raw_spin_unlock(&ctx->lock); 1554 perf_ctx_unlock(cpuctx, task_ctx);
1530 1555
1531 return 0; 1556 return 0;
1532} 1557}
@@ -1739,7 +1764,7 @@ out:
1739 raw_spin_unlock_irq(&ctx->lock); 1764 raw_spin_unlock_irq(&ctx->lock);
1740} 1765}
1741 1766
1742static int perf_event_refresh(struct perf_event *event, int refresh) 1767int perf_event_refresh(struct perf_event *event, int refresh)
1743{ 1768{
1744 /* 1769 /*
1745 * not supported on inherited events 1770 * not supported on inherited events
@@ -1752,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1752 1777
1753 return 0; 1778 return 0;
1754} 1779}
1780EXPORT_SYMBOL_GPL(perf_event_refresh);
1755 1781
1756static void ctx_sched_out(struct perf_event_context *ctx, 1782static void ctx_sched_out(struct perf_event_context *ctx,
1757 struct perf_cpu_context *cpuctx, 1783 struct perf_cpu_context *cpuctx,
1758 enum event_type_t event_type) 1784 enum event_type_t event_type)
1759{ 1785{
1760 struct perf_event *event; 1786 struct perf_event *event;
1787 int is_active = ctx->is_active;
1761 1788
1762 raw_spin_lock(&ctx->lock); 1789 ctx->is_active &= ~event_type;
1763 perf_pmu_disable(ctx->pmu);
1764 ctx->is_active = 0;
1765 if (likely(!ctx->nr_events)) 1790 if (likely(!ctx->nr_events))
1766 goto out; 1791 return;
1792
1767 update_context_time(ctx); 1793 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx); 1794 update_cgrp_time_from_cpuctx(cpuctx);
1769
1770 if (!ctx->nr_active) 1795 if (!ctx->nr_active)
1771 goto out; 1796 return;
1772 1797
1773 if (event_type & EVENT_PINNED) { 1798 perf_pmu_disable(ctx->pmu);
1799 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1800 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1775 group_sched_out(event, cpuctx, ctx); 1801 group_sched_out(event, cpuctx, ctx);
1776 } 1802 }
1777 1803
1778 if (event_type & EVENT_FLEXIBLE) { 1804 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1805 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1780 group_sched_out(event, cpuctx, ctx); 1806 group_sched_out(event, cpuctx, ctx);
1781 } 1807 }
1782out:
1783 perf_pmu_enable(ctx->pmu); 1808 perf_pmu_enable(ctx->pmu);
1784 raw_spin_unlock(&ctx->lock);
1785} 1809}
1786 1810
1787/* 1811/*
@@ -1929,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1929 rcu_read_unlock(); 1953 rcu_read_unlock();
1930 1954
1931 if (do_switch) { 1955 if (do_switch) {
1956 raw_spin_lock(&ctx->lock);
1932 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1957 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1933 cpuctx->task_ctx = NULL; 1958 cpuctx->task_ctx = NULL;
1959 raw_spin_unlock(&ctx->lock);
1934 } 1960 }
1935} 1961}
1936 1962
@@ -1965,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1965 perf_cgroup_sched_out(task); 1991 perf_cgroup_sched_out(task);
1966} 1992}
1967 1993
1968static void task_ctx_sched_out(struct perf_event_context *ctx, 1994static void task_ctx_sched_out(struct perf_event_context *ctx)
1969 enum event_type_t event_type)
1970{ 1995{
1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1996 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1972 1997
@@ -1976,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2001 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1977 return; 2002 return;
1978 2003
1979 ctx_sched_out(ctx, cpuctx, event_type); 2004 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1980 cpuctx->task_ctx = NULL; 2005 cpuctx->task_ctx = NULL;
1981} 2006}
1982 2007
@@ -2055,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2055 struct task_struct *task) 2080 struct task_struct *task)
2056{ 2081{
2057 u64 now; 2082 u64 now;
2083 int is_active = ctx->is_active;
2058 2084
2059 raw_spin_lock(&ctx->lock); 2085 ctx->is_active |= event_type;
2060 ctx->is_active = 1;
2061 if (likely(!ctx->nr_events)) 2086 if (likely(!ctx->nr_events))
2062 goto out; 2087 return;
2063 2088
2064 now = perf_clock(); 2089 now = perf_clock();
2065 ctx->timestamp = now; 2090 ctx->timestamp = now;
@@ -2068,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2068 * First go through the list and put on any pinned groups 2093 * First go through the list and put on any pinned groups
2069 * in order to give them the best chance of going on. 2094 * in order to give them the best chance of going on.
2070 */ 2095 */
2071 if (event_type & EVENT_PINNED) 2096 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2072 ctx_pinned_sched_in(ctx, cpuctx); 2097 ctx_pinned_sched_in(ctx, cpuctx);
2073 2098
2074 /* Then walk through the lower prio flexible groups */ 2099 /* Then walk through the lower prio flexible groups */
2075 if (event_type & EVENT_FLEXIBLE) 2100 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2076 ctx_flexible_sched_in(ctx, cpuctx); 2101 ctx_flexible_sched_in(ctx, cpuctx);
2077
2078out:
2079 raw_spin_unlock(&ctx->lock);
2080} 2102}
2081 2103
2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2104static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2088 ctx_sched_in(ctx, cpuctx, event_type, task); 2110 ctx_sched_in(ctx, cpuctx, event_type, task);
2089} 2111}
2090 2112
2091static void task_ctx_sched_in(struct perf_event_context *ctx,
2092 enum event_type_t event_type)
2093{
2094 struct perf_cpu_context *cpuctx;
2095
2096 cpuctx = __get_cpu_context(ctx);
2097 if (cpuctx->task_ctx == ctx)
2098 return;
2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2101 cpuctx->task_ctx = ctx;
2102}
2103
2104static void perf_event_context_sched_in(struct perf_event_context *ctx, 2113static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task) 2114 struct task_struct *task)
2106{ 2115{
@@ -2110,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2110 if (cpuctx->task_ctx == ctx) 2119 if (cpuctx->task_ctx == ctx)
2111 return; 2120 return;
2112 2121
2122 perf_ctx_lock(cpuctx, ctx);
2113 perf_pmu_disable(ctx->pmu); 2123 perf_pmu_disable(ctx->pmu);
2114 /* 2124 /*
2115 * We want to keep the following priority order: 2125 * We want to keep the following priority order:
@@ -2118,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 */ 2128 */
2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2120 2130
2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2131 perf_event_sched_in(cpuctx, ctx, task);
2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2124 2132
2125 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
2126 2134
2135 perf_pmu_enable(ctx->pmu);
2136 perf_ctx_unlock(cpuctx, ctx);
2137
2127 /* 2138 /*
2128 * Since these rotations are per-cpu, we need to ensure the 2139 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating. 2140 * cpu-context we got scheduled on is actually rotating.
2130 */ 2141 */
2131 perf_pmu_rotate_start(ctx->pmu); 2142 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
2133} 2143}
2134 2144
2135/* 2145/*
@@ -2269,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2269 u64 interrupts, now; 2279 u64 interrupts, now;
2270 s64 delta; 2280 s64 delta;
2271 2281
2272 raw_spin_lock(&ctx->lock);
2273 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2282 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2274 if (event->state != PERF_EVENT_STATE_ACTIVE) 2283 if (event->state != PERF_EVENT_STATE_ACTIVE)
2275 continue; 2284 continue;
@@ -2301,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2301 if (delta > 0) 2310 if (delta > 0)
2302 perf_adjust_period(event, period, delta); 2311 perf_adjust_period(event, period, delta);
2303 } 2312 }
2304 raw_spin_unlock(&ctx->lock);
2305} 2313}
2306 2314
2307/* 2315/*
@@ -2309,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 */ 2317 */
2310static void rotate_ctx(struct perf_event_context *ctx) 2318static void rotate_ctx(struct perf_event_context *ctx)
2311{ 2319{
2312 raw_spin_lock(&ctx->lock);
2313
2314 /* 2320 /*
2315 * Rotate the first entry last of non-pinned groups. Rotation might be 2321 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code. 2322 * disabled by the inheritance code.
2317 */ 2323 */
2318 if (!ctx->rotate_disable) 2324 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups); 2325 list_rotate_left(&ctx->flexible_groups);
2320
2321 raw_spin_unlock(&ctx->lock);
2322} 2326}
2323 2327
2324/* 2328/*
@@ -2345,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2345 rotate = 1; 2349 rotate = 1;
2346 } 2350 }
2347 2351
2352 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2348 perf_pmu_disable(cpuctx->ctx.pmu); 2353 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2354 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2350 if (ctx) 2355 if (ctx)
@@ -2355,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2355 2360
2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2361 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2357 if (ctx) 2362 if (ctx)
2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2363 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2359 2364
2360 rotate_ctx(&cpuctx->ctx); 2365 rotate_ctx(&cpuctx->ctx);
2361 if (ctx) 2366 if (ctx)
2362 rotate_ctx(ctx); 2367 rotate_ctx(ctx);
2363 2368
2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2369 perf_event_sched_in(cpuctx, ctx, current);
2365 if (ctx)
2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2367 2370
2368done: 2371done:
2369 if (remove) 2372 if (remove)
2370 list_del_init(&cpuctx->rotation_list); 2373 list_del_init(&cpuctx->rotation_list);
2371 2374
2372 perf_pmu_enable(cpuctx->ctx.pmu); 2375 perf_pmu_enable(cpuctx->ctx.pmu);
2376 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2373} 2377}
2374 2378
2375void perf_event_task_tick(void) 2379void perf_event_task_tick(void)
@@ -2424,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2424 * in. 2428 * in.
2425 */ 2429 */
2426 perf_cgroup_sched_out(current); 2430 perf_cgroup_sched_out(current);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
2428 2431
2429 raw_spin_lock(&ctx->lock); 2432 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx);
2430 2434
2431 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2432 ret = event_enable_on_exec(event, ctx); 2436 ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2839,12 @@ retry:
2835 unclone_ctx(ctx); 2839 unclone_ctx(ctx);
2836 ++ctx->pin_count; 2840 ++ctx->pin_count;
2837 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2841 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2838 } 2842 } else {
2839
2840 if (!ctx) {
2841 ctx = alloc_perf_context(pmu, task); 2843 ctx = alloc_perf_context(pmu, task);
2842 err = -ENOMEM; 2844 err = -ENOMEM;
2843 if (!ctx) 2845 if (!ctx)
2844 goto errout; 2846 goto errout;
2845 2847
2846 get_ctx(ctx);
2847
2848 err = 0; 2848 err = 0;
2849 mutex_lock(&task->perf_event_mutex); 2849 mutex_lock(&task->perf_event_mutex);
2850 /* 2850 /*
@@ -2856,14 +2856,14 @@ retry:
2856 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN; 2857 err = -EAGAIN;
2858 else { 2858 else {
2859 get_ctx(ctx);
2859 ++ctx->pin_count; 2860 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2861 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 } 2862 }
2862 mutex_unlock(&task->perf_event_mutex); 2863 mutex_unlock(&task->perf_event_mutex);
2863 2864
2864 if (unlikely(err)) { 2865 if (unlikely(err)) {
2865 put_task_struct(task); 2866 put_ctx(ctx);
2866 kfree(ctx);
2867 2867
2868 if (err == -EAGAIN) 2868 if (err == -EAGAIN)
2869 goto retry; 2869 goto retry;
@@ -2890,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
2890 kfree(event); 2890 kfree(event);
2891} 2891}
2892 2892
2893static void perf_buffer_put(struct perf_buffer *buffer); 2893static void ring_buffer_put(struct ring_buffer *rb);
2894 2894
2895static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
2896{ 2896{
@@ -2913,9 +2913,9 @@ static void free_event(struct perf_event *event)
2913 } 2913 }
2914 } 2914 }
2915 2915
2916 if (event->buffer) { 2916 if (event->rb) {
2917 perf_buffer_put(event->buffer); 2917 ring_buffer_put(event->rb);
2918 event->buffer = NULL; 2918 event->rb = NULL;
2919 } 2919 }
2920 2920
2921 if (is_cgroup_event(event)) 2921 if (is_cgroup_event(event))
@@ -2934,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
2934{ 2934{
2935 struct perf_event_context *ctx = event->ctx; 2935 struct perf_event_context *ctx = event->ctx;
2936 2936
2937 /*
2938 * Remove from the PMU, can't get re-enabled since we got
2939 * here because the last ref went.
2940 */
2941 perf_event_disable(event);
2942
2943 WARN_ON_ONCE(ctx->parent_ctx); 2937 WARN_ON_ONCE(ctx->parent_ctx);
2944 /* 2938 /*
2945 * There are two ways this annotation is useful: 2939 * There are two ways this annotation is useful:
@@ -2956,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
2956 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2950 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2957 raw_spin_lock_irq(&ctx->lock); 2951 raw_spin_lock_irq(&ctx->lock);
2958 perf_group_detach(event); 2952 perf_group_detach(event);
2959 list_del_event(event, ctx);
2960 raw_spin_unlock_irq(&ctx->lock); 2953 raw_spin_unlock_irq(&ctx->lock);
2954 perf_remove_from_context(event);
2961 mutex_unlock(&ctx->mutex); 2955 mutex_unlock(&ctx->mutex);
2962 2956
2963 free_event(event); 2957 free_event(event);
@@ -3149,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3149static unsigned int perf_poll(struct file *file, poll_table *wait) 3143static unsigned int perf_poll(struct file *file, poll_table *wait)
3150{ 3144{
3151 struct perf_event *event = file->private_data; 3145 struct perf_event *event = file->private_data;
3152 struct perf_buffer *buffer; 3146 struct ring_buffer *rb;
3153 unsigned int events = POLL_HUP; 3147 unsigned int events = POLL_HUP;
3154 3148
3155 rcu_read_lock(); 3149 rcu_read_lock();
3156 buffer = rcu_dereference(event->buffer); 3150 rb = rcu_dereference(event->rb);
3157 if (buffer) 3151 if (rb)
3158 events = atomic_xchg(&buffer->poll, 0); 3152 events = atomic_xchg(&rb->poll, 0);
3159 rcu_read_unlock(); 3153 rcu_read_unlock();
3160 3154
3161 poll_wait(file, &event->waitq, wait); 3155 poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
3358 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3352 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3359} 3353}
3360 3354
3355static void calc_timer_values(struct perf_event *event,
3356 u64 *running,
3357 u64 *enabled)
3358{
3359 u64 now, ctx_time;
3360
3361 now = perf_clock();
3362 ctx_time = event->shadow_ctx_time + now;
3363 *enabled = ctx_time - event->tstamp_enabled;
3364 *running = ctx_time - event->tstamp_running;
3365}
3366
3361/* 3367/*
3362 * Callers need to ensure there can be no nesting of this function, otherwise 3368 * Callers need to ensure there can be no nesting of this function, otherwise
3363 * the seqlock logic goes bad. We can not serialize this because the arch 3369 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
3366void perf_event_update_userpage(struct perf_event *event) 3372void perf_event_update_userpage(struct perf_event *event)
3367{ 3373{
3368 struct perf_event_mmap_page *userpg; 3374 struct perf_event_mmap_page *userpg;
3369 struct perf_buffer *buffer; 3375 struct ring_buffer *rb;
3376 u64 enabled, running;
3370 3377
3371 rcu_read_lock(); 3378 rcu_read_lock();
3372 buffer = rcu_dereference(event->buffer); 3379 /*
3373 if (!buffer) 3380 * compute total_time_enabled, total_time_running
3381 * based on snapshot values taken when the event
3382 * was last scheduled in.
3383 *
3384 * we cannot simply called update_context_time()
3385 * because of locking issue as we can be called in
3386 * NMI context
3387 */
3388 calc_timer_values(event, &enabled, &running);
3389 rb = rcu_dereference(event->rb);
3390 if (!rb)
3374 goto unlock; 3391 goto unlock;
3375 3392
3376 userpg = buffer->user_page; 3393 userpg = rb->user_page;
3377 3394
3378 /* 3395 /*
3379 * Disable preemption so as to not let the corresponding user-space 3396 * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
3387 if (event->state == PERF_EVENT_STATE_ACTIVE) 3404 if (event->state == PERF_EVENT_STATE_ACTIVE)
3388 userpg->offset -= local64_read(&event->hw.prev_count); 3405 userpg->offset -= local64_read(&event->hw.prev_count);
3389 3406
3390 userpg->time_enabled = event->total_time_enabled + 3407 userpg->time_enabled = enabled +
3391 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
3392 3409
3393 userpg->time_running = event->total_time_running + 3410 userpg->time_running = running +
3394 atomic64_read(&event->child_total_time_running); 3411 atomic64_read(&event->child_total_time_running);
3395 3412
3396 barrier(); 3413 barrier();
@@ -3400,220 +3417,10 @@ unlock:
3400 rcu_read_unlock(); 3417 rcu_read_unlock();
3401} 3418}
3402 3419
3403static unsigned long perf_data_size(struct perf_buffer *buffer);
3404
3405static void
3406perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3407{
3408 long max_size = perf_data_size(buffer);
3409
3410 if (watermark)
3411 buffer->watermark = min(max_size, watermark);
3412
3413 if (!buffer->watermark)
3414 buffer->watermark = max_size / 2;
3415
3416 if (flags & PERF_BUFFER_WRITABLE)
3417 buffer->writable = 1;
3418
3419 atomic_set(&buffer->refcount, 1);
3420}
3421
3422#ifndef CONFIG_PERF_USE_VMALLOC
3423
3424/*
3425 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3426 */
3427
3428static struct page *
3429perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3430{
3431 if (pgoff > buffer->nr_pages)
3432 return NULL;
3433
3434 if (pgoff == 0)
3435 return virt_to_page(buffer->user_page);
3436
3437 return virt_to_page(buffer->data_pages[pgoff - 1]);
3438}
3439
3440static void *perf_mmap_alloc_page(int cpu)
3441{
3442 struct page *page;
3443 int node;
3444
3445 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3446 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3447 if (!page)
3448 return NULL;
3449
3450 return page_address(page);
3451}
3452
3453static struct perf_buffer *
3454perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3455{
3456 struct perf_buffer *buffer;
3457 unsigned long size;
3458 int i;
3459
3460 size = sizeof(struct perf_buffer);
3461 size += nr_pages * sizeof(void *);
3462
3463 buffer = kzalloc(size, GFP_KERNEL);
3464 if (!buffer)
3465 goto fail;
3466
3467 buffer->user_page = perf_mmap_alloc_page(cpu);
3468 if (!buffer->user_page)
3469 goto fail_user_page;
3470
3471 for (i = 0; i < nr_pages; i++) {
3472 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3473 if (!buffer->data_pages[i])
3474 goto fail_data_pages;
3475 }
3476
3477 buffer->nr_pages = nr_pages;
3478
3479 perf_buffer_init(buffer, watermark, flags);
3480
3481 return buffer;
3482
3483fail_data_pages:
3484 for (i--; i >= 0; i--)
3485 free_page((unsigned long)buffer->data_pages[i]);
3486
3487 free_page((unsigned long)buffer->user_page);
3488
3489fail_user_page:
3490 kfree(buffer);
3491
3492fail:
3493 return NULL;
3494}
3495
3496static void perf_mmap_free_page(unsigned long addr)
3497{
3498 struct page *page = virt_to_page((void *)addr);
3499
3500 page->mapping = NULL;
3501 __free_page(page);
3502}
3503
3504static void perf_buffer_free(struct perf_buffer *buffer)
3505{
3506 int i;
3507
3508 perf_mmap_free_page((unsigned long)buffer->user_page);
3509 for (i = 0; i < buffer->nr_pages; i++)
3510 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3511 kfree(buffer);
3512}
3513
3514static inline int page_order(struct perf_buffer *buffer)
3515{
3516 return 0;
3517}
3518
3519#else
3520
3521/*
3522 * Back perf_mmap() with vmalloc memory.
3523 *
3524 * Required for architectures that have d-cache aliasing issues.
3525 */
3526
3527static inline int page_order(struct perf_buffer *buffer)
3528{
3529 return buffer->page_order;
3530}
3531
3532static struct page *
3533perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3534{
3535 if (pgoff > (1UL << page_order(buffer)))
3536 return NULL;
3537
3538 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3539}
3540
3541static void perf_mmap_unmark_page(void *addr)
3542{
3543 struct page *page = vmalloc_to_page(addr);
3544
3545 page->mapping = NULL;
3546}
3547
3548static void perf_buffer_free_work(struct work_struct *work)
3549{
3550 struct perf_buffer *buffer;
3551 void *base;
3552 int i, nr;
3553
3554 buffer = container_of(work, struct perf_buffer, work);
3555 nr = 1 << page_order(buffer);
3556
3557 base = buffer->user_page;
3558 for (i = 0; i < nr + 1; i++)
3559 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3560
3561 vfree(base);
3562 kfree(buffer);
3563}
3564
3565static void perf_buffer_free(struct perf_buffer *buffer)
3566{
3567 schedule_work(&buffer->work);
3568}
3569
3570static struct perf_buffer *
3571perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3572{
3573 struct perf_buffer *buffer;
3574 unsigned long size;
3575 void *all_buf;
3576
3577 size = sizeof(struct perf_buffer);
3578 size += sizeof(void *);
3579
3580 buffer = kzalloc(size, GFP_KERNEL);
3581 if (!buffer)
3582 goto fail;
3583
3584 INIT_WORK(&buffer->work, perf_buffer_free_work);
3585
3586 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3587 if (!all_buf)
3588 goto fail_all_buf;
3589
3590 buffer->user_page = all_buf;
3591 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3592 buffer->page_order = ilog2(nr_pages);
3593 buffer->nr_pages = 1;
3594
3595 perf_buffer_init(buffer, watermark, flags);
3596
3597 return buffer;
3598
3599fail_all_buf:
3600 kfree(buffer);
3601
3602fail:
3603 return NULL;
3604}
3605
3606#endif
3607
3608static unsigned long perf_data_size(struct perf_buffer *buffer)
3609{
3610 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3611}
3612
3613static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3420static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3614{ 3421{
3615 struct perf_event *event = vma->vm_file->private_data; 3422 struct perf_event *event = vma->vm_file->private_data;
3616 struct perf_buffer *buffer; 3423 struct ring_buffer *rb;
3617 int ret = VM_FAULT_SIGBUS; 3424 int ret = VM_FAULT_SIGBUS;
3618 3425
3619 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3426 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3623 } 3430 }
3624 3431
3625 rcu_read_lock(); 3432 rcu_read_lock();
3626 buffer = rcu_dereference(event->buffer); 3433 rb = rcu_dereference(event->rb);
3627 if (!buffer) 3434 if (!rb)
3628 goto unlock; 3435 goto unlock;
3629 3436
3630 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3437 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3631 goto unlock; 3438 goto unlock;
3632 3439
3633 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3440 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3634 if (!vmf->page) 3441 if (!vmf->page)
3635 goto unlock; 3442 goto unlock;
3636 3443
@@ -3645,35 +3452,35 @@ unlock:
3645 return ret; 3452 return ret;
3646} 3453}
3647 3454
3648static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3455static void rb_free_rcu(struct rcu_head *rcu_head)
3649{ 3456{
3650 struct perf_buffer *buffer; 3457 struct ring_buffer *rb;
3651 3458
3652 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3459 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3653 perf_buffer_free(buffer); 3460 rb_free(rb);
3654} 3461}
3655 3462
3656static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3463static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3657{ 3464{
3658 struct perf_buffer *buffer; 3465 struct ring_buffer *rb;
3659 3466
3660 rcu_read_lock(); 3467 rcu_read_lock();
3661 buffer = rcu_dereference(event->buffer); 3468 rb = rcu_dereference(event->rb);
3662 if (buffer) { 3469 if (rb) {
3663 if (!atomic_inc_not_zero(&buffer->refcount)) 3470 if (!atomic_inc_not_zero(&rb->refcount))
3664 buffer = NULL; 3471 rb = NULL;
3665 } 3472 }
3666 rcu_read_unlock(); 3473 rcu_read_unlock();
3667 3474
3668 return buffer; 3475 return rb;
3669} 3476}
3670 3477
3671static void perf_buffer_put(struct perf_buffer *buffer) 3478static void ring_buffer_put(struct ring_buffer *rb)
3672{ 3479{
3673 if (!atomic_dec_and_test(&buffer->refcount)) 3480 if (!atomic_dec_and_test(&rb->refcount))
3674 return; 3481 return;
3675 3482
3676 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3483 call_rcu(&rb->rcu_head, rb_free_rcu);
3677} 3484}
3678 3485
3679static void perf_mmap_open(struct vm_area_struct *vma) 3486static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3688 struct perf_event *event = vma->vm_file->private_data; 3495 struct perf_event *event = vma->vm_file->private_data;
3689 3496
3690 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3497 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3691 unsigned long size = perf_data_size(event->buffer); 3498 unsigned long size = perf_data_size(event->rb);
3692 struct user_struct *user = event->mmap_user; 3499 struct user_struct *user = event->mmap_user;
3693 struct perf_buffer *buffer = event->buffer; 3500 struct ring_buffer *rb = event->rb;
3694 3501
3695 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3696 vma->vm_mm->locked_vm -= event->mmap_locked; 3503 vma->vm_mm->locked_vm -= event->mmap_locked;
3697 rcu_assign_pointer(event->buffer, NULL); 3504 rcu_assign_pointer(event->rb, NULL);
3698 mutex_unlock(&event->mmap_mutex); 3505 mutex_unlock(&event->mmap_mutex);
3699 3506
3700 perf_buffer_put(buffer); 3507 ring_buffer_put(rb);
3701 free_uid(user); 3508 free_uid(user);
3702 } 3509 }
3703} 3510}
@@ -3715,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3715 unsigned long user_locked, user_lock_limit; 3522 unsigned long user_locked, user_lock_limit;
3716 struct user_struct *user = current_user(); 3523 struct user_struct *user = current_user();
3717 unsigned long locked, lock_limit; 3524 unsigned long locked, lock_limit;
3718 struct perf_buffer *buffer; 3525 struct ring_buffer *rb;
3719 unsigned long vma_size; 3526 unsigned long vma_size;
3720 unsigned long nr_pages; 3527 unsigned long nr_pages;
3721 long user_extra, extra; 3528 long user_extra, extra;
@@ -3724,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3724 /* 3531 /*
3725 * Don't allow mmap() of inherited per-task counters. This would 3532 * Don't allow mmap() of inherited per-task counters. This would
3726 * create a performance issue due to all children writing to the 3533 * create a performance issue due to all children writing to the
3727 * same buffer. 3534 * same rb.
3728 */ 3535 */
3729 if (event->cpu == -1 && event->attr.inherit) 3536 if (event->cpu == -1 && event->attr.inherit)
3730 return -EINVAL; 3537 return -EINVAL;
@@ -3736,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3736 nr_pages = (vma_size / PAGE_SIZE) - 1; 3543 nr_pages = (vma_size / PAGE_SIZE) - 1;
3737 3544
3738 /* 3545 /*
3739 * If we have buffer pages ensure they're a power-of-two number, so we 3546 * If we have rb pages ensure they're a power-of-two number, so we
3740 * can do bitmasks instead of modulo. 3547 * can do bitmasks instead of modulo.
3741 */ 3548 */
3742 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3549 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3750 3557
3751 WARN_ON_ONCE(event->ctx->parent_ctx); 3558 WARN_ON_ONCE(event->ctx->parent_ctx);
3752 mutex_lock(&event->mmap_mutex); 3559 mutex_lock(&event->mmap_mutex);
3753 if (event->buffer) { 3560 if (event->rb) {
3754 if (event->buffer->nr_pages == nr_pages) 3561 if (event->rb->nr_pages == nr_pages)
3755 atomic_inc(&event->buffer->refcount); 3562 atomic_inc(&event->rb->refcount);
3756 else 3563 else
3757 ret = -EINVAL; 3564 ret = -EINVAL;
3758 goto unlock; 3565 goto unlock;
@@ -3782,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3782 goto unlock; 3589 goto unlock;
3783 } 3590 }
3784 3591
3785 WARN_ON(event->buffer); 3592 WARN_ON(event->rb);
3786 3593
3787 if (vma->vm_flags & VM_WRITE) 3594 if (vma->vm_flags & VM_WRITE)
3788 flags |= PERF_BUFFER_WRITABLE; 3595 flags |= RING_BUFFER_WRITABLE;
3789 3596
3790 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3597 rb = rb_alloc(nr_pages,
3791 event->cpu, flags); 3598 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3792 if (!buffer) { 3599 event->cpu, flags);
3600
3601 if (!rb) {
3793 ret = -ENOMEM; 3602 ret = -ENOMEM;
3794 goto unlock; 3603 goto unlock;
3795 } 3604 }
3796 rcu_assign_pointer(event->buffer, buffer); 3605 rcu_assign_pointer(event->rb, rb);
3797 3606
3798 atomic_long_add(user_extra, &user->locked_vm); 3607 atomic_long_add(user_extra, &user->locked_vm);
3799 event->mmap_locked = extra; 3608 event->mmap_locked = extra;
@@ -3892,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3892} 3701}
3893EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3702EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3894 3703
3895/*
3896 * Output
3897 */
3898static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3899 unsigned long offset, unsigned long head)
3900{
3901 unsigned long mask;
3902
3903 if (!buffer->writable)
3904 return true;
3905
3906 mask = perf_data_size(buffer) - 1;
3907
3908 offset = (offset - tail) & mask;
3909 head = (head - tail) & mask;
3910
3911 if ((int)(head - offset) < 0)
3912 return false;
3913
3914 return true;
3915}
3916
3917static void perf_output_wakeup(struct perf_output_handle *handle)
3918{
3919 atomic_set(&handle->buffer->poll, POLL_IN);
3920
3921 if (handle->nmi) {
3922 handle->event->pending_wakeup = 1;
3923 irq_work_queue(&handle->event->pending);
3924 } else
3925 perf_event_wakeup(handle->event);
3926}
3927
3928/*
3929 * We need to ensure a later event_id doesn't publish a head when a former
3930 * event isn't done writing. However since we need to deal with NMIs we
3931 * cannot fully serialize things.
3932 *
3933 * We only publish the head (and generate a wakeup) when the outer-most
3934 * event completes.
3935 */
3936static void perf_output_get_handle(struct perf_output_handle *handle)
3937{
3938 struct perf_buffer *buffer = handle->buffer;
3939
3940 preempt_disable();
3941 local_inc(&buffer->nest);
3942 handle->wakeup = local_read(&buffer->wakeup);
3943}
3944
3945static void perf_output_put_handle(struct perf_output_handle *handle)
3946{
3947 struct perf_buffer *buffer = handle->buffer;
3948 unsigned long head;
3949
3950again:
3951 head = local_read(&buffer->head);
3952
3953 /*
3954 * IRQ/NMI can happen here, which means we can miss a head update.
3955 */
3956
3957 if (!local_dec_and_test(&buffer->nest))
3958 goto out;
3959
3960 /*
3961 * Publish the known good head. Rely on the full barrier implied
3962 * by atomic_dec_and_test() order the buffer->head read and this
3963 * write.
3964 */
3965 buffer->user_page->data_head = head;
3966
3967 /*
3968 * Now check if we missed an update, rely on the (compiler)
3969 * barrier in atomic_dec_and_test() to re-read buffer->head.
3970 */
3971 if (unlikely(head != local_read(&buffer->head))) {
3972 local_inc(&buffer->nest);
3973 goto again;
3974 }
3975
3976 if (handle->wakeup != local_read(&buffer->wakeup))
3977 perf_output_wakeup(handle);
3978
3979out:
3980 preempt_enable();
3981}
3982
3983__always_inline void perf_output_copy(struct perf_output_handle *handle,
3984 const void *buf, unsigned int len)
3985{
3986 do {
3987 unsigned long size = min_t(unsigned long, handle->size, len);
3988
3989 memcpy(handle->addr, buf, size);
3990
3991 len -= size;
3992 handle->addr += size;
3993 buf += size;
3994 handle->size -= size;
3995 if (!handle->size) {
3996 struct perf_buffer *buffer = handle->buffer;
3997
3998 handle->page++;
3999 handle->page &= buffer->nr_pages - 1;
4000 handle->addr = buffer->data_pages[handle->page];
4001 handle->size = PAGE_SIZE << page_order(buffer);
4002 }
4003 } while (len);
4004}
4005
4006static void __perf_event_header__init_id(struct perf_event_header *header, 3704static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data, 3705 struct perf_sample_data *data,
4008 struct perf_event *event) 3706 struct perf_event *event)
@@ -4033,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4033 } 3731 }
4034} 3732}
4035 3733
4036static void perf_event_header__init_id(struct perf_event_header *header, 3734void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data, 3735 struct perf_sample_data *data,
4038 struct perf_event *event) 3736 struct perf_event *event)
4039{ 3737{
4040 if (event->attr.sample_id_all) 3738 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event); 3739 __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4062 perf_output_put(handle, data->cpu_entry); 3760 perf_output_put(handle, data->cpu_entry);
4063} 3761}
4064 3762
4065static void perf_event__output_id_sample(struct perf_event *event, 3763void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle, 3764 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample) 3765 struct perf_sample_data *sample)
4068{ 3766{
4069 if (event->attr.sample_id_all) 3767 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample); 3768 __perf_event__output_id_sample(handle, sample);
4071} 3769}
4072 3770
4073int perf_output_begin(struct perf_output_handle *handle,
4074 struct perf_event *event, unsigned int size,
4075 int nmi, int sample)
4076{
4077 struct perf_buffer *buffer;
4078 unsigned long tail, offset, head;
4079 int have_lost;
4080 struct perf_sample_data sample_data;
4081 struct {
4082 struct perf_event_header header;
4083 u64 id;
4084 u64 lost;
4085 } lost_event;
4086
4087 rcu_read_lock();
4088 /*
4089 * For inherited events we send all the output towards the parent.
4090 */
4091 if (event->parent)
4092 event = event->parent;
4093
4094 buffer = rcu_dereference(event->buffer);
4095 if (!buffer)
4096 goto out;
4097
4098 handle->buffer = buffer;
4099 handle->event = event;
4100 handle->nmi = nmi;
4101 handle->sample = sample;
4102
4103 if (!buffer->nr_pages)
4104 goto out;
4105
4106 have_lost = local_read(&buffer->lost);
4107 if (have_lost) {
4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
4113
4114 perf_output_get_handle(handle);
4115
4116 do {
4117 /*
4118 * Userspace could choose to issue a mb() before updating the
4119 * tail pointer. So that all reads will be completed before the
4120 * write is issued.
4121 */
4122 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4123 smp_rmb();
4124 offset = head = local_read(&buffer->head);
4125 head += size;
4126 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4127 goto fail;
4128 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4129
4130 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4131 local_add(buffer->watermark, &buffer->wakeup);
4132
4133 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4134 handle->page &= buffer->nr_pages - 1;
4135 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4136 handle->addr = buffer->data_pages[handle->page];
4137 handle->addr += handle->size;
4138 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4139
4140 if (have_lost) {
4141 lost_event.header.type = PERF_RECORD_LOST;
4142 lost_event.header.misc = 0;
4143 lost_event.id = event->id;
4144 lost_event.lost = local_xchg(&buffer->lost, 0);
4145
4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
4148 }
4149
4150 return 0;
4151
4152fail:
4153 local_inc(&buffer->lost);
4154 perf_output_put_handle(handle);
4155out:
4156 rcu_read_unlock();
4157
4158 return -ENOSPC;
4159}
4160
4161void perf_output_end(struct perf_output_handle *handle)
4162{
4163 struct perf_event *event = handle->event;
4164 struct perf_buffer *buffer = handle->buffer;
4165
4166 int wakeup_events = event->attr.wakeup_events;
4167
4168 if (handle->sample && wakeup_events) {
4169 int events = local_inc_return(&buffer->events);
4170 if (events >= wakeup_events) {
4171 local_sub(wakeup_events, &buffer->events);
4172 local_inc(&buffer->wakeup);
4173 }
4174 }
4175
4176 perf_output_put_handle(handle);
4177 rcu_read_unlock();
4178}
4179
4180static void perf_output_read_one(struct perf_output_handle *handle, 3771static void perf_output_read_one(struct perf_output_handle *handle,
4181 struct perf_event *event, 3772 struct perf_event *event,
4182 u64 enabled, u64 running) 3773 u64 enabled, u64 running)
@@ -4197,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4197 if (read_format & PERF_FORMAT_ID) 3788 if (read_format & PERF_FORMAT_ID)
4198 values[n++] = primary_event_id(event); 3789 values[n++] = primary_event_id(event);
4199 3790
4200 perf_output_copy(handle, values, n * sizeof(u64)); 3791 __output_copy(handle, values, n * sizeof(u64));
4201} 3792}
4202 3793
4203/* 3794/*
@@ -4227,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4227 if (read_format & PERF_FORMAT_ID) 3818 if (read_format & PERF_FORMAT_ID)
4228 values[n++] = primary_event_id(leader); 3819 values[n++] = primary_event_id(leader);
4229 3820
4230 perf_output_copy(handle, values, n * sizeof(u64)); 3821 __output_copy(handle, values, n * sizeof(u64));
4231 3822
4232 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3823 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4233 n = 0; 3824 n = 0;
@@ -4239,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4239 if (read_format & PERF_FORMAT_ID) 3830 if (read_format & PERF_FORMAT_ID)
4240 values[n++] = primary_event_id(sub); 3831 values[n++] = primary_event_id(sub);
4241 3832
4242 perf_output_copy(handle, values, n * sizeof(u64)); 3833 __output_copy(handle, values, n * sizeof(u64));
4243 } 3834 }
4244} 3835}
4245 3836
@@ -4249,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4249static void perf_output_read(struct perf_output_handle *handle, 3840static void perf_output_read(struct perf_output_handle *handle,
4250 struct perf_event *event) 3841 struct perf_event *event)
4251{ 3842{
4252 u64 enabled = 0, running = 0, now, ctx_time; 3843 u64 enabled = 0, running = 0;
4253 u64 read_format = event->attr.read_format; 3844 u64 read_format = event->attr.read_format;
4254 3845
4255 /* 3846 /*
@@ -4261,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4261 * because of locking issue as we are called in 3852 * because of locking issue as we are called in
4262 * NMI context 3853 * NMI context
4263 */ 3854 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3855 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4265 now = perf_clock(); 3856 calc_timer_values(event, &enabled, &running);
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270 3857
4271 if (event->attr.read_format & PERF_FORMAT_GROUP) 3858 if (event->attr.read_format & PERF_FORMAT_GROUP)
4272 perf_output_read_group(handle, event, enabled, running); 3859 perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4319 3906
4320 size *= sizeof(u64); 3907 size *= sizeof(u64);
4321 3908
4322 perf_output_copy(handle, data->callchain, size); 3909 __output_copy(handle, data->callchain, size);
4323 } else { 3910 } else {
4324 u64 nr = 0; 3911 u64 nr = 0;
4325 perf_output_put(handle, nr); 3912 perf_output_put(handle, nr);
@@ -4329,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4329 if (sample_type & PERF_SAMPLE_RAW) { 3916 if (sample_type & PERF_SAMPLE_RAW) {
4330 if (data->raw) { 3917 if (data->raw) {
4331 perf_output_put(handle, data->raw->size); 3918 perf_output_put(handle, data->raw->size);
4332 perf_output_copy(handle, data->raw->data, 3919 __output_copy(handle, data->raw->data,
4333 data->raw->size); 3920 data->raw->size);
4334 } else { 3921 } else {
4335 struct { 3922 struct {
4336 u32 size; 3923 u32 size;
@@ -4342,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4342 perf_output_put(handle, raw); 3929 perf_output_put(handle, raw);
4343 } 3930 }
4344 } 3931 }
3932
3933 if (!event->attr.watermark) {
3934 int wakeup_events = event->attr.wakeup_events;
3935
3936 if (wakeup_events) {
3937 struct ring_buffer *rb = handle->rb;
3938 int events = local_inc_return(&rb->events);
3939
3940 if (events >= wakeup_events) {
3941 local_sub(wakeup_events, &rb->events);
3942 local_inc(&rb->wakeup);
3943 }
3944 }
3945 }
4345} 3946}
4346 3947
4347void perf_prepare_sample(struct perf_event_header *header, 3948void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4386 } 3987 }
4387} 3988}
4388 3989
4389static void perf_event_output(struct perf_event *event, int nmi, 3990static void perf_event_output(struct perf_event *event,
4390 struct perf_sample_data *data, 3991 struct perf_sample_data *data,
4391 struct pt_regs *regs) 3992 struct pt_regs *regs)
4392{ 3993{
@@ -4398,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4398 3999
4399 perf_prepare_sample(&header, data, event, regs); 4000 perf_prepare_sample(&header, data, event, regs);
4400 4001
4401 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4002 if (perf_output_begin(&handle, event, header.size))
4402 goto exit; 4003 goto exit;
4403 4004
4404 perf_output_sample(&handle, &header, data, event); 4005 perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
4438 int ret; 4039 int ret;
4439 4040
4440 perf_event_header__init_id(&read_event.header, &sample, event); 4041 perf_event_header__init_id(&read_event.header, &sample, event);
4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4042 ret = perf_output_begin(&handle, event, read_event.header.size);
4442 if (ret) 4043 if (ret)
4443 return; 4044 return;
4444 4045
@@ -4481,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4082 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4482 4083
4483 ret = perf_output_begin(&handle, event, 4084 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0); 4085 task_event->event_id.header.size);
4485 if (ret) 4086 if (ret)
4486 goto out; 4087 goto out;
4487 4088
@@ -4618,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
4618 4219
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4220 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event, 4221 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0); 4222 comm_event->event_id.header.size);
4622 4223
4623 if (ret) 4224 if (ret)
4624 goto out; 4225 goto out;
@@ -4627,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4228 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4628 4229
4629 perf_output_put(&handle, comm_event->event_id); 4230 perf_output_put(&handle, comm_event->event_id);
4630 perf_output_copy(&handle, comm_event->comm, 4231 __output_copy(&handle, comm_event->comm,
4631 comm_event->comm_size); 4232 comm_event->comm_size);
4632 4233
4633 perf_event__output_id_sample(event, &handle, &sample); 4234 perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4765 4366
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4367 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event, 4368 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0); 4369 mmap_event->event_id.header.size);
4769 if (ret) 4370 if (ret)
4770 goto out; 4371 goto out;
4771 4372
@@ -4773,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 mmap_event->event_id.tid = perf_event_tid(event, current); 4374 mmap_event->event_id.tid = perf_event_tid(event, current);
4774 4375
4775 perf_output_put(&handle, mmap_event->event_id); 4376 perf_output_put(&handle, mmap_event->event_id);
4776 perf_output_copy(&handle, mmap_event->file_name, 4377 __output_copy(&handle, mmap_event->file_name,
4777 mmap_event->file_size); 4378 mmap_event->file_size);
4778 4379
4779 perf_event__output_id_sample(event, &handle, &sample); 4380 perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4829 4430
4830 if (file) { 4431 if (file) {
4831 /* 4432 /*
4832 * d_path works from the end of the buffer backwards, so we 4433 * d_path works from the end of the rb backwards, so we
4833 * need to add enough zero bytes after the string to handle 4434 * need to add enough zero bytes after the string to handle
4834 * the 64bit alignment we do later. 4435 * the 64bit alignment we do later.
4835 */ 4436 */
@@ -4960,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4960 perf_event_header__init_id(&throttle_event.header, &sample, event); 4561 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961 4562
4962 ret = perf_output_begin(&handle, event, 4563 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0); 4564 throttle_event.header.size);
4964 if (ret) 4565 if (ret)
4965 return; 4566 return;
4966 4567
@@ -4973,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4973 * Generic event overflow handling, sampling. 4574 * Generic event overflow handling, sampling.
4974 */ 4575 */
4975 4576
4976static int __perf_event_overflow(struct perf_event *event, int nmi, 4577static int __perf_event_overflow(struct perf_event *event,
4977 int throttle, struct perf_sample_data *data, 4578 int throttle, struct perf_sample_data *data,
4978 struct pt_regs *regs) 4579 struct pt_regs *regs)
4979{ 4580{
@@ -5016,34 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5016 if (events && atomic_dec_and_test(&event->event_limit)) { 4617 if (events && atomic_dec_and_test(&event->event_limit)) {
5017 ret = 1; 4618 ret = 1;
5018 event->pending_kill = POLL_HUP; 4619 event->pending_kill = POLL_HUP;
5019 if (nmi) { 4620 event->pending_disable = 1;
5020 event->pending_disable = 1; 4621 irq_work_queue(&event->pending);
5021 irq_work_queue(&event->pending);
5022 } else
5023 perf_event_disable(event);
5024 } 4622 }
5025 4623
5026 if (event->overflow_handler) 4624 if (event->overflow_handler)
5027 event->overflow_handler(event, nmi, data, regs); 4625 event->overflow_handler(event, data, regs);
5028 else 4626 else
5029 perf_event_output(event, nmi, data, regs); 4627 perf_event_output(event, data, regs);
5030 4628
5031 if (event->fasync && event->pending_kill) { 4629 if (event->fasync && event->pending_kill) {
5032 if (nmi) { 4630 event->pending_wakeup = 1;
5033 event->pending_wakeup = 1; 4631 irq_work_queue(&event->pending);
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 } 4632 }
5038 4633
5039 return ret; 4634 return ret;
5040} 4635}
5041 4636
5042int perf_event_overflow(struct perf_event *event, int nmi, 4637int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4638 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4639 struct pt_regs *regs)
5045{ 4640{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4641 return __perf_event_overflow(event, 1, data, regs);
5047} 4642}
5048 4643
5049/* 4644/*
@@ -5092,7 +4687,7 @@ again:
5092} 4687}
5093 4688
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4689static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4690 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4691 struct pt_regs *regs)
5097{ 4692{
5098 struct hw_perf_event *hwc = &event->hw; 4693 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4701 return;
5107 4702
5108 for (; overflow; overflow--) { 4703 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4704 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4705 data, regs)) {
5111 /* 4706 /*
5112 * We inhibit the overflow from happening when 4707 * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4714}
5120 4715
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4716static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4717 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4718 struct pt_regs *regs)
5124{ 4719{
5125 struct hw_perf_event *hwc = &event->hw; 4720 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4728 return;
5134 4729
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4730 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4731 return perf_swevent_overflow(event, 1, data, regs);
5137 4732
5138 if (local64_add_negative(nr, &hwc->period_left)) 4733 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4734 return;
5140 4735
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4736 perf_swevent_overflow(event, 0, data, regs);
5142} 4737}
5143 4738
5144static int perf_exclude_event(struct perf_event *event, 4739static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4821}
5227 4822
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4823static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4824 u64 nr,
5230 struct perf_sample_data *data, 4825 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4826 struct pt_regs *regs)
5232{ 4827{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4837
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4838 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4839 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4840 perf_swevent_event(event, nr, data, regs);
5246 } 4841 }
5247end: 4842end:
5248 rcu_read_unlock(); 4843 rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4858 put_recursion_context(swhash->recursion, rctx);
5264} 4859}
5265 4860
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4861void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4862{
5269 struct perf_sample_data data; 4863 struct perf_sample_data data;
5270 int rctx; 4864 int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4870
5277 perf_sample_data_init(&data, addr); 4871 perf_sample_data_init(&data, addr);
5278 4872
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4873 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4874
5281 perf_swevent_put_recursion_context(rctx); 4875 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4876 preempt_enable_notrace();
@@ -5524,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5524 5118
5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5119 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5526 if (perf_tp_event_match(event, &data, regs)) 5120 if (perf_tp_event_match(event, &data, regs))
5527 perf_swevent_event(event, count, 1, &data, regs); 5121 perf_swevent_event(event, count, &data, regs);
5528 } 5122 }
5529 5123
5530 perf_swevent_put_recursion_context(rctx); 5124 perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5617 perf_sample_data_init(&sample, bp->attr.bp_addr); 5211 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618 5212
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5213 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs); 5214 perf_swevent_event(bp, 1, &sample, regs);
5621} 5215}
5622#endif 5216#endif
5623 5217
@@ -5646,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5646 5240
5647 if (regs && !perf_exclude_event(event, regs)) { 5241 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0)) 5242 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs)) 5243 if (perf_event_overflow(event, &data, regs))
5650 ret = HRTIMER_NORESTART; 5244 ret = HRTIMER_NORESTART;
5651 } 5245 }
5652 5246
@@ -5986,6 +5580,7 @@ free_dev:
5986} 5580}
5987 5581
5988static struct lock_class_key cpuctx_mutex; 5582static struct lock_class_key cpuctx_mutex;
5583static struct lock_class_key cpuctx_lock;
5989 5584
5990int perf_pmu_register(struct pmu *pmu, char *name, int type) 5585int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{ 5586{
@@ -6036,6 +5631,7 @@ skip_type:
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx); 5632 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5633 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5634 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6039 cpuctx->ctx.type = cpu_context; 5635 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu; 5636 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1; 5637 cpuctx->jiffies_interval = 1;
@@ -6150,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6150 struct task_struct *task, 5746 struct task_struct *task,
6151 struct perf_event *group_leader, 5747 struct perf_event *group_leader,
6152 struct perf_event *parent_event, 5748 struct perf_event *parent_event,
6153 perf_overflow_handler_t overflow_handler) 5749 perf_overflow_handler_t overflow_handler,
5750 void *context)
6154{ 5751{
6155 struct pmu *pmu; 5752 struct pmu *pmu;
6156 struct perf_event *event; 5753 struct perf_event *event;
@@ -6208,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6208#endif 5805#endif
6209 } 5806 }
6210 5807
6211 if (!overflow_handler && parent_event) 5808 if (!overflow_handler && parent_event) {
6212 overflow_handler = parent_event->overflow_handler; 5809 overflow_handler = parent_event->overflow_handler;
5810 context = parent_event->overflow_handler_context;
5811 }
6213 5812
6214 event->overflow_handler = overflow_handler; 5813 event->overflow_handler = overflow_handler;
5814 event->overflow_handler_context = context;
6215 5815
6216 if (attr->disabled) 5816 if (attr->disabled)
6217 event->state = PERF_EVENT_STATE_OFF; 5817 event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6326 if (ret) 5926 if (ret)
6327 return -EFAULT; 5927 return -EFAULT;
6328 5928
6329 /*
6330 * If the type exists, the corresponding creation will verify
6331 * the attr->config.
6332 */
6333 if (attr->type >= PERF_TYPE_MAX)
6334 return -EINVAL;
6335
6336 if (attr->__reserved_1) 5929 if (attr->__reserved_1)
6337 return -EINVAL; 5930 return -EINVAL;
6338 5931
@@ -6354,7 +5947,7 @@ err_size:
6354static int 5947static int
6355perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5948perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6356{ 5949{
6357 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5950 struct ring_buffer *rb = NULL, *old_rb = NULL;
6358 int ret = -EINVAL; 5951 int ret = -EINVAL;
6359 5952
6360 if (!output_event) 5953 if (!output_event)
@@ -6371,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6371 goto out; 5964 goto out;
6372 5965
6373 /* 5966 /*
6374 * If its not a per-cpu buffer, it must be the same task. 5967 * If its not a per-cpu rb, it must be the same task.
6375 */ 5968 */
6376 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5969 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6377 goto out; 5970 goto out;
@@ -6383,20 +5976,20 @@ set:
6383 goto unlock; 5976 goto unlock;
6384 5977
6385 if (output_event) { 5978 if (output_event) {
6386 /* get the buffer we want to redirect to */ 5979 /* get the rb we want to redirect to */
6387 buffer = perf_buffer_get(output_event); 5980 rb = ring_buffer_get(output_event);
6388 if (!buffer) 5981 if (!rb)
6389 goto unlock; 5982 goto unlock;
6390 } 5983 }
6391 5984
6392 old_buffer = event->buffer; 5985 old_rb = event->rb;
6393 rcu_assign_pointer(event->buffer, buffer); 5986 rcu_assign_pointer(event->rb, rb);
6394 ret = 0; 5987 ret = 0;
6395unlock: 5988unlock:
6396 mutex_unlock(&event->mmap_mutex); 5989 mutex_unlock(&event->mmap_mutex);
6397 5990
6398 if (old_buffer) 5991 if (old_rb)
6399 perf_buffer_put(old_buffer); 5992 ring_buffer_put(old_rb);
6400out: 5993out:
6401 return ret; 5994 return ret;
6402} 5995}
@@ -6478,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
6478 } 6071 }
6479 } 6072 }
6480 6073
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6074 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6075 NULL, NULL);
6482 if (IS_ERR(event)) { 6076 if (IS_ERR(event)) {
6483 err = PTR_ERR(event); 6077 err = PTR_ERR(event);
6484 goto err_task; 6078 goto err_task;
@@ -6663,7 +6257,8 @@ err_fd:
6663struct perf_event * 6257struct perf_event *
6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6258perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6665 struct task_struct *task, 6259 struct task_struct *task,
6666 perf_overflow_handler_t overflow_handler) 6260 perf_overflow_handler_t overflow_handler,
6261 void *context)
6667{ 6262{
6668 struct perf_event_context *ctx; 6263 struct perf_event_context *ctx;
6669 struct perf_event *event; 6264 struct perf_event *event;
@@ -6673,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 * Get the target context (task or percpu): 6268 * Get the target context (task or percpu):
6674 */ 6269 */
6675 6270
6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6271 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6272 overflow_handler, context);
6677 if (IS_ERR(event)) { 6273 if (IS_ERR(event)) {
6678 err = PTR_ERR(event); 6274 err = PTR_ERR(event);
6679 goto err; 6275 goto err;
@@ -6780,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6780 * our context. 6376 * our context.
6781 */ 6377 */
6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6378 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
6784 6379
6785 /* 6380 /*
6786 * Take the context lock here so that if find_get_context is 6381 * Take the context lock here so that if find_get_context is
@@ -6788,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * incremented the context's refcount before we do put_ctx below. 6383 * incremented the context's refcount before we do put_ctx below.
6789 */ 6384 */
6790 raw_spin_lock(&child_ctx->lock); 6385 raw_spin_lock(&child_ctx->lock);
6386 task_ctx_sched_out(child_ctx);
6791 child->perf_event_ctxp[ctxn] = NULL; 6387 child->perf_event_ctxp[ctxn] = NULL;
6792 /* 6388 /*
6793 * If this context is a clone; unclone it so it can't get 6389 * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
6957 parent_event->cpu, 6553 parent_event->cpu,
6958 child, 6554 child,
6959 group_leader, parent_event, 6555 group_leader, parent_event,
6960 NULL); 6556 NULL, NULL);
6961 if (IS_ERR(child_event)) 6557 if (IS_ERR(child_event))
6962 return child_event; 6558 return child_event;
6963 get_ctx(child_ctx); 6559 get_ctx(child_ctx);
@@ -6984,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
6984 6580
6985 child_event->ctx = child_ctx; 6581 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler; 6582 child_event->overflow_handler = parent_event->overflow_handler;
6583 child_event->overflow_handler_context
6584 = parent_event->overflow_handler_context;
6987 6585
6988 /* 6586 /*
6989 * Precalculate sample_data sizes 6587 * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 14c9b63a96c3..9ee58bb9e60f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -168,7 +168,6 @@ void release_task(struct task_struct * p)
168 struct task_struct *leader; 168 struct task_struct *leader;
169 int zap_leader; 169 int zap_leader;
170repeat: 170repeat:
171 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 171 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials. But shut RCU-lockdep up */ 172 * can't be modifying its own credentials. But shut RCU-lockdep up */
174 rcu_read_lock(); 173 rcu_read_lock();
@@ -178,7 +177,7 @@ repeat:
178 proc_flush_task(p); 177 proc_flush_task(p);
179 178
180 write_lock_irq(&tasklist_lock); 179 write_lock_irq(&tasklist_lock);
181 tracehook_finish_release_task(p); 180 ptrace_release_task(p);
182 __exit_signal(p); 181 __exit_signal(p);
183 182
184 /* 183 /*
@@ -189,22 +188,12 @@ repeat:
189 zap_leader = 0; 188 zap_leader = 0;
190 leader = p->group_leader; 189 leader = p->group_leader;
191 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 190 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
192 BUG_ON(task_detached(leader));
193 do_notify_parent(leader, leader->exit_signal);
194 /* 191 /*
195 * If we were the last child thread and the leader has 192 * If we were the last child thread and the leader has
196 * exited already, and the leader's parent ignores SIGCHLD, 193 * exited already, and the leader's parent ignores SIGCHLD,
197 * then we are the one who should release the leader. 194 * then we are the one who should release the leader.
198 *
199 * do_notify_parent() will have marked it self-reaping in
200 * that case.
201 */
202 zap_leader = task_detached(leader);
203
204 /*
205 * This maintains the invariant that release_task()
206 * only runs on a task in EXIT_DEAD, just for sanity.
207 */ 195 */
196 zap_leader = do_notify_parent(leader, leader->exit_signal);
208 if (zap_leader) 197 if (zap_leader)
209 leader->exit_state = EXIT_DEAD; 198 leader->exit_state = EXIT_DEAD;
210 } 199 }
@@ -276,18 +265,16 @@ int is_current_pgrp_orphaned(void)
276 return retval; 265 return retval;
277} 266}
278 267
279static int has_stopped_jobs(struct pid *pgrp) 268static bool has_stopped_jobs(struct pid *pgrp)
280{ 269{
281 int retval = 0;
282 struct task_struct *p; 270 struct task_struct *p;
283 271
284 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 272 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
285 if (!task_is_stopped(p)) 273 if (p->signal->flags & SIGNAL_STOP_STOPPED)
286 continue; 274 return true;
287 retval = 1;
288 break;
289 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 275 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
290 return retval; 276
277 return false;
291} 278}
292 279
293/* 280/*
@@ -750,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
750{ 737{
751 list_move_tail(&p->sibling, &p->real_parent->children); 738 list_move_tail(&p->sibling, &p->real_parent->children);
752 739
753 if (task_detached(p)) 740 if (p->exit_state == EXIT_DEAD)
754 return; 741 return;
755 /* 742 /*
756 * If this is a threaded reparent there is no need to 743 * If this is a threaded reparent there is no need to
@@ -763,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
763 p->exit_signal = SIGCHLD; 750 p->exit_signal = SIGCHLD;
764 751
765 /* If it has exited notify the new parent about this child's death. */ 752 /* If it has exited notify the new parent about this child's death. */
766 if (!task_ptrace(p) && 753 if (!p->ptrace &&
767 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 754 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
768 do_notify_parent(p, p->exit_signal); 755 if (do_notify_parent(p, p->exit_signal)) {
769 if (task_detached(p)) {
770 p->exit_state = EXIT_DEAD; 756 p->exit_state = EXIT_DEAD;
771 list_move_tail(&p->sibling, dead); 757 list_move_tail(&p->sibling, dead);
772 } 758 }
@@ -793,7 +779,7 @@ static void forget_original_parent(struct task_struct *father)
793 do { 779 do {
794 t->real_parent = reaper; 780 t->real_parent = reaper;
795 if (t->parent == father) { 781 if (t->parent == father) {
796 BUG_ON(task_ptrace(t)); 782 BUG_ON(t->ptrace);
797 t->parent = t->real_parent; 783 t->parent = t->real_parent;
798 } 784 }
799 if (t->pdeath_signal) 785 if (t->pdeath_signal)
@@ -818,8 +804,7 @@ static void forget_original_parent(struct task_struct *father)
818 */ 804 */
819static void exit_notify(struct task_struct *tsk, int group_dead) 805static void exit_notify(struct task_struct *tsk, int group_dead)
820{ 806{
821 int signal; 807 bool autoreap;
822 void *cookie;
823 808
824 /* 809 /*
825 * This does two things: 810 * This does two things:
@@ -850,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
850 * we have changed execution domain as these two values started 835 * we have changed execution domain as these two values started
851 * the same after a fork. 836 * the same after a fork.
852 */ 837 */
853 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
854 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
855 tsk->self_exec_id != tsk->parent_exec_id)) 840 tsk->self_exec_id != tsk->parent_exec_id))
856 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
857 842
858 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 if (unlikely(tsk->ptrace)) {
859 if (signal >= 0) 844 int sig = thread_group_leader(tsk) &&
860 signal = do_notify_parent(tsk, signal); 845 thread_group_empty(tsk) &&
846 !ptrace_reparented(tsk) ?
847 tsk->exit_signal : SIGCHLD;
848 autoreap = do_notify_parent(tsk, sig);
849 } else if (thread_group_leader(tsk)) {
850 autoreap = thread_group_empty(tsk) &&
851 do_notify_parent(tsk, tsk->exit_signal);
852 } else {
853 autoreap = true;
854 }
861 855
862 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 856 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
863 857
864 /* mt-exec, de_thread() is waiting for group leader */ 858 /* mt-exec, de_thread() is waiting for group leader */
865 if (unlikely(tsk->signal->notify_count < 0)) 859 if (unlikely(tsk->signal->notify_count < 0))
866 wake_up_process(tsk->signal->group_exit_task); 860 wake_up_process(tsk->signal->group_exit_task);
867 write_unlock_irq(&tasklist_lock); 861 write_unlock_irq(&tasklist_lock);
868 862
869 tracehook_report_death(tsk, signal, cookie, group_dead);
870
871 /* If the process is dead, release it - nobody will wait for it */ 863 /* If the process is dead, release it - nobody will wait for it */
872 if (signal == DEATH_REAP) 864 if (autoreap)
873 release_task(tsk); 865 release_task(tsk);
874} 866}
875 867
@@ -905,7 +897,6 @@ NORET_TYPE void do_exit(long code)
905 897
906 profile_task_exit(tsk); 898 profile_task_exit(tsk);
907 899
908 WARN_ON(atomic_read(&tsk->fs_excl));
909 WARN_ON(blk_needs_flush_plug(tsk)); 900 WARN_ON(blk_needs_flush_plug(tsk));
910 901
911 if (unlikely(in_interrupt())) 902 if (unlikely(in_interrupt()))
@@ -922,7 +913,7 @@ NORET_TYPE void do_exit(long code)
922 */ 913 */
923 set_fs(USER_DS); 914 set_fs(USER_DS);
924 915
925 tracehook_report_exit(&code); 916 ptrace_event(PTRACE_EVENT_EXIT, code);
926 917
927 validate_creds_for_do_exit(tsk); 918 validate_creds_for_do_exit(tsk);
928 919
@@ -1234,9 +1225,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1234 traced = ptrace_reparented(p); 1225 traced = ptrace_reparented(p);
1235 /* 1226 /*
1236 * It can be ptraced but not reparented, check 1227 * It can be ptraced but not reparented, check
1237 * !task_detached() to filter out sub-threads. 1228 * thread_group_leader() to filter out sub-threads.
1238 */ 1229 */
1239 if (likely(!traced) && likely(!task_detached(p))) { 1230 if (likely(!traced) && thread_group_leader(p)) {
1240 struct signal_struct *psig; 1231 struct signal_struct *psig;
1241 struct signal_struct *sig; 1232 struct signal_struct *sig;
1242 unsigned long maxrss; 1233 unsigned long maxrss;
@@ -1344,16 +1335,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1344 /* We dropped tasklist, ptracer could die and untrace */ 1335 /* We dropped tasklist, ptracer could die and untrace */
1345 ptrace_unlink(p); 1336 ptrace_unlink(p);
1346 /* 1337 /*
1347 * If this is not a detached task, notify the parent. 1338 * If this is not a sub-thread, notify the parent.
1348 * If it's still not detached after that, don't release 1339 * If parent wants a zombie, don't release it now.
1349 * it now.
1350 */ 1340 */
1351 if (!task_detached(p)) { 1341 if (thread_group_leader(p) &&
1352 do_notify_parent(p, p->exit_signal); 1342 !do_notify_parent(p, p->exit_signal)) {
1353 if (!task_detached(p)) { 1343 p->exit_state = EXIT_ZOMBIE;
1354 p->exit_state = EXIT_ZOMBIE; 1344 p = NULL;
1355 p = NULL;
1356 }
1357 } 1345 }
1358 write_unlock_irq(&tasklist_lock); 1346 write_unlock_irq(&tasklist_lock);
1359 } 1347 }
@@ -1366,7 +1354,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1366static int *task_stopped_code(struct task_struct *p, bool ptrace) 1354static int *task_stopped_code(struct task_struct *p, bool ptrace)
1367{ 1355{
1368 if (ptrace) { 1356 if (ptrace) {
1369 if (task_is_stopped_or_traced(p)) 1357 if (task_is_stopped_or_traced(p) &&
1358 !(p->jobctl & JOBCTL_LISTENING))
1370 return &p->exit_code; 1359 return &p->exit_code;
1371 } else { 1360 } else {
1372 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1361 if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1562,7 +1551,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1562 * Notification and reaping will be cascaded to the real 1551 * Notification and reaping will be cascaded to the real
1563 * parent when the ptracer detaches. 1552 * parent when the ptracer detaches.
1564 */ 1553 */
1565 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1554 if (likely(!ptrace) && unlikely(p->ptrace)) {
1566 /* it will become visible, clear notask_error */ 1555 /* it will become visible, clear notask_error */
1567 wo->notask_error = 0; 1556 wo->notask_error = 0;
1568 return 0; 1557 return 0;
@@ -1605,8 +1594,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1605 * own children, it should create a separate process which 1594 * own children, it should create a separate process which
1606 * takes the role of real parent. 1595 * takes the role of real parent.
1607 */ 1596 */
1608 if (likely(!ptrace) && task_ptrace(p) && 1597 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1609 same_thread_group(p->parent, p->real_parent))
1610 return 0; 1598 return 0;
1611 1599
1612 /* 1600 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..17bf7c8d6511 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
37#include <linux/swap.h> 37#include <linux/swap.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/jiffies.h> 39#include <linux/jiffies.h>
40#include <linux/tracehook.h>
41#include <linux/futex.h> 40#include <linux/futex.h>
42#include <linux/compat.h> 41#include <linux/compat.h>
43#include <linux/kthread.h> 42#include <linux/kthread.h>
@@ -291,7 +290,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
291 290
292 /* One for us, one for whoever does the "release_task()" (usually parent) */ 291 /* One for us, one for whoever does the "release_task()" (usually parent) */
293 atomic_set(&tsk->usage,2); 292 atomic_set(&tsk->usage,2);
294 atomic_set(&tsk->fs_excl, 0);
295#ifdef CONFIG_BLK_DEV_IO_TRACE 293#ifdef CONFIG_BLK_DEV_IO_TRACE
296 tsk->btrace_seq = 0; 294 tsk->btrace_seq = 0;
297#endif 295#endif
@@ -1013,7 +1011,7 @@ static void rt_mutex_init_task(struct task_struct *p)
1013{ 1011{
1014 raw_spin_lock_init(&p->pi_lock); 1012 raw_spin_lock_init(&p->pi_lock);
1015#ifdef CONFIG_RT_MUTEXES 1013#ifdef CONFIG_RT_MUTEXES
1016 plist_head_init_raw(&p->pi_waiters, &p->pi_lock); 1014 plist_head_init(&p->pi_waiters);
1017 p->pi_blocked_on = NULL; 1015 p->pi_blocked_on = NULL;
1018#endif 1016#endif
1019} 1017}
@@ -1340,7 +1338,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1340 } 1338 }
1341 1339
1342 if (likely(p->pid)) { 1340 if (likely(p->pid)) {
1343 tracehook_finish_clone(p, clone_flags, trace); 1341 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1344 1342
1345 if (thread_group_leader(p)) { 1343 if (thread_group_leader(p)) {
1346 if (is_child_reaper(pid)) 1344 if (is_child_reaper(pid))
@@ -1481,10 +1479,22 @@ long do_fork(unsigned long clone_flags,
1481 } 1479 }
1482 1480
1483 /* 1481 /*
1484 * When called from kernel_thread, don't do user tracing stuff. 1482 * Determine whether and which event to report to ptracer. When
1483 * called from kernel_thread or CLONE_UNTRACED is explicitly
1484 * requested, no event is reported; otherwise, report if the event
1485 * for the type of forking is enabled.
1485 */ 1486 */
1486 if (likely(user_mode(regs))) 1487 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
1487 trace = tracehook_prepare_clone(clone_flags); 1488 if (clone_flags & CLONE_VFORK)
1489 trace = PTRACE_EVENT_VFORK;
1490 else if ((clone_flags & CSIGNAL) != SIGCHLD)
1491 trace = PTRACE_EVENT_CLONE;
1492 else
1493 trace = PTRACE_EVENT_FORK;
1494
1495 if (likely(!ptrace_event_enabled(current, trace)))
1496 trace = 0;
1497 }
1488 1498
1489 p = copy_process(clone_flags, stack_start, regs, stack_size, 1499 p = copy_process(clone_flags, stack_start, regs, stack_size,
1490 child_tidptr, NULL, trace); 1500 child_tidptr, NULL, trace);
@@ -1508,26 +1518,26 @@ long do_fork(unsigned long clone_flags,
1508 } 1518 }
1509 1519
1510 audit_finish_fork(p); 1520 audit_finish_fork(p);
1511 tracehook_report_clone(regs, clone_flags, nr, p);
1512 1521
1513 /* 1522 /*
1514 * We set PF_STARTING at creation in case tracing wants to 1523 * We set PF_STARTING at creation in case tracing wants to
1515 * use this to distinguish a fully live task from one that 1524 * use this to distinguish a fully live task from one that
1516 * hasn't gotten to tracehook_report_clone() yet. Now we 1525 * hasn't finished SIGSTOP raising yet. Now we clear it
1517 * clear it and set the child going. 1526 * and set the child going.
1518 */ 1527 */
1519 p->flags &= ~PF_STARTING; 1528 p->flags &= ~PF_STARTING;
1520 1529
1521 wake_up_new_task(p); 1530 wake_up_new_task(p);
1522 1531
1523 tracehook_report_clone_complete(trace, regs, 1532 /* forking complete and child started to run, tell ptracer */
1524 clone_flags, nr, p); 1533 if (unlikely(trace))
1534 ptrace_event(trace, nr);
1525 1535
1526 if (clone_flags & CLONE_VFORK) { 1536 if (clone_flags & CLONE_VFORK) {
1527 freezer_do_not_count(); 1537 freezer_do_not_count();
1528 wait_for_completion(&vfork); 1538 wait_for_completion(&vfork);
1529 freezer_count(); 1539 freezer_count();
1530 tracehook_report_vfork_done(p, nr); 1540 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1531 } 1541 }
1532 } else { 1542 } else {
1533 nr = PTR_ERR(p); 1543 nr = PTR_ERR(p);
@@ -1574,6 +1584,7 @@ void __init proc_caches_init(void)
1574 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1584 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1575 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1585 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1576 mmap_init(); 1586 mmap_init();
1587 nsproxy_cache_init();
1577} 1588}
1578 1589
1579/* 1590/*
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282eae..3fbc76cbb9aa 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2697,7 +2697,7 @@ static int __init futex_init(void)
2697 futex_cmpxchg_enabled = 1; 2697 futex_cmpxchg_enabled = 1;
2698 2698
2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2700 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); 2700 plist_head_init(&futex_queues[i].chain);
2701 spin_lock_init(&futex_queues[i].lock); 2701 spin_lock_init(&futex_queues[i].lock);
2702 } 2702 }
2703 2703
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 31a9db711906..3a2cab407b93 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -101,10 +101,10 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
101} 101}
102 102
103/** 103/**
104 * irq_gc_ack - Ack pending interrupt 104 * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
105 * @d: irq_data 105 * @d: irq_data
106 */ 106 */
107void irq_gc_ack(struct irq_data *d) 107void irq_gc_ack_set_bit(struct irq_data *d)
108{ 108{
109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 109 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
110 u32 mask = 1 << (d->irq - gc->irq_base); 110 u32 mask = 1 << (d->irq - gc->irq_base);
@@ -115,6 +115,20 @@ void irq_gc_ack(struct irq_data *d)
115} 115}
116 116
117/** 117/**
118 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
119 * @d: irq_data
120 */
121void irq_gc_ack_clr_bit(struct irq_data *d)
122{
123 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
124 u32 mask = ~(1 << (d->irq - gc->irq_base));
125
126 irq_gc_lock(gc);
127 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
128 irq_gc_unlock(gc);
129}
130
131/**
118 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt 132 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
119 * @d: irq_data 133 * @d: irq_data
120 */ 134 */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e7..b30fd54eb985 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1255/* 1255/*
1256 * If we have a symbol_name argument, look it up and add the offset field 1256 * If we have a symbol_name argument, look it up and add the offset field
1257 * to it. This way, we can specify a relative address to a symbol. 1257 * to it. This way, we can specify a relative address to a symbol.
1258 * This returns encoded errors if it fails to look up symbol or invalid
1259 * combination of parameters.
1258 */ 1260 */
1259static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1261static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1260{ 1262{
1261 kprobe_opcode_t *addr = p->addr; 1263 kprobe_opcode_t *addr = p->addr;
1264
1265 if ((p->symbol_name && p->addr) ||
1266 (!p->symbol_name && !p->addr))
1267 goto invalid;
1268
1262 if (p->symbol_name) { 1269 if (p->symbol_name) {
1263 if (addr)
1264 return NULL;
1265 kprobe_lookup_name(p->symbol_name, addr); 1270 kprobe_lookup_name(p->symbol_name, addr);
1271 if (!addr)
1272 return ERR_PTR(-ENOENT);
1266 } 1273 }
1267 1274
1268 if (!addr) 1275 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
1269 return NULL; 1276 if (addr)
1270 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1277 return addr;
1278
1279invalid:
1280 return ERR_PTR(-EINVAL);
1271} 1281}
1272 1282
1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1283/* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1311 kprobe_opcode_t *addr; 1321 kprobe_opcode_t *addr;
1312 1322
1313 addr = kprobe_addr(p); 1323 addr = kprobe_addr(p);
1314 if (!addr) 1324 if (IS_ERR(addr))
1315 return -EINVAL; 1325 return PTR_ERR(addr);
1316 p->addr = addr; 1326 p->addr = addr;
1317 1327
1318 ret = check_kprobe_rereg(p); 1328 ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1335 */ 1345 */
1336 probed_mod = __module_text_address((unsigned long) p->addr); 1346 probed_mod = __module_text_address((unsigned long) p->addr);
1337 if (probed_mod) { 1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1338 /* 1350 /*
1339 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1340 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1351 module_put(probed_mod); 1363 module_put(probed_mod);
1352 goto fail_with_jump_label; 1364 goto fail_with_jump_label;
1353 } 1365 }
1366 /* ret will be updated by following code */
1354 } 1367 }
1355 preempt_enable(); 1368 preempt_enable();
1356 jump_label_unlock(); 1369 jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
1399fail_with_jump_label: 1412fail_with_jump_label:
1400 preempt_enable(); 1413 preempt_enable();
1401 jump_label_unlock(); 1414 jump_label_unlock();
1402 return -EINVAL; 1415 return ret;
1403} 1416}
1404EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1405 1418
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1686 1699
1687 if (kretprobe_blacklist_size) { 1700 if (kretprobe_blacklist_size) {
1688 addr = kprobe_addr(&rp->kp); 1701 addr = kprobe_addr(&rp->kp);
1689 if (!addr) 1702 if (IS_ERR(addr))
1690 return -EINVAL; 1703 return PTR_ERR(addr);
1691 1704
1692 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1705 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
1693 if (kretprobe_blacklist[i].addr == addr) 1706 if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 298c9276dfdb..3956f5149e25 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2468 2468
2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2470 2470
2471 if (hlock_class(hlock)->key == &__lockdep_no_validate__)
2472 continue;
2473
2471 if (!mark_lock(curr, hlock, usage_bit)) 2474 if (!mark_lock(curr, hlock, usage_bit))
2472 return 0; 2475 return 0;
2473 } 2476 }
@@ -2478,15 +2481,10 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2478/* 2481/*
2479 * Hardirqs will be enabled: 2482 * Hardirqs will be enabled:
2480 */ 2483 */
2481void trace_hardirqs_on_caller(unsigned long ip) 2484static void __trace_hardirqs_on_caller(unsigned long ip)
2482{ 2485{
2483 struct task_struct *curr = current; 2486 struct task_struct *curr = current;
2484 2487
2485 time_hardirqs_on(CALLER_ADDR0, ip);
2486
2487 if (unlikely(!debug_locks || current->lockdep_recursion))
2488 return;
2489
2490 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2488 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2491 return; 2489 return;
2492 2490
@@ -2502,8 +2500,6 @@ void trace_hardirqs_on_caller(unsigned long ip)
2502 /* we'll do an OFF -> ON transition: */ 2500 /* we'll do an OFF -> ON transition: */
2503 curr->hardirqs_enabled = 1; 2501 curr->hardirqs_enabled = 1;
2504 2502
2505 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2506 return;
2507 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2503 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2508 return; 2504 return;
2509 /* 2505 /*
@@ -2525,6 +2521,21 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 curr->hardirq_enable_event = ++curr->irq_events; 2521 curr->hardirq_enable_event = ++curr->irq_events;
2526 debug_atomic_inc(hardirqs_on_events); 2522 debug_atomic_inc(hardirqs_on_events);
2527} 2523}
2524
2525void trace_hardirqs_on_caller(unsigned long ip)
2526{
2527 time_hardirqs_on(CALLER_ADDR0, ip);
2528
2529 if (unlikely(!debug_locks || current->lockdep_recursion))
2530 return;
2531
2532 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2533 return;
2534
2535 current->lockdep_recursion = 1;
2536 __trace_hardirqs_on_caller(ip);
2537 current->lockdep_recursion = 0;
2538}
2528EXPORT_SYMBOL(trace_hardirqs_on_caller); 2539EXPORT_SYMBOL(trace_hardirqs_on_caller);
2529 2540
2530void trace_hardirqs_on(void) 2541void trace_hardirqs_on(void)
@@ -2574,7 +2585,7 @@ void trace_softirqs_on(unsigned long ip)
2574{ 2585{
2575 struct task_struct *curr = current; 2586 struct task_struct *curr = current;
2576 2587
2577 if (unlikely(!debug_locks)) 2588 if (unlikely(!debug_locks || current->lockdep_recursion))
2578 return; 2589 return;
2579 2590
2580 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2591 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2585,6 +2596,7 @@ void trace_softirqs_on(unsigned long ip)
2585 return; 2596 return;
2586 } 2597 }
2587 2598
2599 current->lockdep_recursion = 1;
2588 /* 2600 /*
2589 * We'll do an OFF -> ON transition: 2601 * We'll do an OFF -> ON transition:
2590 */ 2602 */
@@ -2599,6 +2611,7 @@ void trace_softirqs_on(unsigned long ip)
2599 */ 2611 */
2600 if (curr->hardirqs_enabled) 2612 if (curr->hardirqs_enabled)
2601 mark_held_locks(curr, SOFTIRQ); 2613 mark_held_locks(curr, SOFTIRQ);
2614 current->lockdep_recursion = 0;
2602} 2615}
2603 2616
2604/* 2617/*
@@ -2608,7 +2621,7 @@ void trace_softirqs_off(unsigned long ip)
2608{ 2621{
2609 struct task_struct *curr = current; 2622 struct task_struct *curr = current;
2610 2623
2611 if (unlikely(!debug_locks)) 2624 if (unlikely(!debug_locks || current->lockdep_recursion))
2612 return; 2625 return;
2613 2626
2614 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2627 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3f..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
545 mod->field = kstrdup(s, GFP_KERNEL); \ 545 mod->field = kstrdup(s, GFP_KERNEL); \
546} \ 546} \
547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
548 struct module *mod, char *buffer) \ 548 struct module_kobject *mk, char *buffer) \
549{ \ 549{ \
550 return sprintf(buffer, "%s\n", mod->field); \ 550 return sprintf(buffer, "%s\n", mk->mod->field); \
551} \ 551} \
552static int modinfo_##field##_exists(struct module *mod) \ 552static int modinfo_##field##_exists(struct module *mod) \
553{ \ 553{ \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
902EXPORT_SYMBOL_GPL(symbol_put_addr); 902EXPORT_SYMBOL_GPL(symbol_put_addr);
903 903
904static ssize_t show_refcnt(struct module_attribute *mattr, 904static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module *mod, char *buffer) 905 struct module_kobject *mk, char *buffer)
906{ 906{
907 return sprintf(buffer, "%u\n", module_refcount(mod)); 907 return sprintf(buffer, "%u\n", module_refcount(mk->mod));
908} 908}
909 909
910static struct module_attribute refcnt = { 910static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
952#endif /* CONFIG_MODULE_UNLOAD */ 952#endif /* CONFIG_MODULE_UNLOAD */
953 953
954static ssize_t show_initstate(struct module_attribute *mattr, 954static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module *mod, char *buffer) 955 struct module_kobject *mk, char *buffer)
956{ 956{
957 const char *state = "unknown"; 957 const char *state = "unknown";
958 958
959 switch (mod->state) { 959 switch (mk->mod->state) {
960 case MODULE_STATE_LIVE: 960 case MODULE_STATE_LIVE:
961 state = "live"; 961 state = "live";
962 break; 962 break;
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
975 .show = show_initstate, 975 .show = show_initstate,
976}; 976};
977 977
978static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk,
980 const char *buffer, size_t count)
981{
982 enum kobject_action action;
983
984 if (kobject_action_type(buffer, count, &action) == 0)
985 kobject_uevent(&mk->kobj, action);
986 return count;
987}
988
989struct module_attribute module_uevent = {
990 .attr = { .name = "uevent", .mode = 0200 },
991 .store = store_uevent,
992};
993
978static struct module_attribute *modinfo_attrs[] = { 994static struct module_attribute *modinfo_attrs[] = {
979 &modinfo_version, 995 &modinfo_version,
980 &modinfo_srcversion, 996 &modinfo_srcversion,
981 &initstate, 997 &initstate,
998 &module_uevent,
982#ifdef CONFIG_MODULE_UNLOAD 999#ifdef CONFIG_MODULE_UNLOAD
983 &refcnt, 1000 &refcnt,
984#endif 1001#endif
@@ -1187,7 +1204,7 @@ struct module_sect_attrs
1187}; 1204};
1188 1205
1189static ssize_t module_sect_show(struct module_attribute *mattr, 1206static ssize_t module_sect_show(struct module_attribute *mattr,
1190 struct module *mod, char *buf) 1207 struct module_kobject *mk, char *buf)
1191{ 1208{
1192 struct module_sect_attr *sattr = 1209 struct module_sect_attr *sattr =
1193 container_of(mattr, struct module_sect_attr, mattr); 1210 container_of(mattr, struct module_sect_attr, mattr);
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { } 1714static void unset_module_init_ro_nx(struct module *mod) { }
1698#endif 1715#endif
1699 1716
1717void __weak module_free(struct module *mod, void *module_region)
1718{
1719 vfree(module_region);
1720}
1721
1722void __weak module_arch_cleanup(struct module *mod)
1723{
1724}
1725
1700/* Free a module, remove from lists, etc. */ 1726/* Free a module, remove from lists, etc. */
1701static void free_module(struct module *mod) 1727static void free_module(struct module *mod)
1702{ 1728{
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1851 return ret; 1877 return ret;
1852} 1878}
1853 1879
1880int __weak apply_relocate(Elf_Shdr *sechdrs,
1881 const char *strtab,
1882 unsigned int symindex,
1883 unsigned int relsec,
1884 struct module *me)
1885{
1886 pr_err("module %s: REL relocation unsupported\n", me->name);
1887 return -ENOEXEC;
1888}
1889
1890int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1891 const char *strtab,
1892 unsigned int symindex,
1893 unsigned int relsec,
1894 struct module *me)
1895{
1896 pr_err("module %s: RELA relocation unsupported\n", me->name);
1897 return -ENOEXEC;
1898}
1899
1854static int apply_relocations(struct module *mod, const struct load_info *info) 1900static int apply_relocations(struct module *mod, const struct load_info *info)
1855{ 1901{
1856 unsigned int i; 1902 unsigned int i;
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2235 ddebug_remove_module(debug->modname); 2281 ddebug_remove_module(debug->modname);
2236} 2282}
2237 2283
2284void * __weak module_alloc(unsigned long size)
2285{
2286 return size == 0 ? NULL : vmalloc_exec(size);
2287}
2288
2238static void *module_alloc_update_bounds(unsigned long size) 2289static void *module_alloc_update_bounds(unsigned long size)
2239{ 2290{
2240 void *ret = module_alloc(size); 2291 void *ret = module_alloc(size);
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
2645 set_fs(old_fs); 2696 set_fs(old_fs);
2646} 2697}
2647 2698
2699int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2700 Elf_Shdr *sechdrs,
2701 char *secstrings,
2702 struct module *mod)
2703{
2704 return 0;
2705}
2706
2648static struct module *layout_and_allocate(struct load_info *info) 2707static struct module *layout_and_allocate(struct load_info *info)
2649{ 2708{
2650 /* Module within temporary copy. */ 2709 /* Module within temporary copy. */
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
2716 module_free(mod, mod->module_core); 2775 module_free(mod, mod->module_core);
2717} 2776}
2718 2777
2778int __weak module_finalize(const Elf_Ehdr *hdr,
2779 const Elf_Shdr *sechdrs,
2780 struct module *me)
2781{
2782 return 0;
2783}
2784
2719static int post_relocation(struct module *mod, const struct load_info *info) 2785static int post_relocation(struct module *mod, const struct load_info *info)
2720{ 2786{
2721 /* Sort exception table now relocations are done. */ 2787 /* Sort exception table now relocations are done. */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15d..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
271 return err; 271 return err;
272} 272}
273 273
274static int __init nsproxy_cache_init(void) 274int __init nsproxy_cache_init(void)
275{ 275{
276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
277 return 0; 277 return 0;
278} 278}
279
280module_init(nsproxy_cache_init);
diff --git a/kernel/params.c b/kernel/params.c
index ed72e1330862..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
225 int ret; \ 225 int ret; \
226 \ 226 \
227 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
228 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret < 0 || ((type)l != l)) \
229 return -EINVAL; \ 229 return ret < 0 ? ret : -EINVAL; \
230 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
231 return 0; \ 231 return 0; \
232 } \ 232 } \
@@ -511,7 +511,7 @@ struct module_param_attrs
511#define to_param_attr(n) container_of(n, struct param_attribute, mattr) 511#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
512 512
513static ssize_t param_attr_show(struct module_attribute *mattr, 513static ssize_t param_attr_show(struct module_attribute *mattr,
514 struct module *mod, char *buf) 514 struct module_kobject *mk, char *buf)
515{ 515{
516 int count; 516 int count;
517 struct param_attribute *attribute = to_param_attr(mattr); 517 struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
531 531
532/* sysfs always hands a nul-terminated string in buf. We rely on that. */ 532/* sysfs always hands a nul-terminated string in buf. We rely on that. */
533static ssize_t param_attr_store(struct module_attribute *mattr, 533static ssize_t param_attr_store(struct module_attribute *mattr,
534 struct module *owner, 534 struct module_kobject *km,
535 const char *buf, size_t len) 535 const char *buf, size_t len)
536{ 536{
537 int err; 537 int err;
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
730 mk->kobj.kset = module_kset; 730 mk->kobj.kset = module_kset;
731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, 731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
732 "%s", name); 732 "%s", name);
733#ifdef CONFIG_MODULES
734 if (!err)
735 err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
736#endif
733 if (err) { 737 if (err) {
734 kobject_put(&mk->kobj); 738 kobject_put(&mk->kobj);
735 printk(KERN_ERR 739 printk(KERN_ERR
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void)
807} 811}
808 812
809ssize_t __modver_version_show(struct module_attribute *mattr, 813ssize_t __modver_version_show(struct module_attribute *mattr,
810 struct module *mod, char *buf) 814 struct module_kobject *mk, char *buf)
811{ 815{
812 struct module_version_attribute *vattr = 816 struct module_version_attribute *vattr =
813 container_of(mattr, struct module_version_attribute, mattr); 817 container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
852 if (!attribute->show) 856 if (!attribute->show)
853 return -EIO; 857 return -EIO;
854 858
855 ret = attribute->show(attribute, mk->mod, buf); 859 ret = attribute->show(attribute, mk, buf);
856 860
857 return ret; 861 return ret;
858} 862}
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
871 if (!attribute->store) 875 if (!attribute->store)
872 return -EIO; 876 return -EIO;
873 877
874 ret = attribute->store(attribute, mk->mod, buf, len); 878 ret = attribute->store(attribute, mk, buf, len);
875 879
876 return ret; 880 return ret;
877} 881}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0c..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
74static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
78 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
84 84
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
88 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
95 95
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bcd8fce351b7..b1914cb9095c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -224,6 +224,10 @@ config PM_OPP
224 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
225 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226 226
227config PM_RUNTIME_CLK 227config PM_CLK
228 def_bool y 228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK 229 depends on PM && HAVE_CLK
230
231config PM_GENERIC_DOMAINS
232 bool
233 depends on PM
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2981af4ce7cb..6c601f871964 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
37 37
38int pm_notifier_call_chain(unsigned long val) 38int pm_notifier_call_chain(unsigned long val)
39{ 39{
40 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) 40 int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
41 == NOTIFY_BAD) ? -EINVAL : 0; 41
42 return notifier_to_errno(ret);
42} 43}
43 44
44/* If set, devices may be suspended and resumed asynchronously. */ 45/* If set, devices may be suspended and resumed asynchronously. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba215419..b6b71ad2208f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
44 suspend_ops = ops; 44 suspend_ops = ops;
45 mutex_unlock(&pm_mutex); 45 mutex_unlock(&pm_mutex);
46} 46}
47EXPORT_SYMBOL_GPL(suspend_set_ops);
47 48
48bool valid_state(suspend_state_t state) 49bool valid_state(suspend_state_t state)
49{ 50{
@@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state)
65{ 66{
66 return state == PM_SUSPEND_MEM; 67 return state == PM_SUSPEND_MEM;
67} 68}
69EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
68 70
69static int suspend_test(int level) 71static int suspend_test(int level)
70{ 72{
@@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
126} 128}
127 129
128/** 130/**
129 * suspend_enter - enter the desired system sleep state. 131 * suspend_enter - enter the desired system sleep state.
130 * @state: state to enter 132 * @state: State to enter
133 * @wakeup: Returns information that suspend should not be entered again.
131 * 134 *
132 * This function should be called after devices have been suspended. 135 * This function should be called after devices have been suspended.
133 */ 136 */
134static int suspend_enter(suspend_state_t state) 137static int suspend_enter(suspend_state_t state, bool *wakeup)
135{ 138{
136 int error; 139 int error;
137 140
@@ -165,7 +168,8 @@ static int suspend_enter(suspend_state_t state)
165 168
166 error = syscore_suspend(); 169 error = syscore_suspend();
167 if (!error) { 170 if (!error) {
168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 171 *wakeup = pm_wakeup_pending();
172 if (!(suspend_test(TEST_CORE) || *wakeup)) {
169 error = suspend_ops->enter(state); 173 error = suspend_ops->enter(state);
170 events_check_enabled = false; 174 events_check_enabled = false;
171 } 175 }
@@ -199,6 +203,7 @@ static int suspend_enter(suspend_state_t state)
199int suspend_devices_and_enter(suspend_state_t state) 203int suspend_devices_and_enter(suspend_state_t state)
200{ 204{
201 int error; 205 int error;
206 bool wakeup = false;
202 207
203 if (!suspend_ops) 208 if (!suspend_ops)
204 return -ENOSYS; 209 return -ENOSYS;
@@ -220,7 +225,10 @@ int suspend_devices_and_enter(suspend_state_t state)
220 if (suspend_test(TEST_DEVICES)) 225 if (suspend_test(TEST_DEVICES))
221 goto Recover_platform; 226 goto Recover_platform;
222 227
223 error = suspend_enter(state); 228 do {
229 error = suspend_enter(state, &wakeup);
230 } while (!error && !wakeup
231 && suspend_ops->suspend_again && suspend_ops->suspend_again());
224 232
225 Resume_devices: 233 Resume_devices:
226 suspend_test_start(); 234 suspend_test_start();
diff --git a/kernel/printk.c b/kernel/printk.c
index 35185392173f..37dff3429adb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -782,7 +782,7 @@ static inline int can_use_console(unsigned int cpu)
782static int console_trylock_for_printk(unsigned int cpu) 782static int console_trylock_for_printk(unsigned int cpu)
783 __releases(&logbuf_lock) 783 __releases(&logbuf_lock)
784{ 784{
785 int retval = 0; 785 int retval = 0, wake = 0;
786 786
787 if (console_trylock()) { 787 if (console_trylock()) {
788 retval = 1; 788 retval = 1;
@@ -795,12 +795,14 @@ static int console_trylock_for_printk(unsigned int cpu)
795 */ 795 */
796 if (!can_use_console(cpu)) { 796 if (!can_use_console(cpu)) {
797 console_locked = 0; 797 console_locked = 0;
798 up(&console_sem); 798 wake = 1;
799 retval = 0; 799 retval = 0;
800 } 800 }
801 } 801 }
802 printk_cpu = UINT_MAX; 802 printk_cpu = UINT_MAX;
803 spin_unlock(&logbuf_lock); 803 spin_unlock(&logbuf_lock);
804 if (wake)
805 up(&console_sem);
804 return retval; 806 return retval;
805} 807}
806static const char recursion_bug_msg [] = 808static const char recursion_bug_msg [] =
@@ -1242,7 +1244,7 @@ void console_unlock(void)
1242{ 1244{
1243 unsigned long flags; 1245 unsigned long flags;
1244 unsigned _con_start, _log_end; 1246 unsigned _con_start, _log_end;
1245 unsigned wake_klogd = 0; 1247 unsigned wake_klogd = 0, retry = 0;
1246 1248
1247 if (console_suspended) { 1249 if (console_suspended) {
1248 up(&console_sem); 1250 up(&console_sem);
@@ -1251,6 +1253,7 @@ void console_unlock(void)
1251 1253
1252 console_may_schedule = 0; 1254 console_may_schedule = 0;
1253 1255
1256again:
1254 for ( ; ; ) { 1257 for ( ; ; ) {
1255 spin_lock_irqsave(&logbuf_lock, flags); 1258 spin_lock_irqsave(&logbuf_lock, flags);
1256 wake_klogd |= log_start - log_end; 1259 wake_klogd |= log_start - log_end;
@@ -1271,8 +1274,23 @@ void console_unlock(void)
1271 if (unlikely(exclusive_console)) 1274 if (unlikely(exclusive_console))
1272 exclusive_console = NULL; 1275 exclusive_console = NULL;
1273 1276
1277 spin_unlock(&logbuf_lock);
1278
1274 up(&console_sem); 1279 up(&console_sem);
1280
1281 /*
1282 * Someone could have filled up the buffer again, so re-check if there's
1283 * something to flush. In case we cannot trylock the console_sem again,
1284 * there's a new owner and the console_unlock() from them will do the
1285 * flush, no worries.
1286 */
1287 spin_lock(&logbuf_lock);
1288 if (con_start != log_end)
1289 retry = 1;
1275 spin_unlock_irqrestore(&logbuf_lock, flags); 1290 spin_unlock_irqrestore(&logbuf_lock, flags);
1291 if (retry && console_trylock())
1292 goto again;
1293
1276 if (wake_klogd) 1294 if (wake_klogd)
1277 wake_up_klogd(); 1295 wake_up_klogd();
1278} 1296}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd9..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h>
26 27
27 28
29static int ptrace_trapping_sleep_fn(void *flags)
30{
31 schedule();
32 return 0;
33}
34
28/* 35/*
29 * ptrace a task: make the debugger its new parent and 36 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 37 * move it to the ptrace list.
@@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child)
77 spin_lock(&child->sighand->siglock); 84 spin_lock(&child->sighand->siglock);
78 85
79 /* 86 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and 87 * Clear all pending traps and TRAPPING. TRAPPING should be
88 * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
89 */
90 task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
91 task_clear_jobctl_trapping(child);
92
93 /*
94 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
81 * @child isn't dead. 95 * @child isn't dead.
82 */ 96 */
83 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count)) 99 child->signal->group_stop_count))
86 child->group_stop |= GROUP_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
87 101
88 /* 102 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child)
91 * is in TASK_TRACED; otherwise, we might unduly disrupt 105 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps. 106 * TASK_KILLABLE sleeps.
93 */ 107 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) 108 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child)); 109 signal_wake_up(child, task_is_traced(child));
96 110
97 spin_unlock(&child->sighand->siglock); 111 spin_unlock(&child->sighand->siglock);
98} 112}
99 113
100/* 114/**
101 * Check that we have indeed attached to the thing.. 115 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
116 * @child: ptracee to check for
117 * @ignore_state: don't check whether @child is currently %TASK_TRACED
118 *
119 * Check whether @child is being ptraced by %current and ready for further
120 * ptrace operations. If @ignore_state is %false, @child also should be in
121 * %TASK_TRACED state and on return the child is guaranteed to be traced
122 * and not executing. If @ignore_state is %true, @child can be in any
123 * state.
124 *
125 * CONTEXT:
126 * Grabs and releases tasklist_lock and @child->sighand->siglock.
127 *
128 * RETURNS:
129 * 0 on success, -ESRCH if %child is not ready.
102 */ 130 */
103int ptrace_check_attach(struct task_struct *child, int kill) 131int ptrace_check_attach(struct task_struct *child, bool ignore_state)
104{ 132{
105 int ret = -ESRCH; 133 int ret = -ESRCH;
106 134
@@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
119 */ 147 */
120 spin_lock_irq(&child->sighand->siglock); 148 spin_lock_irq(&child->sighand->siglock);
121 WARN_ON_ONCE(task_is_stopped(child)); 149 WARN_ON_ONCE(task_is_stopped(child));
122 if (task_is_traced(child) || kill) 150 if (ignore_state || (task_is_traced(child) &&
151 !(child->jobctl & JOBCTL_LISTENING)))
123 ret = 0; 152 ret = 0;
124 spin_unlock_irq(&child->sighand->siglock); 153 spin_unlock_irq(&child->sighand->siglock);
125 } 154 }
126 read_unlock(&tasklist_lock); 155 read_unlock(&tasklist_lock);
127 156
128 if (!ret && !kill) 157 if (!ret && !ignore_state)
129 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 158 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
130 159
131 /* All systems go.. */ 160 /* All systems go.. */
@@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
182 return !err; 211 return !err;
183} 212}
184 213
185static int ptrace_attach(struct task_struct *task) 214static int ptrace_attach(struct task_struct *task, long request,
215 unsigned long flags)
186{ 216{
187 bool wait_trap = false; 217 bool seize = (request == PTRACE_SEIZE);
188 int retval; 218 int retval;
189 219
220 /*
221 * SEIZE will enable new ptrace behaviors which will be implemented
222 * gradually. SEIZE_DEVEL is used to prevent applications
223 * expecting full SEIZE behaviors trapping on kernel commits which
224 * are still in the process of implementing them.
225 *
226 * Only test programs for new ptrace behaviors being implemented
227 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
228 *
229 * Once SEIZE behaviors are completely implemented, this flag and
230 * the following test will be removed.
231 */
232 retval = -EIO;
233 if (seize && !(flags & PTRACE_SEIZE_DEVEL))
234 goto out;
235
190 audit_ptrace(task); 236 audit_ptrace(task);
191 237
192 retval = -EPERM; 238 retval = -EPERM;
@@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task)
218 goto unlock_tasklist; 264 goto unlock_tasklist;
219 265
220 task->ptrace = PT_PTRACED; 266 task->ptrace = PT_PTRACED;
267 if (seize)
268 task->ptrace |= PT_SEIZED;
221 if (task_ns_capable(task, CAP_SYS_PTRACE)) 269 if (task_ns_capable(task, CAP_SYS_PTRACE))
222 task->ptrace |= PT_PTRACE_CAP; 270 task->ptrace |= PT_PTRACE_CAP;
223 271
224 __ptrace_link(task, current); 272 __ptrace_link(task, current);
225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 273
274 /* SEIZE doesn't trap tracee on attach */
275 if (!seize)
276 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
226 277
227 spin_lock(&task->sighand->siglock); 278 spin_lock(&task->sighand->siglock);
228 279
229 /* 280 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and 281 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING 282 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any 283 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait 284 * event which clears the group stop states happens. We'll wait
@@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task)
243 * The following task_is_stopped() test is safe as both transitions 294 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock. 295 * in and out of STOPPED are protected by siglock.
245 */ 296 */
246 if (task_is_stopped(task)) { 297 if (task_is_stopped(task) &&
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; 298 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
248 signal_wake_up(task, 1); 299 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251 300
252 spin_unlock(&task->sighand->siglock); 301 spin_unlock(&task->sighand->siglock);
253 302
@@ -257,9 +306,12 @@ unlock_tasklist:
257unlock_creds: 306unlock_creds:
258 mutex_unlock(&task->signal->cred_guard_mutex); 307 mutex_unlock(&task->signal->cred_guard_mutex);
259out: 308out:
260 if (wait_trap) 309 if (!retval) {
261 wait_event(current->signal->wait_chldexit, 310 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
262 !(task->group_stop & GROUP_STOP_TRAPPING)); 311 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
312 proc_ptrace_connector(task, PTRACE_ATTACH);
313 }
314
263 return retval; 315 return retval;
264} 316}
265 317
@@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
322 */ 374 */
323static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 375static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
324{ 376{
377 bool dead;
378
325 __ptrace_unlink(p); 379 __ptrace_unlink(p);
326 380
327 if (p->exit_state == EXIT_ZOMBIE) { 381 if (p->exit_state != EXIT_ZOMBIE)
328 if (!task_detached(p) && thread_group_empty(p)) { 382 return false;
329 if (!same_thread_group(p->real_parent, tracer)) 383
330 do_notify_parent(p, p->exit_signal); 384 dead = !thread_group_leader(p);
331 else if (ignoring_children(tracer->sighand)) { 385
332 __wake_up_parent(p, tracer); 386 if (!dead && thread_group_empty(p)) {
333 p->exit_signal = -1; 387 if (!same_thread_group(p->real_parent, tracer))
334 } 388 dead = do_notify_parent(p, p->exit_signal);
335 } 389 else if (ignoring_children(tracer->sighand)) {
336 if (task_detached(p)) { 390 __wake_up_parent(p, tracer);
337 /* Mark it as in the process of being reaped. */ 391 dead = true;
338 p->exit_state = EXIT_DEAD;
339 return true;
340 } 392 }
341 } 393 }
342 394 /* Mark it as in the process of being reaped. */
343 return false; 395 if (dead)
396 p->exit_state = EXIT_DEAD;
397 return dead;
344} 398}
345 399
346static int ptrace_detach(struct task_struct *child, unsigned int data) 400static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
365 } 419 }
366 write_unlock_irq(&tasklist_lock); 420 write_unlock_irq(&tasklist_lock);
367 421
422 proc_ptrace_connector(child, PTRACE_DETACH);
368 if (unlikely(dead)) 423 if (unlikely(dead))
369 release_task(child); 424 release_task(child);
370 425
@@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
611int ptrace_request(struct task_struct *child, long request, 666int ptrace_request(struct task_struct *child, long request,
612 unsigned long addr, unsigned long data) 667 unsigned long addr, unsigned long data)
613{ 668{
669 bool seized = child->ptrace & PT_SEIZED;
614 int ret = -EIO; 670 int ret = -EIO;
615 siginfo_t siginfo; 671 siginfo_t siginfo, *si;
616 void __user *datavp = (void __user *) data; 672 void __user *datavp = (void __user *) data;
617 unsigned long __user *datalp = datavp; 673 unsigned long __user *datalp = datavp;
674 unsigned long flags;
618 675
619 switch (request) { 676 switch (request) {
620 case PTRACE_PEEKTEXT: 677 case PTRACE_PEEKTEXT:
@@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
647 ret = ptrace_setsiginfo(child, &siginfo); 704 ret = ptrace_setsiginfo(child, &siginfo);
648 break; 705 break;
649 706
707 case PTRACE_INTERRUPT:
708 /*
709 * Stop tracee without any side-effect on signal or job
710 * control. At least one trap is guaranteed to happen
711 * after this request. If @child is already trapped, the
712 * current trap is not disturbed and another trap will
713 * happen after the current trap is ended with PTRACE_CONT.
714 *
715 * The actual trap might not be PTRACE_EVENT_STOP trap but
716 * the pending condition is cleared regardless.
717 */
718 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
719 break;
720
721 /*
722 * INTERRUPT doesn't disturb existing trap sans one
723 * exception. If ptracer issued LISTEN for the current
724 * STOP, this INTERRUPT should clear LISTEN and re-trap
725 * tracee into STOP.
726 */
727 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
728 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
729
730 unlock_task_sighand(child, &flags);
731 ret = 0;
732 break;
733
734 case PTRACE_LISTEN:
735 /*
736 * Listen for events. Tracee must be in STOP. It's not
737 * resumed per-se but is not considered to be in TRACED by
738 * wait(2) or ptrace(2). If an async event (e.g. group
739 * stop state change) happens, tracee will enter STOP trap
740 * again. Alternatively, ptracer can issue INTERRUPT to
741 * finish listening and re-trap tracee into STOP.
742 */
743 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
744 break;
745
746 si = child->last_siginfo;
747 if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
748 break;
749
750 child->jobctl |= JOBCTL_LISTENING;
751
752 /*
753 * If NOTIFY is set, it means event happened between start
754 * of this trap and now. Trigger re-trap immediately.
755 */
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
757 signal_wake_up(child, true);
758
759 unlock_task_sighand(child, &flags);
760 ret = 0;
761 break;
762
650 case PTRACE_DETACH: /* detach a process that was attached. */ 763 case PTRACE_DETACH: /* detach a process that was attached. */
651 ret = ptrace_detach(child, data); 764 ret = ptrace_detach(child, data);
652 break; 765 break;
@@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
761 goto out; 874 goto out;
762 } 875 }
763 876
764 if (request == PTRACE_ATTACH) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
765 ret = ptrace_attach(child); 878 ret = ptrace_attach(child, request, data);
766 /* 879 /*
767 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
768 * a ptrace attach. 881 * a ptrace attach.
@@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
772 goto out_put_task_struct; 885 goto out_put_task_struct;
773 } 886 }
774 887
775 ret = ptrace_check_attach(child, request == PTRACE_KILL); 888 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
889 request == PTRACE_INTERRUPT);
776 if (ret < 0) 890 if (ret < 0)
777 goto out_put_task_struct; 891 goto out_put_task_struct;
778 892
@@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
903 goto out; 1017 goto out;
904 } 1018 }
905 1019
906 if (request == PTRACE_ATTACH) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
907 ret = ptrace_attach(child); 1021 ret = ptrace_attach(child, request, data);
908 /* 1022 /*
909 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
910 * a ptrace attach. 1024 * a ptrace attach.
@@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
914 goto out_put_task_struct; 1028 goto out_put_task_struct;
915 } 1029 }
916 1030
917 ret = ptrace_check_attach(child, request == PTRACE_KILL); 1031 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1032 request == PTRACE_INTERRUPT);
918 if (!ret) 1033 if (!ret)
919 ret = compat_arch_ptrace(child, request, addr, data); 1034 ret = compat_arch_ptrace(child, request, addr, data);
920 1035
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7e59ffb3d0ba..ba06207b1dd3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,9 +84,32 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
84 84
85static struct rcu_state *rcu_state; 85static struct rcu_state *rcu_state;
86 86
87/*
88 * The rcu_scheduler_active variable transitions from zero to one just
89 * before the first task is spawned. So when this variable is zero, RCU
90 * can assume that there is but one task, allowing RCU to (for example)
91 * optimized synchronize_sched() to a simple barrier(). When this variable
92 * is one, RCU must actually do all the hard work required to detect real
93 * grace periods. This variable is also used to suppress boot-time false
94 * positives from lockdep-RCU error checking.
95 */
87int rcu_scheduler_active __read_mostly; 96int rcu_scheduler_active __read_mostly;
88EXPORT_SYMBOL_GPL(rcu_scheduler_active); 97EXPORT_SYMBOL_GPL(rcu_scheduler_active);
89 98
99/*
100 * The rcu_scheduler_fully_active variable transitions from zero to one
101 * during the early_initcall() processing, which is after the scheduler
102 * is capable of creating new tasks. So RCU processing (for example,
103 * creating tasks for RCU priority boosting) must be delayed until after
104 * rcu_scheduler_fully_active transitions from zero to one. We also
105 * currently delay invocation of any RCU callbacks until after this point.
106 *
107 * It might later prove better for people registering RCU callbacks during
108 * early boot to take responsibility for these callbacks, but one step at
109 * a time.
110 */
111static int rcu_scheduler_fully_active __read_mostly;
112
90#ifdef CONFIG_RCU_BOOST 113#ifdef CONFIG_RCU_BOOST
91 114
92/* 115/*
@@ -98,7 +121,6 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
98DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); 121DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
99DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 122DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
100DEFINE_PER_CPU(char, rcu_cpu_has_work); 123DEFINE_PER_CPU(char, rcu_cpu_has_work);
101static char rcu_kthreads_spawnable;
102 124
103#endif /* #ifdef CONFIG_RCU_BOOST */ 125#endif /* #ifdef CONFIG_RCU_BOOST */
104 126
@@ -1467,6 +1489,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1467 */ 1489 */
1468static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 1490static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1469{ 1491{
1492 if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
1493 return;
1470 if (likely(!rsp->boost)) { 1494 if (likely(!rsp->boost)) {
1471 rcu_do_batch(rsp, rdp); 1495 rcu_do_batch(rsp, rdp);
1472 return; 1496 return;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 14dc7dd00902..8aafbb80b8b0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,6 +68,7 @@ struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 68DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
69static struct rcu_state *rcu_state = &rcu_preempt_state; 69static struct rcu_state *rcu_state = &rcu_preempt_state;
70 70
71static void rcu_read_unlock_special(struct task_struct *t);
71static int rcu_preempted_readers_exp(struct rcu_node *rnp); 72static int rcu_preempted_readers_exp(struct rcu_node *rnp);
72 73
73/* 74/*
@@ -147,7 +148,7 @@ static void rcu_preempt_note_context_switch(int cpu)
147 struct rcu_data *rdp; 148 struct rcu_data *rdp;
148 struct rcu_node *rnp; 149 struct rcu_node *rnp;
149 150
150 if (t->rcu_read_lock_nesting && 151 if (t->rcu_read_lock_nesting > 0 &&
151 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 152 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
152 153
153 /* Possibly blocking in an RCU read-side critical section. */ 154 /* Possibly blocking in an RCU read-side critical section. */
@@ -190,6 +191,14 @@ static void rcu_preempt_note_context_switch(int cpu)
190 rnp->gp_tasks = &t->rcu_node_entry; 191 rnp->gp_tasks = &t->rcu_node_entry;
191 } 192 }
192 raw_spin_unlock_irqrestore(&rnp->lock, flags); 193 raw_spin_unlock_irqrestore(&rnp->lock, flags);
194 } else if (t->rcu_read_lock_nesting < 0 &&
195 t->rcu_read_unlock_special) {
196
197 /*
198 * Complete exit from RCU read-side critical section on
199 * behalf of preempted instance of __rcu_read_unlock().
200 */
201 rcu_read_unlock_special(t);
193 } 202 }
194 203
195 /* 204 /*
@@ -284,7 +293,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
284 * notify RCU core processing or task having blocked during the RCU 293 * notify RCU core processing or task having blocked during the RCU
285 * read-side critical section. 294 * read-side critical section.
286 */ 295 */
287static void rcu_read_unlock_special(struct task_struct *t) 296static noinline void rcu_read_unlock_special(struct task_struct *t)
288{ 297{
289 int empty; 298 int empty;
290 int empty_exp; 299 int empty_exp;
@@ -309,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
309 } 318 }
310 319
311 /* Hardware IRQ handlers cannot block. */ 320 /* Hardware IRQ handlers cannot block. */
312 if (in_irq()) { 321 if (in_irq() || in_serving_softirq()) {
313 local_irq_restore(flags); 322 local_irq_restore(flags);
314 return; 323 return;
315 } 324 }
@@ -342,6 +351,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
342#ifdef CONFIG_RCU_BOOST 351#ifdef CONFIG_RCU_BOOST
343 if (&t->rcu_node_entry == rnp->boost_tasks) 352 if (&t->rcu_node_entry == rnp->boost_tasks)
344 rnp->boost_tasks = np; 353 rnp->boost_tasks = np;
354 /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
355 if (t->rcu_boosted) {
356 special |= RCU_READ_UNLOCK_BOOSTED;
357 t->rcu_boosted = 0;
358 }
345#endif /* #ifdef CONFIG_RCU_BOOST */ 359#endif /* #ifdef CONFIG_RCU_BOOST */
346 t->rcu_blocked_node = NULL; 360 t->rcu_blocked_node = NULL;
347 361
@@ -358,7 +372,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
358#ifdef CONFIG_RCU_BOOST 372#ifdef CONFIG_RCU_BOOST
359 /* Unboost if we were boosted. */ 373 /* Unboost if we were boosted. */
360 if (special & RCU_READ_UNLOCK_BOOSTED) { 374 if (special & RCU_READ_UNLOCK_BOOSTED) {
361 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
362 rt_mutex_unlock(t->rcu_boost_mutex); 375 rt_mutex_unlock(t->rcu_boost_mutex);
363 t->rcu_boost_mutex = NULL; 376 t->rcu_boost_mutex = NULL;
364 } 377 }
@@ -387,13 +400,22 @@ void __rcu_read_unlock(void)
387 struct task_struct *t = current; 400 struct task_struct *t = current;
388 401
389 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 402 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
390 --t->rcu_read_lock_nesting; 403 if (t->rcu_read_lock_nesting != 1)
391 barrier(); /* decrement before load of ->rcu_read_unlock_special */ 404 --t->rcu_read_lock_nesting;
392 if (t->rcu_read_lock_nesting == 0 && 405 else {
393 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 406 t->rcu_read_lock_nesting = INT_MIN;
394 rcu_read_unlock_special(t); 407 barrier(); /* assign before ->rcu_read_unlock_special load */
408 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
409 rcu_read_unlock_special(t);
410 barrier(); /* ->rcu_read_unlock_special load before assign */
411 t->rcu_read_lock_nesting = 0;
412 }
395#ifdef CONFIG_PROVE_LOCKING 413#ifdef CONFIG_PROVE_LOCKING
396 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); 414 {
415 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
416
417 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
418 }
397#endif /* #ifdef CONFIG_PROVE_LOCKING */ 419#endif /* #ifdef CONFIG_PROVE_LOCKING */
398} 420}
399EXPORT_SYMBOL_GPL(__rcu_read_unlock); 421EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -589,7 +611,8 @@ static void rcu_preempt_check_callbacks(int cpu)
589 rcu_preempt_qs(cpu); 611 rcu_preempt_qs(cpu);
590 return; 612 return;
591 } 613 }
592 if (per_cpu(rcu_preempt_data, cpu).qs_pending) 614 if (t->rcu_read_lock_nesting > 0 &&
615 per_cpu(rcu_preempt_data, cpu).qs_pending)
593 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 616 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
594} 617}
595 618
@@ -695,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
695 718
696 raw_spin_lock_irqsave(&rnp->lock, flags); 719 raw_spin_lock_irqsave(&rnp->lock, flags);
697 for (;;) { 720 for (;;) {
698 if (!sync_rcu_preempt_exp_done(rnp)) 721 if (!sync_rcu_preempt_exp_done(rnp)) {
722 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 break; 723 break;
724 }
700 if (rnp->parent == NULL) { 725 if (rnp->parent == NULL) {
726 raw_spin_unlock_irqrestore(&rnp->lock, flags);
701 wake_up(&sync_rcu_preempt_exp_wq); 727 wake_up(&sync_rcu_preempt_exp_wq);
702 break; 728 break;
703 } 729 }
@@ -707,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
707 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 733 raw_spin_lock(&rnp->lock); /* irqs already disabled */
708 rnp->expmask &= ~mask; 734 rnp->expmask &= ~mask;
709 } 735 }
710 raw_spin_unlock_irqrestore(&rnp->lock, flags);
711} 736}
712 737
713/* 738/*
@@ -1174,7 +1199,7 @@ static int rcu_boost(struct rcu_node *rnp)
1174 t = container_of(tb, struct task_struct, rcu_node_entry); 1199 t = container_of(tb, struct task_struct, rcu_node_entry);
1175 rt_mutex_init_proxy_locked(&mtx, t); 1200 rt_mutex_init_proxy_locked(&mtx, t);
1176 t->rcu_boost_mutex = &mtx; 1201 t->rcu_boost_mutex = &mtx;
1177 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; 1202 t->rcu_boosted = 1;
1178 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1203 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1179 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1204 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1180 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1205 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1532,7 +1557,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1532 struct sched_param sp; 1557 struct sched_param sp;
1533 struct task_struct *t; 1558 struct task_struct *t;
1534 1559
1535 if (!rcu_kthreads_spawnable || 1560 if (!rcu_scheduler_fully_active ||
1536 per_cpu(rcu_cpu_kthread_task, cpu) != NULL) 1561 per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
1537 return 0; 1562 return 0;
1538 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); 1563 t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
@@ -1639,7 +1664,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1639 struct sched_param sp; 1664 struct sched_param sp;
1640 struct task_struct *t; 1665 struct task_struct *t;
1641 1666
1642 if (!rcu_kthreads_spawnable || 1667 if (!rcu_scheduler_fully_active ||
1643 rnp->qsmaskinit == 0) 1668 rnp->qsmaskinit == 0)
1644 return 0; 1669 return 0;
1645 if (rnp->node_kthread_task == NULL) { 1670 if (rnp->node_kthread_task == NULL) {
@@ -1665,7 +1690,7 @@ static int __init rcu_spawn_kthreads(void)
1665 int cpu; 1690 int cpu;
1666 struct rcu_node *rnp; 1691 struct rcu_node *rnp;
1667 1692
1668 rcu_kthreads_spawnable = 1; 1693 rcu_scheduler_fully_active = 1;
1669 for_each_possible_cpu(cpu) { 1694 for_each_possible_cpu(cpu) {
1670 per_cpu(rcu_cpu_has_work, cpu) = 0; 1695 per_cpu(rcu_cpu_has_work, cpu) = 0;
1671 if (cpu_online(cpu)) 1696 if (cpu_online(cpu))
@@ -1687,7 +1712,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1687 struct rcu_node *rnp = rdp->mynode; 1712 struct rcu_node *rnp = rdp->mynode;
1688 1713
1689 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1714 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1690 if (rcu_kthreads_spawnable) { 1715 if (rcu_scheduler_fully_active) {
1691 (void)rcu_spawn_one_cpu_kthread(cpu); 1716 (void)rcu_spawn_one_cpu_kthread(cpu);
1692 if (rnp->node_kthread_task == NULL) 1717 if (rnp->node_kthread_task == NULL)
1693 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1718 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
@@ -1726,6 +1751,13 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1726{ 1751{
1727} 1752}
1728 1753
1754static int __init rcu_scheduler_really_started(void)
1755{
1756 rcu_scheduler_fully_active = 1;
1757 return 0;
1758}
1759early_initcall(rcu_scheduler_really_started);
1760
1729static void __cpuinit rcu_prepare_kthreads(int cpu) 1761static void __cpuinit rcu_prepare_kthreads(int cpu)
1730{ 1762{
1731} 1763}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
890{ 890{
891 lock->owner = NULL; 891 lock->owner = NULL;
892 raw_spin_lock_init(&lock->wait_lock); 892 raw_spin_lock_init(&lock->wait_lock);
893 plist_head_init_raw(&lock->wait_list, &lock->wait_lock); 893 plist_head_init(&lock->wait_list);
894 894
895 debug_rt_mutex_init(lock, name); 895 debug_rt_mutex_init(lock, name);
896} 896}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..176e5e56ffab 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
117 117
118EXPORT_SYMBOL(down_read_nested); 118EXPORT_SYMBOL(down_read_nested);
119 119
120void down_read_non_owner(struct rw_semaphore *sem)
121{
122 might_sleep();
123
124 __down_read(sem);
125}
126
127EXPORT_SYMBOL(down_read_non_owner);
128
129void down_write_nested(struct rw_semaphore *sem, int subclass) 120void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 121{
131 might_sleep(); 122 might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 127
137EXPORT_SYMBOL(down_write_nested); 128EXPORT_SYMBOL(down_write_nested);
138 129
139void up_read_non_owner(struct rw_semaphore *sem)
140{
141 __up_read(sem);
142}
143
144EXPORT_SYMBOL(up_read_non_owner);
145
146#endif 130#endif
147 131
148 132
diff --git a/kernel/sched.c b/kernel/sched.c
index ad8ab90bb301..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h> 77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h>
80#endif
78 81
79#include "sched_cpupri.h" 82#include "sched_cpupri.h"
80#include "workqueue_sched.h" 83#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
124 127
125static inline int rt_policy(int policy) 128static inline int rt_policy(int policy)
126{ 129{
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 130 if (policy == SCHED_FIFO || policy == SCHED_RR)
128 return 1; 131 return 1;
129 return 0; 132 return 0;
130} 133}
@@ -422,6 +425,7 @@ struct rt_rq {
422 */ 425 */
423struct root_domain { 426struct root_domain {
424 atomic_t refcount; 427 atomic_t refcount;
428 atomic_t rto_count;
425 struct rcu_head rcu; 429 struct rcu_head rcu;
426 cpumask_var_t span; 430 cpumask_var_t span;
427 cpumask_var_t online; 431 cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
431 * one runnable RT task. 435 * one runnable RT task.
432 */ 436 */
433 cpumask_var_t rto_mask; 437 cpumask_var_t rto_mask;
434 atomic_t rto_count;
435 struct cpupri cpupri; 438 struct cpupri cpupri;
436}; 439};
437 440
@@ -528,6 +531,12 @@ struct rq {
528#ifdef CONFIG_IRQ_TIME_ACCOUNTING 531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
529 u64 prev_irq_time; 532 u64 prev_irq_time;
530#endif 533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
531 540
532 /* calc_load related fields */ 541 /* calc_load related fields */
533 unsigned long calc_load_update; 542 unsigned long calc_load_update;
@@ -1567,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1567 return rq->avg_load_per_task; 1576 return rq->avg_load_per_task;
1568} 1577}
1569 1578
1570#ifdef CONFIG_FAIR_GROUP_SCHED
1571
1572/*
1573 * Compute the cpu's hierarchical load factor for each task group.
1574 * This needs to be done in a top-down fashion because the load of a child
1575 * group is a fraction of its parents load.
1576 */
1577static int tg_load_down(struct task_group *tg, void *data)
1578{
1579 unsigned long load;
1580 long cpu = (long)data;
1581
1582 if (!tg->parent) {
1583 load = cpu_rq(cpu)->load.weight;
1584 } else {
1585 load = tg->parent->cfs_rq[cpu]->h_load;
1586 load *= tg->se[cpu]->load.weight;
1587 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1588 }
1589
1590 tg->cfs_rq[cpu]->h_load = load;
1591
1592 return 0;
1593}
1594
1595static void update_h_load(long cpu)
1596{
1597 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1598}
1599
1600#endif
1601
1602#ifdef CONFIG_PREEMPT 1579#ifdef CONFIG_PREEMPT
1603 1580
1604static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1581static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1952,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
1952} 1929}
1953EXPORT_SYMBOL_GPL(account_system_vtime); 1930EXPORT_SYMBOL_GPL(account_system_vtime);
1954 1931
1955static void update_rq_clock_task(struct rq *rq, s64 delta) 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1933
1934#ifdef CONFIG_PARAVIRT
1935static inline u64 steal_ticks(u64 steal)
1956{ 1936{
1957 s64 irq_delta; 1937 if (unlikely(steal > NSEC_PER_SEC))
1938 return div_u64(steal, TICK_NSEC);
1939
1940 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1941}
1942#endif
1958 1943
1944static void update_rq_clock_task(struct rq *rq, s64 delta)
1945{
1946/*
1947 * In theory, the compile should just see 0 here, and optimize out the call
1948 * to sched_rt_avg_update. But I don't trust it...
1949 */
1950#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1951 s64 steal = 0, irq_delta = 0;
1952#endif
1953#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1959 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1954 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1960 1955
1961 /* 1956 /*
@@ -1978,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1978 1973
1979 rq->prev_irq_time += irq_delta; 1974 rq->prev_irq_time += irq_delta;
1980 delta -= irq_delta; 1975 delta -= irq_delta;
1976#endif
1977#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1978 if (static_branch((&paravirt_steal_rq_enabled))) {
1979 u64 st;
1980
1981 steal = paravirt_steal_clock(cpu_of(rq));
1982 steal -= rq->prev_steal_time_rq;
1983
1984 if (unlikely(steal > delta))
1985 steal = delta;
1986
1987 st = steal_ticks(steal);
1988 steal = st * TICK_NSEC;
1989
1990 rq->prev_steal_time_rq += steal;
1991
1992 delta -= steal;
1993 }
1994#endif
1995
1981 rq->clock_task += delta; 1996 rq->clock_task += delta;
1982 1997
1983 if (irq_delta && sched_feat(NONIRQ_POWER)) 1998#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1984 sched_rt_avg_update(rq, irq_delta); 1999 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2000 sched_rt_avg_update(rq, irq_delta + steal);
2001#endif
1985} 2002}
1986 2003
2004#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1987static int irqtime_account_hi_update(void) 2005static int irqtime_account_hi_update(void)
1988{ 2006{
1989 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2007 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2018,12 +2036,7 @@ static int irqtime_account_si_update(void)
2018 2036
2019#define sched_clock_irqtime (0) 2037#define sched_clock_irqtime (0)
2020 2038
2021static void update_rq_clock_task(struct rq *rq, s64 delta) 2039#endif
2022{
2023 rq->clock_task += delta;
2024}
2025
2026#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2027 2040
2028#include "sched_idletask.c" 2041#include "sched_idletask.c"
2029#include "sched_fair.c" 2042#include "sched_fair.c"
@@ -2219,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2219 2232
2220 if (task_cpu(p) != new_cpu) { 2233 if (task_cpu(p) != new_cpu) {
2221 p->se.nr_migrations++; 2234 p->se.nr_migrations++;
2222 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2235 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2223 } 2236 }
2224 2237
2225 __set_task_cpu(p, new_cpu); 2238 __set_task_cpu(p, new_cpu);
@@ -2496,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2496 if (p->sched_class->task_woken) 2509 if (p->sched_class->task_woken)
2497 p->sched_class->task_woken(rq, p); 2510 p->sched_class->task_woken(rq, p);
2498 2511
2499 if (unlikely(rq->idle_stamp)) { 2512 if (rq->idle_stamp) {
2500 u64 delta = rq->clock - rq->idle_stamp; 2513 u64 delta = rq->clock - rq->idle_stamp;
2501 u64 max = 2*sysctl_sched_migration_cost; 2514 u64 max = 2*sysctl_sched_migration_cost;
2502 2515
@@ -2543,13 +2556,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2543} 2556}
2544 2557
2545#ifdef CONFIG_SMP 2558#ifdef CONFIG_SMP
2546static void sched_ttwu_pending(void) 2559static void sched_ttwu_do_pending(struct task_struct *list)
2547{ 2560{
2548 struct rq *rq = this_rq(); 2561 struct rq *rq = this_rq();
2549 struct task_struct *list = xchg(&rq->wake_list, NULL);
2550
2551 if (!list)
2552 return;
2553 2562
2554 raw_spin_lock(&rq->lock); 2563 raw_spin_lock(&rq->lock);
2555 2564
@@ -2562,9 +2571,45 @@ static void sched_ttwu_pending(void)
2562 raw_spin_unlock(&rq->lock); 2571 raw_spin_unlock(&rq->lock);
2563} 2572}
2564 2573
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2565void scheduler_ipi(void) 2589void scheduler_ipi(void)
2566{ 2590{
2567 sched_ttwu_pending(); 2591 struct rq *rq = this_rq();
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return;
2596
2597 /*
2598 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2599 * traditionally all their work was done from the interrupt return
2600 * path. Now that we actually do some work, we need to make sure
2601 * we do call them.
2602 *
2603 * Some archs already do call them, luckily irq_enter/exit nest
2604 * properly.
2605 *
2606 * Arguably we should visit all archs and update all handlers,
2607 * however a fair share of IPIs are still resched only so this would
2608 * somewhat pessimize the simple resched case.
2609 */
2610 irq_enter();
2611 sched_ttwu_do_pending(list);
2612 irq_exit();
2568} 2613}
2569 2614
2570static void ttwu_queue_remote(struct task_struct *p, int cpu) 2615static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -2853,7 +2898,7 @@ void sched_fork(struct task_struct *p)
2853#if defined(CONFIG_SMP) 2898#if defined(CONFIG_SMP)
2854 p->on_cpu = 0; 2899 p->on_cpu = 0;
2855#endif 2900#endif
2856#ifdef CONFIG_PREEMPT 2901#ifdef CONFIG_PREEMPT_COUNT
2857 /* Want to start with kernel preemption disabled. */ 2902 /* Want to start with kernel preemption disabled. */
2858 task_thread_info(p)->preempt_count = 1; 2903 task_thread_info(p)->preempt_count = 1;
2859#endif 2904#endif
@@ -3844,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
3844 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3889 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3845} 3890}
3846 3891
3892static __always_inline bool steal_account_process_tick(void)
3893{
3894#ifdef CONFIG_PARAVIRT
3895 if (static_branch(&paravirt_steal_enabled)) {
3896 u64 steal, st = 0;
3897
3898 steal = paravirt_steal_clock(smp_processor_id());
3899 steal -= this_rq()->prev_steal_time;
3900
3901 st = steal_ticks(steal);
3902 this_rq()->prev_steal_time += st * TICK_NSEC;
3903
3904 account_steal_time(st);
3905 return st;
3906 }
3907#endif
3908 return false;
3909}
3910
3847#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3911#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3848 3912
3849#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3913#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3875,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3875 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3939 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3876 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3940 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3877 3941
3942 if (steal_account_process_tick())
3943 return;
3944
3878 if (irqtime_account_hi_update()) { 3945 if (irqtime_account_hi_update()) {
3879 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3946 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3880 } else if (irqtime_account_si_update()) { 3947 } else if (irqtime_account_si_update()) {
@@ -3928,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
3928 return; 3995 return;
3929 } 3996 }
3930 3997
3998 if (steal_account_process_tick())
3999 return;
4000
3931 if (user_tick) 4001 if (user_tick)
3932 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4002 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3933 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4003 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4305,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
4305 4375
4306static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4376static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4307{ 4377{
4308 bool ret = false;
4309
4310 rcu_read_lock();
4311 if (lock->owner != owner) 4378 if (lock->owner != owner)
4312 goto fail; 4379 return false;
4313 4380
4314 /* 4381 /*
4315 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4382 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4319,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4319 */ 4386 */
4320 barrier(); 4387 barrier();
4321 4388
4322 ret = owner->on_cpu; 4389 return owner->on_cpu;
4323fail:
4324 rcu_read_unlock();
4325
4326 return ret;
4327} 4390}
4328 4391
4329/* 4392/*
@@ -4335,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4335 if (!sched_feat(OWNER_SPIN)) 4398 if (!sched_feat(OWNER_SPIN))
4336 return 0; 4399 return 0;
4337 4400
4401 rcu_read_lock();
4338 while (owner_running(lock, owner)) { 4402 while (owner_running(lock, owner)) {
4339 if (need_resched()) 4403 if (need_resched())
4340 return 0; 4404 break;
4341 4405
4342 arch_mutex_cpu_relax(); 4406 arch_mutex_cpu_relax();
4343 } 4407 }
4408 rcu_read_unlock();
4344 4409
4345 /* 4410 /*
4346 * If the owner changed to another task there is likely 4411 * We break out the loop above on need_resched() and when the
4347 * heavy contention, stop spinning. 4412 * owner changed, which is a sign for heavy contention. Return
4413 * success only when lock->owner is NULL.
4348 */ 4414 */
4349 if (lock->owner) 4415 return lock->owner == NULL;
4350 return 0;
4351
4352 return 1;
4353} 4416}
4354#endif 4417#endif
4355 4418
@@ -6556,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6556 break; 6619 break;
6557 } 6620 }
6558 6621
6559 if (!group->cpu_power) { 6622 if (!group->sgp->power) {
6560 printk(KERN_CONT "\n"); 6623 printk(KERN_CONT "\n");
6561 printk(KERN_ERR "ERROR: domain->cpu_power not " 6624 printk(KERN_ERR "ERROR: domain->cpu_power not "
6562 "set\n"); 6625 "set\n");
@@ -6580,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6580 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6643 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6581 6644
6582 printk(KERN_CONT " %s", str); 6645 printk(KERN_CONT " %s", str);
6583 if (group->cpu_power != SCHED_POWER_SCALE) { 6646 if (group->sgp->power != SCHED_POWER_SCALE) {
6584 printk(KERN_CONT " (cpu_power = %d)", 6647 printk(KERN_CONT " (cpu_power = %d)",
6585 group->cpu_power); 6648 group->sgp->power);
6586 } 6649 }
6587 6650
6588 group = group->next; 6651 group = group->next;
@@ -6773,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void)
6773 return rd; 6836 return rd;
6774} 6837}
6775 6838
6839static void free_sched_groups(struct sched_group *sg, int free_sgp)
6840{
6841 struct sched_group *tmp, *first;
6842
6843 if (!sg)
6844 return;
6845
6846 first = sg;
6847 do {
6848 tmp = sg->next;
6849
6850 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6851 kfree(sg->sgp);
6852
6853 kfree(sg);
6854 sg = tmp;
6855 } while (sg != first);
6856}
6857
6776static void free_sched_domain(struct rcu_head *rcu) 6858static void free_sched_domain(struct rcu_head *rcu)
6777{ 6859{
6778 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6860 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6779 if (atomic_dec_and_test(&sd->groups->ref)) 6861
6862 /*
6863 * If its an overlapping domain it has private groups, iterate and
6864 * nuke them all.
6865 */
6866 if (sd->flags & SD_OVERLAP) {
6867 free_sched_groups(sd->groups, 1);
6868 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6869 kfree(sd->groups->sgp);
6780 kfree(sd->groups); 6870 kfree(sd->groups);
6871 }
6781 kfree(sd); 6872 kfree(sd);
6782} 6873}
6783 6874
@@ -6944,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6944struct sd_data { 7035struct sd_data {
6945 struct sched_domain **__percpu sd; 7036 struct sched_domain **__percpu sd;
6946 struct sched_group **__percpu sg; 7037 struct sched_group **__percpu sg;
7038 struct sched_group_power **__percpu sgp;
6947}; 7039};
6948 7040
6949struct s_data { 7041struct s_data {
@@ -6963,15 +7055,73 @@ struct sched_domain_topology_level;
6963typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 7055typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6964typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 7056typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6965 7057
7058#define SDTL_OVERLAP 0x01
7059
6966struct sched_domain_topology_level { 7060struct sched_domain_topology_level {
6967 sched_domain_init_f init; 7061 sched_domain_init_f init;
6968 sched_domain_mask_f mask; 7062 sched_domain_mask_f mask;
7063 int flags;
6969 struct sd_data data; 7064 struct sd_data data;
6970}; 7065};
6971 7066
6972/* 7067static int
6973 * Assumes the sched_domain tree is fully constructed 7068build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6974 */ 7069{
7070 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7071 const struct cpumask *span = sched_domain_span(sd);
7072 struct cpumask *covered = sched_domains_tmpmask;
7073 struct sd_data *sdd = sd->private;
7074 struct sched_domain *child;
7075 int i;
7076
7077 cpumask_clear(covered);
7078
7079 for_each_cpu(i, span) {
7080 struct cpumask *sg_span;
7081
7082 if (cpumask_test_cpu(i, covered))
7083 continue;
7084
7085 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7086 GFP_KERNEL, cpu_to_node(i));
7087
7088 if (!sg)
7089 goto fail;
7090
7091 sg_span = sched_group_cpus(sg);
7092
7093 child = *per_cpu_ptr(sdd->sd, i);
7094 if (child->child) {
7095 child = child->child;
7096 cpumask_copy(sg_span, sched_domain_span(child));
7097 } else
7098 cpumask_set_cpu(i, sg_span);
7099
7100 cpumask_or(covered, covered, sg_span);
7101
7102 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7103 atomic_inc(&sg->sgp->ref);
7104
7105 if (cpumask_test_cpu(cpu, sg_span))
7106 groups = sg;
7107
7108 if (!first)
7109 first = sg;
7110 if (last)
7111 last->next = sg;
7112 last = sg;
7113 last->next = first;
7114 }
7115 sd->groups = groups;
7116
7117 return 0;
7118
7119fail:
7120 free_sched_groups(first, 0);
7121
7122 return -ENOMEM;
7123}
7124
6975static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 7125static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6976{ 7126{
6977 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 7127 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6980,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6980 if (child) 7130 if (child)
6981 cpu = cpumask_first(sched_domain_span(child)); 7131 cpu = cpumask_first(sched_domain_span(child));
6982 7132
6983 if (sg) 7133 if (sg) {
6984 *sg = *per_cpu_ptr(sdd->sg, cpu); 7134 *sg = *per_cpu_ptr(sdd->sg, cpu);
7135 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7136 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
7137 }
6985 7138
6986 return cpu; 7139 return cpu;
6987} 7140}
6988 7141
6989/* 7142/*
6990 * build_sched_groups takes the cpumask we wish to span, and a pointer
6991 * to a function which identifies what group(along with sched group) a CPU
6992 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6993 * (due to the fact that we keep track of groups covered with a struct cpumask).
6994 *
6995 * build_sched_groups will build a circular linked list of the groups 7143 * build_sched_groups will build a circular linked list of the groups
6996 * covered by the given span, and will set each group's ->cpumask correctly, 7144 * covered by the given span, and will set each group's ->cpumask correctly,
6997 * and ->cpu_power to 0. 7145 * and ->cpu_power to 0.
7146 *
7147 * Assumes the sched_domain tree is fully constructed
6998 */ 7148 */
6999static void 7149static int
7000build_sched_groups(struct sched_domain *sd) 7150build_sched_groups(struct sched_domain *sd, int cpu)
7001{ 7151{
7002 struct sched_group *first = NULL, *last = NULL; 7152 struct sched_group *first = NULL, *last = NULL;
7003 struct sd_data *sdd = sd->private; 7153 struct sd_data *sdd = sd->private;
@@ -7005,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd)
7005 struct cpumask *covered; 7155 struct cpumask *covered;
7006 int i; 7156 int i;
7007 7157
7158 get_group(cpu, sdd, &sd->groups);
7159 atomic_inc(&sd->groups->ref);
7160
7161 if (cpu != cpumask_first(sched_domain_span(sd)))
7162 return 0;
7163
7008 lockdep_assert_held(&sched_domains_mutex); 7164 lockdep_assert_held(&sched_domains_mutex);
7009 covered = sched_domains_tmpmask; 7165 covered = sched_domains_tmpmask;
7010 7166
@@ -7019,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd)
7019 continue; 7175 continue;
7020 7176
7021 cpumask_clear(sched_group_cpus(sg)); 7177 cpumask_clear(sched_group_cpus(sg));
7022 sg->cpu_power = 0; 7178 sg->sgp->power = 0;
7023 7179
7024 for_each_cpu(j, span) { 7180 for_each_cpu(j, span) {
7025 if (get_group(j, sdd, NULL) != group) 7181 if (get_group(j, sdd, NULL) != group)
@@ -7036,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd)
7036 last = sg; 7192 last = sg;
7037 } 7193 }
7038 last->next = first; 7194 last->next = first;
7195
7196 return 0;
7039} 7197}
7040 7198
7041/* 7199/*
@@ -7050,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd)
7050 */ 7208 */
7051static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7209static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7052{ 7210{
7053 WARN_ON(!sd || !sd->groups); 7211 struct sched_group *sg = sd->groups;
7054 7212
7055 if (cpu != group_first_cpu(sd->groups)) 7213 WARN_ON(!sd || !sg);
7056 return;
7057 7214
7058 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7215 do {
7216 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7217 sg = sg->next;
7218 } while (sg != sd->groups);
7219
7220 if (cpu != group_first_cpu(sg))
7221 return;
7059 7222
7060 update_group_power(sd, cpu); 7223 update_group_power(sd, cpu);
7061} 7224}
@@ -7176,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7176static void claim_allocations(int cpu, struct sched_domain *sd) 7339static void claim_allocations(int cpu, struct sched_domain *sd)
7177{ 7340{
7178 struct sd_data *sdd = sd->private; 7341 struct sd_data *sdd = sd->private;
7179 struct sched_group *sg = sd->groups;
7180 7342
7181 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7343 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7182 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7344 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7183 7345
7184 if (cpu == cpumask_first(sched_group_cpus(sg))) { 7346 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7185 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7186 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7347 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7187 } 7348
7349 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7350 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7188} 7351}
7189 7352
7190#ifdef CONFIG_SCHED_SMT 7353#ifdef CONFIG_SCHED_SMT
@@ -7209,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = {
7209#endif 7372#endif
7210 { sd_init_CPU, cpu_cpu_mask, }, 7373 { sd_init_CPU, cpu_cpu_mask, },
7211#ifdef CONFIG_NUMA 7374#ifdef CONFIG_NUMA
7212 { sd_init_NODE, cpu_node_mask, }, 7375 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7213 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7376 { sd_init_ALLNODES, cpu_allnodes_mask, },
7214#endif 7377#endif
7215 { NULL, }, 7378 { NULL, },
@@ -7233,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7233 if (!sdd->sg) 7396 if (!sdd->sg)
7234 return -ENOMEM; 7397 return -ENOMEM;
7235 7398
7399 sdd->sgp = alloc_percpu(struct sched_group_power *);
7400 if (!sdd->sgp)
7401 return -ENOMEM;
7402
7236 for_each_cpu(j, cpu_map) { 7403 for_each_cpu(j, cpu_map) {
7237 struct sched_domain *sd; 7404 struct sched_domain *sd;
7238 struct sched_group *sg; 7405 struct sched_group *sg;
7406 struct sched_group_power *sgp;
7239 7407
7240 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7408 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7241 GFP_KERNEL, cpu_to_node(j)); 7409 GFP_KERNEL, cpu_to_node(j));
@@ -7250,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7250 return -ENOMEM; 7418 return -ENOMEM;
7251 7419
7252 *per_cpu_ptr(sdd->sg, j) = sg; 7420 *per_cpu_ptr(sdd->sg, j) = sg;
7421
7422 sgp = kzalloc_node(sizeof(struct sched_group_power),
7423 GFP_KERNEL, cpu_to_node(j));
7424 if (!sgp)
7425 return -ENOMEM;
7426
7427 *per_cpu_ptr(sdd->sgp, j) = sgp;
7253 } 7428 }
7254 } 7429 }
7255 7430
@@ -7265,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
7265 struct sd_data *sdd = &tl->data; 7440 struct sd_data *sdd = &tl->data;
7266 7441
7267 for_each_cpu(j, cpu_map) { 7442 for_each_cpu(j, cpu_map) {
7268 kfree(*per_cpu_ptr(sdd->sd, j)); 7443 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7444 if (sd && (sd->flags & SD_OVERLAP))
7445 free_sched_groups(sd->groups, 0);
7269 kfree(*per_cpu_ptr(sdd->sg, j)); 7446 kfree(*per_cpu_ptr(sdd->sg, j));
7447 kfree(*per_cpu_ptr(sdd->sgp, j));
7270 } 7448 }
7271 free_percpu(sdd->sd); 7449 free_percpu(sdd->sd);
7272 free_percpu(sdd->sg); 7450 free_percpu(sdd->sg);
7451 free_percpu(sdd->sgp);
7273 } 7452 }
7274} 7453}
7275 7454
@@ -7315,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7315 struct sched_domain_topology_level *tl; 7494 struct sched_domain_topology_level *tl;
7316 7495
7317 sd = NULL; 7496 sd = NULL;
7318 for (tl = sched_domain_topology; tl->init; tl++) 7497 for (tl = sched_domain_topology; tl->init; tl++) {
7319 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7498 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7499 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7500 sd->flags |= SD_OVERLAP;
7501 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7502 break;
7503 }
7320 7504
7321 while (sd->child) 7505 while (sd->child)
7322 sd = sd->child; 7506 sd = sd->child;
@@ -7328,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7328 for_each_cpu(i, cpu_map) { 7512 for_each_cpu(i, cpu_map) {
7329 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7513 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7330 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7514 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7331 get_group(i, sd->private, &sd->groups); 7515 if (sd->flags & SD_OVERLAP) {
7332 atomic_inc(&sd->groups->ref); 7516 if (build_overlap_sched_groups(sd, i))
7333 7517 goto error;
7334 if (i != cpumask_first(sched_domain_span(sd))) 7518 } else {
7335 continue; 7519 if (build_sched_groups(sd, i))
7336 7520 goto error;
7337 build_sched_groups(sd); 7521 }
7338 } 7522 }
7339 } 7523 }
7340 7524
@@ -7744,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
7744 && addr < (unsigned long)__sched_text_end); 7928 && addr < (unsigned long)__sched_text_end);
7745} 7929}
7746 7930
7747static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7931static void init_cfs_rq(struct cfs_rq *cfs_rq)
7748{ 7932{
7749 cfs_rq->tasks_timeline = RB_ROOT; 7933 cfs_rq->tasks_timeline = RB_ROOT;
7750 INIT_LIST_HEAD(&cfs_rq->tasks); 7934 INIT_LIST_HEAD(&cfs_rq->tasks);
7751#ifdef CONFIG_FAIR_GROUP_SCHED
7752 cfs_rq->rq = rq;
7753 /* allow initial update_cfs_load() to truncate */
7754#ifdef CONFIG_SMP
7755 cfs_rq->load_stamp = 1;
7756#endif
7757#endif
7758 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7935 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7936#ifndef CONFIG_64BIT
7937 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7938#endif
7759} 7939}
7760 7940
7761static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7941static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7771,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7771 /* delimiter for bitsearch: */ 7951 /* delimiter for bitsearch: */
7772 __set_bit(MAX_RT_PRIO, array->bitmap); 7952 __set_bit(MAX_RT_PRIO, array->bitmap);
7773 7953
7774#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7954#if defined CONFIG_SMP
7775 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7955 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7776#ifdef CONFIG_SMP
7777 rt_rq->highest_prio.next = MAX_RT_PRIO; 7956 rt_rq->highest_prio.next = MAX_RT_PRIO;
7778#endif
7779#endif
7780#ifdef CONFIG_SMP
7781 rt_rq->rt_nr_migratory = 0; 7957 rt_rq->rt_nr_migratory = 0;
7782 rt_rq->overloaded = 0; 7958 rt_rq->overloaded = 0;
7783 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7959 plist_head_init(&rt_rq->pushable_tasks);
7784#endif 7960#endif
7785 7961
7786 rt_rq->rt_time = 0; 7962 rt_rq->rt_time = 0;
7787 rt_rq->rt_throttled = 0; 7963 rt_rq->rt_throttled = 0;
7788 rt_rq->rt_runtime = 0; 7964 rt_rq->rt_runtime = 0;
7789 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7965 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7790
7791#ifdef CONFIG_RT_GROUP_SCHED
7792 rt_rq->rt_nr_boosted = 0;
7793 rt_rq->rq = rq;
7794#endif
7795} 7966}
7796 7967
7797#ifdef CONFIG_FAIR_GROUP_SCHED 7968#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7800,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7800 struct sched_entity *parent) 7971 struct sched_entity *parent)
7801{ 7972{
7802 struct rq *rq = cpu_rq(cpu); 7973 struct rq *rq = cpu_rq(cpu);
7803 tg->cfs_rq[cpu] = cfs_rq; 7974
7804 init_cfs_rq(cfs_rq, rq);
7805 cfs_rq->tg = tg; 7975 cfs_rq->tg = tg;
7976 cfs_rq->rq = rq;
7977#ifdef CONFIG_SMP
7978 /* allow initial update_cfs_load() to truncate */
7979 cfs_rq->load_stamp = 1;
7980#endif
7806 7981
7982 tg->cfs_rq[cpu] = cfs_rq;
7807 tg->se[cpu] = se; 7983 tg->se[cpu] = se;
7984
7808 /* se could be NULL for root_task_group */ 7985 /* se could be NULL for root_task_group */
7809 if (!se) 7986 if (!se)
7810 return; 7987 return;
@@ -7827,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7827{ 8004{
7828 struct rq *rq = cpu_rq(cpu); 8005 struct rq *rq = cpu_rq(cpu);
7829 8006
7830 tg->rt_rq[cpu] = rt_rq; 8007 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7831 init_rt_rq(rt_rq, rq); 8008 rt_rq->rt_nr_boosted = 0;
8009 rt_rq->rq = rq;
7832 rt_rq->tg = tg; 8010 rt_rq->tg = tg;
7833 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7834 8011
8012 tg->rt_rq[cpu] = rt_rq;
7835 tg->rt_se[cpu] = rt_se; 8013 tg->rt_se[cpu] = rt_se;
8014
7836 if (!rt_se) 8015 if (!rt_se)
7837 return; 8016 return;
7838 8017
@@ -7914,7 +8093,7 @@ void __init sched_init(void)
7914 rq->nr_running = 0; 8093 rq->nr_running = 0;
7915 rq->calc_load_active = 0; 8094 rq->calc_load_active = 0;
7916 rq->calc_load_update = jiffies + LOAD_FREQ; 8095 rq->calc_load_update = jiffies + LOAD_FREQ;
7917 init_cfs_rq(&rq->cfs, rq); 8096 init_cfs_rq(&rq->cfs);
7918 init_rt_rq(&rq->rt, rq); 8097 init_rt_rq(&rq->rt, rq);
7919#ifdef CONFIG_FAIR_GROUP_SCHED 8098#ifdef CONFIG_FAIR_GROUP_SCHED
7920 root_task_group.shares = root_task_group_load; 8099 root_task_group.shares = root_task_group_load;
@@ -7985,7 +8164,7 @@ void __init sched_init(void)
7985#endif 8164#endif
7986 8165
7987#ifdef CONFIG_RT_MUTEXES 8166#ifdef CONFIG_RT_MUTEXES
7988 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8167 plist_head_init(&init_task.pi_waiters);
7989#endif 8168#endif
7990 8169
7991 /* 8170 /*
@@ -8028,7 +8207,7 @@ void __init sched_init(void)
8028 scheduler_running = 1; 8207 scheduler_running = 1;
8029} 8208}
8030 8209
8031#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8210#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8032static inline int preempt_count_equals(int preempt_offset) 8211static inline int preempt_count_equals(int preempt_offset)
8033{ 8212{
8034 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8213 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8038,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
8038 8217
8039void __might_sleep(const char *file, int line, int preempt_offset) 8218void __might_sleep(const char *file, int line, int preempt_offset)
8040{ 8219{
8041#ifdef in_atomic
8042 static unsigned long prev_jiffy; /* ratelimiting */ 8220 static unsigned long prev_jiffy; /* ratelimiting */
8043 8221
8044 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8222 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8060,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8060 if (irqs_disabled()) 8238 if (irqs_disabled())
8061 print_irqtrace_events(current); 8239 print_irqtrace_events(current);
8062 dump_stack(); 8240 dump_stack();
8063#endif
8064} 8241}
8065EXPORT_SYMBOL(__might_sleep); 8242EXPORT_SYMBOL(__might_sleep);
8066#endif 8243#endif
@@ -8219,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8219 if (!se) 8396 if (!se)
8220 goto err_free_rq; 8397 goto err_free_rq;
8221 8398
8399 init_cfs_rq(cfs_rq);
8222 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8400 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8223 } 8401 }
8224 8402
@@ -8246,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8246 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8424 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8247 raw_spin_unlock_irqrestore(&rq->lock, flags); 8425 raw_spin_unlock_irqrestore(&rq->lock, flags);
8248} 8426}
8249#else /* !CONFG_FAIR_GROUP_SCHED */ 8427#else /* !CONFIG_FAIR_GROUP_SCHED */
8250static inline void free_fair_sched_group(struct task_group *tg) 8428static inline void free_fair_sched_group(struct task_group *tg)
8251{ 8429{
8252} 8430}
@@ -8267,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
8267{ 8445{
8268 int i; 8446 int i;
8269 8447
8270 destroy_rt_bandwidth(&tg->rt_bandwidth); 8448 if (tg->rt_se)
8449 destroy_rt_bandwidth(&tg->rt_bandwidth);
8271 8450
8272 for_each_possible_cpu(i) { 8451 for_each_possible_cpu(i) {
8273 if (tg->rt_rq) 8452 if (tg->rt_rq)
@@ -8308,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8308 if (!rt_se) 8487 if (!rt_se)
8309 goto err_free_rq; 8488 goto err_free_rq;
8310 8489
8490 init_rt_rq(rt_rq, cpu_rq(i));
8491 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8311 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8492 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8312 } 8493 }
8313 8494
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
13 int nice; 13 int nice;
14}; 14};
15 15
16static inline bool task_group_is_autogroup(struct task_group *tg);
16static inline struct task_group * 17static inline struct task_group *
17autogroup_task_group(struct task_struct *p, struct task_group *tg); 18autogroup_task_group(struct task_struct *p, struct task_group *tg);
18 19
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
135 return grp->my_q; 135 return grp->my_q;
136} 136}
137 137
138/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
139 * another cpu ('this_cpu')
140 */
141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
142{
143 return cfs_rq->tg->cfs_rq[this_cpu];
144}
145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 138static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{ 139{
148 if (!cfs_rq->on_list) { 140 if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
271 return NULL; 263 return NULL;
272} 264}
273 265
274static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
275{
276 return &cpu_rq(this_cpu)->cfs;
277}
278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 266static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{ 267{
281} 268}
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
334 return (s64)(a->vruntime - b->vruntime) < 0; 321 return (s64)(a->vruntime - b->vruntime) < 0;
335} 322}
336 323
337static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
338{
339 return se->vruntime - cfs_rq->min_vruntime;
340}
341
342static void update_min_vruntime(struct cfs_rq *cfs_rq) 324static void update_min_vruntime(struct cfs_rq *cfs_rq)
343{ 325{
344 u64 vruntime = cfs_rq->min_vruntime; 326 u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
372 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 354 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
373 struct rb_node *parent = NULL; 355 struct rb_node *parent = NULL;
374 struct sched_entity *entry; 356 struct sched_entity *entry;
375 s64 key = entity_key(cfs_rq, se);
376 int leftmost = 1; 357 int leftmost = 1;
377 358
378 /* 359 /*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
385 * We dont care about collisions. Nodes with 366 * We dont care about collisions. Nodes with
386 * the same key stay together. 367 * the same key stay together.
387 */ 368 */
388 if (key < entity_key(cfs_rq, entry)) { 369 if (entity_before(se, entry)) {
389 link = &parent->rb_left; 370 link = &parent->rb_left;
390 } else { 371 } else {
391 link = &parent->rb_right; 372 link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1336 } 1317 }
1337 1318
1338 for_each_sched_entity(se) { 1319 for_each_sched_entity(se) {
1339 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1320 cfs_rq = cfs_rq_of(se);
1340 1321
1341 update_cfs_load(cfs_rq, 0); 1322 update_cfs_load(cfs_rq, 0);
1342 update_cfs_shares(cfs_rq); 1323 update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1370 */ 1351 */
1371 if (task_sleep && parent_entity(se)) 1352 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se)); 1353 set_next_buddy(parent_entity(se));
1354
1355 /* avoid re-evaluating load for this entity */
1356 se = parent_entity(se);
1373 break; 1357 break;
1374 } 1358 }
1375 flags |= DEQUEUE_SLEEP; 1359 flags |= DEQUEUE_SLEEP;
1376 } 1360 }
1377 1361
1378 for_each_sched_entity(se) { 1362 for_each_sched_entity(se) {
1379 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 cfs_rq = cfs_rq_of(se);
1380 1364
1381 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1382 update_cfs_shares(cfs_rq); 1366 update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1481 * effect of the currently running task from the load 1465 * effect of the currently running task from the load
1482 * of the current CPU: 1466 * of the current CPU:
1483 */ 1467 */
1484 rcu_read_lock();
1485 if (sync) { 1468 if (sync) {
1486 tg = task_group(current); 1469 tg = task_group(current);
1487 weight = current->se.load.weight; 1470 weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1517 balanced = this_eff_load <= prev_eff_load; 1500 balanced = this_eff_load <= prev_eff_load;
1518 } else 1501 } else
1519 balanced = true; 1502 balanced = true;
1520 rcu_read_unlock();
1521 1503
1522 /* 1504 /*
1523 * If the currently running task will sleep within 1505 * If the currently running task will sleep within
@@ -1585,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1585 } 1567 }
1586 1568
1587 /* Adjust by relative CPU power of the group */ 1569 /* Adjust by relative CPU power of the group */
1588 avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; 1570 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
1589 1571
1590 if (local_group) { 1572 if (local_group) {
1591 this_load = avg_load; 1573 this_load = avg_load;
@@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1921 if (!sched_feat(WAKEUP_PREEMPT)) 1903 if (!sched_feat(WAKEUP_PREEMPT))
1922 return; 1904 return;
1923 1905
1924 update_curr(cfs_rq);
1925 find_matching_se(&se, &pse); 1906 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se));
1926 BUG_ON(!pse); 1908 BUG_ON(!pse);
1927 if (wakeup_preempt_entity(se, pse) == 1) { 1909 if (wakeup_preempt_entity(se, pse) == 1) {
1928 /* 1910 /*
@@ -2231,11 +2213,43 @@ static void update_shares(int cpu)
2231 struct rq *rq = cpu_rq(cpu); 2213 struct rq *rq = cpu_rq(cpu);
2232 2214
2233 rcu_read_lock(); 2215 rcu_read_lock();
2216 /*
2217 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details.
2219 */
2234 for_each_leaf_cfs_rq(rq, cfs_rq) 2220 for_each_leaf_cfs_rq(rq, cfs_rq)
2235 update_shares_cpu(cfs_rq->tg, cpu); 2221 update_shares_cpu(cfs_rq->tg, cpu);
2236 rcu_read_unlock(); 2222 rcu_read_unlock();
2237} 2223}
2238 2224
2225/*
2226 * Compute the cpu's hierarchical load factor for each task group.
2227 * This needs to be done in a top-down fashion because the load of a child
2228 * group is a fraction of its parents load.
2229 */
2230static int tg_load_down(struct task_group *tg, void *data)
2231{
2232 unsigned long load;
2233 long cpu = (long)data;
2234
2235 if (!tg->parent) {
2236 load = cpu_rq(cpu)->load.weight;
2237 } else {
2238 load = tg->parent->cfs_rq[cpu]->h_load;
2239 load *= tg->se[cpu]->load.weight;
2240 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2241 }
2242
2243 tg->cfs_rq[cpu]->h_load = load;
2244
2245 return 0;
2246}
2247
2248static void update_h_load(long cpu)
2249{
2250 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
2251}
2252
2239static unsigned long 2253static unsigned long
2240load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2254load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2241 unsigned long max_load_move, 2255 unsigned long max_load_move,
@@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2243 int *all_pinned) 2257 int *all_pinned)
2244{ 2258{
2245 long rem_load_move = max_load_move; 2259 long rem_load_move = max_load_move;
2246 int busiest_cpu = cpu_of(busiest); 2260 struct cfs_rq *busiest_cfs_rq;
2247 struct task_group *tg;
2248 2261
2249 rcu_read_lock(); 2262 rcu_read_lock();
2250 update_h_load(busiest_cpu); 2263 update_h_load(cpu_of(busiest));
2251 2264
2252 list_for_each_entry_rcu(tg, &task_groups, list) { 2265 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
2253 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2254 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 2266 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2255 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 2267 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2256 u64 rem_load, moved_load; 2268 u64 rem_load, moved_load;
@@ -2631,7 +2643,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2631 power >>= SCHED_POWER_SHIFT; 2643 power >>= SCHED_POWER_SHIFT;
2632 } 2644 }
2633 2645
2634 sdg->cpu_power_orig = power; 2646 sdg->sgp->power_orig = power;
2635 2647
2636 if (sched_feat(ARCH_POWER)) 2648 if (sched_feat(ARCH_POWER))
2637 power *= arch_scale_freq_power(sd, cpu); 2649 power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2659,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2647 power = 1; 2659 power = 1;
2648 2660
2649 cpu_rq(cpu)->cpu_power = power; 2661 cpu_rq(cpu)->cpu_power = power;
2650 sdg->cpu_power = power; 2662 sdg->sgp->power = power;
2651} 2663}
2652 2664
2653static void update_group_power(struct sched_domain *sd, int cpu) 2665static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2665 2677
2666 group = child->groups; 2678 group = child->groups;
2667 do { 2679 do {
2668 power += group->cpu_power; 2680 power += group->sgp->power;
2669 group = group->next; 2681 group = group->next;
2670 } while (group != child->groups); 2682 } while (group != child->groups);
2671 2683
2672 sdg->cpu_power = power; 2684 sdg->sgp->power = power;
2673} 2685}
2674 2686
2675/* 2687/*
@@ -2691,7 +2703,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2691 /* 2703 /*
2692 * If ~90% of the cpu_power is still there, we're good. 2704 * If ~90% of the cpu_power is still there, we're good.
2693 */ 2705 */
2694 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2706 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2695 return 1; 2707 return 1;
2696 2708
2697 return 0; 2709 return 0;
@@ -2771,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2771 } 2783 }
2772 2784
2773 /* Adjust by relative CPU power of the group */ 2785 /* Adjust by relative CPU power of the group */
2774 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; 2786 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2775 2787
2776 /* 2788 /*
2777 * Consider the group unbalanced when the imbalance is larger 2789 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2800,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2788 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 2800 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2789 sgs->group_imb = 1; 2801 sgs->group_imb = 1;
2790 2802
2791 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, 2803 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2792 SCHED_POWER_SCALE); 2804 SCHED_POWER_SCALE);
2793 if (!sgs->group_capacity) 2805 if (!sgs->group_capacity)
2794 sgs->group_capacity = fix_small_capacity(sd, group); 2806 sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2877 return; 2889 return;
2878 2890
2879 sds->total_load += sgs.group_load; 2891 sds->total_load += sgs.group_load;
2880 sds->total_pwr += sg->cpu_power; 2892 sds->total_pwr += sg->sgp->power;
2881 2893
2882 /* 2894 /*
2883 * In case the child domain prefers tasks go to siblings 2895 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2974,7 @@ static int check_asym_packing(struct sched_domain *sd,
2962 if (this_cpu > busiest_cpu) 2974 if (this_cpu > busiest_cpu)
2963 return 0; 2975 return 0;
2964 2976
2965 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2977 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
2966 SCHED_POWER_SCALE); 2978 SCHED_POWER_SCALE);
2967 return 1; 2979 return 1;
2968} 2980}
@@ -2993,7 +3005,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2993 3005
2994 scaled_busy_load_per_task = sds->busiest_load_per_task 3006 scaled_busy_load_per_task = sds->busiest_load_per_task
2995 * SCHED_POWER_SCALE; 3007 * SCHED_POWER_SCALE;
2996 scaled_busy_load_per_task /= sds->busiest->cpu_power; 3008 scaled_busy_load_per_task /= sds->busiest->sgp->power;
2997 3009
2998 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3010 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2999 (scaled_busy_load_per_task * imbn)) { 3011 (scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3019,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3007 * moving them. 3019 * moving them.
3008 */ 3020 */
3009 3021
3010 pwr_now += sds->busiest->cpu_power * 3022 pwr_now += sds->busiest->sgp->power *
3011 min(sds->busiest_load_per_task, sds->max_load); 3023 min(sds->busiest_load_per_task, sds->max_load);
3012 pwr_now += sds->this->cpu_power * 3024 pwr_now += sds->this->sgp->power *
3013 min(sds->this_load_per_task, sds->this_load); 3025 min(sds->this_load_per_task, sds->this_load);
3014 pwr_now /= SCHED_POWER_SCALE; 3026 pwr_now /= SCHED_POWER_SCALE;
3015 3027
3016 /* Amount of load we'd subtract */ 3028 /* Amount of load we'd subtract */
3017 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3018 sds->busiest->cpu_power; 3030 sds->busiest->sgp->power;
3019 if (sds->max_load > tmp) 3031 if (sds->max_load > tmp)
3020 pwr_move += sds->busiest->cpu_power * 3032 pwr_move += sds->busiest->sgp->power *
3021 min(sds->busiest_load_per_task, sds->max_load - tmp); 3033 min(sds->busiest_load_per_task, sds->max_load - tmp);
3022 3034
3023 /* Amount of load we'd add */ 3035 /* Amount of load we'd add */
3024 if (sds->max_load * sds->busiest->cpu_power < 3036 if (sds->max_load * sds->busiest->sgp->power <
3025 sds->busiest_load_per_task * SCHED_POWER_SCALE) 3037 sds->busiest_load_per_task * SCHED_POWER_SCALE)
3026 tmp = (sds->max_load * sds->busiest->cpu_power) / 3038 tmp = (sds->max_load * sds->busiest->sgp->power) /
3027 sds->this->cpu_power; 3039 sds->this->sgp->power;
3028 else 3040 else
3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3041 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3030 sds->this->cpu_power; 3042 sds->this->sgp->power;
3031 pwr_move += sds->this->cpu_power * 3043 pwr_move += sds->this->sgp->power *
3032 min(sds->this_load_per_task, sds->this_load + tmp); 3044 min(sds->this_load_per_task, sds->this_load + tmp);
3033 pwr_move /= SCHED_POWER_SCALE; 3045 pwr_move /= SCHED_POWER_SCALE;
3034 3046
@@ -3074,7 +3086,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3074 3086
3075 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 3087 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3076 3088
3077 load_above_capacity /= sds->busiest->cpu_power; 3089 load_above_capacity /= sds->busiest->sgp->power;
3078 } 3090 }
3079 3091
3080 /* 3092 /*
@@ -3090,8 +3102,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3090 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3102 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3091 3103
3092 /* How much load to actually move to equalise the imbalance */ 3104 /* How much load to actually move to equalise the imbalance */
3093 *imbalance = min(max_pull * sds->busiest->cpu_power, 3105 *imbalance = min(max_pull * sds->busiest->sgp->power,
3094 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3106 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
3095 / SCHED_POWER_SCALE; 3107 / SCHED_POWER_SCALE;
3096 3108
3097 /* 3109 /*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,12 +61,14 @@ SCHED_FEAT(LB_BIAS, 1)
61SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62 62
63/* 63/*
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on time not spent running tasks
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONTASK_POWER, 1)
67 67
68/* 68/*
69 * Queue remote wakeups on the target CPU and process them 69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces. 70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */ 71 */
72SCHED_FEAT(TTWU_QUEUE, 1) 72SCHED_FEAT(TTWU_QUEUE, 1)
73
74SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d018212bab..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
185 185
186typedef struct task_group *rt_rq_iter_t; 186typedef struct task_group *rt_rq_iter_t;
187 187
188#define for_each_rt_rq(rt_rq, iter, rq) \ 188static inline struct task_group *next_task_group(struct task_group *tg)
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ 189{
190 (&iter->list != &task_groups) && \ 190 do {
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \ 191 tg = list_entry_rcu(tg->list.next,
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) 192 typeof(struct task_group), list);
193 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
194
195 if (&tg->list == &task_groups)
196 tg = NULL;
197
198 return tg;
199}
200
201#define for_each_rt_rq(rt_rq, iter, rq) \
202 for (iter = container_of(&task_groups, typeof(*iter), list); \
203 (iter = next_task_group(iter)) && \
204 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
193 205
194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 206static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
195{ 207{
@@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1126 1138
1127 rt_rq = &rq->rt; 1139 rt_rq = &rq->rt;
1128 1140
1129 if (unlikely(!rt_rq->rt_nr_running)) 1141 if (!rt_rq->rt_nr_running)
1130 return NULL; 1142 return NULL;
1131 1143
1132 if (rt_rq_throttled(rt_rq)) 1144 if (rt_rq_throttled(rt_rq))
@@ -1548,7 +1560,7 @@ skip:
1548static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1560static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1549{ 1561{
1550 /* Try to pull RT tasks here if we lower this rq's prio */ 1562 /* Try to pull RT tasks here if we lower this rq's prio */
1551 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) 1563 if (rq->rt.highest_prio.curr > prev->prio)
1552 pull_rt_task(rq); 1564 pull_rt_task(rq);
1553} 1565}
1554 1566
diff --git a/kernel/signal.c b/kernel/signal.c
index ff7678603328..d7f70aed1cc0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
87 /* 87 /*
88 * Tracers may want to know about even ignored signals. 88 * Tracers may want to know about even ignored signals.
89 */ 89 */
90 return !tracehook_consider_ignored_signal(t, sig); 90 return !t->ptrace;
91} 91}
92 92
93/* 93/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if ((t->group_stop & GROUP_STOP_PENDING) || 127 if ((t->jobctl & JOBCTL_PENDING_MASK) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
150 150
151void recalc_sigpending(void) 151void recalc_sigpending(void)
152{ 152{
153 if (unlikely(tracehook_force_sigpending())) 153 if (!recalc_sigpending_tsk(current) && !freezing(current))
154 set_thread_flag(TIF_SIGPENDING);
155 else if (!recalc_sigpending_tsk(current) && !freezing(current))
156 clear_thread_flag(TIF_SIGPENDING); 154 clear_thread_flag(TIF_SIGPENDING);
157 155
158} 156}
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
224} 222}
225 223
226/** 224/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit 225 * task_set_jobctl_pending - set jobctl pending bits
228 * @task: target task 226 * @task: target task
227 * @mask: pending bits to set
229 * 228 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it 229 * Clear @mask from @task->jobctl. @mask must be subset of
231 * and wake up the ptracer. Note that we don't need any further locking. 230 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
232 * @task->siglock guarantees that @task->parent points to the ptracer. 231 * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
232 * cleared. If @task is already being killed or exiting, this function
233 * becomes noop.
234 *
235 * CONTEXT:
236 * Must be called with @task->sighand->siglock held.
237 *
238 * RETURNS:
239 * %true if @mask is set, %false if made noop because @task was dying.
240 */
241bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
242{
243 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
244 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
245 BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
246
247 if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
248 return false;
249
250 if (mask & JOBCTL_STOP_SIGMASK)
251 task->jobctl &= ~JOBCTL_STOP_SIGMASK;
252
253 task->jobctl |= mask;
254 return true;
255}
256
257/**
258 * task_clear_jobctl_trapping - clear jobctl trapping bit
259 * @task: target task
260 *
261 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
262 * Clear it and wake up the ptracer. Note that we don't need any further
263 * locking. @task->siglock guarantees that @task->parent points to the
264 * ptracer.
233 * 265 *
234 * CONTEXT: 266 * CONTEXT:
235 * Must be called with @task->sighand->siglock held. 267 * Must be called with @task->sighand->siglock held.
236 */ 268 */
237static void task_clear_group_stop_trapping(struct task_struct *task) 269void task_clear_jobctl_trapping(struct task_struct *task)
238{ 270{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { 271 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING; 272 task->jobctl &= ~JOBCTL_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit, 273 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
242 TASK_UNINTERRUPTIBLE, 1, task);
243 } 274 }
244} 275}
245 276
246/** 277/**
247 * task_clear_group_stop_pending - clear pending group stop 278 * task_clear_jobctl_pending - clear jobctl pending bits
248 * @task: target task 279 * @task: target task
280 * @mask: pending bits to clear
249 * 281 *
250 * Clear group stop states for @task. 282 * Clear @mask from @task->jobctl. @mask must be subset of
283 * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
284 * STOP bits are cleared together.
285 *
286 * If clearing of @mask leaves no stop or trap pending, this function calls
287 * task_clear_jobctl_trapping().
251 * 288 *
252 * CONTEXT: 289 * CONTEXT:
253 * Must be called with @task->sighand->siglock held. 290 * Must be called with @task->sighand->siglock held.
254 */ 291 */
255void task_clear_group_stop_pending(struct task_struct *task) 292void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
256{ 293{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | 294 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
258 GROUP_STOP_DEQUEUED); 295
296 if (mask & JOBCTL_STOP_PENDING)
297 mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
298
299 task->jobctl &= ~mask;
300
301 if (!(task->jobctl & JOBCTL_PENDING_MASK))
302 task_clear_jobctl_trapping(task);
259} 303}
260 304
261/** 305/**
262 * task_participate_group_stop - participate in a group stop 306 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop 307 * @task: task participating in a group stop
264 * 308 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop. 309 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if 310 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group 311 * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set. 312 * stop, the appropriate %SIGNAL_* flags are set.
269 * 313 *
270 * CONTEXT: 314 * CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
277static bool task_participate_group_stop(struct task_struct *task) 321static bool task_participate_group_stop(struct task_struct *task)
278{ 322{
279 struct signal_struct *sig = task->signal; 323 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME; 324 bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
281 325
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); 326 WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
283 327
284 task_clear_group_stop_pending(task); 328 task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
285 329
286 if (!consume) 330 if (!consume)
287 return false; 331 return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
449 return 1; 493 return 1;
450 if (handler != SIG_IGN && handler != SIG_DFL) 494 if (handler != SIG_IGN && handler != SIG_DFL)
451 return 0; 495 return 0;
452 return !tracehook_consider_fatal_signal(tsk, sig); 496 /* if ptraced, let the tracer determine */
497 return !tsk->ptrace;
453} 498}
454 499
455/* 500/*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
604 * is to alert stop-signal processing code when another 649 * is to alert stop-signal processing code when another
605 * processor has come along and cleared the flag. 650 * processor has come along and cleared the flag.
606 */ 651 */
607 current->group_stop |= GROUP_STOP_DEQUEUED; 652 current->jobctl |= JOBCTL_STOP_DEQUEUED;
608 } 653 }
609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 654 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
610 /* 655 /*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
773 return security_task_kill(t, info, sig, 0); 818 return security_task_kill(t, info, sig, 0);
774} 819}
775 820
821/**
822 * ptrace_trap_notify - schedule trap to notify ptracer
823 * @t: tracee wanting to notify tracer
824 *
825 * This function schedules sticky ptrace trap which is cleared on the next
826 * TRAP_STOP to notify ptracer of an event. @t must have been seized by
827 * ptracer.
828 *
829 * If @t is running, STOP trap will be taken. If trapped for STOP and
830 * ptracer is listening for events, tracee is woken up so that it can
831 * re-trap for the new event. If trapped otherwise, STOP trap will be
832 * eventually taken without returning to userland after the existing traps
833 * are finished by PTRACE_CONT.
834 *
835 * CONTEXT:
836 * Must be called with @task->sighand->siglock held.
837 */
838static void ptrace_trap_notify(struct task_struct *t)
839{
840 WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
841 assert_spin_locked(&t->sighand->siglock);
842
843 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
844 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
845}
846
776/* 847/*
777 * Handle magic process-wide effects of stop/continue signals. Unlike 848 * Handle magic process-wide effects of stop/continue signals. Unlike
778 * the signal actions, these happen immediately at signal-generation 849 * the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 880 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
810 t = p; 881 t = p;
811 do { 882 do {
812 task_clear_group_stop_pending(t); 883 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 884 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
814 wake_up_state(t, __TASK_STOPPED); 885 if (likely(!(t->ptrace & PT_SEIZED)))
886 wake_up_state(t, __TASK_STOPPED);
887 else
888 ptrace_trap_notify(t);
815 } while_each_thread(p, t); 889 } while_each_thread(p, t);
816 890
817 /* 891 /*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
908 if (sig_fatal(p, sig) && 982 if (sig_fatal(p, sig) &&
909 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 983 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
910 !sigismember(&t->real_blocked, sig) && 984 !sigismember(&t->real_blocked, sig) &&
911 (sig == SIGKILL || 985 (sig == SIGKILL || !t->ptrace)) {
912 !tracehook_consider_fatal_signal(t, sig))) {
913 /* 986 /*
914 * This signal will be fatal to the whole group. 987 * This signal will be fatal to the whole group.
915 */ 988 */
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
925 signal->group_stop_count = 0; 998 signal->group_stop_count = 0;
926 t = p; 999 t = p;
927 do { 1000 do {
928 task_clear_group_stop_pending(t); 1001 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
929 sigaddset(&t->pending.signal, SIGKILL); 1002 sigaddset(&t->pending.signal, SIGKILL);
930 signal_wake_up(t, 1); 1003 signal_wake_up(t, 1);
931 } while_each_thread(p, t); 1004 } while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
1160 p->signal->group_stop_count = 0; 1233 p->signal->group_stop_count = 0;
1161 1234
1162 while_each_thread(p, t) { 1235 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t); 1236 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1164 count++; 1237 count++;
1165 1238
1166 /* Don't bother with already dead threads */ 1239 /* Don't bother with already dead threads */
@@ -1178,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1178{ 1251{
1179 struct sighand_struct *sighand; 1252 struct sighand_struct *sighand;
1180 1253
1181 rcu_read_lock();
1182 for (;;) { 1254 for (;;) {
1255 local_irq_save(*flags);
1256 rcu_read_lock();
1183 sighand = rcu_dereference(tsk->sighand); 1257 sighand = rcu_dereference(tsk->sighand);
1184 if (unlikely(sighand == NULL)) 1258 if (unlikely(sighand == NULL)) {
1259 rcu_read_unlock();
1260 local_irq_restore(*flags);
1185 break; 1261 break;
1262 }
1186 1263
1187 spin_lock_irqsave(&sighand->siglock, *flags); 1264 spin_lock(&sighand->siglock);
1188 if (likely(sighand == tsk->sighand)) 1265 if (likely(sighand == tsk->sighand)) {
1266 rcu_read_unlock();
1189 break; 1267 break;
1190 spin_unlock_irqrestore(&sighand->siglock, *flags); 1268 }
1269 spin_unlock(&sighand->siglock);
1270 rcu_read_unlock();
1271 local_irq_restore(*flags);
1191 } 1272 }
1192 rcu_read_unlock();
1193 1273
1194 return sighand; 1274 return sighand;
1195} 1275}
@@ -1504,22 +1584,22 @@ ret:
1504 * Let a parent know about the death of a child. 1584 * Let a parent know about the death of a child.
1505 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1585 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1506 * 1586 *
1507 * Returns -1 if our parent ignored us and so we've switched to 1587 * Returns true if our parent ignored us and so we've switched to
1508 * self-reaping, or else @sig. 1588 * self-reaping.
1509 */ 1589 */
1510int do_notify_parent(struct task_struct *tsk, int sig) 1590bool do_notify_parent(struct task_struct *tsk, int sig)
1511{ 1591{
1512 struct siginfo info; 1592 struct siginfo info;
1513 unsigned long flags; 1593 unsigned long flags;
1514 struct sighand_struct *psig; 1594 struct sighand_struct *psig;
1515 int ret = sig; 1595 bool autoreap = false;
1516 1596
1517 BUG_ON(sig == -1); 1597 BUG_ON(sig == -1);
1518 1598
1519 /* do_notify_parent_cldstop should have been called instead. */ 1599 /* do_notify_parent_cldstop should have been called instead. */
1520 BUG_ON(task_is_stopped_or_traced(tsk)); 1600 BUG_ON(task_is_stopped_or_traced(tsk));
1521 1601
1522 BUG_ON(!task_ptrace(tsk) && 1602 BUG_ON(!tsk->ptrace &&
1523 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1603 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1524 1604
1525 info.si_signo = sig; 1605 info.si_signo = sig;
@@ -1558,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1558 1638
1559 psig = tsk->parent->sighand; 1639 psig = tsk->parent->sighand;
1560 spin_lock_irqsave(&psig->siglock, flags); 1640 spin_lock_irqsave(&psig->siglock, flags);
1561 if (!task_ptrace(tsk) && sig == SIGCHLD && 1641 if (!tsk->ptrace && sig == SIGCHLD &&
1562 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1642 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1563 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1643 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1564 /* 1644 /*
@@ -1576,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1576 * is implementation-defined: we do (if you don't want 1656 * is implementation-defined: we do (if you don't want
1577 * it, just use SIG_IGN instead). 1657 * it, just use SIG_IGN instead).
1578 */ 1658 */
1579 ret = tsk->exit_signal = -1; 1659 autoreap = true;
1580 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1660 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1581 sig = -1; 1661 sig = 0;
1582 } 1662 }
1583 if (valid_signal(sig) && sig > 0) 1663 if (valid_signal(sig) && sig)
1584 __group_send_sig_info(sig, &info, tsk->parent); 1664 __group_send_sig_info(sig, &info, tsk->parent);
1585 __wake_up_parent(tsk, tsk->parent); 1665 __wake_up_parent(tsk, tsk->parent);
1586 spin_unlock_irqrestore(&psig->siglock, flags); 1666 spin_unlock_irqrestore(&psig->siglock, flags);
1587 1667
1588 return ret; 1668 return autoreap;
1589} 1669}
1590 1670
1591/** 1671/**
@@ -1658,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1658 1738
1659static inline int may_ptrace_stop(void) 1739static inline int may_ptrace_stop(void)
1660{ 1740{
1661 if (!likely(task_ptrace(current))) 1741 if (!likely(current->ptrace))
1662 return 0; 1742 return 0;
1663 /* 1743 /*
1664 * Are we in the middle of do_coredump? 1744 * Are we in the middle of do_coredump?
@@ -1687,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
1687} 1767}
1688 1768
1689/* 1769/*
1690 * Test whether the target task of the usual cldstop notification - the
1691 * real_parent of @child - is in the same group as the ptracer.
1692 */
1693static bool real_parent_is_ptracer(struct task_struct *child)
1694{
1695 return same_thread_group(child->parent, child->real_parent);
1696}
1697
1698/*
1699 * This must be called with current->sighand->siglock held. 1770 * This must be called with current->sighand->siglock held.
1700 * 1771 *
1701 * This should be the path for all ptrace stops. 1772 * This should be the path for all ptrace stops.
@@ -1732,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1732 } 1803 }
1733 1804
1734 /* 1805 /*
1735 * If @why is CLD_STOPPED, we're trapping to participate in a group 1806 * We're committing to trapping. TRACED should be visible before
1736 * stop. Do the bookkeeping. Note that if SIGCONT was delievered 1807 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
1737 * while siglock was released for the arch hook, PENDING could be 1808 * Also, transition to TRACED and updates to ->jobctl should be
1738 * clear now. We act as if SIGCONT is received after TASK_TRACED 1809 * atomic with respect to siglock and should be done after the arch
1739 * is entered - ignore it. 1810 * hook as siglock is released and regrabbed across it.
1740 */ 1811 */
1741 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) 1812 set_current_state(TASK_TRACED);
1742 gstop_done = task_participate_group_stop(current);
1743 1813
1744 current->last_siginfo = info; 1814 current->last_siginfo = info;
1745 current->exit_code = exit_code; 1815 current->exit_code = exit_code;
1746 1816
1747 /* 1817 /*
1748 * TRACED should be visible before TRAPPING is cleared; otherwise, 1818 * If @why is CLD_STOPPED, we're trapping to participate in a group
1749 * the tracer might fail do_wait(). 1819 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1820 * across siglock relocks since INTERRUPT was scheduled, PENDING
1821 * could be clear now. We act as if SIGCONT is received after
1822 * TASK_TRACED is entered - ignore it.
1750 */ 1823 */
1751 set_current_state(TASK_TRACED); 1824 if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
1825 gstop_done = task_participate_group_stop(current);
1752 1826
1753 /* 1827 /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
1754 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and 1828 task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
1755 * transition to TASK_TRACED should be atomic with respect to 1829 if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
1756 * siglock. This hsould be done after the arch hook as siglock is 1830 task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
1757 * released and regrabbed across it. 1831
1758 */ 1832 /* entering a trap, clear TRAPPING */
1759 task_clear_group_stop_trapping(current); 1833 task_clear_jobctl_trapping(current);
1760 1834
1761 spin_unlock_irq(&current->sighand->siglock); 1835 spin_unlock_irq(&current->sighand->siglock);
1762 read_lock(&tasklist_lock); 1836 read_lock(&tasklist_lock);
@@ -1772,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1772 * separately unless they're gonna be duplicates. 1846 * separately unless they're gonna be duplicates.
1773 */ 1847 */
1774 do_notify_parent_cldstop(current, true, why); 1848 do_notify_parent_cldstop(current, true, why);
1775 if (gstop_done && !real_parent_is_ptracer(current)) 1849 if (gstop_done && ptrace_reparented(current))
1776 do_notify_parent_cldstop(current, false, why); 1850 do_notify_parent_cldstop(current, false, why);
1777 1851
1778 /* 1852 /*
@@ -1792,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1792 * 1866 *
1793 * If @gstop_done, the ptracer went away between group stop 1867 * If @gstop_done, the ptracer went away between group stop
1794 * completion and here. During detach, it would have set 1868 * completion and here. During detach, it would have set
1795 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED 1869 * JOBCTL_STOP_PENDING on us and we'll re-enter
1796 * in do_signal_stop() on return, so notifying the real 1870 * TASK_STOPPED in do_signal_stop() on return, so notifying
1797 * parent of the group stop completion is enough. 1871 * the real parent of the group stop completion is enough.
1798 */ 1872 */
1799 if (gstop_done) 1873 if (gstop_done)
1800 do_notify_parent_cldstop(current, false, why); 1874 do_notify_parent_cldstop(current, false, why);
@@ -1820,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1820 spin_lock_irq(&current->sighand->siglock); 1894 spin_lock_irq(&current->sighand->siglock);
1821 current->last_siginfo = NULL; 1895 current->last_siginfo = NULL;
1822 1896
1897 /* LISTENING can be set only during STOP traps, clear it */
1898 current->jobctl &= ~JOBCTL_LISTENING;
1899
1823 /* 1900 /*
1824 * Queued signals ignored us while we were stopped for tracing. 1901 * Queued signals ignored us while we were stopped for tracing.
1825 * So check for any that we should take before resuming user mode. 1902 * So check for any that we should take before resuming user mode.
@@ -1828,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1828 recalc_sigpending_tsk(current); 1905 recalc_sigpending_tsk(current);
1829} 1906}
1830 1907
1831void ptrace_notify(int exit_code) 1908static void ptrace_do_notify(int signr, int exit_code, int why)
1832{ 1909{
1833 siginfo_t info; 1910 siginfo_t info;
1834 1911
1835 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1836
1837 memset(&info, 0, sizeof info); 1912 memset(&info, 0, sizeof info);
1838 info.si_signo = SIGTRAP; 1913 info.si_signo = signr;
1839 info.si_code = exit_code; 1914 info.si_code = exit_code;
1840 info.si_pid = task_pid_vnr(current); 1915 info.si_pid = task_pid_vnr(current);
1841 info.si_uid = current_uid(); 1916 info.si_uid = current_uid();
1842 1917
1843 /* Let the debugger run. */ 1918 /* Let the debugger run. */
1919 ptrace_stop(exit_code, why, 1, &info);
1920}
1921
1922void ptrace_notify(int exit_code)
1923{
1924 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1925
1844 spin_lock_irq(&current->sighand->siglock); 1926 spin_lock_irq(&current->sighand->siglock);
1845 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); 1927 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
1846 spin_unlock_irq(&current->sighand->siglock); 1928 spin_unlock_irq(&current->sighand->siglock);
1847} 1929}
1848 1930
1849/* 1931/**
1850 * This performs the stopping for SIGSTOP and other stop signals. 1932 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
1851 * We have to stop all threads in the thread group. 1933 * @signr: signr causing group stop if initiating
1852 * Returns non-zero if we've actually stopped and released the siglock. 1934 *
1853 * Returns zero if we didn't stop and still hold the siglock. 1935 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
1936 * and participate in it. If already set, participate in the existing
1937 * group stop. If participated in a group stop (and thus slept), %true is
1938 * returned with siglock released.
1939 *
1940 * If ptraced, this function doesn't handle stop itself. Instead,
1941 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
1942 * untouched. The caller must ensure that INTERRUPT trap handling takes
1943 * places afterwards.
1944 *
1945 * CONTEXT:
1946 * Must be called with @current->sighand->siglock held, which is released
1947 * on %true return.
1948 *
1949 * RETURNS:
1950 * %false if group stop is already cancelled or ptrace trap is scheduled.
1951 * %true if participated in group stop.
1854 */ 1952 */
1855static int do_signal_stop(int signr) 1953static bool do_signal_stop(int signr)
1954 __releases(&current->sighand->siglock)
1856{ 1955{
1857 struct signal_struct *sig = current->signal; 1956 struct signal_struct *sig = current->signal;
1858 1957
1859 if (!(current->group_stop & GROUP_STOP_PENDING)) { 1958 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
1860 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; 1959 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
1861 struct task_struct *t; 1960 struct task_struct *t;
1862 1961
1863 /* signr will be recorded in task->group_stop for retries */ 1962 /* signr will be recorded in task->jobctl for retries */
1864 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); 1963 WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
1865 1964
1866 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || 1965 if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
1867 unlikely(signal_group_exit(sig))) 1966 unlikely(signal_group_exit(sig)))
1868 return 0; 1967 return false;
1869 /* 1968 /*
1870 * There is no group stop already in progress. We must 1969 * There is no group stop already in progress. We must
1871 * initiate one now. 1970 * initiate one now.
@@ -1888,28 +1987,32 @@ static int do_signal_stop(int signr)
1888 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 1987 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1889 sig->group_exit_code = signr; 1988 sig->group_exit_code = signr;
1890 else 1989 else
1891 WARN_ON_ONCE(!task_ptrace(current)); 1990 WARN_ON_ONCE(!current->ptrace);
1991
1992 sig->group_stop_count = 0;
1993
1994 if (task_set_jobctl_pending(current, signr | gstop))
1995 sig->group_stop_count++;
1892 1996
1893 current->group_stop &= ~GROUP_STOP_SIGMASK;
1894 current->group_stop |= signr | gstop;
1895 sig->group_stop_count = 1;
1896 for (t = next_thread(current); t != current; 1997 for (t = next_thread(current); t != current;
1897 t = next_thread(t)) { 1998 t = next_thread(t)) {
1898 t->group_stop &= ~GROUP_STOP_SIGMASK;
1899 /* 1999 /*
1900 * Setting state to TASK_STOPPED for a group 2000 * Setting state to TASK_STOPPED for a group
1901 * stop is always done with the siglock held, 2001 * stop is always done with the siglock held,
1902 * so this check has no races. 2002 * so this check has no races.
1903 */ 2003 */
1904 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { 2004 if (!task_is_stopped(t) &&
1905 t->group_stop |= signr | gstop; 2005 task_set_jobctl_pending(t, signr | gstop)) {
1906 sig->group_stop_count++; 2006 sig->group_stop_count++;
1907 signal_wake_up(t, 0); 2007 if (likely(!(t->ptrace & PT_SEIZED)))
2008 signal_wake_up(t, 0);
2009 else
2010 ptrace_trap_notify(t);
1908 } 2011 }
1909 } 2012 }
1910 } 2013 }
1911retry: 2014
1912 if (likely(!task_ptrace(current))) { 2015 if (likely(!current->ptrace)) {
1913 int notify = 0; 2016 int notify = 0;
1914 2017
1915 /* 2018 /*
@@ -1940,43 +2043,65 @@ retry:
1940 2043
1941 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2044 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1942 schedule(); 2045 schedule();
1943 2046 return true;
1944 spin_lock_irq(&current->sighand->siglock);
1945 } else { 2047 } else {
1946 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, 2048 /*
1947 CLD_STOPPED, 0, NULL); 2049 * While ptraced, group stop is handled by STOP trap.
1948 current->exit_code = 0; 2050 * Schedule it and let the caller deal with it.
2051 */
2052 task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
2053 return false;
1949 } 2054 }
2055}
1950 2056
1951 /* 2057/**
1952 * GROUP_STOP_PENDING could be set if another group stop has 2058 * do_jobctl_trap - take care of ptrace jobctl traps
1953 * started since being woken up or ptrace wants us to transit 2059 *
1954 * between TASK_STOPPED and TRACED. Retry group stop. 2060 * When PT_SEIZED, it's used for both group stop and explicit
1955 */ 2061 * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
1956 if (current->group_stop & GROUP_STOP_PENDING) { 2062 * accompanying siginfo. If stopped, lower eight bits of exit_code contain
1957 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); 2063 * the stop signal; otherwise, %SIGTRAP.
1958 goto retry; 2064 *
2065 * When !PT_SEIZED, it's used only for group stop trap with stop signal
2066 * number as exit_code and no siginfo.
2067 *
2068 * CONTEXT:
2069 * Must be called with @current->sighand->siglock held, which may be
2070 * released and re-acquired before returning with intervening sleep.
2071 */
2072static void do_jobctl_trap(void)
2073{
2074 struct signal_struct *signal = current->signal;
2075 int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
2076
2077 if (current->ptrace & PT_SEIZED) {
2078 if (!signal->group_stop_count &&
2079 !(signal->flags & SIGNAL_STOP_STOPPED))
2080 signr = SIGTRAP;
2081 WARN_ON_ONCE(!signr);
2082 ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
2083 CLD_STOPPED);
2084 } else {
2085 WARN_ON_ONCE(!signr);
2086 ptrace_stop(signr, CLD_STOPPED, 0, NULL);
2087 current->exit_code = 0;
1959 } 2088 }
1960
1961 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1962 task_clear_group_stop_trapping(current);
1963
1964 spin_unlock_irq(&current->sighand->siglock);
1965
1966 tracehook_finish_jctl();
1967
1968 return 1;
1969} 2089}
1970 2090
1971static int ptrace_signal(int signr, siginfo_t *info, 2091static int ptrace_signal(int signr, siginfo_t *info,
1972 struct pt_regs *regs, void *cookie) 2092 struct pt_regs *regs, void *cookie)
1973{ 2093{
1974 if (!task_ptrace(current))
1975 return signr;
1976
1977 ptrace_signal_deliver(regs, cookie); 2094 ptrace_signal_deliver(regs, cookie);
1978 2095 /*
1979 /* Let the debugger run. */ 2096 * We do not check sig_kernel_stop(signr) but set this marker
2097 * unconditionally because we do not know whether debugger will
2098 * change signr. This flag has no meaning unless we are going
2099 * to stop after return from ptrace_stop(). In this case it will
2100 * be checked in do_signal_stop(), we should only stop if it was
2101 * not cleared by SIGCONT while we were sleeping. See also the
2102 * comment in dequeue_signal().
2103 */
2104 current->jobctl |= JOBCTL_STOP_DEQUEUED;
1980 ptrace_stop(signr, CLD_TRAPPED, 0, info); 2105 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1981 2106
1982 /* We're back. Did the debugger cancel the sig? */ 2107 /* We're back. Did the debugger cancel the sig? */
@@ -2032,7 +2157,6 @@ relock:
2032 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2157 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2033 */ 2158 */
2034 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2159 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2035 struct task_struct *leader;
2036 int why; 2160 int why;
2037 2161
2038 if (signal->flags & SIGNAL_CLD_CONTINUED) 2162 if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2053,13 +2177,11 @@ relock:
2053 * a duplicate. 2177 * a duplicate.
2054 */ 2178 */
2055 read_lock(&tasklist_lock); 2179 read_lock(&tasklist_lock);
2056
2057 do_notify_parent_cldstop(current, false, why); 2180 do_notify_parent_cldstop(current, false, why);
2058 2181
2059 leader = current->group_leader; 2182 if (ptrace_reparented(current->group_leader))
2060 if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) 2183 do_notify_parent_cldstop(current->group_leader,
2061 do_notify_parent_cldstop(leader, true, why); 2184 true, why);
2062
2063 read_unlock(&tasklist_lock); 2185 read_unlock(&tasklist_lock);
2064 2186
2065 goto relock; 2187 goto relock;
@@ -2067,37 +2189,31 @@ relock:
2067 2189
2068 for (;;) { 2190 for (;;) {
2069 struct k_sigaction *ka; 2191 struct k_sigaction *ka;
2070 /* 2192
2071 * Tracing can induce an artificial signal and choose sigaction. 2193 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2072 * The return value in @signr determines the default action, 2194 do_signal_stop(0))
2073 * but @info->si_signo is the signal number we will report.
2074 */
2075 signr = tracehook_get_signal(current, regs, info, return_ka);
2076 if (unlikely(signr < 0))
2077 goto relock; 2195 goto relock;
2078 if (unlikely(signr != 0))
2079 ka = return_ka;
2080 else {
2081 if (unlikely(current->group_stop &
2082 GROUP_STOP_PENDING) && do_signal_stop(0))
2083 goto relock;
2084 2196
2085 signr = dequeue_signal(current, &current->blocked, 2197 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2086 info); 2198 do_jobctl_trap();
2199 spin_unlock_irq(&sighand->siglock);
2200 goto relock;
2201 }
2087 2202
2088 if (!signr) 2203 signr = dequeue_signal(current, &current->blocked, info);
2089 break; /* will return 0 */
2090 2204
2091 if (signr != SIGKILL) { 2205 if (!signr)
2092 signr = ptrace_signal(signr, info, 2206 break; /* will return 0 */
2093 regs, cookie);
2094 if (!signr)
2095 continue;
2096 }
2097 2207
2098 ka = &sighand->action[signr-1]; 2208 if (unlikely(current->ptrace) && signr != SIGKILL) {
2209 signr = ptrace_signal(signr, info,
2210 regs, cookie);
2211 if (!signr)
2212 continue;
2099 } 2213 }
2100 2214
2215 ka = &sighand->action[signr-1];
2216
2101 /* Trace actually delivered signals. */ 2217 /* Trace actually delivered signals. */
2102 trace_signal_deliver(signr, info, ka); 2218 trace_signal_deliver(signr, info, ka);
2103 2219
@@ -2253,7 +2369,7 @@ void exit_signals(struct task_struct *tsk)
2253 signotset(&unblocked); 2369 signotset(&unblocked);
2254 retarget_shared_pending(tsk, &unblocked); 2370 retarget_shared_pending(tsk, &unblocked);
2255 2371
2256 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && 2372 if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
2257 task_participate_group_stop(tsk)) 2373 task_participate_group_stop(tsk))
2258 group_stop = CLD_STOPPED; 2374 group_stop = CLD_STOPPED;
2259out: 2375out:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 40cf63ddd4b3..fca82c32042b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -315,16 +315,24 @@ static inline void invoke_softirq(void)
315{ 315{
316 if (!force_irqthreads) 316 if (!force_irqthreads)
317 __do_softirq(); 317 __do_softirq();
318 else 318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
319 wakeup_softirqd(); 321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
320} 324}
321#else 325#else
322static inline void invoke_softirq(void) 326static inline void invoke_softirq(void)
323{ 327{
324 if (!force_irqthreads) 328 if (!force_irqthreads)
325 do_softirq(); 329 do_softirq();
326 else 330 else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET);
327 wakeup_softirqd(); 333 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET);
335 }
328} 336}
329#endif 337#endif
330 338
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28/* 28/*
29 * Architectures that do not implement save_stack_trace_tsk get this 29 * Architectures that do not implement save_stack_trace_tsk or
30 * weak alias and a once-per-bootup warning (whenever this facility 30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
31 * is utilized - for example by procfs): 31 * (whenever this facility is utilized - for example by procfs):
32 */ 32 */
33__weak void 33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{ 35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); 36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37} 37}
38
39__weak void
40save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
41{
42 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
43}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..c1124752e1d3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
136static DEFINE_MUTEX(stop_cpus_mutex); 136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); 137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138 138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) 139static void queue_stop_cpus_work(const struct cpumask *cpumask,
140 cpu_stop_fn_t fn, void *arg,
141 struct cpu_stop_done *done)
140{ 142{
141 struct cpu_stop_work *work; 143 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu; 144 unsigned int cpu;
144 145
145 /* initialize works and done */ 146 /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
147 work = &per_cpu(stop_cpus_work, cpu); 148 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn; 149 work->fn = fn;
149 work->arg = arg; 150 work->arg = arg;
150 work->done = &done; 151 work->done = done;
151 } 152 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153 153
154 /* 154 /*
155 * Disable preemption while queueing to avoid getting 155 * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu)); 162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable(); 163 preempt_enable();
164}
164 165
166static int __stop_cpus(const struct cpumask *cpumask,
167 cpu_stop_fn_t fn, void *arg)
168{
169 struct cpu_stop_done done;
170
171 cpu_stop_init_done(&done, cpumask_weight(cpumask));
172 queue_stop_cpus_work(cpumask, fn, arg, &done);
165 wait_for_completion(&done.completion); 173 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT; 174 return done.executed ? done.ret : -ENOENT;
167} 175}
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
431 struct stop_machine_data *smdata = data; 439 struct stop_machine_data *smdata = data;
432 enum stopmachine_state curstate = STOPMACHINE_NONE; 440 enum stopmachine_state curstate = STOPMACHINE_NONE;
433 int cpu = smp_processor_id(), err = 0; 441 int cpu = smp_processor_id(), err = 0;
442 unsigned long flags;
434 bool is_active; 443 bool is_active;
435 444
445 /*
446 * When called from stop_machine_from_inactive_cpu(), irq might
447 * already be disabled. Save the state and restore it on exit.
448 */
449 local_save_flags(flags);
450
436 if (!smdata->active_cpus) 451 if (!smdata->active_cpus)
437 is_active = cpu == cpumask_first(cpu_online_mask); 452 is_active = cpu == cpumask_first(cpu_online_mask);
438 else 453 else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
460 } 475 }
461 } while (curstate != STOPMACHINE_EXIT); 476 } while (curstate != STOPMACHINE_EXIT);
462 477
463 local_irq_enable(); 478 local_irq_restore(flags);
464 return err; 479 return err;
465} 480}
466 481
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
487} 502}
488EXPORT_SYMBOL_GPL(stop_machine); 503EXPORT_SYMBOL_GPL(stop_machine);
489 504
505/**
506 * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
507 * @fn: the function to run
508 * @data: the data ptr for the @fn()
509 * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
510 *
511 * This is identical to stop_machine() but can be called from a CPU which
512 * is not active. The local CPU is in the process of hotplug (so no other
513 * CPU hotplug can start) and not marked active and doesn't have enough
514 * context to sleep.
515 *
516 * This function provides stop_machine() functionality for such state by
517 * using busy-wait for synchronization and executing @fn directly for local
518 * CPU.
519 *
520 * CONTEXT:
521 * Local CPU is inactive. Temporarily stops all active CPUs.
522 *
523 * RETURNS:
524 * 0 if all executions of @fn returned 0, any non zero return value if any
525 * returned non zero.
526 */
527int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
528 const struct cpumask *cpus)
529{
530 struct stop_machine_data smdata = { .fn = fn, .data = data,
531 .active_cpus = cpus };
532 struct cpu_stop_done done;
533 int ret;
534
535 /* Local CPU must be inactive and CPU hotplug in progress. */
536 BUG_ON(cpu_active(raw_smp_processor_id()));
537 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
538
539 /* No proper task established and can't sleep - busy wait for lock. */
540 while (!mutex_trylock(&stop_cpus_mutex))
541 cpu_relax();
542
543 /* Schedule work on other CPUs and execute directly for local CPU */
544 set_state(&smdata, STOPMACHINE_PREPARE);
545 cpu_stop_init_done(&done, num_active_cpus());
546 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
547 &done);
548 ret = stop_machine_cpu_stop(&smdata);
549
550 /* Busy wait for completion. */
551 while (!completion_done(&done.completion))
552 cpu_relax();
553
554 mutex_unlock(&stop_cpus_mutex);
555 return ret ?: done.ret;
556}
557
490#endif /* CONFIG_STOP_MACHINE */ 558#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98bd355..11d65b531e50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1590,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head)
1590 spin_unlock(&sysctl_lock); 1590 spin_unlock(&sysctl_lock);
1591} 1591}
1592 1592
1593static void free_head(struct rcu_head *rcu)
1594{
1595 kfree(container_of(rcu, struct ctl_table_header, rcu));
1596}
1597
1598void sysctl_head_put(struct ctl_table_header *head) 1593void sysctl_head_put(struct ctl_table_header *head)
1599{ 1594{
1600 spin_lock(&sysctl_lock); 1595 spin_lock(&sysctl_lock);
1601 if (!--head->count) 1596 if (!--head->count)
1602 call_rcu(&head->rcu, free_head); 1597 kfree_rcu(head, rcu);
1603 spin_unlock(&sysctl_lock); 1598 spin_unlock(&sysctl_lock);
1604} 1599}
1605 1600
@@ -1971,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1971 start_unregistering(header); 1966 start_unregistering(header);
1972 if (!--header->parent->count) { 1967 if (!--header->parent->count) {
1973 WARN_ON(1); 1968 WARN_ON(1);
1974 call_rcu(&header->parent->rcu, free_head); 1969 kfree_rcu(header->parent, rcu);
1975 } 1970 }
1976 if (!--header->count) 1971 if (!--header->count)
1977 call_rcu(&header->rcu, free_head); 1972 kfree_rcu(header, rcu);
1978 spin_unlock(&sysctl_lock); 1973 spin_unlock(&sysctl_lock);
1979} 1974}
1980 1975
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342408cf68dd..2b021b0e8507 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -604,6 +604,12 @@ static struct timespec timekeeping_suspend_time;
604 */ 604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta) 605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{ 606{
607 if (!timespec_valid(delta)) {
608 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
609 "sleep delta value!\n");
610 return;
611 }
612
607 xtime = timespec_add(xtime, *delta); 613 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 614 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta); 615 total_sleep_time = timespec_add(total_sleep_time, *delta);
@@ -686,12 +692,34 @@ static void timekeeping_resume(void)
686static int timekeeping_suspend(void) 692static int timekeeping_suspend(void)
687{ 693{
688 unsigned long flags; 694 unsigned long flags;
695 struct timespec delta, delta_delta;
696 static struct timespec old_delta;
689 697
690 read_persistent_clock(&timekeeping_suspend_time); 698 read_persistent_clock(&timekeeping_suspend_time);
691 699
692 write_seqlock_irqsave(&xtime_lock, flags); 700 write_seqlock_irqsave(&xtime_lock, flags);
693 timekeeping_forward_now(); 701 timekeeping_forward_now();
694 timekeeping_suspended = 1; 702 timekeeping_suspended = 1;
703
704 /*
705 * To avoid drift caused by repeated suspend/resumes,
706 * which each can add ~1 second drift error,
707 * try to compensate so the difference in system time
708 * and persistent_clock time stays close to constant.
709 */
710 delta = timespec_sub(xtime, timekeeping_suspend_time);
711 delta_delta = timespec_sub(delta, old_delta);
712 if (abs(delta_delta.tv_sec) >= 2) {
713 /*
714 * if delta_delta is too large, assume time correction
715 * has occured and set old_delta to the current delta.
716 */
717 old_delta = delta;
718 } else {
719 /* Otherwise try to adjust old_system to compensate */
720 timekeeping_suspend_time =
721 timespec_add(timekeeping_suspend_time, delta_delta);
722 }
695 write_sequnlock_irqrestore(&xtime_lock, flags); 723 write_sequnlock_irqrestore(&xtime_lock, flags);
696 724
697 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 725 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f57440..c3e4575e7829 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
32 32
33#include <trace/events/sched.h> 33#include <trace/events/sched.h>
34 34
35#include <asm/ftrace.h>
36#include <asm/setup.h> 35#include <asm/setup.h>
37 36
38#include "trace_output.h" 37#include "trace_output.h"
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly;
82 81
83static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
84 83
85static struct ftrace_ops ftrace_list_end __read_mostly = 84static struct ftrace_ops ftrace_list_end __read_mostly = {
86{
87 .func = ftrace_stub, 85 .func = ftrace_stub,
88}; 86};
89 87
90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops; 94static struct ftrace_ops global_ops;
@@ -148,9 +147,11 @@ void clear_ftrace_function(void)
148{ 147{
149 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function = ftrace_stub; 149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
151 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
152} 152}
153 153
154#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
154#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
155/* 156/*
156 * For those archs that do not test ftrace_trace_stop in their 157 * For those archs that do not test ftrace_trace_stop in their
@@ -210,7 +211,12 @@ static void update_ftrace_function(void)
210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 211#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
211 ftrace_trace_function = func; 212 ftrace_trace_function = func;
212#else 213#else
214#ifdef CONFIG_DYNAMIC_FTRACE
215 /* do not update till all functions have been modified */
216 __ftrace_trace_function_delay = func;
217#else
213 __ftrace_trace_function = func; 218 __ftrace_trace_function = func;
219#endif
214 ftrace_trace_function = ftrace_test_stop_func; 220 ftrace_trace_function = ftrace_test_stop_func;
215#endif 221#endif
216} 222}
@@ -785,8 +791,7 @@ static void unregister_ftrace_profiler(void)
785 unregister_ftrace_graph(); 791 unregister_ftrace_graph();
786} 792}
787#else 793#else
788static struct ftrace_ops ftrace_profile_ops __read_mostly = 794static struct ftrace_ops ftrace_profile_ops __read_mostly = {
789{
790 .func = function_profile_call, 795 .func = function_profile_call,
791}; 796};
792 797
@@ -806,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
806 size_t cnt, loff_t *ppos) 811 size_t cnt, loff_t *ppos)
807{ 812{
808 unsigned long val; 813 unsigned long val;
809 char buf[64]; /* big enough to hold a number */
810 int ret; 814 int ret;
811 815
812 if (cnt >= sizeof(buf)) 816 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
813 return -EINVAL; 817 if (ret)
814
815 if (copy_from_user(&buf, ubuf, cnt))
816 return -EFAULT;
817
818 buf[cnt] = 0;
819
820 ret = strict_strtoul(buf, 10, &val);
821 if (ret < 0)
822 return ret; 818 return ret;
823 819
824 val = !!val; 820 val = !!val;
@@ -1182,8 +1178,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1182 return NULL; 1178 return NULL;
1183} 1179}
1184 1180
1181static void
1182ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1183static void
1184ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1185
1185static int 1186static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) 1187ftrace_hash_move(struct ftrace_ops *ops, int enable,
1188 struct ftrace_hash **dst, struct ftrace_hash *src)
1187{ 1189{
1188 struct ftrace_func_entry *entry; 1190 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn; 1191 struct hlist_node *tp, *tn;
@@ -1193,9 +1195,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1193 unsigned long key; 1195 unsigned long key;
1194 int size = src->count; 1196 int size = src->count;
1195 int bits = 0; 1197 int bits = 0;
1198 int ret;
1196 int i; 1199 int i;
1197 1200
1198 /* 1201 /*
1202 * Remove the current set, update the hash and add
1203 * them back.
1204 */
1205 ftrace_hash_rec_disable(ops, enable);
1206
1207 /*
1199 * If the new source is empty, just free dst and assign it 1208 * If the new source is empty, just free dst and assign it
1200 * the empty_hash. 1209 * the empty_hash.
1201 */ 1210 */
@@ -1215,9 +1224,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1215 if (bits > FTRACE_HASH_MAX_BITS) 1224 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS; 1225 bits = FTRACE_HASH_MAX_BITS;
1217 1226
1227 ret = -ENOMEM;
1218 new_hash = alloc_ftrace_hash(bits); 1228 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash) 1229 if (!new_hash)
1220 return -ENOMEM; 1230 goto out;
1221 1231
1222 size = 1 << src->size_bits; 1232 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) { 1233 for (i = 0; i < size; i++) {
@@ -1236,7 +1246,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1236 rcu_assign_pointer(*dst, new_hash); 1246 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash); 1247 free_ftrace_hash_rcu(old_hash);
1238 1248
1239 return 0; 1249 ret = 0;
1250 out:
1251 /*
1252 * Enable regardless of ret:
1253 * On success, we enable the new hash.
1254 * On failure, we re-enable the original hash.
1255 */
1256 ftrace_hash_rec_enable(ops, enable);
1257
1258 return ret;
1240} 1259}
1241 1260
1242/* 1261/*
@@ -1596,6 +1615,12 @@ static int __ftrace_modify_code(void *data)
1596{ 1615{
1597 int *command = data; 1616 int *command = data;
1598 1617
1618 /*
1619 * Do not call function tracer while we update the code.
1620 * We are in stop machine, no worrying about races.
1621 */
1622 function_trace_stop++;
1623
1599 if (*command & FTRACE_ENABLE_CALLS) 1624 if (*command & FTRACE_ENABLE_CALLS)
1600 ftrace_replace_code(1); 1625 ftrace_replace_code(1);
1601 else if (*command & FTRACE_DISABLE_CALLS) 1626 else if (*command & FTRACE_DISABLE_CALLS)
@@ -1609,6 +1634,18 @@ static int __ftrace_modify_code(void *data)
1609 else if (*command & FTRACE_STOP_FUNC_RET) 1634 else if (*command & FTRACE_STOP_FUNC_RET)
1610 ftrace_disable_ftrace_graph_caller(); 1635 ftrace_disable_ftrace_graph_caller();
1611 1636
1637#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1638 /*
1639 * For archs that call ftrace_test_stop_func(), we must
1640 * wait till after we update all the function callers
1641 * before we update the callback. This keeps different
1642 * ops that record different functions from corrupting
1643 * each other.
1644 */
1645 __ftrace_trace_function = __ftrace_trace_function_delay;
1646#endif
1647 function_trace_stop--;
1648
1612 return 0; 1649 return 0;
1613} 1650}
1614 1651
@@ -1744,10 +1781,36 @@ static cycle_t ftrace_update_time;
1744static unsigned long ftrace_update_cnt; 1781static unsigned long ftrace_update_cnt;
1745unsigned long ftrace_update_tot_cnt; 1782unsigned long ftrace_update_tot_cnt;
1746 1783
1784static int ops_traces_mod(struct ftrace_ops *ops)
1785{
1786 struct ftrace_hash *hash;
1787
1788 hash = ops->filter_hash;
1789 return !!(!hash || !hash->count);
1790}
1791
1747static int ftrace_update_code(struct module *mod) 1792static int ftrace_update_code(struct module *mod)
1748{ 1793{
1749 struct dyn_ftrace *p; 1794 struct dyn_ftrace *p;
1750 cycle_t start, stop; 1795 cycle_t start, stop;
1796 unsigned long ref = 0;
1797
1798 /*
1799 * When adding a module, we need to check if tracers are
1800 * currently enabled and if they are set to trace all functions.
1801 * If they are, we need to enable the module functions as well
1802 * as update the reference counts for those function records.
1803 */
1804 if (mod) {
1805 struct ftrace_ops *ops;
1806
1807 for (ops = ftrace_ops_list;
1808 ops != &ftrace_list_end; ops = ops->next) {
1809 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
1810 ops_traces_mod(ops))
1811 ref++;
1812 }
1813 }
1751 1814
1752 start = ftrace_now(raw_smp_processor_id()); 1815 start = ftrace_now(raw_smp_processor_id());
1753 ftrace_update_cnt = 0; 1816 ftrace_update_cnt = 0;
@@ -1760,7 +1823,7 @@ static int ftrace_update_code(struct module *mod)
1760 1823
1761 p = ftrace_new_addrs; 1824 p = ftrace_new_addrs;
1762 ftrace_new_addrs = p->newlist; 1825 ftrace_new_addrs = p->newlist;
1763 p->flags = 0L; 1826 p->flags = ref;
1764 1827
1765 /* 1828 /*
1766 * Do the initial record conversion from mcount jump 1829 * Do the initial record conversion from mcount jump
@@ -1783,7 +1846,7 @@ static int ftrace_update_code(struct module *mod)
1783 * conversion puts the module to the correct state, thus 1846 * conversion puts the module to the correct state, thus
1784 * passing the ftrace_make_call check. 1847 * passing the ftrace_make_call check.
1785 */ 1848 */
1786 if (ftrace_start_up) { 1849 if (ftrace_start_up && ref) {
1787 int failed = __ftrace_replace_code(p, 1); 1850 int failed = __ftrace_replace_code(p, 1);
1788 if (failed) { 1851 if (failed) {
1789 ftrace_bug(failed, p->ip); 1852 ftrace_bug(failed, p->ip);
@@ -2407,10 +2470,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
2407 */ 2470 */
2408 2471
2409static int 2472static int
2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2473ftrace_mod_callback(struct ftrace_hash *hash,
2474 char *func, char *cmd, char *param, int enable)
2411{ 2475{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
2414 char *mod; 2476 char *mod;
2415 int ret = -EINVAL; 2477 int ret = -EINVAL;
2416 2478
@@ -2430,11 +2492,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
2430 if (!strlen(mod)) 2492 if (!strlen(mod))
2431 return ret; 2493 return ret;
2432 2494
2433 if (enable)
2434 hash = ops->filter_hash;
2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod); 2495 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret) 2496 if (!ret)
2440 ret = -EINVAL; 2497 ret = -EINVAL;
@@ -2760,7 +2817,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
2760 mutex_lock(&ftrace_cmd_mutex); 2817 mutex_lock(&ftrace_cmd_mutex);
2761 list_for_each_entry(p, &ftrace_commands, list) { 2818 list_for_each_entry(p, &ftrace_commands, list) {
2762 if (strcmp(p->name, command) == 0) { 2819 if (strcmp(p->name, command) == 0) {
2763 ret = p->func(func, command, next, enable); 2820 ret = p->func(hash, func, command, next, enable);
2764 goto out_unlock; 2821 goto out_unlock;
2765 } 2822 }
2766 } 2823 }
@@ -2857,7 +2914,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2857 ftrace_match_records(hash, buf, len); 2914 ftrace_match_records(hash, buf, len);
2858 2915
2859 mutex_lock(&ftrace_lock); 2916 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash); 2917 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2918 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2919 && ftrace_enabled)
2920 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2921
2861 mutex_unlock(&ftrace_lock); 2922 mutex_unlock(&ftrace_lock);
2862 2923
2863 mutex_unlock(&ftrace_regex_lock); 2924 mutex_unlock(&ftrace_regex_lock);
@@ -3040,18 +3101,12 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3040 orig_hash = &iter->ops->notrace_hash; 3101 orig_hash = &iter->ops->notrace_hash;
3041 3102
3042 mutex_lock(&ftrace_lock); 3103 mutex_lock(&ftrace_lock);
3043 /* 3104 ret = ftrace_hash_move(iter->ops, filter_hash,
3044 * Remove the current set, update the hash and add 3105 orig_hash, iter->hash);
3045 * them back. 3106 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3046 */ 3107 && ftrace_enabled)
3047 ftrace_hash_rec_disable(iter->ops, filter_hash); 3108 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3048 ret = ftrace_hash_move(orig_hash, iter->hash); 3109
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock); 3110 mutex_unlock(&ftrace_lock);
3056 } 3111 }
3057 free_ftrace_hash(iter->hash); 3112 free_ftrace_hash(iter->hash);
@@ -3330,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod,
3330{ 3385{
3331 unsigned long *p; 3386 unsigned long *p;
3332 unsigned long addr; 3387 unsigned long addr;
3333 unsigned long flags; 3388 unsigned long flags = 0; /* Shut up gcc */
3334 3389
3335 mutex_lock(&ftrace_lock); 3390 mutex_lock(&ftrace_lock);
3336 p = start; 3391 p = start;
@@ -3348,12 +3403,18 @@ static int ftrace_process_locs(struct module *mod,
3348 } 3403 }
3349 3404
3350 /* 3405 /*
3351 * Disable interrupts to prevent interrupts from executing 3406 * We only need to disable interrupts on start up
3352 * code that is being modified. 3407 * because we are modifying code that an interrupt
3408 * may execute, and the modification is not atomic.
3409 * But for modules, nothing runs the code we modify
3410 * until we are finished with it, and there's no
3411 * reason to cause large interrupt latencies while we do it.
3353 */ 3412 */
3354 local_irq_save(flags); 3413 if (!mod)
3414 local_irq_save(flags);
3355 ftrace_update_code(mod); 3415 ftrace_update_code(mod);
3356 local_irq_restore(flags); 3416 if (!mod)
3417 local_irq_restore(flags);
3357 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3358 3419
3359 return 0; 3420 return 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa407943..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages) 997 unsigned nr_pages)
998{ 998{
999 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 unsigned long addr;
1001 LIST_HEAD(pages); 1000 LIST_HEAD(pages);
1002 unsigned i; 1001 unsigned i;
1003 1002
1004 WARN_ON(!nr_pages); 1003 WARN_ON(!nr_pages);
1005 1004
1006 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
1006 struct page *page;
1007 /*
1008 * __GFP_NORETRY flag makes sure that the allocation fails
1009 * gracefully without invoking oom-killer and the system is
1010 * not destabilized.
1011 */
1007 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1008 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 1013 GFP_KERNEL | __GFP_NORETRY,
1014 cpu_to_node(cpu_buffer->cpu));
1009 if (!bpage) 1015 if (!bpage)
1010 goto free_pages; 1016 goto free_pages;
1011 1017
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1013 1019
1014 list_add(&bpage->list, &pages); 1020 list_add(&bpage->list, &pages);
1015 1021
1016 addr = __get_free_page(GFP_KERNEL); 1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1017 if (!addr) 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1024 if (!page)
1018 goto free_pages; 1025 goto free_pages;
1019 bpage->page = (void *)addr; 1026 bpage->page = page_address(page);
1020 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1021 } 1028 }
1022 1029
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
1048 unsigned long addr; 1055 struct page *page;
1049 int ret; 1056 int ret;
1050 1057
1051 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 1058 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1067 rb_check_bpage(cpu_buffer, bpage); 1074 rb_check_bpage(cpu_buffer, bpage);
1068 1075
1069 cpu_buffer->reader_page = bpage; 1076 cpu_buffer->reader_page = bpage;
1070 addr = __get_free_page(GFP_KERNEL); 1077 page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
1071 if (!addr) 1078 if (!page)
1072 goto fail_free_reader; 1079 goto fail_free_reader;
1073 bpage->page = (void *)addr; 1080 bpage->page = page_address(page);
1074 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1075 1082
1076 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1314 unsigned nr_pages, rm_pages, new_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1315 struct buffer_page *bpage, *tmp; 1322 struct buffer_page *bpage, *tmp;
1316 unsigned long buffer_size; 1323 unsigned long buffer_size;
1317 unsigned long addr;
1318 LIST_HEAD(pages); 1324 LIST_HEAD(pages);
1319 int i, cpu; 1325 int i, cpu;
1320 1326
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1375 1381
1376 for_each_buffer_cpu(buffer, cpu) { 1382 for_each_buffer_cpu(buffer, cpu) {
1377 for (i = 0; i < new_pages; i++) { 1383 for (i = 0; i < new_pages; i++) {
1384 struct page *page;
1385 /*
1386 * __GFP_NORETRY flag makes sure that the allocation
1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1378 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1379 cache_line_size()), 1391 cache_line_size()),
1380 GFP_KERNEL, cpu_to_node(cpu)); 1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1381 if (!bpage) 1394 if (!bpage)
1382 goto free_pages; 1395 goto free_pages;
1383 list_add(&bpage->list, &pages); 1396 list_add(&bpage->list, &pages);
1384 addr = __get_free_page(GFP_KERNEL); 1397 page = alloc_pages_node(cpu_to_node(cpu),
1385 if (!addr) 1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1386 goto free_pages; 1400 goto free_pages;
1387 bpage->page = (void *)addr; 1401 bpage->page = page_address(page);
1388 rb_init_page(bpage->page); 1402 rb_init_page(bpage->page);
1389 } 1403 }
1390 } 1404 }
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3730 * Returns: 3744 * Returns:
3731 * The page allocated, or NULL on error. 3745 * The page allocated, or NULL on error.
3732 */ 3746 */
3733void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 3747void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
3734{ 3748{
3735 struct buffer_data_page *bpage; 3749 struct buffer_data_page *bpage;
3736 unsigned long addr; 3750 struct page *page;
3737 3751
3738 addr = __get_free_page(GFP_KERNEL); 3752 page = alloc_pages_node(cpu_to_node(cpu),
3739 if (!addr) 3753 GFP_KERNEL | __GFP_NORETRY, 0);
3754 if (!page)
3740 return NULL; 3755 return NULL;
3741 3756
3742 bpage = (void *)addr; 3757 bpage = page_address(page);
3743 3758
3744 rb_init_page(bpage); 3759 rb_init_page(bpage);
3745 3760
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
3978 size_t cnt, loff_t *ppos) 3993 size_t cnt, loff_t *ppos)
3979{ 3994{
3980 unsigned long *p = filp->private_data; 3995 unsigned long *p = filp->private_data;
3981 char buf[64];
3982 unsigned long val; 3996 unsigned long val;
3983 int ret; 3997 int ret;
3984 3998
3985 if (cnt >= sizeof(buf)) 3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3986 return -EINVAL; 4000 if (ret)
3987
3988 if (copy_from_user(&buf, ubuf, cnt))
3989 return -EFAULT;
3990
3991 buf[cnt] = 0;
3992
3993 ret = strict_strtoul(buf, 10, &val);
3994 if (ret < 0)
3995 return ret; 4001 return ret;
3996 4002
3997 if (val) 4003 if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
106 int inc; 106 int inc;
107 int i; 107 int i;
108 108
109 bpage = ring_buffer_alloc_read_page(buffer); 109 bpage = ring_buffer_alloc_read_page(buffer, cpu);
110 if (!bpage) 110 if (!bpage)
111 return EVENT_DROPPED; 111 return EVENT_DROPPED;
112 112
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f21..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work)
347{
348 wake_up(&trace_wait);
349}
350
351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
352
346/** 353/**
347 * trace_wake_up - wake up tasks waiting for trace input 354 * trace_wake_up - wake up tasks waiting for trace input
348 * 355 *
349 * Simply wakes up any task that is blocked on the trace_wait 356 * Schedules a delayed work to wake up any task that is blocked on the
350 * queue. These is used with trace_poll for tasks polling the trace. 357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
351 */ 359 */
352void trace_wake_up(void) 360void trace_wake_up(void)
353{ 361{
354 int cpu; 362 const unsigned long delay = msecs_to_jiffies(2);
355 363
356 if (trace_flags & TRACE_ITER_BLOCK) 364 if (trace_flags & TRACE_ITER_BLOCK)
357 return; 365 return;
358 /* 366 schedule_delayed_work(&wakeup_work, delay);
359 * The runqueue_is_locked() can fail, but this is the best we
360 * have for now:
361 */
362 cpu = get_cpu();
363 if (!runqueue_is_locked(cpu))
364 wake_up(&trace_wait);
365 put_cpu();
366} 367}
367 368
368static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
424 "graph-time", 425 "graph-time",
425 "record-cmd", 426 "record-cmd",
426 "overwrite", 427 "overwrite",
428 "disable_on_free",
427 NULL 429 NULL
428}; 430};
429 431
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1191} 1193}
1192EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1193 1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1200{
1201 ring_buffer_unlock_commit(buffer, event);
1202
1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1204 ftrace_trace_userstack(buffer, flags, pc);
1205}
1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1207
1194void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1195 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
1196{ 1210{
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1234} 1248}
1235 1249
1236#ifdef CONFIG_STACKTRACE 1250#ifdef CONFIG_STACKTRACE
1251
1252#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
1253struct ftrace_stack {
1254 unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
1255};
1256
1257static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
1258static DEFINE_PER_CPU(int, ftrace_stack_reserve);
1259
1237static void __ftrace_trace_stack(struct ring_buffer *buffer, 1260static void __ftrace_trace_stack(struct ring_buffer *buffer,
1238 unsigned long flags, 1261 unsigned long flags,
1239 int skip, int pc) 1262 int skip, int pc, struct pt_regs *regs)
1240{ 1263{
1241 struct ftrace_event_call *call = &event_kernel_stack; 1264 struct ftrace_event_call *call = &event_kernel_stack;
1242 struct ring_buffer_event *event; 1265 struct ring_buffer_event *event;
1243 struct stack_entry *entry; 1266 struct stack_entry *entry;
1244 struct stack_trace trace; 1267 struct stack_trace trace;
1268 int use_stack;
1269 int size = FTRACE_STACK_ENTRIES;
1270
1271 trace.nr_entries = 0;
1272 trace.skip = skip;
1273
1274 /*
1275 * Since events can happen in NMIs there's no safe way to
1276 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1277 * or NMI comes in, it will just have to use the default
1278 * FTRACE_STACK_SIZE.
1279 */
1280 preempt_disable_notrace();
1281
1282 use_stack = ++__get_cpu_var(ftrace_stack_reserve);
1283 /*
1284 * We don't need any atomic variables, just a barrier.
1285 * If an interrupt comes in, we don't care, because it would
1286 * have exited and put the counter back to what we want.
1287 * We just need a barrier to keep gcc from moving things
1288 * around.
1289 */
1290 barrier();
1291 if (use_stack == 1) {
1292 trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
1293 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1294
1295 if (regs)
1296 save_stack_trace_regs(regs, &trace);
1297 else
1298 save_stack_trace(&trace);
1299
1300 if (trace.nr_entries > size)
1301 size = trace.nr_entries;
1302 } else
1303 /* From now on, use_stack is a boolean */
1304 use_stack = 0;
1305
1306 size *= sizeof(unsigned long);
1245 1307
1246 event = trace_buffer_lock_reserve(buffer, TRACE_STACK, 1308 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1247 sizeof(*entry), flags, pc); 1309 sizeof(*entry) + size, flags, pc);
1248 if (!event) 1310 if (!event)
1249 return; 1311 goto out;
1250 entry = ring_buffer_event_data(event); 1312 entry = ring_buffer_event_data(event);
1251 memset(&entry->caller, 0, sizeof(entry->caller));
1252 1313
1253 trace.nr_entries = 0; 1314 memset(&entry->caller, 0, size);
1254 trace.max_entries = FTRACE_STACK_ENTRIES; 1315
1255 trace.skip = skip; 1316 if (use_stack)
1256 trace.entries = entry->caller; 1317 memcpy(&entry->caller, trace.entries,
1318 trace.nr_entries * sizeof(unsigned long));
1319 else {
1320 trace.max_entries = FTRACE_STACK_ENTRIES;
1321 trace.entries = entry->caller;
1322 if (regs)
1323 save_stack_trace_regs(regs, &trace);
1324 else
1325 save_stack_trace(&trace);
1326 }
1327
1328 entry->size = trace.nr_entries;
1257 1329
1258 save_stack_trace(&trace);
1259 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1260 ring_buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1332
1333 out:
1334 /* Again, don't let gcc optimize things here */
1335 barrier();
1336 __get_cpu_var(ftrace_stack_reserve)--;
1337 preempt_enable_notrace();
1338
1339}
1340
1341void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
1342 int skip, int pc, struct pt_regs *regs)
1343{
1344 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1345 return;
1346
1347 __ftrace_trace_stack(buffer, flags, skip, pc, regs);
1261} 1348}
1262 1349
1263void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 1350void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1266 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1353 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1267 return; 1354 return;
1268 1355
1269 __ftrace_trace_stack(buffer, flags, skip, pc); 1356 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
1270} 1357}
1271 1358
1272void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1359void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1273 int pc) 1360 int pc)
1274{ 1361{
1275 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1362 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
1276} 1363}
1277 1364
1278/** 1365/**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
1288 local_save_flags(flags); 1375 local_save_flags(flags);
1289 1376
1290 /* skipping 3 traces, seems to get us at the caller of this function */ 1377 /* skipping 3 traces, seems to get us at the caller of this function */
1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1378 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
1292} 1379}
1293 1380
1294static DEFINE_PER_CPU(int, user_stack_count); 1381static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1536 1623
1537 ftrace_enable_cpu(); 1624 ftrace_enable_cpu();
1538 1625
1539 return event ? ring_buffer_event_data(event) : NULL; 1626 if (event) {
1627 iter->ent_size = ring_buffer_event_length(event);
1628 return ring_buffer_event_data(event);
1629 }
1630 iter->ent_size = 0;
1631 return NULL;
1540} 1632}
1541 1633
1542static struct trace_entry * 1634static struct trace_entry *
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
2051{ 2143{
2052 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
2053 2145
2146 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
2147 return;
2148
2054 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2149 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2055 /* print nothing if the buffers are empty */ 2150 /* print nothing if the buffers are empty */
2056 if (trace_empty(iter)) 2151 if (trace_empty(iter))
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2701 size_t cnt, loff_t *ppos) 2796 size_t cnt, loff_t *ppos)
2702{ 2797{
2703 struct trace_array *tr = filp->private_data; 2798 struct trace_array *tr = filp->private_data;
2704 char buf[64];
2705 unsigned long val; 2799 unsigned long val;
2706 int ret; 2800 int ret;
2707 2801
2708 if (cnt >= sizeof(buf)) 2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2709 return -EINVAL; 2803 if (ret)
2710
2711 if (copy_from_user(&buf, ubuf, cnt))
2712 return -EFAULT;
2713
2714 buf[cnt] = 0;
2715
2716 ret = strict_strtoul(buf, 10, &val);
2717 if (ret < 0)
2718 return ret; 2804 return ret;
2719 2805
2720 val = !!val; 2806 val = !!val;
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2767 return t->init(tr); 2853 return t->init(tr);
2768} 2854}
2769 2855
2770static int tracing_resize_ring_buffer(unsigned long size) 2856static int __tracing_resize_ring_buffer(unsigned long size)
2771{ 2857{
2772 int ret; 2858 int ret;
2773 2859
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
2819 return ret; 2905 return ret;
2820} 2906}
2821 2907
2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
2909{
2910 int cpu, ret = size;
2911
2912 mutex_lock(&trace_types_lock);
2913
2914 tracing_stop();
2915
2916 /* disable all cpu buffers */
2917 for_each_tracing_cpu(cpu) {
2918 if (global_trace.data[cpu])
2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
2922 }
2923
2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
2927 if (ret < 0)
2928 ret = -ENOMEM;
2929
2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
2938 mutex_unlock(&trace_types_lock);
2939
2940 return ret;
2941}
2942
2822 2943
2823/** 2944/**
2824 * tracing_update_buffers - used by tracing facility to expand ring buffers 2945 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void)
2836 2957
2837 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
2838 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
2839 ret = tracing_resize_ring_buffer(trace_buf_size); 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
2840 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
2841 2962
2842 return ret; 2963 return ret;
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
2860 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
2861 2982
2862 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
2863 ret = tracing_resize_ring_buffer(trace_buf_size); 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
2864 if (ret < 0) 2985 if (ret < 0)
2865 goto out; 2986 goto out;
2866 ret = 0; 2987 ret = 0;
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2966 size_t cnt, loff_t *ppos) 3087 size_t cnt, loff_t *ppos)
2967{ 3088{
2968 unsigned long *ptr = filp->private_data; 3089 unsigned long *ptr = filp->private_data;
2969 char buf[64];
2970 unsigned long val; 3090 unsigned long val;
2971 int ret; 3091 int ret;
2972 3092
2973 if (cnt >= sizeof(buf)) 3093 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2974 return -EINVAL; 3094 if (ret)
2975
2976 if (copy_from_user(&buf, ubuf, cnt))
2977 return -EFAULT;
2978
2979 buf[cnt] = 0;
2980
2981 ret = strict_strtoul(buf, 10, &val);
2982 if (ret < 0)
2983 return ret; 3095 return ret;
2984 3096
2985 *ptr = val * 1000; 3097 *ptr = val * 1000;
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3434 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3435{ 3547{
3436 unsigned long val; 3548 unsigned long val;
3437 char buf[64]; 3549 int ret;
3438 int ret, cpu;
3439
3440 if (cnt >= sizeof(buf))
3441 return -EINVAL;
3442
3443 if (copy_from_user(&buf, ubuf, cnt))
3444 return -EFAULT;
3445
3446 buf[cnt] = 0;
3447 3550
3448 ret = strict_strtoul(buf, 10, &val); 3551 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3449 if (ret < 0) 3552 if (ret)
3450 return ret; 3553 return ret;
3451 3554
3452 /* must have at least 1 entry */ 3555 /* must have at least 1 entry */
3453 if (!val) 3556 if (!val)
3454 return -EINVAL; 3557 return -EINVAL;
3455 3558
3456 mutex_lock(&trace_types_lock);
3457
3458 tracing_stop();
3459
3460 /* disable all cpu buffers */
3461 for_each_tracing_cpu(cpu) {
3462 if (global_trace.data[cpu])
3463 atomic_inc(&global_trace.data[cpu]->disabled);
3464 if (max_tr.data[cpu])
3465 atomic_inc(&max_tr.data[cpu]->disabled);
3466 }
3467
3468 /* value is in KB */ 3559 /* value is in KB */
3469 val <<= 10; 3560 val <<= 10;
3470 3561
3471 if (val != global_trace.entries) { 3562 ret = tracing_resize_ring_buffer(val);
3472 ret = tracing_resize_ring_buffer(val); 3563 if (ret < 0)
3473 if (ret < 0) { 3564 return ret;
3474 cnt = ret;
3475 goto out;
3476 }
3477 }
3478 3565
3479 *ppos += cnt; 3566 *ppos += cnt;
3480 3567
3481 /* If check pages failed, return ENOMEM */ 3568 return cnt;
3482 if (tracing_disabled) 3569}
3483 cnt = -ENOMEM;
3484 out:
3485 for_each_tracing_cpu(cpu) {
3486 if (global_trace.data[cpu])
3487 atomic_dec(&global_trace.data[cpu]->disabled);
3488 if (max_tr.data[cpu])
3489 atomic_dec(&max_tr.data[cpu]->disabled);
3490 }
3491 3570
3492 tracing_start(); 3571static ssize_t
3493 mutex_unlock(&trace_types_lock); 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos)
3574{
3575 /*
3576 * There is no need to read what the user has written, this function
3577 * is just to make sure that there is no error when "echo" is used
3578 */
3579
3580 *ppos += cnt;
3494 3581
3495 return cnt; 3582 return cnt;
3496} 3583}
3497 3584
3585static int
3586tracing_free_buffer_release(struct inode *inode, struct file *filp)
3587{
3588 /* disable tracing ? */
3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3590 tracing_off();
3591 /* resize the ring buffer to 0 */
3592 tracing_resize_ring_buffer(0);
3593
3594 return 0;
3595}
3596
3498static int mark_printk(const char *fmt, ...) 3597static int mark_printk(const char *fmt, ...)
3499{ 3598{
3500 int ret; 3599 int ret;
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
3640 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
3641}; 3740};
3642 3741
3742static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release,
3745};
3746
3643static const struct file_operations tracing_mark_fops = { 3747static const struct file_operations tracing_mark_fops = {
3644 .open = tracing_open_generic, 3748 .open = tracing_open_generic,
3645 .write = tracing_mark_write, 3749 .write = tracing_mark_write,
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3696 return 0; 3800 return 0;
3697 3801
3698 if (!info->spare) 3802 if (!info->spare)
3699 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3803 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
3700 if (!info->spare) 3804 if (!info->spare)
3701 return -ENOMEM; 3805 return -ENOMEM;
3702 3806
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3853 3957
3854 ref->ref = 1; 3958 ref->ref = 1;
3855 ref->buffer = info->tr->buffer; 3959 ref->buffer = info->tr->buffer;
3856 ref->page = ring_buffer_alloc_read_page(ref->buffer); 3960 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
3857 if (!ref->page) { 3961 if (!ref->page) {
3858 kfree(ref); 3962 kfree(ref);
3859 break; 3963 break;
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3862 r = ring_buffer_read_page(ref->buffer, &ref->page, 3966 r = ring_buffer_read_page(ref->buffer, &ref->page,
3863 len, info->cpu, 1); 3967 len, info->cpu, 1);
3864 if (r < 0) { 3968 if (r < 0) {
3865 ring_buffer_free_read_page(ref->buffer, 3969 ring_buffer_free_read_page(ref->buffer, ref->page);
3866 ref->page);
3867 kfree(ref); 3970 kfree(ref);
3868 break; 3971 break;
3869 } 3972 }
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4099{ 4202{
4100 struct trace_option_dentry *topt = filp->private_data; 4203 struct trace_option_dentry *topt = filp->private_data;
4101 unsigned long val; 4204 unsigned long val;
4102 char buf[64];
4103 int ret; 4205 int ret;
4104 4206
4105 if (cnt >= sizeof(buf)) 4207 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4106 return -EINVAL; 4208 if (ret)
4107
4108 if (copy_from_user(&buf, ubuf, cnt))
4109 return -EFAULT;
4110
4111 buf[cnt] = 0;
4112
4113 ret = strict_strtoul(buf, 10, &val);
4114 if (ret < 0)
4115 return ret; 4209 return ret;
4116 4210
4117 if (val != 0 && val != 1) 4211 if (val != 0 && val != 1)
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4159 loff_t *ppos) 4253 loff_t *ppos)
4160{ 4254{
4161 long index = (long)filp->private_data; 4255 long index = (long)filp->private_data;
4162 char buf[64];
4163 unsigned long val; 4256 unsigned long val;
4164 int ret; 4257 int ret;
4165 4258
4166 if (cnt >= sizeof(buf)) 4259 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4167 return -EINVAL; 4260 if (ret)
4168
4169 if (copy_from_user(&buf, ubuf, cnt))
4170 return -EFAULT;
4171
4172 buf[cnt] = 0;
4173
4174 ret = strict_strtoul(buf, 10, &val);
4175 if (ret < 0)
4176 return ret; 4261 return ret;
4177 4262
4178 if (val != 0 && val != 1) 4263 if (val != 0 && val != 1)
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
4365 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4366 &global_trace, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4367 4452
4453 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops);
4455
4368 trace_create_file("trace_marker", 0220, d_tracer, 4456 trace_create_file("trace_marker", 0220, d_tracer,
4369 NULL, &tracing_mark_fops); 4457 NULL, &tracing_mark_fops);
4370 4458
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 229f8591f61d..3f381d0b20a8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -278,6 +278,29 @@ struct tracer {
278}; 278};
279 279
280 280
281/* Only current can touch trace_recursion */
282#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
283#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
284
285/* Ring buffer has the 10 LSB bits to count */
286#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
287
288/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12)
291/*
292 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function
294 * graph in irq because we want to trace a particular function that
295 * was called in irq context but we have irq tracing off. Since this
296 * can only be modified by current, we can reuse trace_recursion.
297 */
298#define TRACE_IRQ_BIT (1<<13)
299
300#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
301#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
302#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
303
281#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
282 305
283int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
389void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 412void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
390 int skip, int pc); 413 int skip, int pc);
391 414
415void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
416 int skip, int pc, struct pt_regs *regs);
417
392void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, 418void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
393 int pc); 419 int pc);
394 420
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
400{ 426{
401} 427}
402 428
429static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
430 unsigned long flags, int skip,
431 int pc, struct pt_regs *regs)
432{
433}
434
403static inline void ftrace_trace_userstack(struct ring_buffer *buffer, 435static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
404 unsigned long flags, int pc) 436 unsigned long flags, int pc)
405{ 437{
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
507 return 1; 539 return 1;
508 540
509 for (i = 0; i < ftrace_graph_count; i++) { 541 for (i = 0; i < ftrace_graph_count; i++) {
510 if (addr == ftrace_graph_funcs[i]) 542 if (addr == ftrace_graph_funcs[i]) {
543 /*
544 * If no irqs are to be traced, but a set_graph_function
545 * is set, and called by an interrupt handler, we still
546 * want to trace it.
547 */
548 if (in_irq())
549 trace_recursion_set(TRACE_IRQ_BIT);
550 else
551 trace_recursion_clear(TRACE_IRQ_BIT);
511 return 1; 552 return 1;
553 }
512 } 554 }
513 555
514 return 0; 556 return 0;
@@ -609,6 +651,7 @@ enum trace_iterator_flags {
609 TRACE_ITER_GRAPH_TIME = 0x80000, 651 TRACE_ITER_GRAPH_TIME = 0x80000,
610 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000,
612}; 655};
613 656
614/* 657/*
@@ -677,6 +720,7 @@ struct event_subsystem {
677 struct dentry *entry; 720 struct dentry *entry;
678 struct event_filter *filter; 721 struct event_filter *filter;
679 int nr_events; 722 int nr_events;
723 int ref_count;
680}; 724};
681 725
682#define FILTER_PRED_INVALID ((unsigned short)-1) 726#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[];
784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 828 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
785#include "trace_entries.h" 829#include "trace_entries.h"
786 830
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
802#endif /* _LINUX_KERNEL_TRACE_H */ 831#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
161 TRACE_STACK, 161 TRACE_STACK,
162 162
163 F_STRUCT( 163 F_STRUCT(
164 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 164 __field( int, size )
165 __dynamic_array(unsigned long, caller )
165 ), 166 ),
166 167
167 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 686ec399f2a8..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
244 mutex_unlock(&event_mutex); 244 mutex_unlock(&event_mutex);
245} 245}
246 246
247static void __put_system(struct event_subsystem *system)
248{
249 struct event_filter *filter = system->filter;
250
251 WARN_ON_ONCE(system->ref_count == 0);
252 if (--system->ref_count)
253 return;
254
255 if (filter) {
256 kfree(filter->filter_string);
257 kfree(filter);
258 }
259 kfree(system->name);
260 kfree(system);
261}
262
263static void __get_system(struct event_subsystem *system)
264{
265 WARN_ON_ONCE(system->ref_count == 0);
266 system->ref_count++;
267}
268
269static void put_system(struct event_subsystem *system)
270{
271 mutex_lock(&event_mutex);
272 __put_system(system);
273 mutex_unlock(&event_mutex);
274}
275
247/* 276/*
248 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 277 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
249 */ 278 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
486 loff_t *ppos) 515 loff_t *ppos)
487{ 516{
488 struct ftrace_event_call *call = filp->private_data; 517 struct ftrace_event_call *call = filp->private_data;
489 char buf[64];
490 unsigned long val; 518 unsigned long val;
491 int ret; 519 int ret;
492 520
493 if (cnt >= sizeof(buf)) 521 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
494 return -EINVAL; 522 if (ret)
495
496 if (copy_from_user(&buf, ubuf, cnt))
497 return -EFAULT;
498
499 buf[cnt] = 0;
500
501 ret = strict_strtoul(buf, 10, &val);
502 if (ret < 0)
503 return ret; 523 return ret;
504 524
505 ret = tracing_update_buffers(); 525 ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
528 loff_t *ppos) 548 loff_t *ppos)
529{ 549{
530 const char set_to_char[4] = { '?', '0', '1', 'X' }; 550 const char set_to_char[4] = { '?', '0', '1', 'X' };
531 const char *system = filp->private_data; 551 struct event_subsystem *system = filp->private_data;
532 struct ftrace_event_call *call; 552 struct ftrace_event_call *call;
533 char buf[2]; 553 char buf[2];
534 int set = 0; 554 int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
539 if (!call->name || !call->class || !call->class->reg) 559 if (!call->name || !call->class || !call->class->reg)
540 continue; 560 continue;
541 561
542 if (system && strcmp(call->class->system, system) != 0) 562 if (system && strcmp(call->class->system, system->name) != 0)
543 continue; 563 continue;
544 564
545 /* 565 /*
@@ -569,21 +589,13 @@ static ssize_t
569system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 589system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
570 loff_t *ppos) 590 loff_t *ppos)
571{ 591{
572 const char *system = filp->private_data; 592 struct event_subsystem *system = filp->private_data;
593 const char *name = NULL;
573 unsigned long val; 594 unsigned long val;
574 char buf[64];
575 ssize_t ret; 595 ssize_t ret;
576 596
577 if (cnt >= sizeof(buf)) 597 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
578 return -EINVAL; 598 if (ret)
579
580 if (copy_from_user(&buf, ubuf, cnt))
581 return -EFAULT;
582
583 buf[cnt] = 0;
584
585 ret = strict_strtoul(buf, 10, &val);
586 if (ret < 0)
587 return ret; 599 return ret;
588 600
589 ret = tracing_update_buffers(); 601 ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
593 if (val != 0 && val != 1) 605 if (val != 0 && val != 1)
594 return -EINVAL; 606 return -EINVAL;
595 607
596 ret = __ftrace_set_clr_event(NULL, system, NULL, val); 608 /*
609 * Opening of "enable" adds a ref count to system,
610 * so the name is safe to use.
611 */
612 if (system)
613 name = system->name;
614
615 ret = __ftrace_set_clr_event(NULL, name, NULL, val);
597 if (ret) 616 if (ret)
598 goto out; 617 goto out;
599 618
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
826 return cnt; 845 return cnt;
827} 846}
828 847
848static LIST_HEAD(event_subsystems);
849
850static int subsystem_open(struct inode *inode, struct file *filp)
851{
852 struct event_subsystem *system = NULL;
853 int ret;
854
855 if (!inode->i_private)
856 goto skip_search;
857
858 /* Make sure the system still exists */
859 mutex_lock(&event_mutex);
860 list_for_each_entry(system, &event_subsystems, list) {
861 if (system == inode->i_private) {
862 /* Don't open systems with no events */
863 if (!system->nr_events) {
864 system = NULL;
865 break;
866 }
867 __get_system(system);
868 break;
869 }
870 }
871 mutex_unlock(&event_mutex);
872
873 if (system != inode->i_private)
874 return -ENODEV;
875
876 skip_search:
877 ret = tracing_open_generic(inode, filp);
878 if (ret < 0 && system)
879 put_system(system);
880
881 return ret;
882}
883
884static int subsystem_release(struct inode *inode, struct file *file)
885{
886 struct event_subsystem *system = inode->i_private;
887
888 if (system)
889 put_system(system);
890
891 return 0;
892}
893
829static ssize_t 894static ssize_t
830subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 895subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
831 loff_t *ppos) 896 loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
963}; 1028};
964 1029
965static const struct file_operations ftrace_subsystem_filter_fops = { 1030static const struct file_operations ftrace_subsystem_filter_fops = {
966 .open = tracing_open_generic, 1031 .open = subsystem_open,
967 .read = subsystem_filter_read, 1032 .read = subsystem_filter_read,
968 .write = subsystem_filter_write, 1033 .write = subsystem_filter_write,
969 .llseek = default_llseek, 1034 .llseek = default_llseek,
1035 .release = subsystem_release,
970}; 1036};
971 1037
972static const struct file_operations ftrace_system_enable_fops = { 1038static const struct file_operations ftrace_system_enable_fops = {
973 .open = tracing_open_generic, 1039 .open = subsystem_open,
974 .read = system_enable_read, 1040 .read = system_enable_read,
975 .write = system_enable_write, 1041 .write = system_enable_write,
976 .llseek = default_llseek, 1042 .llseek = default_llseek,
1043 .release = subsystem_release,
977}; 1044};
978 1045
979static const struct file_operations ftrace_show_header_fops = { 1046static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1002 return d_events; 1069 return d_events;
1003} 1070}
1004 1071
1005static LIST_HEAD(event_subsystems);
1006
1007static struct dentry * 1072static struct dentry *
1008event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1009{ 1074{
@@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1013 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1014 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1015 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1016 system->nr_events++; 1082 system->nr_events++;
1017 return system->entry; 1083 return system->entry;
1018 } 1084 }
@@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1035 } 1101 }
1036 1102
1037 system->nr_events = 1; 1103 system->nr_events = 1;
1104 system->ref_count = 1;
1038 system->name = kstrdup(name, GFP_KERNEL); 1105 system->name = kstrdup(name, GFP_KERNEL);
1039 if (!system->name) { 1106 if (!system->name) {
1040 debugfs_remove(system->entry); 1107 debugfs_remove(system->entry);
@@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1062 "'%s/filter' entry\n", name); 1129 "'%s/filter' entry\n", name);
1063 } 1130 }
1064 1131
1065 trace_create_file("enable", 0644, system->entry, 1132 trace_create_file("enable", 0644, system->entry, system,
1066 (void *)system->name,
1067 &ftrace_system_enable_fops); 1133 &ftrace_system_enable_fops);
1068 1134
1069 return system->entry; 1135 return system->entry;
@@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name)
1184 list_for_each_entry(system, &event_subsystems, list) { 1250 list_for_each_entry(system, &event_subsystems, list) {
1185 if (strcmp(system->name, name) == 0) { 1251 if (strcmp(system->name, name) == 0) {
1186 if (!--system->nr_events) { 1252 if (!--system->nr_events) {
1187 struct event_filter *filter = system->filter;
1188
1189 debugfs_remove_recursive(system->entry); 1253 debugfs_remove_recursive(system->entry);
1190 list_del(&system->list); 1254 list_del(&system->list);
1191 if (filter) { 1255 __put_system(system);
1192 kfree(filter->filter_string);
1193 kfree(filter);
1194 }
1195 kfree(system->name);
1196 kfree(system);
1197 } 1256 }
1198 break; 1257 break;
1199 } 1258 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf20..256764ecccd6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1886 1886
1887 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1888 1888
1889 /* Make sure the system still has events */
1890 if (!system->nr_events) {
1891 err = -ENODEV;
1892 goto out_unlock;
1893 }
1894
1889 if (!strcmp(strstrip(filter_string), "0")) { 1895 if (!strcmp(strstrip(filter_string), "0")) {
1890 filter_free_subsystem_preds(system); 1896 filter_free_subsystem_preds(system);
1891 remove_filter_string(system->filter); 1897 remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e974..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
324} 324}
325 325
326static int 326static int
327ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) 327ftrace_trace_onoff_callback(struct ftrace_hash *hash,
328 char *glob, char *cmd, char *param, int enable)
328{ 329{
329 struct ftrace_probe_ops *ops; 330 struct ftrace_probe_ops *ops;
330 void *count = (void *)-1; 331 void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..a7d2a4c653d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
74 74
75static struct trace_array *graph_array; 75static struct trace_array *graph_array;
76 76
77/*
78 * DURATION column is being also used to display IRQ signs,
79 * following values are used by print_graph_irq and others
80 * to fill in space into DURATION column.
81 */
82enum {
83 DURATION_FILL_FULL = -1,
84 DURATION_FILL_START = -2,
85 DURATION_FILL_END = -3,
86};
87
88static enum print_line_t
89print_graph_duration(unsigned long long duration, struct trace_seq *s,
90 u32 flags);
77 91
78/* Add a function return address to the trace stack on thread info.*/ 92/* Add a function return address to the trace stack on thread info.*/
79int 93int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
213 227
214static inline int ftrace_graph_ignore_irqs(void) 228static inline int ftrace_graph_ignore_irqs(void)
215{ 229{
216 if (!ftrace_graph_skip_irqs) 230 if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
217 return 0; 231 return 0;
218 232
219 return in_irq(); 233 return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
577 return next; 591 return next;
578} 592}
579 593
580/* Signal a overhead of time execution to the output */
581static int
582print_graph_overhead(unsigned long long duration, struct trace_seq *s,
583 u32 flags)
584{
585 /* If duration disappear, we don't need anything */
586 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
587 return 1;
588
589 /* Non nested entry or return */
590 if (duration == -1)
591 return trace_seq_printf(s, " ");
592
593 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
594 /* Duration exceeded 100 msecs */
595 if (duration > 100000ULL)
596 return trace_seq_printf(s, "! ");
597
598 /* Duration exceeded 10 msecs */
599 if (duration > 10000ULL)
600 return trace_seq_printf(s, "+ ");
601 }
602
603 return trace_seq_printf(s, " ");
604}
605
606static int print_graph_abs_time(u64 t, struct trace_seq *s) 594static int print_graph_abs_time(u64 t, struct trace_seq *s)
607{ 595{
608 unsigned long usecs_rem; 596 unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
625 addr >= (unsigned long)__irqentry_text_end) 613 addr >= (unsigned long)__irqentry_text_end)
626 return TRACE_TYPE_UNHANDLED; 614 return TRACE_TYPE_UNHANDLED;
627 615
628 /* Absolute time */ 616 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
629 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 617 /* Absolute time */
630 ret = print_graph_abs_time(iter->ts, s); 618 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
631 if (!ret) 619 ret = print_graph_abs_time(iter->ts, s);
632 return TRACE_TYPE_PARTIAL_LINE; 620 if (!ret)
633 } 621 return TRACE_TYPE_PARTIAL_LINE;
622 }
634 623
635 /* Cpu */ 624 /* Cpu */
636 if (flags & TRACE_GRAPH_PRINT_CPU) { 625 if (flags & TRACE_GRAPH_PRINT_CPU) {
637 ret = print_graph_cpu(s, cpu); 626 ret = print_graph_cpu(s, cpu);
638 if (ret == TRACE_TYPE_PARTIAL_LINE) 627 if (ret == TRACE_TYPE_PARTIAL_LINE)
639 return TRACE_TYPE_PARTIAL_LINE; 628 return TRACE_TYPE_PARTIAL_LINE;
640 } 629 }
641 630
642 /* Proc */ 631 /* Proc */
643 if (flags & TRACE_GRAPH_PRINT_PROC) { 632 if (flags & TRACE_GRAPH_PRINT_PROC) {
644 ret = print_graph_proc(s, pid); 633 ret = print_graph_proc(s, pid);
645 if (ret == TRACE_TYPE_PARTIAL_LINE) 634 if (ret == TRACE_TYPE_PARTIAL_LINE)
646 return TRACE_TYPE_PARTIAL_LINE; 635 return TRACE_TYPE_PARTIAL_LINE;
647 ret = trace_seq_printf(s, " | "); 636 ret = trace_seq_printf(s, " | ");
648 if (!ret) 637 if (!ret)
649 return TRACE_TYPE_PARTIAL_LINE; 638 return TRACE_TYPE_PARTIAL_LINE;
639 }
650 } 640 }
651 641
652 /* No overhead */ 642 /* No overhead */
653 ret = print_graph_overhead(-1, s, flags); 643 ret = print_graph_duration(DURATION_FILL_START, s, flags);
654 if (!ret) 644 if (ret != TRACE_TYPE_HANDLED)
655 return TRACE_TYPE_PARTIAL_LINE; 645 return ret;
656 646
657 if (type == TRACE_GRAPH_ENT) 647 if (type == TRACE_GRAPH_ENT)
658 ret = trace_seq_printf(s, "==========>"); 648 ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
662 if (!ret) 652 if (!ret)
663 return TRACE_TYPE_PARTIAL_LINE; 653 return TRACE_TYPE_PARTIAL_LINE;
664 654
665 /* Don't close the duration column if haven't one */ 655 ret = print_graph_duration(DURATION_FILL_END, s, flags);
666 if (flags & TRACE_GRAPH_PRINT_DURATION) 656 if (ret != TRACE_TYPE_HANDLED)
667 trace_seq_printf(s, " |"); 657 return ret;
658
668 ret = trace_seq_printf(s, "\n"); 659 ret = trace_seq_printf(s, "\n");
669 660
670 if (!ret) 661 if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
716} 707}
717 708
718static enum print_line_t 709static enum print_line_t
719print_graph_duration(unsigned long long duration, struct trace_seq *s) 710print_graph_duration(unsigned long long duration, struct trace_seq *s,
711 u32 flags)
720{ 712{
721 int ret; 713 int ret = -1;
714
715 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
716 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
717 return TRACE_TYPE_HANDLED;
718
719 /* No real adata, just filling the column with spaces */
720 switch (duration) {
721 case DURATION_FILL_FULL:
722 ret = trace_seq_printf(s, " | ");
723 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
724 case DURATION_FILL_START:
725 ret = trace_seq_printf(s, " ");
726 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
727 case DURATION_FILL_END:
728 ret = trace_seq_printf(s, " |");
729 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
730 }
731
732 /* Signal a overhead of time execution to the output */
733 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
734 /* Duration exceeded 100 msecs */
735 if (duration > 100000ULL)
736 ret = trace_seq_printf(s, "! ");
737 /* Duration exceeded 10 msecs */
738 else if (duration > 10000ULL)
739 ret = trace_seq_printf(s, "+ ");
740 }
741
742 /*
743 * The -1 means we either did not exceed the duration tresholds
744 * or we dont want to print out the overhead. Either way we need
745 * to fill out the space.
746 */
747 if (ret == -1)
748 ret = trace_seq_printf(s, " ");
749
750 /* Catching here any failure happenned above */
751 if (!ret)
752 return TRACE_TYPE_PARTIAL_LINE;
722 753
723 ret = trace_print_graph_duration(duration, s); 754 ret = trace_print_graph_duration(duration, s);
724 if (ret != TRACE_TYPE_HANDLED) 755 if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
767 cpu_data->enter_funcs[call->depth] = 0; 798 cpu_data->enter_funcs[call->depth] = 0;
768 } 799 }
769 800
770 /* Overhead */ 801 /* Overhead and duration */
771 ret = print_graph_overhead(duration, s, flags); 802 ret = print_graph_duration(duration, s, flags);
772 if (!ret) 803 if (ret == TRACE_TYPE_PARTIAL_LINE)
773 return TRACE_TYPE_PARTIAL_LINE; 804 return TRACE_TYPE_PARTIAL_LINE;
774 805
775 /* Duration */
776 if (flags & TRACE_GRAPH_PRINT_DURATION) {
777 ret = print_graph_duration(duration, s);
778 if (ret == TRACE_TYPE_PARTIAL_LINE)
779 return TRACE_TYPE_PARTIAL_LINE;
780 }
781
782 /* Function */ 806 /* Function */
783 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 807 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
784 ret = trace_seq_printf(s, " "); 808 ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
815 cpu_data->enter_funcs[call->depth] = call->func; 839 cpu_data->enter_funcs[call->depth] = call->func;
816 } 840 }
817 841
818 /* No overhead */
819 ret = print_graph_overhead(-1, s, flags);
820 if (!ret)
821 return TRACE_TYPE_PARTIAL_LINE;
822
823 /* No time */ 842 /* No time */
824 if (flags & TRACE_GRAPH_PRINT_DURATION) { 843 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
825 ret = trace_seq_printf(s, " | "); 844 if (ret != TRACE_TYPE_HANDLED)
826 if (!ret) 845 return ret;
827 return TRACE_TYPE_PARTIAL_LINE;
828 }
829 846
830 /* Function */ 847 /* Function */
831 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 848 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
865 return TRACE_TYPE_PARTIAL_LINE; 882 return TRACE_TYPE_PARTIAL_LINE;
866 } 883 }
867 884
885 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
886 return 0;
887
868 /* Absolute time */ 888 /* Absolute time */
869 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 889 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
870 ret = print_graph_abs_time(iter->ts, s); 890 ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1078 if (print_graph_prologue(iter, s, 0, 0, flags)) 1098 if (print_graph_prologue(iter, s, 0, 0, flags))
1079 return TRACE_TYPE_PARTIAL_LINE; 1099 return TRACE_TYPE_PARTIAL_LINE;
1080 1100
1081 /* Overhead */ 1101 /* Overhead and duration */
1082 ret = print_graph_overhead(duration, s, flags); 1102 ret = print_graph_duration(duration, s, flags);
1083 if (!ret) 1103 if (ret == TRACE_TYPE_PARTIAL_LINE)
1084 return TRACE_TYPE_PARTIAL_LINE; 1104 return TRACE_TYPE_PARTIAL_LINE;
1085 1105
1086 /* Duration */
1087 if (flags & TRACE_GRAPH_PRINT_DURATION) {
1088 ret = print_graph_duration(duration, s);
1089 if (ret == TRACE_TYPE_PARTIAL_LINE)
1090 return TRACE_TYPE_PARTIAL_LINE;
1091 }
1092
1093 /* Closing brace */ 1106 /* Closing brace */
1094 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1107 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1095 ret = trace_seq_printf(s, " "); 1108 ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1146 if (print_graph_prologue(iter, s, 0, 0, flags)) 1159 if (print_graph_prologue(iter, s, 0, 0, flags))
1147 return TRACE_TYPE_PARTIAL_LINE; 1160 return TRACE_TYPE_PARTIAL_LINE;
1148 1161
1149 /* No overhead */
1150 ret = print_graph_overhead(-1, s, flags);
1151 if (!ret)
1152 return TRACE_TYPE_PARTIAL_LINE;
1153
1154 /* No time */ 1162 /* No time */
1155 if (flags & TRACE_GRAPH_PRINT_DURATION) { 1163 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
1156 ret = trace_seq_printf(s, " | "); 1164 if (ret != TRACE_TYPE_HANDLED)
1157 if (!ret) 1165 return ret;
1158 return TRACE_TYPE_PARTIAL_LINE;
1159 }
1160 1166
1161 /* Indentation */ 1167 /* Indentation */
1162 if (depth > 0) 1168 if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1207 1213
1208 1214
1209enum print_line_t 1215enum print_line_t
1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1216print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1211{ 1217{
1212 struct ftrace_graph_ent_entry *field; 1218 struct ftrace_graph_ent_entry *field;
1213 struct fgraph_data *data = iter->private; 1219 struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1270static enum print_line_t 1276static enum print_line_t
1271print_graph_function(struct trace_iterator *iter) 1277print_graph_function(struct trace_iterator *iter)
1272{ 1278{
1273 return __print_graph_function_flags(iter, tracer_flags.val); 1279 return print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1285} 1280}
1286 1281
1287static enum print_line_t 1282static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1309 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); 1304 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
1310 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); 1305 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
1311 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); 1306 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
1312 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); 1307 seq_printf(s, "#%.*s||| / \n", size, spaces);
1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1314} 1308}
1315 1309
1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags) 1310static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1329 if (flags & TRACE_GRAPH_PRINT_PROC) 1323 if (flags & TRACE_GRAPH_PRINT_PROC)
1330 seq_printf(s, " TASK/PID "); 1324 seq_printf(s, " TASK/PID ");
1331 if (lat) 1325 if (lat)
1332 seq_printf(s, "|||||"); 1326 seq_printf(s, "||||");
1333 if (flags & TRACE_GRAPH_PRINT_DURATION) 1327 if (flags & TRACE_GRAPH_PRINT_DURATION)
1334 seq_printf(s, " DURATION "); 1328 seq_printf(s, " DURATION ");
1335 seq_printf(s, " FUNCTION CALLS\n"); 1329 seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1343 if (flags & TRACE_GRAPH_PRINT_PROC) 1337 if (flags & TRACE_GRAPH_PRINT_PROC)
1344 seq_printf(s, " | | "); 1338 seq_printf(s, " | | ");
1345 if (lat) 1339 if (lat)
1346 seq_printf(s, "|||||"); 1340 seq_printf(s, "||||");
1347 if (flags & TRACE_GRAPH_PRINT_DURATION) 1341 if (flags & TRACE_GRAPH_PRINT_DURATION)
1348 seq_printf(s, " | | "); 1342 seq_printf(s, " | | ");
1349 seq_printf(s, " | | | |\n"); 1343 seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{ 1352{
1359 struct trace_iterator *iter = s->private; 1353 struct trace_iterator *iter = s->private;
1360 1354
1355 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
1356 return;
1357
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 1358 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */ 1359 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter)) 1360 if (trace_empty(iter))
1364 return; 1361 return;
1365 1362
1366 print_trace_header(s, iter); 1363 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION; 1364 }
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370 1365
1371 __print_graph_headers_flags(s, flags); 1366 __print_graph_headers_flags(s, flags);
1372} 1367}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284d..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
226} 226}
227 227
228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ 228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
229 TRACE_GRAPH_PRINT_PROC) 229 TRACE_GRAPH_PRINT_PROC | \
230 TRACE_GRAPH_PRINT_ABS_TIME | \
231 TRACE_GRAPH_PRINT_DURATION)
230 232
231static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 233static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
232{ 234{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8be..5fb3697bf0e5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string) 343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size) 344DEFINE_FETCH_deref(string_size)
345 345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
346static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
347{ 355{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string_size NULL 385#define fetch_bitfield_string_size NULL
378 386
379static __kprobes void 387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data) 401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{ 402{
382 /* 403 /*
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
389 free_symbol_cache(data->orig.data); 410 free_symbol_cache(data->orig.data);
390 kfree(data); 411 kfree(data);
391} 412}
413
392/* Default (unsigned long) fetch type */ 414/* Default (unsigned long) fetch type */
393#define __DEFAULT_FETCH_TYPE(t) u##t 415#define __DEFAULT_FETCH_TYPE(t) u##t
394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -536,6 +558,7 @@ struct probe_arg {
536/* Flags for trace_probe */ 558/* Flags for trace_probe */
537#define TP_FLAG_TRACE 1 559#define TP_FLAG_TRACE 1
538#define TP_FLAG_PROFILE 2 560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
539 562
540struct trace_probe { 563struct trace_probe {
541 struct list_head list; 564 struct list_head list;
@@ -555,16 +578,49 @@ struct trace_probe {
555 (sizeof(struct probe_arg) * (n))) 578 (sizeof(struct probe_arg) * (n)))
556 579
557 580
558static __kprobes int probe_is_return(struct trace_probe *tp) 581static __kprobes int trace_probe_is_return(struct trace_probe *tp)
559{ 582{
560 return tp->rp.handler != NULL; 583 return tp->rp.handler != NULL;
561} 584}
562 585
563static __kprobes const char *probe_symbol(struct trace_probe *tp) 586static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
564{ 587{
565 return tp->symbol ? tp->symbol : "unknown"; 588 return tp->symbol ? tp->symbol : "unknown";
566} 589}
567 590
591static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
592{
593 return tp->rp.kp.offset;
594}
595
596static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
597{
598 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
599}
600
601static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
602{
603 return !!(tp->flags & TP_FLAG_REGISTERED);
604}
605
606static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
607{
608 return !!(kprobe_gone(&tp->rp.kp));
609}
610
611static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
612 struct module *mod)
613{
614 int len = strlen(mod->name);
615 const char *name = trace_probe_symbol(tp);
616 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
617}
618
619static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
620{
621 return !!strchr(trace_probe_symbol(tp), ':');
622}
623
568static int register_probe_event(struct trace_probe *tp); 624static int register_probe_event(struct trace_probe *tp);
569static void unregister_probe_event(struct trace_probe *tp); 625static void unregister_probe_event(struct trace_probe *tp);
570 626
@@ -646,6 +702,16 @@ error:
646 return ERR_PTR(ret); 702 return ERR_PTR(ret);
647} 703}
648 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
649static void free_probe_arg(struct probe_arg *arg) 715static void free_probe_arg(struct probe_arg *arg)
650{ 716{
651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) 717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
671 kfree(tp); 737 kfree(tp);
672} 738}
673 739
674static struct trace_probe *find_probe_event(const char *event, 740static struct trace_probe *find_trace_probe(const char *event,
675 const char *group) 741 const char *group)
676{ 742{
677 struct trace_probe *tp; 743 struct trace_probe *tp;
@@ -683,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event,
683 return NULL; 749 return NULL;
684} 750}
685 751
752/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
753static int enable_trace_probe(struct trace_probe *tp, int flag)
754{
755 int ret = 0;
756
757 tp->flags |= flag;
758 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
759 !trace_probe_has_gone(tp)) {
760 if (trace_probe_is_return(tp))
761 ret = enable_kretprobe(&tp->rp);
762 else
763 ret = enable_kprobe(&tp->rp.kp);
764 }
765
766 return ret;
767}
768
769/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
770static void disable_trace_probe(struct trace_probe *tp, int flag)
771{
772 tp->flags &= ~flag;
773 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
774 if (trace_probe_is_return(tp))
775 disable_kretprobe(&tp->rp);
776 else
777 disable_kprobe(&tp->rp.kp);
778 }
779}
780
781/* Internal register function - just handle k*probes and flags */
782static int __register_trace_probe(struct trace_probe *tp)
783{
784 int i, ret;
785
786 if (trace_probe_is_registered(tp))
787 return -EINVAL;
788
789 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]);
791
792 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp))
794 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
795 else
796 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
797
798 if (trace_probe_is_return(tp))
799 ret = register_kretprobe(&tp->rp);
800 else
801 ret = register_kprobe(&tp->rp.kp);
802
803 if (ret == 0)
804 tp->flags |= TP_FLAG_REGISTERED;
805 else {
806 pr_warning("Could not insert probe at %s+%lu: %d\n",
807 trace_probe_symbol(tp), trace_probe_offset(tp), ret);
808 if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
809 pr_warning("This probe might be able to register after"
810 "target module is loaded. Continue.\n");
811 ret = 0;
812 } else if (ret == -EILSEQ) {
813 pr_warning("Probing address(0x%p) is not an "
814 "instruction boundary.\n",
815 tp->rp.kp.addr);
816 ret = -EINVAL;
817 }
818 }
819
820 return ret;
821}
822
823/* Internal unregister function - just handle k*probes and flags */
824static void __unregister_trace_probe(struct trace_probe *tp)
825{
826 if (trace_probe_is_registered(tp)) {
827 if (trace_probe_is_return(tp))
828 unregister_kretprobe(&tp->rp);
829 else
830 unregister_kprobe(&tp->rp.kp);
831 tp->flags &= ~TP_FLAG_REGISTERED;
832 /* Cleanup kprobe for reuse */
833 if (tp->rp.kp.symbol_name)
834 tp->rp.kp.addr = NULL;
835 }
836}
837
686/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
687static void unregister_trace_probe(struct trace_probe *tp) 839static void unregister_trace_probe(struct trace_probe *tp)
688{ 840{
689 if (probe_is_return(tp)) 841 __unregister_trace_probe(tp);
690 unregister_kretprobe(&tp->rp);
691 else
692 unregister_kprobe(&tp->rp.kp);
693 list_del(&tp->list); 842 list_del(&tp->list);
694 unregister_probe_event(tp); 843 unregister_probe_event(tp);
695} 844}
@@ -702,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp)
702 851
703 mutex_lock(&probe_lock); 852 mutex_lock(&probe_lock);
704 853
705 /* register as an event */ 854 /* Delete old (same name) event if exist */
706 old_tp = find_probe_event(tp->call.name, tp->call.class->system); 855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
707 if (old_tp) { 856 if (old_tp) {
708 /* delete old event */
709 unregister_trace_probe(old_tp); 857 unregister_trace_probe(old_tp);
710 free_trace_probe(old_tp); 858 free_trace_probe(old_tp);
711 } 859 }
860
861 /* Register new event */
712 ret = register_probe_event(tp); 862 ret = register_probe_event(tp);
713 if (ret) { 863 if (ret) {
714 pr_warning("Failed to register probe event(%d)\n", ret); 864 pr_warning("Failed to register probe event(%d)\n", ret);
715 goto end; 865 goto end;
716 } 866 }
717 867
718 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 868 /* Register k*probe */
719 if (probe_is_return(tp)) 869 ret = __register_trace_probe(tp);
720 ret = register_kretprobe(&tp->rp); 870 if (ret < 0)
721 else
722 ret = register_kprobe(&tp->rp.kp);
723
724 if (ret) {
725 pr_warning("Could not insert probe(%d)\n", ret);
726 if (ret == -EILSEQ) {
727 pr_warning("Probing address(0x%p) is not an "
728 "instruction boundary.\n",
729 tp->rp.kp.addr);
730 ret = -EINVAL;
731 }
732 unregister_probe_event(tp); 871 unregister_probe_event(tp);
733 } else 872 else
734 list_add_tail(&tp->list, &probe_list); 873 list_add_tail(&tp->list, &probe_list);
874
735end: 875end:
736 mutex_unlock(&probe_lock); 876 mutex_unlock(&probe_lock);
737 return ret; 877 return ret;
738} 878}
739 879
880/* Module notifier call back, checking event on the module */
881static int trace_probe_module_callback(struct notifier_block *nb,
882 unsigned long val, void *data)
883{
884 struct module *mod = data;
885 struct trace_probe *tp;
886 int ret;
887
888 if (val != MODULE_STATE_COMING)
889 return NOTIFY_DONE;
890
891 /* Update probes on coming module */
892 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) {
895 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp);
897 if (ret)
898 pr_warning("Failed to re-register probe %s on"
899 "%s: %d\n",
900 tp->call.name, mod->name, ret);
901 }
902 }
903 mutex_unlock(&probe_lock);
904
905 return NOTIFY_DONE;
906}
907
908static struct notifier_block trace_probe_module_nb = {
909 .notifier_call = trace_probe_module_callback,
910 .priority = 1 /* Invoked after kprobe module callback */
911};
912
740/* Split symbol and offset. */ 913/* Split symbol and offset. */
741static int split_symbol_offset(char *symbol, unsigned long *offset) 914static int split_symbol_offset(char *symbol, unsigned long *offset)
742{ 915{
@@ -962,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv)
962{ 1135{
963 /* 1136 /*
964 * Argument syntax: 1137 * Argument syntax:
965 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 1138 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
966 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 1139 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
967 * Fetch args: 1140 * Fetch args:
968 * $retval : fetch return value 1141 * $retval : fetch return value
969 * $stack : fetch stack address 1142 * $stack : fetch stack address
@@ -1025,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv)
1025 return -EINVAL; 1198 return -EINVAL;
1026 } 1199 }
1027 mutex_lock(&probe_lock); 1200 mutex_lock(&probe_lock);
1028 tp = find_probe_event(event, group); 1201 tp = find_trace_probe(event, group);
1029 if (!tp) { 1202 if (!tp) {
1030 mutex_unlock(&probe_lock); 1203 mutex_unlock(&probe_lock);
1031 pr_info("Event %s/%s doesn't exist.\n", group, event); 1204 pr_info("Event %s/%s doesn't exist.\n", group, event);
@@ -1144,7 +1317,7 @@ error:
1144 return ret; 1317 return ret;
1145} 1318}
1146 1319
1147static void cleanup_all_probes(void) 1320static void release_all_trace_probes(void)
1148{ 1321{
1149 struct trace_probe *tp; 1322 struct trace_probe *tp;
1150 1323
@@ -1158,7 +1331,6 @@ static void cleanup_all_probes(void)
1158 mutex_unlock(&probe_lock); 1331 mutex_unlock(&probe_lock);
1159} 1332}
1160 1333
1161
1162/* Probes listing interfaces */ 1334/* Probes listing interfaces */
1163static void *probes_seq_start(struct seq_file *m, loff_t *pos) 1335static void *probes_seq_start(struct seq_file *m, loff_t *pos)
1164{ 1336{
@@ -1181,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
1181 struct trace_probe *tp = v; 1353 struct trace_probe *tp = v;
1182 int i; 1354 int i;
1183 1355
1184 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1356 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
1185 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 1357 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
1186 1358
1187 if (!tp->symbol) 1359 if (!tp->symbol)
1188 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1360 seq_printf(m, " 0x%p", tp->rp.kp.addr);
1189 else if (tp->rp.kp.offset) 1361 else if (tp->rp.kp.offset)
1190 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 1362 seq_printf(m, " %s+%u", trace_probe_symbol(tp),
1363 tp->rp.kp.offset);
1191 else 1364 else
1192 seq_printf(m, " %s", probe_symbol(tp)); 1365 seq_printf(m, " %s", trace_probe_symbol(tp));
1193 1366
1194 for (i = 0; i < tp->nr_args; i++) 1367 for (i = 0; i < tp->nr_args; i++)
1195 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 1368 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1209,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file)
1209{ 1382{
1210 if ((file->f_mode & FMODE_WRITE) && 1383 if ((file->f_mode & FMODE_WRITE) &&
1211 (file->f_flags & O_TRUNC)) 1384 (file->f_flags & O_TRUNC))
1212 cleanup_all_probes(); 1385 release_all_trace_probes();
1213 1386
1214 return seq_open(file, &probes_seq_op); 1387 return seq_open(file, &probes_seq_op);
1215} 1388}
@@ -1397,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1397 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1570 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1398 1571
1399 if (!filter_current_check_discard(buffer, call, entry, event)) 1572 if (!filter_current_check_discard(buffer, call, entry, event))
1400 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1573 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1574 irq_flags, pc, regs);
1401} 1575}
1402 1576
1403/* Kretprobe handler */ 1577/* Kretprobe handler */
@@ -1429,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1429 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1603 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1430 1604
1431 if (!filter_current_check_discard(buffer, call, entry, event)) 1605 if (!filter_current_check_discard(buffer, call, entry, event))
1432 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1606 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1607 irq_flags, pc, regs);
1433} 1608}
1434 1609
1435/* Event entry printers */ 1610/* Event entry printers */
@@ -1511,30 +1686,6 @@ partial:
1511 return TRACE_TYPE_PARTIAL_LINE; 1686 return TRACE_TYPE_PARTIAL_LINE;
1512} 1687}
1513 1688
1514static int probe_event_enable(struct ftrace_event_call *call)
1515{
1516 struct trace_probe *tp = (struct trace_probe *)call->data;
1517
1518 tp->flags |= TP_FLAG_TRACE;
1519 if (probe_is_return(tp))
1520 return enable_kretprobe(&tp->rp);
1521 else
1522 return enable_kprobe(&tp->rp.kp);
1523}
1524
1525static void probe_event_disable(struct ftrace_event_call *call)
1526{
1527 struct trace_probe *tp = (struct trace_probe *)call->data;
1528
1529 tp->flags &= ~TP_FLAG_TRACE;
1530 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1531 if (probe_is_return(tp))
1532 disable_kretprobe(&tp->rp);
1533 else
1534 disable_kprobe(&tp->rp.kp);
1535 }
1536}
1537
1538#undef DEFINE_FIELD 1689#undef DEFINE_FIELD
1539#define DEFINE_FIELD(type, item, name, is_signed) \ 1690#define DEFINE_FIELD(type, item, name, is_signed) \
1540 do { \ 1691 do { \
@@ -1596,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1596 1747
1597 const char *fmt, *arg; 1748 const char *fmt, *arg;
1598 1749
1599 if (!probe_is_return(tp)) { 1750 if (!trace_probe_is_return(tp)) {
1600 fmt = "(%lx)"; 1751 fmt = "(%lx)";
1601 arg = "REC->" FIELD_STRING_IP; 1752 arg = "REC->" FIELD_STRING_IP;
1602 } else { 1753 } else {
@@ -1713,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1713 head = this_cpu_ptr(call->perf_events); 1864 head = this_cpu_ptr(call->perf_events);
1714 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1865 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1715} 1866}
1716
1717static int probe_perf_enable(struct ftrace_event_call *call)
1718{
1719 struct trace_probe *tp = (struct trace_probe *)call->data;
1720
1721 tp->flags |= TP_FLAG_PROFILE;
1722
1723 if (probe_is_return(tp))
1724 return enable_kretprobe(&tp->rp);
1725 else
1726 return enable_kprobe(&tp->rp.kp);
1727}
1728
1729static void probe_perf_disable(struct ftrace_event_call *call)
1730{
1731 struct trace_probe *tp = (struct trace_probe *)call->data;
1732
1733 tp->flags &= ~TP_FLAG_PROFILE;
1734
1735 if (!(tp->flags & TP_FLAG_TRACE)) {
1736 if (probe_is_return(tp))
1737 disable_kretprobe(&tp->rp);
1738 else
1739 disable_kprobe(&tp->rp.kp);
1740 }
1741}
1742#endif /* CONFIG_PERF_EVENTS */ 1867#endif /* CONFIG_PERF_EVENTS */
1743 1868
1744static __kprobes 1869static __kprobes
1745int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1870int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1746{ 1871{
1872 struct trace_probe *tp = (struct trace_probe *)event->data;
1873
1747 switch (type) { 1874 switch (type) {
1748 case TRACE_REG_REGISTER: 1875 case TRACE_REG_REGISTER:
1749 return probe_event_enable(event); 1876 return enable_trace_probe(tp, TP_FLAG_TRACE);
1750 case TRACE_REG_UNREGISTER: 1877 case TRACE_REG_UNREGISTER:
1751 probe_event_disable(event); 1878 disable_trace_probe(tp, TP_FLAG_TRACE);
1752 return 0; 1879 return 0;
1753 1880
1754#ifdef CONFIG_PERF_EVENTS 1881#ifdef CONFIG_PERF_EVENTS
1755 case TRACE_REG_PERF_REGISTER: 1882 case TRACE_REG_PERF_REGISTER:
1756 return probe_perf_enable(event); 1883 return enable_trace_probe(tp, TP_FLAG_PROFILE);
1757 case TRACE_REG_PERF_UNREGISTER: 1884 case TRACE_REG_PERF_UNREGISTER:
1758 probe_perf_disable(event); 1885 disable_trace_probe(tp, TP_FLAG_PROFILE);
1759 return 0; 1886 return 0;
1760#endif 1887#endif
1761 } 1888 }
@@ -1805,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp)
1805 1932
1806 /* Initialize ftrace_event_call */ 1933 /* Initialize ftrace_event_call */
1807 INIT_LIST_HEAD(&call->class->fields); 1934 INIT_LIST_HEAD(&call->class->fields);
1808 if (probe_is_return(tp)) { 1935 if (trace_probe_is_return(tp)) {
1809 call->event.funcs = &kretprobe_funcs; 1936 call->event.funcs = &kretprobe_funcs;
1810 call->class->define_fields = kretprobe_event_define_fields; 1937 call->class->define_fields = kretprobe_event_define_fields;
1811 } else { 1938 } else {
@@ -1844,6 +1971,9 @@ static __init int init_kprobe_trace(void)
1844 struct dentry *d_tracer; 1971 struct dentry *d_tracer;
1845 struct dentry *entry; 1972 struct dentry *entry;
1846 1973
1974 if (register_module_notifier(&trace_probe_module_nb))
1975 return -EINVAL;
1976
1847 d_tracer = tracing_init_dentry(); 1977 d_tracer = tracing_init_dentry();
1848 if (!d_tracer) 1978 if (!d_tracer)
1849 return 0; 1979 return 0;
@@ -1897,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void)
1897 warn++; 2027 warn++;
1898 } else { 2028 } else {
1899 /* Enable trace point */ 2029 /* Enable trace point */
1900 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); 2030 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1901 if (WARN_ON_ONCE(tp == NULL)) { 2031 if (WARN_ON_ONCE(tp == NULL)) {
1902 pr_warning("error on getting new probe.\n"); 2032 pr_warning("error on getting new probe.\n");
1903 warn++; 2033 warn++;
1904 } else 2034 } else
1905 probe_event_enable(&tp->call); 2035 enable_trace_probe(tp, TP_FLAG_TRACE);
1906 } 2036 }
1907 2037
1908 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 2038 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1912,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void)
1912 warn++; 2042 warn++;
1913 } else { 2043 } else {
1914 /* Enable trace point */ 2044 /* Enable trace point */
1915 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); 2045 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1916 if (WARN_ON_ONCE(tp == NULL)) { 2046 if (WARN_ON_ONCE(tp == NULL)) {
1917 pr_warning("error on getting new probe.\n"); 2047 pr_warning("error on getting new probe.\n");
1918 warn++; 2048 warn++;
1919 } else 2049 } else
1920 probe_event_enable(&tp->call); 2050 enable_trace_probe(tp, TP_FLAG_TRACE);
1921 } 2051 }
1922 2052
1923 if (warn) 2053 if (warn)
@@ -1938,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void)
1938 } 2068 }
1939 2069
1940end: 2070end:
1941 cleanup_all_probes(); 2071 release_all_trace_probes();
1942 if (warn) 2072 if (warn)
1943 pr_cont("NG: Some tests are failed. Please check them.\n"); 2073 pr_cont("NG: Some tests are failed. Please check them.\n");
1944 else 2074 else
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e1..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1107{ 1107{
1108 struct stack_entry *field; 1108 struct stack_entry *field;
1109 struct trace_seq *s = &iter->seq; 1109 struct trace_seq *s = &iter->seq;
1110 int i; 1110 unsigned long *p;
1111 unsigned long *end;
1111 1112
1112 trace_assign_type(field, iter->ent); 1113 trace_assign_type(field, iter->ent);
1114 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1113 1115
1114 if (!trace_seq_puts(s, "<stack trace>\n")) 1116 if (!trace_seq_puts(s, "<stack trace>\n"))
1115 goto partial; 1117 goto partial;
1116 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1118
1117 if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) 1119 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1118 break;
1119 if (!trace_seq_puts(s, " => ")) 1120 if (!trace_seq_puts(s, " => "))
1120 goto partial; 1121 goto partial;
1121 1122
1122 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1123 if (!seq_print_ip_sym(s, *p, flags))
1123 goto partial; 1124 goto partial;
1124 if (!trace_seq_puts(s, "\n")) 1125 if (!trace_seq_puts(s, "\n"))
1125 goto partial; 1126 goto partial;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2ca..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
227 graph_trace_close(iter); 227 graph_trace_close(iter);
228} 228}
229 229
230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) 230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
231 TRACE_GRAPH_PRINT_ABS_TIME | \
232 TRACE_GRAPH_PRINT_DURATION)
231 233
232static enum print_line_t wakeup_print_line(struct trace_iterator *iter) 234static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
233{ 235{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c25..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
156{ 156{
157 long *ptr = filp->private_data; 157 long *ptr = filp->private_data;
158 unsigned long val, flags; 158 unsigned long val, flags;
159 char buf[64];
160 int ret; 159 int ret;
161 int cpu; 160 int cpu;
162 161
163 if (count >= sizeof(buf)) 162 ret = kstrtoul_from_user(ubuf, count, 10, &val);
164 return -EINVAL; 163 if (ret)
165
166 if (copy_from_user(&buf, ubuf, count))
167 return -EFAULT;
168
169 buf[count] = 0;
170
171 ret = strict_strtoul(buf, 10, &val);
172 if (ret < 0)
173 return ret; 164 return ret;
174 165
175 local_irq_save(flags); 166 local_irq_save(flags);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad4792..36491cd5b7d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts)
200} 200}
201 201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203
203static struct perf_event_attr wd_hw_attr = { 204static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 205 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES, 206 .config = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
209}; 210};
210 211
211/* Callback function for perf event subsystem */ 212/* Callback function for perf event subsystem */
212static void watchdog_overflow_callback(struct perf_event *event, int nmi, 213static void watchdog_overflow_callback(struct perf_event *event,
213 struct perf_sample_data *data, 214 struct perf_sample_data *data,
214 struct pt_regs *regs) 215 struct pt_regs *regs)
215{ 216{
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
368 if (event != NULL) 369 if (event != NULL)
369 goto out_enable; 370 goto out_enable;
370 371
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374
375 /* Try to register using hardware perf events */
376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
375 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 379 goto out_save;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d04..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
221 * per-CPU workqueues: 221 * per-CPU workqueues:
222 */ 222 */
223struct workqueue_struct { 223struct workqueue_struct {
224 unsigned int flags; /* I: WQ_* flags */ 224 unsigned int flags; /* W: WQ_* flags */
225 union { 225 union {
226 struct cpu_workqueue_struct __percpu *pcpu; 226 struct cpu_workqueue_struct __percpu *pcpu;
227 struct cpu_workqueue_struct *single; 227 struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
240 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 240 mayday_mask_t mayday_mask; /* cpus requesting rescue */
241 struct worker *rescuer; /* I: rescue worker */ 241 struct worker *rescuer; /* I: rescue worker */
242 242
243 int nr_drainers; /* W: drain in progress */
243 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
244 const char *name; /* I: workqueue name */ 245 const char *name; /* I: workqueue name */
245#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
990 debug_work_activate(work); 991 debug_work_activate(work);
991 992
992 /* if dying, only works from the same workqueue are allowed */ 993 /* if dying, only works from the same workqueue are allowed */
993 if (unlikely(wq->flags & WQ_DYING) && 994 if (unlikely(wq->flags & WQ_DRAINING) &&
994 WARN_ON_ONCE(!is_chained_work(wq))) 995 WARN_ON_ONCE(!is_chained_work(wq)))
995 return; 996 return;
996 997
@@ -2381,6 +2382,54 @@ out_unlock:
2381} 2382}
2382EXPORT_SYMBOL_GPL(flush_workqueue); 2383EXPORT_SYMBOL_GPL(flush_workqueue);
2383 2384
2385/**
2386 * drain_workqueue - drain a workqueue
2387 * @wq: workqueue to drain
2388 *
2389 * Wait until the workqueue becomes empty. While draining is in progress,
2390 * only chain queueing is allowed. IOW, only currently pending or running
2391 * work items on @wq can queue further work items on it. @wq is flushed
2392 * repeatedly until it becomes empty. The number of flushing is detemined
2393 * by the depth of chaining and should be relatively short. Whine if it
2394 * takes too long.
2395 */
2396void drain_workqueue(struct workqueue_struct *wq)
2397{
2398 unsigned int flush_cnt = 0;
2399 unsigned int cpu;
2400
2401 /*
2402 * __queue_work() needs to test whether there are drainers, is much
2403 * hotter than drain_workqueue() and already looks at @wq->flags.
2404 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2405 */
2406 spin_lock(&workqueue_lock);
2407 if (!wq->nr_drainers++)
2408 wq->flags |= WQ_DRAINING;
2409 spin_unlock(&workqueue_lock);
2410reflush:
2411 flush_workqueue(wq);
2412
2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415
2416 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
2417 continue;
2418
2419 if (++flush_cnt == 10 ||
2420 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2421 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2422 wq->name, flush_cnt);
2423 goto reflush;
2424 }
2425
2426 spin_lock(&workqueue_lock);
2427 if (!--wq->nr_drainers)
2428 wq->flags &= ~WQ_DRAINING;
2429 spin_unlock(&workqueue_lock);
2430}
2431EXPORT_SYMBOL_GPL(drain_workqueue);
2432
2384static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2433static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2385 bool wait_executing) 2434 bool wait_executing)
2386{ 2435{
@@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3009 */ 3058 */
3010void destroy_workqueue(struct workqueue_struct *wq) 3059void destroy_workqueue(struct workqueue_struct *wq)
3011{ 3060{
3012 unsigned int flush_cnt = 0;
3013 unsigned int cpu; 3061 unsigned int cpu;
3014 3062
3015 /* 3063 /* drain it before proceeding with destruction */
3016 * Mark @wq dying and drain all pending works. Once WQ_DYING is 3064 drain_workqueue(wq);
3017 * set, only chain queueing is allowed. IOW, only currently
3018 * pending or running work items on @wq can queue further work
3019 * items on it. @wq is flushed repeatedly until it becomes empty.
3020 * The number of flushing is detemined by the depth of chaining and
3021 * should be relatively short. Whine if it takes too long.
3022 */
3023 wq->flags |= WQ_DYING;
3024reflush:
3025 flush_workqueue(wq);
3026
3027 for_each_cwq_cpu(cpu, wq) {
3028 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3029
3030 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3031 continue;
3032
3033 if (++flush_cnt == 10 ||
3034 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3035 printk(KERN_WARNING "workqueue %s: flush on "
3036 "destruction isn't complete after %u tries\n",
3037 wq->name, flush_cnt);
3038 goto reflush;
3039 }
3040 3065
3041 /* 3066 /*
3042 * wq list is used to freeze wq, remove from list after 3067 * wq list is used to freeze wq, remove from list after