aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 19:44:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 19:44:39 -0400
commit4d4abdcb1dee03a4f9d6d2021622ed07e14dfd17 (patch)
tree4ed4c74b70240451065165fda5fb2059f8c6b1e5 /kernel
parent0342cbcfced2ee937d7c8e1c63f3d3082da7c7dc (diff)
parent7fcfd1abd6480d3b9ef17f5759c175e036e835cf (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (123 commits) perf: Remove the nmi parameter from the oprofile_perf backend x86, perf: Make copy_from_user_nmi() a library function perf: Remove perf_event_attr::type check x86, perf: P4 PMU - Fix typos in comments and style cleanup perf tools: Make test use the preset debugfs path perf tools: Add automated tests for events parsing perf tools: De-opt the parse_events function perf script: Fix display of IP address for non-callchain path perf tools: Fix endian conversion reading event attr from file header perf tools: Add missing 'node' alias to the hw_cache[] array perf probe: Support adding probes on offline kernel modules perf probe: Add probed module in front of function perf probe: Introduce debuginfo to encapsulate dwarf information perf-probe: Move dwarf library routines to dwarf-aux.{c, h} perf probe: Remove redundant dwarf functions perf probe: Move strtailcmp to string.c perf probe: Rename DIE_FIND_CB_FOUND to DIE_FIND_CB_END tracing/kprobe: Update symbol reference when loading module tracing/kprobes: Support module init function probing kprobes: Return -ENOENT if probe point doesn't exist ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c938
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/trace/ftrace.c157
-rw-r--r--kernel/trace/ring_buffer.c66
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c326
-rw-r--r--kernel/trace/trace.h61
-rw-r--r--kernel/trace/trace_entries.h3
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c3
-rw-r--r--kernel/trace/trace_functions_graph.c225
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c324
-rw-r--r--kernel/trace/trace_output.c11
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/watchdog.c8
25 files changed, 1655 insertions, 1182 deletions
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h>
53#include <linux/ktime.h>
52#include <linux/module.h> 54#include <linux/module.h>
53#include <linux/wait.h> 55#include <linux/wait.h>
54#include <linux/sched.h> 56#include <linux/sched.h>
55#include <linux/slab.h> 57#include <linux/slab.h>
56#include <linux/workqueue.h> 58#include <linux/workqueue.h>
57#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
60 61
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
128 129
129 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
130 if (initcall_debug && system_state == SYSTEM_BOOTING) { 131 if (initcall_debug && system_state == SYSTEM_BOOTING) {
131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie,
132 entry->func, task_pid_nr(current)); 134 entry->func, task_pid_nr(current));
133 calltime = ktime_get(); 135 calltime = ktime_get();
134 } 136 }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
136 if (initcall_debug && system_state == SYSTEM_BOOTING) { 138 if (initcall_debug && system_state == SYSTEM_BOOTING) {
137 rettime = ktime_get(); 139 rettime = ktime_get();
138 delta = ktime_sub(rettime, calltime); 140 delta = ktime_sub(rettime, calltime);
139 printk("initcall %lli_%pF returned 0 after %lld usecs\n", 141 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
140 (long long)entry->cookie, 142 (long long)entry->cookie,
141 entry->func, 143 entry->func,
142 (long long)ktime_to_ns(delta) >> 10); 144 (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
270 ktime_t starttime, delta, endtime; 272 ktime_t starttime, delta, endtime;
271 273
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk("async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 276 starttime = ktime_get();
275 } 277 }
276 278
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
280 endtime = ktime_get(); 282 endtime = ktime_get();
281 delta = ktime_sub(endtime, starttime); 283 delta = ktime_sub(endtime, starttime);
282 284
283 printk("async_continuing @ %i after %lli usec\n", 285 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
284 task_pid_nr(current), 286 task_pid_nr(current),
285 (long long)ktime_to_ns(delta) >> 10); 287 (long long)ktime_to_ns(delta) >> 10);
286 } 288 }
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o 5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108ccaf..b8785e26ee1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -731,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
731 748
732/* 749/*
733 * Update the total_time_enabled and total_time_running fields for a event. 750 * Update the total_time_enabled and total_time_running fields for a event.
751 * The caller of this function needs to hold the ctx->lock.
734 */ 752 */
735static void update_event_times(struct perf_event *event) 753static void update_event_times(struct perf_event *event)
736{ 754{
@@ -1105,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
1105 raw_spin_lock(&ctx->lock); 1123 raw_spin_lock(&ctx->lock);
1106 event_sched_out(event, cpuctx, ctx); 1124 event_sched_out(event, cpuctx, ctx);
1107 list_del_event(event, ctx); 1125 list_del_event(event, ctx);
1126 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1127 ctx->is_active = 0;
1128 cpuctx->task_ctx = NULL;
1129 }
1108 raw_spin_unlock(&ctx->lock); 1130 raw_spin_unlock(&ctx->lock);
1109 1131
1110 return 0; 1132 return 0;
@@ -1454,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
1454 event->tstamp_stopped = tstamp; 1476 event->tstamp_stopped = tstamp;
1455} 1477}
1456 1478
1457static void perf_event_context_sched_in(struct perf_event_context *ctx, 1479static void task_ctx_sched_out(struct perf_event_context *ctx);
1458 struct task_struct *tsk); 1480static void
1481ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type,
1484 struct task_struct *task);
1485
1486static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1487 struct perf_event_context *ctx,
1488 struct task_struct *task)
1489{
1490 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1491 if (ctx)
1492 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1493 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1494 if (ctx)
1495 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1496}
1459 1497
1460/* 1498/*
1461 * Cross CPU call to install and enable a performance event 1499 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1504,37 @@ static int __perf_install_in_context(void *info)
1466{ 1504{
1467 struct perf_event *event = info; 1505 struct perf_event *event = info;
1468 struct perf_event_context *ctx = event->ctx; 1506 struct perf_event_context *ctx = event->ctx;
1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1471 int err; 1508 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1509 struct task_struct *task = current;
1510
1511 perf_ctx_lock(cpuctx, task_ctx);
1512 perf_pmu_disable(cpuctx->ctx.pmu);
1472 1513
1473 /* 1514 /*
1474 * In case we're installing a new context to an already running task, 1515 * If there was an active task_ctx schedule it out.
1475 * could also happen before perf_event_task_sched_in() on architectures
1476 * which do context switches with IRQs enabled.
1477 */ 1516 */
1478 if (ctx->task && !cpuctx->task_ctx) 1517 if (task_ctx)
1479 perf_event_context_sched_in(ctx, ctx->task); 1518 task_ctx_sched_out(task_ctx);
1519
1520 /*
1521 * If the context we're installing events in is not the
1522 * active task_ctx, flip them.
1523 */
1524 if (ctx->task && task_ctx != ctx) {
1525 if (task_ctx)
1526 raw_spin_unlock(&task_ctx->lock);
1527 raw_spin_lock(&ctx->lock);
1528 task_ctx = ctx;
1529 }
1530
1531 if (task_ctx) {
1532 cpuctx->task_ctx = task_ctx;
1533 task = task_ctx->task;
1534 }
1535
1536 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1480 1537
1481 raw_spin_lock(&ctx->lock);
1482 ctx->is_active = 1;
1483 update_context_time(ctx); 1538 update_context_time(ctx);
1484 /* 1539 /*
1485 * update cgrp time only if current cgrp 1540 * update cgrp time only if current cgrp
@@ -1490,43 +1545,13 @@ static int __perf_install_in_context(void *info)
1490 1545
1491 add_event_to_ctx(event, ctx); 1546 add_event_to_ctx(event, ctx);
1492 1547
1493 if (!event_filter_match(event))
1494 goto unlock;
1495
1496 /*
1497 * Don't put the event on if it is disabled or if
1498 * it is in a group and the group isn't on.
1499 */
1500 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1501 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1502 goto unlock;
1503
1504 /* 1548 /*
1505 * An exclusive event can't go on if there are already active 1549 * Schedule everything back in
1506 * hardware events, and no hardware event can go on if there
1507 * is already an exclusive event on.
1508 */ 1550 */
1509 if (!group_can_go_on(event, cpuctx, 1)) 1551 perf_event_sched_in(cpuctx, task_ctx, task);
1510 err = -EEXIST;
1511 else
1512 err = event_sched_in(event, cpuctx, ctx);
1513
1514 if (err) {
1515 /*
1516 * This event couldn't go on. If it is in a group
1517 * then we have to pull the whole group off.
1518 * If the event group is pinned then put it in error state.
1519 */
1520 if (leader != event)
1521 group_sched_out(leader, cpuctx, ctx);
1522 if (leader->attr.pinned) {
1523 update_group_times(leader);
1524 leader->state = PERF_EVENT_STATE_ERROR;
1525 }
1526 }
1527 1552
1528unlock: 1553 perf_pmu_enable(cpuctx->ctx.pmu);
1529 raw_spin_unlock(&ctx->lock); 1554 perf_ctx_unlock(cpuctx, task_ctx);
1530 1555
1531 return 0; 1556 return 0;
1532} 1557}
@@ -1739,7 +1764,7 @@ out:
1739 raw_spin_unlock_irq(&ctx->lock); 1764 raw_spin_unlock_irq(&ctx->lock);
1740} 1765}
1741 1766
1742static int perf_event_refresh(struct perf_event *event, int refresh) 1767int perf_event_refresh(struct perf_event *event, int refresh)
1743{ 1768{
1744 /* 1769 /*
1745 * not supported on inherited events 1770 * not supported on inherited events
@@ -1752,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1752 1777
1753 return 0; 1778 return 0;
1754} 1779}
1780EXPORT_SYMBOL_GPL(perf_event_refresh);
1755 1781
1756static void ctx_sched_out(struct perf_event_context *ctx, 1782static void ctx_sched_out(struct perf_event_context *ctx,
1757 struct perf_cpu_context *cpuctx, 1783 struct perf_cpu_context *cpuctx,
1758 enum event_type_t event_type) 1784 enum event_type_t event_type)
1759{ 1785{
1760 struct perf_event *event; 1786 struct perf_event *event;
1787 int is_active = ctx->is_active;
1761 1788
1762 raw_spin_lock(&ctx->lock); 1789 ctx->is_active &= ~event_type;
1763 perf_pmu_disable(ctx->pmu);
1764 ctx->is_active = 0;
1765 if (likely(!ctx->nr_events)) 1790 if (likely(!ctx->nr_events))
1766 goto out; 1791 return;
1792
1767 update_context_time(ctx); 1793 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx); 1794 update_cgrp_time_from_cpuctx(cpuctx);
1769
1770 if (!ctx->nr_active) 1795 if (!ctx->nr_active)
1771 goto out; 1796 return;
1772 1797
1773 if (event_type & EVENT_PINNED) { 1798 perf_pmu_disable(ctx->pmu);
1799 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1800 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1775 group_sched_out(event, cpuctx, ctx); 1801 group_sched_out(event, cpuctx, ctx);
1776 } 1802 }
1777 1803
1778 if (event_type & EVENT_FLEXIBLE) { 1804 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1805 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1780 group_sched_out(event, cpuctx, ctx); 1806 group_sched_out(event, cpuctx, ctx);
1781 } 1807 }
1782out:
1783 perf_pmu_enable(ctx->pmu); 1808 perf_pmu_enable(ctx->pmu);
1784 raw_spin_unlock(&ctx->lock);
1785} 1809}
1786 1810
1787/* 1811/*
@@ -1929,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1929 rcu_read_unlock(); 1953 rcu_read_unlock();
1930 1954
1931 if (do_switch) { 1955 if (do_switch) {
1956 raw_spin_lock(&ctx->lock);
1932 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1957 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1933 cpuctx->task_ctx = NULL; 1958 cpuctx->task_ctx = NULL;
1959 raw_spin_unlock(&ctx->lock);
1934 } 1960 }
1935} 1961}
1936 1962
@@ -1965,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1965 perf_cgroup_sched_out(task); 1991 perf_cgroup_sched_out(task);
1966} 1992}
1967 1993
1968static void task_ctx_sched_out(struct perf_event_context *ctx, 1994static void task_ctx_sched_out(struct perf_event_context *ctx)
1969 enum event_type_t event_type)
1970{ 1995{
1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1996 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1972 1997
@@ -1976,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2001 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1977 return; 2002 return;
1978 2003
1979 ctx_sched_out(ctx, cpuctx, event_type); 2004 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1980 cpuctx->task_ctx = NULL; 2005 cpuctx->task_ctx = NULL;
1981} 2006}
1982 2007
@@ -2055,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2055 struct task_struct *task) 2080 struct task_struct *task)
2056{ 2081{
2057 u64 now; 2082 u64 now;
2083 int is_active = ctx->is_active;
2058 2084
2059 raw_spin_lock(&ctx->lock); 2085 ctx->is_active |= event_type;
2060 ctx->is_active = 1;
2061 if (likely(!ctx->nr_events)) 2086 if (likely(!ctx->nr_events))
2062 goto out; 2087 return;
2063 2088
2064 now = perf_clock(); 2089 now = perf_clock();
2065 ctx->timestamp = now; 2090 ctx->timestamp = now;
@@ -2068,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2068 * First go through the list and put on any pinned groups 2093 * First go through the list and put on any pinned groups
2069 * in order to give them the best chance of going on. 2094 * in order to give them the best chance of going on.
2070 */ 2095 */
2071 if (event_type & EVENT_PINNED) 2096 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2072 ctx_pinned_sched_in(ctx, cpuctx); 2097 ctx_pinned_sched_in(ctx, cpuctx);
2073 2098
2074 /* Then walk through the lower prio flexible groups */ 2099 /* Then walk through the lower prio flexible groups */
2075 if (event_type & EVENT_FLEXIBLE) 2100 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2076 ctx_flexible_sched_in(ctx, cpuctx); 2101 ctx_flexible_sched_in(ctx, cpuctx);
2077
2078out:
2079 raw_spin_unlock(&ctx->lock);
2080} 2102}
2081 2103
2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2104static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2088 ctx_sched_in(ctx, cpuctx, event_type, task); 2110 ctx_sched_in(ctx, cpuctx, event_type, task);
2089} 2111}
2090 2112
2091static void task_ctx_sched_in(struct perf_event_context *ctx,
2092 enum event_type_t event_type)
2093{
2094 struct perf_cpu_context *cpuctx;
2095
2096 cpuctx = __get_cpu_context(ctx);
2097 if (cpuctx->task_ctx == ctx)
2098 return;
2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2101 cpuctx->task_ctx = ctx;
2102}
2103
2104static void perf_event_context_sched_in(struct perf_event_context *ctx, 2113static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task) 2114 struct task_struct *task)
2106{ 2115{
@@ -2110,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2110 if (cpuctx->task_ctx == ctx) 2119 if (cpuctx->task_ctx == ctx)
2111 return; 2120 return;
2112 2121
2122 perf_ctx_lock(cpuctx, ctx);
2113 perf_pmu_disable(ctx->pmu); 2123 perf_pmu_disable(ctx->pmu);
2114 /* 2124 /*
2115 * We want to keep the following priority order: 2125 * We want to keep the following priority order:
@@ -2118,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 */ 2128 */
2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2120 2130
2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2131 perf_event_sched_in(cpuctx, ctx, task);
2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2124 2132
2125 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
2126 2134
2135 perf_pmu_enable(ctx->pmu);
2136 perf_ctx_unlock(cpuctx, ctx);
2137
2127 /* 2138 /*
2128 * Since these rotations are per-cpu, we need to ensure the 2139 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating. 2140 * cpu-context we got scheduled on is actually rotating.
2130 */ 2141 */
2131 perf_pmu_rotate_start(ctx->pmu); 2142 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
2133} 2143}
2134 2144
2135/* 2145/*
@@ -2269,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2269 u64 interrupts, now; 2279 u64 interrupts, now;
2270 s64 delta; 2280 s64 delta;
2271 2281
2272 raw_spin_lock(&ctx->lock);
2273 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2282 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2274 if (event->state != PERF_EVENT_STATE_ACTIVE) 2283 if (event->state != PERF_EVENT_STATE_ACTIVE)
2275 continue; 2284 continue;
@@ -2301,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2301 if (delta > 0) 2310 if (delta > 0)
2302 perf_adjust_period(event, period, delta); 2311 perf_adjust_period(event, period, delta);
2303 } 2312 }
2304 raw_spin_unlock(&ctx->lock);
2305} 2313}
2306 2314
2307/* 2315/*
@@ -2309,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 */ 2317 */
2310static void rotate_ctx(struct perf_event_context *ctx) 2318static void rotate_ctx(struct perf_event_context *ctx)
2311{ 2319{
2312 raw_spin_lock(&ctx->lock);
2313
2314 /* 2320 /*
2315 * Rotate the first entry last of non-pinned groups. Rotation might be 2321 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code. 2322 * disabled by the inheritance code.
2317 */ 2323 */
2318 if (!ctx->rotate_disable) 2324 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups); 2325 list_rotate_left(&ctx->flexible_groups);
2320
2321 raw_spin_unlock(&ctx->lock);
2322} 2326}
2323 2327
2324/* 2328/*
@@ -2345,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2345 rotate = 1; 2349 rotate = 1;
2346 } 2350 }
2347 2351
2352 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2348 perf_pmu_disable(cpuctx->ctx.pmu); 2353 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2354 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2350 if (ctx) 2355 if (ctx)
@@ -2355,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2355 2360
2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2361 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2357 if (ctx) 2362 if (ctx)
2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2363 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2359 2364
2360 rotate_ctx(&cpuctx->ctx); 2365 rotate_ctx(&cpuctx->ctx);
2361 if (ctx) 2366 if (ctx)
2362 rotate_ctx(ctx); 2367 rotate_ctx(ctx);
2363 2368
2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2369 perf_event_sched_in(cpuctx, ctx, current);
2365 if (ctx)
2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2367 2370
2368done: 2371done:
2369 if (remove) 2372 if (remove)
2370 list_del_init(&cpuctx->rotation_list); 2373 list_del_init(&cpuctx->rotation_list);
2371 2374
2372 perf_pmu_enable(cpuctx->ctx.pmu); 2375 perf_pmu_enable(cpuctx->ctx.pmu);
2376 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2373} 2377}
2374 2378
2375void perf_event_task_tick(void) 2379void perf_event_task_tick(void)
@@ -2424,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2424 * in. 2428 * in.
2425 */ 2429 */
2426 perf_cgroup_sched_out(current); 2430 perf_cgroup_sched_out(current);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
2428 2431
2429 raw_spin_lock(&ctx->lock); 2432 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx);
2430 2434
2431 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2432 ret = event_enable_on_exec(event, ctx); 2436 ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2839,12 @@ retry:
2835 unclone_ctx(ctx); 2839 unclone_ctx(ctx);
2836 ++ctx->pin_count; 2840 ++ctx->pin_count;
2837 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2841 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2838 } 2842 } else {
2839
2840 if (!ctx) {
2841 ctx = alloc_perf_context(pmu, task); 2843 ctx = alloc_perf_context(pmu, task);
2842 err = -ENOMEM; 2844 err = -ENOMEM;
2843 if (!ctx) 2845 if (!ctx)
2844 goto errout; 2846 goto errout;
2845 2847
2846 get_ctx(ctx);
2847
2848 err = 0; 2848 err = 0;
2849 mutex_lock(&task->perf_event_mutex); 2849 mutex_lock(&task->perf_event_mutex);
2850 /* 2850 /*
@@ -2856,14 +2856,14 @@ retry:
2856 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN; 2857 err = -EAGAIN;
2858 else { 2858 else {
2859 get_ctx(ctx);
2859 ++ctx->pin_count; 2860 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2861 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 } 2862 }
2862 mutex_unlock(&task->perf_event_mutex); 2863 mutex_unlock(&task->perf_event_mutex);
2863 2864
2864 if (unlikely(err)) { 2865 if (unlikely(err)) {
2865 put_task_struct(task); 2866 put_ctx(ctx);
2866 kfree(ctx);
2867 2867
2868 if (err == -EAGAIN) 2868 if (err == -EAGAIN)
2869 goto retry; 2869 goto retry;
@@ -2890,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
2890 kfree(event); 2890 kfree(event);
2891} 2891}
2892 2892
2893static void perf_buffer_put(struct perf_buffer *buffer); 2893static void ring_buffer_put(struct ring_buffer *rb);
2894 2894
2895static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
2896{ 2896{
@@ -2913,9 +2913,9 @@ static void free_event(struct perf_event *event)
2913 } 2913 }
2914 } 2914 }
2915 2915
2916 if (event->buffer) { 2916 if (event->rb) {
2917 perf_buffer_put(event->buffer); 2917 ring_buffer_put(event->rb);
2918 event->buffer = NULL; 2918 event->rb = NULL;
2919 } 2919 }
2920 2920
2921 if (is_cgroup_event(event)) 2921 if (is_cgroup_event(event))
@@ -2934,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
2934{ 2934{
2935 struct perf_event_context *ctx = event->ctx; 2935 struct perf_event_context *ctx = event->ctx;
2936 2936
2937 /*
2938 * Remove from the PMU, can't get re-enabled since we got
2939 * here because the last ref went.
2940 */
2941 perf_event_disable(event);
2942
2943 WARN_ON_ONCE(ctx->parent_ctx); 2937 WARN_ON_ONCE(ctx->parent_ctx);
2944 /* 2938 /*
2945 * There are two ways this annotation is useful: 2939 * There are two ways this annotation is useful:
@@ -2956,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
2956 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2950 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2957 raw_spin_lock_irq(&ctx->lock); 2951 raw_spin_lock_irq(&ctx->lock);
2958 perf_group_detach(event); 2952 perf_group_detach(event);
2959 list_del_event(event, ctx);
2960 raw_spin_unlock_irq(&ctx->lock); 2953 raw_spin_unlock_irq(&ctx->lock);
2954 perf_remove_from_context(event);
2961 mutex_unlock(&ctx->mutex); 2955 mutex_unlock(&ctx->mutex);
2962 2956
2963 free_event(event); 2957 free_event(event);
@@ -3149,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3149static unsigned int perf_poll(struct file *file, poll_table *wait) 3143static unsigned int perf_poll(struct file *file, poll_table *wait)
3150{ 3144{
3151 struct perf_event *event = file->private_data; 3145 struct perf_event *event = file->private_data;
3152 struct perf_buffer *buffer; 3146 struct ring_buffer *rb;
3153 unsigned int events = POLL_HUP; 3147 unsigned int events = POLL_HUP;
3154 3148
3155 rcu_read_lock(); 3149 rcu_read_lock();
3156 buffer = rcu_dereference(event->buffer); 3150 rb = rcu_dereference(event->rb);
3157 if (buffer) 3151 if (rb)
3158 events = atomic_xchg(&buffer->poll, 0); 3152 events = atomic_xchg(&rb->poll, 0);
3159 rcu_read_unlock(); 3153 rcu_read_unlock();
3160 3154
3161 poll_wait(file, &event->waitq, wait); 3155 poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
3358 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3352 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3359} 3353}
3360 3354
3355static void calc_timer_values(struct perf_event *event,
3356 u64 *running,
3357 u64 *enabled)
3358{
3359 u64 now, ctx_time;
3360
3361 now = perf_clock();
3362 ctx_time = event->shadow_ctx_time + now;
3363 *enabled = ctx_time - event->tstamp_enabled;
3364 *running = ctx_time - event->tstamp_running;
3365}
3366
3361/* 3367/*
3362 * Callers need to ensure there can be no nesting of this function, otherwise 3368 * Callers need to ensure there can be no nesting of this function, otherwise
3363 * the seqlock logic goes bad. We can not serialize this because the arch 3369 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
3366void perf_event_update_userpage(struct perf_event *event) 3372void perf_event_update_userpage(struct perf_event *event)
3367{ 3373{
3368 struct perf_event_mmap_page *userpg; 3374 struct perf_event_mmap_page *userpg;
3369 struct perf_buffer *buffer; 3375 struct ring_buffer *rb;
3376 u64 enabled, running;
3370 3377
3371 rcu_read_lock(); 3378 rcu_read_lock();
3372 buffer = rcu_dereference(event->buffer); 3379 /*
3373 if (!buffer) 3380 * compute total_time_enabled, total_time_running
3381 * based on snapshot values taken when the event
3382 * was last scheduled in.
3383 *
3384 * we cannot simply called update_context_time()
3385 * because of locking issue as we can be called in
3386 * NMI context
3387 */
3388 calc_timer_values(event, &enabled, &running);
3389 rb = rcu_dereference(event->rb);
3390 if (!rb)
3374 goto unlock; 3391 goto unlock;
3375 3392
3376 userpg = buffer->user_page; 3393 userpg = rb->user_page;
3377 3394
3378 /* 3395 /*
3379 * Disable preemption so as to not let the corresponding user-space 3396 * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
3387 if (event->state == PERF_EVENT_STATE_ACTIVE) 3404 if (event->state == PERF_EVENT_STATE_ACTIVE)
3388 userpg->offset -= local64_read(&event->hw.prev_count); 3405 userpg->offset -= local64_read(&event->hw.prev_count);
3389 3406
3390 userpg->time_enabled = event->total_time_enabled + 3407 userpg->time_enabled = enabled +
3391 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
3392 3409
3393 userpg->time_running = event->total_time_running + 3410 userpg->time_running = running +
3394 atomic64_read(&event->child_total_time_running); 3411 atomic64_read(&event->child_total_time_running);
3395 3412
3396 barrier(); 3413 barrier();
@@ -3400,220 +3417,10 @@ unlock:
3400 rcu_read_unlock(); 3417 rcu_read_unlock();
3401} 3418}
3402 3419
3403static unsigned long perf_data_size(struct perf_buffer *buffer);
3404
3405static void
3406perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3407{
3408 long max_size = perf_data_size(buffer);
3409
3410 if (watermark)
3411 buffer->watermark = min(max_size, watermark);
3412
3413 if (!buffer->watermark)
3414 buffer->watermark = max_size / 2;
3415
3416 if (flags & PERF_BUFFER_WRITABLE)
3417 buffer->writable = 1;
3418
3419 atomic_set(&buffer->refcount, 1);
3420}
3421
3422#ifndef CONFIG_PERF_USE_VMALLOC
3423
3424/*
3425 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3426 */
3427
3428static struct page *
3429perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3430{
3431 if (pgoff > buffer->nr_pages)
3432 return NULL;
3433
3434 if (pgoff == 0)
3435 return virt_to_page(buffer->user_page);
3436
3437 return virt_to_page(buffer->data_pages[pgoff - 1]);
3438}
3439
3440static void *perf_mmap_alloc_page(int cpu)
3441{
3442 struct page *page;
3443 int node;
3444
3445 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3446 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3447 if (!page)
3448 return NULL;
3449
3450 return page_address(page);
3451}
3452
3453static struct perf_buffer *
3454perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3455{
3456 struct perf_buffer *buffer;
3457 unsigned long size;
3458 int i;
3459
3460 size = sizeof(struct perf_buffer);
3461 size += nr_pages * sizeof(void *);
3462
3463 buffer = kzalloc(size, GFP_KERNEL);
3464 if (!buffer)
3465 goto fail;
3466
3467 buffer->user_page = perf_mmap_alloc_page(cpu);
3468 if (!buffer->user_page)
3469 goto fail_user_page;
3470
3471 for (i = 0; i < nr_pages; i++) {
3472 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3473 if (!buffer->data_pages[i])
3474 goto fail_data_pages;
3475 }
3476
3477 buffer->nr_pages = nr_pages;
3478
3479 perf_buffer_init(buffer, watermark, flags);
3480
3481 return buffer;
3482
3483fail_data_pages:
3484 for (i--; i >= 0; i--)
3485 free_page((unsigned long)buffer->data_pages[i]);
3486
3487 free_page((unsigned long)buffer->user_page);
3488
3489fail_user_page:
3490 kfree(buffer);
3491
3492fail:
3493 return NULL;
3494}
3495
3496static void perf_mmap_free_page(unsigned long addr)
3497{
3498 struct page *page = virt_to_page((void *)addr);
3499
3500 page->mapping = NULL;
3501 __free_page(page);
3502}
3503
3504static void perf_buffer_free(struct perf_buffer *buffer)
3505{
3506 int i;
3507
3508 perf_mmap_free_page((unsigned long)buffer->user_page);
3509 for (i = 0; i < buffer->nr_pages; i++)
3510 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3511 kfree(buffer);
3512}
3513
3514static inline int page_order(struct perf_buffer *buffer)
3515{
3516 return 0;
3517}
3518
3519#else
3520
3521/*
3522 * Back perf_mmap() with vmalloc memory.
3523 *
3524 * Required for architectures that have d-cache aliasing issues.
3525 */
3526
3527static inline int page_order(struct perf_buffer *buffer)
3528{
3529 return buffer->page_order;
3530}
3531
3532static struct page *
3533perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3534{
3535 if (pgoff > (1UL << page_order(buffer)))
3536 return NULL;
3537
3538 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3539}
3540
3541static void perf_mmap_unmark_page(void *addr)
3542{
3543 struct page *page = vmalloc_to_page(addr);
3544
3545 page->mapping = NULL;
3546}
3547
3548static void perf_buffer_free_work(struct work_struct *work)
3549{
3550 struct perf_buffer *buffer;
3551 void *base;
3552 int i, nr;
3553
3554 buffer = container_of(work, struct perf_buffer, work);
3555 nr = 1 << page_order(buffer);
3556
3557 base = buffer->user_page;
3558 for (i = 0; i < nr + 1; i++)
3559 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3560
3561 vfree(base);
3562 kfree(buffer);
3563}
3564
3565static void perf_buffer_free(struct perf_buffer *buffer)
3566{
3567 schedule_work(&buffer->work);
3568}
3569
3570static struct perf_buffer *
3571perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3572{
3573 struct perf_buffer *buffer;
3574 unsigned long size;
3575 void *all_buf;
3576
3577 size = sizeof(struct perf_buffer);
3578 size += sizeof(void *);
3579
3580 buffer = kzalloc(size, GFP_KERNEL);
3581 if (!buffer)
3582 goto fail;
3583
3584 INIT_WORK(&buffer->work, perf_buffer_free_work);
3585
3586 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3587 if (!all_buf)
3588 goto fail_all_buf;
3589
3590 buffer->user_page = all_buf;
3591 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3592 buffer->page_order = ilog2(nr_pages);
3593 buffer->nr_pages = 1;
3594
3595 perf_buffer_init(buffer, watermark, flags);
3596
3597 return buffer;
3598
3599fail_all_buf:
3600 kfree(buffer);
3601
3602fail:
3603 return NULL;
3604}
3605
3606#endif
3607
3608static unsigned long perf_data_size(struct perf_buffer *buffer)
3609{
3610 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3611}
3612
3613static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3420static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3614{ 3421{
3615 struct perf_event *event = vma->vm_file->private_data; 3422 struct perf_event *event = vma->vm_file->private_data;
3616 struct perf_buffer *buffer; 3423 struct ring_buffer *rb;
3617 int ret = VM_FAULT_SIGBUS; 3424 int ret = VM_FAULT_SIGBUS;
3618 3425
3619 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3426 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3623 } 3430 }
3624 3431
3625 rcu_read_lock(); 3432 rcu_read_lock();
3626 buffer = rcu_dereference(event->buffer); 3433 rb = rcu_dereference(event->rb);
3627 if (!buffer) 3434 if (!rb)
3628 goto unlock; 3435 goto unlock;
3629 3436
3630 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3437 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3631 goto unlock; 3438 goto unlock;
3632 3439
3633 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3440 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3634 if (!vmf->page) 3441 if (!vmf->page)
3635 goto unlock; 3442 goto unlock;
3636 3443
@@ -3645,35 +3452,35 @@ unlock:
3645 return ret; 3452 return ret;
3646} 3453}
3647 3454
3648static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3455static void rb_free_rcu(struct rcu_head *rcu_head)
3649{ 3456{
3650 struct perf_buffer *buffer; 3457 struct ring_buffer *rb;
3651 3458
3652 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3459 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3653 perf_buffer_free(buffer); 3460 rb_free(rb);
3654} 3461}
3655 3462
3656static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3463static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3657{ 3464{
3658 struct perf_buffer *buffer; 3465 struct ring_buffer *rb;
3659 3466
3660 rcu_read_lock(); 3467 rcu_read_lock();
3661 buffer = rcu_dereference(event->buffer); 3468 rb = rcu_dereference(event->rb);
3662 if (buffer) { 3469 if (rb) {
3663 if (!atomic_inc_not_zero(&buffer->refcount)) 3470 if (!atomic_inc_not_zero(&rb->refcount))
3664 buffer = NULL; 3471 rb = NULL;
3665 } 3472 }
3666 rcu_read_unlock(); 3473 rcu_read_unlock();
3667 3474
3668 return buffer; 3475 return rb;
3669} 3476}
3670 3477
3671static void perf_buffer_put(struct perf_buffer *buffer) 3478static void ring_buffer_put(struct ring_buffer *rb)
3672{ 3479{
3673 if (!atomic_dec_and_test(&buffer->refcount)) 3480 if (!atomic_dec_and_test(&rb->refcount))
3674 return; 3481 return;
3675 3482
3676 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3483 call_rcu(&rb->rcu_head, rb_free_rcu);
3677} 3484}
3678 3485
3679static void perf_mmap_open(struct vm_area_struct *vma) 3486static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3688 struct perf_event *event = vma->vm_file->private_data; 3495 struct perf_event *event = vma->vm_file->private_data;
3689 3496
3690 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3497 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3691 unsigned long size = perf_data_size(event->buffer); 3498 unsigned long size = perf_data_size(event->rb);
3692 struct user_struct *user = event->mmap_user; 3499 struct user_struct *user = event->mmap_user;
3693 struct perf_buffer *buffer = event->buffer; 3500 struct ring_buffer *rb = event->rb;
3694 3501
3695 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3696 vma->vm_mm->locked_vm -= event->mmap_locked; 3503 vma->vm_mm->locked_vm -= event->mmap_locked;
3697 rcu_assign_pointer(event->buffer, NULL); 3504 rcu_assign_pointer(event->rb, NULL);
3698 mutex_unlock(&event->mmap_mutex); 3505 mutex_unlock(&event->mmap_mutex);
3699 3506
3700 perf_buffer_put(buffer); 3507 ring_buffer_put(rb);
3701 free_uid(user); 3508 free_uid(user);
3702 } 3509 }
3703} 3510}
@@ -3715,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3715 unsigned long user_locked, user_lock_limit; 3522 unsigned long user_locked, user_lock_limit;
3716 struct user_struct *user = current_user(); 3523 struct user_struct *user = current_user();
3717 unsigned long locked, lock_limit; 3524 unsigned long locked, lock_limit;
3718 struct perf_buffer *buffer; 3525 struct ring_buffer *rb;
3719 unsigned long vma_size; 3526 unsigned long vma_size;
3720 unsigned long nr_pages; 3527 unsigned long nr_pages;
3721 long user_extra, extra; 3528 long user_extra, extra;
@@ -3724,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3724 /* 3531 /*
3725 * Don't allow mmap() of inherited per-task counters. This would 3532 * Don't allow mmap() of inherited per-task counters. This would
3726 * create a performance issue due to all children writing to the 3533 * create a performance issue due to all children writing to the
3727 * same buffer. 3534 * same rb.
3728 */ 3535 */
3729 if (event->cpu == -1 && event->attr.inherit) 3536 if (event->cpu == -1 && event->attr.inherit)
3730 return -EINVAL; 3537 return -EINVAL;
@@ -3736,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3736 nr_pages = (vma_size / PAGE_SIZE) - 1; 3543 nr_pages = (vma_size / PAGE_SIZE) - 1;
3737 3544
3738 /* 3545 /*
3739 * If we have buffer pages ensure they're a power-of-two number, so we 3546 * If we have rb pages ensure they're a power-of-two number, so we
3740 * can do bitmasks instead of modulo. 3547 * can do bitmasks instead of modulo.
3741 */ 3548 */
3742 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3549 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3750 3557
3751 WARN_ON_ONCE(event->ctx->parent_ctx); 3558 WARN_ON_ONCE(event->ctx->parent_ctx);
3752 mutex_lock(&event->mmap_mutex); 3559 mutex_lock(&event->mmap_mutex);
3753 if (event->buffer) { 3560 if (event->rb) {
3754 if (event->buffer->nr_pages == nr_pages) 3561 if (event->rb->nr_pages == nr_pages)
3755 atomic_inc(&event->buffer->refcount); 3562 atomic_inc(&event->rb->refcount);
3756 else 3563 else
3757 ret = -EINVAL; 3564 ret = -EINVAL;
3758 goto unlock; 3565 goto unlock;
@@ -3782,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3782 goto unlock; 3589 goto unlock;
3783 } 3590 }
3784 3591
3785 WARN_ON(event->buffer); 3592 WARN_ON(event->rb);
3786 3593
3787 if (vma->vm_flags & VM_WRITE) 3594 if (vma->vm_flags & VM_WRITE)
3788 flags |= PERF_BUFFER_WRITABLE; 3595 flags |= RING_BUFFER_WRITABLE;
3789 3596
3790 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3597 rb = rb_alloc(nr_pages,
3791 event->cpu, flags); 3598 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3792 if (!buffer) { 3599 event->cpu, flags);
3600
3601 if (!rb) {
3793 ret = -ENOMEM; 3602 ret = -ENOMEM;
3794 goto unlock; 3603 goto unlock;
3795 } 3604 }
3796 rcu_assign_pointer(event->buffer, buffer); 3605 rcu_assign_pointer(event->rb, rb);
3797 3606
3798 atomic_long_add(user_extra, &user->locked_vm); 3607 atomic_long_add(user_extra, &user->locked_vm);
3799 event->mmap_locked = extra; 3608 event->mmap_locked = extra;
@@ -3892,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3892} 3701}
3893EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3702EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3894 3703
3895/*
3896 * Output
3897 */
3898static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3899 unsigned long offset, unsigned long head)
3900{
3901 unsigned long mask;
3902
3903 if (!buffer->writable)
3904 return true;
3905
3906 mask = perf_data_size(buffer) - 1;
3907
3908 offset = (offset - tail) & mask;
3909 head = (head - tail) & mask;
3910
3911 if ((int)(head - offset) < 0)
3912 return false;
3913
3914 return true;
3915}
3916
3917static void perf_output_wakeup(struct perf_output_handle *handle)
3918{
3919 atomic_set(&handle->buffer->poll, POLL_IN);
3920
3921 if (handle->nmi) {
3922 handle->event->pending_wakeup = 1;
3923 irq_work_queue(&handle->event->pending);
3924 } else
3925 perf_event_wakeup(handle->event);
3926}
3927
3928/*
3929 * We need to ensure a later event_id doesn't publish a head when a former
3930 * event isn't done writing. However since we need to deal with NMIs we
3931 * cannot fully serialize things.
3932 *
3933 * We only publish the head (and generate a wakeup) when the outer-most
3934 * event completes.
3935 */
3936static void perf_output_get_handle(struct perf_output_handle *handle)
3937{
3938 struct perf_buffer *buffer = handle->buffer;
3939
3940 preempt_disable();
3941 local_inc(&buffer->nest);
3942 handle->wakeup = local_read(&buffer->wakeup);
3943}
3944
3945static void perf_output_put_handle(struct perf_output_handle *handle)
3946{
3947 struct perf_buffer *buffer = handle->buffer;
3948 unsigned long head;
3949
3950again:
3951 head = local_read(&buffer->head);
3952
3953 /*
3954 * IRQ/NMI can happen here, which means we can miss a head update.
3955 */
3956
3957 if (!local_dec_and_test(&buffer->nest))
3958 goto out;
3959
3960 /*
3961 * Publish the known good head. Rely on the full barrier implied
3962 * by atomic_dec_and_test() order the buffer->head read and this
3963 * write.
3964 */
3965 buffer->user_page->data_head = head;
3966
3967 /*
3968 * Now check if we missed an update, rely on the (compiler)
3969 * barrier in atomic_dec_and_test() to re-read buffer->head.
3970 */
3971 if (unlikely(head != local_read(&buffer->head))) {
3972 local_inc(&buffer->nest);
3973 goto again;
3974 }
3975
3976 if (handle->wakeup != local_read(&buffer->wakeup))
3977 perf_output_wakeup(handle);
3978
3979out:
3980 preempt_enable();
3981}
3982
3983__always_inline void perf_output_copy(struct perf_output_handle *handle,
3984 const void *buf, unsigned int len)
3985{
3986 do {
3987 unsigned long size = min_t(unsigned long, handle->size, len);
3988
3989 memcpy(handle->addr, buf, size);
3990
3991 len -= size;
3992 handle->addr += size;
3993 buf += size;
3994 handle->size -= size;
3995 if (!handle->size) {
3996 struct perf_buffer *buffer = handle->buffer;
3997
3998 handle->page++;
3999 handle->page &= buffer->nr_pages - 1;
4000 handle->addr = buffer->data_pages[handle->page];
4001 handle->size = PAGE_SIZE << page_order(buffer);
4002 }
4003 } while (len);
4004}
4005
4006static void __perf_event_header__init_id(struct perf_event_header *header, 3704static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data, 3705 struct perf_sample_data *data,
4008 struct perf_event *event) 3706 struct perf_event *event)
@@ -4033,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4033 } 3731 }
4034} 3732}
4035 3733
4036static void perf_event_header__init_id(struct perf_event_header *header, 3734void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data, 3735 struct perf_sample_data *data,
4038 struct perf_event *event) 3736 struct perf_event *event)
4039{ 3737{
4040 if (event->attr.sample_id_all) 3738 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event); 3739 __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4062 perf_output_put(handle, data->cpu_entry); 3760 perf_output_put(handle, data->cpu_entry);
4063} 3761}
4064 3762
4065static void perf_event__output_id_sample(struct perf_event *event, 3763void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle, 3764 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample) 3765 struct perf_sample_data *sample)
4068{ 3766{
4069 if (event->attr.sample_id_all) 3767 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample); 3768 __perf_event__output_id_sample(handle, sample);
4071} 3769}
4072 3770
4073int perf_output_begin(struct perf_output_handle *handle,
4074 struct perf_event *event, unsigned int size,
4075 int nmi, int sample)
4076{
4077 struct perf_buffer *buffer;
4078 unsigned long tail, offset, head;
4079 int have_lost;
4080 struct perf_sample_data sample_data;
4081 struct {
4082 struct perf_event_header header;
4083 u64 id;
4084 u64 lost;
4085 } lost_event;
4086
4087 rcu_read_lock();
4088 /*
4089 * For inherited events we send all the output towards the parent.
4090 */
4091 if (event->parent)
4092 event = event->parent;
4093
4094 buffer = rcu_dereference(event->buffer);
4095 if (!buffer)
4096 goto out;
4097
4098 handle->buffer = buffer;
4099 handle->event = event;
4100 handle->nmi = nmi;
4101 handle->sample = sample;
4102
4103 if (!buffer->nr_pages)
4104 goto out;
4105
4106 have_lost = local_read(&buffer->lost);
4107 if (have_lost) {
4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
4113
4114 perf_output_get_handle(handle);
4115
4116 do {
4117 /*
4118 * Userspace could choose to issue a mb() before updating the
4119 * tail pointer. So that all reads will be completed before the
4120 * write is issued.
4121 */
4122 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4123 smp_rmb();
4124 offset = head = local_read(&buffer->head);
4125 head += size;
4126 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4127 goto fail;
4128 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4129
4130 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4131 local_add(buffer->watermark, &buffer->wakeup);
4132
4133 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4134 handle->page &= buffer->nr_pages - 1;
4135 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4136 handle->addr = buffer->data_pages[handle->page];
4137 handle->addr += handle->size;
4138 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4139
4140 if (have_lost) {
4141 lost_event.header.type = PERF_RECORD_LOST;
4142 lost_event.header.misc = 0;
4143 lost_event.id = event->id;
4144 lost_event.lost = local_xchg(&buffer->lost, 0);
4145
4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
4148 }
4149
4150 return 0;
4151
4152fail:
4153 local_inc(&buffer->lost);
4154 perf_output_put_handle(handle);
4155out:
4156 rcu_read_unlock();
4157
4158 return -ENOSPC;
4159}
4160
4161void perf_output_end(struct perf_output_handle *handle)
4162{
4163 struct perf_event *event = handle->event;
4164 struct perf_buffer *buffer = handle->buffer;
4165
4166 int wakeup_events = event->attr.wakeup_events;
4167
4168 if (handle->sample && wakeup_events) {
4169 int events = local_inc_return(&buffer->events);
4170 if (events >= wakeup_events) {
4171 local_sub(wakeup_events, &buffer->events);
4172 local_inc(&buffer->wakeup);
4173 }
4174 }
4175
4176 perf_output_put_handle(handle);
4177 rcu_read_unlock();
4178}
4179
4180static void perf_output_read_one(struct perf_output_handle *handle, 3771static void perf_output_read_one(struct perf_output_handle *handle,
4181 struct perf_event *event, 3772 struct perf_event *event,
4182 u64 enabled, u64 running) 3773 u64 enabled, u64 running)
@@ -4197,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4197 if (read_format & PERF_FORMAT_ID) 3788 if (read_format & PERF_FORMAT_ID)
4198 values[n++] = primary_event_id(event); 3789 values[n++] = primary_event_id(event);
4199 3790
4200 perf_output_copy(handle, values, n * sizeof(u64)); 3791 __output_copy(handle, values, n * sizeof(u64));
4201} 3792}
4202 3793
4203/* 3794/*
@@ -4227,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4227 if (read_format & PERF_FORMAT_ID) 3818 if (read_format & PERF_FORMAT_ID)
4228 values[n++] = primary_event_id(leader); 3819 values[n++] = primary_event_id(leader);
4229 3820
4230 perf_output_copy(handle, values, n * sizeof(u64)); 3821 __output_copy(handle, values, n * sizeof(u64));
4231 3822
4232 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3823 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4233 n = 0; 3824 n = 0;
@@ -4239,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4239 if (read_format & PERF_FORMAT_ID) 3830 if (read_format & PERF_FORMAT_ID)
4240 values[n++] = primary_event_id(sub); 3831 values[n++] = primary_event_id(sub);
4241 3832
4242 perf_output_copy(handle, values, n * sizeof(u64)); 3833 __output_copy(handle, values, n * sizeof(u64));
4243 } 3834 }
4244} 3835}
4245 3836
@@ -4249,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4249static void perf_output_read(struct perf_output_handle *handle, 3840static void perf_output_read(struct perf_output_handle *handle,
4250 struct perf_event *event) 3841 struct perf_event *event)
4251{ 3842{
4252 u64 enabled = 0, running = 0, now, ctx_time; 3843 u64 enabled = 0, running = 0;
4253 u64 read_format = event->attr.read_format; 3844 u64 read_format = event->attr.read_format;
4254 3845
4255 /* 3846 /*
@@ -4261,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4261 * because of locking issue as we are called in 3852 * because of locking issue as we are called in
4262 * NMI context 3853 * NMI context
4263 */ 3854 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3855 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4265 now = perf_clock(); 3856 calc_timer_values(event, &enabled, &running);
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270 3857
4271 if (event->attr.read_format & PERF_FORMAT_GROUP) 3858 if (event->attr.read_format & PERF_FORMAT_GROUP)
4272 perf_output_read_group(handle, event, enabled, running); 3859 perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4319 3906
4320 size *= sizeof(u64); 3907 size *= sizeof(u64);
4321 3908
4322 perf_output_copy(handle, data->callchain, size); 3909 __output_copy(handle, data->callchain, size);
4323 } else { 3910 } else {
4324 u64 nr = 0; 3911 u64 nr = 0;
4325 perf_output_put(handle, nr); 3912 perf_output_put(handle, nr);
@@ -4329,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4329 if (sample_type & PERF_SAMPLE_RAW) { 3916 if (sample_type & PERF_SAMPLE_RAW) {
4330 if (data->raw) { 3917 if (data->raw) {
4331 perf_output_put(handle, data->raw->size); 3918 perf_output_put(handle, data->raw->size);
4332 perf_output_copy(handle, data->raw->data, 3919 __output_copy(handle, data->raw->data,
4333 data->raw->size); 3920 data->raw->size);
4334 } else { 3921 } else {
4335 struct { 3922 struct {
4336 u32 size; 3923 u32 size;
@@ -4342,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4342 perf_output_put(handle, raw); 3929 perf_output_put(handle, raw);
4343 } 3930 }
4344 } 3931 }
3932
3933 if (!event->attr.watermark) {
3934 int wakeup_events = event->attr.wakeup_events;
3935
3936 if (wakeup_events) {
3937 struct ring_buffer *rb = handle->rb;
3938 int events = local_inc_return(&rb->events);
3939
3940 if (events >= wakeup_events) {
3941 local_sub(wakeup_events, &rb->events);
3942 local_inc(&rb->wakeup);
3943 }
3944 }
3945 }
4345} 3946}
4346 3947
4347void perf_prepare_sample(struct perf_event_header *header, 3948void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4386 } 3987 }
4387} 3988}
4388 3989
4389static void perf_event_output(struct perf_event *event, int nmi, 3990static void perf_event_output(struct perf_event *event,
4390 struct perf_sample_data *data, 3991 struct perf_sample_data *data,
4391 struct pt_regs *regs) 3992 struct pt_regs *regs)
4392{ 3993{
@@ -4398,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4398 3999
4399 perf_prepare_sample(&header, data, event, regs); 4000 perf_prepare_sample(&header, data, event, regs);
4400 4001
4401 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4002 if (perf_output_begin(&handle, event, header.size))
4402 goto exit; 4003 goto exit;
4403 4004
4404 perf_output_sample(&handle, &header, data, event); 4005 perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
4438 int ret; 4039 int ret;
4439 4040
4440 perf_event_header__init_id(&read_event.header, &sample, event); 4041 perf_event_header__init_id(&read_event.header, &sample, event);
4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4042 ret = perf_output_begin(&handle, event, read_event.header.size);
4442 if (ret) 4043 if (ret)
4443 return; 4044 return;
4444 4045
@@ -4481,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4082 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4482 4083
4483 ret = perf_output_begin(&handle, event, 4084 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0); 4085 task_event->event_id.header.size);
4485 if (ret) 4086 if (ret)
4486 goto out; 4087 goto out;
4487 4088
@@ -4618,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
4618 4219
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4220 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event, 4221 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0); 4222 comm_event->event_id.header.size);
4622 4223
4623 if (ret) 4224 if (ret)
4624 goto out; 4225 goto out;
@@ -4627,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4228 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4628 4229
4629 perf_output_put(&handle, comm_event->event_id); 4230 perf_output_put(&handle, comm_event->event_id);
4630 perf_output_copy(&handle, comm_event->comm, 4231 __output_copy(&handle, comm_event->comm,
4631 comm_event->comm_size); 4232 comm_event->comm_size);
4632 4233
4633 perf_event__output_id_sample(event, &handle, &sample); 4234 perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4765 4366
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4367 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event, 4368 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0); 4369 mmap_event->event_id.header.size);
4769 if (ret) 4370 if (ret)
4770 goto out; 4371 goto out;
4771 4372
@@ -4773,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 mmap_event->event_id.tid = perf_event_tid(event, current); 4374 mmap_event->event_id.tid = perf_event_tid(event, current);
4774 4375
4775 perf_output_put(&handle, mmap_event->event_id); 4376 perf_output_put(&handle, mmap_event->event_id);
4776 perf_output_copy(&handle, mmap_event->file_name, 4377 __output_copy(&handle, mmap_event->file_name,
4777 mmap_event->file_size); 4378 mmap_event->file_size);
4778 4379
4779 perf_event__output_id_sample(event, &handle, &sample); 4380 perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4829 4430
4830 if (file) { 4431 if (file) {
4831 /* 4432 /*
4832 * d_path works from the end of the buffer backwards, so we 4433 * d_path works from the end of the rb backwards, so we
4833 * need to add enough zero bytes after the string to handle 4434 * need to add enough zero bytes after the string to handle
4834 * the 64bit alignment we do later. 4435 * the 64bit alignment we do later.
4835 */ 4436 */
@@ -4960,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4960 perf_event_header__init_id(&throttle_event.header, &sample, event); 4561 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961 4562
4962 ret = perf_output_begin(&handle, event, 4563 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0); 4564 throttle_event.header.size);
4964 if (ret) 4565 if (ret)
4965 return; 4566 return;
4966 4567
@@ -4973,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4973 * Generic event overflow handling, sampling. 4574 * Generic event overflow handling, sampling.
4974 */ 4575 */
4975 4576
4976static int __perf_event_overflow(struct perf_event *event, int nmi, 4577static int __perf_event_overflow(struct perf_event *event,
4977 int throttle, struct perf_sample_data *data, 4578 int throttle, struct perf_sample_data *data,
4978 struct pt_regs *regs) 4579 struct pt_regs *regs)
4979{ 4580{
@@ -5016,34 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5016 if (events && atomic_dec_and_test(&event->event_limit)) { 4617 if (events && atomic_dec_and_test(&event->event_limit)) {
5017 ret = 1; 4618 ret = 1;
5018 event->pending_kill = POLL_HUP; 4619 event->pending_kill = POLL_HUP;
5019 if (nmi) { 4620 event->pending_disable = 1;
5020 event->pending_disable = 1; 4621 irq_work_queue(&event->pending);
5021 irq_work_queue(&event->pending);
5022 } else
5023 perf_event_disable(event);
5024 } 4622 }
5025 4623
5026 if (event->overflow_handler) 4624 if (event->overflow_handler)
5027 event->overflow_handler(event, nmi, data, regs); 4625 event->overflow_handler(event, data, regs);
5028 else 4626 else
5029 perf_event_output(event, nmi, data, regs); 4627 perf_event_output(event, data, regs);
5030 4628
5031 if (event->fasync && event->pending_kill) { 4629 if (event->fasync && event->pending_kill) {
5032 if (nmi) { 4630 event->pending_wakeup = 1;
5033 event->pending_wakeup = 1; 4631 irq_work_queue(&event->pending);
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 } 4632 }
5038 4633
5039 return ret; 4634 return ret;
5040} 4635}
5041 4636
5042int perf_event_overflow(struct perf_event *event, int nmi, 4637int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4638 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4639 struct pt_regs *regs)
5045{ 4640{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4641 return __perf_event_overflow(event, 1, data, regs);
5047} 4642}
5048 4643
5049/* 4644/*
@@ -5092,7 +4687,7 @@ again:
5092} 4687}
5093 4688
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4689static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4690 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4691 struct pt_regs *regs)
5097{ 4692{
5098 struct hw_perf_event *hwc = &event->hw; 4693 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4701 return;
5107 4702
5108 for (; overflow; overflow--) { 4703 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4704 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4705 data, regs)) {
5111 /* 4706 /*
5112 * We inhibit the overflow from happening when 4707 * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4714}
5120 4715
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4716static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4717 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4718 struct pt_regs *regs)
5124{ 4719{
5125 struct hw_perf_event *hwc = &event->hw; 4720 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4728 return;
5134 4729
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4730 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4731 return perf_swevent_overflow(event, 1, data, regs);
5137 4732
5138 if (local64_add_negative(nr, &hwc->period_left)) 4733 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4734 return;
5140 4735
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4736 perf_swevent_overflow(event, 0, data, regs);
5142} 4737}
5143 4738
5144static int perf_exclude_event(struct perf_event *event, 4739static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4821}
5227 4822
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4823static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4824 u64 nr,
5230 struct perf_sample_data *data, 4825 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4826 struct pt_regs *regs)
5232{ 4827{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4837
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4838 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4839 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4840 perf_swevent_event(event, nr, data, regs);
5246 } 4841 }
5247end: 4842end:
5248 rcu_read_unlock(); 4843 rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4858 put_recursion_context(swhash->recursion, rctx);
5264} 4859}
5265 4860
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4861void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4862{
5269 struct perf_sample_data data; 4863 struct perf_sample_data data;
5270 int rctx; 4864 int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4870
5277 perf_sample_data_init(&data, addr); 4871 perf_sample_data_init(&data, addr);
5278 4872
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4873 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4874
5281 perf_swevent_put_recursion_context(rctx); 4875 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4876 preempt_enable_notrace();
@@ -5524,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5524 5118
5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5119 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5526 if (perf_tp_event_match(event, &data, regs)) 5120 if (perf_tp_event_match(event, &data, regs))
5527 perf_swevent_event(event, count, 1, &data, regs); 5121 perf_swevent_event(event, count, &data, regs);
5528 } 5122 }
5529 5123
5530 perf_swevent_put_recursion_context(rctx); 5124 perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5617 perf_sample_data_init(&sample, bp->attr.bp_addr); 5211 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618 5212
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5213 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs); 5214 perf_swevent_event(bp, 1, &sample, regs);
5621} 5215}
5622#endif 5216#endif
5623 5217
@@ -5646,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5646 5240
5647 if (regs && !perf_exclude_event(event, regs)) { 5241 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0)) 5242 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs)) 5243 if (perf_event_overflow(event, &data, regs))
5650 ret = HRTIMER_NORESTART; 5244 ret = HRTIMER_NORESTART;
5651 } 5245 }
5652 5246
@@ -5986,6 +5580,7 @@ free_dev:
5986} 5580}
5987 5581
5988static struct lock_class_key cpuctx_mutex; 5582static struct lock_class_key cpuctx_mutex;
5583static struct lock_class_key cpuctx_lock;
5989 5584
5990int perf_pmu_register(struct pmu *pmu, char *name, int type) 5585int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{ 5586{
@@ -6036,6 +5631,7 @@ skip_type:
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx); 5632 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5633 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5634 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6039 cpuctx->ctx.type = cpu_context; 5635 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu; 5636 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1; 5637 cpuctx->jiffies_interval = 1;
@@ -6150,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6150 struct task_struct *task, 5746 struct task_struct *task,
6151 struct perf_event *group_leader, 5747 struct perf_event *group_leader,
6152 struct perf_event *parent_event, 5748 struct perf_event *parent_event,
6153 perf_overflow_handler_t overflow_handler) 5749 perf_overflow_handler_t overflow_handler,
5750 void *context)
6154{ 5751{
6155 struct pmu *pmu; 5752 struct pmu *pmu;
6156 struct perf_event *event; 5753 struct perf_event *event;
@@ -6208,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6208#endif 5805#endif
6209 } 5806 }
6210 5807
6211 if (!overflow_handler && parent_event) 5808 if (!overflow_handler && parent_event) {
6212 overflow_handler = parent_event->overflow_handler; 5809 overflow_handler = parent_event->overflow_handler;
5810 context = parent_event->overflow_handler_context;
5811 }
6213 5812
6214 event->overflow_handler = overflow_handler; 5813 event->overflow_handler = overflow_handler;
5814 event->overflow_handler_context = context;
6215 5815
6216 if (attr->disabled) 5816 if (attr->disabled)
6217 event->state = PERF_EVENT_STATE_OFF; 5817 event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6326 if (ret) 5926 if (ret)
6327 return -EFAULT; 5927 return -EFAULT;
6328 5928
6329 /*
6330 * If the type exists, the corresponding creation will verify
6331 * the attr->config.
6332 */
6333 if (attr->type >= PERF_TYPE_MAX)
6334 return -EINVAL;
6335
6336 if (attr->__reserved_1) 5929 if (attr->__reserved_1)
6337 return -EINVAL; 5930 return -EINVAL;
6338 5931
@@ -6354,7 +5947,7 @@ err_size:
6354static int 5947static int
6355perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5948perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6356{ 5949{
6357 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5950 struct ring_buffer *rb = NULL, *old_rb = NULL;
6358 int ret = -EINVAL; 5951 int ret = -EINVAL;
6359 5952
6360 if (!output_event) 5953 if (!output_event)
@@ -6371,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6371 goto out; 5964 goto out;
6372 5965
6373 /* 5966 /*
6374 * If its not a per-cpu buffer, it must be the same task. 5967 * If its not a per-cpu rb, it must be the same task.
6375 */ 5968 */
6376 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5969 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6377 goto out; 5970 goto out;
@@ -6383,20 +5976,20 @@ set:
6383 goto unlock; 5976 goto unlock;
6384 5977
6385 if (output_event) { 5978 if (output_event) {
6386 /* get the buffer we want to redirect to */ 5979 /* get the rb we want to redirect to */
6387 buffer = perf_buffer_get(output_event); 5980 rb = ring_buffer_get(output_event);
6388 if (!buffer) 5981 if (!rb)
6389 goto unlock; 5982 goto unlock;
6390 } 5983 }
6391 5984
6392 old_buffer = event->buffer; 5985 old_rb = event->rb;
6393 rcu_assign_pointer(event->buffer, buffer); 5986 rcu_assign_pointer(event->rb, rb);
6394 ret = 0; 5987 ret = 0;
6395unlock: 5988unlock:
6396 mutex_unlock(&event->mmap_mutex); 5989 mutex_unlock(&event->mmap_mutex);
6397 5990
6398 if (old_buffer) 5991 if (old_rb)
6399 perf_buffer_put(old_buffer); 5992 ring_buffer_put(old_rb);
6400out: 5993out:
6401 return ret; 5994 return ret;
6402} 5995}
@@ -6478,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
6478 } 6071 }
6479 } 6072 }
6480 6073
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6074 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6075 NULL, NULL);
6482 if (IS_ERR(event)) { 6076 if (IS_ERR(event)) {
6483 err = PTR_ERR(event); 6077 err = PTR_ERR(event);
6484 goto err_task; 6078 goto err_task;
@@ -6663,7 +6257,8 @@ err_fd:
6663struct perf_event * 6257struct perf_event *
6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6258perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6665 struct task_struct *task, 6259 struct task_struct *task,
6666 perf_overflow_handler_t overflow_handler) 6260 perf_overflow_handler_t overflow_handler,
6261 void *context)
6667{ 6262{
6668 struct perf_event_context *ctx; 6263 struct perf_event_context *ctx;
6669 struct perf_event *event; 6264 struct perf_event *event;
@@ -6673,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 * Get the target context (task or percpu): 6268 * Get the target context (task or percpu):
6674 */ 6269 */
6675 6270
6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6271 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6272 overflow_handler, context);
6677 if (IS_ERR(event)) { 6273 if (IS_ERR(event)) {
6678 err = PTR_ERR(event); 6274 err = PTR_ERR(event);
6679 goto err; 6275 goto err;
@@ -6780,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6780 * our context. 6376 * our context.
6781 */ 6377 */
6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6378 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
6784 6379
6785 /* 6380 /*
6786 * Take the context lock here so that if find_get_context is 6381 * Take the context lock here so that if find_get_context is
@@ -6788,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * incremented the context's refcount before we do put_ctx below. 6383 * incremented the context's refcount before we do put_ctx below.
6789 */ 6384 */
6790 raw_spin_lock(&child_ctx->lock); 6385 raw_spin_lock(&child_ctx->lock);
6386 task_ctx_sched_out(child_ctx);
6791 child->perf_event_ctxp[ctxn] = NULL; 6387 child->perf_event_ctxp[ctxn] = NULL;
6792 /* 6388 /*
6793 * If this context is a clone; unclone it so it can't get 6389 * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
6957 parent_event->cpu, 6553 parent_event->cpu,
6958 child, 6554 child,
6959 group_leader, parent_event, 6555 group_leader, parent_event,
6960 NULL); 6556 NULL, NULL);
6961 if (IS_ERR(child_event)) 6557 if (IS_ERR(child_event))
6962 return child_event; 6558 return child_event;
6963 get_ctx(child_ctx); 6559 get_ctx(child_ctx);
@@ -6984,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
6984 6580
6985 child_event->ctx = child_ctx; 6581 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler; 6582 child_event->overflow_handler = parent_event->overflow_handler;
6583 child_event->overflow_handler_context
6584 = parent_event->overflow_handler_context;
6987 6585
6988 /* 6586 /*
6989 * Precalculate sample_data sizes 6587 * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e7..b30fd54eb985 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1255/* 1255/*
1256 * If we have a symbol_name argument, look it up and add the offset field 1256 * If we have a symbol_name argument, look it up and add the offset field
1257 * to it. This way, we can specify a relative address to a symbol. 1257 * to it. This way, we can specify a relative address to a symbol.
1258 * This returns encoded errors if it fails to look up symbol or invalid
1259 * combination of parameters.
1258 */ 1260 */
1259static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1261static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1260{ 1262{
1261 kprobe_opcode_t *addr = p->addr; 1263 kprobe_opcode_t *addr = p->addr;
1264
1265 if ((p->symbol_name && p->addr) ||
1266 (!p->symbol_name && !p->addr))
1267 goto invalid;
1268
1262 if (p->symbol_name) { 1269 if (p->symbol_name) {
1263 if (addr)
1264 return NULL;
1265 kprobe_lookup_name(p->symbol_name, addr); 1270 kprobe_lookup_name(p->symbol_name, addr);
1271 if (!addr)
1272 return ERR_PTR(-ENOENT);
1266 } 1273 }
1267 1274
1268 if (!addr) 1275 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
1269 return NULL; 1276 if (addr)
1270 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1277 return addr;
1278
1279invalid:
1280 return ERR_PTR(-EINVAL);
1271} 1281}
1272 1282
1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1283/* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1311 kprobe_opcode_t *addr; 1321 kprobe_opcode_t *addr;
1312 1322
1313 addr = kprobe_addr(p); 1323 addr = kprobe_addr(p);
1314 if (!addr) 1324 if (IS_ERR(addr))
1315 return -EINVAL; 1325 return PTR_ERR(addr);
1316 p->addr = addr; 1326 p->addr = addr;
1317 1327
1318 ret = check_kprobe_rereg(p); 1328 ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1335 */ 1345 */
1336 probed_mod = __module_text_address((unsigned long) p->addr); 1346 probed_mod = __module_text_address((unsigned long) p->addr);
1337 if (probed_mod) { 1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1338 /* 1350 /*
1339 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1340 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1351 module_put(probed_mod); 1363 module_put(probed_mod);
1352 goto fail_with_jump_label; 1364 goto fail_with_jump_label;
1353 } 1365 }
1366 /* ret will be updated by following code */
1354 } 1367 }
1355 preempt_enable(); 1368 preempt_enable();
1356 jump_label_unlock(); 1369 jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
1399fail_with_jump_label: 1412fail_with_jump_label:
1400 preempt_enable(); 1413 preempt_enable();
1401 jump_label_unlock(); 1414 jump_label_unlock();
1402 return -EINVAL; 1415 return ret;
1403} 1416}
1404EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1405 1418
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1686 1699
1687 if (kretprobe_blacklist_size) { 1700 if (kretprobe_blacklist_size) {
1688 addr = kprobe_addr(&rp->kp); 1701 addr = kprobe_addr(&rp->kp);
1689 if (!addr) 1702 if (IS_ERR(addr))
1690 return -EINVAL; 1703 return PTR_ERR(addr);
1691 1704
1692 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1705 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
1693 if (kretprobe_blacklist[i].addr == addr) 1706 if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/sched.c b/kernel/sched.c
index c518b05fd062..84b9e076812e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2220 2220
2221 if (task_cpu(p) != new_cpu) { 2221 if (task_cpu(p) != new_cpu) {
2222 p->se.nr_migrations++; 2222 p->se.nr_migrations++;
2223 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2223 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2224 } 2224 }
2225 2225
2226 __set_task_cpu(p, new_cpu); 2226 __set_task_cpu(p, new_cpu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28/* 28/*
29 * Architectures that do not implement save_stack_trace_tsk get this 29 * Architectures that do not implement save_stack_trace_tsk or
30 * weak alias and a once-per-bootup warning (whenever this facility 30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
31 * is utilized - for example by procfs): 31 * (whenever this facility is utilized - for example by procfs):
32 */ 32 */
33__weak void 33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{ 35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); 36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37} 37}
38
39__weak void
40save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
41{
42 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
43}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f57440..c3e4575e7829 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
32 32
33#include <trace/events/sched.h> 33#include <trace/events/sched.h>
34 34
35#include <asm/ftrace.h>
36#include <asm/setup.h> 35#include <asm/setup.h>
37 36
38#include "trace_output.h" 37#include "trace_output.h"
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly;
82 81
83static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
84 83
85static struct ftrace_ops ftrace_list_end __read_mostly = 84static struct ftrace_ops ftrace_list_end __read_mostly = {
86{
87 .func = ftrace_stub, 85 .func = ftrace_stub,
88}; 86};
89 87
90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops; 94static struct ftrace_ops global_ops;
@@ -148,9 +147,11 @@ void clear_ftrace_function(void)
148{ 147{
149 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function = ftrace_stub; 149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
151 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
152} 152}
153 153
154#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
154#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
155/* 156/*
156 * For those archs that do not test ftrace_trace_stop in their 157 * For those archs that do not test ftrace_trace_stop in their
@@ -210,7 +211,12 @@ static void update_ftrace_function(void)
210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 211#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
211 ftrace_trace_function = func; 212 ftrace_trace_function = func;
212#else 213#else
214#ifdef CONFIG_DYNAMIC_FTRACE
215 /* do not update till all functions have been modified */
216 __ftrace_trace_function_delay = func;
217#else
213 __ftrace_trace_function = func; 218 __ftrace_trace_function = func;
219#endif
214 ftrace_trace_function = ftrace_test_stop_func; 220 ftrace_trace_function = ftrace_test_stop_func;
215#endif 221#endif
216} 222}
@@ -785,8 +791,7 @@ static void unregister_ftrace_profiler(void)
785 unregister_ftrace_graph(); 791 unregister_ftrace_graph();
786} 792}
787#else 793#else
788static struct ftrace_ops ftrace_profile_ops __read_mostly = 794static struct ftrace_ops ftrace_profile_ops __read_mostly = {
789{
790 .func = function_profile_call, 795 .func = function_profile_call,
791}; 796};
792 797
@@ -806,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
806 size_t cnt, loff_t *ppos) 811 size_t cnt, loff_t *ppos)
807{ 812{
808 unsigned long val; 813 unsigned long val;
809 char buf[64]; /* big enough to hold a number */
810 int ret; 814 int ret;
811 815
812 if (cnt >= sizeof(buf)) 816 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
813 return -EINVAL; 817 if (ret)
814
815 if (copy_from_user(&buf, ubuf, cnt))
816 return -EFAULT;
817
818 buf[cnt] = 0;
819
820 ret = strict_strtoul(buf, 10, &val);
821 if (ret < 0)
822 return ret; 818 return ret;
823 819
824 val = !!val; 820 val = !!val;
@@ -1182,8 +1178,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1182 return NULL; 1178 return NULL;
1183} 1179}
1184 1180
1181static void
1182ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1183static void
1184ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1185
1185static int 1186static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) 1187ftrace_hash_move(struct ftrace_ops *ops, int enable,
1188 struct ftrace_hash **dst, struct ftrace_hash *src)
1187{ 1189{
1188 struct ftrace_func_entry *entry; 1190 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn; 1191 struct hlist_node *tp, *tn;
@@ -1193,9 +1195,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1193 unsigned long key; 1195 unsigned long key;
1194 int size = src->count; 1196 int size = src->count;
1195 int bits = 0; 1197 int bits = 0;
1198 int ret;
1196 int i; 1199 int i;
1197 1200
1198 /* 1201 /*
1202 * Remove the current set, update the hash and add
1203 * them back.
1204 */
1205 ftrace_hash_rec_disable(ops, enable);
1206
1207 /*
1199 * If the new source is empty, just free dst and assign it 1208 * If the new source is empty, just free dst and assign it
1200 * the empty_hash. 1209 * the empty_hash.
1201 */ 1210 */
@@ -1215,9 +1224,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1215 if (bits > FTRACE_HASH_MAX_BITS) 1224 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS; 1225 bits = FTRACE_HASH_MAX_BITS;
1217 1226
1227 ret = -ENOMEM;
1218 new_hash = alloc_ftrace_hash(bits); 1228 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash) 1229 if (!new_hash)
1220 return -ENOMEM; 1230 goto out;
1221 1231
1222 size = 1 << src->size_bits; 1232 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) { 1233 for (i = 0; i < size; i++) {
@@ -1236,7 +1246,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1236 rcu_assign_pointer(*dst, new_hash); 1246 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash); 1247 free_ftrace_hash_rcu(old_hash);
1238 1248
1239 return 0; 1249 ret = 0;
1250 out:
1251 /*
1252 * Enable regardless of ret:
1253 * On success, we enable the new hash.
1254 * On failure, we re-enable the original hash.
1255 */
1256 ftrace_hash_rec_enable(ops, enable);
1257
1258 return ret;
1240} 1259}
1241 1260
1242/* 1261/*
@@ -1596,6 +1615,12 @@ static int __ftrace_modify_code(void *data)
1596{ 1615{
1597 int *command = data; 1616 int *command = data;
1598 1617
1618 /*
1619 * Do not call function tracer while we update the code.
1620 * We are in stop machine, no worrying about races.
1621 */
1622 function_trace_stop++;
1623
1599 if (*command & FTRACE_ENABLE_CALLS) 1624 if (*command & FTRACE_ENABLE_CALLS)
1600 ftrace_replace_code(1); 1625 ftrace_replace_code(1);
1601 else if (*command & FTRACE_DISABLE_CALLS) 1626 else if (*command & FTRACE_DISABLE_CALLS)
@@ -1609,6 +1634,18 @@ static int __ftrace_modify_code(void *data)
1609 else if (*command & FTRACE_STOP_FUNC_RET) 1634 else if (*command & FTRACE_STOP_FUNC_RET)
1610 ftrace_disable_ftrace_graph_caller(); 1635 ftrace_disable_ftrace_graph_caller();
1611 1636
1637#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1638 /*
1639 * For archs that call ftrace_test_stop_func(), we must
1640 * wait till after we update all the function callers
1641 * before we update the callback. This keeps different
1642 * ops that record different functions from corrupting
1643 * each other.
1644 */
1645 __ftrace_trace_function = __ftrace_trace_function_delay;
1646#endif
1647 function_trace_stop--;
1648
1612 return 0; 1649 return 0;
1613} 1650}
1614 1651
@@ -1744,10 +1781,36 @@ static cycle_t ftrace_update_time;
1744static unsigned long ftrace_update_cnt; 1781static unsigned long ftrace_update_cnt;
1745unsigned long ftrace_update_tot_cnt; 1782unsigned long ftrace_update_tot_cnt;
1746 1783
1784static int ops_traces_mod(struct ftrace_ops *ops)
1785{
1786 struct ftrace_hash *hash;
1787
1788 hash = ops->filter_hash;
1789 return !!(!hash || !hash->count);
1790}
1791
1747static int ftrace_update_code(struct module *mod) 1792static int ftrace_update_code(struct module *mod)
1748{ 1793{
1749 struct dyn_ftrace *p; 1794 struct dyn_ftrace *p;
1750 cycle_t start, stop; 1795 cycle_t start, stop;
1796 unsigned long ref = 0;
1797
1798 /*
1799 * When adding a module, we need to check if tracers are
1800 * currently enabled and if they are set to trace all functions.
1801 * If they are, we need to enable the module functions as well
1802 * as update the reference counts for those function records.
1803 */
1804 if (mod) {
1805 struct ftrace_ops *ops;
1806
1807 for (ops = ftrace_ops_list;
1808 ops != &ftrace_list_end; ops = ops->next) {
1809 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
1810 ops_traces_mod(ops))
1811 ref++;
1812 }
1813 }
1751 1814
1752 start = ftrace_now(raw_smp_processor_id()); 1815 start = ftrace_now(raw_smp_processor_id());
1753 ftrace_update_cnt = 0; 1816 ftrace_update_cnt = 0;
@@ -1760,7 +1823,7 @@ static int ftrace_update_code(struct module *mod)
1760 1823
1761 p = ftrace_new_addrs; 1824 p = ftrace_new_addrs;
1762 ftrace_new_addrs = p->newlist; 1825 ftrace_new_addrs = p->newlist;
1763 p->flags = 0L; 1826 p->flags = ref;
1764 1827
1765 /* 1828 /*
1766 * Do the initial record conversion from mcount jump 1829 * Do the initial record conversion from mcount jump
@@ -1783,7 +1846,7 @@ static int ftrace_update_code(struct module *mod)
1783 * conversion puts the module to the correct state, thus 1846 * conversion puts the module to the correct state, thus
1784 * passing the ftrace_make_call check. 1847 * passing the ftrace_make_call check.
1785 */ 1848 */
1786 if (ftrace_start_up) { 1849 if (ftrace_start_up && ref) {
1787 int failed = __ftrace_replace_code(p, 1); 1850 int failed = __ftrace_replace_code(p, 1);
1788 if (failed) { 1851 if (failed) {
1789 ftrace_bug(failed, p->ip); 1852 ftrace_bug(failed, p->ip);
@@ -2407,10 +2470,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
2407 */ 2470 */
2408 2471
2409static int 2472static int
2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2473ftrace_mod_callback(struct ftrace_hash *hash,
2474 char *func, char *cmd, char *param, int enable)
2411{ 2475{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
2414 char *mod; 2476 char *mod;
2415 int ret = -EINVAL; 2477 int ret = -EINVAL;
2416 2478
@@ -2430,11 +2492,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
2430 if (!strlen(mod)) 2492 if (!strlen(mod))
2431 return ret; 2493 return ret;
2432 2494
2433 if (enable)
2434 hash = ops->filter_hash;
2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod); 2495 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret) 2496 if (!ret)
2440 ret = -EINVAL; 2497 ret = -EINVAL;
@@ -2760,7 +2817,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
2760 mutex_lock(&ftrace_cmd_mutex); 2817 mutex_lock(&ftrace_cmd_mutex);
2761 list_for_each_entry(p, &ftrace_commands, list) { 2818 list_for_each_entry(p, &ftrace_commands, list) {
2762 if (strcmp(p->name, command) == 0) { 2819 if (strcmp(p->name, command) == 0) {
2763 ret = p->func(func, command, next, enable); 2820 ret = p->func(hash, func, command, next, enable);
2764 goto out_unlock; 2821 goto out_unlock;
2765 } 2822 }
2766 } 2823 }
@@ -2857,7 +2914,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2857 ftrace_match_records(hash, buf, len); 2914 ftrace_match_records(hash, buf, len);
2858 2915
2859 mutex_lock(&ftrace_lock); 2916 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash); 2917 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2918 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2919 && ftrace_enabled)
2920 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2921
2861 mutex_unlock(&ftrace_lock); 2922 mutex_unlock(&ftrace_lock);
2862 2923
2863 mutex_unlock(&ftrace_regex_lock); 2924 mutex_unlock(&ftrace_regex_lock);
@@ -3040,18 +3101,12 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3040 orig_hash = &iter->ops->notrace_hash; 3101 orig_hash = &iter->ops->notrace_hash;
3041 3102
3042 mutex_lock(&ftrace_lock); 3103 mutex_lock(&ftrace_lock);
3043 /* 3104 ret = ftrace_hash_move(iter->ops, filter_hash,
3044 * Remove the current set, update the hash and add 3105 orig_hash, iter->hash);
3045 * them back. 3106 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3046 */ 3107 && ftrace_enabled)
3047 ftrace_hash_rec_disable(iter->ops, filter_hash); 3108 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3048 ret = ftrace_hash_move(orig_hash, iter->hash); 3109
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock); 3110 mutex_unlock(&ftrace_lock);
3056 } 3111 }
3057 free_ftrace_hash(iter->hash); 3112 free_ftrace_hash(iter->hash);
@@ -3330,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod,
3330{ 3385{
3331 unsigned long *p; 3386 unsigned long *p;
3332 unsigned long addr; 3387 unsigned long addr;
3333 unsigned long flags; 3388 unsigned long flags = 0; /* Shut up gcc */
3334 3389
3335 mutex_lock(&ftrace_lock); 3390 mutex_lock(&ftrace_lock);
3336 p = start; 3391 p = start;
@@ -3348,12 +3403,18 @@ static int ftrace_process_locs(struct module *mod,
3348 } 3403 }
3349 3404
3350 /* 3405 /*
3351 * Disable interrupts to prevent interrupts from executing 3406 * We only need to disable interrupts on start up
3352 * code that is being modified. 3407 * because we are modifying code that an interrupt
3408 * may execute, and the modification is not atomic.
3409 * But for modules, nothing runs the code we modify
3410 * until we are finished with it, and there's no
3411 * reason to cause large interrupt latencies while we do it.
3353 */ 3412 */
3354 local_irq_save(flags); 3413 if (!mod)
3414 local_irq_save(flags);
3355 ftrace_update_code(mod); 3415 ftrace_update_code(mod);
3356 local_irq_restore(flags); 3416 if (!mod)
3417 local_irq_restore(flags);
3357 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3358 3419
3359 return 0; 3420 return 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa407943..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages) 997 unsigned nr_pages)
998{ 998{
999 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 unsigned long addr;
1001 LIST_HEAD(pages); 1000 LIST_HEAD(pages);
1002 unsigned i; 1001 unsigned i;
1003 1002
1004 WARN_ON(!nr_pages); 1003 WARN_ON(!nr_pages);
1005 1004
1006 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
1006 struct page *page;
1007 /*
1008 * __GFP_NORETRY flag makes sure that the allocation fails
1009 * gracefully without invoking oom-killer and the system is
1010 * not destabilized.
1011 */
1007 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1008 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 1013 GFP_KERNEL | __GFP_NORETRY,
1014 cpu_to_node(cpu_buffer->cpu));
1009 if (!bpage) 1015 if (!bpage)
1010 goto free_pages; 1016 goto free_pages;
1011 1017
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1013 1019
1014 list_add(&bpage->list, &pages); 1020 list_add(&bpage->list, &pages);
1015 1021
1016 addr = __get_free_page(GFP_KERNEL); 1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1017 if (!addr) 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1024 if (!page)
1018 goto free_pages; 1025 goto free_pages;
1019 bpage->page = (void *)addr; 1026 bpage->page = page_address(page);
1020 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1021 } 1028 }
1022 1029
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
1048 unsigned long addr; 1055 struct page *page;
1049 int ret; 1056 int ret;
1050 1057
1051 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 1058 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1067 rb_check_bpage(cpu_buffer, bpage); 1074 rb_check_bpage(cpu_buffer, bpage);
1068 1075
1069 cpu_buffer->reader_page = bpage; 1076 cpu_buffer->reader_page = bpage;
1070 addr = __get_free_page(GFP_KERNEL); 1077 page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
1071 if (!addr) 1078 if (!page)
1072 goto fail_free_reader; 1079 goto fail_free_reader;
1073 bpage->page = (void *)addr; 1080 bpage->page = page_address(page);
1074 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1075 1082
1076 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1314 unsigned nr_pages, rm_pages, new_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1315 struct buffer_page *bpage, *tmp; 1322 struct buffer_page *bpage, *tmp;
1316 unsigned long buffer_size; 1323 unsigned long buffer_size;
1317 unsigned long addr;
1318 LIST_HEAD(pages); 1324 LIST_HEAD(pages);
1319 int i, cpu; 1325 int i, cpu;
1320 1326
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1375 1381
1376 for_each_buffer_cpu(buffer, cpu) { 1382 for_each_buffer_cpu(buffer, cpu) {
1377 for (i = 0; i < new_pages; i++) { 1383 for (i = 0; i < new_pages; i++) {
1384 struct page *page;
1385 /*
1386 * __GFP_NORETRY flag makes sure that the allocation
1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1378 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1379 cache_line_size()), 1391 cache_line_size()),
1380 GFP_KERNEL, cpu_to_node(cpu)); 1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1381 if (!bpage) 1394 if (!bpage)
1382 goto free_pages; 1395 goto free_pages;
1383 list_add(&bpage->list, &pages); 1396 list_add(&bpage->list, &pages);
1384 addr = __get_free_page(GFP_KERNEL); 1397 page = alloc_pages_node(cpu_to_node(cpu),
1385 if (!addr) 1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1386 goto free_pages; 1400 goto free_pages;
1387 bpage->page = (void *)addr; 1401 bpage->page = page_address(page);
1388 rb_init_page(bpage->page); 1402 rb_init_page(bpage->page);
1389 } 1403 }
1390 } 1404 }
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3730 * Returns: 3744 * Returns:
3731 * The page allocated, or NULL on error. 3745 * The page allocated, or NULL on error.
3732 */ 3746 */
3733void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 3747void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
3734{ 3748{
3735 struct buffer_data_page *bpage; 3749 struct buffer_data_page *bpage;
3736 unsigned long addr; 3750 struct page *page;
3737 3751
3738 addr = __get_free_page(GFP_KERNEL); 3752 page = alloc_pages_node(cpu_to_node(cpu),
3739 if (!addr) 3753 GFP_KERNEL | __GFP_NORETRY, 0);
3754 if (!page)
3740 return NULL; 3755 return NULL;
3741 3756
3742 bpage = (void *)addr; 3757 bpage = page_address(page);
3743 3758
3744 rb_init_page(bpage); 3759 rb_init_page(bpage);
3745 3760
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
3978 size_t cnt, loff_t *ppos) 3993 size_t cnt, loff_t *ppos)
3979{ 3994{
3980 unsigned long *p = filp->private_data; 3995 unsigned long *p = filp->private_data;
3981 char buf[64];
3982 unsigned long val; 3996 unsigned long val;
3983 int ret; 3997 int ret;
3984 3998
3985 if (cnt >= sizeof(buf)) 3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3986 return -EINVAL; 4000 if (ret)
3987
3988 if (copy_from_user(&buf, ubuf, cnt))
3989 return -EFAULT;
3990
3991 buf[cnt] = 0;
3992
3993 ret = strict_strtoul(buf, 10, &val);
3994 if (ret < 0)
3995 return ret; 4001 return ret;
3996 4002
3997 if (val) 4003 if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
106 int inc; 106 int inc;
107 int i; 107 int i;
108 108
109 bpage = ring_buffer_alloc_read_page(buffer); 109 bpage = ring_buffer_alloc_read_page(buffer, cpu);
110 if (!bpage) 110 if (!bpage)
111 return EVENT_DROPPED; 111 return EVENT_DROPPED;
112 112
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f21..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work)
347{
348 wake_up(&trace_wait);
349}
350
351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
352
346/** 353/**
347 * trace_wake_up - wake up tasks waiting for trace input 354 * trace_wake_up - wake up tasks waiting for trace input
348 * 355 *
349 * Simply wakes up any task that is blocked on the trace_wait 356 * Schedules a delayed work to wake up any task that is blocked on the
350 * queue. These is used with trace_poll for tasks polling the trace. 357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
351 */ 359 */
352void trace_wake_up(void) 360void trace_wake_up(void)
353{ 361{
354 int cpu; 362 const unsigned long delay = msecs_to_jiffies(2);
355 363
356 if (trace_flags & TRACE_ITER_BLOCK) 364 if (trace_flags & TRACE_ITER_BLOCK)
357 return; 365 return;
358 /* 366 schedule_delayed_work(&wakeup_work, delay);
359 * The runqueue_is_locked() can fail, but this is the best we
360 * have for now:
361 */
362 cpu = get_cpu();
363 if (!runqueue_is_locked(cpu))
364 wake_up(&trace_wait);
365 put_cpu();
366} 367}
367 368
368static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
424 "graph-time", 425 "graph-time",
425 "record-cmd", 426 "record-cmd",
426 "overwrite", 427 "overwrite",
428 "disable_on_free",
427 NULL 429 NULL
428}; 430};
429 431
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1191} 1193}
1192EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1193 1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1200{
1201 ring_buffer_unlock_commit(buffer, event);
1202
1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1204 ftrace_trace_userstack(buffer, flags, pc);
1205}
1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1207
1194void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1195 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
1196{ 1210{
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1234} 1248}
1235 1249
1236#ifdef CONFIG_STACKTRACE 1250#ifdef CONFIG_STACKTRACE
1251
1252#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
1253struct ftrace_stack {
1254 unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
1255};
1256
1257static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
1258static DEFINE_PER_CPU(int, ftrace_stack_reserve);
1259
1237static void __ftrace_trace_stack(struct ring_buffer *buffer, 1260static void __ftrace_trace_stack(struct ring_buffer *buffer,
1238 unsigned long flags, 1261 unsigned long flags,
1239 int skip, int pc) 1262 int skip, int pc, struct pt_regs *regs)
1240{ 1263{
1241 struct ftrace_event_call *call = &event_kernel_stack; 1264 struct ftrace_event_call *call = &event_kernel_stack;
1242 struct ring_buffer_event *event; 1265 struct ring_buffer_event *event;
1243 struct stack_entry *entry; 1266 struct stack_entry *entry;
1244 struct stack_trace trace; 1267 struct stack_trace trace;
1268 int use_stack;
1269 int size = FTRACE_STACK_ENTRIES;
1270
1271 trace.nr_entries = 0;
1272 trace.skip = skip;
1273
1274 /*
1275 * Since events can happen in NMIs there's no safe way to
1276 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1277 * or NMI comes in, it will just have to use the default
1278 * FTRACE_STACK_SIZE.
1279 */
1280 preempt_disable_notrace();
1281
1282 use_stack = ++__get_cpu_var(ftrace_stack_reserve);
1283 /*
1284 * We don't need any atomic variables, just a barrier.
1285 * If an interrupt comes in, we don't care, because it would
1286 * have exited and put the counter back to what we want.
1287 * We just need a barrier to keep gcc from moving things
1288 * around.
1289 */
1290 barrier();
1291 if (use_stack == 1) {
1292 trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
1293 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1294
1295 if (regs)
1296 save_stack_trace_regs(regs, &trace);
1297 else
1298 save_stack_trace(&trace);
1299
1300 if (trace.nr_entries > size)
1301 size = trace.nr_entries;
1302 } else
1303 /* From now on, use_stack is a boolean */
1304 use_stack = 0;
1305
1306 size *= sizeof(unsigned long);
1245 1307
1246 event = trace_buffer_lock_reserve(buffer, TRACE_STACK, 1308 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1247 sizeof(*entry), flags, pc); 1309 sizeof(*entry) + size, flags, pc);
1248 if (!event) 1310 if (!event)
1249 return; 1311 goto out;
1250 entry = ring_buffer_event_data(event); 1312 entry = ring_buffer_event_data(event);
1251 memset(&entry->caller, 0, sizeof(entry->caller));
1252 1313
1253 trace.nr_entries = 0; 1314 memset(&entry->caller, 0, size);
1254 trace.max_entries = FTRACE_STACK_ENTRIES; 1315
1255 trace.skip = skip; 1316 if (use_stack)
1256 trace.entries = entry->caller; 1317 memcpy(&entry->caller, trace.entries,
1318 trace.nr_entries * sizeof(unsigned long));
1319 else {
1320 trace.max_entries = FTRACE_STACK_ENTRIES;
1321 trace.entries = entry->caller;
1322 if (regs)
1323 save_stack_trace_regs(regs, &trace);
1324 else
1325 save_stack_trace(&trace);
1326 }
1327
1328 entry->size = trace.nr_entries;
1257 1329
1258 save_stack_trace(&trace);
1259 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1260 ring_buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1332
1333 out:
1334 /* Again, don't let gcc optimize things here */
1335 barrier();
1336 __get_cpu_var(ftrace_stack_reserve)--;
1337 preempt_enable_notrace();
1338
1339}
1340
1341void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
1342 int skip, int pc, struct pt_regs *regs)
1343{
1344 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1345 return;
1346
1347 __ftrace_trace_stack(buffer, flags, skip, pc, regs);
1261} 1348}
1262 1349
1263void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 1350void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1266 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1353 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1267 return; 1354 return;
1268 1355
1269 __ftrace_trace_stack(buffer, flags, skip, pc); 1356 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
1270} 1357}
1271 1358
1272void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1359void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1273 int pc) 1360 int pc)
1274{ 1361{
1275 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1362 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
1276} 1363}
1277 1364
1278/** 1365/**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
1288 local_save_flags(flags); 1375 local_save_flags(flags);
1289 1376
1290 /* skipping 3 traces, seems to get us at the caller of this function */ 1377 /* skipping 3 traces, seems to get us at the caller of this function */
1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1378 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
1292} 1379}
1293 1380
1294static DEFINE_PER_CPU(int, user_stack_count); 1381static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1536 1623
1537 ftrace_enable_cpu(); 1624 ftrace_enable_cpu();
1538 1625
1539 return event ? ring_buffer_event_data(event) : NULL; 1626 if (event) {
1627 iter->ent_size = ring_buffer_event_length(event);
1628 return ring_buffer_event_data(event);
1629 }
1630 iter->ent_size = 0;
1631 return NULL;
1540} 1632}
1541 1633
1542static struct trace_entry * 1634static struct trace_entry *
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
2051{ 2143{
2052 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
2053 2145
2146 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
2147 return;
2148
2054 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2149 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2055 /* print nothing if the buffers are empty */ 2150 /* print nothing if the buffers are empty */
2056 if (trace_empty(iter)) 2151 if (trace_empty(iter))
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2701 size_t cnt, loff_t *ppos) 2796 size_t cnt, loff_t *ppos)
2702{ 2797{
2703 struct trace_array *tr = filp->private_data; 2798 struct trace_array *tr = filp->private_data;
2704 char buf[64];
2705 unsigned long val; 2799 unsigned long val;
2706 int ret; 2800 int ret;
2707 2801
2708 if (cnt >= sizeof(buf)) 2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2709 return -EINVAL; 2803 if (ret)
2710
2711 if (copy_from_user(&buf, ubuf, cnt))
2712 return -EFAULT;
2713
2714 buf[cnt] = 0;
2715
2716 ret = strict_strtoul(buf, 10, &val);
2717 if (ret < 0)
2718 return ret; 2804 return ret;
2719 2805
2720 val = !!val; 2806 val = !!val;
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2767 return t->init(tr); 2853 return t->init(tr);
2768} 2854}
2769 2855
2770static int tracing_resize_ring_buffer(unsigned long size) 2856static int __tracing_resize_ring_buffer(unsigned long size)
2771{ 2857{
2772 int ret; 2858 int ret;
2773 2859
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
2819 return ret; 2905 return ret;
2820} 2906}
2821 2907
2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
2909{
2910 int cpu, ret = size;
2911
2912 mutex_lock(&trace_types_lock);
2913
2914 tracing_stop();
2915
2916 /* disable all cpu buffers */
2917 for_each_tracing_cpu(cpu) {
2918 if (global_trace.data[cpu])
2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
2922 }
2923
2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
2927 if (ret < 0)
2928 ret = -ENOMEM;
2929
2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
2938 mutex_unlock(&trace_types_lock);
2939
2940 return ret;
2941}
2942
2822 2943
2823/** 2944/**
2824 * tracing_update_buffers - used by tracing facility to expand ring buffers 2945 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void)
2836 2957
2837 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
2838 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
2839 ret = tracing_resize_ring_buffer(trace_buf_size); 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
2840 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
2841 2962
2842 return ret; 2963 return ret;
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
2860 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
2861 2982
2862 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
2863 ret = tracing_resize_ring_buffer(trace_buf_size); 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
2864 if (ret < 0) 2985 if (ret < 0)
2865 goto out; 2986 goto out;
2866 ret = 0; 2987 ret = 0;
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2966 size_t cnt, loff_t *ppos) 3087 size_t cnt, loff_t *ppos)
2967{ 3088{
2968 unsigned long *ptr = filp->private_data; 3089 unsigned long *ptr = filp->private_data;
2969 char buf[64];
2970 unsigned long val; 3090 unsigned long val;
2971 int ret; 3091 int ret;
2972 3092
2973 if (cnt >= sizeof(buf)) 3093 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2974 return -EINVAL; 3094 if (ret)
2975
2976 if (copy_from_user(&buf, ubuf, cnt))
2977 return -EFAULT;
2978
2979 buf[cnt] = 0;
2980
2981 ret = strict_strtoul(buf, 10, &val);
2982 if (ret < 0)
2983 return ret; 3095 return ret;
2984 3096
2985 *ptr = val * 1000; 3097 *ptr = val * 1000;
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3434 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3435{ 3547{
3436 unsigned long val; 3548 unsigned long val;
3437 char buf[64]; 3549 int ret;
3438 int ret, cpu;
3439
3440 if (cnt >= sizeof(buf))
3441 return -EINVAL;
3442
3443 if (copy_from_user(&buf, ubuf, cnt))
3444 return -EFAULT;
3445
3446 buf[cnt] = 0;
3447 3550
3448 ret = strict_strtoul(buf, 10, &val); 3551 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3449 if (ret < 0) 3552 if (ret)
3450 return ret; 3553 return ret;
3451 3554
3452 /* must have at least 1 entry */ 3555 /* must have at least 1 entry */
3453 if (!val) 3556 if (!val)
3454 return -EINVAL; 3557 return -EINVAL;
3455 3558
3456 mutex_lock(&trace_types_lock);
3457
3458 tracing_stop();
3459
3460 /* disable all cpu buffers */
3461 for_each_tracing_cpu(cpu) {
3462 if (global_trace.data[cpu])
3463 atomic_inc(&global_trace.data[cpu]->disabled);
3464 if (max_tr.data[cpu])
3465 atomic_inc(&max_tr.data[cpu]->disabled);
3466 }
3467
3468 /* value is in KB */ 3559 /* value is in KB */
3469 val <<= 10; 3560 val <<= 10;
3470 3561
3471 if (val != global_trace.entries) { 3562 ret = tracing_resize_ring_buffer(val);
3472 ret = tracing_resize_ring_buffer(val); 3563 if (ret < 0)
3473 if (ret < 0) { 3564 return ret;
3474 cnt = ret;
3475 goto out;
3476 }
3477 }
3478 3565
3479 *ppos += cnt; 3566 *ppos += cnt;
3480 3567
3481 /* If check pages failed, return ENOMEM */ 3568 return cnt;
3482 if (tracing_disabled) 3569}
3483 cnt = -ENOMEM;
3484 out:
3485 for_each_tracing_cpu(cpu) {
3486 if (global_trace.data[cpu])
3487 atomic_dec(&global_trace.data[cpu]->disabled);
3488 if (max_tr.data[cpu])
3489 atomic_dec(&max_tr.data[cpu]->disabled);
3490 }
3491 3570
3492 tracing_start(); 3571static ssize_t
3493 mutex_unlock(&trace_types_lock); 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos)
3574{
3575 /*
3576 * There is no need to read what the user has written, this function
3577 * is just to make sure that there is no error when "echo" is used
3578 */
3579
3580 *ppos += cnt;
3494 3581
3495 return cnt; 3582 return cnt;
3496} 3583}
3497 3584
3585static int
3586tracing_free_buffer_release(struct inode *inode, struct file *filp)
3587{
3588 /* disable tracing ? */
3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3590 tracing_off();
3591 /* resize the ring buffer to 0 */
3592 tracing_resize_ring_buffer(0);
3593
3594 return 0;
3595}
3596
3498static int mark_printk(const char *fmt, ...) 3597static int mark_printk(const char *fmt, ...)
3499{ 3598{
3500 int ret; 3599 int ret;
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
3640 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
3641}; 3740};
3642 3741
3742static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release,
3745};
3746
3643static const struct file_operations tracing_mark_fops = { 3747static const struct file_operations tracing_mark_fops = {
3644 .open = tracing_open_generic, 3748 .open = tracing_open_generic,
3645 .write = tracing_mark_write, 3749 .write = tracing_mark_write,
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3696 return 0; 3800 return 0;
3697 3801
3698 if (!info->spare) 3802 if (!info->spare)
3699 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3803 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
3700 if (!info->spare) 3804 if (!info->spare)
3701 return -ENOMEM; 3805 return -ENOMEM;
3702 3806
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3853 3957
3854 ref->ref = 1; 3958 ref->ref = 1;
3855 ref->buffer = info->tr->buffer; 3959 ref->buffer = info->tr->buffer;
3856 ref->page = ring_buffer_alloc_read_page(ref->buffer); 3960 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
3857 if (!ref->page) { 3961 if (!ref->page) {
3858 kfree(ref); 3962 kfree(ref);
3859 break; 3963 break;
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3862 r = ring_buffer_read_page(ref->buffer, &ref->page, 3966 r = ring_buffer_read_page(ref->buffer, &ref->page,
3863 len, info->cpu, 1); 3967 len, info->cpu, 1);
3864 if (r < 0) { 3968 if (r < 0) {
3865 ring_buffer_free_read_page(ref->buffer, 3969 ring_buffer_free_read_page(ref->buffer, ref->page);
3866 ref->page);
3867 kfree(ref); 3970 kfree(ref);
3868 break; 3971 break;
3869 } 3972 }
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4099{ 4202{
4100 struct trace_option_dentry *topt = filp->private_data; 4203 struct trace_option_dentry *topt = filp->private_data;
4101 unsigned long val; 4204 unsigned long val;
4102 char buf[64];
4103 int ret; 4205 int ret;
4104 4206
4105 if (cnt >= sizeof(buf)) 4207 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4106 return -EINVAL; 4208 if (ret)
4107
4108 if (copy_from_user(&buf, ubuf, cnt))
4109 return -EFAULT;
4110
4111 buf[cnt] = 0;
4112
4113 ret = strict_strtoul(buf, 10, &val);
4114 if (ret < 0)
4115 return ret; 4209 return ret;
4116 4210
4117 if (val != 0 && val != 1) 4211 if (val != 0 && val != 1)
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4159 loff_t *ppos) 4253 loff_t *ppos)
4160{ 4254{
4161 long index = (long)filp->private_data; 4255 long index = (long)filp->private_data;
4162 char buf[64];
4163 unsigned long val; 4256 unsigned long val;
4164 int ret; 4257 int ret;
4165 4258
4166 if (cnt >= sizeof(buf)) 4259 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4167 return -EINVAL; 4260 if (ret)
4168
4169 if (copy_from_user(&buf, ubuf, cnt))
4170 return -EFAULT;
4171
4172 buf[cnt] = 0;
4173
4174 ret = strict_strtoul(buf, 10, &val);
4175 if (ret < 0)
4176 return ret; 4261 return ret;
4177 4262
4178 if (val != 0 && val != 1) 4263 if (val != 0 && val != 1)
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
4365 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4366 &global_trace, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4367 4452
4453 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops);
4455
4368 trace_create_file("trace_marker", 0220, d_tracer, 4456 trace_create_file("trace_marker", 0220, d_tracer,
4369 NULL, &tracing_mark_fops); 4457 NULL, &tracing_mark_fops);
4370 4458
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 229f8591f61d..3f381d0b20a8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -278,6 +278,29 @@ struct tracer {
278}; 278};
279 279
280 280
281/* Only current can touch trace_recursion */
282#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
283#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
284
285/* Ring buffer has the 10 LSB bits to count */
286#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
287
288/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12)
291/*
292 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function
294 * graph in irq because we want to trace a particular function that
295 * was called in irq context but we have irq tracing off. Since this
296 * can only be modified by current, we can reuse trace_recursion.
297 */
298#define TRACE_IRQ_BIT (1<<13)
299
300#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
301#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
302#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
303
281#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
282 305
283int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
389void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 412void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
390 int skip, int pc); 413 int skip, int pc);
391 414
415void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
416 int skip, int pc, struct pt_regs *regs);
417
392void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, 418void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
393 int pc); 419 int pc);
394 420
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
400{ 426{
401} 427}
402 428
429static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
430 unsigned long flags, int skip,
431 int pc, struct pt_regs *regs)
432{
433}
434
403static inline void ftrace_trace_userstack(struct ring_buffer *buffer, 435static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
404 unsigned long flags, int pc) 436 unsigned long flags, int pc)
405{ 437{
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
507 return 1; 539 return 1;
508 540
509 for (i = 0; i < ftrace_graph_count; i++) { 541 for (i = 0; i < ftrace_graph_count; i++) {
510 if (addr == ftrace_graph_funcs[i]) 542 if (addr == ftrace_graph_funcs[i]) {
543 /*
544 * If no irqs are to be traced, but a set_graph_function
545 * is set, and called by an interrupt handler, we still
546 * want to trace it.
547 */
548 if (in_irq())
549 trace_recursion_set(TRACE_IRQ_BIT);
550 else
551 trace_recursion_clear(TRACE_IRQ_BIT);
511 return 1; 552 return 1;
553 }
512 } 554 }
513 555
514 return 0; 556 return 0;
@@ -609,6 +651,7 @@ enum trace_iterator_flags {
609 TRACE_ITER_GRAPH_TIME = 0x80000, 651 TRACE_ITER_GRAPH_TIME = 0x80000,
610 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000,
612}; 655};
613 656
614/* 657/*
@@ -677,6 +720,7 @@ struct event_subsystem {
677 struct dentry *entry; 720 struct dentry *entry;
678 struct event_filter *filter; 721 struct event_filter *filter;
679 int nr_events; 722 int nr_events;
723 int ref_count;
680}; 724};
681 725
682#define FILTER_PRED_INVALID ((unsigned short)-1) 726#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[];
784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 828 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
785#include "trace_entries.h" 829#include "trace_entries.h"
786 830
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
802#endif /* _LINUX_KERNEL_TRACE_H */ 831#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
161 TRACE_STACK, 161 TRACE_STACK,
162 162
163 F_STRUCT( 163 F_STRUCT(
164 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 164 __field( int, size )
165 __dynamic_array(unsigned long, caller )
165 ), 166 ),
166 167
167 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 686ec399f2a8..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
244 mutex_unlock(&event_mutex); 244 mutex_unlock(&event_mutex);
245} 245}
246 246
247static void __put_system(struct event_subsystem *system)
248{
249 struct event_filter *filter = system->filter;
250
251 WARN_ON_ONCE(system->ref_count == 0);
252 if (--system->ref_count)
253 return;
254
255 if (filter) {
256 kfree(filter->filter_string);
257 kfree(filter);
258 }
259 kfree(system->name);
260 kfree(system);
261}
262
263static void __get_system(struct event_subsystem *system)
264{
265 WARN_ON_ONCE(system->ref_count == 0);
266 system->ref_count++;
267}
268
269static void put_system(struct event_subsystem *system)
270{
271 mutex_lock(&event_mutex);
272 __put_system(system);
273 mutex_unlock(&event_mutex);
274}
275
247/* 276/*
248 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 277 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
249 */ 278 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
486 loff_t *ppos) 515 loff_t *ppos)
487{ 516{
488 struct ftrace_event_call *call = filp->private_data; 517 struct ftrace_event_call *call = filp->private_data;
489 char buf[64];
490 unsigned long val; 518 unsigned long val;
491 int ret; 519 int ret;
492 520
493 if (cnt >= sizeof(buf)) 521 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
494 return -EINVAL; 522 if (ret)
495
496 if (copy_from_user(&buf, ubuf, cnt))
497 return -EFAULT;
498
499 buf[cnt] = 0;
500
501 ret = strict_strtoul(buf, 10, &val);
502 if (ret < 0)
503 return ret; 523 return ret;
504 524
505 ret = tracing_update_buffers(); 525 ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
528 loff_t *ppos) 548 loff_t *ppos)
529{ 549{
530 const char set_to_char[4] = { '?', '0', '1', 'X' }; 550 const char set_to_char[4] = { '?', '0', '1', 'X' };
531 const char *system = filp->private_data; 551 struct event_subsystem *system = filp->private_data;
532 struct ftrace_event_call *call; 552 struct ftrace_event_call *call;
533 char buf[2]; 553 char buf[2];
534 int set = 0; 554 int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
539 if (!call->name || !call->class || !call->class->reg) 559 if (!call->name || !call->class || !call->class->reg)
540 continue; 560 continue;
541 561
542 if (system && strcmp(call->class->system, system) != 0) 562 if (system && strcmp(call->class->system, system->name) != 0)
543 continue; 563 continue;
544 564
545 /* 565 /*
@@ -569,21 +589,13 @@ static ssize_t
569system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 589system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
570 loff_t *ppos) 590 loff_t *ppos)
571{ 591{
572 const char *system = filp->private_data; 592 struct event_subsystem *system = filp->private_data;
593 const char *name = NULL;
573 unsigned long val; 594 unsigned long val;
574 char buf[64];
575 ssize_t ret; 595 ssize_t ret;
576 596
577 if (cnt >= sizeof(buf)) 597 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
578 return -EINVAL; 598 if (ret)
579
580 if (copy_from_user(&buf, ubuf, cnt))
581 return -EFAULT;
582
583 buf[cnt] = 0;
584
585 ret = strict_strtoul(buf, 10, &val);
586 if (ret < 0)
587 return ret; 599 return ret;
588 600
589 ret = tracing_update_buffers(); 601 ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
593 if (val != 0 && val != 1) 605 if (val != 0 && val != 1)
594 return -EINVAL; 606 return -EINVAL;
595 607
596 ret = __ftrace_set_clr_event(NULL, system, NULL, val); 608 /*
609 * Opening of "enable" adds a ref count to system,
610 * so the name is safe to use.
611 */
612 if (system)
613 name = system->name;
614
615 ret = __ftrace_set_clr_event(NULL, name, NULL, val);
597 if (ret) 616 if (ret)
598 goto out; 617 goto out;
599 618
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
826 return cnt; 845 return cnt;
827} 846}
828 847
848static LIST_HEAD(event_subsystems);
849
850static int subsystem_open(struct inode *inode, struct file *filp)
851{
852 struct event_subsystem *system = NULL;
853 int ret;
854
855 if (!inode->i_private)
856 goto skip_search;
857
858 /* Make sure the system still exists */
859 mutex_lock(&event_mutex);
860 list_for_each_entry(system, &event_subsystems, list) {
861 if (system == inode->i_private) {
862 /* Don't open systems with no events */
863 if (!system->nr_events) {
864 system = NULL;
865 break;
866 }
867 __get_system(system);
868 break;
869 }
870 }
871 mutex_unlock(&event_mutex);
872
873 if (system != inode->i_private)
874 return -ENODEV;
875
876 skip_search:
877 ret = tracing_open_generic(inode, filp);
878 if (ret < 0 && system)
879 put_system(system);
880
881 return ret;
882}
883
884static int subsystem_release(struct inode *inode, struct file *file)
885{
886 struct event_subsystem *system = inode->i_private;
887
888 if (system)
889 put_system(system);
890
891 return 0;
892}
893
829static ssize_t 894static ssize_t
830subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 895subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
831 loff_t *ppos) 896 loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
963}; 1028};
964 1029
965static const struct file_operations ftrace_subsystem_filter_fops = { 1030static const struct file_operations ftrace_subsystem_filter_fops = {
966 .open = tracing_open_generic, 1031 .open = subsystem_open,
967 .read = subsystem_filter_read, 1032 .read = subsystem_filter_read,
968 .write = subsystem_filter_write, 1033 .write = subsystem_filter_write,
969 .llseek = default_llseek, 1034 .llseek = default_llseek,
1035 .release = subsystem_release,
970}; 1036};
971 1037
972static const struct file_operations ftrace_system_enable_fops = { 1038static const struct file_operations ftrace_system_enable_fops = {
973 .open = tracing_open_generic, 1039 .open = subsystem_open,
974 .read = system_enable_read, 1040 .read = system_enable_read,
975 .write = system_enable_write, 1041 .write = system_enable_write,
976 .llseek = default_llseek, 1042 .llseek = default_llseek,
1043 .release = subsystem_release,
977}; 1044};
978 1045
979static const struct file_operations ftrace_show_header_fops = { 1046static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1002 return d_events; 1069 return d_events;
1003} 1070}
1004 1071
1005static LIST_HEAD(event_subsystems);
1006
1007static struct dentry * 1072static struct dentry *
1008event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1009{ 1074{
@@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1013 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1014 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1015 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1016 system->nr_events++; 1082 system->nr_events++;
1017 return system->entry; 1083 return system->entry;
1018 } 1084 }
@@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1035 } 1101 }
1036 1102
1037 system->nr_events = 1; 1103 system->nr_events = 1;
1104 system->ref_count = 1;
1038 system->name = kstrdup(name, GFP_KERNEL); 1105 system->name = kstrdup(name, GFP_KERNEL);
1039 if (!system->name) { 1106 if (!system->name) {
1040 debugfs_remove(system->entry); 1107 debugfs_remove(system->entry);
@@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1062 "'%s/filter' entry\n", name); 1129 "'%s/filter' entry\n", name);
1063 } 1130 }
1064 1131
1065 trace_create_file("enable", 0644, system->entry, 1132 trace_create_file("enable", 0644, system->entry, system,
1066 (void *)system->name,
1067 &ftrace_system_enable_fops); 1133 &ftrace_system_enable_fops);
1068 1134
1069 return system->entry; 1135 return system->entry;
@@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name)
1184 list_for_each_entry(system, &event_subsystems, list) { 1250 list_for_each_entry(system, &event_subsystems, list) {
1185 if (strcmp(system->name, name) == 0) { 1251 if (strcmp(system->name, name) == 0) {
1186 if (!--system->nr_events) { 1252 if (!--system->nr_events) {
1187 struct event_filter *filter = system->filter;
1188
1189 debugfs_remove_recursive(system->entry); 1253 debugfs_remove_recursive(system->entry);
1190 list_del(&system->list); 1254 list_del(&system->list);
1191 if (filter) { 1255 __put_system(system);
1192 kfree(filter->filter_string);
1193 kfree(filter);
1194 }
1195 kfree(system->name);
1196 kfree(system);
1197 } 1256 }
1198 break; 1257 break;
1199 } 1258 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf20..256764ecccd6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1886 1886
1887 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1888 1888
1889 /* Make sure the system still has events */
1890 if (!system->nr_events) {
1891 err = -ENODEV;
1892 goto out_unlock;
1893 }
1894
1889 if (!strcmp(strstrip(filter_string), "0")) { 1895 if (!strcmp(strstrip(filter_string), "0")) {
1890 filter_free_subsystem_preds(system); 1896 filter_free_subsystem_preds(system);
1891 remove_filter_string(system->filter); 1897 remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e974..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
324} 324}
325 325
326static int 326static int
327ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) 327ftrace_trace_onoff_callback(struct ftrace_hash *hash,
328 char *glob, char *cmd, char *param, int enable)
328{ 329{
329 struct ftrace_probe_ops *ops; 330 struct ftrace_probe_ops *ops;
330 void *count = (void *)-1; 331 void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..a7d2a4c653d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
74 74
75static struct trace_array *graph_array; 75static struct trace_array *graph_array;
76 76
77/*
78 * DURATION column is being also used to display IRQ signs,
79 * following values are used by print_graph_irq and others
80 * to fill in space into DURATION column.
81 */
82enum {
83 DURATION_FILL_FULL = -1,
84 DURATION_FILL_START = -2,
85 DURATION_FILL_END = -3,
86};
87
88static enum print_line_t
89print_graph_duration(unsigned long long duration, struct trace_seq *s,
90 u32 flags);
77 91
78/* Add a function return address to the trace stack on thread info.*/ 92/* Add a function return address to the trace stack on thread info.*/
79int 93int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
213 227
214static inline int ftrace_graph_ignore_irqs(void) 228static inline int ftrace_graph_ignore_irqs(void)
215{ 229{
216 if (!ftrace_graph_skip_irqs) 230 if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
217 return 0; 231 return 0;
218 232
219 return in_irq(); 233 return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
577 return next; 591 return next;
578} 592}
579 593
580/* Signal a overhead of time execution to the output */
581static int
582print_graph_overhead(unsigned long long duration, struct trace_seq *s,
583 u32 flags)
584{
585 /* If duration disappear, we don't need anything */
586 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
587 return 1;
588
589 /* Non nested entry or return */
590 if (duration == -1)
591 return trace_seq_printf(s, " ");
592
593 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
594 /* Duration exceeded 100 msecs */
595 if (duration > 100000ULL)
596 return trace_seq_printf(s, "! ");
597
598 /* Duration exceeded 10 msecs */
599 if (duration > 10000ULL)
600 return trace_seq_printf(s, "+ ");
601 }
602
603 return trace_seq_printf(s, " ");
604}
605
606static int print_graph_abs_time(u64 t, struct trace_seq *s) 594static int print_graph_abs_time(u64 t, struct trace_seq *s)
607{ 595{
608 unsigned long usecs_rem; 596 unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
625 addr >= (unsigned long)__irqentry_text_end) 613 addr >= (unsigned long)__irqentry_text_end)
626 return TRACE_TYPE_UNHANDLED; 614 return TRACE_TYPE_UNHANDLED;
627 615
628 /* Absolute time */ 616 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
629 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 617 /* Absolute time */
630 ret = print_graph_abs_time(iter->ts, s); 618 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
631 if (!ret) 619 ret = print_graph_abs_time(iter->ts, s);
632 return TRACE_TYPE_PARTIAL_LINE; 620 if (!ret)
633 } 621 return TRACE_TYPE_PARTIAL_LINE;
622 }
634 623
635 /* Cpu */ 624 /* Cpu */
636 if (flags & TRACE_GRAPH_PRINT_CPU) { 625 if (flags & TRACE_GRAPH_PRINT_CPU) {
637 ret = print_graph_cpu(s, cpu); 626 ret = print_graph_cpu(s, cpu);
638 if (ret == TRACE_TYPE_PARTIAL_LINE) 627 if (ret == TRACE_TYPE_PARTIAL_LINE)
639 return TRACE_TYPE_PARTIAL_LINE; 628 return TRACE_TYPE_PARTIAL_LINE;
640 } 629 }
641 630
642 /* Proc */ 631 /* Proc */
643 if (flags & TRACE_GRAPH_PRINT_PROC) { 632 if (flags & TRACE_GRAPH_PRINT_PROC) {
644 ret = print_graph_proc(s, pid); 633 ret = print_graph_proc(s, pid);
645 if (ret == TRACE_TYPE_PARTIAL_LINE) 634 if (ret == TRACE_TYPE_PARTIAL_LINE)
646 return TRACE_TYPE_PARTIAL_LINE; 635 return TRACE_TYPE_PARTIAL_LINE;
647 ret = trace_seq_printf(s, " | "); 636 ret = trace_seq_printf(s, " | ");
648 if (!ret) 637 if (!ret)
649 return TRACE_TYPE_PARTIAL_LINE; 638 return TRACE_TYPE_PARTIAL_LINE;
639 }
650 } 640 }
651 641
652 /* No overhead */ 642 /* No overhead */
653 ret = print_graph_overhead(-1, s, flags); 643 ret = print_graph_duration(DURATION_FILL_START, s, flags);
654 if (!ret) 644 if (ret != TRACE_TYPE_HANDLED)
655 return TRACE_TYPE_PARTIAL_LINE; 645 return ret;
656 646
657 if (type == TRACE_GRAPH_ENT) 647 if (type == TRACE_GRAPH_ENT)
658 ret = trace_seq_printf(s, "==========>"); 648 ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
662 if (!ret) 652 if (!ret)
663 return TRACE_TYPE_PARTIAL_LINE; 653 return TRACE_TYPE_PARTIAL_LINE;
664 654
665 /* Don't close the duration column if haven't one */ 655 ret = print_graph_duration(DURATION_FILL_END, s, flags);
666 if (flags & TRACE_GRAPH_PRINT_DURATION) 656 if (ret != TRACE_TYPE_HANDLED)
667 trace_seq_printf(s, " |"); 657 return ret;
658
668 ret = trace_seq_printf(s, "\n"); 659 ret = trace_seq_printf(s, "\n");
669 660
670 if (!ret) 661 if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
716} 707}
717 708
718static enum print_line_t 709static enum print_line_t
719print_graph_duration(unsigned long long duration, struct trace_seq *s) 710print_graph_duration(unsigned long long duration, struct trace_seq *s,
711 u32 flags)
720{ 712{
721 int ret; 713 int ret = -1;
714
715 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
716 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
717 return TRACE_TYPE_HANDLED;
718
719 /* No real adata, just filling the column with spaces */
720 switch (duration) {
721 case DURATION_FILL_FULL:
722 ret = trace_seq_printf(s, " | ");
723 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
724 case DURATION_FILL_START:
725 ret = trace_seq_printf(s, " ");
726 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
727 case DURATION_FILL_END:
728 ret = trace_seq_printf(s, " |");
729 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
730 }
731
732 /* Signal a overhead of time execution to the output */
733 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
734 /* Duration exceeded 100 msecs */
735 if (duration > 100000ULL)
736 ret = trace_seq_printf(s, "! ");
737 /* Duration exceeded 10 msecs */
738 else if (duration > 10000ULL)
739 ret = trace_seq_printf(s, "+ ");
740 }
741
742 /*
743 * The -1 means we either did not exceed the duration tresholds
744 * or we dont want to print out the overhead. Either way we need
745 * to fill out the space.
746 */
747 if (ret == -1)
748 ret = trace_seq_printf(s, " ");
749
750 /* Catching here any failure happenned above */
751 if (!ret)
752 return TRACE_TYPE_PARTIAL_LINE;
722 753
723 ret = trace_print_graph_duration(duration, s); 754 ret = trace_print_graph_duration(duration, s);
724 if (ret != TRACE_TYPE_HANDLED) 755 if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
767 cpu_data->enter_funcs[call->depth] = 0; 798 cpu_data->enter_funcs[call->depth] = 0;
768 } 799 }
769 800
770 /* Overhead */ 801 /* Overhead and duration */
771 ret = print_graph_overhead(duration, s, flags); 802 ret = print_graph_duration(duration, s, flags);
772 if (!ret) 803 if (ret == TRACE_TYPE_PARTIAL_LINE)
773 return TRACE_TYPE_PARTIAL_LINE; 804 return TRACE_TYPE_PARTIAL_LINE;
774 805
775 /* Duration */
776 if (flags & TRACE_GRAPH_PRINT_DURATION) {
777 ret = print_graph_duration(duration, s);
778 if (ret == TRACE_TYPE_PARTIAL_LINE)
779 return TRACE_TYPE_PARTIAL_LINE;
780 }
781
782 /* Function */ 806 /* Function */
783 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 807 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
784 ret = trace_seq_printf(s, " "); 808 ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
815 cpu_data->enter_funcs[call->depth] = call->func; 839 cpu_data->enter_funcs[call->depth] = call->func;
816 } 840 }
817 841
818 /* No overhead */
819 ret = print_graph_overhead(-1, s, flags);
820 if (!ret)
821 return TRACE_TYPE_PARTIAL_LINE;
822
823 /* No time */ 842 /* No time */
824 if (flags & TRACE_GRAPH_PRINT_DURATION) { 843 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
825 ret = trace_seq_printf(s, " | "); 844 if (ret != TRACE_TYPE_HANDLED)
826 if (!ret) 845 return ret;
827 return TRACE_TYPE_PARTIAL_LINE;
828 }
829 846
830 /* Function */ 847 /* Function */
831 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 848 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
865 return TRACE_TYPE_PARTIAL_LINE; 882 return TRACE_TYPE_PARTIAL_LINE;
866 } 883 }
867 884
885 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
886 return 0;
887
868 /* Absolute time */ 888 /* Absolute time */
869 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 889 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
870 ret = print_graph_abs_time(iter->ts, s); 890 ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1078 if (print_graph_prologue(iter, s, 0, 0, flags)) 1098 if (print_graph_prologue(iter, s, 0, 0, flags))
1079 return TRACE_TYPE_PARTIAL_LINE; 1099 return TRACE_TYPE_PARTIAL_LINE;
1080 1100
1081 /* Overhead */ 1101 /* Overhead and duration */
1082 ret = print_graph_overhead(duration, s, flags); 1102 ret = print_graph_duration(duration, s, flags);
1083 if (!ret) 1103 if (ret == TRACE_TYPE_PARTIAL_LINE)
1084 return TRACE_TYPE_PARTIAL_LINE; 1104 return TRACE_TYPE_PARTIAL_LINE;
1085 1105
1086 /* Duration */
1087 if (flags & TRACE_GRAPH_PRINT_DURATION) {
1088 ret = print_graph_duration(duration, s);
1089 if (ret == TRACE_TYPE_PARTIAL_LINE)
1090 return TRACE_TYPE_PARTIAL_LINE;
1091 }
1092
1093 /* Closing brace */ 1106 /* Closing brace */
1094 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1107 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1095 ret = trace_seq_printf(s, " "); 1108 ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1146 if (print_graph_prologue(iter, s, 0, 0, flags)) 1159 if (print_graph_prologue(iter, s, 0, 0, flags))
1147 return TRACE_TYPE_PARTIAL_LINE; 1160 return TRACE_TYPE_PARTIAL_LINE;
1148 1161
1149 /* No overhead */
1150 ret = print_graph_overhead(-1, s, flags);
1151 if (!ret)
1152 return TRACE_TYPE_PARTIAL_LINE;
1153
1154 /* No time */ 1162 /* No time */
1155 if (flags & TRACE_GRAPH_PRINT_DURATION) { 1163 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
1156 ret = trace_seq_printf(s, " | "); 1164 if (ret != TRACE_TYPE_HANDLED)
1157 if (!ret) 1165 return ret;
1158 return TRACE_TYPE_PARTIAL_LINE;
1159 }
1160 1166
1161 /* Indentation */ 1167 /* Indentation */
1162 if (depth > 0) 1168 if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1207 1213
1208 1214
1209enum print_line_t 1215enum print_line_t
1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1216print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1211{ 1217{
1212 struct ftrace_graph_ent_entry *field; 1218 struct ftrace_graph_ent_entry *field;
1213 struct fgraph_data *data = iter->private; 1219 struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1270static enum print_line_t 1276static enum print_line_t
1271print_graph_function(struct trace_iterator *iter) 1277print_graph_function(struct trace_iterator *iter)
1272{ 1278{
1273 return __print_graph_function_flags(iter, tracer_flags.val); 1279 return print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1285} 1280}
1286 1281
1287static enum print_line_t 1282static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1309 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); 1304 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
1310 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); 1305 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
1311 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); 1306 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
1312 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); 1307 seq_printf(s, "#%.*s||| / \n", size, spaces);
1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1314} 1308}
1315 1309
1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags) 1310static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1329 if (flags & TRACE_GRAPH_PRINT_PROC) 1323 if (flags & TRACE_GRAPH_PRINT_PROC)
1330 seq_printf(s, " TASK/PID "); 1324 seq_printf(s, " TASK/PID ");
1331 if (lat) 1325 if (lat)
1332 seq_printf(s, "|||||"); 1326 seq_printf(s, "||||");
1333 if (flags & TRACE_GRAPH_PRINT_DURATION) 1327 if (flags & TRACE_GRAPH_PRINT_DURATION)
1334 seq_printf(s, " DURATION "); 1328 seq_printf(s, " DURATION ");
1335 seq_printf(s, " FUNCTION CALLS\n"); 1329 seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1343 if (flags & TRACE_GRAPH_PRINT_PROC) 1337 if (flags & TRACE_GRAPH_PRINT_PROC)
1344 seq_printf(s, " | | "); 1338 seq_printf(s, " | | ");
1345 if (lat) 1339 if (lat)
1346 seq_printf(s, "|||||"); 1340 seq_printf(s, "||||");
1347 if (flags & TRACE_GRAPH_PRINT_DURATION) 1341 if (flags & TRACE_GRAPH_PRINT_DURATION)
1348 seq_printf(s, " | | "); 1342 seq_printf(s, " | | ");
1349 seq_printf(s, " | | | |\n"); 1343 seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{ 1352{
1359 struct trace_iterator *iter = s->private; 1353 struct trace_iterator *iter = s->private;
1360 1354
1355 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
1356 return;
1357
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 1358 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */ 1359 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter)) 1360 if (trace_empty(iter))
1364 return; 1361 return;
1365 1362
1366 print_trace_header(s, iter); 1363 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION; 1364 }
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370 1365
1371 __print_graph_headers_flags(s, flags); 1366 __print_graph_headers_flags(s, flags);
1372} 1367}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284d..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
226} 226}
227 227
228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ 228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
229 TRACE_GRAPH_PRINT_PROC) 229 TRACE_GRAPH_PRINT_PROC | \
230 TRACE_GRAPH_PRINT_ABS_TIME | \
231 TRACE_GRAPH_PRINT_DURATION)
230 232
231static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 233static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
232{ 234{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8be..5fb3697bf0e5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string) 343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size) 344DEFINE_FETCH_deref(string_size)
345 345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
346static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
347{ 355{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string_size NULL 385#define fetch_bitfield_string_size NULL
378 386
379static __kprobes void 387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data) 401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{ 402{
382 /* 403 /*
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
389 free_symbol_cache(data->orig.data); 410 free_symbol_cache(data->orig.data);
390 kfree(data); 411 kfree(data);
391} 412}
413
392/* Default (unsigned long) fetch type */ 414/* Default (unsigned long) fetch type */
393#define __DEFAULT_FETCH_TYPE(t) u##t 415#define __DEFAULT_FETCH_TYPE(t) u##t
394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -536,6 +558,7 @@ struct probe_arg {
536/* Flags for trace_probe */ 558/* Flags for trace_probe */
537#define TP_FLAG_TRACE 1 559#define TP_FLAG_TRACE 1
538#define TP_FLAG_PROFILE 2 560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
539 562
540struct trace_probe { 563struct trace_probe {
541 struct list_head list; 564 struct list_head list;
@@ -555,16 +578,49 @@ struct trace_probe {
555 (sizeof(struct probe_arg) * (n))) 578 (sizeof(struct probe_arg) * (n)))
556 579
557 580
558static __kprobes int probe_is_return(struct trace_probe *tp) 581static __kprobes int trace_probe_is_return(struct trace_probe *tp)
559{ 582{
560 return tp->rp.handler != NULL; 583 return tp->rp.handler != NULL;
561} 584}
562 585
563static __kprobes const char *probe_symbol(struct trace_probe *tp) 586static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
564{ 587{
565 return tp->symbol ? tp->symbol : "unknown"; 588 return tp->symbol ? tp->symbol : "unknown";
566} 589}
567 590
591static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
592{
593 return tp->rp.kp.offset;
594}
595
596static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
597{
598 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
599}
600
601static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
602{
603 return !!(tp->flags & TP_FLAG_REGISTERED);
604}
605
606static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
607{
608 return !!(kprobe_gone(&tp->rp.kp));
609}
610
611static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
612 struct module *mod)
613{
614 int len = strlen(mod->name);
615 const char *name = trace_probe_symbol(tp);
616 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
617}
618
619static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
620{
621 return !!strchr(trace_probe_symbol(tp), ':');
622}
623
568static int register_probe_event(struct trace_probe *tp); 624static int register_probe_event(struct trace_probe *tp);
569static void unregister_probe_event(struct trace_probe *tp); 625static void unregister_probe_event(struct trace_probe *tp);
570 626
@@ -646,6 +702,16 @@ error:
646 return ERR_PTR(ret); 702 return ERR_PTR(ret);
647} 703}
648 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
649static void free_probe_arg(struct probe_arg *arg) 715static void free_probe_arg(struct probe_arg *arg)
650{ 716{
651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) 717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
671 kfree(tp); 737 kfree(tp);
672} 738}
673 739
674static struct trace_probe *find_probe_event(const char *event, 740static struct trace_probe *find_trace_probe(const char *event,
675 const char *group) 741 const char *group)
676{ 742{
677 struct trace_probe *tp; 743 struct trace_probe *tp;
@@ -683,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event,
683 return NULL; 749 return NULL;
684} 750}
685 751
752/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
753static int enable_trace_probe(struct trace_probe *tp, int flag)
754{
755 int ret = 0;
756
757 tp->flags |= flag;
758 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
759 !trace_probe_has_gone(tp)) {
760 if (trace_probe_is_return(tp))
761 ret = enable_kretprobe(&tp->rp);
762 else
763 ret = enable_kprobe(&tp->rp.kp);
764 }
765
766 return ret;
767}
768
769/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
770static void disable_trace_probe(struct trace_probe *tp, int flag)
771{
772 tp->flags &= ~flag;
773 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
774 if (trace_probe_is_return(tp))
775 disable_kretprobe(&tp->rp);
776 else
777 disable_kprobe(&tp->rp.kp);
778 }
779}
780
781/* Internal register function - just handle k*probes and flags */
782static int __register_trace_probe(struct trace_probe *tp)
783{
784 int i, ret;
785
786 if (trace_probe_is_registered(tp))
787 return -EINVAL;
788
789 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]);
791
792 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp))
794 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
795 else
796 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
797
798 if (trace_probe_is_return(tp))
799 ret = register_kretprobe(&tp->rp);
800 else
801 ret = register_kprobe(&tp->rp.kp);
802
803 if (ret == 0)
804 tp->flags |= TP_FLAG_REGISTERED;
805 else {
806 pr_warning("Could not insert probe at %s+%lu: %d\n",
807 trace_probe_symbol(tp), trace_probe_offset(tp), ret);
808 if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
809 pr_warning("This probe might be able to register after"
810 "target module is loaded. Continue.\n");
811 ret = 0;
812 } else if (ret == -EILSEQ) {
813 pr_warning("Probing address(0x%p) is not an "
814 "instruction boundary.\n",
815 tp->rp.kp.addr);
816 ret = -EINVAL;
817 }
818 }
819
820 return ret;
821}
822
823/* Internal unregister function - just handle k*probes and flags */
824static void __unregister_trace_probe(struct trace_probe *tp)
825{
826 if (trace_probe_is_registered(tp)) {
827 if (trace_probe_is_return(tp))
828 unregister_kretprobe(&tp->rp);
829 else
830 unregister_kprobe(&tp->rp.kp);
831 tp->flags &= ~TP_FLAG_REGISTERED;
832 /* Cleanup kprobe for reuse */
833 if (tp->rp.kp.symbol_name)
834 tp->rp.kp.addr = NULL;
835 }
836}
837
686/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
687static void unregister_trace_probe(struct trace_probe *tp) 839static void unregister_trace_probe(struct trace_probe *tp)
688{ 840{
689 if (probe_is_return(tp)) 841 __unregister_trace_probe(tp);
690 unregister_kretprobe(&tp->rp);
691 else
692 unregister_kprobe(&tp->rp.kp);
693 list_del(&tp->list); 842 list_del(&tp->list);
694 unregister_probe_event(tp); 843 unregister_probe_event(tp);
695} 844}
@@ -702,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp)
702 851
703 mutex_lock(&probe_lock); 852 mutex_lock(&probe_lock);
704 853
705 /* register as an event */ 854 /* Delete old (same name) event if exist */
706 old_tp = find_probe_event(tp->call.name, tp->call.class->system); 855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
707 if (old_tp) { 856 if (old_tp) {
708 /* delete old event */
709 unregister_trace_probe(old_tp); 857 unregister_trace_probe(old_tp);
710 free_trace_probe(old_tp); 858 free_trace_probe(old_tp);
711 } 859 }
860
861 /* Register new event */
712 ret = register_probe_event(tp); 862 ret = register_probe_event(tp);
713 if (ret) { 863 if (ret) {
714 pr_warning("Failed to register probe event(%d)\n", ret); 864 pr_warning("Failed to register probe event(%d)\n", ret);
715 goto end; 865 goto end;
716 } 866 }
717 867
718 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 868 /* Register k*probe */
719 if (probe_is_return(tp)) 869 ret = __register_trace_probe(tp);
720 ret = register_kretprobe(&tp->rp); 870 if (ret < 0)
721 else
722 ret = register_kprobe(&tp->rp.kp);
723
724 if (ret) {
725 pr_warning("Could not insert probe(%d)\n", ret);
726 if (ret == -EILSEQ) {
727 pr_warning("Probing address(0x%p) is not an "
728 "instruction boundary.\n",
729 tp->rp.kp.addr);
730 ret = -EINVAL;
731 }
732 unregister_probe_event(tp); 871 unregister_probe_event(tp);
733 } else 872 else
734 list_add_tail(&tp->list, &probe_list); 873 list_add_tail(&tp->list, &probe_list);
874
735end: 875end:
736 mutex_unlock(&probe_lock); 876 mutex_unlock(&probe_lock);
737 return ret; 877 return ret;
738} 878}
739 879
880/* Module notifier call back, checking event on the module */
881static int trace_probe_module_callback(struct notifier_block *nb,
882 unsigned long val, void *data)
883{
884 struct module *mod = data;
885 struct trace_probe *tp;
886 int ret;
887
888 if (val != MODULE_STATE_COMING)
889 return NOTIFY_DONE;
890
891 /* Update probes on coming module */
892 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) {
895 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp);
897 if (ret)
898 pr_warning("Failed to re-register probe %s on"
899 "%s: %d\n",
900 tp->call.name, mod->name, ret);
901 }
902 }
903 mutex_unlock(&probe_lock);
904
905 return NOTIFY_DONE;
906}
907
908static struct notifier_block trace_probe_module_nb = {
909 .notifier_call = trace_probe_module_callback,
910 .priority = 1 /* Invoked after kprobe module callback */
911};
912
740/* Split symbol and offset. */ 913/* Split symbol and offset. */
741static int split_symbol_offset(char *symbol, unsigned long *offset) 914static int split_symbol_offset(char *symbol, unsigned long *offset)
742{ 915{
@@ -962,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv)
962{ 1135{
963 /* 1136 /*
964 * Argument syntax: 1137 * Argument syntax:
965 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 1138 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
966 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 1139 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
967 * Fetch args: 1140 * Fetch args:
968 * $retval : fetch return value 1141 * $retval : fetch return value
969 * $stack : fetch stack address 1142 * $stack : fetch stack address
@@ -1025,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv)
1025 return -EINVAL; 1198 return -EINVAL;
1026 } 1199 }
1027 mutex_lock(&probe_lock); 1200 mutex_lock(&probe_lock);
1028 tp = find_probe_event(event, group); 1201 tp = find_trace_probe(event, group);
1029 if (!tp) { 1202 if (!tp) {
1030 mutex_unlock(&probe_lock); 1203 mutex_unlock(&probe_lock);
1031 pr_info("Event %s/%s doesn't exist.\n", group, event); 1204 pr_info("Event %s/%s doesn't exist.\n", group, event);
@@ -1144,7 +1317,7 @@ error:
1144 return ret; 1317 return ret;
1145} 1318}
1146 1319
1147static void cleanup_all_probes(void) 1320static void release_all_trace_probes(void)
1148{ 1321{
1149 struct trace_probe *tp; 1322 struct trace_probe *tp;
1150 1323
@@ -1158,7 +1331,6 @@ static void cleanup_all_probes(void)
1158 mutex_unlock(&probe_lock); 1331 mutex_unlock(&probe_lock);
1159} 1332}
1160 1333
1161
1162/* Probes listing interfaces */ 1334/* Probes listing interfaces */
1163static void *probes_seq_start(struct seq_file *m, loff_t *pos) 1335static void *probes_seq_start(struct seq_file *m, loff_t *pos)
1164{ 1336{
@@ -1181,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
1181 struct trace_probe *tp = v; 1353 struct trace_probe *tp = v;
1182 int i; 1354 int i;
1183 1355
1184 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1356 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
1185 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 1357 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
1186 1358
1187 if (!tp->symbol) 1359 if (!tp->symbol)
1188 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1360 seq_printf(m, " 0x%p", tp->rp.kp.addr);
1189 else if (tp->rp.kp.offset) 1361 else if (tp->rp.kp.offset)
1190 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 1362 seq_printf(m, " %s+%u", trace_probe_symbol(tp),
1363 tp->rp.kp.offset);
1191 else 1364 else
1192 seq_printf(m, " %s", probe_symbol(tp)); 1365 seq_printf(m, " %s", trace_probe_symbol(tp));
1193 1366
1194 for (i = 0; i < tp->nr_args; i++) 1367 for (i = 0; i < tp->nr_args; i++)
1195 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 1368 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1209,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file)
1209{ 1382{
1210 if ((file->f_mode & FMODE_WRITE) && 1383 if ((file->f_mode & FMODE_WRITE) &&
1211 (file->f_flags & O_TRUNC)) 1384 (file->f_flags & O_TRUNC))
1212 cleanup_all_probes(); 1385 release_all_trace_probes();
1213 1386
1214 return seq_open(file, &probes_seq_op); 1387 return seq_open(file, &probes_seq_op);
1215} 1388}
@@ -1397,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1397 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1570 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1398 1571
1399 if (!filter_current_check_discard(buffer, call, entry, event)) 1572 if (!filter_current_check_discard(buffer, call, entry, event))
1400 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1573 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1574 irq_flags, pc, regs);
1401} 1575}
1402 1576
1403/* Kretprobe handler */ 1577/* Kretprobe handler */
@@ -1429,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1429 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1603 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1430 1604
1431 if (!filter_current_check_discard(buffer, call, entry, event)) 1605 if (!filter_current_check_discard(buffer, call, entry, event))
1432 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1606 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1607 irq_flags, pc, regs);
1433} 1608}
1434 1609
1435/* Event entry printers */ 1610/* Event entry printers */
@@ -1511,30 +1686,6 @@ partial:
1511 return TRACE_TYPE_PARTIAL_LINE; 1686 return TRACE_TYPE_PARTIAL_LINE;
1512} 1687}
1513 1688
1514static int probe_event_enable(struct ftrace_event_call *call)
1515{
1516 struct trace_probe *tp = (struct trace_probe *)call->data;
1517
1518 tp->flags |= TP_FLAG_TRACE;
1519 if (probe_is_return(tp))
1520 return enable_kretprobe(&tp->rp);
1521 else
1522 return enable_kprobe(&tp->rp.kp);
1523}
1524
1525static void probe_event_disable(struct ftrace_event_call *call)
1526{
1527 struct trace_probe *tp = (struct trace_probe *)call->data;
1528
1529 tp->flags &= ~TP_FLAG_TRACE;
1530 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1531 if (probe_is_return(tp))
1532 disable_kretprobe(&tp->rp);
1533 else
1534 disable_kprobe(&tp->rp.kp);
1535 }
1536}
1537
1538#undef DEFINE_FIELD 1689#undef DEFINE_FIELD
1539#define DEFINE_FIELD(type, item, name, is_signed) \ 1690#define DEFINE_FIELD(type, item, name, is_signed) \
1540 do { \ 1691 do { \
@@ -1596,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1596 1747
1597 const char *fmt, *arg; 1748 const char *fmt, *arg;
1598 1749
1599 if (!probe_is_return(tp)) { 1750 if (!trace_probe_is_return(tp)) {
1600 fmt = "(%lx)"; 1751 fmt = "(%lx)";
1601 arg = "REC->" FIELD_STRING_IP; 1752 arg = "REC->" FIELD_STRING_IP;
1602 } else { 1753 } else {
@@ -1713,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1713 head = this_cpu_ptr(call->perf_events); 1864 head = this_cpu_ptr(call->perf_events);
1714 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1865 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1715} 1866}
1716
1717static int probe_perf_enable(struct ftrace_event_call *call)
1718{
1719 struct trace_probe *tp = (struct trace_probe *)call->data;
1720
1721 tp->flags |= TP_FLAG_PROFILE;
1722
1723 if (probe_is_return(tp))
1724 return enable_kretprobe(&tp->rp);
1725 else
1726 return enable_kprobe(&tp->rp.kp);
1727}
1728
1729static void probe_perf_disable(struct ftrace_event_call *call)
1730{
1731 struct trace_probe *tp = (struct trace_probe *)call->data;
1732
1733 tp->flags &= ~TP_FLAG_PROFILE;
1734
1735 if (!(tp->flags & TP_FLAG_TRACE)) {
1736 if (probe_is_return(tp))
1737 disable_kretprobe(&tp->rp);
1738 else
1739 disable_kprobe(&tp->rp.kp);
1740 }
1741}
1742#endif /* CONFIG_PERF_EVENTS */ 1867#endif /* CONFIG_PERF_EVENTS */
1743 1868
1744static __kprobes 1869static __kprobes
1745int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1870int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1746{ 1871{
1872 struct trace_probe *tp = (struct trace_probe *)event->data;
1873
1747 switch (type) { 1874 switch (type) {
1748 case TRACE_REG_REGISTER: 1875 case TRACE_REG_REGISTER:
1749 return probe_event_enable(event); 1876 return enable_trace_probe(tp, TP_FLAG_TRACE);
1750 case TRACE_REG_UNREGISTER: 1877 case TRACE_REG_UNREGISTER:
1751 probe_event_disable(event); 1878 disable_trace_probe(tp, TP_FLAG_TRACE);
1752 return 0; 1879 return 0;
1753 1880
1754#ifdef CONFIG_PERF_EVENTS 1881#ifdef CONFIG_PERF_EVENTS
1755 case TRACE_REG_PERF_REGISTER: 1882 case TRACE_REG_PERF_REGISTER:
1756 return probe_perf_enable(event); 1883 return enable_trace_probe(tp, TP_FLAG_PROFILE);
1757 case TRACE_REG_PERF_UNREGISTER: 1884 case TRACE_REG_PERF_UNREGISTER:
1758 probe_perf_disable(event); 1885 disable_trace_probe(tp, TP_FLAG_PROFILE);
1759 return 0; 1886 return 0;
1760#endif 1887#endif
1761 } 1888 }
@@ -1805,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp)
1805 1932
1806 /* Initialize ftrace_event_call */ 1933 /* Initialize ftrace_event_call */
1807 INIT_LIST_HEAD(&call->class->fields); 1934 INIT_LIST_HEAD(&call->class->fields);
1808 if (probe_is_return(tp)) { 1935 if (trace_probe_is_return(tp)) {
1809 call->event.funcs = &kretprobe_funcs; 1936 call->event.funcs = &kretprobe_funcs;
1810 call->class->define_fields = kretprobe_event_define_fields; 1937 call->class->define_fields = kretprobe_event_define_fields;
1811 } else { 1938 } else {
@@ -1844,6 +1971,9 @@ static __init int init_kprobe_trace(void)
1844 struct dentry *d_tracer; 1971 struct dentry *d_tracer;
1845 struct dentry *entry; 1972 struct dentry *entry;
1846 1973
1974 if (register_module_notifier(&trace_probe_module_nb))
1975 return -EINVAL;
1976
1847 d_tracer = tracing_init_dentry(); 1977 d_tracer = tracing_init_dentry();
1848 if (!d_tracer) 1978 if (!d_tracer)
1849 return 0; 1979 return 0;
@@ -1897,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void)
1897 warn++; 2027 warn++;
1898 } else { 2028 } else {
1899 /* Enable trace point */ 2029 /* Enable trace point */
1900 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); 2030 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1901 if (WARN_ON_ONCE(tp == NULL)) { 2031 if (WARN_ON_ONCE(tp == NULL)) {
1902 pr_warning("error on getting new probe.\n"); 2032 pr_warning("error on getting new probe.\n");
1903 warn++; 2033 warn++;
1904 } else 2034 } else
1905 probe_event_enable(&tp->call); 2035 enable_trace_probe(tp, TP_FLAG_TRACE);
1906 } 2036 }
1907 2037
1908 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 2038 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1912,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void)
1912 warn++; 2042 warn++;
1913 } else { 2043 } else {
1914 /* Enable trace point */ 2044 /* Enable trace point */
1915 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); 2045 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1916 if (WARN_ON_ONCE(tp == NULL)) { 2046 if (WARN_ON_ONCE(tp == NULL)) {
1917 pr_warning("error on getting new probe.\n"); 2047 pr_warning("error on getting new probe.\n");
1918 warn++; 2048 warn++;
1919 } else 2049 } else
1920 probe_event_enable(&tp->call); 2050 enable_trace_probe(tp, TP_FLAG_TRACE);
1921 } 2051 }
1922 2052
1923 if (warn) 2053 if (warn)
@@ -1938,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void)
1938 } 2068 }
1939 2069
1940end: 2070end:
1941 cleanup_all_probes(); 2071 release_all_trace_probes();
1942 if (warn) 2072 if (warn)
1943 pr_cont("NG: Some tests are failed. Please check them.\n"); 2073 pr_cont("NG: Some tests are failed. Please check them.\n");
1944 else 2074 else
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e1..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1107{ 1107{
1108 struct stack_entry *field; 1108 struct stack_entry *field;
1109 struct trace_seq *s = &iter->seq; 1109 struct trace_seq *s = &iter->seq;
1110 int i; 1110 unsigned long *p;
1111 unsigned long *end;
1111 1112
1112 trace_assign_type(field, iter->ent); 1113 trace_assign_type(field, iter->ent);
1114 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1113 1115
1114 if (!trace_seq_puts(s, "<stack trace>\n")) 1116 if (!trace_seq_puts(s, "<stack trace>\n"))
1115 goto partial; 1117 goto partial;
1116 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1118
1117 if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) 1119 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1118 break;
1119 if (!trace_seq_puts(s, " => ")) 1120 if (!trace_seq_puts(s, " => "))
1120 goto partial; 1121 goto partial;
1121 1122
1122 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1123 if (!seq_print_ip_sym(s, *p, flags))
1123 goto partial; 1124 goto partial;
1124 if (!trace_seq_puts(s, "\n")) 1125 if (!trace_seq_puts(s, "\n"))
1125 goto partial; 1126 goto partial;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2ca..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
227 graph_trace_close(iter); 227 graph_trace_close(iter);
228} 228}
229 229
230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) 230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
231 TRACE_GRAPH_PRINT_ABS_TIME | \
232 TRACE_GRAPH_PRINT_DURATION)
231 233
232static enum print_line_t wakeup_print_line(struct trace_iterator *iter) 234static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
233{ 235{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c25..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
156{ 156{
157 long *ptr = filp->private_data; 157 long *ptr = filp->private_data;
158 unsigned long val, flags; 158 unsigned long val, flags;
159 char buf[64];
160 int ret; 159 int ret;
161 int cpu; 160 int cpu;
162 161
163 if (count >= sizeof(buf)) 162 ret = kstrtoul_from_user(ubuf, count, 10, &val);
164 return -EINVAL; 163 if (ret)
165
166 if (copy_from_user(&buf, ubuf, count))
167 return -EFAULT;
168
169 buf[count] = 0;
170
171 ret = strict_strtoul(buf, 10, &val);
172 if (ret < 0)
173 return ret; 164 return ret;
174 165
175 local_irq_save(flags); 166 local_irq_save(flags);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad4792..36491cd5b7d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts)
200} 200}
201 201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203
203static struct perf_event_attr wd_hw_attr = { 204static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 205 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES, 206 .config = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
209}; 210};
210 211
211/* Callback function for perf event subsystem */ 212/* Callback function for perf event subsystem */
212static void watchdog_overflow_callback(struct perf_event *event, int nmi, 213static void watchdog_overflow_callback(struct perf_event *event,
213 struct perf_sample_data *data, 214 struct perf_sample_data *data,
214 struct pt_regs *regs) 215 struct pt_regs *regs)
215{ 216{
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
368 if (event != NULL) 369 if (event != NULL)
369 goto out_enable; 370 goto out_enable;
370 371
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374
375 /* Try to register using hardware perf events */
376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
375 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 379 goto out_save;