aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2017-09-05 08:16:28 -0400
committerIngo Molnar <mingo@kernel.org>2017-10-27 04:31:59 -0400
commit0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 (patch)
tree829a0fcdecec3ba45aac39380fdc8e31acfc286a
parent0c1cbc18df9e38182a0604b15535699c84d7342a (diff)
perf/core: Rewrite event timekeeping
The current even timekeeping, which computes enabled and running times, uses 3 distinct timestamps to reflect the various event states: OFF (stopped), INACTIVE (enabled) and ACTIVE (running). Furthermore, the update rules are such that even INACTIVE events need their timestamps updated. This is undesirable because we'd like to not touch INACTIVE events if at all possible, this makes event scheduling (much) more expensive than needed. Rewrite the timekeeping to directly use event->state, this greatly simplifies the code and results in only having to update things when we change state, or an up-to-date value is requested (read). Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/perf_event.h19
-rw-r--r--kernel/events/core.c385
2 files changed, 130 insertions, 274 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b7532650de47..874b71a70058 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -588,26 +588,10 @@ struct perf_event {
588 * has been enabled (i.e. eligible to run, and the task has 588 * has been enabled (i.e. eligible to run, and the task has
589 * been scheduled in, if this is a per-task event) 589 * been scheduled in, if this is a per-task event)
590 * and running (scheduled onto the CPU), respectively. 590 * and running (scheduled onto the CPU), respectively.
591 *
592 * They are computed from tstamp_enabled, tstamp_running and
593 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
594 */ 591 */
595 u64 total_time_enabled; 592 u64 total_time_enabled;
596 u64 total_time_running; 593 u64 total_time_running;
597 594 u64 tstamp;
598 /*
599 * These are timestamps used for computing total_time_enabled
600 * and total_time_running when the event is in INACTIVE or
601 * ACTIVE state, measured in nanoseconds from an arbitrary point
602 * in time.
603 * tstamp_enabled: the notional time when the event was enabled
604 * tstamp_running: the notional time when the event was scheduled on
605 * tstamp_stopped: in INACTIVE state, the notional time when the
606 * event was scheduled off.
607 */
608 u64 tstamp_enabled;
609 u64 tstamp_running;
610 u64 tstamp_stopped;
611 595
612 /* 596 /*
613 * timestamp shadows the actual context timing but it can 597 * timestamp shadows the actual context timing but it can
@@ -699,7 +683,6 @@ struct perf_event {
699 683
700#ifdef CONFIG_CGROUP_PERF 684#ifdef CONFIG_CGROUP_PERF
701 struct perf_cgroup *cgrp; /* cgroup event is attach to */ 685 struct perf_cgroup *cgrp; /* cgroup event is attach to */
702 int cgrp_defer_enabled;
703#endif 686#endif
704 687
705 struct list_head sb_list; 688 struct list_head sb_list;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f74f9c35490..2551e8ce7224 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
582 return event->clock(); 582 return event->clock();
583} 583}
584 584
585/*
586 * State based event timekeeping...
587 *
588 * The basic idea is to use event->state to determine which (if any) time
589 * fields to increment with the current delta. This means we only need to
590 * update timestamps when we change state or when they are explicitly requested
591 * (read).
592 *
593 * Event groups make things a little more complicated, but not terribly so. The
594 * rules for a group are that if the group leader is OFF the entire group is
595 * OFF, irrespecive of what the group member states are. This results in
596 * __perf_effective_state().
597 *
598 * A futher ramification is that when a group leader flips between OFF and
599 * !OFF, we need to update all group member times.
600 *
601 *
602 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
603 * need to make sure the relevant context time is updated before we try and
604 * update our timestamps.
605 */
606
607static __always_inline enum perf_event_state
608__perf_effective_state(struct perf_event *event)
609{
610 struct perf_event *leader = event->group_leader;
611
612 if (leader->state <= PERF_EVENT_STATE_OFF)
613 return leader->state;
614
615 return event->state;
616}
617
618static __always_inline void
619__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
620{
621 enum perf_event_state state = __perf_effective_state(event);
622 u64 delta = now - event->tstamp;
623
624 *enabled = event->total_time_enabled;
625 if (state >= PERF_EVENT_STATE_INACTIVE)
626 *enabled += delta;
627
628 *running = event->total_time_running;
629 if (state >= PERF_EVENT_STATE_ACTIVE)
630 *running += delta;
631}
632
633static void perf_event_update_time(struct perf_event *event)
634{
635 u64 now = perf_event_time(event);
636
637 __perf_update_times(event, now, &event->total_time_enabled,
638 &event->total_time_running);
639 event->tstamp = now;
640}
641
642static void perf_event_update_sibling_time(struct perf_event *leader)
643{
644 struct perf_event *sibling;
645
646 list_for_each_entry(sibling, &leader->sibling_list, group_entry)
647 perf_event_update_time(sibling);
648}
649
650static void
651perf_event_set_state(struct perf_event *event, enum perf_event_state state)
652{
653 if (event->state == state)
654 return;
655
656 perf_event_update_time(event);
657 /*
658 * If a group leader gets enabled/disabled all its siblings
659 * are affected too.
660 */
661 if ((event->state < 0) ^ (state < 0))
662 perf_event_update_sibling_time(event);
663
664 WRITE_ONCE(event->state, state);
665}
666
585#ifdef CONFIG_CGROUP_PERF 667#ifdef CONFIG_CGROUP_PERF
586 668
587static inline bool 669static inline bool
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
841 event->shadow_ctx_time = now - t->timestamp; 923 event->shadow_ctx_time = now - t->timestamp;
842} 924}
843 925
844static inline void
845perf_cgroup_defer_enabled(struct perf_event *event)
846{
847 /*
848 * when the current task's perf cgroup does not match
849 * the event's, we need to remember to call the
850 * perf_mark_enable() function the first time a task with
851 * a matching perf cgroup is scheduled in.
852 */
853 if (is_cgroup_event(event) && !perf_cgroup_match(event))
854 event->cgrp_defer_enabled = 1;
855}
856
857static inline void
858perf_cgroup_mark_enabled(struct perf_event *event,
859 struct perf_event_context *ctx)
860{
861 struct perf_event *sub;
862 u64 tstamp = perf_event_time(event);
863
864 if (!event->cgrp_defer_enabled)
865 return;
866
867 event->cgrp_defer_enabled = 0;
868
869 event->tstamp_enabled = tstamp - event->total_time_enabled;
870 list_for_each_entry(sub, &event->sibling_list, group_entry) {
871 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
872 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
873 sub->cgrp_defer_enabled = 0;
874 }
875 }
876}
877
878/* 926/*
879 * Update cpuctx->cgrp so that it is set when first cgroup event is added and 927 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
880 * cleared when last cgroup event is removed. 928 * cleared when last cgroup event is removed.
@@ -973,17 +1021,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
973} 1021}
974 1022
975static inline void 1023static inline void
976perf_cgroup_defer_enabled(struct perf_event *event)
977{
978}
979
980static inline void
981perf_cgroup_mark_enabled(struct perf_event *event,
982 struct perf_event_context *ctx)
983{
984}
985
986static inline void
987list_update_cgroup_event(struct perf_event *event, 1024list_update_cgroup_event(struct perf_event *event,
988 struct perf_event_context *ctx, bool add) 1025 struct perf_event_context *ctx, bool add)
989{ 1026{
@@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event)
1396 return ctx ? ctx->time : 0; 1433 return ctx ? ctx->time : 0;
1397} 1434}
1398 1435
1399/*
1400 * Update the total_time_enabled and total_time_running fields for a event.
1401 */
1402static void update_event_times(struct perf_event *event)
1403{
1404 struct perf_event_context *ctx = event->ctx;
1405 u64 run_end;
1406
1407 lockdep_assert_held(&ctx->lock);
1408
1409 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1410 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1411 return;
1412
1413 /*
1414 * in cgroup mode, time_enabled represents
1415 * the time the event was enabled AND active
1416 * tasks were in the monitored cgroup. This is
1417 * independent of the activity of the context as
1418 * there may be a mix of cgroup and non-cgroup events.
1419 *
1420 * That is why we treat cgroup events differently
1421 * here.
1422 */
1423 if (is_cgroup_event(event))
1424 run_end = perf_cgroup_event_time(event);
1425 else if (ctx->is_active)
1426 run_end = ctx->time;
1427 else
1428 run_end = event->tstamp_stopped;
1429
1430 event->total_time_enabled = run_end - event->tstamp_enabled;
1431
1432 if (event->state == PERF_EVENT_STATE_INACTIVE)
1433 run_end = event->tstamp_stopped;
1434 else
1435 run_end = perf_event_time(event);
1436
1437 event->total_time_running = run_end - event->tstamp_running;
1438
1439}
1440
1441/*
1442 * Update total_time_enabled and total_time_running for all events in a group.
1443 */
1444static void update_group_times(struct perf_event *leader)
1445{
1446 struct perf_event *event;
1447
1448 update_event_times(leader);
1449 list_for_each_entry(event, &leader->sibling_list, group_entry)
1450 update_event_times(event);
1451}
1452
1453static enum event_type_t get_event_type(struct perf_event *event) 1436static enum event_type_t get_event_type(struct perf_event *event)
1454{ 1437{
1455 struct perf_event_context *ctx = event->ctx; 1438 struct perf_event_context *ctx = event->ctx;
@@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1492 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); 1475 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1493 event->attach_state |= PERF_ATTACH_CONTEXT; 1476 event->attach_state |= PERF_ATTACH_CONTEXT;
1494 1477
1478 event->tstamp = perf_event_time(event);
1479
1495 /* 1480 /*
1496 * If we're a stand alone event or group leader, we go to the context 1481 * If we're a stand alone event or group leader, we go to the context
1497 * list, group events are kept attached to the group so that 1482 * list, group events are kept attached to the group so that
@@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1699 if (event->group_leader == event) 1684 if (event->group_leader == event)
1700 list_del_init(&event->group_entry); 1685 list_del_init(&event->group_entry);
1701 1686
1702 update_group_times(event);
1703
1704 /* 1687 /*
1705 * If event was in error state, then keep it 1688 * If event was in error state, then keep it
1706 * that way, otherwise bogus counts will be 1689 * that way, otherwise bogus counts will be
@@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1709 * of the event 1692 * of the event
1710 */ 1693 */
1711 if (event->state > PERF_EVENT_STATE_OFF) 1694 if (event->state > PERF_EVENT_STATE_OFF)
1712 event->state = PERF_EVENT_STATE_OFF; 1695 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1713 1696
1714 ctx->generation++; 1697 ctx->generation++;
1715} 1698}
@@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event,
1808 struct perf_cpu_context *cpuctx, 1791 struct perf_cpu_context *cpuctx,
1809 struct perf_event_context *ctx) 1792 struct perf_event_context *ctx)
1810{ 1793{
1811 u64 tstamp = perf_event_time(event); 1794 enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1812 u64 delta;
1813 1795
1814 WARN_ON_ONCE(event->ctx != ctx); 1796 WARN_ON_ONCE(event->ctx != ctx);
1815 lockdep_assert_held(&ctx->lock); 1797 lockdep_assert_held(&ctx->lock);
1816 1798
1817 /*
1818 * An event which could not be activated because of
1819 * filter mismatch still needs to have its timings
1820 * maintained, otherwise bogus information is return
1821 * via read() for time_enabled, time_running:
1822 */
1823 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1824 !event_filter_match(event)) {
1825 delta = tstamp - event->tstamp_stopped;
1826 event->tstamp_running += delta;
1827 event->tstamp_stopped = tstamp;
1828 }
1829
1830 if (event->state != PERF_EVENT_STATE_ACTIVE) 1799 if (event->state != PERF_EVENT_STATE_ACTIVE)
1831 return; 1800 return;
1832 1801
1833 perf_pmu_disable(event->pmu); 1802 perf_pmu_disable(event->pmu);
1834 1803
1835 event->tstamp_stopped = tstamp;
1836 event->pmu->del(event, 0); 1804 event->pmu->del(event, 0);
1837 event->oncpu = -1; 1805 event->oncpu = -1;
1838 event->state = PERF_EVENT_STATE_INACTIVE; 1806
1839 if (event->pending_disable) { 1807 if (event->pending_disable) {
1840 event->pending_disable = 0; 1808 event->pending_disable = 0;
1841 event->state = PERF_EVENT_STATE_OFF; 1809 state = PERF_EVENT_STATE_OFF;
1842 } 1810 }
1811 perf_event_set_state(event, state);
1843 1812
1844 if (!is_software_event(event)) 1813 if (!is_software_event(event))
1845 cpuctx->active_oncpu--; 1814 cpuctx->active_oncpu--;
@@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event,
1859 struct perf_event_context *ctx) 1828 struct perf_event_context *ctx)
1860{ 1829{
1861 struct perf_event *event; 1830 struct perf_event *event;
1862 int state = group_event->state; 1831
1832 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
1833 return;
1863 1834
1864 perf_pmu_disable(ctx->pmu); 1835 perf_pmu_disable(ctx->pmu);
1865 1836
@@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event,
1873 1844
1874 perf_pmu_enable(ctx->pmu); 1845 perf_pmu_enable(ctx->pmu);
1875 1846
1876 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) 1847 if (group_event->attr.exclusive)
1877 cpuctx->exclusive = 0; 1848 cpuctx->exclusive = 0;
1878} 1849}
1879 1850
@@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event,
1965 update_cgrp_time_from_event(event); 1936 update_cgrp_time_from_event(event);
1966 } 1937 }
1967 1938
1968 update_group_times(event);
1969 if (event == event->group_leader) 1939 if (event == event->group_leader)
1970 group_sched_out(event, cpuctx, ctx); 1940 group_sched_out(event, cpuctx, ctx);
1971 else 1941 else
1972 event_sched_out(event, cpuctx, ctx); 1942 event_sched_out(event, cpuctx, ctx);
1973 event->state = PERF_EVENT_STATE_OFF; 1943
1944 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1974} 1945}
1975 1946
1976/* 1947/*
@@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
2027} 1998}
2028 1999
2029static void perf_set_shadow_time(struct perf_event *event, 2000static void perf_set_shadow_time(struct perf_event *event,
2030 struct perf_event_context *ctx, 2001 struct perf_event_context *ctx)
2031 u64 tstamp)
2032{ 2002{
2033 /* 2003 /*
2034 * use the correct time source for the time snapshot 2004 * use the correct time source for the time snapshot
@@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event,
2056 * is cleaner and simpler to understand. 2026 * is cleaner and simpler to understand.
2057 */ 2027 */
2058 if (is_cgroup_event(event)) 2028 if (is_cgroup_event(event))
2059 perf_cgroup_set_shadow_time(event, tstamp); 2029 perf_cgroup_set_shadow_time(event, event->tstamp);
2060 else 2030 else
2061 event->shadow_ctx_time = tstamp - ctx->timestamp; 2031 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2062} 2032}
2063 2033
2064#define MAX_INTERRUPTS (~0ULL) 2034#define MAX_INTERRUPTS (~0ULL)
@@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event,
2071 struct perf_cpu_context *cpuctx, 2041 struct perf_cpu_context *cpuctx,
2072 struct perf_event_context *ctx) 2042 struct perf_event_context *ctx)
2073{ 2043{
2074 u64 tstamp = perf_event_time(event);
2075 int ret = 0; 2044 int ret = 0;
2076 2045
2077 lockdep_assert_held(&ctx->lock); 2046 lockdep_assert_held(&ctx->lock);
@@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event,
2086 * ->oncpu if it sees ACTIVE. 2055 * ->oncpu if it sees ACTIVE.
2087 */ 2056 */
2088 smp_wmb(); 2057 smp_wmb();
2089 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); 2058 perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2090 2059
2091 /* 2060 /*
2092 * Unthrottle events, since we scheduled we might have missed several 2061 * Unthrottle events, since we scheduled we might have missed several
@@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event,
2100 2069
2101 perf_pmu_disable(event->pmu); 2070 perf_pmu_disable(event->pmu);
2102 2071
2103 perf_set_shadow_time(event, ctx, tstamp); 2072 perf_set_shadow_time(event, ctx);
2104 2073
2105 perf_log_itrace_start(event); 2074 perf_log_itrace_start(event);
2106 2075
2107 if (event->pmu->add(event, PERF_EF_START)) { 2076 if (event->pmu->add(event, PERF_EF_START)) {
2108 event->state = PERF_EVENT_STATE_INACTIVE; 2077 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2109 event->oncpu = -1; 2078 event->oncpu = -1;
2110 ret = -EAGAIN; 2079 ret = -EAGAIN;
2111 goto out; 2080 goto out;
2112 } 2081 }
2113 2082
2114 event->tstamp_running += tstamp - event->tstamp_stopped;
2115
2116 if (!is_software_event(event)) 2083 if (!is_software_event(event))
2117 cpuctx->active_oncpu++; 2084 cpuctx->active_oncpu++;
2118 if (!ctx->nr_active++) 2085 if (!ctx->nr_active++)
@@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event,
2136{ 2103{
2137 struct perf_event *event, *partial_group = NULL; 2104 struct perf_event *event, *partial_group = NULL;
2138 struct pmu *pmu = ctx->pmu; 2105 struct pmu *pmu = ctx->pmu;
2139 u64 now = ctx->time;
2140 bool simulate = false;
2141 2106
2142 if (group_event->state == PERF_EVENT_STATE_OFF) 2107 if (group_event->state == PERF_EVENT_STATE_OFF)
2143 return 0; 2108 return 0;
@@ -2167,27 +2132,13 @@ group_error:
2167 /* 2132 /*
2168 * Groups can be scheduled in as one unit only, so undo any 2133 * Groups can be scheduled in as one unit only, so undo any
2169 * partial group before returning: 2134 * partial group before returning:
2170 * The events up to the failed event are scheduled out normally, 2135 * The events up to the failed event are scheduled out normally.
2171 * tstamp_stopped will be updated.
2172 *
2173 * The failed events and the remaining siblings need to have
2174 * their timings updated as if they had gone thru event_sched_in()
2175 * and event_sched_out(). This is required to get consistent timings
2176 * across the group. This also takes care of the case where the group
2177 * could never be scheduled by ensuring tstamp_stopped is set to mark
2178 * the time the event was actually stopped, such that time delta
2179 * calculation in update_event_times() is correct.
2180 */ 2136 */
2181 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2137 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2182 if (event == partial_group) 2138 if (event == partial_group)
2183 simulate = true; 2139 break;
2184 2140
2185 if (simulate) { 2141 event_sched_out(event, cpuctx, ctx);
2186 event->tstamp_running += now - event->tstamp_stopped;
2187 event->tstamp_stopped = now;
2188 } else {
2189 event_sched_out(event, cpuctx, ctx);
2190 }
2191 } 2142 }
2192 event_sched_out(group_event, cpuctx, ctx); 2143 event_sched_out(group_event, cpuctx, ctx);
2193 2144
@@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event,
2229 return can_add_hw; 2180 return can_add_hw;
2230} 2181}
2231 2182
2232/*
2233 * Complement to update_event_times(). This computes the tstamp_* values to
2234 * continue 'enabled' state from @now, and effectively discards the time
2235 * between the prior tstamp_stopped and now (as we were in the OFF state, or
2236 * just switched (context) time base).
2237 *
2238 * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
2239 * cannot have been scheduled in yet. And going into INACTIVE state means
2240 * '@event->tstamp_stopped = @now'.
2241 *
2242 * Thus given the rules of update_event_times():
2243 *
2244 * total_time_enabled = tstamp_stopped - tstamp_enabled
2245 * total_time_running = tstamp_stopped - tstamp_running
2246 *
2247 * We can insert 'tstamp_stopped == now' and reverse them to compute new
2248 * tstamp_* values.
2249 */
2250static void __perf_event_enable_time(struct perf_event *event, u64 now)
2251{
2252 WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
2253
2254 event->tstamp_stopped = now;
2255 event->tstamp_enabled = now - event->total_time_enabled;
2256 event->tstamp_running = now - event->total_time_running;
2257}
2258
2259static void add_event_to_ctx(struct perf_event *event, 2183static void add_event_to_ctx(struct perf_event *event,
2260 struct perf_event_context *ctx) 2184 struct perf_event_context *ctx)
2261{ 2185{
2262 u64 tstamp = perf_event_time(event);
2263
2264 list_add_event(event, ctx); 2186 list_add_event(event, ctx);
2265 perf_group_attach(event); 2187 perf_group_attach(event);
2266 /*
2267 * We can be called with event->state == STATE_OFF when we create with
2268 * .disabled = 1. In that case the IOC_ENABLE will call this function.
2269 */
2270 if (event->state == PERF_EVENT_STATE_INACTIVE)
2271 __perf_event_enable_time(event, tstamp);
2272} 2188}
2273 2189
2274static void ctx_sched_out(struct perf_event_context *ctx, 2190static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2500,28 +2416,6 @@ again:
2500} 2416}
2501 2417
2502/* 2418/*
2503 * Put a event into inactive state and update time fields.
2504 * Enabling the leader of a group effectively enables all
2505 * the group members that aren't explicitly disabled, so we
2506 * have to update their ->tstamp_enabled also.
2507 * Note: this works for group members as well as group leaders
2508 * since the non-leader members' sibling_lists will be empty.
2509 */
2510static void __perf_event_mark_enabled(struct perf_event *event)
2511{
2512 struct perf_event *sub;
2513 u64 tstamp = perf_event_time(event);
2514
2515 event->state = PERF_EVENT_STATE_INACTIVE;
2516 __perf_event_enable_time(event, tstamp);
2517 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2518 /* XXX should not be > INACTIVE if event isn't */
2519 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2520 __perf_event_enable_time(sub, tstamp);
2521 }
2522}
2523
2524/*
2525 * Cross CPU call to enable a performance event 2419 * Cross CPU call to enable a performance event
2526 */ 2420 */
2527static void __perf_event_enable(struct perf_event *event, 2421static void __perf_event_enable(struct perf_event *event,
@@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event,
2539 if (ctx->is_active) 2433 if (ctx->is_active)
2540 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2434 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2541 2435
2542 __perf_event_mark_enabled(event); 2436 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2543 2437
2544 if (!ctx->is_active) 2438 if (!ctx->is_active)
2545 return; 2439 return;
2546 2440
2547 if (!event_filter_match(event)) { 2441 if (!event_filter_match(event)) {
2548 if (is_cgroup_event(event))
2549 perf_cgroup_defer_enabled(event);
2550 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); 2442 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2551 return; 2443 return;
2552 } 2444 }
@@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
2866 * we know the event must be on the current CPU, therefore we 2758 * we know the event must be on the current CPU, therefore we
2867 * don't need to use it. 2759 * don't need to use it.
2868 */ 2760 */
2869 switch (event->state) { 2761 if (event->state == PERF_EVENT_STATE_ACTIVE)
2870 case PERF_EVENT_STATE_ACTIVE:
2871 event->pmu->read(event); 2762 event->pmu->read(event);
2872 /* fall-through */
2873 2763
2874 case PERF_EVENT_STATE_INACTIVE: 2764 perf_event_update_time(event);
2875 update_event_times(event);
2876 break;
2877
2878 default:
2879 break;
2880 }
2881 2765
2882 /* 2766 /*
2883 * In order to keep per-task stats reliable we need to flip the event 2767 * In order to keep per-task stats reliable we need to flip the event
@@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
3114 if (!event_filter_match(event)) 2998 if (!event_filter_match(event))
3115 continue; 2999 continue;
3116 3000
3117 /* may need to reset tstamp_enabled */
3118 if (is_cgroup_event(event))
3119 perf_cgroup_mark_enabled(event, ctx);
3120
3121 if (group_can_go_on(event, cpuctx, 1)) 3001 if (group_can_go_on(event, cpuctx, 1))
3122 group_sched_in(event, cpuctx, ctx); 3002 group_sched_in(event, cpuctx, ctx);
3123 3003
@@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
3125 * If this pinned group hasn't been scheduled, 3005 * If this pinned group hasn't been scheduled,
3126 * put it in error state. 3006 * put it in error state.
3127 */ 3007 */
3128 if (event->state == PERF_EVENT_STATE_INACTIVE) { 3008 if (event->state == PERF_EVENT_STATE_INACTIVE)
3129 update_group_times(event); 3009 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3130 event->state = PERF_EVENT_STATE_ERROR;
3131 }
3132 } 3010 }
3133} 3011}
3134 3012
@@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
3150 if (!event_filter_match(event)) 3028 if (!event_filter_match(event))
3151 continue; 3029 continue;
3152 3030
3153 /* may need to reset tstamp_enabled */
3154 if (is_cgroup_event(event))
3155 perf_cgroup_mark_enabled(event, ctx);
3156
3157 if (group_can_go_on(event, cpuctx, can_add_hw)) { 3031 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3158 if (group_sched_in(event, cpuctx, ctx)) 3032 if (group_sched_in(event, cpuctx, ctx))
3159 can_add_hw = 0; 3033 can_add_hw = 0;
@@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event,
3545 if (event->state >= PERF_EVENT_STATE_INACTIVE) 3419 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3546 return 0; 3420 return 0;
3547 3421
3548 __perf_event_mark_enabled(event); 3422 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3549 3423
3550 return 1; 3424 return 1;
3551} 3425}
@@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info)
3644 update_cgrp_time_from_event(event); 3518 update_cgrp_time_from_event(event);
3645 } 3519 }
3646 3520
3647 if (!data->group) 3521 perf_event_update_time(event);
3648 update_event_times(event); 3522 if (data->group)
3649 else 3523 perf_event_update_sibling_time(event);
3650 update_group_times(event);
3651 3524
3652 if (event->state != PERF_EVENT_STATE_ACTIVE) 3525 if (event->state != PERF_EVENT_STATE_ACTIVE)
3653 goto unlock; 3526 goto unlock;
@@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
3696{ 3569{
3697 unsigned long flags; 3570 unsigned long flags;
3698 int ret = 0; 3571 int ret = 0;
3699 u64 now;
3700 3572
3701 /* 3573 /*
3702 * Disabling interrupts avoids all counter scheduling (context 3574 * Disabling interrupts avoids all counter scheduling (context
@@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
3727 goto out; 3599 goto out;
3728 } 3600 }
3729 3601
3730 now = event->shadow_ctx_time + perf_clock(); 3602
3731 if (enabled)
3732 *enabled = now - event->tstamp_enabled;
3733 /* 3603 /*
3734 * If the event is currently on this CPU, its either a per-task event, 3604 * If the event is currently on this CPU, its either a per-task event,
3735 * or local to this CPU. Furthermore it means its ACTIVE (otherwise 3605 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3736 * oncpu == -1). 3606 * oncpu == -1).
3737 */ 3607 */
3738 if (event->oncpu == smp_processor_id()) { 3608 if (event->oncpu == smp_processor_id())
3739 event->pmu->read(event); 3609 event->pmu->read(event);
3740 if (running)
3741 *running = now - event->tstamp_running;
3742 } else if (running) {
3743 *running = event->total_time_running;
3744 }
3745 3610
3746 *value = local64_read(&event->count); 3611 *value = local64_read(&event->count);
3612 if (enabled || running) {
3613 u64 now = event->shadow_ctx_time + perf_clock();
3614 u64 __enabled, __running;
3615
3616 __perf_update_times(event, now, &__enabled, &__running);
3617 if (enabled)
3618 *enabled = __enabled;
3619 if (running)
3620 *running = __running;
3621 }
3747out: 3622out:
3748 local_irq_restore(flags); 3623 local_irq_restore(flags);
3749 3624
@@ -3818,10 +3693,9 @@ again:
3818 update_cgrp_time_from_event(event); 3693 update_cgrp_time_from_event(event);
3819 } 3694 }
3820 3695
3696 perf_event_update_time(event);
3821 if (group) 3697 if (group)
3822 update_group_times(event); 3698 perf_event_update_sibling_time(event);
3823 else
3824 update_event_times(event);
3825 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3699 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3826 } 3700 }
3827 3701
@@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event,
4945 4819
4946 *now = perf_clock(); 4820 *now = perf_clock();
4947 ctx_time = event->shadow_ctx_time + *now; 4821 ctx_time = event->shadow_ctx_time + *now;
4948 *enabled = ctx_time - event->tstamp_enabled; 4822 __perf_update_times(event, ctx_time, enabled, running);
4949 *running = ctx_time - event->tstamp_running;
4950} 4823}
4951 4824
4952static void perf_event_init_userpage(struct perf_event *event) 4825static void perf_event_init_userpage(struct perf_event *event)
@@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event,
10581 if (parent_event) 10454 if (parent_event)
10582 perf_group_detach(child_event); 10455 perf_group_detach(child_event);
10583 list_del_event(child_event, child_ctx); 10456 list_del_event(child_event, child_ctx);
10584 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ 10457 perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
10585 raw_spin_unlock_irq(&child_ctx->lock); 10458 raw_spin_unlock_irq(&child_ctx->lock);
10586 10459
10587 /* 10460 /*