perf: Reimplement frequency driven sampling

commit abd50713944c8ea9e0af5b7bffa0aacae21cc91a upstream. There was a bug in the old period code that caused intel_pmu_enable_all() or native_write_msr_safe() to show up quite high in the profiles. In staring at that code it made my head hurt, so I rewrote it in a hopefully simpler fashion. Its now fully symetric between tick and overflow driven adjustments and uses less data to boot. The only complication is that it basically wants to do a u128 division. The code approximates that in a rather simple truncate until it fits fashion, taking care to balance the terms while truncating. This version does not generate that sampling artefact. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2010-01-26 12:50:16 -0500
committer: Greg Kroah-Hartman <gregkh@suse.de> 2010-03-15 12:06:17 -0400
commit: 21a6adcde06e129b055caa3256e65a97a2986770 (patch)
tree: 56663f2682b5114b92335c7c53ce26e1449ac8cf
parent: 69cb5f7cdc28a5352a03c16bbaa0a92cdf31b9d4 (diff)
2 files changed, 94 insertions, 43 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a177698d95e2..c8ea0c77a625 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -496,9 +496,8 @@ struct hw_perf_event {
        atomic64_t                      period_left;
        u64                             interrupts;
-        u64                             freq_count;
+        u64                             freq_time_stamp;
-        u64                             freq_interrupts;
+        u64                             freq_count_stamp;
-        u64                             freq_stamp;
 #endif
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2fa4301ffadb..b707465b0613 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1350,14 +1350,83 @@ static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_event *event, int enable);
-static void perf_adjust_period(struct perf_event *event, u64 events)
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
+{
+        u64 frequency = event->attr.sample_freq;
+        u64 sec = NSEC_PER_SEC;
+        u64 divisor, dividend;
+        int count_fls, nsec_fls, frequency_fls, sec_fls;
+        count_fls = fls64(count);
+        nsec_fls = fls64(nsec);
+        frequency_fls = fls64(frequency);
+        sec_fls = 30;
+        /*
+         * We got @count in @nsec, with a target of sample_freq HZ
+         * the target period becomes:
+         *
+         *             @count * 10^9
+         * period = -------------------
+         *          @nsec * sample_freq
+         *
+         */
+        /*
+         * Reduce accuracy by one bit such that @a and @b converge
+         * to a similar magnitude.
+         */
+#define REDUCE_FLS(a, b)                \
+do {                                    \
+        if (a##_fls > b##_fls) {        \
+                a >>= 1;                \
+                a##_fls--;              \
+        } else {                        \
+                b >>= 1;                \
+                b##_fls--;              \
+        }                               \
+} while (0)
+        /*
+         * Reduce accuracy until either term fits in a u64, then proceed with
+         * the other, so that finally we can do a u64/u64 division.
+         */
+        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+                REDUCE_FLS(nsec, frequency);
+                REDUCE_FLS(sec, count);
+        }
+        if (count_fls + sec_fls > 64) {
+                divisor = nsec * frequency;
+                while (count_fls + sec_fls > 64) {
+                        REDUCE_FLS(count, sec);
+                        divisor >>= 1;
+                }
+                dividend = count * sec;
+        } else {
+                dividend = count * sec;
+                while (nsec_fls + frequency_fls > 64) {
+                        REDUCE_FLS(nsec, frequency);
+                        dividend >>= 1;
+                }
+                divisor = nsec * frequency;
+        }
+        return div64_u64(dividend, divisor);
+}
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period, sample_period;
        s64 delta;
-        events *= hwc->sample_period;
+        period = perf_calculate_period(event, nsec, count);
-        period = div64_u64(events, event->attr.sample_freq);
        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1437,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
                sample_period = 1;
        hwc->sample_period = sample_period;
+        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+                perf_disable();
+                event->pmu->disable(event);
+                atomic64_set(&hwc->period_left, 0);
+                event->pmu->enable(event);
+                perf_enable();
+        }
 }
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        u64 interrupts, freq;
+        u64 interrupts, now;
+        s64 delta;
        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1473,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
                        event->pmu->unthrottle(event);
-                        interrupts = 2*sysctl_perf_event_sample_rate/HZ;
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                /*
+                event->pmu->read(event);
-                 * if the specified freq < HZ then we need to skip ticks
+                now = atomic64_read(&event->count);
-                 */
+                delta = now - hwc->freq_count_stamp;
-                if (event->attr.sample_freq < HZ) {
+                hwc->freq_count_stamp = now;
-                        freq = event->attr.sample_freq;
-                        hwc->freq_count += freq;
-                        hwc->freq_interrupts += interrupts;
-                        if (hwc->freq_count < HZ)
-                                continue;
-                        interrupts = hwc->freq_interrupts;
-                        hwc->freq_interrupts = 0;
-                        hwc->freq_count -= HZ;
-                } else
-                        freq = HZ;
-                perf_adjust_period(event, freq * interrupts);
-                /*
+                if (delta > 0)
-                 * In order to avoid being stalled by an (accidental) huge
+                        perf_adjust_period(event, TICK_NSEC, delta);
-                 * sample period, force reset the sample period if we didn't
-                 * get any events in this freq period.
-                 */
-                if (!interrupts) {
-                        perf_disable();
-                        event->pmu->disable(event);
-                        atomic64_set(&hwc->period_left, 0);
-                        event->pmu->enable(event);
-                        perf_enable();
-                }
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -3688,12 +3740,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (event->attr.freq) {
                u64 now = perf_clock();
-                s64 delta = now - hwc->freq_stamp;
+                s64 delta = now - hwc->freq_time_stamp;
-                hwc->freq_stamp = now;
+                hwc->freq_time_stamp = now;
-                if (delta > 0 && delta < TICK_NSEC)
+                if (delta > 0 && delta < 2*TICK_NSEC)
-                        perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+                        perf_adjust_period(event, delta, hwc->last_period);
        }
        /*
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2010-01-26 12:50:16 -0500
committer	Greg Kroah-Hartman <gregkh@suse.de>	2010-03-15 12:06:17 -0400
commit	21a6adcde06e129b055caa3256e65a97a2986770 (patch)
tree	56663f2682b5114b92335c7c53ce26e1449ac8cf
parent	69cb5f7cdc28a5352a03c16bbaa0a92cdf31b9d4 (diff)