powerpc/perf_counter: Add generic support for POWER-family PMU hardware

This provides the architecture-specific functions needed to access PMU hardware on the 64-bit PowerPC processors. It has been designed for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will hopefully also suit other 64-bit PowerPC machines (although probably not Cell given how different it is in this area). This doesn't include back-ends for any specific processors. This implements a system which allows back-ends to express the constraints that their hardware has on what events can be counted simultaneously. The constraints are expressed as a 64-bit mask + 64-bit value for each event, and the encoding is capable of expressing the constraints arising from having a set of multiplexers feeding an event bus, with some events being available through multiple multiplexer settings, such as we get on POWER4 and PPC970. Furthermore, the back-end can supply alternative event codes for each event, and the constraint checking code will try all possible combinations of alternative event codes to try to find a combination that will fit. Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Paul Mackerras <paulus@samba.org> 2009-01-09 04:21:55 -0500
committer: Paul Mackerras <paulus@samba.org> 2009-01-10 00:32:05 -0500
commit: 4574910e5087085a1f330ff8373cee4503f5c77c (patch)
tree: a3bb6c974c2314ca91ee2e3a33a7283187ad19ea /arch/powerpc/kernel/perf_counter.c
parent: 93a6d3ce6962044fe9badf528fed46b455d58292 (diff)
1 files changed, 754 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..c7d4c2966a5c
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,754 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+struct cpu_hw_counters {
+        int n_counters;
+        int n_percpu;
+        int disabled;
+        int n_added;
+        struct perf_counter *counter[MAX_HWCOUNTERS];
+        unsigned int events[MAX_HWCOUNTERS];
+        u64 mmcr[3];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+struct power_pmu *ppmu;
+void perf_counter_print_debug(void)
+{
+}
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+        return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+        unsigned long val;
+        switch (idx) {
+        case 1:
+                val = mfspr(SPRN_PMC1);
+                break;
+        case 2:
+                val = mfspr(SPRN_PMC2);
+                break;
+        case 3:
+                val = mfspr(SPRN_PMC3);
+                break;
+        case 4:
+                val = mfspr(SPRN_PMC4);
+                break;
+        case 5:
+                val = mfspr(SPRN_PMC5);
+                break;
+        case 6:
+                val = mfspr(SPRN_PMC6);
+                break;
+        case 7:
+                val = mfspr(SPRN_PMC7);
+                break;
+        case 8:
+                val = mfspr(SPRN_PMC8);
+                break;
+        default:
+                printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+                val = 0;
+        }
+        return val;
+}
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+        switch (idx) {
+        case 1:
+                mtspr(SPRN_PMC1, val);
+                break;
+        case 2:
+                mtspr(SPRN_PMC2, val);
+                break;
+        case 3:
+                mtspr(SPRN_PMC3, val);
+                break;
+        case 4:
+                mtspr(SPRN_PMC4, val);
+                break;
+        case 5:
+                mtspr(SPRN_PMC5, val);
+                break;
+        case 6:
+                mtspr(SPRN_PMC6, val);
+                break;
+        case 7:
+                mtspr(SPRN_PMC7, val);
+                break;
+        case 8:
+                mtspr(SPRN_PMC8, val);
+                break;
+        default:
+                printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+        }
+}
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(unsigned int event[], int n_ev)
+{
+        u64 mask, value, nv;
+        unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+        u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+        u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+        u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+        int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+        int i, j;
+        u64 addf = ppmu->add_fields;
+        u64 tadd = ppmu->test_adder;
+        if (n_ev > ppmu->n_counter)
+                return -1;
+        /* First see if the events will go on as-is */
+        for (i = 0; i < n_ev; ++i) {
+                alternatives[i][0] = event[i];
+                if (ppmu->get_constraint(event[i], &amasks[i][0],
+                                         &avalues[i][0]))
+                        return -1;
+                choice[i] = 0;
+        }
+        value = mask = 0;
+        for (i = 0; i < n_ev; ++i) {
+                nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+                if ((((nv + tadd) ^ value) & mask) != 0 ||
+                    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+                        break;
+                value = nv;
+                mask |= amasks[i][0];
+        }
+        if (i == n_ev)
+                return 0;       /* all OK */
+        /* doesn't work, gather alternatives... */
+        if (!ppmu->get_alternatives)
+                return -1;
+        for (i = 0; i < n_ev; ++i) {
+                n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+                for (j = 1; j < n_alt[i]; ++j)
+                        ppmu->get_constraint(alternatives[i][j],
+                                             &amasks[i][j], &avalues[i][j]);
+        }
+        /* enumerate all possibilities and see if any will work */
+        i = 0;
+        j = -1;
+        value = mask = nv = 0;
+        while (i < n_ev) {
+                if (j >= 0) {
+                        /* we're backtracking, restore context */
+                        value = svalues[i];
+                        mask = smasks[i];
+                        j = choice[i];
+                }
+                /*
+                 * See if any alternative k for event i,
+                 * where k > j, will satisfy the constraints.
+                 */
+                while (++j < n_alt[i]) {
+                        nv = (value | avalues[i][j]) +
+                                (value & avalues[i][j] & addf);
+                        if ((((nv + tadd) ^ value) & mask) == 0 &&
+                            (((nv + tadd) ^ avalues[i][j])
+                             & amasks[i][j]) == 0)
+                                break;
+                }
+                if (j >= n_alt[i]) {
+                        /*
+                         * No feasible alternative, backtrack
+                         * to event i-1 and continue enumerating its
+                         * alternatives from where we got up to.
+                         */
+                        if (--i < 0)
+                                return -1;
+                } else {
+                        /*
+                         * Found a feasible alternative for event i,
+                         * remember where we got up to with this event,
+                         * go on to the next event, and start with
+                         * the first alternative for it.
+                         */
+                        choice[i] = j;
+                        svalues[i] = value;
+                        smasks[i] = mask;
+                        value = nv;
+                        mask |= amasks[i][j];
+                        ++i;
+                        j = -1;
+                }
+        }
+        /* OK, we have a feasible combination, tell the caller the solution */
+        for (i = 0; i < n_ev; ++i)
+                event[i] = alternatives[i][choice[i]];
+        return 0;
+}
+static void power_perf_read(struct perf_counter *counter)
+{
+        long val, delta, prev;
+        if (!counter->hw.idx)
+                return;
+        /*
+         * Performance monitor interrupts come even when interrupts
+         * are soft-disabled, as long as interrupts are hard-enabled.
+         * Therefore we treat them like NMIs.
+         */
+        do {
+                prev = atomic64_read(&counter->hw.prev_count);
+                barrier();
+                val = read_pmc(counter->hw.idx);
+        } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+        /* The counters are only 32 bits wide */
+        delta = (val - prev) & 0xfffffffful;
+        atomic64_add(delta, &counter->count);
+        atomic64_sub(delta, &counter->hw.period_left);
+}
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+u64 hw_perf_save_disable(void)
+{
+        struct cpu_hw_counters *cpuhw;
+        unsigned long ret;
+        unsigned long flags;
+        local_irq_save(flags);
+        cpuhw = &__get_cpu_var(cpu_hw_counters);
+        ret = cpuhw->disabled;
+        if (!ret) {
+                cpuhw->disabled = 1;
+                cpuhw->n_added = 0;
+                /*
+                 * Set the 'freeze counters' bit.
+                 * The barrier is to make sure the mtspr has been
+                 * executed and the PMU has frozen the counters
+                 * before we return.
+                 */
+                mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+                mb();
+        }
+        local_irq_restore(flags);
+        return ret;
+}
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_restore(u64 disable)
+{
+        struct perf_counter *counter;
+        struct cpu_hw_counters *cpuhw;
+        unsigned long flags;
+        long i;
+        unsigned long val;
+        s64 left;
+        unsigned int hwc_index[MAX_HWCOUNTERS];
+        if (disable)
+                return;
+        local_irq_save(flags);
+        cpuhw = &__get_cpu_var(cpu_hw_counters);
+        cpuhw->disabled = 0;
+        /*
+         * If we didn't change anything, or only removed counters,
+         * no need to recalculate MMCR* settings and reset the PMCs.
+         * Just reenable the PMU with the current MMCR* settings
+         * (possibly updated for removal of counters).
+         */
+        if (!cpuhw->n_added) {
+                mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+                mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+                mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+                goto out;
+        }
+        /*
+         * Compute MMCR* values for the new set of counters
+         */
+        if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+                               cpuhw->mmcr)) {
+                /* shouldn't ever get here */
+                printk(KERN_ERR "oops compute_mmcr failed\n");
+                goto out;
+        }
+        /*
+         * Write the new configuration to MMCR* with the freeze
+         * bit set and set the hardware counters to their initial values.
+         * Then unfreeze the counters.
+         */
+        mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+        mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+        mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+                                | MMCR0_FC);
+        /*
+         * Read off any pre-existing counters that need to move
+         * to another PMC.
+         */
+        for (i = 0; i < cpuhw->n_counters; ++i) {
+                counter = cpuhw->counter[i];
+                if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+                        power_perf_read(counter);
+                        write_pmc(counter->hw.idx, 0);
+                        counter->hw.idx = 0;
+                }
+        }
+        /*
+         * Initialize the PMCs for all the new and moved counters.
+         */
+        for (i = 0; i < cpuhw->n_counters; ++i) {
+                counter = cpuhw->counter[i];
+                if (counter->hw.idx)
+                        continue;
+                val = 0;
+                if (counter->hw_event.irq_period) {
+                        left = atomic64_read(&counter->hw.period_left);
+                        if (left < 0x80000000L)
+                                val = 0x80000000L - left;
+                }
+                atomic64_set(&counter->hw.prev_count, val);
+                counter->hw.idx = hwc_index[i] + 1;
+                write_pmc(counter->hw.idx, val);
+        }
+        mb();
+        cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+        mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+ out:
+        local_irq_restore(flags);
+}
+static int collect_events(struct perf_counter *group, int max_count,
+                          struct perf_counter *ctrs[], unsigned int *events)
+{
+        int n = 0;
+        struct perf_counter *counter;
+        if (!is_software_counter(group)) {
+                if (n >= max_count)
+                        return -1;
+                ctrs[n] = group;
+                events[n++] = group->hw.config;
+        }
+        list_for_each_entry(counter, &group->sibling_list, list_entry) {
+                if (!is_software_counter(counter) &&
+                    counter->state != PERF_COUNTER_STATE_OFF) {
+                        if (n >= max_count)
+                                return -1;
+                        ctrs[n] = counter;
+                        events[n++] = counter->hw.config;
+                }
+        }
+        return n;
+}
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+        counter->state = PERF_COUNTER_STATE_ACTIVE;
+        counter->oncpu = cpu;
+        if (is_software_counter(counter))
+                counter->hw_ops->enable(counter);
+}
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+               struct perf_cpu_context *cpuctx,
+               struct perf_counter_context *ctx, int cpu)
+{
+        struct cpu_hw_counters *cpuhw;
+        long i, n, n0;
+        struct perf_counter *sub;
+        cpuhw = &__get_cpu_var(cpu_hw_counters);
+        n0 = cpuhw->n_counters;
+        n = collect_events(group_leader, ppmu->n_counter - n0,
+                           &cpuhw->counter[n0], &cpuhw->events[n0]);
+        if (n < 0)
+                return -EAGAIN;
+        if (power_check_constraints(cpuhw->events, n + n0))
+                return -EAGAIN;
+        cpuhw->n_counters = n0 + n;
+        cpuhw->n_added += n;
+        /*
+         * OK, this group can go on; update counter states etc.,
+         * and enable any software counters
+         */
+        for (i = n0; i < n0 + n; ++i)
+                cpuhw->counter[i]->hw.config = cpuhw->events[i];
+        n = 1;
+        counter_sched_in(group_leader, cpu);
+        list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+                if (sub->state != PERF_COUNTER_STATE_OFF) {
+                        counter_sched_in(sub, cpu);
+                        ++n;
+                }
+        }
+        cpuctx->active_oncpu += n;
+        ctx->nr_active += n;
+        return 1;
+}
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_restore to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_perf_enable(struct perf_counter *counter)
+{
+        struct cpu_hw_counters *cpuhw;
+        unsigned long flags;
+        u64 pmudis;
+        int n0;
+        int ret = -EAGAIN;
+        local_irq_save(flags);
+        pmudis = hw_perf_save_disable();
+        /*
+         * Add the counter to the list (if there is room)
+         * and check whether the total set is still feasible.
+         */
+        cpuhw = &__get_cpu_var(cpu_hw_counters);
+        n0 = cpuhw->n_counters;
+        if (n0 >= ppmu->n_counter)
+                goto out;
+        cpuhw->counter[n0] = counter;
+        cpuhw->events[n0] = counter->hw.config;
+        if (power_check_constraints(cpuhw->events, n0 + 1))
+                goto out;
+        counter->hw.config = cpuhw->events[n0];
+        ++cpuhw->n_counters;
+        ++cpuhw->n_added;
+        ret = 0;
+ out:
+        hw_perf_restore(pmudis);
+        local_irq_restore(flags);
+        return ret;
+}
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_perf_disable(struct perf_counter *counter)
+{
+        struct cpu_hw_counters *cpuhw;
+        long i;
+        u64 pmudis;
+        unsigned long flags;
+        local_irq_save(flags);
+        pmudis = hw_perf_save_disable();
+        power_perf_read(counter);
+        cpuhw = &__get_cpu_var(cpu_hw_counters);
+        for (i = 0; i < cpuhw->n_counters; ++i) {
+                if (counter == cpuhw->counter[i]) {
+                        while (++i < cpuhw->n_counters)
+                                cpuhw->counter[i-1] = cpuhw->counter[i];
+                        --cpuhw->n_counters;
+                        ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+                        write_pmc(counter->hw.idx, 0);
+                        counter->hw.idx = 0;
+                        break;
+                }
+        }
+        if (cpuhw->n_counters == 0) {
+                /* disable exceptions if no counters are running */
+                cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+        }
+        hw_perf_restore(pmudis);
+        local_irq_restore(flags);
+}
+struct hw_perf_counter_ops power_perf_ops = {
+        .enable = power_perf_enable,
+        .disable = power_perf_disable,
+        .read = power_perf_read
+};
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
+{
+        unsigned long ev;
+        struct perf_counter *ctrs[MAX_HWCOUNTERS];
+        unsigned int events[MAX_HWCOUNTERS];
+        int n;
+        if (!ppmu)
+                return NULL;
+        if ((s64)counter->hw_event.irq_period < 0)
+                return NULL;
+        ev = counter->hw_event.type;
+        if (!counter->hw_event.raw) {
+                if (ev >= ppmu->n_generic ||
+                    ppmu->generic_events[ev] == 0)
+                        return NULL;
+                ev = ppmu->generic_events[ev];
+        }
+        counter->hw.config_base = ev;
+        counter->hw.idx = 0;
+        /*
+         * If this is in a group, check if it can go on with all the
+         * other hardware counters in the group.  We assume the counter
+         * hasn't been linked into its leader's sibling list at this point.
+         */
+        n = 0;
+        if (counter->group_leader != counter) {
+                n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+                                   ctrs, events);
+                if (n < 0)
+                        return NULL;
+        }
+        events[n++] = ev;
+        if (power_check_constraints(events, n))
+                return NULL;
+        counter->hw.config = events[n - 1];
+        atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+        return &power_perf_ops;
+}
+/*
+ * Handle wakeups.
+ */
+void perf_counter_do_pending(void)
+{
+        int i;
+        struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+        struct perf_counter *counter;
+        set_perf_counter_pending(0);
+        for (i = 0; i < cpuhw->n_counters; ++i) {
+                counter = cpuhw->counter[i];
+                if (counter && counter->wakeup_pending) {
+                        counter->wakeup_pending = 0;
+                        wake_up(&counter->waitq);
+                }
+        }
+}
+/*
+ * Record data for an irq counter.
+ * This function was lifted from the x86 code; maybe it should
+ * go in the core?
+ */
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+        struct perf_data *irqdata = counter->irqdata;
+        if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+                irqdata->overrun++;
+        } else {
+                u64 *p = (u64 *) &irqdata->data[irqdata->len];
+                *p = data;
+                irqdata->len += sizeof(u64);
+        }
+}
+/*
+ * Record all the values of the counters in a group
+ */
+static void perf_handle_group(struct perf_counter *counter)
+{
+        struct perf_counter *leader, *sub;
+        leader = counter->group_leader;
+        list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+                if (sub != counter)
+                        sub->hw_ops->read(sub);
+                perf_store_irq_data(counter, sub->hw_event.type);
+                perf_store_irq_data(counter, atomic64_read(&sub->count));
+        }
+}
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+                               struct pt_regs *regs)
+{
+        s64 prev, delta, left;
+        int record = 0;
+        /* we don't have to worry about interrupts here */
+        prev = atomic64_read(&counter->hw.prev_count);
+        delta = (val - prev) & 0xfffffffful;
+        atomic64_add(delta, &counter->count);
+        /*
+         * See if the total period for this counter has expired,
+         * and update for the next period.
+         */
+        val = 0;
+        left = atomic64_read(&counter->hw.period_left) - delta;
+        if (counter->hw_event.irq_period) {
+                if (left <= 0) {
+                        left += counter->hw_event.irq_period;
+                        if (left <= 0)
+                                left = counter->hw_event.irq_period;
+                        record = 1;
+                }
+                if (left < 0x80000000L)
+                        val = 0x80000000L - left;
+        }
+        write_pmc(counter->hw.idx, val);
+        atomic64_set(&counter->hw.prev_count, val);
+        atomic64_set(&counter->hw.period_left, left);
+        /*
+         * Finally record data if requested.
+         */
+        if (record) {
+                switch (counter->hw_event.record_type) {
+                case PERF_RECORD_SIMPLE:
+                        break;
+                case PERF_RECORD_IRQ:
+                        perf_store_irq_data(counter, instruction_pointer(regs));
+                        counter->wakeup_pending = 1;
+                        break;
+                case PERF_RECORD_GROUP:
+                        perf_handle_group(counter);
+                        counter->wakeup_pending = 1;
+                        break;
+                }
+        }
+}
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+        int i;
+        struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+        struct perf_counter *counter;
+        long val;
+        int need_wakeup = 0, found = 0;
+        for (i = 0; i < cpuhw->n_counters; ++i) {
+                counter = cpuhw->counter[i];
+                val = read_pmc(counter->hw.idx);
+                if ((int)val < 0) {
+                        /* counter has overflowed */
+                        found = 1;
+                        record_and_restart(counter, val, regs);
+                        if (counter->wakeup_pending)
+                                need_wakeup = 1;
+                }
+        }
+        /*
+         * In case we didn't find and reset the counter that caused
+         * the interrupt, scan all counters and reset any that are
+         * negative, to avoid getting continual interrupts.
+         * Any that we processed in the previous loop will not be negative.
+         */
+        if (!found) {
+                for (i = 0; i < ppmu->n_counter; ++i) {
+                        val = read_pmc(i + 1);
+                        if ((int)val < 0)
+                                write_pmc(i + 1, 0);
+                }
+        }
+        /*
+         * Reset MMCR0 to its normal value.  This will set PMXE and
+         * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+         * and thus allow interrupts to occur again.
+         * XXX might want to use MSR.PM to keep the counters frozen until
+         * we get back out of this interrupt.
+         */
+        mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+        /*
+         * If we need a wakeup, check whether interrupts were soft-enabled
+         * when we took the interrupt.  If they were, we can wake stuff up
+         * immediately; otherwise we'll have to set a flag and do the
+         * wakeup when interrupts get soft-enabled.
+         */
+        if (need_wakeup) {
+                if (regs->softe) {
+                        irq_enter();
+                        perf_counter_do_pending();
+                        irq_exit();
+                } else {
+                        set_perf_counter_pending(1);
+                }
+        }
+}
+static int init_perf_counters(void)
+{
+        if (reserve_pmc_hardware(perf_counter_interrupt)) {
+                printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
+                return -EBUSY;
+        }
+        return 0;
+}
+arch_initcall(init_perf_counters);
author	Paul Mackerras <paulus@samba.org>	2009-01-09 04:21:55 -0500
committer	Paul Mackerras <paulus@samba.org>	2009-01-10 00:32:05 -0500
commit	4574910e5087085a1f330ff8373cee4503f5c77c (patch)
tree	a3bb6c974c2314ca91ee2e3a33a7283187ad19ea /arch/powerpc/kernel/perf_counter.c
parent	93a6d3ce6962044fe9badf528fed46b455d58292 (diff)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c new file mode 100644 index 000000000000..c7d4c2966a5c --- /dev/null +++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,754 @@
	1	/*
	2	* Performance counter support - powerpc architecture code
	3	*
	4	* Copyright 2008-2009 Paul Mackerras, IBM Corporation.
	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License
	8	* as published by the Free Software Foundation; either version
	9	* 2 of the License, or (at your option) any later version.
	10	*/
	11	#include <linux/kernel.h>
	12	#include <linux/sched.h>
	13	#include <linux/perf_counter.h>
	14	#include <linux/percpu.h>
	15	#include <linux/hardirq.h>
	16	#include <asm/reg.h>
	17	#include <asm/pmc.h>
	18
	19	struct cpu_hw_counters {
	20	int n_counters;
	21	int n_percpu;
	22	int disabled;
	23	int n_added;
	24	struct perf_counter *counter[MAX_HWCOUNTERS];
	25	unsigned int events[MAX_HWCOUNTERS];
	26	u64 mmcr[3];
	27	};
	28	DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
	29
	30	struct power_pmu *ppmu;
	31
	32	void perf_counter_print_debug(void)
	33	{
	34	}
	35
	36	/*
	37	* Return 1 for a software counter, 0 for a hardware counter
	38	*/
	39	static inline int is_software_counter(struct perf_counter *counter)
	40	{
	41	return !counter->hw_event.raw && counter->hw_event.type < 0;
	42	}
	43
	44	/*
	45	* Read one performance monitor counter (PMC).
	46	*/
	47	static unsigned long read_pmc(int idx)
	48	{
	49	unsigned long val;
	50
	51	switch (idx) {
	52	case 1:
	53	val = mfspr(SPRN_PMC1);
	54	break;
	55	case 2:
	56	val = mfspr(SPRN_PMC2);
	57	break;
	58	case 3:
	59	val = mfspr(SPRN_PMC3);
	60	break;
	61	case 4:
	62	val = mfspr(SPRN_PMC4);
	63	break;
	64	case 5:
	65	val = mfspr(SPRN_PMC5);
	66	break;
	67	case 6:
	68	val = mfspr(SPRN_PMC6);
	69	break;
	70	case 7:
	71	val = mfspr(SPRN_PMC7);
	72	break;
	73	case 8:
	74	val = mfspr(SPRN_PMC8);
	75	break;
	76	default:
	77	printk(KERN_ERR "oops trying to read PMC%d\n", idx);
	78	val = 0;
	79	}
	80	return val;
	81	}
	82
	83	/*
	84	* Write one PMC.
	85	*/
	86	static void write_pmc(int idx, unsigned long val)
	87	{
	88	switch (idx) {
	89	case 1:
	90	mtspr(SPRN_PMC1, val);
	91	break;
	92	case 2:
	93	mtspr(SPRN_PMC2, val);
	94	break;
	95	case 3:
	96	mtspr(SPRN_PMC3, val);
	97	break;
	98	case 4:
	99	mtspr(SPRN_PMC4, val);
	100	break;
	101	case 5:
	102	mtspr(SPRN_PMC5, val);
	103	break;
	104	case 6:
	105	mtspr(SPRN_PMC6, val);
	106	break;
	107	case 7:
	108	mtspr(SPRN_PMC7, val);
	109	break;
	110	case 8:
	111	mtspr(SPRN_PMC8, val);
	112	break;
	113	default:
	114	printk(KERN_ERR "oops trying to write PMC%d\n", idx);
	115	}
	116	}
	117
	118	/*
	119	* Check if a set of events can all go on the PMU at once.
	120	* If they can't, this will look at alternative codes for the events
	121	* and see if any combination of alternative codes is feasible.
	122	* The feasible set is returned in event[].
	123	*/
	124	static int power_check_constraints(unsigned int event[], int n_ev)
	125	{
	126	u64 mask, value, nv;
	127	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
	128	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
	129	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
	130	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
	131	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
	132	int i, j;
	133	u64 addf = ppmu->add_fields;
	134	u64 tadd = ppmu->test_adder;
	135
	136	if (n_ev > ppmu->n_counter)
	137	return -1;
	138
	139	/* First see if the events will go on as-is */
	140	for (i = 0; i < n_ev; ++i) {
	141	alternatives[i][0] = event[i];
	142	if (ppmu->get_constraint(event[i], &amasks[i][0],
	143	&avalues[i][0]))
	144	return -1;
	145	choice[i] = 0;
	146	}
	147	value = mask = 0;
	148	for (i = 0; i < n_ev; ++i) {
	149	nv = (value \| avalues[i][0]) + (value & avalues[i][0] & addf);
	150	if ((((nv + tadd) ^ value) & mask) != 0 \|\|
	151	(((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
	152	break;
	153	value = nv;
	154	mask \|= amasks[i][0];
	155	}
	156	if (i == n_ev)
	157	return 0; /* all OK */
	158
	159	/* doesn't work, gather alternatives... */
	160	if (!ppmu->get_alternatives)
	161	return -1;
	162	for (i = 0; i < n_ev; ++i) {
	163	n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
	164	for (j = 1; j < n_alt[i]; ++j)
	165	ppmu->get_constraint(alternatives[i][j],
	166	&amasks[i][j], &avalues[i][j]);
	167	}
	168
	169	/* enumerate all possibilities and see if any will work */
	170	i = 0;
	171	j = -1;
	172	value = mask = nv = 0;
	173	while (i < n_ev) {
	174	if (j >= 0) {
	175	/* we're backtracking, restore context */
	176	value = svalues[i];
	177	mask = smasks[i];
	178	j = choice[i];
	179	}
	180	/*
	181	* See if any alternative k for event i,
	182	* where k > j, will satisfy the constraints.
	183	*/
	184	while (++j < n_alt[i]) {
	185	nv = (value \| avalues[i][j]) +
	186	(value & avalues[i][j] & addf);
	187	if ((((nv + tadd) ^ value) & mask) == 0 &&
	188	(((nv + tadd) ^ avalues[i][j])
	189	& amasks[i][j]) == 0)
	190	break;
	191	}
	192	if (j >= n_alt[i]) {
	193	/*
	194	* No feasible alternative, backtrack
	195	* to event i-1 and continue enumerating its
	196	* alternatives from where we got up to.
	197	*/
	198	if (--i < 0)
	199	return -1;
	200	} else {
	201	/*
	202	* Found a feasible alternative for event i,
	203	* remember where we got up to with this event,
	204	* go on to the next event, and start with
	205	* the first alternative for it.
	206	*/
	207	choice[i] = j;
	208	svalues[i] = value;
	209	smasks[i] = mask;
	210	value = nv;
	211	mask \|= amasks[i][j];
	212	++i;
	213	j = -1;
	214	}
	215	}
	216
	217	/* OK, we have a feasible combination, tell the caller the solution */
	218	for (i = 0; i < n_ev; ++i)
	219	event[i] = alternatives[i][choice[i]];
	220	return 0;
	221	}
	222
	223	static void power_perf_read(struct perf_counter *counter)
	224	{
	225	long val, delta, prev;
	226
	227	if (!counter->hw.idx)
	228	return;
	229	/*
	230	* Performance monitor interrupts come even when interrupts
	231	* are soft-disabled, as long as interrupts are hard-enabled.
	232	* Therefore we treat them like NMIs.
	233	*/
	234	do {
	235	prev = atomic64_read(&counter->hw.prev_count);
	236	barrier();
	237	val = read_pmc(counter->hw.idx);
	238	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
	239
	240	/* The counters are only 32 bits wide */
	241	delta = (val - prev) & 0xfffffffful;
	242	atomic64_add(delta, &counter->count);
	243	atomic64_sub(delta, &counter->hw.period_left);
	244	}
	245
	246	/*
	247	* Disable all counters to prevent PMU interrupts and to allow
	248	* counters to be added or removed.
	249	*/
	250	u64 hw_perf_save_disable(void)
	251	{
	252	struct cpu_hw_counters *cpuhw;
	253	unsigned long ret;
	254	unsigned long flags;
	255
	256	local_irq_save(flags);
	257	cpuhw = &__get_cpu_var(cpu_hw_counters);
	258
	259	ret = cpuhw->disabled;
	260	if (!ret) {
	261	cpuhw->disabled = 1;
	262	cpuhw->n_added = 0;
	263
	264	/*
	265	* Set the 'freeze counters' bit.
	266	* The barrier is to make sure the mtspr has been
	267	* executed and the PMU has frozen the counters
	268	* before we return.
	269	*/
	270	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) \| MMCR0_FC);
	271	mb();
	272	}
	273	local_irq_restore(flags);
	274	return ret;
	275	}
	276
	277	/*
	278	* Re-enable all counters if disable == 0.
	279	* If we were previously disabled and counters were added, then
	280	* put the new config on the PMU.
	281	*/
	282	void hw_perf_restore(u64 disable)
	283	{
	284	struct perf_counter *counter;
	285	struct cpu_hw_counters *cpuhw;
	286	unsigned long flags;
	287	long i;
	288	unsigned long val;
	289	s64 left;
	290	unsigned int hwc_index[MAX_HWCOUNTERS];
	291
	292	if (disable)
	293	return;
	294	local_irq_save(flags);
	295	cpuhw = &__get_cpu_var(cpu_hw_counters);
	296	cpuhw->disabled = 0;
	297
	298	/*
	299	* If we didn't change anything, or only removed counters,
	300	* no need to recalculate MMCR* settings and reset the PMCs.
	301	* Just reenable the PMU with the current MMCR* settings
	302	* (possibly updated for removal of counters).
	303	*/
	304	if (!cpuhw->n_added) {
	305	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
	306	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
	307	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
	308	goto out;
	309	}
	310
	311	/*
	312	* Compute MMCR* values for the new set of counters
	313	*/
	314	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
	315	cpuhw->mmcr)) {
	316	/* shouldn't ever get here */
	317	printk(KERN_ERR "oops compute_mmcr failed\n");
	318	goto out;
	319	}
	320
	321	/*
	322	* Write the new configuration to MMCR* with the freeze
	323	* bit set and set the hardware counters to their initial values.
	324	* Then unfreeze the counters.
	325	*/
	326	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
	327	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
	328	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE \| MMCR0_PMCjCE))
	329	\| MMCR0_FC);
	330
	331	/*
	332	* Read off any pre-existing counters that need to move
	333	* to another PMC.
	334	*/
	335	for (i = 0; i < cpuhw->n_counters; ++i) {
	336	counter = cpuhw->counter[i];
	337	if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
	338	power_perf_read(counter);
	339	write_pmc(counter->hw.idx, 0);
	340	counter->hw.idx = 0;
	341	}
	342	}
	343
	344	/*
	345	* Initialize the PMCs for all the new and moved counters.
	346	*/
	347	for (i = 0; i < cpuhw->n_counters; ++i) {
	348	counter = cpuhw->counter[i];
	349	if (counter->hw.idx)
	350	continue;
	351	val = 0;
	352	if (counter->hw_event.irq_period) {
	353	left = atomic64_read(&counter->hw.period_left);
	354	if (left < 0x80000000L)
	355	val = 0x80000000L - left;
	356	}
	357	atomic64_set(&counter->hw.prev_count, val);
	358	counter->hw.idx = hwc_index[i] + 1;
	359	write_pmc(counter->hw.idx, val);
	360	}
	361	mb();
	362	cpuhw->mmcr[0] \|= MMCR0_PMXE \| MMCR0_FCECE;
	363	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
	364
	365	out:
	366	local_irq_restore(flags);
	367	}
	368
	369	static int collect_events(struct perf_counter *group, int max_count,
	370	struct perf_counter ctrs[], unsigned int events)
	371	{
	372	int n = 0;
	373	struct perf_counter *counter;
	374
	375	if (!is_software_counter(group)) {
	376	if (n >= max_count)
	377	return -1;
	378	ctrs[n] = group;
	379	events[n++] = group->hw.config;
	380	}
	381	list_for_each_entry(counter, &group->sibling_list, list_entry) {
	382	if (!is_software_counter(counter) &&
	383	counter->state != PERF_COUNTER_STATE_OFF) {
	384	if (n >= max_count)
	385	return -1;
	386	ctrs[n] = counter;
	387	events[n++] = counter->hw.config;
	388	}
	389	}
	390	return n;
	391	}
	392
	393	static void counter_sched_in(struct perf_counter *counter, int cpu)
	394	{
	395	counter->state = PERF_COUNTER_STATE_ACTIVE;
	396	counter->oncpu = cpu;
	397	if (is_software_counter(counter))
	398	counter->hw_ops->enable(counter);
	399	}
	400
	401	/*
	402	* Called to enable a whole group of counters.
	403	* Returns 1 if the group was enabled, or -EAGAIN if it could not be.
	404	* Assumes the caller has disabled interrupts and has
	405	* frozen the PMU with hw_perf_save_disable.
	406	*/
	407	int hw_perf_group_sched_in(struct perf_counter *group_leader,
	408	struct perf_cpu_context *cpuctx,
	409	struct perf_counter_context *ctx, int cpu)
	410	{
	411	struct cpu_hw_counters *cpuhw;
	412	long i, n, n0;
	413	struct perf_counter *sub;
	414
	415	cpuhw = &__get_cpu_var(cpu_hw_counters);
	416	n0 = cpuhw->n_counters;
	417	n = collect_events(group_leader, ppmu->n_counter - n0,
	418	&cpuhw->counter[n0], &cpuhw->events[n0]);
	419	if (n < 0)
	420	return -EAGAIN;
	421	if (power_check_constraints(cpuhw->events, n + n0))
	422	return -EAGAIN;
	423	cpuhw->n_counters = n0 + n;
	424	cpuhw->n_added += n;
	425
	426	/*
	427	* OK, this group can go on; update counter states etc.,
	428	* and enable any software counters
	429	*/
	430	for (i = n0; i < n0 + n; ++i)
	431	cpuhw->counter[i]->hw.config = cpuhw->events[i];
	432	n = 1;
	433	counter_sched_in(group_leader, cpu);
	434	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
	435	if (sub->state != PERF_COUNTER_STATE_OFF) {
	436	counter_sched_in(sub, cpu);
	437	++n;
	438	}
	439	}
	440	cpuctx->active_oncpu += n;
	441	ctx->nr_active += n;
	442
	443	return 1;
	444	}
	445
	446	/*
	447	* Add a counter to the PMU.
	448	* If all counters are not already frozen, then we disable and
	449	* re-enable the PMU in order to get hw_perf_restore to do the
	450	* actual work of reconfiguring the PMU.
	451	*/
	452	static int power_perf_enable(struct perf_counter *counter)
	453	{
	454	struct cpu_hw_counters *cpuhw;
	455	unsigned long flags;
	456	u64 pmudis;
	457	int n0;
	458	int ret = -EAGAIN;
	459
	460	local_irq_save(flags);
	461	pmudis = hw_perf_save_disable();
	462
	463	/*
	464	* Add the counter to the list (if there is room)
	465	* and check whether the total set is still feasible.
	466	*/
	467	cpuhw = &__get_cpu_var(cpu_hw_counters);
	468	n0 = cpuhw->n_counters;
	469	if (n0 >= ppmu->n_counter)
	470	goto out;
	471	cpuhw->counter[n0] = counter;
	472	cpuhw->events[n0] = counter->hw.config;
	473	if (power_check_constraints(cpuhw->events, n0 + 1))
	474	goto out;
	475
	476	counter->hw.config = cpuhw->events[n0];
	477	++cpuhw->n_counters;
	478	++cpuhw->n_added;
	479
	480	ret = 0;
	481	out:
	482	hw_perf_restore(pmudis);
	483	local_irq_restore(flags);
	484	return ret;
	485	}
	486
	487	/*
	488	* Remove a counter from the PMU.
	489	*/
	490	static void power_perf_disable(struct perf_counter *counter)
	491	{
	492	struct cpu_hw_counters *cpuhw;
	493	long i;
	494	u64 pmudis;
	495	unsigned long flags;
	496
	497	local_irq_save(flags);
	498	pmudis = hw_perf_save_disable();
	499
	500	power_perf_read(counter);
	501
	502	cpuhw = &__get_cpu_var(cpu_hw_counters);
	503	for (i = 0; i < cpuhw->n_counters; ++i) {
	504	if (counter == cpuhw->counter[i]) {
	505	while (++i < cpuhw->n_counters)
	506	cpuhw->counter[i-1] = cpuhw->counter[i];
	507	--cpuhw->n_counters;
	508	ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
	509	write_pmc(counter->hw.idx, 0);
	510	counter->hw.idx = 0;
	511	break;
	512	}
	513	}
	514	if (cpuhw->n_counters == 0) {
	515	/* disable exceptions if no counters are running */
	516	cpuhw->mmcr[0] &= ~(MMCR0_PMXE \| MMCR0_FCECE);
	517	}
	518
	519	hw_perf_restore(pmudis);
	520	local_irq_restore(flags);
	521	}
	522
	523	struct hw_perf_counter_ops power_perf_ops = {
	524	.enable = power_perf_enable,
	525	.disable = power_perf_disable,
	526	.read = power_perf_read
	527	};
	528
	529	const struct hw_perf_counter_ops *
	530	hw_perf_counter_init(struct perf_counter *counter)
	531	{
	532	unsigned long ev;
	533	struct perf_counter *ctrs[MAX_HWCOUNTERS];
	534	unsigned int events[MAX_HWCOUNTERS];
	535	int n;
	536
	537	if (!ppmu)
	538	return NULL;
	539	if ((s64)counter->hw_event.irq_period < 0)
	540	return NULL;
	541	ev = counter->hw_event.type;
	542	if (!counter->hw_event.raw) {
	543	if (ev >= ppmu->n_generic \|\|
	544	ppmu->generic_events[ev] == 0)
	545	return NULL;
	546	ev = ppmu->generic_events[ev];
	547	}
	548	counter->hw.config_base = ev;
	549	counter->hw.idx = 0;
	550
	551	/*
	552	* If this is in a group, check if it can go on with all the
	553	* other hardware counters in the group. We assume the counter
	554	* hasn't been linked into its leader's sibling list at this point.
	555	*/
	556	n = 0;
	557	if (counter->group_leader != counter) {
	558	n = collect_events(counter->group_leader, ppmu->n_counter - 1,
	559	ctrs, events);
	560	if (n < 0)
	561	return NULL;
	562	}
	563	events[n++] = ev;
	564	if (power_check_constraints(events, n))
	565	return NULL;
	566
	567	counter->hw.config = events[n - 1];
	568	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
	569	return &power_perf_ops;
	570	}
	571
	572	/*
	573	* Handle wakeups.
	574	*/
	575	void perf_counter_do_pending(void)
	576	{
	577	int i;
	578	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
	579	struct perf_counter *counter;
	580
	581	set_perf_counter_pending(0);
	582	for (i = 0; i < cpuhw->n_counters; ++i) {
	583	counter = cpuhw->counter[i];
	584	if (counter && counter->wakeup_pending) {
	585	counter->wakeup_pending = 0;
	586	wake_up(&counter->waitq);
	587	}
	588	}
	589	}
	590
	591	/*
	592	* Record data for an irq counter.
	593	* This function was lifted from the x86 code; maybe it should
	594	* go in the core?
	595	*/
	596	static void perf_store_irq_data(struct perf_counter *counter, u64 data)
	597	{
	598	struct perf_data *irqdata = counter->irqdata;
	599
	600	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
	601	irqdata->overrun++;
	602	} else {
	603	u64 p = (u64 ) &irqdata->data[irqdata->len];
	604
	605	*p = data;
	606	irqdata->len += sizeof(u64);
	607	}
	608	}
	609
	610	/*
	611	* Record all the values of the counters in a group
	612	*/
	613	static void perf_handle_group(struct perf_counter *counter)
	614	{
	615	struct perf_counter leader, sub;
	616
	617	leader = counter->group_leader;
	618	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
	619	if (sub != counter)
	620	sub->hw_ops->read(sub);
	621	perf_store_irq_data(counter, sub->hw_event.type);
	622	perf_store_irq_data(counter, atomic64_read(&sub->count));
	623	}
	624	}
	625
	626	/*
	627	* A counter has overflowed; update its count and record
	628	* things if requested. Note that interrupts are hard-disabled
	629	* here so there is no possibility of being interrupted.
	630	*/
	631	static void record_and_restart(struct perf_counter *counter, long val,
	632	struct pt_regs *regs)
	633	{
	634	s64 prev, delta, left;
	635	int record = 0;
	636
	637	/* we don't have to worry about interrupts here */
	638	prev = atomic64_read(&counter->hw.prev_count);
	639	delta = (val - prev) & 0xfffffffful;
	640	atomic64_add(delta, &counter->count);
	641
	642	/*
	643	* See if the total period for this counter has expired,
	644	* and update for the next period.
	645	*/
	646	val = 0;
	647	left = atomic64_read(&counter->hw.period_left) - delta;
	648	if (counter->hw_event.irq_period) {
	649	if (left <= 0) {
	650	left += counter->hw_event.irq_period;
	651	if (left <= 0)
	652	left = counter->hw_event.irq_period;
	653	record = 1;
	654	}
	655	if (left < 0x80000000L)
	656	val = 0x80000000L - left;
	657	}
	658	write_pmc(counter->hw.idx, val);
	659	atomic64_set(&counter->hw.prev_count, val);
	660	atomic64_set(&counter->hw.period_left, left);
	661
	662	/*
	663	* Finally record data if requested.
	664	*/
	665	if (record) {
	666	switch (counter->hw_event.record_type) {
	667	case PERF_RECORD_SIMPLE:
	668	break;
	669	case PERF_RECORD_IRQ:
	670	perf_store_irq_data(counter, instruction_pointer(regs));
	671	counter->wakeup_pending = 1;
	672	break;
	673	case PERF_RECORD_GROUP:
	674	perf_handle_group(counter);
	675	counter->wakeup_pending = 1;
	676	break;
	677	}
	678	}
	679	}
	680
	681	/*
	682	* Performance monitor interrupt stuff
	683	*/
	684	static void perf_counter_interrupt(struct pt_regs *regs)
	685	{
	686	int i;
	687	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
	688	struct perf_counter *counter;
	689	long val;
	690	int need_wakeup = 0, found = 0;
	691
	692	for (i = 0; i < cpuhw->n_counters; ++i) {
	693	counter = cpuhw->counter[i];
	694	val = read_pmc(counter->hw.idx);
	695	if ((int)val < 0) {
	696	/* counter has overflowed */
	697	found = 1;
	698	record_and_restart(counter, val, regs);
	699	if (counter->wakeup_pending)
	700	need_wakeup = 1;
	701	}
	702	}
	703
	704	/*
	705	* In case we didn't find and reset the counter that caused
	706	* the interrupt, scan all counters and reset any that are
	707	* negative, to avoid getting continual interrupts.
	708	* Any that we processed in the previous loop will not be negative.
	709	*/
	710	if (!found) {
	711	for (i = 0; i < ppmu->n_counter; ++i) {
	712	val = read_pmc(i + 1);
	713	if ((int)val < 0)
	714	write_pmc(i + 1, 0);
	715	}
	716	}
	717
	718	/*
	719	* Reset MMCR0 to its normal value. This will set PMXE and
	720	* clear FC (freeze counters) and PMAO (perf mon alert occurred)
	721	* and thus allow interrupts to occur again.
	722	* XXX might want to use MSR.PM to keep the counters frozen until
	723	* we get back out of this interrupt.
	724	*/
	725	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
	726
	727	/*
	728	* If we need a wakeup, check whether interrupts were soft-enabled
	729	* when we took the interrupt. If they were, we can wake stuff up
	730	* immediately; otherwise we'll have to set a flag and do the
	731	* wakeup when interrupts get soft-enabled.
	732	*/
	733	if (need_wakeup) {
	734	if (regs->softe) {
	735	irq_enter();
	736	perf_counter_do_pending();
	737	irq_exit();
	738	} else {
	739	set_perf_counter_pending(1);
	740	}
	741	}
	742	}
	743
	744	static int init_perf_counters(void)
	745	{
	746	if (reserve_pmc_hardware(perf_counter_interrupt)) {
	747	printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
	748	return -EBUSY;
	749	}
	750
	751	return 0;
	752	}
	753
	754	arch_initcall(init_perf_counters);