1 files changed, 221 insertions, 0 deletions
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
new file mode 100644
index 000000000000..380d7e217531
--- /dev/null
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -0,0 +1,221 @@
+/*
+ * Cell Broadband Engine OProfile Support
+ *
+ * (C) Copyright IBM Corporation 2006
+ *
+ * Authors: Maynard Johnson <maynardj@us.ibm.com>
+ *          Carl Love <carll@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/hrtimer.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <asm/cell-pmu.h>
+#include "pr_util.h"
+#define TRACE_ARRAY_SIZE 1024
+#define SCALE_SHIFT 14
+static u32 *samples;
+static int spu_prof_running;
+static unsigned int profiling_interval;
+#define NUM_SPU_BITS_TRBUF 16
+#define SPUS_PER_TB_ENTRY   4
+#define SPUS_PER_NODE        8
+#define SPU_PC_MASK          0xFFFF
+static DEFINE_SPINLOCK(sample_array_lock);
+unsigned long sample_array_lock_flags;
+void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
+{
+        unsigned long ns_per_cyc;
+        if (!freq_khz)
+                freq_khz = ppc_proc_freq/1000;
+        /* To calculate a timeout in nanoseconds, the basic
+         * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
+         * To avoid floating point math, we use the scale math
+         * technique as described in linux/jiffies.h.  We use
+         * a scale factor of SCALE_SHIFT, which provides 4 decimal places
+         * of precision.  This is close enough for the purpose at hand.
+         *
+         * The value of the timeout should be small enough that the hw
+         * trace buffer will not get more then about 1/3 full for the
+         * maximum user specified (the LFSR value) hw sampling frequency.
+         * This is to ensure the trace buffer will never fill even if the
+         * kernel thread scheduling varies under a heavy system load.
+         */
+        ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
+        profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;
+}
+/*
+ * Extract SPU PC from trace buffer entry
+ */
+static void spu_pc_extract(int cpu, int entry)
+{
+        /* the trace buffer is 128 bits */
+        u64 trace_buffer[2];
+        u64 spu_mask;
+        int spu;
+        spu_mask = SPU_PC_MASK;
+        /* Each SPU PC is 16 bits; hence, four spus in each of
+         * the two 64-bit buffer entries that make up the
+         * 128-bit trace_buffer entry.  Process two 64-bit values
+         * simultaneously.
+         * trace[0] SPU PC contents are: 0 1 2 3
+         * trace[1] SPU PC contents are: 4 5 6 7
+         */
+        cbe_read_trace_buffer(cpu, trace_buffer);
+        for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
+                /* spu PC trace entry is upper 16 bits of the
+                 * 18 bit SPU program counter
+                 */
+                samples[spu * TRACE_ARRAY_SIZE + entry]
+                        = (spu_mask & trace_buffer[0]) << 2;
+                samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
+                        = (spu_mask & trace_buffer[1]) << 2;
+                trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
+                trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
+        }
+}
+static int cell_spu_pc_collection(int cpu)
+{
+        u32 trace_addr;
+        int entry;
+        /* process the collected SPU PC for the node */
+        entry = 0;
+        trace_addr = cbe_read_pm(cpu, trace_address);
+        while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
+                /* there is data in the trace buffer to process */
+                spu_pc_extract(cpu, entry);
+                entry++;
+                if (entry >= TRACE_ARRAY_SIZE)
+                        /* spu_samples is full */
+                        break;
+                trace_addr = cbe_read_pm(cpu, trace_address);
+        }
+        return entry;
+}
+static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+{
+        ktime_t kt;
+        int cpu, node, k, num_samples, spu_num;
+        if (!spu_prof_running)
+                goto stop;
+        for_each_online_cpu(cpu) {
+                if (cbe_get_hw_thread_id(cpu))
+                        continue;
+                node = cbe_cpu_to_node(cpu);
+                /* There should only be one kernel thread at a time processing
+                 * the samples.  In the very unlikely case that the processing
+                 * is taking a very long time and multiple kernel threads are
+                 * started to process the samples.  Make sure only one kernel
+                 * thread is working on the samples array at a time.  The
+                 * sample array must be loaded and then processed for a given
+                 * cpu.  The sample array is not per cpu.
+                 */
+                spin_lock_irqsave(&sample_array_lock,
+                                  sample_array_lock_flags);
+                num_samples = cell_spu_pc_collection(cpu);
+                if (num_samples == 0) {
+                        spin_unlock_irqrestore(&sample_array_lock,
+                                               sample_array_lock_flags);
+                        continue;
+                }
+                for (k = 0; k < SPUS_PER_NODE; k++) {
+                        spu_num = k + (node * SPUS_PER_NODE);
+                        spu_sync_buffer(spu_num,
+                                        samples + (k * TRACE_ARRAY_SIZE),
+                                        num_samples);
+                }
+                spin_unlock_irqrestore(&sample_array_lock,
+                                       sample_array_lock_flags);
+        }
+        smp_wmb();      /* insure spu event buffer updates are written */
+                        /* don't want events intermingled... */
+        kt = ktime_set(0, profiling_interval);
+        if (!spu_prof_running)
+                goto stop;
+        hrtimer_forward(timer, timer->base->get_time(), kt);
+        return HRTIMER_RESTART;
+ stop:
+        printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
+        return HRTIMER_NORESTART;
+}
+static struct hrtimer timer;
+/*
+ * Entry point for SPU profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+int start_spu_profiling(unsigned int cycles_reset)
+{
+        ktime_t kt;
+        pr_debug("timer resolution: %lu\n", TICK_NSEC);
+        kt = ktime_set(0, profiling_interval);
+        hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        timer.expires = kt;
+        timer.function = profile_spus;
+        /* Allocate arrays for collecting SPU PC samples */
+        samples = kzalloc(SPUS_PER_NODE *
+                          TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
+        if (!samples)
+                return -ENOMEM;
+        spu_prof_running = 1;
+        hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+        return 0;
+}
+void stop_spu_profiling(void)
+{
+        spu_prof_running = 0;
+        hrtimer_cancel(&timer);
+        kfree(samples);
+        pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+}

diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c new file mode 100644 index 000000000000..380d7e217531 --- /dev/null +++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -0,0 +1,221 @@
	1	/*
	2	* Cell Broadband Engine OProfile Support
	3	*
	4	* (C) Copyright IBM Corporation 2006
	5	*
	6	* Authors: Maynard Johnson <maynardj@us.ibm.com>
	7	* Carl Love <carll@us.ibm.com>
	8	*
	9	* This program is free software; you can redistribute it and/or
	10	* modify it under the terms of the GNU General Public License
	11	* as published by the Free Software Foundation; either version
	12	* 2 of the License, or (at your option) any later version.
	13	*/
	14
	15	#include <linux/hrtimer.h>
	16	#include <linux/smp.h>
	17	#include <linux/slab.h>
	18	#include <asm/cell-pmu.h>
	19	#include "pr_util.h"
	20
	21	#define TRACE_ARRAY_SIZE 1024
	22	#define SCALE_SHIFT 14
	23
	24	static u32 *samples;
	25
	26	static int spu_prof_running;
	27	static unsigned int profiling_interval;
	28
	29	#define NUM_SPU_BITS_TRBUF 16
	30	#define SPUS_PER_TB_ENTRY 4
	31	#define SPUS_PER_NODE 8
	32
	33	#define SPU_PC_MASK 0xFFFF
	34
	35	static DEFINE_SPINLOCK(sample_array_lock);
	36	unsigned long sample_array_lock_flags;
	37
	38	void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
	39	{
	40	unsigned long ns_per_cyc;
	41
	42	if (!freq_khz)
	43	freq_khz = ppc_proc_freq/1000;
	44
	45	/* To calculate a timeout in nanoseconds, the basic
	46	* formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency).
	47	* To avoid floating point math, we use the scale math
	48	* technique as described in linux/jiffies.h. We use
	49	* a scale factor of SCALE_SHIFT, which provides 4 decimal places
	50	* of precision. This is close enough for the purpose at hand.
	51	*
	52	* The value of the timeout should be small enough that the hw
	53	* trace buffer will not get more then about 1/3 full for the
	54	* maximum user specified (the LFSR value) hw sampling frequency.
	55	* This is to ensure the trace buffer will never fill even if the
	56	* kernel thread scheduling varies under a heavy system load.
	57	*/
	58
	59	ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz;
	60	profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT;
	61
	62	}
	63
	64	/*
	65	* Extract SPU PC from trace buffer entry
	66	*/
	67	static void spu_pc_extract(int cpu, int entry)
	68	{
	69	/* the trace buffer is 128 bits */
	70	u64 trace_buffer[2];
	71	u64 spu_mask;
	72	int spu;
	73
	74	spu_mask = SPU_PC_MASK;
	75
	76	/* Each SPU PC is 16 bits; hence, four spus in each of
	77	* the two 64-bit buffer entries that make up the
	78	* 128-bit trace_buffer entry. Process two 64-bit values
	79	* simultaneously.
	80	* trace[0] SPU PC contents are: 0 1 2 3
	81	* trace[1] SPU PC contents are: 4 5 6 7
	82	*/
	83
	84	cbe_read_trace_buffer(cpu, trace_buffer);
	85
	86	for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) {
	87	/* spu PC trace entry is upper 16 bits of the
	88	* 18 bit SPU program counter
	89	*/
	90	samples[spu * TRACE_ARRAY_SIZE + entry]
	91	= (spu_mask & trace_buffer[0]) << 2;
	92	samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry]
	93	= (spu_mask & trace_buffer[1]) << 2;
	94
	95	trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF;
	96	trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF;
	97	}
	98	}
	99
	100	static int cell_spu_pc_collection(int cpu)
	101	{
	102	u32 trace_addr;
	103	int entry;
	104
	105	/* process the collected SPU PC for the node */
	106
	107	entry = 0;
	108
	109	trace_addr = cbe_read_pm(cpu, trace_address);
	110	while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
	111	/* there is data in the trace buffer to process */
	112	spu_pc_extract(cpu, entry);
	113
	114	entry++;
	115
	116	if (entry >= TRACE_ARRAY_SIZE)
	117	/* spu_samples is full */
	118	break;
	119
	120	trace_addr = cbe_read_pm(cpu, trace_address);
	121	}
	122
	123	return entry;
	124	}
	125
	126
	127	static enum hrtimer_restart profile_spus(struct hrtimer *timer)
	128	{
	129	ktime_t kt;
	130	int cpu, node, k, num_samples, spu_num;
	131
	132	if (!spu_prof_running)
	133	goto stop;
	134
	135	for_each_online_cpu(cpu) {
	136	if (cbe_get_hw_thread_id(cpu))
	137	continue;
	138
	139	node = cbe_cpu_to_node(cpu);
	140
	141	/* There should only be one kernel thread at a time processing
	142	* the samples. In the very unlikely case that the processing
	143	* is taking a very long time and multiple kernel threads are
	144	* started to process the samples. Make sure only one kernel
	145	* thread is working on the samples array at a time. The
	146	* sample array must be loaded and then processed for a given
	147	* cpu. The sample array is not per cpu.
	148	*/
	149	spin_lock_irqsave(&sample_array_lock,
	150	sample_array_lock_flags);
	151	num_samples = cell_spu_pc_collection(cpu);
	152
	153	if (num_samples == 0) {
	154	spin_unlock_irqrestore(&sample_array_lock,
	155	sample_array_lock_flags);
	156	continue;
	157	}
	158
	159	for (k = 0; k < SPUS_PER_NODE; k++) {
	160	spu_num = k + (node * SPUS_PER_NODE);
	161	spu_sync_buffer(spu_num,
	162	samples + (k * TRACE_ARRAY_SIZE),
	163	num_samples);
	164	}
	165
	166	spin_unlock_irqrestore(&sample_array_lock,
	167	sample_array_lock_flags);
	168
	169	}
	170	smp_wmb(); /* insure spu event buffer updates are written */
	171	/* don't want events intermingled... */
	172
	173	kt = ktime_set(0, profiling_interval);
	174	if (!spu_prof_running)
	175	goto stop;
	176	hrtimer_forward(timer, timer->base->get_time(), kt);
	177	return HRTIMER_RESTART;
	178
	179	stop:
	180	printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n");
	181	return HRTIMER_NORESTART;
	182	}
	183
	184	static struct hrtimer timer;
	185	/*
	186	* Entry point for SPU profiling.
	187	* NOTE: SPU profiling is done system-wide, not per-CPU.
	188	*
	189	* cycles_reset is the count value specified by the user when
	190	* setting up OProfile to count SPU_CYCLES.
	191	*/
	192	int start_spu_profiling(unsigned int cycles_reset)
	193	{
	194	ktime_t kt;
	195
	196	pr_debug("timer resolution: %lu\n", TICK_NSEC);
	197	kt = ktime_set(0, profiling_interval);
	198	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	199	timer.expires = kt;
	200	timer.function = profile_spus;
	201
	202	/* Allocate arrays for collecting SPU PC samples */
	203	samples = kzalloc(SPUS_PER_NODE *
	204	TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL);
	205
	206	if (!samples)
	207	return -ENOMEM;
	208
	209	spu_prof_running = 1;
	210	hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
	211
	212	return 0;
	213	}
	214
	215	void stop_spu_profiling(void)
	216	{
	217	spu_prof_running = 0;
	218	hrtimer_cancel(&timer);
	219	kfree(samples);
	220	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
	221	}