diff options
Diffstat (limited to 'arch/powerpc/oprofile/cell/spu_profiler.c')
-rw-r--r-- | arch/powerpc/oprofile/cell/spu_profiler.c | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c new file mode 100644 index 000000000000..380d7e217531 --- /dev/null +++ b/arch/powerpc/oprofile/cell/spu_profiler.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Cell Broadband Engine OProfile Support | ||
3 | * | ||
4 | * (C) Copyright IBM Corporation 2006 | ||
5 | * | ||
6 | * Authors: Maynard Johnson <maynardj@us.ibm.com> | ||
7 | * Carl Love <carll@us.ibm.com> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | */ | ||
14 | |||
15 | #include <linux/hrtimer.h> | ||
16 | #include <linux/smp.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <asm/cell-pmu.h> | ||
19 | #include "pr_util.h" | ||
20 | |||
21 | #define TRACE_ARRAY_SIZE 1024 | ||
22 | #define SCALE_SHIFT 14 | ||
23 | |||
24 | static u32 *samples; | ||
25 | |||
26 | static int spu_prof_running; | ||
27 | static unsigned int profiling_interval; | ||
28 | |||
29 | #define NUM_SPU_BITS_TRBUF 16 | ||
30 | #define SPUS_PER_TB_ENTRY 4 | ||
31 | #define SPUS_PER_NODE 8 | ||
32 | |||
33 | #define SPU_PC_MASK 0xFFFF | ||
34 | |||
35 | static DEFINE_SPINLOCK(sample_array_lock); | ||
36 | unsigned long sample_array_lock_flags; | ||
37 | |||
38 | void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset) | ||
39 | { | ||
40 | unsigned long ns_per_cyc; | ||
41 | |||
42 | if (!freq_khz) | ||
43 | freq_khz = ppc_proc_freq/1000; | ||
44 | |||
45 | /* To calculate a timeout in nanoseconds, the basic | ||
46 | * formula is ns = cycles_reset * (NSEC_PER_SEC / cpu frequency). | ||
47 | * To avoid floating point math, we use the scale math | ||
48 | * technique as described in linux/jiffies.h. We use | ||
49 | * a scale factor of SCALE_SHIFT, which provides 4 decimal places | ||
50 | * of precision. This is close enough for the purpose at hand. | ||
51 | * | ||
52 | * The value of the timeout should be small enough that the hw | ||
53 | * trace buffer will not get more then about 1/3 full for the | ||
54 | * maximum user specified (the LFSR value) hw sampling frequency. | ||
55 | * This is to ensure the trace buffer will never fill even if the | ||
56 | * kernel thread scheduling varies under a heavy system load. | ||
57 | */ | ||
58 | |||
59 | ns_per_cyc = (USEC_PER_SEC << SCALE_SHIFT)/freq_khz; | ||
60 | profiling_interval = (ns_per_cyc * cycles_reset) >> SCALE_SHIFT; | ||
61 | |||
62 | } | ||
63 | |||
64 | /* | ||
65 | * Extract SPU PC from trace buffer entry | ||
66 | */ | ||
67 | static void spu_pc_extract(int cpu, int entry) | ||
68 | { | ||
69 | /* the trace buffer is 128 bits */ | ||
70 | u64 trace_buffer[2]; | ||
71 | u64 spu_mask; | ||
72 | int spu; | ||
73 | |||
74 | spu_mask = SPU_PC_MASK; | ||
75 | |||
76 | /* Each SPU PC is 16 bits; hence, four spus in each of | ||
77 | * the two 64-bit buffer entries that make up the | ||
78 | * 128-bit trace_buffer entry. Process two 64-bit values | ||
79 | * simultaneously. | ||
80 | * trace[0] SPU PC contents are: 0 1 2 3 | ||
81 | * trace[1] SPU PC contents are: 4 5 6 7 | ||
82 | */ | ||
83 | |||
84 | cbe_read_trace_buffer(cpu, trace_buffer); | ||
85 | |||
86 | for (spu = SPUS_PER_TB_ENTRY-1; spu >= 0; spu--) { | ||
87 | /* spu PC trace entry is upper 16 bits of the | ||
88 | * 18 bit SPU program counter | ||
89 | */ | ||
90 | samples[spu * TRACE_ARRAY_SIZE + entry] | ||
91 | = (spu_mask & trace_buffer[0]) << 2; | ||
92 | samples[(spu + SPUS_PER_TB_ENTRY) * TRACE_ARRAY_SIZE + entry] | ||
93 | = (spu_mask & trace_buffer[1]) << 2; | ||
94 | |||
95 | trace_buffer[0] = trace_buffer[0] >> NUM_SPU_BITS_TRBUF; | ||
96 | trace_buffer[1] = trace_buffer[1] >> NUM_SPU_BITS_TRBUF; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | static int cell_spu_pc_collection(int cpu) | ||
101 | { | ||
102 | u32 trace_addr; | ||
103 | int entry; | ||
104 | |||
105 | /* process the collected SPU PC for the node */ | ||
106 | |||
107 | entry = 0; | ||
108 | |||
109 | trace_addr = cbe_read_pm(cpu, trace_address); | ||
110 | while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) { | ||
111 | /* there is data in the trace buffer to process */ | ||
112 | spu_pc_extract(cpu, entry); | ||
113 | |||
114 | entry++; | ||
115 | |||
116 | if (entry >= TRACE_ARRAY_SIZE) | ||
117 | /* spu_samples is full */ | ||
118 | break; | ||
119 | |||
120 | trace_addr = cbe_read_pm(cpu, trace_address); | ||
121 | } | ||
122 | |||
123 | return entry; | ||
124 | } | ||
125 | |||
126 | |||
127 | static enum hrtimer_restart profile_spus(struct hrtimer *timer) | ||
128 | { | ||
129 | ktime_t kt; | ||
130 | int cpu, node, k, num_samples, spu_num; | ||
131 | |||
132 | if (!spu_prof_running) | ||
133 | goto stop; | ||
134 | |||
135 | for_each_online_cpu(cpu) { | ||
136 | if (cbe_get_hw_thread_id(cpu)) | ||
137 | continue; | ||
138 | |||
139 | node = cbe_cpu_to_node(cpu); | ||
140 | |||
141 | /* There should only be one kernel thread at a time processing | ||
142 | * the samples. In the very unlikely case that the processing | ||
143 | * is taking a very long time and multiple kernel threads are | ||
144 | * started to process the samples. Make sure only one kernel | ||
145 | * thread is working on the samples array at a time. The | ||
146 | * sample array must be loaded and then processed for a given | ||
147 | * cpu. The sample array is not per cpu. | ||
148 | */ | ||
149 | spin_lock_irqsave(&sample_array_lock, | ||
150 | sample_array_lock_flags); | ||
151 | num_samples = cell_spu_pc_collection(cpu); | ||
152 | |||
153 | if (num_samples == 0) { | ||
154 | spin_unlock_irqrestore(&sample_array_lock, | ||
155 | sample_array_lock_flags); | ||
156 | continue; | ||
157 | } | ||
158 | |||
159 | for (k = 0; k < SPUS_PER_NODE; k++) { | ||
160 | spu_num = k + (node * SPUS_PER_NODE); | ||
161 | spu_sync_buffer(spu_num, | ||
162 | samples + (k * TRACE_ARRAY_SIZE), | ||
163 | num_samples); | ||
164 | } | ||
165 | |||
166 | spin_unlock_irqrestore(&sample_array_lock, | ||
167 | sample_array_lock_flags); | ||
168 | |||
169 | } | ||
170 | smp_wmb(); /* insure spu event buffer updates are written */ | ||
171 | /* don't want events intermingled... */ | ||
172 | |||
173 | kt = ktime_set(0, profiling_interval); | ||
174 | if (!spu_prof_running) | ||
175 | goto stop; | ||
176 | hrtimer_forward(timer, timer->base->get_time(), kt); | ||
177 | return HRTIMER_RESTART; | ||
178 | |||
179 | stop: | ||
180 | printk(KERN_INFO "SPU_PROF: spu-prof timer ending\n"); | ||
181 | return HRTIMER_NORESTART; | ||
182 | } | ||
183 | |||
184 | static struct hrtimer timer; | ||
185 | /* | ||
186 | * Entry point for SPU profiling. | ||
187 | * NOTE: SPU profiling is done system-wide, not per-CPU. | ||
188 | * | ||
189 | * cycles_reset is the count value specified by the user when | ||
190 | * setting up OProfile to count SPU_CYCLES. | ||
191 | */ | ||
192 | int start_spu_profiling(unsigned int cycles_reset) | ||
193 | { | ||
194 | ktime_t kt; | ||
195 | |||
196 | pr_debug("timer resolution: %lu\n", TICK_NSEC); | ||
197 | kt = ktime_set(0, profiling_interval); | ||
198 | hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
199 | timer.expires = kt; | ||
200 | timer.function = profile_spus; | ||
201 | |||
202 | /* Allocate arrays for collecting SPU PC samples */ | ||
203 | samples = kzalloc(SPUS_PER_NODE * | ||
204 | TRACE_ARRAY_SIZE * sizeof(u32), GFP_KERNEL); | ||
205 | |||
206 | if (!samples) | ||
207 | return -ENOMEM; | ||
208 | |||
209 | spu_prof_running = 1; | ||
210 | hrtimer_start(&timer, kt, HRTIMER_MODE_REL); | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | void stop_spu_profiling(void) | ||
216 | { | ||
217 | spu_prof_running = 0; | ||
218 | hrtimer_cancel(&timer); | ||
219 | kfree(samples); | ||
220 | pr_debug("SPU_PROF: stop_spu_profiling issued\n"); | ||
221 | } | ||