diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-09-18 14:14:01 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-09-18 14:47:30 -0400 |
commit | def0a9b2573e00ab0b486cb5382625203ab4c4a6 (patch) | |
tree | 1e3086fc320c244297b5b63cce47065bcfb71e8c | |
parent | cf450a7355a116af793998c118a6bcf7f5a8367e (diff) |
sched_clock: Make it NMI safe
Arjan complained about the suckyness of TSC on modern machines, and
asked if we could do something about that for PERF_SAMPLE_TIME.
Make cpu_clock() NMI safe by removing the spinlock and using
cmpxchg. This also makes it smaller and more robust.
Affects architectures that use HAVE_UNSTABLE_SCHED_CLOCK, i.e. IA64
and x86.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | kernel/perf_counter.c | 9 | ||||
-rw-r--r-- | kernel/sched_clock.c | 122 |
2 files changed, 56 insertions, 75 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6944bd55ec4e..06d233a06da5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c | |||
@@ -2955,10 +2955,7 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
2955 | } | 2955 | } |
2956 | 2956 | ||
2957 | if (sample_type & PERF_SAMPLE_TIME) { | 2957 | if (sample_type & PERF_SAMPLE_TIME) { |
2958 | /* | 2958 | data->time = perf_clock(); |
2959 | * Maybe do better on x86 and provide cpu_clock_nmi() | ||
2960 | */ | ||
2961 | data->time = sched_clock(); | ||
2962 | 2959 | ||
2963 | header->size += sizeof(data->time); | 2960 | header->size += sizeof(data->time); |
2964 | } | 2961 | } |
@@ -3488,7 +3485,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) | |||
3488 | .misc = 0, | 3485 | .misc = 0, |
3489 | .size = sizeof(throttle_event), | 3486 | .size = sizeof(throttle_event), |
3490 | }, | 3487 | }, |
3491 | .time = sched_clock(), | 3488 | .time = perf_clock(), |
3492 | .id = primary_counter_id(counter), | 3489 | .id = primary_counter_id(counter), |
3493 | .stream_id = counter->id, | 3490 | .stream_id = counter->id, |
3494 | }; | 3491 | }; |
@@ -3540,7 +3537,7 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, | |||
3540 | } | 3537 | } |
3541 | 3538 | ||
3542 | if (counter->attr.freq) { | 3539 | if (counter->attr.freq) { |
3543 | u64 now = sched_clock(); | 3540 | u64 now = perf_clock(); |
3544 | s64 delta = now - hwc->freq_stamp; | 3541 | s64 delta = now - hwc->freq_stamp; |
3545 | 3542 | ||
3546 | hwc->freq_stamp = now; | 3543 | hwc->freq_stamp = now; |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e1d16c9a7680..ac2e1dc708bd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running; | |||
48 | __read_mostly int sched_clock_stable; | 48 | __read_mostly int sched_clock_stable; |
49 | 49 | ||
50 | struct sched_clock_data { | 50 | struct sched_clock_data { |
51 | /* | ||
52 | * Raw spinlock - this is a special case: this might be called | ||
53 | * from within instrumentation code so we dont want to do any | ||
54 | * instrumentation ourselves. | ||
55 | */ | ||
56 | raw_spinlock_t lock; | ||
57 | |||
58 | u64 tick_raw; | 51 | u64 tick_raw; |
59 | u64 tick_gtod; | 52 | u64 tick_gtod; |
60 | u64 clock; | 53 | u64 clock; |
@@ -80,7 +73,6 @@ void sched_clock_init(void) | |||
80 | for_each_possible_cpu(cpu) { | 73 | for_each_possible_cpu(cpu) { |
81 | struct sched_clock_data *scd = cpu_sdc(cpu); | 74 | struct sched_clock_data *scd = cpu_sdc(cpu); |
82 | 75 | ||
83 | scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | ||
84 | scd->tick_raw = 0; | 76 | scd->tick_raw = 0; |
85 | scd->tick_gtod = ktime_now; | 77 | scd->tick_gtod = ktime_now; |
86 | scd->clock = ktime_now; | 78 | scd->clock = ktime_now; |
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y) | |||
109 | * - filter out backward motion | 101 | * - filter out backward motion |
110 | * - use the GTOD tick value to create a window to filter crazy TSC values | 102 | * - use the GTOD tick value to create a window to filter crazy TSC values |
111 | */ | 103 | */ |
112 | static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) | 104 | static u64 sched_clock_local(struct sched_clock_data *scd) |
113 | { | 105 | { |
114 | s64 delta = now - scd->tick_raw; | 106 | u64 now, clock, old_clock, min_clock, max_clock; |
115 | u64 clock, min_clock, max_clock; | 107 | s64 delta; |
116 | 108 | ||
109 | again: | ||
110 | now = sched_clock(); | ||
111 | delta = now - scd->tick_raw; | ||
117 | if (unlikely(delta < 0)) | 112 | if (unlikely(delta < 0)) |
118 | delta = 0; | 113 | delta = 0; |
119 | 114 | ||
115 | old_clock = scd->clock; | ||
116 | |||
120 | /* | 117 | /* |
121 | * scd->clock = clamp(scd->tick_gtod + delta, | 118 | * scd->clock = clamp(scd->tick_gtod + delta, |
122 | * max(scd->tick_gtod, scd->clock), | 119 | * max(scd->tick_gtod, scd->clock), |
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) | |||
124 | */ | 121 | */ |
125 | 122 | ||
126 | clock = scd->tick_gtod + delta; | 123 | clock = scd->tick_gtod + delta; |
127 | min_clock = wrap_max(scd->tick_gtod, scd->clock); | 124 | min_clock = wrap_max(scd->tick_gtod, old_clock); |
128 | max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); | 125 | max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); |
129 | 126 | ||
130 | clock = wrap_max(clock, min_clock); | 127 | clock = wrap_max(clock, min_clock); |
131 | clock = wrap_min(clock, max_clock); | 128 | clock = wrap_min(clock, max_clock); |
132 | 129 | ||
133 | scd->clock = clock; | 130 | if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) |
131 | goto again; | ||
134 | 132 | ||
135 | return scd->clock; | 133 | return clock; |
136 | } | 134 | } |
137 | 135 | ||
138 | static void lock_double_clock(struct sched_clock_data *data1, | 136 | static u64 sched_clock_remote(struct sched_clock_data *scd) |
139 | struct sched_clock_data *data2) | ||
140 | { | 137 | { |
141 | if (data1 < data2) { | 138 | struct sched_clock_data *my_scd = this_scd(); |
142 | __raw_spin_lock(&data1->lock); | 139 | u64 this_clock, remote_clock; |
143 | __raw_spin_lock(&data2->lock); | 140 | u64 *ptr, old_val, val; |
141 | |||
142 | sched_clock_local(my_scd); | ||
143 | again: | ||
144 | this_clock = my_scd->clock; | ||
145 | remote_clock = scd->clock; | ||
146 | |||
147 | /* | ||
148 | * Use the opportunity that we have both locks | ||
149 | * taken to couple the two clocks: we take the | ||
150 | * larger time as the latest time for both | ||
151 | * runqueues. (this creates monotonic movement) | ||
152 | */ | ||
153 | if (likely((s64)(remote_clock - this_clock) < 0)) { | ||
154 | ptr = &scd->clock; | ||
155 | old_val = remote_clock; | ||
156 | val = this_clock; | ||
144 | } else { | 157 | } else { |
145 | __raw_spin_lock(&data2->lock); | 158 | /* |
146 | __raw_spin_lock(&data1->lock); | 159 | * Should be rare, but possible: |
160 | */ | ||
161 | ptr = &my_scd->clock; | ||
162 | old_val = this_clock; | ||
163 | val = remote_clock; | ||
147 | } | 164 | } |
165 | |||
166 | if (cmpxchg(ptr, old_val, val) != old_val) | ||
167 | goto again; | ||
168 | |||
169 | return val; | ||
148 | } | 170 | } |
149 | 171 | ||
150 | u64 sched_clock_cpu(int cpu) | 172 | u64 sched_clock_cpu(int cpu) |
151 | { | 173 | { |
152 | u64 now, clock, this_clock, remote_clock; | ||
153 | struct sched_clock_data *scd; | 174 | struct sched_clock_data *scd; |
175 | u64 clock; | ||
176 | |||
177 | WARN_ON_ONCE(!irqs_disabled()); | ||
154 | 178 | ||
155 | if (sched_clock_stable) | 179 | if (sched_clock_stable) |
156 | return sched_clock(); | 180 | return sched_clock(); |
157 | 181 | ||
158 | scd = cpu_sdc(cpu); | ||
159 | |||
160 | /* | ||
161 | * Normally this is not called in NMI context - but if it is, | ||
162 | * trying to do any locking here is totally lethal. | ||
163 | */ | ||
164 | if (unlikely(in_nmi())) | ||
165 | return scd->clock; | ||
166 | |||
167 | if (unlikely(!sched_clock_running)) | 182 | if (unlikely(!sched_clock_running)) |
168 | return 0ull; | 183 | return 0ull; |
169 | 184 | ||
170 | WARN_ON_ONCE(!irqs_disabled()); | 185 | scd = cpu_sdc(cpu); |
171 | now = sched_clock(); | ||
172 | |||
173 | if (cpu != raw_smp_processor_id()) { | ||
174 | struct sched_clock_data *my_scd = this_scd(); | ||
175 | |||
176 | lock_double_clock(scd, my_scd); | ||
177 | |||
178 | this_clock = __update_sched_clock(my_scd, now); | ||
179 | remote_clock = scd->clock; | ||
180 | |||
181 | /* | ||
182 | * Use the opportunity that we have both locks | ||
183 | * taken to couple the two clocks: we take the | ||
184 | * larger time as the latest time for both | ||
185 | * runqueues. (this creates monotonic movement) | ||
186 | */ | ||
187 | if (likely((s64)(remote_clock - this_clock) < 0)) { | ||
188 | clock = this_clock; | ||
189 | scd->clock = clock; | ||
190 | } else { | ||
191 | /* | ||
192 | * Should be rare, but possible: | ||
193 | */ | ||
194 | clock = remote_clock; | ||
195 | my_scd->clock = remote_clock; | ||
196 | } | ||
197 | |||
198 | __raw_spin_unlock(&my_scd->lock); | ||
199 | } else { | ||
200 | __raw_spin_lock(&scd->lock); | ||
201 | clock = __update_sched_clock(scd, now); | ||
202 | } | ||
203 | 186 | ||
204 | __raw_spin_unlock(&scd->lock); | 187 | if (cpu != smp_processor_id()) |
188 | clock = sched_clock_remote(scd); | ||
189 | else | ||
190 | clock = sched_clock_local(scd); | ||
205 | 191 | ||
206 | return clock; | 192 | return clock; |
207 | } | 193 | } |
@@ -223,11 +209,9 @@ void sched_clock_tick(void) | |||
223 | now_gtod = ktime_to_ns(ktime_get()); | 209 | now_gtod = ktime_to_ns(ktime_get()); |
224 | now = sched_clock(); | 210 | now = sched_clock(); |
225 | 211 | ||
226 | __raw_spin_lock(&scd->lock); | ||
227 | scd->tick_raw = now; | 212 | scd->tick_raw = now; |
228 | scd->tick_gtod = now_gtod; | 213 | scd->tick_gtod = now_gtod; |
229 | __update_sched_clock(scd, now); | 214 | sched_clock_local(scd); |
230 | __raw_spin_unlock(&scd->lock); | ||
231 | } | 215 | } |
232 | 216 | ||
233 | /* | 217 | /* |