diff options
Diffstat (limited to 'arch/x86_64/kernel')
-rw-r--r-- | arch/x86_64/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 230 | ||||
-rw-r--r-- | arch/x86_64/kernel/time.c | 11 | ||||
-rw-r--r-- | arch/x86_64/kernel/tsc_sync.c | 187 |
4 files changed, 210 insertions, 220 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index ae399458024b..6465eee6d920 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile | |||
@@ -19,7 +19,7 @@ obj-$(CONFIG_ACPI) += acpi/ | |||
19 | obj-$(CONFIG_X86_MSR) += msr.o | 19 | obj-$(CONFIG_X86_MSR) += msr.o |
20 | obj-$(CONFIG_MICROCODE) += microcode.o | 20 | obj-$(CONFIG_MICROCODE) += microcode.o |
21 | obj-$(CONFIG_X86_CPUID) += cpuid.o | 21 | obj-$(CONFIG_X86_CPUID) += cpuid.o |
22 | obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o | 22 | obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o |
23 | obj-y += apic.o nmi.o | 23 | obj-y += apic.o nmi.o |
24 | obj-y += io_apic.o mpparse.o \ | 24 | obj-y += io_apic.o mpparse.o \ |
25 | genapic.o genapic_cluster.o genapic_flat.o | 25 | genapic.o genapic_cluster.o genapic_flat.o |
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index daf19332f0dd..62d828433c30 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -148,217 +148,6 @@ static void __cpuinit smp_store_cpu_info(int id) | |||
148 | print_cpu_info(c); | 148 | print_cpu_info(c); |
149 | } | 149 | } |
150 | 150 | ||
151 | /* | ||
152 | * New Funky TSC sync algorithm borrowed from IA64. | ||
153 | * Main advantage is that it doesn't reset the TSCs fully and | ||
154 | * in general looks more robust and it works better than my earlier | ||
155 | * attempts. I believe it was written by David Mosberger. Some minor | ||
156 | * adjustments for x86-64 by me -AK | ||
157 | * | ||
158 | * Original comment reproduced below. | ||
159 | * | ||
160 | * Synchronize TSC of the current (slave) CPU with the TSC of the | ||
161 | * MASTER CPU (normally the time-keeper CPU). We use a closed loop to | ||
162 | * eliminate the possibility of unaccounted-for errors (such as | ||
163 | * getting a machine check in the middle of a calibration step). The | ||
164 | * basic idea is for the slave to ask the master what itc value it has | ||
165 | * and to read its own itc before and after the master responds. Each | ||
166 | * iteration gives us three timestamps: | ||
167 | * | ||
168 | * slave master | ||
169 | * | ||
170 | * t0 ---\ | ||
171 | * ---\ | ||
172 | * ---> | ||
173 | * tm | ||
174 | * /--- | ||
175 | * /--- | ||
176 | * t1 <--- | ||
177 | * | ||
178 | * | ||
179 | * The goal is to adjust the slave's TSC such that tm falls exactly | ||
180 | * half-way between t0 and t1. If we achieve this, the clocks are | ||
181 | * synchronized provided the interconnect between the slave and the | ||
182 | * master is symmetric. Even if the interconnect were asymmetric, we | ||
183 | * would still know that the synchronization error is smaller than the | ||
184 | * roundtrip latency (t0 - t1). | ||
185 | * | ||
186 | * When the interconnect is quiet and symmetric, this lets us | ||
187 | * synchronize the TSC to within one or two cycles. However, we can | ||
188 | * only *guarantee* that the synchronization is accurate to within a | ||
189 | * round-trip time, which is typically in the range of several hundred | ||
190 | * cycles (e.g., ~500 cycles). In practice, this means that the TSCs | ||
191 | * are usually almost perfectly synchronized, but we shouldn't assume | ||
192 | * that the accuracy is much better than half a micro second or so. | ||
193 | * | ||
194 | * [there are other errors like the latency of RDTSC and of the | ||
195 | * WRMSR. These can also account to hundreds of cycles. So it's | ||
196 | * probably worse. It claims 153 cycles error on a dual Opteron, | ||
197 | * but I suspect the numbers are actually somewhat worse -AK] | ||
198 | */ | ||
199 | |||
200 | #define MASTER 0 | ||
201 | #define SLAVE (SMP_CACHE_BYTES/8) | ||
202 | |||
203 | /* Intentionally don't use cpu_relax() while TSC synchronization | ||
204 | because we don't want to go into funky power save modi or cause | ||
205 | hypervisors to schedule us away. Going to sleep would likely affect | ||
206 | latency and low latency is the primary objective here. -AK */ | ||
207 | #define no_cpu_relax() barrier() | ||
208 | |||
209 | static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); | ||
210 | static volatile __cpuinitdata unsigned long go[SLAVE + 1]; | ||
211 | static int notscsync __cpuinitdata; | ||
212 | |||
213 | #undef DEBUG_TSC_SYNC | ||
214 | |||
215 | #define NUM_ROUNDS 64 /* magic value */ | ||
216 | #define NUM_ITERS 5 /* likewise */ | ||
217 | |||
218 | /* Callback on boot CPU */ | ||
219 | static __cpuinit void sync_master(void *arg) | ||
220 | { | ||
221 | unsigned long flags, i; | ||
222 | |||
223 | go[MASTER] = 0; | ||
224 | |||
225 | local_irq_save(flags); | ||
226 | { | ||
227 | for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { | ||
228 | while (!go[MASTER]) | ||
229 | no_cpu_relax(); | ||
230 | go[MASTER] = 0; | ||
231 | rdtscll(go[SLAVE]); | ||
232 | } | ||
233 | } | ||
234 | local_irq_restore(flags); | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Return the number of cycles by which our tsc differs from the tsc | ||
239 | * on the master (time-keeper) CPU. A positive number indicates our | ||
240 | * tsc is ahead of the master, negative that it is behind. | ||
241 | */ | ||
242 | static inline long | ||
243 | get_delta(long *rt, long *master) | ||
244 | { | ||
245 | unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; | ||
246 | unsigned long tcenter, t0, t1, tm; | ||
247 | int i; | ||
248 | |||
249 | for (i = 0; i < NUM_ITERS; ++i) { | ||
250 | rdtscll(t0); | ||
251 | go[MASTER] = 1; | ||
252 | while (!(tm = go[SLAVE])) | ||
253 | no_cpu_relax(); | ||
254 | go[SLAVE] = 0; | ||
255 | rdtscll(t1); | ||
256 | |||
257 | if (t1 - t0 < best_t1 - best_t0) | ||
258 | best_t0 = t0, best_t1 = t1, best_tm = tm; | ||
259 | } | ||
260 | |||
261 | *rt = best_t1 - best_t0; | ||
262 | *master = best_tm - best_t0; | ||
263 | |||
264 | /* average best_t0 and best_t1 without overflow: */ | ||
265 | tcenter = (best_t0/2 + best_t1/2); | ||
266 | if (best_t0 % 2 + best_t1 % 2 == 2) | ||
267 | ++tcenter; | ||
268 | return tcenter - best_tm; | ||
269 | } | ||
270 | |||
271 | static __cpuinit void sync_tsc(unsigned int master) | ||
272 | { | ||
273 | int i, done = 0; | ||
274 | long delta, adj, adjust_latency = 0; | ||
275 | unsigned long flags, rt, master_time_stamp, bound; | ||
276 | #ifdef DEBUG_TSC_SYNC | ||
277 | static struct syncdebug { | ||
278 | long rt; /* roundtrip time */ | ||
279 | long master; /* master's timestamp */ | ||
280 | long diff; /* difference between midpoint and master's timestamp */ | ||
281 | long lat; /* estimate of tsc adjustment latency */ | ||
282 | } t[NUM_ROUNDS] __cpuinitdata; | ||
283 | #endif | ||
284 | |||
285 | printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", | ||
286 | smp_processor_id(), master); | ||
287 | |||
288 | go[MASTER] = 1; | ||
289 | |||
290 | /* It is dangerous to broadcast IPI as cpus are coming up, | ||
291 | * as they may not be ready to accept them. So since | ||
292 | * we only need to send the ipi to the boot cpu direct | ||
293 | * the message, and avoid the race. | ||
294 | */ | ||
295 | smp_call_function_single(master, sync_master, NULL, 1, 0); | ||
296 | |||
297 | while (go[MASTER]) /* wait for master to be ready */ | ||
298 | no_cpu_relax(); | ||
299 | |||
300 | spin_lock_irqsave(&tsc_sync_lock, flags); | ||
301 | { | ||
302 | for (i = 0; i < NUM_ROUNDS; ++i) { | ||
303 | delta = get_delta(&rt, &master_time_stamp); | ||
304 | if (delta == 0) { | ||
305 | done = 1; /* let's lock on to this... */ | ||
306 | bound = rt; | ||
307 | } | ||
308 | |||
309 | if (!done) { | ||
310 | unsigned long t; | ||
311 | if (i > 0) { | ||
312 | adjust_latency += -delta; | ||
313 | adj = -delta + adjust_latency/4; | ||
314 | } else | ||
315 | adj = -delta; | ||
316 | |||
317 | rdtscll(t); | ||
318 | wrmsrl(MSR_IA32_TSC, t + adj); | ||
319 | } | ||
320 | #ifdef DEBUG_TSC_SYNC | ||
321 | t[i].rt = rt; | ||
322 | t[i].master = master_time_stamp; | ||
323 | t[i].diff = delta; | ||
324 | t[i].lat = adjust_latency/4; | ||
325 | #endif | ||
326 | } | ||
327 | } | ||
328 | spin_unlock_irqrestore(&tsc_sync_lock, flags); | ||
329 | |||
330 | #ifdef DEBUG_TSC_SYNC | ||
331 | for (i = 0; i < NUM_ROUNDS; ++i) | ||
332 | printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", | ||
333 | t[i].rt, t[i].master, t[i].diff, t[i].lat); | ||
334 | #endif | ||
335 | |||
336 | printk(KERN_INFO | ||
337 | "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " | ||
338 | "maxerr %lu cycles)\n", | ||
339 | smp_processor_id(), master, delta, rt); | ||
340 | } | ||
341 | |||
342 | static void __cpuinit tsc_sync_wait(void) | ||
343 | { | ||
344 | /* | ||
345 | * When the CPU has synchronized TSCs assume the BIOS | ||
346 | * or the hardware already synced. Otherwise we could | ||
347 | * mess up a possible perfect synchronization with a | ||
348 | * not-quite-perfect algorithm. | ||
349 | */ | ||
350 | if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) | ||
351 | return; | ||
352 | sync_tsc(0); | ||
353 | } | ||
354 | |||
355 | static __init int notscsync_setup(char *s) | ||
356 | { | ||
357 | notscsync = 1; | ||
358 | return 1; | ||
359 | } | ||
360 | __setup("notscsync", notscsync_setup); | ||
361 | |||
362 | static atomic_t init_deasserted __cpuinitdata; | 151 | static atomic_t init_deasserted __cpuinitdata; |
363 | 152 | ||
364 | /* | 153 | /* |
@@ -546,6 +335,11 @@ void __cpuinit start_secondary(void) | |||
546 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ | 335 | /* otherwise gcc will move up the smp_processor_id before the cpu_init */ |
547 | barrier(); | 336 | barrier(); |
548 | 337 | ||
338 | /* | ||
339 | * Check TSC sync first: | ||
340 | */ | ||
341 | check_tsc_sync_target(); | ||
342 | |||
549 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); | 343 | Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); |
550 | setup_secondary_APIC_clock(); | 344 | setup_secondary_APIC_clock(); |
551 | 345 | ||
@@ -565,14 +359,6 @@ void __cpuinit start_secondary(void) | |||
565 | */ | 359 | */ |
566 | set_cpu_sibling_map(smp_processor_id()); | 360 | set_cpu_sibling_map(smp_processor_id()); |
567 | 361 | ||
568 | /* | ||
569 | * Wait for TSC sync to not schedule things before. | ||
570 | * We still process interrupts, which could see an inconsistent | ||
571 | * time in that window unfortunately. | ||
572 | * Do this here because TSC sync has global unprotected state. | ||
573 | */ | ||
574 | tsc_sync_wait(); | ||
575 | |||
576 | /* | 362 | /* |
577 | * We need to hold call_lock, so there is no inconsistency | 363 | * We need to hold call_lock, so there is no inconsistency |
578 | * between the time smp_call_function() determines number of | 364 | * between the time smp_call_function() determines number of |
@@ -592,6 +378,7 @@ void __cpuinit start_secondary(void) | |||
592 | cpu_set(smp_processor_id(), cpu_online_map); | 378 | cpu_set(smp_processor_id(), cpu_online_map); |
593 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; | 379 | per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; |
594 | spin_unlock(&vector_lock); | 380 | spin_unlock(&vector_lock); |
381 | |||
595 | unlock_ipi_call_lock(); | 382 | unlock_ipi_call_lock(); |
596 | 383 | ||
597 | cpu_idle(); | 384 | cpu_idle(); |
@@ -1168,6 +955,11 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
1168 | /* Unleash the CPU! */ | 955 | /* Unleash the CPU! */ |
1169 | Dprintk("waiting for cpu %d\n", cpu); | 956 | Dprintk("waiting for cpu %d\n", cpu); |
1170 | 957 | ||
958 | /* | ||
959 | * Make sure and check TSC sync: | ||
960 | */ | ||
961 | check_tsc_sync_source(cpu); | ||
962 | |||
1171 | while (!cpu_isset(cpu, cpu_online_map)) | 963 | while (!cpu_isset(cpu, cpu_online_map)) |
1172 | cpu_relax(); | 964 | cpu_relax(); |
1173 | 965 | ||
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3cc6886f1fb7..8cb2b2d35f5d 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c | |||
@@ -944,12 +944,23 @@ void __init time_init(void) | |||
944 | #endif | 944 | #endif |
945 | } | 945 | } |
946 | 946 | ||
947 | static int tsc_unstable = 0; | ||
948 | |||
949 | void mark_tsc_unstable(void) | ||
950 | { | ||
951 | tsc_unstable = 1; | ||
952 | } | ||
953 | EXPORT_SYMBOL_GPL(mark_tsc_unstable); | ||
954 | |||
947 | /* | 955 | /* |
948 | * Make an educated guess if the TSC is trustworthy and synchronized | 956 | * Make an educated guess if the TSC is trustworthy and synchronized |
949 | * over all CPUs. | 957 | * over all CPUs. |
950 | */ | 958 | */ |
951 | __cpuinit int unsynchronized_tsc(void) | 959 | __cpuinit int unsynchronized_tsc(void) |
952 | { | 960 | { |
961 | if (tsc_unstable) | ||
962 | return 1; | ||
963 | |||
953 | #ifdef CONFIG_SMP | 964 | #ifdef CONFIG_SMP |
954 | if (apic_is_clustered_box()) | 965 | if (apic_is_clustered_box()) |
955 | return 1; | 966 | return 1; |
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c new file mode 100644 index 000000000000..014f0db45dfa --- /dev/null +++ b/arch/x86_64/kernel/tsc_sync.c | |||
@@ -0,0 +1,187 @@ | |||
1 | /* | ||
2 | * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization. | ||
3 | * | ||
4 | * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar | ||
5 | * | ||
6 | * We check whether all boot CPUs have their TSC's synchronized, | ||
7 | * print a warning if not and turn off the TSC clock-source. | ||
8 | * | ||
9 | * The warp-check is point-to-point between two CPUs, the CPU | ||
10 | * initiating the bootup is the 'source CPU', the freshly booting | ||
11 | * CPU is the 'target CPU'. | ||
12 | * | ||
13 | * Only two CPUs may participate - they can enter in any order. | ||
14 | * ( The serial nature of the boot logic and the CPU hotplug lock | ||
15 | * protects against more than 2 CPUs entering this code. ) | ||
16 | */ | ||
17 | #include <linux/spinlock.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/smp.h> | ||
21 | #include <linux/nmi.h> | ||
22 | #include <asm/tsc.h> | ||
23 | |||
24 | /* | ||
25 | * Entry/exit counters that make sure that both CPUs | ||
26 | * run the measurement code at once: | ||
27 | */ | ||
28 | static __cpuinitdata atomic_t start_count; | ||
29 | static __cpuinitdata atomic_t stop_count; | ||
30 | |||
31 | /* | ||
32 | * We use a raw spinlock in this exceptional case, because | ||
33 | * we want to have the fastest, inlined, non-debug version | ||
34 | * of a critical section, to be able to prove TSC time-warps: | ||
35 | */ | ||
36 | static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; | ||
37 | static __cpuinitdata cycles_t last_tsc; | ||
38 | static __cpuinitdata cycles_t max_warp; | ||
39 | static __cpuinitdata int nr_warps; | ||
40 | |||
41 | /* | ||
42 | * TSC-warp measurement loop running on both CPUs: | ||
43 | */ | ||
44 | static __cpuinit void check_tsc_warp(void) | ||
45 | { | ||
46 | cycles_t start, now, prev, end; | ||
47 | int i; | ||
48 | |||
49 | start = get_cycles_sync(); | ||
50 | /* | ||
51 | * The measurement runs for 20 msecs: | ||
52 | */ | ||
53 | end = start + cpu_khz * 20ULL; | ||
54 | now = start; | ||
55 | |||
56 | for (i = 0; ; i++) { | ||
57 | /* | ||
58 | * We take the global lock, measure TSC, save the | ||
59 | * previous TSC that was measured (possibly on | ||
60 | * another CPU) and update the previous TSC timestamp. | ||
61 | */ | ||
62 | __raw_spin_lock(&sync_lock); | ||
63 | prev = last_tsc; | ||
64 | now = get_cycles_sync(); | ||
65 | last_tsc = now; | ||
66 | __raw_spin_unlock(&sync_lock); | ||
67 | |||
68 | /* | ||
69 | * Be nice every now and then (and also check whether | ||
70 | * measurement is done [we also insert a 100 million | ||
71 | * loops safety exit, so we dont lock up in case the | ||
72 | * TSC readout is totally broken]): | ||
73 | */ | ||
74 | if (unlikely(!(i & 7))) { | ||
75 | if (now > end || i > 100000000) | ||
76 | break; | ||
77 | cpu_relax(); | ||
78 | touch_nmi_watchdog(); | ||
79 | } | ||
80 | /* | ||
81 | * Outside the critical section we can now see whether | ||
82 | * we saw a time-warp of the TSC going backwards: | ||
83 | */ | ||
84 | if (unlikely(prev > now)) { | ||
85 | __raw_spin_lock(&sync_lock); | ||
86 | max_warp = max(max_warp, prev - now); | ||
87 | nr_warps++; | ||
88 | __raw_spin_unlock(&sync_lock); | ||
89 | } | ||
90 | |||
91 | } | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Source CPU calls into this - it waits for the freshly booted | ||
96 | * target CPU to arrive and then starts the measurement: | ||
97 | */ | ||
98 | void __cpuinit check_tsc_sync_source(int cpu) | ||
99 | { | ||
100 | int cpus = 2; | ||
101 | |||
102 | /* | ||
103 | * No need to check if we already know that the TSC is not | ||
104 | * synchronized: | ||
105 | */ | ||
106 | if (unsynchronized_tsc()) | ||
107 | return; | ||
108 | |||
109 | printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", | ||
110 | smp_processor_id(), cpu); | ||
111 | |||
112 | /* | ||
113 | * Reset it - in case this is a second bootup: | ||
114 | */ | ||
115 | atomic_set(&stop_count, 0); | ||
116 | |||
117 | /* | ||
118 | * Wait for the target to arrive: | ||
119 | */ | ||
120 | while (atomic_read(&start_count) != cpus-1) | ||
121 | cpu_relax(); | ||
122 | /* | ||
123 | * Trigger the target to continue into the measurement too: | ||
124 | */ | ||
125 | atomic_inc(&start_count); | ||
126 | |||
127 | check_tsc_warp(); | ||
128 | |||
129 | while (atomic_read(&stop_count) != cpus-1) | ||
130 | cpu_relax(); | ||
131 | |||
132 | /* | ||
133 | * Reset it - just in case we boot another CPU later: | ||
134 | */ | ||
135 | atomic_set(&start_count, 0); | ||
136 | |||
137 | if (nr_warps) { | ||
138 | printk("\n"); | ||
139 | printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," | ||
140 | " turning off TSC clock.\n", max_warp); | ||
141 | mark_tsc_unstable(); | ||
142 | nr_warps = 0; | ||
143 | max_warp = 0; | ||
144 | last_tsc = 0; | ||
145 | } else { | ||
146 | printk(" passed.\n"); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Let the target continue with the bootup: | ||
151 | */ | ||
152 | atomic_inc(&stop_count); | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Freshly booted CPUs call into this: | ||
157 | */ | ||
158 | void __cpuinit check_tsc_sync_target(void) | ||
159 | { | ||
160 | int cpus = 2; | ||
161 | |||
162 | if (unsynchronized_tsc()) | ||
163 | return; | ||
164 | |||
165 | /* | ||
166 | * Register this CPU's participation and wait for the | ||
167 | * source CPU to start the measurement: | ||
168 | */ | ||
169 | atomic_inc(&start_count); | ||
170 | while (atomic_read(&start_count) != cpus) | ||
171 | cpu_relax(); | ||
172 | |||
173 | check_tsc_warp(); | ||
174 | |||
175 | /* | ||
176 | * Ok, we are done: | ||
177 | */ | ||
178 | atomic_inc(&stop_count); | ||
179 | |||
180 | /* | ||
181 | * Wait for the source CPU to print stuff: | ||
182 | */ | ||
183 | while (atomic_read(&stop_count) != cpus) | ||
184 | cpu_relax(); | ||
185 | } | ||
186 | #undef NR_LOOPS | ||
187 | |||