aboutsummaryrefslogtreecommitdiffstats
path: root/init
diff options
context:
space:
mode:
authorAndrew Worsley <amworsley@gmail.com>2011-05-24 20:13:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-25 11:39:46 -0400
commitd2b463135f84d15808163cd15638b108e323d3e7 (patch)
tree35d8af01b9403635235220891beba5c835a91086 /init
parent1dbe39424a43e56a6c9aed12661192af51dcdb9f (diff)
init/calibrate.c: fix for critical bogoMIPS intermittent calculation failure
A fix to the TSC (Time Stamp Counter) based bogoMIPS calculation used on secondary CPUs which has two faults: 1: Not handling wrapping of the lower 32 bits of the TSC counter on 32bit kernel - perhaps TSC is not reset by a warm reset? 2: TSC and Jiffies are no incrementing together properly. Either jiffies increment too quickly or Time Stamp Counter isn't incremented in during an SMI but the real time clock is and jiffies are incremented. Case 1 can result in a factor of 16 too large a value which makes udelay() values too small and can cause mysterious driver errors. Case 2 appears to give smaller 10-15% errors after averaging but enough to cause occasional failures on my own board I have tested this code on my own branch and attach patch suitable for current kernel code. See below for examples of the failures and how the fix handles these situations now. I reported this issue earlier here: Intermittent problem with BogoMIPs calculation on Intel AP CPUs - http://marc.info/?l=linux-kernel&m=129947246316875&w=4 I suspect this issue has been seen by others but as it is intermittent and bogoMIPS for secondary CPUs are no longer printed out it might have been difficult to identify this as the cause. Perhaps these unresolved issues, although quite old, might be relevant as possibly this fault has been around for a while. In particular Case 1 may only be relevant to 32bit kernels on newer HW (most people run 64bit kernels?). Case 2 is less dramatic since the earlier fix in this area and also intermittent. Re: bogomips discrepancy on Intel Core2 Quad CPU - http://marc.info/?l=linux-kernel&m=118929277524298&w=4 slow system and bogus bogomips - http://marc.info/?l=linux-kernel&m=116791286716107&w=4 Re: Re: [RFC-PATCH] clocksource: update lpj if clocksource has - http://marc.info/?l=linux-kernel&m=128952775819467&w=4 This issue is masked a little by commit feae3203d711db0a ("timers, init: Limit the number of per cpu calibration bootup messages") which only prints out the first bogoMIPS value making it much harder to notice other values differing. Perhaps it should be changed to only suppress them when they are similar values? Here are some outputs showing faults occurring and the new code handling them properly. See my earlier message for examples of the original failure. Case 1: A Time Stamp Counter wrap: ... Calibrating delay loop (skipped), value calculated using timer frequency.. 6332.70 BogoMIPS (lpj=31663540) .... calibrate_delay_direct() timer_rate_max=31666493 timer_rate_min=31666151 pre_start=4170369255 pre_end=4202035539 calibrate_delay_direct() timer_rate_max=2425955274 timer_rate_min=2425954941 pre_start=4265368533 pre_end=2396356387 calibrate_delay_direct() ignoring timer_rate as we had a TSC wrap around start=4265368581 >=post_end=2396356511 calibrate_delay_direct() timer_rate_max=31666274 timer_rate_min=31665942 pre_start=2440373374 pre_end=2472039515 calibrate_delay_direct() timer_rate_max=31666492 timer_rate_min=31666160 pre_start=2535372139 pre_end=2567038422 calibrate_delay_direct() timer_rate_max=31666455 timer_rate_min=31666207 pre_start=2630371084 pre_end=2662037415 Calibrating delay using timer specific routine.. 6333.28 BogoMIPS (lpj=31666428) Total of 2 processors activated (12665.99 BogoMIPS). .... Case 2: Some thing (presumably the SMM interrupt?) causing the very low increase in TSC counter for the DELAY_CALIBRATION_TICKS increase in jiffies ... Calibrating delay loop (skipped), value calculated using timer frequency.. 6333.25 BogoMIPS (lpj=31666270) ... calibrate_delay_direct() timer_rate_max=31666483 timer_rate_min=31666074 pre_start=4199536526 pre_end=4231202809 calibrate_delay_direct() timer_rate_max=864348 timer_rate_min=864016 pre_start=2405343672 pre_end=2406207897 calibrate_delay_direct() timer_rate_max=31666483 timer_rate_min=31666179 pre_start=2469540464 pre_end=2501206823 calibrate_delay_direct() timer_rate_max=31666511 timer_rate_min=31666122 pre_start=2564539400 pre_end=2596205712 calibrate_delay_direct() timer_rate_max=31666084 timer_rate_min=31665685 pre_start=2659538782 pre_end=2691204657 calibrate_delay_direct() dropping min bogoMips estimate 1 = 864348 Calibrating delay using timer specific routine.. 6333.27 BogoMIPS (lpj=31666390) Total of 2 processors activated (12666.53 BogoMIPS). ... After 70 boots I saw 2 variations <1% slip through [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix straggly printk mess] Signed-off-by: Andrew Worsley <amworsley@gmail.com> Reviewed-by: Phil Carmody <ext-phil.2.carmody@nokia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'init')
-rw-r--r--init/calibrate.c75
1 files changed, 69 insertions, 6 deletions
diff --git a/init/calibrate.c b/init/calibrate.c
index 76ac9194cbc4..cfd7000c9d71 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -38,6 +38,9 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
38 unsigned long timer_rate_min, timer_rate_max; 38 unsigned long timer_rate_min, timer_rate_max;
39 unsigned long good_timer_sum = 0; 39 unsigned long good_timer_sum = 0;
40 unsigned long good_timer_count = 0; 40 unsigned long good_timer_count = 0;
41 unsigned long measured_times[MAX_DIRECT_CALIBRATION_RETRIES];
42 int max = -1; /* index of measured_times with max/min values or not set */
43 int min = -1;
41 int i; 44 int i;
42 45
43 if (read_current_timer(&pre_start) < 0 ) 46 if (read_current_timer(&pre_start) < 0 )
@@ -90,18 +93,78 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
90 * If the upper limit and lower limit of the timer_rate is 93 * If the upper limit and lower limit of the timer_rate is
91 * >= 12.5% apart, redo calibration. 94 * >= 12.5% apart, redo calibration.
92 */ 95 */
93 if (pre_start != 0 && pre_end != 0 && 96 printk(KERN_DEBUG "calibrate_delay_direct() timer_rate_max=%lu "
97 "timer_rate_min=%lu pre_start=%lu pre_end=%lu\n",
98 timer_rate_max, timer_rate_min, pre_start, pre_end);
99 if (start >= post_end)
100 printk(KERN_NOTICE "calibrate_delay_direct() ignoring "
101 "timer_rate as we had a TSC wrap around"
102 " start=%lu >=post_end=%lu\n",
103 start, post_end);
104 if (start < post_end && pre_start != 0 && pre_end != 0 &&
94 (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { 105 (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) {
95 good_timer_count++; 106 good_timer_count++;
96 good_timer_sum += timer_rate_max; 107 good_timer_sum += timer_rate_max;
97 } 108 measured_times[i] = timer_rate_max;
109 if (max < 0 || timer_rate_max > measured_times[max])
110 max = i;
111 if (min < 0 || timer_rate_max < measured_times[min])
112 min = i;
113 } else
114 measured_times[i] = 0;
115
98 } 116 }
99 117
100 if (good_timer_count) 118 /*
101 return (good_timer_sum/good_timer_count); 119 * Find the maximum & minimum - if they differ too much throw out the
120 * one with the largest difference from the mean and try again...
121 */
122 while (good_timer_count > 1) {
123 unsigned long estimate;
124 unsigned long maxdiff;
125
126 /* compute the estimate */
127 estimate = (good_timer_sum/good_timer_count);
128 maxdiff = estimate >> 3;
129
130 /* if range is within 12% let's take it */
131 if ((measured_times[max] - measured_times[min]) < maxdiff)
132 return estimate;
133
134 /* ok - drop the worse value and try again... */
135 good_timer_sum = 0;
136 good_timer_count = 0;
137 if ((measured_times[max] - estimate) <
138 (estimate - measured_times[min])) {
139 printk(KERN_NOTICE "calibrate_delay_direct() dropping "
140 "min bogoMips estimate %d = %lu\n",
141 min, measured_times[min]);
142 measured_times[min] = 0;
143 min = max;
144 } else {
145 printk(KERN_NOTICE "calibrate_delay_direct() dropping "
146 "max bogoMips estimate %d = %lu\n",
147 max, measured_times[max]);
148 measured_times[max] = 0;
149 max = min;
150 }
151
152 for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) {
153 if (measured_times[i] == 0)
154 continue;
155 good_timer_count++;
156 good_timer_sum += measured_times[i];
157 if (measured_times[i] < measured_times[min])
158 min = i;
159 if (measured_times[i] > measured_times[max])
160 max = i;
161 }
162
163 }
102 164
103 printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " 165 printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good "
104 "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); 166 "estimate for loops_per_jiffy.\nProbably due to long platform "
167 "interrupts. Consider using \"lpj=\" boot option.\n");
105 return 0; 168 return 0;
106} 169}
107#else 170#else