diff options
author | Phil Carmody <ext-phil.2.carmody@nokia.com> | 2011-03-22 19:34:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-22 20:44:11 -0400 |
commit | 191e56880a6a638ce931859317f37deb084b6433 (patch) | |
tree | 26853fa62983f12b85badda6b9ee2197c2f10697 | |
parent | 71c696b1d0310da3ab8033d743282959bd49d28b (diff) |
calibrate: home in on correct lpj value more quickly
Binary chop with a jiffy-resync on each step to find an upper bound is
slow, so just race in a tight-ish loop to find an underestimate.
If done with lots of individual steps, sometimes several hundreds of
iterations would be required, which would impose a significant overhead,
and make the initial estimate very low. By taking slowly increasing steps
there will be less overhead.
E.g. an x86_64 2.67GHz could have fitted in 613 individual small delays,
but in reality should have been able to fit in a single delay 644 times
longer, so underestimated by 31 steps. To reach the equivalent of 644
small delays with the accelerating scheme now requires about 130
iterations, so has <1/4th of the overhead, and can therefore be expected
to underestimate by only 7 steps.
As now we have a better initial estimate we can binary chop over a smaller
range. With the loop overhead in the initial estimate kept low, and the
step sizes moderate, we won't have under-estimated by much, so chose as
tight a range as we can.
Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | init/calibrate.c | 57 |
1 files changed, 34 insertions, 23 deletions
diff --git a/init/calibrate.c b/init/calibrate.c index b71643a7acae..f9000dfbe227 100644 --- a/init/calibrate.c +++ b/init/calibrate.c | |||
@@ -110,8 +110,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} | |||
110 | 110 | ||
111 | /* | 111 | /* |
112 | * This is the number of bits of precision for the loops_per_jiffy. Each | 112 | * This is the number of bits of precision for the loops_per_jiffy. Each |
113 | * bit takes on average 1.5/HZ seconds. This (like the original) is a little | 113 | * time we refine our estimate after the first takes 1.5/HZ seconds, so try |
114 | * better than 1% | 114 | * to start with a good estimate. |
115 | * For the boot cpu we can skip the delay calibration and assign it a value | 115 | * For the boot cpu we can skip the delay calibration and assign it a value |
116 | * calculated based on the timer frequency. | 116 | * calculated based on the timer frequency. |
117 | * For the rest of the CPUs we cannot assume that the timer frequency is same as | 117 | * For the rest of the CPUs we cannot assume that the timer frequency is same as |
@@ -121,38 +121,49 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} | |||
121 | 121 | ||
122 | static unsigned long __cpuinit calibrate_delay_converge(void) | 122 | static unsigned long __cpuinit calibrate_delay_converge(void) |
123 | { | 123 | { |
124 | unsigned long lpj, ticks, loopbit; | 124 | /* First stage - slowly accelerate to find initial bounds */ |
125 | int lps_precision = LPS_PREC; | 125 | unsigned long lpj, ticks, loopadd, chop_limit; |
126 | int trials = 0, band = 0, trial_in_band = 0; | ||
126 | 127 | ||
127 | lpj = (1<<12); | 128 | lpj = (1<<12); |
128 | while ((lpj <<= 1) != 0) { | 129 | |
129 | /* wait for "start of" clock tick */ | 130 | /* wait for "start of" clock tick */ |
130 | ticks = jiffies; | 131 | ticks = jiffies; |
131 | while (ticks == jiffies) | 132 | while (ticks == jiffies) |
132 | /* nothing */; | 133 | ; /* nothing */ |
133 | /* Go .. */ | 134 | /* Go .. */ |
134 | ticks = jiffies; | 135 | ticks = jiffies; |
135 | __delay(lpj); | 136 | do { |
136 | ticks = jiffies - ticks; | 137 | if (++trial_in_band == (1<<band)) { |
137 | if (ticks) | 138 | ++band; |
138 | break; | 139 | trial_in_band = 0; |
139 | } | 140 | } |
141 | __delay(lpj * band); | ||
142 | trials += band; | ||
143 | } while (ticks == jiffies); | ||
144 | /* | ||
145 | * We overshot, so retreat to a clear underestimate. Then estimate | ||
146 | * the largest likely undershoot. This defines our chop bounds. | ||
147 | */ | ||
148 | trials -= band; | ||
149 | loopadd = lpj * band; | ||
150 | lpj *= trials; | ||
151 | chop_limit = lpj >> (LPS_PREC + 1); | ||
140 | 152 | ||
141 | /* | 153 | /* |
142 | * Do a binary approximation to get lpj set to | 154 | * Do a binary approximation to get lpj set to |
143 | * equal one clock (up to lps_precision bits) | 155 | * equal one clock (up to LPS_PREC bits) |
144 | */ | 156 | */ |
145 | lpj >>= 1; | 157 | while (loopadd > chop_limit) { |
146 | loopbit = lpj; | 158 | lpj += loopadd; |
147 | while (lps_precision-- && (loopbit >>= 1)) { | ||
148 | lpj |= loopbit; | ||
149 | ticks = jiffies; | 159 | ticks = jiffies; |
150 | while (ticks == jiffies) | 160 | while (ticks == jiffies) |
151 | /* nothing */; | 161 | ; /* nothing */ |
152 | ticks = jiffies; | 162 | ticks = jiffies; |
153 | __delay(lpj); | 163 | __delay(lpj); |
154 | if (jiffies != ticks) /* longer than 1 tick */ | 164 | if (jiffies != ticks) /* longer than 1 tick */ |
155 | lpj &= ~loopbit; | 165 | lpj -= loopadd; |
166 | loopadd >>= 1; | ||
156 | } | 167 | } |
157 | 168 | ||
158 | return lpj; | 169 | return lpj; |