calibrate: home in on correct lpj value more quickly

Binary chop with a jiffy-resync on each step to find an upper bound is slow, so just race in a tight-ish loop to find an underestimate. If done with lots of individual steps, sometimes several hundreds of iterations would be required, which would impose a significant overhead, and make the initial estimate very low. By taking slowly increasing steps there will be less overhead. E.g. an x86_64 2.67GHz could have fitted in 613 individual small delays, but in reality should have been able to fit in a single delay 644 times longer, so underestimated by 31 steps. To reach the equivalent of 644 small delays with the accelerating scheme now requires about 130 iterations, so has <1/4th of the overhead, and can therefore be expected to underestimate by only 7 steps. As now we have a better initial estimate we can binary chop over a smaller range. With the loop overhead in the initial estimate kept low, and the step sizes moderate, we won't have under-estimated by much, so chose as tight a range as we can. Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Tested-by: Stephen Boyd <sboyd@codeaurora.org> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Phil Carmody <ext-phil.2.carmody@nokia.com> 2011-03-22 19:34:13 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-03-22 20:44:11 -0400
commit: 191e56880a6a638ce931859317f37deb084b6433 (patch)
tree: 26853fa62983f12b85badda6b9ee2197c2f10697 /init/calibrate.c
parent: 71c696b1d0310da3ab8033d743282959bd49d28b (diff)
1 files changed, 34 insertions, 23 deletions
diff --git a/init/calibrate.c b/init/calibrate.c
index b71643a7acae..f9000dfbe227 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -110,8 +110,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
 /*
 * This is the number of bits of precision for the loops_per_jiffy.  Each
- * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
+ * time we refine our estimate after the first takes 1.5/HZ seconds, so try
- * better than 1%
+ * to start with a good estimate.
 * For the boot cpu we can skip the delay calibration and assign it a value
 * calculated based on the timer frequency.
 * For the rest of the CPUs we cannot assume that the timer frequency is same as
@@ -121,38 +121,49 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
 static unsigned long __cpuinit calibrate_delay_converge(void)
 {
-        unsigned long lpj, ticks, loopbit;
+        /* First stage - slowly accelerate to find initial bounds */
-        int lps_precision = LPS_PREC;
+        unsigned long lpj, ticks, loopadd, chop_limit;
+        int trials = 0, band = 0, trial_in_band = 0;
        lpj = (1<<12);
-        while ((lpj <<= 1) != 0) {
-                /* wait for "start of" clock tick */
+        /* wait for "start of" clock tick */
-                ticks = jiffies;
+        ticks = jiffies;
-                while (ticks == jiffies)
+        while (ticks == jiffies)
-                        /* nothing */;
+                ; /* nothing */
-                /* Go .. */
+        /* Go .. */
-                ticks = jiffies;
+        ticks = jiffies;
-                __delay(lpj);
+        do {
-                ticks = jiffies - ticks;
+                if (++trial_in_band == (1<<band)) {
-                if (ticks)
+                        ++band;
-                        break;
+                        trial_in_band = 0;
-        }
+                }
+                __delay(lpj * band);
+                trials += band;
+        } while (ticks == jiffies);
+        /*
+         * We overshot, so retreat to a clear underestimate. Then estimate
+         * the largest likely undershoot. This defines our chop bounds.
+         */
+        trials -= band;
+        loopadd = lpj * band;
+        lpj *= trials;
+        chop_limit = lpj >> (LPS_PREC + 1);
        /*
         * Do a binary approximation to get lpj set to
-         * equal one clock (up to lps_precision bits)
+         * equal one clock (up to LPS_PREC bits)
         */
-        lpj >>= 1;
+        while (loopadd > chop_limit) {
-        loopbit = lpj;
+                lpj += loopadd;
-        while (lps_precision-- && (loopbit >>= 1)) {
-                lpj |= loopbit;
                ticks = jiffies;
                while (ticks == jiffies)
-                        /* nothing */;
+                        ; /* nothing */
                ticks = jiffies;
                __delay(lpj);
                if (jiffies != ticks)   /* longer than 1 tick */
-                        lpj &= ~loopbit;
+                        lpj -= loopadd;
+                loopadd >>= 1;
        }
        return lpj;
author	Phil Carmody <ext-phil.2.carmody@nokia.com>	2011-03-22 19:34:13 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-03-22 20:44:11 -0400
commit	191e56880a6a638ce931859317f37deb084b6433 (patch)
tree	26853fa62983f12b85badda6b9ee2197c2f10697 /init/calibrate.c
parent	71c696b1d0310da3ab8033d743282959bd49d28b (diff)

diff --git a/init/calibrate.c b/init/calibrate.c index b71643a7acae..f9000dfbe227 100644 --- a/init/calibrate.c +++ b/init/calibrate.c
@@ -110,8 +110,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
110		110
111	/*	111	/*
112	* This is the number of bits of precision for the loops_per_jiffy. Each	112	* This is the number of bits of precision for the loops_per_jiffy. Each
113	* bit takes on average 1.5/HZ seconds. This (like the original) is a little	113	* time we refine our estimate after the first takes 1.5/HZ seconds, so try
114	* better than 1%	114	* to start with a good estimate.
115	* For the boot cpu we can skip the delay calibration and assign it a value	115	* For the boot cpu we can skip the delay calibration and assign it a value
116	* calculated based on the timer frequency.	116	* calculated based on the timer frequency.
117	* For the rest of the CPUs we cannot assume that the timer frequency is same as	117	* For the rest of the CPUs we cannot assume that the timer frequency is same as
@@ -121,38 +121,49 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
121		121
122	static unsigned long __cpuinit calibrate_delay_converge(void)	122	static unsigned long __cpuinit calibrate_delay_converge(void)
123	{	123	{
124	unsigned long lpj, ticks, loopbit;	124	/* First stage - slowly accelerate to find initial bounds */
125	int lps_precision = LPS_PREC;	125	unsigned long lpj, ticks, loopadd, chop_limit;
		126	int trials = 0, band = 0, trial_in_band = 0;
126		127
127	lpj = (1<<12);	128	lpj = (1<<12);
128	while ((lpj <<= 1) != 0) {	129
129	/* wait for "start of" clock tick */	130	/* wait for "start of" clock tick */
130	ticks = jiffies;	131	ticks = jiffies;
131	while (ticks == jiffies)	132	while (ticks == jiffies)
132	/* nothing */;	133	; /* nothing */
133	/* Go .. */	134	/* Go .. */
134	ticks = jiffies;	135	ticks = jiffies;
135	__delay(lpj);	136	do {
136	ticks = jiffies - ticks;	137	if (++trial_in_band == (1<<band)) {
137	if (ticks)	138	++band;
138	break;	139	trial_in_band = 0;
139	}	140	}
		141	__delay(lpj * band);
		142	trials += band;
		143	} while (ticks == jiffies);
		144	/*
		145	* We overshot, so retreat to a clear underestimate. Then estimate
		146	* the largest likely undershoot. This defines our chop bounds.
		147	*/
		148	trials -= band;
		149	loopadd = lpj * band;
		150	lpj *= trials;
		151	chop_limit = lpj >> (LPS_PREC + 1);
140		152
141	/*	153	/*
142	* Do a binary approximation to get lpj set to	154	* Do a binary approximation to get lpj set to
143	* equal one clock (up to lps_precision bits)	155	* equal one clock (up to LPS_PREC bits)
144	*/	156	*/
145	lpj >>= 1;	157	while (loopadd > chop_limit) {
146	loopbit = lpj;	158	lpj += loopadd;
147	while (lps_precision-- && (loopbit >>= 1)) {
148	lpj \|= loopbit;
149	ticks = jiffies;	159	ticks = jiffies;
150	while (ticks == jiffies)	160	while (ticks == jiffies)
151	/* nothing */;	161	; /* nothing */
152	ticks = jiffies;	162	ticks = jiffies;
153	__delay(lpj);	163	__delay(lpj);
154	if (jiffies != ticks) /* longer than 1 tick */	164	if (jiffies != ticks) /* longer than 1 tick */
155	lpj &= ~loopbit;	165	lpj -= loopadd;
		166	loopadd >>= 1;
156	}	167	}
157		168
158	return lpj;	169	return lpj;