diff options
author | Denis Vlasenko <vda.linux@googlemail.com> | 2007-07-16 02:41:56 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 12:05:52 -0400 |
commit | 4277eedd7908a0ca8b66fad46ee76b0ad96e6ef2 (patch) | |
tree | 88780b40c23883af5e9958a7f397f23ff5619ff7 /lib | |
parent | b39a734097d5095d63eb9c709a6aaf965633bb01 (diff) |
vsprintf.c: optimizing, part 2: base 10 conversion speedup, v2
Optimize integer-to-string conversion in vsprintf.c for base 10. This is
by far the most used conversion, and in some use cases it impacts
performance. For example, top reads /proc/$PID/stat for every process, and
with 4000 processes decimal conversion alone takes noticeable time.
Using code from
http://www.cs.uiowa.edu/~jones/bcd/decimal.html
(with permission from the author, Douglas W. Jones)
binary-to-decimal-string conversion is done in groups of five digits at
once, using only additions/subtractions/shifts (with -O2; -Os throws in
some multiply instructions).
On i386 arch gcc 4.1.2 -O2 generates ~500 bytes of code.
This patch is run tested. Userspace benchmark/test is also attached.
I tested it on PIII and AMD64 and new code is generally ~2.5 times
faster. On AMD64:
# ./vsprintf_verify-O2
Original decimal conv: .......... 151 ns per iteration
Patched decimal conv: .......... 62 ns per iteration
Testing correctness
12895992590592 ok... [Ctrl-C]
# ./vsprintf_verify-O2
Original decimal conv: .......... 151 ns per iteration
Patched decimal conv: .......... 62 ns per iteration
Testing correctness
26025406464 ok... [Ctrl-C]
More realistic test: top from busybox project was modified to
report how many us it took to scan /proc (this does not account
any processing done after that, like sorting process list),
and then I test it with 4000 processes:
#!/bin/sh
i=4000
while test $i != 0; do
sleep 30 &
let i--
done
busybox top -b -n3 >/dev/null
on unpatched kernel:
top: 4120 processes took 102864 microseconds to scan
top: 4120 processes took 91757 microseconds to scan
top: 4120 processes took 92517 microseconds to scan
top: 4120 processes took 92581 microseconds to scan
on patched kernel:
top: 4120 processes took 75460 microseconds to scan
top: 4120 processes took 66451 microseconds to scan
top: 4120 processes took 67267 microseconds to scan
top: 4120 processes took 67618 microseconds to scan
The speedup comes from much faster generation of /proc/PID/stat
by sprintf() calls inside the kernel.
Signed-off-by: Douglas W Jones <jones@cs.uiowa.edu>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r-- | lib/vsprintf.c | 108 |
1 files changed, 105 insertions, 3 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c index e94b4bd25bc5..6b6734df6d2d 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c | |||
@@ -135,6 +135,103 @@ static int skip_atoi(const char **s) | |||
135 | return i; | 135 | return i; |
136 | } | 136 | } |
137 | 137 | ||
138 | /* Decimal conversion is by far the most typical, and is used | ||
139 | * for /proc and /sys data. This directly impacts e.g. top performance | ||
140 | * with many processes running. We optimize it for speed | ||
141 | * using code from | ||
142 | * http://www.cs.uiowa.edu/~jones/bcd/decimal.html | ||
143 | * (with permission from the author, Douglas W. Jones). */ | ||
144 | |||
145 | /* Formats correctly any integer in [0,99999]. | ||
146 | * Outputs from one to five digits depending on input. | ||
147 | * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */ | ||
148 | static char* put_dec_trunc(char *buf, unsigned q) | ||
149 | { | ||
150 | unsigned d3, d2, d1, d0; | ||
151 | d1 = (q>>4) & 0xf; | ||
152 | d2 = (q>>8) & 0xf; | ||
153 | d3 = (q>>12); | ||
154 | |||
155 | d0 = 6*(d3 + d2 + d1) + (q & 0xf); | ||
156 | q = (d0 * 0xcd) >> 11; | ||
157 | d0 = d0 - 10*q; | ||
158 | *buf++ = d0 + '0'; /* least significant digit */ | ||
159 | d1 = q + 9*d3 + 5*d2 + d1; | ||
160 | if (d1 != 0) { | ||
161 | q = (d1 * 0xcd) >> 11; | ||
162 | d1 = d1 - 10*q; | ||
163 | *buf++ = d1 + '0'; /* next digit */ | ||
164 | |||
165 | d2 = q + 2*d2; | ||
166 | if ((d2 != 0) || (d3 != 0)) { | ||
167 | q = (d2 * 0xd) >> 7; | ||
168 | d2 = d2 - 10*q; | ||
169 | *buf++ = d2 + '0'; /* next digit */ | ||
170 | |||
171 | d3 = q + 4*d3; | ||
172 | if (d3 != 0) { | ||
173 | q = (d3 * 0xcd) >> 11; | ||
174 | d3 = d3 - 10*q; | ||
175 | *buf++ = d3 + '0'; /* next digit */ | ||
176 | if (q != 0) | ||
177 | *buf++ = q + '0'; /* most sign. digit */ | ||
178 | } | ||
179 | } | ||
180 | } | ||
181 | return buf; | ||
182 | } | ||
183 | /* Same with if's removed. Always emits five digits */ | ||
184 | static char* put_dec_full(char *buf, unsigned q) | ||
185 | { | ||
186 | /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ | ||
187 | /* but anyway, gcc produces better code with full-sized ints */ | ||
188 | unsigned d3, d2, d1, d0; | ||
189 | d1 = (q>>4) & 0xf; | ||
190 | d2 = (q>>8) & 0xf; | ||
191 | d3 = (q>>12); | ||
192 | |||
193 | /* Possible ways to approx. divide by 10 */ | ||
194 | /* gcc -O2 replaces multiply with shifts and adds */ | ||
195 | // (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386) | ||
196 | // (x * 0x67) >> 10: 1100111 | ||
197 | // (x * 0x34) >> 9: 110100 - same | ||
198 | // (x * 0x1a) >> 8: 11010 - same | ||
199 | // (x * 0x0d) >> 7: 1101 - same, shortest code (on i386) | ||
200 | |||
201 | d0 = 6*(d3 + d2 + d1) + (q & 0xf); | ||
202 | q = (d0 * 0xcd) >> 11; | ||
203 | d0 = d0 - 10*q; | ||
204 | *buf++ = d0 + '0'; | ||
205 | d1 = q + 9*d3 + 5*d2 + d1; | ||
206 | q = (d1 * 0xcd) >> 11; | ||
207 | d1 = d1 - 10*q; | ||
208 | *buf++ = d1 + '0'; | ||
209 | |||
210 | d2 = q + 2*d2; | ||
211 | q = (d2 * 0xd) >> 7; | ||
212 | d2 = d2 - 10*q; | ||
213 | *buf++ = d2 + '0'; | ||
214 | |||
215 | d3 = q + 4*d3; | ||
216 | q = (d3 * 0xcd) >> 11; /* - shorter code */ | ||
217 | /* q = (d3 * 0x67) >> 10; - would also work */ | ||
218 | d3 = d3 - 10*q; | ||
219 | *buf++ = d3 + '0'; | ||
220 | *buf++ = q + '0'; | ||
221 | return buf; | ||
222 | } | ||
223 | /* No inlining helps gcc to use registers better */ | ||
224 | static noinline char* put_dec(char *buf, unsigned long long num) | ||
225 | { | ||
226 | while (1) { | ||
227 | unsigned rem; | ||
228 | if (num < 100000) | ||
229 | return put_dec_trunc(buf, num); | ||
230 | rem = do_div(num, 100000); | ||
231 | buf = put_dec_full(buf, rem); | ||
232 | } | ||
233 | } | ||
234 | |||
138 | #define ZEROPAD 1 /* pad with zero */ | 235 | #define ZEROPAD 1 /* pad with zero */ |
139 | #define SIGN 2 /* unsigned/signed long */ | 236 | #define SIGN 2 /* unsigned/signed long */ |
140 | #define PLUS 4 /* show plus */ | 237 | #define PLUS 4 /* show plus */ |
@@ -182,6 +279,11 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int | |||
182 | i = 0; | 279 | i = 0; |
183 | if (num == 0) | 280 | if (num == 0) |
184 | tmp[i++] = '0'; | 281 | tmp[i++] = '0'; |
282 | /* Generic code, for any base: | ||
283 | else do { | ||
284 | tmp[i++] = digits[do_div(num,base)]; | ||
285 | } while (num != 0); | ||
286 | */ | ||
185 | else if (base != 10) { /* 8 or 16 */ | 287 | else if (base != 10) { /* 8 or 16 */ |
186 | int mask = base - 1; | 288 | int mask = base - 1; |
187 | int shift = 3; | 289 | int shift = 3; |
@@ -190,9 +292,9 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int | |||
190 | tmp[i++] = digits[((unsigned char)num) & mask]; | 292 | tmp[i++] = digits[((unsigned char)num) & mask]; |
191 | num >>= shift; | 293 | num >>= shift; |
192 | } while (num); | 294 | } while (num); |
193 | } else do { /* generic code, works for any base */ | 295 | } else { /* base 10 */ |
194 | tmp[i++] = digits[do_div(num,10 /*base*/)]; | 296 | i = put_dec(tmp, num) - tmp; |
195 | } while (num); | 297 | } |
196 | 298 | ||
197 | /* printing 100 using %2d gives "100", not "00" */ | 299 | /* printing 100 using %2d gives "100", not "00" */ |
198 | if (i > precision) | 300 | if (i > precision) |