aboutsummaryrefslogtreecommitdiffstats
path: root/lib/vsprintf.c
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2012-05-31 19:26:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-31 20:49:27 -0400
commit133fd9f5cda2d86904126f4b9fa4e8f4330c9569 (patch)
tree0e60bfcec85f123243cf1ffe735264527efd6d0e /lib/vsprintf.c
parent725fe002d315c2501c110b7245d3eb4f4535f4d6 (diff)
vsprintf: further optimize decimal conversion
Previous code was using optimizations which were developed to work well even on narrow-word CPUs (by today's standards). But Linux runs only on 32-bit and wider CPUs. We can use that. First: using 32x32->64 multiply and trivial 32-bit shift, we can correctly divide by 10 much larger numbers, and thus we can print groups of 9 digits instead of groups of 5 digits. Next: there are two algorithms to print larger numbers. One is generic: divide by 1000000000 and repeatedly print groups of (up to) 9 digits. It's conceptually simple, but requires an (unsigned long long) / 1000000000 division. Second algorithm splits 64-bit unsigned long long into 16-bit chunks, manipulates them cleverly and generates groups of 4 decimal digits. It so happens that it does NOT require long long division. If long is > 32 bits, division of 64-bit values is relatively easy, and we will use the first algorithm. If long long is > 64 bits (strange architecture with VERY large long long), second algorithm can't be used, and we again use the first one. Else (if long is 32 bits and long long is 64 bits) we use second one. And third: there is a simple optimization which takes fast path not only for zero as was done before, but for all one-digit numbers. In all tested cases new code is faster than old one, in many cases by 30%, in few cases by more than 50% (for example, on x86-32, conversion of 12345678). Code growth is ~0 in 32-bit case and ~130 bytes in 64-bit case. This patch is based upon an original from Michal Nazarewicz. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Michal Nazarewicz <mina86@mina86.com> Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com> Cc: Douglas W Jones <jones@cs.uiowa.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib/vsprintf.c')
-rw-r--r--lib/vsprintf.c281
1 files changed, 190 insertions, 91 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b8fbd275bc46..c3f36d415bdf 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -112,106 +112,199 @@ int skip_atoi(const char **s)
112/* Decimal conversion is by far the most typical, and is used 112/* Decimal conversion is by far the most typical, and is used
113 * for /proc and /sys data. This directly impacts e.g. top performance 113 * for /proc and /sys data. This directly impacts e.g. top performance
114 * with many processes running. We optimize it for speed 114 * with many processes running. We optimize it for speed
115 * using code from 115 * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
116 * http://www.cs.uiowa.edu/~jones/bcd/decimal.html 116 * (with permission from the author, Douglas W. Jones).
117 * (with permission from the author, Douglas W. Jones). */ 117 */
118 118
119/* Formats correctly any integer in [0,99999]. 119#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
120 * Outputs from one to five digits depending on input. 120/* Formats correctly any integer in [0, 999999999] */
121 * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
122static noinline_for_stack 121static noinline_for_stack
123char *put_dec_trunc(char *buf, unsigned q) 122char *put_dec_full9(char *buf, unsigned q)
124{ 123{
125 unsigned d3, d2, d1, d0; 124 unsigned r;
126 d1 = (q>>4) & 0xf;
127 d2 = (q>>8) & 0xf;
128 d3 = (q>>12);
129
130 d0 = 6*(d3 + d2 + d1) + (q & 0xf);
131 q = (d0 * 0xcd) >> 11;
132 d0 = d0 - 10*q;
133 *buf++ = d0 + '0'; /* least significant digit */
134 d1 = q + 9*d3 + 5*d2 + d1;
135 if (d1 != 0) {
136 q = (d1 * 0xcd) >> 11;
137 d1 = d1 - 10*q;
138 *buf++ = d1 + '0'; /* next digit */
139
140 d2 = q + 2*d2;
141 if ((d2 != 0) || (d3 != 0)) {
142 q = (d2 * 0xd) >> 7;
143 d2 = d2 - 10*q;
144 *buf++ = d2 + '0'; /* next digit */
145
146 d3 = q + 4*d3;
147 if (d3 != 0) {
148 q = (d3 * 0xcd) >> 11;
149 d3 = d3 - 10*q;
150 *buf++ = d3 + '0'; /* next digit */
151 if (q != 0)
152 *buf++ = q + '0'; /* most sign. digit */
153 }
154 }
155 }
156 125
126 /*
127 * Possible ways to approx. divide by 10
128 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
129 * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)
130 * (x * 0x6667) >> 18 x < 43699
131 * (x * 0x3334) >> 17 x < 16389
132 * (x * 0x199a) >> 16 x < 16389
133 * (x * 0x0ccd) >> 15 x < 16389
134 * (x * 0x0667) >> 14 x < 2739
135 * (x * 0x0334) >> 13 x < 1029
136 * (x * 0x019a) >> 12 x < 1029
137 * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)
138 * (x * 0x0067) >> 10 x < 179
139 * (x * 0x0034) >> 9 x < 69 same
140 * (x * 0x001a) >> 8 x < 69 same
141 * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)
142 * (x * 0x0007) >> 6 x < 19
143 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
144 */
145 r = (q * (uint64_t)0x1999999a) >> 32;
146 *buf++ = (q - 10 * r) + '0'; /* 1 */
147 q = (r * (uint64_t)0x1999999a) >> 32;
148 *buf++ = (r - 10 * q) + '0'; /* 2 */
149 r = (q * (uint64_t)0x1999999a) >> 32;
150 *buf++ = (q - 10 * r) + '0'; /* 3 */
151 q = (r * (uint64_t)0x1999999a) >> 32;
152 *buf++ = (r - 10 * q) + '0'; /* 4 */
153 r = (q * (uint64_t)0x1999999a) >> 32;
154 *buf++ = (q - 10 * r) + '0'; /* 5 */
155 /* Now value is under 10000, can avoid 64-bit multiply */
156 q = (r * 0x199a) >> 16;
157 *buf++ = (r - 10 * q) + '0'; /* 6 */
158 r = (q * 0xcd) >> 11;
159 *buf++ = (q - 10 * r) + '0'; /* 7 */
160 q = (r * 0xcd) >> 11;
161 *buf++ = (r - 10 * q) + '0'; /* 8 */
162 *buf++ = q + '0'; /* 9 */
157 return buf; 163 return buf;
158} 164}
159/* Same with if's removed. Always emits five digits */ 165#endif
166
167/* Similar to above but do not pad with zeros.
168 * Code can be easily arranged to print 9 digits too, but our callers
169 * always call put_dec_full9() instead when the number has 9 decimal digits.
170 */
160static noinline_for_stack 171static noinline_for_stack
161char *put_dec_full(char *buf, unsigned q) 172char *put_dec_trunc8(char *buf, unsigned r)
162{ 173{
163 /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ 174 unsigned q;
164 /* but anyway, gcc produces better code with full-sized ints */ 175
165 unsigned d3, d2, d1, d0; 176 /* Copy of previous function's body with added early returns */
166 d1 = (q>>4) & 0xf; 177 q = (r * (uint64_t)0x1999999a) >> 32;
167 d2 = (q>>8) & 0xf; 178 *buf++ = (r - 10 * q) + '0'; /* 2 */
168 d3 = (q>>12); 179 if (q == 0)
180 return buf;
181 r = (q * (uint64_t)0x1999999a) >> 32;
182 *buf++ = (q - 10 * r) + '0'; /* 3 */
183 if (r == 0)
184 return buf;
185 q = (r * (uint64_t)0x1999999a) >> 32;
186 *buf++ = (r - 10 * q) + '0'; /* 4 */
187 if (q == 0)
188 return buf;
189 r = (q * (uint64_t)0x1999999a) >> 32;
190 *buf++ = (q - 10 * r) + '0'; /* 5 */
191 if (r == 0)
192 return buf;
193 q = (r * 0x199a) >> 16;
194 *buf++ = (r - 10 * q) + '0'; /* 6 */
195 if (q == 0)
196 return buf;
197 r = (q * 0xcd) >> 11;
198 *buf++ = (q - 10 * r) + '0'; /* 7 */
199 if (r == 0)
200 return buf;
201 q = (r * 0xcd) >> 11;
202 *buf++ = (r - 10 * q) + '0'; /* 8 */
203 if (q == 0)
204 return buf;
205 *buf++ = q + '0'; /* 9 */
206 return buf;
207}
169 208
170 /* 209/* There are two algorithms to print larger numbers.
171 * Possible ways to approx. divide by 10 210 * One is generic: divide by 1000000000 and repeatedly print
172 * gcc -O2 replaces multiply with shifts and adds 211 * groups of (up to) 9 digits. It's conceptually simple,
173 * (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386) 212 * but requires a (unsigned long long) / 1000000000 division.
174 * (x * 0x67) >> 10: 1100111 213 *
175 * (x * 0x34) >> 9: 110100 - same 214 * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
176 * (x * 0x1a) >> 8: 11010 - same 215 * manipulates them cleverly and generates groups of 4 decimal digits.
177 * (x * 0x0d) >> 7: 1101 - same, shortest code (on i386) 216 * It so happens that it does NOT require long long division.
178 */ 217 *
179 d0 = 6*(d3 + d2 + d1) + (q & 0xf); 218 * If long is > 32 bits, division of 64-bit values is relatively easy,
180 q = (d0 * 0xcd) >> 11; 219 * and we will use the first algorithm.
181 d0 = d0 - 10*q; 220 * If long long is > 64 bits (strange architecture with VERY large long long),
182 *buf++ = d0 + '0'; 221 * second algorithm can't be used, and we again use the first one.
183 d1 = q + 9*d3 + 5*d2 + d1; 222 *
184 q = (d1 * 0xcd) >> 11; 223 * Else (if long is 32 bits and long long is 64 bits) we use second one.
185 d1 = d1 - 10*q; 224 */
186 *buf++ = d1 + '0';
187
188 d2 = q + 2*d2;
189 q = (d2 * 0xd) >> 7;
190 d2 = d2 - 10*q;
191 *buf++ = d2 + '0';
192
193 d3 = q + 4*d3;
194 q = (d3 * 0xcd) >> 11; /* - shorter code */
195 /* q = (d3 * 0x67) >> 10; - would also work */
196 d3 = d3 - 10*q;
197 *buf++ = d3 + '0';
198 *buf++ = q + '0';
199 225
200 return buf; 226#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
227
228/* First algorithm: generic */
229
230static
231char *put_dec(char *buf, unsigned long long n)
232{
233 if (n >= 100*1000*1000) {
234 while (n >= 1000*1000*1000)
235 buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
236 if (n >= 100*1000*1000)
237 return put_dec_full9(buf, n);
238 }
239 return put_dec_trunc8(buf, n);
201} 240}
202/* No inlining helps gcc to use registers better */ 241
242#else
243
244/* Second algorithm: valid only for 64-bit long longs */
245
203static noinline_for_stack 246static noinline_for_stack
204char *put_dec(char *buf, unsigned long long num) 247char *put_dec_full4(char *buf, unsigned q)
205{ 248{
206 while (1) { 249 unsigned r;
207 unsigned rem; 250 r = (q * 0xcccd) >> 19;
208 if (num < 100000) 251 *buf++ = (q - 10 * r) + '0';
209 return put_dec_trunc(buf, num); 252 q = (r * 0x199a) >> 16;
210 rem = do_div(num, 100000); 253 *buf++ = (r - 10 * q) + '0';
211 buf = put_dec_full(buf, rem); 254 r = (q * 0xcd) >> 11;
212 } 255 *buf++ = (q - 10 * r) + '0';
256 *buf++ = r + '0';
257 return buf;
258}
259
260/* Based on code by Douglas W. Jones found at
261 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
262 * (with permission from the author).
263 * Performs no 64-bit division and hence should be fast on 32-bit machines.
264 */
265static
266char *put_dec(char *buf, unsigned long long n)
267{
268 uint32_t d3, d2, d1, q, h;
269
270 if (n < 100*1000*1000)
271 return put_dec_trunc8(buf, n);
272
273 d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
274 h = (n >> 32);
275 d2 = (h ) & 0xffff;
276 d3 = (h >> 16); /* implicit "& 0xffff" */
277
278 q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
279
280 buf = put_dec_full4(buf, q % 10000);
281 q = q / 10000;
282
283 d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1;
284 buf = put_dec_full4(buf, d1 % 10000);
285 q = d1 / 10000;
286
287 d2 = q + 4749 * d3 + 42 * d2;
288 buf = put_dec_full4(buf, d2 % 10000);
289 q = d2 / 10000;
290
291 d3 = q + 281 * d3;
292 if (!d3)
293 goto done;
294 buf = put_dec_full4(buf, d3 % 10000);
295 q = d3 / 10000;
296 if (!q)
297 goto done;
298 buf = put_dec_full4(buf, q);
299 done:
300 while (buf[-1] == '0')
301 --buf;
302
303 return buf;
213} 304}
214 305
306#endif
307
215/* 308/*
216 * Convert passed number to decimal string. 309 * Convert passed number to decimal string.
217 * Returns the length of string. On buffer overflow, returns 0. 310 * Returns the length of string. On buffer overflow, returns 0.
@@ -220,16 +313,22 @@ char *put_dec(char *buf, unsigned long long num)
220 */ 313 */
221int num_to_str(char *buf, int size, unsigned long long num) 314int num_to_str(char *buf, int size, unsigned long long num)
222{ 315{
223 char tmp[21]; /* Enough for 2^64 in decimal */ 316 char tmp[sizeof(num) * 3];
224 int idx, len; 317 int idx, len;
225 318
226 len = put_dec(tmp, num) - tmp; 319 /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
320 if (num <= 9) {
321 tmp[0] = '0' + num;
322 len = 1;
323 } else {
324 len = put_dec(tmp, num) - tmp;
325 }
227 326
228 if (len > size) 327 if (len > size)
229 return 0; 328 return 0;
230 for (idx = 0; idx < len; ++idx) 329 for (idx = 0; idx < len; ++idx)
231 buf[idx] = tmp[len - idx - 1]; 330 buf[idx] = tmp[len - idx - 1];
232 return len; 331 return len;
233} 332}
234 333
235#define ZEROPAD 1 /* pad with zero */ 334#define ZEROPAD 1 /* pad with zero */
@@ -314,8 +413,8 @@ char *number(char *buf, char *end, unsigned long long num,
314 413
315 /* generate full string in tmp[], in reverse order */ 414 /* generate full string in tmp[], in reverse order */
316 i = 0; 415 i = 0;
317 if (num == 0) 416 if (num < spec.base)
318 tmp[i++] = '0'; 417 tmp[i++] = digits[num] | locase;
319 /* Generic code, for any base: 418 /* Generic code, for any base:
320 else do { 419 else do {
321 tmp[i++] = (digits[do_div(num,base)] | locase); 420 tmp[i++] = (digits[do_div(num,base)] | locase);
@@ -611,7 +710,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
611 } 710 }
612 for (i = 0; i < 4; i++) { 711 for (i = 0; i < 4; i++) {
613 char temp[3]; /* hold each IP quad in reverse order */ 712 char temp[3]; /* hold each IP quad in reverse order */
614 int digits = put_dec_trunc(temp, addr[index]) - temp; 713 int digits = put_dec_trunc8(temp, addr[index]) - temp;
615 if (leading_zeros) { 714 if (leading_zeros) {
616 if (digits < 3) 715 if (digits < 3)
617 *p++ = '0'; 716 *p++ = '0';