vsprintf: further optimize decimal conversion

Previous code was using optimizations which were developed to work well even on narrow-word CPUs (by today's standards). But Linux runs only on 32-bit and wider CPUs. We can use that. First: using 32x32->64 multiply and trivial 32-bit shift, we can correctly divide by 10 much larger numbers, and thus we can print groups of 9 digits instead of groups of 5 digits. Next: there are two algorithms to print larger numbers. One is generic: divide by 1000000000 and repeatedly print groups of (up to) 9 digits. It's conceptually simple, but requires an (unsigned long long) / 1000000000 division. Second algorithm splits 64-bit unsigned long long into 16-bit chunks, manipulates them cleverly and generates groups of 4 decimal digits. It so happens that it does NOT require long long division. If long is > 32 bits, division of 64-bit values is relatively easy, and we will use the first algorithm. If long long is > 64 bits (strange architecture with VERY large long long), second algorithm can't be used, and we again use the first one. Else (if long is 32 bits and long long is 64 bits) we use second one. And third: there is a simple optimization which takes fast path not only for zero as was done before, but for all one-digit numbers. In all tested cases new code is faster than old one, in many cases by 30%, in few cases by more than 50% (for example, on x86-32, conversion of 12345678). Code growth is ~0 in 32-bit case and ~130 bytes in 64-bit case. This patch is based upon an original from Michal Nazarewicz. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Michal Nazarewicz <mina86@mina86.com> Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com> Cc: Douglas W Jones <jones@cs.uiowa.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Denys Vlasenko <vda.linux@googlemail.com> 2012-05-31 19:26:08 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-05-31 20:49:27 -0400
commit: 133fd9f5cda2d86904126f4b9fa4e8f4330c9569 (patch)
tree: 0e60bfcec85f123243cf1ffe735264527efd6d0e /lib/vsprintf.c
parent: 725fe002d315c2501c110b7245d3eb4f4535f4d6 (diff)
1 files changed, 190 insertions, 91 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b8fbd275bc46..c3f36d415bdf 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -112,106 +112,199 @@ int skip_atoi(const char **s)
 /* Decimal conversion is by far the most typical, and is used
 * for /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed
- * using code from
+ * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
- * http://www.cs.uiowa.edu/~jones/bcd/decimal.html
+ * (with permission from the author, Douglas W. Jones).
- * (with permission from the author, Douglas W. Jones). */
+ */
-/* Formats correctly any integer in [0,99999].
+#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
- * Outputs from one to five digits depending on input.
+/* Formats correctly any integer in [0, 999999999] */
- * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
 static noinline_for_stack
-char *put_dec_trunc(char *buf, unsigned q)
+char *put_dec_full9(char *buf, unsigned q)
 {
-        unsigned d3, d2, d1, d0;
+        unsigned r;
-        d1 = (q>>4) & 0xf;
-        d2 = (q>>8) & 0xf;
-        d3 = (q>>12);
-        d0 = 6*(d3 + d2 + d1) + (q & 0xf);
-        q = (d0 * 0xcd) >> 11;
-        d0 = d0 - 10*q;
-        *buf++ = d0 + '0'; /* least significant digit */
-        d1 = q + 9*d3 + 5*d2 + d1;
-        if (d1 != 0) {
-                q = (d1 * 0xcd) >> 11;
-                d1 = d1 - 10*q;
-                *buf++ = d1 + '0'; /* next digit */
-                d2 = q + 2*d2;
-                if ((d2 != 0) || (d3 != 0)) {
-                        q = (d2 * 0xd) >> 7;
-                        d2 = d2 - 10*q;
-                        *buf++ = d2 + '0'; /* next digit */
-                        d3 = q + 4*d3;
-                        if (d3 != 0) {
-                                q = (d3 * 0xcd) >> 11;
-                                d3 = d3 - 10*q;
-                                *buf++ = d3 + '0';  /* next digit */
-                                if (q != 0)
-                                        *buf++ = q + '0'; /* most sign. digit */
-                        }
-                }
-        }
+        /*
+         * Possible ways to approx. divide by 10
+         * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
+         * (x * 0xcccd) >> 19     x <      81920 (x < 262149 when 64-bit mul)
+         * (x * 0x6667) >> 18     x <      43699
+         * (x * 0x3334) >> 17     x <      16389
+         * (x * 0x199a) >> 16     x <      16389
+         * (x * 0x0ccd) >> 15     x <      16389
+         * (x * 0x0667) >> 14     x <       2739
+         * (x * 0x0334) >> 13     x <       1029
+         * (x * 0x019a) >> 12     x <       1029
+         * (x * 0x00cd) >> 11     x <       1029 shorter code than * 0x67 (on i386)
+         * (x * 0x0067) >> 10     x <        179
+         * (x * 0x0034) >>  9     x <         69 same
+         * (x * 0x001a) >>  8     x <         69 same
+         * (x * 0x000d) >>  7     x <         69 same, shortest code (on i386)
+         * (x * 0x0007) >>  6     x <         19
+         * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
+         */
+        r      = (q * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (q - 10 * r) + '0'; /* 1 */
+        q      = (r * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (r - 10 * q) + '0'; /* 2 */
+        r      = (q * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (q - 10 * r) + '0'; /* 3 */
+        q      = (r * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (r - 10 * q) + '0'; /* 4 */
+        r      = (q * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (q - 10 * r) + '0'; /* 5 */
+        /* Now value is under 10000, can avoid 64-bit multiply */
+        q      = (r * 0x199a) >> 16;
+        *buf++ = (r - 10 * q)  + '0'; /* 6 */
+        r      = (q * 0xcd) >> 11;
+        *buf++ = (q - 10 * r)  + '0'; /* 7 */
+        q      = (r * 0xcd) >> 11;
+        *buf++ = (r - 10 * q) + '0'; /* 8 */
+        *buf++ = q + '0'; /* 9 */
        return buf;
 }
-/* Same with if's removed. Always emits five digits */
+#endif
+/* Similar to above but do not pad with zeros.
+ * Code can be easily arranged to print 9 digits too, but our callers
+ * always call put_dec_full9() instead when the number has 9 decimal digits.
+ */
 static noinline_for_stack
-char *put_dec_full(char *buf, unsigned q)
+char *put_dec_trunc8(char *buf, unsigned r)
 {
-        /* BTW, if q is in [0,9999], 8-bit ints will be enough, */
+        unsigned q;
-        /* but anyway, gcc produces better code with full-sized ints */
-        unsigned d3, d2, d1, d0;
+        /* Copy of previous function's body with added early returns */
-        d1 = (q>>4) & 0xf;
+        q      = (r * (uint64_t)0x1999999a) >> 32;
-        d2 = (q>>8) & 0xf;
+        *buf++ = (r - 10 * q) + '0'; /* 2 */
-        d3 = (q>>12);
+        if (q == 0)
+                return buf;
+        r      = (q * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (q - 10 * r) + '0'; /* 3 */
+        if (r == 0)
+                return buf;
+        q      = (r * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (r - 10 * q) + '0'; /* 4 */
+        if (q == 0)
+                return buf;
+        r      = (q * (uint64_t)0x1999999a) >> 32;
+        *buf++ = (q - 10 * r) + '0'; /* 5 */
+        if (r == 0)
+                return buf;
+        q      = (r * 0x199a) >> 16;
+        *buf++ = (r - 10 * q)  + '0'; /* 6 */
+        if (q == 0)
+                return buf;
+        r      = (q * 0xcd) >> 11;
+        *buf++ = (q - 10 * r)  + '0'; /* 7 */
+        if (r == 0)
+                return buf;
+        q      = (r * 0xcd) >> 11;
+        *buf++ = (r - 10 * q) + '0'; /* 8 */
+        if (q == 0)
+                return buf;
+        *buf++ = q + '0'; /* 9 */
+        return buf;
+}
-        /*
+/* There are two algorithms to print larger numbers.
-         * Possible ways to approx. divide by 10
+ * One is generic: divide by 1000000000 and repeatedly print
-         * gcc -O2 replaces multiply with shifts and adds
+ * groups of (up to) 9 digits. It's conceptually simple,
-         * (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386)
+ * but requires a (unsigned long long) / 1000000000 division.
-         * (x * 0x67) >> 10:  1100111
+ *
-         * (x * 0x34) >> 9:    110100 - same
+ * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
-         * (x * 0x1a) >> 8:     11010 - same
+ * manipulates them cleverly and generates groups of 4 decimal digits.
-         * (x * 0x0d) >> 7:      1101 - same, shortest code (on i386)
+ * It so happens that it does NOT require long long division.
-         */
+ *
-        d0 = 6*(d3 + d2 + d1) + (q & 0xf);
+ * If long is > 32 bits, division of 64-bit values is relatively easy,
-        q = (d0 * 0xcd) >> 11;
+ * and we will use the first algorithm.
-        d0 = d0 - 10*q;
+ * If long long is > 64 bits (strange architecture with VERY large long long),
-        *buf++ = d0 + '0';
+ * second algorithm can't be used, and we again use the first one.
-        d1 = q + 9*d3 + 5*d2 + d1;
+ *
-                q = (d1 * 0xcd) >> 11;
+ * Else (if long is 32 bits and long long is 64 bits) we use second one.
-                d1 = d1 - 10*q;
+ */
-                *buf++ = d1 + '0';
-                d2 = q + 2*d2;
-                        q = (d2 * 0xd) >> 7;
-                        d2 = d2 - 10*q;
-                        *buf++ = d2 + '0';
-                        d3 = q + 4*d3;
-                                q = (d3 * 0xcd) >> 11; /* - shorter code */
-                                /* q = (d3 * 0x67) >> 10; - would also work */
-                                d3 = d3 - 10*q;
-                                *buf++ = d3 + '0';
-                                        *buf++ = q + '0';
-        return buf;
+#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+/* First algorithm: generic */
+static
+char *put_dec(char *buf, unsigned long long n)
+{
+        if (n >= 100*1000*1000) {
+                while (n >= 1000*1000*1000)
+                        buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
+                if (n >= 100*1000*1000)
+                        return put_dec_full9(buf, n);
+        }
+        return put_dec_trunc8(buf, n);
 }
-/* No inlining helps gcc to use registers better */
+#else
+/* Second algorithm: valid only for 64-bit long longs */
 static noinline_for_stack
-char *put_dec(char *buf, unsigned long long num)
+char *put_dec_full4(char *buf, unsigned q)
 {
-        while (1) {
+        unsigned r;
-                unsigned rem;
+        r      = (q * 0xcccd) >> 19;
-                if (num < 100000)
+        *buf++ = (q - 10 * r) + '0';
-                        return put_dec_trunc(buf, num);
+        q      = (r * 0x199a) >> 16;
-                rem = do_div(num, 100000);
+        *buf++ = (r - 10 * q)  + '0';
-                buf = put_dec_full(buf, rem);
+        r      = (q * 0xcd) >> 11;
-        }
+        *buf++ = (q - 10 * r)  + '0';
+        *buf++ = r + '0';
+        return buf;
+}
+/* Based on code by Douglas W. Jones found at
+ * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
+ * (with permission from the author).
+ * Performs no 64-bit division and hence should be fast on 32-bit machines.
+ */
+static
+char *put_dec(char *buf, unsigned long long n)
+{
+        uint32_t d3, d2, d1, q, h;
+        if (n < 100*1000*1000)
+                return put_dec_trunc8(buf, n);
+        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
+        h   = (n >> 32);
+        d2  = (h      ) & 0xffff;
+        d3  = (h >> 16); /* implicit "& 0xffff" */
+        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
+        buf = put_dec_full4(buf, q % 10000);
+        q   = q / 10000;
+        d1  = q + 7671 * d3 + 9496 * d2 + 6 * d1;
+        buf = put_dec_full4(buf, d1 % 10000);
+        q   = d1 / 10000;
+        d2  = q + 4749 * d3 + 42 * d2;
+        buf = put_dec_full4(buf, d2 % 10000);
+        q   = d2 / 10000;
+        d3  = q + 281 * d3;
+        if (!d3)
+                goto done;
+        buf = put_dec_full4(buf, d3 % 10000);
+        q   = d3 / 10000;
+        if (!q)
+                goto done;
+        buf = put_dec_full4(buf, q);
+ done:
+        while (buf[-1] == '0')
+                --buf;
+        return buf;
 }
+#endif
 /*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
@@ -220,16 +313,22 @@ char *put_dec(char *buf, unsigned long long num)
 */
 int num_to_str(char *buf, int size, unsigned long long num)
 {
-        char tmp[21];           /* Enough for 2^64 in decimal */
+        char tmp[sizeof(num) * 3];
        int idx, len;
-        len = put_dec(tmp, num) - tmp;
+        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
+        if (num <= 9) {
+                tmp[0] = '0' + num;
+                len = 1;
+        } else {
+                len = put_dec(tmp, num) - tmp;
+        }
        if (len > size)
                return 0;
        for (idx = 0; idx < len; ++idx)
                buf[idx] = tmp[len - idx - 1];
-        return  len;
+        return len;
 }
 #define ZEROPAD 1               /* pad with zero */
@@ -314,8 +413,8 @@ char *number(char *buf, char *end, unsigned long long num,
        /* generate full string in tmp[], in reverse order */
        i = 0;
-        if (num == 0)
+        if (num < spec.base)
-                tmp[i++] = '0';
+                tmp[i++] = digits[num] | locase;
        /* Generic code, for any base:
        else do {
                tmp[i++] = (digits[do_div(num,base)] | locase);
@@ -611,7 +710,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
        }
        for (i = 0; i < 4; i++) {
                char temp[3];   /* hold each IP quad in reverse order */
-                int digits = put_dec_trunc(temp, addr[index]) - temp;
+                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
author	Denys Vlasenko <vda.linux@googlemail.com>	2012-05-31 19:26:08 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-05-31 20:49:27 -0400
commit	133fd9f5cda2d86904126f4b9fa4e8f4330c9569 (patch)
tree	0e60bfcec85f123243cf1ffe735264527efd6d0e /lib/vsprintf.c
parent	725fe002d315c2501c110b7245d3eb4f4535f4d6 (diff)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b8fbd275bc46..c3f36d415bdf 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c
@@ -112,106 +112,199 @@ int skip_atoi(const char **s)
112	/* Decimal conversion is by far the most typical, and is used	112	/* Decimal conversion is by far the most typical, and is used
113	* for /proc and /sys data. This directly impacts e.g. top performance	113	* for /proc and /sys data. This directly impacts e.g. top performance
114	* with many processes running. We optimize it for speed	114	* with many processes running. We optimize it for speed
115	* using code from	115	* using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
116	* http://www.cs.uiowa.edu/~jones/bcd/decimal.html	116	* (with permission from the author, Douglas W. Jones).
117	* (with permission from the author, Douglas W. Jones). */	117	*/
118		118
119	/* Formats correctly any integer in [0,99999].	119	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64
120	* Outputs from one to five digits depending on input.	120	/* Formats correctly any integer in [0, 999999999] */
121	* On i386 gcc 4.1.2 -O2: ~250 bytes of code. */
122	static noinline_for_stack	121	static noinline_for_stack
123	char put_dec_trunc(char buf, unsigned q)	122	char put_dec_full9(char buf, unsigned q)
124	{	123	{
125	unsigned d3, d2, d1, d0;	124	unsigned r;
126	d1 = (q>>4) & 0xf;
127	d2 = (q>>8) & 0xf;
128	d3 = (q>>12);
129
130	d0 = 6*(d3 + d2 + d1) + (q & 0xf);
131	q = (d0 * 0xcd) >> 11;
132	d0 = d0 - 10*q;
133	buf++ = d0 + '0'; / least significant digit */
134	d1 = q + 9d3 + 5d2 + d1;
135	if (d1 != 0) {
136	q = (d1 * 0xcd) >> 11;
137	d1 = d1 - 10*q;
138	buf++ = d1 + '0'; / next digit */
139
140	d2 = q + 2*d2;
141	if ((d2 != 0) \|\| (d3 != 0)) {
142	q = (d2 * 0xd) >> 7;
143	d2 = d2 - 10*q;
144	buf++ = d2 + '0'; / next digit */
145
146	d3 = q + 4*d3;
147	if (d3 != 0) {
148	q = (d3 * 0xcd) >> 11;
149	d3 = d3 - 10*q;
150	buf++ = d3 + '0'; / next digit */
151	if (q != 0)
152	buf++ = q + '0'; / most sign. digit */
153	}
154	}
155	}
156		125
		126	/*
		127	* Possible ways to approx. divide by 10
		128	* (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
		129	* (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)
		130	* (x * 0x6667) >> 18 x < 43699
		131	* (x * 0x3334) >> 17 x < 16389
		132	* (x * 0x199a) >> 16 x < 16389
		133	* (x * 0x0ccd) >> 15 x < 16389
		134	* (x * 0x0667) >> 14 x < 2739
		135	* (x * 0x0334) >> 13 x < 1029
		136	* (x * 0x019a) >> 12 x < 1029
		137	* (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)
		138	* (x * 0x0067) >> 10 x < 179
		139	* (x * 0x0034) >> 9 x < 69 same
		140	* (x * 0x001a) >> 8 x < 69 same
		141	* (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)
		142	* (x * 0x0007) >> 6 x < 19
		143	* See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
		144	*/
		145	r = (q * (uint64_t)0x1999999a) >> 32;
		146	buf++ = (q - 10 r) + '0'; /* 1 */
		147	q = (r * (uint64_t)0x1999999a) >> 32;
		148	buf++ = (r - 10 q) + '0'; /* 2 */
		149	r = (q * (uint64_t)0x1999999a) >> 32;
		150	buf++ = (q - 10 r) + '0'; /* 3 */
		151	q = (r * (uint64_t)0x1999999a) >> 32;
		152	buf++ = (r - 10 q) + '0'; /* 4 */
		153	r = (q * (uint64_t)0x1999999a) >> 32;
		154	buf++ = (q - 10 r) + '0'; /* 5 */
		155	/* Now value is under 10000, can avoid 64-bit multiply */
		156	q = (r * 0x199a) >> 16;
		157	buf++ = (r - 10 q) + '0'; /* 6 */
		158	r = (q * 0xcd) >> 11;
		159	buf++ = (q - 10 r) + '0'; /* 7 */
		160	q = (r * 0xcd) >> 11;
		161	buf++ = (r - 10 q) + '0'; /* 8 */
		162	buf++ = q + '0'; / 9 */
157	return buf;	163	return buf;
158	}	164	}
159	/* Same with if's removed. Always emits five digits */	165	#endif
		166
		167	/* Similar to above but do not pad with zeros.
		168	* Code can be easily arranged to print 9 digits too, but our callers
		169	* always call put_dec_full9() instead when the number has 9 decimal digits.
		170	*/
160	static noinline_for_stack	171	static noinline_for_stack
161	char put_dec_full(char buf, unsigned q)	172	char put_dec_trunc8(char buf, unsigned r)
162	{	173	{
163	/* BTW, if q is in [0,9999], 8-bit ints will be enough, */	174	unsigned q;
164	/* but anyway, gcc produces better code with full-sized ints */	175
165	unsigned d3, d2, d1, d0;	176	/* Copy of previous function's body with added early returns */
166	d1 = (q>>4) & 0xf;	177	q = (r * (uint64_t)0x1999999a) >> 32;
167	d2 = (q>>8) & 0xf;	178	buf++ = (r - 10 q) + '0'; /* 2 */
168	d3 = (q>>12);	179	if (q == 0)
		180	return buf;
		181	r = (q * (uint64_t)0x1999999a) >> 32;
		182	buf++ = (q - 10 r) + '0'; /* 3 */
		183	if (r == 0)
		184	return buf;
		185	q = (r * (uint64_t)0x1999999a) >> 32;
		186	buf++ = (r - 10 q) + '0'; /* 4 */
		187	if (q == 0)
		188	return buf;
		189	r = (q * (uint64_t)0x1999999a) >> 32;
		190	buf++ = (q - 10 r) + '0'; /* 5 */
		191	if (r == 0)
		192	return buf;
		193	q = (r * 0x199a) >> 16;
		194	buf++ = (r - 10 q) + '0'; /* 6 */
		195	if (q == 0)
		196	return buf;
		197	r = (q * 0xcd) >> 11;
		198	buf++ = (q - 10 r) + '0'; /* 7 */
		199	if (r == 0)
		200	return buf;
		201	q = (r * 0xcd) >> 11;
		202	buf++ = (r - 10 q) + '0'; /* 8 */
		203	if (q == 0)
		204	return buf;
		205	buf++ = q + '0'; / 9 */
		206	return buf;
		207	}
169		208
170	/*	209	/* There are two algorithms to print larger numbers.
171	* Possible ways to approx. divide by 10	210	* One is generic: divide by 1000000000 and repeatedly print
172	* gcc -O2 replaces multiply with shifts and adds	211	* groups of (up to) 9 digits. It's conceptually simple,
173	* (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386)	212	* but requires a (unsigned long long) / 1000000000 division.
174	* (x * 0x67) >> 10: 1100111	213	*
175	* (x * 0x34) >> 9: 110100 - same	214	* Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
176	* (x * 0x1a) >> 8: 11010 - same	215	* manipulates them cleverly and generates groups of 4 decimal digits.
177	* (x * 0x0d) >> 7: 1101 - same, shortest code (on i386)	216	* It so happens that it does NOT require long long division.
178	*/	217	*
179	d0 = 6*(d3 + d2 + d1) + (q & 0xf);	218	* If long is > 32 bits, division of 64-bit values is relatively easy,
180	q = (d0 * 0xcd) >> 11;	219	* and we will use the first algorithm.
181	d0 = d0 - 10*q;	220	* If long long is > 64 bits (strange architecture with VERY large long long),
182	*buf++ = d0 + '0';	221	* second algorithm can't be used, and we again use the first one.
183	d1 = q + 9d3 + 5d2 + d1;	222	*
184	q = (d1 * 0xcd) >> 11;	223	* Else (if long is 32 bits and long long is 64 bits) we use second one.
185	d1 = d1 - 10*q;	224	*/
186	*buf++ = d1 + '0';
187
188	d2 = q + 2*d2;
189	q = (d2 * 0xd) >> 7;
190	d2 = d2 - 10*q;
191	*buf++ = d2 + '0';
192
193	d3 = q + 4*d3;
194	q = (d3 * 0xcd) >> 11; /* - shorter code */
195	/* q = (d3 * 0x67) >> 10; - would also work */
196	d3 = d3 - 10*q;
197	*buf++ = d3 + '0';
198	*buf++ = q + '0';
199		225
200	return buf;	226	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64
		227
		228	/* First algorithm: generic */
		229
		230	static
		231	char put_dec(char buf, unsigned long long n)
		232	{
		233	if (n >= 10010001000) {
		234	while (n >= 100010001000)
		235	buf = put_dec_full9(buf, do_div(n, 100010001000));
		236	if (n >= 10010001000)
		237	return put_dec_full9(buf, n);
		238	}
		239	return put_dec_trunc8(buf, n);
201	}	240	}
202	/* No inlining helps gcc to use registers better */	241
		242	#else
		243
		244	/* Second algorithm: valid only for 64-bit long longs */
		245
203	static noinline_for_stack	246	static noinline_for_stack
204	char put_dec(char buf, unsigned long long num)	247	char put_dec_full4(char buf, unsigned q)
205	{	248	{
206	while (1) {	249	unsigned r;
207	unsigned rem;	250	r = (q * 0xcccd) >> 19;
208	if (num < 100000)	251	buf++ = (q - 10 r) + '0';
209	return put_dec_trunc(buf, num);	252	q = (r * 0x199a) >> 16;
210	rem = do_div(num, 100000);	253	buf++ = (r - 10 q) + '0';
211	buf = put_dec_full(buf, rem);	254	r = (q * 0xcd) >> 11;
212	}	255	buf++ = (q - 10 r) + '0';
		256	*buf++ = r + '0';
		257	return buf;
		258	}
		259
		260	/* Based on code by Douglas W. Jones found at
		261	* <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
		262	* (with permission from the author).
		263	* Performs no 64-bit division and hence should be fast on 32-bit machines.
		264	*/
		265	static
		266	char put_dec(char buf, unsigned long long n)
		267	{
		268	uint32_t d3, d2, d1, q, h;
		269
		270	if (n < 10010001000)
		271	return put_dec_trunc8(buf, n);
		272
		273	d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
		274	h = (n >> 32);
		275	d2 = (h ) & 0xffff;
		276	d3 = (h >> 16); /* implicit "& 0xffff" */
		277
		278	q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
		279
		280	buf = put_dec_full4(buf, q % 10000);
		281	q = q / 10000;
		282
		283	d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1;
		284	buf = put_dec_full4(buf, d1 % 10000);
		285	q = d1 / 10000;
		286
		287	d2 = q + 4749 * d3 + 42 * d2;
		288	buf = put_dec_full4(buf, d2 % 10000);
		289	q = d2 / 10000;
		290
		291	d3 = q + 281 * d3;
		292	if (!d3)
		293	goto done;
		294	buf = put_dec_full4(buf, d3 % 10000);
		295	q = d3 / 10000;
		296	if (!q)
		297	goto done;
		298	buf = put_dec_full4(buf, q);
		299	done:
		300	while (buf[-1] == '0')
		301	--buf;
		302
		303	return buf;
213	}	304	}
214		305
		306	#endif
		307
215	/*	308	/*
216	* Convert passed number to decimal string.	309	* Convert passed number to decimal string.
217	* Returns the length of string. On buffer overflow, returns 0.	310	* Returns the length of string. On buffer overflow, returns 0.
@@ -220,16 +313,22 @@ char put_dec(char buf, unsigned long long num)
220	*/	313	*/
221	int num_to_str(char *buf, int size, unsigned long long num)	314	int num_to_str(char *buf, int size, unsigned long long num)
222	{	315	{
223	char tmp[21]; /* Enough for 2^64 in decimal */	316	char tmp[sizeof(num) * 3];
224	int idx, len;	317	int idx, len;
225		318
226	len = put_dec(tmp, num) - tmp;	319	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
		320	if (num <= 9) {
		321	tmp[0] = '0' + num;
		322	len = 1;
		323	} else {
		324	len = put_dec(tmp, num) - tmp;
		325	}
227		326
228	if (len > size)	327	if (len > size)
229	return 0;	328	return 0;
230	for (idx = 0; idx < len; ++idx)	329	for (idx = 0; idx < len; ++idx)
231	buf[idx] = tmp[len - idx - 1];	330	buf[idx] = tmp[len - idx - 1];
232	return len;	331	return len;
233	}	332	}
234		333
235	#define ZEROPAD 1 /* pad with zero */	334	#define ZEROPAD 1 /* pad with zero */
@@ -314,8 +413,8 @@ char number(char buf, char *end, unsigned long long num,
314		413
315	/* generate full string in tmp[], in reverse order */	414	/* generate full string in tmp[], in reverse order */
316	i = 0;	415	i = 0;
317	if (num == 0)	416	if (num < spec.base)
318	tmp[i++] = '0';	417	tmp[i++] = digits[num] \| locase;
319	/* Generic code, for any base:	418	/* Generic code, for any base:
320	else do {	419	else do {
321	tmp[i++] = (digits[do_div(num,base)] \| locase);	420	tmp[i++] = (digits[do_div(num,base)] \| locase);
@@ -611,7 +710,7 @@ char ip4_string(char p, const u8 addr, const char fmt)
611	}	710	}
612	for (i = 0; i < 4; i++) {	711	for (i = 0; i < 4; i++) {
613	char temp[3]; /* hold each IP quad in reverse order */	712	char temp[3]; /* hold each IP quad in reverse order */
614	int digits = put_dec_trunc(temp, addr[index]) - temp;	713	int digits = put_dec_trunc8(temp, addr[index]) - temp;
615	if (leading_zeros) {	714	if (leading_zeros) {
616	if (digits < 3)	715	if (digits < 3)
617	*p++ = '0';	716	*p++ = '0';