lib/vsprintf.c: even faster binary to decimal conversion

The most expensive part of decimal conversion is the divisions by 10 (albeit done using reciprocal multiplication with appropriately chosen constants). I decided to see if one could eliminate around half of these multiplications by emitting two digits at a time, at the cost of a 200 byte lookup table, and it does indeed seem like there is something to be gained, especially on 64 bits. Microbenchmarking shows improvements ranging from -50% (for numbers uniformly distributed in [0, 2^64-1]) to -25% (for numbers heavily biased toward the smaller end, a more realistic distribution). On a larger scale, perf shows that top, one of the big consumers of /proc data, uses 0.5-1.0% fewer cpu cycles. I had to jump through some hoops to get the 32 bit code to compile and run on my 64 bit machine, so I'm not sure how relevant these numbers are, but just for comparison the microbenchmark showed improvements between -30% and -10%. The bloat-o-meter costs are around 150 bytes (the generated code is a little smaller, so it's not the full 200 bytes) on both 32 and 64 bit. I'm aware that extra cache misses won't show up in a microbenchmark as used above, but on the other hand decimal conversions often happen in bulk (for example in the case of top). I have of course tested that the new code generates the same output as the old, for both the first and last 1e10 numbers in [0,2^64-1] and 4e9 'random' numbers in-between. Test and verification code on github: https://github.com/Villemoes/dec. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Tested-by: Jeff Epler <jepler@unpythonic.net> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Joe Perches <joe@perches.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Rasmus Villemoes <linux@rasmusvillemoes.dk> 2015-04-16 15:43:22 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-17 09:03:54 -0400
commit: 7c43d9a30c527d9e06e2c55f82b56f28df43caed (patch)
tree: 9d83a74720aae5b5ebb84066641392fc68cbebf2 /lib
parent: 840620a1596a90636a44d6a593db4041bb28d52e (diff)
1 files changed, 128 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3a1e0843f9a2..c93ec8a035b3 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
 #include <asm/page.h>           /* for PAGE_SIZE */
 #include <asm/sections.h>       /* for dereference_function_descriptor() */
+#include <asm/byteorder.h>      /* cpu_to_le16 */
 #include <linux/string_helpers.h>
 #include "kstrtox.h"
@@ -122,142 +123,147 @@ int skip_atoi(const char **s)
        return i;
 }
-/* Decimal conversion is by far the most typical, and is used
+/*
- * for /proc and /sys data. This directly impacts e.g. top performance
+ * Decimal conversion is by far the most typical, and is used for
- * with many processes running. We optimize it for speed
+ * /proc and /sys data. This directly impacts e.g. top performance
- * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
+ * with many processes running. We optimize it for speed by emitting
- * (with permission from the author, Douglas W. Jones).
+ * two characters at a time, using a 200 byte lookup table. This
+ * roughly halves the number of multiplications compared to computing
+ * the digits one at a time. Implementation strongly inspired by the
+ * previous version, which in turn used ideas described at
+ * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
+ * from the author, Douglas W. Jones).
+ *
+ * It turns out there is precisely one 26 bit fixed-point
+ * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
+ * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
+ * range happens to be somewhat larger (x <= 1073741898), but that's
+ * irrelevant for our purpose.
+ *
+ * For dividing a number in the range [10^4, 10^6-1] by 100, we still
+ * need a 32x32->64 bit multiply, so we simply use the same constant.
+ *
+ * For dividing a number in the range [100, 10^4-1] by 100, there are
+ * several options. The simplest is (x * 0x147b) >> 19, which is valid
+ * for all x <= 43698.
 */
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+static const u16 decpair[100] = {
-/* Formats correctly any integer in [0, 999999999] */
+#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
+        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
+        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
+        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
+        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
+        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
+        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
+        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
+        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
+        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
+        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
+#undef _
+};
+/*
+ * This will print a single '0' even if r == 0, since we would
+ * immediately jump to out_r where two 0s would be written and one of
+ * them then discarded. This is needed by ip4_string below. All other
+ * callers pass a non-zero value of r.
+*/
 static noinline_for_stack
-char *put_dec_full9(char *buf, unsigned q)
+char *put_dec_trunc8(char *buf, unsigned r)
 {
-        unsigned r;
+        unsigned q;
-        /*
+        /* 1 <= r < 10^8 */
-         * Possible ways to approx. divide by 10
+        if (r < 100)
-         * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
+                goto out_r;
-         * (x * 0xcccd) >> 19     x <      81920 (x < 262149 when 64-bit mul)
-         * (x * 0x6667) >> 18     x <      43699
+        /* 100 <= r < 10^8 */
-         * (x * 0x3334) >> 17     x <      16389
+        q = (r * (u64)0x28f5c29) >> 32;
-         * (x * 0x199a) >> 16     x <      16389
+        *((u16 *)buf) = decpair[r - 100*q];
-         * (x * 0x0ccd) >> 15     x <      16389
+        buf += 2;
-         * (x * 0x0667) >> 14     x <       2739
-         * (x * 0x0334) >> 13     x <       1029
+        /* 1 <= q < 10^6 */
-         * (x * 0x019a) >> 12     x <       1029
+        if (q < 100)
-         * (x * 0x00cd) >> 11     x <       1029 shorter code than * 0x67 (on i386)
+                goto out_q;
-         * (x * 0x0067) >> 10     x <        179
-         * (x * 0x0034) >>  9     x <         69 same
+        /*  100 <= q < 10^6 */
-         * (x * 0x001a) >>  8     x <         69 same
+        r = (q * (u64)0x28f5c29) >> 32;
-         * (x * 0x000d) >>  7     x <         69 same, shortest code (on i386)
+        *((u16 *)buf) = decpair[q - 100*r];
-         * (x * 0x0007) >>  6     x <         19
+        buf += 2;
-         * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
-         */
+        /* 1 <= r < 10^4 */
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        if (r < 100)
-        *buf++ = (q - 10 * r) + '0'; /* 1 */
+                goto out_r;
-        q      = (r * (uint64_t)0x1999999a) >> 32;
-        *buf++ = (r - 10 * q) + '0'; /* 2 */
+        /* 100 <= r < 10^4 */
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        q = (r * 0x147b) >> 19;
-        *buf++ = (q - 10 * r) + '0'; /* 3 */
+        *((u16 *)buf) = decpair[r - 100*q];
-        q      = (r * (uint64_t)0x1999999a) >> 32;
+        buf += 2;
-        *buf++ = (r - 10 * q) + '0'; /* 4 */
+out_q:
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        /* 1 <= q < 100 */
-        *buf++ = (q - 10 * r) + '0'; /* 5 */
+        r = q;
-        /* Now value is under 10000, can avoid 64-bit multiply */
+out_r:
-        q      = (r * 0x199a) >> 16;
+        /* 1 <= r < 100 */
-        *buf++ = (r - 10 * q)  + '0'; /* 6 */
+        *((u16 *)buf) = decpair[r];
-        r      = (q * 0xcd) >> 11;
+        buf += 2;
-        *buf++ = (q - 10 * r)  + '0'; /* 7 */
+        if (buf[-1] == '0')
-        q      = (r * 0xcd) >> 11;
+                buf--;
-        *buf++ = (r - 10 * q) + '0'; /* 8 */
-        *buf++ = q + '0'; /* 9 */
        return buf;
 }
-#endif
-/* Similar to above but do not pad with zeros.
+#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
- * Code can be easily arranged to print 9 digits too, but our callers
- * always call put_dec_full9() instead when the number has 9 decimal digits.
- */
 static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
+char *put_dec_full8(char *buf, unsigned r)
 {
        unsigned q;
-        /* Copy of previous function's body with added early returns */
+        /* 0 <= r < 10^8 */
-        while (r >= 10000) {
+        q = (r * (u64)0x28f5c29) >> 32;
-                q = r + '0';
+        *((u16 *)buf) = decpair[r - 100*q];
-                r  = (r * (uint64_t)0x1999999a) >> 32;
+        buf += 2;
-                *buf++ = q - 10*r;
-        }
-        q      = (r * 0x199a) >> 16;    /* r <= 9999 */
+        /* 0 <= q < 10^6 */
-        *buf++ = (r - 10 * q)  + '0';
+        r = (q * (u64)0x28f5c29) >> 32;
-        if (q == 0)
+        *((u16 *)buf) = decpair[q - 100*r];
-                return buf;
+        buf += 2;
-        r      = (q * 0xcd) >> 11;      /* q <= 999 */
-        *buf++ = (q - 10 * r)  + '0';
-        if (r == 0)
-                return buf;
-        q      = (r * 0xcd) >> 11;      /* r <= 99 */
-        *buf++ = (r - 10 * q) + '0';
-        if (q == 0)
-                return buf;
-        *buf++ = q + '0';                /* q <= 9 */
-        return buf;
-}
-/* There are two algorithms to print larger numbers.
+        /* 0 <= r < 10^4 */
- * One is generic: divide by 1000000000 and repeatedly print
+        q = (r * 0x147b) >> 19;
- * groups of (up to) 9 digits. It's conceptually simple,
+        *((u16 *)buf) = decpair[r - 100*q];
- * but requires a (unsigned long long) / 1000000000 division.
+        buf += 2;
- *
- * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
- * manipulates them cleverly and generates groups of 4 decimal digits.
- * It so happens that it does NOT require long long division.
- *
- * If long is > 32 bits, division of 64-bit values is relatively easy,
- * and we will use the first algorithm.
- * If long long is > 64 bits (strange architecture with VERY large long long),
- * second algorithm can't be used, and we again use the first one.
- *
- * Else (if long is 32 bits and long long is 64 bits) we use second one.
- */
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+        /* 0 <= q < 100 */
+        *((u16 *)buf) = decpair[q];
-/* First algorithm: generic */
+        buf += 2;
+        return buf;
+}
-static
+static noinline_for_stack
 char *put_dec(char *buf, unsigned long long n)
 {
-        if (n >= 100*1000*1000) {
+        if (n >= 100*1000*1000)
-                while (n >= 1000*1000*1000)
+                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-                        buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
+        /* 1 <= n <= 1.6e11 */
-                if (n >= 100*1000*1000)
+        if (n >= 100*1000*1000)
-                        return put_dec_full9(buf, n);
+                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-        }
+        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
 }
-#else
+#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
-/* Second algorithm: valid only for 64-bit long longs */
+static void
+put_dec_full4(char *buf, unsigned r)
-/* See comment in put_dec_full9 for choice of constants */
-static noinline_for_stack
-void put_dec_full4(char *buf, unsigned q)
 {
-        unsigned r;
+        unsigned q;
-        r      = (q * 0xccd) >> 15;
-        buf[0] = (q - 10 * r) + '0';
+        /* 0 <= r < 10^4 */
-        q      = (r * 0xcd) >> 11;
+        q = (r * 0x147b) >> 19;
-        buf[1] = (r - 10 * q)  + '0';
+        *((u16 *)buf) = decpair[r - 100*q];
-        r      = (q * 0xcd) >> 11;
+        buf += 2;
-        buf[2] = (q - 10 * r)  + '0';
+        /* 0 <= q < 100 */
-        buf[3] = r + '0';
+        *((u16 *)buf) = decpair[q];
 }
 /*
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q)
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
- * (d1 in the put_dec code, assuming n is all-ones).
+ * (second call in the put_dec code, assuming n is all-ones).
 */
-static
+static noinline_for_stack
 unsigned put_dec_helper4(char *buf, unsigned x)
 {
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n)
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */
+        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
+             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n)
 */
 int num_to_str(char *buf, int size, unsigned long long num)
 {
-        char tmp[sizeof(num) * 3];
+        /* put_dec requires 2-byte alignment of the buffer. */
+        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;
        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +393,8 @@ static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
 {
-        char tmp[3 * sizeof(num)];
+        /* put_dec requires 2-byte alignment of the buffer. */
+        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
                break;
        }
        for (i = 0; i < 4; i++) {
-                char temp[3];   /* hold each IP quad in reverse order */
+                char temp[4] __aligned(2);      /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
author	Rasmus Villemoes <linux@rasmusvillemoes.dk>	2015-04-16 15:43:22 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-17 09:03:54 -0400
commit	7c43d9a30c527d9e06e2c55f82b56f28df43caed (patch)
tree	9d83a74720aae5b5ebb84066641392fc68cbebf2 /lib
parent	840620a1596a90636a44d6a593db4041bb28d52e (diff)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3a1e0843f9a2..c93ec8a035b3 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
33		33
34	#include <asm/page.h> /* for PAGE_SIZE */	34	#include <asm/page.h> /* for PAGE_SIZE */
35	#include <asm/sections.h> /* for dereference_function_descriptor() */	35	#include <asm/sections.h> /* for dereference_function_descriptor() */
		36	#include <asm/byteorder.h> /* cpu_to_le16 */
36		37
37	#include <linux/string_helpers.h>	38	#include <linux/string_helpers.h>
38	#include "kstrtox.h"	39	#include "kstrtox.h"
@@ -122,142 +123,147 @@ int skip_atoi(const char **s)
122	return i;	123	return i;
123	}	124	}
124		125
125	/* Decimal conversion is by far the most typical, and is used	126	/*
126	* for /proc and /sys data. This directly impacts e.g. top performance	127	* Decimal conversion is by far the most typical, and is used for
127	* with many processes running. We optimize it for speed	128	* /proc and /sys data. This directly impacts e.g. top performance
128	* using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>	129	* with many processes running. We optimize it for speed by emitting
129	* (with permission from the author, Douglas W. Jones).	130	* two characters at a time, using a 200 byte lookup table. This
		131	* roughly halves the number of multiplications compared to computing
		132	* the digits one at a time. Implementation strongly inspired by the
		133	* previous version, which in turn used ideas described at
		134	* <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
		135	* from the author, Douglas W. Jones).
		136	*
		137	* It turns out there is precisely one 26 bit fixed-point
		138	* approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
		139	* holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
		140	* range happens to be somewhat larger (x <= 1073741898), but that's
		141	* irrelevant for our purpose.
		142	*
		143	* For dividing a number in the range [10^4, 10^6-1] by 100, we still
		144	* need a 32x32->64 bit multiply, so we simply use the same constant.
		145	*
		146	* For dividing a number in the range [100, 10^4-1] by 100, there are
		147	* several options. The simplest is (x * 0x147b) >> 19, which is valid
		148	* for all x <= 43698.
130	*/	149	*/
131		150
132	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64	151	static const u16 decpair[100] = {
133	/* Formats correctly any integer in [0, 999999999] */	152	#define _(x) (__force u16) cpu_to_le16(((x % 10) \| ((x / 10) << 8)) + 0x3030)
		153	_( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
		154	_(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
		155	_(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
		156	_(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
		157	_(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
		158	_(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
		159	_(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
		160	_(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
		161	_(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
		162	_(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
		163	#undef _
		164	};
		165
		166	/*
		167	* This will print a single '0' even if r == 0, since we would
		168	* immediately jump to out_r where two 0s would be written and one of
		169	* them then discarded. This is needed by ip4_string below. All other
		170	* callers pass a non-zero value of r.
		171	*/
134	static noinline_for_stack	172	static noinline_for_stack
135	char put_dec_full9(char buf, unsigned q)	173	char put_dec_trunc8(char buf, unsigned r)
136	{	174	{
137	unsigned r;	175	unsigned q;
138		176
139	/*	177	/* 1 <= r < 10^8 */
140	* Possible ways to approx. divide by 10	178	if (r < 100)
141	* (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)	179	goto out_r;
142	* (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)	180
143	* (x * 0x6667) >> 18 x < 43699	181	/* 100 <= r < 10^8 */
144	* (x * 0x3334) >> 17 x < 16389	182	q = (r * (u64)0x28f5c29) >> 32;
145	* (x * 0x199a) >> 16 x < 16389	183	((u16 )buf) = decpair[r - 100*q];
146	* (x * 0x0ccd) >> 15 x < 16389	184	buf += 2;
147	* (x * 0x0667) >> 14 x < 2739	185
148	* (x * 0x0334) >> 13 x < 1029	186	/* 1 <= q < 10^6 */
149	* (x * 0x019a) >> 12 x < 1029	187	if (q < 100)
150	* (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)	188	goto out_q;
151	* (x * 0x0067) >> 10 x < 179	189
152	* (x * 0x0034) >> 9 x < 69 same	190	/* 100 <= q < 10^6 */
153	* (x * 0x001a) >> 8 x < 69 same	191	r = (q * (u64)0x28f5c29) >> 32;
154	* (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)	192	((u16 )buf) = decpair[q - 100*r];
155	* (x * 0x0007) >> 6 x < 19	193	buf += 2;
156	* See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>	194
157	*/	195	/* 1 <= r < 10^4 */
158	r = (q * (uint64_t)0x1999999a) >> 32;	196	if (r < 100)
159	buf++ = (q - 10 r) + '0'; /* 1 */	197	goto out_r;
160	q = (r * (uint64_t)0x1999999a) >> 32;	198
161	buf++ = (r - 10 q) + '0'; /* 2 */	199	/* 100 <= r < 10^4 */
162	r = (q * (uint64_t)0x1999999a) >> 32;	200	q = (r * 0x147b) >> 19;
163	buf++ = (q - 10 r) + '0'; /* 3 */	201	((u16 )buf) = decpair[r - 100*q];
164	q = (r * (uint64_t)0x1999999a) >> 32;	202	buf += 2;
165	buf++ = (r - 10 q) + '0'; /* 4 */	203	out_q:
166	r = (q * (uint64_t)0x1999999a) >> 32;	204	/* 1 <= q < 100 */
167	buf++ = (q - 10 r) + '0'; /* 5 */	205	r = q;
168	/* Now value is under 10000, can avoid 64-bit multiply */	206	out_r:
169	q = (r * 0x199a) >> 16;	207	/* 1 <= r < 100 */
170	buf++ = (r - 10 q) + '0'; /* 6 */	208	((u16 )buf) = decpair[r];
171	r = (q * 0xcd) >> 11;	209	buf += 2;
172	buf++ = (q - 10 r) + '0'; /* 7 */	210	if (buf[-1] == '0')
173	q = (r * 0xcd) >> 11;	211	buf--;
174	buf++ = (r - 10 q) + '0'; /* 8 */
175	buf++ = q + '0'; / 9 */
176	return buf;	212	return buf;
177	}	213	}
178	#endif
179		214
180	/* Similar to above but do not pad with zeros.	215	#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
181	* Code can be easily arranged to print 9 digits too, but our callers
182	* always call put_dec_full9() instead when the number has 9 decimal digits.
183	*/
184	static noinline_for_stack	216	static noinline_for_stack
185	char put_dec_trunc8(char buf, unsigned r)	217	char put_dec_full8(char buf, unsigned r)
186	{	218	{
187	unsigned q;	219	unsigned q;
188		220
189	/* Copy of previous function's body with added early returns */	221	/* 0 <= r < 10^8 */
190	while (r >= 10000) {	222	q = (r * (u64)0x28f5c29) >> 32;
191	q = r + '0';	223	((u16 )buf) = decpair[r - 100*q];
192	r = (r * (uint64_t)0x1999999a) >> 32;	224	buf += 2;
193	buf++ = q - 10r;
194	}
195		225
196	q = (r * 0x199a) >> 16; /* r <= 9999 */	226	/* 0 <= q < 10^6 */
197	buf++ = (r - 10 q) + '0';	227	r = (q * (u64)0x28f5c29) >> 32;
198	if (q == 0)	228	((u16 )buf) = decpair[q - 100*r];
199	return buf;	229	buf += 2;
200	r = (q * 0xcd) >> 11; /* q <= 999 */
201	buf++ = (q - 10 r) + '0';
202	if (r == 0)
203	return buf;
204	q = (r * 0xcd) >> 11; /* r <= 99 */
205	buf++ = (r - 10 q) + '0';
206	if (q == 0)
207	return buf;
208	buf++ = q + '0'; / q <= 9 */
209	return buf;
210	}
211		230
212	/* There are two algorithms to print larger numbers.	231	/* 0 <= r < 10^4 */
213	* One is generic: divide by 1000000000 and repeatedly print	232	q = (r * 0x147b) >> 19;
214	* groups of (up to) 9 digits. It's conceptually simple,	233	((u16 )buf) = decpair[r - 100*q];
215	* but requires a (unsigned long long) / 1000000000 division.	234	buf += 2;
216	*
217	* Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
218	* manipulates them cleverly and generates groups of 4 decimal digits.
219	* It so happens that it does NOT require long long division.
220	*
221	* If long is > 32 bits, division of 64-bit values is relatively easy,
222	* and we will use the first algorithm.
223	* If long long is > 64 bits (strange architecture with VERY large long long),
224	* second algorithm can't be used, and we again use the first one.
225	*
226	* Else (if long is 32 bits and long long is 64 bits) we use second one.
227	*/
228		235
229	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64	236	/* 0 <= q < 100 */
230		237	((u16 )buf) = decpair[q];
231	/* First algorithm: generic */	238	buf += 2;
		239	return buf;
		240	}
232		241
233	static	242	static noinline_for_stack
234	char put_dec(char buf, unsigned long long n)	243	char put_dec(char buf, unsigned long long n)
235	{	244	{
236	if (n >= 10010001000) {	245	if (n >= 10010001000)
237	while (n >= 100010001000)	246	buf = put_dec_full8(buf, do_div(n, 10010001000));
238	buf = put_dec_full9(buf, do_div(n, 100010001000));	247	/* 1 <= n <= 1.6e11 */
239	if (n >= 10010001000)	248	if (n >= 10010001000)
240	return put_dec_full9(buf, n);	249	buf = put_dec_full8(buf, do_div(n, 10010001000));
241	}	250	/* 1 <= n < 1e8 */
242	return put_dec_trunc8(buf, n);	251	return put_dec_trunc8(buf, n);
243	}	252	}
244		253
245	#else	254	#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
246		255
247	/* Second algorithm: valid only for 64-bit long longs */	256	static void
248		257	put_dec_full4(char *buf, unsigned r)
249	/* See comment in put_dec_full9 for choice of constants */
250	static noinline_for_stack
251	void put_dec_full4(char *buf, unsigned q)
252	{	258	{
253	unsigned r;	259	unsigned q;
254	r = (q * 0xccd) >> 15;	260
255	buf[0] = (q - 10 * r) + '0';	261	/* 0 <= r < 10^4 */
256	q = (r * 0xcd) >> 11;	262	q = (r * 0x147b) >> 19;
257	buf[1] = (r - 10 * q) + '0';	263	((u16 )buf) = decpair[r - 100*q];
258	r = (q * 0xcd) >> 11;	264	buf += 2;
259	buf[2] = (q - 10 * r) + '0';	265	/* 0 <= q < 100 */
260	buf[3] = r + '0';	266	((u16 )buf) = decpair[q];
261	}	267	}
262		268
263	/*	269	/*
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q)
265	* The approximation x/10000 == (x * 0x346DC5D7) >> 43	271	* The approximation x/10000 == (x * 0x346DC5D7) >> 43
266	* holds for all x < 1,128,869,999. The largest value this	272	* holds for all x < 1,128,869,999. The largest value this
267	* helper will ever be asked to convert is 1,125,520,955.	273	* helper will ever be asked to convert is 1,125,520,955.
268	* (d1 in the put_dec code, assuming n is all-ones).	274	* (second call in the put_dec code, assuming n is all-ones).
269	*/	275	*/
270	static	276	static noinline_for_stack
271	unsigned put_dec_helper4(char *buf, unsigned x)	277	unsigned put_dec_helper4(char *buf, unsigned x)
272	{	278	{
273	uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;	279	uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +300,8 @@ char put_dec(char buf, unsigned long long n)
294	d2 = (h ) & 0xffff;	300	d2 = (h ) & 0xffff;
295	d3 = (h >> 16); /* implicit "& 0xffff" */	301	d3 = (h >> 16); /* implicit "& 0xffff" */
296		302
		303	/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
		304	= 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
297	q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);	305	q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
298	q = put_dec_helper4(buf, q);	306	q = put_dec_helper4(buf, q);
299		307
@@ -323,7 +331,8 @@ char put_dec(char buf, unsigned long long n)
323	*/	331	*/
324	int num_to_str(char *buf, int size, unsigned long long num)	332	int num_to_str(char *buf, int size, unsigned long long num)
325	{	333	{
326	char tmp[sizeof(num) * 3];	334	/* put_dec requires 2-byte alignment of the buffer. */
		335	char tmp[sizeof(num) * 3] __aligned(2);
327	int idx, len;	336	int idx, len;
328		337
329	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */	338	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +393,8 @@ static noinline_for_stack
384	char number(char buf, char *end, unsigned long long num,	393	char number(char buf, char *end, unsigned long long num,
385	struct printf_spec spec)	394	struct printf_spec spec)
386	{	395	{
387	char tmp[3 * sizeof(num)];	396	/* put_dec requires 2-byte alignment of the buffer. */
		397	char tmp[3 * sizeof(num)] __aligned(2);
388	char sign;	398	char sign;
389	char locase;	399	char locase;
390	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);	400	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +954,7 @@ char ip4_string(char p, const u8 addr, const char fmt)
944	break;	954	break;
945	}	955	}
946	for (i = 0; i < 4; i++) {	956	for (i = 0; i < 4; i++) {
947	char temp[3]; /* hold each IP quad in reverse order */	957	char temp[4] __aligned(2); /* hold each IP quad in reverse order */
948	int digits = put_dec_trunc8(temp, addr[index]) - temp;	958	int digits = put_dec_trunc8(temp, addr[index]) - temp;
949	if (leading_zeros) {	959	if (leading_zeros) {
950	if (digits < 3)	960	if (digits < 3)