1 files changed, 126 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3a1e0843f9a2..da39c608a28c 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
 #include <asm/page.h>           /* for PAGE_SIZE */
 #include <asm/sections.h>       /* for dereference_function_descriptor() */
+#include <asm/byteorder.h>      /* cpu_to_le16 */
 #include <linux/string_helpers.h>
 #include "kstrtox.h"
@@ -122,142 +123,145 @@ int skip_atoi(const char **s)
        return i;
 }
-/* Decimal conversion is by far the most typical, and is used
+/*
- * for /proc and /sys data. This directly impacts e.g. top performance
+ * Decimal conversion is by far the most typical, and is used for
- * with many processes running. We optimize it for speed
+ * /proc and /sys data. This directly impacts e.g. top performance
- * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
+ * with many processes running. We optimize it for speed by emitting
- * (with permission from the author, Douglas W. Jones).
+ * two characters at a time, using a 200 byte lookup table. This
+ * roughly halves the number of multiplications compared to computing
+ * the digits one at a time. Implementation strongly inspired by the
+ * previous version, which in turn used ideas described at
+ * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
+ * from the author, Douglas W. Jones).
+ *
+ * It turns out there is precisely one 26 bit fixed-point
+ * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
+ * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
+ * range happens to be somewhat larger (x <= 1073741898), but that's
+ * irrelevant for our purpose.
+ *
+ * For dividing a number in the range [10^4, 10^6-1] by 100, we still
+ * need a 32x32->64 bit multiply, so we simply use the same constant.
+ *
+ * For dividing a number in the range [100, 10^4-1] by 100, there are
+ * several options. The simplest is (x * 0x147b) >> 19, which is valid
+ * for all x <= 43698.
 */
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+static const u16 decpair[100] = {
-/* Formats correctly any integer in [0, 999999999] */
+#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
+        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
+        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
+        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
+        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
+        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
+        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
+        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
+        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
+        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
+        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
+#undef _
+};
+/*
+ * This will print a single '0' even if r == 0, since we would
+ * immediately jump to out_r where two 0s would be written but only
+ * one of them accounted for in buf. This is needed by ip4_string
+ * below. All other callers pass a non-zero value of r.
+*/
 static noinline_for_stack
-char *put_dec_full9(char *buf, unsigned q)
+char *put_dec_trunc8(char *buf, unsigned r)
 {
-        unsigned r;
+        unsigned q;
-        /*
+        /* 1 <= r < 10^8 */
-         * Possible ways to approx. divide by 10
+        if (r < 100)
-         * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)
+                goto out_r;
-         * (x * 0xcccd) >> 19     x <      81920 (x < 262149 when 64-bit mul)
-         * (x * 0x6667) >> 18     x <      43699
+        /* 100 <= r < 10^8 */
-         * (x * 0x3334) >> 17     x <      16389
+        q = (r * (u64)0x28f5c29) >> 32;
-         * (x * 0x199a) >> 16     x <      16389
+        *((u16 *)buf) = decpair[r - 100*q];
-         * (x * 0x0ccd) >> 15     x <      16389
+        buf += 2;
-         * (x * 0x0667) >> 14     x <       2739
-         * (x * 0x0334) >> 13     x <       1029
+        /* 1 <= q < 10^6 */
-         * (x * 0x019a) >> 12     x <       1029
+        if (q < 100)
-         * (x * 0x00cd) >> 11     x <       1029 shorter code than * 0x67 (on i386)
+                goto out_q;
-         * (x * 0x0067) >> 10     x <        179
-         * (x * 0x0034) >>  9     x <         69 same
+        /*  100 <= q < 10^6 */
-         * (x * 0x001a) >>  8     x <         69 same
+        r = (q * (u64)0x28f5c29) >> 32;
-         * (x * 0x000d) >>  7     x <         69 same, shortest code (on i386)
+        *((u16 *)buf) = decpair[q - 100*r];
-         * (x * 0x0007) >>  6     x <         19
+        buf += 2;
-         * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>
-         */
+        /* 1 <= r < 10^4 */
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        if (r < 100)
-        *buf++ = (q - 10 * r) + '0'; /* 1 */
+                goto out_r;
-        q      = (r * (uint64_t)0x1999999a) >> 32;
-        *buf++ = (r - 10 * q) + '0'; /* 2 */
+        /* 100 <= r < 10^4 */
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        q = (r * 0x147b) >> 19;
-        *buf++ = (q - 10 * r) + '0'; /* 3 */
+        *((u16 *)buf) = decpair[r - 100*q];
-        q      = (r * (uint64_t)0x1999999a) >> 32;
+        buf += 2;
-        *buf++ = (r - 10 * q) + '0'; /* 4 */
+out_q:
-        r      = (q * (uint64_t)0x1999999a) >> 32;
+        /* 1 <= q < 100 */
-        *buf++ = (q - 10 * r) + '0'; /* 5 */
+        r = q;
-        /* Now value is under 10000, can avoid 64-bit multiply */
+out_r:
-        q      = (r * 0x199a) >> 16;
+        /* 1 <= r < 100 */
-        *buf++ = (r - 10 * q)  + '0'; /* 6 */
+        *((u16 *)buf) = decpair[r];
-        r      = (q * 0xcd) >> 11;
+        buf += r < 10 ? 1 : 2;
-        *buf++ = (q - 10 * r)  + '0'; /* 7 */
-        q      = (r * 0xcd) >> 11;
-        *buf++ = (r - 10 * q) + '0'; /* 8 */
-        *buf++ = q + '0'; /* 9 */
        return buf;
 }
-#endif
-/* Similar to above but do not pad with zeros.
+#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
- * Code can be easily arranged to print 9 digits too, but our callers
- * always call put_dec_full9() instead when the number has 9 decimal digits.
- */
 static noinline_for_stack
-char *put_dec_trunc8(char *buf, unsigned r)
+char *put_dec_full8(char *buf, unsigned r)
 {
        unsigned q;
-        /* Copy of previous function's body with added early returns */
+        /* 0 <= r < 10^8 */
-        while (r >= 10000) {
+        q = (r * (u64)0x28f5c29) >> 32;
-                q = r + '0';
+        *((u16 *)buf) = decpair[r - 100*q];
-                r  = (r * (uint64_t)0x1999999a) >> 32;
+        buf += 2;
-                *buf++ = q - 10*r;
-        }
-        q      = (r * 0x199a) >> 16;    /* r <= 9999 */
+        /* 0 <= q < 10^6 */
-        *buf++ = (r - 10 * q)  + '0';
+        r = (q * (u64)0x28f5c29) >> 32;
-        if (q == 0)
+        *((u16 *)buf) = decpair[q - 100*r];
-                return buf;
+        buf += 2;
-        r      = (q * 0xcd) >> 11;      /* q <= 999 */
-        *buf++ = (q - 10 * r)  + '0';
-        if (r == 0)
-                return buf;
-        q      = (r * 0xcd) >> 11;      /* r <= 99 */
-        *buf++ = (r - 10 * q) + '0';
-        if (q == 0)
-                return buf;
-        *buf++ = q + '0';                /* q <= 9 */
-        return buf;
-}
-/* There are two algorithms to print larger numbers.
+        /* 0 <= r < 10^4 */
- * One is generic: divide by 1000000000 and repeatedly print
+        q = (r * 0x147b) >> 19;
- * groups of (up to) 9 digits. It's conceptually simple,
+        *((u16 *)buf) = decpair[r - 100*q];
- * but requires a (unsigned long long) / 1000000000 division.
+        buf += 2;
- *
- * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
- * manipulates them cleverly and generates groups of 4 decimal digits.
- * It so happens that it does NOT require long long division.
- *
- * If long is > 32 bits, division of 64-bit values is relatively easy,
- * and we will use the first algorithm.
- * If long long is > 64 bits (strange architecture with VERY large long long),
- * second algorithm can't be used, and we again use the first one.
- *
- * Else (if long is 32 bits and long long is 64 bits) we use second one.
- */
-#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64
+        /* 0 <= q < 100 */
+        *((u16 *)buf) = decpair[q];
-/* First algorithm: generic */
+        buf += 2;
+        return buf;
+}
-static
+static noinline_for_stack
 char *put_dec(char *buf, unsigned long long n)
 {
-        if (n >= 100*1000*1000) {
+        if (n >= 100*1000*1000)
-                while (n >= 1000*1000*1000)
+                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-                        buf = put_dec_full9(buf, do_div(n, 1000*1000*1000));
+        /* 1 <= n <= 1.6e11 */
-                if (n >= 100*1000*1000)
+        if (n >= 100*1000*1000)
-                        return put_dec_full9(buf, n);
+                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
-        }
+        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
 }
-#else
+#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
-/* Second algorithm: valid only for 64-bit long longs */
+static void
+put_dec_full4(char *buf, unsigned r)
-/* See comment in put_dec_full9 for choice of constants */
-static noinline_for_stack
-void put_dec_full4(char *buf, unsigned q)
 {
-        unsigned r;
+        unsigned q;
-        r      = (q * 0xccd) >> 15;
-        buf[0] = (q - 10 * r) + '0';
+        /* 0 <= r < 10^4 */
-        q      = (r * 0xcd) >> 11;
+        q = (r * 0x147b) >> 19;
-        buf[1] = (r - 10 * q)  + '0';
+        *((u16 *)buf) = decpair[r - 100*q];
-        r      = (q * 0xcd) >> 11;
+        buf += 2;
-        buf[2] = (q - 10 * r)  + '0';
+        /* 0 <= q < 100 */
-        buf[3] = r + '0';
+        *((u16 *)buf) = decpair[q];
 }
 /*
@@ -265,9 +269,9 @@ void put_dec_full4(char *buf, unsigned q)
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
- * (d1 in the put_dec code, assuming n is all-ones).
+ * (second call in the put_dec code, assuming n is all-ones).
 */
-static
+static noinline_for_stack
 unsigned put_dec_helper4(char *buf, unsigned x)
 {
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +298,8 @@ char *put_dec(char *buf, unsigned long long n)
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */
+        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
+             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);
@@ -323,7 +329,8 @@ char *put_dec(char *buf, unsigned long long n)
 */
 int num_to_str(char *buf, int size, unsigned long long num)
 {
-        char tmp[sizeof(num) * 3];
+        /* put_dec requires 2-byte alignment of the buffer. */
+        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;
        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +391,8 @@ static noinline_for_stack
 char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
 {
-        char tmp[3 * sizeof(num)];
+        /* put_dec requires 2-byte alignment of the buffer. */
+        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +952,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
                break;
        }
        for (i = 0; i < 4; i++) {
-                char temp[3];   /* hold each IP quad in reverse order */
+                char temp[4] __aligned(2);      /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3a1e0843f9a2..da39c608a28c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
33		33
34	#include <asm/page.h> /* for PAGE_SIZE */	34	#include <asm/page.h> /* for PAGE_SIZE */
35	#include <asm/sections.h> /* for dereference_function_descriptor() */	35	#include <asm/sections.h> /* for dereference_function_descriptor() */
		36	#include <asm/byteorder.h> /* cpu_to_le16 */
36		37
37	#include <linux/string_helpers.h>	38	#include <linux/string_helpers.h>
38	#include "kstrtox.h"	39	#include "kstrtox.h"
@@ -122,142 +123,145 @@ int skip_atoi(const char **s)
122	return i;	123	return i;
123	}	124	}
124		125
125	/* Decimal conversion is by far the most typical, and is used	126	/*
126	* for /proc and /sys data. This directly impacts e.g. top performance	127	* Decimal conversion is by far the most typical, and is used for
127	* with many processes running. We optimize it for speed	128	* /proc and /sys data. This directly impacts e.g. top performance
128	* using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html>	129	* with many processes running. We optimize it for speed by emitting
129	* (with permission from the author, Douglas W. Jones).	130	* two characters at a time, using a 200 byte lookup table. This
		131	* roughly halves the number of multiplications compared to computing
		132	* the digits one at a time. Implementation strongly inspired by the
		133	* previous version, which in turn used ideas described at
		134	* <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
		135	* from the author, Douglas W. Jones).
		136	*
		137	* It turns out there is precisely one 26 bit fixed-point
		138	* approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
		139	* holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
		140	* range happens to be somewhat larger (x <= 1073741898), but that's
		141	* irrelevant for our purpose.
		142	*
		143	* For dividing a number in the range [10^4, 10^6-1] by 100, we still
		144	* need a 32x32->64 bit multiply, so we simply use the same constant.
		145	*
		146	* For dividing a number in the range [100, 10^4-1] by 100, there are
		147	* several options. The simplest is (x * 0x147b) >> 19, which is valid
		148	* for all x <= 43698.
130	*/	149	*/
131		150
132	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64	151	static const u16 decpair[100] = {
133	/* Formats correctly any integer in [0, 999999999] */	152	#define _(x) (__force u16) cpu_to_le16(((x % 10) \| ((x / 10) << 8)) + 0x3030)
		153	_( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
		154	_(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
		155	_(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
		156	_(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
		157	_(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
		158	_(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
		159	_(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
		160	_(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
		161	_(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
		162	_(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
		163	#undef _
		164	};
		165
		166	/*
		167	* This will print a single '0' even if r == 0, since we would
		168	* immediately jump to out_r where two 0s would be written but only
		169	* one of them accounted for in buf. This is needed by ip4_string
		170	* below. All other callers pass a non-zero value of r.
		171	*/
134	static noinline_for_stack	172	static noinline_for_stack
135	char put_dec_full9(char buf, unsigned q)	173	char put_dec_trunc8(char buf, unsigned r)
136	{	174	{
137	unsigned r;	175	unsigned q;
138		176
139	/*	177	/* 1 <= r < 10^8 */
140	* Possible ways to approx. divide by 10	178	if (r < 100)
141	* (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit)	179	goto out_r;
142	* (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul)	180
143	* (x * 0x6667) >> 18 x < 43699	181	/* 100 <= r < 10^8 */
144	* (x * 0x3334) >> 17 x < 16389	182	q = (r * (u64)0x28f5c29) >> 32;
145	* (x * 0x199a) >> 16 x < 16389	183	((u16 )buf) = decpair[r - 100*q];
146	* (x * 0x0ccd) >> 15 x < 16389	184	buf += 2;
147	* (x * 0x0667) >> 14 x < 2739	185
148	* (x * 0x0334) >> 13 x < 1029	186	/* 1 <= q < 10^6 */
149	* (x * 0x019a) >> 12 x < 1029	187	if (q < 100)
150	* (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386)	188	goto out_q;
151	* (x * 0x0067) >> 10 x < 179	189
152	* (x * 0x0034) >> 9 x < 69 same	190	/* 100 <= q < 10^6 */
153	* (x * 0x001a) >> 8 x < 69 same	191	r = (q * (u64)0x28f5c29) >> 32;
154	* (x * 0x000d) >> 7 x < 69 same, shortest code (on i386)	192	((u16 )buf) = decpair[q - 100*r];
155	* (x * 0x0007) >> 6 x < 19	193	buf += 2;
156	* See <http://www.cs.uiowa.edu/~jones/bcd/divide.html>	194
157	*/	195	/* 1 <= r < 10^4 */
158	r = (q * (uint64_t)0x1999999a) >> 32;	196	if (r < 100)
159	buf++ = (q - 10 r) + '0'; /* 1 */	197	goto out_r;
160	q = (r * (uint64_t)0x1999999a) >> 32;	198
161	buf++ = (r - 10 q) + '0'; /* 2 */	199	/* 100 <= r < 10^4 */
162	r = (q * (uint64_t)0x1999999a) >> 32;	200	q = (r * 0x147b) >> 19;
163	buf++ = (q - 10 r) + '0'; /* 3 */	201	((u16 )buf) = decpair[r - 100*q];
164	q = (r * (uint64_t)0x1999999a) >> 32;	202	buf += 2;
165	buf++ = (r - 10 q) + '0'; /* 4 */	203	out_q:
166	r = (q * (uint64_t)0x1999999a) >> 32;	204	/* 1 <= q < 100 */
167	buf++ = (q - 10 r) + '0'; /* 5 */	205	r = q;
168	/* Now value is under 10000, can avoid 64-bit multiply */	206	out_r:
169	q = (r * 0x199a) >> 16;	207	/* 1 <= r < 100 */
170	buf++ = (r - 10 q) + '0'; /* 6 */	208	((u16 )buf) = decpair[r];
171	r = (q * 0xcd) >> 11;	209	buf += r < 10 ? 1 : 2;
172	buf++ = (q - 10 r) + '0'; /* 7 */
173	q = (r * 0xcd) >> 11;
174	buf++ = (r - 10 q) + '0'; /* 8 */
175	buf++ = q + '0'; / 9 */
176	return buf;	210	return buf;
177	}	211	}
178	#endif
179		212
180	/* Similar to above but do not pad with zeros.	213	#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
181	* Code can be easily arranged to print 9 digits too, but our callers
182	* always call put_dec_full9() instead when the number has 9 decimal digits.
183	*/
184	static noinline_for_stack	214	static noinline_for_stack
185	char put_dec_trunc8(char buf, unsigned r)	215	char put_dec_full8(char buf, unsigned r)
186	{	216	{
187	unsigned q;	217	unsigned q;
188		218
189	/* Copy of previous function's body with added early returns */	219	/* 0 <= r < 10^8 */
190	while (r >= 10000) {	220	q = (r * (u64)0x28f5c29) >> 32;
191	q = r + '0';	221	((u16 )buf) = decpair[r - 100*q];
192	r = (r * (uint64_t)0x1999999a) >> 32;	222	buf += 2;
193	buf++ = q - 10r;
194	}
195		223
196	q = (r * 0x199a) >> 16; /* r <= 9999 */	224	/* 0 <= q < 10^6 */
197	buf++ = (r - 10 q) + '0';	225	r = (q * (u64)0x28f5c29) >> 32;
198	if (q == 0)	226	((u16 )buf) = decpair[q - 100*r];
199	return buf;	227	buf += 2;
200	r = (q * 0xcd) >> 11; /* q <= 999 */
201	buf++ = (q - 10 r) + '0';
202	if (r == 0)
203	return buf;
204	q = (r * 0xcd) >> 11; /* r <= 99 */
205	buf++ = (r - 10 q) + '0';
206	if (q == 0)
207	return buf;
208	buf++ = q + '0'; / q <= 9 */
209	return buf;
210	}
211		228
212	/* There are two algorithms to print larger numbers.	229	/* 0 <= r < 10^4 */
213	* One is generic: divide by 1000000000 and repeatedly print	230	q = (r * 0x147b) >> 19;
214	* groups of (up to) 9 digits. It's conceptually simple,	231	((u16 )buf) = decpair[r - 100*q];
215	* but requires a (unsigned long long) / 1000000000 division.	232	buf += 2;
216	*
217	* Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
218	* manipulates them cleverly and generates groups of 4 decimal digits.
219	* It so happens that it does NOT require long long division.
220	*
221	* If long is > 32 bits, division of 64-bit values is relatively easy,
222	* and we will use the first algorithm.
223	* If long long is > 64 bits (strange architecture with VERY large long long),
224	* second algorithm can't be used, and we again use the first one.
225	*
226	* Else (if long is 32 bits and long long is 64 bits) we use second one.
227	*/
228		233
229	#if BITS_PER_LONG != 32 \|\| BITS_PER_LONG_LONG != 64	234	/* 0 <= q < 100 */
230		235	((u16 )buf) = decpair[q];
231	/* First algorithm: generic */	236	buf += 2;
		237	return buf;
		238	}
232		239
233	static	240	static noinline_for_stack
234	char put_dec(char buf, unsigned long long n)	241	char put_dec(char buf, unsigned long long n)
235	{	242	{
236	if (n >= 10010001000) {	243	if (n >= 10010001000)
237	while (n >= 100010001000)	244	buf = put_dec_full8(buf, do_div(n, 10010001000));
238	buf = put_dec_full9(buf, do_div(n, 100010001000));	245	/* 1 <= n <= 1.6e11 */
239	if (n >= 10010001000)	246	if (n >= 10010001000)
240	return put_dec_full9(buf, n);	247	buf = put_dec_full8(buf, do_div(n, 10010001000));
241	}	248	/* 1 <= n < 1e8 */
242	return put_dec_trunc8(buf, n);	249	return put_dec_trunc8(buf, n);
243	}	250	}
244		251
245	#else	252	#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
246		253
247	/* Second algorithm: valid only for 64-bit long longs */	254	static void
248		255	put_dec_full4(char *buf, unsigned r)
249	/* See comment in put_dec_full9 for choice of constants */
250	static noinline_for_stack
251	void put_dec_full4(char *buf, unsigned q)
252	{	256	{
253	unsigned r;	257	unsigned q;
254	r = (q * 0xccd) >> 15;	258
255	buf[0] = (q - 10 * r) + '0';	259	/* 0 <= r < 10^4 */
256	q = (r * 0xcd) >> 11;	260	q = (r * 0x147b) >> 19;
257	buf[1] = (r - 10 * q) + '0';	261	((u16 )buf) = decpair[r - 100*q];
258	r = (q * 0xcd) >> 11;	262	buf += 2;
259	buf[2] = (q - 10 * r) + '0';	263	/* 0 <= q < 100 */
260	buf[3] = r + '0';	264	((u16 )buf) = decpair[q];
261	}	265	}
262		266
263	/*	267	/*
@@ -265,9 +269,9 @@ void put_dec_full4(char *buf, unsigned q)
265	* The approximation x/10000 == (x * 0x346DC5D7) >> 43	269	* The approximation x/10000 == (x * 0x346DC5D7) >> 43
266	* holds for all x < 1,128,869,999. The largest value this	270	* holds for all x < 1,128,869,999. The largest value this
267	* helper will ever be asked to convert is 1,125,520,955.	271	* helper will ever be asked to convert is 1,125,520,955.
268	* (d1 in the put_dec code, assuming n is all-ones).	272	* (second call in the put_dec code, assuming n is all-ones).
269	*/	273	*/
270	static	274	static noinline_for_stack
271	unsigned put_dec_helper4(char *buf, unsigned x)	275	unsigned put_dec_helper4(char *buf, unsigned x)
272	{	276	{
273	uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;	277	uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +298,8 @@ char put_dec(char buf, unsigned long long n)
294	d2 = (h ) & 0xffff;	298	d2 = (h ) & 0xffff;
295	d3 = (h >> 16); /* implicit "& 0xffff" */	299	d3 = (h >> 16); /* implicit "& 0xffff" */
296		300
		301	/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
		302	= 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
297	q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);	303	q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
298	q = put_dec_helper4(buf, q);	304	q = put_dec_helper4(buf, q);
299		305
@@ -323,7 +329,8 @@ char put_dec(char buf, unsigned long long n)
323	*/	329	*/
324	int num_to_str(char *buf, int size, unsigned long long num)	330	int num_to_str(char *buf, int size, unsigned long long num)
325	{	331	{
326	char tmp[sizeof(num) * 3];	332	/* put_dec requires 2-byte alignment of the buffer. */
		333	char tmp[sizeof(num) * 3] __aligned(2);
327	int idx, len;	334	int idx, len;
328		335
329	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */	336	/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +391,8 @@ static noinline_for_stack
384	char number(char buf, char *end, unsigned long long num,	391	char number(char buf, char *end, unsigned long long num,
385	struct printf_spec spec)	392	struct printf_spec spec)
386	{	393	{
387	char tmp[3 * sizeof(num)];	394	/* put_dec requires 2-byte alignment of the buffer. */
		395	char tmp[3 * sizeof(num)] __aligned(2);
388	char sign;	396	char sign;
389	char locase;	397	char locase;
390	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);	398	int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +952,7 @@ char ip4_string(char p, const u8 addr, const char fmt)
944	break;	952	break;
945	}	953	}
946	for (i = 0; i < 4; i++) {	954	for (i = 0; i < 4; i++) {
947	char temp[3]; /* hold each IP quad in reverse order */	955	char temp[4] __aligned(2); /* hold each IP quad in reverse order */
948	int digits = put_dec_trunc8(temp, addr[index]) - temp;	956	int digits = put_dec_trunc8(temp, addr[index]) - temp;
949	if (leading_zeros) {	957	if (leading_zeros) {
950	if (digits < 3)	958	if (digits < 3)