aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/vsprintf.c246
1 files changed, 128 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3a1e0843f9a2..c93ec8a035b3 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> /* for PAGE_SIZE */ 34#include <asm/page.h> /* for PAGE_SIZE */
35#include <asm/sections.h> /* for dereference_function_descriptor() */ 35#include <asm/sections.h> /* for dereference_function_descriptor() */
36#include <asm/byteorder.h> /* cpu_to_le16 */
36 37
37#include <linux/string_helpers.h> 38#include <linux/string_helpers.h>
38#include "kstrtox.h" 39#include "kstrtox.h"
@@ -122,142 +123,147 @@ int skip_atoi(const char **s)
122 return i; 123 return i;
123} 124}
124 125
125/* Decimal conversion is by far the most typical, and is used 126/*
126 * for /proc and /sys data. This directly impacts e.g. top performance 127 * Decimal conversion is by far the most typical, and is used for
127 * with many processes running. We optimize it for speed 128 * /proc and /sys data. This directly impacts e.g. top performance
128 * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 129 * with many processes running. We optimize it for speed by emitting
129 * (with permission from the author, Douglas W. Jones). 130 * two characters at a time, using a 200 byte lookup table. This
131 * roughly halves the number of multiplications compared to computing
132 * the digits one at a time. Implementation strongly inspired by the
133 * previous version, which in turn used ideas described at
134 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
135 * from the author, Douglas W. Jones).
136 *
137 * It turns out there is precisely one 26 bit fixed-point
138 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
139 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
140 * range happens to be somewhat larger (x <= 1073741898), but that's
141 * irrelevant for our purpose.
142 *
143 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
144 * need a 32x32->64 bit multiply, so we simply use the same constant.
145 *
146 * For dividing a number in the range [100, 10^4-1] by 100, there are
147 * several options. The simplest is (x * 0x147b) >> 19, which is valid
148 * for all x <= 43698.
130 */ 149 */
131 150
132#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 151static const u16 decpair[100] = {
133/* Formats correctly any integer in [0, 999999999] */ 152#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
153 _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
154 _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
155 _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
156 _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
157 _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
158 _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
159 _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
160 _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
161 _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
162 _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
163#undef _
164};
165
166/*
167 * This will print a single '0' even if r == 0, since we would
168 * immediately jump to out_r where two 0s would be written and one of
169 * them then discarded. This is needed by ip4_string below. All other
170 * callers pass a non-zero value of r.
171*/
134static noinline_for_stack 172static noinline_for_stack
135char *put_dec_full9(char *buf, unsigned q) 173char *put_dec_trunc8(char *buf, unsigned r)
136{ 174{
137 unsigned r; 175 unsigned q;
138 176
139 /* 177 /* 1 <= r < 10^8 */
140 * Possible ways to approx. divide by 10 178 if (r < 100)
141 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) 179 goto out_r;
142 * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) 180
143 * (x * 0x6667) >> 18 x < 43699 181 /* 100 <= r < 10^8 */
144 * (x * 0x3334) >> 17 x < 16389 182 q = (r * (u64)0x28f5c29) >> 32;
145 * (x * 0x199a) >> 16 x < 16389 183 *((u16 *)buf) = decpair[r - 100*q];
146 * (x * 0x0ccd) >> 15 x < 16389 184 buf += 2;
147 * (x * 0x0667) >> 14 x < 2739 185
148 * (x * 0x0334) >> 13 x < 1029 186 /* 1 <= q < 10^6 */
149 * (x * 0x019a) >> 12 x < 1029 187 if (q < 100)
150 * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) 188 goto out_q;
151 * (x * 0x0067) >> 10 x < 179 189
152 * (x * 0x0034) >> 9 x < 69 same 190 /* 100 <= q < 10^6 */
153 * (x * 0x001a) >> 8 x < 69 same 191 r = (q * (u64)0x28f5c29) >> 32;
154 * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) 192 *((u16 *)buf) = decpair[q - 100*r];
155 * (x * 0x0007) >> 6 x < 19 193 buf += 2;
156 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 194
157 */ 195 /* 1 <= r < 10^4 */
158 r = (q * (uint64_t)0x1999999a) >> 32; 196 if (r < 100)
159 *buf++ = (q - 10 * r) + '0'; /* 1 */ 197 goto out_r;
160 q = (r * (uint64_t)0x1999999a) >> 32; 198
161 *buf++ = (r - 10 * q) + '0'; /* 2 */ 199 /* 100 <= r < 10^4 */
162 r = (q * (uint64_t)0x1999999a) >> 32; 200 q = (r * 0x147b) >> 19;
163 *buf++ = (q - 10 * r) + '0'; /* 3 */ 201 *((u16 *)buf) = decpair[r - 100*q];
164 q = (r * (uint64_t)0x1999999a) >> 32; 202 buf += 2;
165 *buf++ = (r - 10 * q) + '0'; /* 4 */ 203out_q:
166 r = (q * (uint64_t)0x1999999a) >> 32; 204 /* 1 <= q < 100 */
167 *buf++ = (q - 10 * r) + '0'; /* 5 */ 205 r = q;
168 /* Now value is under 10000, can avoid 64-bit multiply */ 206out_r:
169 q = (r * 0x199a) >> 16; 207 /* 1 <= r < 100 */
170 *buf++ = (r - 10 * q) + '0'; /* 6 */ 208 *((u16 *)buf) = decpair[r];
171 r = (q * 0xcd) >> 11; 209 buf += 2;
172 *buf++ = (q - 10 * r) + '0'; /* 7 */ 210 if (buf[-1] == '0')
173 q = (r * 0xcd) >> 11; 211 buf--;
174 *buf++ = (r - 10 * q) + '0'; /* 8 */
175 *buf++ = q + '0'; /* 9 */
176 return buf; 212 return buf;
177} 213}
178#endif
179 214
180/* Similar to above but do not pad with zeros. 215#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
181 * Code can be easily arranged to print 9 digits too, but our callers
182 * always call put_dec_full9() instead when the number has 9 decimal digits.
183 */
184static noinline_for_stack 216static noinline_for_stack
185char *put_dec_trunc8(char *buf, unsigned r) 217char *put_dec_full8(char *buf, unsigned r)
186{ 218{
187 unsigned q; 219 unsigned q;
188 220
189 /* Copy of previous function's body with added early returns */ 221 /* 0 <= r < 10^8 */
190 while (r >= 10000) { 222 q = (r * (u64)0x28f5c29) >> 32;
191 q = r + '0'; 223 *((u16 *)buf) = decpair[r - 100*q];
192 r = (r * (uint64_t)0x1999999a) >> 32; 224 buf += 2;
193 *buf++ = q - 10*r;
194 }
195 225
196 q = (r * 0x199a) >> 16; /* r <= 9999 */ 226 /* 0 <= q < 10^6 */
197 *buf++ = (r - 10 * q) + '0'; 227 r = (q * (u64)0x28f5c29) >> 32;
198 if (q == 0) 228 *((u16 *)buf) = decpair[q - 100*r];
199 return buf; 229 buf += 2;
200 r = (q * 0xcd) >> 11; /* q <= 999 */
201 *buf++ = (q - 10 * r) + '0';
202 if (r == 0)
203 return buf;
204 q = (r * 0xcd) >> 11; /* r <= 99 */
205 *buf++ = (r - 10 * q) + '0';
206 if (q == 0)
207 return buf;
208 *buf++ = q + '0'; /* q <= 9 */
209 return buf;
210}
211 230
212/* There are two algorithms to print larger numbers. 231 /* 0 <= r < 10^4 */
213 * One is generic: divide by 1000000000 and repeatedly print 232 q = (r * 0x147b) >> 19;
214 * groups of (up to) 9 digits. It's conceptually simple, 233 *((u16 *)buf) = decpair[r - 100*q];
215 * but requires a (unsigned long long) / 1000000000 division. 234 buf += 2;
216 *
217 * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
218 * manipulates them cleverly and generates groups of 4 decimal digits.
219 * It so happens that it does NOT require long long division.
220 *
221 * If long is > 32 bits, division of 64-bit values is relatively easy,
222 * and we will use the first algorithm.
223 * If long long is > 64 bits (strange architecture with VERY large long long),
224 * second algorithm can't be used, and we again use the first one.
225 *
226 * Else (if long is 32 bits and long long is 64 bits) we use second one.
227 */
228 235
229#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 236 /* 0 <= q < 100 */
230 237 *((u16 *)buf) = decpair[q];
231/* First algorithm: generic */ 238 buf += 2;
239 return buf;
240}
232 241
233static 242static noinline_for_stack
234char *put_dec(char *buf, unsigned long long n) 243char *put_dec(char *buf, unsigned long long n)
235{ 244{
236 if (n >= 100*1000*1000) { 245 if (n >= 100*1000*1000)
237 while (n >= 1000*1000*1000) 246 buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
238 buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); 247 /* 1 <= n <= 1.6e11 */
239 if (n >= 100*1000*1000) 248 if (n >= 100*1000*1000)
240 return put_dec_full9(buf, n); 249 buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
241 } 250 /* 1 <= n < 1e8 */
242 return put_dec_trunc8(buf, n); 251 return put_dec_trunc8(buf, n);
243} 252}
244 253
245#else 254#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
246 255
247/* Second algorithm: valid only for 64-bit long longs */ 256static void
248 257put_dec_full4(char *buf, unsigned r)
249/* See comment in put_dec_full9 for choice of constants */
250static noinline_for_stack
251void put_dec_full4(char *buf, unsigned q)
252{ 258{
253 unsigned r; 259 unsigned q;
254 r = (q * 0xccd) >> 15; 260
255 buf[0] = (q - 10 * r) + '0'; 261 /* 0 <= r < 10^4 */
256 q = (r * 0xcd) >> 11; 262 q = (r * 0x147b) >> 19;
257 buf[1] = (r - 10 * q) + '0'; 263 *((u16 *)buf) = decpair[r - 100*q];
258 r = (q * 0xcd) >> 11; 264 buf += 2;
259 buf[2] = (q - 10 * r) + '0'; 265 /* 0 <= q < 100 */
260 buf[3] = r + '0'; 266 *((u16 *)buf) = decpair[q];
261} 267}
262 268
263/* 269/*
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q)
265 * The approximation x/10000 == (x * 0x346DC5D7) >> 43 271 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
266 * holds for all x < 1,128,869,999. The largest value this 272 * holds for all x < 1,128,869,999. The largest value this
267 * helper will ever be asked to convert is 1,125,520,955. 273 * helper will ever be asked to convert is 1,125,520,955.
268 * (d1 in the put_dec code, assuming n is all-ones). 274 * (second call in the put_dec code, assuming n is all-ones).
269 */ 275 */
270static 276static noinline_for_stack
271unsigned put_dec_helper4(char *buf, unsigned x) 277unsigned put_dec_helper4(char *buf, unsigned x)
272{ 278{
273 uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; 279 uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n)
294 d2 = (h ) & 0xffff; 300 d2 = (h ) & 0xffff;
295 d3 = (h >> 16); /* implicit "& 0xffff" */ 301 d3 = (h >> 16); /* implicit "& 0xffff" */
296 302
303 /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
304 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
297 q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); 305 q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
298 q = put_dec_helper4(buf, q); 306 q = put_dec_helper4(buf, q);
299 307
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n)
323 */ 331 */
324int num_to_str(char *buf, int size, unsigned long long num) 332int num_to_str(char *buf, int size, unsigned long long num)
325{ 333{
326 char tmp[sizeof(num) * 3]; 334 /* put_dec requires 2-byte alignment of the buffer. */
335 char tmp[sizeof(num) * 3] __aligned(2);
327 int idx, len; 336 int idx, len;
328 337
329 /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ 338 /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +393,8 @@ static noinline_for_stack
384char *number(char *buf, char *end, unsigned long long num, 393char *number(char *buf, char *end, unsigned long long num,
385 struct printf_spec spec) 394 struct printf_spec spec)
386{ 395{
387 char tmp[3 * sizeof(num)]; 396 /* put_dec requires 2-byte alignment of the buffer. */
397 char tmp[3 * sizeof(num)] __aligned(2);
388 char sign; 398 char sign;
389 char locase; 399 char locase;
390 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); 400 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
944 break; 954 break;
945 } 955 }
946 for (i = 0; i < 4; i++) { 956 for (i = 0; i < 4; i++) {
947 char temp[3]; /* hold each IP quad in reverse order */ 957 char temp[4] __aligned(2); /* hold each IP quad in reverse order */
948 int digits = put_dec_trunc8(temp, addr[index]) - temp; 958 int digits = put_dec_trunc8(temp, addr[index]) - temp;
949 if (leading_zeros) { 959 if (leading_zeros) {
950 if (digits < 3) 960 if (digits < 3)