diff options
-rw-r--r-- | lib/vsprintf.c | 246 |
1 files changed, 128 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3a1e0843f9a2..c93ec8a035b3 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c | |||
@@ -33,6 +33,7 @@ | |||
33 | 33 | ||
34 | #include <asm/page.h> /* for PAGE_SIZE */ | 34 | #include <asm/page.h> /* for PAGE_SIZE */ |
35 | #include <asm/sections.h> /* for dereference_function_descriptor() */ | 35 | #include <asm/sections.h> /* for dereference_function_descriptor() */ |
36 | #include <asm/byteorder.h> /* cpu_to_le16 */ | ||
36 | 37 | ||
37 | #include <linux/string_helpers.h> | 38 | #include <linux/string_helpers.h> |
38 | #include "kstrtox.h" | 39 | #include "kstrtox.h" |
@@ -122,142 +123,147 @@ int skip_atoi(const char **s) | |||
122 | return i; | 123 | return i; |
123 | } | 124 | } |
124 | 125 | ||
125 | /* Decimal conversion is by far the most typical, and is used | 126 | /* |
126 | * for /proc and /sys data. This directly impacts e.g. top performance | 127 | * Decimal conversion is by far the most typical, and is used for |
127 | * with many processes running. We optimize it for speed | 128 | * /proc and /sys data. This directly impacts e.g. top performance |
128 | * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> | 129 | * with many processes running. We optimize it for speed by emitting |
129 | * (with permission from the author, Douglas W. Jones). | 130 | * two characters at a time, using a 200 byte lookup table. This |
131 | * roughly halves the number of multiplications compared to computing | ||
132 | * the digits one at a time. Implementation strongly inspired by the | ||
133 | * previous version, which in turn used ideas described at | ||
134 | * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission | ||
135 | * from the author, Douglas W. Jones). | ||
136 | * | ||
137 | * It turns out there is precisely one 26 bit fixed-point | ||
138 | * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 | ||
139 | * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual | ||
140 | * range happens to be somewhat larger (x <= 1073741898), but that's | ||
141 | * irrelevant for our purpose. | ||
142 | * | ||
143 | * For dividing a number in the range [10^4, 10^6-1] by 100, we still | ||
144 | * need a 32x32->64 bit multiply, so we simply use the same constant. | ||
145 | * | ||
146 | * For dividing a number in the range [100, 10^4-1] by 100, there are | ||
147 | * several options. The simplest is (x * 0x147b) >> 19, which is valid | ||
148 | * for all x <= 43698. | ||
130 | */ | 149 | */ |
131 | 150 | ||
132 | #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 | 151 | static const u16 decpair[100] = { |
133 | /* Formats correctly any integer in [0, 999999999] */ | 152 | #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) |
153 | _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), | ||
154 | _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), | ||
155 | _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), | ||
156 | _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), | ||
157 | _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), | ||
158 | _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), | ||
159 | _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), | ||
160 | _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), | ||
161 | _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), | ||
162 | _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), | ||
163 | #undef _ | ||
164 | }; | ||
165 | |||
166 | /* | ||
167 | * This will print a single '0' even if r == 0, since we would | ||
168 | * immediately jump to out_r where two 0s would be written and one of | ||
169 | * them then discarded. This is needed by ip4_string below. All other | ||
170 | * callers pass a non-zero value of r. | ||
171 | */ | ||
134 | static noinline_for_stack | 172 | static noinline_for_stack |
135 | char *put_dec_full9(char *buf, unsigned q) | 173 | char *put_dec_trunc8(char *buf, unsigned r) |
136 | { | 174 | { |
137 | unsigned r; | 175 | unsigned q; |
138 | 176 | ||
139 | /* | 177 | /* 1 <= r < 10^8 */ |
140 | * Possible ways to approx. divide by 10 | 178 | if (r < 100) |
141 | * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) | 179 | goto out_r; |
142 | * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) | 180 | |
143 | * (x * 0x6667) >> 18 x < 43699 | 181 | /* 100 <= r < 10^8 */ |
144 | * (x * 0x3334) >> 17 x < 16389 | 182 | q = (r * (u64)0x28f5c29) >> 32; |
145 | * (x * 0x199a) >> 16 x < 16389 | 183 | *((u16 *)buf) = decpair[r - 100*q]; |
146 | * (x * 0x0ccd) >> 15 x < 16389 | 184 | buf += 2; |
147 | * (x * 0x0667) >> 14 x < 2739 | 185 | |
148 | * (x * 0x0334) >> 13 x < 1029 | 186 | /* 1 <= q < 10^6 */ |
149 | * (x * 0x019a) >> 12 x < 1029 | 187 | if (q < 100) |
150 | * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) | 188 | goto out_q; |
151 | * (x * 0x0067) >> 10 x < 179 | 189 | |
152 | * (x * 0x0034) >> 9 x < 69 same | 190 | /* 100 <= q < 10^6 */ |
153 | * (x * 0x001a) >> 8 x < 69 same | 191 | r = (q * (u64)0x28f5c29) >> 32; |
154 | * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) | 192 | *((u16 *)buf) = decpair[q - 100*r]; |
155 | * (x * 0x0007) >> 6 x < 19 | 193 | buf += 2; |
156 | * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> | 194 | |
157 | */ | 195 | /* 1 <= r < 10^4 */ |
158 | r = (q * (uint64_t)0x1999999a) >> 32; | 196 | if (r < 100) |
159 | *buf++ = (q - 10 * r) + '0'; /* 1 */ | 197 | goto out_r; |
160 | q = (r * (uint64_t)0x1999999a) >> 32; | 198 | |
161 | *buf++ = (r - 10 * q) + '0'; /* 2 */ | 199 | /* 100 <= r < 10^4 */ |
162 | r = (q * (uint64_t)0x1999999a) >> 32; | 200 | q = (r * 0x147b) >> 19; |
163 | *buf++ = (q - 10 * r) + '0'; /* 3 */ | 201 | *((u16 *)buf) = decpair[r - 100*q]; |
164 | q = (r * (uint64_t)0x1999999a) >> 32; | 202 | buf += 2; |
165 | *buf++ = (r - 10 * q) + '0'; /* 4 */ | 203 | out_q: |
166 | r = (q * (uint64_t)0x1999999a) >> 32; | 204 | /* 1 <= q < 100 */ |
167 | *buf++ = (q - 10 * r) + '0'; /* 5 */ | 205 | r = q; |
168 | /* Now value is under 10000, can avoid 64-bit multiply */ | 206 | out_r: |
169 | q = (r * 0x199a) >> 16; | 207 | /* 1 <= r < 100 */ |
170 | *buf++ = (r - 10 * q) + '0'; /* 6 */ | 208 | *((u16 *)buf) = decpair[r]; |
171 | r = (q * 0xcd) >> 11; | 209 | buf += 2; |
172 | *buf++ = (q - 10 * r) + '0'; /* 7 */ | 210 | if (buf[-1] == '0') |
173 | q = (r * 0xcd) >> 11; | 211 | buf--; |
174 | *buf++ = (r - 10 * q) + '0'; /* 8 */ | ||
175 | *buf++ = q + '0'; /* 9 */ | ||
176 | return buf; | 212 | return buf; |
177 | } | 213 | } |
178 | #endif | ||
179 | 214 | ||
180 | /* Similar to above but do not pad with zeros. | 215 | #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 |
181 | * Code can be easily arranged to print 9 digits too, but our callers | ||
182 | * always call put_dec_full9() instead when the number has 9 decimal digits. | ||
183 | */ | ||
184 | static noinline_for_stack | 216 | static noinline_for_stack |
185 | char *put_dec_trunc8(char *buf, unsigned r) | 217 | char *put_dec_full8(char *buf, unsigned r) |
186 | { | 218 | { |
187 | unsigned q; | 219 | unsigned q; |
188 | 220 | ||
189 | /* Copy of previous function's body with added early returns */ | 221 | /* 0 <= r < 10^8 */ |
190 | while (r >= 10000) { | 222 | q = (r * (u64)0x28f5c29) >> 32; |
191 | q = r + '0'; | 223 | *((u16 *)buf) = decpair[r - 100*q]; |
192 | r = (r * (uint64_t)0x1999999a) >> 32; | 224 | buf += 2; |
193 | *buf++ = q - 10*r; | ||
194 | } | ||
195 | 225 | ||
196 | q = (r * 0x199a) >> 16; /* r <= 9999 */ | 226 | /* 0 <= q < 10^6 */ |
197 | *buf++ = (r - 10 * q) + '0'; | 227 | r = (q * (u64)0x28f5c29) >> 32; |
198 | if (q == 0) | 228 | *((u16 *)buf) = decpair[q - 100*r]; |
199 | return buf; | 229 | buf += 2; |
200 | r = (q * 0xcd) >> 11; /* q <= 999 */ | ||
201 | *buf++ = (q - 10 * r) + '0'; | ||
202 | if (r == 0) | ||
203 | return buf; | ||
204 | q = (r * 0xcd) >> 11; /* r <= 99 */ | ||
205 | *buf++ = (r - 10 * q) + '0'; | ||
206 | if (q == 0) | ||
207 | return buf; | ||
208 | *buf++ = q + '0'; /* q <= 9 */ | ||
209 | return buf; | ||
210 | } | ||
211 | 230 | ||
212 | /* There are two algorithms to print larger numbers. | 231 | /* 0 <= r < 10^4 */ |
213 | * One is generic: divide by 1000000000 and repeatedly print | 232 | q = (r * 0x147b) >> 19; |
214 | * groups of (up to) 9 digits. It's conceptually simple, | 233 | *((u16 *)buf) = decpair[r - 100*q]; |
215 | * but requires a (unsigned long long) / 1000000000 division. | 234 | buf += 2; |
216 | * | ||
217 | * Second algorithm splits 64-bit unsigned long long into 16-bit chunks, | ||
218 | * manipulates them cleverly and generates groups of 4 decimal digits. | ||
219 | * It so happens that it does NOT require long long division. | ||
220 | * | ||
221 | * If long is > 32 bits, division of 64-bit values is relatively easy, | ||
222 | * and we will use the first algorithm. | ||
223 | * If long long is > 64 bits (strange architecture with VERY large long long), | ||
224 | * second algorithm can't be used, and we again use the first one. | ||
225 | * | ||
226 | * Else (if long is 32 bits and long long is 64 bits) we use second one. | ||
227 | */ | ||
228 | 235 | ||
229 | #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 | 236 | /* 0 <= q < 100 */ |
230 | 237 | *((u16 *)buf) = decpair[q]; | |
231 | /* First algorithm: generic */ | 238 | buf += 2; |
239 | return buf; | ||
240 | } | ||
232 | 241 | ||
233 | static | 242 | static noinline_for_stack |
234 | char *put_dec(char *buf, unsigned long long n) | 243 | char *put_dec(char *buf, unsigned long long n) |
235 | { | 244 | { |
236 | if (n >= 100*1000*1000) { | 245 | if (n >= 100*1000*1000) |
237 | while (n >= 1000*1000*1000) | 246 | buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); |
238 | buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); | 247 | /* 1 <= n <= 1.6e11 */ |
239 | if (n >= 100*1000*1000) | 248 | if (n >= 100*1000*1000) |
240 | return put_dec_full9(buf, n); | 249 | buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); |
241 | } | 250 | /* 1 <= n < 1e8 */ |
242 | return put_dec_trunc8(buf, n); | 251 | return put_dec_trunc8(buf, n); |
243 | } | 252 | } |
244 | 253 | ||
245 | #else | 254 | #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 |
246 | 255 | ||
247 | /* Second algorithm: valid only for 64-bit long longs */ | 256 | static void |
248 | 257 | put_dec_full4(char *buf, unsigned r) | |
249 | /* See comment in put_dec_full9 for choice of constants */ | ||
250 | static noinline_for_stack | ||
251 | void put_dec_full4(char *buf, unsigned q) | ||
252 | { | 258 | { |
253 | unsigned r; | 259 | unsigned q; |
254 | r = (q * 0xccd) >> 15; | 260 | |
255 | buf[0] = (q - 10 * r) + '0'; | 261 | /* 0 <= r < 10^4 */ |
256 | q = (r * 0xcd) >> 11; | 262 | q = (r * 0x147b) >> 19; |
257 | buf[1] = (r - 10 * q) + '0'; | 263 | *((u16 *)buf) = decpair[r - 100*q]; |
258 | r = (q * 0xcd) >> 11; | 264 | buf += 2; |
259 | buf[2] = (q - 10 * r) + '0'; | 265 | /* 0 <= q < 100 */ |
260 | buf[3] = r + '0'; | 266 | *((u16 *)buf) = decpair[q]; |
261 | } | 267 | } |
262 | 268 | ||
263 | /* | 269 | /* |
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q) | |||
265 | * The approximation x/10000 == (x * 0x346DC5D7) >> 43 | 271 | * The approximation x/10000 == (x * 0x346DC5D7) >> 43 |
266 | * holds for all x < 1,128,869,999. The largest value this | 272 | * holds for all x < 1,128,869,999. The largest value this |
267 | * helper will ever be asked to convert is 1,125,520,955. | 273 | * helper will ever be asked to convert is 1,125,520,955. |
268 | * (d1 in the put_dec code, assuming n is all-ones). | 274 | * (second call in the put_dec code, assuming n is all-ones). |
269 | */ | 275 | */ |
270 | static | 276 | static noinline_for_stack |
271 | unsigned put_dec_helper4(char *buf, unsigned x) | 277 | unsigned put_dec_helper4(char *buf, unsigned x) |
272 | { | 278 | { |
273 | uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; | 279 | uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; |
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n) | |||
294 | d2 = (h ) & 0xffff; | 300 | d2 = (h ) & 0xffff; |
295 | d3 = (h >> 16); /* implicit "& 0xffff" */ | 301 | d3 = (h >> 16); /* implicit "& 0xffff" */ |
296 | 302 | ||
303 | /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 | ||
304 | = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ | ||
297 | q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); | 305 | q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); |
298 | q = put_dec_helper4(buf, q); | 306 | q = put_dec_helper4(buf, q); |
299 | 307 | ||
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n) | |||
323 | */ | 331 | */ |
324 | int num_to_str(char *buf, int size, unsigned long long num) | 332 | int num_to_str(char *buf, int size, unsigned long long num) |
325 | { | 333 | { |
326 | char tmp[sizeof(num) * 3]; | 334 | /* put_dec requires 2-byte alignment of the buffer. */ |
335 | char tmp[sizeof(num) * 3] __aligned(2); | ||
327 | int idx, len; | 336 | int idx, len; |
328 | 337 | ||
329 | /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ | 338 | /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ |
@@ -384,7 +393,8 @@ static noinline_for_stack | |||
384 | char *number(char *buf, char *end, unsigned long long num, | 393 | char *number(char *buf, char *end, unsigned long long num, |
385 | struct printf_spec spec) | 394 | struct printf_spec spec) |
386 | { | 395 | { |
387 | char tmp[3 * sizeof(num)]; | 396 | /* put_dec requires 2-byte alignment of the buffer. */ |
397 | char tmp[3 * sizeof(num)] __aligned(2); | ||
388 | char sign; | 398 | char sign; |
389 | char locase; | 399 | char locase; |
390 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); | 400 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); |
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) | |||
944 | break; | 954 | break; |
945 | } | 955 | } |
946 | for (i = 0; i < 4; i++) { | 956 | for (i = 0; i < 4; i++) { |
947 | char temp[3]; /* hold each IP quad in reverse order */ | 957 | char temp[4] __aligned(2); /* hold each IP quad in reverse order */ |
948 | int digits = put_dec_trunc8(temp, addr[index]) - temp; | 958 | int digits = put_dec_trunc8(temp, addr[index]) - temp; |
949 | if (leading_zeros) { | 959 | if (leading_zeros) { |
950 | if (digits < 3) | 960 | if (digits < 3) |