diff options
author | Rasmus Villemoes <linux@rasmusvillemoes.dk> | 2015-04-16 15:43:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-17 09:03:54 -0400 |
commit | 7c43d9a30c527d9e06e2c55f82b56f28df43caed (patch) | |
tree | 9d83a74720aae5b5ebb84066641392fc68cbebf2 /lib/vsprintf.c | |
parent | 840620a1596a90636a44d6a593db4041bb28d52e (diff) |
lib/vsprintf.c: even faster binary to decimal conversion
The most expensive part of decimal conversion is the divisions by 10
(albeit done using reciprocal multiplication with appropriately chosen
constants). I decided to see if one could eliminate around half of
these multiplications by emitting two digits at a time, at the cost of a
200 byte lookup table, and it does indeed seem like there is something
to be gained, especially on 64 bits. Microbenchmarking shows
improvements ranging from -50% (for numbers uniformly distributed in [0,
2^64-1]) to -25% (for numbers heavily biased toward the smaller end, a
more realistic distribution).
On a larger scale, perf shows that top, one of the big consumers of /proc
data, uses 0.5-1.0% fewer cpu cycles.
I had to jump through some hoops to get the 32 bit code to compile and run
on my 64 bit machine, so I'm not sure how relevant these numbers are, but
just for comparison the microbenchmark showed improvements between -30%
and -10%.
The bloat-o-meter costs are around 150 bytes (the generated code is a
little smaller, so it's not the full 200 bytes) on both 32 and 64 bit.
I'm aware that extra cache misses won't show up in a microbenchmark as
used above, but on the other hand decimal conversions often happen in bulk
(for example in the case of top).
I have of course tested that the new code generates the same output as the
old, for both the first and last 1e10 numbers in [0,2^64-1] and 4e9
'random' numbers in-between.
Test and verification code on github: https://github.com/Villemoes/dec.
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Tested-by: Jeff Epler <jepler@unpythonic.net>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib/vsprintf.c')
-rw-r--r-- | lib/vsprintf.c | 246 |
1 files changed, 128 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3a1e0843f9a2..c93ec8a035b3 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c | |||
@@ -33,6 +33,7 @@ | |||
33 | 33 | ||
34 | #include <asm/page.h> /* for PAGE_SIZE */ | 34 | #include <asm/page.h> /* for PAGE_SIZE */ |
35 | #include <asm/sections.h> /* for dereference_function_descriptor() */ | 35 | #include <asm/sections.h> /* for dereference_function_descriptor() */ |
36 | #include <asm/byteorder.h> /* cpu_to_le16 */ | ||
36 | 37 | ||
37 | #include <linux/string_helpers.h> | 38 | #include <linux/string_helpers.h> |
38 | #include "kstrtox.h" | 39 | #include "kstrtox.h" |
@@ -122,142 +123,147 @@ int skip_atoi(const char **s) | |||
122 | return i; | 123 | return i; |
123 | } | 124 | } |
124 | 125 | ||
125 | /* Decimal conversion is by far the most typical, and is used | 126 | /* |
126 | * for /proc and /sys data. This directly impacts e.g. top performance | 127 | * Decimal conversion is by far the most typical, and is used for |
127 | * with many processes running. We optimize it for speed | 128 | * /proc and /sys data. This directly impacts e.g. top performance |
128 | * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> | 129 | * with many processes running. We optimize it for speed by emitting |
129 | * (with permission from the author, Douglas W. Jones). | 130 | * two characters at a time, using a 200 byte lookup table. This |
131 | * roughly halves the number of multiplications compared to computing | ||
132 | * the digits one at a time. Implementation strongly inspired by the | ||
133 | * previous version, which in turn used ideas described at | ||
134 | * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission | ||
135 | * from the author, Douglas W. Jones). | ||
136 | * | ||
137 | * It turns out there is precisely one 26 bit fixed-point | ||
138 | * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 | ||
139 | * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual | ||
140 | * range happens to be somewhat larger (x <= 1073741898), but that's | ||
141 | * irrelevant for our purpose. | ||
142 | * | ||
143 | * For dividing a number in the range [10^4, 10^6-1] by 100, we still | ||
144 | * need a 32x32->64 bit multiply, so we simply use the same constant. | ||
145 | * | ||
146 | * For dividing a number in the range [100, 10^4-1] by 100, there are | ||
147 | * several options. The simplest is (x * 0x147b) >> 19, which is valid | ||
148 | * for all x <= 43698. | ||
130 | */ | 149 | */ |
131 | 150 | ||
132 | #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 | 151 | static const u16 decpair[100] = { |
133 | /* Formats correctly any integer in [0, 999999999] */ | 152 | #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) |
153 | _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), | ||
154 | _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), | ||
155 | _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), | ||
156 | _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), | ||
157 | _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), | ||
158 | _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), | ||
159 | _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), | ||
160 | _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), | ||
161 | _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), | ||
162 | _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), | ||
163 | #undef _ | ||
164 | }; | ||
165 | |||
166 | /* | ||
167 | * This will print a single '0' even if r == 0, since we would | ||
168 | * immediately jump to out_r where two 0s would be written and one of | ||
169 | * them then discarded. This is needed by ip4_string below. All other | ||
170 | * callers pass a non-zero value of r. | ||
171 | */ | ||
134 | static noinline_for_stack | 172 | static noinline_for_stack |
135 | char *put_dec_full9(char *buf, unsigned q) | 173 | char *put_dec_trunc8(char *buf, unsigned r) |
136 | { | 174 | { |
137 | unsigned r; | 175 | unsigned q; |
138 | 176 | ||
139 | /* | 177 | /* 1 <= r < 10^8 */ |
140 | * Possible ways to approx. divide by 10 | 178 | if (r < 100) |
141 | * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) | 179 | goto out_r; |
142 | * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) | 180 | |
143 | * (x * 0x6667) >> 18 x < 43699 | 181 | /* 100 <= r < 10^8 */ |
144 | * (x * 0x3334) >> 17 x < 16389 | 182 | q = (r * (u64)0x28f5c29) >> 32; |
145 | * (x * 0x199a) >> 16 x < 16389 | 183 | *((u16 *)buf) = decpair[r - 100*q]; |
146 | * (x * 0x0ccd) >> 15 x < 16389 | 184 | buf += 2; |
147 | * (x * 0x0667) >> 14 x < 2739 | 185 | |
148 | * (x * 0x0334) >> 13 x < 1029 | 186 | /* 1 <= q < 10^6 */ |
149 | * (x * 0x019a) >> 12 x < 1029 | 187 | if (q < 100) |
150 | * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) | 188 | goto out_q; |
151 | * (x * 0x0067) >> 10 x < 179 | 189 | |
152 | * (x * 0x0034) >> 9 x < 69 same | 190 | /* 100 <= q < 10^6 */ |
153 | * (x * 0x001a) >> 8 x < 69 same | 191 | r = (q * (u64)0x28f5c29) >> 32; |
154 | * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) | 192 | *((u16 *)buf) = decpair[q - 100*r]; |
155 | * (x * 0x0007) >> 6 x < 19 | 193 | buf += 2; |
156 | * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> | 194 | |
157 | */ | 195 | /* 1 <= r < 10^4 */ |
158 | r = (q * (uint64_t)0x1999999a) >> 32; | 196 | if (r < 100) |
159 | *buf++ = (q - 10 * r) + '0'; /* 1 */ | 197 | goto out_r; |
160 | q = (r * (uint64_t)0x1999999a) >> 32; | 198 | |
161 | *buf++ = (r - 10 * q) + '0'; /* 2 */ | 199 | /* 100 <= r < 10^4 */ |
162 | r = (q * (uint64_t)0x1999999a) >> 32; | 200 | q = (r * 0x147b) >> 19; |
163 | *buf++ = (q - 10 * r) + '0'; /* 3 */ | 201 | *((u16 *)buf) = decpair[r - 100*q]; |
164 | q = (r * (uint64_t)0x1999999a) >> 32; | 202 | buf += 2; |
165 | *buf++ = (r - 10 * q) + '0'; /* 4 */ | 203 | out_q: |
166 | r = (q * (uint64_t)0x1999999a) >> 32; | 204 | /* 1 <= q < 100 */ |
167 | *buf++ = (q - 10 * r) + '0'; /* 5 */ | 205 | r = q; |
168 | /* Now value is under 10000, can avoid 64-bit multiply */ | 206 | out_r: |
169 | q = (r * 0x199a) >> 16; | 207 | /* 1 <= r < 100 */ |
170 | *buf++ = (r - 10 * q) + '0'; /* 6 */ | 208 | *((u16 *)buf) = decpair[r]; |
171 | r = (q * 0xcd) >> 11; | 209 | buf += 2; |
172 | *buf++ = (q - 10 * r) + '0'; /* 7 */ | 210 | if (buf[-1] == '0') |
173 | q = (r * 0xcd) >> 11; | 211 | buf--; |
174 | *buf++ = (r - 10 * q) + '0'; /* 8 */ | ||
175 | *buf++ = q + '0'; /* 9 */ | ||
176 | return buf; | 212 | return buf; |
177 | } | 213 | } |
178 | #endif | ||
179 | 214 | ||
180 | /* Similar to above but do not pad with zeros. | 215 | #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 |
181 | * Code can be easily arranged to print 9 digits too, but our callers | ||
182 | * always call put_dec_full9() instead when the number has 9 decimal digits. | ||
183 | */ | ||
184 | static noinline_for_stack | 216 | static noinline_for_stack |
185 | char *put_dec_trunc8(char *buf, unsigned r) | 217 | char *put_dec_full8(char *buf, unsigned r) |
186 | { | 218 | { |
187 | unsigned q; | 219 | unsigned q; |
188 | 220 | ||
189 | /* Copy of previous function's body with added early returns */ | 221 | /* 0 <= r < 10^8 */ |
190 | while (r >= 10000) { | 222 | q = (r * (u64)0x28f5c29) >> 32; |
191 | q = r + '0'; | 223 | *((u16 *)buf) = decpair[r - 100*q]; |
192 | r = (r * (uint64_t)0x1999999a) >> 32; | 224 | buf += 2; |
193 | *buf++ = q - 10*r; | ||
194 | } | ||
195 | 225 | ||
196 | q = (r * 0x199a) >> 16; /* r <= 9999 */ | 226 | /* 0 <= q < 10^6 */ |
197 | *buf++ = (r - 10 * q) + '0'; | 227 | r = (q * (u64)0x28f5c29) >> 32; |
198 | if (q == 0) | 228 | *((u16 *)buf) = decpair[q - 100*r]; |
199 | return buf; | 229 | buf += 2; |
200 | r = (q * 0xcd) >> 11; /* q <= 999 */ | ||
201 | *buf++ = (q - 10 * r) + '0'; | ||
202 | if (r == 0) | ||
203 | return buf; | ||
204 | q = (r * 0xcd) >> 11; /* r <= 99 */ | ||
205 | *buf++ = (r - 10 * q) + '0'; | ||
206 | if (q == 0) | ||
207 | return buf; | ||
208 | *buf++ = q + '0'; /* q <= 9 */ | ||
209 | return buf; | ||
210 | } | ||
211 | 230 | ||
212 | /* There are two algorithms to print larger numbers. | 231 | /* 0 <= r < 10^4 */ |
213 | * One is generic: divide by 1000000000 and repeatedly print | 232 | q = (r * 0x147b) >> 19; |
214 | * groups of (up to) 9 digits. It's conceptually simple, | 233 | *((u16 *)buf) = decpair[r - 100*q]; |
215 | * but requires a (unsigned long long) / 1000000000 division. | 234 | buf += 2; |
216 | * | ||
217 | * Second algorithm splits 64-bit unsigned long long into 16-bit chunks, | ||
218 | * manipulates them cleverly and generates groups of 4 decimal digits. | ||
219 | * It so happens that it does NOT require long long division. | ||
220 | * | ||
221 | * If long is > 32 bits, division of 64-bit values is relatively easy, | ||
222 | * and we will use the first algorithm. | ||
223 | * If long long is > 64 bits (strange architecture with VERY large long long), | ||
224 | * second algorithm can't be used, and we again use the first one. | ||
225 | * | ||
226 | * Else (if long is 32 bits and long long is 64 bits) we use second one. | ||
227 | */ | ||
228 | 235 | ||
229 | #if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 | 236 | /* 0 <= q < 100 */ |
230 | 237 | *((u16 *)buf) = decpair[q]; | |
231 | /* First algorithm: generic */ | 238 | buf += 2; |
239 | return buf; | ||
240 | } | ||
232 | 241 | ||
233 | static | 242 | static noinline_for_stack |
234 | char *put_dec(char *buf, unsigned long long n) | 243 | char *put_dec(char *buf, unsigned long long n) |
235 | { | 244 | { |
236 | if (n >= 100*1000*1000) { | 245 | if (n >= 100*1000*1000) |
237 | while (n >= 1000*1000*1000) | 246 | buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); |
238 | buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); | 247 | /* 1 <= n <= 1.6e11 */ |
239 | if (n >= 100*1000*1000) | 248 | if (n >= 100*1000*1000) |
240 | return put_dec_full9(buf, n); | 249 | buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); |
241 | } | 250 | /* 1 <= n < 1e8 */ |
242 | return put_dec_trunc8(buf, n); | 251 | return put_dec_trunc8(buf, n); |
243 | } | 252 | } |
244 | 253 | ||
245 | #else | 254 | #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 |
246 | 255 | ||
247 | /* Second algorithm: valid only for 64-bit long longs */ | 256 | static void |
248 | 257 | put_dec_full4(char *buf, unsigned r) | |
249 | /* See comment in put_dec_full9 for choice of constants */ | ||
250 | static noinline_for_stack | ||
251 | void put_dec_full4(char *buf, unsigned q) | ||
252 | { | 258 | { |
253 | unsigned r; | 259 | unsigned q; |
254 | r = (q * 0xccd) >> 15; | 260 | |
255 | buf[0] = (q - 10 * r) + '0'; | 261 | /* 0 <= r < 10^4 */ |
256 | q = (r * 0xcd) >> 11; | 262 | q = (r * 0x147b) >> 19; |
257 | buf[1] = (r - 10 * q) + '0'; | 263 | *((u16 *)buf) = decpair[r - 100*q]; |
258 | r = (q * 0xcd) >> 11; | 264 | buf += 2; |
259 | buf[2] = (q - 10 * r) + '0'; | 265 | /* 0 <= q < 100 */ |
260 | buf[3] = r + '0'; | 266 | *((u16 *)buf) = decpair[q]; |
261 | } | 267 | } |
262 | 268 | ||
263 | /* | 269 | /* |
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q) | |||
265 | * The approximation x/10000 == (x * 0x346DC5D7) >> 43 | 271 | * The approximation x/10000 == (x * 0x346DC5D7) >> 43 |
266 | * holds for all x < 1,128,869,999. The largest value this | 272 | * holds for all x < 1,128,869,999. The largest value this |
267 | * helper will ever be asked to convert is 1,125,520,955. | 273 | * helper will ever be asked to convert is 1,125,520,955. |
268 | * (d1 in the put_dec code, assuming n is all-ones). | 274 | * (second call in the put_dec code, assuming n is all-ones). |
269 | */ | 275 | */ |
270 | static | 276 | static noinline_for_stack |
271 | unsigned put_dec_helper4(char *buf, unsigned x) | 277 | unsigned put_dec_helper4(char *buf, unsigned x) |
272 | { | 278 | { |
273 | uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; | 279 | uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; |
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n) | |||
294 | d2 = (h ) & 0xffff; | 300 | d2 = (h ) & 0xffff; |
295 | d3 = (h >> 16); /* implicit "& 0xffff" */ | 301 | d3 = (h >> 16); /* implicit "& 0xffff" */ |
296 | 302 | ||
303 | /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 | ||
304 | = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ | ||
297 | q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); | 305 | q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); |
298 | q = put_dec_helper4(buf, q); | 306 | q = put_dec_helper4(buf, q); |
299 | 307 | ||
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n) | |||
323 | */ | 331 | */ |
324 | int num_to_str(char *buf, int size, unsigned long long num) | 332 | int num_to_str(char *buf, int size, unsigned long long num) |
325 | { | 333 | { |
326 | char tmp[sizeof(num) * 3]; | 334 | /* put_dec requires 2-byte alignment of the buffer. */ |
335 | char tmp[sizeof(num) * 3] __aligned(2); | ||
327 | int idx, len; | 336 | int idx, len; |
328 | 337 | ||
329 | /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ | 338 | /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ |
@@ -384,7 +393,8 @@ static noinline_for_stack | |||
384 | char *number(char *buf, char *end, unsigned long long num, | 393 | char *number(char *buf, char *end, unsigned long long num, |
385 | struct printf_spec spec) | 394 | struct printf_spec spec) |
386 | { | 395 | { |
387 | char tmp[3 * sizeof(num)]; | 396 | /* put_dec requires 2-byte alignment of the buffer. */ |
397 | char tmp[3 * sizeof(num)] __aligned(2); | ||
388 | char sign; | 398 | char sign; |
389 | char locase; | 399 | char locase; |
390 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); | 400 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); |
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt) | |||
944 | break; | 954 | break; |
945 | } | 955 | } |
946 | for (i = 0; i < 4; i++) { | 956 | for (i = 0; i < 4; i++) { |
947 | char temp[3]; /* hold each IP quad in reverse order */ | 957 | char temp[4] __aligned(2); /* hold each IP quad in reverse order */ |
948 | int digits = put_dec_trunc8(temp, addr[index]) - temp; | 958 | int digits = put_dec_trunc8(temp, addr[index]) - temp; |
949 | if (leading_zeros) { | 959 | if (leading_zeros) { |
950 | if (digits < 3) | 960 | if (digits < 3) |