aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRasmus Villemoes <linux@rasmusvillemoes.dk>2015-04-16 15:43:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-17 09:03:54 -0400
commit7c43d9a30c527d9e06e2c55f82b56f28df43caed (patch)
tree9d83a74720aae5b5ebb84066641392fc68cbebf2
parent840620a1596a90636a44d6a593db4041bb28d52e (diff)
lib/vsprintf.c: even faster binary to decimal conversion
The most expensive part of decimal conversion is the divisions by 10 (albeit done using reciprocal multiplication with appropriately chosen constants). I decided to see if one could eliminate around half of these multiplications by emitting two digits at a time, at the cost of a 200 byte lookup table, and it does indeed seem like there is something to be gained, especially on 64 bits. Microbenchmarking shows improvements ranging from -50% (for numbers uniformly distributed in [0, 2^64-1]) to -25% (for numbers heavily biased toward the smaller end, a more realistic distribution). On a larger scale, perf shows that top, one of the big consumers of /proc data, uses 0.5-1.0% fewer cpu cycles. I had to jump through some hoops to get the 32 bit code to compile and run on my 64 bit machine, so I'm not sure how relevant these numbers are, but just for comparison the microbenchmark showed improvements between -30% and -10%. The bloat-o-meter costs are around 150 bytes (the generated code is a little smaller, so it's not the full 200 bytes) on both 32 and 64 bit. I'm aware that extra cache misses won't show up in a microbenchmark as used above, but on the other hand decimal conversions often happen in bulk (for example in the case of top). I have of course tested that the new code generates the same output as the old, for both the first and last 1e10 numbers in [0,2^64-1] and 4e9 'random' numbers in-between. Test and verification code on github: https://github.com/Villemoes/dec. Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk> Tested-by: Jeff Epler <jepler@unpythonic.net> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org> Cc: Tejun Heo <tj@kernel.org> Cc: Joe Perches <joe@perches.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--lib/vsprintf.c246
1 files changed, 128 insertions, 118 deletions
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3a1e0843f9a2..c93ec8a035b3 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -33,6 +33,7 @@
33 33
34#include <asm/page.h> /* for PAGE_SIZE */ 34#include <asm/page.h> /* for PAGE_SIZE */
35#include <asm/sections.h> /* for dereference_function_descriptor() */ 35#include <asm/sections.h> /* for dereference_function_descriptor() */
36#include <asm/byteorder.h> /* cpu_to_le16 */
36 37
37#include <linux/string_helpers.h> 38#include <linux/string_helpers.h>
38#include "kstrtox.h" 39#include "kstrtox.h"
@@ -122,142 +123,147 @@ int skip_atoi(const char **s)
122 return i; 123 return i;
123} 124}
124 125
125/* Decimal conversion is by far the most typical, and is used 126/*
126 * for /proc and /sys data. This directly impacts e.g. top performance 127 * Decimal conversion is by far the most typical, and is used for
127 * with many processes running. We optimize it for speed 128 * /proc and /sys data. This directly impacts e.g. top performance
128 * using ideas described at <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 129 * with many processes running. We optimize it for speed by emitting
129 * (with permission from the author, Douglas W. Jones). 130 * two characters at a time, using a 200 byte lookup table. This
131 * roughly halves the number of multiplications compared to computing
132 * the digits one at a time. Implementation strongly inspired by the
133 * previous version, which in turn used ideas described at
134 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
135 * from the author, Douglas W. Jones).
136 *
137 * It turns out there is precisely one 26 bit fixed-point
138 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
139 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
140 * range happens to be somewhat larger (x <= 1073741898), but that's
141 * irrelevant for our purpose.
142 *
143 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
144 * need a 32x32->64 bit multiply, so we simply use the same constant.
145 *
146 * For dividing a number in the range [100, 10^4-1] by 100, there are
147 * several options. The simplest is (x * 0x147b) >> 19, which is valid
148 * for all x <= 43698.
130 */ 149 */
131 150
132#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 151static const u16 decpair[100] = {
133/* Formats correctly any integer in [0, 999999999] */ 152#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
153 _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
154 _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
155 _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
156 _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
157 _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
158 _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
159 _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
160 _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
161 _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
162 _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
163#undef _
164};
165
166/*
167 * This will print a single '0' even if r == 0, since we would
168 * immediately jump to out_r where two 0s would be written and one of
169 * them then discarded. This is needed by ip4_string below. All other
170 * callers pass a non-zero value of r.
171*/
134static noinline_for_stack 172static noinline_for_stack
135char *put_dec_full9(char *buf, unsigned q) 173char *put_dec_trunc8(char *buf, unsigned r)
136{ 174{
137 unsigned r; 175 unsigned q;
138 176
139 /* 177 /* 1 <= r < 10^8 */
140 * Possible ways to approx. divide by 10 178 if (r < 100)
141 * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) 179 goto out_r;
142 * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) 180
143 * (x * 0x6667) >> 18 x < 43699 181 /* 100 <= r < 10^8 */
144 * (x * 0x3334) >> 17 x < 16389 182 q = (r * (u64)0x28f5c29) >> 32;
145 * (x * 0x199a) >> 16 x < 16389 183 *((u16 *)buf) = decpair[r - 100*q];
146 * (x * 0x0ccd) >> 15 x < 16389 184 buf += 2;
147 * (x * 0x0667) >> 14 x < 2739 185
148 * (x * 0x0334) >> 13 x < 1029 186 /* 1 <= q < 10^6 */
149 * (x * 0x019a) >> 12 x < 1029 187 if (q < 100)
150 * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) 188 goto out_q;
151 * (x * 0x0067) >> 10 x < 179 189
152 * (x * 0x0034) >> 9 x < 69 same 190 /* 100 <= q < 10^6 */
153 * (x * 0x001a) >> 8 x < 69 same 191 r = (q * (u64)0x28f5c29) >> 32;
154 * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) 192 *((u16 *)buf) = decpair[q - 100*r];
155 * (x * 0x0007) >> 6 x < 19 193 buf += 2;
156 * See <http://www.cs.uiowa.edu/~jones/bcd/divide.html> 194
157 */ 195 /* 1 <= r < 10^4 */
158 r = (q * (uint64_t)0x1999999a) >> 32; 196 if (r < 100)
159 *buf++ = (q - 10 * r) + '0'; /* 1 */ 197 goto out_r;
160 q = (r * (uint64_t)0x1999999a) >> 32; 198
161 *buf++ = (r - 10 * q) + '0'; /* 2 */ 199 /* 100 <= r < 10^4 */
162 r = (q * (uint64_t)0x1999999a) >> 32; 200 q = (r * 0x147b) >> 19;
163 *buf++ = (q - 10 * r) + '0'; /* 3 */ 201 *((u16 *)buf) = decpair[r - 100*q];
164 q = (r * (uint64_t)0x1999999a) >> 32; 202 buf += 2;
165 *buf++ = (r - 10 * q) + '0'; /* 4 */ 203out_q:
166 r = (q * (uint64_t)0x1999999a) >> 32; 204 /* 1 <= q < 100 */
167 *buf++ = (q - 10 * r) + '0'; /* 5 */ 205 r = q;
168 /* Now value is under 10000, can avoid 64-bit multiply */ 206out_r:
169 q = (r * 0x199a) >> 16; 207 /* 1 <= r < 100 */
170 *buf++ = (r - 10 * q) + '0'; /* 6 */ 208 *((u16 *)buf) = decpair[r];
171 r = (q * 0xcd) >> 11; 209 buf += 2;
172 *buf++ = (q - 10 * r) + '0'; /* 7 */ 210 if (buf[-1] == '0')
173 q = (r * 0xcd) >> 11; 211 buf--;
174 *buf++ = (r - 10 * q) + '0'; /* 8 */
175 *buf++ = q + '0'; /* 9 */
176 return buf; 212 return buf;
177} 213}
178#endif
179 214
180/* Similar to above but do not pad with zeros. 215#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
181 * Code can be easily arranged to print 9 digits too, but our callers
182 * always call put_dec_full9() instead when the number has 9 decimal digits.
183 */
184static noinline_for_stack 216static noinline_for_stack
185char *put_dec_trunc8(char *buf, unsigned r) 217char *put_dec_full8(char *buf, unsigned r)
186{ 218{
187 unsigned q; 219 unsigned q;
188 220
189 /* Copy of previous function's body with added early returns */ 221 /* 0 <= r < 10^8 */
190 while (r >= 10000) { 222 q = (r * (u64)0x28f5c29) >> 32;
191 q = r + '0'; 223 *((u16 *)buf) = decpair[r - 100*q];
192 r = (r * (uint64_t)0x1999999a) >> 32; 224 buf += 2;
193 *buf++ = q - 10*r;
194 }
195 225
196 q = (r * 0x199a) >> 16; /* r <= 9999 */ 226 /* 0 <= q < 10^6 */
197 *buf++ = (r - 10 * q) + '0'; 227 r = (q * (u64)0x28f5c29) >> 32;
198 if (q == 0) 228 *((u16 *)buf) = decpair[q - 100*r];
199 return buf; 229 buf += 2;
200 r = (q * 0xcd) >> 11; /* q <= 999 */
201 *buf++ = (q - 10 * r) + '0';
202 if (r == 0)
203 return buf;
204 q = (r * 0xcd) >> 11; /* r <= 99 */
205 *buf++ = (r - 10 * q) + '0';
206 if (q == 0)
207 return buf;
208 *buf++ = q + '0'; /* q <= 9 */
209 return buf;
210}
211 230
212/* There are two algorithms to print larger numbers. 231 /* 0 <= r < 10^4 */
213 * One is generic: divide by 1000000000 and repeatedly print 232 q = (r * 0x147b) >> 19;
214 * groups of (up to) 9 digits. It's conceptually simple, 233 *((u16 *)buf) = decpair[r - 100*q];
215 * but requires a (unsigned long long) / 1000000000 division. 234 buf += 2;
216 *
217 * Second algorithm splits 64-bit unsigned long long into 16-bit chunks,
218 * manipulates them cleverly and generates groups of 4 decimal digits.
219 * It so happens that it does NOT require long long division.
220 *
221 * If long is > 32 bits, division of 64-bit values is relatively easy,
222 * and we will use the first algorithm.
223 * If long long is > 64 bits (strange architecture with VERY large long long),
224 * second algorithm can't be used, and we again use the first one.
225 *
226 * Else (if long is 32 bits and long long is 64 bits) we use second one.
227 */
228 235
229#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 236 /* 0 <= q < 100 */
230 237 *((u16 *)buf) = decpair[q];
231/* First algorithm: generic */ 238 buf += 2;
239 return buf;
240}
232 241
233static 242static noinline_for_stack
234char *put_dec(char *buf, unsigned long long n) 243char *put_dec(char *buf, unsigned long long n)
235{ 244{
236 if (n >= 100*1000*1000) { 245 if (n >= 100*1000*1000)
237 while (n >= 1000*1000*1000) 246 buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
238 buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); 247 /* 1 <= n <= 1.6e11 */
239 if (n >= 100*1000*1000) 248 if (n >= 100*1000*1000)
240 return put_dec_full9(buf, n); 249 buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
241 } 250 /* 1 <= n < 1e8 */
242 return put_dec_trunc8(buf, n); 251 return put_dec_trunc8(buf, n);
243} 252}
244 253
245#else 254#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
246 255
247/* Second algorithm: valid only for 64-bit long longs */ 256static void
248 257put_dec_full4(char *buf, unsigned r)
249/* See comment in put_dec_full9 for choice of constants */
250static noinline_for_stack
251void put_dec_full4(char *buf, unsigned q)
252{ 258{
253 unsigned r; 259 unsigned q;
254 r = (q * 0xccd) >> 15; 260
255 buf[0] = (q - 10 * r) + '0'; 261 /* 0 <= r < 10^4 */
256 q = (r * 0xcd) >> 11; 262 q = (r * 0x147b) >> 19;
257 buf[1] = (r - 10 * q) + '0'; 263 *((u16 *)buf) = decpair[r - 100*q];
258 r = (q * 0xcd) >> 11; 264 buf += 2;
259 buf[2] = (q - 10 * r) + '0'; 265 /* 0 <= q < 100 */
260 buf[3] = r + '0'; 266 *((u16 *)buf) = decpair[q];
261} 267}
262 268
263/* 269/*
@@ -265,9 +271,9 @@ void put_dec_full4(char *buf, unsigned q)
265 * The approximation x/10000 == (x * 0x346DC5D7) >> 43 271 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
266 * holds for all x < 1,128,869,999. The largest value this 272 * holds for all x < 1,128,869,999. The largest value this
267 * helper will ever be asked to convert is 1,125,520,955. 273 * helper will ever be asked to convert is 1,125,520,955.
268 * (d1 in the put_dec code, assuming n is all-ones). 274 * (second call in the put_dec code, assuming n is all-ones).
269 */ 275 */
270static 276static noinline_for_stack
271unsigned put_dec_helper4(char *buf, unsigned x) 277unsigned put_dec_helper4(char *buf, unsigned x)
272{ 278{
273 uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; 279 uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
@@ -294,6 +300,8 @@ char *put_dec(char *buf, unsigned long long n)
294 d2 = (h ) & 0xffff; 300 d2 = (h ) & 0xffff;
295 d3 = (h >> 16); /* implicit "& 0xffff" */ 301 d3 = (h >> 16); /* implicit "& 0xffff" */
296 302
303 /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
304 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
297 q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); 305 q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
298 q = put_dec_helper4(buf, q); 306 q = put_dec_helper4(buf, q);
299 307
@@ -323,7 +331,8 @@ char *put_dec(char *buf, unsigned long long n)
323 */ 331 */
324int num_to_str(char *buf, int size, unsigned long long num) 332int num_to_str(char *buf, int size, unsigned long long num)
325{ 333{
326 char tmp[sizeof(num) * 3]; 334 /* put_dec requires 2-byte alignment of the buffer. */
335 char tmp[sizeof(num) * 3] __aligned(2);
327 int idx, len; 336 int idx, len;
328 337
329 /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ 338 /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
@@ -384,7 +393,8 @@ static noinline_for_stack
384char *number(char *buf, char *end, unsigned long long num, 393char *number(char *buf, char *end, unsigned long long num,
385 struct printf_spec spec) 394 struct printf_spec spec)
386{ 395{
387 char tmp[3 * sizeof(num)]; 396 /* put_dec requires 2-byte alignment of the buffer. */
397 char tmp[3 * sizeof(num)] __aligned(2);
388 char sign; 398 char sign;
389 char locase; 399 char locase;
390 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); 400 int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
@@ -944,7 +954,7 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
944 break; 954 break;
945 } 955 }
946 for (i = 0; i < 4; i++) { 956 for (i = 0; i < 4; i++) {
947 char temp[3]; /* hold each IP quad in reverse order */ 957 char temp[4] __aligned(2); /* hold each IP quad in reverse order */
948 int digits = put_dec_trunc8(temp, addr[index]) - temp; 958 int digits = put_dec_trunc8(temp, addr[index]) - temp;
949 if (leading_zeros) { 959 if (leading_zeros) {
950 if (digits < 3) 960 if (digits < 3)