aboutsummaryrefslogtreecommitdiffstats
path: root/tools/lib/find_bit.c
diff options
context:
space:
mode:
authorClement Courbet <courbet@google.com>2018-02-06 18:38:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-06 21:32:44 -0500
commit0ade34c37012ea5c516d9aa4d19a56e9f40a55ed (patch)
treee231c127d4a43927f49cf1a9323f9a52c81682ea /tools/lib/find_bit.c
parent15ff67bf85c6c02ab7d850deea0199516e8f16a0 (diff)
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). It's essentially a joined iteration in search for a non-zero bit, which is currently implemented as a lookup join (find a nonzero bit on the lhs, lookup the rhs to see if it's set there). Implement a direct join (find a nonzero bit on the incrementally built join). Also add generic bitmap benchmarks in the new `test_find_bit` module for new function (see `find_next_and_bit` in [2] and [3] below). For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory usage. Note that on Arm, the new pure-C implementation still outperforms the old one that uses a mix of C and asm (`find_next_bit`) [3]. [1] Approximate benchmark code: ``` unsigned long src1p[nr_cpumask_longs] = {pattern1}; unsigned long src2p[nr_cpumask_longs] = {pattern2}; for (/*a bunch of repetitions*/) { for (int n = -1; n <= nr_cpu_ids; ++n) { asm volatile("" : "+rm"(src1p)); // prevent any optimization asm volatile("" : "+rm"(src2p)); unsigned long result = cpumask_next_and(n, src1p, src2p); asm volatile("" : "+rm"(result)); } } ``` Results: pattern1 pattern2 time_before/time_after 0x0000ffff 0x0000ffff 1.65 0x0000ffff 0x00005555 2.24 0x0000ffff 0x00001111 2.94 0x0000ffff 0x00000000 14.0 0x00005555 0x0000ffff 1.67 0x00005555 0x00005555 1.71 0x00005555 0x00001111 1.90 0x00005555 0x00000000 6.58 0x00001111 0x0000ffff 1.46 0x00001111 0x00005555 1.49 0x00001111 0x00001111 1.45 0x00001111 0x00000000 3.10 0x00000000 0x0000ffff 1.18 0x00000000 0x00005555 1.18 0x00000000 0x00001111 1.17 0x00000000 0x00000000 1.25 ----------------------------- geo.mean 2.06 [2] test_find_next_bit, X86 (skylake) [ 3913.477422] Start testing find_bit() with random-filled bitmap [ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations [ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations [ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations [ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations [ 3913.480216] Start testing find_next_and_bit() with random-filled bitmap [ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations [ 3913.481075] Start testing find_bit() with sparse bitmap [ 3913.481078] find_next_bit: 2536 cycles, 66 iterations [ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations [ 3913.481255] find_last_bit: 2006 cycles, 66 iterations [ 3913.481265] find_first_bit: 17488 cycles, 66 iterations [ 3913.481266] Start testing find_next_and_bit() with sparse bitmap [ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations [3] test_find_next_bit, arm (v7 odroid XU3). [ 267.206928] Start testing find_bit() with random-filled bitmap [ 267.214752] find_next_bit: 4474 cycles, 16419 iterations [ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations [ 267.229294] find_last_bit: 4209 cycles, 16419 iterations [ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations [ 267.286265] Start testing find_next_and_bit() with random-filled bitmap [ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations [ 267.309422] Start testing find_bit() with sparse bitmap [ 267.316054] find_next_bit: 191 cycles, 66 iterations [ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations [ 267.329803] find_last_bit: 84 cycles, 66 iterations [ 267.336169] find_first_bit: 4118 cycles, 66 iterations [ 267.342627] Start testing find_next_and_bit() with sparse bitmap [ 267.356919] find_next_and_bit: 91 cycles, 1 iterations [courbet@google.com: v6] Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com [geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>] Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com Signed-off-by: Clement Courbet <courbet@google.com> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Yury Norov <ynorov@caviumnetworks.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'tools/lib/find_bit.c')
-rw-r--r--tools/lib/find_bit.c39
1 files changed, 29 insertions, 10 deletions
diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index 42c15f906aac..a88bd507091e 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -22,22 +22,29 @@
22#include <linux/bitmap.h> 22#include <linux/bitmap.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24 24
25#if !defined(find_next_bit) 25#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
26 !defined(find_next_and_bit)
26 27
27/* 28/*
28 * This is a common helper function for find_next_bit and 29 * This is a common helper function for find_next_bit, find_next_zero_bit, and
29 * find_next_zero_bit. The difference is the "invert" argument, which 30 * find_next_and_bit. The differences are:
30 * is XORed with each fetched word before searching it for one bits. 31 * - The "invert" argument, which is XORed with each fetched word before
32 * searching it for one bits.
33 * - The optional "addr2", which is anded with "addr1" if present.
31 */ 34 */
32static unsigned long _find_next_bit(const unsigned long *addr, 35static inline unsigned long _find_next_bit(const unsigned long *addr1,
33 unsigned long nbits, unsigned long start, unsigned long invert) 36 const unsigned long *addr2, unsigned long nbits,
37 unsigned long start, unsigned long invert)
34{ 38{
35 unsigned long tmp; 39 unsigned long tmp;
36 40
37 if (unlikely(start >= nbits)) 41 if (unlikely(start >= nbits))
38 return nbits; 42 return nbits;
39 43
40 tmp = addr[start / BITS_PER_LONG] ^ invert; 44 tmp = addr1[start / BITS_PER_LONG];
45 if (addr2)
46 tmp &= addr2[start / BITS_PER_LONG];
47 tmp ^= invert;
41 48
42 /* Handle 1st word. */ 49 /* Handle 1st word. */
43 tmp &= BITMAP_FIRST_WORD_MASK(start); 50 tmp &= BITMAP_FIRST_WORD_MASK(start);
@@ -48,7 +55,10 @@ static unsigned long _find_next_bit(const unsigned long *addr,
48 if (start >= nbits) 55 if (start >= nbits)
49 return nbits; 56 return nbits;
50 57
51 tmp = addr[start / BITS_PER_LONG] ^ invert; 58 tmp = addr1[start / BITS_PER_LONG];
59 if (addr2)
60 tmp &= addr2[start / BITS_PER_LONG];
61 tmp ^= invert;
52 } 62 }
53 63
54 return min(start + __ffs(tmp), nbits); 64 return min(start + __ffs(tmp), nbits);
@@ -62,7 +72,7 @@ static unsigned long _find_next_bit(const unsigned long *addr,
62unsigned long find_next_bit(const unsigned long *addr, unsigned long size, 72unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
63 unsigned long offset) 73 unsigned long offset)
64{ 74{
65 return _find_next_bit(addr, size, offset, 0UL); 75 return _find_next_bit(addr, NULL, size, offset, 0UL);
66} 76}
67#endif 77#endif
68 78
@@ -104,6 +114,15 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
104unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, 114unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
105 unsigned long offset) 115 unsigned long offset)
106{ 116{
107 return _find_next_bit(addr, size, offset, ~0UL); 117 return _find_next_bit(addr, NULL, size, offset, ~0UL);
118}
119#endif
120
121#ifndef find_next_and_bit
122unsigned long find_next_and_bit(const unsigned long *addr1,
123 const unsigned long *addr2, unsigned long size,
124 unsigned long offset)
125{
126 return _find_next_bit(addr1, addr2, size, offset, 0UL);
108} 127}
109#endif 128#endif