aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorClement Courbet <courbet@google.com>2018-02-06 18:38:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-06 21:32:44 -0500
commit0ade34c37012ea5c516d9aa4d19a56e9f40a55ed (patch)
treee231c127d4a43927f49cf1a9323f9a52c81682ea /lib
parent15ff67bf85c6c02ab7d850deea0199516e8f16a0 (diff)
lib: optimize cpumask_next_and()
We've measured that we spend ~0.6% of sys cpu time in cpumask_next_and(). It's essentially a joined iteration in search for a non-zero bit, which is currently implemented as a lookup join (find a nonzero bit on the lhs, lookup the rhs to see if it's set there). Implement a direct join (find a nonzero bit on the incrementally built join). Also add generic bitmap benchmarks in the new `test_find_bit` module for new function (see `find_next_and_bit` in [2] and [3] below). For cpumask_next_and, direct benchmarking shows that it's 1.17x to 14x faster with a geometric mean of 2.1 on 32 CPUs [1]. No impact on memory usage. Note that on Arm, the new pure-C implementation still outperforms the old one that uses a mix of C and asm (`find_next_bit`) [3]. [1] Approximate benchmark code: ``` unsigned long src1p[nr_cpumask_longs] = {pattern1}; unsigned long src2p[nr_cpumask_longs] = {pattern2}; for (/*a bunch of repetitions*/) { for (int n = -1; n <= nr_cpu_ids; ++n) { asm volatile("" : "+rm"(src1p)); // prevent any optimization asm volatile("" : "+rm"(src2p)); unsigned long result = cpumask_next_and(n, src1p, src2p); asm volatile("" : "+rm"(result)); } } ``` Results: pattern1 pattern2 time_before/time_after 0x0000ffff 0x0000ffff 1.65 0x0000ffff 0x00005555 2.24 0x0000ffff 0x00001111 2.94 0x0000ffff 0x00000000 14.0 0x00005555 0x0000ffff 1.67 0x00005555 0x00005555 1.71 0x00005555 0x00001111 1.90 0x00005555 0x00000000 6.58 0x00001111 0x0000ffff 1.46 0x00001111 0x00005555 1.49 0x00001111 0x00001111 1.45 0x00001111 0x00000000 3.10 0x00000000 0x0000ffff 1.18 0x00000000 0x00005555 1.18 0x00000000 0x00001111 1.17 0x00000000 0x00000000 1.25 ----------------------------- geo.mean 2.06 [2] test_find_next_bit, X86 (skylake) [ 3913.477422] Start testing find_bit() with random-filled bitmap [ 3913.477847] find_next_bit: 160868 cycles, 16484 iterations [ 3913.477933] find_next_zero_bit: 169542 cycles, 16285 iterations [ 3913.478036] find_last_bit: 201638 cycles, 16483 iterations [ 3913.480214] find_first_bit: 4353244 cycles, 16484 iterations [ 3913.480216] Start testing find_next_and_bit() with random-filled bitmap [ 3913.481074] find_next_and_bit: 89604 cycles, 8216 iterations [ 3913.481075] Start testing find_bit() with sparse bitmap [ 3913.481078] find_next_bit: 2536 cycles, 66 iterations [ 3913.481252] find_next_zero_bit: 344404 cycles, 32703 iterations [ 3913.481255] find_last_bit: 2006 cycles, 66 iterations [ 3913.481265] find_first_bit: 17488 cycles, 66 iterations [ 3913.481266] Start testing find_next_and_bit() with sparse bitmap [ 3913.481272] find_next_and_bit: 764 cycles, 1 iterations [3] test_find_next_bit, arm (v7 odroid XU3). [ 267.206928] Start testing find_bit() with random-filled bitmap [ 267.214752] find_next_bit: 4474 cycles, 16419 iterations [ 267.221850] find_next_zero_bit: 5976 cycles, 16350 iterations [ 267.229294] find_last_bit: 4209 cycles, 16419 iterations [ 267.279131] find_first_bit: 1032991 cycles, 16420 iterations [ 267.286265] Start testing find_next_and_bit() with random-filled bitmap [ 267.302386] find_next_and_bit: 2290 cycles, 8140 iterations [ 267.309422] Start testing find_bit() with sparse bitmap [ 267.316054] find_next_bit: 191 cycles, 66 iterations [ 267.322726] find_next_zero_bit: 8758 cycles, 32703 iterations [ 267.329803] find_last_bit: 84 cycles, 66 iterations [ 267.336169] find_first_bit: 4118 cycles, 66 iterations [ 267.342627] Start testing find_next_and_bit() with sparse bitmap [ 267.356919] find_next_and_bit: 91 cycles, 1 iterations [courbet@google.com: v6] Link: http://lkml.kernel.org/r/20171129095715.23430-1-courbet@google.com [geert@linux-m68k.org: m68k/bitops: always include <asm-generic/bitops/find.h>] Link: http://lkml.kernel.org/r/1512556816-28627-1-git-send-email-geert@linux-m68k.org Link: http://lkml.kernel.org/r/20171128131334.23491-1-courbet@google.com Signed-off-by: Clement Courbet <courbet@google.com> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Yury Norov <ynorov@caviumnetworks.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/cpumask.c9
-rw-r--r--lib/find_bit.c59
-rw-r--r--lib/find_bit_benchmark.c25
3 files changed, 72 insertions, 21 deletions
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 35fe142ebb5e..beca6244671a 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -33,10 +33,11 @@ EXPORT_SYMBOL(cpumask_next);
33int cpumask_next_and(int n, const struct cpumask *src1p, 33int cpumask_next_and(int n, const struct cpumask *src1p,
34 const struct cpumask *src2p) 34 const struct cpumask *src2p)
35{ 35{
36 while ((n = cpumask_next(n, src1p)) < nr_cpu_ids) 36 /* -1 is a legal arg here. */
37 if (cpumask_test_cpu(n, src2p)) 37 if (n != -1)
38 break; 38 cpumask_check(n);
39 return n; 39 return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
40 nr_cpumask_bits, n + 1);
40} 41}
41EXPORT_SYMBOL(cpumask_next_and); 42EXPORT_SYMBOL(cpumask_next_and);
42 43
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 6ed74f78380c..ee3df93ba69a 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -21,22 +21,29 @@
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23 23
24#if !defined(find_next_bit) || !defined(find_next_zero_bit) 24#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
25 !defined(find_next_and_bit)
25 26
26/* 27/*
27 * This is a common helper function for find_next_bit and 28 * This is a common helper function for find_next_bit, find_next_zero_bit, and
28 * find_next_zero_bit. The difference is the "invert" argument, which 29 * find_next_and_bit. The differences are:
29 * is XORed with each fetched word before searching it for one bits. 30 * - The "invert" argument, which is XORed with each fetched word before
31 * searching it for one bits.
32 * - The optional "addr2", which is anded with "addr1" if present.
30 */ 33 */
31static unsigned long _find_next_bit(const unsigned long *addr, 34static inline unsigned long _find_next_bit(const unsigned long *addr1,
32 unsigned long nbits, unsigned long start, unsigned long invert) 35 const unsigned long *addr2, unsigned long nbits,
36 unsigned long start, unsigned long invert)
33{ 37{
34 unsigned long tmp; 38 unsigned long tmp;
35 39
36 if (unlikely(start >= nbits)) 40 if (unlikely(start >= nbits))
37 return nbits; 41 return nbits;
38 42
39 tmp = addr[start / BITS_PER_LONG] ^ invert; 43 tmp = addr1[start / BITS_PER_LONG];
44 if (addr2)
45 tmp &= addr2[start / BITS_PER_LONG];
46 tmp ^= invert;
40 47
41 /* Handle 1st word. */ 48 /* Handle 1st word. */
42 tmp &= BITMAP_FIRST_WORD_MASK(start); 49 tmp &= BITMAP_FIRST_WORD_MASK(start);
@@ -47,7 +54,10 @@ static unsigned long _find_next_bit(const unsigned long *addr,
47 if (start >= nbits) 54 if (start >= nbits)
48 return nbits; 55 return nbits;
49 56
50 tmp = addr[start / BITS_PER_LONG] ^ invert; 57 tmp = addr1[start / BITS_PER_LONG];
58 if (addr2)
59 tmp &= addr2[start / BITS_PER_LONG];
60 tmp ^= invert;
51 } 61 }
52 62
53 return min(start + __ffs(tmp), nbits); 63 return min(start + __ffs(tmp), nbits);
@@ -61,7 +71,7 @@ static unsigned long _find_next_bit(const unsigned long *addr,
61unsigned long find_next_bit(const unsigned long *addr, unsigned long size, 71unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
62 unsigned long offset) 72 unsigned long offset)
63{ 73{
64 return _find_next_bit(addr, size, offset, 0UL); 74 return _find_next_bit(addr, NULL, size, offset, 0UL);
65} 75}
66EXPORT_SYMBOL(find_next_bit); 76EXPORT_SYMBOL(find_next_bit);
67#endif 77#endif
@@ -70,11 +80,21 @@ EXPORT_SYMBOL(find_next_bit);
70unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, 80unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
71 unsigned long offset) 81 unsigned long offset)
72{ 82{
73 return _find_next_bit(addr, size, offset, ~0UL); 83 return _find_next_bit(addr, NULL, size, offset, ~0UL);
74} 84}
75EXPORT_SYMBOL(find_next_zero_bit); 85EXPORT_SYMBOL(find_next_zero_bit);
76#endif 86#endif
77 87
88#if !defined(find_next_and_bit)
89unsigned long find_next_and_bit(const unsigned long *addr1,
90 const unsigned long *addr2, unsigned long size,
91 unsigned long offset)
92{
93 return _find_next_bit(addr1, addr2, size, offset, 0UL);
94}
95EXPORT_SYMBOL(find_next_and_bit);
96#endif
97
78#ifndef find_first_bit 98#ifndef find_first_bit
79/* 99/*
80 * Find the first set bit in a memory region. 100 * Find the first set bit in a memory region.
@@ -146,15 +166,19 @@ static inline unsigned long ext2_swab(const unsigned long y)
146} 166}
147 167
148#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) 168#if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le)
149static unsigned long _find_next_bit_le(const unsigned long *addr, 169static inline unsigned long _find_next_bit_le(const unsigned long *addr1,
150 unsigned long nbits, unsigned long start, unsigned long invert) 170 const unsigned long *addr2, unsigned long nbits,
171 unsigned long start, unsigned long invert)
151{ 172{
152 unsigned long tmp; 173 unsigned long tmp;
153 174
154 if (unlikely(start >= nbits)) 175 if (unlikely(start >= nbits))
155 return nbits; 176 return nbits;
156 177
157 tmp = addr[start / BITS_PER_LONG] ^ invert; 178 tmp = addr1[start / BITS_PER_LONG];
179 if (addr2)
180 tmp &= addr2[start / BITS_PER_LONG];
181 tmp ^= invert;
158 182
159 /* Handle 1st word. */ 183 /* Handle 1st word. */
160 tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start)); 184 tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start));
@@ -165,7 +189,10 @@ static unsigned long _find_next_bit_le(const unsigned long *addr,
165 if (start >= nbits) 189 if (start >= nbits)
166 return nbits; 190 return nbits;
167 191
168 tmp = addr[start / BITS_PER_LONG] ^ invert; 192 tmp = addr1[start / BITS_PER_LONG];
193 if (addr2)
194 tmp &= addr2[start / BITS_PER_LONG];
195 tmp ^= invert;
169 } 196 }
170 197
171 return min(start + __ffs(ext2_swab(tmp)), nbits); 198 return min(start + __ffs(ext2_swab(tmp)), nbits);
@@ -176,7 +203,7 @@ static unsigned long _find_next_bit_le(const unsigned long *addr,
176unsigned long find_next_zero_bit_le(const void *addr, unsigned 203unsigned long find_next_zero_bit_le(const void *addr, unsigned
177 long size, unsigned long offset) 204 long size, unsigned long offset)
178{ 205{
179 return _find_next_bit_le(addr, size, offset, ~0UL); 206 return _find_next_bit_le(addr, NULL, size, offset, ~0UL);
180} 207}
181EXPORT_SYMBOL(find_next_zero_bit_le); 208EXPORT_SYMBOL(find_next_zero_bit_le);
182#endif 209#endif
@@ -185,7 +212,7 @@ EXPORT_SYMBOL(find_next_zero_bit_le);
185unsigned long find_next_bit_le(const void *addr, unsigned 212unsigned long find_next_bit_le(const void *addr, unsigned
186 long size, unsigned long offset) 213 long size, unsigned long offset)
187{ 214{
188 return _find_next_bit_le(addr, size, offset, 0UL); 215 return _find_next_bit_le(addr, NULL, size, offset, 0UL);
189} 216}
190EXPORT_SYMBOL(find_next_bit_le); 217EXPORT_SYMBOL(find_next_bit_le);
191#endif 218#endif
diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c
index 67b19233c28f..5985a25e6cbc 100644
--- a/lib/find_bit_benchmark.c
+++ b/lib/find_bit_benchmark.c
@@ -35,6 +35,7 @@
35#define SPARSE 500 35#define SPARSE 500
36 36
37static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; 37static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata;
38static DECLARE_BITMAP(bitmap2, BITMAP_LEN) __initdata;
38 39
39/* 40/*
40 * This is Schlemiel the Painter's algorithm. It should be called after 41 * This is Schlemiel the Painter's algorithm. It should be called after
@@ -103,6 +104,22 @@ static int __init test_find_last_bit(const void *bitmap, unsigned long len)
103 return 0; 104 return 0;
104} 105}
105 106
107static int __init test_find_next_and_bit(const void *bitmap,
108 const void *bitmap2, unsigned long len)
109{
110 unsigned long i, cnt;
111 cycles_t cycles;
112
113 cycles = get_cycles();
114 for (cnt = i = 0; i < BITMAP_LEN; cnt++)
115 i = find_next_and_bit(bitmap, bitmap2, BITMAP_LEN, i+1);
116 cycles = get_cycles() - cycles;
117 pr_err("find_next_and_bit:\t\t%llu cycles, %ld iterations\n",
118 (u64)cycles, cnt);
119
120 return 0;
121}
122
106static int __init find_bit_test(void) 123static int __init find_bit_test(void)
107{ 124{
108 unsigned long nbits = BITMAP_LEN / SPARSE; 125 unsigned long nbits = BITMAP_LEN / SPARSE;
@@ -110,23 +127,29 @@ static int __init find_bit_test(void)
110 pr_err("\nStart testing find_bit() with random-filled bitmap\n"); 127 pr_err("\nStart testing find_bit() with random-filled bitmap\n");
111 128
112 get_random_bytes(bitmap, sizeof(bitmap)); 129 get_random_bytes(bitmap, sizeof(bitmap));
130 get_random_bytes(bitmap2, sizeof(bitmap2));
113 131
114 test_find_next_bit(bitmap, BITMAP_LEN); 132 test_find_next_bit(bitmap, BITMAP_LEN);
115 test_find_next_zero_bit(bitmap, BITMAP_LEN); 133 test_find_next_zero_bit(bitmap, BITMAP_LEN);
116 test_find_last_bit(bitmap, BITMAP_LEN); 134 test_find_last_bit(bitmap, BITMAP_LEN);
117 test_find_first_bit(bitmap, BITMAP_LEN); 135 test_find_first_bit(bitmap, BITMAP_LEN);
136 test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN);
118 137
119 pr_err("\nStart testing find_bit() with sparse bitmap\n"); 138 pr_err("\nStart testing find_bit() with sparse bitmap\n");
120 139
121 bitmap_zero(bitmap, BITMAP_LEN); 140 bitmap_zero(bitmap, BITMAP_LEN);
141 bitmap_zero(bitmap2, BITMAP_LEN);
122 142
123 while (nbits--) 143 while (nbits--) {
124 __set_bit(prandom_u32() % BITMAP_LEN, bitmap); 144 __set_bit(prandom_u32() % BITMAP_LEN, bitmap);
145 __set_bit(prandom_u32() % BITMAP_LEN, bitmap2);
146 }
125 147
126 test_find_next_bit(bitmap, BITMAP_LEN); 148 test_find_next_bit(bitmap, BITMAP_LEN);
127 test_find_next_zero_bit(bitmap, BITMAP_LEN); 149 test_find_next_zero_bit(bitmap, BITMAP_LEN);
128 test_find_last_bit(bitmap, BITMAP_LEN); 150 test_find_last_bit(bitmap, BITMAP_LEN);
129 test_find_first_bit(bitmap, BITMAP_LEN); 151 test_find_first_bit(bitmap, BITMAP_LEN);
152 test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN);
130 153
131 /* 154 /*
132 * Everything is OK. Return error just to let user run benchmark 155 * Everything is OK. Return error just to let user run benchmark