aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2010-08-12 12:28:09 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-11-28 23:48:17 -0500
commit64ff31287693c1f325cb9cb049569c1611438ef1 (patch)
tree3c159d2ca6f967fca13bae17cff19f92e0b3896c /arch
parent72083646528d4887b920deb71b37e09bc7d227bb (diff)
powerpc: Add support for popcnt instructions
POWER5 added popcntb, and POWER7 added popcntw and popcntd. As a first step this patch does all the work out of line, but it would be nice to implement them as inlines with an out of line fallback. The performance issue with hweight was noticed when disabling SMT on a large (192 thread) POWER7 box. The patch improves that testcase by about 8%. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/bitops.h9
-rw-r--r--arch/powerpc/include/asm/cputable.h9
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c7
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/lib/hweight_64.S110
5 files changed, 133 insertions, 4 deletions
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 30964ae2d096..8a7e9314c68a 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -267,7 +267,16 @@ static __inline__ int fls64(__u64 x)
267#include <asm-generic/bitops/fls64.h> 267#include <asm-generic/bitops/fls64.h>
268#endif /* __powerpc64__ */ 268#endif /* __powerpc64__ */
269 269
270#ifdef CONFIG_PPC64
271unsigned int __arch_hweight8(unsigned int w);
272unsigned int __arch_hweight16(unsigned int w);
273unsigned int __arch_hweight32(unsigned int w);
274unsigned long __arch_hweight64(__u64 w);
275#include <asm-generic/bitops/const_hweight.h>
276#else
270#include <asm-generic/bitops/hweight.h> 277#include <asm-generic/bitops/hweight.h>
278#endif
279
271#include <asm-generic/bitops/find.h> 280#include <asm-generic/bitops/find.h>
272 281
273/* Little-endian versions */ 282/* Little-endian versions */
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index f3a1fdd9cf08..f0a211d96923 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -199,6 +199,8 @@ extern const char *powerpc_base_platform;
199#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) 199#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000)
200#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000) 200#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000)
201#define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0200000000000000) 201#define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0200000000000000)
202#define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0400000000000000)
203#define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0800000000000000)
202 204
203#ifndef __ASSEMBLY__ 205#ifndef __ASSEMBLY__
204 206
@@ -403,21 +405,22 @@ extern const char *powerpc_base_platform;
403 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 405 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
404 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 406 CPU_FTR_MMCRA | CPU_FTR_SMT | \
405 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ 407 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
406 CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS) 408 CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS | \
409 CPU_FTR_POPCNTB)
407#define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 410#define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
408 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 411 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
409 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 412 CPU_FTR_MMCRA | CPU_FTR_SMT | \
410 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ 413 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
411 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ 414 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
412 CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ 415 CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
413 CPU_FTR_STCX_CHECKS_ADDRESS) 416 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB)
414#define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 417#define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
415 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 418 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
416 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 419 CPU_FTR_MMCRA | CPU_FTR_SMT | \
417 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ 420 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
418 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ 421 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
419 CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ 422 CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \
420 CPU_FTR_STCX_CHECKS_ADDRESS) 423 CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD)
421#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 424#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
422 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 425 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
423 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ 426 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index ab3e392ac63c..ef3ef566235e 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -186,3 +186,10 @@ EXPORT_SYMBOL(__mtdcr);
186EXPORT_SYMBOL(__mfdcr); 186EXPORT_SYMBOL(__mfdcr);
187#endif 187#endif
188EXPORT_SYMBOL(empty_zero_page); 188EXPORT_SYMBOL(empty_zero_page);
189
190#ifdef CONFIG_PPC64
191EXPORT_SYMBOL(__arch_hweight8);
192EXPORT_SYMBOL(__arch_hweight16);
193EXPORT_SYMBOL(__arch_hweight32);
194EXPORT_SYMBOL(__arch_hweight64);
195#endif
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 889f2bc106dd..166a6a0ad544 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o
16 16
17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ 17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
18 memcpy_64.o usercopy_64.o mem_64.o string.o \ 18 memcpy_64.o usercopy_64.o mem_64.o string.o \
19 checksum_wrappers_64.o 19 checksum_wrappers_64.o hweight_64.o
20obj-$(CONFIG_XMON) += sstep.o ldstfp.o 20obj-$(CONFIG_XMON) += sstep.o ldstfp.o
21obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o 21obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o
22obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o 22obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
new file mode 100644
index 000000000000..ee2320bb5ddf
--- /dev/null
+++ b/arch/powerpc/lib/hweight_64.S
@@ -0,0 +1,110 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2010
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/processor.h>
21#include <asm/ppc_asm.h>
22
23/* Note: This code relies on -mminimal-toc */
24
25_GLOBAL(__arch_hweight8)
26BEGIN_FTR_SECTION
27 b .__sw_hweight8
28 nop
29 nop
30FTR_SECTION_ELSE
31 popcntb r3,r3
32 clrldi r3,r3,64-8
33 blr
34ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
35
36_GLOBAL(__arch_hweight16)
37BEGIN_FTR_SECTION
38 b .__sw_hweight16
39 nop
40 nop
41 nop
42 nop
43FTR_SECTION_ELSE
44 BEGIN_FTR_SECTION_NESTED(50)
45 popcntb r3,r3
46 srdi r4,r3,8
47 add r3,r4,r3
48 clrldi r3,r3,64-8
49 blr
50 FTR_SECTION_ELSE_NESTED(50)
51 clrlwi r3,r3,16
52 popcntw r3,r3
53 clrldi r3,r3,64-8
54 blr
55 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
56ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
57
58_GLOBAL(__arch_hweight32)
59BEGIN_FTR_SECTION
60 b .__sw_hweight32
61 nop
62 nop
63 nop
64 nop
65 nop
66 nop
67FTR_SECTION_ELSE
68 BEGIN_FTR_SECTION_NESTED(51)
69 popcntb r3,r3
70 srdi r4,r3,16
71 add r3,r4,r3
72 srdi r4,r3,8
73 add r3,r4,r3
74 clrldi r3,r3,64-8
75 blr
76 FTR_SECTION_ELSE_NESTED(51)
77 popcntw r3,r3
78 clrldi r3,r3,64-8
79 blr
80 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
81ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
82
83_GLOBAL(__arch_hweight64)
84BEGIN_FTR_SECTION
85 b .__sw_hweight64
86 nop
87 nop
88 nop
89 nop
90 nop
91 nop
92 nop
93 nop
94FTR_SECTION_ELSE
95 BEGIN_FTR_SECTION_NESTED(52)
96 popcntb r3,r3
97 srdi r4,r3,32
98 add r3,r4,r3
99 srdi r4,r3,16
100 add r3,r4,r3
101 srdi r4,r3,8
102 add r3,r4,r3
103 clrldi r3,r3,64-8
104 blr
105 FTR_SECTION_ELSE_NESTED(52)
106 popcntd r3,r3
107 clrldi r3,r3,64-8
108 blr
109 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
110ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)