aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2012-05-27 15:54:03 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-07-03 00:14:41 -0400
commit17968fbbd19f1bb281ee4eb2548764ac5664c4ec (patch)
treec6b7a68ea7897e6bf213bffa4795fe209609cba9
parentd136e27326a3bd50d7929a43c018abf13e426b7e (diff)
powerpc: 64bit optimised __clear_user
I noticed __clear_user high up in a profile of one of my RAID stress tests. The testcase was doing a dd from /dev/zero which ends up calling __clear_user. __clear_user is basically a loop with a single 4 byte store which is horribly slow. We can do much better by aligning the desination and doing 32 bytes of 8 byte stores in a loop. The following testcase was used to verify the patch: http://ozlabs.org/~anton/junkcode/stress_clear_user.c To show the improvement in performance I ran a dd from /dev/zero to /dev/null on a POWER7 box: Before: # dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 3.72379 s, 2.8 GB/s After: # time dd if=/dev/zero of=/dev/null bs=1M count=10000 10485760000 bytes (10 GB) copied, 0.728318 s, 14.4 GB/s Over 5x faster. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/lib/string.S2
-rw-r--r--arch/powerpc/lib/string_64.S141
3 files changed, 144 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7735a2c2e6d9..f049e339e456 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o
17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ 17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
18 memcpy_64.o usercopy_64.o mem_64.o string.o \ 18 memcpy_64.o usercopy_64.o mem_64.o string.o \
19 checksum_wrappers_64.o hweight_64.o \ 19 checksum_wrappers_64.o hweight_64.o \
20 copyuser_power7.o 20 copyuser_power7.o string_64.o
21obj-$(CONFIG_XMON) += sstep.o ldstfp.o 21obj-$(CONFIG_XMON) += sstep.o ldstfp.o
22obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o 22obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o
23obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o 23obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 093d6316435c..1b5a0a09d609 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -119,6 +119,7 @@ _GLOBAL(memchr)
1192: li r3,0 1192: li r3,0
120 blr 120 blr
121 121
122#ifdef CONFIG_PPC32
122_GLOBAL(__clear_user) 123_GLOBAL(__clear_user)
123 addi r6,r3,-4 124 addi r6,r3,-4
124 li r3,0 125 li r3,0
@@ -160,3 +161,4 @@ _GLOBAL(__clear_user)
160 PPC_LONG 1b,91b 161 PPC_LONG 1b,91b
161 PPC_LONG 8b,92b 162 PPC_LONG 8b,92b
162 .text 163 .text
164#endif
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 000000000000..6613b9047005
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,141 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20
21#include <asm/ppc_asm.h>
22
23/**
24 * __clear_user: - Zero a block of memory in user space, with less checking.
25 * @to: Destination address, in user space.
26 * @n: Number of bytes to zero.
27 *
28 * Zero a block of memory in user space. Caller must check
29 * the specified block with access_ok() before calling this function.
30 *
31 * Returns number of bytes that could not be cleared.
32 * On success, this will be zero.
33 */
34
35 .macro err1
36100:
37 .section __ex_table,"a"
38 .align 3
39 .llong 100b,.Ldo_err1
40 .previous
41 .endm
42
43 .macro err2
44200:
45 .section __ex_table,"a"
46 .align 3
47 .llong 200b,.Ldo_err2
48 .previous
49 .endm
50
51 .macro err3
52300:
53 .section __ex_table,"a"
54 .align 3
55 .llong 300b,.Ldo_err3
56 .previous
57 .endm
58
59.Ldo_err1:
60 mr r3,r8
61
62.Ldo_err2:
63 mtctr r4
641:
65err3; stb r0,0(r3)
66 addi r3,r3,1
67 addi r4,r4,-1
68 bdnz 1b
69
70.Ldo_err3:
71 mr r3,r4
72 blr
73
74_GLOBAL(__clear_user)
75 cmpdi r4,32
76 neg r6,r3
77 li r0,0
78 blt .Lshort_clear
79 mr r8,r3
80 mtocrf 0x01,r6
81 clrldi r6,r6,(64-3)
82
83 /* Get the destination 8 byte aligned */
84 bf cr7*4+3,1f
85err1; stb r0,0(r3)
86 addi r3,r3,1
87
881: bf cr7*4+2,2f
89err1; sth r0,0(r3)
90 addi r3,r3,2
91
922: bf cr7*4+1,3f
93err1; stw r0,0(r3)
94 addi r3,r3,4
95
963: sub r4,r4,r6
97 srdi r6,r4,5
98 cmpdi r4,32
99 blt .Lshort_clear
100 mtctr r6
101
102 /* Do 32 byte chunks */
1034:
104err2; std r0,0(r3)
105err2; std r0,8(r3)
106err2; std r0,16(r3)
107err2; std r0,24(r3)
108 addi r3,r3,32
109 addi r4,r4,-32
110 bdnz 4b
111
112.Lshort_clear:
113 /* up to 31 bytes to go */
114 cmpdi r4,16
115 blt 6f
116err2; std r0,0(r3)
117err2; std r0,8(r3)
118 addi r3,r3,16
119 addi r4,r4,-16
120
121 /* Up to 15 bytes to go */
1226: mr r8,r3
123 clrldi r4,r4,(64-4)
124 mtocrf 0x01,r4
125 bf cr7*4+0,7f
126err1; std r0,0(r3)
127 addi r3,r3,8
128
1297: bf cr7*4+1,8f
130err1; stw r0,0(r3)
131 addi r3,r3,4
132
1338: bf cr7*4+2,9f
134err1; sth r0,0(r3)
135 addi r3,r3,2
136
1379: bf cr7*4+3,10f
138err1; stb r0,0(r3)
139
14010: li r3,0
141 blr