aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristophe Leroy <christophe.leroy@c-s.fr>2018-05-30 03:06:13 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2018-06-03 10:39:21 -0400
commitf36bbf21e8b911b3c629fd36d4d217105b47a20e (patch)
tree87e644ed266162108da765a94bb975443bff3f60
parent60f1d2893ee6de65cdea609c84950b133e76a769 (diff)
powerpc/lib: optimise 32 bits __clear_user()
Rewrite clear_user() on the same principle as memset(0), making use of dcbz to clear complete cache lines. This code is a copy/paste of memset(), with some modifications in order to retrieve remaining number of bytes to be cleared, as it needs to be returned in case of error. On the same way as done on PPC64 in commit 17968fbbd19f1 ("powerpc: 64bit optimised __clear_user"), the patch moves __clear_user() into a dedicated file string_32.S On a MPC885, throughput is almost doubled: Before: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s After: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s On a MPC8321, throughput is multiplied by 2.12: Before: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s After: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/lib/Makefile5
-rw-r--r--arch/powerpc/lib/string.S46
-rw-r--r--arch/powerpc/lib/string_32.S90
3 files changed, 93 insertions, 48 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
26 memcpy_power7.o 26 memcpy_power7.o
27 27
28obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ 28obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
29 string_64.o memcpy_64.o memcmp_64.o pmem.o 29 memcpy_64.o memcmp_64.o pmem.o
30 30
31obj64-$(CONFIG_SMP) += locks.o 31obj64-$(CONFIG_SMP) += locks.o
32obj64-$(CONFIG_ALTIVEC) += vmx-helper.o 32obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
33obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o 33obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
34 34
35obj-y += checksum_$(BITS).o checksum_wrappers.o 35obj-y += checksum_$(BITS).o checksum_wrappers.o \
36 string_$(BITS).o
36 37
37obj-y += sstep.o ldstfp.o quad.o 38obj-y += sstep.o ldstfp.o quad.o
38obj64-y += quad.o 39obj64-y += quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 0378def28d41..5343a88e619e 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -8,8 +8,6 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11#include <asm/processor.h>
12#include <asm/errno.h>
13#include <asm/ppc_asm.h> 11#include <asm/ppc_asm.h>
14#include <asm/export.h> 12#include <asm/export.h>
15#include <asm/cache.h> 13#include <asm/cache.h>
@@ -86,47 +84,3 @@ _GLOBAL(memchr)
862: li r3,0 842: li r3,0
87 blr 85 blr
88EXPORT_SYMBOL(memchr) 86EXPORT_SYMBOL(memchr)
89
90#ifdef CONFIG_PPC32
91_GLOBAL(__clear_user)
92 addi r6,r3,-4
93 li r3,0
94 li r5,0
95 cmplwi 0,r4,4
96 blt 7f
97 /* clear a single word */
9811: stwu r5,4(r6)
99 beqlr
100 /* clear word sized chunks */
101 andi. r0,r6,3
102 add r4,r0,r4
103 subf r6,r0,r6
104 srwi r0,r4,2
105 andi. r4,r4,3
106 mtctr r0
107 bdz 7f
1081: stwu r5,4(r6)
109 bdnz 1b
110 /* clear byte sized chunks */
1117: cmpwi 0,r4,0
112 beqlr
113 mtctr r4
114 addi r6,r6,3
1158: stbu r5,1(r6)
116 bdnz 8b
117 blr
11890: mr r3,r4
119 blr
12091: mfctr r3
121 slwi r3,r3,2
122 add r3,r3,r4
123 blr
12492: mfctr r3
125 blr
126
127 EX_TABLE(11b, 90b)
128 EX_TABLE(1b, 91b)
129 EX_TABLE(8b, 92b)
130
131EXPORT_SYMBOL(__clear_user)
132#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..f69a6aab7bfb
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,90 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3/*
4 * String handling functions for PowerPC32
5 *
6 * Copyright (C) 1996 Paul Mackerras.
7 *
8 */
9
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/cache.h>
13
14 .text
15
16CACHELINE_BYTES = L1_CACHE_BYTES
17LG_CACHELINE_BYTES = L1_CACHE_SHIFT
18CACHELINE_MASK = (L1_CACHE_BYTES-1)
19
20_GLOBAL(__clear_user)
21/*
22 * Use dcbz on the complete cache lines in the destination
23 * to set them to zero. This requires that the destination
24 * area is cacheable.
25 */
26 cmplwi cr0, r4, 4
27 mr r10, r3
28 li r3, 0
29 blt 7f
30
3111: stw r3, 0(r10)
32 beqlr
33 andi. r0, r10, 3
34 add r11, r0, r4
35 subf r6, r0, r10
36
37 clrlwi r7, r6, 32 - LG_CACHELINE_BYTES
38 add r8, r7, r11
39 srwi r9, r8, LG_CACHELINE_BYTES
40 addic. r9, r9, -1 /* total number of complete cachelines */
41 ble 2f
42 xori r0, r7, CACHELINE_MASK & ~3
43 srwi. r0, r0, 2
44 beq 3f
45 mtctr r0
464: stwu r3, 4(r6)
47 bdnz 4b
483: mtctr r9
49 li r7, 4
5010: dcbz r7, r6
51 addi r6, r6, CACHELINE_BYTES
52 bdnz 10b
53 clrlwi r11, r8, 32 - LG_CACHELINE_BYTES
54 addi r11, r11, 4
55
562: srwi r0 ,r11 ,2
57 mtctr r0
58 bdz 6f
591: stwu r3, 4(r6)
60 bdnz 1b
616: andi. r11, r11, 3
62 beqlr
63 mtctr r11
64 addi r6, r6, 3
658: stbu r3, 1(r6)
66 bdnz 8b
67 blr
68
697: cmpwi cr0, r4, 0
70 beqlr
71 mtctr r4
72 addi r6, r10, -1
739: stbu r3, 1(r6)
74 bdnz 9b
75 blr
76
7790: mr r3, r4
78 blr
7991: add r3, r10, r4
80 subf r3, r6, r3
81 blr
82
83 EX_TABLE(11b, 90b)
84 EX_TABLE(4b, 91b)
85 EX_TABLE(10b, 91b)
86 EX_TABLE(1b, 91b)
87 EX_TABLE(8b, 91b)
88 EX_TABLE(9b, 91b)
89
90EXPORT_SYMBOL(__clear_user)