diff options
Diffstat (limited to 'arch/x86/lib/memset_64.S')
-rw-r--r-- | arch/x86/lib/memset_64.S | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S new file mode 100644 index 000000000000..2c5948116bd2 --- /dev/null +++ b/arch/x86/lib/memset_64.S | |||
@@ -0,0 +1,133 @@ | |||
1 | /* Copyright 2002 Andi Kleen, SuSE Labs */ | ||
2 | |||
3 | #include <linux/linkage.h> | ||
4 | #include <asm/dwarf2.h> | ||
5 | |||
6 | /* | ||
7 | * ISO C memset - set a memory block to a byte value. | ||
8 | * | ||
9 | * rdi destination | ||
10 | * rsi value (char) | ||
11 | * rdx count (bytes) | ||
12 | * | ||
13 | * rax original destination | ||
14 | */ | ||
15 | ALIGN | ||
16 | memset_c: | ||
17 | CFI_STARTPROC | ||
18 | movq %rdi,%r9 | ||
19 | movl %edx,%r8d | ||
20 | andl $7,%r8d | ||
21 | movl %edx,%ecx | ||
22 | shrl $3,%ecx | ||
23 | /* expand byte value */ | ||
24 | movzbl %sil,%esi | ||
25 | movabs $0x0101010101010101,%rax | ||
26 | mulq %rsi /* with rax, clobbers rdx */ | ||
27 | rep stosq | ||
28 | movl %r8d,%ecx | ||
29 | rep stosb | ||
30 | movq %r9,%rax | ||
31 | ret | ||
32 | CFI_ENDPROC | ||
33 | ENDPROC(memset_c) | ||
34 | |||
35 | ENTRY(memset) | ||
36 | ENTRY(__memset) | ||
37 | CFI_STARTPROC | ||
38 | movq %rdi,%r10 | ||
39 | movq %rdx,%r11 | ||
40 | |||
41 | /* expand byte value */ | ||
42 | movzbl %sil,%ecx | ||
43 | movabs $0x0101010101010101,%rax | ||
44 | mul %rcx /* with rax, clobbers rdx */ | ||
45 | |||
46 | /* align dst */ | ||
47 | movl %edi,%r9d | ||
48 | andl $7,%r9d | ||
49 | jnz .Lbad_alignment | ||
50 | CFI_REMEMBER_STATE | ||
51 | .Lafter_bad_alignment: | ||
52 | |||
53 | movl %r11d,%ecx | ||
54 | shrl $6,%ecx | ||
55 | jz .Lhandle_tail | ||
56 | |||
57 | .p2align 4 | ||
58 | .Lloop_64: | ||
59 | decl %ecx | ||
60 | movq %rax,(%rdi) | ||
61 | movq %rax,8(%rdi) | ||
62 | movq %rax,16(%rdi) | ||
63 | movq %rax,24(%rdi) | ||
64 | movq %rax,32(%rdi) | ||
65 | movq %rax,40(%rdi) | ||
66 | movq %rax,48(%rdi) | ||
67 | movq %rax,56(%rdi) | ||
68 | leaq 64(%rdi),%rdi | ||
69 | jnz .Lloop_64 | ||
70 | |||
71 | /* Handle tail in loops. The loops should be faster than hard | ||
72 | to predict jump tables. */ | ||
73 | .p2align 4 | ||
74 | .Lhandle_tail: | ||
75 | movl %r11d,%ecx | ||
76 | andl $63&(~7),%ecx | ||
77 | jz .Lhandle_7 | ||
78 | shrl $3,%ecx | ||
79 | .p2align 4 | ||
80 | .Lloop_8: | ||
81 | decl %ecx | ||
82 | movq %rax,(%rdi) | ||
83 | leaq 8(%rdi),%rdi | ||
84 | jnz .Lloop_8 | ||
85 | |||
86 | .Lhandle_7: | ||
87 | movl %r11d,%ecx | ||
88 | andl $7,%ecx | ||
89 | jz .Lende | ||
90 | .p2align 4 | ||
91 | .Lloop_1: | ||
92 | decl %ecx | ||
93 | movb %al,(%rdi) | ||
94 | leaq 1(%rdi),%rdi | ||
95 | jnz .Lloop_1 | ||
96 | |||
97 | .Lende: | ||
98 | movq %r10,%rax | ||
99 | ret | ||
100 | |||
101 | CFI_RESTORE_STATE | ||
102 | .Lbad_alignment: | ||
103 | cmpq $7,%r11 | ||
104 | jbe .Lhandle_7 | ||
105 | movq %rax,(%rdi) /* unaligned store */ | ||
106 | movq $8,%r8 | ||
107 | subq %r9,%r8 | ||
108 | addq %r8,%rdi | ||
109 | subq %r8,%r11 | ||
110 | jmp .Lafter_bad_alignment | ||
111 | .Lfinal: | ||
112 | CFI_ENDPROC | ||
113 | ENDPROC(memset) | ||
114 | ENDPROC(__memset) | ||
115 | |||
116 | /* Some CPUs run faster using the string instructions. | ||
117 | It is also a lot simpler. Use this when possible */ | ||
118 | |||
119 | #include <asm/cpufeature.h> | ||
120 | |||
121 | .section .altinstr_replacement,"ax" | ||
122 | 1: .byte 0xeb /* jmp <disp8> */ | ||
123 | .byte (memset_c - memset) - (2f - 1b) /* offset */ | ||
124 | 2: | ||
125 | .previous | ||
126 | .section .altinstructions,"a" | ||
127 | .align 8 | ||
128 | .quad memset | ||
129 | .quad 1b | ||
130 | .byte X86_FEATURE_REP_GOOD | ||
131 | .byte .Lfinal - memset | ||
132 | .byte 2b - 1b | ||
133 | .previous | ||