diff options
| -rw-r--r-- | arch/x86_64/kernel/setup.c | 6 | ||||
| -rw-r--r-- | arch/x86_64/lib/clear_page.S | 38 | ||||
| -rw-r--r-- | arch/x86_64/lib/copy_page.S | 87 | ||||
| -rw-r--r-- | arch/x86_64/lib/copy_user.S | 247 | ||||
| -rw-r--r-- | arch/x86_64/lib/memcpy.S | 93 | ||||
| -rw-r--r-- | arch/x86_64/lib/memset.S | 94 | ||||
| -rw-r--r-- | include/asm-x86_64/cpufeature.h | 2 |
7 files changed, 543 insertions, 24 deletions
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 28895c03cb11..506f152c2389 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c | |||
| @@ -877,6 +877,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) | |||
| 877 | static int __init init_amd(struct cpuinfo_x86 *c) | 877 | static int __init init_amd(struct cpuinfo_x86 *c) |
| 878 | { | 878 | { |
| 879 | int r; | 879 | int r; |
| 880 | unsigned level; | ||
| 880 | 881 | ||
| 881 | #ifdef CONFIG_SMP | 882 | #ifdef CONFIG_SMP |
| 882 | unsigned long value; | 883 | unsigned long value; |
| @@ -899,6 +900,11 @@ static int __init init_amd(struct cpuinfo_x86 *c) | |||
| 899 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ | 900 | 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ |
| 900 | clear_bit(0*32+31, &c->x86_capability); | 901 | clear_bit(0*32+31, &c->x86_capability); |
| 901 | 902 | ||
| 903 | /* On C+ stepping K8 rep microcode works well for copy/memset */ | ||
| 904 | level = cpuid_eax(1); | ||
| 905 | if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) | ||
| 906 | set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); | ||
| 907 | |||
| 902 | r = get_model_name(c); | 908 | r = get_model_name(c); |
| 903 | if (!r) { | 909 | if (!r) { |
| 904 | switch (c->x86) { | 910 | switch (c->x86) { |
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 43d9fa136180..1f81b79b796c 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S | |||
| @@ -5,8 +5,46 @@ | |||
| 5 | .globl clear_page | 5 | .globl clear_page |
| 6 | .p2align 4 | 6 | .p2align 4 |
| 7 | clear_page: | 7 | clear_page: |
| 8 | xorl %eax,%eax | ||
| 9 | movl $4096/64,%ecx | ||
| 10 | .p2align 4 | ||
| 11 | .Lloop: | ||
| 12 | decl %ecx | ||
| 13 | #define PUT(x) movq %rax,x*8(%rdi) | ||
| 14 | movq %rax,(%rdi) | ||
| 15 | PUT(1) | ||
| 16 | PUT(2) | ||
| 17 | PUT(3) | ||
| 18 | PUT(4) | ||
| 19 | PUT(5) | ||
| 20 | PUT(6) | ||
| 21 | PUT(7) | ||
| 22 | leaq 64(%rdi),%rdi | ||
| 23 | jnz .Lloop | ||
| 24 | nop | ||
| 25 | ret | ||
| 26 | clear_page_end: | ||
| 27 | |||
| 28 | /* Some CPUs run faster using the string instructions. | ||
| 29 | It is also a lot simpler. Use this when possible */ | ||
| 30 | |||
| 31 | #include <asm/cpufeature.h> | ||
| 32 | |||
| 33 | .section .altinstructions,"a" | ||
| 34 | .align 8 | ||
| 35 | .quad clear_page | ||
| 36 | .quad clear_page_c | ||
| 37 | .byte X86_FEATURE_REP_GOOD | ||
| 38 | .byte clear_page_end-clear_page | ||
| 39 | .byte clear_page_c_end-clear_page_c | ||
| 40 | .previous | ||
| 41 | |||
| 42 | .section .altinstr_replacement,"ax" | ||
| 43 | clear_page_c: | ||
| 8 | movl $4096/8,%ecx | 44 | movl $4096/8,%ecx |
| 9 | xorl %eax,%eax | 45 | xorl %eax,%eax |
| 10 | rep | 46 | rep |
| 11 | stosq | 47 | stosq |
| 12 | ret | 48 | ret |
| 49 | clear_page_c_end: | ||
| 50 | .previous | ||
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 621a19769406..8fa19d96a7ee 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S | |||
| @@ -8,7 +8,94 @@ | |||
| 8 | .globl copy_page | 8 | .globl copy_page |
| 9 | .p2align 4 | 9 | .p2align 4 |
| 10 | copy_page: | 10 | copy_page: |
| 11 | subq $3*8,%rsp | ||
| 12 | movq %rbx,(%rsp) | ||
| 13 | movq %r12,1*8(%rsp) | ||
| 14 | movq %r13,2*8(%rsp) | ||
| 15 | |||
| 16 | movl $(4096/64)-5,%ecx | ||
| 17 | .p2align 4 | ||
| 18 | .Loop64: | ||
| 19 | dec %rcx | ||
| 20 | |||
| 21 | movq (%rsi), %rax | ||
| 22 | movq 8 (%rsi), %rbx | ||
| 23 | movq 16 (%rsi), %rdx | ||
| 24 | movq 24 (%rsi), %r8 | ||
| 25 | movq 32 (%rsi), %r9 | ||
| 26 | movq 40 (%rsi), %r10 | ||
| 27 | movq 48 (%rsi), %r11 | ||
| 28 | movq 56 (%rsi), %r12 | ||
| 29 | |||
| 30 | prefetcht0 5*64(%rsi) | ||
| 31 | |||
| 32 | movq %rax, (%rdi) | ||
| 33 | movq %rbx, 8 (%rdi) | ||
| 34 | movq %rdx, 16 (%rdi) | ||
| 35 | movq %r8, 24 (%rdi) | ||
| 36 | movq %r9, 32 (%rdi) | ||
| 37 | movq %r10, 40 (%rdi) | ||
| 38 | movq %r11, 48 (%rdi) | ||
| 39 | movq %r12, 56 (%rdi) | ||
| 40 | |||
| 41 | leaq 64 (%rsi), %rsi | ||
| 42 | leaq 64 (%rdi), %rdi | ||
| 43 | |||
| 44 | jnz .Loop64 | ||
| 45 | |||
| 46 | movl $5,%ecx | ||
| 47 | .p2align 4 | ||
| 48 | .Loop2: | ||
| 49 | decl %ecx | ||
| 50 | |||
| 51 | movq (%rsi), %rax | ||
| 52 | movq 8 (%rsi), %rbx | ||
| 53 | movq 16 (%rsi), %rdx | ||
| 54 | movq 24 (%rsi), %r8 | ||
| 55 | movq 32 (%rsi), %r9 | ||
| 56 | movq 40 (%rsi), %r10 | ||
| 57 | movq 48 (%rsi), %r11 | ||
| 58 | movq 56 (%rsi), %r12 | ||
| 59 | |||
| 60 | movq %rax, (%rdi) | ||
| 61 | movq %rbx, 8 (%rdi) | ||
| 62 | movq %rdx, 16 (%rdi) | ||
| 63 | movq %r8, 24 (%rdi) | ||
| 64 | movq %r9, 32 (%rdi) | ||
| 65 | movq %r10, 40 (%rdi) | ||
| 66 | movq %r11, 48 (%rdi) | ||
| 67 | movq %r12, 56 (%rdi) | ||
| 68 | |||
| 69 | leaq 64(%rdi),%rdi | ||
| 70 | leaq 64(%rsi),%rsi | ||
| 71 | |||
| 72 | jnz .Loop2 | ||
| 73 | |||
| 74 | movq (%rsp),%rbx | ||
| 75 | movq 1*8(%rsp),%r12 | ||
| 76 | movq 2*8(%rsp),%r13 | ||
| 77 | addq $3*8,%rsp | ||
| 78 | ret | ||
| 79 | |||
| 80 | /* Some CPUs run faster using the string copy instructions. | ||
| 81 | It is also a lot simpler. Use this when possible */ | ||
| 82 | |||
| 83 | #include <asm/cpufeature.h> | ||
| 84 | |||
| 85 | .section .altinstructions,"a" | ||
| 86 | .align 8 | ||
| 87 | .quad copy_page | ||
| 88 | .quad copy_page_c | ||
| 89 | .byte X86_FEATURE_REP_GOOD | ||
| 90 | .byte copy_page_c_end-copy_page_c | ||
| 91 | .byte copy_page_c_end-copy_page_c | ||
| 92 | .previous | ||
| 93 | |||
| 94 | .section .altinstr_replacement,"ax" | ||
| 95 | copy_page_c: | ||
| 11 | movl $4096/8,%ecx | 96 | movl $4096/8,%ecx |
| 12 | rep | 97 | rep |
| 13 | movsq | 98 | movsq |
| 14 | ret | 99 | ret |
| 100 | copy_page_c_end: | ||
| 101 | .previous | ||
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 79422b6559c3..f64569b83b54 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S | |||
| @@ -4,9 +4,12 @@ | |||
| 4 | * Functions to copy from and to user space. | 4 | * Functions to copy from and to user space. |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | #define FIX_ALIGNMENT 1 | ||
| 8 | |||
| 7 | #include <asm/current.h> | 9 | #include <asm/current.h> |
| 8 | #include <asm/asm-offsets.h> | 10 | #include <asm/asm-offsets.h> |
| 9 | #include <asm/thread_info.h> | 11 | #include <asm/thread_info.h> |
| 12 | #include <asm/cpufeature.h> | ||
| 10 | 13 | ||
| 11 | /* Standard copy_to_user with segment limit checking */ | 14 | /* Standard copy_to_user with segment limit checking */ |
| 12 | .globl copy_to_user | 15 | .globl copy_to_user |
| @@ -18,7 +21,23 @@ copy_to_user: | |||
| 18 | jc bad_to_user | 21 | jc bad_to_user |
| 19 | cmpq threadinfo_addr_limit(%rax),%rcx | 22 | cmpq threadinfo_addr_limit(%rax),%rcx |
| 20 | jae bad_to_user | 23 | jae bad_to_user |
| 21 | jmp copy_user_generic | 24 | 2: |
| 25 | .byte 0xe9 /* 32bit jump */ | ||
| 26 | .long .Lcug-1f | ||
| 27 | 1: | ||
| 28 | |||
| 29 | .section .altinstr_replacement,"ax" | ||
| 30 | 3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ | ||
| 31 | .long copy_user_generic_c-1b /* offset */ | ||
| 32 | .previous | ||
| 33 | .section .altinstructions,"a" | ||
| 34 | .align 8 | ||
| 35 | .quad 2b | ||
| 36 | .quad 3b | ||
| 37 | .byte X86_FEATURE_REP_GOOD | ||
| 38 | .byte 5 | ||
| 39 | .byte 5 | ||
| 40 | .previous | ||
| 22 | 41 | ||
| 23 | /* Standard copy_from_user with segment limit checking */ | 42 | /* Standard copy_from_user with segment limit checking */ |
| 24 | .globl copy_from_user | 43 | .globl copy_from_user |
| @@ -53,44 +72,230 @@ bad_to_user: | |||
| 53 | * rsi source | 72 | * rsi source |
| 54 | * rdx count | 73 | * rdx count |
| 55 | * | 74 | * |
| 56 | * Only 4GB of copy is supported. This shouldn't be a problem | ||
| 57 | * because the kernel normally only writes from/to page sized chunks | ||
| 58 | * even if user space passed a longer buffer. | ||
| 59 | * And more would be dangerous because both Intel and AMD have | ||
| 60 | * errata with rep movsq > 4GB. If someone feels the need to fix | ||
| 61 | * this please consider this. | ||
| 62 | * | ||
| 63 | * Output: | 75 | * Output: |
| 64 | * eax uncopied bytes or 0 if successful. | 76 | * eax uncopied bytes or 0 if successful. |
| 65 | */ | 77 | */ |
| 66 | |||
| 67 | .globl copy_user_generic | 78 | .globl copy_user_generic |
| 79 | .p2align 4 | ||
| 68 | copy_user_generic: | 80 | copy_user_generic: |
| 81 | .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ | ||
| 82 | .byte 0x66,0x90 | ||
| 83 | 1: | ||
| 84 | .section .altinstr_replacement,"ax" | ||
| 85 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | ||
| 86 | .long copy_user_generic_c-1b /* offset */ | ||
| 87 | .previous | ||
| 88 | .section .altinstructions,"a" | ||
| 89 | .align 8 | ||
| 90 | .quad copy_user_generic | ||
| 91 | .quad 2b | ||
| 92 | .byte X86_FEATURE_REP_GOOD | ||
| 93 | .byte 5 | ||
| 94 | .byte 5 | ||
| 95 | .previous | ||
| 96 | .Lcug: | ||
| 97 | pushq %rbx | ||
| 98 | xorl %eax,%eax /*zero for the exception handler */ | ||
| 99 | |||
| 100 | #ifdef FIX_ALIGNMENT | ||
| 101 | /* check for bad alignment of destination */ | ||
| 102 | movl %edi,%ecx | ||
| 103 | andl $7,%ecx | ||
| 104 | jnz .Lbad_alignment | ||
| 105 | .Lafter_bad_alignment: | ||
| 106 | #endif | ||
| 107 | |||
| 108 | movq %rdx,%rcx | ||
| 109 | |||
| 110 | movl $64,%ebx | ||
| 111 | shrq $6,%rdx | ||
| 112 | decq %rdx | ||
| 113 | js .Lhandle_tail | ||
| 114 | |||
| 115 | .p2align 4 | ||
| 116 | .Lloop: | ||
| 117 | .Ls1: movq (%rsi),%r11 | ||
| 118 | .Ls2: movq 1*8(%rsi),%r8 | ||
| 119 | .Ls3: movq 2*8(%rsi),%r9 | ||
| 120 | .Ls4: movq 3*8(%rsi),%r10 | ||
| 121 | .Ld1: movq %r11,(%rdi) | ||
| 122 | .Ld2: movq %r8,1*8(%rdi) | ||
| 123 | .Ld3: movq %r9,2*8(%rdi) | ||
| 124 | .Ld4: movq %r10,3*8(%rdi) | ||
| 125 | |||
| 126 | .Ls5: movq 4*8(%rsi),%r11 | ||
| 127 | .Ls6: movq 5*8(%rsi),%r8 | ||
| 128 | .Ls7: movq 6*8(%rsi),%r9 | ||
| 129 | .Ls8: movq 7*8(%rsi),%r10 | ||
| 130 | .Ld5: movq %r11,4*8(%rdi) | ||
| 131 | .Ld6: movq %r8,5*8(%rdi) | ||
| 132 | .Ld7: movq %r9,6*8(%rdi) | ||
| 133 | .Ld8: movq %r10,7*8(%rdi) | ||
| 134 | |||
| 135 | decq %rdx | ||
| 136 | |||
| 137 | leaq 64(%rsi),%rsi | ||
| 138 | leaq 64(%rdi),%rdi | ||
| 139 | |||
| 140 | jns .Lloop | ||
| 141 | |||
| 142 | .p2align 4 | ||
| 143 | .Lhandle_tail: | ||
| 144 | movl %ecx,%edx | ||
| 145 | andl $63,%ecx | ||
| 146 | shrl $3,%ecx | ||
| 147 | jz .Lhandle_7 | ||
| 148 | movl $8,%ebx | ||
| 149 | .p2align 4 | ||
| 150 | .Lloop_8: | ||
| 151 | .Ls9: movq (%rsi),%r8 | ||
| 152 | .Ld9: movq %r8,(%rdi) | ||
| 153 | decl %ecx | ||
| 154 | leaq 8(%rdi),%rdi | ||
| 155 | leaq 8(%rsi),%rsi | ||
| 156 | jnz .Lloop_8 | ||
| 157 | |||
| 158 | .Lhandle_7: | ||
| 159 | movl %edx,%ecx | ||
| 160 | andl $7,%ecx | ||
| 161 | jz .Lende | ||
| 162 | .p2align 4 | ||
| 163 | .Lloop_1: | ||
| 164 | .Ls10: movb (%rsi),%bl | ||
| 165 | .Ld10: movb %bl,(%rdi) | ||
| 166 | incq %rdi | ||
| 167 | incq %rsi | ||
| 168 | decl %ecx | ||
| 169 | jnz .Lloop_1 | ||
| 170 | |||
| 171 | .Lende: | ||
| 172 | popq %rbx | ||
| 173 | ret | ||
| 174 | |||
| 175 | #ifdef FIX_ALIGNMENT | ||
| 176 | /* align destination */ | ||
| 177 | .p2align 4 | ||
| 178 | .Lbad_alignment: | ||
| 179 | movl $8,%r9d | ||
| 180 | subl %ecx,%r9d | ||
| 181 | movl %r9d,%ecx | ||
| 182 | cmpq %r9,%rdx | ||
| 183 | jz .Lhandle_7 | ||
| 184 | js .Lhandle_7 | ||
| 185 | .Lalign_1: | ||
| 186 | .Ls11: movb (%rsi),%bl | ||
| 187 | .Ld11: movb %bl,(%rdi) | ||
| 188 | incq %rsi | ||
| 189 | incq %rdi | ||
| 190 | decl %ecx | ||
| 191 | jnz .Lalign_1 | ||
| 192 | subq %r9,%rdx | ||
| 193 | jmp .Lafter_bad_alignment | ||
| 194 | #endif | ||
| 195 | |||
| 196 | /* table sorted by exception address */ | ||
| 197 | .section __ex_table,"a" | ||
| 198 | .align 8 | ||
| 199 | .quad .Ls1,.Ls1e | ||
| 200 | .quad .Ls2,.Ls2e | ||
| 201 | .quad .Ls3,.Ls3e | ||
| 202 | .quad .Ls4,.Ls4e | ||
| 203 | .quad .Ld1,.Ls1e | ||
| 204 | .quad .Ld2,.Ls2e | ||
| 205 | .quad .Ld3,.Ls3e | ||
| 206 | .quad .Ld4,.Ls4e | ||
| 207 | .quad .Ls5,.Ls5e | ||
| 208 | .quad .Ls6,.Ls6e | ||
| 209 | .quad .Ls7,.Ls7e | ||
| 210 | .quad .Ls8,.Ls8e | ||
| 211 | .quad .Ld5,.Ls5e | ||
| 212 | .quad .Ld6,.Ls6e | ||
| 213 | .quad .Ld7,.Ls7e | ||
| 214 | .quad .Ld8,.Ls8e | ||
| 215 | .quad .Ls9,.Le_quad | ||
| 216 | .quad .Ld9,.Le_quad | ||
| 217 | .quad .Ls10,.Le_byte | ||
| 218 | .quad .Ld10,.Le_byte | ||
| 219 | #ifdef FIX_ALIGNMENT | ||
| 220 | .quad .Ls11,.Lzero_rest | ||
| 221 | .quad .Ld11,.Lzero_rest | ||
| 222 | #endif | ||
| 223 | .quad .Le5,.Le_zero | ||
| 224 | .previous | ||
| 225 | |||
| 226 | /* compute 64-offset for main loop. 8 bytes accuracy with error on the | ||
| 227 | pessimistic side. this is gross. it would be better to fix the | ||
| 228 | interface. */ | ||
| 229 | /* eax: zero, ebx: 64 */ | ||
| 230 | .Ls1e: addl $8,%eax | ||
| 231 | .Ls2e: addl $8,%eax | ||
| 232 | .Ls3e: addl $8,%eax | ||
| 233 | .Ls4e: addl $8,%eax | ||
| 234 | .Ls5e: addl $8,%eax | ||
| 235 | .Ls6e: addl $8,%eax | ||
| 236 | .Ls7e: addl $8,%eax | ||
| 237 | .Ls8e: addl $8,%eax | ||
| 238 | addq %rbx,%rdi /* +64 */ | ||
| 239 | subq %rax,%rdi /* correct destination with computed offset */ | ||
| 240 | |||
| 241 | shlq $6,%rdx /* loop counter * 64 (stride length) */ | ||
| 242 | addq %rax,%rdx /* add offset to loopcnt */ | ||
| 243 | andl $63,%ecx /* remaining bytes */ | ||
| 244 | addq %rcx,%rdx /* add them */ | ||
| 245 | jmp .Lzero_rest | ||
| 246 | |||
| 247 | /* exception on quad word loop in tail handling */ | ||
| 248 | /* ecx: loopcnt/8, %edx: length, rdi: correct */ | ||
| 249 | .Le_quad: | ||
| 250 | shll $3,%ecx | ||
| 251 | andl $7,%edx | ||
| 252 | addl %ecx,%edx | ||
| 253 | /* edx: bytes to zero, rdi: dest, eax:zero */ | ||
| 254 | .Lzero_rest: | ||
| 255 | movq %rdx,%rcx | ||
| 256 | .Le_byte: | ||
| 257 | xorl %eax,%eax | ||
| 258 | .Le5: rep | ||
| 259 | stosb | ||
| 260 | /* when there is another exception while zeroing the rest just return */ | ||
| 261 | .Le_zero: | ||
| 262 | movq %rdx,%rax | ||
| 263 | jmp .Lende | ||
| 264 | |||
| 265 | /* Some CPUs run faster using the string copy instructions. | ||
| 266 | This is also a lot simpler. Use them when possible. | ||
| 267 | Patch in jmps to this code instead of copying it fully | ||
| 268 | to avoid unwanted aliasing in the exception tables. */ | ||
| 269 | |||
| 270 | /* rdi destination | ||
| 271 | * rsi source | ||
| 272 | * rdx count | ||
| 273 | * | ||
| 274 | * Output: | ||
| 275 | * eax uncopied bytes or 0 if successfull. | ||
| 276 | * | ||
| 277 | * Only 4GB of copy is supported. This shouldn't be a problem | ||
| 278 | * because the kernel normally only writes from/to page sized chunks | ||
| 279 | * even if user space passed a longer buffer. | ||
| 280 | * And more would be dangerous because both Intel and AMD have | ||
| 281 | * errata with rep movsq > 4GB. If someone feels the need to fix | ||
| 282 | * this please consider this. | ||
| 283 | */ | ||
| 284 | copy_user_generic_c: | ||
| 69 | movl %edx,%ecx | 285 | movl %edx,%ecx |
| 70 | shrl $3,%ecx | 286 | shrl $3,%ecx |
| 71 | andl $7,%edx | 287 | andl $7,%edx |
| 72 | jz 5f | ||
| 73 | 1: rep | 288 | 1: rep |
| 74 | movsq | 289 | movsq |
| 75 | movl %edx,%ecx | 290 | movl %edx,%ecx |
| 76 | xor %eax,%eax | ||
| 77 | 2: rep | 291 | 2: rep |
| 78 | movsb | 292 | movsb |
| 293 | 4: movl %ecx,%eax | ||
| 79 | ret | 294 | ret |
| 80 | /* align here? */ | ||
| 81 | 5: xorl %eax,%eax | ||
| 82 | 6: rep movsq | ||
| 83 | ret | ||
| 84 | |||
| 85 | .section .fixup,"ax" | ||
| 86 | 3: lea (%rdx,%rcx,8),%rax | 295 | 3: lea (%rdx,%rcx,8),%rax |
| 87 | ret | 296 | ret |
| 88 | 4: movl %ecx,%eax | ||
| 89 | ret | ||
| 90 | .previous | ||
| 91 | 297 | ||
| 92 | .section __ex_table,"a" | 298 | .section __ex_table,"a" |
| 93 | .quad 1b,3b | 299 | .quad 1b,3b |
| 94 | .quad 2b,4b | 300 | .quad 2b,4b |
| 95 | .quad 6b,4b | ||
| 96 | .previous | 301 | .previous |
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 92dd80544602..5554948b5554 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S | |||
| @@ -11,8 +11,6 @@ | |||
| 11 | * | 11 | * |
| 12 | * Output: | 12 | * Output: |
| 13 | * rax original destination | 13 | * rax original destination |
| 14 | * | ||
| 15 | * TODO: check best memcpy for PSC | ||
| 16 | */ | 14 | */ |
| 17 | 15 | ||
| 18 | .globl __memcpy | 16 | .globl __memcpy |
| @@ -20,6 +18,95 @@ | |||
| 20 | .p2align 4 | 18 | .p2align 4 |
| 21 | __memcpy: | 19 | __memcpy: |
| 22 | memcpy: | 20 | memcpy: |
| 21 | pushq %rbx | ||
| 22 | movq %rdi,%rax | ||
| 23 | |||
| 24 | movl %edx,%ecx | ||
| 25 | shrl $6,%ecx | ||
| 26 | jz .Lhandle_tail | ||
| 27 | |||
| 28 | .p2align 4 | ||
| 29 | .Lloop_64: | ||
| 30 | decl %ecx | ||
| 31 | |||
| 32 | movq (%rsi),%r11 | ||
| 33 | movq 8(%rsi),%r8 | ||
| 34 | |||
| 35 | movq %r11,(%rdi) | ||
| 36 | movq %r8,1*8(%rdi) | ||
| 37 | |||
| 38 | movq 2*8(%rsi),%r9 | ||
| 39 | movq 3*8(%rsi),%r10 | ||
| 40 | |||
| 41 | movq %r9,2*8(%rdi) | ||
| 42 | movq %r10,3*8(%rdi) | ||
| 43 | |||
| 44 | movq 4*8(%rsi),%r11 | ||
| 45 | movq 5*8(%rsi),%r8 | ||
| 46 | |||
| 47 | movq %r11,4*8(%rdi) | ||
| 48 | movq %r8,5*8(%rdi) | ||
| 49 | |||
| 50 | movq 6*8(%rsi),%r9 | ||
| 51 | movq 7*8(%rsi),%r10 | ||
| 52 | |||
| 53 | movq %r9,6*8(%rdi) | ||
| 54 | movq %r10,7*8(%rdi) | ||
| 55 | |||
| 56 | leaq 64(%rsi),%rsi | ||
| 57 | leaq 64(%rdi),%rdi | ||
| 58 | jnz .Lloop_64 | ||
| 59 | |||
| 60 | .Lhandle_tail: | ||
| 61 | movl %edx,%ecx | ||
| 62 | andl $63,%ecx | ||
| 63 | shrl $3,%ecx | ||
| 64 | jz .Lhandle_7 | ||
| 65 | .p2align 4 | ||
| 66 | .Lloop_8: | ||
| 67 | decl %ecx | ||
| 68 | movq (%rsi),%r8 | ||
| 69 | movq %r8,(%rdi) | ||
| 70 | leaq 8(%rdi),%rdi | ||
| 71 | leaq 8(%rsi),%rsi | ||
| 72 | jnz .Lloop_8 | ||
| 73 | |||
| 74 | .Lhandle_7: | ||
| 75 | movl %edx,%ecx | ||
| 76 | andl $7,%ecx | ||
| 77 | jz .Lende | ||
| 78 | .p2align 4 | ||
| 79 | .Lloop_1: | ||
| 80 | movb (%rsi),%r8b | ||
| 81 | movb %r8b,(%rdi) | ||
| 82 | incq %rdi | ||
| 83 | incq %rsi | ||
| 84 | decl %ecx | ||
| 85 | jnz .Lloop_1 | ||
| 86 | |||
| 87 | .Lende: | ||
| 88 | popq %rbx | ||
| 89 | ret | ||
| 90 | .Lfinal: | ||
| 91 | |||
| 92 | /* Some CPUs run faster using the string copy instructions. | ||
| 93 | It is also a lot simpler. Use this when possible */ | ||
| 94 | |||
| 95 | .section .altinstructions,"a" | ||
| 96 | .align 8 | ||
| 97 | .quad memcpy | ||
| 98 | .quad memcpy_c | ||
| 99 | .byte X86_FEATURE_REP_GOOD | ||
| 100 | .byte .Lfinal-memcpy | ||
| 101 | .byte memcpy_c_end-memcpy_c | ||
| 102 | .previous | ||
| 103 | |||
| 104 | .section .altinstr_replacement,"ax" | ||
| 105 | /* rdi destination | ||
| 106 | * rsi source | ||
| 107 | * rdx count | ||
| 108 | */ | ||
| 109 | memcpy_c: | ||
| 23 | movq %rdi,%rax | 110 | movq %rdi,%rax |
| 24 | movl %edx,%ecx | 111 | movl %edx,%ecx |
| 25 | shrl $3,%ecx | 112 | shrl $3,%ecx |
| @@ -30,3 +117,5 @@ memcpy: | |||
| 30 | rep | 117 | rep |
| 31 | movsb | 118 | movsb |
| 32 | ret | 119 | ret |
| 120 | memcpy_c_end: | ||
| 121 | .previous | ||
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1e..ad397f2c7de8 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S | |||
| @@ -13,6 +13,98 @@ | |||
| 13 | .p2align 4 | 13 | .p2align 4 |
| 14 | memset: | 14 | memset: |
| 15 | __memset: | 15 | __memset: |
| 16 | movq %rdi,%r10 | ||
| 17 | movq %rdx,%r11 | ||
| 18 | |||
| 19 | /* expand byte value */ | ||
| 20 | movzbl %sil,%ecx | ||
| 21 | movabs $0x0101010101010101,%rax | ||
| 22 | mul %rcx /* with rax, clobbers rdx */ | ||
| 23 | |||
| 24 | /* align dst */ | ||
| 25 | movl %edi,%r9d | ||
| 26 | andl $7,%r9d | ||
| 27 | jnz .Lbad_alignment | ||
| 28 | .Lafter_bad_alignment: | ||
| 29 | |||
| 30 | movl %r11d,%ecx | ||
| 31 | shrl $6,%ecx | ||
| 32 | jz .Lhandle_tail | ||
| 33 | |||
| 34 | .p2align 4 | ||
| 35 | .Lloop_64: | ||
| 36 | decl %ecx | ||
| 37 | movq %rax,(%rdi) | ||
| 38 | movq %rax,8(%rdi) | ||
| 39 | movq %rax,16(%rdi) | ||
| 40 | movq %rax,24(%rdi) | ||
| 41 | movq %rax,32(%rdi) | ||
| 42 | movq %rax,40(%rdi) | ||
| 43 | movq %rax,48(%rdi) | ||
| 44 | movq %rax,56(%rdi) | ||
| 45 | leaq 64(%rdi),%rdi | ||
| 46 | jnz .Lloop_64 | ||
| 47 | |||
| 48 | /* Handle tail in loops. The loops should be faster than hard | ||
| 49 | to predict jump tables. */ | ||
| 50 | .p2align 4 | ||
| 51 | .Lhandle_tail: | ||
| 52 | movl %r11d,%ecx | ||
| 53 | andl $63&(~7),%ecx | ||
| 54 | jz .Lhandle_7 | ||
| 55 | shrl $3,%ecx | ||
| 56 | .p2align 4 | ||
| 57 | .Lloop_8: | ||
| 58 | decl %ecx | ||
| 59 | movq %rax,(%rdi) | ||
| 60 | leaq 8(%rdi),%rdi | ||
| 61 | jnz .Lloop_8 | ||
| 62 | |||
| 63 | .Lhandle_7: | ||
| 64 | movl %r11d,%ecx | ||
| 65 | andl $7,%ecx | ||
| 66 | jz .Lende | ||
| 67 | .p2align 4 | ||
| 68 | .Lloop_1: | ||
| 69 | decl %ecx | ||
| 70 | movb %al,(%rdi) | ||
| 71 | leaq 1(%rdi),%rdi | ||
| 72 | jnz .Lloop_1 | ||
| 73 | |||
| 74 | .Lende: | ||
| 75 | movq %r10,%rax | ||
| 76 | ret | ||
| 77 | |||
| 78 | .Lbad_alignment: | ||
| 79 | cmpq $7,%r11 | ||
| 80 | jbe .Lhandle_7 | ||
| 81 | movq %rax,(%rdi) /* unaligned store */ | ||
| 82 | movq $8,%r8 | ||
| 83 | subq %r9,%r8 | ||
| 84 | addq %r8,%rdi | ||
| 85 | subq %r8,%r11 | ||
| 86 | jmp .Lafter_bad_alignment | ||
| 87 | |||
| 88 | /* Some CPUs run faster using the string instructions. | ||
| 89 | It is also a lot simpler. Use this when possible */ | ||
| 90 | |||
| 91 | #include <asm/cpufeature.h> | ||
| 92 | |||
| 93 | .section .altinstructions,"a" | ||
| 94 | .align 8 | ||
| 95 | .quad memset | ||
| 96 | .quad memset_c | ||
| 97 | .byte X86_FEATURE_REP_GOOD | ||
| 98 | .byte memset_c_end-memset_c | ||
| 99 | .byte memset_c_end-memset_c | ||
| 100 | .previous | ||
| 101 | |||
| 102 | .section .altinstr_replacement,"ax" | ||
| 103 | /* rdi destination | ||
| 104 | * rsi value | ||
| 105 | * rdx count | ||
| 106 | */ | ||
| 107 | memset_c: | ||
| 16 | movq %rdi,%r9 | 108 | movq %rdi,%r9 |
| 17 | movl %edx,%r8d | 109 | movl %edx,%r8d |
| 18 | andl $7,%r8d | 110 | andl $7,%r8d |
| @@ -29,3 +121,5 @@ __memset: | |||
| 29 | stosb | 121 | stosb |
| 30 | movq %r9,%rax | 122 | movq %r9,%rax |
| 31 | ret | 123 | ret |
| 124 | memset_c_end: | ||
| 125 | .previous | ||
diff --git a/include/asm-x86_64/cpufeature.h b/include/asm-x86_64/cpufeature.h index 41c0ac8559be..76bb6193ae91 100644 --- a/include/asm-x86_64/cpufeature.h +++ b/include/asm-x86_64/cpufeature.h | |||
| @@ -61,7 +61,7 @@ | |||
| 61 | #define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ | 61 | #define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ |
| 62 | #define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ | 62 | #define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ |
| 63 | #define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ | 63 | #define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ |
| 64 | /* 4 free */ | 64 | #define X86_FEATURE_REP_GOOD (3*32+ 4) /* rep microcode works well on this CPU */ |
| 65 | #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */ | 65 | #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */ |
| 66 | #define X86_FEATURE_SYNC_RDTSC (3*32+6) /* RDTSC syncs CPU core */ | 66 | #define X86_FEATURE_SYNC_RDTSC (3*32+6) /* RDTSC syncs CPU core */ |
| 67 | 67 | ||
