diff options
| -rw-r--r-- | arch/x86/lib/memcpy_32.c | 6 | ||||
| -rw-r--r-- | arch/x86/lib/memcpy_64.S | 158 |
2 files changed, 105 insertions, 59 deletions
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index be424dfcf365..81130d477ee2 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c | |||
| @@ -36,11 +36,9 @@ void *memmove(void *dest, const void *src, size_t n) | |||
| 36 | "1" (src), | 36 | "1" (src), |
| 37 | "2" (dest) | 37 | "2" (dest) |
| 38 | :"memory"); | 38 | :"memory"); |
| 39 | |||
| 40 | } else { | 39 | } else { |
| 41 | 40 | if((src + n) < dest) | |
| 42 | if((src + count) < dest) | 41 | return memcpy(dest, src, n); |
| 43 | return memcpy(dest, src, count); | ||
| 44 | else | 42 | else |
| 45 | __asm__ __volatile__( | 43 | __asm__ __volatile__( |
| 46 | "std\n\t" | 44 | "std\n\t" |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..75ef61e35e38 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
| @@ -40,84 +40,132 @@ | |||
| 40 | ENTRY(__memcpy) | 40 | ENTRY(__memcpy) |
| 41 | ENTRY(memcpy) | 41 | ENTRY(memcpy) |
| 42 | CFI_STARTPROC | 42 | CFI_STARTPROC |
| 43 | movq %rdi, %rax | ||
| 43 | 44 | ||
| 44 | /* | 45 | /* |
| 45 | * Put the number of full 64-byte blocks into %ecx. | 46 | * Use 32bit CMP here to avoid long NOP padding. |
| 46 | * Tail portion is handled at the end: | ||
| 47 | */ | 47 | */ |
| 48 | movq %rdi, %rax | 48 | cmp $0x20, %edx |
| 49 | movl %edx, %ecx | 49 | jb .Lhandle_tail |
| 50 | shrl $6, %ecx | ||
| 51 | jz .Lhandle_tail | ||
| 52 | 50 | ||
| 53 | .p2align 4 | ||
| 54 | .Lloop_64: | ||
| 55 | /* | 51 | /* |
| 56 | * We decrement the loop index here - and the zero-flag is | 52 | * We check whether memory false dependece could occur, |
| 57 | * checked at the end of the loop (instructions inbetween do | 53 | * then jump to corresponding copy mode. |
| 58 | * not change the zero flag): | ||
| 59 | */ | 54 | */ |
| 60 | decl %ecx | 55 | cmp %dil, %sil |
| 56 | jl .Lcopy_backward | ||
| 57 | subl $0x20, %edx | ||
| 58 | .Lcopy_forward_loop: | ||
| 59 | subq $0x20, %rdx | ||
| 61 | 60 | ||
| 62 | /* | 61 | /* |
| 63 | * Move in blocks of 4x16 bytes: | 62 | * Move in blocks of 4x8 bytes: |
| 64 | */ | 63 | */ |
| 65 | movq 0*8(%rsi), %r11 | 64 | movq 0*8(%rsi), %r8 |
| 66 | movq 1*8(%rsi), %r8 | 65 | movq 1*8(%rsi), %r9 |
| 67 | movq %r11, 0*8(%rdi) | 66 | movq 2*8(%rsi), %r10 |
| 68 | movq %r8, 1*8(%rdi) | 67 | movq 3*8(%rsi), %r11 |
| 69 | 68 | leaq 4*8(%rsi), %rsi | |
| 70 | movq 2*8(%rsi), %r9 | 69 | |
| 71 | movq 3*8(%rsi), %r10 | 70 | movq %r8, 0*8(%rdi) |
| 72 | movq %r9, 2*8(%rdi) | 71 | movq %r9, 1*8(%rdi) |
| 73 | movq %r10, 3*8(%rdi) | 72 | movq %r10, 2*8(%rdi) |
| 74 | 73 | movq %r11, 3*8(%rdi) | |
| 75 | movq 4*8(%rsi), %r11 | 74 | leaq 4*8(%rdi), %rdi |
| 76 | movq 5*8(%rsi), %r8 | 75 | jae .Lcopy_forward_loop |
| 77 | movq %r11, 4*8(%rdi) | 76 | addq $0x20, %rdx |
| 78 | movq %r8, 5*8(%rdi) | 77 | jmp .Lhandle_tail |
| 79 | 78 | ||
| 80 | movq 6*8(%rsi), %r9 | 79 | .Lcopy_backward: |
| 81 | movq 7*8(%rsi), %r10 | 80 | /* |
| 82 | movq %r9, 6*8(%rdi) | 81 | * Calculate copy position to tail. |
| 83 | movq %r10, 7*8(%rdi) | 82 | */ |
| 84 | 83 | addq %rdx, %rsi | |
| 85 | leaq 64(%rsi), %rsi | 84 | addq %rdx, %rdi |
| 86 | leaq 64(%rdi), %rdi | 85 | subq $0x20, %rdx |
| 87 | 86 | /* | |
| 88 | jnz .Lloop_64 | 87 | * At most 3 ALU operations in one cycle, |
| 88 | * so append NOPS in the same 16bytes trunk. | ||
| 89 | */ | ||
| 90 | .p2align 4 | ||
| 91 | .Lcopy_backward_loop: | ||
| 92 | subq $0x20, %rdx | ||
| 93 | movq -1*8(%rsi), %r8 | ||
| 94 | movq -2*8(%rsi), %r9 | ||
| 95 | movq -3*8(%rsi), %r10 | ||
| 96 | movq -4*8(%rsi), %r11 | ||
| 97 | leaq -4*8(%rsi), %rsi | ||
| 98 | movq %r8, -1*8(%rdi) | ||
| 99 | movq %r9, -2*8(%rdi) | ||
| 100 | movq %r10, -3*8(%rdi) | ||
| 101 | movq %r11, -4*8(%rdi) | ||
| 102 | leaq -4*8(%rdi), %rdi | ||
| 103 | jae .Lcopy_backward_loop | ||
| 89 | 104 | ||
| 105 | /* | ||
| 106 | * Calculate copy position to head. | ||
| 107 | */ | ||
| 108 | addq $0x20, %rdx | ||
| 109 | subq %rdx, %rsi | ||
| 110 | subq %rdx, %rdi | ||
| 90 | .Lhandle_tail: | 111 | .Lhandle_tail: |
| 91 | movl %edx, %ecx | 112 | cmpq $16, %rdx |
| 92 | andl $63, %ecx | 113 | jb .Lless_16bytes |
| 93 | shrl $3, %ecx | ||
| 94 | jz .Lhandle_7 | ||
| 95 | 114 | ||
| 115 | /* | ||
| 116 | * Move data from 16 bytes to 31 bytes. | ||
| 117 | */ | ||
| 118 | movq 0*8(%rsi), %r8 | ||
| 119 | movq 1*8(%rsi), %r9 | ||
| 120 | movq -2*8(%rsi, %rdx), %r10 | ||
| 121 | movq -1*8(%rsi, %rdx), %r11 | ||
| 122 | movq %r8, 0*8(%rdi) | ||
| 123 | movq %r9, 1*8(%rdi) | ||
| 124 | movq %r10, -2*8(%rdi, %rdx) | ||
| 125 | movq %r11, -1*8(%rdi, %rdx) | ||
| 126 | retq | ||
| 96 | .p2align 4 | 127 | .p2align 4 |
| 97 | .Lloop_8: | 128 | .Lless_16bytes: |
| 98 | decl %ecx | 129 | cmpq $8, %rdx |
| 99 | movq (%rsi), %r8 | 130 | jb .Lless_8bytes |
| 100 | movq %r8, (%rdi) | 131 | /* |
| 101 | leaq 8(%rdi), %rdi | 132 | * Move data from 8 bytes to 15 bytes. |
| 102 | leaq 8(%rsi), %rsi | 133 | */ |
| 103 | jnz .Lloop_8 | 134 | movq 0*8(%rsi), %r8 |
| 104 | 135 | movq -1*8(%rsi, %rdx), %r9 | |
| 105 | .Lhandle_7: | 136 | movq %r8, 0*8(%rdi) |
| 106 | movl %edx, %ecx | 137 | movq %r9, -1*8(%rdi, %rdx) |
| 107 | andl $7, %ecx | 138 | retq |
| 108 | jz .Lend | 139 | .p2align 4 |
| 140 | .Lless_8bytes: | ||
| 141 | cmpq $4, %rdx | ||
| 142 | jb .Lless_3bytes | ||
| 109 | 143 | ||
| 144 | /* | ||
| 145 | * Move data from 4 bytes to 7 bytes. | ||
| 146 | */ | ||
| 147 | movl (%rsi), %ecx | ||
| 148 | movl -4(%rsi, %rdx), %r8d | ||
| 149 | movl %ecx, (%rdi) | ||
| 150 | movl %r8d, -4(%rdi, %rdx) | ||
| 151 | retq | ||
| 110 | .p2align 4 | 152 | .p2align 4 |
| 153 | .Lless_3bytes: | ||
| 154 | cmpl $0, %edx | ||
| 155 | je .Lend | ||
| 156 | /* | ||
| 157 | * Move data from 1 bytes to 3 bytes. | ||
| 158 | */ | ||
| 111 | .Lloop_1: | 159 | .Lloop_1: |
| 112 | movb (%rsi), %r8b | 160 | movb (%rsi), %r8b |
| 113 | movb %r8b, (%rdi) | 161 | movb %r8b, (%rdi) |
| 114 | incq %rdi | 162 | incq %rdi |
| 115 | incq %rsi | 163 | incq %rsi |
| 116 | decl %ecx | 164 | decl %edx |
| 117 | jnz .Lloop_1 | 165 | jnz .Lloop_1 |
| 118 | 166 | ||
| 119 | .Lend: | 167 | .Lend: |
| 120 | ret | 168 | retq |
| 121 | CFI_ENDPROC | 169 | CFI_ENDPROC |
| 122 | ENDPROC(memcpy) | 170 | ENDPROC(memcpy) |
| 123 | ENDPROC(__memcpy) | 171 | ENDPROC(__memcpy) |
