/* Copyright 2002 Andi Kleen */
#include <asm/cpufeature.h>
/*
* memcpy - Copy a memory block.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* rax original destination
*/
.globl __memcpy
.globl memcpy
.p2align 4
__memcpy:
memcpy:
pushq %rbx
movq %rdi,%rax
movl %edx,%ecx
shrl $6,%ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decl %ecx
movq (%rsi),%r11
movq 8(%rsi),%r8
movq %r11,(%rdi)
movq %r8,1*8(%rdi)
movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10
movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)
movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8
movq %r11,4*8(%rdi)
movq %r8,5*8(%rdi)
movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10
movq %r9,6*8(%rdi)
movq %r10,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
movl %edx,%ecx
andl $63,%ecx
shrl $3,%ecx
jz .Lhandle_7
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi),%r8
movq %r8,(%rdi)
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jnz .Lloop_8
.Lhandle_7:
movl %edx,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
.Lende:
popq %rbx
ret
.Lfinal:
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
.section .altinstructions,"a"
.align 8
.quad memcpy
.quad memcpy_c
.byte X86_FEATURE_REP_GOOD
.byte .Lfinal-memcpy
.byte memcpy_c_end-memcpy_c
.previous
.section .altinstr_replacement,"ax"
/* rdi destination
* rsi source
* rdx count
*/
memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
rep
movsq
movl %edx,%ecx
rep
movsb
ret
memcpy_c_end:
.previous