19 files changed, 926 insertions, 405 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
        lib-y += memmove_64.o memset_64.o
        lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
        lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+        lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e65..e8e7e0d06f42 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
 /* if you want SMP support, implement these with real spinlocks */
 .macro LOCK reg
-        pushfl
+        pushfl_cfi
-        CFI_ADJUST_CFA_OFFSET 4
        cli
 .endm
 .macro UNLOCK reg
-        popfl
+        popfl_cfi
-        CFI_ADJUST_CFA_OFFSET -4
 .endm
 #define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de3352..391a083674b4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
 #include <asm/dwarf2.h>
 .macro SAVE reg
-        pushl %\reg
+        pushl_cfi %\reg
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET \reg, 0
 .endm
 .macro RESTORE reg
-        popl %\reg
+        popl_cfi %\reg
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE \reg
 .endm
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..78d16a554db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
           */           
 ENTRY(csum_partial)
        CFI_STARTPROC
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
        movl 20(%esp),%eax      # Function arg: unsigned int sum
        movl 16(%esp),%ecx      # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
        jz 8f
        roll $8, %eax
 8:
-        popl %ebx
+        popl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebx
-        popl %esi
+        popl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE esi
        ret
        CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
 ENTRY(csum_partial)
        CFI_STARTPROC
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
        movl 20(%esp),%eax      # Function arg: unsigned int sum
        movl 16(%esp),%ecx      # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
        jz 90f
        roll $8, %eax
 90: 
-        popl %ebx
+        popl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebx
-        popl %esi
+        popl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE esi
        ret
        CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
        CFI_STARTPROC
        subl  $4,%esp   
        CFI_ADJUST_CFA_OFFSET 4
-        pushl %edi
+        pushl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edi, 0
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
        movl ARGBASE+16(%esp),%eax      # sum
        movl ARGBASE+12(%esp),%ecx      # len
@@ -426,17 +415,13 @@ DST(	movb %cl, (%edi)	)
 .previous
-        popl %ebx
+        popl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebx
-        popl %esi
+        popl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE esi
-        popl %edi
+        popl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE edi
-        popl %ecx                       # equivalent to addl $4,%esp
+        popl_cfi %ecx                   # equivalent to addl $4,%esp
-        CFI_ADJUST_CFA_OFFSET -4
        ret     
        CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
                
 ENTRY(csum_partial_copy_generic)
        CFI_STARTPROC
-        pushl %ebx
+        pushl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ebx, 0
-        pushl %edi
+        pushl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edi, 0
-        pushl %esi
+        pushl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET esi, 0
        movl ARGBASE+4(%esp),%esi       #src
        movl ARGBASE+8(%esp),%edi       #dst    
@@ -527,14 +509,11 @@ DST(	movb %dl, (%edi)         )
        jmp  7b                 
 .previous                               
-        popl %esi
+        popl_cfi %esi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE esi
-        popl %edi
+        popl_cfi %edi
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE edi
-        popl %ebx
+        popl_cfi %ebx
-        CFI_ADJUST_CFA_OFFSET -4
        CFI_RESTORE ebx
        ret
        CFI_ENDPROC
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24a..f2145cfa12a6 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 /*
 * Zero a page.         
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
        CFI_ENDPROC
 ENDPROC(clear_page_c)
+ENTRY(clear_page_c_e)
+        CFI_STARTPROC
+        movl $4096,%ecx
+        xorl %eax,%eax
+        rep stosb
+        ret
+        CFI_ENDPROC
+ENDPROC(clear_page_c_e)
 ENTRY(clear_page)
        CFI_STARTPROC
        xorl   %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
 .Lclear_page_end:
 ENDPROC(clear_page)
-        /* Some CPUs run faster using the string instructions.
+        /*
-           It is also a lot simpler. Use this when possible */
+         * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+         * It is recommended to use this when possible.
+         * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+         * Otherwise, use original function.
+         *
+         */
 #include <asm/cpufeature.h>
        .section .altinstr_replacement,"ax"
 1:      .byte 0xeb                                      /* jmp <disp8> */
        .byte (clear_page_c - clear_page) - (2f - 1b)   /* offset */
-2:
+2:      .byte 0xeb                                      /* jmp <disp8> */
+        .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
        .previous
        .section .altinstructions,"a"
-        .align 8
+        altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-        .quad clear_page
+                             .Lclear_page_end-clear_page, 2b-1b
-        .quad 1b
+        altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-        .word X86_FEATURE_REP_GOOD
+                             .Lclear_page_end-clear_page,3b-2b
-        .byte .Lclear_page_end - clear_page
-        .byte 2b - 1b
        .previous
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..1e572c507d06
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
+/*
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; version 2
+ *      of the License.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+#ifdef CONFIG_SMP
+#define SEG_PREFIX %gs:
+#else
+#define SEG_PREFIX
+#endif
+.text
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al  : Operation successful
+ */
+ENTRY(this_cpu_cmpxchg16b_emu)
+CFI_STARTPROC
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
+# via the ZF.  Caller will access %al to get result.
+#
+# Note that this is only useful for a cpuops operation.  Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of
+# macros).
+#
+this_cpu_cmpxchg16b_emu:
+        pushf
+        cli
+        cmpq SEG_PREFIX(%rsi), %rax
+        jne not_same
+        cmpq SEG_PREFIX 8(%rsi), %rdx
+        jne not_same
+        movq %rbx, SEG_PREFIX(%rsi)
+        movq %rcx, SEG_PREFIX 8(%rsi)
+        popf
+        mov $1, %al
+        ret
+ not_same:
+        popf
+        xor %al,%al
+        ret
+CFI_ENDPROC
+ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..024840266ba0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
-        .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+        .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
        .byte 0xe9      /* 32bit jump */
        .long \orig-1f  /* by default jump to orig */
 1:
        .section .altinstr_replacement,"ax"
 2:      .byte 0xe9                      /* near jump with 32bit immediate */
-        .long \alt-1b /* offset */   /* or alternatively to alt */
+        .long \alt1-1b /* offset */   /* or alternatively to alt1 */
+3:      .byte 0xe9                      /* near jump with 32bit immediate */
+        .long \alt2-1b /* offset */   /* or alternatively to alt2 */
        .previous
        .section .altinstructions,"a"
-        .align 8
+        altinstruction_entry 0b,2b,\feature1,5,5
-        .quad  0b
+        altinstruction_entry 0b,3b,\feature2,5,5
-        .quad  2b
-        .word  \feature                 /* when feature is set */
-        .byte  5
-        .byte  5
        .previous
        .endm
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
        addq %rdx,%rcx
        jc bad_to_user
        cmpq TI_addr_limit(%rax),%rcx
-        jae bad_to_user
+        ja bad_to_user
-        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+                copy_user_generic_unrolled,copy_user_generic_string,    \
+                copy_user_enhanced_fast_string
        CFI_ENDPROC
 ENDPROC(_copy_to_user)
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
        addq %rdx,%rcx
        jc bad_from_user
        cmpq TI_addr_limit(%rax),%rcx
-        jae bad_from_user
+        ja bad_from_user
-        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+                copy_user_generic_unrolled,copy_user_generic_string,    \
+                copy_user_enhanced_fast_string
        CFI_ENDPROC
 ENDPROC(_copy_from_user)
@@ -117,7 +128,7 @@ ENDPROC(bad_from_user)
 * rdx count
 *
 * Output:
- * eax uncopied bytes or 0 if successfull.
+ * eax uncopied bytes or 0 if successful.
 */
 ENTRY(copy_user_generic_unrolled)
        CFI_STARTPROC
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
        .previous
        CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+        CFI_STARTPROC
+        andl %edx,%edx
+        jz 2f
+        movl %edx,%ecx
+1:      rep
+        movsb
+2:      xorl %eax,%eax
+        ret
+        .section .fixup,"ax"
+12:     movl %ecx,%edx          /* ecx is zerorest also */
+        jmp copy_user_handle_tail
+        .previous
+        .section __ex_table,"a"
+        .align 8
+        .quad 1b,12b
+        .previous
+        CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
 /*
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
- *      
+ *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
 /*
 * Checksum copy with exception handling.
- * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
 * destination is zeroed.
- * 
+ *
 * Input
 * rdi  source
 * rsi  destination
 * edx  len (32bit)
- * ecx  sum (32bit) 
+ * ecx  sum (32bit)
 * r8   src_err_ptr (int)
 * r9   dst_err_ptr (int)
 *
 * Output
 * eax  64bit sum. undefined in case of exception.
- * 
+ *
- * Wrappers need to take care of valid exception sum and zeroing.                
+ * Wrappers need to take care of valid exception sum and zeroing.
 * They also should align source or destination to 8 bytes.
 */
        .macro source
 10:
-        .section __ex_table,"a"
+        .section __ex_table, "a"
        .align 8
-        .quad 10b,.Lbad_source
+        .quad 10b, .Lbad_source
        .previous
        .endm
-                
        .macro dest
 20:
-        .section __ex_table,"a"
+        .section __ex_table, "a"
        .align 8
-        .quad 20b,.Lbad_dest
+        .quad 20b, .Lbad_dest
        .previous
        .endm
-                        
        .macro ignore L=.Lignore
 30:
-        .section __ex_table,"a"
+        .section __ex_table, "a"
        .align 8
-        .quad 30b,\L
+        .quad 30b, \L
        .previous
        .endm
-        
-                                
 ENTRY(csum_partial_copy_generic)
        CFI_STARTPROC
-        cmpl     $3*64,%edx
+        cmpl    $3*64, %edx
-        jle      .Lignore
+        jle     .Lignore
-.Lignore:               
+.Lignore:
-        subq  $7*8,%rsp
+        subq  $7*8, %rsp
        CFI_ADJUST_CFA_OFFSET 7*8
-        movq  %rbx,2*8(%rsp)
+        movq  %rbx, 2*8(%rsp)
        CFI_REL_OFFSET rbx, 2*8
-        movq  %r12,3*8(%rsp)
+        movq  %r12, 3*8(%rsp)
        CFI_REL_OFFSET r12, 3*8
-        movq  %r14,4*8(%rsp)
+        movq  %r14, 4*8(%rsp)
        CFI_REL_OFFSET r14, 4*8
-        movq  %r13,5*8(%rsp)
+        movq  %r13, 5*8(%rsp)
        CFI_REL_OFFSET r13, 5*8
-        movq  %rbp,6*8(%rsp)
+        movq  %rbp, 6*8(%rsp)
        CFI_REL_OFFSET rbp, 6*8
-        movq  %r8,(%rsp)
+        movq  %r8, (%rsp)
-        movq  %r9,1*8(%rsp)
+        movq  %r9, 1*8(%rsp)
-        
-        movl  %ecx,%eax
-        movl  %edx,%ecx
-        xorl  %r9d,%r9d
+        movl  %ecx, %eax
-        movq  %rcx,%r12
+        movl  %edx, %ecx
-        shrq  $6,%r12
+        xorl  %r9d, %r9d
-        jz    .Lhandle_tail       /* < 64 */
+        movq  %rcx, %r12
+        shrq  $6, %r12
+        jz      .Lhandle_tail       /* < 64 */
        clc
-        
        /* main loop. clear in 64 byte blocks */
        /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
        /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
        .p2align 4
 .Lloop:
        source
-        movq  (%rdi),%rbx
+        movq  (%rdi), %rbx
        source
-        movq  8(%rdi),%r8
+        movq  8(%rdi), %r8
        source
-        movq  16(%rdi),%r11
+        movq  16(%rdi), %r11
        source
-        movq  24(%rdi),%rdx
+        movq  24(%rdi), %rdx
        source
-        movq  32(%rdi),%r10
+        movq  32(%rdi), %r10
        source
-        movq  40(%rdi),%rbp
+        movq  40(%rdi), %rbp
        source
-        movq  48(%rdi),%r14
+        movq  48(%rdi), %r14
        source
-        movq  56(%rdi),%r13
+        movq  56(%rdi), %r13
-                
        ignore 2f
        prefetcht0 5*64(%rdi)
-2:                                                      
+2:
-        adcq  %rbx,%rax
+        adcq  %rbx, %rax
-        adcq  %r8,%rax
+        adcq  %r8, %rax
-        adcq  %r11,%rax
+        adcq  %r11, %rax
-        adcq  %rdx,%rax
+        adcq  %rdx, %rax
-        adcq  %r10,%rax
+        adcq  %r10, %rax
-        adcq  %rbp,%rax
+        adcq  %rbp, %rax
-        adcq  %r14,%rax
+        adcq  %r14, %rax
-        adcq  %r13,%rax
+        adcq  %r13, %rax
        decl %r12d
-        
        dest
-        movq %rbx,(%rsi)
+        movq %rbx, (%rsi)
        dest
-        movq %r8,8(%rsi)
+        movq %r8, 8(%rsi)
        dest
-        movq %r11,16(%rsi)
+        movq %r11, 16(%rsi)
        dest
-        movq %rdx,24(%rsi)
+        movq %rdx, 24(%rsi)
        dest
-        movq %r10,32(%rsi)
+        movq %r10, 32(%rsi)
        dest
-        movq %rbp,40(%rsi)
+        movq %rbp, 40(%rsi)
        dest
-        movq %r14,48(%rsi)
+        movq %r14, 48(%rsi)
        dest
-        movq %r13,56(%rsi)
+        movq %r13, 56(%rsi)
-        
 3:
-        
-        leaq 64(%rdi),%rdi
-        leaq 64(%rsi),%rsi
-        jnz   .Lloop
+        leaq 64(%rdi), %rdi
+        leaq 64(%rsi), %rsi
-        adcq  %r9,%rax
+        jnz     .Lloop
-        /* do last upto 56 bytes */
+        adcq  %r9, %rax
+        /* do last up to 56 bytes */
 .Lhandle_tail:
        /* ecx: count */
-        movl %ecx,%r10d
+        movl %ecx, %r10d
-        andl $63,%ecx
+        andl $63, %ecx
-        shrl $3,%ecx
+        shrl $3, %ecx
-        jz       .Lfold
+        jz      .Lfold
        clc
        .p2align 4
-.Lloop_8:       
+.Lloop_8:
        source
-        movq (%rdi),%rbx
+        movq (%rdi), %rbx
-        adcq %rbx,%rax
+        adcq %rbx, %rax
        decl %ecx
        dest
-        movq %rbx,(%rsi)
+        movq %rbx, (%rsi)
-        leaq 8(%rsi),%rsi /* preserve carry */
+        leaq 8(%rsi), %rsi /* preserve carry */
-        leaq 8(%rdi),%rdi
+        leaq 8(%rdi), %rdi
        jnz     .Lloop_8
-        adcq %r9,%rax   /* add in carry */
+        adcq %r9, %rax  /* add in carry */
 .Lfold:
        /* reduce checksum to 32bits */
-        movl %eax,%ebx
+        movl %eax, %ebx
-        shrq $32,%rax
+        shrq $32, %rax
-        addl %ebx,%eax
+        addl %ebx, %eax
-        adcl %r9d,%eax
+        adcl %r9d, %eax
-        /* do last upto 6 bytes */      
+        /* do last up to 6 bytes */
 .Lhandle_7:
-        movl %r10d,%ecx
+        movl %r10d, %ecx
-        andl $7,%ecx
+        andl $7, %ecx
-        shrl $1,%ecx
+        shrl $1, %ecx
        jz   .Lhandle_1
-        movl $2,%edx
+        movl $2, %edx
-        xorl %ebx,%ebx
+        xorl %ebx, %ebx
-        clc  
+        clc
        .p2align 4
-.Lloop_1:       
+.Lloop_1:
        source
-        movw (%rdi),%bx
+        movw (%rdi), %bx
-        adcl %ebx,%eax
+        adcl %ebx, %eax
        decl %ecx
        dest
-        movw %bx,(%rsi)
+        movw %bx, (%rsi)
-        leaq 2(%rdi),%rdi
+        leaq 2(%rdi), %rdi
-        leaq 2(%rsi),%rsi
+        leaq 2(%rsi), %rsi
        jnz .Lloop_1
-        adcl %r9d,%eax  /* add in carry */
+        adcl %r9d, %eax /* add in carry */
-        
        /* handle last odd byte */
 .Lhandle_1:
-        testl $1,%r10d
+        testl $1, %r10d
        jz    .Lende
-        xorl  %ebx,%ebx
+        xorl  %ebx, %ebx
        source
-        movb (%rdi),%bl
+        movb (%rdi), %bl
        dest
-        movb %bl,(%rsi)
+        movb %bl, (%rsi)
-        addl %ebx,%eax
+        addl %ebx, %eax
-        adcl %r9d,%eax          /* carry */
+        adcl %r9d, %eax         /* carry */
-                        
        CFI_REMEMBER_STATE
 .Lende:
-        movq 2*8(%rsp),%rbx
+        movq 2*8(%rsp), %rbx
        CFI_RESTORE rbx
-        movq 3*8(%rsp),%r12
+        movq 3*8(%rsp), %r12
        CFI_RESTORE r12
-        movq 4*8(%rsp),%r14
+        movq 4*8(%rsp), %r14
        CFI_RESTORE r14
-        movq 5*8(%rsp),%r13
+        movq 5*8(%rsp), %r13
        CFI_RESTORE r13
-        movq 6*8(%rsp),%rbp
+        movq 6*8(%rsp), %rbp
        CFI_RESTORE rbp
-        addq $7*8,%rsp
+        addq $7*8, %rsp
        CFI_ADJUST_CFA_OFFSET -7*8
        ret
        CFI_RESTORE_STATE
        /* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
-        movq (%rsp),%rax
+        movq (%rsp), %rax
-        testq %rax,%rax
+        testq %rax, %rax
        jz   .Lende
-        movl $-EFAULT,(%rax)
+        movl $-EFAULT, (%rax)
        jmp  .Lende
-        
 .Lbad_dest:
-        movq 8(%rsp),%rax
+        movq 8(%rsp), %rax
-        testq %rax,%rax
+        testq %rax, %rax
-        jz   .Lende     
+        jz   .Lende
-        movl $-EFAULT,(%rax)
+        movl $-EFAULT, (%rax)
        jmp .Lende
        CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
                                count64--;
                        }
-                        /* last upto 7 8byte blocks */
+                        /* last up to 7 8byte blocks */
                        count %= 8; 
                        while (count) { 
                                asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..fc45ba887d05 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops)
        asm("mull %%edx"
                :"=d" (xloops), "=&a" (d0)
                :"1" (xloops), "0"
-                (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+                (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
        __delay(++xloops);
 }
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
 void *memmove(void *dest, const void *src, size_t n)
 {
-        int d0, d1, d2;
+        int d0,d1,d2,d3,d4,d5;
+        char *ret = dest;
-        if (dest < src) {
-                memcpy(dest, src, n);
+        __asm__ __volatile__(
-        } else {
+                /* Handle more 16bytes in loop */
-                __asm__ __volatile__(
+                "cmp $0x10, %0\n\t"
-                        "std\n\t"
+                "jb     1f\n\t"
-                        "rep\n\t"
-                        "movsb\n\t"
+                /* Decide forward/backward copy mode */
-                        "cld"
+                "cmp %2, %1\n\t"
-                        : "=&c" (d0), "=&S" (d1), "=&D" (d2)
+                "jb     2f\n\t"
-                        :"0" (n),
-                         "1" (n-1+src),
+                /*
-                         "2" (n-1+dest)
+                 * movs instruction have many startup latency
-                        :"memory");
+                 * so we handle small size by general register.
-        }
+                 */
-        return dest;
+                "cmp  $680, %0\n\t"
+                "jb 3f\n\t"
+                /*
+                 * movs instruction is only good for aligned case.
+                 */
+                "mov %1, %3\n\t"
+                "xor %2, %3\n\t"
+                "and $0xff, %3\n\t"
+                "jz 4f\n\t"
+                "3:\n\t"
+                "sub $0x10, %0\n\t"
+                /*
+                 * We gobble 16byts forward in each loop.
+                 */
+                "3:\n\t"
+                "sub $0x10, %0\n\t"
+                "mov 0*4(%1), %3\n\t"
+                "mov 1*4(%1), %4\n\t"
+                "mov  %3, 0*4(%2)\n\t"
+                "mov  %4, 1*4(%2)\n\t"
+                "mov 2*4(%1), %3\n\t"
+                "mov 3*4(%1), %4\n\t"
+                "mov  %3, 2*4(%2)\n\t"
+                "mov  %4, 3*4(%2)\n\t"
+                "lea  0x10(%1), %1\n\t"
+                "lea  0x10(%2), %2\n\t"
+                "jae 3b\n\t"
+                "add $0x10, %0\n\t"
+                "jmp 1f\n\t"
+                /*
+                 * Handle data forward by movs.
+                 */
+                ".p2align 4\n\t"
+                "4:\n\t"
+                "mov -4(%1, %0), %3\n\t"
+                "lea -4(%2, %0), %4\n\t"
+                "shr $2, %0\n\t"
+                "rep movsl\n\t"
+                "mov %3, (%4)\n\t"
+                "jmp 11f\n\t"
+                /*
+                 * Handle data backward by movs.
+                 */
+                ".p2align 4\n\t"
+                "6:\n\t"
+                "mov (%1), %3\n\t"
+                "mov %2, %4\n\t"
+                "lea -4(%1, %0), %1\n\t"
+                "lea -4(%2, %0), %2\n\t"
+                "shr $2, %0\n\t"
+                "std\n\t"
+                "rep movsl\n\t"
+                "mov %3,(%4)\n\t"
+                "cld\n\t"
+                "jmp 11f\n\t"
+                /*
+                 * Start to prepare for backward copy.
+                 */
+                ".p2align 4\n\t"
+                "2:\n\t"
+                "cmp  $680, %0\n\t"
+                "jb 5f\n\t"
+                "mov %1, %3\n\t"
+                "xor %2, %3\n\t"
+                "and $0xff, %3\n\t"
+                "jz 6b\n\t"
+                /*
+                 * Calculate copy position to tail.
+                 */
+                "5:\n\t"
+                "add %0, %1\n\t"
+                "add %0, %2\n\t"
+                "sub $0x10, %0\n\t"
+                /*
+                 * We gobble 16byts backward in each loop.
+                 */
+                "7:\n\t"
+                "sub $0x10, %0\n\t"
+                "mov -1*4(%1), %3\n\t"
+                "mov -2*4(%1), %4\n\t"
+                "mov  %3, -1*4(%2)\n\t"
+                "mov  %4, -2*4(%2)\n\t"
+                "mov -3*4(%1), %3\n\t"
+                "mov -4*4(%1), %4\n\t"
+                "mov  %3, -3*4(%2)\n\t"
+                "mov  %4, -4*4(%2)\n\t"
+                "lea  -0x10(%1), %1\n\t"
+                "lea  -0x10(%2), %2\n\t"
+                "jae 7b\n\t"
+                /*
+                 * Calculate copy position to head.
+                 */
+                "add $0x10, %0\n\t"
+                "sub %0, %1\n\t"
+                "sub %0, %2\n\t"
+                /*
+                 * Move data from 8 bytes to 15 bytes.
+                 */
+                ".p2align 4\n\t"
+                "1:\n\t"
+                "cmp $8, %0\n\t"
+                "jb 8f\n\t"
+                "mov 0*4(%1), %3\n\t"
+                "mov 1*4(%1), %4\n\t"
+                "mov -2*4(%1, %0), %5\n\t"
+                "mov -1*4(%1, %0), %1\n\t"
+                "mov  %3, 0*4(%2)\n\t"
+                "mov  %4, 1*4(%2)\n\t"
+                "mov  %5, -2*4(%2, %0)\n\t"
+                "mov  %1, -1*4(%2, %0)\n\t"
+                "jmp 11f\n\t"
+                /*
+                 * Move data from 4 bytes to 7 bytes.
+                 */
+                ".p2align 4\n\t"
+                "8:\n\t"
+                "cmp $4, %0\n\t"
+                "jb 9f\n\t"
+                "mov 0*4(%1), %3\n\t"
+                "mov -1*4(%1, %0), %4\n\t"
+                "mov  %3, 0*4(%2)\n\t"
+                "mov  %4, -1*4(%2, %0)\n\t"
+                "jmp 11f\n\t"
+                /*
+                 * Move data from 2 bytes to 3 bytes.
+                 */
+                ".p2align 4\n\t"
+                "9:\n\t"
+                "cmp $2, %0\n\t"
+                "jb 10f\n\t"
+                "movw 0*2(%1), %%dx\n\t"
+                "movw -1*2(%1, %0), %%bx\n\t"
+                "movw %%dx, 0*2(%2)\n\t"
+                "movw %%bx, -1*2(%2, %0)\n\t"
+                "jmp 11f\n\t"
+                /*
+                 * Move data for 1 byte.
+                 */
+                ".p2align 4\n\t"
+                "10:\n\t"
+                "cmp $1, %0\n\t"
+                "jb 11f\n\t"
+                "movb (%1), %%cl\n\t"
+                "movb %%cl, (%2)\n\t"
+                ".p2align 4\n\t"
+                "11:"
+                : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+                  "=r" (d3),"=r" (d4), "=r"(d5)
+                :"0" (n),
+                 "1" (src),
+                 "2" (dest)
+                :"memory");
+        return ret;
 }
 EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 /*
 * memcpy - Copy a memory block.
@@ -37,107 +38,173 @@
 .Lmemcpy_e:
        .previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+        .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+        movq %rdi, %rax
+        movl %edx, %ecx
+        rep movsb
+        ret
+.Lmemcpy_e_e:
+        .previous
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
+        movq %rdi, %rax
        /*
-         * Put the number of full 64-byte blocks into %ecx.
+         * Use 32bit CMP here to avoid long NOP padding.
-         * Tail portion is handled at the end:
         */
-        movq %rdi, %rax
+        cmp  $0x20, %edx
-        movl %edx, %ecx
+        jb .Lhandle_tail
-        shrl   $6, %ecx
-        jz .Lhandle_tail
-        .p2align 4
-.Lloop_64:
        /*
-         * We decrement the loop index here - and the zero-flag is
+         * We check whether memory false dependence could occur,
-         * checked at the end of the loop (instructions inbetween do
+         * then jump to corresponding copy mode.
-         * not change the zero flag):
         */
-        decl %ecx
+        cmp  %dil, %sil
+        jl .Lcopy_backward
+        subl $0x20, %edx
+.Lcopy_forward_loop:
+        subq $0x20,     %rdx
        /*
-         * Move in blocks of 4x16 bytes:
+         * Move in blocks of 4x8 bytes:
         */
-        movq 0*8(%rsi),         %r11
+        movq 0*8(%rsi), %r8
-        movq 1*8(%rsi),         %r8
+        movq 1*8(%rsi), %r9
-        movq %r11,              0*8(%rdi)
+        movq 2*8(%rsi), %r10
-        movq %r8,               1*8(%rdi)
+        movq 3*8(%rsi), %r11
+        leaq 4*8(%rsi), %rsi
-        movq 2*8(%rsi),         %r9
-        movq 3*8(%rsi),         %r10
+        movq %r8,       0*8(%rdi)
-        movq %r9,               2*8(%rdi)
+        movq %r9,       1*8(%rdi)
-        movq %r10,              3*8(%rdi)
+        movq %r10,      2*8(%rdi)
+        movq %r11,      3*8(%rdi)
-        movq 4*8(%rsi),         %r11
+        leaq 4*8(%rdi), %rdi
-        movq 5*8(%rsi),         %r8
+        jae  .Lcopy_forward_loop
-        movq %r11,              4*8(%rdi)
+        addq $0x20,     %rdx
-        movq %r8,               5*8(%rdi)
+        jmp  .Lhandle_tail
-        movq 6*8(%rsi),         %r9
+.Lcopy_backward:
-        movq 7*8(%rsi),         %r10
+        /*
-        movq %r9,               6*8(%rdi)
+         * Calculate copy position to tail.
-        movq %r10,              7*8(%rdi)
+         */
+        addq %rdx,      %rsi
-        leaq 64(%rsi), %rsi
+        addq %rdx,      %rdi
-        leaq 64(%rdi), %rdi
+        subq $0x20,     %rdx
+        /*
-        jnz  .Lloop_64
+         * At most 3 ALU operations in one cycle,
+         * so append NOPS in the same 16bytes trunk.
+         */
+        .p2align 4
+.Lcopy_backward_loop:
+        subq $0x20,     %rdx
+        movq -1*8(%rsi),        %r8
+        movq -2*8(%rsi),        %r9
+        movq -3*8(%rsi),        %r10
+        movq -4*8(%rsi),        %r11
+        leaq -4*8(%rsi),        %rsi
+        movq %r8,               -1*8(%rdi)
+        movq %r9,               -2*8(%rdi)
+        movq %r10,              -3*8(%rdi)
+        movq %r11,              -4*8(%rdi)
+        leaq -4*8(%rdi),        %rdi
+        jae  .Lcopy_backward_loop
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20,     %rdx
+        subq %rdx,      %rsi
+        subq %rdx,      %rdi
 .Lhandle_tail:
-        movl %edx, %ecx
+        cmpq $16,       %rdx
-        andl  $63, %ecx
+        jb   .Lless_16bytes
-        shrl   $3, %ecx
-        jz   .Lhandle_7
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r8
+        movq 1*8(%rsi), %r9
+        movq -2*8(%rsi, %rdx),  %r10
+        movq -1*8(%rsi, %rdx),  %r11
+        movq %r8,       0*8(%rdi)
+        movq %r9,       1*8(%rdi)
+        movq %r10,      -2*8(%rdi, %rdx)
+        movq %r11,      -1*8(%rdi, %rdx)
+        retq
        .p2align 4
-.Lloop_8:
+.Lless_16bytes:
-        decl %ecx
+        cmpq $8,        %rdx
-        movq (%rsi),            %r8
+        jb   .Lless_8bytes
-        movq %r8,               (%rdi)
+        /*
-        leaq 8(%rdi),           %rdi
+         * Move data from 8 bytes to 15 bytes.
-        leaq 8(%rsi),           %rsi
+         */
-        jnz  .Lloop_8
+        movq 0*8(%rsi), %r8
+        movq -1*8(%rsi, %rdx),  %r9
-.Lhandle_7:
+        movq %r8,       0*8(%rdi)
-        movl %edx, %ecx
+        movq %r9,       -1*8(%rdi, %rdx)
-        andl $7, %ecx
+        retq
-        jz .Lend
+        .p2align 4
+.Lless_8bytes:
+        cmpq $4,        %rdx
+        jb   .Lless_3bytes
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %ecx
+        movl -4(%rsi, %rdx), %r8d
+        movl %ecx, (%rdi)
+        movl %r8d, -4(%rdi, %rdx)
+        retq
        .p2align 4
+.Lless_3bytes:
+        cmpl $0, %edx
+        je .Lend
+        /*
+         * Move data from 1 bytes to 3 bytes.
+         */
 .Lloop_1:
        movb (%rsi), %r8b
        movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
-        decl %ecx
+        decl %edx
        jnz .Lloop_1
 .Lend:
-        ret
+        retq
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
        /*
-         * Some CPUs run faster using the string copy instructions.
+         * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-         * It is also a lot simpler. Use this when possible:
+         * If the feature is supported, memcpy_c_e() is the first choice.
-         */
+         * If enhanced rep movsb copy is not available, use fast string copy
+         * memcpy_c() when possible. This is faster and code is simpler than
-        .section .altinstructions, "a"
+         * original memcpy().
-        .align 8
+         * Otherwise, original memcpy() is used.
-        .quad memcpy
+         * In .altinstructions section, ERMS feature is placed after REG_GOOD
-        .quad .Lmemcpy_c
+         * feature to implement the right patch order.
-        .word X86_FEATURE_REP_GOOD
+         *
-        /*
         * Replace only beginning, memcpy is used to apply alternatives,
         * so it is silly to overwrite itself with nops - reboot is the
         * only outcome...
         */
-        .byte .Lmemcpy_e - .Lmemcpy_c
+        .section .altinstructions, "a"
-        .byte .Lmemcpy_e - .Lmemcpy_c
+        altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+                             .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+        altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+                             .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
        .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..d0ec9c2936d7
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,224 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#define _STRING_C
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#undef memmove
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+ENTRY(memmove)
+        CFI_STARTPROC
+        /* Handle more 32bytes in loop */
+        mov %rdi, %rax
+        cmp $0x20, %rdx
+        jb      1f
+        /* Decide forward/backward copy mode */
+        cmp %rdi, %rsi
+        jge .Lmemmove_begin_forward
+        mov %rsi, %r8
+        add %rdx, %r8
+        cmp %rdi, %r8
+        jg 2f
+.Lmemmove_begin_forward:
+        /*
+         * movsq instruction have many startup latency
+         * so we handle small size by general register.
+         */
+        cmp  $680, %rdx
+        jb      3f
+        /*
+         * movsq instruction is only good for aligned case.
+         */
+        cmpb %dil, %sil
+        je 4f
+3:
+        sub $0x20, %rdx
+        /*
+         * We gobble 32byts forward in each loop.
+         */
+5:
+        sub $0x20, %rdx
+        movq 0*8(%rsi), %r11
+        movq 1*8(%rsi), %r10
+        movq 2*8(%rsi), %r9
+        movq 3*8(%rsi), %r8
+        leaq 4*8(%rsi), %rsi
+        movq %r11, 0*8(%rdi)
+        movq %r10, 1*8(%rdi)
+        movq %r9, 2*8(%rdi)
+        movq %r8, 3*8(%rdi)
+        leaq 4*8(%rdi), %rdi
+        jae 5b
+        addq $0x20, %rdx
+        jmp 1f
+        /*
+         * Handle data forward by movsq.
+         */
+        .p2align 4
+4:
+        movq %rdx, %rcx
+        movq -8(%rsi, %rdx), %r11
+        lea -8(%rdi, %rdx), %r10
+        shrq $3, %rcx
+        rep movsq
+        movq %r11, (%r10)
+        jmp 13f
+.Lmemmove_end_forward:
+        /*
+         * Handle data backward by movsq.
+         */
+        .p2align 4
+7:
+        movq %rdx, %rcx
+        movq (%rsi), %r11
+        movq %rdi, %r10
+        leaq -8(%rsi, %rdx), %rsi
+        leaq -8(%rdi, %rdx), %rdi
+        shrq $3, %rcx
+        std
+        rep movsq
+        cld
+        movq %r11, (%r10)
+        jmp 13f
+        /*
+         * Start to prepare for backward copy.
+         */
+        .p2align 4
+2:
+        cmp $680, %rdx
+        jb 6f
+        cmp %dil, %sil
+        je 7b
+6:
+        /*
+         * Calculate copy position to tail.
+         */
+        addq %rdx, %rsi
+        addq %rdx, %rdi
+        subq $0x20, %rdx
+        /*
+         * We gobble 32byts backward in each loop.
+         */
+8:
+        subq $0x20, %rdx
+        movq -1*8(%rsi), %r11
+        movq -2*8(%rsi), %r10
+        movq -3*8(%rsi), %r9
+        movq -4*8(%rsi), %r8
+        leaq -4*8(%rsi), %rsi
+        movq %r11, -1*8(%rdi)
+        movq %r10, -2*8(%rdi)
+        movq %r9, -3*8(%rdi)
+        movq %r8, -4*8(%rdi)
+        leaq -4*8(%rdi), %rdi
+        jae 8b
+        /*
+         * Calculate copy position to head.
+         */
+        addq $0x20, %rdx
+        subq %rdx, %rsi
+        subq %rdx, %rdi
+1:
+        cmpq $16, %rdx
+        jb 9f
+        /*
+         * Move data from 16 bytes to 31 bytes.
+         */
+        movq 0*8(%rsi), %r11
+        movq 1*8(%rsi), %r10
+        movq -2*8(%rsi, %rdx), %r9
+        movq -1*8(%rsi, %rdx), %r8
+        movq %r11, 0*8(%rdi)
+        movq %r10, 1*8(%rdi)
+        movq %r9, -2*8(%rdi, %rdx)
+        movq %r8, -1*8(%rdi, %rdx)
+        jmp 13f
+        .p2align 4
+9:
+        cmpq $8, %rdx
+        jb 10f
+        /*
+         * Move data from 8 bytes to 15 bytes.
+         */
+        movq 0*8(%rsi), %r11
+        movq -1*8(%rsi, %rdx), %r10
+        movq %r11, 0*8(%rdi)
+        movq %r10, -1*8(%rdi, %rdx)
+        jmp 13f
+10:
+        cmpq $4, %rdx
+        jb 11f
+        /*
+         * Move data from 4 bytes to 7 bytes.
+         */
+        movl (%rsi), %r11d
+        movl -4(%rsi, %rdx), %r10d
+        movl %r11d, (%rdi)
+        movl %r10d, -4(%rdi, %rdx)
+        jmp 13f
+11:
+        cmp $2, %rdx
+        jb 12f
+        /*
+         * Move data from 2 bytes to 3 bytes.
+         */
+        movw (%rsi), %r11w
+        movw -2(%rsi, %rdx), %r10w
+        movw %r11w, (%rdi)
+        movw %r10w, -2(%rdi, %rdx)
+        jmp 13f
+12:
+        cmp $1, %rdx
+        jb 13f
+        /*
+         * Move data for 1 byte.
+         */
+        movb (%rsi), %r11b
+        movb %r11b, (%rdi)
+13:
+        retq
+        CFI_ENDPROC
+        .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+        /* Forward moving data. */
+        movq %rdx, %rcx
+        rep movsb
+        retq
+.Lmemmove_end_forward_efs:
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad .Lmemmove_begin_forward
+        .quad .Lmemmove_begin_forward_efs
+        .word X86_FEATURE_ERMS
+        .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
+        .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+        .previous
+ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 0a33909bf122..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Normally compiler builtins are used, but sometimes the compiler calls out
-   of line code. Based on asm-i386/string.h.
- */
-#define _STRING_C
-#include <linux/string.h>
-#include <linux/module.h>
-#undef memmove
-void *memmove(void *dest, const void *src, size_t count)
-{
-        if (dest < src) {
-                return memcpy(dest, src, count);
-        } else {
-                char *p = dest + count;
-                const char *s = src + count;
-                while (count--)
-                        *--p = *--s;
-        }
-        return dest;
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d344269652..79bd454b78a3 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
 /*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
 *      
 * rdi   destination
 * rsi   value (char) 
@@ -31,6 +35,28 @@
 .Lmemset_e:
        .previous
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+        .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+        movq %rdi,%r9
+        movb %sil,%al
+        movl %edx,%ecx
+        rep stosb
+        movq %r9,%rax
+        ret
+.Lmemset_e_e:
+        .previous
 ENTRY(memset)
 ENTRY(__memset)
        CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
 ENDPROC(memset)
 ENDPROC(__memset)
-        /* Some CPUs run faster using the string instructions.
+        /* Some CPUs support enhanced REP MOVSB/STOSB feature.
-           It is also a lot simpler. Use this when possible */
+         * It is recommended to use this when possible.
+         *
-#include <asm/cpufeature.h>
+         * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+         * instructions.
+         *
+         * Otherwise, use original memset function.
+         *
+         * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+         */
        .section .altinstructions,"a"
-        .align 8
+        altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-        .quad memset
+                             .Lfinal-memset,.Lmemset_e-.Lmemset_c
-        .quad .Lmemset_c
+        altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-        .word X86_FEATURE_REP_GOOD
+                             .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
-        .byte .Lfinal - memset
-        .byte .Lmemset_e - .Lmemset_c
        .previous
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49df..67743977398b 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
 #include <asm/dwarf2.h>
 #define save_common_regs \
-        pushq %rdi; \
+        pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
-        pushq %rsi; \
+        pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
-        pushq %rcx; \
+        pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
-        pushq %r8; \
+        pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
-        pushq %r9; \
+        pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
-        pushq %r10; \
+        pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
-        pushq %r11
+        pushq_cfi %r11; CFI_REL_OFFSET r11, 0
 #define restore_common_regs \
-        popq %r11; \
+        popq_cfi %r11; CFI_RESTORE r11; \
-        popq %r10; \
+        popq_cfi %r10; CFI_RESTORE r10; \
-        popq %r9; \
+        popq_cfi %r9;  CFI_RESTORE r9; \
-        popq %r8; \
+        popq_cfi %r8;  CFI_RESTORE r8; \
-        popq %rcx; \
+        popq_cfi %rcx; CFI_RESTORE rcx; \
-        popq %rsi; \
+        popq_cfi %rsi; CFI_RESTORE rsi; \
-        popq %rdi
+        popq_cfi %rdi; CFI_RESTORE rdi
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
+        CFI_STARTPROC
        save_common_regs
-        pushq %rdx
+        pushq_cfi %rdx
+        CFI_REL_OFFSET rdx, 0
        movq %rax,%rdi
        call rwsem_down_read_failed
-        popq %rdx
+        popq_cfi %rdx
+        CFI_RESTORE rdx
        restore_common_regs
        ret
-        ENDPROC(call_rwsem_down_read_failed)
+        CFI_ENDPROC
+ENDPROC(call_rwsem_down_read_failed)
 ENTRY(call_rwsem_down_write_failed)
+        CFI_STARTPROC
        save_common_regs
        movq %rax,%rdi
        call rwsem_down_write_failed
        restore_common_regs
        ret
-        ENDPROC(call_rwsem_down_write_failed)
+        CFI_ENDPROC
+ENDPROC(call_rwsem_down_write_failed)
 ENTRY(call_rwsem_wake)
+        CFI_STARTPROC
        decl %edx       /* do nothing if still outstanding active readers */
        jnz 1f
        save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
        call rwsem_wake
        restore_common_regs
 1:      ret
-        ENDPROC(call_rwsem_wake)
+        CFI_ENDPROC
+ENDPROC(call_rwsem_wake)
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_downgrade_wake)
+        CFI_STARTPROC
        save_common_regs
-        pushq %rdx
+        pushq_cfi %rdx
+        CFI_REL_OFFSET rdx, 0
        movq %rax,%rdi
        call rwsem_downgrade_wake
-        popq %rdx
+        popq_cfi %rdx
+        CFI_RESTORE rdx
        restore_common_regs
        ret
-        ENDPROC(call_rwsem_downgrade_wake)
+        CFI_ENDPROC
+ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe4741782..06691daa4108 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
 */
 #ifdef CONFIG_SMP
 ENTRY(__write_lock_failed)
-        CFI_STARTPROC simple
+        CFI_STARTPROC
        FRAME
 2:      LOCK_PREFIX
        addl    $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
        CFI_STARTPROC
-        push %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx,0
-        push %edx
+        pushl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edx,0
        call rwsem_down_read_failed
-        pop %edx
+        popl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET -4
+        popl_cfi %ecx
-        pop %ecx
-        CFI_ADJUST_CFA_OFFSET -4
        ret
        CFI_ENDPROC
        ENDPROC(call_rwsem_down_read_failed)
 ENTRY(call_rwsem_down_write_failed)
        CFI_STARTPROC
-        push %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx,0
        calll rwsem_down_write_failed
-        pop %ecx
+        popl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET -4
        ret
        CFI_ENDPROC
        ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
        CFI_STARTPROC
        decw %dx    /* do nothing if still outstanding active readers */
        jnz 1f
-        push %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx,0
        call rwsem_wake
-        pop %ecx
+        popl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET -4
 1:      ret
        CFI_ENDPROC
        ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_downgrade_wake)
        CFI_STARTPROC
-        push %ecx
+        pushl_cfi %ecx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET ecx,0
-        push %edx
+        pushl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET edx,0
        call rwsem_downgrade_wake
-        pop %edx
+        popl_cfi %edx
-        CFI_ADJUST_CFA_OFFSET -4
+        popl_cfi %ecx
-        pop %ecx
-        CFI_ADJUST_CFA_OFFSET -4
        ret
        CFI_ENDPROC
        ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ecc..2930ae05d773 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
        #include <linux/linkage.h>
-#define ARCH_TRACE_IRQS_ON                      \
-        pushl %eax;                             \
-        pushl %ecx;                             \
-        pushl %edx;                             \
-        call trace_hardirqs_on;                 \
-        popl %edx;                              \
-        popl %ecx;                              \
-        popl %eax;
-#define ARCH_TRACE_IRQS_OFF                     \
-        pushl %eax;                             \
-        pushl %ecx;                             \
-        pushl %edx;                             \
-        call trace_hardirqs_off;                \
-        popl %edx;                              \
-        popl %ecx;                              \
-        popl %eax;
 #ifdef CONFIG_TRACE_IRQFLAGS
        /* put return address in eax (arg1) */
        .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a5428..782b082c9ff7 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
        CFI_ENDPROC
        .endm
-        /* rdi: arg1 ... normal C conventions. rax is passed from C. */         
-        .macro thunk_retrax name,func
-        .globl \name
-\name:  
-        CFI_STARTPROC
-        SAVE_ARGS
-        call \func
-        jmp  restore_norax
-        CFI_ENDPROC
-        .endm
-        
-        .section .sched.text, "ax"
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-        thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
-        thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
-        thunk rwsem_wake_thunk,rwsem_wake
-        thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
-#endif  
-        
 #ifdef CONFIG_TRACE_IRQFLAGS
        /* put return address in rdi (arg1) */
        .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
        RESTORE_ARGS
        ret     
        CFI_ENDPROC
-        
-        CFI_STARTPROC
-        SAVE_ARGS
-restore_norax:  
-        RESTORE_ARGS 1
-        ret
-        CFI_ENDPROC