diff options
Diffstat (limited to 'arch/x86/lib')
-rw-r--r-- | arch/x86/lib/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/lib/atomic64_386_32.S | 6 | ||||
-rw-r--r-- | arch/x86/lib/atomic64_cx8_32.S | 6 | ||||
-rw-r--r-- | arch/x86/lib/checksum_32.S | 63 | ||||
-rw-r--r-- | arch/x86/lib/clear_page_64.S | 33 | ||||
-rw-r--r-- | arch/x86/lib/cmpxchg16b_emu.S | 65 | ||||
-rw-r--r-- | arch/x86/lib/copy_user_64.S | 71 | ||||
-rw-r--r-- | arch/x86/lib/csum-copy_64.S | 242 | ||||
-rw-r--r-- | arch/x86/lib/csum-partial_64.c | 2 | ||||
-rw-r--r-- | arch/x86/lib/delay.c | 2 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_32.c | 199 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_64.S | 203 | ||||
-rw-r--r-- | arch/x86/lib/memmove_64.S | 224 | ||||
-rw-r--r-- | arch/x86/lib/memmove_64.c | 21 | ||||
-rw-r--r-- | arch/x86/lib/memset_64.S | 54 | ||||
-rw-r--r-- | arch/x86/lib/rwsem_64.S | 56 | ||||
-rw-r--r-- | arch/x86/lib/semaphore_32.S | 38 | ||||
-rw-r--r-- | arch/x86/lib/thunk_32.S | 18 | ||||
-rw-r--r-- | arch/x86/lib/thunk_64.S | 27 |
19 files changed, 926 insertions, 405 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf070ede0..f2479f19ddde 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile | |||
@@ -42,4 +42,5 @@ else | |||
42 | lib-y += memmove_64.o memset_64.o | 42 | lib-y += memmove_64.o memset_64.o |
43 | lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o | 43 | lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o |
44 | lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o | 44 | lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o |
45 | lib-y += cmpxchg16b_emu.o | ||
45 | endif | 46 | endif |
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 2cda60a06e65..e8e7e0d06f42 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S | |||
@@ -15,14 +15,12 @@ | |||
15 | 15 | ||
16 | /* if you want SMP support, implement these with real spinlocks */ | 16 | /* if you want SMP support, implement these with real spinlocks */ |
17 | .macro LOCK reg | 17 | .macro LOCK reg |
18 | pushfl | 18 | pushfl_cfi |
19 | CFI_ADJUST_CFA_OFFSET 4 | ||
20 | cli | 19 | cli |
21 | .endm | 20 | .endm |
22 | 21 | ||
23 | .macro UNLOCK reg | 22 | .macro UNLOCK reg |
24 | popfl | 23 | popfl_cfi |
25 | CFI_ADJUST_CFA_OFFSET -4 | ||
26 | .endm | 24 | .endm |
27 | 25 | ||
28 | #define BEGIN(op) \ | 26 | #define BEGIN(op) \ |
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 71e080de3352..391a083674b4 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S | |||
@@ -14,14 +14,12 @@ | |||
14 | #include <asm/dwarf2.h> | 14 | #include <asm/dwarf2.h> |
15 | 15 | ||
16 | .macro SAVE reg | 16 | .macro SAVE reg |
17 | pushl %\reg | 17 | pushl_cfi %\reg |
18 | CFI_ADJUST_CFA_OFFSET 4 | ||
19 | CFI_REL_OFFSET \reg, 0 | 18 | CFI_REL_OFFSET \reg, 0 |
20 | .endm | 19 | .endm |
21 | 20 | ||
22 | .macro RESTORE reg | 21 | .macro RESTORE reg |
23 | popl %\reg | 22 | popl_cfi %\reg |
24 | CFI_ADJUST_CFA_OFFSET -4 | ||
25 | CFI_RESTORE \reg | 23 | CFI_RESTORE \reg |
26 | .endm | 24 | .endm |
27 | 25 | ||
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index adbccd0bbb78..78d16a554db0 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S | |||
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | |||
50 | */ | 50 | */ |
51 | ENTRY(csum_partial) | 51 | ENTRY(csum_partial) |
52 | CFI_STARTPROC | 52 | CFI_STARTPROC |
53 | pushl %esi | 53 | pushl_cfi %esi |
54 | CFI_ADJUST_CFA_OFFSET 4 | ||
55 | CFI_REL_OFFSET esi, 0 | 54 | CFI_REL_OFFSET esi, 0 |
56 | pushl %ebx | 55 | pushl_cfi %ebx |
57 | CFI_ADJUST_CFA_OFFSET 4 | ||
58 | CFI_REL_OFFSET ebx, 0 | 56 | CFI_REL_OFFSET ebx, 0 |
59 | movl 20(%esp),%eax # Function arg: unsigned int sum | 57 | movl 20(%esp),%eax # Function arg: unsigned int sum |
60 | movl 16(%esp),%ecx # Function arg: int len | 58 | movl 16(%esp),%ecx # Function arg: int len |
@@ -132,11 +130,9 @@ ENTRY(csum_partial) | |||
132 | jz 8f | 130 | jz 8f |
133 | roll $8, %eax | 131 | roll $8, %eax |
134 | 8: | 132 | 8: |
135 | popl %ebx | 133 | popl_cfi %ebx |
136 | CFI_ADJUST_CFA_OFFSET -4 | ||
137 | CFI_RESTORE ebx | 134 | CFI_RESTORE ebx |
138 | popl %esi | 135 | popl_cfi %esi |
139 | CFI_ADJUST_CFA_OFFSET -4 | ||
140 | CFI_RESTORE esi | 136 | CFI_RESTORE esi |
141 | ret | 137 | ret |
142 | CFI_ENDPROC | 138 | CFI_ENDPROC |
@@ -148,11 +144,9 @@ ENDPROC(csum_partial) | |||
148 | 144 | ||
149 | ENTRY(csum_partial) | 145 | ENTRY(csum_partial) |
150 | CFI_STARTPROC | 146 | CFI_STARTPROC |
151 | pushl %esi | 147 | pushl_cfi %esi |
152 | CFI_ADJUST_CFA_OFFSET 4 | ||
153 | CFI_REL_OFFSET esi, 0 | 148 | CFI_REL_OFFSET esi, 0 |
154 | pushl %ebx | 149 | pushl_cfi %ebx |
155 | CFI_ADJUST_CFA_OFFSET 4 | ||
156 | CFI_REL_OFFSET ebx, 0 | 150 | CFI_REL_OFFSET ebx, 0 |
157 | movl 20(%esp),%eax # Function arg: unsigned int sum | 151 | movl 20(%esp),%eax # Function arg: unsigned int sum |
158 | movl 16(%esp),%ecx # Function arg: int len | 152 | movl 16(%esp),%ecx # Function arg: int len |
@@ -260,11 +254,9 @@ ENTRY(csum_partial) | |||
260 | jz 90f | 254 | jz 90f |
261 | roll $8, %eax | 255 | roll $8, %eax |
262 | 90: | 256 | 90: |
263 | popl %ebx | 257 | popl_cfi %ebx |
264 | CFI_ADJUST_CFA_OFFSET -4 | ||
265 | CFI_RESTORE ebx | 258 | CFI_RESTORE ebx |
266 | popl %esi | 259 | popl_cfi %esi |
267 | CFI_ADJUST_CFA_OFFSET -4 | ||
268 | CFI_RESTORE esi | 260 | CFI_RESTORE esi |
269 | ret | 261 | ret |
270 | CFI_ENDPROC | 262 | CFI_ENDPROC |
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic) | |||
309 | CFI_STARTPROC | 301 | CFI_STARTPROC |
310 | subl $4,%esp | 302 | subl $4,%esp |
311 | CFI_ADJUST_CFA_OFFSET 4 | 303 | CFI_ADJUST_CFA_OFFSET 4 |
312 | pushl %edi | 304 | pushl_cfi %edi |
313 | CFI_ADJUST_CFA_OFFSET 4 | ||
314 | CFI_REL_OFFSET edi, 0 | 305 | CFI_REL_OFFSET edi, 0 |
315 | pushl %esi | 306 | pushl_cfi %esi |
316 | CFI_ADJUST_CFA_OFFSET 4 | ||
317 | CFI_REL_OFFSET esi, 0 | 307 | CFI_REL_OFFSET esi, 0 |
318 | pushl %ebx | 308 | pushl_cfi %ebx |
319 | CFI_ADJUST_CFA_OFFSET 4 | ||
320 | CFI_REL_OFFSET ebx, 0 | 309 | CFI_REL_OFFSET ebx, 0 |
321 | movl ARGBASE+16(%esp),%eax # sum | 310 | movl ARGBASE+16(%esp),%eax # sum |
322 | movl ARGBASE+12(%esp),%ecx # len | 311 | movl ARGBASE+12(%esp),%ecx # len |
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) ) | |||
426 | 415 | ||
427 | .previous | 416 | .previous |
428 | 417 | ||
429 | popl %ebx | 418 | popl_cfi %ebx |
430 | CFI_ADJUST_CFA_OFFSET -4 | ||
431 | CFI_RESTORE ebx | 419 | CFI_RESTORE ebx |
432 | popl %esi | 420 | popl_cfi %esi |
433 | CFI_ADJUST_CFA_OFFSET -4 | ||
434 | CFI_RESTORE esi | 421 | CFI_RESTORE esi |
435 | popl %edi | 422 | popl_cfi %edi |
436 | CFI_ADJUST_CFA_OFFSET -4 | ||
437 | CFI_RESTORE edi | 423 | CFI_RESTORE edi |
438 | popl %ecx # equivalent to addl $4,%esp | 424 | popl_cfi %ecx # equivalent to addl $4,%esp |
439 | CFI_ADJUST_CFA_OFFSET -4 | ||
440 | ret | 425 | ret |
441 | CFI_ENDPROC | 426 | CFI_ENDPROC |
442 | ENDPROC(csum_partial_copy_generic) | 427 | ENDPROC(csum_partial_copy_generic) |
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic) | |||
459 | 444 | ||
460 | ENTRY(csum_partial_copy_generic) | 445 | ENTRY(csum_partial_copy_generic) |
461 | CFI_STARTPROC | 446 | CFI_STARTPROC |
462 | pushl %ebx | 447 | pushl_cfi %ebx |
463 | CFI_ADJUST_CFA_OFFSET 4 | ||
464 | CFI_REL_OFFSET ebx, 0 | 448 | CFI_REL_OFFSET ebx, 0 |
465 | pushl %edi | 449 | pushl_cfi %edi |
466 | CFI_ADJUST_CFA_OFFSET 4 | ||
467 | CFI_REL_OFFSET edi, 0 | 450 | CFI_REL_OFFSET edi, 0 |
468 | pushl %esi | 451 | pushl_cfi %esi |
469 | CFI_ADJUST_CFA_OFFSET 4 | ||
470 | CFI_REL_OFFSET esi, 0 | 452 | CFI_REL_OFFSET esi, 0 |
471 | movl ARGBASE+4(%esp),%esi #src | 453 | movl ARGBASE+4(%esp),%esi #src |
472 | movl ARGBASE+8(%esp),%edi #dst | 454 | movl ARGBASE+8(%esp),%edi #dst |
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) ) | |||
527 | jmp 7b | 509 | jmp 7b |
528 | .previous | 510 | .previous |
529 | 511 | ||
530 | popl %esi | 512 | popl_cfi %esi |
531 | CFI_ADJUST_CFA_OFFSET -4 | ||
532 | CFI_RESTORE esi | 513 | CFI_RESTORE esi |
533 | popl %edi | 514 | popl_cfi %edi |
534 | CFI_ADJUST_CFA_OFFSET -4 | ||
535 | CFI_RESTORE edi | 515 | CFI_RESTORE edi |
536 | popl %ebx | 516 | popl_cfi %ebx |
537 | CFI_ADJUST_CFA_OFFSET -4 | ||
538 | CFI_RESTORE ebx | 517 | CFI_RESTORE ebx |
539 | ret | 518 | ret |
540 | CFI_ENDPROC | 519 | CFI_ENDPROC |
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index aa4326bfb24a..f2145cfa12a6 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <linux/linkage.h> | 1 | #include <linux/linkage.h> |
2 | #include <asm/dwarf2.h> | 2 | #include <asm/dwarf2.h> |
3 | #include <asm/alternative-asm.h> | ||
3 | 4 | ||
4 | /* | 5 | /* |
5 | * Zero a page. | 6 | * Zero a page. |
@@ -14,6 +15,15 @@ ENTRY(clear_page_c) | |||
14 | CFI_ENDPROC | 15 | CFI_ENDPROC |
15 | ENDPROC(clear_page_c) | 16 | ENDPROC(clear_page_c) |
16 | 17 | ||
18 | ENTRY(clear_page_c_e) | ||
19 | CFI_STARTPROC | ||
20 | movl $4096,%ecx | ||
21 | xorl %eax,%eax | ||
22 | rep stosb | ||
23 | ret | ||
24 | CFI_ENDPROC | ||
25 | ENDPROC(clear_page_c_e) | ||
26 | |||
17 | ENTRY(clear_page) | 27 | ENTRY(clear_page) |
18 | CFI_STARTPROC | 28 | CFI_STARTPROC |
19 | xorl %eax,%eax | 29 | xorl %eax,%eax |
@@ -38,21 +48,26 @@ ENTRY(clear_page) | |||
38 | .Lclear_page_end: | 48 | .Lclear_page_end: |
39 | ENDPROC(clear_page) | 49 | ENDPROC(clear_page) |
40 | 50 | ||
41 | /* Some CPUs run faster using the string instructions. | 51 | /* |
42 | It is also a lot simpler. Use this when possible */ | 52 | * Some CPUs support enhanced REP MOVSB/STOSB instructions. |
53 | * It is recommended to use this when possible. | ||
54 | * If enhanced REP MOVSB/STOSB is not available, try to use fast string. | ||
55 | * Otherwise, use original function. | ||
56 | * | ||
57 | */ | ||
43 | 58 | ||
44 | #include <asm/cpufeature.h> | 59 | #include <asm/cpufeature.h> |
45 | 60 | ||
46 | .section .altinstr_replacement,"ax" | 61 | .section .altinstr_replacement,"ax" |
47 | 1: .byte 0xeb /* jmp <disp8> */ | 62 | 1: .byte 0xeb /* jmp <disp8> */ |
48 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ | 63 | .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ |
49 | 2: | 64 | 2: .byte 0xeb /* jmp <disp8> */ |
65 | .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ | ||
66 | 3: | ||
50 | .previous | 67 | .previous |
51 | .section .altinstructions,"a" | 68 | .section .altinstructions,"a" |
52 | .align 8 | 69 | altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ |
53 | .quad clear_page | 70 | .Lclear_page_end-clear_page, 2b-1b |
54 | .quad 1b | 71 | altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ |
55 | .word X86_FEATURE_REP_GOOD | 72 | .Lclear_page_end-clear_page,3b-2b |
56 | .byte .Lclear_page_end - clear_page | ||
57 | .byte 2b - 1b | ||
58 | .previous | 73 | .previous |
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 000000000000..1e572c507d06 --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S | |||
@@ -0,0 +1,65 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or | ||
3 | * modify it under the terms of the GNU General Public License | ||
4 | * as published by the Free Software Foundation; version 2 | ||
5 | * of the License. | ||
6 | * | ||
7 | */ | ||
8 | #include <linux/linkage.h> | ||
9 | #include <asm/alternative-asm.h> | ||
10 | #include <asm/frame.h> | ||
11 | #include <asm/dwarf2.h> | ||
12 | |||
13 | #ifdef CONFIG_SMP | ||
14 | #define SEG_PREFIX %gs: | ||
15 | #else | ||
16 | #define SEG_PREFIX | ||
17 | #endif | ||
18 | |||
19 | .text | ||
20 | |||
21 | /* | ||
22 | * Inputs: | ||
23 | * %rsi : memory location to compare | ||
24 | * %rax : low 64 bits of old value | ||
25 | * %rdx : high 64 bits of old value | ||
26 | * %rbx : low 64 bits of new value | ||
27 | * %rcx : high 64 bits of new value | ||
28 | * %al : Operation successful | ||
29 | */ | ||
30 | ENTRY(this_cpu_cmpxchg16b_emu) | ||
31 | CFI_STARTPROC | ||
32 | |||
33 | # | ||
34 | # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not | ||
35 | # via the ZF. Caller will access %al to get result. | ||
36 | # | ||
37 | # Note that this is only useful for a cpuops operation. Meaning that we | ||
38 | # do *not* have a fully atomic operation but just an operation that is | ||
39 | # *atomic* on a single cpu (as provided by the this_cpu_xx class of | ||
40 | # macros). | ||
41 | # | ||
42 | this_cpu_cmpxchg16b_emu: | ||
43 | pushf | ||
44 | cli | ||
45 | |||
46 | cmpq SEG_PREFIX(%rsi), %rax | ||
47 | jne not_same | ||
48 | cmpq SEG_PREFIX 8(%rsi), %rdx | ||
49 | jne not_same | ||
50 | |||
51 | movq %rbx, SEG_PREFIX(%rsi) | ||
52 | movq %rcx, SEG_PREFIX 8(%rsi) | ||
53 | |||
54 | popf | ||
55 | mov $1, %al | ||
56 | ret | ||
57 | |||
58 | not_same: | ||
59 | popf | ||
60 | xor %al,%al | ||
61 | ret | ||
62 | |||
63 | CFI_ENDPROC | ||
64 | |||
65 | ENDPROC(this_cpu_cmpxchg16b_emu) | ||
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a460158b5ac5..024840266ba0 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S | |||
@@ -15,23 +15,30 @@ | |||
15 | #include <asm/asm-offsets.h> | 15 | #include <asm/asm-offsets.h> |
16 | #include <asm/thread_info.h> | 16 | #include <asm/thread_info.h> |
17 | #include <asm/cpufeature.h> | 17 | #include <asm/cpufeature.h> |
18 | #include <asm/alternative-asm.h> | ||
18 | 19 | ||
19 | .macro ALTERNATIVE_JUMP feature,orig,alt | 20 | /* |
21 | * By placing feature2 after feature1 in altinstructions section, we logically | ||
22 | * implement: | ||
23 | * If CPU has feature2, jmp to alt2 is used | ||
24 | * else if CPU has feature1, jmp to alt1 is used | ||
25 | * else jmp to orig is used. | ||
26 | */ | ||
27 | .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 | ||
20 | 0: | 28 | 0: |
21 | .byte 0xe9 /* 32bit jump */ | 29 | .byte 0xe9 /* 32bit jump */ |
22 | .long \orig-1f /* by default jump to orig */ | 30 | .long \orig-1f /* by default jump to orig */ |
23 | 1: | 31 | 1: |
24 | .section .altinstr_replacement,"ax" | 32 | .section .altinstr_replacement,"ax" |
25 | 2: .byte 0xe9 /* near jump with 32bit immediate */ | 33 | 2: .byte 0xe9 /* near jump with 32bit immediate */ |
26 | .long \alt-1b /* offset */ /* or alternatively to alt */ | 34 | .long \alt1-1b /* offset */ /* or alternatively to alt1 */ |
35 | 3: .byte 0xe9 /* near jump with 32bit immediate */ | ||
36 | .long \alt2-1b /* offset */ /* or alternatively to alt2 */ | ||
27 | .previous | 37 | .previous |
38 | |||
28 | .section .altinstructions,"a" | 39 | .section .altinstructions,"a" |
29 | .align 8 | 40 | altinstruction_entry 0b,2b,\feature1,5,5 |
30 | .quad 0b | 41 | altinstruction_entry 0b,3b,\feature2,5,5 |
31 | .quad 2b | ||
32 | .word \feature /* when feature is set */ | ||
33 | .byte 5 | ||
34 | .byte 5 | ||
35 | .previous | 42 | .previous |
36 | .endm | 43 | .endm |
37 | 44 | ||
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user) | |||
72 | addq %rdx,%rcx | 79 | addq %rdx,%rcx |
73 | jc bad_to_user | 80 | jc bad_to_user |
74 | cmpq TI_addr_limit(%rax),%rcx | 81 | cmpq TI_addr_limit(%rax),%rcx |
75 | jae bad_to_user | 82 | ja bad_to_user |
76 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 83 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
84 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
85 | copy_user_enhanced_fast_string | ||
77 | CFI_ENDPROC | 86 | CFI_ENDPROC |
78 | ENDPROC(_copy_to_user) | 87 | ENDPROC(_copy_to_user) |
79 | 88 | ||
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user) | |||
85 | addq %rdx,%rcx | 94 | addq %rdx,%rcx |
86 | jc bad_from_user | 95 | jc bad_from_user |
87 | cmpq TI_addr_limit(%rax),%rcx | 96 | cmpq TI_addr_limit(%rax),%rcx |
88 | jae bad_from_user | 97 | ja bad_from_user |
89 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string | 98 | ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ |
99 | copy_user_generic_unrolled,copy_user_generic_string, \ | ||
100 | copy_user_enhanced_fast_string | ||
90 | CFI_ENDPROC | 101 | CFI_ENDPROC |
91 | ENDPROC(_copy_from_user) | 102 | ENDPROC(_copy_from_user) |
92 | 103 | ||
@@ -117,7 +128,7 @@ ENDPROC(bad_from_user) | |||
117 | * rdx count | 128 | * rdx count |
118 | * | 129 | * |
119 | * Output: | 130 | * Output: |
120 | * eax uncopied bytes or 0 if successfull. | 131 | * eax uncopied bytes or 0 if successful. |
121 | */ | 132 | */ |
122 | ENTRY(copy_user_generic_unrolled) | 133 | ENTRY(copy_user_generic_unrolled) |
123 | CFI_STARTPROC | 134 | CFI_STARTPROC |
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) | |||
255 | .previous | 266 | .previous |
256 | CFI_ENDPROC | 267 | CFI_ENDPROC |
257 | ENDPROC(copy_user_generic_string) | 268 | ENDPROC(copy_user_generic_string) |
269 | |||
270 | /* | ||
271 | * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. | ||
272 | * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. | ||
273 | * | ||
274 | * Input: | ||
275 | * rdi destination | ||
276 | * rsi source | ||
277 | * rdx count | ||
278 | * | ||
279 | * Output: | ||
280 | * eax uncopied bytes or 0 if successful. | ||
281 | */ | ||
282 | ENTRY(copy_user_enhanced_fast_string) | ||
283 | CFI_STARTPROC | ||
284 | andl %edx,%edx | ||
285 | jz 2f | ||
286 | movl %edx,%ecx | ||
287 | 1: rep | ||
288 | movsb | ||
289 | 2: xorl %eax,%eax | ||
290 | ret | ||
291 | |||
292 | .section .fixup,"ax" | ||
293 | 12: movl %ecx,%edx /* ecx is zerorest also */ | ||
294 | jmp copy_user_handle_tail | ||
295 | .previous | ||
296 | |||
297 | .section __ex_table,"a" | ||
298 | .align 8 | ||
299 | .quad 1b,12b | ||
300 | .previous | ||
301 | CFI_ENDPROC | ||
302 | ENDPROC(copy_user_enhanced_fast_string) | ||
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index f0dba36578ea..fb903b758da8 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 2 | * Copyright 2002, 2003 Andi Kleen, SuSE Labs. |
3 | * | 3 | * |
4 | * This file is subject to the terms and conditions of the GNU General Public | 4 | * This file is subject to the terms and conditions of the GNU General Public |
5 | * License. See the file COPYING in the main directory of this archive | 5 | * License. See the file COPYING in the main directory of this archive |
6 | * for more details. No warranty for anything given at all. | 6 | * for more details. No warranty for anything given at all. |
@@ -11,82 +11,82 @@ | |||
11 | 11 | ||
12 | /* | 12 | /* |
13 | * Checksum copy with exception handling. | 13 | * Checksum copy with exception handling. |
14 | * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the | 14 | * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the |
15 | * destination is zeroed. | 15 | * destination is zeroed. |
16 | * | 16 | * |
17 | * Input | 17 | * Input |
18 | * rdi source | 18 | * rdi source |
19 | * rsi destination | 19 | * rsi destination |
20 | * edx len (32bit) | 20 | * edx len (32bit) |
21 | * ecx sum (32bit) | 21 | * ecx sum (32bit) |
22 | * r8 src_err_ptr (int) | 22 | * r8 src_err_ptr (int) |
23 | * r9 dst_err_ptr (int) | 23 | * r9 dst_err_ptr (int) |
24 | * | 24 | * |
25 | * Output | 25 | * Output |
26 | * eax 64bit sum. undefined in case of exception. | 26 | * eax 64bit sum. undefined in case of exception. |
27 | * | 27 | * |
28 | * Wrappers need to take care of valid exception sum and zeroing. | 28 | * Wrappers need to take care of valid exception sum and zeroing. |
29 | * They also should align source or destination to 8 bytes. | 29 | * They also should align source or destination to 8 bytes. |
30 | */ | 30 | */ |
31 | 31 | ||
32 | .macro source | 32 | .macro source |
33 | 10: | 33 | 10: |
34 | .section __ex_table,"a" | 34 | .section __ex_table, "a" |
35 | .align 8 | 35 | .align 8 |
36 | .quad 10b,.Lbad_source | 36 | .quad 10b, .Lbad_source |
37 | .previous | 37 | .previous |
38 | .endm | 38 | .endm |
39 | 39 | ||
40 | .macro dest | 40 | .macro dest |
41 | 20: | 41 | 20: |
42 | .section __ex_table,"a" | 42 | .section __ex_table, "a" |
43 | .align 8 | 43 | .align 8 |
44 | .quad 20b,.Lbad_dest | 44 | .quad 20b, .Lbad_dest |
45 | .previous | 45 | .previous |
46 | .endm | 46 | .endm |
47 | 47 | ||
48 | .macro ignore L=.Lignore | 48 | .macro ignore L=.Lignore |
49 | 30: | 49 | 30: |
50 | .section __ex_table,"a" | 50 | .section __ex_table, "a" |
51 | .align 8 | 51 | .align 8 |
52 | .quad 30b,\L | 52 | .quad 30b, \L |
53 | .previous | 53 | .previous |
54 | .endm | 54 | .endm |
55 | 55 | ||
56 | 56 | ||
57 | ENTRY(csum_partial_copy_generic) | 57 | ENTRY(csum_partial_copy_generic) |
58 | CFI_STARTPROC | 58 | CFI_STARTPROC |
59 | cmpl $3*64,%edx | 59 | cmpl $3*64, %edx |
60 | jle .Lignore | 60 | jle .Lignore |
61 | 61 | ||
62 | .Lignore: | 62 | .Lignore: |
63 | subq $7*8,%rsp | 63 | subq $7*8, %rsp |
64 | CFI_ADJUST_CFA_OFFSET 7*8 | 64 | CFI_ADJUST_CFA_OFFSET 7*8 |
65 | movq %rbx,2*8(%rsp) | 65 | movq %rbx, 2*8(%rsp) |
66 | CFI_REL_OFFSET rbx, 2*8 | 66 | CFI_REL_OFFSET rbx, 2*8 |
67 | movq %r12,3*8(%rsp) | 67 | movq %r12, 3*8(%rsp) |
68 | CFI_REL_OFFSET r12, 3*8 | 68 | CFI_REL_OFFSET r12, 3*8 |
69 | movq %r14,4*8(%rsp) | 69 | movq %r14, 4*8(%rsp) |
70 | CFI_REL_OFFSET r14, 4*8 | 70 | CFI_REL_OFFSET r14, 4*8 |
71 | movq %r13,5*8(%rsp) | 71 | movq %r13, 5*8(%rsp) |
72 | CFI_REL_OFFSET r13, 5*8 | 72 | CFI_REL_OFFSET r13, 5*8 |
73 | movq %rbp,6*8(%rsp) | 73 | movq %rbp, 6*8(%rsp) |
74 | CFI_REL_OFFSET rbp, 6*8 | 74 | CFI_REL_OFFSET rbp, 6*8 |
75 | 75 | ||
76 | movq %r8,(%rsp) | 76 | movq %r8, (%rsp) |
77 | movq %r9,1*8(%rsp) | 77 | movq %r9, 1*8(%rsp) |
78 | |||
79 | movl %ecx,%eax | ||
80 | movl %edx,%ecx | ||
81 | 78 | ||
82 | xorl %r9d,%r9d | 79 | movl %ecx, %eax |
83 | movq %rcx,%r12 | 80 | movl %edx, %ecx |
84 | 81 | ||
85 | shrq $6,%r12 | 82 | xorl %r9d, %r9d |
86 | jz .Lhandle_tail /* < 64 */ | 83 | movq %rcx, %r12 |
84 | |||
85 | shrq $6, %r12 | ||
86 | jz .Lhandle_tail /* < 64 */ | ||
87 | 87 | ||
88 | clc | 88 | clc |
89 | 89 | ||
90 | /* main loop. clear in 64 byte blocks */ | 90 | /* main loop. clear in 64 byte blocks */ |
91 | /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ | 91 | /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ |
92 | /* r11: temp3, rdx: temp4, r12 loopcnt */ | 92 | /* r11: temp3, rdx: temp4, r12 loopcnt */ |
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic) | |||
94 | .p2align 4 | 94 | .p2align 4 |
95 | .Lloop: | 95 | .Lloop: |
96 | source | 96 | source |
97 | movq (%rdi),%rbx | 97 | movq (%rdi), %rbx |
98 | source | 98 | source |
99 | movq 8(%rdi),%r8 | 99 | movq 8(%rdi), %r8 |
100 | source | 100 | source |
101 | movq 16(%rdi),%r11 | 101 | movq 16(%rdi), %r11 |
102 | source | 102 | source |
103 | movq 24(%rdi),%rdx | 103 | movq 24(%rdi), %rdx |
104 | 104 | ||
105 | source | 105 | source |
106 | movq 32(%rdi),%r10 | 106 | movq 32(%rdi), %r10 |
107 | source | 107 | source |
108 | movq 40(%rdi),%rbp | 108 | movq 40(%rdi), %rbp |
109 | source | 109 | source |
110 | movq 48(%rdi),%r14 | 110 | movq 48(%rdi), %r14 |
111 | source | 111 | source |
112 | movq 56(%rdi),%r13 | 112 | movq 56(%rdi), %r13 |
113 | 113 | ||
114 | ignore 2f | 114 | ignore 2f |
115 | prefetcht0 5*64(%rdi) | 115 | prefetcht0 5*64(%rdi) |
116 | 2: | 116 | 2: |
117 | adcq %rbx,%rax | 117 | adcq %rbx, %rax |
118 | adcq %r8,%rax | 118 | adcq %r8, %rax |
119 | adcq %r11,%rax | 119 | adcq %r11, %rax |
120 | adcq %rdx,%rax | 120 | adcq %rdx, %rax |
121 | adcq %r10,%rax | 121 | adcq %r10, %rax |
122 | adcq %rbp,%rax | 122 | adcq %rbp, %rax |
123 | adcq %r14,%rax | 123 | adcq %r14, %rax |
124 | adcq %r13,%rax | 124 | adcq %r13, %rax |
125 | 125 | ||
126 | decl %r12d | 126 | decl %r12d |
127 | 127 | ||
128 | dest | 128 | dest |
129 | movq %rbx,(%rsi) | 129 | movq %rbx, (%rsi) |
130 | dest | 130 | dest |
131 | movq %r8,8(%rsi) | 131 | movq %r8, 8(%rsi) |
132 | dest | 132 | dest |
133 | movq %r11,16(%rsi) | 133 | movq %r11, 16(%rsi) |
134 | dest | 134 | dest |
135 | movq %rdx,24(%rsi) | 135 | movq %rdx, 24(%rsi) |
136 | 136 | ||
137 | dest | 137 | dest |
138 | movq %r10,32(%rsi) | 138 | movq %r10, 32(%rsi) |
139 | dest | 139 | dest |
140 | movq %rbp,40(%rsi) | 140 | movq %rbp, 40(%rsi) |
141 | dest | 141 | dest |
142 | movq %r14,48(%rsi) | 142 | movq %r14, 48(%rsi) |
143 | dest | 143 | dest |
144 | movq %r13,56(%rsi) | 144 | movq %r13, 56(%rsi) |
145 | 145 | ||
146 | 3: | 146 | 3: |
147 | |||
148 | leaq 64(%rdi),%rdi | ||
149 | leaq 64(%rsi),%rsi | ||
150 | 147 | ||
151 | jnz .Lloop | 148 | leaq 64(%rdi), %rdi |
149 | leaq 64(%rsi), %rsi | ||
152 | 150 | ||
153 | adcq %r9,%rax | 151 | jnz .Lloop |
154 | 152 | ||
155 | /* do last upto 56 bytes */ | 153 | adcq %r9, %rax |
154 | |||
155 | /* do last up to 56 bytes */ | ||
156 | .Lhandle_tail: | 156 | .Lhandle_tail: |
157 | /* ecx: count */ | 157 | /* ecx: count */ |
158 | movl %ecx,%r10d | 158 | movl %ecx, %r10d |
159 | andl $63,%ecx | 159 | andl $63, %ecx |
160 | shrl $3,%ecx | 160 | shrl $3, %ecx |
161 | jz .Lfold | 161 | jz .Lfold |
162 | clc | 162 | clc |
163 | .p2align 4 | 163 | .p2align 4 |
164 | .Lloop_8: | 164 | .Lloop_8: |
165 | source | 165 | source |
166 | movq (%rdi),%rbx | 166 | movq (%rdi), %rbx |
167 | adcq %rbx,%rax | 167 | adcq %rbx, %rax |
168 | decl %ecx | 168 | decl %ecx |
169 | dest | 169 | dest |
170 | movq %rbx,(%rsi) | 170 | movq %rbx, (%rsi) |
171 | leaq 8(%rsi),%rsi /* preserve carry */ | 171 | leaq 8(%rsi), %rsi /* preserve carry */ |
172 | leaq 8(%rdi),%rdi | 172 | leaq 8(%rdi), %rdi |
173 | jnz .Lloop_8 | 173 | jnz .Lloop_8 |
174 | adcq %r9,%rax /* add in carry */ | 174 | adcq %r9, %rax /* add in carry */ |
175 | 175 | ||
176 | .Lfold: | 176 | .Lfold: |
177 | /* reduce checksum to 32bits */ | 177 | /* reduce checksum to 32bits */ |
178 | movl %eax,%ebx | 178 | movl %eax, %ebx |
179 | shrq $32,%rax | 179 | shrq $32, %rax |
180 | addl %ebx,%eax | 180 | addl %ebx, %eax |
181 | adcl %r9d,%eax | 181 | adcl %r9d, %eax |
182 | 182 | ||
183 | /* do last upto 6 bytes */ | 183 | /* do last up to 6 bytes */ |
184 | .Lhandle_7: | 184 | .Lhandle_7: |
185 | movl %r10d,%ecx | 185 | movl %r10d, %ecx |
186 | andl $7,%ecx | 186 | andl $7, %ecx |
187 | shrl $1,%ecx | 187 | shrl $1, %ecx |
188 | jz .Lhandle_1 | 188 | jz .Lhandle_1 |
189 | movl $2,%edx | 189 | movl $2, %edx |
190 | xorl %ebx,%ebx | 190 | xorl %ebx, %ebx |
191 | clc | 191 | clc |
192 | .p2align 4 | 192 | .p2align 4 |
193 | .Lloop_1: | 193 | .Lloop_1: |
194 | source | 194 | source |
195 | movw (%rdi),%bx | 195 | movw (%rdi), %bx |
196 | adcl %ebx,%eax | 196 | adcl %ebx, %eax |
197 | decl %ecx | 197 | decl %ecx |
198 | dest | 198 | dest |
199 | movw %bx,(%rsi) | 199 | movw %bx, (%rsi) |
200 | leaq 2(%rdi),%rdi | 200 | leaq 2(%rdi), %rdi |
201 | leaq 2(%rsi),%rsi | 201 | leaq 2(%rsi), %rsi |
202 | jnz .Lloop_1 | 202 | jnz .Lloop_1 |
203 | adcl %r9d,%eax /* add in carry */ | 203 | adcl %r9d, %eax /* add in carry */ |
204 | 204 | ||
205 | /* handle last odd byte */ | 205 | /* handle last odd byte */ |
206 | .Lhandle_1: | 206 | .Lhandle_1: |
207 | testl $1,%r10d | 207 | testl $1, %r10d |
208 | jz .Lende | 208 | jz .Lende |
209 | xorl %ebx,%ebx | 209 | xorl %ebx, %ebx |
210 | source | 210 | source |
211 | movb (%rdi),%bl | 211 | movb (%rdi), %bl |
212 | dest | 212 | dest |
213 | movb %bl,(%rsi) | 213 | movb %bl, (%rsi) |
214 | addl %ebx,%eax | 214 | addl %ebx, %eax |
215 | adcl %r9d,%eax /* carry */ | 215 | adcl %r9d, %eax /* carry */ |
216 | 216 | ||
217 | CFI_REMEMBER_STATE | 217 | CFI_REMEMBER_STATE |
218 | .Lende: | 218 | .Lende: |
219 | movq 2*8(%rsp),%rbx | 219 | movq 2*8(%rsp), %rbx |
220 | CFI_RESTORE rbx | 220 | CFI_RESTORE rbx |
221 | movq 3*8(%rsp),%r12 | 221 | movq 3*8(%rsp), %r12 |
222 | CFI_RESTORE r12 | 222 | CFI_RESTORE r12 |
223 | movq 4*8(%rsp),%r14 | 223 | movq 4*8(%rsp), %r14 |
224 | CFI_RESTORE r14 | 224 | CFI_RESTORE r14 |
225 | movq 5*8(%rsp),%r13 | 225 | movq 5*8(%rsp), %r13 |
226 | CFI_RESTORE r13 | 226 | CFI_RESTORE r13 |
227 | movq 6*8(%rsp),%rbp | 227 | movq 6*8(%rsp), %rbp |
228 | CFI_RESTORE rbp | 228 | CFI_RESTORE rbp |
229 | addq $7*8,%rsp | 229 | addq $7*8, %rsp |
230 | CFI_ADJUST_CFA_OFFSET -7*8 | 230 | CFI_ADJUST_CFA_OFFSET -7*8 |
231 | ret | 231 | ret |
232 | CFI_RESTORE_STATE | 232 | CFI_RESTORE_STATE |
233 | 233 | ||
234 | /* Exception handlers. Very simple, zeroing is done in the wrappers */ | 234 | /* Exception handlers. Very simple, zeroing is done in the wrappers */ |
235 | .Lbad_source: | 235 | .Lbad_source: |
236 | movq (%rsp),%rax | 236 | movq (%rsp), %rax |
237 | testq %rax,%rax | 237 | testq %rax, %rax |
238 | jz .Lende | 238 | jz .Lende |
239 | movl $-EFAULT,(%rax) | 239 | movl $-EFAULT, (%rax) |
240 | jmp .Lende | 240 | jmp .Lende |
241 | 241 | ||
242 | .Lbad_dest: | 242 | .Lbad_dest: |
243 | movq 8(%rsp),%rax | 243 | movq 8(%rsp), %rax |
244 | testq %rax,%rax | 244 | testq %rax, %rax |
245 | jz .Lende | 245 | jz .Lende |
246 | movl $-EFAULT,(%rax) | 246 | movl $-EFAULT, (%rax) |
247 | jmp .Lende | 247 | jmp .Lende |
248 | CFI_ENDPROC | 248 | CFI_ENDPROC |
249 | ENDPROC(csum_partial_copy_generic) | 249 | ENDPROC(csum_partial_copy_generic) |
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index bf51144d97e1..9845371c5c36 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c | |||
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) | |||
84 | count64--; | 84 | count64--; |
85 | } | 85 | } |
86 | 86 | ||
87 | /* last upto 7 8byte blocks */ | 87 | /* last up to 7 8byte blocks */ |
88 | count %= 8; | 88 | count %= 8; |
89 | while (count) { | 89 | while (count) { |
90 | asm("addq %1,%0\n\t" | 90 | asm("addq %1,%0\n\t" |
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index ff485d361182..fc45ba887d05 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c | |||
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops) | |||
121 | asm("mull %%edx" | 121 | asm("mull %%edx" |
122 | :"=d" (xloops), "=&a" (d0) | 122 | :"=d" (xloops), "=&a" (d0) |
123 | :"1" (xloops), "0" | 123 | :"1" (xloops), "0" |
124 | (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); | 124 | (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4))); |
125 | 125 | ||
126 | __delay(++xloops); | 126 | __delay(++xloops); |
127 | } | 127 | } |
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 5415a9d06f53..b908a59eccf5 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c | |||
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset); | |||
22 | 22 | ||
23 | void *memmove(void *dest, const void *src, size_t n) | 23 | void *memmove(void *dest, const void *src, size_t n) |
24 | { | 24 | { |
25 | int d0, d1, d2; | 25 | int d0,d1,d2,d3,d4,d5; |
26 | 26 | char *ret = dest; | |
27 | if (dest < src) { | 27 | |
28 | memcpy(dest, src, n); | 28 | __asm__ __volatile__( |
29 | } else { | 29 | /* Handle more 16bytes in loop */ |
30 | __asm__ __volatile__( | 30 | "cmp $0x10, %0\n\t" |
31 | "std\n\t" | 31 | "jb 1f\n\t" |
32 | "rep\n\t" | 32 | |
33 | "movsb\n\t" | 33 | /* Decide forward/backward copy mode */ |
34 | "cld" | 34 | "cmp %2, %1\n\t" |
35 | : "=&c" (d0), "=&S" (d1), "=&D" (d2) | 35 | "jb 2f\n\t" |
36 | :"0" (n), | 36 | |
37 | "1" (n-1+src), | 37 | /* |
38 | "2" (n-1+dest) | 38 | * movs instruction have many startup latency |
39 | :"memory"); | 39 | * so we handle small size by general register. |
40 | } | 40 | */ |
41 | return dest; | 41 | "cmp $680, %0\n\t" |
42 | "jb 3f\n\t" | ||
43 | /* | ||
44 | * movs instruction is only good for aligned case. | ||
45 | */ | ||
46 | "mov %1, %3\n\t" | ||
47 | "xor %2, %3\n\t" | ||
48 | "and $0xff, %3\n\t" | ||
49 | "jz 4f\n\t" | ||
50 | "3:\n\t" | ||
51 | "sub $0x10, %0\n\t" | ||
52 | |||
53 | /* | ||
54 | * We gobble 16byts forward in each loop. | ||
55 | */ | ||
56 | "3:\n\t" | ||
57 | "sub $0x10, %0\n\t" | ||
58 | "mov 0*4(%1), %3\n\t" | ||
59 | "mov 1*4(%1), %4\n\t" | ||
60 | "mov %3, 0*4(%2)\n\t" | ||
61 | "mov %4, 1*4(%2)\n\t" | ||
62 | "mov 2*4(%1), %3\n\t" | ||
63 | "mov 3*4(%1), %4\n\t" | ||
64 | "mov %3, 2*4(%2)\n\t" | ||
65 | "mov %4, 3*4(%2)\n\t" | ||
66 | "lea 0x10(%1), %1\n\t" | ||
67 | "lea 0x10(%2), %2\n\t" | ||
68 | "jae 3b\n\t" | ||
69 | "add $0x10, %0\n\t" | ||
70 | "jmp 1f\n\t" | ||
71 | |||
72 | /* | ||
73 | * Handle data forward by movs. | ||
74 | */ | ||
75 | ".p2align 4\n\t" | ||
76 | "4:\n\t" | ||
77 | "mov -4(%1, %0), %3\n\t" | ||
78 | "lea -4(%2, %0), %4\n\t" | ||
79 | "shr $2, %0\n\t" | ||
80 | "rep movsl\n\t" | ||
81 | "mov %3, (%4)\n\t" | ||
82 | "jmp 11f\n\t" | ||
83 | /* | ||
84 | * Handle data backward by movs. | ||
85 | */ | ||
86 | ".p2align 4\n\t" | ||
87 | "6:\n\t" | ||
88 | "mov (%1), %3\n\t" | ||
89 | "mov %2, %4\n\t" | ||
90 | "lea -4(%1, %0), %1\n\t" | ||
91 | "lea -4(%2, %0), %2\n\t" | ||
92 | "shr $2, %0\n\t" | ||
93 | "std\n\t" | ||
94 | "rep movsl\n\t" | ||
95 | "mov %3,(%4)\n\t" | ||
96 | "cld\n\t" | ||
97 | "jmp 11f\n\t" | ||
98 | |||
99 | /* | ||
100 | * Start to prepare for backward copy. | ||
101 | */ | ||
102 | ".p2align 4\n\t" | ||
103 | "2:\n\t" | ||
104 | "cmp $680, %0\n\t" | ||
105 | "jb 5f\n\t" | ||
106 | "mov %1, %3\n\t" | ||
107 | "xor %2, %3\n\t" | ||
108 | "and $0xff, %3\n\t" | ||
109 | "jz 6b\n\t" | ||
110 | |||
111 | /* | ||
112 | * Calculate copy position to tail. | ||
113 | */ | ||
114 | "5:\n\t" | ||
115 | "add %0, %1\n\t" | ||
116 | "add %0, %2\n\t" | ||
117 | "sub $0x10, %0\n\t" | ||
118 | |||
119 | /* | ||
120 | * We gobble 16byts backward in each loop. | ||
121 | */ | ||
122 | "7:\n\t" | ||
123 | "sub $0x10, %0\n\t" | ||
124 | |||
125 | "mov -1*4(%1), %3\n\t" | ||
126 | "mov -2*4(%1), %4\n\t" | ||
127 | "mov %3, -1*4(%2)\n\t" | ||
128 | "mov %4, -2*4(%2)\n\t" | ||
129 | "mov -3*4(%1), %3\n\t" | ||
130 | "mov -4*4(%1), %4\n\t" | ||
131 | "mov %3, -3*4(%2)\n\t" | ||
132 | "mov %4, -4*4(%2)\n\t" | ||
133 | "lea -0x10(%1), %1\n\t" | ||
134 | "lea -0x10(%2), %2\n\t" | ||
135 | "jae 7b\n\t" | ||
136 | /* | ||
137 | * Calculate copy position to head. | ||
138 | */ | ||
139 | "add $0x10, %0\n\t" | ||
140 | "sub %0, %1\n\t" | ||
141 | "sub %0, %2\n\t" | ||
142 | |||
143 | /* | ||
144 | * Move data from 8 bytes to 15 bytes. | ||
145 | */ | ||
146 | ".p2align 4\n\t" | ||
147 | "1:\n\t" | ||
148 | "cmp $8, %0\n\t" | ||
149 | "jb 8f\n\t" | ||
150 | "mov 0*4(%1), %3\n\t" | ||
151 | "mov 1*4(%1), %4\n\t" | ||
152 | "mov -2*4(%1, %0), %5\n\t" | ||
153 | "mov -1*4(%1, %0), %1\n\t" | ||
154 | |||
155 | "mov %3, 0*4(%2)\n\t" | ||
156 | "mov %4, 1*4(%2)\n\t" | ||
157 | "mov %5, -2*4(%2, %0)\n\t" | ||
158 | "mov %1, -1*4(%2, %0)\n\t" | ||
159 | "jmp 11f\n\t" | ||
160 | |||
161 | /* | ||
162 | * Move data from 4 bytes to 7 bytes. | ||
163 | */ | ||
164 | ".p2align 4\n\t" | ||
165 | "8:\n\t" | ||
166 | "cmp $4, %0\n\t" | ||
167 | "jb 9f\n\t" | ||
168 | "mov 0*4(%1), %3\n\t" | ||
169 | "mov -1*4(%1, %0), %4\n\t" | ||
170 | "mov %3, 0*4(%2)\n\t" | ||
171 | "mov %4, -1*4(%2, %0)\n\t" | ||
172 | "jmp 11f\n\t" | ||
173 | |||
174 | /* | ||
175 | * Move data from 2 bytes to 3 bytes. | ||
176 | */ | ||
177 | ".p2align 4\n\t" | ||
178 | "9:\n\t" | ||
179 | "cmp $2, %0\n\t" | ||
180 | "jb 10f\n\t" | ||
181 | "movw 0*2(%1), %%dx\n\t" | ||
182 | "movw -1*2(%1, %0), %%bx\n\t" | ||
183 | "movw %%dx, 0*2(%2)\n\t" | ||
184 | "movw %%bx, -1*2(%2, %0)\n\t" | ||
185 | "jmp 11f\n\t" | ||
186 | |||
187 | /* | ||
188 | * Move data for 1 byte. | ||
189 | */ | ||
190 | ".p2align 4\n\t" | ||
191 | "10:\n\t" | ||
192 | "cmp $1, %0\n\t" | ||
193 | "jb 11f\n\t" | ||
194 | "movb (%1), %%cl\n\t" | ||
195 | "movb %%cl, (%2)\n\t" | ||
196 | ".p2align 4\n\t" | ||
197 | "11:" | ||
198 | : "=&c" (d0), "=&S" (d1), "=&D" (d2), | ||
199 | "=r" (d3),"=r" (d4), "=r"(d5) | ||
200 | :"0" (n), | ||
201 | "1" (src), | ||
202 | "2" (dest) | ||
203 | :"memory"); | ||
204 | |||
205 | return ret; | ||
206 | |||
42 | } | 207 | } |
43 | EXPORT_SYMBOL(memmove); | 208 | EXPORT_SYMBOL(memmove); |
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bcbcd1e0f7d5..efbf2a0ecdea 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S | |||
@@ -4,6 +4,7 @@ | |||
4 | 4 | ||
5 | #include <asm/cpufeature.h> | 5 | #include <asm/cpufeature.h> |
6 | #include <asm/dwarf2.h> | 6 | #include <asm/dwarf2.h> |
7 | #include <asm/alternative-asm.h> | ||
7 | 8 | ||
8 | /* | 9 | /* |
9 | * memcpy - Copy a memory block. | 10 | * memcpy - Copy a memory block. |
@@ -37,107 +38,173 @@ | |||
37 | .Lmemcpy_e: | 38 | .Lmemcpy_e: |
38 | .previous | 39 | .previous |
39 | 40 | ||
41 | /* | ||
42 | * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than | ||
43 | * memcpy_c. Use memcpy_c_e when possible. | ||
44 | * | ||
45 | * This gets patched over the unrolled variant (below) via the | ||
46 | * alternative instructions framework: | ||
47 | */ | ||
48 | .section .altinstr_replacement, "ax", @progbits | ||
49 | .Lmemcpy_c_e: | ||
50 | movq %rdi, %rax | ||
51 | |||
52 | movl %edx, %ecx | ||
53 | rep movsb | ||
54 | ret | ||
55 | .Lmemcpy_e_e: | ||
56 | .previous | ||
57 | |||
40 | ENTRY(__memcpy) | 58 | ENTRY(__memcpy) |
41 | ENTRY(memcpy) | 59 | ENTRY(memcpy) |
42 | CFI_STARTPROC | 60 | CFI_STARTPROC |
61 | movq %rdi, %rax | ||
43 | 62 | ||
44 | /* | 63 | /* |
45 | * Put the number of full 64-byte blocks into %ecx. | 64 | * Use 32bit CMP here to avoid long NOP padding. |
46 | * Tail portion is handled at the end: | ||
47 | */ | 65 | */ |
48 | movq %rdi, %rax | 66 | cmp $0x20, %edx |
49 | movl %edx, %ecx | 67 | jb .Lhandle_tail |
50 | shrl $6, %ecx | ||
51 | jz .Lhandle_tail | ||
52 | 68 | ||
53 | .p2align 4 | ||
54 | .Lloop_64: | ||
55 | /* | 69 | /* |
56 | * We decrement the loop index here - and the zero-flag is | 70 | * We check whether memory false dependence could occur, |
57 | * checked at the end of the loop (instructions inbetween do | 71 | * then jump to corresponding copy mode. |
58 | * not change the zero flag): | ||
59 | */ | 72 | */ |
60 | decl %ecx | 73 | cmp %dil, %sil |
74 | jl .Lcopy_backward | ||
75 | subl $0x20, %edx | ||
76 | .Lcopy_forward_loop: | ||
77 | subq $0x20, %rdx | ||
61 | 78 | ||
62 | /* | 79 | /* |
63 | * Move in blocks of 4x16 bytes: | 80 | * Move in blocks of 4x8 bytes: |
64 | */ | 81 | */ |
65 | movq 0*8(%rsi), %r11 | 82 | movq 0*8(%rsi), %r8 |
66 | movq 1*8(%rsi), %r8 | 83 | movq 1*8(%rsi), %r9 |
67 | movq %r11, 0*8(%rdi) | 84 | movq 2*8(%rsi), %r10 |
68 | movq %r8, 1*8(%rdi) | 85 | movq 3*8(%rsi), %r11 |
69 | 86 | leaq 4*8(%rsi), %rsi | |
70 | movq 2*8(%rsi), %r9 | 87 | |
71 | movq 3*8(%rsi), %r10 | 88 | movq %r8, 0*8(%rdi) |
72 | movq %r9, 2*8(%rdi) | 89 | movq %r9, 1*8(%rdi) |
73 | movq %r10, 3*8(%rdi) | 90 | movq %r10, 2*8(%rdi) |
74 | 91 | movq %r11, 3*8(%rdi) | |
75 | movq 4*8(%rsi), %r11 | 92 | leaq 4*8(%rdi), %rdi |
76 | movq 5*8(%rsi), %r8 | 93 | jae .Lcopy_forward_loop |
77 | movq %r11, 4*8(%rdi) | 94 | addq $0x20, %rdx |
78 | movq %r8, 5*8(%rdi) | 95 | jmp .Lhandle_tail |
79 | 96 | ||
80 | movq 6*8(%rsi), %r9 | 97 | .Lcopy_backward: |
81 | movq 7*8(%rsi), %r10 | 98 | /* |
82 | movq %r9, 6*8(%rdi) | 99 | * Calculate copy position to tail. |
83 | movq %r10, 7*8(%rdi) | 100 | */ |
84 | 101 | addq %rdx, %rsi | |
85 | leaq 64(%rsi), %rsi | 102 | addq %rdx, %rdi |
86 | leaq 64(%rdi), %rdi | 103 | subq $0x20, %rdx |
87 | 104 | /* | |
88 | jnz .Lloop_64 | 105 | * At most 3 ALU operations in one cycle, |
106 | * so append NOPS in the same 16bytes trunk. | ||
107 | */ | ||
108 | .p2align 4 | ||
109 | .Lcopy_backward_loop: | ||
110 | subq $0x20, %rdx | ||
111 | movq -1*8(%rsi), %r8 | ||
112 | movq -2*8(%rsi), %r9 | ||
113 | movq -3*8(%rsi), %r10 | ||
114 | movq -4*8(%rsi), %r11 | ||
115 | leaq -4*8(%rsi), %rsi | ||
116 | movq %r8, -1*8(%rdi) | ||
117 | movq %r9, -2*8(%rdi) | ||
118 | movq %r10, -3*8(%rdi) | ||
119 | movq %r11, -4*8(%rdi) | ||
120 | leaq -4*8(%rdi), %rdi | ||
121 | jae .Lcopy_backward_loop | ||
89 | 122 | ||
123 | /* | ||
124 | * Calculate copy position to head. | ||
125 | */ | ||
126 | addq $0x20, %rdx | ||
127 | subq %rdx, %rsi | ||
128 | subq %rdx, %rdi | ||
90 | .Lhandle_tail: | 129 | .Lhandle_tail: |
91 | movl %edx, %ecx | 130 | cmpq $16, %rdx |
92 | andl $63, %ecx | 131 | jb .Lless_16bytes |
93 | shrl $3, %ecx | ||
94 | jz .Lhandle_7 | ||
95 | 132 | ||
133 | /* | ||
134 | * Move data from 16 bytes to 31 bytes. | ||
135 | */ | ||
136 | movq 0*8(%rsi), %r8 | ||
137 | movq 1*8(%rsi), %r9 | ||
138 | movq -2*8(%rsi, %rdx), %r10 | ||
139 | movq -1*8(%rsi, %rdx), %r11 | ||
140 | movq %r8, 0*8(%rdi) | ||
141 | movq %r9, 1*8(%rdi) | ||
142 | movq %r10, -2*8(%rdi, %rdx) | ||
143 | movq %r11, -1*8(%rdi, %rdx) | ||
144 | retq | ||
96 | .p2align 4 | 145 | .p2align 4 |
97 | .Lloop_8: | 146 | .Lless_16bytes: |
98 | decl %ecx | 147 | cmpq $8, %rdx |
99 | movq (%rsi), %r8 | 148 | jb .Lless_8bytes |
100 | movq %r8, (%rdi) | 149 | /* |
101 | leaq 8(%rdi), %rdi | 150 | * Move data from 8 bytes to 15 bytes. |
102 | leaq 8(%rsi), %rsi | 151 | */ |
103 | jnz .Lloop_8 | 152 | movq 0*8(%rsi), %r8 |
104 | 153 | movq -1*8(%rsi, %rdx), %r9 | |
105 | .Lhandle_7: | 154 | movq %r8, 0*8(%rdi) |
106 | movl %edx, %ecx | 155 | movq %r9, -1*8(%rdi, %rdx) |
107 | andl $7, %ecx | 156 | retq |
108 | jz .Lend | 157 | .p2align 4 |
158 | .Lless_8bytes: | ||
159 | cmpq $4, %rdx | ||
160 | jb .Lless_3bytes | ||
109 | 161 | ||
162 | /* | ||
163 | * Move data from 4 bytes to 7 bytes. | ||
164 | */ | ||
165 | movl (%rsi), %ecx | ||
166 | movl -4(%rsi, %rdx), %r8d | ||
167 | movl %ecx, (%rdi) | ||
168 | movl %r8d, -4(%rdi, %rdx) | ||
169 | retq | ||
110 | .p2align 4 | 170 | .p2align 4 |
171 | .Lless_3bytes: | ||
172 | cmpl $0, %edx | ||
173 | je .Lend | ||
174 | /* | ||
175 | * Move data from 1 bytes to 3 bytes. | ||
176 | */ | ||
111 | .Lloop_1: | 177 | .Lloop_1: |
112 | movb (%rsi), %r8b | 178 | movb (%rsi), %r8b |
113 | movb %r8b, (%rdi) | 179 | movb %r8b, (%rdi) |
114 | incq %rdi | 180 | incq %rdi |
115 | incq %rsi | 181 | incq %rsi |
116 | decl %ecx | 182 | decl %edx |
117 | jnz .Lloop_1 | 183 | jnz .Lloop_1 |
118 | 184 | ||
119 | .Lend: | 185 | .Lend: |
120 | ret | 186 | retq |
121 | CFI_ENDPROC | 187 | CFI_ENDPROC |
122 | ENDPROC(memcpy) | 188 | ENDPROC(memcpy) |
123 | ENDPROC(__memcpy) | 189 | ENDPROC(__memcpy) |
124 | 190 | ||
125 | /* | 191 | /* |
126 | * Some CPUs run faster using the string copy instructions. | 192 | * Some CPUs are adding enhanced REP MOVSB/STOSB feature |
127 | * It is also a lot simpler. Use this when possible: | 193 | * If the feature is supported, memcpy_c_e() is the first choice. |
128 | */ | 194 | * If enhanced rep movsb copy is not available, use fast string copy |
129 | 195 | * memcpy_c() when possible. This is faster and code is simpler than | |
130 | .section .altinstructions, "a" | 196 | * original memcpy(). |
131 | .align 8 | 197 | * Otherwise, original memcpy() is used. |
132 | .quad memcpy | 198 | * In .altinstructions section, ERMS feature is placed after REG_GOOD |
133 | .quad .Lmemcpy_c | 199 | * feature to implement the right patch order. |
134 | .word X86_FEATURE_REP_GOOD | 200 | * |
135 | |||
136 | /* | ||
137 | * Replace only beginning, memcpy is used to apply alternatives, | 201 | * Replace only beginning, memcpy is used to apply alternatives, |
138 | * so it is silly to overwrite itself with nops - reboot is the | 202 | * so it is silly to overwrite itself with nops - reboot is the |
139 | * only outcome... | 203 | * only outcome... |
140 | */ | 204 | */ |
141 | .byte .Lmemcpy_e - .Lmemcpy_c | 205 | .section .altinstructions, "a" |
142 | .byte .Lmemcpy_e - .Lmemcpy_c | 206 | altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ |
207 | .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c | ||
208 | altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ | ||
209 | .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e | ||
143 | .previous | 210 | .previous |
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S new file mode 100644 index 000000000000..d0ec9c2936d7 --- /dev/null +++ b/arch/x86/lib/memmove_64.S | |||
@@ -0,0 +1,224 @@ | |||
1 | /* | ||
2 | * Normally compiler builtins are used, but sometimes the compiler calls out | ||
3 | * of line code. Based on asm-i386/string.h. | ||
4 | * | ||
5 | * This assembly file is re-written from memmove_64.c file. | ||
6 | * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> | ||
7 | */ | ||
8 | #define _STRING_C | ||
9 | #include <linux/linkage.h> | ||
10 | #include <asm/dwarf2.h> | ||
11 | #include <asm/cpufeature.h> | ||
12 | |||
13 | #undef memmove | ||
14 | |||
15 | /* | ||
16 | * Implement memmove(). This can handle overlap between src and dst. | ||
17 | * | ||
18 | * Input: | ||
19 | * rdi: dest | ||
20 | * rsi: src | ||
21 | * rdx: count | ||
22 | * | ||
23 | * Output: | ||
24 | * rax: dest | ||
25 | */ | ||
26 | ENTRY(memmove) | ||
27 | CFI_STARTPROC | ||
28 | |||
29 | /* Handle more 32bytes in loop */ | ||
30 | mov %rdi, %rax | ||
31 | cmp $0x20, %rdx | ||
32 | jb 1f | ||
33 | |||
34 | /* Decide forward/backward copy mode */ | ||
35 | cmp %rdi, %rsi | ||
36 | jge .Lmemmove_begin_forward | ||
37 | mov %rsi, %r8 | ||
38 | add %rdx, %r8 | ||
39 | cmp %rdi, %r8 | ||
40 | jg 2f | ||
41 | |||
42 | .Lmemmove_begin_forward: | ||
43 | /* | ||
44 | * movsq instruction have many startup latency | ||
45 | * so we handle small size by general register. | ||
46 | */ | ||
47 | cmp $680, %rdx | ||
48 | jb 3f | ||
49 | /* | ||
50 | * movsq instruction is only good for aligned case. | ||
51 | */ | ||
52 | |||
53 | cmpb %dil, %sil | ||
54 | je 4f | ||
55 | 3: | ||
56 | sub $0x20, %rdx | ||
57 | /* | ||
58 | * We gobble 32byts forward in each loop. | ||
59 | */ | ||
60 | 5: | ||
61 | sub $0x20, %rdx | ||
62 | movq 0*8(%rsi), %r11 | ||
63 | movq 1*8(%rsi), %r10 | ||
64 | movq 2*8(%rsi), %r9 | ||
65 | movq 3*8(%rsi), %r8 | ||
66 | leaq 4*8(%rsi), %rsi | ||
67 | |||
68 | movq %r11, 0*8(%rdi) | ||
69 | movq %r10, 1*8(%rdi) | ||
70 | movq %r9, 2*8(%rdi) | ||
71 | movq %r8, 3*8(%rdi) | ||
72 | leaq 4*8(%rdi), %rdi | ||
73 | jae 5b | ||
74 | addq $0x20, %rdx | ||
75 | jmp 1f | ||
76 | /* | ||
77 | * Handle data forward by movsq. | ||
78 | */ | ||
79 | .p2align 4 | ||
80 | 4: | ||
81 | movq %rdx, %rcx | ||
82 | movq -8(%rsi, %rdx), %r11 | ||
83 | lea -8(%rdi, %rdx), %r10 | ||
84 | shrq $3, %rcx | ||
85 | rep movsq | ||
86 | movq %r11, (%r10) | ||
87 | jmp 13f | ||
88 | .Lmemmove_end_forward: | ||
89 | |||
90 | /* | ||
91 | * Handle data backward by movsq. | ||
92 | */ | ||
93 | .p2align 4 | ||
94 | 7: | ||
95 | movq %rdx, %rcx | ||
96 | movq (%rsi), %r11 | ||
97 | movq %rdi, %r10 | ||
98 | leaq -8(%rsi, %rdx), %rsi | ||
99 | leaq -8(%rdi, %rdx), %rdi | ||
100 | shrq $3, %rcx | ||
101 | std | ||
102 | rep movsq | ||
103 | cld | ||
104 | movq %r11, (%r10) | ||
105 | jmp 13f | ||
106 | |||
107 | /* | ||
108 | * Start to prepare for backward copy. | ||
109 | */ | ||
110 | .p2align 4 | ||
111 | 2: | ||
112 | cmp $680, %rdx | ||
113 | jb 6f | ||
114 | cmp %dil, %sil | ||
115 | je 7b | ||
116 | 6: | ||
117 | /* | ||
118 | * Calculate copy position to tail. | ||
119 | */ | ||
120 | addq %rdx, %rsi | ||
121 | addq %rdx, %rdi | ||
122 | subq $0x20, %rdx | ||
123 | /* | ||
124 | * We gobble 32byts backward in each loop. | ||
125 | */ | ||
126 | 8: | ||
127 | subq $0x20, %rdx | ||
128 | movq -1*8(%rsi), %r11 | ||
129 | movq -2*8(%rsi), %r10 | ||
130 | movq -3*8(%rsi), %r9 | ||
131 | movq -4*8(%rsi), %r8 | ||
132 | leaq -4*8(%rsi), %rsi | ||
133 | |||
134 | movq %r11, -1*8(%rdi) | ||
135 | movq %r10, -2*8(%rdi) | ||
136 | movq %r9, -3*8(%rdi) | ||
137 | movq %r8, -4*8(%rdi) | ||
138 | leaq -4*8(%rdi), %rdi | ||
139 | jae 8b | ||
140 | /* | ||
141 | * Calculate copy position to head. | ||
142 | */ | ||
143 | addq $0x20, %rdx | ||
144 | subq %rdx, %rsi | ||
145 | subq %rdx, %rdi | ||
146 | 1: | ||
147 | cmpq $16, %rdx | ||
148 | jb 9f | ||
149 | /* | ||
150 | * Move data from 16 bytes to 31 bytes. | ||
151 | */ | ||
152 | movq 0*8(%rsi), %r11 | ||
153 | movq 1*8(%rsi), %r10 | ||
154 | movq -2*8(%rsi, %rdx), %r9 | ||
155 | movq -1*8(%rsi, %rdx), %r8 | ||
156 | movq %r11, 0*8(%rdi) | ||
157 | movq %r10, 1*8(%rdi) | ||
158 | movq %r9, -2*8(%rdi, %rdx) | ||
159 | movq %r8, -1*8(%rdi, %rdx) | ||
160 | jmp 13f | ||
161 | .p2align 4 | ||
162 | 9: | ||
163 | cmpq $8, %rdx | ||
164 | jb 10f | ||
165 | /* | ||
166 | * Move data from 8 bytes to 15 bytes. | ||
167 | */ | ||
168 | movq 0*8(%rsi), %r11 | ||
169 | movq -1*8(%rsi, %rdx), %r10 | ||
170 | movq %r11, 0*8(%rdi) | ||
171 | movq %r10, -1*8(%rdi, %rdx) | ||
172 | jmp 13f | ||
173 | 10: | ||
174 | cmpq $4, %rdx | ||
175 | jb 11f | ||
176 | /* | ||
177 | * Move data from 4 bytes to 7 bytes. | ||
178 | */ | ||
179 | movl (%rsi), %r11d | ||
180 | movl -4(%rsi, %rdx), %r10d | ||
181 | movl %r11d, (%rdi) | ||
182 | movl %r10d, -4(%rdi, %rdx) | ||
183 | jmp 13f | ||
184 | 11: | ||
185 | cmp $2, %rdx | ||
186 | jb 12f | ||
187 | /* | ||
188 | * Move data from 2 bytes to 3 bytes. | ||
189 | */ | ||
190 | movw (%rsi), %r11w | ||
191 | movw -2(%rsi, %rdx), %r10w | ||
192 | movw %r11w, (%rdi) | ||
193 | movw %r10w, -2(%rdi, %rdx) | ||
194 | jmp 13f | ||
195 | 12: | ||
196 | cmp $1, %rdx | ||
197 | jb 13f | ||
198 | /* | ||
199 | * Move data for 1 byte. | ||
200 | */ | ||
201 | movb (%rsi), %r11b | ||
202 | movb %r11b, (%rdi) | ||
203 | 13: | ||
204 | retq | ||
205 | CFI_ENDPROC | ||
206 | |||
207 | .section .altinstr_replacement,"ax" | ||
208 | .Lmemmove_begin_forward_efs: | ||
209 | /* Forward moving data. */ | ||
210 | movq %rdx, %rcx | ||
211 | rep movsb | ||
212 | retq | ||
213 | .Lmemmove_end_forward_efs: | ||
214 | .previous | ||
215 | |||
216 | .section .altinstructions,"a" | ||
217 | .align 8 | ||
218 | .quad .Lmemmove_begin_forward | ||
219 | .quad .Lmemmove_begin_forward_efs | ||
220 | .word X86_FEATURE_ERMS | ||
221 | .byte .Lmemmove_end_forward-.Lmemmove_begin_forward | ||
222 | .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs | ||
223 | .previous | ||
224 | ENDPROC(memmove) | ||
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c deleted file mode 100644 index 0a33909bf122..000000000000 --- a/arch/x86/lib/memmove_64.c +++ /dev/null | |||
@@ -1,21 +0,0 @@ | |||
1 | /* Normally compiler builtins are used, but sometimes the compiler calls out | ||
2 | of line code. Based on asm-i386/string.h. | ||
3 | */ | ||
4 | #define _STRING_C | ||
5 | #include <linux/string.h> | ||
6 | #include <linux/module.h> | ||
7 | |||
8 | #undef memmove | ||
9 | void *memmove(void *dest, const void *src, size_t count) | ||
10 | { | ||
11 | if (dest < src) { | ||
12 | return memcpy(dest, src, count); | ||
13 | } else { | ||
14 | char *p = dest + count; | ||
15 | const char *s = src + count; | ||
16 | while (count--) | ||
17 | *--p = *--s; | ||
18 | } | ||
19 | return dest; | ||
20 | } | ||
21 | EXPORT_SYMBOL(memmove); | ||
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 09d344269652..79bd454b78a3 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S | |||
@@ -2,9 +2,13 @@ | |||
2 | 2 | ||
3 | #include <linux/linkage.h> | 3 | #include <linux/linkage.h> |
4 | #include <asm/dwarf2.h> | 4 | #include <asm/dwarf2.h> |
5 | #include <asm/cpufeature.h> | ||
6 | #include <asm/alternative-asm.h> | ||
5 | 7 | ||
6 | /* | 8 | /* |
7 | * ISO C memset - set a memory block to a byte value. | 9 | * ISO C memset - set a memory block to a byte value. This function uses fast |
10 | * string to get better performance than the original function. The code is | ||
11 | * simpler and shorter than the orignal function as well. | ||
8 | * | 12 | * |
9 | * rdi destination | 13 | * rdi destination |
10 | * rsi value (char) | 14 | * rsi value (char) |
@@ -31,6 +35,28 @@ | |||
31 | .Lmemset_e: | 35 | .Lmemset_e: |
32 | .previous | 36 | .previous |
33 | 37 | ||
38 | /* | ||
39 | * ISO C memset - set a memory block to a byte value. This function uses | ||
40 | * enhanced rep stosb to override the fast string function. | ||
41 | * The code is simpler and shorter than the fast string function as well. | ||
42 | * | ||
43 | * rdi destination | ||
44 | * rsi value (char) | ||
45 | * rdx count (bytes) | ||
46 | * | ||
47 | * rax original destination | ||
48 | */ | ||
49 | .section .altinstr_replacement, "ax", @progbits | ||
50 | .Lmemset_c_e: | ||
51 | movq %rdi,%r9 | ||
52 | movb %sil,%al | ||
53 | movl %edx,%ecx | ||
54 | rep stosb | ||
55 | movq %r9,%rax | ||
56 | ret | ||
57 | .Lmemset_e_e: | ||
58 | .previous | ||
59 | |||
34 | ENTRY(memset) | 60 | ENTRY(memset) |
35 | ENTRY(__memset) | 61 | ENTRY(__memset) |
36 | CFI_STARTPROC | 62 | CFI_STARTPROC |
@@ -112,16 +138,20 @@ ENTRY(__memset) | |||
112 | ENDPROC(memset) | 138 | ENDPROC(memset) |
113 | ENDPROC(__memset) | 139 | ENDPROC(__memset) |
114 | 140 | ||
115 | /* Some CPUs run faster using the string instructions. | 141 | /* Some CPUs support enhanced REP MOVSB/STOSB feature. |
116 | It is also a lot simpler. Use this when possible */ | 142 | * It is recommended to use this when possible. |
117 | 143 | * | |
118 | #include <asm/cpufeature.h> | 144 | * If enhanced REP MOVSB/STOSB feature is not available, use fast string |
119 | 145 | * instructions. | |
146 | * | ||
147 | * Otherwise, use original memset function. | ||
148 | * | ||
149 | * In .altinstructions section, ERMS feature is placed after REG_GOOD | ||
150 | * feature to implement the right patch order. | ||
151 | */ | ||
120 | .section .altinstructions,"a" | 152 | .section .altinstructions,"a" |
121 | .align 8 | 153 | altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ |
122 | .quad memset | 154 | .Lfinal-memset,.Lmemset_e-.Lmemset_c |
123 | .quad .Lmemset_c | 155 | altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ |
124 | .word X86_FEATURE_REP_GOOD | 156 | .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e |
125 | .byte .Lfinal - memset | ||
126 | .byte .Lmemset_e - .Lmemset_c | ||
127 | .previous | 157 | .previous |
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S index 41fcf00e49df..67743977398b 100644 --- a/arch/x86/lib/rwsem_64.S +++ b/arch/x86/lib/rwsem_64.S | |||
@@ -23,43 +23,50 @@ | |||
23 | #include <asm/dwarf2.h> | 23 | #include <asm/dwarf2.h> |
24 | 24 | ||
25 | #define save_common_regs \ | 25 | #define save_common_regs \ |
26 | pushq %rdi; \ | 26 | pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ |
27 | pushq %rsi; \ | 27 | pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ |
28 | pushq %rcx; \ | 28 | pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ |
29 | pushq %r8; \ | 29 | pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ |
30 | pushq %r9; \ | 30 | pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ |
31 | pushq %r10; \ | 31 | pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ |
32 | pushq %r11 | 32 | pushq_cfi %r11; CFI_REL_OFFSET r11, 0 |
33 | 33 | ||
34 | #define restore_common_regs \ | 34 | #define restore_common_regs \ |
35 | popq %r11; \ | 35 | popq_cfi %r11; CFI_RESTORE r11; \ |
36 | popq %r10; \ | 36 | popq_cfi %r10; CFI_RESTORE r10; \ |
37 | popq %r9; \ | 37 | popq_cfi %r9; CFI_RESTORE r9; \ |
38 | popq %r8; \ | 38 | popq_cfi %r8; CFI_RESTORE r8; \ |
39 | popq %rcx; \ | 39 | popq_cfi %rcx; CFI_RESTORE rcx; \ |
40 | popq %rsi; \ | 40 | popq_cfi %rsi; CFI_RESTORE rsi; \ |
41 | popq %rdi | 41 | popq_cfi %rdi; CFI_RESTORE rdi |
42 | 42 | ||
43 | /* Fix up special calling conventions */ | 43 | /* Fix up special calling conventions */ |
44 | ENTRY(call_rwsem_down_read_failed) | 44 | ENTRY(call_rwsem_down_read_failed) |
45 | CFI_STARTPROC | ||
45 | save_common_regs | 46 | save_common_regs |
46 | pushq %rdx | 47 | pushq_cfi %rdx |
48 | CFI_REL_OFFSET rdx, 0 | ||
47 | movq %rax,%rdi | 49 | movq %rax,%rdi |
48 | call rwsem_down_read_failed | 50 | call rwsem_down_read_failed |
49 | popq %rdx | 51 | popq_cfi %rdx |
52 | CFI_RESTORE rdx | ||
50 | restore_common_regs | 53 | restore_common_regs |
51 | ret | 54 | ret |
52 | ENDPROC(call_rwsem_down_read_failed) | 55 | CFI_ENDPROC |
56 | ENDPROC(call_rwsem_down_read_failed) | ||
53 | 57 | ||
54 | ENTRY(call_rwsem_down_write_failed) | 58 | ENTRY(call_rwsem_down_write_failed) |
59 | CFI_STARTPROC | ||
55 | save_common_regs | 60 | save_common_regs |
56 | movq %rax,%rdi | 61 | movq %rax,%rdi |
57 | call rwsem_down_write_failed | 62 | call rwsem_down_write_failed |
58 | restore_common_regs | 63 | restore_common_regs |
59 | ret | 64 | ret |
60 | ENDPROC(call_rwsem_down_write_failed) | 65 | CFI_ENDPROC |
66 | ENDPROC(call_rwsem_down_write_failed) | ||
61 | 67 | ||
62 | ENTRY(call_rwsem_wake) | 68 | ENTRY(call_rwsem_wake) |
69 | CFI_STARTPROC | ||
63 | decl %edx /* do nothing if still outstanding active readers */ | 70 | decl %edx /* do nothing if still outstanding active readers */ |
64 | jnz 1f | 71 | jnz 1f |
65 | save_common_regs | 72 | save_common_regs |
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake) | |||
67 | call rwsem_wake | 74 | call rwsem_wake |
68 | restore_common_regs | 75 | restore_common_regs |
69 | 1: ret | 76 | 1: ret |
70 | ENDPROC(call_rwsem_wake) | 77 | CFI_ENDPROC |
78 | ENDPROC(call_rwsem_wake) | ||
71 | 79 | ||
72 | /* Fix up special calling conventions */ | 80 | /* Fix up special calling conventions */ |
73 | ENTRY(call_rwsem_downgrade_wake) | 81 | ENTRY(call_rwsem_downgrade_wake) |
82 | CFI_STARTPROC | ||
74 | save_common_regs | 83 | save_common_regs |
75 | pushq %rdx | 84 | pushq_cfi %rdx |
85 | CFI_REL_OFFSET rdx, 0 | ||
76 | movq %rax,%rdi | 86 | movq %rax,%rdi |
77 | call rwsem_downgrade_wake | 87 | call rwsem_downgrade_wake |
78 | popq %rdx | 88 | popq_cfi %rdx |
89 | CFI_RESTORE rdx | ||
79 | restore_common_regs | 90 | restore_common_regs |
80 | ret | 91 | ret |
81 | ENDPROC(call_rwsem_downgrade_wake) | 92 | CFI_ENDPROC |
93 | ENDPROC(call_rwsem_downgrade_wake) | ||
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 648fe4741782..06691daa4108 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S | |||
@@ -36,7 +36,7 @@ | |||
36 | */ | 36 | */ |
37 | #ifdef CONFIG_SMP | 37 | #ifdef CONFIG_SMP |
38 | ENTRY(__write_lock_failed) | 38 | ENTRY(__write_lock_failed) |
39 | CFI_STARTPROC simple | 39 | CFI_STARTPROC |
40 | FRAME | 40 | FRAME |
41 | 2: LOCK_PREFIX | 41 | 2: LOCK_PREFIX |
42 | addl $ RW_LOCK_BIAS,(%eax) | 42 | addl $ RW_LOCK_BIAS,(%eax) |
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed) | |||
74 | /* Fix up special calling conventions */ | 74 | /* Fix up special calling conventions */ |
75 | ENTRY(call_rwsem_down_read_failed) | 75 | ENTRY(call_rwsem_down_read_failed) |
76 | CFI_STARTPROC | 76 | CFI_STARTPROC |
77 | push %ecx | 77 | pushl_cfi %ecx |
78 | CFI_ADJUST_CFA_OFFSET 4 | ||
79 | CFI_REL_OFFSET ecx,0 | 78 | CFI_REL_OFFSET ecx,0 |
80 | push %edx | 79 | pushl_cfi %edx |
81 | CFI_ADJUST_CFA_OFFSET 4 | ||
82 | CFI_REL_OFFSET edx,0 | 80 | CFI_REL_OFFSET edx,0 |
83 | call rwsem_down_read_failed | 81 | call rwsem_down_read_failed |
84 | pop %edx | 82 | popl_cfi %edx |
85 | CFI_ADJUST_CFA_OFFSET -4 | 83 | popl_cfi %ecx |
86 | pop %ecx | ||
87 | CFI_ADJUST_CFA_OFFSET -4 | ||
88 | ret | 84 | ret |
89 | CFI_ENDPROC | 85 | CFI_ENDPROC |
90 | ENDPROC(call_rwsem_down_read_failed) | 86 | ENDPROC(call_rwsem_down_read_failed) |
91 | 87 | ||
92 | ENTRY(call_rwsem_down_write_failed) | 88 | ENTRY(call_rwsem_down_write_failed) |
93 | CFI_STARTPROC | 89 | CFI_STARTPROC |
94 | push %ecx | 90 | pushl_cfi %ecx |
95 | CFI_ADJUST_CFA_OFFSET 4 | ||
96 | CFI_REL_OFFSET ecx,0 | 91 | CFI_REL_OFFSET ecx,0 |
97 | calll rwsem_down_write_failed | 92 | calll rwsem_down_write_failed |
98 | pop %ecx | 93 | popl_cfi %ecx |
99 | CFI_ADJUST_CFA_OFFSET -4 | ||
100 | ret | 94 | ret |
101 | CFI_ENDPROC | 95 | CFI_ENDPROC |
102 | ENDPROC(call_rwsem_down_write_failed) | 96 | ENDPROC(call_rwsem_down_write_failed) |
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake) | |||
105 | CFI_STARTPROC | 99 | CFI_STARTPROC |
106 | decw %dx /* do nothing if still outstanding active readers */ | 100 | decw %dx /* do nothing if still outstanding active readers */ |
107 | jnz 1f | 101 | jnz 1f |
108 | push %ecx | 102 | pushl_cfi %ecx |
109 | CFI_ADJUST_CFA_OFFSET 4 | ||
110 | CFI_REL_OFFSET ecx,0 | 103 | CFI_REL_OFFSET ecx,0 |
111 | call rwsem_wake | 104 | call rwsem_wake |
112 | pop %ecx | 105 | popl_cfi %ecx |
113 | CFI_ADJUST_CFA_OFFSET -4 | ||
114 | 1: ret | 106 | 1: ret |
115 | CFI_ENDPROC | 107 | CFI_ENDPROC |
116 | ENDPROC(call_rwsem_wake) | 108 | ENDPROC(call_rwsem_wake) |
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake) | |||
118 | /* Fix up special calling conventions */ | 110 | /* Fix up special calling conventions */ |
119 | ENTRY(call_rwsem_downgrade_wake) | 111 | ENTRY(call_rwsem_downgrade_wake) |
120 | CFI_STARTPROC | 112 | CFI_STARTPROC |
121 | push %ecx | 113 | pushl_cfi %ecx |
122 | CFI_ADJUST_CFA_OFFSET 4 | ||
123 | CFI_REL_OFFSET ecx,0 | 114 | CFI_REL_OFFSET ecx,0 |
124 | push %edx | 115 | pushl_cfi %edx |
125 | CFI_ADJUST_CFA_OFFSET 4 | ||
126 | CFI_REL_OFFSET edx,0 | 116 | CFI_REL_OFFSET edx,0 |
127 | call rwsem_downgrade_wake | 117 | call rwsem_downgrade_wake |
128 | pop %edx | 118 | popl_cfi %edx |
129 | CFI_ADJUST_CFA_OFFSET -4 | 119 | popl_cfi %ecx |
130 | pop %ecx | ||
131 | CFI_ADJUST_CFA_OFFSET -4 | ||
132 | ret | 120 | ret |
133 | CFI_ENDPROC | 121 | CFI_ENDPROC |
134 | ENDPROC(call_rwsem_downgrade_wake) | 122 | ENDPROC(call_rwsem_downgrade_wake) |
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index 650b11e00ecc..2930ae05d773 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S | |||
@@ -7,24 +7,6 @@ | |||
7 | 7 | ||
8 | #include <linux/linkage.h> | 8 | #include <linux/linkage.h> |
9 | 9 | ||
10 | #define ARCH_TRACE_IRQS_ON \ | ||
11 | pushl %eax; \ | ||
12 | pushl %ecx; \ | ||
13 | pushl %edx; \ | ||
14 | call trace_hardirqs_on; \ | ||
15 | popl %edx; \ | ||
16 | popl %ecx; \ | ||
17 | popl %eax; | ||
18 | |||
19 | #define ARCH_TRACE_IRQS_OFF \ | ||
20 | pushl %eax; \ | ||
21 | pushl %ecx; \ | ||
22 | pushl %edx; \ | ||
23 | call trace_hardirqs_off; \ | ||
24 | popl %edx; \ | ||
25 | popl %ecx; \ | ||
26 | popl %eax; | ||
27 | |||
28 | #ifdef CONFIG_TRACE_IRQFLAGS | 10 | #ifdef CONFIG_TRACE_IRQFLAGS |
29 | /* put return address in eax (arg1) */ | 11 | /* put return address in eax (arg1) */ |
30 | .macro thunk_ra name,func | 12 | .macro thunk_ra name,func |
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index bf9a7d5a5428..782b082c9ff7 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S | |||
@@ -22,26 +22,6 @@ | |||
22 | CFI_ENDPROC | 22 | CFI_ENDPROC |
23 | .endm | 23 | .endm |
24 | 24 | ||
25 | /* rdi: arg1 ... normal C conventions. rax is passed from C. */ | ||
26 | .macro thunk_retrax name,func | ||
27 | .globl \name | ||
28 | \name: | ||
29 | CFI_STARTPROC | ||
30 | SAVE_ARGS | ||
31 | call \func | ||
32 | jmp restore_norax | ||
33 | CFI_ENDPROC | ||
34 | .endm | ||
35 | |||
36 | |||
37 | .section .sched.text, "ax" | ||
38 | #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM | ||
39 | thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed | ||
40 | thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed | ||
41 | thunk rwsem_wake_thunk,rwsem_wake | ||
42 | thunk rwsem_downgrade_thunk,rwsem_downgrade_wake | ||
43 | #endif | ||
44 | |||
45 | #ifdef CONFIG_TRACE_IRQFLAGS | 25 | #ifdef CONFIG_TRACE_IRQFLAGS |
46 | /* put return address in rdi (arg1) */ | 26 | /* put return address in rdi (arg1) */ |
47 | .macro thunk_ra name,func | 27 | .macro thunk_ra name,func |
@@ -72,10 +52,3 @@ restore: | |||
72 | RESTORE_ARGS | 52 | RESTORE_ARGS |
73 | ret | 53 | ret |
74 | CFI_ENDPROC | 54 | CFI_ENDPROC |
75 | |||
76 | CFI_STARTPROC | ||
77 | SAVE_ARGS | ||
78 | restore_norax: | ||
79 | RESTORE_ARGS 1 | ||
80 | ret | ||
81 | CFI_ENDPROC | ||