diff options
Diffstat (limited to 'arch/alpha/lib/memcpy.c')
-rw-r--r-- | arch/alpha/lib/memcpy.c | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/arch/alpha/lib/memcpy.c b/arch/alpha/lib/memcpy.c new file mode 100644 index 000000000000..64083fc73238 --- /dev/null +++ b/arch/alpha/lib/memcpy.c | |||
@@ -0,0 +1,163 @@ | |||
1 | /* | ||
2 | * linux/arch/alpha/lib/memcpy.c | ||
3 | * | ||
4 | * Copyright (C) 1995 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * This is a reasonably optimized memcpy() routine. | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * Note that the C code is written to be optimized into good assembly. However, | ||
13 | * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a | ||
14 | * explicit compare against 0 (instead of just using the proper "blt reg, xx" or | ||
15 | * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually.. | ||
16 | */ | ||
17 | |||
18 | #include <linux/types.h> | ||
19 | |||
20 | /* | ||
21 | * This should be done in one go with ldq_u*2/mask/stq_u. Do it | ||
22 | * with a macro so that we can fix it up later.. | ||
23 | */ | ||
24 | #define ALIGN_DEST_TO8_UP(d,s,n) \ | ||
25 | while (d & 7) { \ | ||
26 | if (n <= 0) return; \ | ||
27 | n--; \ | ||
28 | *(char *) d = *(char *) s; \ | ||
29 | d++; s++; \ | ||
30 | } | ||
31 | #define ALIGN_DEST_TO8_DN(d,s,n) \ | ||
32 | while (d & 7) { \ | ||
33 | if (n <= 0) return; \ | ||
34 | n--; \ | ||
35 | d--; s--; \ | ||
36 | *(char *) d = *(char *) s; \ | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * This should similarly be done with ldq_u*2/mask/stq. The destination | ||
41 | * is aligned, but we don't fill in a full quad-word | ||
42 | */ | ||
43 | #define DO_REST_UP(d,s,n) \ | ||
44 | while (n > 0) { \ | ||
45 | n--; \ | ||
46 | *(char *) d = *(char *) s; \ | ||
47 | d++; s++; \ | ||
48 | } | ||
49 | #define DO_REST_DN(d,s,n) \ | ||
50 | while (n > 0) { \ | ||
51 | n--; \ | ||
52 | d--; s--; \ | ||
53 | *(char *) d = *(char *) s; \ | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * This should be done with ldq/mask/stq. The source and destination are | ||
58 | * aligned, but we don't fill in a full quad-word | ||
59 | */ | ||
60 | #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n) | ||
61 | #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n) | ||
62 | |||
63 | /* | ||
64 | * This does unaligned memory copies. We want to avoid storing to | ||
65 | * an unaligned address, as that would do a read-modify-write cycle. | ||
66 | * We also want to avoid double-reading the unaligned reads. | ||
67 | * | ||
68 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
69 | */ | ||
70 | static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s, | ||
71 | long n) | ||
72 | { | ||
73 | ALIGN_DEST_TO8_UP(d,s,n); | ||
74 | n -= 8; /* to avoid compare against 8 in the loop */ | ||
75 | if (n >= 0) { | ||
76 | unsigned long low_word, high_word; | ||
77 | __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s)); | ||
78 | do { | ||
79 | unsigned long tmp; | ||
80 | __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8))); | ||
81 | n -= 8; | ||
82 | __asm__("extql %1,%2,%0" | ||
83 | :"=r" (low_word) | ||
84 | :"r" (low_word), "r" (s)); | ||
85 | __asm__("extqh %1,%2,%0" | ||
86 | :"=r" (tmp) | ||
87 | :"r" (high_word), "r" (s)); | ||
88 | s += 8; | ||
89 | *(unsigned long *) d = low_word | tmp; | ||
90 | d += 8; | ||
91 | low_word = high_word; | ||
92 | } while (n >= 0); | ||
93 | } | ||
94 | n += 8; | ||
95 | DO_REST_UP(d,s,n); | ||
96 | } | ||
97 | |||
98 | static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s, | ||
99 | long n) | ||
100 | { | ||
101 | /* I don't understand AXP assembler well enough for this. -Tim */ | ||
102 | s += n; | ||
103 | d += n; | ||
104 | while (n--) | ||
105 | * (char *) --d = * (char *) --s; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register | ||
110 | * for the load-store. I don't know why, but it would seem that using a floating | ||
111 | * point register for the move seems to slow things down (very small difference, | ||
112 | * though). | ||
113 | * | ||
114 | * Note the ordering to try to avoid load (and address generation) latencies. | ||
115 | */ | ||
116 | static inline void __memcpy_aligned_up (unsigned long d, unsigned long s, | ||
117 | long n) | ||
118 | { | ||
119 | ALIGN_DEST_TO8_UP(d,s,n); | ||
120 | n -= 8; | ||
121 | while (n >= 0) { | ||
122 | unsigned long tmp; | ||
123 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
124 | n -= 8; | ||
125 | s += 8; | ||
126 | *(unsigned long *) d = tmp; | ||
127 | d += 8; | ||
128 | } | ||
129 | n += 8; | ||
130 | DO_REST_ALIGNED_UP(d,s,n); | ||
131 | } | ||
132 | static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s, | ||
133 | long n) | ||
134 | { | ||
135 | s += n; | ||
136 | d += n; | ||
137 | ALIGN_DEST_TO8_DN(d,s,n); | ||
138 | n -= 8; | ||
139 | while (n >= 0) { | ||
140 | unsigned long tmp; | ||
141 | s -= 8; | ||
142 | __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s)); | ||
143 | n -= 8; | ||
144 | d -= 8; | ||
145 | *(unsigned long *) d = tmp; | ||
146 | } | ||
147 | n += 8; | ||
148 | DO_REST_ALIGNED_DN(d,s,n); | ||
149 | } | ||
150 | |||
151 | void * memcpy(void * dest, const void *src, size_t n) | ||
152 | { | ||
153 | if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) { | ||
154 | __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src, | ||
155 | n); | ||
156 | return dest; | ||
157 | } | ||
158 | __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n); | ||
159 | return dest; | ||
160 | } | ||
161 | |||
162 | /* For backward modules compatibility, define __memcpy. */ | ||
163 | asm("__memcpy = memcpy; .globl __memcpy"); | ||