diff options
author | Stuart Menefy <stuart.menefy@st.com> | 2007-09-27 23:36:35 -0400 |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2007-09-27 23:36:35 -0400 |
commit | 023ef184fff6ac2e7cba345708f35536a2a419cb (patch) | |
tree | c80cc81eeb473a214fee0d2a0e952448071cf154 /arch | |
parent | 24eb17e0813490497f4d5b2fad218bdba402cece (diff) |
sh: __copy_user() optimizations for small copies.
This implements a fast-path for small (less than 12 bytes) copies,
with the existing path treated as the slow-path and left as the default
behaviour for all other copy sizes.
Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/sh/mm/copy_page.S | 169 |
1 files changed, 108 insertions, 61 deletions
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S index ae039f2da162..a81dbdb05596 100644 --- a/arch/sh/mm/copy_page.S +++ b/arch/sh/mm/copy_page.S | |||
@@ -141,47 +141,38 @@ ENTRY(__copy_user_page) | |||
141 | .long 9999b, 6000f ; \ | 141 | .long 9999b, 6000f ; \ |
142 | .previous | 142 | .previous |
143 | ENTRY(__copy_user) | 143 | ENTRY(__copy_user) |
144 | tst r6,r6 ! Check explicitly for zero | 144 | ! Check if small number of bytes |
145 | bf 1f | 145 | mov #11,r0 |
146 | rts | ||
147 | mov #0,r0 ! normal return | ||
148 | 1: | ||
149 | mov.l r10,@-r15 | ||
150 | mov.l r9,@-r15 | ||
151 | mov.l r8,@-r15 | ||
152 | mov r4,r3 | 146 | mov r4,r3 |
153 | add r6,r3 ! last destination address | 147 | cmp/gt r0,r6 ! r6 (len) > r0 (11) |
154 | mov #12,r0 ! Check if small number of bytes | 148 | bf/s .L_cleanup_loop_no_pop |
155 | cmp/gt r0,r6 | 149 | add r6,r3 ! last destination address |
156 | bt 2f | 150 | |
157 | bra .L_cleanup_loop | 151 | ! Calculate bytes needed to align to src |
158 | nop | 152 | mov.l r11,@-r15 |
159 | 2: | 153 | neg r5,r0 |
160 | neg r5,r0 ! Calculate bytes needed to align source | 154 | mov.l r10,@-r15 |
161 | add #4,r0 | 155 | add #4,r0 |
156 | mov.l r9,@-r15 | ||
162 | and #3,r0 | 157 | and #3,r0 |
158 | mov.l r8,@-r15 | ||
163 | tst r0,r0 | 159 | tst r0,r0 |
164 | bt .L_jump | 160 | bt 2f |
165 | mov r0,r1 | ||
166 | 161 | ||
167 | .L_loop1: | 162 | 1: |
168 | ! Copy bytes to align source | 163 | ! Copy bytes to long word align src |
169 | EX( mov.b @r5+,r0 ) | 164 | EX( mov.b @r5+,r1 ) |
170 | dt r1 | 165 | dt r0 |
171 | EX( mov.b r0,@r4 ) | ||
172 | add #-1,r6 | 166 | add #-1,r6 |
173 | bf/s .L_loop1 | 167 | EX( mov.b r1,@r4 ) |
168 | bf/s 1b | ||
174 | add #1,r4 | 169 | add #1,r4 |
175 | 170 | ||
176 | .L_jump: | 171 | ! Jump to appropriate routine depending on dest |
177 | mov r6,r2 ! Calculate number of longwords to copy | 172 | 2: mov #3,r1 |
173 | mov r6, r2 | ||
174 | and r4,r1 | ||
178 | shlr2 r2 | 175 | shlr2 r2 |
179 | tst r2,r2 | ||
180 | bt .L_cleanup | ||
181 | |||
182 | mov r4,r0 ! Jump to appropriate routine | ||
183 | and #3,r0 | ||
184 | mov r0,r1 | ||
185 | shll2 r1 | 176 | shll2 r1 |
186 | mova .L_jump_tbl,r0 | 177 | mova .L_jump_tbl,r0 |
187 | mov.l @(r0,r1),r1 | 178 | mov.l @(r0,r1),r1 |
@@ -195,43 +186,97 @@ EX( mov.b r0,@r4 ) | |||
195 | .long .L_dest10 | 186 | .long .L_dest10 |
196 | .long .L_dest11 | 187 | .long .L_dest11 |
197 | 188 | ||
189 | /* | ||
190 | * Come here if there are less than 12 bytes to copy | ||
191 | * | ||
192 | * Keep the branch target close, so the bf/s callee doesn't overflow | ||
193 | * and result in a more expensive branch being inserted. This is the | ||
194 | * fast-path for small copies, the jump via the jump table will hit the | ||
195 | * default slow-path cleanup. -PFM. | ||
196 | */ | ||
197 | .L_cleanup_loop_no_pop: | ||
198 | tst r6,r6 ! Check explicitly for zero | ||
199 | bt 1f | ||
200 | |||
201 | 2: | ||
202 | EX( mov.b @r5+,r0 ) | ||
203 | dt r6 | ||
204 | EX( mov.b r0,@r4 ) | ||
205 | bf/s 2b | ||
206 | add #1,r4 | ||
207 | |||
208 | 1: mov #0,r0 ! normal return | ||
209 | 5000: | ||
210 | |||
211 | # Exception handler: | ||
212 | .section .fixup, "ax" | ||
213 | 6000: | ||
214 | mov.l 8000f,r1 | ||
215 | mov r3,r0 | ||
216 | jmp @r1 | ||
217 | sub r4,r0 | ||
218 | .align 2 | ||
219 | 8000: .long 5000b | ||
220 | |||
221 | .previous | ||
222 | rts | ||
223 | nop | ||
224 | |||
198 | ! Destination = 00 | 225 | ! Destination = 00 |
199 | 226 | ||
200 | .L_dest00: | 227 | .L_dest00: |
201 | mov r2,r7 | 228 | ! Skip the large copy for small transfers |
202 | shlr2 r7 | 229 | mov #(32+32-4), r0 |
203 | shlr r7 | 230 | cmp/gt r6, r0 ! r0 (60) > r6 (len) |
204 | tst r7,r7 | 231 | bt 1f |
205 | mov #7,r0 | 232 | |
206 | bt/s 1f | 233 | ! Align dest to a 32 byte boundary |
207 | and r0,r2 | 234 | neg r4,r0 |
208 | .align 2 | 235 | add #0x20, r0 |
236 | and #0x1f, r0 | ||
237 | tst r0, r0 | ||
238 | bt 2f | ||
239 | |||
240 | sub r0, r6 | ||
241 | shlr2 r0 | ||
242 | 3: | ||
243 | EX( mov.l @r5+,r1 ) | ||
244 | dt r0 | ||
245 | EX( mov.l r1,@r4 ) | ||
246 | bf/s 3b | ||
247 | add #4,r4 | ||
248 | |||
209 | 2: | 249 | 2: |
210 | EX( mov.l @r5+,r0 ) | 250 | EX( mov.l @r5+,r0 ) |
251 | EX( mov.l @r5+,r1 ) | ||
252 | EX( mov.l @r5+,r2 ) | ||
253 | EX( mov.l @r5+,r7 ) | ||
211 | EX( mov.l @r5+,r8 ) | 254 | EX( mov.l @r5+,r8 ) |
212 | EX( mov.l @r5+,r9 ) | 255 | EX( mov.l @r5+,r9 ) |
213 | EX( mov.l @r5+,r10 ) | 256 | EX( mov.l @r5+,r10 ) |
214 | EX( mov.l r0,@r4 ) | 257 | EX( mov.l @r5+,r11 ) |
215 | EX( mov.l r8,@(4,r4) ) | 258 | EX( movca.l r0,@r4 ) |
216 | EX( mov.l r9,@(8,r4) ) | 259 | add #-32, r6 |
217 | EX( mov.l r10,@(12,r4) ) | 260 | EX( mov.l r1,@(4,r4) ) |
218 | EX( mov.l @r5+,r0 ) | 261 | mov #32, r0 |
219 | EX( mov.l @r5+,r8 ) | 262 | EX( mov.l r2,@(8,r4) ) |
220 | EX( mov.l @r5+,r9 ) | 263 | cmp/gt r6, r0 ! r0 (32) > r6 (len) |
221 | EX( mov.l @r5+,r10 ) | 264 | EX( mov.l r7,@(12,r4) ) |
222 | dt r7 | 265 | EX( mov.l r8,@(16,r4) ) |
223 | EX( mov.l r0,@(16,r4) ) | 266 | EX( mov.l r9,@(20,r4) ) |
224 | EX( mov.l r8,@(20,r4) ) | 267 | EX( mov.l r10,@(24,r4) ) |
225 | EX( mov.l r9,@(24,r4) ) | 268 | EX( mov.l r11,@(28,r4) ) |
226 | EX( mov.l r10,@(28,r4) ) | ||
227 | bf/s 2b | 269 | bf/s 2b |
228 | add #32,r4 | 270 | add #32,r4 |
229 | tst r2,r2 | 271 | |
272 | 1: mov r6, r0 | ||
273 | shlr2 r0 | ||
274 | tst r0, r0 | ||
230 | bt .L_cleanup | 275 | bt .L_cleanup |
231 | 1: | 276 | 1: |
232 | EX( mov.l @r5+,r0 ) | 277 | EX( mov.l @r5+,r1 ) |
233 | dt r2 | 278 | dt r0 |
234 | EX( mov.l r0,@r4 ) | 279 | EX( mov.l r1,@r4 ) |
235 | bf/s 1b | 280 | bf/s 1b |
236 | add #4,r4 | 281 | add #4,r4 |
237 | 282 | ||
@@ -250,7 +295,7 @@ EX( mov.l r0,@r4 ) | |||
250 | and r0,r2 | 295 | and r0,r2 |
251 | 2: | 296 | 2: |
252 | dt r7 | 297 | dt r7 |
253 | #ifdef __LITTLE_ENDIAN__ | 298 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
254 | EX( mov.l @r5+,r0 ) | 299 | EX( mov.l @r5+,r0 ) |
255 | EX( mov.l @r5+,r1 ) | 300 | EX( mov.l @r5+,r1 ) |
256 | EX( mov.l @r5+,r8 ) | 301 | EX( mov.l @r5+,r8 ) |
@@ -320,7 +365,7 @@ EX( mov.w r0,@(2,r4) ) | |||
320 | 1: ! Read longword, write two words per iteration | 365 | 1: ! Read longword, write two words per iteration |
321 | EX( mov.l @r5+,r0 ) | 366 | EX( mov.l @r5+,r0 ) |
322 | dt r2 | 367 | dt r2 |
323 | #ifdef __LITTLE_ENDIAN__ | 368 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
324 | EX( mov.w r0,@r4 ) | 369 | EX( mov.w r0,@r4 ) |
325 | shlr16 r0 | 370 | shlr16 r0 |
326 | EX( mov.w r0,@(2,r4) ) | 371 | EX( mov.w r0,@(2,r4) ) |
@@ -342,7 +387,7 @@ EX( mov.w r0,@r4 ) | |||
342 | ! Read longword, write byte, word, byte per iteration | 387 | ! Read longword, write byte, word, byte per iteration |
343 | EX( mov.l @r5+,r0 ) | 388 | EX( mov.l @r5+,r0 ) |
344 | dt r2 | 389 | dt r2 |
345 | #ifdef __LITTLE_ENDIAN__ | 390 | #ifdef CONFIG_CPU_LITTLE_ENDIAN |
346 | EX( mov.b r0,@r4 ) | 391 | EX( mov.b r0,@r4 ) |
347 | shlr8 r0 | 392 | shlr8 r0 |
348 | add #1,r4 | 393 | add #1,r4 |
@@ -379,6 +424,7 @@ EX( mov.b r0,@r4 ) | |||
379 | 424 | ||
380 | .L_exit: | 425 | .L_exit: |
381 | mov #0,r0 ! normal return | 426 | mov #0,r0 ! normal return |
427 | |||
382 | 5000: | 428 | 5000: |
383 | 429 | ||
384 | # Exception handler: | 430 | # Exception handler: |
@@ -394,5 +440,6 @@ EX( mov.b r0,@r4 ) | |||
394 | .previous | 440 | .previous |
395 | mov.l @r15+,r8 | 441 | mov.l @r15+,r8 |
396 | mov.l @r15+,r9 | 442 | mov.l @r15+,r9 |
443 | mov.l @r15+,r10 | ||
397 | rts | 444 | rts |
398 | mov.l @r15+,r10 | 445 | mov.l @r15+,r11 |