aboutsummaryrefslogtreecommitdiffstats
path: root/arch/mips/lib/memcpy-inatomic.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/mips/lib/memcpy-inatomic.S')
-rw-r--r--arch/mips/lib/memcpy-inatomic.S141
1 files changed, 78 insertions, 63 deletions
diff --git a/arch/mips/lib/memcpy-inatomic.S b/arch/mips/lib/memcpy-inatomic.S
index 3a534b2baa0f..736d0fb56a94 100644
--- a/arch/mips/lib/memcpy-inatomic.S
+++ b/arch/mips/lib/memcpy-inatomic.S
@@ -9,6 +9,7 @@
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc. 10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde 11 * memcpy/copy_user author: Mark Vandevoorde
12 * Copyright (C) 2007 Maciej W. Rozycki
12 * 13 *
13 * Mnemonic names for arguments to memcpy/__copy_user 14 * Mnemonic names for arguments to memcpy/__copy_user
14 */ 15 */
@@ -175,7 +176,11 @@
175 176
176 .text 177 .text
177 .set noreorder 178 .set noreorder
179#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
178 .set noat 180 .set noat
181#else
182 .set at=v1
183#endif
179 184
180/* 185/*
181 * A combined memcpy/__copy_user 186 * A combined memcpy/__copy_user
@@ -204,36 +209,36 @@ LEAF(__copy_user_inatomic)
204 and t1, dst, ADDRMASK 209 and t1, dst, ADDRMASK
205 PREF( 0, 1*32(src) ) 210 PREF( 0, 1*32(src) )
206 PREF( 1, 1*32(dst) ) 211 PREF( 1, 1*32(dst) )
207 bnez t2, copy_bytes_checklen 212 bnez t2, .Lcopy_bytes_checklen
208 and t0, src, ADDRMASK 213 and t0, src, ADDRMASK
209 PREF( 0, 2*32(src) ) 214 PREF( 0, 2*32(src) )
210 PREF( 1, 2*32(dst) ) 215 PREF( 1, 2*32(dst) )
211 bnez t1, dst_unaligned 216 bnez t1, .Ldst_unaligned
212 nop 217 nop
213 bnez t0, src_unaligned_dst_aligned 218 bnez t0, .Lsrc_unaligned_dst_aligned
214 /* 219 /*
215 * use delay slot for fall-through 220 * use delay slot for fall-through
216 * src and dst are aligned; need to compute rem 221 * src and dst are aligned; need to compute rem
217 */ 222 */
218both_aligned: 223.Lboth_aligned:
219 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 224 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
220 beqz t0, cleanup_both_aligned # len < 8*NBYTES 225 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES
221 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 226 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
222 PREF( 0, 3*32(src) ) 227 PREF( 0, 3*32(src) )
223 PREF( 1, 3*32(dst) ) 228 PREF( 1, 3*32(dst) )
224 .align 4 229 .align 4
2251: 2301:
226EXC( LOAD t0, UNIT(0)(src), l_exc) 231EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
227EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 232EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
228EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 233EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
229EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 234EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
230 SUB len, len, 8*NBYTES 235 SUB len, len, 8*NBYTES
231EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 236EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy)
232EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 237EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy)
233 STORE t0, UNIT(0)(dst) 238 STORE t0, UNIT(0)(dst)
234 STORE t1, UNIT(1)(dst) 239 STORE t1, UNIT(1)(dst)
235EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 240EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy)
236EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 241EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy)
237 ADD src, src, 8*NBYTES 242 ADD src, src, 8*NBYTES
238 ADD dst, dst, 8*NBYTES 243 ADD dst, dst, 8*NBYTES
239 STORE t2, UNIT(-6)(dst) 244 STORE t2, UNIT(-6)(dst)
@@ -250,39 +255,43 @@ EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
250 /* 255 /*
251 * len == rem == the number of bytes left to copy < 8*NBYTES 256 * len == rem == the number of bytes left to copy < 8*NBYTES
252 */ 257 */
253cleanup_both_aligned: 258.Lcleanup_both_aligned:
254 beqz len, done 259 beqz len, .Ldone
255 sltu t0, len, 4*NBYTES 260 sltu t0, len, 4*NBYTES
256 bnez t0, less_than_4units 261 bnez t0, .Lless_than_4units
257 and rem, len, (NBYTES-1) # rem = len % NBYTES 262 and rem, len, (NBYTES-1) # rem = len % NBYTES
258 /* 263 /*
259 * len >= 4*NBYTES 264 * len >= 4*NBYTES
260 */ 265 */
261EXC( LOAD t0, UNIT(0)(src), l_exc) 266EXC( LOAD t0, UNIT(0)(src), .Ll_exc)
262EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 267EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy)
263EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 268EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy)
264EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 269EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy)
265 SUB len, len, 4*NBYTES 270 SUB len, len, 4*NBYTES
266 ADD src, src, 4*NBYTES 271 ADD src, src, 4*NBYTES
267 STORE t0, UNIT(0)(dst) 272 STORE t0, UNIT(0)(dst)
268 STORE t1, UNIT(1)(dst) 273 STORE t1, UNIT(1)(dst)
269 STORE t2, UNIT(2)(dst) 274 STORE t2, UNIT(2)(dst)
270 STORE t3, UNIT(3)(dst) 275 STORE t3, UNIT(3)(dst)
271 beqz len, done 276 .set reorder /* DADDI_WAR */
272 ADD dst, dst, 4*NBYTES 277 ADD dst, dst, 4*NBYTES
273less_than_4units: 278 beqz len, .Ldone
279 .set noreorder
280.Lless_than_4units:
274 /* 281 /*
275 * rem = len % NBYTES 282 * rem = len % NBYTES
276 */ 283 */
277 beq rem, len, copy_bytes 284 beq rem, len, .Lcopy_bytes
278 nop 285 nop
2791: 2861:
280EXC( LOAD t0, 0(src), l_exc) 287EXC( LOAD t0, 0(src), .Ll_exc)
281 ADD src, src, NBYTES 288 ADD src, src, NBYTES
282 SUB len, len, NBYTES 289 SUB len, len, NBYTES
283 STORE t0, 0(dst) 290 STORE t0, 0(dst)
291 .set reorder /* DADDI_WAR */
292 ADD dst, dst, NBYTES
284 bne rem, len, 1b 293 bne rem, len, 1b
285 ADD dst, dst, NBYTES 294 .set noreorder
286 295
287 /* 296 /*
288 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 297 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
@@ -296,17 +305,17 @@ EXC( LOAD t0, 0(src), l_exc)
296 * more instruction-level parallelism. 305 * more instruction-level parallelism.
297 */ 306 */
298#define bits t2 307#define bits t2
299 beqz len, done 308 beqz len, .Ldone
300 ADD t1, dst, len # t1 is just past last byte of dst 309 ADD t1, dst, len # t1 is just past last byte of dst
301 li bits, 8*NBYTES 310 li bits, 8*NBYTES
302 SLL rem, len, 3 # rem = number of bits to keep 311 SLL rem, len, 3 # rem = number of bits to keep
303EXC( LOAD t0, 0(src), l_exc) 312EXC( LOAD t0, 0(src), .Ll_exc)
304 SUB bits, bits, rem # bits = number of bits to discard 313 SUB bits, bits, rem # bits = number of bits to discard
305 SHIFT_DISCARD t0, t0, bits 314 SHIFT_DISCARD t0, t0, bits
306 STREST t0, -1(t1) 315 STREST t0, -1(t1)
307 jr ra 316 jr ra
308 move len, zero 317 move len, zero
309dst_unaligned: 318.Ldst_unaligned:
310 /* 319 /*
311 * dst is unaligned 320 * dst is unaligned
312 * t0 = src & ADDRMASK 321 * t0 = src & ADDRMASK
@@ -317,22 +326,22 @@ dst_unaligned:
317 * Set match = (src and dst have same alignment) 326 * Set match = (src and dst have same alignment)
318 */ 327 */
319#define match rem 328#define match rem
320EXC( LDFIRST t3, FIRST(0)(src), l_exc) 329EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc)
321 ADD t2, zero, NBYTES 330 ADD t2, zero, NBYTES
322EXC( LDREST t3, REST(0)(src), l_exc_copy) 331EXC( LDREST t3, REST(0)(src), .Ll_exc_copy)
323 SUB t2, t2, t1 # t2 = number of bytes copied 332 SUB t2, t2, t1 # t2 = number of bytes copied
324 xor match, t0, t1 333 xor match, t0, t1
325 STFIRST t3, FIRST(0)(dst) 334 STFIRST t3, FIRST(0)(dst)
326 beq len, t2, done 335 beq len, t2, .Ldone
327 SUB len, len, t2 336 SUB len, len, t2
328 ADD dst, dst, t2 337 ADD dst, dst, t2
329 beqz match, both_aligned 338 beqz match, .Lboth_aligned
330 ADD src, src, t2 339 ADD src, src, t2
331 340
332src_unaligned_dst_aligned: 341.Lsrc_unaligned_dst_aligned:
333 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 342 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
334 PREF( 0, 3*32(src) ) 343 PREF( 0, 3*32(src) )
335 beqz t0, cleanup_src_unaligned 344 beqz t0, .Lcleanup_src_unaligned
336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 345 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
337 PREF( 1, 3*32(dst) ) 346 PREF( 1, 3*32(dst) )
3381: 3471:
@@ -342,15 +351,15 @@ src_unaligned_dst_aligned:
342 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 351 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
343 * are to the same unit (unless src is aligned, but it's not). 352 * are to the same unit (unless src is aligned, but it's not).
344 */ 353 */
345EXC( LDFIRST t0, FIRST(0)(src), l_exc) 354EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
346EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 355EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy)
347 SUB len, len, 4*NBYTES 356 SUB len, len, 4*NBYTES
348EXC( LDREST t0, REST(0)(src), l_exc_copy) 357EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
349EXC( LDREST t1, REST(1)(src), l_exc_copy) 358EXC( LDREST t1, REST(1)(src), .Ll_exc_copy)
350EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 359EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy)
351EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 360EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy)
352EXC( LDREST t2, REST(2)(src), l_exc_copy) 361EXC( LDREST t2, REST(2)(src), .Ll_exc_copy)
353EXC( LDREST t3, REST(3)(src), l_exc_copy) 362EXC( LDREST t3, REST(3)(src), .Ll_exc_copy)
354 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 363 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
355 ADD src, src, 4*NBYTES 364 ADD src, src, 4*NBYTES
356#ifdef CONFIG_CPU_SB1 365#ifdef CONFIG_CPU_SB1
@@ -361,32 +370,36 @@ EXC( LDREST t3, REST(3)(src), l_exc_copy)
361 STORE t2, UNIT(2)(dst) 370 STORE t2, UNIT(2)(dst)
362 STORE t3, UNIT(3)(dst) 371 STORE t3, UNIT(3)(dst)
363 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 372 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
373 .set reorder /* DADDI_WAR */
374 ADD dst, dst, 4*NBYTES
364 bne len, rem, 1b 375 bne len, rem, 1b
365 ADD dst, dst, 4*NBYTES 376 .set noreorder
366 377
367cleanup_src_unaligned: 378.Lcleanup_src_unaligned:
368 beqz len, done 379 beqz len, .Ldone
369 and rem, len, NBYTES-1 # rem = len % NBYTES 380 and rem, len, NBYTES-1 # rem = len % NBYTES
370 beq rem, len, copy_bytes 381 beq rem, len, .Lcopy_bytes
371 nop 382 nop
3721: 3831:
373EXC( LDFIRST t0, FIRST(0)(src), l_exc) 384EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc)
374EXC( LDREST t0, REST(0)(src), l_exc_copy) 385EXC( LDREST t0, REST(0)(src), .Ll_exc_copy)
375 ADD src, src, NBYTES 386 ADD src, src, NBYTES
376 SUB len, len, NBYTES 387 SUB len, len, NBYTES
377 STORE t0, 0(dst) 388 STORE t0, 0(dst)
389 .set reorder /* DADDI_WAR */
390 ADD dst, dst, NBYTES
378 bne len, rem, 1b 391 bne len, rem, 1b
379 ADD dst, dst, NBYTES 392 .set noreorder
380 393
381copy_bytes_checklen: 394.Lcopy_bytes_checklen:
382 beqz len, done 395 beqz len, .Ldone
383 nop 396 nop
384copy_bytes: 397.Lcopy_bytes:
385 /* 0 < len < NBYTES */ 398 /* 0 < len < NBYTES */
386#define COPY_BYTE(N) \ 399#define COPY_BYTE(N) \
387EXC( lb t0, N(src), l_exc); \ 400EXC( lb t0, N(src), .Ll_exc); \
388 SUB len, len, 1; \ 401 SUB len, len, 1; \
389 beqz len, done; \ 402 beqz len, .Ldone; \
390 sb t0, N(dst) 403 sb t0, N(dst)
391 404
392 COPY_BYTE(0) 405 COPY_BYTE(0)
@@ -397,16 +410,16 @@ EXC( lb t0, N(src), l_exc); \
397 COPY_BYTE(4) 410 COPY_BYTE(4)
398 COPY_BYTE(5) 411 COPY_BYTE(5)
399#endif 412#endif
400EXC( lb t0, NBYTES-2(src), l_exc) 413EXC( lb t0, NBYTES-2(src), .Ll_exc)
401 SUB len, len, 1 414 SUB len, len, 1
402 jr ra 415 jr ra
403 sb t0, NBYTES-2(dst) 416 sb t0, NBYTES-2(dst)
404done: 417.Ldone:
405 jr ra 418 jr ra
406 nop 419 nop
407 END(__copy_user_inatomic) 420 END(__copy_user_inatomic)
408 421
409l_exc_copy: 422.Ll_exc_copy:
410 /* 423 /*
411 * Copy bytes from src until faulting load address (or until a 424 * Copy bytes from src until faulting load address (or until a
412 * lb faults) 425 * lb faults)
@@ -421,12 +434,14 @@ l_exc_copy:
421 nop 434 nop
422 LOAD t0, THREAD_BUADDR(t0) 435 LOAD t0, THREAD_BUADDR(t0)
4231: 4361:
424EXC( lb t1, 0(src), l_exc) 437EXC( lb t1, 0(src), .Ll_exc)
425 ADD src, src, 1 438 ADD src, src, 1
426 sb t1, 0(dst) # can't fault -- we're copy_from_user 439 sb t1, 0(dst) # can't fault -- we're copy_from_user
440 .set reorder /* DADDI_WAR */
441 ADD dst, dst, 1
427 bne src, t0, 1b 442 bne src, t0, 1b
428 ADD dst, dst, 1 443 .set noreorder
429l_exc: 444.Ll_exc:
430 LOAD t0, TI_TASK($28) 445 LOAD t0, TI_TASK($28)
431 nop 446 nop
432 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 447 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address