diff options
Diffstat (limited to 'arch/mips/lib/csum_partial.S')
-rw-r--r-- | arch/mips/lib/csum_partial.S | 442 |
1 files changed, 442 insertions, 0 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index 9db357294be1..c0a77fe038be 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S | |||
@@ -8,7 +8,9 @@ | |||
8 | * Copyright (C) 1998, 1999 Ralf Baechle | 8 | * Copyright (C) 1998, 1999 Ralf Baechle |
9 | * Copyright (C) 1999 Silicon Graphics, Inc. | 9 | * Copyright (C) 1999 Silicon Graphics, Inc. |
10 | */ | 10 | */ |
11 | #include <linux/errno.h> | ||
11 | #include <asm/asm.h> | 12 | #include <asm/asm.h> |
13 | #include <asm/asm-offsets.h> | ||
12 | #include <asm/regdef.h> | 14 | #include <asm/regdef.h> |
13 | 15 | ||
14 | #ifdef CONFIG_64BIT | 16 | #ifdef CONFIG_64BIT |
@@ -271,3 +273,443 @@ small_csumcpy: | |||
271 | jr ra | 273 | jr ra |
272 | .set noreorder | 274 | .set noreorder |
273 | END(csum_partial) | 275 | END(csum_partial) |
276 | |||
277 | |||
278 | /* | ||
279 | * checksum and copy routines based on memcpy.S | ||
280 | * | ||
281 | * csum_partial_copy_nocheck(src, dst, len, sum) | ||
282 | * __csum_partial_copy_user(src, dst, len, sum, errp) | ||
283 | * | ||
284 | * See "Spec" in memcpy.S for details. Unlike __copy_user, all | ||
285 | * function in this file use the standard calling convention. | ||
286 | */ | ||
287 | |||
288 | #define src a0 | ||
289 | #define dst a1 | ||
290 | #define len a2 | ||
291 | #define psum a3 | ||
292 | #define sum v0 | ||
293 | #define odd t8 | ||
294 | #define errptr t9 | ||
295 | |||
296 | /* | ||
297 | * The exception handler for loads requires that: | ||
298 | * 1- AT contain the address of the byte just past the end of the source | ||
299 | * of the copy, | ||
300 | * 2- src_entry <= src < AT, and | ||
301 | * 3- (dst - src) == (dst_entry - src_entry), | ||
302 | * The _entry suffix denotes values when __copy_user was called. | ||
303 | * | ||
304 | * (1) is set up up by __csum_partial_copy_from_user and maintained by | ||
305 | * not writing AT in __csum_partial_copy | ||
306 | * (2) is met by incrementing src by the number of bytes copied | ||
307 | * (3) is met by not doing loads between a pair of increments of dst and src | ||
308 | * | ||
309 | * The exception handlers for stores stores -EFAULT to errptr and return. | ||
310 | * These handlers do not need to overwrite any data. | ||
311 | */ | ||
312 | |||
313 | #define EXC(inst_reg,addr,handler) \ | ||
314 | 9: inst_reg, addr; \ | ||
315 | .section __ex_table,"a"; \ | ||
316 | PTR 9b, handler; \ | ||
317 | .previous | ||
318 | |||
319 | #ifdef USE_DOUBLE | ||
320 | |||
321 | #define LOAD ld | ||
322 | #define LOADL ldl | ||
323 | #define LOADR ldr | ||
324 | #define STOREL sdl | ||
325 | #define STORER sdr | ||
326 | #define STORE sd | ||
327 | #define ADD daddu | ||
328 | #define SUB dsubu | ||
329 | #define SRL dsrl | ||
330 | #define SLL dsll | ||
331 | #define SLLV dsllv | ||
332 | #define SRLV dsrlv | ||
333 | #define NBYTES 8 | ||
334 | #define LOG_NBYTES 3 | ||
335 | |||
336 | #else | ||
337 | |||
338 | #define LOAD lw | ||
339 | #define LOADL lwl | ||
340 | #define LOADR lwr | ||
341 | #define STOREL swl | ||
342 | #define STORER swr | ||
343 | #define STORE sw | ||
344 | #define ADD addu | ||
345 | #define SUB subu | ||
346 | #define SRL srl | ||
347 | #define SLL sll | ||
348 | #define SLLV sllv | ||
349 | #define SRLV srlv | ||
350 | #define NBYTES 4 | ||
351 | #define LOG_NBYTES 2 | ||
352 | |||
353 | #endif /* USE_DOUBLE */ | ||
354 | |||
355 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | ||
356 | #define LDFIRST LOADR | ||
357 | #define LDREST LOADL | ||
358 | #define STFIRST STORER | ||
359 | #define STREST STOREL | ||
360 | #define SHIFT_DISCARD SLLV | ||
361 | #define SHIFT_DISCARD_REVERT SRLV | ||
362 | #else | ||
363 | #define LDFIRST LOADL | ||
364 | #define LDREST LOADR | ||
365 | #define STFIRST STOREL | ||
366 | #define STREST STORER | ||
367 | #define SHIFT_DISCARD SRLV | ||
368 | #define SHIFT_DISCARD_REVERT SLLV | ||
369 | #endif | ||
370 | |||
371 | #define FIRST(unit) ((unit)*NBYTES) | ||
372 | #define REST(unit) (FIRST(unit)+NBYTES-1) | ||
373 | |||
374 | #define ADDRMASK (NBYTES-1) | ||
375 | |||
376 | .set noat | ||
377 | |||
378 | LEAF(__csum_partial_copy_user) | ||
379 | PTR_ADDU AT, src, len /* See (1) above. */ | ||
380 | #ifdef CONFIG_64BIT | ||
381 | move errptr, a4 | ||
382 | #else | ||
383 | lw errptr, 16(sp) | ||
384 | #endif | ||
385 | FEXPORT(csum_partial_copy_nocheck) | ||
386 | move sum, zero | ||
387 | move odd, zero | ||
388 | /* | ||
389 | * Note: dst & src may be unaligned, len may be 0 | ||
390 | * Temps | ||
391 | */ | ||
392 | /* | ||
393 | * The "issue break"s below are very approximate. | ||
394 | * Issue delays for dcache fills will perturb the schedule, as will | ||
395 | * load queue full replay traps, etc. | ||
396 | * | ||
397 | * If len < NBYTES use byte operations. | ||
398 | */ | ||
399 | sltu t2, len, NBYTES | ||
400 | and t1, dst, ADDRMASK | ||
401 | bnez t2, copy_bytes_checklen | ||
402 | and t0, src, ADDRMASK | ||
403 | andi odd, dst, 0x1 /* odd buffer? */ | ||
404 | bnez t1, dst_unaligned | ||
405 | nop | ||
406 | bnez t0, src_unaligned_dst_aligned | ||
407 | /* | ||
408 | * use delay slot for fall-through | ||
409 | * src and dst are aligned; need to compute rem | ||
410 | */ | ||
411 | both_aligned: | ||
412 | SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter | ||
413 | beqz t0, cleanup_both_aligned # len < 8*NBYTES | ||
414 | nop | ||
415 | SUB len, 8*NBYTES # subtract here for bgez loop | ||
416 | .align 4 | ||
417 | 1: | ||
418 | EXC( LOAD t0, UNIT(0)(src), l_exc) | ||
419 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | ||
420 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | ||
421 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | ||
422 | EXC( LOAD t4, UNIT(4)(src), l_exc_copy) | ||
423 | EXC( LOAD t5, UNIT(5)(src), l_exc_copy) | ||
424 | EXC( LOAD t6, UNIT(6)(src), l_exc_copy) | ||
425 | EXC( LOAD t7, UNIT(7)(src), l_exc_copy) | ||
426 | SUB len, len, 8*NBYTES | ||
427 | ADD src, src, 8*NBYTES | ||
428 | EXC( STORE t0, UNIT(0)(dst), s_exc) | ||
429 | ADDC(sum, t0) | ||
430 | EXC( STORE t1, UNIT(1)(dst), s_exc) | ||
431 | ADDC(sum, t1) | ||
432 | EXC( STORE t2, UNIT(2)(dst), s_exc) | ||
433 | ADDC(sum, t2) | ||
434 | EXC( STORE t3, UNIT(3)(dst), s_exc) | ||
435 | ADDC(sum, t3) | ||
436 | EXC( STORE t4, UNIT(4)(dst), s_exc) | ||
437 | ADDC(sum, t4) | ||
438 | EXC( STORE t5, UNIT(5)(dst), s_exc) | ||
439 | ADDC(sum, t5) | ||
440 | EXC( STORE t6, UNIT(6)(dst), s_exc) | ||
441 | ADDC(sum, t6) | ||
442 | EXC( STORE t7, UNIT(7)(dst), s_exc) | ||
443 | ADDC(sum, t7) | ||
444 | bgez len, 1b | ||
445 | ADD dst, dst, 8*NBYTES | ||
446 | ADD len, 8*NBYTES # revert len (see above) | ||
447 | |||
448 | /* | ||
449 | * len == the number of bytes left to copy < 8*NBYTES | ||
450 | */ | ||
451 | cleanup_both_aligned: | ||
452 | #define rem t7 | ||
453 | beqz len, done | ||
454 | sltu t0, len, 4*NBYTES | ||
455 | bnez t0, less_than_4units | ||
456 | and rem, len, (NBYTES-1) # rem = len % NBYTES | ||
457 | /* | ||
458 | * len >= 4*NBYTES | ||
459 | */ | ||
460 | EXC( LOAD t0, UNIT(0)(src), l_exc) | ||
461 | EXC( LOAD t1, UNIT(1)(src), l_exc_copy) | ||
462 | EXC( LOAD t2, UNIT(2)(src), l_exc_copy) | ||
463 | EXC( LOAD t3, UNIT(3)(src), l_exc_copy) | ||
464 | SUB len, len, 4*NBYTES | ||
465 | ADD src, src, 4*NBYTES | ||
466 | EXC( STORE t0, UNIT(0)(dst), s_exc) | ||
467 | ADDC(sum, t0) | ||
468 | EXC( STORE t1, UNIT(1)(dst), s_exc) | ||
469 | ADDC(sum, t1) | ||
470 | EXC( STORE t2, UNIT(2)(dst), s_exc) | ||
471 | ADDC(sum, t2) | ||
472 | EXC( STORE t3, UNIT(3)(dst), s_exc) | ||
473 | ADDC(sum, t3) | ||
474 | beqz len, done | ||
475 | ADD dst, dst, 4*NBYTES | ||
476 | less_than_4units: | ||
477 | /* | ||
478 | * rem = len % NBYTES | ||
479 | */ | ||
480 | beq rem, len, copy_bytes | ||
481 | nop | ||
482 | 1: | ||
483 | EXC( LOAD t0, 0(src), l_exc) | ||
484 | ADD src, src, NBYTES | ||
485 | SUB len, len, NBYTES | ||
486 | EXC( STORE t0, 0(dst), s_exc) | ||
487 | ADDC(sum, t0) | ||
488 | bne rem, len, 1b | ||
489 | ADD dst, dst, NBYTES | ||
490 | |||
491 | /* | ||
492 | * src and dst are aligned, need to copy rem bytes (rem < NBYTES) | ||
493 | * A loop would do only a byte at a time with possible branch | ||
494 | * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE | ||
495 | * because can't assume read-access to dst. Instead, use | ||
496 | * STREST dst, which doesn't require read access to dst. | ||
497 | * | ||
498 | * This code should perform better than a simple loop on modern, | ||
499 | * wide-issue mips processors because the code has fewer branches and | ||
500 | * more instruction-level parallelism. | ||
501 | */ | ||
502 | #define bits t2 | ||
503 | beqz len, done | ||
504 | ADD t1, dst, len # t1 is just past last byte of dst | ||
505 | li bits, 8*NBYTES | ||
506 | SLL rem, len, 3 # rem = number of bits to keep | ||
507 | EXC( LOAD t0, 0(src), l_exc) | ||
508 | SUB bits, bits, rem # bits = number of bits to discard | ||
509 | SHIFT_DISCARD t0, t0, bits | ||
510 | EXC( STREST t0, -1(t1), s_exc) | ||
511 | SHIFT_DISCARD_REVERT t0, t0, bits | ||
512 | .set reorder | ||
513 | ADDC(sum, t0) | ||
514 | b done | ||
515 | .set noreorder | ||
516 | dst_unaligned: | ||
517 | /* | ||
518 | * dst is unaligned | ||
519 | * t0 = src & ADDRMASK | ||
520 | * t1 = dst & ADDRMASK; T1 > 0 | ||
521 | * len >= NBYTES | ||
522 | * | ||
523 | * Copy enough bytes to align dst | ||
524 | * Set match = (src and dst have same alignment) | ||
525 | */ | ||
526 | #define match rem | ||
527 | EXC( LDFIRST t3, FIRST(0)(src), l_exc) | ||
528 | ADD t2, zero, NBYTES | ||
529 | EXC( LDREST t3, REST(0)(src), l_exc_copy) | ||
530 | SUB t2, t2, t1 # t2 = number of bytes copied | ||
531 | xor match, t0, t1 | ||
532 | EXC( STFIRST t3, FIRST(0)(dst), s_exc) | ||
533 | SLL t4, t1, 3 # t4 = number of bits to discard | ||
534 | SHIFT_DISCARD t3, t3, t4 | ||
535 | /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */ | ||
536 | ADDC(sum, t3) | ||
537 | beq len, t2, done | ||
538 | SUB len, len, t2 | ||
539 | ADD dst, dst, t2 | ||
540 | beqz match, both_aligned | ||
541 | ADD src, src, t2 | ||
542 | |||
543 | src_unaligned_dst_aligned: | ||
544 | SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter | ||
545 | beqz t0, cleanup_src_unaligned | ||
546 | and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES | ||
547 | 1: | ||
548 | /* | ||
549 | * Avoid consecutive LD*'s to the same register since some mips | ||
550 | * implementations can't issue them in the same cycle. | ||
551 | * It's OK to load FIRST(N+1) before REST(N) because the two addresses | ||
552 | * are to the same unit (unless src is aligned, but it's not). | ||
553 | */ | ||
554 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | ||
555 | EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) | ||
556 | SUB len, len, 4*NBYTES | ||
557 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | ||
558 | EXC( LDREST t1, REST(1)(src), l_exc_copy) | ||
559 | EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) | ||
560 | EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) | ||
561 | EXC( LDREST t2, REST(2)(src), l_exc_copy) | ||
562 | EXC( LDREST t3, REST(3)(src), l_exc_copy) | ||
563 | ADD src, src, 4*NBYTES | ||
564 | #ifdef CONFIG_CPU_SB1 | ||
565 | nop # improves slotting | ||
566 | #endif | ||
567 | EXC( STORE t0, UNIT(0)(dst), s_exc) | ||
568 | ADDC(sum, t0) | ||
569 | EXC( STORE t1, UNIT(1)(dst), s_exc) | ||
570 | ADDC(sum, t1) | ||
571 | EXC( STORE t2, UNIT(2)(dst), s_exc) | ||
572 | ADDC(sum, t2) | ||
573 | EXC( STORE t3, UNIT(3)(dst), s_exc) | ||
574 | ADDC(sum, t3) | ||
575 | bne len, rem, 1b | ||
576 | ADD dst, dst, 4*NBYTES | ||
577 | |||
578 | cleanup_src_unaligned: | ||
579 | beqz len, done | ||
580 | and rem, len, NBYTES-1 # rem = len % NBYTES | ||
581 | beq rem, len, copy_bytes | ||
582 | nop | ||
583 | 1: | ||
584 | EXC( LDFIRST t0, FIRST(0)(src), l_exc) | ||
585 | EXC( LDREST t0, REST(0)(src), l_exc_copy) | ||
586 | ADD src, src, NBYTES | ||
587 | SUB len, len, NBYTES | ||
588 | EXC( STORE t0, 0(dst), s_exc) | ||
589 | ADDC(sum, t0) | ||
590 | bne len, rem, 1b | ||
591 | ADD dst, dst, NBYTES | ||
592 | |||
593 | copy_bytes_checklen: | ||
594 | beqz len, done | ||
595 | nop | ||
596 | copy_bytes: | ||
597 | /* 0 < len < NBYTES */ | ||
598 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | ||
599 | #define SHIFT_START 0 | ||
600 | #define SHIFT_INC 8 | ||
601 | #else | ||
602 | #define SHIFT_START 8*(NBYTES-1) | ||
603 | #define SHIFT_INC -8 | ||
604 | #endif | ||
605 | move t2, zero # partial word | ||
606 | li t3, SHIFT_START # shift | ||
607 | /* use l_exc_copy here to return correct sum on fault */ | ||
608 | #define COPY_BYTE(N) \ | ||
609 | EXC( lbu t0, N(src), l_exc_copy); \ | ||
610 | SUB len, len, 1; \ | ||
611 | EXC( sb t0, N(dst), s_exc); \ | ||
612 | SLLV t0, t0, t3; \ | ||
613 | addu t3, SHIFT_INC; \ | ||
614 | beqz len, copy_bytes_done; \ | ||
615 | or t2, t0 | ||
616 | |||
617 | COPY_BYTE(0) | ||
618 | COPY_BYTE(1) | ||
619 | #ifdef USE_DOUBLE | ||
620 | COPY_BYTE(2) | ||
621 | COPY_BYTE(3) | ||
622 | COPY_BYTE(4) | ||
623 | COPY_BYTE(5) | ||
624 | #endif | ||
625 | EXC( lbu t0, NBYTES-2(src), l_exc_copy) | ||
626 | SUB len, len, 1 | ||
627 | EXC( sb t0, NBYTES-2(dst), s_exc) | ||
628 | SLLV t0, t0, t3 | ||
629 | or t2, t0 | ||
630 | copy_bytes_done: | ||
631 | ADDC(sum, t2) | ||
632 | done: | ||
633 | /* fold checksum */ | ||
634 | #ifdef USE_DOUBLE | ||
635 | dsll32 v1, sum, 0 | ||
636 | daddu sum, v1 | ||
637 | sltu v1, sum, v1 | ||
638 | dsra32 sum, sum, 0 | ||
639 | addu sum, v1 | ||
640 | #endif | ||
641 | sll v1, sum, 16 | ||
642 | addu sum, v1 | ||
643 | sltu v1, sum, v1 | ||
644 | srl sum, sum, 16 | ||
645 | addu sum, v1 | ||
646 | |||
647 | /* odd buffer alignment? */ | ||
648 | beqz odd, 1f | ||
649 | nop | ||
650 | sll v1, sum, 8 | ||
651 | srl sum, sum, 8 | ||
652 | or sum, v1 | ||
653 | andi sum, 0xffff | ||
654 | 1: | ||
655 | .set reorder | ||
656 | ADDC(sum, psum) | ||
657 | jr ra | ||
658 | .set noreorder | ||
659 | |||
660 | l_exc_copy: | ||
661 | /* | ||
662 | * Copy bytes from src until faulting load address (or until a | ||
663 | * lb faults) | ||
664 | * | ||
665 | * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) | ||
666 | * may be more than a byte beyond the last address. | ||
667 | * Hence, the lb below may get an exception. | ||
668 | * | ||
669 | * Assumes src < THREAD_BUADDR($28) | ||
670 | */ | ||
671 | LOAD t0, TI_TASK($28) | ||
672 | li t2, SHIFT_START | ||
673 | LOAD t0, THREAD_BUADDR(t0) | ||
674 | 1: | ||
675 | EXC( lbu t1, 0(src), l_exc) | ||
676 | ADD src, src, 1 | ||
677 | sb t1, 0(dst) # can't fault -- we're copy_from_user | ||
678 | SLLV t1, t1, t2 | ||
679 | addu t2, SHIFT_INC | ||
680 | ADDC(sum, t1) | ||
681 | bne src, t0, 1b | ||
682 | ADD dst, dst, 1 | ||
683 | l_exc: | ||
684 | LOAD t0, TI_TASK($28) | ||
685 | nop | ||
686 | LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address | ||
687 | nop | ||
688 | SUB len, AT, t0 # len number of uncopied bytes | ||
689 | /* | ||
690 | * Here's where we rely on src and dst being incremented in tandem, | ||
691 | * See (3) above. | ||
692 | * dst += (fault addr - src) to put dst at first byte to clear | ||
693 | */ | ||
694 | ADD dst, t0 # compute start address in a1 | ||
695 | SUB dst, src | ||
696 | /* | ||
697 | * Clear len bytes starting at dst. Can't call __bzero because it | ||
698 | * might modify len. An inefficient loop for these rare times... | ||
699 | */ | ||
700 | beqz len, done | ||
701 | SUB src, len, 1 | ||
702 | 1: sb zero, 0(dst) | ||
703 | ADD dst, dst, 1 | ||
704 | bnez src, 1b | ||
705 | SUB src, src, 1 | ||
706 | li v1, -EFAULT | ||
707 | b done | ||
708 | sw v1, (errptr) | ||
709 | |||
710 | s_exc: | ||
711 | li v0, -1 /* invalid checksum */ | ||
712 | li v1, -EFAULT | ||
713 | jr ra | ||
714 | sw v1, (errptr) | ||
715 | END(__csum_partial_copy_user) | ||