aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAtsushi Nemoto <anemo@mba.ocn.ne.jp>2006-12-12 11:22:06 -0500
committerRalf Baechle <ralf@linux-mips.org>2007-01-08 16:41:04 -0500
commitf860c90bd6ce22c6a0a352cc16acc74fba3d628e (patch)
treefee63d3ff954439d19932beeb0109508a0bc899c /arch
parent61e84f99877fa8caaf1be86d51d825406e8d8bc1 (diff)
[MIPS] csum_partial and copy in parallel
Implement optimized asm version of csum_partial_copy_nocheck, csum_partial_copy_from_user and csum_and_copy_to_user which can do calculate and copy in parallel, based on memcpy.S. Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp> Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/mips/kernel/mips_ksyms.c2
-rw-r--r--arch/mips/lib/Makefile2
-rw-r--r--arch/mips/lib/csum_partial.S442
-rw-r--r--arch/mips/lib/csum_partial_copy.c52
4 files changed, 445 insertions, 53 deletions
diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c
index f44a01357ada..2ef857c3ee53 100644
--- a/arch/mips/kernel/mips_ksyms.c
+++ b/arch/mips/kernel/mips_ksyms.c
@@ -46,5 +46,7 @@ EXPORT_SYMBOL(__strnlen_user_nocheck_asm);
46EXPORT_SYMBOL(__strnlen_user_asm); 46EXPORT_SYMBOL(__strnlen_user_asm);
47 47
48EXPORT_SYMBOL(csum_partial); 48EXPORT_SYMBOL(csum_partial);
49EXPORT_SYMBOL(csum_partial_copy_nocheck);
50EXPORT_SYMBOL(__csum_partial_copy_user);
49 51
50EXPORT_SYMBOL(invalid_pte_table); 52EXPORT_SYMBOL(invalid_pte_table);
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 888b61ea12fe..989c900b8b14 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -2,7 +2,7 @@
2# Makefile for MIPS-specific library files.. 2# Makefile for MIPS-specific library files..
3# 3#
4 4
5lib-y += csum_partial.o csum_partial_copy.o memcpy.o promlib.o \ 5lib-y += csum_partial.o memcpy.o promlib.o \
6 strlen_user.o strncpy_user.o strnlen_user.o uncached.o 6 strlen_user.o strncpy_user.o strnlen_user.o uncached.o
7 7
8obj-y += iomap.o 8obj-y += iomap.o
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 9db357294be1..c0a77fe038be 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -8,7 +8,9 @@
8 * Copyright (C) 1998, 1999 Ralf Baechle 8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc. 9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 */ 10 */
11#include <linux/errno.h>
11#include <asm/asm.h> 12#include <asm/asm.h>
13#include <asm/asm-offsets.h>
12#include <asm/regdef.h> 14#include <asm/regdef.h>
13 15
14#ifdef CONFIG_64BIT 16#ifdef CONFIG_64BIT
@@ -271,3 +273,443 @@ small_csumcpy:
271 jr ra 273 jr ra
272 .set noreorder 274 .set noreorder
273 END(csum_partial) 275 END(csum_partial)
276
277
278/*
279 * checksum and copy routines based on memcpy.S
280 *
281 * csum_partial_copy_nocheck(src, dst, len, sum)
282 * __csum_partial_copy_user(src, dst, len, sum, errp)
283 *
284 * See "Spec" in memcpy.S for details. Unlike __copy_user, all
285 * function in this file use the standard calling convention.
286 */
287
288#define src a0
289#define dst a1
290#define len a2
291#define psum a3
292#define sum v0
293#define odd t8
294#define errptr t9
295
296/*
297 * The exception handler for loads requires that:
298 * 1- AT contain the address of the byte just past the end of the source
299 * of the copy,
300 * 2- src_entry <= src < AT, and
301 * 3- (dst - src) == (dst_entry - src_entry),
302 * The _entry suffix denotes values when __copy_user was called.
303 *
304 * (1) is set up up by __csum_partial_copy_from_user and maintained by
305 * not writing AT in __csum_partial_copy
306 * (2) is met by incrementing src by the number of bytes copied
307 * (3) is met by not doing loads between a pair of increments of dst and src
308 *
309 * The exception handlers for stores stores -EFAULT to errptr and return.
310 * These handlers do not need to overwrite any data.
311 */
312
313#define EXC(inst_reg,addr,handler) \
3149: inst_reg, addr; \
315 .section __ex_table,"a"; \
316 PTR 9b, handler; \
317 .previous
318
319#ifdef USE_DOUBLE
320
321#define LOAD ld
322#define LOADL ldl
323#define LOADR ldr
324#define STOREL sdl
325#define STORER sdr
326#define STORE sd
327#define ADD daddu
328#define SUB dsubu
329#define SRL dsrl
330#define SLL dsll
331#define SLLV dsllv
332#define SRLV dsrlv
333#define NBYTES 8
334#define LOG_NBYTES 3
335
336#else
337
338#define LOAD lw
339#define LOADL lwl
340#define LOADR lwr
341#define STOREL swl
342#define STORER swr
343#define STORE sw
344#define ADD addu
345#define SUB subu
346#define SRL srl
347#define SLL sll
348#define SLLV sllv
349#define SRLV srlv
350#define NBYTES 4
351#define LOG_NBYTES 2
352
353#endif /* USE_DOUBLE */
354
355#ifdef CONFIG_CPU_LITTLE_ENDIAN
356#define LDFIRST LOADR
357#define LDREST LOADL
358#define STFIRST STORER
359#define STREST STOREL
360#define SHIFT_DISCARD SLLV
361#define SHIFT_DISCARD_REVERT SRLV
362#else
363#define LDFIRST LOADL
364#define LDREST LOADR
365#define STFIRST STOREL
366#define STREST STORER
367#define SHIFT_DISCARD SRLV
368#define SHIFT_DISCARD_REVERT SLLV
369#endif
370
371#define FIRST(unit) ((unit)*NBYTES)
372#define REST(unit) (FIRST(unit)+NBYTES-1)
373
374#define ADDRMASK (NBYTES-1)
375
376 .set noat
377
378LEAF(__csum_partial_copy_user)
379 PTR_ADDU AT, src, len /* See (1) above. */
380#ifdef CONFIG_64BIT
381 move errptr, a4
382#else
383 lw errptr, 16(sp)
384#endif
385FEXPORT(csum_partial_copy_nocheck)
386 move sum, zero
387 move odd, zero
388 /*
389 * Note: dst & src may be unaligned, len may be 0
390 * Temps
391 */
392 /*
393 * The "issue break"s below are very approximate.
394 * Issue delays for dcache fills will perturb the schedule, as will
395 * load queue full replay traps, etc.
396 *
397 * If len < NBYTES use byte operations.
398 */
399 sltu t2, len, NBYTES
400 and t1, dst, ADDRMASK
401 bnez t2, copy_bytes_checklen
402 and t0, src, ADDRMASK
403 andi odd, dst, 0x1 /* odd buffer? */
404 bnez t1, dst_unaligned
405 nop
406 bnez t0, src_unaligned_dst_aligned
407 /*
408 * use delay slot for fall-through
409 * src and dst are aligned; need to compute rem
410 */
411both_aligned:
412 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
413 beqz t0, cleanup_both_aligned # len < 8*NBYTES
414 nop
415 SUB len, 8*NBYTES # subtract here for bgez loop
416 .align 4
4171:
418EXC( LOAD t0, UNIT(0)(src), l_exc)
419EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
420EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
421EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
422EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
423EXC( LOAD t5, UNIT(5)(src), l_exc_copy)
424EXC( LOAD t6, UNIT(6)(src), l_exc_copy)
425EXC( LOAD t7, UNIT(7)(src), l_exc_copy)
426 SUB len, len, 8*NBYTES
427 ADD src, src, 8*NBYTES
428EXC( STORE t0, UNIT(0)(dst), s_exc)
429 ADDC(sum, t0)
430EXC( STORE t1, UNIT(1)(dst), s_exc)
431 ADDC(sum, t1)
432EXC( STORE t2, UNIT(2)(dst), s_exc)
433 ADDC(sum, t2)
434EXC( STORE t3, UNIT(3)(dst), s_exc)
435 ADDC(sum, t3)
436EXC( STORE t4, UNIT(4)(dst), s_exc)
437 ADDC(sum, t4)
438EXC( STORE t5, UNIT(5)(dst), s_exc)
439 ADDC(sum, t5)
440EXC( STORE t6, UNIT(6)(dst), s_exc)
441 ADDC(sum, t6)
442EXC( STORE t7, UNIT(7)(dst), s_exc)
443 ADDC(sum, t7)
444 bgez len, 1b
445 ADD dst, dst, 8*NBYTES
446 ADD len, 8*NBYTES # revert len (see above)
447
448 /*
449 * len == the number of bytes left to copy < 8*NBYTES
450 */
451cleanup_both_aligned:
452#define rem t7
453 beqz len, done
454 sltu t0, len, 4*NBYTES
455 bnez t0, less_than_4units
456 and rem, len, (NBYTES-1) # rem = len % NBYTES
457 /*
458 * len >= 4*NBYTES
459 */
460EXC( LOAD t0, UNIT(0)(src), l_exc)
461EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
462EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
463EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
464 SUB len, len, 4*NBYTES
465 ADD src, src, 4*NBYTES
466EXC( STORE t0, UNIT(0)(dst), s_exc)
467 ADDC(sum, t0)
468EXC( STORE t1, UNIT(1)(dst), s_exc)
469 ADDC(sum, t1)
470EXC( STORE t2, UNIT(2)(dst), s_exc)
471 ADDC(sum, t2)
472EXC( STORE t3, UNIT(3)(dst), s_exc)
473 ADDC(sum, t3)
474 beqz len, done
475 ADD dst, dst, 4*NBYTES
476less_than_4units:
477 /*
478 * rem = len % NBYTES
479 */
480 beq rem, len, copy_bytes
481 nop
4821:
483EXC( LOAD t0, 0(src), l_exc)
484 ADD src, src, NBYTES
485 SUB len, len, NBYTES
486EXC( STORE t0, 0(dst), s_exc)
487 ADDC(sum, t0)
488 bne rem, len, 1b
489 ADD dst, dst, NBYTES
490
491 /*
492 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
493 * A loop would do only a byte at a time with possible branch
494 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
495 * because can't assume read-access to dst. Instead, use
496 * STREST dst, which doesn't require read access to dst.
497 *
498 * This code should perform better than a simple loop on modern,
499 * wide-issue mips processors because the code has fewer branches and
500 * more instruction-level parallelism.
501 */
502#define bits t2
503 beqz len, done
504 ADD t1, dst, len # t1 is just past last byte of dst
505 li bits, 8*NBYTES
506 SLL rem, len, 3 # rem = number of bits to keep
507EXC( LOAD t0, 0(src), l_exc)
508 SUB bits, bits, rem # bits = number of bits to discard
509 SHIFT_DISCARD t0, t0, bits
510EXC( STREST t0, -1(t1), s_exc)
511 SHIFT_DISCARD_REVERT t0, t0, bits
512 .set reorder
513 ADDC(sum, t0)
514 b done
515 .set noreorder
516dst_unaligned:
517 /*
518 * dst is unaligned
519 * t0 = src & ADDRMASK
520 * t1 = dst & ADDRMASK; T1 > 0
521 * len >= NBYTES
522 *
523 * Copy enough bytes to align dst
524 * Set match = (src and dst have same alignment)
525 */
526#define match rem
527EXC( LDFIRST t3, FIRST(0)(src), l_exc)
528 ADD t2, zero, NBYTES
529EXC( LDREST t3, REST(0)(src), l_exc_copy)
530 SUB t2, t2, t1 # t2 = number of bytes copied
531 xor match, t0, t1
532EXC( STFIRST t3, FIRST(0)(dst), s_exc)
533 SLL t4, t1, 3 # t4 = number of bits to discard
534 SHIFT_DISCARD t3, t3, t4
535 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
536 ADDC(sum, t3)
537 beq len, t2, done
538 SUB len, len, t2
539 ADD dst, dst, t2
540 beqz match, both_aligned
541 ADD src, src, t2
542
543src_unaligned_dst_aligned:
544 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
545 beqz t0, cleanup_src_unaligned
546 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
5471:
548/*
549 * Avoid consecutive LD*'s to the same register since some mips
550 * implementations can't issue them in the same cycle.
551 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
552 * are to the same unit (unless src is aligned, but it's not).
553 */
554EXC( LDFIRST t0, FIRST(0)(src), l_exc)
555EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
556 SUB len, len, 4*NBYTES
557EXC( LDREST t0, REST(0)(src), l_exc_copy)
558EXC( LDREST t1, REST(1)(src), l_exc_copy)
559EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
560EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
561EXC( LDREST t2, REST(2)(src), l_exc_copy)
562EXC( LDREST t3, REST(3)(src), l_exc_copy)
563 ADD src, src, 4*NBYTES
564#ifdef CONFIG_CPU_SB1
565 nop # improves slotting
566#endif
567EXC( STORE t0, UNIT(0)(dst), s_exc)
568 ADDC(sum, t0)
569EXC( STORE t1, UNIT(1)(dst), s_exc)
570 ADDC(sum, t1)
571EXC( STORE t2, UNIT(2)(dst), s_exc)
572 ADDC(sum, t2)
573EXC( STORE t3, UNIT(3)(dst), s_exc)
574 ADDC(sum, t3)
575 bne len, rem, 1b
576 ADD dst, dst, 4*NBYTES
577
578cleanup_src_unaligned:
579 beqz len, done
580 and rem, len, NBYTES-1 # rem = len % NBYTES
581 beq rem, len, copy_bytes
582 nop
5831:
584EXC( LDFIRST t0, FIRST(0)(src), l_exc)
585EXC( LDREST t0, REST(0)(src), l_exc_copy)
586 ADD src, src, NBYTES
587 SUB len, len, NBYTES
588EXC( STORE t0, 0(dst), s_exc)
589 ADDC(sum, t0)
590 bne len, rem, 1b
591 ADD dst, dst, NBYTES
592
593copy_bytes_checklen:
594 beqz len, done
595 nop
596copy_bytes:
597 /* 0 < len < NBYTES */
598#ifdef CONFIG_CPU_LITTLE_ENDIAN
599#define SHIFT_START 0
600#define SHIFT_INC 8
601#else
602#define SHIFT_START 8*(NBYTES-1)
603#define SHIFT_INC -8
604#endif
605 move t2, zero # partial word
606 li t3, SHIFT_START # shift
607/* use l_exc_copy here to return correct sum on fault */
608#define COPY_BYTE(N) \
609EXC( lbu t0, N(src), l_exc_copy); \
610 SUB len, len, 1; \
611EXC( sb t0, N(dst), s_exc); \
612 SLLV t0, t0, t3; \
613 addu t3, SHIFT_INC; \
614 beqz len, copy_bytes_done; \
615 or t2, t0
616
617 COPY_BYTE(0)
618 COPY_BYTE(1)
619#ifdef USE_DOUBLE
620 COPY_BYTE(2)
621 COPY_BYTE(3)
622 COPY_BYTE(4)
623 COPY_BYTE(5)
624#endif
625EXC( lbu t0, NBYTES-2(src), l_exc_copy)
626 SUB len, len, 1
627EXC( sb t0, NBYTES-2(dst), s_exc)
628 SLLV t0, t0, t3
629 or t2, t0
630copy_bytes_done:
631 ADDC(sum, t2)
632done:
633 /* fold checksum */
634#ifdef USE_DOUBLE
635 dsll32 v1, sum, 0
636 daddu sum, v1
637 sltu v1, sum, v1
638 dsra32 sum, sum, 0
639 addu sum, v1
640#endif
641 sll v1, sum, 16
642 addu sum, v1
643 sltu v1, sum, v1
644 srl sum, sum, 16
645 addu sum, v1
646
647 /* odd buffer alignment? */
648 beqz odd, 1f
649 nop
650 sll v1, sum, 8
651 srl sum, sum, 8
652 or sum, v1
653 andi sum, 0xffff
6541:
655 .set reorder
656 ADDC(sum, psum)
657 jr ra
658 .set noreorder
659
660l_exc_copy:
661 /*
662 * Copy bytes from src until faulting load address (or until a
663 * lb faults)
664 *
665 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
666 * may be more than a byte beyond the last address.
667 * Hence, the lb below may get an exception.
668 *
669 * Assumes src < THREAD_BUADDR($28)
670 */
671 LOAD t0, TI_TASK($28)
672 li t2, SHIFT_START
673 LOAD t0, THREAD_BUADDR(t0)
6741:
675EXC( lbu t1, 0(src), l_exc)
676 ADD src, src, 1
677 sb t1, 0(dst) # can't fault -- we're copy_from_user
678 SLLV t1, t1, t2
679 addu t2, SHIFT_INC
680 ADDC(sum, t1)
681 bne src, t0, 1b
682 ADD dst, dst, 1
683l_exc:
684 LOAD t0, TI_TASK($28)
685 nop
686 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
687 nop
688 SUB len, AT, t0 # len number of uncopied bytes
689 /*
690 * Here's where we rely on src and dst being incremented in tandem,
691 * See (3) above.
692 * dst += (fault addr - src) to put dst at first byte to clear
693 */
694 ADD dst, t0 # compute start address in a1
695 SUB dst, src
696 /*
697 * Clear len bytes starting at dst. Can't call __bzero because it
698 * might modify len. An inefficient loop for these rare times...
699 */
700 beqz len, done
701 SUB src, len, 1
7021: sb zero, 0(dst)
703 ADD dst, dst, 1
704 bnez src, 1b
705 SUB src, src, 1
706 li v1, -EFAULT
707 b done
708 sw v1, (errptr)
709
710s_exc:
711 li v0, -1 /* invalid checksum */
712 li v1, -EFAULT
713 jr ra
714 sw v1, (errptr)
715 END(__csum_partial_copy_user)
diff --git a/arch/mips/lib/csum_partial_copy.c b/arch/mips/lib/csum_partial_copy.c
deleted file mode 100644
index 06771040a267..000000000000
--- a/arch/mips/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1994, 1995 Waldorf Electronics GmbH
7 * Copyright (C) 1998, 1999 Ralf Baechle
8 */
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/types.h>
12#include <asm/byteorder.h>
13#include <asm/string.h>
14#include <asm/uaccess.h>
15#include <net/checksum.h>
16
17/*
18 * copy while checksumming, otherwise like csum_partial
19 */
20__wsum csum_partial_copy_nocheck(const void *src,
21 void *dst, int len, __wsum sum)
22{
23 /*
24 * It's 2:30 am and I don't feel like doing it real ...
25 * This is lots slower than the real thing (tm)
26 */
27 sum = csum_partial(src, len, sum);
28 memcpy(dst, src, len);
29
30 return sum;
31}
32
33EXPORT_SYMBOL(csum_partial_copy_nocheck);
34
35/*
36 * Copy from userspace and compute checksum. If we catch an exception
37 * then zero the rest of the buffer.
38 */
39__wsum csum_partial_copy_from_user (const void __user *src,
40 void *dst, int len, __wsum sum, int *err_ptr)
41{
42 int missing;
43
44 might_sleep();
45 missing = copy_from_user(dst, src, len);
46 if (missing) {
47 memset(dst + len - missing, 0, missing);
48 *err_ptr = -EFAULT;
49 }
50
51 return csum_partial(dst, len, sum);
52}