CRIS v32: Update lib/checksum.S and lib/checksumcopy.S

- Slight tweaks, use $acr + addoq to propagate carry across the loop boundary. - Better use of latency cycles. - Remove duplicate folding of carry, it is not needed.
author: Jesper Nilsson <jesper.nilsson@axis.com> 2008-01-25 11:54:14 -0500
committer: Jesper Nilsson <jesper.nilsson@axis.com> 2008-02-08 05:06:35 -0500
commit: 41f9412b206985a36145b423f58bf8b46085358e (patch)
tree: 4d573cfdbe8d7dd066bf78f3af31cf90effaf9e0
parent: 7674464cb31ff652d2eda69783ef61640eae4c3c (diff)
2 files changed, 43 insertions, 98 deletions
diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S
index 32e66181b826..87f3fd71ab10 100644
--- a/arch/cris/arch-v32/lib/checksum.S
+++ b/arch/cris/arch-v32/lib/checksum.S
@@ -1,6 +1,6 @@
 /*
 * A fast checksum routine using movem
- * Copyright (c) 1998-2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
 *
 * csum_partial(const unsigned char * buff, int len, unsigned int sum)
 */
@@ -12,30 +12,23 @@ csum_partial:
        ;; r11 - length
        ;; r12 - checksum
-        ;; check for breakeven length between movem and normal word looping versions
+        ;; Optimized for large packets
-        ;; we also do _NOT_ want to compute a checksum over more than the
+        subq    10*4, $r11
-        ;; actual length when length < 40
+        blt     _word_loop
+        move.d  $r11, $acr
-        cmpu.w  80,$r11
-        blo     _word_loop
-        nop
-        ;; need to save the registers we use below in the movem loop
-        ;; this overhead is why we have a check above for breakeven length
-        ;; only r0 - r8 have to be saved, the other ones are clobber-able
-        ;; according to the ABI
        subq    9*4,$sp
-        subq    10*4,$r11       ; update length for the first loop
+        clearf  c
        movem   $r8,[$sp]
        ;; do a movem checksum
 _mloop: movem   [$r10+],$r9     ; read 10 longwords
+        ;; Loop count without touching the c flag.
+        addoq   -10*4, $acr, $acr
        ;; perform dword checksumming on the 10 longwords
-        add.d   $r0,$r12
+        addc    $r0,$r12
        addc    $r1,$r12
        addc    $r2,$r12
        addc    $r3,$r12
@@ -46,60 +39,41 @@ _mloop:	movem	[$r10+],$r9	; read 10 longwords
        addc    $r8,$r12
        addc    $r9,$r12
-        ;; fold the carry into the checksum, to avoid having to loop the carry
+        ;; test $acr without trashing carry.
-        ;; back into the top
+        move.d  $acr, $acr
+        bpl     _mloop
-        addc    0,$r12
+        ;; r11 <= acr  is not really needed in the mloop, just using the dslot
-        addc    0,$r12          ; do it again, since we might have generated a carry
+        ;; to prepare for what is needed after mloop.
+        move.d  $acr, $r11
-        subq    10*4,$r11
-        bge     _mloop
-        nop
-        addq    10*4,$r11       ; compensate for last loop underflowing length
+        ;; fold the last carry into r13
+        addc    0, $r12
        movem   [$sp+],$r8      ; restore regs
 _word_loop:
-        ;; only fold if there is anything to fold.
+        addq    10*4,$r11       ; compensate for last loop underflowing length
-        cmpq    0,$r12
-        beq     _no_fold
-        ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below.
-        ;; r9 and r13 can be used as temporaries.
        moveq   -1,$r9          ; put 0xffff in r9, faster than move.d 0xffff,r9
        lsrq    16,$r9
        move.d  $r12,$r13
        lsrq    16,$r13         ; r13 = checksum >> 16
-        and.d   $r9,$r12                ; checksum = checksum & 0xffff
+        and.d   $r9,$r12        ; checksum = checksum & 0xffff
-        add.d   $r13,$r12               ; checksum += r13
-        move.d  $r12,$r13               ; do the same again, maybe we got a carry last add
-        lsrq    16,$r13
-        and.d   $r9,$r12
-        add.d   $r13,$r12
 _no_fold:
-        cmpq    2,$r11
+        subq    2,$r11
        blt     _no_words
-        nop
+        add.d   $r13,$r12       ; checksum += r13
        ;; checksum the rest of the words
-        subq    2,$r11
 _wloop: subq    2,$r11
        bge     _wloop
        addu.w  [$r10+],$r12
-        addq    2,$r11
 _no_words:
+        addq    2,$r11
        ;; see if we have one odd byte more
-        cmpq    1,$r11
+        bne     _do_byte
-        beq     _do_byte
        nop
        ret
        move.d  $r12,$r10
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S
index 9303ccbadc6d..21aabe91489b 100644
--- a/arch/cris/arch-v32/lib/checksumcopy.S
+++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
 /*
 * A fast checksum+copy routine using movem
- * Copyright (c) 1998, 2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
 *
 * Authors:     Bjorn Wesen
 *
@@ -16,32 +16,23 @@ csum_partial_copy_nocheck:
        ;; r12 - length
        ;; r13 - checksum
-        ;; check for breakeven length between movem and normal word looping versions
+        ;; Optimized for large packets
-        ;; we also do _NOT_ want to compute a checksum over more than the
+        subq    10*4, $r12
-        ;; actual length when length < 40
+        blt     _word_loop
+        move.d  $r12, $acr
-        cmpu.w  80,$r12
-        blo     _word_loop
-        nop
-        ;; need to save the registers we use below in the movem loop
-        ;; this overhead is why we have a check above for breakeven length
-        ;; only r0 - r8 have to be saved, the other ones are clobber-able
-        ;; according to the ABI
        subq    9*4,$sp
-        subq    10*4,$r12       ; update length for the first loop
+        clearf  c
        movem   $r8,[$sp]
        ;; do a movem copy and checksum
 1:      ;; A failing userspace access (the read) will have this as PC.
 _mloop: movem   [$r10+],$r9     ; read 10 longwords
+        addoq   -10*4, $acr, $acr ; loop counter in latency cycle
        movem   $r9,[$r11+]     ; write 10 longwords
        ;; perform dword checksumming on the 10 longwords
+        addc    $r0,$r13
-        add.d   $r0,$r13
        addc    $r1,$r13
        addc    $r2,$r13
        addc    $r3,$r13
@@ -52,47 +43,30 @@ _mloop:	movem	[$r10+],$r9	; read 10 longwords
        addc    $r8,$r13
        addc    $r9,$r13
-        ;; fold the carry into the checksum, to avoid having to loop the carry
+        ;; test $acr, without trashing carry.
-        ;; back into the top
+        move.d  $acr, $acr
+        bpl     _mloop
-        addc    0,$r13
+        ;; r12 <= acr  is needed after mloop and in the exception handlers.
-        addc    0,$r13          ; do it again, since we might have generated a carry
+        move.d  $acr, $r12
-        subq    10*4,$r12
-        bge     _mloop
-        nop
-        addq    10*4,$r12       ; compensate for last loop underflowing length
+        ;; fold the last carry into r13
+        addc    0, $r13
        movem   [$sp+],$r8      ; restore regs
 _word_loop:
-        ;; only fold if there is anything to fold.
+        addq    10*4,$r12       ; compensate for last loop underflowing length
-        cmpq    0,$r13
-        beq     _no_fold
        ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
        ;; r9 can be used as temporary.
        move.d  $r13,$r9
        lsrq    16,$r9          ; r0 = checksum >> 16
        and.d   0xffff,$r13     ; checksum = checksum & 0xffff
-        add.d   $r9,$r13        ; checksum += r0
-        move.d  $r13,$r9        ; do the same again, maybe we got a carry last add
-        lsrq    16,$r9
-        and.d   0xffff,$r13
-        add.d   $r9,$r13
-_no_fold:
+        subq    2, $r12
-        cmpq    2,$r12
        blt     _no_words
-        nop
+        add.d   $r9,$r13        ; checksum += r0
        ;; copy and checksum the rest of the words
-        subq    2,$r12
 2:      ;; A failing userspace access for the read below will have this as PC.
 _wloop: move.w  [$r10+],$r9
        addu.w  $r9,$r13
@@ -100,12 +74,9 @@ _wloop:	move.w	[$r10+],$r9
        bge     _wloop
        move.w  $r9,[$r11+]
-        addq    2,$r12
 _no_words:
-        ;; see if we have one odd byte more
+        addq    2,$r12
-        cmpq    1,$r12
+        bne     _do_byte
-        beq     _do_byte
        nop
        ret
        move.d  $r13,$r10
author	Jesper Nilsson <jesper.nilsson@axis.com>	2008-01-25 11:54:14 -0500
committer	Jesper Nilsson <jesper.nilsson@axis.com>	2008-02-08 05:06:35 -0500
commit	41f9412b206985a36145b423f58bf8b46085358e (patch)
tree	4d573cfdbe8d7dd066bf78f3af31cf90effaf9e0
parent	7674464cb31ff652d2eda69783ef61640eae4c3c (diff)

diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S index 32e66181b826..87f3fd71ab10 100644 --- a/arch/cris/arch-v32/lib/checksum.S +++ b/arch/cris/arch-v32/lib/checksum.S
@@ -1,6 +1,6 @@
1	/*	1	/*
2	* A fast checksum routine using movem	2	* A fast checksum routine using movem
3	* Copyright (c) 1998-2001, 2003 Axis Communications AB	3	* Copyright (c) 1998-2007 Axis Communications AB
4	*	4	*
5	* csum_partial(const unsigned char * buff, int len, unsigned int sum)	5	* csum_partial(const unsigned char * buff, int len, unsigned int sum)
6	*/	6	*/
@@ -12,30 +12,23 @@ csum_partial:
12	;; r11 - length	12	;; r11 - length
13	;; r12 - checksum	13	;; r12 - checksum
14		14
15	;; check for breakeven length between movem and normal word looping versions	15	;; Optimized for large packets
16	;; we also do _NOT_ want to compute a checksum over more than the	16	subq 10*4, $r11
17	;; actual length when length < 40	17	blt _word_loop
18		18	move.d $r11, $acr
19	cmpu.w 80,$r11
20	blo _word_loop
21	nop
22
23	;; need to save the registers we use below in the movem loop
24	;; this overhead is why we have a check above for breakeven length
25	;; only r0 - r8 have to be saved, the other ones are clobber-able
26	;; according to the ABI
27		19
28	subq 9*4,$sp	20	subq 9*4,$sp
29	subq 10*4,$r11 ; update length for the first loop	21	clearf c
30	movem $r8,[$sp]	22	movem $r8,[$sp]
31		23
32	;; do a movem checksum	24	;; do a movem checksum
33		25
34	_mloop: movem [$r10+],$r9 ; read 10 longwords	26	_mloop: movem [$r10+],$r9 ; read 10 longwords
35		27	;; Loop count without touching the c flag.
		28	addoq -10*4, $acr, $acr
36	;; perform dword checksumming on the 10 longwords	29	;; perform dword checksumming on the 10 longwords
37		30
38	add.d $r0,$r12	31	addc $r0,$r12
39	addc $r1,$r12	32	addc $r1,$r12
40	addc $r2,$r12	33	addc $r2,$r12
41	addc $r3,$r12	34	addc $r3,$r12
@@ -46,60 +39,41 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords
46	addc $r8,$r12	39	addc $r8,$r12
47	addc $r9,$r12	40	addc $r9,$r12
48		41
49	;; fold the carry into the checksum, to avoid having to loop the carry	42	;; test $acr without trashing carry.
50	;; back into the top	43	move.d $acr, $acr
51		44	bpl _mloop
52	addc 0,$r12	45	;; r11 <= acr is not really needed in the mloop, just using the dslot
53	addc 0,$r12 ; do it again, since we might have generated a carry	46	;; to prepare for what is needed after mloop.
54		47	move.d $acr, $r11
55	subq 10*4,$r11
56	bge _mloop
57	nop
58
59	addq 10*4,$r11 ; compensate for last loop underflowing length
60		48
		49	;; fold the last carry into r13
		50	addc 0, $r12
61	movem [$sp+],$r8 ; restore regs	51	movem [$sp+],$r8 ; restore regs
62		52
63	_word_loop:	53	_word_loop:
64	;; only fold if there is anything to fold.	54	addq 10*4,$r11 ; compensate for last loop underflowing length
65
66	cmpq 0,$r12
67	beq _no_fold
68
69	;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below.
70	;; r9 and r13 can be used as temporaries.
71		55
72	moveq -1,$r9 ; put 0xffff in r9, faster than move.d 0xffff,r9	56	moveq -1,$r9 ; put 0xffff in r9, faster than move.d 0xffff,r9
73	lsrq 16,$r9	57	lsrq 16,$r9
74		58
75	move.d $r12,$r13	59	move.d $r12,$r13
76	lsrq 16,$r13 ; r13 = checksum >> 16	60	lsrq 16,$r13 ; r13 = checksum >> 16
77	and.d $r9,$r12 ; checksum = checksum & 0xffff	61	and.d $r9,$r12 ; checksum = checksum & 0xffff
78	add.d $r13,$r12 ; checksum += r13
79	move.d $r12,$r13 ; do the same again, maybe we got a carry last add
80	lsrq 16,$r13
81	and.d $r9,$r12
82	add.d $r13,$r12
83		62
84	_no_fold:	63	_no_fold:
85	cmpq 2,$r11	64	subq 2,$r11
86	blt _no_words	65	blt _no_words
87	nop	66	add.d $r13,$r12 ; checksum += r13
88		67
89	;; checksum the rest of the words	68	;; checksum the rest of the words
90
91	subq 2,$r11
92
93	_wloop: subq 2,$r11	69	_wloop: subq 2,$r11
94	bge _wloop	70	bge _wloop
95	addu.w [$r10+],$r12	71	addu.w [$r10+],$r12
96		72
97	addq 2,$r11
98
99	_no_words:	73	_no_words:
		74	addq 2,$r11
100	;; see if we have one odd byte more	75	;; see if we have one odd byte more
101	cmpq 1,$r11	76	bne _do_byte
102	beq _do_byte
103	nop	77	nop
104	ret	78	ret
105	move.d $r12,$r10	79	move.d $r12,$r10


diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S index 9303ccbadc6d..21aabe91489b 100644 --- a/arch/cris/arch-v32/lib/checksumcopy.S +++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
1	/*	1	/*
2	* A fast checksum+copy routine using movem	2	* A fast checksum+copy routine using movem
3	* Copyright (c) 1998, 2001, 2003 Axis Communications AB	3	* Copyright (c) 1998-2007 Axis Communications AB
4	*	4	*
5	* Authors: Bjorn Wesen	5	* Authors: Bjorn Wesen
6	*	6	*
@@ -16,32 +16,23 @@ csum_partial_copy_nocheck:
16	;; r12 - length	16	;; r12 - length
17	;; r13 - checksum	17	;; r13 - checksum
18		18
19	;; check for breakeven length between movem and normal word looping versions	19	;; Optimized for large packets
20	;; we also do _NOT_ want to compute a checksum over more than the	20	subq 10*4, $r12
21	;; actual length when length < 40	21	blt _word_loop
22		22	move.d $r12, $acr
23	cmpu.w 80,$r12
24	blo _word_loop
25	nop
26
27	;; need to save the registers we use below in the movem loop
28	;; this overhead is why we have a check above for breakeven length
29	;; only r0 - r8 have to be saved, the other ones are clobber-able
30	;; according to the ABI
31		23
32	subq 9*4,$sp	24	subq 9*4,$sp
33	subq 10*4,$r12 ; update length for the first loop	25	clearf c
34	movem $r8,[$sp]	26	movem $r8,[$sp]
35		27
36	;; do a movem copy and checksum	28	;; do a movem copy and checksum
37
38	1: ;; A failing userspace access (the read) will have this as PC.	29	1: ;; A failing userspace access (the read) will have this as PC.
39	_mloop: movem [$r10+],$r9 ; read 10 longwords	30	_mloop: movem [$r10+],$r9 ; read 10 longwords
		31	addoq -10*4, $acr, $acr ; loop counter in latency cycle
40	movem $r9,[$r11+] ; write 10 longwords	32	movem $r9,[$r11+] ; write 10 longwords
41		33
42	;; perform dword checksumming on the 10 longwords	34	;; perform dword checksumming on the 10 longwords
43		35	addc $r0,$r13
44	add.d $r0,$r13
45	addc $r1,$r13	36	addc $r1,$r13
46	addc $r2,$r13	37	addc $r2,$r13
47	addc $r3,$r13	38	addc $r3,$r13
@@ -52,47 +43,30 @@ _mloop: movem [$r10+],$r9 ; read 10 longwords
52	addc $r8,$r13	43	addc $r8,$r13
53	addc $r9,$r13	44	addc $r9,$r13
54		45
55	;; fold the carry into the checksum, to avoid having to loop the carry	46	;; test $acr, without trashing carry.
56	;; back into the top	47	move.d $acr, $acr
57		48	bpl _mloop
58	addc 0,$r13	49	;; r12 <= acr is needed after mloop and in the exception handlers.
59	addc 0,$r13 ; do it again, since we might have generated a carry	50	move.d $acr, $r12
60
61	subq 10*4,$r12
62	bge _mloop
63	nop
64
65	addq 10*4,$r12 ; compensate for last loop underflowing length
66		51
		52	;; fold the last carry into r13
		53	addc 0, $r13
67	movem [$sp+],$r8 ; restore regs	54	movem [$sp+],$r8 ; restore regs
68		55
69	_word_loop:	56	_word_loop:
70	;; only fold if there is anything to fold.	57	addq 10*4,$r12 ; compensate for last loop underflowing length
71
72	cmpq 0,$r13
73	beq _no_fold
74		58
75	;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below	59	;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
76	;; r9 can be used as temporary.	60	;; r9 can be used as temporary.
77
78	move.d $r13,$r9	61	move.d $r13,$r9
79	lsrq 16,$r9 ; r0 = checksum >> 16	62	lsrq 16,$r9 ; r0 = checksum >> 16
80	and.d 0xffff,$r13 ; checksum = checksum & 0xffff	63	and.d 0xffff,$r13 ; checksum = checksum & 0xffff
81	add.d $r9,$r13 ; checksum += r0
82	move.d $r13,$r9 ; do the same again, maybe we got a carry last add
83	lsrq 16,$r9
84	and.d 0xffff,$r13
85	add.d $r9,$r13
86		64
87	_no_fold:	65	subq 2, $r12
88	cmpq 2,$r12
89	blt _no_words	66	blt _no_words
90	nop	67	add.d $r9,$r13 ; checksum += r0
91		68
92	;; copy and checksum the rest of the words	69	;; copy and checksum the rest of the words
93
94	subq 2,$r12
95
96	2: ;; A failing userspace access for the read below will have this as PC.	70	2: ;; A failing userspace access for the read below will have this as PC.
97	_wloop: move.w [$r10+],$r9	71	_wloop: move.w [$r10+],$r9
98	addu.w $r9,$r13	72	addu.w $r9,$r13
@@ -100,12 +74,9 @@ _wloop: move.w [$r10+],$r9
100	bge _wloop	74	bge _wloop
101	move.w $r9,[$r11+]	75	move.w $r9,[$r11+]
102		76
103	addq 2,$r12
104
105	_no_words:	77	_no_words:
106	;; see if we have one odd byte more	78	addq 2,$r12
107	cmpq 1,$r12	79	bne _do_byte
108	beq _do_byte
109	nop	80	nop
110	ret	81	ret
111	move.d $r13,$r10	82	move.d $r13,$r10