1 files changed, 172 insertions, 0 deletions
diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S
new file mode 100644
index 000000000000..ba9cd3ccc2b2
--- /dev/null
+++ b/arch/sparc64/lib/checksum.S
@@ -0,0 +1,172 @@
+/* checksum.S: Sparc V9 optimized checksum code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1995 Miguel de Icaza
+ *  Copyright(C) 1996, 2000 David S. Miller
+ *  Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ *      Linux/Alpha checksum c-code
+ *      Linux/ix86 inline checksum assembly
+ *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ *      David Mosberger-Tang for optimized reference c-code
+ *      BSD4.4 portable checksum routine
+ */
+        .text
+csum_partial_fix_alignment:
+        /* We checked for zero length already, so there must be
+         * at least one byte.
+         */
+        be,pt           %icc, 1f
+         nop
+        ldub            [%o0 + 0x00], %o4
+        add             %o0, 1, %o0
+        sub             %o1, 1, %o1
+1:      andcc           %o0, 0x2, %g0
+        be,pn           %icc, csum_partial_post_align
+         cmp            %o1, 2
+        blu,pn          %icc, csum_partial_end_cruft
+         nop
+        lduh            [%o0 + 0x00], %o5
+        add             %o0, 2, %o0
+        sub             %o1, 2, %o1
+        ba,pt           %xcc, csum_partial_post_align
+         add            %o5, %o4, %o4
+        .align          32
+        .globl          csum_partial
+csum_partial:           /* %o0=buff, %o1=len, %o2=sum */
+        prefetch        [%o0 + 0x000], #n_reads
+        clr             %o4
+        prefetch        [%o0 + 0x040], #n_reads
+        brz,pn          %o1, csum_partial_finish
+         andcc          %o0, 0x3, %g0
+        /* We "remember" whether the lowest bit in the address
+         * was set in %g7.  Because if it is, we have to swap
+         * upper and lower 8 bit fields of the sum we calculate.
+        */
+        bne,pn          %icc, csum_partial_fix_alignment
+         andcc          %o0, 0x1, %g7
+csum_partial_post_align:
+        prefetch        [%o0 + 0x080], #n_reads
+        andncc          %o1, 0x3f, %o3
+        prefetch        [%o0 + 0x0c0], #n_reads
+        sub             %o1, %o3, %o1
+        brz,pn          %o3, 2f
+         prefetch       [%o0 + 0x100], #n_reads
+        /* So that we don't need to use the non-pairing
+         * add-with-carry instructions we accumulate 32-bit
+         * values into a 64-bit register.  At the end of the
+         * loop we fold it down to 32-bits and so on.
+         */
+        prefetch        [%o0 + 0x140], #n_reads
+1:      lduw            [%o0 + 0x00], %o5
+        lduw            [%o0 + 0x04], %g1
+        lduw            [%o0 + 0x08], %g2
+        add             %o4, %o5, %o4
+        lduw            [%o0 + 0x0c], %g3
+        add             %o4, %g1, %o4
+        lduw            [%o0 + 0x10], %o5
+        add             %o4, %g2, %o4
+        lduw            [%o0 + 0x14], %g1
+        add             %o4, %g3, %o4
+        lduw            [%o0 + 0x18], %g2
+        add             %o4, %o5, %o4
+        lduw            [%o0 + 0x1c], %g3
+        add             %o4, %g1, %o4
+        lduw            [%o0 + 0x20], %o5
+        add             %o4, %g2, %o4
+        lduw            [%o0 + 0x24], %g1
+        add             %o4, %g3, %o4
+        lduw            [%o0 + 0x28], %g2
+        add             %o4, %o5, %o4
+        lduw            [%o0 + 0x2c], %g3
+        add             %o4, %g1, %o4
+        lduw            [%o0 + 0x30], %o5
+        add             %o4, %g2, %o4
+        lduw            [%o0 + 0x34], %g1
+        add             %o4, %g3, %o4
+        lduw            [%o0 + 0x38], %g2
+        add             %o4, %o5, %o4
+        lduw            [%o0 + 0x3c], %g3
+        add             %o4, %g1, %o4
+        prefetch        [%o0 + 0x180], #n_reads
+        add             %o4, %g2, %o4
+        subcc           %o3, 0x40, %o3
+        add             %o0, 0x40, %o0
+        bne,pt          %icc, 1b
+         add            %o4, %g3, %o4
+2:      and             %o1, 0x3c, %o3
+        brz,pn          %o3, 2f
+         sub            %o1, %o3, %o1
+1:      lduw            [%o0 + 0x00], %o5
+        subcc           %o3, 0x4, %o3
+        add             %o0, 0x4, %o0
+        bne,pt          %icc, 1b
+         add            %o4, %o5, %o4
+2:
+        /* fold 64-->32 */
+        srlx            %o4, 32, %o5
+        srl             %o4, 0, %o4
+        add             %o4, %o5, %o4
+        srlx            %o4, 32, %o5
+        srl             %o4, 0, %o4
+        add             %o4, %o5, %o4
+        /* fold 32-->16 */
+        sethi           %hi(0xffff0000), %g1
+        srl             %o4, 16, %o5
+        andn            %o4, %g1, %g2
+        add             %o5, %g2, %o4
+        srl             %o4, 16, %o5
+        andn            %o4, %g1, %g2
+        add             %o5, %g2, %o4
+csum_partial_end_cruft:
+        /* %o4 has the 16-bit sum we have calculated so-far.  */
+        cmp             %o1, 2
+        blu,pt          %icc, 1f
+         nop
+        lduh            [%o0 + 0x00], %o5
+        sub             %o1, 2, %o1
+        add             %o0, 2, %o0
+        add             %o4, %o5, %o4
+1:      brz,pt          %o1, 1f
+         nop
+        ldub            [%o0 + 0x00], %o5
+        sub             %o1, 1, %o1
+        add             %o0, 1, %o0
+        sllx            %o5, 8, %o5
+        add             %o4, %o5, %o4
+1:
+        /* fold 32-->16 */
+        sethi           %hi(0xffff0000), %g1
+        srl             %o4, 16, %o5
+        andn            %o4, %g1, %g2
+        add             %o5, %g2, %o4
+        srl             %o4, 16, %o5
+        andn            %o4, %g1, %g2
+        add             %o5, %g2, %o4
+1:      brz,pt          %g7, 1f
+         nop
+        /* We started with an odd byte, byte-swap the result.  */
+        srl             %o4, 8, %o5
+        and             %o4, 0xff, %g1
+        sll             %g1, 8, %g1
+        or              %o5, %g1, %o4
+1:      add             %o2, %o4, %o2
+csum_partial_finish:
+        retl
+         mov            %o2, %o0

diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S new file mode 100644 index 000000000000..ba9cd3ccc2b2 --- /dev/null +++ b/arch/sparc64/lib/checksum.S
@@ -0,0 +1,172 @@
	1	/* checksum.S: Sparc V9 optimized checksum code.
	2	*
	3	* Copyright(C) 1995 Linus Torvalds
	4	* Copyright(C) 1995 Miguel de Icaza
	5	* Copyright(C) 1996, 2000 David S. Miller
	6	* Copyright(C) 1997 Jakub Jelinek
	7	*
	8	* derived from:
	9	* Linux/Alpha checksum c-code
	10	* Linux/ix86 inline checksum assembly
	11	* RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
	12	* David Mosberger-Tang for optimized reference c-code
	13	* BSD4.4 portable checksum routine
	14	*/
	15
	16	.text
	17
	18	csum_partial_fix_alignment:
	19	/* We checked for zero length already, so there must be
	20	* at least one byte.
	21	*/
	22	be,pt %icc, 1f
	23	nop
	24	ldub [%o0 + 0x00], %o4
	25	add %o0, 1, %o0
	26	sub %o1, 1, %o1
	27	1: andcc %o0, 0x2, %g0
	28	be,pn %icc, csum_partial_post_align
	29	cmp %o1, 2
	30	blu,pn %icc, csum_partial_end_cruft
	31	nop
	32	lduh [%o0 + 0x00], %o5
	33	add %o0, 2, %o0
	34	sub %o1, 2, %o1
	35	ba,pt %xcc, csum_partial_post_align
	36	add %o5, %o4, %o4
	37
	38	.align 32
	39	.globl csum_partial
	40	csum_partial: /* %o0=buff, %o1=len, %o2=sum */
	41	prefetch [%o0 + 0x000], #n_reads
	42	clr %o4
	43	prefetch [%o0 + 0x040], #n_reads
	44	brz,pn %o1, csum_partial_finish
	45	andcc %o0, 0x3, %g0
	46
	47	/* We "remember" whether the lowest bit in the address
	48	* was set in %g7. Because if it is, we have to swap
	49	* upper and lower 8 bit fields of the sum we calculate.
	50	*/
	51	bne,pn %icc, csum_partial_fix_alignment
	52	andcc %o0, 0x1, %g7
	53
	54	csum_partial_post_align:
	55	prefetch [%o0 + 0x080], #n_reads
	56	andncc %o1, 0x3f, %o3
	57
	58	prefetch [%o0 + 0x0c0], #n_reads
	59	sub %o1, %o3, %o1
	60	brz,pn %o3, 2f
	61	prefetch [%o0 + 0x100], #n_reads
	62
	63	/* So that we don't need to use the non-pairing
	64	* add-with-carry instructions we accumulate 32-bit
	65	* values into a 64-bit register. At the end of the
	66	* loop we fold it down to 32-bits and so on.
	67	*/
	68	prefetch [%o0 + 0x140], #n_reads
	69	1: lduw [%o0 + 0x00], %o5
	70	lduw [%o0 + 0x04], %g1
	71	lduw [%o0 + 0x08], %g2
	72	add %o4, %o5, %o4
	73	lduw [%o0 + 0x0c], %g3
	74	add %o4, %g1, %o4
	75	lduw [%o0 + 0x10], %o5
	76	add %o4, %g2, %o4
	77	lduw [%o0 + 0x14], %g1
	78	add %o4, %g3, %o4
	79	lduw [%o0 + 0x18], %g2
	80	add %o4, %o5, %o4
	81	lduw [%o0 + 0x1c], %g3
	82	add %o4, %g1, %o4
	83	lduw [%o0 + 0x20], %o5
	84	add %o4, %g2, %o4
	85	lduw [%o0 + 0x24], %g1
	86	add %o4, %g3, %o4
	87	lduw [%o0 + 0x28], %g2
	88	add %o4, %o5, %o4
	89	lduw [%o0 + 0x2c], %g3
	90	add %o4, %g1, %o4
	91	lduw [%o0 + 0x30], %o5
	92	add %o4, %g2, %o4
	93	lduw [%o0 + 0x34], %g1
	94	add %o4, %g3, %o4
	95	lduw [%o0 + 0x38], %g2
	96	add %o4, %o5, %o4
	97	lduw [%o0 + 0x3c], %g3
	98	add %o4, %g1, %o4
	99	prefetch [%o0 + 0x180], #n_reads
	100	add %o4, %g2, %o4
	101	subcc %o3, 0x40, %o3
	102	add %o0, 0x40, %o0
	103	bne,pt %icc, 1b
	104	add %o4, %g3, %o4
	105
	106	2: and %o1, 0x3c, %o3
	107	brz,pn %o3, 2f
	108	sub %o1, %o3, %o1
	109	1: lduw [%o0 + 0x00], %o5
	110	subcc %o3, 0x4, %o3
	111	add %o0, 0x4, %o0
	112	bne,pt %icc, 1b
	113	add %o4, %o5, %o4
	114
	115	2:
	116	/* fold 64-->32 */
	117	srlx %o4, 32, %o5
	118	srl %o4, 0, %o4
	119	add %o4, %o5, %o4
	120	srlx %o4, 32, %o5
	121	srl %o4, 0, %o4
	122	add %o4, %o5, %o4
	123
	124	/* fold 32-->16 */
	125	sethi %hi(0xffff0000), %g1
	126	srl %o4, 16, %o5
	127	andn %o4, %g1, %g2
	128	add %o5, %g2, %o4
	129	srl %o4, 16, %o5
	130	andn %o4, %g1, %g2
	131	add %o5, %g2, %o4
	132
	133	csum_partial_end_cruft:
	134	/* %o4 has the 16-bit sum we have calculated so-far. */
	135	cmp %o1, 2
	136	blu,pt %icc, 1f
	137	nop
	138	lduh [%o0 + 0x00], %o5
	139	sub %o1, 2, %o1
	140	add %o0, 2, %o0
	141	add %o4, %o5, %o4
	142	1: brz,pt %o1, 1f
	143	nop
	144	ldub [%o0 + 0x00], %o5
	145	sub %o1, 1, %o1
	146	add %o0, 1, %o0
	147	sllx %o5, 8, %o5
	148	add %o4, %o5, %o4
	149	1:
	150	/* fold 32-->16 */
	151	sethi %hi(0xffff0000), %g1
	152	srl %o4, 16, %o5
	153	andn %o4, %g1, %g2
	154	add %o5, %g2, %o4
	155	srl %o4, 16, %o5
	156	andn %o4, %g1, %g2
	157	add %o5, %g2, %o4
	158
	159	1: brz,pt %g7, 1f
	160	nop
	161
	162	/* We started with an odd byte, byte-swap the result. */
	163	srl %o4, 8, %o5
	164	and %o4, 0xff, %g1
	165	sll %g1, 8, %g1
	166	or %o5, %g1, %o4
	167
	168	1: add %o2, %o4, %o2
	169
	170	csum_partial_finish:
	171	retl
	172	mov %o2, %o0