aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc64/lib/checksum.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64/lib/checksum.S')
-rw-r--r--arch/sparc64/lib/checksum.S172
1 files changed, 172 insertions, 0 deletions
diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S
new file mode 100644
index 000000000000..ba9cd3ccc2b2
--- /dev/null
+++ b/arch/sparc64/lib/checksum.S
@@ -0,0 +1,172 @@
1/* checksum.S: Sparc V9 optimized checksum code.
2 *
3 * Copyright(C) 1995 Linus Torvalds
4 * Copyright(C) 1995 Miguel de Icaza
5 * Copyright(C) 1996, 2000 David S. Miller
6 * Copyright(C) 1997 Jakub Jelinek
7 *
8 * derived from:
9 * Linux/Alpha checksum c-code
10 * Linux/ix86 inline checksum assembly
11 * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
12 * David Mosberger-Tang for optimized reference c-code
13 * BSD4.4 portable checksum routine
14 */
15
16 .text
17
18csum_partial_fix_alignment:
19 /* We checked for zero length already, so there must be
20 * at least one byte.
21 */
22 be,pt %icc, 1f
23 nop
24 ldub [%o0 + 0x00], %o4
25 add %o0, 1, %o0
26 sub %o1, 1, %o1
271: andcc %o0, 0x2, %g0
28 be,pn %icc, csum_partial_post_align
29 cmp %o1, 2
30 blu,pn %icc, csum_partial_end_cruft
31 nop
32 lduh [%o0 + 0x00], %o5
33 add %o0, 2, %o0
34 sub %o1, 2, %o1
35 ba,pt %xcc, csum_partial_post_align
36 add %o5, %o4, %o4
37
38 .align 32
39 .globl csum_partial
40csum_partial: /* %o0=buff, %o1=len, %o2=sum */
41 prefetch [%o0 + 0x000], #n_reads
42 clr %o4
43 prefetch [%o0 + 0x040], #n_reads
44 brz,pn %o1, csum_partial_finish
45 andcc %o0, 0x3, %g0
46
47 /* We "remember" whether the lowest bit in the address
48 * was set in %g7. Because if it is, we have to swap
49 * upper and lower 8 bit fields of the sum we calculate.
50 */
51 bne,pn %icc, csum_partial_fix_alignment
52 andcc %o0, 0x1, %g7
53
54csum_partial_post_align:
55 prefetch [%o0 + 0x080], #n_reads
56 andncc %o1, 0x3f, %o3
57
58 prefetch [%o0 + 0x0c0], #n_reads
59 sub %o1, %o3, %o1
60 brz,pn %o3, 2f
61 prefetch [%o0 + 0x100], #n_reads
62
63 /* So that we don't need to use the non-pairing
64 * add-with-carry instructions we accumulate 32-bit
65 * values into a 64-bit register. At the end of the
66 * loop we fold it down to 32-bits and so on.
67 */
68 prefetch [%o0 + 0x140], #n_reads
691: lduw [%o0 + 0x00], %o5
70 lduw [%o0 + 0x04], %g1
71 lduw [%o0 + 0x08], %g2
72 add %o4, %o5, %o4
73 lduw [%o0 + 0x0c], %g3
74 add %o4, %g1, %o4
75 lduw [%o0 + 0x10], %o5
76 add %o4, %g2, %o4
77 lduw [%o0 + 0x14], %g1
78 add %o4, %g3, %o4
79 lduw [%o0 + 0x18], %g2
80 add %o4, %o5, %o4
81 lduw [%o0 + 0x1c], %g3
82 add %o4, %g1, %o4
83 lduw [%o0 + 0x20], %o5
84 add %o4, %g2, %o4
85 lduw [%o0 + 0x24], %g1
86 add %o4, %g3, %o4
87 lduw [%o0 + 0x28], %g2
88 add %o4, %o5, %o4
89 lduw [%o0 + 0x2c], %g3
90 add %o4, %g1, %o4
91 lduw [%o0 + 0x30], %o5
92 add %o4, %g2, %o4
93 lduw [%o0 + 0x34], %g1
94 add %o4, %g3, %o4
95 lduw [%o0 + 0x38], %g2
96 add %o4, %o5, %o4
97 lduw [%o0 + 0x3c], %g3
98 add %o4, %g1, %o4
99 prefetch [%o0 + 0x180], #n_reads
100 add %o4, %g2, %o4
101 subcc %o3, 0x40, %o3
102 add %o0, 0x40, %o0
103 bne,pt %icc, 1b
104 add %o4, %g3, %o4
105
1062: and %o1, 0x3c, %o3
107 brz,pn %o3, 2f
108 sub %o1, %o3, %o1
1091: lduw [%o0 + 0x00], %o5
110 subcc %o3, 0x4, %o3
111 add %o0, 0x4, %o0
112 bne,pt %icc, 1b
113 add %o4, %o5, %o4
114
1152:
116 /* fold 64-->32 */
117 srlx %o4, 32, %o5
118 srl %o4, 0, %o4
119 add %o4, %o5, %o4
120 srlx %o4, 32, %o5
121 srl %o4, 0, %o4
122 add %o4, %o5, %o4
123
124 /* fold 32-->16 */
125 sethi %hi(0xffff0000), %g1
126 srl %o4, 16, %o5
127 andn %o4, %g1, %g2
128 add %o5, %g2, %o4
129 srl %o4, 16, %o5
130 andn %o4, %g1, %g2
131 add %o5, %g2, %o4
132
133csum_partial_end_cruft:
134 /* %o4 has the 16-bit sum we have calculated so-far. */
135 cmp %o1, 2
136 blu,pt %icc, 1f
137 nop
138 lduh [%o0 + 0x00], %o5
139 sub %o1, 2, %o1
140 add %o0, 2, %o0
141 add %o4, %o5, %o4
1421: brz,pt %o1, 1f
143 nop
144 ldub [%o0 + 0x00], %o5
145 sub %o1, 1, %o1
146 add %o0, 1, %o0
147 sllx %o5, 8, %o5
148 add %o4, %o5, %o4
1491:
150 /* fold 32-->16 */
151 sethi %hi(0xffff0000), %g1
152 srl %o4, 16, %o5
153 andn %o4, %g1, %g2
154 add %o5, %g2, %o4
155 srl %o4, 16, %o5
156 andn %o4, %g1, %g2
157 add %o5, %g2, %o4
158
1591: brz,pt %g7, 1f
160 nop
161
162 /* We started with an odd byte, byte-swap the result. */
163 srl %o4, 8, %o5
164 and %o4, 0xff, %g1
165 sll %g1, 8, %g1
166 or %o5, %g1, %o4
167
1681: add %o2, %o4, %o2
169
170csum_partial_finish:
171 retl
172 mov %o2, %o0