diff options
Diffstat (limited to 'arch/alpha/lib')
-rw-r--r-- | arch/alpha/lib/csum_ipv6_magic.S | 51 | ||||
-rw-r--r-- | arch/alpha/lib/ev6-csum_ipv6_magic.S | 42 |
2 files changed, 70 insertions, 23 deletions
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S index e09748dbf2ed..2c2acb96deb6 100644 --- a/arch/alpha/lib/csum_ipv6_magic.S +++ b/arch/alpha/lib/csum_ipv6_magic.S | |||
@@ -7,6 +7,9 @@ | |||
7 | * __u32 len, | 7 | * __u32 len, |
8 | * unsigned short proto, | 8 | * unsigned short proto, |
9 | * unsigned int csum); | 9 | * unsigned int csum); |
10 | * | ||
11 | * Misalignment handling (which costs 16 instructions / 8 cycles) | ||
12 | * added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> | ||
10 | */ | 13 | */ |
11 | 14 | ||
12 | .globl csum_ipv6_magic | 15 | .globl csum_ipv6_magic |
@@ -16,37 +19,57 @@ | |||
16 | csum_ipv6_magic: | 19 | csum_ipv6_magic: |
17 | .prologue 0 | 20 | .prologue 0 |
18 | 21 | ||
19 | ldq $0,0($16) # e0 : load src & dst addr words | 22 | ldq_u $0,0($16) # e0 : load src & dst addr words |
20 | zapnot $20,15,$20 # .. e1 : zero extend incoming csum | 23 | zapnot $20,15,$20 # .. e1 : zero extend incoming csum |
21 | extqh $18,1,$4 # e0 : byte swap len & proto while we wait | 24 | extqh $18,1,$4 # e0 : byte swap len & proto while we wait |
22 | ldq $1,8($16) # .. e1 : | 25 | ldq_u $21,7($16) # .. e1 : handle misalignment |
23 | 26 | ||
24 | extbl $18,1,$5 # e0 : | 27 | extbl $18,1,$5 # e0 : |
25 | ldq $2,0($17) # .. e1 : | 28 | ldq_u $1,8($16) # .. e1 : |
26 | extbl $18,2,$6 # e0 : | 29 | extbl $18,2,$6 # e0 : |
27 | ldq $3,8($17) # .. e1 : | 30 | ldq_u $22,15($16) # .. e1 : |
28 | 31 | ||
29 | extbl $18,3,$18 # e0 : | 32 | extbl $18,3,$18 # e0 : |
33 | ldq_u $2,0($17) # .. e1 : | ||
30 | sra $4,32,$4 # e0 : | 34 | sra $4,32,$4 # e0 : |
35 | ldq_u $23,7($17) # .. e1 : | ||
36 | |||
37 | extql $0,$16,$0 # e0 : | ||
38 | ldq_u $3,8($17) # .. e1 : | ||
39 | extqh $21,$16,$21 # e0 : | ||
40 | ldq_u $24,15($17) # .. e1 : | ||
41 | |||
31 | sll $5,16,$5 # e0 : | 42 | sll $5,16,$5 # e0 : |
43 | or $0,$21,$0 # .. e1 : 1st src word complete | ||
44 | extql $1,$16,$1 # e0 : | ||
32 | addq $20,$0,$20 # .. e1 : begin summing the words | 45 | addq $20,$0,$20 # .. e1 : begin summing the words |
33 | 46 | ||
34 | sll $6,8,$6 # e0 : | 47 | extqh $22,$16,$22 # e0 : |
35 | cmpult $20,$0,$0 # .. e1 : | 48 | cmpult $20,$0,$0 # .. e1 : |
36 | extwh $19,7,$7 # e0 : | 49 | sll $6,8,$6 # e0 : |
37 | or $4,$18,$18 # .. e1 : | 50 | or $1,$22,$1 # .. e1 : 2nd src word complete |
38 | 51 | ||
39 | extbl $19,1,$19 # e0 : | 52 | extql $2,$17,$2 # e0 : |
53 | or $4,$18,$18 # .. e1 : | ||
54 | extqh $23,$17,$23 # e0 : | ||
40 | or $5,$6,$5 # .. e1 : | 55 | or $5,$6,$5 # .. e1 : |
41 | or $18,$5,$18 # e0 : len complete | ||
42 | or $19,$7,$19 # .. e1 : | ||
43 | 56 | ||
44 | sll $19,48,$19 # e0 : | 57 | extql $3,$17,$3 # e0 : |
58 | or $2,$23,$2 # .. e1 : 1st dst word complete | ||
59 | extqh $24,$17,$24 # e0 : | ||
60 | or $18,$5,$18 # .. e1 : len complete | ||
61 | |||
62 | extwh $19,7,$7 # e0 : | ||
63 | or $3,$24,$3 # .. e1 : 2nd dst word complete | ||
64 | extbl $19,1,$19 # e0 : | ||
45 | addq $20,$1,$20 # .. e1 : | 65 | addq $20,$1,$20 # .. e1 : |
46 | sra $19,32,$19 # e0 : proto complete | 66 | |
67 | or $19,$7,$19 # e0 : | ||
47 | cmpult $20,$1,$1 # .. e1 : | 68 | cmpult $20,$1,$1 # .. e1 : |
69 | sll $19,48,$19 # e0 : | ||
70 | nop # .. e0 : | ||
48 | 71 | ||
49 | nop # e0 : | 72 | sra $19,32,$19 # e0 : proto complete |
50 | addq $20,$2,$20 # .. e1 : | 73 | addq $20,$2,$20 # .. e1 : |
51 | cmpult $20,$2,$2 # e0 : | 74 | cmpult $20,$2,$2 # e0 : |
52 | addq $20,$3,$20 # .. e1 : | 75 | addq $20,$3,$20 # .. e1 : |
@@ -84,7 +107,7 @@ csum_ipv6_magic: | |||
84 | extwl $0,2,$1 # e0 : fold 17-bit value | 107 | extwl $0,2,$1 # e0 : fold 17-bit value |
85 | zapnot $0,3,$0 # .. e1 : | 108 | zapnot $0,3,$0 # .. e1 : |
86 | addq $0,$1,$0 # e0 : | 109 | addq $0,$1,$0 # e0 : |
87 | not $0,$0 # e1 : and complement. | 110 | not $0,$0 # .. e1 : and complement. |
88 | 111 | ||
89 | zapnot $0,3,$0 # e0 : | 112 | zapnot $0,3,$0 # e0 : |
90 | ret # .. e1 : | 113 | ret # .. e1 : |
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S index de1948a69118..fc0bc399f872 100644 --- a/arch/alpha/lib/ev6-csum_ipv6_magic.S +++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S | |||
@@ -46,6 +46,10 @@ | |||
46 | * add the 3 low ushorts together, generating a uint | 46 | * add the 3 low ushorts together, generating a uint |
47 | * a final add of the 2 lower ushorts | 47 | * a final add of the 2 lower ushorts |
48 | * truncating the result. | 48 | * truncating the result. |
49 | * | ||
50 | * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> | ||
51 | * The cost is 16 instructions (~8 cycles), including two extra loads which | ||
52 | * may cause additional delay in rare cases (load-load replay traps). | ||
49 | */ | 53 | */ |
50 | 54 | ||
51 | .globl csum_ipv6_magic | 55 | .globl csum_ipv6_magic |
@@ -55,25 +59,45 @@ | |||
55 | csum_ipv6_magic: | 59 | csum_ipv6_magic: |
56 | .prologue 0 | 60 | .prologue 0 |
57 | 61 | ||
58 | ldq $0,0($16) # L : Latency: 3 | 62 | ldq_u $0,0($16) # L : Latency: 3 |
59 | inslh $18,7,$4 # U : 0000000000AABBCC | 63 | inslh $18,7,$4 # U : 0000000000AABBCC |
60 | ldq $1,8($16) # L : Latency: 3 | 64 | ldq_u $1,8($16) # L : Latency: 3 |
61 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 | 65 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 |
62 | 66 | ||
67 | and $16,7,$6 # E : src misalignment | ||
68 | ldq_u $5,15($16) # L : Latency: 3 | ||
63 | zapnot $20,15,$20 # U : zero extend incoming csum | 69 | zapnot $20,15,$20 # U : zero extend incoming csum |
64 | ldq $2,0($17) # L : Latency: 3 | 70 | ldq_u $2,0($17) # L : U L U L : Latency: 3 |
65 | sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 | 71 | |
72 | extql $0,$6,$0 # U : | ||
73 | extqh $1,$6,$22 # U : | ||
74 | ldq_u $3,8($17) # L : Latency: 3 | ||
75 | sll $19,24,$19 # U : U U L U : 0x000000aa bb000000 | ||
76 | |||
77 | cmoveq $6,$31,$22 # E : src aligned? | ||
78 | ldq_u $23,15($17) # L : Latency: 3 | ||
66 | inswl $18,3,$18 # U : 000000CCDD000000 | 79 | inswl $18,3,$18 # U : 000000CCDD000000 |
80 | addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00 | ||
67 | 81 | ||
68 | ldq $3,8($17) # L : Latency: 3 | 82 | or $0,$22,$0 # E : 1st src word complete |
69 | bis $18,$4,$18 # E : 000000CCDDAABBCC | 83 | extql $1,$6,$1 # U : |
70 | addl $19,$7,$19 # E : <sign bits>bbaabb00 | 84 | or $18,$4,$18 # E : 000000CCDDAABBCC |
71 | nop # E : U L U L | 85 | extqh $5,$6,$5 # U : L U L U |
72 | 86 | ||
87 | and $17,7,$6 # E : dst misalignment | ||
88 | extql $2,$6,$2 # U : | ||
89 | or $1,$5,$1 # E : 2nd src word complete | ||
90 | extqh $3,$6,$22 # U : L U L U : | ||
91 | |||
92 | cmoveq $6,$31,$22 # E : dst aligned? | ||
93 | extql $3,$6,$3 # U : | ||
73 | addq $20,$0,$20 # E : begin summing the words | 94 | addq $20,$0,$20 # E : begin summing the words |
95 | extqh $23,$6,$23 # U : L U L U : | ||
96 | |||
74 | srl $18,16,$4 # U : 0000000000CCDDAA | 97 | srl $18,16,$4 # U : 0000000000CCDDAA |
98 | or $2,$22,$2 # E : 1st dst word complete | ||
75 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 | 99 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 |
76 | nop # E : L U U L | 100 | or $3,$23,$3 # E : U L U L : 2nd dst word complete |
77 | 101 | ||
78 | cmpult $20,$0,$0 # E : | 102 | cmpult $20,$0,$0 # E : |
79 | addq $20,$1,$20 # E : | 103 | addq $20,$1,$20 # E : |