diff options
Diffstat (limited to 'arch/alpha/lib/ev6-csum_ipv6_magic.S')
-rw-r--r-- | arch/alpha/lib/ev6-csum_ipv6_magic.S | 42 |
1 files changed, 33 insertions, 9 deletions
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S index de1948a6911..fc0bc399f87 100644 --- a/arch/alpha/lib/ev6-csum_ipv6_magic.S +++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S | |||
@@ -46,6 +46,10 @@ | |||
46 | * add the 3 low ushorts together, generating a uint | 46 | * add the 3 low ushorts together, generating a uint |
47 | * a final add of the 2 lower ushorts | 47 | * a final add of the 2 lower ushorts |
48 | * truncating the result. | 48 | * truncating the result. |
49 | * | ||
50 | * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru> | ||
51 | * The cost is 16 instructions (~8 cycles), including two extra loads which | ||
52 | * may cause additional delay in rare cases (load-load replay traps). | ||
49 | */ | 53 | */ |
50 | 54 | ||
51 | .globl csum_ipv6_magic | 55 | .globl csum_ipv6_magic |
@@ -55,25 +59,45 @@ | |||
55 | csum_ipv6_magic: | 59 | csum_ipv6_magic: |
56 | .prologue 0 | 60 | .prologue 0 |
57 | 61 | ||
58 | ldq $0,0($16) # L : Latency: 3 | 62 | ldq_u $0,0($16) # L : Latency: 3 |
59 | inslh $18,7,$4 # U : 0000000000AABBCC | 63 | inslh $18,7,$4 # U : 0000000000AABBCC |
60 | ldq $1,8($16) # L : Latency: 3 | 64 | ldq_u $1,8($16) # L : Latency: 3 |
61 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 | 65 | sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 |
62 | 66 | ||
67 | and $16,7,$6 # E : src misalignment | ||
68 | ldq_u $5,15($16) # L : Latency: 3 | ||
63 | zapnot $20,15,$20 # U : zero extend incoming csum | 69 | zapnot $20,15,$20 # U : zero extend incoming csum |
64 | ldq $2,0($17) # L : Latency: 3 | 70 | ldq_u $2,0($17) # L : U L U L : Latency: 3 |
65 | sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 | 71 | |
72 | extql $0,$6,$0 # U : | ||
73 | extqh $1,$6,$22 # U : | ||
74 | ldq_u $3,8($17) # L : Latency: 3 | ||
75 | sll $19,24,$19 # U : U U L U : 0x000000aa bb000000 | ||
76 | |||
77 | cmoveq $6,$31,$22 # E : src aligned? | ||
78 | ldq_u $23,15($17) # L : Latency: 3 | ||
66 | inswl $18,3,$18 # U : 000000CCDD000000 | 79 | inswl $18,3,$18 # U : 000000CCDD000000 |
80 | addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00 | ||
67 | 81 | ||
68 | ldq $3,8($17) # L : Latency: 3 | 82 | or $0,$22,$0 # E : 1st src word complete |
69 | bis $18,$4,$18 # E : 000000CCDDAABBCC | 83 | extql $1,$6,$1 # U : |
70 | addl $19,$7,$19 # E : <sign bits>bbaabb00 | 84 | or $18,$4,$18 # E : 000000CCDDAABBCC |
71 | nop # E : U L U L | 85 | extqh $5,$6,$5 # U : L U L U |
72 | 86 | ||
87 | and $17,7,$6 # E : dst misalignment | ||
88 | extql $2,$6,$2 # U : | ||
89 | or $1,$5,$1 # E : 2nd src word complete | ||
90 | extqh $3,$6,$22 # U : L U L U : | ||
91 | |||
92 | cmoveq $6,$31,$22 # E : dst aligned? | ||
93 | extql $3,$6,$3 # U : | ||
73 | addq $20,$0,$20 # E : begin summing the words | 94 | addq $20,$0,$20 # E : begin summing the words |
95 | extqh $23,$6,$23 # U : L U L U : | ||
96 | |||
74 | srl $18,16,$4 # U : 0000000000CCDDAA | 97 | srl $18,16,$4 # U : 0000000000CCDDAA |
98 | or $2,$22,$2 # E : 1st dst word complete | ||
75 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 | 99 | zap $19,0x3,$19 # U : <sign bits>bbaa0000 |
76 | nop # E : L U U L | 100 | or $3,$23,$3 # E : U L U L : 2nd dst word complete |
77 | 101 | ||
78 | cmpult $20,$0,$0 # E : | 102 | cmpult $20,$0,$0 # E : |
79 | addq $20,$1,$20 # E : | 103 | addq $20,$1,$20 # E : |