aboutsummaryrefslogtreecommitdiffstats
path: root/arch/alpha/lib
diff options
context:
space:
mode:
authorIvan Kokshaysky <ink@jurassic.park.msu.ru>2007-06-23 20:16:35 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-06-24 11:59:11 -0400
commit58ed2f9c75b719da4f494f24ed2d56d45f5b4959 (patch)
tree6ecb60cf2d7b18da36fd29f2a6bd03d15a8cfefb /arch/alpha/lib
parent653d4876b730fedca8473481863cf700245e3582 (diff)
alpha: fix alignment problem in csum_ipv6_magic()
Hopefully this fixes http://bugzilla.kernel.org/show_bug.cgi?id=8635 The struct in6_addr passed to csum_ipv6_magic() is 4 byte aligned, so we can't use the regular 64-bit loads. Since the cost of handling of 4 byte and 1 byte aligned 64-bit data is roughly the same, this code can cope with any src/dst [mis]alignment. Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Richard Henderson <rth@twiddle.net> Cc: Dustin Marquess <jailbird@alcatraz.fdf.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/alpha/lib')
-rw-r--r--arch/alpha/lib/csum_ipv6_magic.S51
-rw-r--r--arch/alpha/lib/ev6-csum_ipv6_magic.S42
2 files changed, 70 insertions, 23 deletions
diff --git a/arch/alpha/lib/csum_ipv6_magic.S b/arch/alpha/lib/csum_ipv6_magic.S
index e09748dbf2ed..2c2acb96deb6 100644
--- a/arch/alpha/lib/csum_ipv6_magic.S
+++ b/arch/alpha/lib/csum_ipv6_magic.S
@@ -7,6 +7,9 @@
7 * __u32 len, 7 * __u32 len,
8 * unsigned short proto, 8 * unsigned short proto,
9 * unsigned int csum); 9 * unsigned int csum);
10 *
11 * Misalignment handling (which costs 16 instructions / 8 cycles)
12 * added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
10 */ 13 */
11 14
12 .globl csum_ipv6_magic 15 .globl csum_ipv6_magic
@@ -16,37 +19,57 @@
16csum_ipv6_magic: 19csum_ipv6_magic:
17 .prologue 0 20 .prologue 0
18 21
19 ldq $0,0($16) # e0 : load src & dst addr words 22 ldq_u $0,0($16) # e0 : load src & dst addr words
20 zapnot $20,15,$20 # .. e1 : zero extend incoming csum 23 zapnot $20,15,$20 # .. e1 : zero extend incoming csum
21 extqh $18,1,$4 # e0 : byte swap len & proto while we wait 24 extqh $18,1,$4 # e0 : byte swap len & proto while we wait
22 ldq $1,8($16) # .. e1 : 25 ldq_u $21,7($16) # .. e1 : handle misalignment
23 26
24 extbl $18,1,$5 # e0 : 27 extbl $18,1,$5 # e0 :
25 ldq $2,0($17) # .. e1 : 28 ldq_u $1,8($16) # .. e1 :
26 extbl $18,2,$6 # e0 : 29 extbl $18,2,$6 # e0 :
27 ldq $3,8($17) # .. e1 : 30 ldq_u $22,15($16) # .. e1 :
28 31
29 extbl $18,3,$18 # e0 : 32 extbl $18,3,$18 # e0 :
33 ldq_u $2,0($17) # .. e1 :
30 sra $4,32,$4 # e0 : 34 sra $4,32,$4 # e0 :
35 ldq_u $23,7($17) # .. e1 :
36
37 extql $0,$16,$0 # e0 :
38 ldq_u $3,8($17) # .. e1 :
39 extqh $21,$16,$21 # e0 :
40 ldq_u $24,15($17) # .. e1 :
41
31 sll $5,16,$5 # e0 : 42 sll $5,16,$5 # e0 :
43 or $0,$21,$0 # .. e1 : 1st src word complete
44 extql $1,$16,$1 # e0 :
32 addq $20,$0,$20 # .. e1 : begin summing the words 45 addq $20,$0,$20 # .. e1 : begin summing the words
33 46
34 sll $6,8,$6 # e0 : 47 extqh $22,$16,$22 # e0 :
35 cmpult $20,$0,$0 # .. e1 : 48 cmpult $20,$0,$0 # .. e1 :
36 extwh $19,7,$7 # e0 : 49 sll $6,8,$6 # e0 :
37 or $4,$18,$18 # .. e1 : 50 or $1,$22,$1 # .. e1 : 2nd src word complete
38 51
39 extbl $19,1,$19 # e0 : 52 extql $2,$17,$2 # e0 :
53 or $4,$18,$18 # .. e1 :
54 extqh $23,$17,$23 # e0 :
40 or $5,$6,$5 # .. e1 : 55 or $5,$6,$5 # .. e1 :
41 or $18,$5,$18 # e0 : len complete
42 or $19,$7,$19 # .. e1 :
43 56
44 sll $19,48,$19 # e0 : 57 extql $3,$17,$3 # e0 :
58 or $2,$23,$2 # .. e1 : 1st dst word complete
59 extqh $24,$17,$24 # e0 :
60 or $18,$5,$18 # .. e1 : len complete
61
62 extwh $19,7,$7 # e0 :
63 or $3,$24,$3 # .. e1 : 2nd dst word complete
64 extbl $19,1,$19 # e0 :
45 addq $20,$1,$20 # .. e1 : 65 addq $20,$1,$20 # .. e1 :
46 sra $19,32,$19 # e0 : proto complete 66
67 or $19,$7,$19 # e0 :
47 cmpult $20,$1,$1 # .. e1 : 68 cmpult $20,$1,$1 # .. e1 :
69 sll $19,48,$19 # e0 :
70 nop # .. e0 :
48 71
49 nop # e0 : 72 sra $19,32,$19 # e0 : proto complete
50 addq $20,$2,$20 # .. e1 : 73 addq $20,$2,$20 # .. e1 :
51 cmpult $20,$2,$2 # e0 : 74 cmpult $20,$2,$2 # e0 :
52 addq $20,$3,$20 # .. e1 : 75 addq $20,$3,$20 # .. e1 :
@@ -84,7 +107,7 @@ csum_ipv6_magic:
84 extwl $0,2,$1 # e0 : fold 17-bit value 107 extwl $0,2,$1 # e0 : fold 17-bit value
85 zapnot $0,3,$0 # .. e1 : 108 zapnot $0,3,$0 # .. e1 :
86 addq $0,$1,$0 # e0 : 109 addq $0,$1,$0 # e0 :
87 not $0,$0 # e1 : and complement. 110 not $0,$0 # .. e1 : and complement.
88 111
89 zapnot $0,3,$0 # e0 : 112 zapnot $0,3,$0 # e0 :
90 ret # .. e1 : 113 ret # .. e1 :
diff --git a/arch/alpha/lib/ev6-csum_ipv6_magic.S b/arch/alpha/lib/ev6-csum_ipv6_magic.S
index de1948a69118..fc0bc399f872 100644
--- a/arch/alpha/lib/ev6-csum_ipv6_magic.S
+++ b/arch/alpha/lib/ev6-csum_ipv6_magic.S
@@ -46,6 +46,10 @@
46 * add the 3 low ushorts together, generating a uint 46 * add the 3 low ushorts together, generating a uint
47 * a final add of the 2 lower ushorts 47 * a final add of the 2 lower ushorts
48 * truncating the result. 48 * truncating the result.
49 *
50 * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
51 * The cost is 16 instructions (~8 cycles), including two extra loads which
52 * may cause additional delay in rare cases (load-load replay traps).
49 */ 53 */
50 54
51 .globl csum_ipv6_magic 55 .globl csum_ipv6_magic
@@ -55,25 +59,45 @@
55csum_ipv6_magic: 59csum_ipv6_magic:
56 .prologue 0 60 .prologue 0
57 61
58 ldq $0,0($16) # L : Latency: 3 62 ldq_u $0,0($16) # L : Latency: 3
59 inslh $18,7,$4 # U : 0000000000AABBCC 63 inslh $18,7,$4 # U : 0000000000AABBCC
60 ldq $1,8($16) # L : Latency: 3 64 ldq_u $1,8($16) # L : Latency: 3
61 sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00 65 sll $19,8,$7 # U : U L U L : 0x00000000 00aabb00
62 66
67 and $16,7,$6 # E : src misalignment
68 ldq_u $5,15($16) # L : Latency: 3
63 zapnot $20,15,$20 # U : zero extend incoming csum 69 zapnot $20,15,$20 # U : zero extend incoming csum
64 ldq $2,0($17) # L : Latency: 3 70 ldq_u $2,0($17) # L : U L U L : Latency: 3
65 sll $19,24,$19 # U : U L L U : 0x000000aa bb000000 71
72 extql $0,$6,$0 # U :
73 extqh $1,$6,$22 # U :
74 ldq_u $3,8($17) # L : Latency: 3
75 sll $19,24,$19 # U : U U L U : 0x000000aa bb000000
76
77 cmoveq $6,$31,$22 # E : src aligned?
78 ldq_u $23,15($17) # L : Latency: 3
66 inswl $18,3,$18 # U : 000000CCDD000000 79 inswl $18,3,$18 # U : 000000CCDD000000
80 addl $19,$7,$19 # E : U L U L : <sign bits>bbaabb00
67 81
68 ldq $3,8($17) # L : Latency: 3 82 or $0,$22,$0 # E : 1st src word complete
69 bis $18,$4,$18 # E : 000000CCDDAABBCC 83 extql $1,$6,$1 # U :
70 addl $19,$7,$19 # E : <sign bits>bbaabb00 84 or $18,$4,$18 # E : 000000CCDDAABBCC
71 nop # E : U L U L 85 extqh $5,$6,$5 # U : L U L U
72 86
87 and $17,7,$6 # E : dst misalignment
88 extql $2,$6,$2 # U :
89 or $1,$5,$1 # E : 2nd src word complete
90 extqh $3,$6,$22 # U : L U L U :
91
92 cmoveq $6,$31,$22 # E : dst aligned?
93 extql $3,$6,$3 # U :
73 addq $20,$0,$20 # E : begin summing the words 94 addq $20,$0,$20 # E : begin summing the words
95 extqh $23,$6,$23 # U : L U L U :
96
74 srl $18,16,$4 # U : 0000000000CCDDAA 97 srl $18,16,$4 # U : 0000000000CCDDAA
98 or $2,$22,$2 # E : 1st dst word complete
75 zap $19,0x3,$19 # U : <sign bits>bbaa0000 99 zap $19,0x3,$19 # U : <sign bits>bbaa0000
76 nop # E : L U U L 100 or $3,$23,$3 # E : U L U L : 2nd dst word complete
77 101
78 cmpult $20,$0,$0 # E : 102 cmpult $20,$0,$0 # E :
79 addq $20,$1,$20 # E : 103 addq $20,$1,$20 # E :