diff options
Diffstat (limited to 'arch/alpha/lib/ev6-copy_page.S')
-rw-r--r-- | arch/alpha/lib/ev6-copy_page.S | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/arch/alpha/lib/ev6-copy_page.S b/arch/alpha/lib/ev6-copy_page.S new file mode 100644 index 000000000000..b789db192754 --- /dev/null +++ b/arch/alpha/lib/ev6-copy_page.S | |||
@@ -0,0 +1,203 @@ | |||
1 | /* | ||
2 | * arch/alpha/lib/ev6-copy_page.S | ||
3 | * | ||
4 | * Copy an entire page. | ||
5 | */ | ||
6 | |||
7 | /* The following comparison of this routine vs the normal copy_page.S | ||
8 | was written by an unnamed ev6 hardware designer and forwarded to me | ||
9 | via Steven Hobbs <hobbs@steven.zko.dec.com>. | ||
10 | |||
11 | First Problem: STQ overflows. | ||
12 | ----------------------------- | ||
13 | |||
14 | It would be nice if EV6 handled every resource overflow efficiently, | ||
15 | but for some it doesn't. Including store queue overflows. It causes | ||
16 | a trap and a restart of the pipe. | ||
17 | |||
18 | To get around this we sometimes use (to borrow a term from a VSSAD | ||
19 | researcher) "aeration". The idea is to slow the rate at which the | ||
20 | processor receives valid instructions by inserting nops in the fetch | ||
21 | path. In doing so, you can prevent the overflow and actually make | ||
22 | the code run faster. You can, of course, take advantage of the fact | ||
23 | that the processor can fetch at most 4 aligned instructions per cycle. | ||
24 | |||
25 | I inserted enough nops to force it to take 10 cycles to fetch the | ||
26 | loop code. In theory, EV6 should be able to execute this loop in | ||
27 | 9 cycles but I was not able to get it to run that fast -- the initial | ||
28 | conditions were such that I could not reach this optimum rate on | ||
29 | (chaotic) EV6. I wrote the code such that everything would issue | ||
30 | in order. | ||
31 | |||
32 | Second Problem: Dcache index matches. | ||
33 | ------------------------------------- | ||
34 | |||
35 | If you are going to use this routine on random aligned pages, there | ||
36 | is a 25% chance that the pages will be at the same dcache indices. | ||
37 | This results in many nasty memory traps without care. | ||
38 | |||
39 | The solution is to schedule the prefetches to avoid the memory | ||
40 | conflicts. I schedule the wh64 prefetches farther ahead of the | ||
41 | read prefetches to avoid this problem. | ||
42 | |||
43 | Third Problem: Needs more prefetching. | ||
44 | -------------------------------------- | ||
45 | |||
46 | In order to improve the code I added deeper prefetching to take the | ||
47 | most advantage of EV6's bandwidth. | ||
48 | |||
49 | I also prefetched the read stream. Note that adding the read prefetch | ||
50 | forced me to add another cycle to the inner-most kernel - up to 11 | ||
51 | from the original 8 cycles per iteration. We could improve performance | ||
52 | further by unrolling the loop and doing multiple prefetches per cycle. | ||
53 | |||
54 | I think that the code below will be very robust and fast code for the | ||
55 | purposes of copying aligned pages. It is slower when both source and | ||
56 | destination pages are in the dcache, but it is my guess that this is | ||
57 | less important than the dcache miss case. */ | ||
58 | |||
59 | |||
60 | .text | ||
61 | .align 4 | ||
62 | .global copy_page | ||
63 | .ent copy_page | ||
64 | copy_page: | ||
65 | .prologue 0 | ||
66 | |||
67 | /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ | ||
68 | wh64 ($16) | ||
69 | ldl $31,0($17) | ||
70 | ldl $31,64($17) | ||
71 | lda $1,1*64($16) | ||
72 | |||
73 | wh64 ($1) | ||
74 | ldl $31,128($17) | ||
75 | ldl $31,192($17) | ||
76 | lda $1,2*64($16) | ||
77 | |||
78 | wh64 ($1) | ||
79 | ldl $31,256($17) | ||
80 | lda $18,118 | ||
81 | lda $1,3*64($16) | ||
82 | |||
83 | wh64 ($1) | ||
84 | nop | ||
85 | lda $1,4*64($16) | ||
86 | lda $2,5*64($16) | ||
87 | |||
88 | wh64 ($1) | ||
89 | wh64 ($2) | ||
90 | lda $1,6*64($16) | ||
91 | lda $2,7*64($16) | ||
92 | |||
93 | wh64 ($1) | ||
94 | wh64 ($2) | ||
95 | lda $1,8*64($16) | ||
96 | lda $2,9*64($16) | ||
97 | |||
98 | wh64 ($1) | ||
99 | wh64 ($2) | ||
100 | lda $19,10*64($16) | ||
101 | nop | ||
102 | |||
103 | /* Main prefetching/write-hinting loop. */ | ||
104 | 1: ldq $0,0($17) | ||
105 | ldq $1,8($17) | ||
106 | unop | ||
107 | unop | ||
108 | |||
109 | unop | ||
110 | unop | ||
111 | ldq $2,16($17) | ||
112 | ldq $3,24($17) | ||
113 | |||
114 | ldq $4,32($17) | ||
115 | ldq $5,40($17) | ||
116 | unop | ||
117 | unop | ||
118 | |||
119 | unop | ||
120 | unop | ||
121 | ldq $6,48($17) | ||
122 | ldq $7,56($17) | ||
123 | |||
124 | ldl $31,320($17) | ||
125 | unop | ||
126 | unop | ||
127 | unop | ||
128 | |||
129 | /* This gives the extra cycle of aeration above the minimum. */ | ||
130 | unop | ||
131 | unop | ||
132 | unop | ||
133 | unop | ||
134 | |||
135 | wh64 ($19) | ||
136 | unop | ||
137 | unop | ||
138 | unop | ||
139 | |||
140 | stq $0,0($16) | ||
141 | subq $18,1,$18 | ||
142 | stq $1,8($16) | ||
143 | unop | ||
144 | |||
145 | unop | ||
146 | stq $2,16($16) | ||
147 | addq $17,64,$17 | ||
148 | stq $3,24($16) | ||
149 | |||
150 | stq $4,32($16) | ||
151 | stq $5,40($16) | ||
152 | addq $19,64,$19 | ||
153 | unop | ||
154 | |||
155 | stq $6,48($16) | ||
156 | stq $7,56($16) | ||
157 | addq $16,64,$16 | ||
158 | bne $18, 1b | ||
159 | |||
160 | /* Prefetch the final 5 cache lines of the read stream. */ | ||
161 | lda $18,10 | ||
162 | ldl $31,320($17) | ||
163 | ldl $31,384($17) | ||
164 | ldl $31,448($17) | ||
165 | |||
166 | ldl $31,512($17) | ||
167 | ldl $31,576($17) | ||
168 | nop | ||
169 | nop | ||
170 | |||
171 | /* Non-prefetching, non-write-hinting cleanup loop for the | ||
172 | final 10 cache lines. */ | ||
173 | 2: ldq $0,0($17) | ||
174 | ldq $1,8($17) | ||
175 | ldq $2,16($17) | ||
176 | ldq $3,24($17) | ||
177 | |||
178 | ldq $4,32($17) | ||
179 | ldq $5,40($17) | ||
180 | ldq $6,48($17) | ||
181 | ldq $7,56($17) | ||
182 | |||
183 | stq $0,0($16) | ||
184 | subq $18,1,$18 | ||
185 | stq $1,8($16) | ||
186 | addq $17,64,$17 | ||
187 | |||
188 | stq $2,16($16) | ||
189 | stq $3,24($16) | ||
190 | stq $4,32($16) | ||
191 | stq $5,40($16) | ||
192 | |||
193 | stq $6,48($16) | ||
194 | stq $7,56($16) | ||
195 | addq $16,64,$16 | ||
196 | bne $18, 2b | ||
197 | |||
198 | ret | ||
199 | nop | ||
200 | unop | ||
201 | nop | ||
202 | |||
203 | .end copy_page | ||