diff options
Diffstat (limited to 'arch/ia64/lib/clear_user.S')
-rw-r--r-- | arch/ia64/lib/clear_user.S | 209 |
1 files changed, 209 insertions, 0 deletions
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S new file mode 100644 index 000000000000..eecd8577b209 --- /dev/null +++ b/arch/ia64/lib/clear_user.S | |||
@@ -0,0 +1,209 @@ | |||
1 | /* | ||
2 | * This routine clears to zero a linear memory buffer in user space. | ||
3 | * | ||
4 | * Inputs: | ||
5 | * in0: address of buffer | ||
6 | * in1: length of buffer in bytes | ||
7 | * Outputs: | ||
8 | * r8: number of bytes that didn't get cleared due to a fault | ||
9 | * | ||
10 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | ||
11 | * Stephane Eranian <eranian@hpl.hp.com> | ||
12 | */ | ||
13 | |||
14 | #include <asm/asmmacro.h> | ||
15 | |||
16 | // | ||
17 | // arguments | ||
18 | // | ||
19 | #define buf r32 | ||
20 | #define len r33 | ||
21 | |||
22 | // | ||
23 | // local registers | ||
24 | // | ||
25 | #define cnt r16 | ||
26 | #define buf2 r17 | ||
27 | #define saved_lc r18 | ||
28 | #define saved_pfs r19 | ||
29 | #define tmp r20 | ||
30 | #define len2 r21 | ||
31 | #define len3 r22 | ||
32 | |||
33 | // | ||
34 | // Theory of operations: | ||
35 | // - we check whether or not the buffer is small, i.e., less than 17 | ||
36 | // in which case we do the byte by byte loop. | ||
37 | // | ||
38 | // - Otherwise we go progressively from 1 byte store to 8byte store in | ||
39 | // the head part, the body is a 16byte store loop and we finish we the | ||
40 | // tail for the last 15 bytes. | ||
41 | // The good point about this breakdown is that the long buffer handling | ||
42 | // contains only 2 branches. | ||
43 | // | ||
44 | // The reason for not using shifting & masking for both the head and the | ||
45 | // tail is to stay semantically correct. This routine is not supposed | ||
46 | // to write bytes outside of the buffer. While most of the time this would | ||
47 | // be ok, we can't tolerate a mistake. A classical example is the case | ||
48 | // of multithreaded code were to the extra bytes touched is actually owned | ||
49 | // by another thread which runs concurrently to ours. Another, less likely, | ||
50 | // example is with device drivers where reading an I/O mapped location may | ||
51 | // have side effects (same thing for writing). | ||
52 | // | ||
53 | |||
54 | GLOBAL_ENTRY(__do_clear_user) | ||
55 | .prologue | ||
56 | .save ar.pfs, saved_pfs | ||
57 | alloc saved_pfs=ar.pfs,2,0,0,0 | ||
58 | cmp.eq p6,p0=r0,len // check for zero length | ||
59 | .save ar.lc, saved_lc | ||
60 | mov saved_lc=ar.lc // preserve ar.lc (slow) | ||
61 | .body | ||
62 | ;; // avoid WAW on CFM | ||
63 | adds tmp=-1,len // br.ctop is repeat/until | ||
64 | mov ret0=len // return value is length at this point | ||
65 | (p6) br.ret.spnt.many rp | ||
66 | ;; | ||
67 | cmp.lt p6,p0=16,len // if len > 16 then long memset | ||
68 | mov ar.lc=tmp // initialize lc for small count | ||
69 | (p6) br.cond.dptk .long_do_clear | ||
70 | ;; // WAR on ar.lc | ||
71 | // | ||
72 | // worst case 16 iterations, avg 8 iterations | ||
73 | // | ||
74 | // We could have played with the predicates to use the extra | ||
75 | // M slot for 2 stores/iteration but the cost the initialization | ||
76 | // the various counters compared to how long the loop is supposed | ||
77 | // to last on average does not make this solution viable. | ||
78 | // | ||
79 | 1: | ||
80 | EX( .Lexit1, st1 [buf]=r0,1 ) | ||
81 | adds len=-1,len // countdown length using len | ||
82 | br.cloop.dptk 1b | ||
83 | ;; // avoid RAW on ar.lc | ||
84 | // | ||
85 | // .Lexit4: comes from byte by byte loop | ||
86 | // len contains bytes left | ||
87 | .Lexit1: | ||
88 | mov ret0=len // faster than using ar.lc | ||
89 | mov ar.lc=saved_lc | ||
90 | br.ret.sptk.many rp // end of short clear_user | ||
91 | |||
92 | |||
93 | // | ||
94 | // At this point we know we have more than 16 bytes to copy | ||
95 | // so we focus on alignment (no branches required) | ||
96 | // | ||
97 | // The use of len/len2 for countdown of the number of bytes left | ||
98 | // instead of ret0 is due to the fact that the exception code | ||
99 | // changes the values of r8. | ||
100 | // | ||
101 | .long_do_clear: | ||
102 | tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) | ||
103 | ;; | ||
104 | EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned | ||
105 | (p6) adds len=-1,len;; // sync because buf is modified | ||
106 | tbit.nz p6,p0=buf,1 | ||
107 | ;; | ||
108 | EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned | ||
109 | (p6) adds len=-2,len;; | ||
110 | tbit.nz p6,p0=buf,2 | ||
111 | ;; | ||
112 | EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned | ||
113 | (p6) adds len=-4,len;; | ||
114 | tbit.nz p6,p0=buf,3 | ||
115 | ;; | ||
116 | EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned | ||
117 | (p6) adds len=-8,len;; | ||
118 | shr.u cnt=len,4 // number of 128-bit (2x64bit) words | ||
119 | ;; | ||
120 | cmp.eq p6,p0=r0,cnt | ||
121 | adds tmp=-1,cnt | ||
122 | (p6) br.cond.dpnt .dotail // we have less than 16 bytes left | ||
123 | ;; | ||
124 | adds buf2=8,buf // setup second base pointer | ||
125 | mov ar.lc=tmp | ||
126 | ;; | ||
127 | |||
128 | // | ||
129 | // 16bytes/iteration core loop | ||
130 | // | ||
131 | // The second store can never generate a fault because | ||
132 | // we come into the loop only when we are 16-byte aligned. | ||
133 | // This means that if we cross a page then it will always be | ||
134 | // in the first store and never in the second. | ||
135 | // | ||
136 | // | ||
137 | // We need to keep track of the remaining length. A possible (optimistic) | ||
138 | // way would be to use ar.lc and derive how many byte were left by | ||
139 | // doing : left= 16*ar.lc + 16. this would avoid the addition at | ||
140 | // every iteration. | ||
141 | // However we need to keep the synchronization point. A template | ||
142 | // M;;MB does not exist and thus we can keep the addition at no | ||
143 | // extra cycle cost (use a nop slot anyway). It also simplifies the | ||
144 | // (unlikely) error recovery code | ||
145 | // | ||
146 | |||
147 | 2: EX(.Lexit3, st8 [buf]=r0,16 ) | ||
148 | ;; // needed to get len correct when error | ||
149 | st8 [buf2]=r0,16 | ||
150 | adds len=-16,len | ||
151 | br.cloop.dptk 2b | ||
152 | ;; | ||
153 | mov ar.lc=saved_lc | ||
154 | // | ||
155 | // tail correction based on len only | ||
156 | // | ||
157 | // We alternate the use of len3,len2 to allow parallelism and correct | ||
158 | // error handling. We also reuse p6/p7 to return correct value. | ||
159 | // The addition of len2/len3 does not cost anything more compared to | ||
160 | // the regular memset as we had empty slots. | ||
161 | // | ||
162 | .dotail: | ||
163 | mov len2=len // for parallelization of error handling | ||
164 | mov len3=len | ||
165 | tbit.nz p6,p0=len,3 | ||
166 | ;; | ||
167 | EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes | ||
168 | (p6) adds len3=-8,len2 | ||
169 | tbit.nz p7,p6=len,2 | ||
170 | ;; | ||
171 | EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes | ||
172 | (p7) adds len2=-4,len3 | ||
173 | tbit.nz p6,p7=len,1 | ||
174 | ;; | ||
175 | EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes | ||
176 | (p6) adds len3=-2,len2 | ||
177 | tbit.nz p7,p6=len,0 | ||
178 | ;; | ||
179 | EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left | ||
180 | mov ret0=r0 // success | ||
181 | br.ret.sptk.many rp // end of most likely path | ||
182 | |||
183 | // | ||
184 | // Outlined error handling code | ||
185 | // | ||
186 | |||
187 | // | ||
188 | // .Lexit3: comes from core loop, need restore pr/lc | ||
189 | // len contains bytes left | ||
190 | // | ||
191 | // | ||
192 | // .Lexit2: | ||
193 | // if p6 -> coming from st8 or st2 : len2 contains what's left | ||
194 | // if p7 -> coming from st4 or st1 : len3 contains what's left | ||
195 | // We must restore lc/pr even though might not have been used. | ||
196 | .Lexit2: | ||
197 | .pred.rel "mutex", p6, p7 | ||
198 | (p6) mov len=len2 | ||
199 | (p7) mov len=len3 | ||
200 | ;; | ||
201 | // | ||
202 | // .Lexit4: comes from head, need not restore pr/lc | ||
203 | // len contains bytes left | ||
204 | // | ||
205 | .Lexit3: | ||
206 | mov ret0=len | ||
207 | mov ar.lc=saved_lc | ||
208 | br.ret.sptk.many rp | ||
209 | END(__do_clear_user) | ||