diff options
Diffstat (limited to 'arch/ia64/kernel/fsys.S')
-rw-r--r-- | arch/ia64/kernel/fsys.S | 179 |
1 files changed, 88 insertions, 91 deletions
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 3f926c2dc708..44841971f077 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S | |||
@@ -147,12 +147,11 @@ ENTRY(fsys_set_tid_address) | |||
147 | FSYS_RETURN | 147 | FSYS_RETURN |
148 | END(fsys_set_tid_address) | 148 | END(fsys_set_tid_address) |
149 | 149 | ||
150 | /* | 150 | #if IA64_GTOD_LOCK_OFFSET !=0 |
151 | * Ensure that the time interpolator structure is compatible with the asm code | 151 | #error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t |
152 | */ | 152 | #endif |
153 | #if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ | 153 | #if IA64_ITC_JITTER_OFFSET !=0 |
154 | || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 | 154 | #error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t |
155 | #error fsys_gettimeofday incompatible with changes to struct time_interpolator | ||
156 | #endif | 155 | #endif |
157 | #define CLOCK_REALTIME 0 | 156 | #define CLOCK_REALTIME 0 |
158 | #define CLOCK_MONOTONIC 1 | 157 | #define CLOCK_MONOTONIC 1 |
@@ -179,126 +178,124 @@ ENTRY(fsys_gettimeofday) | |||
179 | // r11 = preserved: saved ar.pfs | 178 | // r11 = preserved: saved ar.pfs |
180 | // r12 = preserved: memory stack | 179 | // r12 = preserved: memory stack |
181 | // r13 = preserved: thread pointer | 180 | // r13 = preserved: thread pointer |
182 | // r14 = address of mask / mask | 181 | // r14 = address of mask / mask value |
183 | // r15 = preserved: system call number | 182 | // r15 = preserved: system call number |
184 | // r16 = preserved: current task pointer | 183 | // r16 = preserved: current task pointer |
185 | // r17 = wall to monotonic use | 184 | // r17 = (not used) |
186 | // r18 = time_interpolator->offset | 185 | // r18 = (not used) |
187 | // r19 = address of wall_to_monotonic | 186 | // r19 = address of itc_lastcycle |
188 | // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address | 187 | // r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence) |
189 | // r21 = shift factor | 188 | // r21 = address of mmio_ptr |
190 | // r22 = address of time interpolator->last_counter | 189 | // r22 = address of wall_time or monotonic_time |
191 | // r23 = address of time_interpolator->last_cycle | 190 | // r23 = address of shift / value |
192 | // r24 = adress of time_interpolator->offset | 191 | // r24 = address mult factor / cycle_last value |
193 | // r25 = last_cycle value | 192 | // r25 = itc_lastcycle value |
194 | // r26 = last_counter value | 193 | // r26 = address clocksource cycle_last |
195 | // r27 = pointer to xtime | 194 | // r27 = (not used) |
196 | // r28 = sequence number at the beginning of critcal section | 195 | // r28 = sequence number at the beginning of critcal section |
197 | // r29 = address of seqlock | 196 | // r29 = address of itc_jitter |
198 | // r30 = time processing flags / memory address | 197 | // r30 = time processing flags / memory address |
199 | // r31 = pointer to result | 198 | // r31 = pointer to result |
200 | // Predicates | 199 | // Predicates |
201 | // p6,p7 short term use | 200 | // p6,p7 short term use |
202 | // p8 = timesource ar.itc | 201 | // p8 = timesource ar.itc |
203 | // p9 = timesource mmio64 | 202 | // p9 = timesource mmio64 |
204 | // p10 = timesource mmio32 | 203 | // p10 = timesource mmio32 - not used |
205 | // p11 = timesource not to be handled by asm code | 204 | // p11 = timesource not to be handled by asm code |
206 | // p12 = memory time source ( = p9 | p10) | 205 | // p12 = memory time source ( = p9 | p10) - not used |
207 | // p13 = do cmpxchg with time_interpolator_last_cycle | 206 | // p13 = do cmpxchg with itc_lastcycle |
208 | // p14 = Divide by 1000 | 207 | // p14 = Divide by 1000 |
209 | // p15 = Add monotonic | 208 | // p15 = Add monotonic |
210 | // | 209 | // |
211 | // Note that instructions are optimized for McKinley. McKinley can process two | 210 | // Note that instructions are optimized for McKinley. McKinley can |
212 | // bundles simultaneously and therefore we continuously try to feed the CPU | 211 | // process two bundles simultaneously and therefore we continuously |
213 | // two bundles and then a stop. | 212 | // try to feed the CPU two bundles and then a stop. |
214 | tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure | 213 | // |
214 | // Additional note that code has changed a lot. Optimization is TBD. | ||
215 | // Comments begin with "?" are maybe outdated. | ||
216 | tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle | ||
215 | mov pr = r30,0xc000 // Set predicates according to function | 217 | mov pr = r30,0xc000 // Set predicates according to function |
216 | add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 | 218 | add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 |
217 | movl r20 = time_interpolator | 219 | movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address |
218 | ;; | 220 | ;; |
219 | ld8 r20 = [r20] // get pointer to time_interpolator structure | 221 | movl r29 = itc_jitter_data // itc_jitter |
220 | movl r29 = xtime_lock | 222 | add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time |
221 | ld4 r2 = [r2] // process work pending flags | 223 | ld4 r2 = [r2] // process work pending flags |
222 | movl r27 = xtime | 224 | ;; |
223 | ;; // only one bundle here | 225 | (p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time |
224 | ld8 r21 = [r20] // first quad with control information | 226 | add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 |
227 | add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 | ||
225 | and r2 = TIF_ALLWORK_MASK,r2 | 228 | and r2 = TIF_ALLWORK_MASK,r2 |
226 | (p6) br.cond.spnt.few .fail_einval // deferred branch | 229 | (p6) br.cond.spnt.few .fail_einval // ? deferred branch |
227 | ;; | 230 | ;; |
228 | add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 | 231 | add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last |
229 | extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc | ||
230 | extr r8 = r21,0,16 // time_interpolator->source | ||
231 | cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled | 232 | cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled |
232 | (p6) br.cond.spnt.many fsys_fallback_syscall | 233 | (p6) br.cond.spnt.many fsys_fallback_syscall |
233 | ;; | 234 | ;; |
234 | cmp.eq p8,p12 = 0,r8 // Check for cpu timer | 235 | // Begin critical section |
235 | cmp.eq p9,p0 = 1,r8 // MMIO64 ? | 236 | .time_redo: |
236 | extr r2 = r21,24,8 // time_interpolator->jitter | 237 | ld4.acq r28 = [r20] // gtod_lock.sequence, Must take first |
237 | cmp.eq p10,p0 = 2,r8 // MMIO32 ? | 238 | ;; |
238 | cmp.ltu p11,p0 = 2,r8 // function or other clock | 239 | and r28 = ~1,r28 // And make sequence even to force retry if odd |
239 | (p11) br.cond.spnt.many fsys_fallback_syscall | ||
240 | ;; | 240 | ;; |
241 | setf.sig f7 = r3 // Setup for scaling of counter | 241 | ld8 r30 = [r21] // clocksource->mmio_ptr |
242 | (p15) movl r19 = wall_to_monotonic | 242 | add r24 = IA64_CLKSRC_MULT_OFFSET,r20 |
243 | (p12) ld8 r30 = [r10] | 243 | ld4 r2 = [r29] // itc_jitter value |
244 | cmp.ne p13,p0 = r2,r0 // need jitter compensation? | 244 | add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20 |
245 | extr r21 = r21,16,8 // shift factor | 245 | add r14 = IA64_CLKSRC_MASK_OFFSET,r20 |
246 | ;; | 246 | ;; |
247 | .time_redo: | 247 | ld4 r3 = [r24] // clocksource mult value |
248 | .pred.rel.mutex p8,p9,p10 | 248 | ld8 r14 = [r14] // clocksource mask value |
249 | ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes | 249 | cmp.eq p8,p9 = 0,r30 // use cpu timer if no mmio_ptr |
250 | ;; | 250 | ;; |
251 | and r28 = ~1,r28 // Make sequence even to force retry if odd | 251 | setf.sig f7 = r3 // Setup for mult scaling of counter |
252 | (p8) cmp.ne p13,p0 = r2,r0 // need itc_jitter compensation, set p13 | ||
253 | ld4 r23 = [r23] // clocksource shift value | ||
254 | ld8 r24 = [r26] // get clksrc_cycle_last value | ||
255 | (p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control | ||
252 | ;; | 256 | ;; |
257 | .pred.rel.mutex p8,p9 | ||
253 | (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! | 258 | (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! |
254 | add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 | 259 | (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. |
255 | (p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. | 260 | (p13) ld8 r25 = [r19] // get itc_lastcycle value |
256 | (p10) ld4 r2 = [r30] // readw(ti->address) | 261 | ;; // ? could be removed by moving the last add upward |
257 | (p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 | 262 | ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec |
258 | ;; // could be removed by moving the last add upward | 263 | ;; |
259 | ld8 r26 = [r22] // time_interpolator->last_counter | 264 | ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec |
260 | (p13) ld8 r25 = [r23] // time interpolator->last_cycle | 265 | (p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) |
261 | add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 | 266 | ;; |
262 | (p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET | 267 | (p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared |
263 | ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET | 268 | sub r10 = r2,r24 // current_cycle - last_cycle |
264 | add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 | 269 | ;; |
265 | ;; | 270 | (p6) sub r10 = r25,r24 // time we got was less than last_cycle |
266 | ld8 r18 = [r24] // time_interpolator->offset | ||
267 | ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec | ||
268 | (p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) | ||
269 | ;; | ||
270 | ld8 r14 = [r14] // time_interpolator->mask | ||
271 | (p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared | ||
272 | sub r10 = r2,r26 // current_counter - last_counter | ||
273 | ;; | ||
274 | (p6) sub r10 = r25,r26 // time we got was less than last_cycle | ||
275 | (p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg | 271 | (p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg |
276 | ;; | 272 | ;; |
273 | (p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv | ||
274 | ;; | ||
275 | (p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful | ||
276 | ;; | ||
277 | (p7) sub r10 = r3,r24 // then use new last_cycle instead | ||
278 | ;; | ||
277 | and r10 = r10,r14 // Apply mask | 279 | and r10 = r10,r14 // Apply mask |
278 | ;; | 280 | ;; |
279 | setf.sig f8 = r10 | 281 | setf.sig f8 = r10 |
280 | nop.i 123 | 282 | nop.i 123 |
281 | ;; | 283 | ;; |
282 | (p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv | 284 | // fault check takes 5 cycles and we have spare time |
283 | EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time | 285 | EX(.fail_efault, probe.w.fault r31, 3) |
284 | xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) | 286 | xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) |
285 | (p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs | ||
286 | ;; | 287 | ;; |
287 | (p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET | 288 | // ? simulate tbit.nz.or p7,p0 = r28,0 |
288 | (p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo | ||
289 | // simulate tbit.nz.or p7,p0 = r28,0 | ||
290 | getf.sig r2 = f8 | 289 | getf.sig r2 = f8 |
291 | mf | 290 | mf |
292 | add r8 = r8,r18 // Add time interpolator offset | ||
293 | ;; | 291 | ;; |
294 | ld4 r10 = [r29] // xtime_lock.sequence | 292 | ld4 r10 = [r20] // gtod_lock.sequence |
295 | (p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs | 293 | shr.u r2 = r2,r23 // shift by factor |
296 | shr.u r2 = r2,r21 | 294 | ;; // ? overloaded 3 bundles! |
297 | ;; // overloaded 3 bundles! | ||
298 | // End critical section. | ||
299 | add r8 = r8,r2 // Add xtime.nsecs | 295 | add r8 = r8,r2 // Add xtime.nsecs |
300 | cmp4.ne.or p7,p0 = r28,r10 | 296 | cmp4.ne p7,p0 = r28,r10 |
301 | (p7) br.cond.dpnt.few .time_redo // sequence number changed ? | 297 | (p7) br.cond.dpnt.few .time_redo // sequence number changed, redo |
298 | // End critical section. | ||
302 | // Now r8=tv->tv_nsec and r9=tv->tv_sec | 299 | // Now r8=tv->tv_nsec and r9=tv->tv_sec |
303 | mov r10 = r0 | 300 | mov r10 = r0 |
304 | movl r2 = 1000000000 | 301 | movl r2 = 1000000000 |
@@ -308,19 +305,19 @@ EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare | |||
308 | .time_normalize: | 305 | .time_normalize: |
309 | mov r21 = r8 | 306 | mov r21 = r8 |
310 | cmp.ge p6,p0 = r8,r2 | 307 | cmp.ge p6,p0 = r8,r2 |
311 | (p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time | 308 | (p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time |
312 | ;; | 309 | ;; |
313 | (p14) setf.sig f8 = r20 | 310 | (p14) setf.sig f8 = r20 |
314 | (p6) sub r8 = r8,r2 | 311 | (p6) sub r8 = r8,r2 |
315 | (p6) add r9 = 1,r9 // two nops before the branch. | 312 | (p6) add r9 = 1,r9 // two nops before the branch. |
316 | (p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod | 313 | (p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod |
317 | (p6) br.cond.dpnt.few .time_normalize | 314 | (p6) br.cond.dpnt.few .time_normalize |
318 | ;; | 315 | ;; |
319 | // Divided by 8 though shift. Now divide by 125 | 316 | // Divided by 8 though shift. Now divide by 125 |
320 | // The compiler was able to do that with a multiply | 317 | // The compiler was able to do that with a multiply |
321 | // and a shift and we do the same | 318 | // and a shift and we do the same |
322 | EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles | 319 | EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles |
323 | (p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it... | 320 | (p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it |
324 | ;; | 321 | ;; |
325 | mov r8 = r0 | 322 | mov r8 = r0 |
326 | (p14) getf.sig r2 = f8 | 323 | (p14) getf.sig r2 = f8 |