aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/kernel/fsys.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/kernel/fsys.S')
-rw-r--r--arch/ia64/kernel/fsys.S179
1 files changed, 88 insertions, 91 deletions
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 3f926c2dc708..44841971f077 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -147,12 +147,11 @@ ENTRY(fsys_set_tid_address)
147 FSYS_RETURN 147 FSYS_RETURN
148END(fsys_set_tid_address) 148END(fsys_set_tid_address)
149 149
150/* 150#if IA64_GTOD_LOCK_OFFSET !=0
151 * Ensure that the time interpolator structure is compatible with the asm code 151#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
152 */ 152#endif
153#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ 153#if IA64_ITC_JITTER_OFFSET !=0
154 || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 154#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
155#error fsys_gettimeofday incompatible with changes to struct time_interpolator
156#endif 155#endif
157#define CLOCK_REALTIME 0 156#define CLOCK_REALTIME 0
158#define CLOCK_MONOTONIC 1 157#define CLOCK_MONOTONIC 1
@@ -179,126 +178,124 @@ ENTRY(fsys_gettimeofday)
179 // r11 = preserved: saved ar.pfs 178 // r11 = preserved: saved ar.pfs
180 // r12 = preserved: memory stack 179 // r12 = preserved: memory stack
181 // r13 = preserved: thread pointer 180 // r13 = preserved: thread pointer
182 // r14 = address of mask / mask 181 // r14 = address of mask / mask value
183 // r15 = preserved: system call number 182 // r15 = preserved: system call number
184 // r16 = preserved: current task pointer 183 // r16 = preserved: current task pointer
185 // r17 = wall to monotonic use 184 // r17 = (not used)
186 // r18 = time_interpolator->offset 185 // r18 = (not used)
187 // r19 = address of wall_to_monotonic 186 // r19 = address of itc_lastcycle
188 // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address 187 // r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
189 // r21 = shift factor 188 // r21 = address of mmio_ptr
190 // r22 = address of time interpolator->last_counter 189 // r22 = address of wall_time or monotonic_time
191 // r23 = address of time_interpolator->last_cycle 190 // r23 = address of shift / value
192 // r24 = adress of time_interpolator->offset 191 // r24 = address mult factor / cycle_last value
193 // r25 = last_cycle value 192 // r25 = itc_lastcycle value
194 // r26 = last_counter value 193 // r26 = address clocksource cycle_last
195 // r27 = pointer to xtime 194 // r27 = (not used)
196 // r28 = sequence number at the beginning of critcal section 195 // r28 = sequence number at the beginning of critcal section
197 // r29 = address of seqlock 196 // r29 = address of itc_jitter
198 // r30 = time processing flags / memory address 197 // r30 = time processing flags / memory address
199 // r31 = pointer to result 198 // r31 = pointer to result
200 // Predicates 199 // Predicates
201 // p6,p7 short term use 200 // p6,p7 short term use
202 // p8 = timesource ar.itc 201 // p8 = timesource ar.itc
203 // p9 = timesource mmio64 202 // p9 = timesource mmio64
204 // p10 = timesource mmio32 203 // p10 = timesource mmio32 - not used
205 // p11 = timesource not to be handled by asm code 204 // p11 = timesource not to be handled by asm code
206 // p12 = memory time source ( = p9 | p10) 205 // p12 = memory time source ( = p9 | p10) - not used
207 // p13 = do cmpxchg with time_interpolator_last_cycle 206 // p13 = do cmpxchg with itc_lastcycle
208 // p14 = Divide by 1000 207 // p14 = Divide by 1000
209 // p15 = Add monotonic 208 // p15 = Add monotonic
210 // 209 //
211 // Note that instructions are optimized for McKinley. McKinley can process two 210 // Note that instructions are optimized for McKinley. McKinley can
212 // bundles simultaneously and therefore we continuously try to feed the CPU 211 // process two bundles simultaneously and therefore we continuously
213 // two bundles and then a stop. 212 // try to feed the CPU two bundles and then a stop.
214 tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure 213 //
214 // Additional note that code has changed a lot. Optimization is TBD.
215 // Comments begin with "?" are maybe outdated.
216 tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle
215 mov pr = r30,0xc000 // Set predicates according to function 217 mov pr = r30,0xc000 // Set predicates according to function
216 add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 218 add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
217 movl r20 = time_interpolator 219 movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
218 ;; 220 ;;
219 ld8 r20 = [r20] // get pointer to time_interpolator structure 221 movl r29 = itc_jitter_data // itc_jitter
220 movl r29 = xtime_lock 222 add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time
221 ld4 r2 = [r2] // process work pending flags 223 ld4 r2 = [r2] // process work pending flags
222 movl r27 = xtime 224 ;;
223 ;; // only one bundle here 225(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time
224 ld8 r21 = [r20] // first quad with control information 226 add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
227 add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
225 and r2 = TIF_ALLWORK_MASK,r2 228 and r2 = TIF_ALLWORK_MASK,r2
226(p6) br.cond.spnt.few .fail_einval // deferred branch 229(p6) br.cond.spnt.few .fail_einval // ? deferred branch
227 ;; 230 ;;
228 add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 231 add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
229 extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc
230 extr r8 = r21,0,16 // time_interpolator->source
231 cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled 232 cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled
232(p6) br.cond.spnt.many fsys_fallback_syscall 233(p6) br.cond.spnt.many fsys_fallback_syscall
233 ;; 234 ;;
234 cmp.eq p8,p12 = 0,r8 // Check for cpu timer 235 // Begin critical section
235 cmp.eq p9,p0 = 1,r8 // MMIO64 ? 236.time_redo:
236 extr r2 = r21,24,8 // time_interpolator->jitter 237 ld4.acq r28 = [r20] // gtod_lock.sequence, Must take first
237 cmp.eq p10,p0 = 2,r8 // MMIO32 ? 238 ;;
238 cmp.ltu p11,p0 = 2,r8 // function or other clock 239 and r28 = ~1,r28 // And make sequence even to force retry if odd
239(p11) br.cond.spnt.many fsys_fallback_syscall
240 ;; 240 ;;
241 setf.sig f7 = r3 // Setup for scaling of counter 241 ld8 r30 = [r21] // clocksource->mmio_ptr
242(p15) movl r19 = wall_to_monotonic 242 add r24 = IA64_CLKSRC_MULT_OFFSET,r20
243(p12) ld8 r30 = [r10] 243 ld4 r2 = [r29] // itc_jitter value
244 cmp.ne p13,p0 = r2,r0 // need jitter compensation? 244 add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
245 extr r21 = r21,16,8 // shift factor 245 add r14 = IA64_CLKSRC_MASK_OFFSET,r20
246 ;; 246 ;;
247.time_redo: 247 ld4 r3 = [r24] // clocksource mult value
248 .pred.rel.mutex p8,p9,p10 248 ld8 r14 = [r14] // clocksource mask value
249 ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes 249 cmp.eq p8,p9 = 0,r30 // use cpu timer if no mmio_ptr
250 ;; 250 ;;
251 and r28 = ~1,r28 // Make sequence even to force retry if odd 251 setf.sig f7 = r3 // Setup for mult scaling of counter
252(p8) cmp.ne p13,p0 = r2,r0 // need itc_jitter compensation, set p13
253 ld4 r23 = [r23] // clocksource shift value
254 ld8 r24 = [r26] // get clksrc_cycle_last value
255(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control
252 ;; 256 ;;
257 .pred.rel.mutex p8,p9
253(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! 258(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!
254 add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 259(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..
255(p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. 260(p13) ld8 r25 = [r19] // get itc_lastcycle value
256(p10) ld4 r2 = [r30] // readw(ti->address) 261 ;; // ? could be removed by moving the last add upward
257(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 262 ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec
258 ;; // could be removed by moving the last add upward 263 ;;
259 ld8 r26 = [r22] // time_interpolator->last_counter 264 ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec
260(p13) ld8 r25 = [r23] // time interpolator->last_cycle 265(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
261 add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 266 ;;
262(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET 267(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
263 ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET 268 sub r10 = r2,r24 // current_cycle - last_cycle
264 add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 269 ;;
265 ;; 270(p6) sub r10 = r25,r24 // time we got was less than last_cycle
266 ld8 r18 = [r24] // time_interpolator->offset
267 ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec
268(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
269 ;;
270 ld8 r14 = [r14] // time_interpolator->mask
271(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
272 sub r10 = r2,r26 // current_counter - last_counter
273 ;;
274(p6) sub r10 = r25,r26 // time we got was less than last_cycle
275(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg 271(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg
276 ;; 272 ;;
273(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv
274 ;;
275(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful
276 ;;
277(p7) sub r10 = r3,r24 // then use new last_cycle instead
278 ;;
277 and r10 = r10,r14 // Apply mask 279 and r10 = r10,r14 // Apply mask
278 ;; 280 ;;
279 setf.sig f8 = r10 281 setf.sig f8 = r10
280 nop.i 123 282 nop.i 123
281 ;; 283 ;;
282(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv 284 // fault check takes 5 cycles and we have spare time
283EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time 285EX(.fail_efault, probe.w.fault r31, 3)
284 xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) 286 xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)
285(p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs
286 ;; 287 ;;
287(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET 288 // ? simulate tbit.nz.or p7,p0 = r28,0
288(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo
289 // simulate tbit.nz.or p7,p0 = r28,0
290 getf.sig r2 = f8 289 getf.sig r2 = f8
291 mf 290 mf
292 add r8 = r8,r18 // Add time interpolator offset
293 ;; 291 ;;
294 ld4 r10 = [r29] // xtime_lock.sequence 292 ld4 r10 = [r20] // gtod_lock.sequence
295(p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs 293 shr.u r2 = r2,r23 // shift by factor
296 shr.u r2 = r2,r21 294 ;; // ? overloaded 3 bundles!
297 ;; // overloaded 3 bundles!
298 // End critical section.
299 add r8 = r8,r2 // Add xtime.nsecs 295 add r8 = r8,r2 // Add xtime.nsecs
300 cmp4.ne.or p7,p0 = r28,r10 296 cmp4.ne p7,p0 = r28,r10
301(p7) br.cond.dpnt.few .time_redo // sequence number changed ? 297(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo
298 // End critical section.
302 // Now r8=tv->tv_nsec and r9=tv->tv_sec 299 // Now r8=tv->tv_nsec and r9=tv->tv_sec
303 mov r10 = r0 300 mov r10 = r0
304 movl r2 = 1000000000 301 movl r2 = 1000000000
@@ -308,19 +305,19 @@ EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare
308.time_normalize: 305.time_normalize:
309 mov r21 = r8 306 mov r21 = r8
310 cmp.ge p6,p0 = r8,r2 307 cmp.ge p6,p0 = r8,r2
311(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time 308(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
312 ;; 309 ;;
313(p14) setf.sig f8 = r20 310(p14) setf.sig f8 = r20
314(p6) sub r8 = r8,r2 311(p6) sub r8 = r8,r2
315(p6) add r9 = 1,r9 // two nops before the branch. 312(p6) add r9 = 1,r9 // two nops before the branch.
316(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod 313(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod
317(p6) br.cond.dpnt.few .time_normalize 314(p6) br.cond.dpnt.few .time_normalize
318 ;; 315 ;;
319 // Divided by 8 though shift. Now divide by 125 316 // Divided by 8 though shift. Now divide by 125
320 // The compiler was able to do that with a multiply 317 // The compiler was able to do that with a multiply
321 // and a shift and we do the same 318 // and a shift and we do the same
322EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles 319EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles
323(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it... 320(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it
324 ;; 321 ;;
325 mov r8 = r0 322 mov r8 = r0
326(p14) getf.sig r2 = f8 323(p14) getf.sig r2 = f8