1 files changed, 88 insertions, 91 deletions
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 3f926c2dc708..44841971f077 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -147,12 +147,11 @@ ENTRY(fsys_set_tid_address)
        FSYS_RETURN
 END(fsys_set_tid_address)
-/*
+#if IA64_GTOD_LOCK_OFFSET !=0
- * Ensure that the time interpolator structure is compatible with the asm code
+#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
- */
+#endif
-#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \
+#if IA64_ITC_JITTER_OFFSET !=0
-        || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4
+#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
-#error fsys_gettimeofday incompatible with changes to struct time_interpolator
 #endif
 #define CLOCK_REALTIME 0
 #define CLOCK_MONOTONIC 1
@@ -179,126 +178,124 @@ ENTRY(fsys_gettimeofday)
        // r11 = preserved: saved ar.pfs
        // r12 = preserved: memory stack
        // r13 = preserved: thread pointer
-        // r14 = address of mask / mask
+        // r14 = address of mask / mask value
        // r15 = preserved: system call number
        // r16 = preserved: current task pointer
-        // r17 = wall to monotonic use
+        // r17 = (not used)
-        // r18 = time_interpolator->offset
+        // r18 = (not used)
-        // r19 = address of wall_to_monotonic
+        // r19 = address of itc_lastcycle
-        // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address
+        // r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
-        // r21 = shift factor
+        // r21 = address of mmio_ptr
-        // r22 = address of time interpolator->last_counter
+        // r22 = address of wall_time or monotonic_time
-        // r23 = address of time_interpolator->last_cycle
+        // r23 = address of shift / value
-        // r24 = adress of time_interpolator->offset
+        // r24 = address mult factor / cycle_last value
-        // r25 = last_cycle value
+        // r25 = itc_lastcycle value
-        // r26 = last_counter value
+        // r26 = address clocksource cycle_last
-        // r27 = pointer to xtime
+        // r27 = (not used)
        // r28 = sequence number at the beginning of critcal section
-        // r29 = address of seqlock
+        // r29 = address of itc_jitter
        // r30 = time processing flags / memory address
        // r31 = pointer to result
        // Predicates
        // p6,p7 short term use
        // p8 = timesource ar.itc
        // p9 = timesource mmio64
-        // p10 = timesource mmio32
+        // p10 = timesource mmio32 - not used
        // p11 = timesource not to be handled by asm code
-        // p12 = memory time source ( = p9 | p10)
+        // p12 = memory time source ( = p9 | p10) - not used
-        // p13 = do cmpxchg with time_interpolator_last_cycle
+        // p13 = do cmpxchg with itc_lastcycle
        // p14 = Divide by 1000
        // p15 = Add monotonic
        //
-        // Note that instructions are optimized for McKinley. McKinley can process two
+        // Note that instructions are optimized for McKinley. McKinley can
-        // bundles simultaneously and therefore we continuously try to feed the CPU
+        // process two bundles simultaneously and therefore we continuously
-        // two bundles and then a stop.
+        // try to feed the CPU two bundles and then a stop.
-        tnat.nz p6,p0 = r31     // branch deferred since it does not fit into bundle structure
+        //
+        // Additional note that code has changed a lot. Optimization is TBD.
+        // Comments begin with "?" are maybe outdated.
+        tnat.nz p6,p0 = r31     // ? branch deferred to fit later bundle
        mov pr = r30,0xc000     // Set predicates according to function
        add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-        movl r20 = time_interpolator
+        movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
        ;;
-        ld8 r20 = [r20]         // get pointer to time_interpolator structure
+        movl r29 = itc_jitter_data      // itc_jitter
-        movl r29 = xtime_lock
+        add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20        // wall_time
        ld4 r2 = [r2]           // process work pending flags
-        movl r27 = xtime
+        ;;
-        ;;      // only one bundle here
+(p15)   add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20        // monotonic_time
-        ld8 r21 = [r20]         // first quad with control information
+        add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
+        add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
        and r2 = TIF_ALLWORK_MASK,r2
-(p6)    br.cond.spnt.few .fail_einval   // deferred branch
+(p6)    br.cond.spnt.few .fail_einval   // ? deferred branch
        ;;
-        add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20
+        add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
-        extr r3 = r21,32,32     // time_interpolator->nsec_per_cyc
-        extr r8 = r21,0,16      // time_interpolator->source
        cmp.ne p6, p0 = 0, r2   // Fallback if work is scheduled
 (p6)    br.cond.spnt.many fsys_fallback_syscall
        ;;
-        cmp.eq p8,p12 = 0,r8    // Check for cpu timer
+        // Begin critical section
-        cmp.eq p9,p0 = 1,r8     // MMIO64 ?
+.time_redo:
-        extr r2 = r21,24,8      // time_interpolator->jitter
+        ld4.acq r28 = [r20]     // gtod_lock.sequence, Must take first
-        cmp.eq p10,p0 = 2,r8    // MMIO32 ?
+        ;;
-        cmp.ltu p11,p0 = 2,r8   // function or other clock
+        and r28 = ~1,r28        // And make sequence even to force retry if odd
-(p11)   br.cond.spnt.many fsys_fallback_syscall
        ;;
-        setf.sig f7 = r3        // Setup for scaling of counter
+        ld8 r30 = [r21]         // clocksource->mmio_ptr
-(p15)   movl r19 = wall_to_monotonic
+        add r24 = IA64_CLKSRC_MULT_OFFSET,r20
-(p12)   ld8 r30 = [r10]
+        ld4 r2 = [r29]          // itc_jitter value
-        cmp.ne p13,p0 = r2,r0   // need jitter compensation?
+        add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
-        extr r21 = r21,16,8     // shift factor
+        add r14 = IA64_CLKSRC_MASK_OFFSET,r20
        ;;
-.time_redo:
+        ld4 r3 = [r24]          // clocksource mult value
-        .pred.rel.mutex p8,p9,p10
+        ld8 r14 = [r14]         // clocksource mask value
-        ld4.acq r28 = [r29]     // xtime_lock.sequence. Must come first for locking purposes
+        cmp.eq p8,p9 = 0,r30    // use cpu timer if no mmio_ptr
        ;;
-        and r28 = ~1,r28        // Make sequence even to force retry if odd
+        setf.sig f7 = r3        // Setup for mult scaling of counter
+(p8)    cmp.ne p13,p0 = r2,r0   // need itc_jitter compensation, set p13
+        ld4 r23 = [r23]         // clocksource shift value
+        ld8 r24 = [r26]         // get clksrc_cycle_last value
+(p9)    cmp.eq p13,p0 = 0,r30   // if mmio_ptr, clear p13 jitter control
        ;;
+        .pred.rel.mutex p8,p9
 (p8)    mov r2 = ar.itc         // CPU_TIMER. 36 clocks latency!!!
-        add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20
+(p9)    ld8 r2 = [r30]          // MMIO_TIMER. Could also have latency issues..
-(p9)    ld8 r2 = [r30]          // readq(ti->address). Could also have latency issues..
+(p13)   ld8 r25 = [r19]         // get itc_lastcycle value
-(p10)   ld4 r2 = [r30]          // readw(ti->address)
+        ;;              // ? could be removed by moving the last add upward
-(p13)   add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20
+        ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET     // tv_sec
-        ;;                      // could be removed by moving the last add upward
+        ;;
-        ld8 r26 = [r22]         // time_interpolator->last_counter
+        ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET    // tv_nsec
-(p13)   ld8 r25 = [r23]         // time interpolator->last_cycle
+(p13)   sub r3 = r25,r2         // Diff needed before comparison (thanks davidm)
-        add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20
+        ;;
-(p15)   ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET
+(p13)   cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
-        ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET
+        sub r10 = r2,r24        // current_cycle - last_cycle
-        add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20
+        ;;
-        ;;
+(p6)    sub r10 = r25,r24       // time we got was less than last_cycle
-        ld8 r18 = [r24]         // time_interpolator->offset
-        ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET    // xtime.tv_nsec
-(p13)   sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
-        ;;
-        ld8 r14 = [r14]         // time_interpolator->mask
-(p13)   cmp.gt.unc p6,p7 = r3,r0        // check if it is less than last. p6,p7 cleared
-        sub r10 = r2,r26        // current_counter - last_counter
-        ;;
-(p6)    sub r10 = r25,r26       // time we got was less than last_cycle
 (p7)    mov ar.ccv = r25        // more than last_cycle. Prep for cmpxchg
        ;;
+(p7)    cmpxchg8.rel r3 = [r19],r2,ar.ccv
+        ;;
+(p7)    cmp.ne p7,p0 = r25,r3   // if cmpxchg not successful
+        ;;
+(p7)    sub r10 = r3,r24        // then use new last_cycle instead
+        ;;
        and r10 = r10,r14       // Apply mask
        ;;
        setf.sig f8 = r10
        nop.i 123
        ;;
-(p7)    cmpxchg8.rel r3 = [r23],r2,ar.ccv
+        // fault check takes 5 cycles and we have spare time
-EX(.fail_efault, probe.w.fault r31, 3)  // This takes 5 cycles and we have spare time
+EX(.fail_efault, probe.w.fault r31, 3)
        xmpy.l f8 = f8,f7       // nsec_per_cyc*(counter-last_counter)
-(p15)   add r9 = r9,r17         // Add wall to monotonic.secs to result secs
        ;;
-(p15)   ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET
+        // ? simulate tbit.nz.or p7,p0 = r28,0
-(p7)    cmp.ne p7,p0 = r25,r3   // if cmpxchg not successful redo
-        // simulate tbit.nz.or p7,p0 = r28,0
        getf.sig r2 = f8
        mf
-        add r8 = r8,r18         // Add time interpolator offset
        ;;
-        ld4 r10 = [r29]         // xtime_lock.sequence
+        ld4 r10 = [r20]         // gtod_lock.sequence
-(p15)   add r8 = r8, r17        // Add monotonic.nsecs to nsecs
+        shr.u r2 = r2,r23       // shift by factor
-        shr.u r2 = r2,r21
+        ;;              // ? overloaded 3 bundles!
-        ;;              // overloaded 3 bundles!
-        // End critical section.
        add r8 = r8,r2          // Add xtime.nsecs
-        cmp4.ne.or p7,p0 = r28,r10
+        cmp4.ne p7,p0 = r28,r10
-(p7)    br.cond.dpnt.few .time_redo     // sequence number changed ?
+(p7)    br.cond.dpnt.few .time_redo     // sequence number changed, redo
+        // End critical section.
        // Now r8=tv->tv_nsec and r9=tv->tv_sec
        mov r10 = r0
        movl r2 = 1000000000
@@ -308,19 +305,19 @@ EX(.fail_efault, probe.w.fault r31, 3)	// This takes 5 cycles and we have spare
 .time_normalize:
        mov r21 = r8
        cmp.ge p6,p0 = r8,r2
-(p14)   shr.u r20 = r8, 3               // We can repeat this if necessary just wasting some time
+(p14)   shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
        ;;
 (p14)   setf.sig f8 = r20
 (p6)    sub r8 = r8,r2
-(p6)    add r9 = 1,r9                   // two nops before the branch.
+(p6)    add r9 = 1,r9           // two nops before the branch.
-(p14)   setf.sig f7 = r3                // Chances for repeats are 1 in 10000 for gettod
+(p14)   setf.sig f7 = r3        // Chances for repeats are 1 in 10000 for gettod
 (p6)    br.cond.dpnt.few .time_normalize
        ;;
        // Divided by 8 though shift. Now divide by 125
        // The compiler was able to do that with a multiply
        // and a shift and we do the same
-EX(.fail_efault, probe.w.fault r23, 3)          // This also costs 5 cycles
+EX(.fail_efault, probe.w.fault r23, 3)  // This also costs 5 cycles
-(p14)   xmpy.hu f8 = f8, f7                     // xmpy has 5 cycles latency so use it...
+(p14)   xmpy.hu f8 = f8, f7             // xmpy has 5 cycles latency so use it
        ;;
        mov r8 = r0
 (p14)   getf.sig r2 = f8

diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 3f926c2dc708..44841971f077 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S
@@ -147,12 +147,11 @@ ENTRY(fsys_set_tid_address)
147	FSYS_RETURN	147	FSYS_RETURN
148	END(fsys_set_tid_address)	148	END(fsys_set_tid_address)
149		149
150	/*	150	#if IA64_GTOD_LOCK_OFFSET !=0
151	* Ensure that the time interpolator structure is compatible with the asm code	151	#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
152	*/	152	#endif
153	#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 \|\| IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \	153	#if IA64_ITC_JITTER_OFFSET !=0
154	\|\| IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 \|\| IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4	154	#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
155	#error fsys_gettimeofday incompatible with changes to struct time_interpolator
156	#endif	155	#endif
157	#define CLOCK_REALTIME 0	156	#define CLOCK_REALTIME 0
158	#define CLOCK_MONOTONIC 1	157	#define CLOCK_MONOTONIC 1
@@ -179,126 +178,124 @@ ENTRY(fsys_gettimeofday)
179	// r11 = preserved: saved ar.pfs	178	// r11 = preserved: saved ar.pfs
180	// r12 = preserved: memory stack	179	// r12 = preserved: memory stack
181	// r13 = preserved: thread pointer	180	// r13 = preserved: thread pointer
182	// r14 = address of mask / mask	181	// r14 = address of mask / mask value
183	// r15 = preserved: system call number	182	// r15 = preserved: system call number
184	// r16 = preserved: current task pointer	183	// r16 = preserved: current task pointer
185	// r17 = wall to monotonic use	184	// r17 = (not used)
186	// r18 = time_interpolator->offset	185	// r18 = (not used)
187	// r19 = address of wall_to_monotonic	186	// r19 = address of itc_lastcycle
188	// r20 = pointer to struct time_interpolator / pointer to time_interpolator->address	187	// r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
189	// r21 = shift factor	188	// r21 = address of mmio_ptr
190	// r22 = address of time interpolator->last_counter	189	// r22 = address of wall_time or monotonic_time
191	// r23 = address of time_interpolator->last_cycle	190	// r23 = address of shift / value
192	// r24 = adress of time_interpolator->offset	191	// r24 = address mult factor / cycle_last value
193	// r25 = last_cycle value	192	// r25 = itc_lastcycle value
194	// r26 = last_counter value	193	// r26 = address clocksource cycle_last
195	// r27 = pointer to xtime	194	// r27 = (not used)
196	// r28 = sequence number at the beginning of critcal section	195	// r28 = sequence number at the beginning of critcal section
197	// r29 = address of seqlock	196	// r29 = address of itc_jitter
198	// r30 = time processing flags / memory address	197	// r30 = time processing flags / memory address
199	// r31 = pointer to result	198	// r31 = pointer to result
200	// Predicates	199	// Predicates
201	// p6,p7 short term use	200	// p6,p7 short term use
202	// p8 = timesource ar.itc	201	// p8 = timesource ar.itc
203	// p9 = timesource mmio64	202	// p9 = timesource mmio64
204	// p10 = timesource mmio32	203	// p10 = timesource mmio32 - not used
205	// p11 = timesource not to be handled by asm code	204	// p11 = timesource not to be handled by asm code
206	// p12 = memory time source ( = p9 \| p10)	205	// p12 = memory time source ( = p9 \| p10) - not used
207	// p13 = do cmpxchg with time_interpolator_last_cycle	206	// p13 = do cmpxchg with itc_lastcycle
208	// p14 = Divide by 1000	207	// p14 = Divide by 1000
209	// p15 = Add monotonic	208	// p15 = Add monotonic
210	//	209	//
211	// Note that instructions are optimized for McKinley. McKinley can process two	210	// Note that instructions are optimized for McKinley. McKinley can
212	// bundles simultaneously and therefore we continuously try to feed the CPU	211	// process two bundles simultaneously and therefore we continuously
213	// two bundles and then a stop.	212	// try to feed the CPU two bundles and then a stop.
214	tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure	213	//
		214	// Additional note that code has changed a lot. Optimization is TBD.
		215	// Comments begin with "?" are maybe outdated.
		216	tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle
215	mov pr = r30,0xc000 // Set predicates according to function	217	mov pr = r30,0xc000 // Set predicates according to function
216	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16	218	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
217	movl r20 = time_interpolator	219	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
218	;;	220	;;
219	ld8 r20 = [r20] // get pointer to time_interpolator structure	221	movl r29 = itc_jitter_data // itc_jitter
220	movl r29 = xtime_lock	222	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time
221	ld4 r2 = [r2] // process work pending flags	223	ld4 r2 = [r2] // process work pending flags
222	movl r27 = xtime	224	;;
223	;; // only one bundle here	225	(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time
224	ld8 r21 = [r20] // first quad with control information	226	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
		227	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
225	and r2 = TIF_ALLWORK_MASK,r2	228	and r2 = TIF_ALLWORK_MASK,r2
226	(p6) br.cond.spnt.few .fail_einval // deferred branch	229	(p6) br.cond.spnt.few .fail_einval // ? deferred branch
227	;;	230	;;
228	add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20	231	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last
229	extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc
230	extr r8 = r21,0,16 // time_interpolator->source
231	cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled	232	cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled
232	(p6) br.cond.spnt.many fsys_fallback_syscall	233	(p6) br.cond.spnt.many fsys_fallback_syscall
233	;;	234	;;
234	cmp.eq p8,p12 = 0,r8 // Check for cpu timer	235	// Begin critical section
235	cmp.eq p9,p0 = 1,r8 // MMIO64 ?	236	.time_redo:
236	extr r2 = r21,24,8 // time_interpolator->jitter	237	ld4.acq r28 = [r20] // gtod_lock.sequence, Must take first
237	cmp.eq p10,p0 = 2,r8 // MMIO32 ?	238	;;
238	cmp.ltu p11,p0 = 2,r8 // function or other clock	239	and r28 = ~1,r28 // And make sequence even to force retry if odd
239	(p11) br.cond.spnt.many fsys_fallback_syscall
240	;;	240	;;
241	setf.sig f7 = r3 // Setup for scaling of counter	241	ld8 r30 = [r21] // clocksource->mmio_ptr
242	(p15) movl r19 = wall_to_monotonic	242	add r24 = IA64_CLKSRC_MULT_OFFSET,r20
243	(p12) ld8 r30 = [r10]	243	ld4 r2 = [r29] // itc_jitter value
244	cmp.ne p13,p0 = r2,r0 // need jitter compensation?	244	add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
245	extr r21 = r21,16,8 // shift factor	245	add r14 = IA64_CLKSRC_MASK_OFFSET,r20
246	;;	246	;;
247	.time_redo:	247	ld4 r3 = [r24] // clocksource mult value
248	.pred.rel.mutex p8,p9,p10	248	ld8 r14 = [r14] // clocksource mask value
249	ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes	249	cmp.eq p8,p9 = 0,r30 // use cpu timer if no mmio_ptr
250	;;	250	;;
251	and r28 = ~1,r28 // Make sequence even to force retry if odd	251	setf.sig f7 = r3 // Setup for mult scaling of counter
		252	(p8) cmp.ne p13,p0 = r2,r0 // need itc_jitter compensation, set p13
		253	ld4 r23 = [r23] // clocksource shift value
		254	ld8 r24 = [r26] // get clksrc_cycle_last value
		255	(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control
252	;;	256	;;
		257	.pred.rel.mutex p8,p9
253	(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!	258	(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!
254	add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20	259	(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..
255	(p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues..	260	(p13) ld8 r25 = [r19] // get itc_lastcycle value
256	(p10) ld4 r2 = [r30] // readw(ti->address)	261	;; // ? could be removed by moving the last add upward
257	(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20	262	ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec
258	;; // could be removed by moving the last add upward	263	;;
259	ld8 r26 = [r22] // time_interpolator->last_counter	264	ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec
260	(p13) ld8 r25 = [r23] // time interpolator->last_cycle	265	(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
261	add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20	266	;;
262	(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET	267	(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
263	ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET	268	sub r10 = r2,r24 // current_cycle - last_cycle
264	add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20	269	;;
265	;;	270	(p6) sub r10 = r25,r24 // time we got was less than last_cycle
266	ld8 r18 = [r24] // time_interpolator->offset
267	ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec
268	(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm)
269	;;
270	ld8 r14 = [r14] // time_interpolator->mask
271	(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
272	sub r10 = r2,r26 // current_counter - last_counter
273	;;
274	(p6) sub r10 = r25,r26 // time we got was less than last_cycle
275	(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg	271	(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg
276	;;	272	;;
		273	(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv
		274	;;
		275	(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful
		276	;;
		277	(p7) sub r10 = r3,r24 // then use new last_cycle instead
		278	;;
277	and r10 = r10,r14 // Apply mask	279	and r10 = r10,r14 // Apply mask
278	;;	280	;;
279	setf.sig f8 = r10	281	setf.sig f8 = r10
280	nop.i 123	282	nop.i 123
281	;;	283	;;
282	(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv	284	// fault check takes 5 cycles and we have spare time
283	EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time	285	EX(.fail_efault, probe.w.fault r31, 3)
284	xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)	286	xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter)
285	(p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs
286	;;	287	;;
287	(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET	288	// ? simulate tbit.nz.or p7,p0 = r28,0
288	(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo
289	// simulate tbit.nz.or p7,p0 = r28,0
290	getf.sig r2 = f8	289	getf.sig r2 = f8
291	mf	290	mf
292	add r8 = r8,r18 // Add time interpolator offset
293	;;	291	;;
294	ld4 r10 = [r29] // xtime_lock.sequence	292	ld4 r10 = [r20] // gtod_lock.sequence
295	(p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs	293	shr.u r2 = r2,r23 // shift by factor
296	shr.u r2 = r2,r21	294	;; // ? overloaded 3 bundles!
297	;; // overloaded 3 bundles!
298	// End critical section.
299	add r8 = r8,r2 // Add xtime.nsecs	295	add r8 = r8,r2 // Add xtime.nsecs
300	cmp4.ne.or p7,p0 = r28,r10	296	cmp4.ne p7,p0 = r28,r10
301	(p7) br.cond.dpnt.few .time_redo // sequence number changed ?	297	(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo
		298	// End critical section.
302	// Now r8=tv->tv_nsec and r9=tv->tv_sec	299	// Now r8=tv->tv_nsec and r9=tv->tv_sec
303	mov r10 = r0	300	mov r10 = r0
304	movl r2 = 1000000000	301	movl r2 = 1000000000
@@ -308,19 +305,19 @@ EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare
308	.time_normalize:	305	.time_normalize:
309	mov r21 = r8	306	mov r21 = r8
310	cmp.ge p6,p0 = r8,r2	307	cmp.ge p6,p0 = r8,r2
311	(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time	308	(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
312	;;	309	;;
313	(p14) setf.sig f8 = r20	310	(p14) setf.sig f8 = r20
314	(p6) sub r8 = r8,r2	311	(p6) sub r8 = r8,r2
315	(p6) add r9 = 1,r9 // two nops before the branch.	312	(p6) add r9 = 1,r9 // two nops before the branch.
316	(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod	313	(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod
317	(p6) br.cond.dpnt.few .time_normalize	314	(p6) br.cond.dpnt.few .time_normalize
318	;;	315	;;
319	// Divided by 8 though shift. Now divide by 125	316	// Divided by 8 though shift. Now divide by 125
320	// The compiler was able to do that with a multiply	317	// The compiler was able to do that with a multiply
321	// and a shift and we do the same	318	// and a shift and we do the same
322	EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles	319	EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles
323	(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it...	320	(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it
324	;;	321	;;
325	mov r8 = r0	322	mov r8 = r0
326	(p14) getf.sig r2 = f8	323	(p14) getf.sig r2 = f8