1 files changed, 196 insertions, 0 deletions
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
new file mode 100644
index 00000000000..5a5514b77e7
--- /dev/null
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Support routines for atomic operations.  Each function takes:
+ *
+ * r0: address to manipulate
+ * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r2: new value to write, or for cmpxchg/add_unless, value to compare against
+ * r3: (cmpxchg/xchg_add_unless) new value to write or add;
+ *     (atomic64 ops) high word of value to write
+ * r4/r5: (cmpxchg64/add_unless64) new value to write or add
+ *
+ * The 32-bit routines return a "struct __get_user" so that the futex code
+ * has an opportunity to return -EFAULT to the user if needed.
+ * The 64-bit routines just return a "long long" with the value,
+ * since they are only used from kernel space and don't expect to fault.
+ * Support for 16-bit ops is included in the framework but we don't provide
+ * any (x86_64 has an atomic_inc_short(), so we might want to some day).
+ *
+ * Note that the caller is advised to issue a suitable L1 or L2
+ * prefetch on the address being manipulated to avoid extra stalls.
+ * In addition, the hot path is on two icache lines, and we start with
+ * a jump to the second line to make sure they are both in cache so
+ * that we never stall waiting on icache fill while holding the lock.
+ * (This doesn't work out with most 64-bit ops, since they consume
+ * too many bundles, so may take an extra i-cache stall.)
+ *
+ * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
+ * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
+ * the code, just page faults.
+ *
+ * If the load or store faults in a way that can be directly fixed in
+ * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
+ * directly, return to the instruction that faulted, and retry it.
+ *
+ * If the load or store faults in a way that potentially requires us
+ * to release the atomic lock, then retry (e.g. a migrating PTE), we
+ * reset the PC in do_page_fault_ics() to the "tns" instruction so
+ * that on return we will reacquire the lock and restart the op.  We
+ * are somewhat overloading the exception_table_entry notion by doing
+ * this, since those entries are not normally used for migrating PTEs.
+ *
+ * If the main page fault handler discovers a bad address, it will see
+ * the PC pointing to the "tns" instruction (due to the earlier
+ * exception_table_entry processing in do_page_fault_ics), and
+ * re-reset the PC to the fault handler, atomic_bad_address(), which
+ * effectively takes over from the atomic op and can either return a
+ * bad "struct __get_user" (for user addresses) or can just panic (for
+ * bad kernel addresses).
+ *
+ * Note that if the value we would store is the same as what we
+ * loaded, we bypass the load.  Other platforms with true atomics can
+ * make the guarantee that a non-atomic __clear_bit(), for example,
+ * can safely race with an atomic test_and_set_bit(); this example is
+ * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
+ * that on Tile since the "atomic" op is really just a
+ * read/modify/write, and can race with the non-atomic
+ * read/modify/write.  However, if we can short-circuit the write when
+ * it is not needed, in the atomic case, we avoid the race.
+ */
+#include <linux/linkage.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+        .section .text.atomic,"ax"
+ENTRY(__start_atomic_asm_code)
+        .macro  atomic_op, name, bitwidth, body
+        .align  64
+STD_ENTRY_SECTION(__atomic\name, .text.atomic)
+        {
+         movei  r24, 1
+         j      4f              /* branch to second cache line */
+        }
+1:      {
+         .ifc \bitwidth,16
+         lh     r22, r0
+         .else
+         lw     r22, r0
+         addi   r28, r0, 4
+         .endif
+        }
+        .ifc \bitwidth,64
+        lw      r23, r28
+        .endif
+        \body /* set r24, and r25 if 64-bit */
+        {
+         seq    r26, r22, r24
+         seq    r27, r23, r25
+        }
+        .ifc \bitwidth,64
+        bbnst   r27, 2f
+        .endif
+        bbs     r26, 3f         /* skip write-back if it's the same value */
+2:      {
+         .ifc \bitwidth,16
+         sh     r0, r24
+         .else
+         sw     r0, r24
+         .endif
+        }
+        .ifc \bitwidth,64
+        sw      r28, r25
+        .endif
+        mf
+3:      {
+         move   r0, r22
+         .ifc \bitwidth,64
+         move   r1, r23
+         .else
+         move   r1, zero
+         .endif
+         sw     ATOMIC_LOCK_REG_NAME, zero
+        }
+        mtspr   INTERRUPT_CRITICAL_SECTION, zero
+        jrp     lr
+4:      {
+         move   ATOMIC_LOCK_REG_NAME, r1
+         mtspr  INTERRUPT_CRITICAL_SECTION, r24
+        }
+#ifndef CONFIG_SMP
+        j       1b              /* no atomic locks */
+#else
+        {
+         tns    r21, ATOMIC_LOCK_REG_NAME
+         moveli r23, 2048       /* maximum backoff time in cycles */
+        }
+        {
+         bzt    r21, 1b         /* branch if lock acquired */
+         moveli r25, 32         /* starting backoff time in cycles */
+        }
+5:      mtspr   INTERRUPT_CRITICAL_SECTION, zero
+        mfspr   r26, CYCLE_LOW  /* get start point for this backoff */
+6:      mfspr   r22, CYCLE_LOW  /* test to see if we've backed off enough */
+        sub     r22, r22, r26
+        slt     r22, r22, r25
+        bbst    r22, 6b
+        {
+         mtspr  INTERRUPT_CRITICAL_SECTION, r24
+         shli   r25, r25, 1     /* double the backoff; retry the tns */
+        }
+        {
+         tns    r21, ATOMIC_LOCK_REG_NAME
+         slt    r26, r23, r25   /* is the proposed backoff too big? */
+        }
+        {
+         bzt    r21, 1b         /* branch if lock acquired */
+         mvnz   r25, r26, r23
+        }
+        j       5b
+#endif
+        STD_ENDPROC(__atomic\name)
+        .ifc \bitwidth,32
+        .pushsection __ex_table,"a"
+        .word   1b, __atomic\name
+        .word   2b, __atomic\name
+        .word   __atomic\name, __atomic_bad_address
+        .popsection
+        .endif
+        .endm
+atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
+atomic_op _xchg, 32, "move r24, r2"
+atomic_op _xchg_add, 32, "add r24, r22, r2"
+atomic_op _xchg_add_unless, 32, \
+        "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
+atomic_op _or, 32, "or r24, r22, r2"
+atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
+atomic_op _xor, 32, "xor r24, r22, r2"
+atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
+        { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
+atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
+atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
+        slt_u r26, r24, r22; add r25, r25, r26"
+atomic_op 64_xchg_add_unless, 64, \
+        "{ sne r26, r22, r2; sne r27, r23, r3 }; \
+        { bbns r26, 3f; add r24, r22, r4 }; \
+        { bbns r27, 3f; add r25, r23, r5 }; \
+        slt_u r26, r24, r22; add r25, r25, r26"
+        jrp     lr              /* happy backtracer */
+ENTRY(__end_atomic_asm_code)

diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S new file mode 100644 index 00000000000..5a5514b77e7 --- /dev/null +++ b/arch/tile/lib/atomic_asm_32.S
@@ -0,0 +1,196 @@
	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*
	14	* Support routines for atomic operations. Each function takes:
	15	*
	16	* r0: address to manipulate
	17	* r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
	18	* r2: new value to write, or for cmpxchg/add_unless, value to compare against
	19	* r3: (cmpxchg/xchg_add_unless) new value to write or add;
	20	* (atomic64 ops) high word of value to write
	21	* r4/r5: (cmpxchg64/add_unless64) new value to write or add
	22	*
	23	* The 32-bit routines return a "struct __get_user" so that the futex code
	24	* has an opportunity to return -EFAULT to the user if needed.
	25	* The 64-bit routines just return a "long long" with the value,
	26	* since they are only used from kernel space and don't expect to fault.
	27	* Support for 16-bit ops is included in the framework but we don't provide
	28	* any (x86_64 has an atomic_inc_short(), so we might want to some day).
	29	*
	30	* Note that the caller is advised to issue a suitable L1 or L2
	31	* prefetch on the address being manipulated to avoid extra stalls.
	32	* In addition, the hot path is on two icache lines, and we start with
	33	* a jump to the second line to make sure they are both in cache so
	34	* that we never stall waiting on icache fill while holding the lock.
	35	* (This doesn't work out with most 64-bit ops, since they consume
	36	* too many bundles, so may take an extra i-cache stall.)
	37	*
	38	* These routines set the INTERRUPT_CRITICAL_SECTION bit, just
	39	* like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
	40	* the code, just page faults.
	41	*
	42	* If the load or store faults in a way that can be directly fixed in
	43	* the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
	44	* directly, return to the instruction that faulted, and retry it.
	45	*
	46	* If the load or store faults in a way that potentially requires us
	47	* to release the atomic lock, then retry (e.g. a migrating PTE), we
	48	* reset the PC in do_page_fault_ics() to the "tns" instruction so
	49	* that on return we will reacquire the lock and restart the op. We
	50	* are somewhat overloading the exception_table_entry notion by doing
	51	* this, since those entries are not normally used for migrating PTEs.
	52	*
	53	* If the main page fault handler discovers a bad address, it will see
	54	* the PC pointing to the "tns" instruction (due to the earlier
	55	* exception_table_entry processing in do_page_fault_ics), and
	56	* re-reset the PC to the fault handler, atomic_bad_address(), which
	57	* effectively takes over from the atomic op and can either return a
	58	* bad "struct __get_user" (for user addresses) or can just panic (for
	59	* bad kernel addresses).
	60	*
	61	* Note that if the value we would store is the same as what we
	62	* loaded, we bypass the load. Other platforms with true atomics can
	63	* make the guarantee that a non-atomic __clear_bit(), for example,
	64	* can safely race with an atomic test_and_set_bit(); this example is
	65	* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
	66	* that on Tile since the "atomic" op is really just a
	67	* read/modify/write, and can race with the non-atomic
	68	* read/modify/write. However, if we can short-circuit the write when
	69	* it is not needed, in the atomic case, we avoid the race.
	70	*/
	71
	72	#include <linux/linkage.h>
	73	#include <asm/atomic.h>
	74	#include <asm/page.h>
	75	#include <asm/processor.h>
	76
	77	.section .text.atomic,"ax"
	78	ENTRY(__start_atomic_asm_code)
	79
	80	.macro atomic_op, name, bitwidth, body
	81	.align 64
	82	STD_ENTRY_SECTION(__atomic\name, .text.atomic)
	83	{
	84	movei r24, 1
	85	j 4f /* branch to second cache line */
	86	}
	87	1: {
	88	.ifc \bitwidth,16
	89	lh r22, r0
	90	.else
	91	lw r22, r0
	92	addi r28, r0, 4
	93	.endif
	94	}
	95	.ifc \bitwidth,64
	96	lw r23, r28
	97	.endif
	98	\body /* set r24, and r25 if 64-bit */
	99	{
	100	seq r26, r22, r24
	101	seq r27, r23, r25
	102	}
	103	.ifc \bitwidth,64
	104	bbnst r27, 2f
	105	.endif
	106	bbs r26, 3f /* skip write-back if it's the same value */
	107	2: {
	108	.ifc \bitwidth,16
	109	sh r0, r24
	110	.else
	111	sw r0, r24
	112	.endif
	113	}
	114	.ifc \bitwidth,64
	115	sw r28, r25
	116	.endif
	117	mf
	118	3: {
	119	move r0, r22
	120	.ifc \bitwidth,64
	121	move r1, r23
	122	.else
	123	move r1, zero
	124	.endif
	125	sw ATOMIC_LOCK_REG_NAME, zero
	126	}
	127	mtspr INTERRUPT_CRITICAL_SECTION, zero
	128	jrp lr
	129	4: {
	130	move ATOMIC_LOCK_REG_NAME, r1
	131	mtspr INTERRUPT_CRITICAL_SECTION, r24
	132	}
	133	#ifndef CONFIG_SMP
	134	j 1b /* no atomic locks */
	135	#else
	136	{
	137	tns r21, ATOMIC_LOCK_REG_NAME
	138	moveli r23, 2048 /* maximum backoff time in cycles */
	139	}
	140	{
	141	bzt r21, 1b /* branch if lock acquired */
	142	moveli r25, 32 /* starting backoff time in cycles */
	143	}
	144	5: mtspr INTERRUPT_CRITICAL_SECTION, zero
	145	mfspr r26, CYCLE_LOW /* get start point for this backoff */
	146	6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */
	147	sub r22, r22, r26
	148	slt r22, r22, r25
	149	bbst r22, 6b
	150	{
	151	mtspr INTERRUPT_CRITICAL_SECTION, r24
	152	shli r25, r25, 1 /* double the backoff; retry the tns */
	153	}
	154	{
	155	tns r21, ATOMIC_LOCK_REG_NAME
	156	slt r26, r23, r25 /* is the proposed backoff too big? */
	157	}
	158	{
	159	bzt r21, 1b /* branch if lock acquired */
	160	mvnz r25, r26, r23
	161	}
	162	j 5b
	163	#endif
	164	STD_ENDPROC(__atomic\name)
	165	.ifc \bitwidth,32
	166	.pushsection __ex_table,"a"
	167	.word 1b, __atomic\name
	168	.word 2b, __atomic\name
	169	.word __atomic\name, __atomic_bad_address
	170	.popsection
	171	.endif
	172	.endm
	173
	174	atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
	175	atomic_op _xchg, 32, "move r24, r2"
	176	atomic_op _xchg_add, 32, "add r24, r22, r2"
	177	atomic_op _xchg_add_unless, 32, \
	178	"sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
	179	atomic_op _or, 32, "or r24, r22, r2"
	180	atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
	181	atomic_op _xor, 32, "xor r24, r22, r2"
	182
	183	atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
	184	{ bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
	185	atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
	186	atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
	187	slt_u r26, r24, r22; add r25, r25, r26"
	188	atomic_op 64_xchg_add_unless, 64, \
	189	"{ sne r26, r22, r2; sne r27, r23, r3 }; \
	190	{ bbns r26, 3f; add r24, r22, r4 }; \
	191	{ bbns r27, 3f; add r25, r23, r5 }; \
	192	slt_u r26, r24, r22; add r25, r25, r26"
	193
	194	jrp lr /* happy backtracer */
	195
	196	ENTRY(__end_atomic_asm_code)