arch/tile: core support for Tilera 32-bit chips.

This change is the core kernel support for TILEPro and TILE64 chips. No driver support (except the console driver) is included yet. This includes the relevant Linux headers in asm/; the low-level low-level "Tile architecture" headers in arch/, which are shared with the hypervisor, etc., and are build-system agnostic; and the relevant hypervisor headers in hv/. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Reviewed-by: Paul Mundt <lethal@linux-sh.org>
author: Chris Metcalf <cmetcalf@tilera.com> 2010-05-28 23:09:12 -0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2010-06-04 17:11:18 -0400
commit: 867e359b97c970a60626d5d76bbe2a8fadbf38fb (patch)
tree: c5ccbb7f5172e8555977119608ecb1eee3cc37e3 /arch/tile/lib/atomic_asm_32.S
parent: 5360bd776f73d0a7da571d72a09a03f237e99900 (diff)
1 files changed, 197 insertions, 0 deletions
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
new file mode 100644
index 000000000000..c0d058578192
--- /dev/null
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Support routines for atomic operations.  Each function takes:
+ *
+ * r0: address to manipulate
+ * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r2: new value to write, or for cmpxchg/add_unless, value to compare against
+ * r3: (cmpxchg/xchg_add_unless) new value to write or add;
+ *     (atomic64 ops) high word of value to write
+ * r4/r5: (cmpxchg64/add_unless64) new value to write or add
+ *
+ * The 32-bit routines return a "struct __get_user" so that the futex code
+ * has an opportunity to return -EFAULT to the user if needed.
+ * The 64-bit routines just return a "long long" with the value,
+ * since they are only used from kernel space and don't expect to fault.
+ * Support for 16-bit ops is included in the framework but we don't provide
+ * any (x86_64 has an atomic_inc_short(), so we might want to some day).
+ *
+ * Note that the caller is advised to issue a suitable L1 or L2
+ * prefetch on the address being manipulated to avoid extra stalls.
+ * In addition, the hot path is on two icache lines, and we start with
+ * a jump to the second line to make sure they are both in cache so
+ * that we never stall waiting on icache fill while holding the lock.
+ * (This doesn't work out with most 64-bit ops, since they consume
+ * too many bundles, so may take an extra i-cache stall.)
+ *
+ * These routines set the INTERRUPT_CRITICAL_SECTION bit, just
+ * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
+ * the code, just page faults.
+ *
+ * If the load or store faults in a way that can be directly fixed in
+ * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
+ * directly, return to the instruction that faulted, and retry it.
+ *
+ * If the load or store faults in a way that potentially requires us
+ * to release the atomic lock, then retry (e.g. a migrating PTE), we
+ * reset the PC in do_page_fault_ics() to the "tns" instruction so
+ * that on return we will reacquire the lock and restart the op.  We
+ * are somewhat overloading the exception_table_entry notion by doing
+ * this, since those entries are not normally used for migrating PTEs.
+ *
+ * If the main page fault handler discovers a bad address, it will see
+ * the PC pointing to the "tns" instruction (due to the earlier
+ * exception_table_entry processing in do_page_fault_ics), and
+ * re-reset the PC to the fault handler, atomic_bad_address(), which
+ * effectively takes over from the atomic op and can either return a
+ * bad "struct __get_user" (for user addresses) or can just panic (for
+ * bad kernel addresses).
+ *
+ * Note that if the value we would store is the same as what we
+ * loaded, we bypass the load.  Other platforms with true atomics can
+ * make the guarantee that a non-atomic __clear_bit(), for example,
+ * can safely race with an atomic test_and_set_bit(); this example is
+ * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do
+ * that on Tile since the "atomic" op is really just a
+ * read/modify/write, and can race with the non-atomic
+ * read/modify/write.  However, if we can short-circuit the write when
+ * it is not needed, in the atomic case, we avoid the race.
+ */
+#include <linux/linkage.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+        .section .text.atomic,"ax"
+ENTRY(__start_atomic_asm_code)
+        .macro  atomic_op, name, bitwidth, body
+        .align  64
+STD_ENTRY_SECTION(__atomic\name, .text.atomic)
+        {
+         movei  r24, 1
+         j      4f              /* branch to second cache line */
+        }
+1:      {
+         .ifc \bitwidth,16
+         lh     r22, r0
+         .else
+         lw     r22, r0
+         addi   r23, r0, 4
+         .endif
+        }
+        .ifc \bitwidth,64
+        lw      r23, r23
+        .endif
+        \body /* set r24, and r25 if 64-bit */
+        {
+         seq    r26, r22, r24
+         seq    r27, r23, r25
+        }
+        .ifc \bitwidth,64
+        bbnst   r27, 2f
+        .endif
+        bbs     r26, 3f         /* skip write-back if it's the same value */
+2:      {
+         .ifc \bitwidth,16
+         sh     r0, r24
+         .else
+         sw     r0, r24
+         addi   r23, r0, 4
+         .endif
+        }
+        .ifc \bitwidth,64
+        sw      r23, r25
+        .endif
+        mf
+3:      {
+         move   r0, r22
+         .ifc \bitwidth,64
+         move   r1, r23
+         .else
+         move   r1, zero
+         .endif
+         sw     ATOMIC_LOCK_REG_NAME, zero
+        }
+        mtspr   INTERRUPT_CRITICAL_SECTION, zero
+        jrp     lr
+4:      {
+         move   ATOMIC_LOCK_REG_NAME, r1
+         mtspr  INTERRUPT_CRITICAL_SECTION, r24
+        }
+#ifndef CONFIG_SMP
+        j       1b              /* no atomic locks */
+#else
+        {
+         tns    r21, ATOMIC_LOCK_REG_NAME
+         moveli r23, 2048       /* maximum backoff time in cycles */
+        }
+        {
+         bzt    r21, 1b         /* branch if lock acquired */
+         moveli r25, 32         /* starting backoff time in cycles */
+        }
+5:      mtspr   INTERRUPT_CRITICAL_SECTION, zero
+        mfspr   r26, CYCLE_LOW  /* get start point for this backoff */
+6:      mfspr   r22, CYCLE_LOW  /* test to see if we've backed off enough */
+        sub     r22, r22, r26
+        slt     r22, r22, r25
+        bbst    r22, 6b
+        {
+         mtspr  INTERRUPT_CRITICAL_SECTION, r24
+         shli   r25, r25, 1     /* double the backoff; retry the tns */
+        }
+        {
+         tns    r21, ATOMIC_LOCK_REG_NAME
+         slt    r26, r23, r25   /* is the proposed backoff too big? */
+        }
+        {
+         bzt    r21, 1b         /* branch if lock acquired */
+         mvnz   r25, r26, r23
+        }
+        j       5b
+#endif
+        STD_ENDPROC(__atomic\name)
+        .ifc \bitwidth,32
+        .pushsection __ex_table,"a"
+        .word   1b, __atomic\name
+        .word   2b, __atomic\name
+        .word   __atomic\name, __atomic_bad_address
+        .popsection
+        .endif
+        .endm
+atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
+atomic_op _xchg, 32, "move r24, r2"
+atomic_op _xchg_add, 32, "add r24, r22, r2"
+atomic_op _xchg_add_unless, 32, \
+        "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
+atomic_op _or, 32, "or r24, r22, r2"
+atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
+atomic_op _xor, 32, "xor r24, r22, r2"
+atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
+        { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
+atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
+atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
+        slt_u r26, r24, r22; add r25, r25, r26"
+atomic_op 64_xchg_add_unless, 64, \
+        "{ sne r26, r22, r2; sne r27, r23, r3 }; \
+        { bbns r26, 3f; add r24, r22, r4 }; \
+        { bbns r27, 3f; add r25, r23, r5 }; \
+        slt_u r26, r24, r22; add r25, r25, r26"
+        jrp     lr              /* happy backtracer */
+ENTRY(__end_atomic_asm_code)
author	Chris Metcalf <cmetcalf@tilera.com>	2010-05-28 23:09:12 -0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2010-06-04 17:11:18 -0400
commit	867e359b97c970a60626d5d76bbe2a8fadbf38fb (patch)
tree	c5ccbb7f5172e8555977119608ecb1eee3cc37e3 /arch/tile/lib/atomic_asm_32.S
parent	5360bd776f73d0a7da571d72a09a03f237e99900 (diff)

diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S new file mode 100644 index 000000000000..c0d058578192 --- /dev/null +++ b/arch/tile/lib/atomic_asm_32.S
@@ -0,0 +1,197 @@
	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*
	14	* Support routines for atomic operations. Each function takes:
	15	*
	16	* r0: address to manipulate
	17	* r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
	18	* r2: new value to write, or for cmpxchg/add_unless, value to compare against
	19	* r3: (cmpxchg/xchg_add_unless) new value to write or add;
	20	* (atomic64 ops) high word of value to write
	21	* r4/r5: (cmpxchg64/add_unless64) new value to write or add
	22	*
	23	* The 32-bit routines return a "struct __get_user" so that the futex code
	24	* has an opportunity to return -EFAULT to the user if needed.
	25	* The 64-bit routines just return a "long long" with the value,
	26	* since they are only used from kernel space and don't expect to fault.
	27	* Support for 16-bit ops is included in the framework but we don't provide
	28	* any (x86_64 has an atomic_inc_short(), so we might want to some day).
	29	*
	30	* Note that the caller is advised to issue a suitable L1 or L2
	31	* prefetch on the address being manipulated to avoid extra stalls.
	32	* In addition, the hot path is on two icache lines, and we start with
	33	* a jump to the second line to make sure they are both in cache so
	34	* that we never stall waiting on icache fill while holding the lock.
	35	* (This doesn't work out with most 64-bit ops, since they consume
	36	* too many bundles, so may take an extra i-cache stall.)
	37	*
	38	* These routines set the INTERRUPT_CRITICAL_SECTION bit, just
	39	* like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt
	40	* the code, just page faults.
	41	*
	42	* If the load or store faults in a way that can be directly fixed in
	43	* the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it
	44	* directly, return to the instruction that faulted, and retry it.
	45	*
	46	* If the load or store faults in a way that potentially requires us
	47	* to release the atomic lock, then retry (e.g. a migrating PTE), we
	48	* reset the PC in do_page_fault_ics() to the "tns" instruction so
	49	* that on return we will reacquire the lock and restart the op. We
	50	* are somewhat overloading the exception_table_entry notion by doing
	51	* this, since those entries are not normally used for migrating PTEs.
	52	*
	53	* If the main page fault handler discovers a bad address, it will see
	54	* the PC pointing to the "tns" instruction (due to the earlier
	55	* exception_table_entry processing in do_page_fault_ics), and
	56	* re-reset the PC to the fault handler, atomic_bad_address(), which
	57	* effectively takes over from the atomic op and can either return a
	58	* bad "struct __get_user" (for user addresses) or can just panic (for
	59	* bad kernel addresses).
	60	*
	61	* Note that if the value we would store is the same as what we
	62	* loaded, we bypass the load. Other platforms with true atomics can
	63	* make the guarantee that a non-atomic __clear_bit(), for example,
	64	* can safely race with an atomic test_and_set_bit(); this example is
	65	* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do
	66	* that on Tile since the "atomic" op is really just a
	67	* read/modify/write, and can race with the non-atomic
	68	* read/modify/write. However, if we can short-circuit the write when
	69	* it is not needed, in the atomic case, we avoid the race.
	70	*/
	71
	72	#include <linux/linkage.h>
	73	#include <asm/atomic.h>
	74	#include <asm/page.h>
	75	#include <asm/processor.h>
	76
	77	.section .text.atomic,"ax"
	78	ENTRY(__start_atomic_asm_code)
	79
	80	.macro atomic_op, name, bitwidth, body
	81	.align 64
	82	STD_ENTRY_SECTION(__atomic\name, .text.atomic)
	83	{
	84	movei r24, 1
	85	j 4f /* branch to second cache line */
	86	}
	87	1: {
	88	.ifc \bitwidth,16
	89	lh r22, r0
	90	.else
	91	lw r22, r0
	92	addi r23, r0, 4
	93	.endif
	94	}
	95	.ifc \bitwidth,64
	96	lw r23, r23
	97	.endif
	98	\body /* set r24, and r25 if 64-bit */
	99	{
	100	seq r26, r22, r24
	101	seq r27, r23, r25
	102	}
	103	.ifc \bitwidth,64
	104	bbnst r27, 2f
	105	.endif
	106	bbs r26, 3f /* skip write-back if it's the same value */
	107	2: {
	108	.ifc \bitwidth,16
	109	sh r0, r24
	110	.else
	111	sw r0, r24
	112	addi r23, r0, 4
	113	.endif
	114	}
	115	.ifc \bitwidth,64
	116	sw r23, r25
	117	.endif
	118	mf
	119	3: {
	120	move r0, r22
	121	.ifc \bitwidth,64
	122	move r1, r23
	123	.else
	124	move r1, zero
	125	.endif
	126	sw ATOMIC_LOCK_REG_NAME, zero
	127	}
	128	mtspr INTERRUPT_CRITICAL_SECTION, zero
	129	jrp lr
	130	4: {
	131	move ATOMIC_LOCK_REG_NAME, r1
	132	mtspr INTERRUPT_CRITICAL_SECTION, r24
	133	}
	134	#ifndef CONFIG_SMP
	135	j 1b /* no atomic locks */
	136	#else
	137	{
	138	tns r21, ATOMIC_LOCK_REG_NAME
	139	moveli r23, 2048 /* maximum backoff time in cycles */
	140	}
	141	{
	142	bzt r21, 1b /* branch if lock acquired */
	143	moveli r25, 32 /* starting backoff time in cycles */
	144	}
	145	5: mtspr INTERRUPT_CRITICAL_SECTION, zero
	146	mfspr r26, CYCLE_LOW /* get start point for this backoff */
	147	6: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */
	148	sub r22, r22, r26
	149	slt r22, r22, r25
	150	bbst r22, 6b
	151	{
	152	mtspr INTERRUPT_CRITICAL_SECTION, r24
	153	shli r25, r25, 1 /* double the backoff; retry the tns */
	154	}
	155	{
	156	tns r21, ATOMIC_LOCK_REG_NAME
	157	slt r26, r23, r25 /* is the proposed backoff too big? */
	158	}
	159	{
	160	bzt r21, 1b /* branch if lock acquired */
	161	mvnz r25, r26, r23
	162	}
	163	j 5b
	164	#endif
	165	STD_ENDPROC(__atomic\name)
	166	.ifc \bitwidth,32
	167	.pushsection __ex_table,"a"
	168	.word 1b, __atomic\name
	169	.word 2b, __atomic\name
	170	.word __atomic\name, __atomic_bad_address
	171	.popsection
	172	.endif
	173	.endm
	174
	175	atomic_op _cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }"
	176	atomic_op _xchg, 32, "move r24, r2"
	177	atomic_op _xchg_add, 32, "add r24, r22, r2"
	178	atomic_op _xchg_add_unless, 32, \
	179	"sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }"
	180	atomic_op _or, 32, "or r24, r22, r2"
	181	atomic_op _andn, 32, "nor r2, r2, zero; and r24, r22, r2"
	182	atomic_op _xor, 32, "xor r24, r22, r2"
	183
	184	atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \
	185	{ bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }"
	186	atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }"
	187	atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \
	188	slt_u r26, r24, r22; add r25, r25, r26"
	189	atomic_op 64_xchg_add_unless, 64, \
	190	"{ sne r26, r22, r2; sne r27, r23, r3 }; \
	191	{ bbns r26, 3f; add r24, r22, r4 }; \
	192	{ bbns r27, 3f; add r25, r23, r5 }; \
	193	slt_u r26, r24, r22; add r25, r25, r26"
	194
	195	jrp lr /* happy backtracer */
	196
	197	ENTRY(__end_atomic_asm_code)