arch/tile: allow nonatomic stores to interoperate with fast atomic syscalls

This semantic was already true for atomic operations within the kernel, and this change makes it true for the fast atomic syscalls (__NR_cmpxchg and __NR_atomic_update) as well. Previously, user-space had to use the fast atomic syscalls exclusively to update memory, since raw stores could lose a race with the atomic update code even when the atomic update hadn't actually modified the value. With this change, we no longer write back the value to memory if it hasn't changed. This allows certain types of idioms in user space to work as expected, e.g. "atomic exchange" to acquire a spinlock, followed by a raw store of zero to release the lock. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
author: Chris Metcalf <cmetcalf@tilera.com> 2011-05-02 15:13:13 -0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2011-05-04 14:40:07 -0400
commit: df29ccb6c06dcb65867d4fd3c2fa473017f60ecc (patch)
tree: dd05474f47c908f4167cbcd29f5cd41d66e6eb2f /arch/tile/kernel
parent: 398fa5a9319797e43f67b215337afe62e39475ef (diff)
1 files changed, 26 insertions, 35 deletions
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f35c3124fa62..72ade79b621b 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
 * We place it in the __HEAD section to ensure it is relatively
 * near to the intvec_SWINT_1 code (reachable by a conditional branch).
 *
- * Must match register usage in do_page_fault().
+ * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
+ *
+ * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
+ * would store is the same as the value we just loaded.
 */
        __HEAD
        .align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
        {
         shri   r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
         slt_u  r23, r0, r23
+         lw     r26, r0  /* see comment in the "#else" for the "lw r26". */
-         /*
-          * Ensure that the TLB is loaded before we take out the lock.
-          * On TILEPro, this will start fetching the value all the way
-          * into our L1 as well (and if it gets modified before we
-          * grab the lock, it will be invalidated from our cache
-          * before we reload it).  On tile64, we'll start fetching it
-          * into our L1 if we're the home, and if we're not, we'll
-          * still at least start fetching it into the home's L2.
-          */
-         lw     r26, r0
        }
        {
         s2a    r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
         bbs    r23, .Lcmpxchg64
         andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
        }
        {
-         /*
-          * We very carefully align the code that actually runs with
-          * the lock held (nine bundles) so that we know it is all in
-          * the icache when we start.  This instruction (the jump) is
-          * at the start of the first cache line, address zero mod 64;
-          * we jump to somewhere in the second cache line to issue the
-          * tns, then jump back to finish up.
-          */
         s2a    ATOMIC_LOCK_REG_NAME, r25, r21
-         j      .Lcmpxchg32_tns
+         j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
        }
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
        {
         /*
          * We very carefully align the code that actually runs with
-          * the lock held (nine bundles) so that we know it is all in
+          * the lock held (twelve bundles) so that we know it is all in
          * the icache when we start.  This instruction (the jump) is
          * at the start of the first cache line, address zero mod 64;
-          * we jump to somewhere in the second cache line to issue the
+          * we jump to the very end of the second cache line to get that
-          * tns, then jump back to finish up.
+          * line loaded in the icache, then fall through to issue the tns
+          * in the third cache line, at which point it's all cached.
+          * Note that is for performance, not correctness.
          */
         j      .Lcmpxchg32_tns
        }
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-        ENTRY(__sys_cmpxchg_grab_lock)
+/* Symbol for do_page_fault_ics() to use to compare against the PC. */
+.global __sys_cmpxchg_grab_lock
+__sys_cmpxchg_grab_lock:
        /*
         * Perform the actual cmpxchg or atomic_update.
-         * Note that the system <arch/atomic.h> header relies on
-         * atomic_update() to always perform an "mf", so don't make
-         * it optional or conditional without modifying that code.
         */
 .Ldo_cmpxchg32:
        {
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
        }
        {
         mvnz   r24, r23, r25    /* Use atomic_update value if appropriate. */
-         bbns   r22, .Lcmpxchg32_mismatch
+         bbns   r22, .Lcmpxchg32_nostore
        }
+        seq     r22, r24, r21    /* Are we storing the value we loaded? */
+        bbs     r22, .Lcmpxchg32_nostore
        sw      r0, r24
+        /* The following instruction is the start of the second cache line. */
        /* Do slow mtspr here so the following "mf" waits less. */
        {
         move   sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
        }
        mf
-        /* The following instruction is the start of the second cache line. */
        {
         move   r0, r21
         sw     ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
        iret
        /* Duplicated code here in the case where we don't overlap "mf" */
-.Lcmpxchg32_mismatch:
+.Lcmpxchg32_nostore:
        {
         move   r0, r21
         sw     ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
         * and for 64-bit cmpxchg.  We provide it as a macro and put
         * it into both versions.  We can't share the code literally
         * since it depends on having the right branch-back address.
-         * Note that the first few instructions should share the cache
-         * line with the second half of the actual locked code.
         */
        .macro  cmpxchg_lock, bitwidth
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
        }
        /*
         * The preceding instruction is the last thing that must be
-         * on the second cache line.
+         * hot in the icache before we do the "tns" above.
         */
 #ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
        .endm
 .Lcmpxchg32_tns:
+        /*
+         * This is the last instruction on the second cache line.
+         * The nop here loads the second line, then we fall through
+         * to the tns to load the third line before we take the lock.
+         */
+        nop
        cmpxchg_lock 32
        /*
author	Chris Metcalf <cmetcalf@tilera.com>	2011-05-02 15:13:13 -0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2011-05-04 14:40:07 -0400
commit	df29ccb6c06dcb65867d4fd3c2fa473017f60ecc (patch)
tree	dd05474f47c908f4167cbcd29f5cd41d66e6eb2f /arch/tile/kernel
parent	398fa5a9319797e43f67b215337afe62e39475ef (diff)

diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index f35c3124fa62..72ade79b621b 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
1470	* We place it in the __HEAD section to ensure it is relatively	1470	* We place it in the __HEAD section to ensure it is relatively
1471	* near to the intvec_SWINT_1 code (reachable by a conditional branch).	1471	* near to the intvec_SWINT_1 code (reachable by a conditional branch).
1472	*	1472	*
1473	* Must match register usage in do_page_fault().	1473	* Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
		1474	*
		1475	* As we do in lib/atomic_asm_32.S, we bypass a store if the value we
		1476	* would store is the same as the value we just loaded.
1474	*/	1477	*/
1475	__HEAD	1478	__HEAD
1476	.align 64	1479	.align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
1531	{	1534	{
1532	shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT	1535	shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
1533	slt_u r23, r0, r23	1536	slt_u r23, r0, r23
1534		1537	lw r26, r0 /* see comment in the "#else" for the "lw r26". */
1535	/*
1536	* Ensure that the TLB is loaded before we take out the lock.
1537	* On TILEPro, this will start fetching the value all the way
1538	* into our L1 as well (and if it gets modified before we
1539	* grab the lock, it will be invalidated from our cache
1540	* before we reload it). On tile64, we'll start fetching it
1541	* into our L1 if we're the home, and if we're not, we'll
1542	* still at least start fetching it into the home's L2.
1543	*/
1544	lw r26, r0
1545	}	1538	}
1546	{	1539	{
1547	s2a r21, r20, r21	1540	s2a r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
1557	bbs r23, .Lcmpxchg64	1550	bbs r23, .Lcmpxchg64
1558	andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */	1551	andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */
1559	}	1552	}
1560
1561	{	1553	{
1562	/*
1563	* We very carefully align the code that actually runs with
1564	* the lock held (nine bundles) so that we know it is all in
1565	* the icache when we start. This instruction (the jump) is
1566	* at the start of the first cache line, address zero mod 64;
1567	* we jump to somewhere in the second cache line to issue the
1568	* tns, then jump back to finish up.
1569	*/
1570	s2a ATOMIC_LOCK_REG_NAME, r25, r21	1554	s2a ATOMIC_LOCK_REG_NAME, r25, r21
1571	j .Lcmpxchg32_tns	1555	j .Lcmpxchg32_tns /* see comment in the #else for the jump. */
1572	}	1556	}
1573		1557
1574	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	1558	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
1633	{	1617	{
1634	/*	1618	/*
1635	* We very carefully align the code that actually runs with	1619	* We very carefully align the code that actually runs with
1636	* the lock held (nine bundles) so that we know it is all in	1620	* the lock held (twelve bundles) so that we know it is all in
1637	* the icache when we start. This instruction (the jump) is	1621	* the icache when we start. This instruction (the jump) is
1638	* at the start of the first cache line, address zero mod 64;	1622	* at the start of the first cache line, address zero mod 64;
1639	* we jump to somewhere in the second cache line to issue the	1623	* we jump to the very end of the second cache line to get that
1640	* tns, then jump back to finish up.	1624	* line loaded in the icache, then fall through to issue the tns
		1625	* in the third cache line, at which point it's all cached.
		1626	* Note that is for performance, not correctness.
1641	*/	1627	*/
1642	j .Lcmpxchg32_tns	1628	j .Lcmpxchg32_tns
1643	}	1629	}
1644		1630
1645	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	1631	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
1646		1632
1647	ENTRY(__sys_cmpxchg_grab_lock)	1633	/* Symbol for do_page_fault_ics() to use to compare against the PC. */
		1634	.global __sys_cmpxchg_grab_lock
		1635	__sys_cmpxchg_grab_lock:
1648		1636
1649	/*	1637	/*
1650	* Perform the actual cmpxchg or atomic_update.	1638	* Perform the actual cmpxchg or atomic_update.
1651	* Note that the system <arch/atomic.h> header relies on
1652	* atomic_update() to always perform an "mf", so don't make
1653	* it optional or conditional without modifying that code.
1654	*/	1639	*/
1655	.Ldo_cmpxchg32:	1640	.Ldo_cmpxchg32:
1656	{	1641	{
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
1668	}	1653	}
1669	{	1654	{
1670	mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */	1655	mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */
1671	bbns r22, .Lcmpxchg32_mismatch	1656	bbns r22, .Lcmpxchg32_nostore
1672	}	1657	}
		1658	seq r22, r24, r21 /* Are we storing the value we loaded? */
		1659	bbs r22, .Lcmpxchg32_nostore
1673	sw r0, r24	1660	sw r0, r24
1674		1661
		1662	/* The following instruction is the start of the second cache line. */
1675	/* Do slow mtspr here so the following "mf" waits less. */	1663	/* Do slow mtspr here so the following "mf" waits less. */
1676	{	1664	{
1677	move sp, r27	1665	move sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
1679	}	1667	}
1680	mf	1668	mf
1681		1669
1682	/* The following instruction is the start of the second cache line. */
1683	{	1670	{
1684	move r0, r21	1671	move r0, r21
1685	sw ATOMIC_LOCK_REG_NAME, zero	1672	sw ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
1687	iret	1674	iret
1688		1675
1689	/* Duplicated code here in the case where we don't overlap "mf" */	1676	/* Duplicated code here in the case where we don't overlap "mf" */
1690	.Lcmpxchg32_mismatch:	1677	.Lcmpxchg32_nostore:
1691	{	1678	{
1692	move r0, r21	1679	move r0, r21
1693	sw ATOMIC_LOCK_REG_NAME, zero	1680	sw ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
1703	* and for 64-bit cmpxchg. We provide it as a macro and put	1690	* and for 64-bit cmpxchg. We provide it as a macro and put
1704	* it into both versions. We can't share the code literally	1691	* it into both versions. We can't share the code literally
1705	* since it depends on having the right branch-back address.	1692	* since it depends on having the right branch-back address.
1706	* Note that the first few instructions should share the cache
1707	* line with the second half of the actual locked code.
1708	*/	1693	*/
1709	.macro cmpxchg_lock, bitwidth	1694	.macro cmpxchg_lock, bitwidth
1710		1695
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
1730	}	1715	}
1731	/*	1716	/*
1732	* The preceding instruction is the last thing that must be	1717	* The preceding instruction is the last thing that must be
1733	* on the second cache line.	1718	* hot in the icache before we do the "tns" above.
1734	*/	1719	*/
1735		1720
1736	#ifdef CONFIG_SMP	1721	#ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
1761	.endm	1746	.endm
1762		1747
1763	.Lcmpxchg32_tns:	1748	.Lcmpxchg32_tns:
		1749	/*
		1750	* This is the last instruction on the second cache line.
		1751	* The nop here loads the second line, then we fall through
		1752	* to the tns to load the third line before we take the lock.
		1753	*/
		1754	nop
1764	cmpxchg_lock 32	1755	cmpxchg_lock 32
1765		1756
1766	/*	1757	/*