2 files changed, 27 insertions, 36 deletions
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f35c3124fa62..72ade79b621b 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
 * We place it in the __HEAD section to ensure it is relatively
 * near to the intvec_SWINT_1 code (reachable by a conditional branch).
 *
- * Must match register usage in do_page_fault().
+ * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
+ *
+ * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
+ * would store is the same as the value we just loaded.
 */
        __HEAD
        .align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
        {
         shri   r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
         slt_u  r23, r0, r23
+         lw     r26, r0  /* see comment in the "#else" for the "lw r26". */
-         /*
-          * Ensure that the TLB is loaded before we take out the lock.
-          * On TILEPro, this will start fetching the value all the way
-          * into our L1 as well (and if it gets modified before we
-          * grab the lock, it will be invalidated from our cache
-          * before we reload it).  On tile64, we'll start fetching it
-          * into our L1 if we're the home, and if we're not, we'll
-          * still at least start fetching it into the home's L2.
-          */
-         lw     r26, r0
        }
        {
         s2a    r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
         bbs    r23, .Lcmpxchg64
         andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
        }
        {
-         /*
-          * We very carefully align the code that actually runs with
-          * the lock held (nine bundles) so that we know it is all in
-          * the icache when we start.  This instruction (the jump) is
-          * at the start of the first cache line, address zero mod 64;
-          * we jump to somewhere in the second cache line to issue the
-          * tns, then jump back to finish up.
-          */
         s2a    ATOMIC_LOCK_REG_NAME, r25, r21
-         j      .Lcmpxchg32_tns
+         j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
        }
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
        {
         /*
          * We very carefully align the code that actually runs with
-          * the lock held (nine bundles) so that we know it is all in
+          * the lock held (twelve bundles) so that we know it is all in
          * the icache when we start.  This instruction (the jump) is
          * at the start of the first cache line, address zero mod 64;
-          * we jump to somewhere in the second cache line to issue the
+          * we jump to the very end of the second cache line to get that
-          * tns, then jump back to finish up.
+          * line loaded in the icache, then fall through to issue the tns
+          * in the third cache line, at which point it's all cached.
+          * Note that is for performance, not correctness.
          */
         j      .Lcmpxchg32_tns
        }
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-        ENTRY(__sys_cmpxchg_grab_lock)
+/* Symbol for do_page_fault_ics() to use to compare against the PC. */
+.global __sys_cmpxchg_grab_lock
+__sys_cmpxchg_grab_lock:
        /*
         * Perform the actual cmpxchg or atomic_update.
-         * Note that the system <arch/atomic.h> header relies on
-         * atomic_update() to always perform an "mf", so don't make
-         * it optional or conditional without modifying that code.
         */
 .Ldo_cmpxchg32:
        {
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
        }
        {
         mvnz   r24, r23, r25    /* Use atomic_update value if appropriate. */
-         bbns   r22, .Lcmpxchg32_mismatch
+         bbns   r22, .Lcmpxchg32_nostore
        }
+        seq     r22, r24, r21    /* Are we storing the value we loaded? */
+        bbs     r22, .Lcmpxchg32_nostore
        sw      r0, r24
+        /* The following instruction is the start of the second cache line. */
        /* Do slow mtspr here so the following "mf" waits less. */
        {
         move   sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
        }
        mf
-        /* The following instruction is the start of the second cache line. */
        {
         move   r0, r21
         sw     ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
        iret
        /* Duplicated code here in the case where we don't overlap "mf" */
-.Lcmpxchg32_mismatch:
+.Lcmpxchg32_nostore:
        {
         move   r0, r21
         sw     ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
         * and for 64-bit cmpxchg.  We provide it as a macro and put
         * it into both versions.  We can't share the code literally
         * since it depends on having the right branch-back address.
-         * Note that the first few instructions should share the cache
-         * line with the second half of the actual locked code.
         */
        .macro  cmpxchg_lock, bitwidth
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
        }
        /*
         * The preceding instruction is the last thing that must be
-         * on the second cache line.
+         * hot in the icache before we do the "tns" above.
         */
 #ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
        .endm
 .Lcmpxchg32_tns:
+        /*
+         * This is the last instruction on the second cache line.
+         * The nop here loads the second line, then we fall through
+         * to the tns to load the third line before we take the lock.
+         */
+        nop
        cmpxchg_lock 32
        /*
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 82f64cc63658..24448734f6f1 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
 * bad kernel addresses).
 *
 * Note that if the value we would store is the same as what we
- * loaded, we bypass the load.  Other platforms with true atomics can
+ * loaded, we bypass the store.  Other platforms with true atomics can
 * make the guarantee that a non-atomic __clear_bit(), for example,
 * can safely race with an atomic test_and_set_bit(); this example is
 * from bit_spinlock.h in slub_lock() / slub_unlock().  We can't do

diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index f35c3124fa62..72ade79b621b 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
1470	* We place it in the __HEAD section to ensure it is relatively	1470	* We place it in the __HEAD section to ensure it is relatively
1471	* near to the intvec_SWINT_1 code (reachable by a conditional branch).	1471	* near to the intvec_SWINT_1 code (reachable by a conditional branch).
1472	*	1472	*
1473	* Must match register usage in do_page_fault().	1473	* Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
		1474	*
		1475	* As we do in lib/atomic_asm_32.S, we bypass a store if the value we
		1476	* would store is the same as the value we just loaded.
1474	*/	1477	*/
1475	__HEAD	1478	__HEAD
1476	.align 64	1479	.align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
1531	{	1534	{
1532	shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT	1535	shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
1533	slt_u r23, r0, r23	1536	slt_u r23, r0, r23
1534		1537	lw r26, r0 /* see comment in the "#else" for the "lw r26". */
1535	/*
1536	* Ensure that the TLB is loaded before we take out the lock.
1537	* On TILEPro, this will start fetching the value all the way
1538	* into our L1 as well (and if it gets modified before we
1539	* grab the lock, it will be invalidated from our cache
1540	* before we reload it). On tile64, we'll start fetching it
1541	* into our L1 if we're the home, and if we're not, we'll
1542	* still at least start fetching it into the home's L2.
1543	*/
1544	lw r26, r0
1545	}	1538	}
1546	{	1539	{
1547	s2a r21, r20, r21	1540	s2a r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
1557	bbs r23, .Lcmpxchg64	1550	bbs r23, .Lcmpxchg64
1558	andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */	1551	andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */
1559	}	1552	}
1560
1561	{	1553	{
1562	/*
1563	* We very carefully align the code that actually runs with
1564	* the lock held (nine bundles) so that we know it is all in
1565	* the icache when we start. This instruction (the jump) is
1566	* at the start of the first cache line, address zero mod 64;
1567	* we jump to somewhere in the second cache line to issue the
1568	* tns, then jump back to finish up.
1569	*/
1570	s2a ATOMIC_LOCK_REG_NAME, r25, r21	1554	s2a ATOMIC_LOCK_REG_NAME, r25, r21
1571	j .Lcmpxchg32_tns	1555	j .Lcmpxchg32_tns /* see comment in the #else for the jump. */
1572	}	1556	}
1573		1557
1574	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	1558	#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
1633	{	1617	{
1634	/*	1618	/*
1635	* We very carefully align the code that actually runs with	1619	* We very carefully align the code that actually runs with
1636	* the lock held (nine bundles) so that we know it is all in	1620	* the lock held (twelve bundles) so that we know it is all in
1637	* the icache when we start. This instruction (the jump) is	1621	* the icache when we start. This instruction (the jump) is
1638	* at the start of the first cache line, address zero mod 64;	1622	* at the start of the first cache line, address zero mod 64;
1639	* we jump to somewhere in the second cache line to issue the	1623	* we jump to the very end of the second cache line to get that
1640	* tns, then jump back to finish up.	1624	* line loaded in the icache, then fall through to issue the tns
		1625	* in the third cache line, at which point it's all cached.
		1626	* Note that is for performance, not correctness.
1641	*/	1627	*/
1642	j .Lcmpxchg32_tns	1628	j .Lcmpxchg32_tns
1643	}	1629	}
1644		1630
1645	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */	1631	#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
1646		1632
1647	ENTRY(__sys_cmpxchg_grab_lock)	1633	/* Symbol for do_page_fault_ics() to use to compare against the PC. */
		1634	.global __sys_cmpxchg_grab_lock
		1635	__sys_cmpxchg_grab_lock:
1648		1636
1649	/*	1637	/*
1650	* Perform the actual cmpxchg or atomic_update.	1638	* Perform the actual cmpxchg or atomic_update.
1651	* Note that the system <arch/atomic.h> header relies on
1652	* atomic_update() to always perform an "mf", so don't make
1653	* it optional or conditional without modifying that code.
1654	*/	1639	*/
1655	.Ldo_cmpxchg32:	1640	.Ldo_cmpxchg32:
1656	{	1641	{
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
1668	}	1653	}
1669	{	1654	{
1670	mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */	1655	mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */
1671	bbns r22, .Lcmpxchg32_mismatch	1656	bbns r22, .Lcmpxchg32_nostore
1672	}	1657	}
		1658	seq r22, r24, r21 /* Are we storing the value we loaded? */
		1659	bbs r22, .Lcmpxchg32_nostore
1673	sw r0, r24	1660	sw r0, r24
1674		1661
		1662	/* The following instruction is the start of the second cache line. */
1675	/* Do slow mtspr here so the following "mf" waits less. */	1663	/* Do slow mtspr here so the following "mf" waits less. */
1676	{	1664	{
1677	move sp, r27	1665	move sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
1679	}	1667	}
1680	mf	1668	mf
1681		1669
1682	/* The following instruction is the start of the second cache line. */
1683	{	1670	{
1684	move r0, r21	1671	move r0, r21
1685	sw ATOMIC_LOCK_REG_NAME, zero	1672	sw ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
1687	iret	1674	iret
1688		1675
1689	/* Duplicated code here in the case where we don't overlap "mf" */	1676	/* Duplicated code here in the case where we don't overlap "mf" */
1690	.Lcmpxchg32_mismatch:	1677	.Lcmpxchg32_nostore:
1691	{	1678	{
1692	move r0, r21	1679	move r0, r21
1693	sw ATOMIC_LOCK_REG_NAME, zero	1680	sw ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
1703	* and for 64-bit cmpxchg. We provide it as a macro and put	1690	* and for 64-bit cmpxchg. We provide it as a macro and put
1704	* it into both versions. We can't share the code literally	1691	* it into both versions. We can't share the code literally
1705	* since it depends on having the right branch-back address.	1692	* since it depends on having the right branch-back address.
1706	* Note that the first few instructions should share the cache
1707	* line with the second half of the actual locked code.
1708	*/	1693	*/
1709	.macro cmpxchg_lock, bitwidth	1694	.macro cmpxchg_lock, bitwidth
1710		1695
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
1730	}	1715	}
1731	/*	1716	/*
1732	* The preceding instruction is the last thing that must be	1717	* The preceding instruction is the last thing that must be
1733	* on the second cache line.	1718	* hot in the icache before we do the "tns" above.
1734	*/	1719	*/
1735		1720
1736	#ifdef CONFIG_SMP	1721	#ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
1761	.endm	1746	.endm
1762		1747
1763	.Lcmpxchg32_tns:	1748	.Lcmpxchg32_tns:
		1749	/*
		1750	* This is the last instruction on the second cache line.
		1751	* The nop here loads the second line, then we fall through
		1752	* to the tns to load the third line before we take the lock.
		1753	*/
		1754	nop
1764	cmpxchg_lock 32	1755	cmpxchg_lock 32
1765		1756
1766	/*	1757	/*


diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 82f64cc63658..24448734f6f1 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
59	* bad kernel addresses).	59	* bad kernel addresses).
60	*	60	*
61	* Note that if the value we would store is the same as what we	61	* Note that if the value we would store is the same as what we
62	* loaded, we bypass the load. Other platforms with true atomics can	62	* loaded, we bypass the store. Other platforms with true atomics can
63	* make the guarantee that a non-atomic __clear_bit(), for example,	63	* make the guarantee that a non-atomic __clear_bit(), for example,
64	* can safely race with an atomic test_and_set_bit(); this example is	64	* can safely race with an atomic test_and_set_bit(); this example is
65	* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do	65	* from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do