aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2011-05-02 15:13:13 -0400
committerChris Metcalf <cmetcalf@tilera.com>2011-05-04 14:40:07 -0400
commitdf29ccb6c06dcb65867d4fd3c2fa473017f60ecc (patch)
treedd05474f47c908f4167cbcd29f5cd41d66e6eb2f /arch/tile
parent398fa5a9319797e43f67b215337afe62e39475ef (diff)
arch/tile: allow nonatomic stores to interoperate with fast atomic syscalls
This semantic was already true for atomic operations within the kernel, and this change makes it true for the fast atomic syscalls (__NR_cmpxchg and __NR_atomic_update) as well. Previously, user-space had to use the fast atomic syscalls exclusively to update memory, since raw stores could lose a race with the atomic update code even when the atomic update hadn't actually modified the value. With this change, we no longer write back the value to memory if it hasn't changed. This allows certain types of idioms in user space to work as expected, e.g. "atomic exchange" to acquire a spinlock, followed by a raw store of zero to release the lock. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile')
-rw-r--r--arch/tile/kernel/intvec_32.S61
-rw-r--r--arch/tile/lib/atomic_asm_32.S2
2 files changed, 27 insertions, 36 deletions
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f35c3124fa62..72ade79b621b 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
1470 * We place it in the __HEAD section to ensure it is relatively 1470 * We place it in the __HEAD section to ensure it is relatively
1471 * near to the intvec_SWINT_1 code (reachable by a conditional branch). 1471 * near to the intvec_SWINT_1 code (reachable by a conditional branch).
1472 * 1472 *
1473 * Must match register usage in do_page_fault(). 1473 * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
1474 *
1475 * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
1476 * would store is the same as the value we just loaded.
1474 */ 1477 */
1475 __HEAD 1478 __HEAD
1476 .align 64 1479 .align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
1531 { 1534 {
1532 shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT 1535 shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
1533 slt_u r23, r0, r23 1536 slt_u r23, r0, r23
1534 1537 lw r26, r0 /* see comment in the "#else" for the "lw r26". */
1535 /*
1536 * Ensure that the TLB is loaded before we take out the lock.
1537 * On TILEPro, this will start fetching the value all the way
1538 * into our L1 as well (and if it gets modified before we
1539 * grab the lock, it will be invalidated from our cache
1540 * before we reload it). On tile64, we'll start fetching it
1541 * into our L1 if we're the home, and if we're not, we'll
1542 * still at least start fetching it into the home's L2.
1543 */
1544 lw r26, r0
1545 } 1538 }
1546 { 1539 {
1547 s2a r21, r20, r21 1540 s2a r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
1557 bbs r23, .Lcmpxchg64 1550 bbs r23, .Lcmpxchg64
1558 andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ 1551 andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */
1559 } 1552 }
1560
1561 { 1553 {
1562 /*
1563 * We very carefully align the code that actually runs with
1564 * the lock held (nine bundles) so that we know it is all in
1565 * the icache when we start. This instruction (the jump) is
1566 * at the start of the first cache line, address zero mod 64;
1567 * we jump to somewhere in the second cache line to issue the
1568 * tns, then jump back to finish up.
1569 */
1570 s2a ATOMIC_LOCK_REG_NAME, r25, r21 1554 s2a ATOMIC_LOCK_REG_NAME, r25, r21
1571 j .Lcmpxchg32_tns 1555 j .Lcmpxchg32_tns /* see comment in the #else for the jump. */
1572 } 1556 }
1573 1557
1574#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 1558#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
1633 { 1617 {
1634 /* 1618 /*
1635 * We very carefully align the code that actually runs with 1619 * We very carefully align the code that actually runs with
1636 * the lock held (nine bundles) so that we know it is all in 1620 * the lock held (twelve bundles) so that we know it is all in
1637 * the icache when we start. This instruction (the jump) is 1621 * the icache when we start. This instruction (the jump) is
1638 * at the start of the first cache line, address zero mod 64; 1622 * at the start of the first cache line, address zero mod 64;
1639 * we jump to somewhere in the second cache line to issue the 1623 * we jump to the very end of the second cache line to get that
1640 * tns, then jump back to finish up. 1624 * line loaded in the icache, then fall through to issue the tns
1625 * in the third cache line, at which point it's all cached.
1626 * Note that is for performance, not correctness.
1641 */ 1627 */
1642 j .Lcmpxchg32_tns 1628 j .Lcmpxchg32_tns
1643 } 1629 }
1644 1630
1645#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 1631#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
1646 1632
1647 ENTRY(__sys_cmpxchg_grab_lock) 1633/* Symbol for do_page_fault_ics() to use to compare against the PC. */
1634.global __sys_cmpxchg_grab_lock
1635__sys_cmpxchg_grab_lock:
1648 1636
1649 /* 1637 /*
1650 * Perform the actual cmpxchg or atomic_update. 1638 * Perform the actual cmpxchg or atomic_update.
1651 * Note that the system <arch/atomic.h> header relies on
1652 * atomic_update() to always perform an "mf", so don't make
1653 * it optional or conditional without modifying that code.
1654 */ 1639 */
1655.Ldo_cmpxchg32: 1640.Ldo_cmpxchg32:
1656 { 1641 {
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
1668 } 1653 }
1669 { 1654 {
1670 mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */ 1655 mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */
1671 bbns r22, .Lcmpxchg32_mismatch 1656 bbns r22, .Lcmpxchg32_nostore
1672 } 1657 }
1658 seq r22, r24, r21 /* Are we storing the value we loaded? */
1659 bbs r22, .Lcmpxchg32_nostore
1673 sw r0, r24 1660 sw r0, r24
1674 1661
1662 /* The following instruction is the start of the second cache line. */
1675 /* Do slow mtspr here so the following "mf" waits less. */ 1663 /* Do slow mtspr here so the following "mf" waits less. */
1676 { 1664 {
1677 move sp, r27 1665 move sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
1679 } 1667 }
1680 mf 1668 mf
1681 1669
1682 /* The following instruction is the start of the second cache line. */
1683 { 1670 {
1684 move r0, r21 1671 move r0, r21
1685 sw ATOMIC_LOCK_REG_NAME, zero 1672 sw ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
1687 iret 1674 iret
1688 1675
1689 /* Duplicated code here in the case where we don't overlap "mf" */ 1676 /* Duplicated code here in the case where we don't overlap "mf" */
1690.Lcmpxchg32_mismatch: 1677.Lcmpxchg32_nostore:
1691 { 1678 {
1692 move r0, r21 1679 move r0, r21
1693 sw ATOMIC_LOCK_REG_NAME, zero 1680 sw ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
1703 * and for 64-bit cmpxchg. We provide it as a macro and put 1690 * and for 64-bit cmpxchg. We provide it as a macro and put
1704 * it into both versions. We can't share the code literally 1691 * it into both versions. We can't share the code literally
1705 * since it depends on having the right branch-back address. 1692 * since it depends on having the right branch-back address.
1706 * Note that the first few instructions should share the cache
1707 * line with the second half of the actual locked code.
1708 */ 1693 */
1709 .macro cmpxchg_lock, bitwidth 1694 .macro cmpxchg_lock, bitwidth
1710 1695
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
1730 } 1715 }
1731 /* 1716 /*
1732 * The preceding instruction is the last thing that must be 1717 * The preceding instruction is the last thing that must be
1733 * on the second cache line. 1718 * hot in the icache before we do the "tns" above.
1734 */ 1719 */
1735 1720
1736#ifdef CONFIG_SMP 1721#ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
1761 .endm 1746 .endm
1762 1747
1763.Lcmpxchg32_tns: 1748.Lcmpxchg32_tns:
1749 /*
1750 * This is the last instruction on the second cache line.
1751 * The nop here loads the second line, then we fall through
1752 * to the tns to load the third line before we take the lock.
1753 */
1754 nop
1764 cmpxchg_lock 32 1755 cmpxchg_lock 32
1765 1756
1766 /* 1757 /*
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 82f64cc63658..24448734f6f1 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
59 * bad kernel addresses). 59 * bad kernel addresses).
60 * 60 *
61 * Note that if the value we would store is the same as what we 61 * Note that if the value we would store is the same as what we
62 * loaded, we bypass the load. Other platforms with true atomics can 62 * loaded, we bypass the store. Other platforms with true atomics can
63 * make the guarantee that a non-atomic __clear_bit(), for example, 63 * make the guarantee that a non-atomic __clear_bit(), for example,
64 * can safely race with an atomic test_and_set_bit(); this example is 64 * can safely race with an atomic test_and_set_bit(); this example is
65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do 65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do