diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2011-05-02 15:13:13 -0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2011-05-04 14:40:07 -0400 |
commit | df29ccb6c06dcb65867d4fd3c2fa473017f60ecc (patch) | |
tree | dd05474f47c908f4167cbcd29f5cd41d66e6eb2f | |
parent | 398fa5a9319797e43f67b215337afe62e39475ef (diff) |
arch/tile: allow nonatomic stores to interoperate with fast atomic syscalls
This semantic was already true for atomic operations within the kernel,
and this change makes it true for the fast atomic syscalls (__NR_cmpxchg
and __NR_atomic_update) as well. Previously, user-space had to use
the fast atomic syscalls exclusively to update memory, since raw stores
could lose a race with the atomic update code even when the atomic update
hadn't actually modified the value.
With this change, we no longer write back the value to memory if it
hasn't changed. This allows certain types of idioms in user space to
work as expected, e.g. "atomic exchange" to acquire a spinlock, followed
by a raw store of zero to release the lock.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
-rw-r--r-- | arch/tile/kernel/intvec_32.S | 61 | ||||
-rw-r--r-- | arch/tile/lib/atomic_asm_32.S | 2 |
2 files changed, 27 insertions, 36 deletions
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index f35c3124fa62..72ade79b621b 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S | |||
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone) | |||
1470 | * We place it in the __HEAD section to ensure it is relatively | 1470 | * We place it in the __HEAD section to ensure it is relatively |
1471 | * near to the intvec_SWINT_1 code (reachable by a conditional branch). | 1471 | * near to the intvec_SWINT_1 code (reachable by a conditional branch). |
1472 | * | 1472 | * |
1473 | * Must match register usage in do_page_fault(). | 1473 | * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics(). |
1474 | * | ||
1475 | * As we do in lib/atomic_asm_32.S, we bypass a store if the value we | ||
1476 | * would store is the same as the value we just loaded. | ||
1474 | */ | 1477 | */ |
1475 | __HEAD | 1478 | __HEAD |
1476 | .align 64 | 1479 | .align 64 |
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg) | |||
1531 | { | 1534 | { |
1532 | shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT | 1535 | shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT |
1533 | slt_u r23, r0, r23 | 1536 | slt_u r23, r0, r23 |
1534 | 1537 | lw r26, r0 /* see comment in the "#else" for the "lw r26". */ | |
1535 | /* | ||
1536 | * Ensure that the TLB is loaded before we take out the lock. | ||
1537 | * On TILEPro, this will start fetching the value all the way | ||
1538 | * into our L1 as well (and if it gets modified before we | ||
1539 | * grab the lock, it will be invalidated from our cache | ||
1540 | * before we reload it). On tile64, we'll start fetching it | ||
1541 | * into our L1 if we're the home, and if we're not, we'll | ||
1542 | * still at least start fetching it into the home's L2. | ||
1543 | */ | ||
1544 | lw r26, r0 | ||
1545 | } | 1538 | } |
1546 | { | 1539 | { |
1547 | s2a r21, r20, r21 | 1540 | s2a r21, r20, r21 |
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg) | |||
1557 | bbs r23, .Lcmpxchg64 | 1550 | bbs r23, .Lcmpxchg64 |
1558 | andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ | 1551 | andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ |
1559 | } | 1552 | } |
1560 | |||
1561 | { | 1553 | { |
1562 | /* | ||
1563 | * We very carefully align the code that actually runs with | ||
1564 | * the lock held (nine bundles) so that we know it is all in | ||
1565 | * the icache when we start. This instruction (the jump) is | ||
1566 | * at the start of the first cache line, address zero mod 64; | ||
1567 | * we jump to somewhere in the second cache line to issue the | ||
1568 | * tns, then jump back to finish up. | ||
1569 | */ | ||
1570 | s2a ATOMIC_LOCK_REG_NAME, r25, r21 | 1554 | s2a ATOMIC_LOCK_REG_NAME, r25, r21 |
1571 | j .Lcmpxchg32_tns | 1555 | j .Lcmpxchg32_tns /* see comment in the #else for the jump. */ |
1572 | } | 1556 | } |
1573 | 1557 | ||
1574 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 1558 | #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg) | |||
1633 | { | 1617 | { |
1634 | /* | 1618 | /* |
1635 | * We very carefully align the code that actually runs with | 1619 | * We very carefully align the code that actually runs with |
1636 | * the lock held (nine bundles) so that we know it is all in | 1620 | * the lock held (twelve bundles) so that we know it is all in |
1637 | * the icache when we start. This instruction (the jump) is | 1621 | * the icache when we start. This instruction (the jump) is |
1638 | * at the start of the first cache line, address zero mod 64; | 1622 | * at the start of the first cache line, address zero mod 64; |
1639 | * we jump to somewhere in the second cache line to issue the | 1623 | * we jump to the very end of the second cache line to get that |
1640 | * tns, then jump back to finish up. | 1624 | * line loaded in the icache, then fall through to issue the tns |
1625 | * in the third cache line, at which point it's all cached. | ||
1626 | * Note that is for performance, not correctness. | ||
1641 | */ | 1627 | */ |
1642 | j .Lcmpxchg32_tns | 1628 | j .Lcmpxchg32_tns |
1643 | } | 1629 | } |
1644 | 1630 | ||
1645 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ | 1631 | #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ |
1646 | 1632 | ||
1647 | ENTRY(__sys_cmpxchg_grab_lock) | 1633 | /* Symbol for do_page_fault_ics() to use to compare against the PC. */ |
1634 | .global __sys_cmpxchg_grab_lock | ||
1635 | __sys_cmpxchg_grab_lock: | ||
1648 | 1636 | ||
1649 | /* | 1637 | /* |
1650 | * Perform the actual cmpxchg or atomic_update. | 1638 | * Perform the actual cmpxchg or atomic_update. |
1651 | * Note that the system <arch/atomic.h> header relies on | ||
1652 | * atomic_update() to always perform an "mf", so don't make | ||
1653 | * it optional or conditional without modifying that code. | ||
1654 | */ | 1639 | */ |
1655 | .Ldo_cmpxchg32: | 1640 | .Ldo_cmpxchg32: |
1656 | { | 1641 | { |
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg) | |||
1668 | } | 1653 | } |
1669 | { | 1654 | { |
1670 | mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */ | 1655 | mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */ |
1671 | bbns r22, .Lcmpxchg32_mismatch | 1656 | bbns r22, .Lcmpxchg32_nostore |
1672 | } | 1657 | } |
1658 | seq r22, r24, r21 /* Are we storing the value we loaded? */ | ||
1659 | bbs r22, .Lcmpxchg32_nostore | ||
1673 | sw r0, r24 | 1660 | sw r0, r24 |
1674 | 1661 | ||
1662 | /* The following instruction is the start of the second cache line. */ | ||
1675 | /* Do slow mtspr here so the following "mf" waits less. */ | 1663 | /* Do slow mtspr here so the following "mf" waits less. */ |
1676 | { | 1664 | { |
1677 | move sp, r27 | 1665 | move sp, r27 |
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg) | |||
1679 | } | 1667 | } |
1680 | mf | 1668 | mf |
1681 | 1669 | ||
1682 | /* The following instruction is the start of the second cache line. */ | ||
1683 | { | 1670 | { |
1684 | move r0, r21 | 1671 | move r0, r21 |
1685 | sw ATOMIC_LOCK_REG_NAME, zero | 1672 | sw ATOMIC_LOCK_REG_NAME, zero |
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg) | |||
1687 | iret | 1674 | iret |
1688 | 1675 | ||
1689 | /* Duplicated code here in the case where we don't overlap "mf" */ | 1676 | /* Duplicated code here in the case where we don't overlap "mf" */ |
1690 | .Lcmpxchg32_mismatch: | 1677 | .Lcmpxchg32_nostore: |
1691 | { | 1678 | { |
1692 | move r0, r21 | 1679 | move r0, r21 |
1693 | sw ATOMIC_LOCK_REG_NAME, zero | 1680 | sw ATOMIC_LOCK_REG_NAME, zero |
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg) | |||
1703 | * and for 64-bit cmpxchg. We provide it as a macro and put | 1690 | * and for 64-bit cmpxchg. We provide it as a macro and put |
1704 | * it into both versions. We can't share the code literally | 1691 | * it into both versions. We can't share the code literally |
1705 | * since it depends on having the right branch-back address. | 1692 | * since it depends on having the right branch-back address. |
1706 | * Note that the first few instructions should share the cache | ||
1707 | * line with the second half of the actual locked code. | ||
1708 | */ | 1693 | */ |
1709 | .macro cmpxchg_lock, bitwidth | 1694 | .macro cmpxchg_lock, bitwidth |
1710 | 1695 | ||
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg) | |||
1730 | } | 1715 | } |
1731 | /* | 1716 | /* |
1732 | * The preceding instruction is the last thing that must be | 1717 | * The preceding instruction is the last thing that must be |
1733 | * on the second cache line. | 1718 | * hot in the icache before we do the "tns" above. |
1734 | */ | 1719 | */ |
1735 | 1720 | ||
1736 | #ifdef CONFIG_SMP | 1721 | #ifdef CONFIG_SMP |
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg) | |||
1761 | .endm | 1746 | .endm |
1762 | 1747 | ||
1763 | .Lcmpxchg32_tns: | 1748 | .Lcmpxchg32_tns: |
1749 | /* | ||
1750 | * This is the last instruction on the second cache line. | ||
1751 | * The nop here loads the second line, then we fall through | ||
1752 | * to the tns to load the third line before we take the lock. | ||
1753 | */ | ||
1754 | nop | ||
1764 | cmpxchg_lock 32 | 1755 | cmpxchg_lock 32 |
1765 | 1756 | ||
1766 | /* | 1757 | /* |
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S index 82f64cc63658..24448734f6f1 100644 --- a/arch/tile/lib/atomic_asm_32.S +++ b/arch/tile/lib/atomic_asm_32.S | |||
@@ -59,7 +59,7 @@ | |||
59 | * bad kernel addresses). | 59 | * bad kernel addresses). |
60 | * | 60 | * |
61 | * Note that if the value we would store is the same as what we | 61 | * Note that if the value we would store is the same as what we |
62 | * loaded, we bypass the load. Other platforms with true atomics can | 62 | * loaded, we bypass the store. Other platforms with true atomics can |
63 | * make the guarantee that a non-atomic __clear_bit(), for example, | 63 | * make the guarantee that a non-atomic __clear_bit(), for example, |
64 | * can safely race with an atomic test_and_set_bit(); this example is | 64 | * can safely race with an atomic test_and_set_bit(); this example is |
65 | * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do | 65 | * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do |