aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile
diff options
context:
space:
mode:
Diffstat (limited to 'arch/tile')
-rw-r--r--arch/tile/kernel/intvec_32.S61
-rw-r--r--arch/tile/lib/atomic_asm_32.S2
2 files changed, 27 insertions, 36 deletions
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f35c3124fa62..72ade79b621b 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -1470,7 +1470,10 @@ STD_ENTRY(_sys_clone)
1470 * We place it in the __HEAD section to ensure it is relatively 1470 * We place it in the __HEAD section to ensure it is relatively
1471 * near to the intvec_SWINT_1 code (reachable by a conditional branch). 1471 * near to the intvec_SWINT_1 code (reachable by a conditional branch).
1472 * 1472 *
1473 * Must match register usage in do_page_fault(). 1473 * Our use of ATOMIC_LOCK_REG here must match do_page_fault_ics().
1474 *
1475 * As we do in lib/atomic_asm_32.S, we bypass a store if the value we
1476 * would store is the same as the value we just loaded.
1474 */ 1477 */
1475 __HEAD 1478 __HEAD
1476 .align 64 1479 .align 64
@@ -1531,17 +1534,7 @@ ENTRY(sys_cmpxchg)
1531 { 1534 {
1532 shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT 1535 shri r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
1533 slt_u r23, r0, r23 1536 slt_u r23, r0, r23
1534 1537 lw r26, r0 /* see comment in the "#else" for the "lw r26". */
1535 /*
1536 * Ensure that the TLB is loaded before we take out the lock.
1537 * On TILEPro, this will start fetching the value all the way
1538 * into our L1 as well (and if it gets modified before we
1539 * grab the lock, it will be invalidated from our cache
1540 * before we reload it). On tile64, we'll start fetching it
1541 * into our L1 if we're the home, and if we're not, we'll
1542 * still at least start fetching it into the home's L2.
1543 */
1544 lw r26, r0
1545 } 1538 }
1546 { 1539 {
1547 s2a r21, r20, r21 1540 s2a r21, r20, r21
@@ -1557,18 +1550,9 @@ ENTRY(sys_cmpxchg)
1557 bbs r23, .Lcmpxchg64 1550 bbs r23, .Lcmpxchg64
1558 andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */ 1551 andi r23, r0, 7 /* Precompute alignment for cmpxchg64. */
1559 } 1552 }
1560
1561 { 1553 {
1562 /*
1563 * We very carefully align the code that actually runs with
1564 * the lock held (nine bundles) so that we know it is all in
1565 * the icache when we start. This instruction (the jump) is
1566 * at the start of the first cache line, address zero mod 64;
1567 * we jump to somewhere in the second cache line to issue the
1568 * tns, then jump back to finish up.
1569 */
1570 s2a ATOMIC_LOCK_REG_NAME, r25, r21 1554 s2a ATOMIC_LOCK_REG_NAME, r25, r21
1571 j .Lcmpxchg32_tns 1555 j .Lcmpxchg32_tns /* see comment in the #else for the jump. */
1572 } 1556 }
1573 1557
1574#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 1558#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
@@ -1633,24 +1617,25 @@ ENTRY(sys_cmpxchg)
1633 { 1617 {
1634 /* 1618 /*
1635 * We very carefully align the code that actually runs with 1619 * We very carefully align the code that actually runs with
1636 * the lock held (nine bundles) so that we know it is all in 1620 * the lock held (twelve bundles) so that we know it is all in
1637 * the icache when we start. This instruction (the jump) is 1621 * the icache when we start. This instruction (the jump) is
1638 * at the start of the first cache line, address zero mod 64; 1622 * at the start of the first cache line, address zero mod 64;
1639 * we jump to somewhere in the second cache line to issue the 1623 * we jump to the very end of the second cache line to get that
1640 * tns, then jump back to finish up. 1624 * line loaded in the icache, then fall through to issue the tns
1625 * in the third cache line, at which point it's all cached.
1626 * Note that is for performance, not correctness.
1641 */ 1627 */
1642 j .Lcmpxchg32_tns 1628 j .Lcmpxchg32_tns
1643 } 1629 }
1644 1630
1645#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ 1631#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
1646 1632
1647 ENTRY(__sys_cmpxchg_grab_lock) 1633/* Symbol for do_page_fault_ics() to use to compare against the PC. */
1634.global __sys_cmpxchg_grab_lock
1635__sys_cmpxchg_grab_lock:
1648 1636
1649 /* 1637 /*
1650 * Perform the actual cmpxchg or atomic_update. 1638 * Perform the actual cmpxchg or atomic_update.
1651 * Note that the system <arch/atomic.h> header relies on
1652 * atomic_update() to always perform an "mf", so don't make
1653 * it optional or conditional without modifying that code.
1654 */ 1639 */
1655.Ldo_cmpxchg32: 1640.Ldo_cmpxchg32:
1656 { 1641 {
@@ -1668,10 +1653,13 @@ ENTRY(sys_cmpxchg)
1668 } 1653 }
1669 { 1654 {
1670 mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */ 1655 mvnz r24, r23, r25 /* Use atomic_update value if appropriate. */
1671 bbns r22, .Lcmpxchg32_mismatch 1656 bbns r22, .Lcmpxchg32_nostore
1672 } 1657 }
1658 seq r22, r24, r21 /* Are we storing the value we loaded? */
1659 bbs r22, .Lcmpxchg32_nostore
1673 sw r0, r24 1660 sw r0, r24
1674 1661
1662 /* The following instruction is the start of the second cache line. */
1675 /* Do slow mtspr here so the following "mf" waits less. */ 1663 /* Do slow mtspr here so the following "mf" waits less. */
1676 { 1664 {
1677 move sp, r27 1665 move sp, r27
@@ -1679,7 +1667,6 @@ ENTRY(sys_cmpxchg)
1679 } 1667 }
1680 mf 1668 mf
1681 1669
1682 /* The following instruction is the start of the second cache line. */
1683 { 1670 {
1684 move r0, r21 1671 move r0, r21
1685 sw ATOMIC_LOCK_REG_NAME, zero 1672 sw ATOMIC_LOCK_REG_NAME, zero
@@ -1687,7 +1674,7 @@ ENTRY(sys_cmpxchg)
1687 iret 1674 iret
1688 1675
1689 /* Duplicated code here in the case where we don't overlap "mf" */ 1676 /* Duplicated code here in the case where we don't overlap "mf" */
1690.Lcmpxchg32_mismatch: 1677.Lcmpxchg32_nostore:
1691 { 1678 {
1692 move r0, r21 1679 move r0, r21
1693 sw ATOMIC_LOCK_REG_NAME, zero 1680 sw ATOMIC_LOCK_REG_NAME, zero
@@ -1703,8 +1690,6 @@ ENTRY(sys_cmpxchg)
1703 * and for 64-bit cmpxchg. We provide it as a macro and put 1690 * and for 64-bit cmpxchg. We provide it as a macro and put
1704 * it into both versions. We can't share the code literally 1691 * it into both versions. We can't share the code literally
1705 * since it depends on having the right branch-back address. 1692 * since it depends on having the right branch-back address.
1706 * Note that the first few instructions should share the cache
1707 * line with the second half of the actual locked code.
1708 */ 1693 */
1709 .macro cmpxchg_lock, bitwidth 1694 .macro cmpxchg_lock, bitwidth
1710 1695
@@ -1730,7 +1715,7 @@ ENTRY(sys_cmpxchg)
1730 } 1715 }
1731 /* 1716 /*
1732 * The preceding instruction is the last thing that must be 1717 * The preceding instruction is the last thing that must be
1733 * on the second cache line. 1718 * hot in the icache before we do the "tns" above.
1734 */ 1719 */
1735 1720
1736#ifdef CONFIG_SMP 1721#ifdef CONFIG_SMP
@@ -1761,6 +1746,12 @@ ENTRY(sys_cmpxchg)
1761 .endm 1746 .endm
1762 1747
1763.Lcmpxchg32_tns: 1748.Lcmpxchg32_tns:
1749 /*
1750 * This is the last instruction on the second cache line.
1751 * The nop here loads the second line, then we fall through
1752 * to the tns to load the third line before we take the lock.
1753 */
1754 nop
1764 cmpxchg_lock 32 1755 cmpxchg_lock 32
1765 1756
1766 /* 1757 /*
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 82f64cc63658..24448734f6f1 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -59,7 +59,7 @@
59 * bad kernel addresses). 59 * bad kernel addresses).
60 * 60 *
61 * Note that if the value we would store is the same as what we 61 * Note that if the value we would store is the same as what we
62 * loaded, we bypass the load. Other platforms with true atomics can 62 * loaded, we bypass the store. Other platforms with true atomics can
63 * make the guarantee that a non-atomic __clear_bit(), for example, 63 * make the guarantee that a non-atomic __clear_bit(), for example,
64 * can safely race with an atomic test_and_set_bit(); this example is 64 * can safely race with an atomic test_and_set_bit(); this example is
65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do 65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do