aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Acker <dacker@roinet.com>2007-11-08 13:17:41 -0500
committerDavid S. Miller <davem@davemloft.net>2008-01-28 18:03:46 -0500
commit7734f6e6bcd7ba78b00e93e74a4ddafd9886cdea (patch)
treea9f9dceffe2816e65d666a592d441f95f7275c27
parent3627947e84dd2f6dbfd01f796139949f65a43b31 (diff)
Fix e100 on systems that have cache incoherent DMA
On the systems that have cache incoherent DMA, including ARM, there is a race condition between software allocating a new receive buffer and hardware writing into a buffer. The two race on touching the last Receive Frame Descriptor (RFD). It has its el-bit set and its next link equal to 0. When hardware encounters this buffer it attempts to write data to it and then update Status Word bits and Actual Count in the RFD. At the same time software may try to clear the el-bit and set the link address to a new buffer. Since the entire RFD is once cache-line, the two write operations can collide. This can lead to the receive unit stalling or interpreting random memory as its receive area. The fix is to set the el-bit on and the size to 0 on the next to last buffer in the chain. When the hardware encounters this buffer it stops and does not write to it at all. The hardware issues an RNR interrupt with the receive unit in the No Resources state. Software can write to the tail of the list because it knows hardware will stop on the previous descriptor that was marked as the end of list. Once it has a new next to last buffer prepared, it can clear the el-bit and set the size on the previous one. The race on this buffer is safe since the link already points to a valid next buffer and the software can handle the race setting the size (assuming aligned 16 bit writes are atomic with respect to the DMA read). If the hardware sees the el-bit cleared without the size set, it will move on to the next buffer and skip this one. If it sees the size set but the el-bit still set, it will complete that buffer and then RNR interrupt and wait. Signed-off-by: David Acker <dacker@roinet.com> Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com> Signed-off-by: Jeff Garzik <jeff@garzik.org>
-rw-r--r--drivers/net/e100.c128
1 files changed, 99 insertions, 29 deletions
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 597fd2953658..d87636dbdea5 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -106,6 +106,13 @@
106 * the RFD, the RFD must be dma_sync'ed to maintain a consistent 106 * the RFD, the RFD must be dma_sync'ed to maintain a consistent
107 * view from software and hardware. 107 * view from software and hardware.
108 * 108 *
109 * In order to keep updates to the RFD link field from colliding with
110 * hardware writes to mark packets complete, we use the feature that
111 * hardware will not write to a size 0 descriptor and mark the previous
112 * packet as end-of-list (EL). After updating the link, we remove EL
113 * and only then restore the size such that hardware may use the
114 * previous-to-end RFD.
115 *
109 * Under typical operation, the receive unit (RU) is start once, 116 * Under typical operation, the receive unit (RU) is start once,
110 * and the controller happily fills RFDs as frames arrive. If 117 * and the controller happily fills RFDs as frames arrive. If
111 * replacement RFDs cannot be allocated, or the RU goes non-active, 118 * replacement RFDs cannot be allocated, or the RU goes non-active,
@@ -281,6 +288,7 @@ struct csr {
281}; 288};
282 289
283enum scb_status { 290enum scb_status {
291 rus_no_res = 0x08,
284 rus_ready = 0x10, 292 rus_ready = 0x10,
285 rus_mask = 0x3C, 293 rus_mask = 0x3C,
286}; 294};
@@ -952,7 +960,7 @@ static void e100_get_defaults(struct nic *nic)
952 ((nic->mac >= mac_82558_D101_A4) ? cb_cid : cb_i)); 960 ((nic->mac >= mac_82558_D101_A4) ? cb_cid : cb_i));
953 961
954 /* Template for a freshly allocated RFD */ 962 /* Template for a freshly allocated RFD */
955 nic->blank_rfd.command = cpu_to_le16(cb_el); 963 nic->blank_rfd.command = 0;
956 nic->blank_rfd.rbd = 0xFFFFFFFF; 964 nic->blank_rfd.rbd = 0xFFFFFFFF;
957 nic->blank_rfd.size = cpu_to_le16(VLAN_ETH_FRAME_LEN); 965 nic->blank_rfd.size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
958 966
@@ -1791,15 +1799,12 @@ static int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)
1791 } 1799 }
1792 1800
1793 /* Link the RFD to end of RFA by linking previous RFD to 1801 /* Link the RFD to end of RFA by linking previous RFD to
1794 * this one, and clearing EL bit of previous. */ 1802 * this one. We are safe to touch the previous RFD because
1803 * it is protected by the before last buffer's el bit being set */
1795 if(rx->prev->skb) { 1804 if(rx->prev->skb) {
1796 struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data; 1805 struct rfd *prev_rfd = (struct rfd *)rx->prev->skb->data;
1797 put_unaligned(cpu_to_le32(rx->dma_addr), 1806 put_unaligned(cpu_to_le32(rx->dma_addr),
1798 (u32 *)&prev_rfd->link); 1807 (u32 *)&prev_rfd->link);
1799 wmb();
1800 prev_rfd->command &= ~cpu_to_le16(cb_el);
1801 pci_dma_sync_single_for_device(nic->pdev, rx->prev->dma_addr,
1802 sizeof(struct rfd), PCI_DMA_TODEVICE);
1803 } 1808 }
1804 1809
1805 return 0; 1810 return 0;
@@ -1824,8 +1829,19 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
1824 DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status); 1829 DPRINTK(RX_STATUS, DEBUG, "status=0x%04X\n", rfd_status);
1825 1830
1826 /* If data isn't ready, nothing to indicate */ 1831 /* If data isn't ready, nothing to indicate */
1827 if(unlikely(!(rfd_status & cb_complete))) 1832 if (unlikely(!(rfd_status & cb_complete))) {
1833 /* If the next buffer has the el bit, but we think the receiver
1834 * is still running, check to see if it really stopped while
1835 * we had interrupts off.
1836 * This allows for a fast restart without re-enabling
1837 * interrupts */
1838 if ((le16_to_cpu(rfd->command) & cb_el) &&
1839 (RU_RUNNING == nic->ru_running))
1840
1841 if (readb(&nic->csr->scb.status) & rus_no_res)
1842 nic->ru_running = RU_SUSPENDED;
1828 return -ENODATA; 1843 return -ENODATA;
1844 }
1829 1845
1830 /* Get actual data size */ 1846 /* Get actual data size */
1831 actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF; 1847 actual_size = le16_to_cpu(rfd->actual_size) & 0x3FFF;
@@ -1836,9 +1852,18 @@ static int e100_rx_indicate(struct nic *nic, struct rx *rx,
1836 pci_unmap_single(nic->pdev, rx->dma_addr, 1852 pci_unmap_single(nic->pdev, rx->dma_addr,
1837 RFD_BUF_LEN, PCI_DMA_FROMDEVICE); 1853 RFD_BUF_LEN, PCI_DMA_FROMDEVICE);
1838 1854
1839 /* this allows for a fast restart without re-enabling interrupts */ 1855 /* If this buffer has the el bit, but we think the receiver
1840 if(le16_to_cpu(rfd->command) & cb_el) 1856 * is still running, check to see if it really stopped while
1857 * we had interrupts off.
1858 * This allows for a fast restart without re-enabling interrupts.
1859 * This can happen when the RU sees the size change but also sees
1860 * the el bit set. */
1861 if ((le16_to_cpu(rfd->command) & cb_el) &&
1862 (RU_RUNNING == nic->ru_running)) {
1863
1864 if (readb(&nic->csr->scb.status) & rus_no_res)
1841 nic->ru_running = RU_SUSPENDED; 1865 nic->ru_running = RU_SUSPENDED;
1866 }
1842 1867
1843 /* Pull off the RFD and put the actual data (minus eth hdr) */ 1868 /* Pull off the RFD and put the actual data (minus eth hdr) */
1844 skb_reserve(skb, sizeof(struct rfd)); 1869 skb_reserve(skb, sizeof(struct rfd));
@@ -1870,31 +1895,30 @@ static void e100_rx_clean(struct nic *nic, unsigned int *work_done,
1870 unsigned int work_to_do) 1895 unsigned int work_to_do)
1871{ 1896{
1872 struct rx *rx; 1897 struct rx *rx;
1873 int restart_required = 0; 1898 int restart_required = 0, err = 0;
1874 struct rx *rx_to_start = NULL; 1899 struct rx *old_before_last_rx, *new_before_last_rx;
1875 1900 struct rfd *old_before_last_rfd, *new_before_last_rfd;
1876 /* are we already rnr? then pay attention!!! this ensures that
1877 * the state machine progression never allows a start with a
1878 * partially cleaned list, avoiding a race between hardware
1879 * and rx_to_clean when in NAPI mode */
1880 if(RU_SUSPENDED == nic->ru_running)
1881 restart_required = 1;
1882 1901
1883 /* Indicate newly arrived packets */ 1902 /* Indicate newly arrived packets */
1884 for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) { 1903 for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) {
1885 int err = e100_rx_indicate(nic, rx, work_done, work_to_do); 1904 err = e100_rx_indicate(nic, rx, work_done, work_to_do);
1886 if(-EAGAIN == err) { 1905 /* Hit quota or no more to clean */
1887 /* hit quota so have more work to do, restart once 1906 if (-EAGAIN == err || -ENODATA == err)
1888 * cleanup is complete */
1889 restart_required = 0;
1890 break; 1907 break;
1891 } else if(-ENODATA == err)
1892 break; /* No more to clean */
1893 } 1908 }
1894 1909
1895 /* save our starting point as the place we'll restart the receiver */ 1910
1896 if(restart_required) 1911 /* On EAGAIN, hit quota so have more work to do, restart once
1897 rx_to_start = nic->rx_to_clean; 1912 * cleanup is complete.
1913 * Else, are we already rnr? then pay attention!!! this ensures that
1914 * the state machine progression never allows a start with a
1915 * partially cleaned list, avoiding a race between hardware
1916 * and rx_to_clean when in NAPI mode */
1917 if (-EAGAIN != err && RU_SUSPENDED == nic->ru_running)
1918 restart_required = 1;
1919
1920 old_before_last_rx = nic->rx_to_use->prev->prev;
1921 old_before_last_rfd = (struct rfd *)old_before_last_rx->skb->data;
1898 1922
1899 /* Alloc new skbs to refill list */ 1923 /* Alloc new skbs to refill list */
1900 for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) { 1924 for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) {
@@ -1902,10 +1926,42 @@ static void e100_rx_clean(struct nic *nic, unsigned int *work_done,
1902 break; /* Better luck next time (see watchdog) */ 1926 break; /* Better luck next time (see watchdog) */
1903 } 1927 }
1904 1928
1929 new_before_last_rx = nic->rx_to_use->prev->prev;
1930 if (new_before_last_rx != old_before_last_rx) {
1931 /* Set the el-bit on the buffer that is before the last buffer.
1932 * This lets us update the next pointer on the last buffer
1933 * without worrying about hardware touching it.
1934 * We set the size to 0 to prevent hardware from touching this
1935 * buffer.
1936 * When the hardware hits the before last buffer with el-bit
1937 * and size of 0, it will RNR interrupt, the RUS will go into
1938 * the No Resources state. It will not complete nor write to
1939 * this buffer. */
1940 new_before_last_rfd =
1941 (struct rfd *)new_before_last_rx->skb->data;
1942 new_before_last_rfd->size = 0;
1943 new_before_last_rfd->command |= cpu_to_le16(cb_el);
1944 pci_dma_sync_single_for_device(nic->pdev,
1945 new_before_last_rx->dma_addr, sizeof(struct rfd),
1946 PCI_DMA_TODEVICE);
1947
1948 /* Now that we have a new stopping point, we can clear the old
1949 * stopping point. We must sync twice to get the proper
1950 * ordering on the hardware side of things. */
1951 old_before_last_rfd->command &= ~cpu_to_le16(cb_el);
1952 pci_dma_sync_single_for_device(nic->pdev,
1953 old_before_last_rx->dma_addr, sizeof(struct rfd),
1954 PCI_DMA_TODEVICE);
1955 old_before_last_rfd->size = cpu_to_le16(VLAN_ETH_FRAME_LEN);
1956 pci_dma_sync_single_for_device(nic->pdev,
1957 old_before_last_rx->dma_addr, sizeof(struct rfd),
1958 PCI_DMA_TODEVICE);
1959 }
1960
1905 if(restart_required) { 1961 if(restart_required) {
1906 // ack the rnr? 1962 // ack the rnr?
1907 writeb(stat_ack_rnr, &nic->csr->scb.stat_ack); 1963 writeb(stat_ack_rnr, &nic->csr->scb.stat_ack);
1908 e100_start_receiver(nic, rx_to_start); 1964 e100_start_receiver(nic, nic->rx_to_clean);
1909 if(work_done) 1965 if(work_done)
1910 (*work_done)++; 1966 (*work_done)++;
1911 } 1967 }
@@ -1937,6 +1993,7 @@ static int e100_rx_alloc_list(struct nic *nic)
1937{ 1993{
1938 struct rx *rx; 1994 struct rx *rx;
1939 unsigned int i, count = nic->params.rfds.count; 1995 unsigned int i, count = nic->params.rfds.count;
1996 struct rfd *before_last;
1940 1997
1941 nic->rx_to_use = nic->rx_to_clean = NULL; 1998 nic->rx_to_use = nic->rx_to_clean = NULL;
1942 nic->ru_running = RU_UNINITIALIZED; 1999 nic->ru_running = RU_UNINITIALIZED;
@@ -1952,6 +2009,19 @@ static int e100_rx_alloc_list(struct nic *nic)
1952 return -ENOMEM; 2009 return -ENOMEM;
1953 } 2010 }
1954 } 2011 }
2012 /* Set the el-bit on the buffer that is before the last buffer.
2013 * This lets us update the next pointer on the last buffer without
2014 * worrying about hardware touching it.
2015 * We set the size to 0 to prevent hardware from touching this buffer.
2016 * When the hardware hits the before last buffer with el-bit and size
2017 * of 0, it will RNR interrupt, the RU will go into the No Resources
2018 * state. It will not complete nor write to this buffer. */
2019 rx = nic->rxs->prev->prev;
2020 before_last = (struct rfd *)rx->skb->data;
2021 before_last->command |= cpu_to_le16(cb_el);
2022 before_last->size = 0;
2023 pci_dma_sync_single_for_device(nic->pdev, rx->dma_addr,
2024 sizeof(struct rfd), PCI_DMA_TODEVICE);
1955 2025
1956 nic->rx_to_use = nic->rx_to_clean = nic->rxs; 2026 nic->rx_to_use = nic->rx_to_clean = nic->rxs;
1957 nic->ru_running = RU_SUSPENDED; 2027 nic->ru_running = RU_SUSPENDED;