summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLukas Wunner <lukas@wunner.de>2019-09-11 06:15:30 -0400
committerMark Brown <broonie@kernel.org>2019-09-11 10:57:46 -0400
commit2b8279aec1829da4dc645e8731c28d2f5458d652 (patch)
tree2fe66ade86b3267c5455b6d6d691fc36d40888fd
parent8259bf667a0f9ea1a37bb71c7af9ebd550e9251d (diff)
spi: bcm2835: Speed up RX-only DMA transfers by zero-filling TX FIFO
The BCM2835 SPI driver currently sets the SPI_CONTROLLER_MUST_TX flag. When performing an RX-only transfer, this flag causes the SPI core to allocate and DMA-map a dummy buffer which is copied to the TX FIFO. The dummy buffer is necessary because the chip is not capable of automatically clocking out null bytes. Avoid the overhead induced by the dummy buffer by preallocating a reusable DMA transaction which fills the TX FIFO by cyclically copying from the zero page. The transaction requires very little CPU time to submit and generates no interrupts while running. Specifics are provided in kerneldoc comments. [Nathan Chancellor contributed a DMA mapping fixup for an early version of this commit, hence his Signed-off-by.] Tested-by: Nuno Sá <nuno.sa@analog.com> Tested-by: Noralf Trønnes <noralf@tronnes.org> Signed-off-by: Nathan Chancellor <natechancellor@gmail.com> Signed-off-by: Lukas Wunner <lukas@wunner.de> Acked-by: Stefan Wahren <wahrenst@gmx.net> Acked-by: Martin Sperl <kernel@martin.sperl.org> Cc: Robert Jarzmik <robert.jarzmik@free.fr> Link: https://lore.kernel.org/r/f45920af18dbf06e34129bbc406f53dc9c5d1075.1568187525.git.lukas@wunner.de Signed-off-by: Mark Brown <broonie@kernel.org>
-rw-r--r--drivers/spi/spi-bcm2835.c93
1 files changed, 82 insertions, 11 deletions
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 3fe823891861..8a0ea465cbe0 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -111,6 +111,9 @@ MODULE_PARM_DESC(polling_limit_us,
111 * @tx_dma_active: whether a TX DMA descriptor is in progress 111 * @tx_dma_active: whether a TX DMA descriptor is in progress
112 * @rx_dma_active: whether a RX DMA descriptor is in progress 112 * @rx_dma_active: whether a RX DMA descriptor is in progress
113 * (used by bcm2835_spi_dma_tx_done() to handle a race) 113 * (used by bcm2835_spi_dma_tx_done() to handle a race)
114 * @fill_tx_desc: preallocated TX DMA descriptor used for RX-only transfers
115 * (cyclically copies from zero page to TX FIFO)
116 * @fill_tx_addr: bus address of zero page
114 * @clear_rx_desc: preallocated RX DMA descriptor used for TX-only transfers 117 * @clear_rx_desc: preallocated RX DMA descriptor used for TX-only transfers
115 * (cyclically clears RX FIFO by writing @clear_rx_cs to CS register) 118 * (cyclically clears RX FIFO by writing @clear_rx_cs to CS register)
116 * @clear_rx_addr: bus address of @clear_rx_cs 119 * @clear_rx_addr: bus address of @clear_rx_cs
@@ -140,6 +143,8 @@ struct bcm2835_spi {
140 u8 chip_select; 143 u8 chip_select;
141 unsigned int tx_dma_active; 144 unsigned int tx_dma_active;
142 unsigned int rx_dma_active; 145 unsigned int rx_dma_active;
146 struct dma_async_tx_descriptor *fill_tx_desc;
147 dma_addr_t fill_tx_addr;
143 struct dma_async_tx_descriptor *clear_rx_desc[BCM2835_SPI_NUM_CS]; 148 struct dma_async_tx_descriptor *clear_rx_desc[BCM2835_SPI_NUM_CS];
144 dma_addr_t clear_rx_addr; 149 dma_addr_t clear_rx_addr;
145 u32 clear_rx_cs[BCM2835_SPI_NUM_CS] ____cacheline_aligned; 150 u32 clear_rx_cs[BCM2835_SPI_NUM_CS] ____cacheline_aligned;
@@ -469,14 +474,14 @@ static void bcm2835_spi_transfer_prologue(struct spi_controller *ctlr,
469 bs->rx_prologue = 0; 474 bs->rx_prologue = 0;
470 bs->tx_spillover = false; 475 bs->tx_spillover = false;
471 476
472 if (!sg_is_last(&tfr->tx_sg.sgl[0])) 477 if (bs->tx_buf && !sg_is_last(&tfr->tx_sg.sgl[0]))
473 bs->tx_prologue = sg_dma_len(&tfr->tx_sg.sgl[0]) & 3; 478 bs->tx_prologue = sg_dma_len(&tfr->tx_sg.sgl[0]) & 3;
474 479
475 if (bs->rx_buf && !sg_is_last(&tfr->rx_sg.sgl[0])) { 480 if (bs->rx_buf && !sg_is_last(&tfr->rx_sg.sgl[0])) {
476 bs->rx_prologue = sg_dma_len(&tfr->rx_sg.sgl[0]) & 3; 481 bs->rx_prologue = sg_dma_len(&tfr->rx_sg.sgl[0]) & 3;
477 482
478 if (bs->rx_prologue > bs->tx_prologue) { 483 if (bs->rx_prologue > bs->tx_prologue) {
479 if (sg_is_last(&tfr->tx_sg.sgl[0])) { 484 if (!bs->tx_buf || sg_is_last(&tfr->tx_sg.sgl[0])) {
480 bs->tx_prologue = bs->rx_prologue; 485 bs->tx_prologue = bs->rx_prologue;
481 } else { 486 } else {
482 bs->tx_prologue += 4; 487 bs->tx_prologue += 4;
@@ -508,6 +513,9 @@ static void bcm2835_spi_transfer_prologue(struct spi_controller *ctlr,
508 sg_dma_len(&tfr->rx_sg.sgl[0]) -= bs->rx_prologue; 513 sg_dma_len(&tfr->rx_sg.sgl[0]) -= bs->rx_prologue;
509 } 514 }
510 515
516 if (!bs->tx_buf)
517 return;
518
511 /* 519 /*
512 * Write remaining TX prologue. Adjust first entry in TX sglist. 520 * Write remaining TX prologue. Adjust first entry in TX sglist.
513 * Also adjust second entry if prologue spills over to it. 521 * Also adjust second entry if prologue spills over to it.
@@ -552,6 +560,9 @@ static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
552 sg_dma_len(&tfr->rx_sg.sgl[0]) += bs->rx_prologue; 560 sg_dma_len(&tfr->rx_sg.sgl[0]) += bs->rx_prologue;
553 } 561 }
554 562
563 if (!bs->tx_buf)
564 goto out;
565
555 if (likely(!bs->tx_spillover)) { 566 if (likely(!bs->tx_spillover)) {
556 sg_dma_address(&tfr->tx_sg.sgl[0]) -= bs->tx_prologue; 567 sg_dma_address(&tfr->tx_sg.sgl[0]) -= bs->tx_prologue;
557 sg_dma_len(&tfr->tx_sg.sgl[0]) += bs->tx_prologue; 568 sg_dma_len(&tfr->tx_sg.sgl[0]) += bs->tx_prologue;
@@ -560,7 +571,7 @@ static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
560 sg_dma_address(&tfr->tx_sg.sgl[1]) -= 4; 571 sg_dma_address(&tfr->tx_sg.sgl[1]) -= 4;
561 sg_dma_len(&tfr->tx_sg.sgl[1]) += 4; 572 sg_dma_len(&tfr->tx_sg.sgl[1]) += 4;
562 } 573 }
563 574out:
564 bs->tx_prologue = 0; 575 bs->tx_prologue = 0;
565} 576}
566 577
@@ -575,10 +586,7 @@ static void bcm2835_spi_dma_rx_done(void *data)
575 struct spi_controller *ctlr = data; 586 struct spi_controller *ctlr = data;
576 struct bcm2835_spi *bs = spi_controller_get_devdata(ctlr); 587 struct bcm2835_spi *bs = spi_controller_get_devdata(ctlr);
577 588
578 /* reset fifo and HW */ 589 /* terminate tx-dma as we do not have an irq for it
579 bcm2835_spi_reset_hw(ctlr);
580
581 /* and terminate tx-dma as we do not have an irq for it
582 * because when the rx dma will terminate and this callback 590 * because when the rx dma will terminate and this callback
583 * is called the tx-dma must have finished - can't get to this 591 * is called the tx-dma must have finished - can't get to this
584 * situation otherwise... 592 * situation otherwise...
@@ -588,6 +596,9 @@ static void bcm2835_spi_dma_rx_done(void *data)
588 bs->rx_dma_active = false; 596 bs->rx_dma_active = false;
589 bcm2835_spi_undo_prologue(bs); 597 bcm2835_spi_undo_prologue(bs);
590 598
599 /* reset fifo and HW */
600 bcm2835_spi_reset_hw(ctlr);
601
591 /* and mark as completed */; 602 /* and mark as completed */;
592 complete(&ctlr->xfer_completion); 603 complete(&ctlr->xfer_completion);
593} 604}
@@ -715,6 +726,24 @@ static int bcm2835_spi_prepare_sg(struct spi_controller *ctlr,
715 * register.) Reading 32 bytes from the RX FIFO would normally require 8 bus 726 * register.) Reading 32 bytes from the RX FIFO would normally require 8 bus
716 * accesses, whereas clearing it requires only 1 bus access. So an 8-fold 727 * accesses, whereas clearing it requires only 1 bus access. So an 8-fold
717 * reduction in bus traffic and thus energy consumption is achieved. 728 * reduction in bus traffic and thus energy consumption is achieved.
729 *
730 * For *RX-only* transfers (tx_buf is %NULL), fill the TX FIFO by cyclically
731 * copying from the zero page. The DMA descriptor to do this is preallocated
732 * in bcm2835_dma_init(). It must be terminated once the RX DMA channel is
733 * done and can then be reused.
734 *
735 * The BCM2835 DMA driver autodetects when a transaction copies from the zero
736 * page and utilizes the DMA controller's ability to synthesize zeroes instead
737 * of copying them from memory. This reduces traffic on the memory bus. The
738 * feature is not available on so-called "lite" channels, but normally TX DMA
739 * is backed by a full-featured channel.
740 *
741 * Zero-filling the TX FIFO is paced by the DREQ signal. Unfortunately the
742 * BCM2835 SPI controller continues to assert DREQ even after the DLEN register
743 * has been counted down to zero (hardware erratum). Thus, when the transfer
744 * has finished, the DMA engine zero-fills the TX FIFO until it is half full.
745 * (Tuneable with the DC register.) So up to 9 gratuitous bus accesses are
746 * performed at the end of an RX-only transfer.
718 */ 747 */
719static int bcm2835_spi_transfer_one_dma(struct spi_controller *ctlr, 748static int bcm2835_spi_transfer_one_dma(struct spi_controller *ctlr,
720 struct spi_device *spi, 749 struct spi_device *spi,
@@ -735,7 +764,12 @@ static int bcm2835_spi_transfer_one_dma(struct spi_controller *ctlr,
735 bcm2835_spi_transfer_prologue(ctlr, tfr, bs, cs); 764 bcm2835_spi_transfer_prologue(ctlr, tfr, bs, cs);
736 765
737 /* setup tx-DMA */ 766 /* setup tx-DMA */
738 ret = bcm2835_spi_prepare_sg(ctlr, spi, tfr, bs, true); 767 if (bs->tx_buf) {
768 ret = bcm2835_spi_prepare_sg(ctlr, spi, tfr, bs, true);
769 } else {
770 cookie = dmaengine_submit(bs->fill_tx_desc);
771 ret = dma_submit_error(cookie);
772 }
739 if (ret) 773 if (ret)
740 goto err_reset_hw; 774 goto err_reset_hw;
741 775
@@ -812,6 +846,16 @@ static void bcm2835_dma_release(struct spi_controller *ctlr,
812 846
813 if (ctlr->dma_tx) { 847 if (ctlr->dma_tx) {
814 dmaengine_terminate_sync(ctlr->dma_tx); 848 dmaengine_terminate_sync(ctlr->dma_tx);
849
850 if (bs->fill_tx_desc)
851 dmaengine_desc_free(bs->fill_tx_desc);
852
853 if (bs->fill_tx_addr)
854 dma_unmap_page_attrs(ctlr->dma_tx->device->dev,
855 bs->fill_tx_addr, sizeof(u32),
856 DMA_TO_DEVICE,
857 DMA_ATTR_SKIP_CPU_SYNC);
858
815 dma_release_channel(ctlr->dma_tx); 859 dma_release_channel(ctlr->dma_tx);
816 ctlr->dma_tx = NULL; 860 ctlr->dma_tx = NULL;
817 } 861 }
@@ -862,7 +906,11 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
862 goto err_release; 906 goto err_release;
863 } 907 }
864 908
865 /* configure DMAs */ 909 /*
910 * The TX DMA channel either copies a transfer's TX buffer to the FIFO
911 * or, in case of an RX-only transfer, cyclically copies from the zero
912 * page to the FIFO using a preallocated, reusable descriptor.
913 */
866 slave_config.dst_addr = (u32)(dma_reg_base + BCM2835_SPI_FIFO); 914 slave_config.dst_addr = (u32)(dma_reg_base + BCM2835_SPI_FIFO);
867 slave_config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; 915 slave_config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
868 916
@@ -870,6 +918,31 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
870 if (ret) 918 if (ret)
871 goto err_config; 919 goto err_config;
872 920
921 bs->fill_tx_addr = dma_map_page_attrs(ctlr->dma_tx->device->dev,
922 ZERO_PAGE(0), 0, sizeof(u32),
923 DMA_TO_DEVICE,
924 DMA_ATTR_SKIP_CPU_SYNC);
925 if (dma_mapping_error(ctlr->dma_tx->device->dev, bs->fill_tx_addr)) {
926 dev_err(dev, "cannot map zero page - not using DMA mode\n");
927 bs->fill_tx_addr = 0;
928 goto err_release;
929 }
930
931 bs->fill_tx_desc = dmaengine_prep_dma_cyclic(ctlr->dma_tx,
932 bs->fill_tx_addr,
933 sizeof(u32), 0,
934 DMA_MEM_TO_DEV, 0);
935 if (!bs->fill_tx_desc) {
936 dev_err(dev, "cannot prepare fill_tx_desc - not using DMA mode\n");
937 goto err_release;
938 }
939
940 ret = dmaengine_desc_set_reuse(bs->fill_tx_desc);
941 if (ret) {
942 dev_err(dev, "cannot reuse fill_tx_desc - not using DMA mode\n");
943 goto err_release;
944 }
945
873 /* 946 /*
874 * The RX DMA channel is used bidirectionally: It either reads the 947 * The RX DMA channel is used bidirectionally: It either reads the
875 * RX FIFO or, in case of a TX-only transfer, cyclically writes a 948 * RX FIFO or, in case of a TX-only transfer, cyclically writes a
@@ -913,8 +986,6 @@ static void bcm2835_dma_init(struct spi_controller *ctlr, struct device *dev,
913 986
914 /* all went well, so set can_dma */ 987 /* all went well, so set can_dma */
915 ctlr->can_dma = bcm2835_spi_can_dma; 988 ctlr->can_dma = bcm2835_spi_can_dma;
916 /* need to do TX DMA, so we need a dummy buffer */
917 ctlr->flags = SPI_CONTROLLER_MUST_TX;
918 989
919 return; 990 return;
920 991