aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuval Shaia <yuval.shaia@oracle.com>2015-07-12 04:24:09 -0400
committerDoug Ledford <dledford@redhat.com>2015-07-14 13:20:13 -0400
commitc42687784b9a6b9733fee701ed236a5fe088fac4 (patch)
tree14154e185cd0366a67c895a826a46ac9621d9c6f
parent59d40dd92ca00df317ad44fa44c27fe38c58e477 (diff)
IB/ipoib: Scatter-Gather support in connected mode
By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance. This MTU plus overhead puts the memory allocation for IP based packets at 32 4k pages (order 5), which have to be contiguous. When the system memory under pressure, it was observed that allocating 128k contiguous physical memory is difficult and causes serious errors (such as system becomes unusable). This enhancement resolve the issue by removing the physically contiguous memory requirement using Scatter/Gather feature that exists in Linux stack. With this fix Scatter-Gather will be supported also in connected mode. This change reverts some of the change made in commit e112373fd6aa ("IPoIB/cm: Reduce connected mode TX object size"). The ability to use SG in IPoIB CM is possible because the coupling between NETIF_F_SG and NETIF_F_CSUM was removed in commit ec5f06156423 ("net: Kill link between CSUM and SG features.") Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> Acked-by: Christian Marie <christian@ponies.io> Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h29
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c33
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c36
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c2
4 files changed, 54 insertions, 46 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index bd94b0a6e9e5..79859c4d43c9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -239,7 +239,7 @@ struct ipoib_cm_tx {
239 struct net_device *dev; 239 struct net_device *dev;
240 struct ipoib_neigh *neigh; 240 struct ipoib_neigh *neigh;
241 struct ipoib_path *path; 241 struct ipoib_path *path;
242 struct ipoib_cm_tx_buf *tx_ring; 242 struct ipoib_tx_buf *tx_ring;
243 unsigned tx_head; 243 unsigned tx_head;
244 unsigned tx_tail; 244 unsigned tx_tail;
245 unsigned long flags; 245 unsigned long flags;
@@ -504,6 +504,33 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
504void ipoib_mcast_dev_down(struct net_device *dev); 504void ipoib_mcast_dev_down(struct net_device *dev);
505void ipoib_mcast_dev_flush(struct net_device *dev); 505void ipoib_mcast_dev_flush(struct net_device *dev);
506 506
507int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
508void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
509 struct ipoib_tx_buf *tx_req);
510
511static inline void ipoib_build_sge(struct ipoib_dev_priv *priv,
512 struct ipoib_tx_buf *tx_req)
513{
514 int i, off;
515 struct sk_buff *skb = tx_req->skb;
516 skb_frag_t *frags = skb_shinfo(skb)->frags;
517 int nr_frags = skb_shinfo(skb)->nr_frags;
518 u64 *mapping = tx_req->mapping;
519
520 if (skb_headlen(skb)) {
521 priv->tx_sge[0].addr = mapping[0];
522 priv->tx_sge[0].length = skb_headlen(skb);
523 off = 1;
524 } else
525 off = 0;
526
527 for (i = 0; i < nr_frags; ++i) {
528 priv->tx_sge[i + off].addr = mapping[i + off];
529 priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
530 }
531 priv->tx_wr.num_sge = nr_frags + off;
532}
533
507#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 534#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
508struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); 535struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
509int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter); 536int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index cf32a778e7d0..ee39be6ccfb0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -694,14 +694,12 @@ repost:
694static inline int post_send(struct ipoib_dev_priv *priv, 694static inline int post_send(struct ipoib_dev_priv *priv,
695 struct ipoib_cm_tx *tx, 695 struct ipoib_cm_tx *tx,
696 unsigned int wr_id, 696 unsigned int wr_id,
697 u64 addr, int len) 697 struct ipoib_tx_buf *tx_req)
698{ 698{
699 struct ib_send_wr *bad_wr; 699 struct ib_send_wr *bad_wr;
700 700
701 priv->tx_sge[0].addr = addr; 701 ipoib_build_sge(priv, tx_req);
702 priv->tx_sge[0].length = len;
703 702
704 priv->tx_wr.num_sge = 1;
705 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 703 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
706 704
707 return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); 705 return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
@@ -710,8 +708,7 @@ static inline int post_send(struct ipoib_dev_priv *priv,
710void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) 708void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
711{ 709{
712 struct ipoib_dev_priv *priv = netdev_priv(dev); 710 struct ipoib_dev_priv *priv = netdev_priv(dev);
713 struct ipoib_cm_tx_buf *tx_req; 711 struct ipoib_tx_buf *tx_req;
714 u64 addr;
715 int rc; 712 int rc;
716 713
717 if (unlikely(skb->len > tx->mtu)) { 714 if (unlikely(skb->len > tx->mtu)) {
@@ -735,24 +732,21 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
735 */ 732 */
736 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; 733 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
737 tx_req->skb = skb; 734 tx_req->skb = skb;
738 addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); 735
739 if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { 736 if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
740 ++dev->stats.tx_errors; 737 ++dev->stats.tx_errors;
741 dev_kfree_skb_any(skb); 738 dev_kfree_skb_any(skb);
742 return; 739 return;
743 } 740 }
744 741
745 tx_req->mapping = addr;
746
747 skb_orphan(skb); 742 skb_orphan(skb);
748 skb_dst_drop(skb); 743 skb_dst_drop(skb);
749 744
750 rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 745 rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
751 addr, skb->len);
752 if (unlikely(rc)) { 746 if (unlikely(rc)) {
753 ipoib_warn(priv, "post_send failed, error %d\n", rc); 747 ipoib_warn(priv, "post_send failed, error %d\n", rc);
754 ++dev->stats.tx_errors; 748 ++dev->stats.tx_errors;
755 ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 749 ipoib_dma_unmap_tx(priv, tx_req);
756 dev_kfree_skb_any(skb); 750 dev_kfree_skb_any(skb);
757 } else { 751 } else {
758 dev->trans_start = jiffies; 752 dev->trans_start = jiffies;
@@ -777,7 +771,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
777 struct ipoib_dev_priv *priv = netdev_priv(dev); 771 struct ipoib_dev_priv *priv = netdev_priv(dev);
778 struct ipoib_cm_tx *tx = wc->qp->qp_context; 772 struct ipoib_cm_tx *tx = wc->qp->qp_context;
779 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; 773 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
780 struct ipoib_cm_tx_buf *tx_req; 774 struct ipoib_tx_buf *tx_req;
781 unsigned long flags; 775 unsigned long flags;
782 776
783 ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", 777 ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -791,7 +785,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
791 785
792 tx_req = &tx->tx_ring[wr_id]; 786 tx_req = &tx->tx_ring[wr_id];
793 787
794 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 788 ipoib_dma_unmap_tx(priv, tx_req);
795 789
796 /* FIXME: is this right? Shouldn't we only increment on success? */ 790 /* FIXME: is this right? Shouldn't we only increment on success? */
797 ++dev->stats.tx_packets; 791 ++dev->stats.tx_packets;
@@ -1036,6 +1030,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
1036 1030
1037 struct ib_qp *tx_qp; 1031 struct ib_qp *tx_qp;
1038 1032
1033 if (dev->features & NETIF_F_SG)
1034 attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
1035
1039 tx_qp = ib_create_qp(priv->pd, &attr); 1036 tx_qp = ib_create_qp(priv->pd, &attr);
1040 if (PTR_ERR(tx_qp) == -EINVAL) { 1037 if (PTR_ERR(tx_qp) == -EINVAL) {
1041 ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n", 1038 ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
@@ -1170,7 +1167,7 @@ err_tx:
1170static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) 1167static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1171{ 1168{
1172 struct ipoib_dev_priv *priv = netdev_priv(p->dev); 1169 struct ipoib_dev_priv *priv = netdev_priv(p->dev);
1173 struct ipoib_cm_tx_buf *tx_req; 1170 struct ipoib_tx_buf *tx_req;
1174 unsigned long begin; 1171 unsigned long begin;
1175 1172
1176 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", 1173 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1197,8 +1194,7 @@ timeout:
1197 1194
1198 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1195 while ((int) p->tx_tail - (int) p->tx_head < 0) {
1199 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1196 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1200 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 1197 ipoib_dma_unmap_tx(priv, tx_req);
1201 DMA_TO_DEVICE);
1202 dev_kfree_skb_any(tx_req->skb); 1198 dev_kfree_skb_any(tx_req->skb);
1203 ++p->tx_tail; 1199 ++p->tx_tail;
1204 netif_tx_lock_bh(p->dev); 1200 netif_tx_lock_bh(p->dev);
@@ -1455,7 +1451,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
1455 spin_unlock_irq(&priv->lock); 1451 spin_unlock_irq(&priv->lock);
1456} 1452}
1457 1453
1458
1459static ssize_t show_mode(struct device *d, struct device_attribute *attr, 1454static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1460 char *buf) 1455 char *buf)
1461{ 1456{
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 058150ff5aa1..d266667ca9b8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -263,8 +263,7 @@ repost:
263 "for buf %d\n", wr_id); 263 "for buf %d\n", wr_id);
264} 264}
265 265
266static int ipoib_dma_map_tx(struct ib_device *ca, 266int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
267 struct ipoib_tx_buf *tx_req)
268{ 267{
269 struct sk_buff *skb = tx_req->skb; 268 struct sk_buff *skb = tx_req->skb;
270 u64 *mapping = tx_req->mapping; 269 u64 *mapping = tx_req->mapping;
@@ -305,8 +304,8 @@ partial_error:
305 return -EIO; 304 return -EIO;
306} 305}
307 306
308static void ipoib_dma_unmap_tx(struct ib_device *ca, 307void ipoib_dma_unmap_tx(struct ipoib_dev_priv *priv,
309 struct ipoib_tx_buf *tx_req) 308 struct ipoib_tx_buf *tx_req)
310{ 309{
311 struct sk_buff *skb = tx_req->skb; 310 struct sk_buff *skb = tx_req->skb;
312 u64 *mapping = tx_req->mapping; 311 u64 *mapping = tx_req->mapping;
@@ -314,7 +313,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
314 int off; 313 int off;
315 314
316 if (skb_headlen(skb)) { 315 if (skb_headlen(skb)) {
317 ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE); 316 ib_dma_unmap_single(priv->ca, mapping[0], skb_headlen(skb),
317 DMA_TO_DEVICE);
318 off = 1; 318 off = 1;
319 } else 319 } else
320 off = 0; 320 off = 0;
@@ -322,8 +322,8 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
322 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { 322 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
323 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 323 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
324 324
325 ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag), 325 ib_dma_unmap_page(priv->ca, mapping[i + off],
326 DMA_TO_DEVICE); 326 skb_frag_size(frag), DMA_TO_DEVICE);
327 } 327 }
328} 328}
329 329
@@ -389,7 +389,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
389 389
390 tx_req = &priv->tx_ring[wr_id]; 390 tx_req = &priv->tx_ring[wr_id];
391 391
392 ipoib_dma_unmap_tx(priv->ca, tx_req); 392 ipoib_dma_unmap_tx(priv, tx_req);
393 393
394 ++dev->stats.tx_packets; 394 ++dev->stats.tx_packets;
395 dev->stats.tx_bytes += tx_req->skb->len; 395 dev->stats.tx_bytes += tx_req->skb->len;
@@ -514,24 +514,10 @@ static inline int post_send(struct ipoib_dev_priv *priv,
514 void *head, int hlen) 514 void *head, int hlen)
515{ 515{
516 struct ib_send_wr *bad_wr; 516 struct ib_send_wr *bad_wr;
517 int i, off;
518 struct sk_buff *skb = tx_req->skb; 517 struct sk_buff *skb = tx_req->skb;
519 skb_frag_t *frags = skb_shinfo(skb)->frags;
520 int nr_frags = skb_shinfo(skb)->nr_frags;
521 u64 *mapping = tx_req->mapping;
522 518
523 if (skb_headlen(skb)) { 519 ipoib_build_sge(priv, tx_req);
524 priv->tx_sge[0].addr = mapping[0];
525 priv->tx_sge[0].length = skb_headlen(skb);
526 off = 1;
527 } else
528 off = 0;
529 520
530 for (i = 0; i < nr_frags; ++i) {
531 priv->tx_sge[i + off].addr = mapping[i + off];
532 priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
533 }
534 priv->tx_wr.num_sge = nr_frags + off;
535 priv->tx_wr.wr_id = wr_id; 521 priv->tx_wr.wr_id = wr_id;
536 priv->tx_wr.wr.ud.remote_qpn = qpn; 522 priv->tx_wr.wr.ud.remote_qpn = qpn;
537 priv->tx_wr.wr.ud.ah = address; 523 priv->tx_wr.wr.ud.ah = address;
@@ -617,7 +603,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
617 ipoib_warn(priv, "post_send failed, error %d\n", rc); 603 ipoib_warn(priv, "post_send failed, error %d\n", rc);
618 ++dev->stats.tx_errors; 604 ++dev->stats.tx_errors;
619 --priv->tx_outstanding; 605 --priv->tx_outstanding;
620 ipoib_dma_unmap_tx(priv->ca, tx_req); 606 ipoib_dma_unmap_tx(priv, tx_req);
621 dev_kfree_skb_any(skb); 607 dev_kfree_skb_any(skb);
622 if (netif_queue_stopped(dev)) 608 if (netif_queue_stopped(dev))
623 netif_wake_queue(dev); 609 netif_wake_queue(dev);
@@ -868,7 +854,7 @@ int ipoib_ib_dev_stop(struct net_device *dev)
868 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 854 while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
869 tx_req = &priv->tx_ring[priv->tx_tail & 855 tx_req = &priv->tx_ring[priv->tx_tail &
870 (ipoib_sendq_size - 1)]; 856 (ipoib_sendq_size - 1)];
871 ipoib_dma_unmap_tx(priv->ca, tx_req); 857 ipoib_dma_unmap_tx(priv, tx_req);
872 dev_kfree_skb_any(tx_req->skb); 858 dev_kfree_skb_any(tx_req->skb);
873 ++priv->tx_tail; 859 ++priv->tx_tail;
874 --priv->tx_outstanding; 860 --priv->tx_outstanding;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index da3e7e7d5e80..59604a7e5e3e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
190 struct ipoib_dev_priv *priv = netdev_priv(dev); 190 struct ipoib_dev_priv *priv = netdev_priv(dev);
191 191
192 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) 192 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
193 features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); 193 features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
194 194
195 return features; 195 return features;
196} 196}