diff options
author | David L Stevens <david.stevens@oracle.com> | 2014-09-29 19:48:11 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-30 17:10:39 -0400 |
commit | 8e845f4cbbd2ef81846c2ab5dca46d88fb2717ee (patch) | |
tree | 59f30a451145d5e95bf308c4b06cacb61a996417 /drivers/net/ethernet/sun/sunvnet.c | |
parent | e4defc775424a3501caf98d266a8d7616fa53819 (diff) |
sunvnet: make transmit path zero-copy in the kernel
This patch removes pre-allocated transmit buffers and instead directly maps
pending packets on demand. This saves O(n^2) maximum-sized transmit buffers,
for n hosts on a vswitch, as well as a copy to those buffers.
Single-stream TCP throughput linux-solaris dropped ~5% for 1500-byte MTU,
but linux-linux at 1500-bytes increased ~20%.
Signed-off-by: David L Stevens <david.stevens@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/sun/sunvnet.c')
-rw-r--r-- | drivers/net/ethernet/sun/sunvnet.c | 218 |
1 files changed, 174 insertions, 44 deletions
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index b1abcadb42ff..8f5f4e3291a4 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c | |||
@@ -780,6 +780,117 @@ struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb) | |||
780 | return ret; | 780 | return ret; |
781 | } | 781 | } |
782 | 782 | ||
783 | static struct sk_buff *vnet_clean_tx_ring(struct vnet_port *port, | ||
784 | unsigned *pending) | ||
785 | { | ||
786 | struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; | ||
787 | struct sk_buff *skb = NULL; | ||
788 | int i, txi; | ||
789 | |||
790 | *pending = 0; | ||
791 | |||
792 | txi = dr->prod-1; | ||
793 | if (txi < 0) | ||
794 | txi = VNET_TX_RING_SIZE-1; | ||
795 | |||
796 | for (i = 0; i < VNET_TX_RING_SIZE; ++i) { | ||
797 | struct vio_net_desc *d; | ||
798 | |||
799 | d = vio_dring_entry(dr, txi); | ||
800 | |||
801 | if (d->hdr.state == VIO_DESC_DONE) { | ||
802 | if (port->tx_bufs[txi].skb) { | ||
803 | BUG_ON(port->tx_bufs[txi].skb->next); | ||
804 | |||
805 | port->tx_bufs[txi].skb->next = skb; | ||
806 | skb = port->tx_bufs[txi].skb; | ||
807 | port->tx_bufs[txi].skb = NULL; | ||
808 | |||
809 | ldc_unmap(port->vio.lp, | ||
810 | port->tx_bufs[txi].cookies, | ||
811 | port->tx_bufs[txi].ncookies); | ||
812 | } | ||
813 | d->hdr.state = VIO_DESC_FREE; | ||
814 | } else if (d->hdr.state == VIO_DESC_READY) { | ||
815 | (*pending)++; | ||
816 | } else if (d->hdr.state == VIO_DESC_FREE) { | ||
817 | break; | ||
818 | } | ||
819 | --txi; | ||
820 | if (txi < 0) | ||
821 | txi = VNET_TX_RING_SIZE-1; | ||
822 | } | ||
823 | return skb; | ||
824 | } | ||
825 | |||
826 | static inline void vnet_free_skbs(struct sk_buff *skb) | ||
827 | { | ||
828 | struct sk_buff *next; | ||
829 | |||
830 | while (skb) { | ||
831 | next = skb->next; | ||
832 | skb->next = NULL; | ||
833 | dev_kfree_skb(skb); | ||
834 | skb = next; | ||
835 | } | ||
836 | } | ||
837 | |||
838 | static void vnet_clean_timer_expire(unsigned long port0) | ||
839 | { | ||
840 | struct vnet_port *port = (struct vnet_port *)port0; | ||
841 | struct sk_buff *freeskbs; | ||
842 | unsigned pending; | ||
843 | unsigned long flags; | ||
844 | |||
845 | spin_lock_irqsave(&port->vio.lock, flags); | ||
846 | freeskbs = vnet_clean_tx_ring(port, &pending); | ||
847 | spin_unlock_irqrestore(&port->vio.lock, flags); | ||
848 | |||
849 | vnet_free_skbs(freeskbs); | ||
850 | |||
851 | if (pending) | ||
852 | (void)mod_timer(&port->clean_timer, | ||
853 | jiffies + VNET_CLEAN_TIMEOUT); | ||
854 | else | ||
855 | del_timer(&port->clean_timer); | ||
856 | } | ||
857 | |||
858 | static inline struct sk_buff *vnet_skb_shape(struct sk_buff *skb, void **pstart, | ||
859 | int *plen) | ||
860 | { | ||
861 | struct sk_buff *nskb; | ||
862 | int len, pad; | ||
863 | |||
864 | len = skb->len; | ||
865 | pad = 0; | ||
866 | if (len < ETH_ZLEN) { | ||
867 | pad += ETH_ZLEN - skb->len; | ||
868 | len += pad; | ||
869 | } | ||
870 | len += VNET_PACKET_SKIP; | ||
871 | pad += 8 - (len & 7); | ||
872 | len += 8 - (len & 7); | ||
873 | |||
874 | if (((unsigned long)skb->data & 7) != VNET_PACKET_SKIP || | ||
875 | skb_tailroom(skb) < pad || | ||
876 | skb_headroom(skb) < VNET_PACKET_SKIP) { | ||
877 | nskb = alloc_and_align_skb(skb->dev, skb->len); | ||
878 | skb_reserve(nskb, VNET_PACKET_SKIP); | ||
879 | if (skb_copy_bits(skb, 0, nskb->data, skb->len)) { | ||
880 | dev_kfree_skb(nskb); | ||
881 | dev_kfree_skb(skb); | ||
882 | return NULL; | ||
883 | } | ||
884 | (void)skb_put(nskb, skb->len); | ||
885 | dev_kfree_skb(skb); | ||
886 | skb = nskb; | ||
887 | } | ||
888 | |||
889 | *pstart = skb->data - VNET_PACKET_SKIP; | ||
890 | *plen = len; | ||
891 | return skb; | ||
892 | } | ||
893 | |||
783 | static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) | 894 | static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) |
784 | { | 895 | { |
785 | struct vnet *vp = netdev_priv(dev); | 896 | struct vnet *vp = netdev_priv(dev); |
@@ -788,12 +899,20 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
788 | struct vio_net_desc *d; | 899 | struct vio_net_desc *d; |
789 | unsigned long flags; | 900 | unsigned long flags; |
790 | unsigned int len; | 901 | unsigned int len; |
791 | void *tx_buf; | 902 | struct sk_buff *freeskbs = NULL; |
792 | int i, err; | 903 | int i, err, txi; |
904 | void *start = NULL; | ||
905 | int nlen = 0; | ||
906 | unsigned pending = 0; | ||
793 | 907 | ||
794 | if (unlikely(!port)) | 908 | if (unlikely(!port)) |
795 | goto out_dropped; | 909 | goto out_dropped; |
796 | 910 | ||
911 | skb = vnet_skb_shape(skb, &start, &nlen); | ||
912 | |||
913 | if (unlikely(!skb)) | ||
914 | goto out_dropped; | ||
915 | |||
797 | spin_lock_irqsave(&port->vio.lock, flags); | 916 | spin_lock_irqsave(&port->vio.lock, flags); |
798 | 917 | ||
799 | dr = &port->vio.drings[VIO_DRIVER_TX_RING]; | 918 | dr = &port->vio.drings[VIO_DRIVER_TX_RING]; |
@@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
811 | 930 | ||
812 | d = vio_dring_cur(dr); | 931 | d = vio_dring_cur(dr); |
813 | 932 | ||
814 | tx_buf = port->tx_bufs[dr->prod].buf; | 933 | txi = dr->prod; |
815 | skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len); | 934 | |
935 | freeskbs = vnet_clean_tx_ring(port, &pending); | ||
936 | |||
937 | BUG_ON(port->tx_bufs[txi].skb); | ||
816 | 938 | ||
817 | len = skb->len; | 939 | len = skb->len; |
818 | if (len < ETH_ZLEN) { | 940 | if (len < ETH_ZLEN) |
819 | len = ETH_ZLEN; | 941 | len = ETH_ZLEN; |
820 | memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len); | 942 | |
943 | port->tx_bufs[txi].skb = skb; | ||
944 | skb = NULL; | ||
945 | |||
946 | err = ldc_map_single(port->vio.lp, start, nlen, | ||
947 | port->tx_bufs[txi].cookies, 2, | ||
948 | (LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW)); | ||
949 | if (err < 0) { | ||
950 | netdev_info(dev, "tx buffer map error %d\n", err); | ||
951 | goto out_dropped_unlock; | ||
821 | } | 952 | } |
953 | port->tx_bufs[txi].ncookies = err; | ||
822 | 954 | ||
823 | /* We don't rely on the ACKs to free the skb in vnet_start_xmit(), | 955 | /* We don't rely on the ACKs to free the skb in vnet_start_xmit(), |
824 | * thus it is safe to not set VIO_ACK_ENABLE for each transmission: | 956 | * thus it is safe to not set VIO_ACK_ENABLE for each transmission: |
@@ -830,9 +962,9 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
830 | */ | 962 | */ |
831 | d->hdr.ack = VIO_ACK_DISABLE; | 963 | d->hdr.ack = VIO_ACK_DISABLE; |
832 | d->size = len; | 964 | d->size = len; |
833 | d->ncookies = port->tx_bufs[dr->prod].ncookies; | 965 | d->ncookies = port->tx_bufs[txi].ncookies; |
834 | for (i = 0; i < d->ncookies; i++) | 966 | for (i = 0; i < d->ncookies; i++) |
835 | d->cookies[i] = port->tx_bufs[dr->prod].cookies[i]; | 967 | d->cookies[i] = port->tx_bufs[txi].cookies[i]; |
836 | 968 | ||
837 | /* This has to be a non-SMP write barrier because we are writing | 969 | /* This has to be a non-SMP write barrier because we are writing |
838 | * to memory which is shared with the peer LDOM. | 970 | * to memory which is shared with the peer LDOM. |
@@ -876,7 +1008,7 @@ ldc_start_done: | |||
876 | port->start_cons = false; | 1008 | port->start_cons = false; |
877 | 1009 | ||
878 | dev->stats.tx_packets++; | 1010 | dev->stats.tx_packets++; |
879 | dev->stats.tx_bytes += skb->len; | 1011 | dev->stats.tx_bytes += port->tx_bufs[txi].skb->len; |
880 | 1012 | ||
881 | dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1); | 1013 | dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1); |
882 | if (unlikely(vnet_tx_dring_avail(dr) < 2)) { | 1014 | if (unlikely(vnet_tx_dring_avail(dr) < 2)) { |
@@ -887,7 +1019,9 @@ ldc_start_done: | |||
887 | 1019 | ||
888 | spin_unlock_irqrestore(&port->vio.lock, flags); | 1020 | spin_unlock_irqrestore(&port->vio.lock, flags); |
889 | 1021 | ||
890 | dev_kfree_skb(skb); | 1022 | vnet_free_skbs(freeskbs); |
1023 | |||
1024 | (void)mod_timer(&port->clean_timer, jiffies + VNET_CLEAN_TIMEOUT); | ||
891 | 1025 | ||
892 | return NETDEV_TX_OK; | 1026 | return NETDEV_TX_OK; |
893 | 1027 | ||
@@ -895,7 +1029,14 @@ out_dropped_unlock: | |||
895 | spin_unlock_irqrestore(&port->vio.lock, flags); | 1029 | spin_unlock_irqrestore(&port->vio.lock, flags); |
896 | 1030 | ||
897 | out_dropped: | 1031 | out_dropped: |
898 | dev_kfree_skb(skb); | 1032 | if (skb) |
1033 | dev_kfree_skb(skb); | ||
1034 | vnet_free_skbs(freeskbs); | ||
1035 | if (pending) | ||
1036 | (void)mod_timer(&port->clean_timer, | ||
1037 | jiffies + VNET_CLEAN_TIMEOUT); | ||
1038 | else | ||
1039 | del_timer(&port->clean_timer); | ||
899 | dev->stats.tx_dropped++; | 1040 | dev->stats.tx_dropped++; |
900 | return NETDEV_TX_OK; | 1041 | return NETDEV_TX_OK; |
901 | } | 1042 | } |
@@ -1097,17 +1238,22 @@ static void vnet_port_free_tx_bufs(struct vnet_port *port) | |||
1097 | } | 1238 | } |
1098 | 1239 | ||
1099 | for (i = 0; i < VNET_TX_RING_SIZE; i++) { | 1240 | for (i = 0; i < VNET_TX_RING_SIZE; i++) { |
1100 | void *buf = port->tx_bufs[i].buf; | 1241 | struct vio_net_desc *d; |
1242 | void *skb = port->tx_bufs[i].skb; | ||
1101 | 1243 | ||
1102 | if (!buf) | 1244 | if (!skb) |
1103 | continue; | 1245 | continue; |
1104 | 1246 | ||
1247 | d = vio_dring_entry(dr, i); | ||
1248 | if (d->hdr.state == VIO_DESC_READY) | ||
1249 | pr_warn("active transmit buffers freed\n"); | ||
1250 | |||
1105 | ldc_unmap(port->vio.lp, | 1251 | ldc_unmap(port->vio.lp, |
1106 | port->tx_bufs[i].cookies, | 1252 | port->tx_bufs[i].cookies, |
1107 | port->tx_bufs[i].ncookies); | 1253 | port->tx_bufs[i].ncookies); |
1108 | 1254 | dev_kfree_skb(skb); | |
1109 | kfree(buf); | 1255 | port->tx_bufs[i].skb = NULL; |
1110 | port->tx_bufs[i].buf = NULL; | 1256 | d->hdr.state = VIO_DESC_FREE; |
1111 | } | 1257 | } |
1112 | } | 1258 | } |
1113 | 1259 | ||
@@ -1118,34 +1264,6 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port) | |||
1118 | int i, err, ncookies; | 1264 | int i, err, ncookies; |
1119 | void *dring; | 1265 | void *dring; |
1120 | 1266 | ||
1121 | for (i = 0; i < VNET_TX_RING_SIZE; i++) { | ||
1122 | void *buf = kzalloc(VNET_MAXPACKET + 8, GFP_KERNEL); | ||
1123 | int map_len = (VNET_MAXPACKET + 7) & ~7; | ||
1124 | |||
1125 | err = -ENOMEM; | ||
1126 | if (!buf) | ||
1127 | goto err_out; | ||
1128 | |||
1129 | err = -EFAULT; | ||
1130 | if ((unsigned long)buf & (8UL - 1)) { | ||
1131 | pr_err("TX buffer misaligned\n"); | ||
1132 | kfree(buf); | ||
1133 | goto err_out; | ||
1134 | } | ||
1135 | |||
1136 | err = ldc_map_single(port->vio.lp, buf, map_len, | ||
1137 | port->tx_bufs[i].cookies, 2, | ||
1138 | (LDC_MAP_SHADOW | | ||
1139 | LDC_MAP_DIRECT | | ||
1140 | LDC_MAP_RW)); | ||
1141 | if (err < 0) { | ||
1142 | kfree(buf); | ||
1143 | goto err_out; | ||
1144 | } | ||
1145 | port->tx_bufs[i].buf = buf; | ||
1146 | port->tx_bufs[i].ncookies = err; | ||
1147 | } | ||
1148 | |||
1149 | dr = &port->vio.drings[VIO_DRIVER_TX_RING]; | 1267 | dr = &port->vio.drings[VIO_DRIVER_TX_RING]; |
1150 | 1268 | ||
1151 | len = (VNET_TX_RING_SIZE * | 1269 | len = (VNET_TX_RING_SIZE * |
@@ -1172,6 +1290,12 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port) | |||
1172 | dr->pending = VNET_TX_RING_SIZE; | 1290 | dr->pending = VNET_TX_RING_SIZE; |
1173 | dr->ncookies = ncookies; | 1291 | dr->ncookies = ncookies; |
1174 | 1292 | ||
1293 | for (i = 0; i < VNET_TX_RING_SIZE; ++i) { | ||
1294 | struct vio_net_desc *d; | ||
1295 | |||
1296 | d = vio_dring_entry(dr, i); | ||
1297 | d->hdr.state = VIO_DESC_FREE; | ||
1298 | } | ||
1175 | return 0; | 1299 | return 0; |
1176 | 1300 | ||
1177 | err_out: | 1301 | err_out: |
@@ -1203,6 +1327,8 @@ static struct vnet *vnet_new(const u64 *local_mac) | |||
1203 | dev = alloc_etherdev(sizeof(*vp)); | 1327 | dev = alloc_etherdev(sizeof(*vp)); |
1204 | if (!dev) | 1328 | if (!dev) |
1205 | return ERR_PTR(-ENOMEM); | 1329 | return ERR_PTR(-ENOMEM); |
1330 | dev->needed_headroom = VNET_PACKET_SKIP + 8; | ||
1331 | dev->needed_tailroom = 8; | ||
1206 | 1332 | ||
1207 | for (i = 0; i < ETH_ALEN; i++) | 1333 | for (i = 0; i < ETH_ALEN; i++) |
1208 | dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff; | 1334 | dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff; |
@@ -1397,6 +1523,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) | |||
1397 | pr_info("%s: PORT ( remote-mac %pM%s )\n", | 1523 | pr_info("%s: PORT ( remote-mac %pM%s )\n", |
1398 | vp->dev->name, port->raddr, switch_port ? " switch-port" : ""); | 1524 | vp->dev->name, port->raddr, switch_port ? " switch-port" : ""); |
1399 | 1525 | ||
1526 | setup_timer(&port->clean_timer, vnet_clean_timer_expire, | ||
1527 | (unsigned long)port); | ||
1528 | |||
1400 | vio_port_up(&port->vio); | 1529 | vio_port_up(&port->vio); |
1401 | 1530 | ||
1402 | mdesc_release(hp); | 1531 | mdesc_release(hp); |
@@ -1423,6 +1552,7 @@ static int vnet_port_remove(struct vio_dev *vdev) | |||
1423 | unsigned long flags; | 1552 | unsigned long flags; |
1424 | 1553 | ||
1425 | del_timer_sync(&port->vio.timer); | 1554 | del_timer_sync(&port->vio.timer); |
1555 | del_timer_sync(&port->clean_timer); | ||
1426 | 1556 | ||
1427 | spin_lock_irqsave(&vp->lock, flags); | 1557 | spin_lock_irqsave(&vp->lock, flags); |
1428 | list_del(&port->list); | 1558 | list_del(&port->list); |