aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorAnanda Raju <ananda.raju@neterion.com>2005-10-18 18:46:41 -0400
committerArnaldo Carvalho de Melo <acme@mandriva.com>2005-10-28 14:30:00 -0400
commite89e9cf539a28df7d0eb1d0a545368e9920b34ac (patch)
treeaae6a825f351ce931fcd30f1a865ebe65227c4b8 /net
parentde5144164f6242ccfa8c9b64eec570564f5eaf14 (diff)
[IPv4/IPv6]: UFO Scatter-gather approach
Attached is kernel patch for UDP Fragmentation Offload (UFO) feature. 1. This patch incorporate the review comments by Jeff Garzik. 2. Renamed USO as UFO (UDP Fragmentation Offload) 3. udp sendfile support with UFO This patches uses scatter-gather feature of skb to generate large UDP datagram. Below is a "how-to" on changes required in network device driver to use the UFO interface. UDP Fragmentation Offload (UFO) Interface: ------------------------------------------- UFO is a feature wherein the Linux kernel network stack will offload the IP fragmentation functionality of large UDP datagram to hardware. This will reduce the overhead of stack in fragmenting the large UDP datagram to MTU sized packets 1) Drivers indicate their capability of UFO using dev->features |= NETIF_F_UFO | NETIF_F_HW_CSUM | NETIF_F_SG NETIF_F_HW_CSUM is required for UFO over ipv6. 2) UFO packet will be submitted for transmission using driver xmit routine. UFO packet will have a non-zero value for "skb_shinfo(skb)->ufo_size" skb_shinfo(skb)->ufo_size will indicate the length of data part in each IP fragment going out of the adapter after IP fragmentation by hardware. skb->data will contain MAC/IP/UDP header and skb_shinfo(skb)->frags[] contains the data payload. The skb->ip_summed will be set to CHECKSUM_HW indicating that hardware has to do checksum calculation. Hardware should compute the UDP checksum of complete datagram and also ip header checksum of each fragmented IP packet. For IPV6 the UFO provides the fragment identification-id in skb_shinfo(skb)->ip6_frag_id. The adapter should use this ID for generating IPv6 fragments. Signed-off-by: Ananda Raju <ananda.raju@neterion.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (forwarded) Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/ethtool.c53
-rw-r--r--net/core/skbuff.c75
-rw-r--r--net/ipv4/ip_output.c83
-rw-r--r--net/ipv6/ip6_output.c71
5 files changed, 290 insertions, 6 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index a44eeef24edf..8d1541595277 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2717,6 +2717,20 @@ int register_netdevice(struct net_device *dev)
2717 dev->name); 2717 dev->name);
2718 dev->features &= ~NETIF_F_TSO; 2718 dev->features &= ~NETIF_F_TSO;
2719 } 2719 }
2720 if (dev->features & NETIF_F_UFO) {
2721 if (!(dev->features & NETIF_F_HW_CSUM)) {
2722 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
2723 "NETIF_F_HW_CSUM feature.\n",
2724 dev->name);
2725 dev->features &= ~NETIF_F_UFO;
2726 }
2727 if (!(dev->features & NETIF_F_SG)) {
2728 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
2729 "NETIF_F_SG feature.\n",
2730 dev->name);
2731 dev->features &= ~NETIF_F_UFO;
2732 }
2733 }
2720 2734
2721 /* 2735 /*
2722 * nil rebuild_header routine, 2736 * nil rebuild_header routine,
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 404b761e82ce..0350586e9195 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -93,6 +93,20 @@ int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *a
93} 93}
94 94
95 95
96u32 ethtool_op_get_ufo(struct net_device *dev)
97{
98 return (dev->features & NETIF_F_UFO) != 0;
99}
100
101int ethtool_op_set_ufo(struct net_device *dev, u32 data)
102{
103 if (data)
104 dev->features |= NETIF_F_UFO;
105 else
106 dev->features &= ~NETIF_F_UFO;
107 return 0;
108}
109
96/* Handlers for each ethtool command */ 110/* Handlers for each ethtool command */
97 111
98static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) 112static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -483,6 +497,11 @@ static int __ethtool_set_sg(struct net_device *dev, u32 data)
483 return err; 497 return err;
484 } 498 }
485 499
500 if (!data && dev->ethtool_ops->set_ufo) {
501 err = dev->ethtool_ops->set_ufo(dev, 0);
502 if (err)
503 return err;
504 }
486 return dev->ethtool_ops->set_sg(dev, data); 505 return dev->ethtool_ops->set_sg(dev, data);
487} 506}
488 507
@@ -569,6 +588,32 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr)
569 return dev->ethtool_ops->set_tso(dev, edata.data); 588 return dev->ethtool_ops->set_tso(dev, edata.data);
570} 589}
571 590
591static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
592{
593 struct ethtool_value edata = { ETHTOOL_GTSO };
594
595 if (!dev->ethtool_ops->get_ufo)
596 return -EOPNOTSUPP;
597 edata.data = dev->ethtool_ops->get_ufo(dev);
598 if (copy_to_user(useraddr, &edata, sizeof(edata)))
599 return -EFAULT;
600 return 0;
601}
602static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr)
603{
604 struct ethtool_value edata;
605
606 if (!dev->ethtool_ops->set_ufo)
607 return -EOPNOTSUPP;
608 if (copy_from_user(&edata, useraddr, sizeof(edata)))
609 return -EFAULT;
610 if (edata.data && !(dev->features & NETIF_F_SG))
611 return -EINVAL;
612 if (edata.data && !(dev->features & NETIF_F_HW_CSUM))
613 return -EINVAL;
614 return dev->ethtool_ops->set_ufo(dev, edata.data);
615}
616
572static int ethtool_self_test(struct net_device *dev, char __user *useraddr) 617static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
573{ 618{
574 struct ethtool_test test; 619 struct ethtool_test test;
@@ -854,6 +899,12 @@ int dev_ethtool(struct ifreq *ifr)
854 case ETHTOOL_GPERMADDR: 899 case ETHTOOL_GPERMADDR:
855 rc = ethtool_get_perm_addr(dev, useraddr); 900 rc = ethtool_get_perm_addr(dev, useraddr);
856 break; 901 break;
902 case ETHTOOL_GUFO:
903 rc = ethtool_get_ufo(dev, useraddr);
904 break;
905 case ETHTOOL_SUFO:
906 rc = ethtool_set_ufo(dev, useraddr);
907 break;
857 default: 908 default:
858 rc = -EOPNOTSUPP; 909 rc = -EOPNOTSUPP;
859 } 910 }
@@ -882,3 +933,5 @@ EXPORT_SYMBOL(ethtool_op_set_sg);
882EXPORT_SYMBOL(ethtool_op_set_tso); 933EXPORT_SYMBOL(ethtool_op_set_tso);
883EXPORT_SYMBOL(ethtool_op_set_tx_csum); 934EXPORT_SYMBOL(ethtool_op_set_tx_csum);
884EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); 935EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
936EXPORT_SYMBOL(ethtool_op_set_ufo);
937EXPORT_SYMBOL(ethtool_op_get_ufo);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ef9d46b91eb9..95501e40100e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -176,6 +176,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
176 skb_shinfo(skb)->tso_size = 0; 176 skb_shinfo(skb)->tso_size = 0;
177 skb_shinfo(skb)->tso_segs = 0; 177 skb_shinfo(skb)->tso_segs = 0;
178 skb_shinfo(skb)->frag_list = NULL; 178 skb_shinfo(skb)->frag_list = NULL;
179 skb_shinfo(skb)->ufo_size = 0;
180 skb_shinfo(skb)->ip6_frag_id = 0;
179out: 181out:
180 return skb; 182 return skb;
181nodata: 183nodata:
@@ -1696,6 +1698,78 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
1696 return textsearch_find(config, state); 1698 return textsearch_find(config, state);
1697} 1699}
1698 1700
1701/**
1702 * skb_append_datato_frags: - append the user data to a skb
1703 * @sk: sock structure
1704 * @skb: skb structure to be appened with user data.
1705 * @getfrag: call back function to be used for getting the user data
1706 * @from: pointer to user message iov
1707 * @length: length of the iov message
1708 *
1709 * Description: This procedure append the user data in the fragment part
1710 * of the skb if any page alloc fails user this procedure returns -ENOMEM
1711 */
1712int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
1713 int getfrag(void *from, char *to, int offset,
1714 int len, int odd, struct sk_buff *skb),
1715 void *from, int length)
1716{
1717 int frg_cnt = 0;
1718 skb_frag_t *frag = NULL;
1719 struct page *page = NULL;
1720 int copy, left;
1721 int offset = 0;
1722 int ret;
1723
1724 do {
1725 /* Return error if we don't have space for new frag */
1726 frg_cnt = skb_shinfo(skb)->nr_frags;
1727 if (frg_cnt >= MAX_SKB_FRAGS)
1728 return -EFAULT;
1729
1730 /* allocate a new page for next frag */
1731 page = alloc_pages(sk->sk_allocation, 0);
1732
1733 /* If alloc_page fails just return failure and caller will
1734 * free previous allocated pages by doing kfree_skb()
1735 */
1736 if (page == NULL)
1737 return -ENOMEM;
1738
1739 /* initialize the next frag */
1740 sk->sk_sndmsg_page = page;
1741 sk->sk_sndmsg_off = 0;
1742 skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
1743 skb->truesize += PAGE_SIZE;
1744 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1745
1746 /* get the new initialized frag */
1747 frg_cnt = skb_shinfo(skb)->nr_frags;
1748 frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
1749
1750 /* copy the user data to page */
1751 left = PAGE_SIZE - frag->page_offset;
1752 copy = (length > left)? left : length;
1753
1754 ret = getfrag(from, (page_address(frag->page) +
1755 frag->page_offset + frag->size),
1756 offset, copy, 0, skb);
1757 if (ret < 0)
1758 return -EFAULT;
1759
1760 /* copy was successful so update the size parameters */
1761 sk->sk_sndmsg_off += copy;
1762 frag->size += copy;
1763 skb->len += copy;
1764 skb->data_len += copy;
1765 offset += copy;
1766 length -= copy;
1767
1768 } while (length > 0);
1769
1770 return 0;
1771}
1772
1699void __init skb_init(void) 1773void __init skb_init(void)
1700{ 1774{
1701 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 1775 skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1747,3 +1821,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
1747EXPORT_SYMBOL(skb_seq_read); 1821EXPORT_SYMBOL(skb_seq_read);
1748EXPORT_SYMBOL(skb_abort_seq_read); 1822EXPORT_SYMBOL(skb_abort_seq_read);
1749EXPORT_SYMBOL(skb_find_text); 1823EXPORT_SYMBOL(skb_find_text);
1824EXPORT_SYMBOL(skb_append_datato_frags);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 87e350069abb..17758234a3e3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -275,7 +275,8 @@ int ip_output(struct sk_buff *skb)
275{ 275{
276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277 277
278 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size) 278 if (skb->len > dst_mtu(skb->dst) &&
279 !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
279 return ip_fragment(skb, ip_finish_output); 280 return ip_fragment(skb, ip_finish_output);
280 else 281 else
281 return ip_finish_output(skb); 282 return ip_finish_output(skb);
@@ -688,6 +689,60 @@ csum_page(struct page *page, int offset, int copy)
688 return csum; 689 return csum;
689} 690}
690 691
692inline int ip_ufo_append_data(struct sock *sk,
693 int getfrag(void *from, char *to, int offset, int len,
694 int odd, struct sk_buff *skb),
695 void *from, int length, int hh_len, int fragheaderlen,
696 int transhdrlen, int mtu,unsigned int flags)
697{
698 struct sk_buff *skb;
699 int err;
700
701 /* There is support for UDP fragmentation offload by network
702 * device, so create one single skb packet containing complete
703 * udp datagram
704 */
705 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
706 skb = sock_alloc_send_skb(sk,
707 hh_len + fragheaderlen + transhdrlen + 20,
708 (flags & MSG_DONTWAIT), &err);
709
710 if (skb == NULL)
711 return err;
712
713 /* reserve space for Hardware header */
714 skb_reserve(skb, hh_len);
715
716 /* create space for UDP/IP header */
717 skb_put(skb,fragheaderlen + transhdrlen);
718
719 /* initialize network header pointer */
720 skb->nh.raw = skb->data;
721
722 /* initialize protocol header pointer */
723 skb->h.raw = skb->data + fragheaderlen;
724
725 skb->ip_summed = CHECKSUM_HW;
726 skb->csum = 0;
727 sk->sk_sndmsg_off = 0;
728 }
729
730 err = skb_append_datato_frags(sk,skb, getfrag, from,
731 (length - transhdrlen));
732 if (!err) {
733 /* specify the length of each IP datagram fragment*/
734 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
735 __skb_queue_tail(&sk->sk_write_queue, skb);
736
737 return 0;
738 }
739 /* There is not enough support do UFO ,
740 * so follow normal path
741 */
742 kfree_skb(skb);
743 return err;
744}
745
691/* 746/*
692 * ip_append_data() and ip_append_page() can make one large IP datagram 747 * ip_append_data() and ip_append_page() can make one large IP datagram
693 * from many pieces of data. Each pieces will be holded on the socket 748 * from many pieces of data. Each pieces will be holded on the socket
@@ -777,6 +832,15 @@ int ip_append_data(struct sock *sk,
777 csummode = CHECKSUM_HW; 832 csummode = CHECKSUM_HW;
778 833
779 inet->cork.length += length; 834 inet->cork.length += length;
835 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
836 (rt->u.dst.dev->features & NETIF_F_UFO)) {
837
838 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
839 fragheaderlen, transhdrlen, mtu, flags))
840 goto error;
841
842 return 0;
843 }
780 844
781 /* So, what's going on in the loop below? 845 /* So, what's going on in the loop below?
782 * 846 *
@@ -1008,14 +1072,23 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1008 return -EINVAL; 1072 return -EINVAL;
1009 1073
1010 inet->cork.length += size; 1074 inet->cork.length += size;
1075 if ((sk->sk_protocol == IPPROTO_UDP) &&
1076 (rt->u.dst.dev->features & NETIF_F_UFO))
1077 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1078
1011 1079
1012 while (size > 0) { 1080 while (size > 0) {
1013 int i; 1081 int i;
1014 1082
1015 /* Check if the remaining data fits into current packet. */ 1083 if (skb_shinfo(skb)->ufo_size)
1016 len = mtu - skb->len; 1084 len = size;
1017 if (len < size) 1085 else {
1018 len = maxfraglen - skb->len; 1086
1087 /* Check if the remaining data fits into current packet. */
1088 len = mtu - skb->len;
1089 if (len < size)
1090 len = maxfraglen - skb->len;
1091 }
1019 if (len <= 0) { 1092 if (len <= 0) {
1020 struct sk_buff *skb_prev; 1093 struct sk_buff *skb_prev;
1021 char *data; 1094 char *data;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 563b442ffab8..614296a920c6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -147,7 +147,8 @@ static int ip6_output2(struct sk_buff *skb)
147 147
148int ip6_output(struct sk_buff *skb) 148int ip6_output(struct sk_buff *skb)
149{ 149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) 150 if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2); 152 return ip6_fragment(skb, ip6_output2);
152 else 153 else
153 return ip6_output2(skb); 154 return ip6_output2(skb);
@@ -768,6 +769,65 @@ out_err_release:
768 *dst = NULL; 769 *dst = NULL;
769 return err; 770 return err;
770} 771}
772inline int ip6_ufo_append_data(struct sock *sk,
773 int getfrag(void *from, char *to, int offset, int len,
774 int odd, struct sk_buff *skb),
775 void *from, int length, int hh_len, int fragheaderlen,
776 int transhdrlen, int mtu,unsigned int flags)
777
778{
779 struct sk_buff *skb;
780 int err;
781
782 /* There is support for UDP large send offload by network
783 * device, so create one single skb packet containing complete
784 * udp datagram
785 */
786 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
787 skb = sock_alloc_send_skb(sk,
788 hh_len + fragheaderlen + transhdrlen + 20,
789 (flags & MSG_DONTWAIT), &err);
790 if (skb == NULL)
791 return -ENOMEM;
792
793 /* reserve space for Hardware header */
794 skb_reserve(skb, hh_len);
795
796 /* create space for UDP/IP header */
797 skb_put(skb,fragheaderlen + transhdrlen);
798
799 /* initialize network header pointer */
800 skb->nh.raw = skb->data;
801
802 /* initialize protocol header pointer */
803 skb->h.raw = skb->data + fragheaderlen;
804
805 skb->ip_summed = CHECKSUM_HW;
806 skb->csum = 0;
807 sk->sk_sndmsg_off = 0;
808 }
809
810 err = skb_append_datato_frags(sk,skb, getfrag, from,
811 (length - transhdrlen));
812 if (!err) {
813 struct frag_hdr fhdr;
814
815 /* specify the length of each IP datagram fragment*/
816 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
817 sizeof(struct frag_hdr);
818 ipv6_select_ident(skb, &fhdr);
819 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
820 __skb_queue_tail(&sk->sk_write_queue, skb);
821
822 return 0;
823 }
824 /* There is not enough support do UPD LSO,
825 * so follow normal path
826 */
827 kfree_skb(skb);
828
829 return err;
830}
771 831
772int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, 832int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
773 int offset, int len, int odd, struct sk_buff *skb), 833 int offset, int len, int odd, struct sk_buff *skb),
@@ -860,6 +920,15 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
860 */ 920 */
861 921
862 inet->cork.length += length; 922 inet->cork.length += length;
923 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
924 (rt->u.dst.dev->features & NETIF_F_UFO)) {
925
926 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
927 fragheaderlen, transhdrlen, mtu, flags))
928 goto error;
929
930 return 0;
931 }
863 932
864 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 933 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
865 goto alloc_new_skb; 934 goto alloc_new_skb;