aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-18 23:08:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-18 23:08:47 -0400
commitecb2cf1a6b63825a258ff4fe0d7f3070fbe4676b (patch)
tree4b03d332066d148f0d6c416528c6ba5e874d466a
parentee114b97e67b2a572f94982567a21ac4ee17c133 (diff)
parent3e3aac497513c669e1c62c71e1d552ea85c1d974 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller: "A couple interesting SKB fragment handling fixes, plus the usual small bits here and there: 1) Fix 64-bit divide build failure on 32-bit platforms in mlx5, from Tim Gardner. 2) Get rid of a stupid reimplementation on "%*phC" in our sysfs MAC address printing helper. 3) Fix NETIF_F_SG capability advertisement in hyperv driver, if the device can't do checksumming offloads then it shouldn't say it can do SG either. From Haiyang Zhang. 4) bgmac needs to depend on PHYLIB, from Hauke Mehrtens. 5) Don't leak DMA mappings on mapping failures, from Neil Horman. 6) We need to reset the transport header of SKBs in ipv4 before we attempt to perform early socket demux, just like ipv6 does. From Eric Dumazet. 7) Add missing locking on vxlan device removal, from Stephen Hemminger. 8) xen-netfront has to make two passes over an SKB to prepare it for transfer. One pass calculates the number of slots needed, the second massages the SKB and fills the slots. Unfortunately, the first pass doesn't calculate the number of slots properly so we can end up trying to build a MAX_SKB_FRAGS + 1 SKB which doesn't work out so well. Fix from Jan Beulich with help and discussion with several others. 9) Fix a similar problem in tun and macvtap, which have to split up scatter-gather elements at PAGE_SIZE boundaries. Don't do zerocopy if it would result in a > MAX_SKB_FRAGS skb. Fixes from Jason Wang. 10) On receive, once we've decoded the VLAN state completely, clear skb->vlan_tci. Otherwise demuxed tunnels underneath can trigger the VLAN code again, corrupting the packet. Fix from Eric Dumazet" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: vlan: fix a race in egress prio management vlan: mask vlan prio bits macvtap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS tuntap: do not zerocopy if iov needs more pages than MAX_SKB_FRAGS pkt_sched: sch_qfq: remove a source of high packet delay/jitter xen-netfront: pull on receive skb may need to happen earlier vxlan: add necessary locking on device removal hyperv: Fix the NETIF_F_SG flag setting in netvsc net: Fix sysfs_format_mac() code duplication. be2net: Fix to avoid hardware workaround when not needed macvtap: do not assume 802.1Q when send vlan packets macvtap: fix the missing ret value of TUNSETQUEUE ipv4: set transport header earlier mlx5 core: Fix __udivdi3 when compiling for 32 bit arches bgmac: add dependency to phylib net/irda: fixed style issues in irlan_eth ethtool: fixed trailing statements in ethtool ndisc: bool initializations should use true and false atl1e: unmap partially mapped skb on dma error and free skb
-rw-r--r--drivers/net/ethernet/atheros/atl1e/atl1e_main.c24
-rw-r--r--drivers/net/ethernet/broadcom/Kconfig1
-rw-r--r--drivers/net/ethernet/emulex/benet/be_main.c14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/debugfs.c2
-rw-r--r--drivers/net/hyperv/netvsc_drv.c4
-rw-r--r--drivers/net/macvtap.c65
-rw-r--r--drivers/net/tun.c62
-rw-r--r--drivers/net/vxlan.c6
-rw-r--r--drivers/net/xen-netfront.c31
-rw-r--r--include/linux/if_vlan.h3
-rw-r--r--net/8021q/vlan_core.c2
-rw-r--r--net/8021q/vlan_dev.c7
-rw-r--r--net/core/dev.c11
-rw-r--r--net/core/ethtool.c30
-rw-r--r--net/ethernet/eth.c21
-rw-r--r--net/ipv4/ip_input.c7
-rw-r--r--net/ipv6/ndisc.c6
-rw-r--r--net/irda/irlan/irlan_eth.c31
-rw-r--r--net/sched/sch_qfq.c85
19 files changed, 248 insertions, 164 deletions
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 6d1a62a84c9d..1966444590f6 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1678,6 +1678,7 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
1678 u16 f; 1678 u16 f;
1679 int segment; 1679 int segment;
1680 int ring_start = adapter->tx_ring.next_to_use; 1680 int ring_start = adapter->tx_ring.next_to_use;
1681 int ring_end;
1681 1682
1682 nr_frags = skb_shinfo(skb)->nr_frags; 1683 nr_frags = skb_shinfo(skb)->nr_frags;
1683 segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK; 1684 segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK;
@@ -1721,6 +1722,15 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
1721 map_len, PCI_DMA_TODEVICE); 1722 map_len, PCI_DMA_TODEVICE);
1722 1723
1723 if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { 1724 if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) {
1725 /* We need to unwind the mappings we've done */
1726 ring_end = adapter->tx_ring.next_to_use;
1727 adapter->tx_ring.next_to_use = ring_start;
1728 while (adapter->tx_ring.next_to_use != ring_end) {
1729 tpd = atl1e_get_tpd(adapter);
1730 tx_buffer = atl1e_get_tx_buffer(adapter, tpd);
1731 pci_unmap_single(adapter->pdev, tx_buffer->dma,
1732 tx_buffer->length, PCI_DMA_TODEVICE);
1733 }
1724 /* Reset the tx rings next pointer */ 1734 /* Reset the tx rings next pointer */
1725 adapter->tx_ring.next_to_use = ring_start; 1735 adapter->tx_ring.next_to_use = ring_start;
1726 return -ENOSPC; 1736 return -ENOSPC;
@@ -1763,6 +1773,16 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
1763 DMA_TO_DEVICE); 1773 DMA_TO_DEVICE);
1764 1774
1765 if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) { 1775 if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) {
1776 /* We need to unwind the mappings we've done */
1777 ring_end = adapter->tx_ring.next_to_use;
1778 adapter->tx_ring.next_to_use = ring_start;
1779 while (adapter->tx_ring.next_to_use != ring_end) {
1780 tpd = atl1e_get_tpd(adapter);
1781 tx_buffer = atl1e_get_tx_buffer(adapter, tpd);
1782 dma_unmap_page(&adapter->pdev->dev, tx_buffer->dma,
1783 tx_buffer->length, DMA_TO_DEVICE);
1784 }
1785
1766 /* Reset the ring next to use pointer */ 1786 /* Reset the ring next to use pointer */
1767 adapter->tx_ring.next_to_use = ring_start; 1787 adapter->tx_ring.next_to_use = ring_start;
1768 return -ENOSPC; 1788 return -ENOSPC;
@@ -1853,8 +1873,10 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
1853 return NETDEV_TX_OK; 1873 return NETDEV_TX_OK;
1854 } 1874 }
1855 1875
1856 if (atl1e_tx_map(adapter, skb, tpd)) 1876 if (atl1e_tx_map(adapter, skb, tpd)) {
1877 dev_kfree_skb_any(skb);
1857 goto out; 1878 goto out;
1879 }
1858 1880
1859 atl1e_tx_queue(adapter, tpd_req, tpd); 1881 atl1e_tx_queue(adapter, tpd_req, tpd);
1860 1882
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 1d680baf43d6..52c96036dcc4 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -131,6 +131,7 @@ config BNX2X_SRIOV
131config BGMAC 131config BGMAC
132 tristate "BCMA bus GBit core support" 132 tristate "BCMA bus GBit core support"
133 depends on BCMA_HOST_SOC && HAS_DMA 133 depends on BCMA_HOST_SOC && HAS_DMA
134 select PHYLIB
134 ---help--- 135 ---help---
135 This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus. 136 This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus.
136 They can be found on BCM47xx SoCs and provide gigabit ethernet. 137 They can be found on BCM47xx SoCs and provide gigabit ethernet.
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 2df48bb0f1ca..181edb522450 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -782,16 +782,22 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
782 782
783 if (vlan_tx_tag_present(skb)) 783 if (vlan_tx_tag_present(skb))
784 vlan_tag = be_get_tx_vlan_tag(adapter, skb); 784 vlan_tag = be_get_tx_vlan_tag(adapter, skb);
785 else if (qnq_async_evt_rcvd(adapter) && adapter->pvid) 785
786 vlan_tag = adapter->pvid; 786 if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
787 if (!vlan_tag)
788 vlan_tag = adapter->pvid;
789 /* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
790 * skip VLAN insertion
791 */
792 if (skip_hw_vlan)
793 *skip_hw_vlan = true;
794 }
787 795
788 if (vlan_tag) { 796 if (vlan_tag) {
789 skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); 797 skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
790 if (unlikely(!skb)) 798 if (unlikely(!skb))
791 return skb; 799 return skb;
792 skb->vlan_tci = 0; 800 skb->vlan_tci = 0;
793 if (skip_hw_vlan)
794 *skip_hw_vlan = true;
795 } 801 }
796 802
797 /* Insert the outer VLAN, if any */ 803 /* Insert the outer VLAN, if any */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
index 4273c06e2e96..9c7194b26ee2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -156,7 +156,7 @@ static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
156 stats = filp->private_data; 156 stats = filp->private_data;
157 spin_lock(&stats->lock); 157 spin_lock(&stats->lock);
158 if (stats->n) 158 if (stats->n)
159 field = stats->sum / stats->n; 159 field = div64_u64(stats->sum, stats->n);
160 spin_unlock(&stats->lock); 160 spin_unlock(&stats->lock);
161 ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field); 161 ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
162 if (ret > 0) { 162 if (ret > 0) {
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4dccead586be..23a0fff0df52 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -431,8 +431,8 @@ static int netvsc_probe(struct hv_device *dev,
431 net->netdev_ops = &device_ops; 431 net->netdev_ops = &device_ops;
432 432
433 /* TODO: Add GSO and Checksum offload */ 433 /* TODO: Add GSO and Checksum offload */
434 net->hw_features = NETIF_F_SG; 434 net->hw_features = 0;
435 net->features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_TX; 435 net->features = NETIF_F_HW_VLAN_CTAG_TX;
436 436
437 SET_ETHTOOL_OPS(net, &ethtool_ops); 437 SET_ETHTOOL_OPS(net, &ethtool_ops);
438 SET_NETDEV_DEV(net, &dev->device); 438 SET_NETDEV_DEV(net, &dev->device);
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 876c72246ae9..a98fb0ed6aef 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -698,6 +698,28 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
698 return 0; 698 return 0;
699} 699}
700 700
701static unsigned long iov_pages(const struct iovec *iv, int offset,
702 unsigned long nr_segs)
703{
704 unsigned long seg, base;
705 int pages = 0, len, size;
706
707 while (nr_segs && (offset >= iv->iov_len)) {
708 offset -= iv->iov_len;
709 ++iv;
710 --nr_segs;
711 }
712
713 for (seg = 0; seg < nr_segs; seg++) {
714 base = (unsigned long)iv[seg].iov_base + offset;
715 len = iv[seg].iov_len - offset;
716 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
717 pages += size;
718 offset = 0;
719 }
720
721 return pages;
722}
701 723
702/* Get packet from user space buffer */ 724/* Get packet from user space buffer */
703static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, 725static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
@@ -744,31 +766,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
744 if (unlikely(count > UIO_MAXIOV)) 766 if (unlikely(count > UIO_MAXIOV))
745 goto err; 767 goto err;
746 768
747 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) 769 if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
748 zerocopy = true; 770 copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN;
749
750 if (zerocopy) {
751 /* Userspace may produce vectors with count greater than
752 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
753 * to let the rest of data to be fit in the frags.
754 */
755 if (count > MAX_SKB_FRAGS) {
756 copylen = iov_length(iv, count - MAX_SKB_FRAGS);
757 if (copylen < vnet_hdr_len)
758 copylen = 0;
759 else
760 copylen -= vnet_hdr_len;
761 }
762 /* There are 256 bytes to be copied in skb, so there is enough
763 * room for skb expand head in case it is used.
764 * The rest buffer is mapped from userspace.
765 */
766 if (copylen < vnet_hdr.hdr_len)
767 copylen = vnet_hdr.hdr_len;
768 if (!copylen)
769 copylen = GOODCOPY_LEN;
770 linear = copylen; 771 linear = copylen;
771 } else { 772 if (iov_pages(iv, vnet_hdr_len + copylen, count)
773 <= MAX_SKB_FRAGS)
774 zerocopy = true;
775 }
776
777 if (!zerocopy) {
772 copylen = len; 778 copylen = len;
773 linear = vnet_hdr.hdr_len; 779 linear = vnet_hdr.hdr_len;
774 } 780 }
@@ -780,9 +786,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
780 786
781 if (zerocopy) 787 if (zerocopy)
782 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); 788 err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
783 else 789 else {
784 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, 790 err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
785 len); 791 len);
792 if (!err && m && m->msg_control) {
793 struct ubuf_info *uarg = m->msg_control;
794 uarg->callback(uarg, false);
795 }
796 }
797
786 if (err) 798 if (err)
787 goto err_kfree; 799 goto err_kfree;
788 800
@@ -873,7 +885,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
873 __be16 h_vlan_proto; 885 __be16 h_vlan_proto;
874 __be16 h_vlan_TCI; 886 __be16 h_vlan_TCI;
875 } veth; 887 } veth;
876 veth.h_vlan_proto = htons(ETH_P_8021Q); 888 veth.h_vlan_proto = skb->vlan_proto;
877 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); 889 veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
878 890
879 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); 891 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
@@ -1107,6 +1119,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
1107 rtnl_lock(); 1119 rtnl_lock();
1108 ret = macvtap_ioctl_set_queue(file, u); 1120 ret = macvtap_ioctl_set_queue(file, u);
1109 rtnl_unlock(); 1121 rtnl_unlock();
1122 return ret;
1110 1123
1111 case TUNGETFEATURES: 1124 case TUNGETFEATURES:
1112 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | 1125 if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR |
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5cdcf92eb310..db690a372260 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1035,6 +1035,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
1035 return 0; 1035 return 0;
1036} 1036}
1037 1037
1038static unsigned long iov_pages(const struct iovec *iv, int offset,
1039 unsigned long nr_segs)
1040{
1041 unsigned long seg, base;
1042 int pages = 0, len, size;
1043
1044 while (nr_segs && (offset >= iv->iov_len)) {
1045 offset -= iv->iov_len;
1046 ++iv;
1047 --nr_segs;
1048 }
1049
1050 for (seg = 0; seg < nr_segs; seg++) {
1051 base = (unsigned long)iv[seg].iov_base + offset;
1052 len = iv[seg].iov_len - offset;
1053 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
1054 pages += size;
1055 offset = 0;
1056 }
1057
1058 return pages;
1059}
1060
1038/* Get packet from user space buffer */ 1061/* Get packet from user space buffer */
1039static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, 1062static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1040 void *msg_control, const struct iovec *iv, 1063 void *msg_control, const struct iovec *iv,
@@ -1082,32 +1105,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1082 return -EINVAL; 1105 return -EINVAL;
1083 } 1106 }
1084 1107
1085 if (msg_control) 1108 if (msg_control) {
1086 zerocopy = true; 1109 /* There are 256 bytes to be copied in skb, so there is
1087 1110 * enough room for skb expand head in case it is used.
1088 if (zerocopy) {
1089 /* Userspace may produce vectors with count greater than
1090 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
1091 * to let the rest of data to be fit in the frags.
1092 */
1093 if (count > MAX_SKB_FRAGS) {
1094 copylen = iov_length(iv, count - MAX_SKB_FRAGS);
1095 if (copylen < offset)
1096 copylen = 0;
1097 else
1098 copylen -= offset;
1099 } else
1100 copylen = 0;
1101 /* There are 256 bytes to be copied in skb, so there is enough
1102 * room for skb expand head in case it is used.
1103 * The rest of the buffer is mapped from userspace. 1111 * The rest of the buffer is mapped from userspace.
1104 */ 1112 */
1105 if (copylen < gso.hdr_len) 1113 copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN;
1106 copylen = gso.hdr_len;
1107 if (!copylen)
1108 copylen = GOODCOPY_LEN;
1109 linear = copylen; 1114 linear = copylen;
1110 } else { 1115 if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS)
1116 zerocopy = true;
1117 }
1118
1119 if (!zerocopy) {
1111 copylen = len; 1120 copylen = len;
1112 linear = gso.hdr_len; 1121 linear = gso.hdr_len;
1113 } 1122 }
@@ -1121,8 +1130,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1121 1130
1122 if (zerocopy) 1131 if (zerocopy)
1123 err = zerocopy_sg_from_iovec(skb, iv, offset, count); 1132 err = zerocopy_sg_from_iovec(skb, iv, offset, count);
1124 else 1133 else {
1125 err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); 1134 err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
1135 if (!err && msg_control) {
1136 struct ubuf_info *uarg = msg_control;
1137 uarg->callback(uarg, false);
1138 }
1139 }
1126 1140
1127 if (err) { 1141 if (err) {
1128 tun->dev->stats.rx_dropped++; 1142 tun->dev->stats.rx_dropped++;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0ba1e7edbb1b..a5ba8dd7e6be 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1767,9 +1767,15 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
1767 1767
1768static void vxlan_dellink(struct net_device *dev, struct list_head *head) 1768static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1769{ 1769{
1770 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
1770 struct vxlan_dev *vxlan = netdev_priv(dev); 1771 struct vxlan_dev *vxlan = netdev_priv(dev);
1771 1772
1773 flush_workqueue(vxlan_wq);
1774
1775 spin_lock(&vn->sock_lock);
1772 hlist_del_rcu(&vxlan->hlist); 1776 hlist_del_rcu(&vxlan->hlist);
1777 spin_unlock(&vn->sock_lock);
1778
1773 list_del(&vxlan->next); 1779 list_del(&vxlan->next);
1774 unregister_netdevice_queue(dev, head); 1780 unregister_netdevice_queue(dev, head);
1775} 1781}
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ff7f111fffee..36808bf25677 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -286,8 +286,7 @@ no_skb:
286 break; 286 break;
287 } 287 }
288 288
289 __skb_fill_page_desc(skb, 0, page, 0, 0); 289 skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE);
290 skb_shinfo(skb)->nr_frags = 1;
291 __skb_queue_tail(&np->rx_batch, skb); 290 __skb_queue_tail(&np->rx_batch, skb);
292 } 291 }
293 292
@@ -831,7 +830,6 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np,
831 struct sk_buff_head *list) 830 struct sk_buff_head *list)
832{ 831{
833 struct skb_shared_info *shinfo = skb_shinfo(skb); 832 struct skb_shared_info *shinfo = skb_shinfo(skb);
834 int nr_frags = shinfo->nr_frags;
835 RING_IDX cons = np->rx.rsp_cons; 833 RING_IDX cons = np->rx.rsp_cons;
836 struct sk_buff *nskb; 834 struct sk_buff *nskb;
837 835
@@ -840,19 +838,21 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np,
840 RING_GET_RESPONSE(&np->rx, ++cons); 838 RING_GET_RESPONSE(&np->rx, ++cons);
841 skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0]; 839 skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
842 840
843 __skb_fill_page_desc(skb, nr_frags, 841 if (shinfo->nr_frags == MAX_SKB_FRAGS) {
844 skb_frag_page(nfrag), 842 unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
845 rx->offset, rx->status);
846 843
847 skb->data_len += rx->status; 844 BUG_ON(pull_to <= skb_headlen(skb));
845 __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
846 }
847 BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
848
849 skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
850 rx->offset, rx->status, PAGE_SIZE);
848 851
849 skb_shinfo(nskb)->nr_frags = 0; 852 skb_shinfo(nskb)->nr_frags = 0;
850 kfree_skb(nskb); 853 kfree_skb(nskb);
851
852 nr_frags++;
853 } 854 }
854 855
855 shinfo->nr_frags = nr_frags;
856 return cons; 856 return cons;
857} 857}
858 858
@@ -933,7 +933,8 @@ static int handle_incoming_queue(struct net_device *dev,
933 while ((skb = __skb_dequeue(rxq)) != NULL) { 933 while ((skb = __skb_dequeue(rxq)) != NULL) {
934 int pull_to = NETFRONT_SKB_CB(skb)->pull_to; 934 int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
935 935
936 __pskb_pull_tail(skb, pull_to - skb_headlen(skb)); 936 if (pull_to > skb_headlen(skb))
937 __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
937 938
938 /* Ethernet work: Delayed to here as it peeks the header. */ 939 /* Ethernet work: Delayed to here as it peeks the header. */
939 skb->protocol = eth_type_trans(skb, dev); 940 skb->protocol = eth_type_trans(skb, dev);
@@ -1019,16 +1020,10 @@ err:
1019 skb_shinfo(skb)->frags[0].page_offset = rx->offset; 1020 skb_shinfo(skb)->frags[0].page_offset = rx->offset;
1020 skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status); 1021 skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status);
1021 skb->data_len = rx->status; 1022 skb->data_len = rx->status;
1023 skb->len += rx->status;
1022 1024
1023 i = xennet_fill_frags(np, skb, &tmpq); 1025 i = xennet_fill_frags(np, skb, &tmpq);
1024 1026
1025 /*
1026 * Truesize is the actual allocation size, even if the
1027 * allocation is only partially used.
1028 */
1029 skb->truesize += PAGE_SIZE * skb_shinfo(skb)->nr_frags;
1030 skb->len += skb->data_len;
1031
1032 if (rx->flags & XEN_NETRXF_csum_blank) 1027 if (rx->flags & XEN_NETRXF_csum_blank)
1033 skb->ip_summed = CHECKSUM_PARTIAL; 1028 skb->ip_summed = CHECKSUM_PARTIAL;
1034 else if (rx->flags & XEN_NETRXF_data_validated) 1029 else if (rx->flags & XEN_NETRXF_data_validated)
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index cdcbafa9b39a..715c343f7c00 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -79,9 +79,8 @@ static inline int is_vlan_dev(struct net_device *dev)
79} 79}
80 80
81#define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) 81#define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT)
82#define vlan_tx_nonzero_tag_present(__skb) \
83 (vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK))
84#define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) 82#define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
83#define vlan_tx_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK)
85 84
86#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) 85#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
87 86
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 8a15eaadc4bd..4a78c4de9f20 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
9{ 9{
10 struct sk_buff *skb = *skbp; 10 struct sk_buff *skb = *skbp;
11 __be16 vlan_proto = skb->vlan_proto; 11 __be16 vlan_proto = skb->vlan_proto;
12 u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; 12 u16 vlan_id = vlan_tx_tag_get_id(skb);
13 struct net_device *vlan_dev; 13 struct net_device *vlan_dev;
14 struct vlan_pcpu_stats *rx_stats; 14 struct vlan_pcpu_stats *rx_stats;
15 15
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3a8c8fd63c88..1cd3d2a406f5 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -73,6 +73,8 @@ vlan_dev_get_egress_qos_mask(struct net_device *dev, struct sk_buff *skb)
73{ 73{
74 struct vlan_priority_tci_mapping *mp; 74 struct vlan_priority_tci_mapping *mp;
75 75
76 smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */
77
76 mp = vlan_dev_priv(dev)->egress_priority_map[(skb->priority & 0xF)]; 78 mp = vlan_dev_priv(dev)->egress_priority_map[(skb->priority & 0xF)];
77 while (mp) { 79 while (mp) {
78 if (mp->priority == skb->priority) { 80 if (mp->priority == skb->priority) {
@@ -249,6 +251,11 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
249 np->next = mp; 251 np->next = mp;
250 np->priority = skb_prio; 252 np->priority = skb_prio;
251 np->vlan_qos = vlan_qos; 253 np->vlan_qos = vlan_qos;
254 /* Before inserting this element in hash table, make sure all its fields
255 * are committed to memory.
256 * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask()
257 */
258 smp_wmb();
252 vlan->egress_priority_map[skb_prio & 0xF] = np; 259 vlan->egress_priority_map[skb_prio & 0xF] = np;
253 if (vlan_qos) 260 if (vlan_qos)
254 vlan->nr_egress_mappings++; 261 vlan->nr_egress_mappings++;
diff --git a/net/core/dev.c b/net/core/dev.c
index a3d8d44cb7f4..26755dd40daa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3580,8 +3580,15 @@ ncls:
3580 } 3580 }
3581 } 3581 }
3582 3582
3583 if (vlan_tx_nonzero_tag_present(skb)) 3583 if (unlikely(vlan_tx_tag_present(skb))) {
3584 skb->pkt_type = PACKET_OTHERHOST; 3584 if (vlan_tx_tag_get_id(skb))
3585 skb->pkt_type = PACKET_OTHERHOST;
3586 /* Note: we might in the future use prio bits
3587 * and set skb->priority like in vlan_do_receive()
3588 * For the time being, just ignore Priority Code Point
3589 */
3590 skb->vlan_tci = 0;
3591 }
3585 3592
3586 /* deliver only exact match when indicated */ 3593 /* deliver only exact match when indicated */
3587 null_or_dev = deliver_exact ? skb->dev : NULL; 3594 null_or_dev = deliver_exact ? skb->dev : NULL;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ab5fa6336c84..78e9d9223e40 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -279,11 +279,16 @@ static u32 __ethtool_get_flags(struct net_device *dev)
279{ 279{
280 u32 flags = 0; 280 u32 flags = 0;
281 281
282 if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; 282 if (dev->features & NETIF_F_LRO)
283 if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; 283 flags |= ETH_FLAG_LRO;
284 if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; 284 if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
285 if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; 285 flags |= ETH_FLAG_RXVLAN;
286 if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; 286 if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
287 flags |= ETH_FLAG_TXVLAN;
288 if (dev->features & NETIF_F_NTUPLE)
289 flags |= ETH_FLAG_NTUPLE;
290 if (dev->features & NETIF_F_RXHASH)
291 flags |= ETH_FLAG_RXHASH;
287 292
288 return flags; 293 return flags;
289} 294}
@@ -295,11 +300,16 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
295 if (data & ~ETH_ALL_FLAGS) 300 if (data & ~ETH_ALL_FLAGS)
296 return -EINVAL; 301 return -EINVAL;
297 302
298 if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; 303 if (data & ETH_FLAG_LRO)
299 if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; 304 features |= NETIF_F_LRO;
300 if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; 305 if (data & ETH_FLAG_RXVLAN)
301 if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; 306 features |= NETIF_F_HW_VLAN_CTAG_RX;
302 if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; 307 if (data & ETH_FLAG_TXVLAN)
308 features |= NETIF_F_HW_VLAN_CTAG_TX;
309 if (data & ETH_FLAG_NTUPLE)
310 features |= NETIF_F_NTUPLE;
311 if (data & ETH_FLAG_RXHASH)
312 features |= NETIF_F_RXHASH;
303 313
304 /* allow changing only bits set in hw_features */ 314 /* allow changing only bits set in hw_features */
305 changed = (features ^ dev->features) & ETH_ALL_FEATURES; 315 changed = (features ^ dev->features) & ETH_ALL_FEATURES;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 5359560926bc..be1f64d35358 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -401,27 +401,8 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
401} 401}
402EXPORT_SYMBOL(alloc_etherdev_mqs); 402EXPORT_SYMBOL(alloc_etherdev_mqs);
403 403
404static size_t _format_mac_addr(char *buf, int buflen,
405 const unsigned char *addr, int len)
406{
407 int i;
408 char *cp = buf;
409
410 for (i = 0; i < len; i++) {
411 cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
412 if (i == len - 1)
413 break;
414 cp += scnprintf(cp, buflen - (cp - buf), ":");
415 }
416 return cp - buf;
417}
418
419ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) 404ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
420{ 405{
421 size_t l; 406 return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
422
423 l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
424 l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
425 return (ssize_t)l;
426} 407}
427EXPORT_SYMBOL(sysfs_format_mac); 408EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3da817b89e9b..15e3e683adec 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,10 +190,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
190{ 190{
191 struct net *net = dev_net(skb->dev); 191 struct net *net = dev_net(skb->dev);
192 192
193 __skb_pull(skb, ip_hdrlen(skb)); 193 __skb_pull(skb, skb_network_header_len(skb));
194
195 /* Point into the IP datagram, just past the header. */
196 skb_reset_transport_header(skb);
197 194
198 rcu_read_lock(); 195 rcu_read_lock();
199 { 196 {
@@ -437,6 +434,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
437 goto drop; 434 goto drop;
438 } 435 }
439 436
437 skb->transport_header = skb->network_header + iph->ihl*4;
438
440 /* Remove any debris in the socket control block */ 439 /* Remove any debris in the socket control block */
441 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 440 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
442 441
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3b5730b48c5..24c03396e008 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -479,7 +479,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
479 if (ifp) { 479 if (ifp) {
480 src_addr = solicited_addr; 480 src_addr = solicited_addr;
481 if (ifp->flags & IFA_F_OPTIMISTIC) 481 if (ifp->flags & IFA_F_OPTIMISTIC)
482 override = 0; 482 override = false;
483 inc_opt |= ifp->idev->cnf.force_tllao; 483 inc_opt |= ifp->idev->cnf.force_tllao;
484 in6_ifa_put(ifp); 484 in6_ifa_put(ifp);
485 } else { 485 } else {
@@ -557,7 +557,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
557 } 557 }
558 558
559 if (ipv6_addr_any(saddr)) 559 if (ipv6_addr_any(saddr))
560 inc_opt = 0; 560 inc_opt = false;
561 if (inc_opt) 561 if (inc_opt)
562 optlen += ndisc_opt_addr_space(dev); 562 optlen += ndisc_opt_addr_space(dev);
563 563
@@ -790,7 +790,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
790 (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { 790 (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
791 if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && 791 if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
792 skb->pkt_type != PACKET_HOST && 792 skb->pkt_type != PACKET_HOST &&
793 inc != 0 && 793 inc &&
794 idev->nd_parms->proxy_delay != 0) { 794 idev->nd_parms->proxy_delay != 0) {
795 /* 795 /*
796 * for anycast or proxy, 796 * for anycast or proxy,
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index d14152e866d9..ffcec225b5d9 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -44,12 +44,12 @@ static int irlan_eth_open(struct net_device *dev);
44static int irlan_eth_close(struct net_device *dev); 44static int irlan_eth_close(struct net_device *dev);
45static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, 45static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
46 struct net_device *dev); 46 struct net_device *dev);
47static void irlan_eth_set_multicast_list( struct net_device *dev); 47static void irlan_eth_set_multicast_list(struct net_device *dev);
48 48
49static const struct net_device_ops irlan_eth_netdev_ops = { 49static const struct net_device_ops irlan_eth_netdev_ops = {
50 .ndo_open = irlan_eth_open, 50 .ndo_open = irlan_eth_open,
51 .ndo_stop = irlan_eth_close, 51 .ndo_stop = irlan_eth_close,
52 .ndo_start_xmit = irlan_eth_xmit, 52 .ndo_start_xmit = irlan_eth_xmit,
53 .ndo_set_rx_mode = irlan_eth_set_multicast_list, 53 .ndo_set_rx_mode = irlan_eth_set_multicast_list,
54 .ndo_change_mtu = eth_change_mtu, 54 .ndo_change_mtu = eth_change_mtu,
55 .ndo_validate_addr = eth_validate_addr, 55 .ndo_validate_addr = eth_validate_addr,
@@ -110,7 +110,7 @@ static int irlan_eth_open(struct net_device *dev)
110{ 110{
111 struct irlan_cb *self = netdev_priv(dev); 111 struct irlan_cb *self = netdev_priv(dev);
112 112
113 IRDA_DEBUG(2, "%s()\n", __func__ ); 113 IRDA_DEBUG(2, "%s()\n", __func__);
114 114
115 /* Ready to play! */ 115 /* Ready to play! */
116 netif_stop_queue(dev); /* Wait until data link is ready */ 116 netif_stop_queue(dev); /* Wait until data link is ready */
@@ -137,7 +137,7 @@ static int irlan_eth_close(struct net_device *dev)
137{ 137{
138 struct irlan_cb *self = netdev_priv(dev); 138 struct irlan_cb *self = netdev_priv(dev);
139 139
140 IRDA_DEBUG(2, "%s()\n", __func__ ); 140 IRDA_DEBUG(2, "%s()\n", __func__);
141 141
142 /* Stop device */ 142 /* Stop device */
143 netif_stop_queue(dev); 143 netif_stop_queue(dev);
@@ -310,35 +310,32 @@ static void irlan_eth_set_multicast_list(struct net_device *dev)
310{ 310{
311 struct irlan_cb *self = netdev_priv(dev); 311 struct irlan_cb *self = netdev_priv(dev);
312 312
313 IRDA_DEBUG(2, "%s()\n", __func__ ); 313 IRDA_DEBUG(2, "%s()\n", __func__);
314 314
315 /* Check if data channel has been connected yet */ 315 /* Check if data channel has been connected yet */
316 if (self->client.state != IRLAN_DATA) { 316 if (self->client.state != IRLAN_DATA) {
317 IRDA_DEBUG(1, "%s(), delaying!\n", __func__ ); 317 IRDA_DEBUG(1, "%s(), delaying!\n", __func__);
318 return; 318 return;
319 } 319 }
320 320
321 if (dev->flags & IFF_PROMISC) { 321 if (dev->flags & IFF_PROMISC) {
322 /* Enable promiscuous mode */ 322 /* Enable promiscuous mode */
323 IRDA_WARNING("Promiscuous mode not implemented by IrLAN!\n"); 323 IRDA_WARNING("Promiscuous mode not implemented by IrLAN!\n");
324 } 324 } else if ((dev->flags & IFF_ALLMULTI) ||
325 else if ((dev->flags & IFF_ALLMULTI) ||
326 netdev_mc_count(dev) > HW_MAX_ADDRS) { 325 netdev_mc_count(dev) > HW_MAX_ADDRS) {
327 /* Disable promiscuous mode, use normal mode. */ 326 /* Disable promiscuous mode, use normal mode. */
328 IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ ); 327 IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__);
329 /* hardware_set_filter(NULL); */ 328 /* hardware_set_filter(NULL); */
330 329
331 irlan_set_multicast_filter(self, TRUE); 330 irlan_set_multicast_filter(self, TRUE);
332 } 331 } else if (!netdev_mc_empty(dev)) {
333 else if (!netdev_mc_empty(dev)) { 332 IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__);
334 IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ );
335 /* Walk the address list, and load the filter */ 333 /* Walk the address list, and load the filter */
336 /* hardware_set_filter(dev->mc_list); */ 334 /* hardware_set_filter(dev->mc_list); */
337 335
338 irlan_set_multicast_filter(self, TRUE); 336 irlan_set_multicast_filter(self, TRUE);
339 } 337 } else {
340 else { 338 IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__);
341 IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__ );
342 irlan_set_multicast_filter(self, FALSE); 339 irlan_set_multicast_filter(self, FALSE);
343 } 340 }
344 341
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index a7ab323849b6..8056fb4e618a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -113,7 +113,6 @@
113 113
114#define FRAC_BITS 30 /* fixed point arithmetic */ 114#define FRAC_BITS 30 /* fixed point arithmetic */
115#define ONE_FP (1UL << FRAC_BITS) 115#define ONE_FP (1UL << FRAC_BITS)
116#define IWSUM (ONE_FP/QFQ_MAX_WSUM)
117 116
118#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ 117#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
119#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ 118#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
@@ -189,6 +188,7 @@ struct qfq_sched {
189 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ 188 struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */
190 u32 num_active_agg; /* Num. of active aggregates */ 189 u32 num_active_agg; /* Num. of active aggregates */
191 u32 wsum; /* weight sum */ 190 u32 wsum; /* weight sum */
191 u32 iwsum; /* inverse weight sum */
192 192
193 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ 193 unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
194 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ 194 struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
@@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
314 314
315 q->wsum += 315 q->wsum +=
316 (int) agg->class_weight * (new_num_classes - agg->num_classes); 316 (int) agg->class_weight * (new_num_classes - agg->num_classes);
317 q->iwsum = ONE_FP / q->wsum;
317 318
318 agg->num_classes = new_num_classes; 319 agg->num_classes = new_num_classes;
319} 320}
@@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
340{ 341{
341 if (!hlist_unhashed(&agg->nonfull_next)) 342 if (!hlist_unhashed(&agg->nonfull_next))
342 hlist_del_init(&agg->nonfull_next); 343 hlist_del_init(&agg->nonfull_next);
344 q->wsum -= agg->class_weight;
345 if (q->wsum != 0)
346 q->iwsum = ONE_FP / q->wsum;
347
343 if (q->in_serv_agg == agg) 348 if (q->in_serv_agg == agg)
344 q->in_serv_agg = qfq_choose_next_agg(q); 349 q->in_serv_agg = qfq_choose_next_agg(q);
345 kfree(agg); 350 kfree(agg);
@@ -834,38 +839,60 @@ static void qfq_make_eligible(struct qfq_sched *q)
834 } 839 }
835} 840}
836 841
837
838/* 842/*
839 * The index of the slot in which the aggregate is to be inserted must 843 * The index of the slot in which the input aggregate agg is to be
840 * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' 844 * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
841 * because the start time of the group may be moved backward by one 845 * and not a '-1' because the start time of the group may be moved
842 * slot after the aggregate has been inserted, and this would cause 846 * backward by one slot after the aggregate has been inserted, and
843 * non-empty slots to be right-shifted by one position. 847 * this would cause non-empty slots to be right-shifted by one
848 * position.
849 *
850 * QFQ+ fully satisfies this bound to the slot index if the parameters
851 * of the classes are not changed dynamically, and if QFQ+ never
852 * happens to postpone the service of agg unjustly, i.e., it never
853 * happens that the aggregate becomes backlogged and eligible, or just
854 * eligible, while an aggregate with a higher approximated finish time
855 * is being served. In particular, in this case QFQ+ guarantees that
856 * the timestamps of agg are low enough that the slot index is never
857 * higher than 2. Unfortunately, QFQ+ cannot provide the same
858 * guarantee if it happens to unjustly postpone the service of agg, or
859 * if the parameters of some class are changed.
860 *
861 * As for the first event, i.e., an out-of-order service, the
862 * upper bound to the slot index guaranteed by QFQ+ grows to
863 * 2 +
864 * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
865 * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
844 * 866 *
845 * If the weight and lmax (max_pkt_size) of the classes do not change, 867 * The following function deals with this problem by backward-shifting
846 * then QFQ+ does meet the above contraint according to the current 868 * the timestamps of agg, if needed, so as to guarantee that the slot
847 * values of its parameters. In fact, if the weight and lmax of the 869 * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
848 * classes do not change, then, from the theory, QFQ+ guarantees that 870 * cause the service of other aggregates to be postponed, yet the
849 * the slot index is never higher than 871 * worst-case guarantees of these aggregates are not violated. In
850 * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * 872 * fact, in case of no out-of-order service, the timestamps of agg
851 * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18 873 * would have been even lower than they are after the backward shift,
874 * because QFQ+ would have guaranteed a maximum value equal to 2 for
875 * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
876 * service is postponed because of the backward-shift would have
877 * however waited for the service of agg before being served.
852 * 878 *
853 * When the weight of a class is increased or the lmax of the class is 879 * The other event that may cause the slot index to be higher than 2
854 * decreased, a new aggregate with smaller slot size than the original 880 * for agg is a recent change of the parameters of some class. If the
855 * parent aggregate of the class may happen to be activated. The 881 * weight of a class is increased or the lmax (max_pkt_size) of the
856 * activation of this aggregate should be properly delayed to when the 882 * class is decreased, then a new aggregate with smaller slot size
857 * service of the class has finished in the ideal system tracked by 883 * than the original parent aggregate of the class may happen to be
858 * QFQ+. If the activation of the aggregate is not delayed to this 884 * activated. The activation of this aggregate should be properly
859 * reference time instant, then this aggregate may be unjustly served 885 * delayed to when the service of the class has finished in the ideal
860 * before other aggregates waiting for service. This may cause the 886 * system tracked by QFQ+. If the activation of the aggregate is not
861 * above bound to the slot index to be violated for some of these 887 * delayed to this reference time instant, then this aggregate may be
862 * unlucky aggregates. 888 * unjustly served before other aggregates waiting for service. This
889 * may cause the above bound to the slot index to be violated for some
890 * of these unlucky aggregates.
863 * 891 *
864 * Instead of delaying the activation of the new aggregate, which is 892 * Instead of delaying the activation of the new aggregate, which is
865 * quite complex, the following inaccurate but simple solution is used: 893 * quite complex, the above-discussed capping of the slot index is
866 * if the slot index is higher than QFQ_MAX_SLOTS-2, then the 894 * used to handle also the consequences of a change of the parameters
867 * timestamps of the aggregate are shifted backward so as to let the 895 * of a class.
868 * slot index become equal to QFQ_MAX_SLOTS-2.
869 */ 896 */
870static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, 897static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
871 u64 roundedS) 898 u64 roundedS)
@@ -1136,7 +1163,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
1136 else 1163 else
1137 in_serv_agg->budget -= len; 1164 in_serv_agg->budget -= len;
1138 1165
1139 q->V += (u64)len * IWSUM; 1166 q->V += (u64)len * q->iwsum;
1140 pr_debug("qfq dequeue: len %u F %lld now %lld\n", 1167 pr_debug("qfq dequeue: len %u F %lld now %lld\n",
1141 len, (unsigned long long) in_serv_agg->F, 1168 len, (unsigned long long) in_serv_agg->F,
1142 (unsigned long long) q->V); 1169 (unsigned long long) q->V);