aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-08 21:40:54 -0400
commit35a9ad8af0bb0fa3525e6d0d20e32551d226f38e (patch)
tree15b4b33206818886d9cff371fd2163e073b70568 /net/core
parentd5935b07da53f74726e2a65dd4281d0f2c70e5d4 (diff)
parent64b1f00a0830e1c53874067273a096b228d83d36 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Most notable changes in here: 1) By far the biggest accomplishment, thanks to a large range of contributors, is the addition of multi-send for transmit. This is the result of discussions back in Chicago, and the hard work of several individuals. Now, when the ->ndo_start_xmit() method of a driver sees skb->xmit_more as true, it can choose to defer the doorbell telling the driver to start processing the new TX queue entires. skb->xmit_more means that the generic networking is guaranteed to call the driver immediately with another SKB to send. There is logic added to the qdisc layer to dequeue multiple packets at a time, and the handling mis-predicted offloads in software is now done with no locks held. Finally, pktgen is extended to have a "burst" parameter that can be used to test a multi-send implementation. Several drivers have xmit_more support: i40e, igb, ixgbe, mlx4, virtio_net Adding support is almost trivial, so export more drivers to support this optimization soon. I want to thank, in no particular or implied order, Jesper Dangaard Brouer, Eric Dumazet, Alexander Duyck, Tom Herbert, Jamal Hadi Salim, John Fastabend, Florian Westphal, Daniel Borkmann, David Tat, Hannes Frederic Sowa, and Rusty Russell. 2) PTP and timestamping support in bnx2x, from Michal Kalderon. 3) Allow adjusting the rx_copybreak threshold for a driver via ethtool, and add rx_copybreak support to enic driver. From Govindarajulu Varadarajan. 4) Significant enhancements to the generic PHY layer and the bcm7xxx driver in particular (EEE support, auto power down, etc.) from Florian Fainelli. 5) Allow raw buffers to be used for flow dissection, allowing drivers to determine the optimal "linear pull" size for devices that DMA into pools of pages. The objective is to get exactly the necessary amount of headers into the linear SKB area pre-pulled, but no more. The new interface drivers use is eth_get_headlen(). From WANG Cong, with driver conversions (several had their own by-hand duplicated implementations) by Alexander Duyck and Eric Dumazet. 6) Support checksumming more smoothly and efficiently for encapsulations, and add "foo over UDP" facility. From Tom Herbert. 7) Add Broadcom SF2 switch driver to DSA layer, from Florian Fainelli. 8) eBPF now can load programs via a system call and has an extensive testsuite. Alexei Starovoitov and Daniel Borkmann. 9) Major overhaul of the packet scheduler to use RCU in several major areas such as the classifiers and rate estimators. From John Fastabend. 10) Add driver for Intel FM10000 Ethernet Switch, from Alexander Duyck. 11) Rearrange TCP_SKB_CB() to reduce cache line misses, from Eric Dumazet. 12) Add Datacenter TCP congestion control algorithm support, From Florian Westphal. 13) Reorganize sk_buff so that __copy_skb_header() is significantly faster. From Eric Dumazet" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1558 commits) netlabel: directly return netlbl_unlabel_genl_init() net: add netdev_txq_bql_{enqueue, complete}_prefetchw() helpers net: description of dma_cookie cause make xmldocs warning cxgb4: clean up a type issue cxgb4: potential shift wrapping bug i40e: skb->xmit_more support net: fs_enet: Add NAPI TX net: fs_enet: Remove non NAPI RX r8169:add support for RTL8168EP net_sched: copy exts->type in tcf_exts_change() wimax: convert printk to pr_foo() af_unix: remove 0 assignment on static ipv6: Do not warn for informational ICMP messages, regardless of type. Update Intel Ethernet Driver maintainers list bridge: Save frag_max_size between PRE_ROUTING and POST_ROUTING tipc: fix bug in multicast congestion handling net: better IFF_XMIT_DST_RELEASE support net/mlx4_en: remove NETDEV_TX_BUSY 3c59x: fix bad split of cpu_to_le32(pci_map_single()) net: bcmgenet: fix Tx ring priority programming ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c459
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/ethtool.c82
-rw-r--r--net/core/filter.c45
-rw-r--r--net/core/flow_dissector.c115
-rw-r--r--net/core/gen_estimator.c29
-rw-r--r--net/core/gen_stats.c112
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netpoll.c7
-rw-r--r--net/core/pktgen.c76
-rw-r--r--net/core/rtnetlink.c66
-rw-r--r--net/core/secure_seq.c6
-rw-r--r--net/core/skbuff.c395
-rw-r--r--net/core/sock.c110
-rw-r--r--net/core/timestamping.c43
-rw-r--r--net/core/utils.c12
16 files changed, 969 insertions, 597 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 130d64220229..4699dcfdc4ab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -897,23 +897,25 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
897EXPORT_SYMBOL(dev_getfirstbyhwtype); 897EXPORT_SYMBOL(dev_getfirstbyhwtype);
898 898
899/** 899/**
900 * dev_get_by_flags_rcu - find any device with given flags 900 * __dev_get_by_flags - find any device with given flags
901 * @net: the applicable net namespace 901 * @net: the applicable net namespace
902 * @if_flags: IFF_* values 902 * @if_flags: IFF_* values
903 * @mask: bitmask of bits in if_flags to check 903 * @mask: bitmask of bits in if_flags to check
904 * 904 *
905 * Search for any interface with the given flags. Returns NULL if a device 905 * Search for any interface with the given flags. Returns NULL if a device
906 * is not found or a pointer to the device. Must be called inside 906 * is not found or a pointer to the device. Must be called inside
907 * rcu_read_lock(), and result refcount is unchanged. 907 * rtnl_lock(), and result refcount is unchanged.
908 */ 908 */
909 909
910struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, 910struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
911 unsigned short mask) 911 unsigned short mask)
912{ 912{
913 struct net_device *dev, *ret; 913 struct net_device *dev, *ret;
914 914
915 ASSERT_RTNL();
916
915 ret = NULL; 917 ret = NULL;
916 for_each_netdev_rcu(net, dev) { 918 for_each_netdev(net, dev) {
917 if (((dev->flags ^ if_flags) & mask) == 0) { 919 if (((dev->flags ^ if_flags) & mask) == 0) {
918 ret = dev; 920 ret = dev;
919 break; 921 break;
@@ -921,7 +923,7 @@ struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags
921 } 923 }
922 return ret; 924 return ret;
923} 925}
924EXPORT_SYMBOL(dev_get_by_flags_rcu); 926EXPORT_SYMBOL(__dev_get_by_flags);
925 927
926/** 928/**
927 * dev_valid_name - check if name is okay for network device 929 * dev_valid_name - check if name is okay for network device
@@ -2175,6 +2177,53 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2175 return (struct dev_kfree_skb_cb *)skb->cb; 2177 return (struct dev_kfree_skb_cb *)skb->cb;
2176} 2178}
2177 2179
2180void netif_schedule_queue(struct netdev_queue *txq)
2181{
2182 rcu_read_lock();
2183 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2184 struct Qdisc *q = rcu_dereference(txq->qdisc);
2185
2186 __netif_schedule(q);
2187 }
2188 rcu_read_unlock();
2189}
2190EXPORT_SYMBOL(netif_schedule_queue);
2191
2192/**
2193 * netif_wake_subqueue - allow sending packets on subqueue
2194 * @dev: network device
2195 * @queue_index: sub queue index
2196 *
2197 * Resume individual transmit queue of a device with multiple transmit queues.
2198 */
2199void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2200{
2201 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2202
2203 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2204 struct Qdisc *q;
2205
2206 rcu_read_lock();
2207 q = rcu_dereference(txq->qdisc);
2208 __netif_schedule(q);
2209 rcu_read_unlock();
2210 }
2211}
2212EXPORT_SYMBOL(netif_wake_subqueue);
2213
2214void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2215{
2216 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2217 struct Qdisc *q;
2218
2219 rcu_read_lock();
2220 q = rcu_dereference(dev_queue->qdisc);
2221 __netif_schedule(q);
2222 rcu_read_unlock();
2223 }
2224}
2225EXPORT_SYMBOL(netif_tx_wake_queue);
2226
2178void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2227void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2179{ 2228{
2180 unsigned long flags; 2229 unsigned long flags;
@@ -2371,16 +2420,6 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2371 rcu_read_lock(); 2420 rcu_read_lock();
2372 list_for_each_entry_rcu(ptype, &offload_base, list) { 2421 list_for_each_entry_rcu(ptype, &offload_base, list) {
2373 if (ptype->type == type && ptype->callbacks.gso_segment) { 2422 if (ptype->type == type && ptype->callbacks.gso_segment) {
2374 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2375 int err;
2376
2377 err = ptype->callbacks.gso_send_check(skb);
2378 segs = ERR_PTR(err);
2379 if (err || skb_gso_ok(skb, features))
2380 break;
2381 __skb_push(skb, (skb->data -
2382 skb_network_header(skb)));
2383 }
2384 segs = ptype->callbacks.gso_segment(skb, features); 2423 segs = ptype->callbacks.gso_segment(skb, features);
2385 break; 2424 break;
2386 } 2425 }
@@ -2483,52 +2522,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2483 return 0; 2522 return 0;
2484} 2523}
2485 2524
2486struct dev_gso_cb {
2487 void (*destructor)(struct sk_buff *skb);
2488};
2489
2490#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2491
2492static void dev_gso_skb_destructor(struct sk_buff *skb)
2493{
2494 struct dev_gso_cb *cb;
2495
2496 kfree_skb_list(skb->next);
2497 skb->next = NULL;
2498
2499 cb = DEV_GSO_CB(skb);
2500 if (cb->destructor)
2501 cb->destructor(skb);
2502}
2503
2504/**
2505 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2506 * @skb: buffer to segment
2507 * @features: device features as applicable to this skb
2508 *
2509 * This function segments the given skb and stores the list of segments
2510 * in skb->next.
2511 */
2512static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2513{
2514 struct sk_buff *segs;
2515
2516 segs = skb_gso_segment(skb, features);
2517
2518 /* Verifying header integrity only. */
2519 if (!segs)
2520 return 0;
2521
2522 if (IS_ERR(segs))
2523 return PTR_ERR(segs);
2524
2525 skb->next = segs;
2526 DEV_GSO_CB(skb)->destructor = skb->destructor;
2527 skb->destructor = dev_gso_skb_destructor;
2528
2529 return 0;
2530}
2531
2532/* If MPLS offload request, verify we are testing hardware MPLS features 2525/* If MPLS offload request, verify we are testing hardware MPLS features
2533 * instead of standard features for the netdev. 2526 * instead of standard features for the netdev.
2534 */ 2527 */
@@ -2572,10 +2565,12 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
2572 2565
2573netdev_features_t netif_skb_features(struct sk_buff *skb) 2566netdev_features_t netif_skb_features(struct sk_buff *skb)
2574{ 2567{
2568 const struct net_device *dev = skb->dev;
2569 netdev_features_t features = dev->features;
2570 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2575 __be16 protocol = skb->protocol; 2571 __be16 protocol = skb->protocol;
2576 netdev_features_t features = skb->dev->features;
2577 2572
2578 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) 2573 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2579 features &= ~NETIF_F_GSO_MASK; 2574 features &= ~NETIF_F_GSO_MASK;
2580 2575
2581 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { 2576 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
@@ -2586,7 +2581,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
2586 } 2581 }
2587 2582
2588 features = netdev_intersect_features(features, 2583 features = netdev_intersect_features(features,
2589 skb->dev->vlan_features | 2584 dev->vlan_features |
2590 NETIF_F_HW_VLAN_CTAG_TX | 2585 NETIF_F_HW_VLAN_CTAG_TX |
2591 NETIF_F_HW_VLAN_STAG_TX); 2586 NETIF_F_HW_VLAN_STAG_TX);
2592 2587
@@ -2603,119 +2598,149 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
2603} 2598}
2604EXPORT_SYMBOL(netif_skb_features); 2599EXPORT_SYMBOL(netif_skb_features);
2605 2600
2606int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 2601static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2607 struct netdev_queue *txq) 2602 struct netdev_queue *txq, bool more)
2608{ 2603{
2609 const struct net_device_ops *ops = dev->netdev_ops; 2604 unsigned int len;
2610 int rc = NETDEV_TX_OK; 2605 int rc;
2611 unsigned int skb_len;
2612 2606
2613 if (likely(!skb->next)) { 2607 if (!list_empty(&ptype_all))
2614 netdev_features_t features; 2608 dev_queue_xmit_nit(skb, dev);
2615 2609
2616 /* 2610 len = skb->len;
2617 * If device doesn't need skb->dst, release it right now while 2611 trace_net_dev_start_xmit(skb, dev);
2618 * its hot in this cpu cache 2612 rc = netdev_start_xmit(skb, dev, txq, more);
2619 */ 2613 trace_net_dev_xmit(skb, rc, dev, len);
2620 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2621 skb_dst_drop(skb);
2622 2614
2623 features = netif_skb_features(skb); 2615 return rc;
2624 2616}
2625 if (vlan_tx_tag_present(skb) &&
2626 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2627 skb = __vlan_put_tag(skb, skb->vlan_proto,
2628 vlan_tx_tag_get(skb));
2629 if (unlikely(!skb))
2630 goto out;
2631 2617
2632 skb->vlan_tci = 0; 2618struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2633 } 2619 struct netdev_queue *txq, int *ret)
2620{
2621 struct sk_buff *skb = first;
2622 int rc = NETDEV_TX_OK;
2634 2623
2635 /* If encapsulation offload request, verify we are testing 2624 while (skb) {
2636 * hardware encapsulation features instead of standard 2625 struct sk_buff *next = skb->next;
2637 * features for the netdev
2638 */
2639 if (skb->encapsulation)
2640 features &= dev->hw_enc_features;
2641 2626
2642 if (netif_needs_gso(skb, features)) { 2627 skb->next = NULL;
2643 if (unlikely(dev_gso_segment(skb, features))) 2628 rc = xmit_one(skb, dev, txq, next != NULL);
2644 goto out_kfree_skb; 2629 if (unlikely(!dev_xmit_complete(rc))) {
2645 if (skb->next) 2630 skb->next = next;
2646 goto gso; 2631 goto out;
2647 } else { 2632 }
2648 if (skb_needs_linearize(skb, features) &&
2649 __skb_linearize(skb))
2650 goto out_kfree_skb;
2651 2633
2652 /* If packet is not checksummed and device does not 2634 skb = next;
2653 * support checksumming for this protocol, complete 2635 if (netif_xmit_stopped(txq) && skb) {
2654 * checksumming here. 2636 rc = NETDEV_TX_BUSY;
2655 */ 2637 break;
2656 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2657 if (skb->encapsulation)
2658 skb_set_inner_transport_header(skb,
2659 skb_checksum_start_offset(skb));
2660 else
2661 skb_set_transport_header(skb,
2662 skb_checksum_start_offset(skb));
2663 if (!(features & NETIF_F_ALL_CSUM) &&
2664 skb_checksum_help(skb))
2665 goto out_kfree_skb;
2666 }
2667 } 2638 }
2639 }
2668 2640
2669 if (!list_empty(&ptype_all)) 2641out:
2670 dev_queue_xmit_nit(skb, dev); 2642 *ret = rc;
2643 return skb;
2644}
2671 2645
2672 skb_len = skb->len; 2646static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2673 trace_net_dev_start_xmit(skb, dev); 2647 netdev_features_t features)
2674 rc = ops->ndo_start_xmit(skb, dev); 2648{
2675 trace_net_dev_xmit(skb, rc, dev, skb_len); 2649 if (vlan_tx_tag_present(skb) &&
2676 if (rc == NETDEV_TX_OK) 2650 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2677 txq_trans_update(txq); 2651 skb = __vlan_put_tag(skb, skb->vlan_proto,
2678 return rc; 2652 vlan_tx_tag_get(skb));
2653 if (skb)
2654 skb->vlan_tci = 0;
2679 } 2655 }
2656 return skb;
2657}
2680 2658
2681gso: 2659static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2682 do { 2660{
2683 struct sk_buff *nskb = skb->next; 2661 netdev_features_t features;
2684 2662
2685 skb->next = nskb->next; 2663 if (skb->next)
2686 nskb->next = NULL; 2664 return skb;
2687 2665
2688 if (!list_empty(&ptype_all)) 2666 features = netif_skb_features(skb);
2689 dev_queue_xmit_nit(nskb, dev); 2667 skb = validate_xmit_vlan(skb, features);
2690 2668 if (unlikely(!skb))
2691 skb_len = nskb->len; 2669 goto out_null;
2692 trace_net_dev_start_xmit(nskb, dev); 2670
2693 rc = ops->ndo_start_xmit(nskb, dev); 2671 /* If encapsulation offload request, verify we are testing
2694 trace_net_dev_xmit(nskb, rc, dev, skb_len); 2672 * hardware encapsulation features instead of standard
2695 if (unlikely(rc != NETDEV_TX_OK)) { 2673 * features for the netdev
2696 if (rc & ~NETDEV_TX_MASK) 2674 */
2697 goto out_kfree_gso_skb; 2675 if (skb->encapsulation)
2698 nskb->next = skb->next; 2676 features &= dev->hw_enc_features;
2699 skb->next = nskb; 2677
2700 return rc; 2678 if (netif_needs_gso(skb, features)) {
2679 struct sk_buff *segs;
2680
2681 segs = skb_gso_segment(skb, features);
2682 if (IS_ERR(segs)) {
2683 segs = NULL;
2684 } else if (segs) {
2685 consume_skb(skb);
2686 skb = segs;
2701 } 2687 }
2702 txq_trans_update(txq); 2688 } else {
2703 if (unlikely(netif_xmit_stopped(txq) && skb->next)) 2689 if (skb_needs_linearize(skb, features) &&
2704 return NETDEV_TX_BUSY; 2690 __skb_linearize(skb))
2705 } while (skb->next); 2691 goto out_kfree_skb;
2706 2692
2707out_kfree_gso_skb: 2693 /* If packet is not checksummed and device does not
2708 if (likely(skb->next == NULL)) { 2694 * support checksumming for this protocol, complete
2709 skb->destructor = DEV_GSO_CB(skb)->destructor; 2695 * checksumming here.
2710 consume_skb(skb); 2696 */
2711 return rc; 2697 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2698 if (skb->encapsulation)
2699 skb_set_inner_transport_header(skb,
2700 skb_checksum_start_offset(skb));
2701 else
2702 skb_set_transport_header(skb,
2703 skb_checksum_start_offset(skb));
2704 if (!(features & NETIF_F_ALL_CSUM) &&
2705 skb_checksum_help(skb))
2706 goto out_kfree_skb;
2707 }
2712 } 2708 }
2709
2710 return skb;
2711
2713out_kfree_skb: 2712out_kfree_skb:
2714 kfree_skb(skb); 2713 kfree_skb(skb);
2715out: 2714out_null:
2716 return rc; 2715 return NULL;
2716}
2717
2718struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2719{
2720 struct sk_buff *next, *head = NULL, *tail;
2721
2722 for (; skb != NULL; skb = next) {
2723 next = skb->next;
2724 skb->next = NULL;
2725
2726 /* in case skb wont be segmented, point to itself */
2727 skb->prev = skb;
2728
2729 skb = validate_xmit_skb(skb, dev);
2730 if (!skb)
2731 continue;
2732
2733 if (!head)
2734 head = skb;
2735 else
2736 tail->next = skb;
2737 /* If skb was segmented, skb->prev points to
2738 * the last segment. If not, it still contains skb.
2739 */
2740 tail = skb->prev;
2741 }
2742 return head;
2717} 2743}
2718EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2719 2744
2720static void qdisc_pkt_len_init(struct sk_buff *skb) 2745static void qdisc_pkt_len_init(struct sk_buff *skb)
2721{ 2746{
@@ -2778,12 +2803,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2778 * waiting to be sent out; and the qdisc is not running - 2803 * waiting to be sent out; and the qdisc is not running -
2779 * xmit the skb directly. 2804 * xmit the skb directly.
2780 */ 2805 */
2781 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2782 skb_dst_force(skb);
2783 2806
2784 qdisc_bstats_update(q, skb); 2807 qdisc_bstats_update(q, skb);
2785 2808
2786 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2809 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2787 if (unlikely(contended)) { 2810 if (unlikely(contended)) {
2788 spin_unlock(&q->busylock); 2811 spin_unlock(&q->busylock);
2789 contended = false; 2812 contended = false;
@@ -2794,7 +2817,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2794 2817
2795 rc = NET_XMIT_SUCCESS; 2818 rc = NET_XMIT_SUCCESS;
2796 } else { 2819 } else {
2797 skb_dst_force(skb);
2798 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2820 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2799 if (qdisc_run_begin(q)) { 2821 if (qdisc_run_begin(q)) {
2800 if (unlikely(contended)) { 2822 if (unlikely(contended)) {
@@ -2891,6 +2913,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2891 2913
2892 skb_update_prio(skb); 2914 skb_update_prio(skb);
2893 2915
2916 /* If device/qdisc don't need skb->dst, release it right now while
2917 * its hot in this cpu cache.
2918 */
2919 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2920 skb_dst_drop(skb);
2921 else
2922 skb_dst_force(skb);
2923
2894 txq = netdev_pick_tx(dev, skb, accel_priv); 2924 txq = netdev_pick_tx(dev, skb, accel_priv);
2895 q = rcu_dereference_bh(txq->qdisc); 2925 q = rcu_dereference_bh(txq->qdisc);
2896 2926
@@ -2923,11 +2953,15 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2923 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 2953 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2924 goto recursion_alert; 2954 goto recursion_alert;
2925 2955
2956 skb = validate_xmit_skb(skb, dev);
2957 if (!skb)
2958 goto drop;
2959
2926 HARD_TX_LOCK(dev, txq, cpu); 2960 HARD_TX_LOCK(dev, txq, cpu);
2927 2961
2928 if (!netif_xmit_stopped(txq)) { 2962 if (!netif_xmit_stopped(txq)) {
2929 __this_cpu_inc(xmit_recursion); 2963 __this_cpu_inc(xmit_recursion);
2930 rc = dev_hard_start_xmit(skb, dev, txq); 2964 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2931 __this_cpu_dec(xmit_recursion); 2965 __this_cpu_dec(xmit_recursion);
2932 if (dev_xmit_complete(rc)) { 2966 if (dev_xmit_complete(rc)) {
2933 HARD_TX_UNLOCK(dev, txq); 2967 HARD_TX_UNLOCK(dev, txq);
@@ -2948,10 +2982,11 @@ recursion_alert:
2948 } 2982 }
2949 2983
2950 rc = -ENETDOWN; 2984 rc = -ENETDOWN;
2985drop:
2951 rcu_read_unlock_bh(); 2986 rcu_read_unlock_bh();
2952 2987
2953 atomic_long_inc(&dev->tx_dropped); 2988 atomic_long_inc(&dev->tx_dropped);
2954 kfree_skb(skb); 2989 kfree_skb_list(skb);
2955 return rc; 2990 return rc;
2956out: 2991out:
2957 rcu_read_unlock_bh(); 2992 rcu_read_unlock_bh();
@@ -3128,8 +3163,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3128 } 3163 }
3129 3164
3130 if (map) { 3165 if (map) {
3131 tcpu = map->cpus[((u64) hash * map->len) >> 32]; 3166 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3132
3133 if (cpu_online(tcpu)) { 3167 if (cpu_online(tcpu)) {
3134 cpu = tcpu; 3168 cpu = tcpu;
3135 goto done; 3169 goto done;
@@ -3465,7 +3499,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3465 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3499 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3466 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3500 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3467 3501
3468 q = rxq->qdisc; 3502 q = rcu_dereference(rxq->qdisc);
3469 if (q != &noop_qdisc) { 3503 if (q != &noop_qdisc) {
3470 spin_lock(qdisc_lock(q)); 3504 spin_lock(qdisc_lock(q));
3471 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 3505 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3482,7 +3516,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3482{ 3516{
3483 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); 3517 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3484 3518
3485 if (!rxq || rxq->qdisc == &noop_qdisc) 3519 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3486 goto out; 3520 goto out;
3487 3521
3488 if (*pt_prev) { 3522 if (*pt_prev) {
@@ -3963,11 +3997,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3963 if (!(skb->dev->features & NETIF_F_GRO)) 3997 if (!(skb->dev->features & NETIF_F_GRO))
3964 goto normal; 3998 goto normal;
3965 3999
3966 if (skb_is_gso(skb) || skb_has_frag_list(skb)) 4000 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3967 goto normal; 4001 goto normal;
3968 4002
3969 gro_list_prepare(napi, skb); 4003 gro_list_prepare(napi, skb);
3970 NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3971 4004
3972 rcu_read_lock(); 4005 rcu_read_lock();
3973 list_for_each_entry_rcu(ptype, head, list) { 4006 list_for_each_entry_rcu(ptype, head, list) {
@@ -3981,6 +4014,22 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
3981 NAPI_GRO_CB(skb)->free = 0; 4014 NAPI_GRO_CB(skb)->free = 0;
3982 NAPI_GRO_CB(skb)->udp_mark = 0; 4015 NAPI_GRO_CB(skb)->udp_mark = 0;
3983 4016
4017 /* Setup for GRO checksum validation */
4018 switch (skb->ip_summed) {
4019 case CHECKSUM_COMPLETE:
4020 NAPI_GRO_CB(skb)->csum = skb->csum;
4021 NAPI_GRO_CB(skb)->csum_valid = 1;
4022 NAPI_GRO_CB(skb)->csum_cnt = 0;
4023 break;
4024 case CHECKSUM_UNNECESSARY:
4025 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4026 NAPI_GRO_CB(skb)->csum_valid = 0;
4027 break;
4028 default:
4029 NAPI_GRO_CB(skb)->csum_cnt = 0;
4030 NAPI_GRO_CB(skb)->csum_valid = 0;
4031 }
4032
3984 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4033 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3985 break; 4034 break;
3986 } 4035 }
@@ -4210,6 +4259,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
4210} 4259}
4211EXPORT_SYMBOL(napi_gro_frags); 4260EXPORT_SYMBOL(napi_gro_frags);
4212 4261
4262/* Compute the checksum from gro_offset and return the folded value
4263 * after adding in any pseudo checksum.
4264 */
4265__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4266{
4267 __wsum wsum;
4268 __sum16 sum;
4269
4270 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4271
4272 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4273 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4274 if (likely(!sum)) {
4275 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4276 !skb->csum_complete_sw)
4277 netdev_rx_csum_fault(skb->dev);
4278 }
4279
4280 NAPI_GRO_CB(skb)->csum = wsum;
4281 NAPI_GRO_CB(skb)->csum_valid = 1;
4282
4283 return sum;
4284}
4285EXPORT_SYMBOL(__skb_gro_checksum_complete);
4286
4213/* 4287/*
4214 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4288 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4215 * Note: called with local irq disabled, but exits with local irq enabled. 4289 * Note: called with local irq disabled, but exits with local irq enabled.
@@ -6579,6 +6653,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6579 6653
6580 dev->gso_max_size = GSO_MAX_SIZE; 6654 dev->gso_max_size = GSO_MAX_SIZE;
6581 dev->gso_max_segs = GSO_MAX_SEGS; 6655 dev->gso_max_segs = GSO_MAX_SEGS;
6656 dev->gso_min_segs = 0;
6582 6657
6583 INIT_LIST_HEAD(&dev->napi_list); 6658 INIT_LIST_HEAD(&dev->napi_list);
6584 INIT_LIST_HEAD(&dev->unreg_list); 6659 INIT_LIST_HEAD(&dev->unreg_list);
@@ -6588,7 +6663,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6588 INIT_LIST_HEAD(&dev->adj_list.lower); 6663 INIT_LIST_HEAD(&dev->adj_list.lower);
6589 INIT_LIST_HEAD(&dev->all_adj_list.upper); 6664 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6590 INIT_LIST_HEAD(&dev->all_adj_list.lower); 6665 INIT_LIST_HEAD(&dev->all_adj_list.lower);
6591 dev->priv_flags = IFF_XMIT_DST_RELEASE; 6666 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6592 setup(dev); 6667 setup(dev);
6593 6668
6594 dev->num_tx_queues = txqs; 6669 dev->num_tx_queues = txqs;
@@ -7010,53 +7085,45 @@ const char *netdev_drivername(const struct net_device *dev)
7010 return empty; 7085 return empty;
7011} 7086}
7012 7087
7013static int __netdev_printk(const char *level, const struct net_device *dev, 7088static void __netdev_printk(const char *level, const struct net_device *dev,
7014 struct va_format *vaf) 7089 struct va_format *vaf)
7015{ 7090{
7016 int r;
7017
7018 if (dev && dev->dev.parent) { 7091 if (dev && dev->dev.parent) {
7019 r = dev_printk_emit(level[1] - '0', 7092 dev_printk_emit(level[1] - '0',
7020 dev->dev.parent, 7093 dev->dev.parent,
7021 "%s %s %s%s: %pV", 7094 "%s %s %s%s: %pV",
7022 dev_driver_string(dev->dev.parent), 7095 dev_driver_string(dev->dev.parent),
7023 dev_name(dev->dev.parent), 7096 dev_name(dev->dev.parent),
7024 netdev_name(dev), netdev_reg_state(dev), 7097 netdev_name(dev), netdev_reg_state(dev),
7025 vaf); 7098 vaf);
7026 } else if (dev) { 7099 } else if (dev) {
7027 r = printk("%s%s%s: %pV", level, netdev_name(dev), 7100 printk("%s%s%s: %pV",
7028 netdev_reg_state(dev), vaf); 7101 level, netdev_name(dev), netdev_reg_state(dev), vaf);
7029 } else { 7102 } else {
7030 r = printk("%s(NULL net_device): %pV", level, vaf); 7103 printk("%s(NULL net_device): %pV", level, vaf);
7031 } 7104 }
7032
7033 return r;
7034} 7105}
7035 7106
7036int netdev_printk(const char *level, const struct net_device *dev, 7107void netdev_printk(const char *level, const struct net_device *dev,
7037 const char *format, ...) 7108 const char *format, ...)
7038{ 7109{
7039 struct va_format vaf; 7110 struct va_format vaf;
7040 va_list args; 7111 va_list args;
7041 int r;
7042 7112
7043 va_start(args, format); 7113 va_start(args, format);
7044 7114
7045 vaf.fmt = format; 7115 vaf.fmt = format;
7046 vaf.va = &args; 7116 vaf.va = &args;
7047 7117
7048 r = __netdev_printk(level, dev, &vaf); 7118 __netdev_printk(level, dev, &vaf);
7049 7119
7050 va_end(args); 7120 va_end(args);
7051
7052 return r;
7053} 7121}
7054EXPORT_SYMBOL(netdev_printk); 7122EXPORT_SYMBOL(netdev_printk);
7055 7123
7056#define define_netdev_printk_level(func, level) \ 7124#define define_netdev_printk_level(func, level) \
7057int func(const struct net_device *dev, const char *fmt, ...) \ 7125void func(const struct net_device *dev, const char *fmt, ...) \
7058{ \ 7126{ \
7059 int r; \
7060 struct va_format vaf; \ 7127 struct va_format vaf; \
7061 va_list args; \ 7128 va_list args; \
7062 \ 7129 \
@@ -7065,11 +7132,9 @@ int func(const struct net_device *dev, const char *fmt, ...) \
7065 vaf.fmt = fmt; \ 7132 vaf.fmt = fmt; \
7066 vaf.va = &args; \ 7133 vaf.va = &args; \
7067 \ 7134 \
7068 r = __netdev_printk(level, dev, &vaf); \ 7135 __netdev_printk(level, dev, &vaf); \
7069 \ 7136 \
7070 va_end(args); \ 7137 va_end(args); \
7071 \
7072 return r; \
7073} \ 7138} \
7074EXPORT_SYMBOL(func); 7139EXPORT_SYMBOL(func);
7075 7140
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index cf999e09bcd2..72e899a3efda 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -365,11 +365,8 @@ void dev_load(struct net *net, const char *name)
365 no_module = !dev; 365 no_module = !dev;
366 if (no_module && capable(CAP_NET_ADMIN)) 366 if (no_module && capable(CAP_NET_ADMIN))
367 no_module = request_module("netdev-%s", name); 367 no_module = request_module("netdev-%s", name);
368 if (no_module && capable(CAP_SYS_MODULE)) { 368 if (no_module && capable(CAP_SYS_MODULE))
369 if (!request_module("%s", name)) 369 request_module("%s", name);
370 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
371 name);
372 }
373} 370}
374EXPORT_SYMBOL(dev_load); 371EXPORT_SYMBOL(dev_load);
375 372
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 17cb912793fa..1600aa24d36b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1621,6 +1621,81 @@ static int ethtool_get_module_eeprom(struct net_device *dev,
1621 modinfo.eeprom_len); 1621 modinfo.eeprom_len);
1622} 1622}
1623 1623
1624static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
1625{
1626 switch (tuna->id) {
1627 case ETHTOOL_RX_COPYBREAK:
1628 case ETHTOOL_TX_COPYBREAK:
1629 if (tuna->len != sizeof(u32) ||
1630 tuna->type_id != ETHTOOL_TUNABLE_U32)
1631 return -EINVAL;
1632 break;
1633 default:
1634 return -EINVAL;
1635 }
1636
1637 return 0;
1638}
1639
1640static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
1641{
1642 int ret;
1643 struct ethtool_tunable tuna;
1644 const struct ethtool_ops *ops = dev->ethtool_ops;
1645 void *data;
1646
1647 if (!ops->get_tunable)
1648 return -EOPNOTSUPP;
1649 if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
1650 return -EFAULT;
1651 ret = ethtool_tunable_valid(&tuna);
1652 if (ret)
1653 return ret;
1654 data = kmalloc(tuna.len, GFP_USER);
1655 if (!data)
1656 return -ENOMEM;
1657 ret = ops->get_tunable(dev, &tuna, data);
1658 if (ret)
1659 goto out;
1660 useraddr += sizeof(tuna);
1661 ret = -EFAULT;
1662 if (copy_to_user(useraddr, data, tuna.len))
1663 goto out;
1664 ret = 0;
1665
1666out:
1667 kfree(data);
1668 return ret;
1669}
1670
1671static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
1672{
1673 int ret;
1674 struct ethtool_tunable tuna;
1675 const struct ethtool_ops *ops = dev->ethtool_ops;
1676 void *data;
1677
1678 if (!ops->set_tunable)
1679 return -EOPNOTSUPP;
1680 if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
1681 return -EFAULT;
1682 ret = ethtool_tunable_valid(&tuna);
1683 if (ret)
1684 return ret;
1685 data = kmalloc(tuna.len, GFP_USER);
1686 if (!data)
1687 return -ENOMEM;
1688 useraddr += sizeof(tuna);
1689 ret = -EFAULT;
1690 if (copy_from_user(data, useraddr, tuna.len))
1691 goto out;
1692 ret = ops->set_tunable(dev, &tuna, data);
1693
1694out:
1695 kfree(data);
1696 return ret;
1697}
1698
1624/* The main entry point in this file. Called from net/core/dev_ioctl.c */ 1699/* The main entry point in this file. Called from net/core/dev_ioctl.c */
1625 1700
1626int dev_ethtool(struct net *net, struct ifreq *ifr) 1701int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -1670,6 +1745,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1670 case ETHTOOL_GCHANNELS: 1745 case ETHTOOL_GCHANNELS:
1671 case ETHTOOL_GET_TS_INFO: 1746 case ETHTOOL_GET_TS_INFO:
1672 case ETHTOOL_GEEE: 1747 case ETHTOOL_GEEE:
1748 case ETHTOOL_GTUNABLE:
1673 break; 1749 break;
1674 default: 1750 default:
1675 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 1751 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1857,6 +1933,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1857 case ETHTOOL_GMODULEEEPROM: 1933 case ETHTOOL_GMODULEEEPROM:
1858 rc = ethtool_get_module_eeprom(dev, useraddr); 1934 rc = ethtool_get_module_eeprom(dev, useraddr);
1859 break; 1935 break;
1936 case ETHTOOL_GTUNABLE:
1937 rc = ethtool_get_tunable(dev, useraddr);
1938 break;
1939 case ETHTOOL_STUNABLE:
1940 rc = ethtool_set_tunable(dev, useraddr);
1941 break;
1860 default: 1942 default:
1861 rc = -EOPNOTSUPP; 1943 rc = -EOPNOTSUPP;
1862 } 1944 }
diff --git a/net/core/filter.c b/net/core/filter.c
index d814b8a89d0f..fcd3f6742a6a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,33 +87,9 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
87} 87}
88EXPORT_SYMBOL(sk_filter); 88EXPORT_SYMBOL(sk_filter);
89 89
90/* Helper to find the offset of pkt_type in sk_buff structure. We want
91 * to make sure its still a 3bit field starting at a byte boundary;
92 * taken from arch/x86/net/bpf_jit_comp.c.
93 */
94#ifdef __BIG_ENDIAN_BITFIELD
95#define PKT_TYPE_MAX (7 << 5)
96#else
97#define PKT_TYPE_MAX 7
98#endif
99static unsigned int pkt_type_offset(void)
100{
101 struct sk_buff skb_probe = { .pkt_type = ~0, };
102 u8 *ct = (u8 *) &skb_probe;
103 unsigned int off;
104
105 for (off = 0; off < sizeof(struct sk_buff); off++) {
106 if (ct[off] == PKT_TYPE_MAX)
107 return off;
108 }
109
110 pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
111 return -1;
112}
113
114static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 90static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
115{ 91{
116 return __skb_get_poff((struct sk_buff *)(unsigned long) ctx); 92 return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
117} 93}
118 94
119static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 95static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -190,11 +166,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
190 break; 166 break;
191 167
192 case SKF_AD_OFF + SKF_AD_PKTTYPE: 168 case SKF_AD_OFF + SKF_AD_PKTTYPE:
193 *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, 169 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
194 pkt_type_offset()); 170 PKT_TYPE_OFFSET());
195 if (insn->off < 0)
196 return false;
197 insn++;
198 *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); 171 *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
199#ifdef __BIG_ENDIAN_BITFIELD 172#ifdef __BIG_ENDIAN_BITFIELD
200 insn++; 173 insn++;
@@ -933,7 +906,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
933 906
934 /* Expand fp for appending the new filter representation. */ 907 /* Expand fp for appending the new filter representation. */
935 old_fp = fp; 908 old_fp = fp;
936 fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); 909 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
937 if (!fp) { 910 if (!fp) {
938 /* The old_fp is still around in case we couldn't 911 /* The old_fp is still around in case we couldn't
939 * allocate new memory, so uncharge on that one. 912 * allocate new memory, so uncharge on that one.
@@ -972,7 +945,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
972 int err; 945 int err;
973 946
974 fp->bpf_func = NULL; 947 fp->bpf_func = NULL;
975 fp->jited = 0; 948 fp->jited = false;
976 949
977 err = bpf_check_classic(fp->insns, fp->len); 950 err = bpf_check_classic(fp->insns, fp->len);
978 if (err) { 951 if (err) {
@@ -1013,7 +986,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1013 if (fprog->filter == NULL) 986 if (fprog->filter == NULL)
1014 return -EINVAL; 987 return -EINVAL;
1015 988
1016 fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); 989 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1017 if (!fp) 990 if (!fp)
1018 return -ENOMEM; 991 return -ENOMEM;
1019 992
@@ -1069,12 +1042,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1069 if (fprog->filter == NULL) 1042 if (fprog->filter == NULL)
1070 return -EINVAL; 1043 return -EINVAL;
1071 1044
1072 prog = kmalloc(bpf_fsize, GFP_KERNEL); 1045 prog = bpf_prog_alloc(bpf_fsize, 0);
1073 if (!prog) 1046 if (!prog)
1074 return -ENOMEM; 1047 return -ENOMEM;
1075 1048
1076 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1049 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1077 kfree(prog); 1050 __bpf_prog_free(prog);
1078 return -EFAULT; 1051 return -EFAULT;
1079 } 1052 }
1080 1053
@@ -1082,7 +1055,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1082 1055
1083 err = bpf_prog_store_orig_filter(prog, fprog); 1056 err = bpf_prog_store_orig_filter(prog, fprog);
1084 if (err) { 1057 if (err) {
1085 kfree(prog); 1058 __bpf_prog_free(prog);
1086 return -ENOMEM; 1059 return -ENOMEM;
1087 } 1060 }
1088 1061
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5f362c1d0332..8560dea58803 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
13#include <linux/if_pppox.h> 13#include <linux/if_pppox.h>
14#include <linux/ppp_defs.h> 14#include <linux/ppp_defs.h>
15#include <net/flow_keys.h> 15#include <net/flow_keys.h>
16#include <scsi/fc/fc_fcoe.h>
16 17
17/* copy saddr & daddr, possibly using 64bit load/store 18/* copy saddr & daddr, possibly using 64bit load/store
18 * Equivalent to : flow->src = iph->saddr; 19 * Equivalent to : flow->src = iph->saddr;
@@ -26,36 +27,61 @@ static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *i
26} 27}
27 28
28/** 29/**
29 * skb_flow_get_ports - extract the upper layer ports and return them 30 * __skb_flow_get_ports - extract the upper layer ports and return them
30 * @skb: buffer to extract the ports from 31 * @skb: sk_buff to extract the ports from
31 * @thoff: transport header offset 32 * @thoff: transport header offset
32 * @ip_proto: protocol for which to get port offset 33 * @ip_proto: protocol for which to get port offset
34 * @data: raw buffer pointer to the packet, if NULL use skb->data
35 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
33 * 36 *
34 * The function will try to retrieve the ports at offset thoff + poff where poff 37 * The function will try to retrieve the ports at offset thoff + poff where poff
35 * is the protocol port offset returned from proto_ports_offset 38 * is the protocol port offset returned from proto_ports_offset
36 */ 39 */
37__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) 40__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
41 void *data, int hlen)
38{ 42{
39 int poff = proto_ports_offset(ip_proto); 43 int poff = proto_ports_offset(ip_proto);
40 44
45 if (!data) {
46 data = skb->data;
47 hlen = skb_headlen(skb);
48 }
49
41 if (poff >= 0) { 50 if (poff >= 0) {
42 __be32 *ports, _ports; 51 __be32 *ports, _ports;
43 52
44 ports = skb_header_pointer(skb, thoff + poff, 53 ports = __skb_header_pointer(skb, thoff + poff,
45 sizeof(_ports), &_ports); 54 sizeof(_ports), data, hlen, &_ports);
46 if (ports) 55 if (ports)
47 return *ports; 56 return *ports;
48 } 57 }
49 58
50 return 0; 59 return 0;
51} 60}
52EXPORT_SYMBOL(skb_flow_get_ports); 61EXPORT_SYMBOL(__skb_flow_get_ports);
53 62
54bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) 63/**
64 * __skb_flow_dissect - extract the flow_keys struct and return it
65 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
66 * @data: raw buffer pointer to the packet, if NULL use skb->data
67 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
68 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
69 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
70 *
71 * The function will try to retrieve the struct flow_keys from either the skbuff
72 * or a raw buffer specified by the rest parameters
73 */
74bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
75 void *data, __be16 proto, int nhoff, int hlen)
55{ 76{
56 int nhoff = skb_network_offset(skb);
57 u8 ip_proto; 77 u8 ip_proto;
58 __be16 proto = skb->protocol; 78
79 if (!data) {
80 data = skb->data;
81 proto = skb->protocol;
82 nhoff = skb_network_offset(skb);
83 hlen = skb_headlen(skb);
84 }
59 85
60 memset(flow, 0, sizeof(*flow)); 86 memset(flow, 0, sizeof(*flow));
61 87
@@ -65,7 +91,7 @@ again:
65 const struct iphdr *iph; 91 const struct iphdr *iph;
66 struct iphdr _iph; 92 struct iphdr _iph;
67ip: 93ip:
68 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 94 iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
69 if (!iph || iph->ihl < 5) 95 if (!iph || iph->ihl < 5)
70 return false; 96 return false;
71 nhoff += iph->ihl * 4; 97 nhoff += iph->ihl * 4;
@@ -83,7 +109,7 @@ ip:
83 __be32 flow_label; 109 __be32 flow_label;
84 110
85ipv6: 111ipv6:
86 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 112 iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
87 if (!iph) 113 if (!iph)
88 return false; 114 return false;
89 115
@@ -92,6 +118,13 @@ ipv6:
92 flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); 118 flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
93 nhoff += sizeof(struct ipv6hdr); 119 nhoff += sizeof(struct ipv6hdr);
94 120
121 /* skip the flow label processing if skb is NULL. The
122 * assumption here is that if there is no skb we are not
123 * looking for flow info as much as we are length.
124 */
125 if (!skb)
126 break;
127
95 flow_label = ip6_flowlabel(iph); 128 flow_label = ip6_flowlabel(iph);
96 if (flow_label) { 129 if (flow_label) {
97 /* Awesome, IPv6 packet has a flow label so we can 130 /* Awesome, IPv6 packet has a flow label so we can
@@ -113,7 +146,7 @@ ipv6:
113 const struct vlan_hdr *vlan; 146 const struct vlan_hdr *vlan;
114 struct vlan_hdr _vlan; 147 struct vlan_hdr _vlan;
115 148
116 vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); 149 vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
117 if (!vlan) 150 if (!vlan)
118 return false; 151 return false;
119 152
@@ -126,7 +159,7 @@ ipv6:
126 struct pppoe_hdr hdr; 159 struct pppoe_hdr hdr;
127 __be16 proto; 160 __be16 proto;
128 } *hdr, _hdr; 161 } *hdr, _hdr;
129 hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); 162 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
130 if (!hdr) 163 if (!hdr)
131 return false; 164 return false;
132 proto = hdr->proto; 165 proto = hdr->proto;
@@ -140,6 +173,9 @@ ipv6:
140 return false; 173 return false;
141 } 174 }
142 } 175 }
176 case htons(ETH_P_FCOE):
177 flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
178 /* fall through */
143 default: 179 default:
144 return false; 180 return false;
145 } 181 }
@@ -151,7 +187,7 @@ ipv6:
151 __be16 proto; 187 __be16 proto;
152 } *hdr, _hdr; 188 } *hdr, _hdr;
153 189
154 hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); 190 hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
155 if (!hdr) 191 if (!hdr)
156 return false; 192 return false;
157 /* 193 /*
@@ -171,8 +207,9 @@ ipv6:
171 const struct ethhdr *eth; 207 const struct ethhdr *eth;
172 struct ethhdr _eth; 208 struct ethhdr _eth;
173 209
174 eth = skb_header_pointer(skb, nhoff, 210 eth = __skb_header_pointer(skb, nhoff,
175 sizeof(_eth), &_eth); 211 sizeof(_eth),
212 data, hlen, &_eth);
176 if (!eth) 213 if (!eth)
177 return false; 214 return false;
178 proto = eth->h_proto; 215 proto = eth->h_proto;
@@ -194,12 +231,12 @@ ipv6:
194 231
195 flow->n_proto = proto; 232 flow->n_proto = proto;
196 flow->ip_proto = ip_proto; 233 flow->ip_proto = ip_proto;
197 flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto); 234 flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
198 flow->thoff = (u16) nhoff; 235 flow->thoff = (u16) nhoff;
199 236
200 return true; 237 return true;
201} 238}
202EXPORT_SYMBOL(skb_flow_dissect); 239EXPORT_SYMBOL(__skb_flow_dissect);
203 240
204static u32 hashrnd __read_mostly; 241static u32 hashrnd __read_mostly;
205static __always_inline void __flow_hash_secret_init(void) 242static __always_inline void __flow_hash_secret_init(void)
@@ -286,30 +323,22 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
286 qcount = dev->tc_to_txq[tc].count; 323 qcount = dev->tc_to_txq[tc].count;
287 } 324 }
288 325
289 return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset; 326 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
290} 327}
291EXPORT_SYMBOL(__skb_tx_hash); 328EXPORT_SYMBOL(__skb_tx_hash);
292 329
293/* __skb_get_poff() returns the offset to the payload as far as it could 330u32 __skb_get_poff(const struct sk_buff *skb, void *data,
294 * be dissected. The main user is currently BPF, so that we can dynamically 331 const struct flow_keys *keys, int hlen)
295 * truncate packets without needing to push actual payload to the user
296 * space and can analyze headers only, instead.
297 */
298u32 __skb_get_poff(const struct sk_buff *skb)
299{ 332{
300 struct flow_keys keys; 333 u32 poff = keys->thoff;
301 u32 poff = 0;
302
303 if (!skb_flow_dissect(skb, &keys))
304 return 0;
305 334
306 poff += keys.thoff; 335 switch (keys->ip_proto) {
307 switch (keys.ip_proto) {
308 case IPPROTO_TCP: { 336 case IPPROTO_TCP: {
309 const struct tcphdr *tcph; 337 const struct tcphdr *tcph;
310 struct tcphdr _tcph; 338 struct tcphdr _tcph;
311 339
312 tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph); 340 tcph = __skb_header_pointer(skb, poff, sizeof(_tcph),
341 data, hlen, &_tcph);
313 if (!tcph) 342 if (!tcph)
314 return poff; 343 return poff;
315 344
@@ -343,6 +372,21 @@ u32 __skb_get_poff(const struct sk_buff *skb)
343 return poff; 372 return poff;
344} 373}
345 374
375/* skb_get_poff() returns the offset to the payload as far as it could
376 * be dissected. The main user is currently BPF, so that we can dynamically
377 * truncate packets without needing to push actual payload to the user
378 * space and can analyze headers only, instead.
379 */
380u32 skb_get_poff(const struct sk_buff *skb)
381{
382 struct flow_keys keys;
383
384 if (!skb_flow_dissect(skb, &keys))
385 return 0;
386
387 return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
388}
389
346static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 390static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
347{ 391{
348#ifdef CONFIG_XPS 392#ifdef CONFIG_XPS
@@ -359,9 +403,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
359 if (map->len == 1) 403 if (map->len == 1)
360 queue_index = map->queues[0]; 404 queue_index = map->queues[0];
361 else 405 else
362 queue_index = map->queues[ 406 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
363 ((u64)skb_get_hash(skb) * map->len) >> 32]; 407 map->len)];
364
365 if (unlikely(queue_index >= dev->real_num_tx_queues)) 408 if (unlikely(queue_index >= dev->real_num_tx_queues))
366 queue_index = -1; 409 queue_index = -1;
367 } 410 }
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 9d33dfffca19..9dfb88a933e7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -91,6 +91,8 @@ struct gen_estimator
91 u32 avpps; 91 u32 avpps;
92 struct rcu_head e_rcu; 92 struct rcu_head e_rcu;
93 struct rb_node node; 93 struct rb_node node;
94 struct gnet_stats_basic_cpu __percpu *cpu_bstats;
95 struct rcu_head head;
94}; 96};
95 97
96struct gen_estimator_head 98struct gen_estimator_head
@@ -115,9 +117,8 @@ static void est_timer(unsigned long arg)
115 117
116 rcu_read_lock(); 118 rcu_read_lock();
117 list_for_each_entry_rcu(e, &elist[idx].list, list) { 119 list_for_each_entry_rcu(e, &elist[idx].list, list) {
118 u64 nbytes; 120 struct gnet_stats_basic_packed b = {0};
119 u64 brate; 121 u64 brate;
120 u32 npackets;
121 u32 rate; 122 u32 rate;
122 123
123 spin_lock(e->stats_lock); 124 spin_lock(e->stats_lock);
@@ -125,15 +126,15 @@ static void est_timer(unsigned long arg)
125 if (e->bstats == NULL) 126 if (e->bstats == NULL)
126 goto skip; 127 goto skip;
127 128
128 nbytes = e->bstats->bytes; 129 __gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
129 npackets = e->bstats->packets; 130
130 brate = (nbytes - e->last_bytes)<<(7 - idx); 131 brate = (b.bytes - e->last_bytes)<<(7 - idx);
131 e->last_bytes = nbytes; 132 e->last_bytes = b.bytes;
132 e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); 133 e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
133 e->rate_est->bps = (e->avbps+0xF)>>5; 134 e->rate_est->bps = (e->avbps+0xF)>>5;
134 135
135 rate = (npackets - e->last_packets)<<(12 - idx); 136 rate = (b.packets - e->last_packets)<<(12 - idx);
136 e->last_packets = npackets; 137 e->last_packets = b.packets;
137 e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); 138 e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
138 e->rate_est->pps = (e->avpps+0x1FF)>>10; 139 e->rate_est->pps = (e->avpps+0x1FF)>>10;
139skip: 140skip:
@@ -203,12 +204,14 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
203 * 204 *
204 */ 205 */
205int gen_new_estimator(struct gnet_stats_basic_packed *bstats, 206int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
207 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
206 struct gnet_stats_rate_est64 *rate_est, 208 struct gnet_stats_rate_est64 *rate_est,
207 spinlock_t *stats_lock, 209 spinlock_t *stats_lock,
208 struct nlattr *opt) 210 struct nlattr *opt)
209{ 211{
210 struct gen_estimator *est; 212 struct gen_estimator *est;
211 struct gnet_estimator *parm = nla_data(opt); 213 struct gnet_estimator *parm = nla_data(opt);
214 struct gnet_stats_basic_packed b = {0};
212 int idx; 215 int idx;
213 216
214 if (nla_len(opt) < sizeof(*parm)) 217 if (nla_len(opt) < sizeof(*parm))
@@ -221,15 +224,18 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
221 if (est == NULL) 224 if (est == NULL)
222 return -ENOBUFS; 225 return -ENOBUFS;
223 226
227 __gnet_stats_copy_basic(&b, cpu_bstats, bstats);
228
224 idx = parm->interval + 2; 229 idx = parm->interval + 2;
225 est->bstats = bstats; 230 est->bstats = bstats;
226 est->rate_est = rate_est; 231 est->rate_est = rate_est;
227 est->stats_lock = stats_lock; 232 est->stats_lock = stats_lock;
228 est->ewma_log = parm->ewma_log; 233 est->ewma_log = parm->ewma_log;
229 est->last_bytes = bstats->bytes; 234 est->last_bytes = b.bytes;
230 est->avbps = rate_est->bps<<5; 235 est->avbps = rate_est->bps<<5;
231 est->last_packets = bstats->packets; 236 est->last_packets = b.packets;
232 est->avpps = rate_est->pps<<10; 237 est->avpps = rate_est->pps<<10;
238 est->cpu_bstats = cpu_bstats;
233 239
234 spin_lock_bh(&est_tree_lock); 240 spin_lock_bh(&est_tree_lock);
235 if (!elist[idx].timer.function) { 241 if (!elist[idx].timer.function) {
@@ -290,11 +296,12 @@ EXPORT_SYMBOL(gen_kill_estimator);
290 * Returns 0 on success or a negative error code. 296 * Returns 0 on success or a negative error code.
291 */ 297 */
292int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, 298int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
299 struct gnet_stats_basic_cpu __percpu *cpu_bstats,
293 struct gnet_stats_rate_est64 *rate_est, 300 struct gnet_stats_rate_est64 *rate_est,
294 spinlock_t *stats_lock, struct nlattr *opt) 301 spinlock_t *stats_lock, struct nlattr *opt)
295{ 302{
296 gen_kill_estimator(bstats, rate_est); 303 gen_kill_estimator(bstats, rate_est);
297 return gen_new_estimator(bstats, rate_est, stats_lock, opt); 304 return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
298} 305}
299EXPORT_SYMBOL(gen_replace_estimator); 306EXPORT_SYMBOL(gen_replace_estimator);
300 307
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 2ddbce4cce14..0c08062d1796 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -97,6 +97,43 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
97} 97}
98EXPORT_SYMBOL(gnet_stats_start_copy); 98EXPORT_SYMBOL(gnet_stats_start_copy);
99 99
100static void
101__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
102 struct gnet_stats_basic_cpu __percpu *cpu)
103{
104 int i;
105
106 for_each_possible_cpu(i) {
107 struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
108 unsigned int start;
109 u64 bytes;
110 u32 packets;
111
112 do {
113 start = u64_stats_fetch_begin_irq(&bcpu->syncp);
114 bytes = bcpu->bstats.bytes;
115 packets = bcpu->bstats.packets;
116 } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
117
118 bstats->bytes += bytes;
119 bstats->packets += packets;
120 }
121}
122
123void
124__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
125 struct gnet_stats_basic_cpu __percpu *cpu,
126 struct gnet_stats_basic_packed *b)
127{
128 if (cpu) {
129 __gnet_stats_copy_basic_cpu(bstats, cpu);
130 } else {
131 bstats->bytes = b->bytes;
132 bstats->packets = b->packets;
133 }
134}
135EXPORT_SYMBOL(__gnet_stats_copy_basic);
136
100/** 137/**
101 * gnet_stats_copy_basic - copy basic statistics into statistic TLV 138 * gnet_stats_copy_basic - copy basic statistics into statistic TLV
102 * @d: dumping handle 139 * @d: dumping handle
@@ -109,19 +146,25 @@ EXPORT_SYMBOL(gnet_stats_start_copy);
109 * if the room in the socket buffer was not sufficient. 146 * if the room in the socket buffer was not sufficient.
110 */ 147 */
111int 148int
112gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) 149gnet_stats_copy_basic(struct gnet_dump *d,
150 struct gnet_stats_basic_cpu __percpu *cpu,
151 struct gnet_stats_basic_packed *b)
113{ 152{
153 struct gnet_stats_basic_packed bstats = {0};
154
155 __gnet_stats_copy_basic(&bstats, cpu, b);
156
114 if (d->compat_tc_stats) { 157 if (d->compat_tc_stats) {
115 d->tc_stats.bytes = b->bytes; 158 d->tc_stats.bytes = bstats.bytes;
116 d->tc_stats.packets = b->packets; 159 d->tc_stats.packets = bstats.packets;
117 } 160 }
118 161
119 if (d->tail) { 162 if (d->tail) {
120 struct gnet_stats_basic sb; 163 struct gnet_stats_basic sb;
121 164
122 memset(&sb, 0, sizeof(sb)); 165 memset(&sb, 0, sizeof(sb));
123 sb.bytes = b->bytes; 166 sb.bytes = bstats.bytes;
124 sb.packets = b->packets; 167 sb.packets = bstats.packets;
125 return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); 168 return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
126 } 169 }
127 return 0; 170 return 0;
@@ -172,29 +215,74 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
172} 215}
173EXPORT_SYMBOL(gnet_stats_copy_rate_est); 216EXPORT_SYMBOL(gnet_stats_copy_rate_est);
174 217
218static void
219__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
220 const struct gnet_stats_queue __percpu *q)
221{
222 int i;
223
224 for_each_possible_cpu(i) {
225 const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
226
227 qstats->qlen = 0;
228 qstats->backlog += qcpu->backlog;
229 qstats->drops += qcpu->drops;
230 qstats->requeues += qcpu->requeues;
231 qstats->overlimits += qcpu->overlimits;
232 }
233}
234
235static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
236 const struct gnet_stats_queue __percpu *cpu,
237 const struct gnet_stats_queue *q,
238 __u32 qlen)
239{
240 if (cpu) {
241 __gnet_stats_copy_queue_cpu(qstats, cpu);
242 } else {
243 qstats->qlen = q->qlen;
244 qstats->backlog = q->backlog;
245 qstats->drops = q->drops;
246 qstats->requeues = q->requeues;
247 qstats->overlimits = q->overlimits;
248 }
249
250 qstats->qlen = qlen;
251}
252
175/** 253/**
176 * gnet_stats_copy_queue - copy queue statistics into statistics TLV 254 * gnet_stats_copy_queue - copy queue statistics into statistics TLV
177 * @d: dumping handle 255 * @d: dumping handle
256 * @cpu_q: per cpu queue statistics
178 * @q: queue statistics 257 * @q: queue statistics
258 * @qlen: queue length statistics
179 * 259 *
180 * Appends the queue statistics to the top level TLV created by 260 * Appends the queue statistics to the top level TLV created by
181 * gnet_stats_start_copy(). 261 * gnet_stats_start_copy(). Using per cpu queue statistics if
262 * they are available.
182 * 263 *
183 * Returns 0 on success or -1 with the statistic lock released 264 * Returns 0 on success or -1 with the statistic lock released
184 * if the room in the socket buffer was not sufficient. 265 * if the room in the socket buffer was not sufficient.
185 */ 266 */
186int 267int
187gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) 268gnet_stats_copy_queue(struct gnet_dump *d,
269 struct gnet_stats_queue __percpu *cpu_q,
270 struct gnet_stats_queue *q, __u32 qlen)
188{ 271{
272 struct gnet_stats_queue qstats = {0};
273
274 __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
275
189 if (d->compat_tc_stats) { 276 if (d->compat_tc_stats) {
190 d->tc_stats.drops = q->drops; 277 d->tc_stats.drops = qstats.drops;
191 d->tc_stats.qlen = q->qlen; 278 d->tc_stats.qlen = qstats.qlen;
192 d->tc_stats.backlog = q->backlog; 279 d->tc_stats.backlog = qstats.backlog;
193 d->tc_stats.overlimits = q->overlimits; 280 d->tc_stats.overlimits = qstats.overlimits;
194 } 281 }
195 282
196 if (d->tail) 283 if (d->tail)
197 return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); 284 return gnet_stats_copy(d, TCA_STATS_QUEUE,
285 &qstats, sizeof(qstats));
198 286
199 return 0; 287 return 0;
200} 288}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7c6b51a58968..7f155175bba8 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -224,7 +224,7 @@ static void net_free(struct net *net)
224 return; 224 return;
225 } 225 }
226#endif 226#endif
227 kfree(net->gen); 227 kfree(rcu_access_pointer(net->gen));
228 kmem_cache_free(net_cachep, net); 228 kmem_cache_free(net_cachep, net);
229} 229}
230 230
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 907fb5e36c02..e6645b4f330a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -72,7 +72,6 @@ module_param(carrier_timeout, uint, 0644);
72static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, 72static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
73 struct netdev_queue *txq) 73 struct netdev_queue *txq)
74{ 74{
75 const struct net_device_ops *ops = dev->netdev_ops;
76 int status = NETDEV_TX_OK; 75 int status = NETDEV_TX_OK;
77 netdev_features_t features; 76 netdev_features_t features;
78 77
@@ -92,9 +91,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
92 skb->vlan_tci = 0; 91 skb->vlan_tci = 0;
93 } 92 }
94 93
95 status = ops->ndo_start_xmit(skb, dev); 94 status = netdev_start_xmit(skb, dev, txq, false);
96 if (status == NETDEV_TX_OK)
97 txq_trans_update(txq);
98 95
99out: 96out:
100 return status; 97 return status;
@@ -116,7 +113,7 @@ static void queue_process(struct work_struct *work)
116 continue; 113 continue;
117 } 114 }
118 115
119 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 116 txq = skb_get_tx_queue(dev, skb);
120 117
121 local_irq_save(flags); 118 local_irq_save(flags);
122 HARD_TX_LOCK(dev, txq, smp_processor_id()); 119 HARD_TX_LOCK(dev, txq, smp_processor_id());
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8b849ddfef2e..443256bdcddc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -202,6 +202,7 @@
202#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ 202#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */
203#define F_NODE (1<<15) /* Node memory alloc*/ 203#define F_NODE (1<<15) /* Node memory alloc*/
204#define F_UDPCSUM (1<<16) /* Include UDP checksum */ 204#define F_UDPCSUM (1<<16) /* Include UDP checksum */
205#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */
205 206
206/* Thread control flag bits */ 207/* Thread control flag bits */
207#define T_STOP (1<<0) /* Stop run */ 208#define T_STOP (1<<0) /* Stop run */
@@ -386,6 +387,7 @@ struct pktgen_dev {
386 u16 queue_map_min; 387 u16 queue_map_min;
387 u16 queue_map_max; 388 u16 queue_map_max;
388 __u32 skb_priority; /* skb priority field */ 389 __u32 skb_priority; /* skb priority field */
390 unsigned int burst; /* number of duplicated packets to burst */
389 int node; /* Memory node */ 391 int node; /* Memory node */
390 392
391#ifdef CONFIG_XFRM 393#ifdef CONFIG_XFRM
@@ -505,7 +507,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
505 pktgen_reset_all_threads(pn); 507 pktgen_reset_all_threads(pn);
506 508
507 else 509 else
508 pr_warning("Unknown command: %s\n", data); 510 pr_warn("Unknown command: %s\n", data);
509 511
510 return count; 512 return count;
511} 513}
@@ -612,6 +614,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
612 if (pkt_dev->traffic_class) 614 if (pkt_dev->traffic_class)
613 seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); 615 seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class);
614 616
617 if (pkt_dev->burst > 1)
618 seq_printf(seq, " burst: %d\n", pkt_dev->burst);
619
615 if (pkt_dev->node >= 0) 620 if (pkt_dev->node >= 0)
616 seq_printf(seq, " node: %d\n", pkt_dev->node); 621 seq_printf(seq, " node: %d\n", pkt_dev->node);
617 622
@@ -638,6 +643,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
638 if (pkt_dev->flags & F_UDPCSUM) 643 if (pkt_dev->flags & F_UDPCSUM)
639 seq_puts(seq, "UDPCSUM "); 644 seq_puts(seq, "UDPCSUM ");
640 645
646 if (pkt_dev->flags & F_NO_TIMESTAMP)
647 seq_puts(seq, "NO_TIMESTAMP ");
648
641 if (pkt_dev->flags & F_MPLS_RND) 649 if (pkt_dev->flags & F_MPLS_RND)
642 seq_puts(seq, "MPLS_RND "); 650 seq_puts(seq, "MPLS_RND ");
643 651
@@ -857,14 +865,14 @@ static ssize_t pktgen_if_write(struct file *file,
857 pg_result = &(pkt_dev->result[0]); 865 pg_result = &(pkt_dev->result[0]);
858 866
859 if (count < 1) { 867 if (count < 1) {
860 pr_warning("wrong command format\n"); 868 pr_warn("wrong command format\n");
861 return -EINVAL; 869 return -EINVAL;
862 } 870 }
863 871
864 max = count; 872 max = count;
865 tmp = count_trail_chars(user_buffer, max); 873 tmp = count_trail_chars(user_buffer, max);
866 if (tmp < 0) { 874 if (tmp < 0) {
867 pr_warning("illegal format\n"); 875 pr_warn("illegal format\n");
868 return tmp; 876 return tmp;
869 } 877 }
870 i = tmp; 878 i = tmp;
@@ -1120,6 +1128,16 @@ static ssize_t pktgen_if_write(struct file *file,
1120 pkt_dev->dst_mac_count); 1128 pkt_dev->dst_mac_count);
1121 return count; 1129 return count;
1122 } 1130 }
1131 if (!strcmp(name, "burst")) {
1132 len = num_arg(&user_buffer[i], 10, &value);
1133 if (len < 0)
1134 return len;
1135
1136 i += len;
1137 pkt_dev->burst = value < 1 ? 1 : value;
1138 sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
1139 return count;
1140 }
1123 if (!strcmp(name, "node")) { 1141 if (!strcmp(name, "node")) {
1124 len = num_arg(&user_buffer[i], 10, &value); 1142 len = num_arg(&user_buffer[i], 10, &value);
1125 if (len < 0) 1143 if (len < 0)
@@ -1243,6 +1261,9 @@ static ssize_t pktgen_if_write(struct file *file,
1243 else if (strcmp(f, "!UDPCSUM") == 0) 1261 else if (strcmp(f, "!UDPCSUM") == 0)
1244 pkt_dev->flags &= ~F_UDPCSUM; 1262 pkt_dev->flags &= ~F_UDPCSUM;
1245 1263
1264 else if (strcmp(f, "NO_TIMESTAMP") == 0)
1265 pkt_dev->flags |= F_NO_TIMESTAMP;
1266
1246 else { 1267 else {
1247 sprintf(pg_result, 1268 sprintf(pg_result,
1248 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", 1269 "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
@@ -1251,6 +1272,7 @@ static ssize_t pktgen_if_write(struct file *file,
1251 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, " 1272 "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
1252 "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, " 1273 "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
1253 "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, " 1274 "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
1275 "NO_TIMESTAMP, "
1254#ifdef CONFIG_XFRM 1276#ifdef CONFIG_XFRM
1255 "IPSEC, " 1277 "IPSEC, "
1256#endif 1278#endif
@@ -2048,15 +2070,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
2048 ntxq = pkt_dev->odev->real_num_tx_queues; 2070 ntxq = pkt_dev->odev->real_num_tx_queues;
2049 2071
2050 if (ntxq <= pkt_dev->queue_map_min) { 2072 if (ntxq <= pkt_dev->queue_map_min) {
2051 pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", 2073 pr_warn("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
2052 pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, 2074 pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
2053 pkt_dev->odevname); 2075 pkt_dev->odevname);
2054 pkt_dev->queue_map_min = (ntxq ?: 1) - 1; 2076 pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
2055 } 2077 }
2056 if (pkt_dev->queue_map_max >= ntxq) { 2078 if (pkt_dev->queue_map_max >= ntxq) {
2057 pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", 2079 pr_warn("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
2058 pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, 2080 pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
2059 pkt_dev->odevname); 2081 pkt_dev->odevname);
2060 pkt_dev->queue_map_max = (ntxq ?: 1) - 1; 2082 pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
2061 } 2083 }
2062 2084
@@ -2685,9 +2707,14 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
2685 pgh->pgh_magic = htonl(PKTGEN_MAGIC); 2707 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2686 pgh->seq_num = htonl(pkt_dev->seq_num); 2708 pgh->seq_num = htonl(pkt_dev->seq_num);
2687 2709
2688 do_gettimeofday(&timestamp); 2710 if (pkt_dev->flags & F_NO_TIMESTAMP) {
2689 pgh->tv_sec = htonl(timestamp.tv_sec); 2711 pgh->tv_sec = 0;
2690 pgh->tv_usec = htonl(timestamp.tv_usec); 2712 pgh->tv_usec = 0;
2713 } else {
2714 do_gettimeofday(&timestamp);
2715 pgh->tv_sec = htonl(timestamp.tv_sec);
2716 pgh->tv_usec = htonl(timestamp.tv_usec);
2717 }
2691} 2718}
2692 2719
2693static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, 2720static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
@@ -3160,8 +3187,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
3160 int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; 3187 int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
3161 3188
3162 if (!pkt_dev->running) { 3189 if (!pkt_dev->running) {
3163 pr_warning("interface: %s is already stopped\n", 3190 pr_warn("interface: %s is already stopped\n",
3164 pkt_dev->odevname); 3191 pkt_dev->odevname);
3165 return -EINVAL; 3192 return -EINVAL;
3166 } 3193 }
3167 3194
@@ -3284,11 +3311,9 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
3284 3311
3285static void pktgen_xmit(struct pktgen_dev *pkt_dev) 3312static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3286{ 3313{
3314 unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
3287 struct net_device *odev = pkt_dev->odev; 3315 struct net_device *odev = pkt_dev->odev;
3288 netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *)
3289 = odev->netdev_ops->ndo_start_xmit;
3290 struct netdev_queue *txq; 3316 struct netdev_queue *txq;
3291 u16 queue_map;
3292 int ret; 3317 int ret;
3293 3318
3294 /* If device is offline, then don't send */ 3319 /* If device is offline, then don't send */
@@ -3326,8 +3351,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3326 if (pkt_dev->delay && pkt_dev->last_ok) 3351 if (pkt_dev->delay && pkt_dev->last_ok)
3327 spin(pkt_dev, pkt_dev->next_tx); 3352 spin(pkt_dev, pkt_dev->next_tx);
3328 3353
3329 queue_map = skb_get_queue_mapping(pkt_dev->skb); 3354 txq = skb_get_tx_queue(odev, pkt_dev->skb);
3330 txq = netdev_get_tx_queue(odev, queue_map);
3331 3355
3332 local_bh_disable(); 3356 local_bh_disable();
3333 3357
@@ -3338,16 +3362,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3338 pkt_dev->last_ok = 0; 3362 pkt_dev->last_ok = 0;
3339 goto unlock; 3363 goto unlock;
3340 } 3364 }
3341 atomic_inc(&(pkt_dev->skb->users)); 3365 atomic_add(burst, &pkt_dev->skb->users);
3342 ret = (*xmit)(pkt_dev->skb, odev); 3366
3367xmit_more:
3368 ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
3343 3369
3344 switch (ret) { 3370 switch (ret) {
3345 case NETDEV_TX_OK: 3371 case NETDEV_TX_OK:
3346 txq_trans_update(txq);
3347 pkt_dev->last_ok = 1; 3372 pkt_dev->last_ok = 1;
3348 pkt_dev->sofar++; 3373 pkt_dev->sofar++;
3349 pkt_dev->seq_num++; 3374 pkt_dev->seq_num++;
3350 pkt_dev->tx_bytes += pkt_dev->last_pkt_size; 3375 pkt_dev->tx_bytes += pkt_dev->last_pkt_size;
3376 if (burst > 0 && !netif_xmit_frozen_or_drv_stopped(txq))
3377 goto xmit_more;
3351 break; 3378 break;
3352 case NET_XMIT_DROP: 3379 case NET_XMIT_DROP:
3353 case NET_XMIT_CN: 3380 case NET_XMIT_CN:
@@ -3366,6 +3393,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3366 atomic_dec(&(pkt_dev->skb->users)); 3393 atomic_dec(&(pkt_dev->skb->users));
3367 pkt_dev->last_ok = 0; 3394 pkt_dev->last_ok = 0;
3368 } 3395 }
3396 if (unlikely(burst))
3397 atomic_sub(burst, &pkt_dev->skb->users);
3369unlock: 3398unlock:
3370 HARD_TX_UNLOCK(odev, txq); 3399 HARD_TX_UNLOCK(odev, txq);
3371 3400
@@ -3564,6 +3593,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
3564 pkt_dev->svlan_p = 0; 3593 pkt_dev->svlan_p = 0;
3565 pkt_dev->svlan_cfi = 0; 3594 pkt_dev->svlan_cfi = 0;
3566 pkt_dev->svlan_id = 0xffff; 3595 pkt_dev->svlan_id = 0xffff;
3596 pkt_dev->burst = 1;
3567 pkt_dev->node = -1; 3597 pkt_dev->node = -1;
3568 3598
3569 err = pktgen_setup_dev(t->net, pkt_dev, ifname); 3599 err = pktgen_setup_dev(t->net, pkt_dev, ifname);
@@ -3684,7 +3714,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
3684 pr_debug("remove_device pkt_dev=%p\n", pkt_dev); 3714 pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
3685 3715
3686 if (pkt_dev->running) { 3716 if (pkt_dev->running) {
3687 pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); 3717 pr_warn("WARNING: trying to remove a running interface, stopping it now\n");
3688 pktgen_stop_device(pkt_dev); 3718 pktgen_stop_device(pkt_dev);
3689 } 3719 }
3690 3720
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0493e3b7471..a6882686ca3a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1481,9 +1481,12 @@ static int do_set_master(struct net_device *dev, int ifindex)
1481 return 0; 1481 return 0;
1482} 1482}
1483 1483
1484#define DO_SETLINK_MODIFIED 0x01
1485/* notify flag means notify + modified. */
1486#define DO_SETLINK_NOTIFY 0x03
1484static int do_setlink(const struct sk_buff *skb, 1487static int do_setlink(const struct sk_buff *skb,
1485 struct net_device *dev, struct ifinfomsg *ifm, 1488 struct net_device *dev, struct ifinfomsg *ifm,
1486 struct nlattr **tb, char *ifname, int modified) 1489 struct nlattr **tb, char *ifname, int status)
1487{ 1490{
1488 const struct net_device_ops *ops = dev->netdev_ops; 1491 const struct net_device_ops *ops = dev->netdev_ops;
1489 int err; 1492 int err;
@@ -1502,7 +1505,7 @@ static int do_setlink(const struct sk_buff *skb,
1502 put_net(net); 1505 put_net(net);
1503 if (err) 1506 if (err)
1504 goto errout; 1507 goto errout;
1505 modified = 1; 1508 status |= DO_SETLINK_MODIFIED;
1506 } 1509 }
1507 1510
1508 if (tb[IFLA_MAP]) { 1511 if (tb[IFLA_MAP]) {
@@ -1531,7 +1534,7 @@ static int do_setlink(const struct sk_buff *skb,
1531 if (err < 0) 1534 if (err < 0)
1532 goto errout; 1535 goto errout;
1533 1536
1534 modified = 1; 1537 status |= DO_SETLINK_NOTIFY;
1535 } 1538 }
1536 1539
1537 if (tb[IFLA_ADDRESS]) { 1540 if (tb[IFLA_ADDRESS]) {
@@ -1551,19 +1554,19 @@ static int do_setlink(const struct sk_buff *skb,
1551 kfree(sa); 1554 kfree(sa);
1552 if (err) 1555 if (err)
1553 goto errout; 1556 goto errout;
1554 modified = 1; 1557 status |= DO_SETLINK_MODIFIED;
1555 } 1558 }
1556 1559
1557 if (tb[IFLA_MTU]) { 1560 if (tb[IFLA_MTU]) {
1558 err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); 1561 err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
1559 if (err < 0) 1562 if (err < 0)
1560 goto errout; 1563 goto errout;
1561 modified = 1; 1564 status |= DO_SETLINK_MODIFIED;
1562 } 1565 }
1563 1566
1564 if (tb[IFLA_GROUP]) { 1567 if (tb[IFLA_GROUP]) {
1565 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); 1568 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
1566 modified = 1; 1569 status |= DO_SETLINK_NOTIFY;
1567 } 1570 }
1568 1571
1569 /* 1572 /*
@@ -1575,7 +1578,7 @@ static int do_setlink(const struct sk_buff *skb,
1575 err = dev_change_name(dev, ifname); 1578 err = dev_change_name(dev, ifname);
1576 if (err < 0) 1579 if (err < 0)
1577 goto errout; 1580 goto errout;
1578 modified = 1; 1581 status |= DO_SETLINK_MODIFIED;
1579 } 1582 }
1580 1583
1581 if (tb[IFLA_IFALIAS]) { 1584 if (tb[IFLA_IFALIAS]) {
@@ -1583,7 +1586,7 @@ static int do_setlink(const struct sk_buff *skb,
1583 nla_len(tb[IFLA_IFALIAS])); 1586 nla_len(tb[IFLA_IFALIAS]));
1584 if (err < 0) 1587 if (err < 0)
1585 goto errout; 1588 goto errout;
1586 modified = 1; 1589 status |= DO_SETLINK_NOTIFY;
1587 } 1590 }
1588 1591
1589 if (tb[IFLA_BROADCAST]) { 1592 if (tb[IFLA_BROADCAST]) {
@@ -1601,25 +1604,35 @@ static int do_setlink(const struct sk_buff *skb,
1601 err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); 1604 err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]));
1602 if (err) 1605 if (err)
1603 goto errout; 1606 goto errout;
1604 modified = 1; 1607 status |= DO_SETLINK_MODIFIED;
1605 } 1608 }
1606 1609
1607 if (tb[IFLA_CARRIER]) { 1610 if (tb[IFLA_CARRIER]) {
1608 err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); 1611 err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
1609 if (err) 1612 if (err)
1610 goto errout; 1613 goto errout;
1611 modified = 1; 1614 status |= DO_SETLINK_MODIFIED;
1612 } 1615 }
1613 1616
1614 if (tb[IFLA_TXQLEN]) 1617 if (tb[IFLA_TXQLEN]) {
1615 dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); 1618 unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]);
1619
1620 if (dev->tx_queue_len ^ value)
1621 status |= DO_SETLINK_NOTIFY;
1622
1623 dev->tx_queue_len = value;
1624 }
1616 1625
1617 if (tb[IFLA_OPERSTATE]) 1626 if (tb[IFLA_OPERSTATE])
1618 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); 1627 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
1619 1628
1620 if (tb[IFLA_LINKMODE]) { 1629 if (tb[IFLA_LINKMODE]) {
1630 unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
1631
1621 write_lock_bh(&dev_base_lock); 1632 write_lock_bh(&dev_base_lock);
1622 dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); 1633 if (dev->link_mode ^ value)
1634 status |= DO_SETLINK_NOTIFY;
1635 dev->link_mode = value;
1623 write_unlock_bh(&dev_base_lock); 1636 write_unlock_bh(&dev_base_lock);
1624 } 1637 }
1625 1638
@@ -1634,7 +1647,7 @@ static int do_setlink(const struct sk_buff *skb,
1634 err = do_setvfinfo(dev, attr); 1647 err = do_setvfinfo(dev, attr);
1635 if (err < 0) 1648 if (err < 0)
1636 goto errout; 1649 goto errout;
1637 modified = 1; 1650 status |= DO_SETLINK_NOTIFY;
1638 } 1651 }
1639 } 1652 }
1640 err = 0; 1653 err = 0;
@@ -1664,7 +1677,7 @@ static int do_setlink(const struct sk_buff *skb,
1664 err = ops->ndo_set_vf_port(dev, vf, port); 1677 err = ops->ndo_set_vf_port(dev, vf, port);
1665 if (err < 0) 1678 if (err < 0)
1666 goto errout; 1679 goto errout;
1667 modified = 1; 1680 status |= DO_SETLINK_NOTIFY;
1668 } 1681 }
1669 } 1682 }
1670 err = 0; 1683 err = 0;
@@ -1682,7 +1695,7 @@ static int do_setlink(const struct sk_buff *skb,
1682 err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); 1695 err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
1683 if (err < 0) 1696 if (err < 0)
1684 goto errout; 1697 goto errout;
1685 modified = 1; 1698 status |= DO_SETLINK_NOTIFY;
1686 } 1699 }
1687 1700
1688 if (tb[IFLA_AF_SPEC]) { 1701 if (tb[IFLA_AF_SPEC]) {
@@ -1699,15 +1712,20 @@ static int do_setlink(const struct sk_buff *skb,
1699 if (err < 0) 1712 if (err < 0)
1700 goto errout; 1713 goto errout;
1701 1714
1702 modified = 1; 1715 status |= DO_SETLINK_NOTIFY;
1703 } 1716 }
1704 } 1717 }
1705 err = 0; 1718 err = 0;
1706 1719
1707errout: 1720errout:
1708 if (err < 0 && modified) 1721 if (status & DO_SETLINK_MODIFIED) {
1709 net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", 1722 if (status & DO_SETLINK_NOTIFY)
1710 dev->name); 1723 netdev_state_change(dev);
1724
1725 if (err < 0)
1726 net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
1727 dev->name);
1728 }
1711 1729
1712 return err; 1730 return err;
1713} 1731}
@@ -1989,7 +2007,7 @@ replay:
1989 } 2007 }
1990 2008
1991 if (dev) { 2009 if (dev) {
1992 int modified = 0; 2010 int status = 0;
1993 2011
1994 if (nlh->nlmsg_flags & NLM_F_EXCL) 2012 if (nlh->nlmsg_flags & NLM_F_EXCL)
1995 return -EEXIST; 2013 return -EEXIST;
@@ -2004,7 +2022,7 @@ replay:
2004 err = ops->changelink(dev, tb, data); 2022 err = ops->changelink(dev, tb, data);
2005 if (err < 0) 2023 if (err < 0)
2006 return err; 2024 return err;
2007 modified = 1; 2025 status |= DO_SETLINK_NOTIFY;
2008 } 2026 }
2009 2027
2010 if (linkinfo[IFLA_INFO_SLAVE_DATA]) { 2028 if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
@@ -2015,10 +2033,10 @@ replay:
2015 tb, slave_data); 2033 tb, slave_data);
2016 if (err < 0) 2034 if (err < 0)
2017 return err; 2035 return err;
2018 modified = 1; 2036 status |= DO_SETLINK_NOTIFY;
2019 } 2037 }
2020 2038
2021 return do_setlink(skb, dev, ifm, tb, ifname, modified); 2039 return do_setlink(skb, dev, ifm, tb, ifname, status);
2022 } 2040 }
2023 2041
2024 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { 2042 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index ba71212f0251..51dd3193a33e 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -35,7 +35,7 @@ static u32 seq_scale(u32 seq)
35 * overlaps less than one time per MSL (2 minutes). 35 * overlaps less than one time per MSL (2 minutes).
36 * Choosing a clock of 64 ns period is OK. (period of 274 s) 36 * Choosing a clock of 64 ns period is OK. (period of 274 s)
37 */ 37 */
38 return seq + (ktime_to_ns(ktime_get_real()) >> 6); 38 return seq + (ktime_get_real_ns() >> 6);
39} 39}
40#endif 40#endif
41 41
@@ -135,7 +135,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
135 md5_transform(hash, net_secret); 135 md5_transform(hash, net_secret);
136 136
137 seq = hash[0] | (((u64)hash[1]) << 32); 137 seq = hash[0] | (((u64)hash[1]) << 32);
138 seq += ktime_to_ns(ktime_get_real()); 138 seq += ktime_get_real_ns();
139 seq &= (1ull << 48) - 1; 139 seq &= (1ull << 48) - 1;
140 140
141 return seq; 141 return seq;
@@ -163,7 +163,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
163 md5_transform(hash, secret); 163 md5_transform(hash, secret);
164 164
165 seq = hash[0] | (((u64)hash[1]) << 32); 165 seq = hash[0] | (((u64)hash[1]) << 32);
166 seq += ktime_to_ns(ktime_get_real()); 166 seq += ktime_get_real_ns();
167 seq &= (1ull << 48) - 1; 167 seq &= (1ull << 48) - 1;
168 168
169 return seq; 169 return seq;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8d289697cc7a..7b3df0d518ab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,16 +257,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
257 kmemcheck_annotate_variable(shinfo->destructor_arg); 257 kmemcheck_annotate_variable(shinfo->destructor_arg);
258 258
259 if (flags & SKB_ALLOC_FCLONE) { 259 if (flags & SKB_ALLOC_FCLONE) {
260 struct sk_buff *child = skb + 1; 260 struct sk_buff_fclones *fclones;
261 atomic_t *fclone_ref = (atomic_t *) (child + 1);
262 261
263 kmemcheck_annotate_bitfield(child, flags1); 262 fclones = container_of(skb, struct sk_buff_fclones, skb1);
264 kmemcheck_annotate_bitfield(child, flags2); 263
264 kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
265 skb->fclone = SKB_FCLONE_ORIG; 265 skb->fclone = SKB_FCLONE_ORIG;
266 atomic_set(fclone_ref, 1); 266 atomic_set(&fclones->fclone_ref, 1);
267 267
268 child->fclone = SKB_FCLONE_UNAVAILABLE; 268 fclones->skb2.fclone = SKB_FCLONE_FREE;
269 child->pfmemalloc = pfmemalloc; 269 fclones->skb2.pfmemalloc = pfmemalloc;
270 } 270 }
271out: 271out:
272 return skb; 272 return skb;
@@ -491,32 +491,33 @@ static void skb_free_head(struct sk_buff *skb)
491 491
492static void skb_release_data(struct sk_buff *skb) 492static void skb_release_data(struct sk_buff *skb)
493{ 493{
494 if (!skb->cloned || 494 struct skb_shared_info *shinfo = skb_shinfo(skb);
495 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 495 int i;
496 &skb_shinfo(skb)->dataref)) {
497 if (skb_shinfo(skb)->nr_frags) {
498 int i;
499 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
500 skb_frag_unref(skb, i);
501 }
502 496
503 /* 497 if (skb->cloned &&
504 * If skb buf is from userspace, we need to notify the caller 498 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
505 * the lower device DMA has done; 499 &shinfo->dataref))
506 */ 500 return;
507 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
508 struct ubuf_info *uarg;
509 501
510 uarg = skb_shinfo(skb)->destructor_arg; 502 for (i = 0; i < shinfo->nr_frags; i++)
511 if (uarg->callback) 503 __skb_frag_unref(&shinfo->frags[i]);
512 uarg->callback(uarg, true);
513 }
514 504
515 if (skb_has_frag_list(skb)) 505 /*
516 skb_drop_fraglist(skb); 506 * If skb buf is from userspace, we need to notify the caller
507 * the lower device DMA has done;
508 */
509 if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
510 struct ubuf_info *uarg;
517 511
518 skb_free_head(skb); 512 uarg = shinfo->destructor_arg;
513 if (uarg->callback)
514 uarg->callback(uarg, true);
519 } 515 }
516
517 if (shinfo->frag_list)
518 kfree_skb_list(shinfo->frag_list);
519
520 skb_free_head(skb);
520} 521}
521 522
522/* 523/*
@@ -524,8 +525,7 @@ static void skb_release_data(struct sk_buff *skb)
524 */ 525 */
525static void kfree_skbmem(struct sk_buff *skb) 526static void kfree_skbmem(struct sk_buff *skb)
526{ 527{
527 struct sk_buff *other; 528 struct sk_buff_fclones *fclones;
528 atomic_t *fclone_ref;
529 529
530 switch (skb->fclone) { 530 switch (skb->fclone) {
531 case SKB_FCLONE_UNAVAILABLE: 531 case SKB_FCLONE_UNAVAILABLE:
@@ -533,22 +533,28 @@ static void kfree_skbmem(struct sk_buff *skb)
533 break; 533 break;
534 534
535 case SKB_FCLONE_ORIG: 535 case SKB_FCLONE_ORIG:
536 fclone_ref = (atomic_t *) (skb + 2); 536 fclones = container_of(skb, struct sk_buff_fclones, skb1);
537 if (atomic_dec_and_test(fclone_ref)) 537 if (atomic_dec_and_test(&fclones->fclone_ref))
538 kmem_cache_free(skbuff_fclone_cache, skb); 538 kmem_cache_free(skbuff_fclone_cache, fclones);
539 break; 539 break;
540 540
541 case SKB_FCLONE_CLONE: 541 case SKB_FCLONE_CLONE:
542 fclone_ref = (atomic_t *) (skb + 1); 542 fclones = container_of(skb, struct sk_buff_fclones, skb2);
543 other = skb - 1;
544 543
545 /* The clone portion is available for 544 /* Warning : We must perform the atomic_dec_and_test() before
546 * fast-cloning again. 545 * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
546 * skb_clone() could set clone_ref to 2 before our decrement.
547 * Anyway, if we are going to free the structure, no need to
548 * rewrite skb->fclone.
547 */ 549 */
548 skb->fclone = SKB_FCLONE_UNAVAILABLE; 550 if (atomic_dec_and_test(&fclones->fclone_ref)) {
549 551 kmem_cache_free(skbuff_fclone_cache, fclones);
550 if (atomic_dec_and_test(fclone_ref)) 552 } else {
551 kmem_cache_free(skbuff_fclone_cache, other); 553 /* The clone portion is available for
554 * fast-cloning again.
555 */
556 skb->fclone = SKB_FCLONE_FREE;
557 }
552 break; 558 break;
553 } 559 }
554} 560}
@@ -566,7 +572,7 @@ static void skb_release_head_state(struct sk_buff *skb)
566#if IS_ENABLED(CONFIG_NF_CONNTRACK) 572#if IS_ENABLED(CONFIG_NF_CONNTRACK)
567 nf_conntrack_put(skb->nfct); 573 nf_conntrack_put(skb->nfct);
568#endif 574#endif
569#ifdef CONFIG_BRIDGE_NETFILTER 575#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
570 nf_bridge_put(skb->nf_bridge); 576 nf_bridge_put(skb->nf_bridge);
571#endif 577#endif
572/* XXX: IS this still necessary? - JHS */ 578/* XXX: IS this still necessary? - JHS */
@@ -674,57 +680,61 @@ void consume_skb(struct sk_buff *skb)
674} 680}
675EXPORT_SYMBOL(consume_skb); 681EXPORT_SYMBOL(consume_skb);
676 682
683/* Make sure a field is enclosed inside headers_start/headers_end section */
684#define CHECK_SKB_FIELD(field) \
685 BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
686 offsetof(struct sk_buff, headers_start)); \
687 BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
688 offsetof(struct sk_buff, headers_end)); \
689
677static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 690static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
678{ 691{
679 new->tstamp = old->tstamp; 692 new->tstamp = old->tstamp;
693 /* We do not copy old->sk */
680 new->dev = old->dev; 694 new->dev = old->dev;
681 new->transport_header = old->transport_header; 695 memcpy(new->cb, old->cb, sizeof(old->cb));
682 new->network_header = old->network_header;
683 new->mac_header = old->mac_header;
684 new->inner_protocol = old->inner_protocol;
685 new->inner_transport_header = old->inner_transport_header;
686 new->inner_network_header = old->inner_network_header;
687 new->inner_mac_header = old->inner_mac_header;
688 skb_dst_copy(new, old); 696 skb_dst_copy(new, old);
689 skb_copy_hash(new, old);
690 new->ooo_okay = old->ooo_okay;
691 new->no_fcs = old->no_fcs;
692 new->encapsulation = old->encapsulation;
693 new->encap_hdr_csum = old->encap_hdr_csum;
694 new->csum_valid = old->csum_valid;
695 new->csum_complete_sw = old->csum_complete_sw;
696#ifdef CONFIG_XFRM 697#ifdef CONFIG_XFRM
697 new->sp = secpath_get(old->sp); 698 new->sp = secpath_get(old->sp);
698#endif 699#endif
699 memcpy(new->cb, old->cb, sizeof(old->cb)); 700 __nf_copy(new, old, false);
700 new->csum = old->csum; 701
701 new->ignore_df = old->ignore_df; 702 /* Note : this field could be in headers_start/headers_end section
702 new->pkt_type = old->pkt_type; 703 * It is not yet because we do not want to have a 16 bit hole
703 new->ip_summed = old->ip_summed; 704 */
704 skb_copy_queue_mapping(new, old); 705 new->queue_mapping = old->queue_mapping;
705 new->priority = old->priority; 706
706#if IS_ENABLED(CONFIG_IP_VS) 707 memcpy(&new->headers_start, &old->headers_start,
707 new->ipvs_property = old->ipvs_property; 708 offsetof(struct sk_buff, headers_end) -
709 offsetof(struct sk_buff, headers_start));
710 CHECK_SKB_FIELD(protocol);
711 CHECK_SKB_FIELD(csum);
712 CHECK_SKB_FIELD(hash);
713 CHECK_SKB_FIELD(priority);
714 CHECK_SKB_FIELD(skb_iif);
715 CHECK_SKB_FIELD(vlan_proto);
716 CHECK_SKB_FIELD(vlan_tci);
717 CHECK_SKB_FIELD(transport_header);
718 CHECK_SKB_FIELD(network_header);
719 CHECK_SKB_FIELD(mac_header);
720 CHECK_SKB_FIELD(inner_protocol);
721 CHECK_SKB_FIELD(inner_transport_header);
722 CHECK_SKB_FIELD(inner_network_header);
723 CHECK_SKB_FIELD(inner_mac_header);
724 CHECK_SKB_FIELD(mark);
725#ifdef CONFIG_NETWORK_SECMARK
726 CHECK_SKB_FIELD(secmark);
727#endif
728#ifdef CONFIG_NET_RX_BUSY_POLL
729 CHECK_SKB_FIELD(napi_id);
708#endif 730#endif
709 new->pfmemalloc = old->pfmemalloc;
710 new->protocol = old->protocol;
711 new->mark = old->mark;
712 new->skb_iif = old->skb_iif;
713 __nf_copy(new, old);
714#ifdef CONFIG_NET_SCHED 731#ifdef CONFIG_NET_SCHED
715 new->tc_index = old->tc_index; 732 CHECK_SKB_FIELD(tc_index);
716#ifdef CONFIG_NET_CLS_ACT 733#ifdef CONFIG_NET_CLS_ACT
717 new->tc_verd = old->tc_verd; 734 CHECK_SKB_FIELD(tc_verd);
718#endif 735#endif
719#endif 736#endif
720 new->vlan_proto = old->vlan_proto;
721 new->vlan_tci = old->vlan_tci;
722 737
723 skb_copy_secmark(new, old);
724
725#ifdef CONFIG_NET_RX_BUSY_POLL
726 new->napi_id = old->napi_id;
727#endif
728} 738}
729 739
730/* 740/*
@@ -855,17 +865,22 @@ EXPORT_SYMBOL_GPL(skb_copy_ubufs);
855 865
856struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 866struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
857{ 867{
858 struct sk_buff *n; 868 struct sk_buff_fclones *fclones = container_of(skb,
869 struct sk_buff_fclones,
870 skb1);
871 struct sk_buff *n = &fclones->skb2;
859 872
860 if (skb_orphan_frags(skb, gfp_mask)) 873 if (skb_orphan_frags(skb, gfp_mask))
861 return NULL; 874 return NULL;
862 875
863 n = skb + 1;
864 if (skb->fclone == SKB_FCLONE_ORIG && 876 if (skb->fclone == SKB_FCLONE_ORIG &&
865 n->fclone == SKB_FCLONE_UNAVAILABLE) { 877 n->fclone == SKB_FCLONE_FREE) {
866 atomic_t *fclone_ref = (atomic_t *) (n + 1);
867 n->fclone = SKB_FCLONE_CLONE; 878 n->fclone = SKB_FCLONE_CLONE;
868 atomic_inc(fclone_ref); 879 /* As our fastclone was free, clone_ref must be 1 at this point.
880 * We could use atomic_inc() here, but it is faster
881 * to set the final value.
882 */
883 atomic_set(&fclones->fclone_ref, 2);
869 } else { 884 } else {
870 if (skb_pfmemalloc(skb)) 885 if (skb_pfmemalloc(skb))
871 gfp_mask |= __GFP_MEMALLOC; 886 gfp_mask |= __GFP_MEMALLOC;
@@ -875,7 +890,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
875 return NULL; 890 return NULL;
876 891
877 kmemcheck_annotate_bitfield(n, flags1); 892 kmemcheck_annotate_bitfield(n, flags1);
878 kmemcheck_annotate_bitfield(n, flags2);
879 n->fclone = SKB_FCLONE_UNAVAILABLE; 893 n->fclone = SKB_FCLONE_UNAVAILABLE;
880 } 894 }
881 895
@@ -3069,6 +3083,11 @@ perform_csum_check:
3069 } 3083 }
3070 } while ((offset += len) < head_skb->len); 3084 } while ((offset += len) < head_skb->len);
3071 3085
3086 /* Some callers want to get the end of the list.
3087 * Put it in segs->prev to avoid walking the list.
3088 * (see validate_xmit_skb_list() for example)
3089 */
3090 segs->prev = tail;
3072 return segs; 3091 return segs;
3073 3092
3074err: 3093err:
@@ -3182,7 +3201,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
3182 skb_shinfo(nskb)->frag_list = p; 3201 skb_shinfo(nskb)->frag_list = p;
3183 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 3202 skb_shinfo(nskb)->gso_size = pinfo->gso_size;
3184 pinfo->gso_size = 0; 3203 pinfo->gso_size = 0;
3185 skb_header_release(p); 3204 __skb_header_release(p);
3186 NAPI_GRO_CB(nskb)->last = p; 3205 NAPI_GRO_CB(nskb)->last = p;
3187 3206
3188 nskb->data_len += p->len; 3207 nskb->data_len += p->len;
@@ -3214,7 +3233,7 @@ merge:
3214 else 3233 else
3215 NAPI_GRO_CB(p)->last->next = skb; 3234 NAPI_GRO_CB(p)->last->next = skb;
3216 NAPI_GRO_CB(p)->last = skb; 3235 NAPI_GRO_CB(p)->last = skb;
3217 skb_header_release(skb); 3236 __skb_header_release(skb);
3218 lp = p; 3237 lp = p;
3219 3238
3220done: 3239done:
@@ -3230,7 +3249,6 @@ done:
3230 NAPI_GRO_CB(skb)->same_flow = 1; 3249 NAPI_GRO_CB(skb)->same_flow = 1;
3231 return 0; 3250 return 0;
3232} 3251}
3233EXPORT_SYMBOL_GPL(skb_gro_receive);
3234 3252
3235void __init skb_init(void) 3253void __init skb_init(void)
3236{ 3254{
@@ -3240,8 +3258,7 @@ void __init skb_init(void)
3240 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3258 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3241 NULL); 3259 NULL);
3242 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 3260 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
3243 (2*sizeof(struct sk_buff)) + 3261 sizeof(struct sk_buff_fclones),
3244 sizeof(atomic_t),
3245 0, 3262 0,
3246 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3263 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3247 NULL); 3264 NULL);
@@ -3494,32 +3511,66 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3494} 3511}
3495EXPORT_SYMBOL(sock_queue_err_skb); 3512EXPORT_SYMBOL(sock_queue_err_skb);
3496 3513
3497void __skb_tstamp_tx(struct sk_buff *orig_skb, 3514struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
3498 struct skb_shared_hwtstamps *hwtstamps,
3499 struct sock *sk, int tstype)
3500{ 3515{
3501 struct sock_exterr_skb *serr; 3516 struct sk_buff_head *q = &sk->sk_error_queue;
3502 struct sk_buff *skb; 3517 struct sk_buff *skb, *skb_next;
3503 int err; 3518 int err = 0;
3504 3519
3505 if (!sk) 3520 spin_lock_bh(&q->lock);
3506 return; 3521 skb = __skb_dequeue(q);
3522 if (skb && (skb_next = skb_peek(q)))
3523 err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
3524 spin_unlock_bh(&q->lock);
3507 3525
3508 if (hwtstamps) { 3526 sk->sk_err = err;
3509 *skb_hwtstamps(orig_skb) = 3527 if (err)
3510 *hwtstamps; 3528 sk->sk_error_report(sk);
3511 } else { 3529
3512 /* 3530 return skb;
3513 * no hardware time stamps available, 3531}
3514 * so keep the shared tx_flags and only 3532EXPORT_SYMBOL(sock_dequeue_err_skb);
3515 * store software time stamp 3533
3516 */ 3534/**
3517 orig_skb->tstamp = ktime_get_real(); 3535 * skb_clone_sk - create clone of skb, and take reference to socket
3536 * @skb: the skb to clone
3537 *
3538 * This function creates a clone of a buffer that holds a reference on
3539 * sk_refcnt. Buffers created via this function are meant to be
3540 * returned using sock_queue_err_skb, or free via kfree_skb.
3541 *
3542 * When passing buffers allocated with this function to sock_queue_err_skb
3543 * it is necessary to wrap the call with sock_hold/sock_put in order to
3544 * prevent the socket from being released prior to being enqueued on
3545 * the sk_error_queue.
3546 */
3547struct sk_buff *skb_clone_sk(struct sk_buff *skb)
3548{
3549 struct sock *sk = skb->sk;
3550 struct sk_buff *clone;
3551
3552 if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
3553 return NULL;
3554
3555 clone = skb_clone(skb, GFP_ATOMIC);
3556 if (!clone) {
3557 sock_put(sk);
3558 return NULL;
3518 } 3559 }
3519 3560
3520 skb = skb_clone(orig_skb, GFP_ATOMIC); 3561 clone->sk = sk;
3521 if (!skb) 3562 clone->destructor = sock_efree;
3522 return; 3563
3564 return clone;
3565}
3566EXPORT_SYMBOL(skb_clone_sk);
3567
3568static void __skb_complete_tx_timestamp(struct sk_buff *skb,
3569 struct sock *sk,
3570 int tstype)
3571{
3572 struct sock_exterr_skb *serr;
3573 int err;
3523 3574
3524 serr = SKB_EXT_ERR(skb); 3575 serr = SKB_EXT_ERR(skb);
3525 memset(serr, 0, sizeof(*serr)); 3576 memset(serr, 0, sizeof(*serr));
@@ -3537,6 +3588,42 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
3537 if (err) 3588 if (err)
3538 kfree_skb(skb); 3589 kfree_skb(skb);
3539} 3590}
3591
3592void skb_complete_tx_timestamp(struct sk_buff *skb,
3593 struct skb_shared_hwtstamps *hwtstamps)
3594{
3595 struct sock *sk = skb->sk;
3596
3597 /* take a reference to prevent skb_orphan() from freeing the socket */
3598 sock_hold(sk);
3599
3600 *skb_hwtstamps(skb) = *hwtstamps;
3601 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
3602
3603 sock_put(sk);
3604}
3605EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
3606
3607void __skb_tstamp_tx(struct sk_buff *orig_skb,
3608 struct skb_shared_hwtstamps *hwtstamps,
3609 struct sock *sk, int tstype)
3610{
3611 struct sk_buff *skb;
3612
3613 if (!sk)
3614 return;
3615
3616 if (hwtstamps)
3617 *skb_hwtstamps(orig_skb) = *hwtstamps;
3618 else
3619 orig_skb->tstamp = ktime_get_real();
3620
3621 skb = skb_clone(orig_skb, GFP_ATOMIC);
3622 if (!skb)
3623 return;
3624
3625 __skb_complete_tx_timestamp(skb, sk, tstype);
3626}
3540EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 3627EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
3541 3628
3542void skb_tstamp_tx(struct sk_buff *orig_skb, 3629void skb_tstamp_tx(struct sk_buff *orig_skb,
@@ -3561,9 +3648,14 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3561 serr->ee.ee_errno = ENOMSG; 3648 serr->ee.ee_errno = ENOMSG;
3562 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3649 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3563 3650
3651 /* take a reference to prevent skb_orphan() from freeing the socket */
3652 sock_hold(sk);
3653
3564 err = sock_queue_err_skb(sk, skb); 3654 err = sock_queue_err_skb(sk, skb);
3565 if (err) 3655 if (err)
3566 kfree_skb(skb); 3656 kfree_skb(skb);
3657
3658 sock_put(sk);
3567} 3659}
3568EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3660EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3569 3661
@@ -3864,7 +3956,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
3864 return false; 3956 return false;
3865 3957
3866 if (len <= skb_tailroom(to)) { 3958 if (len <= skb_tailroom(to)) {
3867 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 3959 if (len)
3960 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
3868 *delta_truesize = 0; 3961 *delta_truesize = 0;
3869 return true; 3962 return true;
3870 } 3963 }
@@ -4029,3 +4122,81 @@ err_free:
4029 return NULL; 4122 return NULL;
4030} 4123}
4031EXPORT_SYMBOL(skb_vlan_untag); 4124EXPORT_SYMBOL(skb_vlan_untag);
4125
4126/**
4127 * alloc_skb_with_frags - allocate skb with page frags
4128 *
4129 * header_len: size of linear part
4130 * data_len: needed length in frags
4131 * max_page_order: max page order desired.
4132 * errcode: pointer to error code if any
4133 * gfp_mask: allocation mask
4134 *
4135 * This can be used to allocate a paged skb, given a maximal order for frags.
4136 */
4137struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
4138 unsigned long data_len,
4139 int max_page_order,
4140 int *errcode,
4141 gfp_t gfp_mask)
4142{
4143 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
4144 unsigned long chunk;
4145 struct sk_buff *skb;
4146 struct page *page;
4147 gfp_t gfp_head;
4148 int i;
4149
4150 *errcode = -EMSGSIZE;
4151 /* Note this test could be relaxed, if we succeed to allocate
4152 * high order pages...
4153 */
4154 if (npages > MAX_SKB_FRAGS)
4155 return NULL;
4156
4157 gfp_head = gfp_mask;
4158 if (gfp_head & __GFP_WAIT)
4159 gfp_head |= __GFP_REPEAT;
4160
4161 *errcode = -ENOBUFS;
4162 skb = alloc_skb(header_len, gfp_head);
4163 if (!skb)
4164 return NULL;
4165
4166 skb->truesize += npages << PAGE_SHIFT;
4167
4168 for (i = 0; npages > 0; i++) {
4169 int order = max_page_order;
4170
4171 while (order) {
4172 if (npages >= 1 << order) {
4173 page = alloc_pages(gfp_mask |
4174 __GFP_COMP |
4175 __GFP_NOWARN |
4176 __GFP_NORETRY,
4177 order);
4178 if (page)
4179 goto fill_page;
4180 /* Do not retry other high order allocations */
4181 order = 1;
4182 max_page_order = 0;
4183 }
4184 order--;
4185 }
4186 page = alloc_page(gfp_mask);
4187 if (!page)
4188 goto failure;
4189fill_page:
4190 chunk = min_t(unsigned long, data_len,
4191 PAGE_SIZE << order);
4192 skb_fill_page_desc(skb, i, page, 0, chunk);
4193 data_len -= chunk;
4194 npages -= 1 << order;
4195 }
4196 return skb;
4197
4198failure:
4199 kfree_skb(skb);
4200 return NULL;
4201}
4202EXPORT_SYMBOL(alloc_skb_with_frags);
diff --git a/net/core/sock.c b/net/core/sock.c
index 611f424fb76b..b4f3ea2fce60 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -437,7 +437,6 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{ 438{
439 int err; 439 int err;
440 int skb_len;
441 unsigned long flags; 440 unsigned long flags;
442 struct sk_buff_head *list = &sk->sk_receive_queue; 441 struct sk_buff_head *list = &sk->sk_receive_queue;
443 442
@@ -459,13 +458,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
459 skb->dev = NULL; 458 skb->dev = NULL;
460 skb_set_owner_r(skb, sk); 459 skb_set_owner_r(skb, sk);
461 460
462 /* Cache the SKB length before we tack it onto the receive
463 * queue. Once it is added it no longer belongs to us and
464 * may be freed by other threads of control pulling packets
465 * from the queue.
466 */
467 skb_len = skb->len;
468
469 /* we escape from rcu protected region, make sure we dont leak 461 /* we escape from rcu protected region, make sure we dont leak
470 * a norefcounted dst 462 * a norefcounted dst
471 */ 463 */
@@ -1642,18 +1634,24 @@ void sock_rfree(struct sk_buff *skb)
1642} 1634}
1643EXPORT_SYMBOL(sock_rfree); 1635EXPORT_SYMBOL(sock_rfree);
1644 1636
1637void sock_efree(struct sk_buff *skb)
1638{
1639 sock_put(skb->sk);
1640}
1641EXPORT_SYMBOL(sock_efree);
1642
1643#ifdef CONFIG_INET
1645void sock_edemux(struct sk_buff *skb) 1644void sock_edemux(struct sk_buff *skb)
1646{ 1645{
1647 struct sock *sk = skb->sk; 1646 struct sock *sk = skb->sk;
1648 1647
1649#ifdef CONFIG_INET
1650 if (sk->sk_state == TCP_TIME_WAIT) 1648 if (sk->sk_state == TCP_TIME_WAIT)
1651 inet_twsk_put(inet_twsk(sk)); 1649 inet_twsk_put(inet_twsk(sk));
1652 else 1650 else
1653#endif
1654 sock_put(sk); 1651 sock_put(sk);
1655} 1652}
1656EXPORT_SYMBOL(sock_edemux); 1653EXPORT_SYMBOL(sock_edemux);
1654#endif
1657 1655
1658kuid_t sock_i_uid(struct sock *sk) 1656kuid_t sock_i_uid(struct sock *sk)
1659{ 1657{
@@ -1761,21 +1759,12 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1761 unsigned long data_len, int noblock, 1759 unsigned long data_len, int noblock,
1762 int *errcode, int max_page_order) 1760 int *errcode, int max_page_order)
1763{ 1761{
1764 struct sk_buff *skb = NULL; 1762 struct sk_buff *skb;
1765 unsigned long chunk;
1766 gfp_t gfp_mask;
1767 long timeo; 1763 long timeo;
1768 int err; 1764 int err;
1769 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1770 struct page *page;
1771 int i;
1772
1773 err = -EMSGSIZE;
1774 if (npages > MAX_SKB_FRAGS)
1775 goto failure;
1776 1765
1777 timeo = sock_sndtimeo(sk, noblock); 1766 timeo = sock_sndtimeo(sk, noblock);
1778 while (!skb) { 1767 for (;;) {
1779 err = sock_error(sk); 1768 err = sock_error(sk);
1780 if (err != 0) 1769 if (err != 0)
1781 goto failure; 1770 goto failure;
@@ -1784,66 +1773,27 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1784 if (sk->sk_shutdown & SEND_SHUTDOWN) 1773 if (sk->sk_shutdown & SEND_SHUTDOWN)
1785 goto failure; 1774 goto failure;
1786 1775
1787 if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) { 1776 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1788 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 1777 break;
1789 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1790 err = -EAGAIN;
1791 if (!timeo)
1792 goto failure;
1793 if (signal_pending(current))
1794 goto interrupted;
1795 timeo = sock_wait_for_wmem(sk, timeo);
1796 continue;
1797 }
1798
1799 err = -ENOBUFS;
1800 gfp_mask = sk->sk_allocation;
1801 if (gfp_mask & __GFP_WAIT)
1802 gfp_mask |= __GFP_REPEAT;
1803 1778
1804 skb = alloc_skb(header_len, gfp_mask); 1779 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1805 if (!skb) 1780 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1781 err = -EAGAIN;
1782 if (!timeo)
1806 goto failure; 1783 goto failure;
1807 1784 if (signal_pending(current))
1808 skb->truesize += data_len; 1785 goto interrupted;
1809 1786 timeo = sock_wait_for_wmem(sk, timeo);
1810 for (i = 0; npages > 0; i++) {
1811 int order = max_page_order;
1812
1813 while (order) {
1814 if (npages >= 1 << order) {
1815 page = alloc_pages(sk->sk_allocation |
1816 __GFP_COMP |
1817 __GFP_NOWARN |
1818 __GFP_NORETRY,
1819 order);
1820 if (page)
1821 goto fill_page;
1822 /* Do not retry other high order allocations */
1823 order = 1;
1824 max_page_order = 0;
1825 }
1826 order--;
1827 }
1828 page = alloc_page(sk->sk_allocation);
1829 if (!page)
1830 goto failure;
1831fill_page:
1832 chunk = min_t(unsigned long, data_len,
1833 PAGE_SIZE << order);
1834 skb_fill_page_desc(skb, i, page, 0, chunk);
1835 data_len -= chunk;
1836 npages -= 1 << order;
1837 }
1838 } 1787 }
1839 1788 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1840 skb_set_owner_w(skb, sk); 1789 errcode, sk->sk_allocation);
1790 if (skb)
1791 skb_set_owner_w(skb, sk);
1841 return skb; 1792 return skb;
1842 1793
1843interrupted: 1794interrupted:
1844 err = sock_intr_errno(timeo); 1795 err = sock_intr_errno(timeo);
1845failure: 1796failure:
1846 kfree_skb(skb);
1847 *errcode = err; 1797 *errcode = err;
1848 return NULL; 1798 return NULL;
1849} 1799}
@@ -2492,11 +2442,11 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2492 int level, int type) 2442 int level, int type)
2493{ 2443{
2494 struct sock_exterr_skb *serr; 2444 struct sock_exterr_skb *serr;
2495 struct sk_buff *skb, *skb2; 2445 struct sk_buff *skb;
2496 int copied, err; 2446 int copied, err;
2497 2447
2498 err = -EAGAIN; 2448 err = -EAGAIN;
2499 skb = skb_dequeue(&sk->sk_error_queue); 2449 skb = sock_dequeue_err_skb(sk);
2500 if (skb == NULL) 2450 if (skb == NULL)
2501 goto out; 2451 goto out;
2502 2452
@@ -2517,16 +2467,6 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2517 msg->msg_flags |= MSG_ERRQUEUE; 2467 msg->msg_flags |= MSG_ERRQUEUE;
2518 err = copied; 2468 err = copied;
2519 2469
2520 /* Reset and regenerate socket error */
2521 spin_lock_bh(&sk->sk_error_queue.lock);
2522 sk->sk_err = 0;
2523 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2524 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2525 spin_unlock_bh(&sk->sk_error_queue.lock);
2526 sk->sk_error_report(sk);
2527 } else
2528 spin_unlock_bh(&sk->sk_error_queue.lock);
2529
2530out_free_skb: 2470out_free_skb:
2531 kfree_skb(skb); 2471 kfree_skb(skb);
2532out: 2472out:
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index a8770391ea5b..43d3dd62fcc8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -36,10 +36,9 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
36{ 36{
37 struct phy_device *phydev; 37 struct phy_device *phydev;
38 struct sk_buff *clone; 38 struct sk_buff *clone;
39 struct sock *sk = skb->sk;
40 unsigned int type; 39 unsigned int type;
41 40
42 if (!sk) 41 if (!skb->sk)
43 return; 42 return;
44 43
45 type = classify(skb); 44 type = classify(skb);
@@ -48,50 +47,14 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
48 47
49 phydev = skb->dev->phydev; 48 phydev = skb->dev->phydev;
50 if (likely(phydev->drv->txtstamp)) { 49 if (likely(phydev->drv->txtstamp)) {
51 if (!atomic_inc_not_zero(&sk->sk_refcnt)) 50 clone = skb_clone_sk(skb);
51 if (!clone)
52 return; 52 return;
53
54 clone = skb_clone(skb, GFP_ATOMIC);
55 if (!clone) {
56 sock_put(sk);
57 return;
58 }
59
60 clone->sk = sk;
61 phydev->drv->txtstamp(phydev, clone, type); 53 phydev->drv->txtstamp(phydev, clone, type);
62 } 54 }
63} 55}
64EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); 56EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
65 57
66void skb_complete_tx_timestamp(struct sk_buff *skb,
67 struct skb_shared_hwtstamps *hwtstamps)
68{
69 struct sock *sk = skb->sk;
70 struct sock_exterr_skb *serr;
71 int err;
72
73 if (!hwtstamps) {
74 sock_put(sk);
75 kfree_skb(skb);
76 return;
77 }
78
79 *skb_hwtstamps(skb) = *hwtstamps;
80
81 serr = SKB_EXT_ERR(skb);
82 memset(serr, 0, sizeof(*serr));
83 serr->ee.ee_errno = ENOMSG;
84 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
85 skb->sk = NULL;
86
87 err = sock_queue_err_skb(sk, skb);
88
89 sock_put(sk);
90 if (err)
91 kfree_skb(skb);
92}
93EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
94
95bool skb_defer_rx_timestamp(struct sk_buff *skb) 58bool skb_defer_rx_timestamp(struct sk_buff *skb)
96{ 59{
97 struct phy_device *phydev; 60 struct phy_device *phydev;
diff --git a/net/core/utils.c b/net/core/utils.c
index eed34338736c..efc76dd9dcd1 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -306,16 +306,14 @@ EXPORT_SYMBOL(in6_pton);
306void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, 306void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
307 __be32 from, __be32 to, int pseudohdr) 307 __be32 from, __be32 to, int pseudohdr)
308{ 308{
309 __be32 diff[] = { ~from, to };
310 if (skb->ip_summed != CHECKSUM_PARTIAL) { 309 if (skb->ip_summed != CHECKSUM_PARTIAL) {
311 *sum = csum_fold(csum_partial(diff, sizeof(diff), 310 *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
312 ~csum_unfold(*sum))); 311 to));
313 if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) 312 if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
314 skb->csum = ~csum_partial(diff, sizeof(diff), 313 skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
315 ~skb->csum);
316 } else if (pseudohdr) 314 } else if (pseudohdr)
317 *sum = ~csum_fold(csum_partial(diff, sizeof(diff), 315 *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
318 csum_unfold(*sum))); 316 to));
319} 317}
320EXPORT_SYMBOL(inet_proto_csum_replace4); 318EXPORT_SYMBOL(inet_proto_csum_replace4);
321 319