diff options
author | Eric Dumazet <edumazet@google.com> | 2014-10-05 21:38:35 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-10-07 13:22:11 -0400 |
commit | 0287587884b15041203b3a362d485e1ab1f24445 (patch) | |
tree | 675ae57663c1ba3ee8768e65e7fb0e6d0259e04c /net | |
parent | fe971b95c22578456ff7198537827841c726d3f7 (diff) |
net: better IFF_XMIT_DST_RELEASE support
Testing xmit_more support with netperf and connected UDP sockets,
I found strange dst refcount false sharing.
Current handling of IFF_XMIT_DST_RELEASE is not optimal.
Dropping dst in validate_xmit_skb() is certainly too late in case
packet was queued by cpu X but dequeued by cpu Y
The logical point to take care of drop/force is in __dev_queue_xmit()
before even taking qdisc lock.
As Julian Anastasov pointed out, need for skb_dst() might come from some
packet schedulers or classifiers.
This patch adds new helper to cleanly express needs of various drivers
or qdiscs/classifiers.
Drivers that need skb_dst() in their ndo_start_xmit() should call
following helper in their setup instead of the prior :
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
->
netif_keep_dst(dev);
Instead of using a single bit, we use two bits, one being
eventually rebuilt in bonding/team drivers.
The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being
rebuilt in bonding/team. Eventually, we could add something
smarter later.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/8021q/vlan_dev.c | 3 | ||||
-rw-r--r-- | net/atm/clip.c | 2 | ||||
-rw-r--r-- | net/core/dev.c | 19 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_gre.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_tunnel.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_vti.c | 2 | ||||
-rw-r--r-- | net/ipv6/sit.c | 2 | ||||
-rw-r--r-- | net/sched/cls_flow.c | 2 | ||||
-rw-r--r-- | net/sched/cls_route.c | 1 | ||||
-rw-r--r-- | net/sched/sch_generic.c | 3 | ||||
-rw-r--r-- | net/sched/sch_teql.c | 2 |
14 files changed, 23 insertions, 23 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 35a6b6b15e8a..0d441ec8763e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c | |||
@@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev) | |||
799 | ether_setup(dev); | 799 | ether_setup(dev); |
800 | 800 | ||
801 | dev->priv_flags |= IFF_802_1Q_VLAN; | 801 | dev->priv_flags |= IFF_802_1Q_VLAN; |
802 | dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); | 802 | dev->priv_flags &= ~IFF_TX_SKB_SHARING; |
803 | netif_keep_dst(dev); | ||
803 | dev->tx_queue_len = 0; | 804 | dev->tx_queue_len = 0; |
804 | 805 | ||
805 | dev->netdev_ops = &vlan_netdev_ops; | 806 | dev->netdev_ops = &vlan_netdev_ops; |
diff --git a/net/atm/clip.c b/net/atm/clip.c index 1d9eaa4f041a..17e55dfecbe2 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c | |||
@@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev) | |||
501 | /* without any more elaborate queuing. 100 is a reasonable */ | 501 | /* without any more elaborate queuing. 100 is a reasonable */ |
502 | /* compromise between decent burst-tolerance and protection */ | 502 | /* compromise between decent burst-tolerance and protection */ |
503 | /* against memory hogs. */ | 503 | /* against memory hogs. */ |
504 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 504 | netif_keep_dst(dev); |
505 | } | 505 | } |
506 | 506 | ||
507 | static int clip_create(int number) | 507 | static int clip_create(int number) |
diff --git a/net/core/dev.c b/net/core/dev.c index a63b8c43c1b6..3c5bdaa44486 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device | |||
2665 | if (skb->next) | 2665 | if (skb->next) |
2666 | return skb; | 2666 | return skb; |
2667 | 2667 | ||
2668 | /* If device doesn't need skb->dst, release it right now while | ||
2669 | * its hot in this cpu cache | ||
2670 | */ | ||
2671 | if (dev->priv_flags & IFF_XMIT_DST_RELEASE) | ||
2672 | skb_dst_drop(skb); | ||
2673 | |||
2674 | features = netif_skb_features(skb); | 2668 | features = netif_skb_features(skb); |
2675 | skb = validate_xmit_vlan(skb, features); | 2669 | skb = validate_xmit_vlan(skb, features); |
2676 | if (unlikely(!skb)) | 2670 | if (unlikely(!skb)) |
@@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, | |||
2811 | * waiting to be sent out; and the qdisc is not running - | 2805 | * waiting to be sent out; and the qdisc is not running - |
2812 | * xmit the skb directly. | 2806 | * xmit the skb directly. |
2813 | */ | 2807 | */ |
2814 | if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) | ||
2815 | skb_dst_force(skb); | ||
2816 | 2808 | ||
2817 | qdisc_bstats_update(q, skb); | 2809 | qdisc_bstats_update(q, skb); |
2818 | 2810 | ||
@@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, | |||
2827 | 2819 | ||
2828 | rc = NET_XMIT_SUCCESS; | 2820 | rc = NET_XMIT_SUCCESS; |
2829 | } else { | 2821 | } else { |
2830 | skb_dst_force(skb); | ||
2831 | rc = q->enqueue(skb, q) & NET_XMIT_MASK; | 2822 | rc = q->enqueue(skb, q) & NET_XMIT_MASK; |
2832 | if (qdisc_run_begin(q)) { | 2823 | if (qdisc_run_begin(q)) { |
2833 | if (unlikely(contended)) { | 2824 | if (unlikely(contended)) { |
@@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) | |||
2924 | 2915 | ||
2925 | skb_update_prio(skb); | 2916 | skb_update_prio(skb); |
2926 | 2917 | ||
2918 | /* If device/qdisc don't need skb->dst, release it right now while | ||
2919 | * its hot in this cpu cache. | ||
2920 | */ | ||
2921 | if (dev->priv_flags & IFF_XMIT_DST_RELEASE) | ||
2922 | skb_dst_drop(skb); | ||
2923 | else | ||
2924 | skb_dst_force(skb); | ||
2925 | |||
2927 | txq = netdev_pick_tx(dev, skb, accel_priv); | 2926 | txq = netdev_pick_tx(dev, skb, accel_priv); |
2928 | q = rcu_dereference_bh(txq->qdisc); | 2927 | q = rcu_dereference_bh(txq->qdisc); |
2929 | 2928 | ||
@@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, | |||
6674 | INIT_LIST_HEAD(&dev->adj_list.lower); | 6673 | INIT_LIST_HEAD(&dev->adj_list.lower); |
6675 | INIT_LIST_HEAD(&dev->all_adj_list.upper); | 6674 | INIT_LIST_HEAD(&dev->all_adj_list.upper); |
6676 | INIT_LIST_HEAD(&dev->all_adj_list.lower); | 6675 | INIT_LIST_HEAD(&dev->all_adj_list.lower); |
6677 | dev->priv_flags = IFF_XMIT_DST_RELEASE; | 6676 | dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; |
6678 | setup(dev); | 6677 | setup(dev); |
6679 | 6678 | ||
6680 | dev->num_tx_queues = txqs; | 6679 | dev->num_tx_queues = txqs; |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0485ef18d254..12055fdbe716 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
510 | memcpy(dev->broadcast, &iph->daddr, 4); | 510 | memcpy(dev->broadcast, &iph->daddr, 4); |
511 | 511 | ||
512 | dev->flags = IFF_NOARP; | 512 | dev->flags = IFF_NOARP; |
513 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 513 | netif_keep_dst(dev); |
514 | dev->addr_len = 4; | 514 | dev->addr_len = 4; |
515 | 515 | ||
516 | if (iph->daddr) { | 516 | if (iph->daddr) { |
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e453cb724a95..3e861011e4a3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c | |||
@@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev) | |||
364 | dev->iflink = 0; | 364 | dev->iflink = 0; |
365 | dev->addr_len = 4; | 365 | dev->addr_len = 4; |
366 | dev->features |= NETIF_F_LLTX; | 366 | dev->features |= NETIF_F_LLTX; |
367 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 367 | netif_keep_dst(dev); |
368 | 368 | ||
369 | return ip_tunnel_init(dev); | 369 | return ip_tunnel_init(dev); |
370 | } | 370 | } |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ea88ab3102a8..37096d64730e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c | |||
@@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev) | |||
289 | dev->iflink = 0; | 289 | dev->iflink = 0; |
290 | dev->addr_len = 4; | 290 | dev->addr_len = 4; |
291 | dev->features |= NETIF_F_LLTX; | 291 | dev->features |= NETIF_F_LLTX; |
292 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 292 | netif_keep_dst(dev); |
293 | 293 | ||
294 | dev->features |= IPIP_FEATURES; | 294 | dev->features |= IPIP_FEATURES; |
295 | dev->hw_features |= IPIP_FEATURES; | 295 | dev->hw_features |= IPIP_FEATURES; |
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74b677916a70..de3b1c86b8d3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c | |||
@@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) | |||
1242 | dev->flags |= IFF_NOARP; | 1242 | dev->flags |= IFF_NOARP; |
1243 | dev->iflink = 0; | 1243 | dev->iflink = 0; |
1244 | dev->addr_len = sizeof(struct in6_addr); | 1244 | dev->addr_len = sizeof(struct in6_addr); |
1245 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 1245 | netif_keep_dst(dev); |
1246 | } | 1246 | } |
1247 | 1247 | ||
1248 | static int ip6gre_tunnel_init(struct net_device *dev) | 1248 | static int ip6gre_tunnel_init(struct net_device *dev) |
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index d3e8888ad611..9409887fb664 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c | |||
@@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) | |||
1493 | dev->mtu -= 8; | 1493 | dev->mtu -= 8; |
1494 | dev->flags |= IFF_NOARP; | 1494 | dev->flags |= IFF_NOARP; |
1495 | dev->addr_len = sizeof(struct in6_addr); | 1495 | dev->addr_len = sizeof(struct in6_addr); |
1496 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 1496 | netif_keep_dst(dev); |
1497 | /* This perm addr will be used as interface identifier by IPv6 */ | 1497 | /* This perm addr will be used as interface identifier by IPv6 */ |
1498 | dev->addr_assign_type = NET_ADDR_RANDOM; | 1498 | dev->addr_assign_type = NET_ADDR_RANDOM; |
1499 | eth_random_addr(dev->perm_addr); | 1499 | eth_random_addr(dev->perm_addr); |
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5833a2244467..d440bb585524 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c | |||
@@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev) | |||
807 | dev->mtu = ETH_DATA_LEN; | 807 | dev->mtu = ETH_DATA_LEN; |
808 | dev->flags |= IFF_NOARP; | 808 | dev->flags |= IFF_NOARP; |
809 | dev->addr_len = sizeof(struct in6_addr); | 809 | dev->addr_len = sizeof(struct in6_addr); |
810 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 810 | netif_keep_dst(dev); |
811 | } | 811 | } |
812 | 812 | ||
813 | /** | 813 | /** |
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d4e27466f82..6eab37cf5345 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c | |||
@@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) | |||
1364 | dev->hard_header_len = LL_MAX_HEADER + t_hlen; | 1364 | dev->hard_header_len = LL_MAX_HEADER + t_hlen; |
1365 | dev->mtu = ETH_DATA_LEN - t_hlen; | 1365 | dev->mtu = ETH_DATA_LEN - t_hlen; |
1366 | dev->flags = IFF_NOARP; | 1366 | dev->flags = IFF_NOARP; |
1367 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 1367 | netif_keep_dst(dev); |
1368 | dev->iflink = 0; | 1368 | dev->iflink = 0; |
1369 | dev->addr_len = 4; | 1369 | dev->addr_len = 4; |
1370 | dev->features |= NETIF_F_LLTX; | 1370 | dev->features |= NETIF_F_LLTX; |
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index a5d2b20db560..4ac515f2a6ce 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c | |||
@@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, | |||
493 | tcf_exts_change(tp, &fnew->exts, &e); | 493 | tcf_exts_change(tp, &fnew->exts, &e); |
494 | tcf_em_tree_change(tp, &fnew->ematches, &t); | 494 | tcf_em_tree_change(tp, &fnew->ematches, &t); |
495 | 495 | ||
496 | netif_keep_dst(qdisc_dev(tp->q)); | ||
497 | |||
496 | if (tb[TCA_FLOW_KEYS]) { | 498 | if (tb[TCA_FLOW_KEYS]) { |
497 | fnew->keymask = keymask; | 499 | fnew->keymask = keymask; |
498 | fnew->nkeys = nkeys; | 500 | fnew->nkeys = nkeys; |
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f22baae0afa..109a329b7198 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c | |||
@@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, | |||
524 | if (f->handle < f1->handle) | 524 | if (f->handle < f1->handle) |
525 | break; | 525 | break; |
526 | 526 | ||
527 | netif_keep_dst(qdisc_dev(tp->q)); | ||
527 | rcu_assign_pointer(f->next, f1); | 528 | rcu_assign_pointer(f->next, f1); |
528 | rcu_assign_pointer(*fp, f); | 529 | rcu_assign_pointer(*fp, f); |
529 | 530 | ||
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2b349a4de3c8..38d58e6cef07 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c | |||
@@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops); | |||
47 | 47 | ||
48 | static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) | 48 | static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) |
49 | { | 49 | { |
50 | skb_dst_force(skb); | ||
51 | q->gso_skb = skb; | 50 | q->gso_skb = skb; |
52 | q->qstats.requeues++; | 51 | q->qstats.requeues++; |
53 | q->q.qlen++; /* it's still part of the queue */ | 52 | q->q.qlen++; /* it's still part of the queue */ |
@@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q) | |||
218 | if (unlikely(!skb)) | 217 | if (unlikely(!skb)) |
219 | return 0; | 218 | return 0; |
220 | 219 | ||
221 | WARN_ON_ONCE(skb_dst_is_noref(skb)); | ||
222 | |||
223 | root_lock = qdisc_lock(q); | 220 | root_lock = qdisc_lock(q); |
224 | dev = qdisc_dev(q); | 221 | dev = qdisc_dev(q); |
225 | txq = skb_get_tx_queue(dev, skb); | 222 | txq = skb_get_tx_queue(dev, skb); |
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 5cd291bd00e4..6ada42396a24 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c | |||
@@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev) | |||
470 | dev->tx_queue_len = 100; | 470 | dev->tx_queue_len = 100; |
471 | dev->flags = IFF_NOARP; | 471 | dev->flags = IFF_NOARP; |
472 | dev->hard_header_len = LL_MAX_HEADER; | 472 | dev->hard_header_len = LL_MAX_HEADER; |
473 | dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; | 473 | netif_keep_dst(dev); |
474 | } | 474 | } |
475 | 475 | ||
476 | static LIST_HEAD(master_dev_list); | 476 | static LIST_HEAD(master_dev_list); |