aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-02-04 02:48:24 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-04 16:02:54 -0500
commit2bd82484bb4c5db1d5dc983ac7c409b2782e0154 (patch)
tree747ea7398b64904505419a0c6843c16c1452b5c9 /net/core
parent7e8acbb69ee2b855288afc61e3794a30e3582977 (diff)
xps: fix xps for stacked devices
A typical qdisc setup is the following : bond0 : bonding device, using HTB hierarchy eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc XPS allows to spread packets on specific tx queues, based on the cpu doing the send. Problem is that dequeues from bond0 qdisc can happen on random cpus, due to the fact that qdisc_run() can dequeue a batch of packets. CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1 CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0 CPUB -> dequeue packet P1 from bond0 enqueue packet on eth1/eth2 CPUC -> dequeue packet P2 from bond0 enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0) get_xps_queue() then might select wrong queue for P1, since current cpu might be different than CPUA. P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping), if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock) Effect of this bug is TCP reorders, and more generally not optimal TX queue placement. (A victim bulk flow can be migrated to the wrong TX queue for a while) To fix this, we have to record sender cpu number the first time dev_queue_xmit() is called for one tx skb. We can union napi_id (used on receive path) and sender_cpu, granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for this union idea) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/flow_dissector.c7
-rw-r--r--net/core/skbuff.c4
2 files changed, 10 insertions, 1 deletions
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index beb83d1ac1c6..2c35c02a931e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -422,7 +422,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
422 dev_maps = rcu_dereference(dev->xps_maps); 422 dev_maps = rcu_dereference(dev->xps_maps);
423 if (dev_maps) { 423 if (dev_maps) {
424 map = rcu_dereference( 424 map = rcu_dereference(
425 dev_maps->cpu_map[raw_smp_processor_id()]); 425 dev_maps->cpu_map[skb->sender_cpu - 1]);
426 if (map) { 426 if (map) {
427 if (map->len == 1) 427 if (map->len == 1)
428 queue_index = map->queues[0]; 428 queue_index = map->queues[0];
@@ -468,6 +468,11 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
468{ 468{
469 int queue_index = 0; 469 int queue_index = 0;
470 470
471#ifdef CONFIG_XPS
472 if (skb->sender_cpu == 0)
473 skb->sender_cpu = raw_smp_processor_id() + 1;
474#endif
475
471 if (dev->real_num_tx_queues != 1) { 476 if (dev->real_num_tx_queues != 1) {
472 const struct net_device_ops *ops = dev->netdev_ops; 477 const struct net_device_ops *ops = dev->netdev_ops;
473 if (ops->ndo_select_queue) 478 if (ops->ndo_select_queue)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a5bff2767f15..88c613eab142 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,6 +825,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
825#ifdef CONFIG_NET_RX_BUSY_POLL 825#ifdef CONFIG_NET_RX_BUSY_POLL
826 CHECK_SKB_FIELD(napi_id); 826 CHECK_SKB_FIELD(napi_id);
827#endif 827#endif
828#ifdef CONFIG_XPS
829 CHECK_SKB_FIELD(sender_cpu);
830#endif
828#ifdef CONFIG_NET_SCHED 831#ifdef CONFIG_NET_SCHED
829 CHECK_SKB_FIELD(tc_index); 832 CHECK_SKB_FIELD(tc_index);
830#ifdef CONFIG_NET_CLS_ACT 833#ifdef CONFIG_NET_CLS_ACT
@@ -4169,6 +4172,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
4169 skb->ignore_df = 0; 4172 skb->ignore_df = 0;
4170 skb_dst_drop(skb); 4173 skb_dst_drop(skb);
4171 skb->mark = 0; 4174 skb->mark = 0;
4175 skb->sender_cpu = 0;
4172 skb_init_secmark(skb); 4176 skb_init_secmark(skb);
4173 secpath_reset(skb); 4177 secpath_reset(skb);
4174 nf_reset(skb); 4178 nf_reset(skb);