aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-10-06 04:08:49 -0400
committerDavid S. Miller <davem@davemloft.net>2012-10-08 14:51:51 -0400
commit2e71a6f8084e7ac87166dd77d99c44190fb844fc (patch)
treeeb2e2d47361b35b2b5a3f26beac3d1fbd888c372
parenta2af139ff1cd85df586690ff626619ab1ee88b0a (diff)
net: gro: selective flush of packets
Current GRO can hold packets in gro_list for almost unlimited time, in case napi->poll() handler consumes its budget over and over. In this case, napi_complete()/napi_gro_flush() are not called. Another problem is that gro_list is flushed in non friendly way : We scan the list and complete packets in the reverse order. (youngest packets first, oldest packets last) This defeats priorities that sender could have cooked. Since GRO currently only store TCP packets, we dont really notice the bug because of retransmits, but this behavior can add unexpected latencies, particularly on mice flows clamped by elephant flows. This patch makes sure no packet can stay more than 1 ms in queue, and only in stress situations. It also complete packets in the right order to minimize latencies. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Jesse Gross <jesse@nicira.com> Cc: Tom Herbert <therbert@google.com> Cc: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/marvell/skge.c2
-rw-r--r--drivers/net/ethernet/realtek/8139cp.c2
-rw-r--r--include/linux/netdevice.h15
-rw-r--r--net/core/dev.c38
4 files changed, 42 insertions, 15 deletions
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 3f7dab46626b..9b9c2ac5c4c2 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -3189,7 +3189,7 @@ static int skge_poll(struct napi_struct *napi, int to_do)
3189 if (work_done < to_do) { 3189 if (work_done < to_do) {
3190 unsigned long flags; 3190 unsigned long flags;
3191 3191
3192 napi_gro_flush(napi); 3192 napi_gro_flush(napi, false);
3193 spin_lock_irqsave(&hw->hw_lock, flags); 3193 spin_lock_irqsave(&hw->hw_lock, flags);
3194 __napi_complete(napi); 3194 __napi_complete(napi);
3195 hw->intr_mask |= napimask[skge->port]; 3195 hw->intr_mask |= napimask[skge->port];
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 995d0cfc4c06..1c818254b7be 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -563,7 +563,7 @@ rx_next:
563 if (cpr16(IntrStatus) & cp_rx_intr_mask) 563 if (cpr16(IntrStatus) & cp_rx_intr_mask)
564 goto rx_status_loop; 564 goto rx_status_loop;
565 565
566 napi_gro_flush(napi); 566 napi_gro_flush(napi, false);
567 spin_lock_irqsave(&cp->lock, flags); 567 spin_lock_irqsave(&cp->lock, flags);
568 __napi_complete(napi); 568 __napi_complete(napi);
569 cpw16_f(IntrMask, cp_intr_mask); 569 cpw16_f(IntrMask, cp_intr_mask);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a659fd0ba965..0a36fff75bd5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1497,19 +1497,22 @@ struct napi_gro_cb {
1497 /* This indicates where we are processing relative to skb->data. */ 1497 /* This indicates where we are processing relative to skb->data. */
1498 int data_offset; 1498 int data_offset;
1499 1499
1500 /* This is non-zero if the packet may be of the same flow. */
1501 int same_flow;
1502
1503 /* This is non-zero if the packet cannot be merged with the new skb. */ 1500 /* This is non-zero if the packet cannot be merged with the new skb. */
1504 int flush; 1501 int flush;
1505 1502
1506 /* Number of segments aggregated. */ 1503 /* Number of segments aggregated. */
1507 int count; 1504 u16 count;
1505
1506 /* This is non-zero if the packet may be of the same flow. */
1507 u8 same_flow;
1508 1508
1509 /* Free the skb? */ 1509 /* Free the skb? */
1510 int free; 1510 u8 free;
1511#define NAPI_GRO_FREE 1 1511#define NAPI_GRO_FREE 1
1512#define NAPI_GRO_FREE_STOLEN_HEAD 2 1512#define NAPI_GRO_FREE_STOLEN_HEAD 2
1513
1514 /* jiffies when first packet was created/queued */
1515 unsigned long age;
1513}; 1516};
1514 1517
1515#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) 1518#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
@@ -2156,7 +2159,7 @@ extern gro_result_t dev_gro_receive(struct napi_struct *napi,
2156extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb); 2159extern gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
2157extern gro_result_t napi_gro_receive(struct napi_struct *napi, 2160extern gro_result_t napi_gro_receive(struct napi_struct *napi,
2158 struct sk_buff *skb); 2161 struct sk_buff *skb);
2159extern void napi_gro_flush(struct napi_struct *napi); 2162extern void napi_gro_flush(struct napi_struct *napi, bool flush_old);
2160extern struct sk_buff * napi_get_frags(struct napi_struct *napi); 2163extern struct sk_buff * napi_get_frags(struct napi_struct *napi);
2161extern gro_result_t napi_frags_finish(struct napi_struct *napi, 2164extern gro_result_t napi_frags_finish(struct napi_struct *napi,
2162 struct sk_buff *skb, 2165 struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index de2bad717d56..d44668f63c88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3471,17 +3471,31 @@ out:
3471 return netif_receive_skb(skb); 3471 return netif_receive_skb(skb);
3472} 3472}
3473 3473
3474inline void napi_gro_flush(struct napi_struct *napi) 3474/* napi->gro_list contains packets ordered by age.
3475 * youngest packets at the head of it.
3476 * Complete skbs in reverse order to reduce latencies.
3477 */
3478void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3475{ 3479{
3476 struct sk_buff *skb, *next; 3480 struct sk_buff *skb, *prev = NULL;
3477 3481
3478 for (skb = napi->gro_list; skb; skb = next) { 3482 /* scan list and build reverse chain */
3479 next = skb->next; 3483 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3484 skb->prev = prev;
3485 prev = skb;
3486 }
3487
3488 for (skb = prev; skb; skb = prev) {
3480 skb->next = NULL; 3489 skb->next = NULL;
3490
3491 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3492 return;
3493
3494 prev = skb->prev;
3481 napi_gro_complete(skb); 3495 napi_gro_complete(skb);
3496 napi->gro_count--;
3482 } 3497 }
3483 3498
3484 napi->gro_count = 0;
3485 napi->gro_list = NULL; 3499 napi->gro_list = NULL;
3486} 3500}
3487EXPORT_SYMBOL(napi_gro_flush); 3501EXPORT_SYMBOL(napi_gro_flush);
@@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3542 3556
3543 napi->gro_count++; 3557 napi->gro_count++;
3544 NAPI_GRO_CB(skb)->count = 1; 3558 NAPI_GRO_CB(skb)->count = 1;
3559 NAPI_GRO_CB(skb)->age = jiffies;
3545 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 3560 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3546 skb->next = napi->gro_list; 3561 skb->next = napi->gro_list;
3547 napi->gro_list = skb; 3562 napi->gro_list = skb;
@@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n)
3878 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 3893 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3879 return; 3894 return;
3880 3895
3881 napi_gro_flush(n); 3896 napi_gro_flush(n, false);
3882 local_irq_save(flags); 3897 local_irq_save(flags);
3883 __napi_complete(n); 3898 __napi_complete(n);
3884 local_irq_restore(flags); 3899 local_irq_restore(flags);
@@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h)
3983 local_irq_enable(); 3998 local_irq_enable();
3984 napi_complete(n); 3999 napi_complete(n);
3985 local_irq_disable(); 4000 local_irq_disable();
3986 } else 4001 } else {
4002 if (n->gro_list) {
4003 /* flush too old packets
4004 * If HZ < 1000, flush all packets.
4005 */
4006 local_irq_enable();
4007 napi_gro_flush(n, HZ >= 1000);
4008 local_irq_disable();
4009 }
3987 list_move_tail(&n->poll_list, &sd->poll_list); 4010 list_move_tail(&n->poll_list, &sd->poll_list);
4011 }
3988 } 4012 }
3989 4013
3990 netpoll_poll_unlock(have); 4014 netpoll_poll_unlock(have);