aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-04-23 03:11:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-04-23 22:42:49 -0400
commit1402d366019fedaa2b024f2bac06b7cc9a8782e1 (patch)
treef74c00a2308ad13d6970cdc7e5cc29d2f0f809fa /net
parente29ecd51de1683e6aeb88d76251f194b0311f749 (diff)
tcp: introduce tcp_try_coalesce
commit c8628155ece3 (tcp: reduce out_of_order memory use) took care of coalescing tcp segments provided by legacy devices (linear skbs) We extend this idea to fragged skbs, as their truesize can be heavy. ixgbe for example uses 256+1024+PAGE_SIZE/2 = 3328 bytes per segment. Use this coalescing strategy for receive queue too. This contributes to reduce number of tcp collapses, at minimal cost, and reduces memory overhead and packets drops. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Maciej Żenczykowski <maze@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/tcp_input.c79
1 files changed, 62 insertions, 17 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 37e1c5cd2c01..bd7aef59c385 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4449,6 +4449,58 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
4449 return 0; 4449 return 0;
4450} 4450}
4451 4451
4452/**
4453 * tcp_try_coalesce - try to merge skb to prior one
4454 * @sk: socket
4455 * @to: prior buffer
4456 * @from: buffer to add in queue
4457 *
4458 * Before queueing skb @from after @to, try to merge them
4459 * to reduce overall memory use and queue lengths, if cost is small.
4460 * Packets in ofo or receive queues can stay a long time.
4461 * Better try to coalesce them right now to avoid future collapses.
4462 * Returns > 0 value if caller should free @from instead of queueing it
4463 */
4464static int tcp_try_coalesce(struct sock *sk,
4465 struct sk_buff *to,
4466 struct sk_buff *from)
4467{
4468 int len = from->len;
4469
4470 if (tcp_hdr(from)->fin)
4471 return 0;
4472 if (len <= skb_tailroom(to)) {
4473 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
4474merge:
4475 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4476 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4477 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4478 return 1;
4479 }
4480 if (skb_headlen(from) == 0 &&
4481 !skb_has_frag_list(to) &&
4482 !skb_has_frag_list(from) &&
4483 (skb_shinfo(to)->nr_frags +
4484 skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) {
4485 int delta = from->truesize - ksize(from->head) -
4486 SKB_DATA_ALIGN(sizeof(struct sk_buff));
4487
4488 WARN_ON_ONCE(delta < len);
4489 memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
4490 skb_shinfo(from)->frags,
4491 skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
4492 skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
4493 skb_shinfo(from)->nr_frags = 0;
4494 to->truesize += delta;
4495 atomic_add(delta, &sk->sk_rmem_alloc);
4496 sk_mem_charge(sk, delta);
4497 to->len += len;
4498 to->data_len += len;
4499 goto merge;
4500 }
4501 return 0;
4502}
4503
4452static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4504static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4453{ 4505{
4454 struct tcp_sock *tp = tcp_sk(sk); 4506 struct tcp_sock *tp = tcp_sk(sk);
@@ -4487,23 +4539,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4487 end_seq = TCP_SKB_CB(skb)->end_seq; 4539 end_seq = TCP_SKB_CB(skb)->end_seq;
4488 4540
4489 if (seq == TCP_SKB_CB(skb1)->end_seq) { 4541 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4490 /* Packets in ofo can stay in queue a long time. 4542 if (tcp_try_coalesce(sk, skb1, skb) <= 0) {
4491 * Better try to coalesce them right now 4543 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4492 * to avoid future tcp_collapse_ofo_queue(), 4544 } else {
4493 * probably the most expensive function in tcp stack.
4494 */
4495 if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) {
4496 NET_INC_STATS_BH(sock_net(sk),
4497 LINUX_MIB_TCPRCVCOALESCE);
4498 BUG_ON(skb_copy_bits(skb, 0,
4499 skb_put(skb1, skb->len),
4500 skb->len));
4501 TCP_SKB_CB(skb1)->end_seq = end_seq;
4502 TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
4503 __kfree_skb(skb); 4545 __kfree_skb(skb);
4504 skb = NULL; 4546 skb = NULL;
4505 } else {
4506 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4507 } 4547 }
4508 4548
4509 if (!tp->rx_opt.num_sacks || 4549 if (!tp->rx_opt.num_sacks ||
@@ -4624,13 +4664,18 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4624 } 4664 }
4625 4665
4626 if (eaten <= 0) { 4666 if (eaten <= 0) {
4667 struct sk_buff *tail;
4627queue_and_out: 4668queue_and_out:
4628 if (eaten < 0 && 4669 if (eaten < 0 &&
4629 tcp_try_rmem_schedule(sk, skb->truesize)) 4670 tcp_try_rmem_schedule(sk, skb->truesize))
4630 goto drop; 4671 goto drop;
4631 4672
4632 skb_set_owner_r(skb, sk); 4673 tail = skb_peek_tail(&sk->sk_receive_queue);
4633 __skb_queue_tail(&sk->sk_receive_queue, skb); 4674 eaten = tail ? tcp_try_coalesce(sk, tail, skb) : -1;
4675 if (eaten <= 0) {
4676 skb_set_owner_r(skb, sk);
4677 __skb_queue_tail(&sk->sk_receive_queue, skb);
4678 }
4634 } 4679 }
4635 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4680 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4636 if (skb->len) 4681 if (skb->len)