aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorYaogong Wang <wygivan@google.com>2016-09-07 17:49:28 -0400
committerDavid S. Miller <davem@davemloft.net>2016-09-08 20:25:58 -0400
commit9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a (patch)
treef434343314c30020025c7a84507107c4aad60fd4 /net/ipv4/tcp_input.c
parent3b61075be0929569e4de8b905ae6628d3285442f (diff)
tcp: use an RB tree for ooo receive queue
Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang <wygivan@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Acked-By: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c330
1 files changed, 190 insertions, 140 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8cd02c0b056c..a5934c4c8cd4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4108,7 +4108,7 @@ void tcp_fin(struct sock *sk)
4108 /* It _is_ possible, that we have something out-of-order _after_ FIN. 4108 /* It _is_ possible, that we have something out-of-order _after_ FIN.
4109 * Probably, we should reset in this case. For now drop them. 4109 * Probably, we should reset in this case. For now drop them.
4110 */ 4110 */
4111 __skb_queue_purge(&tp->out_of_order_queue); 4111 skb_rbtree_purge(&tp->out_of_order_queue);
4112 if (tcp_is_sack(tp)) 4112 if (tcp_is_sack(tp))
4113 tcp_sack_reset(&tp->rx_opt); 4113 tcp_sack_reset(&tp->rx_opt);
4114 sk_mem_reclaim(sk); 4114 sk_mem_reclaim(sk);
@@ -4268,7 +4268,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4268 int this_sack; 4268 int this_sack;
4269 4269
4270 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 4270 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4271 if (skb_queue_empty(&tp->out_of_order_queue)) { 4271 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4272 tp->rx_opt.num_sacks = 0; 4272 tp->rx_opt.num_sacks = 0;
4273 return; 4273 return;
4274 } 4274 }
@@ -4344,10 +4344,13 @@ static void tcp_ofo_queue(struct sock *sk)
4344{ 4344{
4345 struct tcp_sock *tp = tcp_sk(sk); 4345 struct tcp_sock *tp = tcp_sk(sk);
4346 __u32 dsack_high = tp->rcv_nxt; 4346 __u32 dsack_high = tp->rcv_nxt;
4347 bool fin, fragstolen, eaten;
4347 struct sk_buff *skb, *tail; 4348 struct sk_buff *skb, *tail;
4348 bool fragstolen, eaten; 4349 struct rb_node *p;
4349 4350
4350 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4351 p = rb_first(&tp->out_of_order_queue);
4352 while (p) {
4353 skb = rb_entry(p, struct sk_buff, rbnode);
4351 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4354 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4352 break; 4355 break;
4353 4356
@@ -4357,9 +4360,10 @@ static void tcp_ofo_queue(struct sock *sk)
4357 dsack_high = TCP_SKB_CB(skb)->end_seq; 4360 dsack_high = TCP_SKB_CB(skb)->end_seq;
4358 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4361 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4359 } 4362 }
4363 p = rb_next(p);
4364 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4360 4365
4361 __skb_unlink(skb, &tp->out_of_order_queue); 4366 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4362 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4363 SOCK_DEBUG(sk, "ofo packet was already received\n"); 4367 SOCK_DEBUG(sk, "ofo packet was already received\n");
4364 tcp_drop(sk, skb); 4368 tcp_drop(sk, skb);
4365 continue; 4369 continue;
@@ -4371,12 +4375,19 @@ static void tcp_ofo_queue(struct sock *sk)
4371 tail = skb_peek_tail(&sk->sk_receive_queue); 4375 tail = skb_peek_tail(&sk->sk_receive_queue);
4372 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 4376 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4373 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 4377 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4378 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4374 if (!eaten) 4379 if (!eaten)
4375 __skb_queue_tail(&sk->sk_receive_queue, skb); 4380 __skb_queue_tail(&sk->sk_receive_queue, skb);
4376 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4381 else
4377 tcp_fin(sk);
4378 if (eaten)
4379 kfree_skb_partial(skb, fragstolen); 4382 kfree_skb_partial(skb, fragstolen);
4383
4384 if (unlikely(fin)) {
4385 tcp_fin(sk);
4386 /* tcp_fin() purges tp->out_of_order_queue,
4387 * so we must end this loop right now.
4388 */
4389 break;
4390 }
4380 } 4391 }
4381} 4392}
4382 4393
@@ -4403,8 +4414,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4403static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4414static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4404{ 4415{
4405 struct tcp_sock *tp = tcp_sk(sk); 4416 struct tcp_sock *tp = tcp_sk(sk);
4417 struct rb_node **p, *q, *parent;
4406 struct sk_buff *skb1; 4418 struct sk_buff *skb1;
4407 u32 seq, end_seq; 4419 u32 seq, end_seq;
4420 bool fragstolen;
4408 4421
4409 tcp_ecn_check_ce(tp, skb); 4422 tcp_ecn_check_ce(tp, skb);
4410 4423
@@ -4419,88 +4432,85 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4419 inet_csk_schedule_ack(sk); 4432 inet_csk_schedule_ack(sk);
4420 4433
4421 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); 4434 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4435 seq = TCP_SKB_CB(skb)->seq;
4436 end_seq = TCP_SKB_CB(skb)->end_seq;
4422 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", 4437 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4423 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4438 tp->rcv_nxt, seq, end_seq);
4424 4439
4425 skb1 = skb_peek_tail(&tp->out_of_order_queue); 4440 p = &tp->out_of_order_queue.rb_node;
4426 if (!skb1) { 4441 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4427 /* Initial out of order segment, build 1 SACK. */ 4442 /* Initial out of order segment, build 1 SACK. */
4428 if (tcp_is_sack(tp)) { 4443 if (tcp_is_sack(tp)) {
4429 tp->rx_opt.num_sacks = 1; 4444 tp->rx_opt.num_sacks = 1;
4430 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; 4445 tp->selective_acks[0].start_seq = seq;
4431 tp->selective_acks[0].end_seq = 4446 tp->selective_acks[0].end_seq = end_seq;
4432 TCP_SKB_CB(skb)->end_seq;
4433 } 4447 }
4434 __skb_queue_head(&tp->out_of_order_queue, skb); 4448 rb_link_node(&skb->rbnode, NULL, p);
4449 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4450 tp->ooo_last_skb = skb;
4435 goto end; 4451 goto end;
4436 } 4452 }
4437 4453
4438 seq = TCP_SKB_CB(skb)->seq; 4454 /* In the typical case, we are adding an skb to the end of the list.
4439 end_seq = TCP_SKB_CB(skb)->end_seq; 4455 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4440 4456 */
4441 if (seq == TCP_SKB_CB(skb1)->end_seq) { 4457 if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
4442 bool fragstolen; 4458coalesce_done:
4443 4459 tcp_grow_window(sk, skb);
4444 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { 4460 kfree_skb_partial(skb, fragstolen);
4445 __skb_queue_after(&tp->out_of_order_queue, skb1, skb); 4461 skb = NULL;
4446 } else { 4462 goto add_sack;
4447 tcp_grow_window(sk, skb); 4463 }
4448 kfree_skb_partial(skb, fragstolen); 4464
4449 skb = NULL; 4465 /* Find place to insert this segment. Handle overlaps on the way. */
4466 parent = NULL;
4467 while (*p) {
4468 parent = *p;
4469 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4470 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4471 p = &parent->rb_left;
4472 continue;
4450 } 4473 }
4451 4474 if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4452 if (!tp->rx_opt.num_sacks || 4475 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4453 tp->selective_acks[0].end_seq != seq) 4476 /* All the bits are present. Drop. */
4454 goto add_sack; 4477 NET_INC_STATS(sock_net(sk),
4455 4478 LINUX_MIB_TCPOFOMERGE);
4456 /* Common case: data arrive in order after hole. */ 4479 __kfree_skb(skb);
4457 tp->selective_acks[0].end_seq = end_seq; 4480 skb = NULL;
4458 goto end; 4481 tcp_dsack_set(sk, seq, end_seq);
4459 } 4482 goto add_sack;
4460 4483 }
4461 /* Find place to insert this segment. */ 4484 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4462 while (1) { 4485 /* Partial overlap. */
4463 if (!after(TCP_SKB_CB(skb1)->seq, seq)) 4486 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4464 break; 4487 } else {
4465 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { 4488 /* skb's seq == skb1's seq and skb covers skb1.
4466 skb1 = NULL; 4489 * Replace skb1 with skb.
4467 break; 4490 */
4491 rb_replace_node(&skb1->rbnode, &skb->rbnode,
4492 &tp->out_of_order_queue);
4493 tcp_dsack_extend(sk,
4494 TCP_SKB_CB(skb1)->seq,
4495 TCP_SKB_CB(skb1)->end_seq);
4496 NET_INC_STATS(sock_net(sk),
4497 LINUX_MIB_TCPOFOMERGE);
4498 __kfree_skb(skb1);
4499 goto add_sack;
4500 }
4501 } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4502 goto coalesce_done;
4468 } 4503 }
4469 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); 4504 p = &parent->rb_right;
4470 } 4505 }
4471 4506
4472 /* Do skb overlap to previous one? */ 4507 /* Insert segment into RB tree. */
4473 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { 4508 rb_link_node(&skb->rbnode, parent, p);
4474 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4509 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4475 /* All the bits are present. Drop. */
4476 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4477 tcp_drop(sk, skb);
4478 skb = NULL;
4479 tcp_dsack_set(sk, seq, end_seq);
4480 goto add_sack;
4481 }
4482 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4483 /* Partial overlap. */
4484 tcp_dsack_set(sk, seq,
4485 TCP_SKB_CB(skb1)->end_seq);
4486 } else {
4487 if (skb_queue_is_first(&tp->out_of_order_queue,
4488 skb1))
4489 skb1 = NULL;
4490 else
4491 skb1 = skb_queue_prev(
4492 &tp->out_of_order_queue,
4493 skb1);
4494 }
4495 }
4496 if (!skb1)
4497 __skb_queue_head(&tp->out_of_order_queue, skb);
4498 else
4499 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4500 4510
4501 /* And clean segments covered by new one as whole. */ 4511 /* Remove other segments covered by skb. */
4502 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { 4512 while ((q = rb_next(&skb->rbnode)) != NULL) {
4503 skb1 = skb_queue_next(&tp->out_of_order_queue, skb); 4513 skb1 = rb_entry(q, struct sk_buff, rbnode);
4504 4514
4505 if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4515 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4506 break; 4516 break;
@@ -4509,12 +4519,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4509 end_seq); 4519 end_seq);
4510 break; 4520 break;
4511 } 4521 }
4512 __skb_unlink(skb1, &tp->out_of_order_queue); 4522 rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4513 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4523 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4514 TCP_SKB_CB(skb1)->end_seq); 4524 TCP_SKB_CB(skb1)->end_seq);
4515 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); 4525 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4516 tcp_drop(sk, skb1); 4526 tcp_drop(sk, skb1);
4517 } 4527 }
4528 /* If there is no skb after us, we are the last_skb ! */
4529 if (!q)
4530 tp->ooo_last_skb = skb;
4518 4531
4519add_sack: 4532add_sack:
4520 if (tcp_is_sack(tp)) 4533 if (tcp_is_sack(tp))
@@ -4651,13 +4664,13 @@ queue_and_out:
4651 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 4664 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4652 tcp_fin(sk); 4665 tcp_fin(sk);
4653 4666
4654 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4667 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4655 tcp_ofo_queue(sk); 4668 tcp_ofo_queue(sk);
4656 4669
4657 /* RFC2581. 4.2. SHOULD send immediate ACK, when 4670 /* RFC2581. 4.2. SHOULD send immediate ACK, when
4658 * gap in queue is filled. 4671 * gap in queue is filled.
4659 */ 4672 */
4660 if (skb_queue_empty(&tp->out_of_order_queue)) 4673 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4661 inet_csk(sk)->icsk_ack.pingpong = 0; 4674 inet_csk(sk)->icsk_ack.pingpong = 0;
4662 } 4675 }
4663 4676
@@ -4711,48 +4724,76 @@ drop:
4711 tcp_data_queue_ofo(sk, skb); 4724 tcp_data_queue_ofo(sk, skb);
4712} 4725}
4713 4726
4727static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
4728{
4729 if (list)
4730 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4731
4732 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
4733}
4734
4714static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4735static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4715 struct sk_buff_head *list) 4736 struct sk_buff_head *list,
4737 struct rb_root *root)
4716{ 4738{
4717 struct sk_buff *next = NULL; 4739 struct sk_buff *next = tcp_skb_next(skb, list);
4718 4740
4719 if (!skb_queue_is_last(list, skb)) 4741 if (list)
4720 next = skb_queue_next(list, skb); 4742 __skb_unlink(skb, list);
4743 else
4744 rb_erase(&skb->rbnode, root);
4721 4745
4722 __skb_unlink(skb, list);
4723 __kfree_skb(skb); 4746 __kfree_skb(skb);
4724 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); 4747 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4725 4748
4726 return next; 4749 return next;
4727} 4750}
4728 4751
4752/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
4753static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4754{
4755 struct rb_node **p = &root->rb_node;
4756 struct rb_node *parent = NULL;
4757 struct sk_buff *skb1;
4758
4759 while (*p) {
4760 parent = *p;
4761 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4762 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4763 p = &parent->rb_left;
4764 else
4765 p = &parent->rb_right;
4766 }
4767 rb_link_node(&skb->rbnode, parent, p);
4768 rb_insert_color(&skb->rbnode, root);
4769}
4770
4729/* Collapse contiguous sequence of skbs head..tail with 4771/* Collapse contiguous sequence of skbs head..tail with
4730 * sequence numbers start..end. 4772 * sequence numbers start..end.
4731 * 4773 *
4732 * If tail is NULL, this means until the end of the list. 4774 * If tail is NULL, this means until the end of the queue.
4733 * 4775 *
4734 * Segments with FIN/SYN are not collapsed (only because this 4776 * Segments with FIN/SYN are not collapsed (only because this
4735 * simplifies code) 4777 * simplifies code)
4736 */ 4778 */
4737static void 4779static void
4738tcp_collapse(struct sock *sk, struct sk_buff_head *list, 4780tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
4739 struct sk_buff *head, struct sk_buff *tail, 4781 struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
4740 u32 start, u32 end)
4741{ 4782{
4742 struct sk_buff *skb, *n; 4783 struct sk_buff *skb = head, *n;
4784 struct sk_buff_head tmp;
4743 bool end_of_skbs; 4785 bool end_of_skbs;
4744 4786
4745 /* First, check that queue is collapsible and find 4787 /* First, check that queue is collapsible and find
4746 * the point where collapsing can be useful. */ 4788 * the point where collapsing can be useful.
4747 skb = head; 4789 */
4748restart: 4790restart:
4749 end_of_skbs = true; 4791 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
4750 skb_queue_walk_from_safe(list, skb, n) { 4792 n = tcp_skb_next(skb, list);
4751 if (skb == tail) 4793
4752 break;
4753 /* No new bits? It is possible on ofo queue. */ 4794 /* No new bits? It is possible on ofo queue. */
4754 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4795 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4755 skb = tcp_collapse_one(sk, skb, list); 4796 skb = tcp_collapse_one(sk, skb, list, root);
4756 if (!skb) 4797 if (!skb)
4757 break; 4798 break;
4758 goto restart; 4799 goto restart;
@@ -4770,13 +4811,10 @@ restart:
4770 break; 4811 break;
4771 } 4812 }
4772 4813
4773 if (!skb_queue_is_last(list, skb)) { 4814 if (n && n != tail &&
4774 struct sk_buff *next = skb_queue_next(list, skb); 4815 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4775 if (next != tail && 4816 end_of_skbs = false;
4776 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { 4817 break;
4777 end_of_skbs = false;
4778 break;
4779 }
4780 } 4818 }
4781 4819
4782 /* Decided to skip this, advance start seq. */ 4820 /* Decided to skip this, advance start seq. */
@@ -4786,17 +4824,22 @@ restart:
4786 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4824 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4787 return; 4825 return;
4788 4826
4827 __skb_queue_head_init(&tmp);
4828
4789 while (before(start, end)) { 4829 while (before(start, end)) {
4790 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); 4830 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4791 struct sk_buff *nskb; 4831 struct sk_buff *nskb;
4792 4832
4793 nskb = alloc_skb(copy, GFP_ATOMIC); 4833 nskb = alloc_skb(copy, GFP_ATOMIC);
4794 if (!nskb) 4834 if (!nskb)
4795 return; 4835 break;
4796 4836
4797 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4837 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4798 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4838 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4799 __skb_queue_before(list, skb, nskb); 4839 if (list)
4840 __skb_queue_before(list, skb, nskb);
4841 else
4842 __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
4800 skb_set_owner_r(nskb, sk); 4843 skb_set_owner_r(nskb, sk);
4801 4844
4802 /* Copy data, releasing collapsed skbs. */ 4845 /* Copy data, releasing collapsed skbs. */
@@ -4814,14 +4857,17 @@ restart:
4814 start += size; 4857 start += size;
4815 } 4858 }
4816 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4859 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4817 skb = tcp_collapse_one(sk, skb, list); 4860 skb = tcp_collapse_one(sk, skb, list, root);
4818 if (!skb || 4861 if (!skb ||
4819 skb == tail || 4862 skb == tail ||
4820 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 4863 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4821 return; 4864 goto end;
4822 } 4865 }
4823 } 4866 }
4824 } 4867 }
4868end:
4869 skb_queue_walk_safe(&tmp, skb, n)
4870 tcp_rbtree_insert(root, skb);
4825} 4871}
4826 4872
4827/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 4873/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
@@ -4830,43 +4876,43 @@ restart:
4830static void tcp_collapse_ofo_queue(struct sock *sk) 4876static void tcp_collapse_ofo_queue(struct sock *sk)
4831{ 4877{
4832 struct tcp_sock *tp = tcp_sk(sk); 4878 struct tcp_sock *tp = tcp_sk(sk);
4833 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); 4879 struct sk_buff *skb, *head;
4834 struct sk_buff *head; 4880 struct rb_node *p;
4835 u32 start, end; 4881 u32 start, end;
4836 4882
4837 if (!skb) 4883 p = rb_first(&tp->out_of_order_queue);
4884 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4885new_range:
4886 if (!skb) {
4887 p = rb_last(&tp->out_of_order_queue);
4888 /* Note: This is possible p is NULL here. We do not
4889 * use rb_entry_safe(), as ooo_last_skb is valid only
4890 * if rbtree is not empty.
4891 */
4892 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4838 return; 4893 return;
4839 4894 }
4840 start = TCP_SKB_CB(skb)->seq; 4895 start = TCP_SKB_CB(skb)->seq;
4841 end = TCP_SKB_CB(skb)->end_seq; 4896 end = TCP_SKB_CB(skb)->end_seq;
4842 head = skb;
4843
4844 for (;;) {
4845 struct sk_buff *next = NULL;
4846 4897
4847 if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) 4898 for (head = skb;;) {
4848 next = skb_queue_next(&tp->out_of_order_queue, skb); 4899 skb = tcp_skb_next(skb, NULL);
4849 skb = next;
4850 4900
4851 /* Segment is terminated when we see gap or when 4901 /* Range is terminated when we see a gap or when
4852 * we are at the end of all the queue. */ 4902 * we are at the queue end.
4903 */
4853 if (!skb || 4904 if (!skb ||
4854 after(TCP_SKB_CB(skb)->seq, end) || 4905 after(TCP_SKB_CB(skb)->seq, end) ||
4855 before(TCP_SKB_CB(skb)->end_seq, start)) { 4906 before(TCP_SKB_CB(skb)->end_seq, start)) {
4856 tcp_collapse(sk, &tp->out_of_order_queue, 4907 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4857 head, skb, start, end); 4908 head, skb, start, end);
4858 head = skb; 4909 goto new_range;
4859 if (!skb) 4910 }
4860 break; 4911
4861 /* Start new segment */ 4912 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4862 start = TCP_SKB_CB(skb)->seq; 4913 start = TCP_SKB_CB(skb)->seq;
4914 if (after(TCP_SKB_CB(skb)->end_seq, end))
4863 end = TCP_SKB_CB(skb)->end_seq; 4915 end = TCP_SKB_CB(skb)->end_seq;
4864 } else {
4865 if (before(TCP_SKB_CB(skb)->seq, start))
4866 start = TCP_SKB_CB(skb)->seq;
4867 if (after(TCP_SKB_CB(skb)->end_seq, end))
4868 end = TCP_SKB_CB(skb)->end_seq;
4869 }
4870 } 4916 }
4871} 4917}
4872 4918
@@ -4883,20 +4929,24 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4883static bool tcp_prune_ofo_queue(struct sock *sk) 4929static bool tcp_prune_ofo_queue(struct sock *sk)
4884{ 4930{
4885 struct tcp_sock *tp = tcp_sk(sk); 4931 struct tcp_sock *tp = tcp_sk(sk);
4886 struct sk_buff *skb; 4932 struct rb_node *node, *prev;
4887 4933
4888 if (skb_queue_empty(&tp->out_of_order_queue)) 4934 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4889 return false; 4935 return false;
4890 4936
4891 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 4937 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4892 4938 node = &tp->ooo_last_skb->rbnode;
4893 while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) { 4939 do {
4894 tcp_drop(sk, skb); 4940 prev = rb_prev(node);
4941 rb_erase(node, &tp->out_of_order_queue);
4942 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
4895 sk_mem_reclaim(sk); 4943 sk_mem_reclaim(sk);
4896 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 4944 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4897 !tcp_under_memory_pressure(sk)) 4945 !tcp_under_memory_pressure(sk))
4898 break; 4946 break;
4899 } 4947 node = prev;
4948 } while (node);
4949 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
4900 4950
4901 /* Reset SACK state. A conforming SACK implementation will 4951 /* Reset SACK state. A conforming SACK implementation will
4902 * do the same at a timeout based retransmit. When a connection 4952 * do the same at a timeout based retransmit. When a connection
@@ -4930,7 +4980,7 @@ static int tcp_prune_queue(struct sock *sk)
4930 4980
4931 tcp_collapse_ofo_queue(sk); 4981 tcp_collapse_ofo_queue(sk);
4932 if (!skb_queue_empty(&sk->sk_receive_queue)) 4982 if (!skb_queue_empty(&sk->sk_receive_queue))
4933 tcp_collapse(sk, &sk->sk_receive_queue, 4983 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
4934 skb_peek(&sk->sk_receive_queue), 4984 skb_peek(&sk->sk_receive_queue),
4935 NULL, 4985 NULL,
4936 tp->copied_seq, tp->rcv_nxt); 4986 tp->copied_seq, tp->rcv_nxt);
@@ -5035,7 +5085,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5035 /* We ACK each frame or... */ 5085 /* We ACK each frame or... */
5036 tcp_in_quickack_mode(sk) || 5086 tcp_in_quickack_mode(sk) ||
5037 /* We have out of order data. */ 5087 /* We have out of order data. */
5038 (ofo_possible && skb_peek(&tp->out_of_order_queue))) { 5088 (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5039 /* Then ack it now */ 5089 /* Then ack it now */
5040 tcp_send_ack(sk); 5090 tcp_send_ack(sk);
5041 } else { 5091 } else {