diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 334 |
1 files changed, 189 insertions, 145 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2877c3e09587..e886e2f7fa8d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -61,6 +61,8 @@ | |||
| 61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 61 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
| 62 | */ | 62 | */ |
| 63 | 63 | ||
| 64 | #define pr_fmt(fmt) "TCP: " fmt | ||
| 65 | |||
| 64 | #include <linux/mm.h> | 66 | #include <linux/mm.h> |
| 65 | #include <linux/slab.h> | 67 | #include <linux/slab.h> |
| 66 | #include <linux/module.h> | 68 | #include <linux/module.h> |
| @@ -105,7 +107,6 @@ int sysctl_tcp_abc __read_mostly; | |||
| 105 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ | 107 | #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ |
| 106 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | 108 | #define FLAG_DATA_SACKED 0x20 /* New SACK. */ |
| 107 | #define FLAG_ECE 0x40 /* ECE in this ACK */ | 109 | #define FLAG_ECE 0x40 /* ECE in this ACK */ |
| 108 | #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ | ||
| 109 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | 110 | #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ |
| 110 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ | 111 | #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ |
| 111 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | 112 | #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ |
| @@ -1040,13 +1041,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, | |||
| 1040 | * These 6 states form finite state machine, controlled by the following events: | 1041 | * These 6 states form finite state machine, controlled by the following events: |
| 1041 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) | 1042 | * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) |
| 1042 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) | 1043 | * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) |
| 1043 | * 3. Loss detection event of one of three flavors: | 1044 | * 3. Loss detection event of two flavors: |
| 1044 | * A. Scoreboard estimator decided the packet is lost. | 1045 | * A. Scoreboard estimator decided the packet is lost. |
| 1045 | * A'. Reno "three dupacks" marks head of queue lost. | 1046 | * A'. Reno "three dupacks" marks head of queue lost. |
| 1046 | * A''. Its FACK modfication, head until snd.fack is lost. | 1047 | * A''. Its FACK modification, head until snd.fack is lost. |
| 1047 | * B. SACK arrives sacking data transmitted after never retransmitted | 1048 | * B. SACK arrives sacking SND.NXT at the moment, when the |
| 1048 | * hole was sent out. | ||
| 1049 | * C. SACK arrives sacking SND.NXT at the moment, when the | ||
| 1050 | * segment was retransmitted. | 1049 | * segment was retransmitted. |
| 1051 | * 4. D-SACK added new rule: D-SACK changes any tag to S. | 1050 | * 4. D-SACK added new rule: D-SACK changes any tag to S. |
| 1052 | * | 1051 | * |
| @@ -1153,7 +1152,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, | |||
| 1153 | } | 1152 | } |
| 1154 | 1153 | ||
| 1155 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". | 1154 | /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". |
| 1156 | * Event "C". Later note: FACK people cheated me again 8), we have to account | 1155 | * Event "B". Later note: FACK people cheated me again 8), we have to account |
| 1157 | * for reordering! Ugly, but should help. | 1156 | * for reordering! Ugly, but should help. |
| 1158 | * | 1157 | * |
| 1159 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was | 1158 | * Search retransmitted skbs from write_queue that were sent when snd_nxt was |
| @@ -1310,25 +1309,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, | |||
| 1310 | return in_sack; | 1309 | return in_sack; |
| 1311 | } | 1310 | } |
| 1312 | 1311 | ||
| 1313 | static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, | 1312 | /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ |
| 1314 | struct tcp_sacktag_state *state, | 1313 | static u8 tcp_sacktag_one(struct sock *sk, |
| 1314 | struct tcp_sacktag_state *state, u8 sacked, | ||
| 1315 | u32 start_seq, u32 end_seq, | ||
| 1315 | int dup_sack, int pcount) | 1316 | int dup_sack, int pcount) |
| 1316 | { | 1317 | { |
| 1317 | struct tcp_sock *tp = tcp_sk(sk); | 1318 | struct tcp_sock *tp = tcp_sk(sk); |
| 1318 | u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
| 1319 | int fack_count = state->fack_count; | 1319 | int fack_count = state->fack_count; |
| 1320 | 1320 | ||
| 1321 | /* Account D-SACK for retransmitted packet. */ | 1321 | /* Account D-SACK for retransmitted packet. */ |
| 1322 | if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1322 | if (dup_sack && (sacked & TCPCB_RETRANS)) { |
| 1323 | if (tp->undo_marker && tp->undo_retrans && | 1323 | if (tp->undo_marker && tp->undo_retrans && |
| 1324 | after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) | 1324 | after(end_seq, tp->undo_marker)) |
| 1325 | tp->undo_retrans--; | 1325 | tp->undo_retrans--; |
| 1326 | if (sacked & TCPCB_SACKED_ACKED) | 1326 | if (sacked & TCPCB_SACKED_ACKED) |
| 1327 | state->reord = min(fack_count, state->reord); | 1327 | state->reord = min(fack_count, state->reord); |
| 1328 | } | 1328 | } |
| 1329 | 1329 | ||
| 1330 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ | 1330 | /* Nothing to do; acked frame is about to be dropped (was ACKed). */ |
| 1331 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) | 1331 | if (!after(end_seq, tp->snd_una)) |
| 1332 | return sacked; | 1332 | return sacked; |
| 1333 | 1333 | ||
| 1334 | if (!(sacked & TCPCB_SACKED_ACKED)) { | 1334 | if (!(sacked & TCPCB_SACKED_ACKED)) { |
| @@ -1347,13 +1347,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, | |||
| 1347 | /* New sack for not retransmitted frame, | 1347 | /* New sack for not retransmitted frame, |
| 1348 | * which was in hole. It is reordering. | 1348 | * which was in hole. It is reordering. |
| 1349 | */ | 1349 | */ |
| 1350 | if (before(TCP_SKB_CB(skb)->seq, | 1350 | if (before(start_seq, |
| 1351 | tcp_highest_sack_seq(tp))) | 1351 | tcp_highest_sack_seq(tp))) |
| 1352 | state->reord = min(fack_count, | 1352 | state->reord = min(fack_count, |
| 1353 | state->reord); | 1353 | state->reord); |
| 1354 | 1354 | ||
| 1355 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ | 1355 | /* SACK enhanced F-RTO (RFC4138; Appendix B) */ |
| 1356 | if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) | 1356 | if (!after(end_seq, tp->frto_highmark)) |
| 1357 | state->flag |= FLAG_ONLY_ORIG_SACKED; | 1357 | state->flag |= FLAG_ONLY_ORIG_SACKED; |
| 1358 | } | 1358 | } |
| 1359 | 1359 | ||
| @@ -1371,8 +1371,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, | |||
| 1371 | 1371 | ||
| 1372 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ | 1372 | /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ |
| 1373 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && | 1373 | if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && |
| 1374 | before(TCP_SKB_CB(skb)->seq, | 1374 | before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) |
| 1375 | TCP_SKB_CB(tp->lost_skb_hint)->seq)) | ||
| 1376 | tp->lost_cnt_hint += pcount; | 1375 | tp->lost_cnt_hint += pcount; |
| 1377 | 1376 | ||
| 1378 | if (fack_count > tp->fackets_out) | 1377 | if (fack_count > tp->fackets_out) |
| @@ -1391,6 +1390,9 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, | |||
| 1391 | return sacked; | 1390 | return sacked; |
| 1392 | } | 1391 | } |
| 1393 | 1392 | ||
| 1393 | /* Shift newly-SACKed bytes from this skb to the immediately previous | ||
| 1394 | * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. | ||
| 1395 | */ | ||
| 1394 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | 1396 | static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, |
| 1395 | struct tcp_sacktag_state *state, | 1397 | struct tcp_sacktag_state *state, |
| 1396 | unsigned int pcount, int shifted, int mss, | 1398 | unsigned int pcount, int shifted, int mss, |
| @@ -1398,9 +1400,20 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1398 | { | 1400 | { |
| 1399 | struct tcp_sock *tp = tcp_sk(sk); | 1401 | struct tcp_sock *tp = tcp_sk(sk); |
| 1400 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); | 1402 | struct sk_buff *prev = tcp_write_queue_prev(sk, skb); |
| 1403 | u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ | ||
| 1404 | u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ | ||
| 1401 | 1405 | ||
| 1402 | BUG_ON(!pcount); | 1406 | BUG_ON(!pcount); |
| 1403 | 1407 | ||
| 1408 | /* Adjust counters and hints for the newly sacked sequence | ||
| 1409 | * range but discard the return value since prev is already | ||
| 1410 | * marked. We must tag the range first because the seq | ||
| 1411 | * advancement below implicitly advances | ||
| 1412 | * tcp_highest_sack_seq() when skb is highest_sack. | ||
| 1413 | */ | ||
| 1414 | tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, | ||
| 1415 | start_seq, end_seq, dup_sack, pcount); | ||
| 1416 | |||
| 1404 | if (skb == tp->lost_skb_hint) | 1417 | if (skb == tp->lost_skb_hint) |
| 1405 | tp->lost_cnt_hint += pcount; | 1418 | tp->lost_cnt_hint += pcount; |
| 1406 | 1419 | ||
| @@ -1427,9 +1440,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, | |||
| 1427 | skb_shinfo(skb)->gso_type = 0; | 1440 | skb_shinfo(skb)->gso_type = 0; |
| 1428 | } | 1441 | } |
| 1429 | 1442 | ||
| 1430 | /* We discard results */ | ||
| 1431 | tcp_sacktag_one(skb, sk, state, dup_sack, pcount); | ||
| 1432 | |||
| 1433 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ | 1443 | /* Difference in this won't matter, both ACKed by the same cumul. ACK */ |
| 1434 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); | 1444 | TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); |
| 1435 | 1445 | ||
| @@ -1577,6 +1587,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, | |||
| 1577 | } | 1587 | } |
| 1578 | } | 1588 | } |
| 1579 | 1589 | ||
| 1590 | /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ | ||
| 1591 | if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) | ||
| 1592 | goto fallback; | ||
| 1593 | |||
| 1580 | if (!skb_shift(prev, skb, len)) | 1594 | if (!skb_shift(prev, skb, len)) |
| 1581 | goto fallback; | 1595 | goto fallback; |
| 1582 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) | 1596 | if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) |
| @@ -1667,10 +1681,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, | |||
| 1667 | break; | 1681 | break; |
| 1668 | 1682 | ||
| 1669 | if (in_sack) { | 1683 | if (in_sack) { |
| 1670 | TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, | 1684 | TCP_SKB_CB(skb)->sacked = |
| 1671 | state, | 1685 | tcp_sacktag_one(sk, |
| 1672 | dup_sack, | 1686 | state, |
| 1673 | tcp_skb_pcount(skb)); | 1687 | TCP_SKB_CB(skb)->sacked, |
| 1688 | TCP_SKB_CB(skb)->seq, | ||
| 1689 | TCP_SKB_CB(skb)->end_seq, | ||
| 1690 | dup_sack, | ||
| 1691 | tcp_skb_pcount(skb)); | ||
| 1674 | 1692 | ||
| 1675 | if (!before(TCP_SKB_CB(skb)->seq, | 1693 | if (!before(TCP_SKB_CB(skb)->seq, |
| 1676 | tcp_highest_sack_seq(tp))) | 1694 | tcp_highest_sack_seq(tp))) |
| @@ -1844,10 +1862,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | |||
| 1844 | if (found_dup_sack && ((i + 1) == first_sack_index)) | 1862 | if (found_dup_sack && ((i + 1) == first_sack_index)) |
| 1845 | next_dup = &sp[i + 1]; | 1863 | next_dup = &sp[i + 1]; |
| 1846 | 1864 | ||
| 1847 | /* Event "B" in the comment above. */ | ||
| 1848 | if (after(end_seq, tp->high_seq)) | ||
| 1849 | state.flag |= FLAG_DATA_LOST; | ||
| 1850 | |||
| 1851 | /* Skip too early cached blocks */ | 1865 | /* Skip too early cached blocks */ |
| 1852 | while (tcp_sack_cache_ok(tp, cache) && | 1866 | while (tcp_sack_cache_ok(tp, cache) && |
| 1853 | !before(start_seq, cache->end_seq)) | 1867 | !before(start_seq, cache->end_seq)) |
| @@ -2515,8 +2529,11 @@ static void tcp_timeout_skbs(struct sock *sk) | |||
| 2515 | tcp_verify_left_out(tp); | 2529 | tcp_verify_left_out(tp); |
| 2516 | } | 2530 | } |
| 2517 | 2531 | ||
| 2518 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is | 2532 | /* Detect loss in event "A" above by marking head of queue up as lost. |
| 2519 | * is against sacked "cnt", otherwise it's against facked "cnt" | 2533 | * For FACK or non-SACK(Reno) senders, the first "packets" number of segments |
| 2534 | * are considered lost. For RFC3517 SACK, a segment is considered lost if it | ||
| 2535 | * has at least tp->reordering SACKed seqments above it; "packets" refers to | ||
| 2536 | * the maximum SACKed segments to pass before reaching this limit. | ||
| 2520 | */ | 2537 | */ |
| 2521 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | 2538 | static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) |
| 2522 | { | 2539 | { |
| @@ -2525,6 +2542,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2525 | int cnt, oldcnt; | 2542 | int cnt, oldcnt; |
| 2526 | int err; | 2543 | int err; |
| 2527 | unsigned int mss; | 2544 | unsigned int mss; |
| 2545 | /* Use SACK to deduce losses of new sequences sent during recovery */ | ||
| 2546 | const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; | ||
| 2528 | 2547 | ||
| 2529 | WARN_ON(packets > tp->packets_out); | 2548 | WARN_ON(packets > tp->packets_out); |
| 2530 | if (tp->lost_skb_hint) { | 2549 | if (tp->lost_skb_hint) { |
| @@ -2546,7 +2565,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2546 | tp->lost_skb_hint = skb; | 2565 | tp->lost_skb_hint = skb; |
| 2547 | tp->lost_cnt_hint = cnt; | 2566 | tp->lost_cnt_hint = cnt; |
| 2548 | 2567 | ||
| 2549 | if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) | 2568 | if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) |
| 2550 | break; | 2569 | break; |
| 2551 | 2570 | ||
| 2552 | oldcnt = cnt; | 2571 | oldcnt = cnt; |
| @@ -2556,6 +2575,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) | |||
| 2556 | 2575 | ||
| 2557 | if (cnt > packets) { | 2576 | if (cnt > packets) { |
| 2558 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || | 2577 | if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || |
| 2578 | (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || | ||
| 2559 | (oldcnt >= packets)) | 2579 | (oldcnt >= packets)) |
| 2560 | break; | 2580 | break; |
| 2561 | 2581 | ||
| @@ -3033,19 +3053,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 3033 | if (tcp_check_sack_reneging(sk, flag)) | 3053 | if (tcp_check_sack_reneging(sk, flag)) |
| 3034 | return; | 3054 | return; |
| 3035 | 3055 | ||
| 3036 | /* C. Process data loss notification, provided it is valid. */ | 3056 | /* C. Check consistency of the current state. */ |
| 3037 | if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && | ||
| 3038 | before(tp->snd_una, tp->high_seq) && | ||
| 3039 | icsk->icsk_ca_state != TCP_CA_Open && | ||
| 3040 | tp->fackets_out > tp->reordering) { | ||
| 3041 | tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); | ||
| 3042 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); | ||
| 3043 | } | ||
| 3044 | |||
| 3045 | /* D. Check consistency of the current state. */ | ||
| 3046 | tcp_verify_left_out(tp); | 3057 | tcp_verify_left_out(tp); |
| 3047 | 3058 | ||
| 3048 | /* E. Check state exit conditions. State can be terminated | 3059 | /* D. Check state exit conditions. State can be terminated |
| 3049 | * when high_seq is ACKed. */ | 3060 | * when high_seq is ACKed. */ |
| 3050 | if (icsk->icsk_ca_state == TCP_CA_Open) { | 3061 | if (icsk->icsk_ca_state == TCP_CA_Open) { |
| 3051 | WARN_ON(tp->retrans_out != 0); | 3062 | WARN_ON(tp->retrans_out != 0); |
| @@ -3077,7 +3088,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
| 3077 | } | 3088 | } |
| 3078 | } | 3089 | } |
| 3079 | 3090 | ||
| 3080 | /* F. Process state. */ | 3091 | /* E. Process state. */ |
| 3081 | switch (icsk->icsk_ca_state) { | 3092 | switch (icsk->icsk_ca_state) { |
| 3082 | case TCP_CA_Recovery: | 3093 | case TCP_CA_Recovery: |
| 3083 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { | 3094 | if (!(flag & FLAG_SND_UNA_ADVANCED)) { |
| @@ -3858,9 +3869,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o | |||
| 3858 | opt_rx->wscale_ok = 1; | 3869 | opt_rx->wscale_ok = 1; |
| 3859 | if (snd_wscale > 14) { | 3870 | if (snd_wscale > 14) { |
| 3860 | if (net_ratelimit()) | 3871 | if (net_ratelimit()) |
| 3861 | printk(KERN_INFO "tcp_parse_options: Illegal window " | 3872 | pr_info("%s: Illegal window scaling value %d >14 received\n", |
| 3862 | "scaling value %d >14 received.\n", | 3873 | __func__, |
| 3863 | snd_wscale); | 3874 | snd_wscale); |
| 3864 | snd_wscale = 14; | 3875 | snd_wscale = 14; |
| 3865 | } | 3876 | } |
| 3866 | opt_rx->snd_wscale = snd_wscale; | 3877 | opt_rx->snd_wscale = snd_wscale; |
| @@ -4182,7 +4193,7 @@ static void tcp_fin(struct sock *sk) | |||
| 4182 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these | 4193 | /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
| 4183 | * cases we should never reach this piece of code. | 4194 | * cases we should never reach this piece of code. |
| 4184 | */ | 4195 | */ |
| 4185 | printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", | 4196 | pr_err("%s: Impossible, sk->sk_state=%d\n", |
| 4186 | __func__, sk->sk_state); | 4197 | __func__, sk->sk_state); |
| 4187 | break; | 4198 | break; |
| 4188 | } | 4199 | } |
| @@ -4435,6 +4446,137 @@ static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) | |||
| 4435 | return 0; | 4446 | return 0; |
| 4436 | } | 4447 | } |
| 4437 | 4448 | ||
| 4449 | static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) | ||
| 4450 | { | ||
| 4451 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 4452 | struct sk_buff *skb1; | ||
| 4453 | u32 seq, end_seq; | ||
| 4454 | |||
| 4455 | TCP_ECN_check_ce(tp, skb); | ||
| 4456 | |||
| 4457 | if (tcp_try_rmem_schedule(sk, skb->truesize)) { | ||
| 4458 | /* TODO: should increment a counter */ | ||
| 4459 | __kfree_skb(skb); | ||
| 4460 | return; | ||
| 4461 | } | ||
| 4462 | |||
| 4463 | /* Disable header prediction. */ | ||
| 4464 | tp->pred_flags = 0; | ||
| 4465 | inet_csk_schedule_ack(sk); | ||
| 4466 | |||
| 4467 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
| 4468 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
| 4469 | |||
| 4470 | skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
| 4471 | if (!skb1) { | ||
| 4472 | /* Initial out of order segment, build 1 SACK. */ | ||
| 4473 | if (tcp_is_sack(tp)) { | ||
| 4474 | tp->rx_opt.num_sacks = 1; | ||
| 4475 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
| 4476 | tp->selective_acks[0].end_seq = | ||
| 4477 | TCP_SKB_CB(skb)->end_seq; | ||
| 4478 | } | ||
| 4479 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4480 | goto end; | ||
| 4481 | } | ||
| 4482 | |||
| 4483 | seq = TCP_SKB_CB(skb)->seq; | ||
| 4484 | end_seq = TCP_SKB_CB(skb)->end_seq; | ||
| 4485 | |||
| 4486 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
| 4487 | /* Packets in ofo can stay in queue a long time. | ||
| 4488 | * Better try to coalesce them right now | ||
| 4489 | * to avoid future tcp_collapse_ofo_queue(), | ||
| 4490 | * probably the most expensive function in tcp stack. | ||
| 4491 | */ | ||
| 4492 | if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) { | ||
| 4493 | NET_INC_STATS_BH(sock_net(sk), | ||
| 4494 | LINUX_MIB_TCPRCVCOALESCE); | ||
| 4495 | BUG_ON(skb_copy_bits(skb, 0, | ||
| 4496 | skb_put(skb1, skb->len), | ||
| 4497 | skb->len)); | ||
| 4498 | TCP_SKB_CB(skb1)->end_seq = end_seq; | ||
| 4499 | TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq; | ||
| 4500 | __kfree_skb(skb); | ||
| 4501 | skb = NULL; | ||
| 4502 | } else { | ||
| 4503 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4504 | } | ||
| 4505 | |||
| 4506 | if (!tp->rx_opt.num_sacks || | ||
| 4507 | tp->selective_acks[0].end_seq != seq) | ||
| 4508 | goto add_sack; | ||
| 4509 | |||
| 4510 | /* Common case: data arrive in order after hole. */ | ||
| 4511 | tp->selective_acks[0].end_seq = end_seq; | ||
| 4512 | goto end; | ||
| 4513 | } | ||
| 4514 | |||
| 4515 | /* Find place to insert this segment. */ | ||
| 4516 | while (1) { | ||
| 4517 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
| 4518 | break; | ||
| 4519 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
| 4520 | skb1 = NULL; | ||
| 4521 | break; | ||
| 4522 | } | ||
| 4523 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
| 4524 | } | ||
| 4525 | |||
| 4526 | /* Do skb overlap to previous one? */ | ||
| 4527 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4528 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4529 | /* All the bits are present. Drop. */ | ||
| 4530 | __kfree_skb(skb); | ||
| 4531 | skb = NULL; | ||
| 4532 | tcp_dsack_set(sk, seq, end_seq); | ||
| 4533 | goto add_sack; | ||
| 4534 | } | ||
| 4535 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
| 4536 | /* Partial overlap. */ | ||
| 4537 | tcp_dsack_set(sk, seq, | ||
| 4538 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4539 | } else { | ||
| 4540 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
| 4541 | skb1)) | ||
| 4542 | skb1 = NULL; | ||
| 4543 | else | ||
| 4544 | skb1 = skb_queue_prev( | ||
| 4545 | &tp->out_of_order_queue, | ||
| 4546 | skb1); | ||
| 4547 | } | ||
| 4548 | } | ||
| 4549 | if (!skb1) | ||
| 4550 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4551 | else | ||
| 4552 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4553 | |||
| 4554 | /* And clean segments covered by new one as whole. */ | ||
| 4555 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
| 4556 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
| 4557 | |||
| 4558 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
| 4559 | break; | ||
| 4560 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4561 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4562 | end_seq); | ||
| 4563 | break; | ||
| 4564 | } | ||
| 4565 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
| 4566 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4567 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4568 | __kfree_skb(skb1); | ||
| 4569 | } | ||
| 4570 | |||
| 4571 | add_sack: | ||
| 4572 | if (tcp_is_sack(tp)) | ||
| 4573 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
| 4574 | end: | ||
| 4575 | if (skb) | ||
| 4576 | skb_set_owner_r(skb, sk); | ||
| 4577 | } | ||
| 4578 | |||
| 4579 | |||
| 4438 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | 4580 | static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) |
| 4439 | { | 4581 | { |
| 4440 | const struct tcphdr *th = tcp_hdr(skb); | 4582 | const struct tcphdr *th = tcp_hdr(skb); |
| @@ -4550,105 +4692,7 @@ drop: | |||
| 4550 | goto queue_and_out; | 4692 | goto queue_and_out; |
| 4551 | } | 4693 | } |
| 4552 | 4694 | ||
| 4553 | TCP_ECN_check_ce(tp, skb); | 4695 | tcp_data_queue_ofo(sk, skb); |
| 4554 | |||
| 4555 | if (tcp_try_rmem_schedule(sk, skb->truesize)) | ||
| 4556 | goto drop; | ||
| 4557 | |||
| 4558 | /* Disable header prediction. */ | ||
| 4559 | tp->pred_flags = 0; | ||
| 4560 | inet_csk_schedule_ack(sk); | ||
| 4561 | |||
| 4562 | SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", | ||
| 4563 | tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); | ||
| 4564 | |||
| 4565 | skb_set_owner_r(skb, sk); | ||
| 4566 | |||
| 4567 | if (!skb_peek(&tp->out_of_order_queue)) { | ||
| 4568 | /* Initial out of order segment, build 1 SACK. */ | ||
| 4569 | if (tcp_is_sack(tp)) { | ||
| 4570 | tp->rx_opt.num_sacks = 1; | ||
| 4571 | tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; | ||
| 4572 | tp->selective_acks[0].end_seq = | ||
| 4573 | TCP_SKB_CB(skb)->end_seq; | ||
| 4574 | } | ||
| 4575 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4576 | } else { | ||
| 4577 | struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue); | ||
| 4578 | u32 seq = TCP_SKB_CB(skb)->seq; | ||
| 4579 | u32 end_seq = TCP_SKB_CB(skb)->end_seq; | ||
| 4580 | |||
| 4581 | if (seq == TCP_SKB_CB(skb1)->end_seq) { | ||
| 4582 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4583 | |||
| 4584 | if (!tp->rx_opt.num_sacks || | ||
| 4585 | tp->selective_acks[0].end_seq != seq) | ||
| 4586 | goto add_sack; | ||
| 4587 | |||
| 4588 | /* Common case: data arrive in order after hole. */ | ||
| 4589 | tp->selective_acks[0].end_seq = end_seq; | ||
| 4590 | return; | ||
| 4591 | } | ||
| 4592 | |||
| 4593 | /* Find place to insert this segment. */ | ||
| 4594 | while (1) { | ||
| 4595 | if (!after(TCP_SKB_CB(skb1)->seq, seq)) | ||
| 4596 | break; | ||
| 4597 | if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { | ||
| 4598 | skb1 = NULL; | ||
| 4599 | break; | ||
| 4600 | } | ||
| 4601 | skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); | ||
| 4602 | } | ||
| 4603 | |||
| 4604 | /* Do skb overlap to previous one? */ | ||
| 4605 | if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4606 | if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4607 | /* All the bits are present. Drop. */ | ||
| 4608 | __kfree_skb(skb); | ||
| 4609 | tcp_dsack_set(sk, seq, end_seq); | ||
| 4610 | goto add_sack; | ||
| 4611 | } | ||
| 4612 | if (after(seq, TCP_SKB_CB(skb1)->seq)) { | ||
| 4613 | /* Partial overlap. */ | ||
| 4614 | tcp_dsack_set(sk, seq, | ||
| 4615 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4616 | } else { | ||
| 4617 | if (skb_queue_is_first(&tp->out_of_order_queue, | ||
| 4618 | skb1)) | ||
| 4619 | skb1 = NULL; | ||
| 4620 | else | ||
| 4621 | skb1 = skb_queue_prev( | ||
| 4622 | &tp->out_of_order_queue, | ||
| 4623 | skb1); | ||
| 4624 | } | ||
| 4625 | } | ||
| 4626 | if (!skb1) | ||
| 4627 | __skb_queue_head(&tp->out_of_order_queue, skb); | ||
| 4628 | else | ||
| 4629 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); | ||
| 4630 | |||
| 4631 | /* And clean segments covered by new one as whole. */ | ||
| 4632 | while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { | ||
| 4633 | skb1 = skb_queue_next(&tp->out_of_order_queue, skb); | ||
| 4634 | |||
| 4635 | if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) | ||
| 4636 | break; | ||
| 4637 | if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { | ||
| 4638 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4639 | end_seq); | ||
| 4640 | break; | ||
| 4641 | } | ||
| 4642 | __skb_unlink(skb1, &tp->out_of_order_queue); | ||
| 4643 | tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, | ||
| 4644 | TCP_SKB_CB(skb1)->end_seq); | ||
| 4645 | __kfree_skb(skb1); | ||
| 4646 | } | ||
| 4647 | |||
| 4648 | add_sack: | ||
| 4649 | if (tcp_is_sack(tp)) | ||
| 4650 | tcp_sack_new_ofo_skb(sk, seq, end_seq); | ||
| 4651 | } | ||
| 4652 | } | 4696 | } |
| 4653 | 4697 | ||
| 4654 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | 4698 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, |
