aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlpo Järvinen <ilpo.jarvinen@helsinki.fi>2008-05-08 04:09:11 -0400
committerDavid S. Miller <davem@davemloft.net>2008-05-08 04:09:11 -0400
commit62ab22278308a40bcb7f4079e9719ab8b7fe11b5 (patch)
treecb8153cfa53fd9416fc0a748c0c8eea90b4e9b38
parent9d1045ad68fcccfaf1393cc463ab6357693e8d1d (diff)
tcp FRTO: SACK variant is errorneously used with NewReno
Note: there's actually another bug in FRTO's SACK variant, which is the causing failure in NewReno case because of the error that's fixed here. I'll fix the SACK case separately (it's a separate bug really, though related, but in order to fix that I need to audit tp->snd_nxt usage a bit). There were two places where SACK variant of FRTO is getting incorrectly used even if SACK wasn't negotiated by the TCP flow. This leads to incorrect setting of frto_highmark with NewReno if a previous recovery was interrupted by another RTO. An eventual fallback to conventional recovery then incorrectly considers one or couple of segments as forward transmissions though they weren't, which then are not LOST marked during fallback making them "non-retransmittable" until the next RTO. In a bad case, those segments are really lost and are the only one left in the window. Thus TCP needs another RTO to continue. The next FRTO, however, could again repeat the same events making the progress of the TCP flow extremely slow. In order for these events to occur at all, FRTO must occur again in FRTOs step 3 while the key segments must be lost as well, which is not too likely in practice. It seems to most frequently with some small devices such as network printers that *seem* to accept TCP segments only in-order. In cases were key segments weren't lost, things get automatically resolved because those wrongly marked segments don't need to be retransmitted in order to continue. I found a reproducer after digging up relevant reports (few reports in total, none at netdev or lkml I know of), some cases seemed to indicate middlebox issues which seems now to be a false assumption some people had made. Bugzilla #10063 _might_ be related. Damon L. Chesser <damon@damtek.com> had a reproducable case and was kind enough to tcpdump it for me. With the tcpdump log it was quite trivial to figure out. Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/tcp_input.c13
1 files changed, 8 insertions, 5 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ac15a604e08..26c936930e92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,8 +114,6 @@ int sysctl_tcp_abc __read_mostly;
114#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) 114#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
115#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) 115#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
116 116
117#define IsSackFrto() (sysctl_tcp_frto == 0x2)
118
119#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 117#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
120#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) 118#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
121 119
@@ -1686,6 +1684,11 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1686 tp->sacked_out = 0; 1684 tp->sacked_out = 0;
1687} 1685}
1688 1686
1687static int tcp_is_sackfrto(const struct tcp_sock *tp)
1688{
1689 return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
1690}
1691
1689/* F-RTO can only be used if TCP has never retransmitted anything other than 1692/* F-RTO can only be used if TCP has never retransmitted anything other than
1690 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) 1693 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1691 */ 1694 */
@@ -1702,7 +1705,7 @@ int tcp_use_frto(struct sock *sk)
1702 if (icsk->icsk_mtup.probe_size) 1705 if (icsk->icsk_mtup.probe_size)
1703 return 0; 1706 return 0;
1704 1707
1705 if (IsSackFrto()) 1708 if (tcp_is_sackfrto(tp))
1706 return 1; 1709 return 1;
1707 1710
1708 /* Avoid expensive walking of rexmit queue if possible */ 1711 /* Avoid expensive walking of rexmit queue if possible */
@@ -1792,7 +1795,7 @@ void tcp_enter_frto(struct sock *sk)
1792 /* Earlier loss recovery underway (see RFC4138; Appendix B). 1795 /* Earlier loss recovery underway (see RFC4138; Appendix B).
1793 * The last condition is necessary at least in tp->frto_counter case. 1796 * The last condition is necessary at least in tp->frto_counter case.
1794 */ 1797 */
1795 if (IsSackFrto() && (tp->frto_counter || 1798 if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
1796 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && 1799 ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
1797 after(tp->high_seq, tp->snd_una)) { 1800 after(tp->high_seq, tp->snd_una)) {
1798 tp->frto_highmark = tp->high_seq; 1801 tp->frto_highmark = tp->high_seq;
@@ -3124,7 +3127,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
3124 return 1; 3127 return 1;
3125 } 3128 }
3126 3129
3127 if (!IsSackFrto() || tcp_is_reno(tp)) { 3130 if (!tcp_is_sackfrto(tp)) {
3128 /* RFC4138 shortcoming in step 2; should also have case c): 3131 /* RFC4138 shortcoming in step 2; should also have case c):
3129 * ACK isn't duplicate nor advances window, e.g., opposite dir 3132 * ACK isn't duplicate nor advances window, e.g., opposite dir
3130 * data, winupdate 3133 * data, winupdate