diff options
author | Eric Dumazet <edumazet@google.com> | 2012-07-23 03:48:52 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-23 03:58:46 -0400 |
commit | 563d34d05786263893ba4a1042eb9b9374127cf5 (patch) | |
tree | e9ce502c1f32bea966c81d5597d0a29eb4b9d244 /net/ipv4 | |
parent | c3def943c7117d42caaed3478731ea7c3c87190e (diff) |
tcp: dont drop MTU reduction indications
ICMP messages generated in output path if frame length is bigger than
mtu are actually lost because socket is owned by user (doing the xmit)
One example is the ipgre_tunnel_xmit() calling
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on
sockets hitting dst_allfrag).
Problem of such fix is that it relied on retransmit timers, so short tcp
sessions paid a too big latency increase price.
This patch uses the tcp_release_cb() infrastructure so that MTU
reduction messages (ICMP messages) are not lost, and no extra delay
is added in TCP transmits.
Reported-by: Maciej Żenczykowski <maze@google.com>
Diagnosed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Tore Anderson <tore@fud.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 19 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 6 |
2 files changed, 20 insertions, 5 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59110caeb074..bc5432e3c778 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -275,12 +275,15 @@ failure: | |||
275 | EXPORT_SYMBOL(tcp_v4_connect); | 275 | EXPORT_SYMBOL(tcp_v4_connect); |
276 | 276 | ||
277 | /* | 277 | /* |
278 | * This routine does path mtu discovery as defined in RFC1191. | 278 | * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. |
279 | * It can be called through tcp_release_cb() if socket was owned by user | ||
280 | * at the time tcp_v4_err() was called to handle ICMP message. | ||
279 | */ | 281 | */ |
280 | static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) | 282 | static void tcp_v4_mtu_reduced(struct sock *sk) |
281 | { | 283 | { |
282 | struct dst_entry *dst; | 284 | struct dst_entry *dst; |
283 | struct inet_sock *inet = inet_sk(sk); | 285 | struct inet_sock *inet = inet_sk(sk); |
286 | u32 mtu = tcp_sk(sk)->mtu_info; | ||
284 | 287 | ||
285 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs | 288 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs |
286 | * send out by Linux are always <576bytes so they should go through | 289 | * send out by Linux are always <576bytes so they should go through |
@@ -373,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
373 | bh_lock_sock(sk); | 376 | bh_lock_sock(sk); |
374 | /* If too many ICMPs get dropped on busy | 377 | /* If too many ICMPs get dropped on busy |
375 | * servers this needs to be solved differently. | 378 | * servers this needs to be solved differently. |
379 | * We do take care of PMTU discovery (RFC1191) special case : | ||
380 | * we can receive locally generated ICMP messages while socket is held. | ||
376 | */ | 381 | */ |
377 | if (sock_owned_by_user(sk)) | 382 | if (sock_owned_by_user(sk) && |
383 | type != ICMP_DEST_UNREACH && | ||
384 | code != ICMP_FRAG_NEEDED) | ||
378 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); | 385 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
379 | 386 | ||
380 | if (sk->sk_state == TCP_CLOSE) | 387 | if (sk->sk_state == TCP_CLOSE) |
@@ -409,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
409 | goto out; | 416 | goto out; |
410 | 417 | ||
411 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ | 418 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ |
419 | tp->mtu_info = info; | ||
412 | if (!sock_owned_by_user(sk)) | 420 | if (!sock_owned_by_user(sk)) |
413 | do_pmtu_discovery(sk, iph, info); | 421 | tcp_v4_mtu_reduced(sk); |
422 | else | ||
423 | set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); | ||
414 | goto out; | 424 | goto out; |
415 | } | 425 | } |
416 | 426 | ||
@@ -2596,6 +2606,7 @@ struct proto tcp_prot = { | |||
2596 | .sendpage = tcp_sendpage, | 2606 | .sendpage = tcp_sendpage, |
2597 | .backlog_rcv = tcp_v4_do_rcv, | 2607 | .backlog_rcv = tcp_v4_do_rcv, |
2598 | .release_cb = tcp_release_cb, | 2608 | .release_cb = tcp_release_cb, |
2609 | .mtu_reduced = tcp_v4_mtu_reduced, | ||
2599 | .hash = inet_hash, | 2610 | .hash = inet_hash, |
2600 | .unhash = inet_unhash, | 2611 | .unhash = inet_unhash, |
2601 | .get_port = inet_csk_get_port, | 2612 | .get_port = inet_csk_get_port, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 950aebfd9967..33cd065cfbd8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -885,7 +885,8 @@ static void tcp_tasklet_func(unsigned long data) | |||
885 | 885 | ||
886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ | 886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ |
887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ | 887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ |
888 | (1UL << TCP_DELACK_TIMER_DEFERRED)) | 888 | (1UL << TCP_DELACK_TIMER_DEFERRED) | \ |
889 | (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
889 | /** | 890 | /** |
890 | * tcp_release_cb - tcp release_sock() callback | 891 | * tcp_release_cb - tcp release_sock() callback |
891 | * @sk: socket | 892 | * @sk: socket |
@@ -914,6 +915,9 @@ void tcp_release_cb(struct sock *sk) | |||
914 | 915 | ||
915 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) | 916 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) |
916 | tcp_delack_timer_handler(sk); | 917 | tcp_delack_timer_handler(sk); |
918 | |||
919 | if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) | ||
920 | sk->sk_prot->mtu_reduced(sk); | ||
917 | } | 921 | } |
918 | EXPORT_SYMBOL(tcp_release_cb); | 922 | EXPORT_SYMBOL(tcp_release_cb); |
919 | 923 | ||