aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-07-23 03:48:52 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-23 03:58:46 -0400
commit563d34d05786263893ba4a1042eb9b9374127cf5 (patch)
treee9ce502c1f32bea966c81d5597d0a29eb4b9d244
parentc3def943c7117d42caaed3478731ea7c3c87190e (diff)
tcp: dont drop MTU reduction indications
ICMP messages generated in output path if frame length is bigger than mtu are actually lost because socket is owned by user (doing the xmit) One example is the ipgre_tunnel_xmit() calling icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on sockets hitting dst_allfrag). Problem of such fix is that it relied on retransmit timers, so short tcp sessions paid a too big latency increase price. This patch uses the tcp_release_cb() infrastructure so that MTU reduction messages (ICMP messages) are not lost, and no extra delay is added in TCP transmits. Reported-by: Maciej Żenczykowski <maze@google.com> Diagnosed-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Tore Anderson <tore@fud.no> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h6
-rw-r--r--include/net/sock.h1
-rw-r--r--net/ipv4/tcp_ipv4.c19
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv6/tcp_ipv6.c40
5 files changed, 51 insertions, 21 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2761856987b2..eb125a4c30b3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -493,6 +493,9 @@ struct tcp_sock {
493 u32 probe_seq_start; 493 u32 probe_seq_start;
494 u32 probe_seq_end; 494 u32 probe_seq_end;
495 } mtu_probe; 495 } mtu_probe;
496 u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
497 * while socket was owned by user.
498 */
496 499
497#ifdef CONFIG_TCP_MD5SIG 500#ifdef CONFIG_TCP_MD5SIG
498/* TCP AF-Specific parts; only used by MD5 Signature support so far */ 501/* TCP AF-Specific parts; only used by MD5 Signature support so far */
@@ -518,6 +521,9 @@ enum tsq_flags {
518 TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ 521 TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
519 TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ 522 TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */
520 TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ 523 TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
524 TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
525 * tcp_v{4|6}_mtu_reduced()
526 */
521}; 527};
522 528
523static inline struct tcp_sock *tcp_sk(const struct sock *sk) 529static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff --git a/include/net/sock.h b/include/net/sock.h
index 88de092df50f..e067f8c18f88 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -859,6 +859,7 @@ struct proto {
859 struct sk_buff *skb); 859 struct sk_buff *skb);
860 860
861 void (*release_cb)(struct sock *sk); 861 void (*release_cb)(struct sock *sk);
862 void (*mtu_reduced)(struct sock *sk);
862 863
863 /* Keeping track of sk's, looking them up, and port selection methods. */ 864 /* Keeping track of sk's, looking them up, and port selection methods. */
864 void (*hash)(struct sock *sk); 865 void (*hash)(struct sock *sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 59110caeb074..bc5432e3c778 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -275,12 +275,15 @@ failure:
275EXPORT_SYMBOL(tcp_v4_connect); 275EXPORT_SYMBOL(tcp_v4_connect);
276 276
277/* 277/*
278 * This routine does path mtu discovery as defined in RFC1191. 278 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279 * It can be called through tcp_release_cb() if socket was owned by user
280 * at the time tcp_v4_err() was called to handle ICMP message.
279 */ 281 */
280static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) 282static void tcp_v4_mtu_reduced(struct sock *sk)
281{ 283{
282 struct dst_entry *dst; 284 struct dst_entry *dst;
283 struct inet_sock *inet = inet_sk(sk); 285 struct inet_sock *inet = inet_sk(sk);
286 u32 mtu = tcp_sk(sk)->mtu_info;
284 287
285 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 288 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286 * send out by Linux are always <576bytes so they should go through 289 * send out by Linux are always <576bytes so they should go through
@@ -373,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
373 bh_lock_sock(sk); 376 bh_lock_sock(sk);
374 /* If too many ICMPs get dropped on busy 377 /* If too many ICMPs get dropped on busy
375 * servers this needs to be solved differently. 378 * servers this needs to be solved differently.
379 * We do take care of PMTU discovery (RFC1191) special case :
380 * we can receive locally generated ICMP messages while socket is held.
376 */ 381 */
377 if (sock_owned_by_user(sk)) 382 if (sock_owned_by_user(sk) &&
383 type != ICMP_DEST_UNREACH &&
384 code != ICMP_FRAG_NEEDED)
378 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 385 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
379 386
380 if (sk->sk_state == TCP_CLOSE) 387 if (sk->sk_state == TCP_CLOSE)
@@ -409,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
409 goto out; 416 goto out;
410 417
411 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
419 tp->mtu_info = info;
412 if (!sock_owned_by_user(sk)) 420 if (!sock_owned_by_user(sk))
413 do_pmtu_discovery(sk, iph, info); 421 tcp_v4_mtu_reduced(sk);
422 else
423 set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
414 goto out; 424 goto out;
415 } 425 }
416 426
@@ -2596,6 +2606,7 @@ struct proto tcp_prot = {
2596 .sendpage = tcp_sendpage, 2606 .sendpage = tcp_sendpage,
2597 .backlog_rcv = tcp_v4_do_rcv, 2607 .backlog_rcv = tcp_v4_do_rcv,
2598 .release_cb = tcp_release_cb, 2608 .release_cb = tcp_release_cb,
2609 .mtu_reduced = tcp_v4_mtu_reduced,
2599 .hash = inet_hash, 2610 .hash = inet_hash,
2600 .unhash = inet_unhash, 2611 .unhash = inet_unhash,
2601 .get_port = inet_csk_get_port, 2612 .get_port = inet_csk_get_port,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 950aebfd9967..33cd065cfbd8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -885,7 +885,8 @@ static void tcp_tasklet_func(unsigned long data)
885 885
886#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ 886#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
887 (1UL << TCP_WRITE_TIMER_DEFERRED) | \ 887 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
888 (1UL << TCP_DELACK_TIMER_DEFERRED)) 888 (1UL << TCP_DELACK_TIMER_DEFERRED) | \
889 (1UL << TCP_MTU_REDUCED_DEFERRED))
889/** 890/**
890 * tcp_release_cb - tcp release_sock() callback 891 * tcp_release_cb - tcp release_sock() callback
891 * @sk: socket 892 * @sk: socket
@@ -914,6 +915,9 @@ void tcp_release_cb(struct sock *sk)
914 915
915 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) 916 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
916 tcp_delack_timer_handler(sk); 917 tcp_delack_timer_handler(sk);
918
919 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
920 sk->sk_prot->mtu_reduced(sk);
917} 921}
918EXPORT_SYMBOL(tcp_release_cb); 922EXPORT_SYMBOL(tcp_release_cb);
919 923
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0302ec3fecfc..f49476e2d884 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -315,6 +315,23 @@ failure:
315 return err; 315 return err;
316} 316}
317 317
318static void tcp_v6_mtu_reduced(struct sock *sk)
319{
320 struct dst_entry *dst;
321
322 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
323 return;
324
325 dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
326 if (!dst)
327 return;
328
329 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
330 tcp_sync_mss(sk, dst_mtu(dst));
331 tcp_simple_retransmit(sk);
332 }
333}
334
318static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, 335static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
319 u8 type, u8 code, int offset, __be32 info) 336 u8 type, u8 code, int offset, __be32 info)
320{ 337{
@@ -342,7 +359,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
342 } 359 }
343 360
344 bh_lock_sock(sk); 361 bh_lock_sock(sk);
345 if (sock_owned_by_user(sk)) 362 if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
346 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 363 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
347 364
348 if (sk->sk_state == TCP_CLOSE) 365 if (sk->sk_state == TCP_CLOSE)
@@ -371,21 +388,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
371 } 388 }
372 389
373 if (type == ICMPV6_PKT_TOOBIG) { 390 if (type == ICMPV6_PKT_TOOBIG) {
374 struct dst_entry *dst; 391 tp->mtu_info = ntohl(info);
375 392 if (!sock_owned_by_user(sk))
376 if (sock_owned_by_user(sk)) 393 tcp_v6_mtu_reduced(sk);
377 goto out; 394 else
378 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) 395 set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
379 goto out;
380
381 dst = inet6_csk_update_pmtu(sk, ntohl(info));
382 if (!dst)
383 goto out;
384
385 if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
386 tcp_sync_mss(sk, dst_mtu(dst));
387 tcp_simple_retransmit(sk);
388 }
389 goto out; 396 goto out;
390 } 397 }
391 398
@@ -1949,6 +1956,7 @@ struct proto tcpv6_prot = {
1949 .sendpage = tcp_sendpage, 1956 .sendpage = tcp_sendpage,
1950 .backlog_rcv = tcp_v6_do_rcv, 1957 .backlog_rcv = tcp_v6_do_rcv,
1951 .release_cb = tcp_release_cb, 1958 .release_cb = tcp_release_cb,
1959 .mtu_reduced = tcp_v6_mtu_reduced,
1952 .hash = tcp_v6_hash, 1960 .hash = tcp_v6_hash,
1953 .unhash = inet_unhash, 1961 .unhash = inet_unhash,
1954 .get_port = inet_csk_get_port, 1962 .get_port = inet_csk_get_port,