diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 241 |
1 files changed, 165 insertions, 76 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a2a796c5536b..a369e8a70b2c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |||
| 518 | 518 | ||
| 519 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { | 519 | if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { |
| 520 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; | 520 | struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; |
| 521 | u8 *p = (u8 *)ptr; | ||
| 522 | u32 len; /* Fast Open option length */ | ||
| 523 | |||
| 524 | if (foc->exp) { | ||
| 525 | len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | ||
| 526 | *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | | ||
| 527 | TCPOPT_FASTOPEN_MAGIC); | ||
| 528 | p += TCPOLEN_EXP_FASTOPEN_BASE; | ||
| 529 | } else { | ||
| 530 | len = TCPOLEN_FASTOPEN_BASE + foc->len; | ||
| 531 | *p++ = TCPOPT_FASTOPEN; | ||
| 532 | *p++ = len; | ||
| 533 | } | ||
| 521 | 534 | ||
| 522 | *ptr++ = htonl((TCPOPT_EXP << 24) | | 535 | memcpy(p, foc->val, foc->len); |
| 523 | ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) | | 536 | if ((len & 3) == 2) { |
| 524 | TCPOPT_FASTOPEN_MAGIC); | 537 | p[foc->len] = TCPOPT_NOP; |
| 525 | 538 | p[foc->len + 1] = TCPOPT_NOP; | |
| 526 | memcpy(ptr, foc->val, foc->len); | ||
| 527 | if ((foc->len & 3) == 2) { | ||
| 528 | u8 *align = ((u8 *)ptr) + foc->len; | ||
| 529 | align[0] = align[1] = TCPOPT_NOP; | ||
| 530 | } | 539 | } |
| 531 | ptr += (foc->len + 3) >> 2; | 540 | ptr += (len + 3) >> 2; |
| 532 | } | 541 | } |
| 533 | } | 542 | } |
| 534 | 543 | ||
| @@ -565,7 +574,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 565 | opts->mss = tcp_advertise_mss(sk); | 574 | opts->mss = tcp_advertise_mss(sk); |
| 566 | remaining -= TCPOLEN_MSS_ALIGNED; | 575 | remaining -= TCPOLEN_MSS_ALIGNED; |
| 567 | 576 | ||
| 568 | if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { | 577 | if (likely(sysctl_tcp_timestamps && !*md5)) { |
| 569 | opts->options |= OPTION_TS; | 578 | opts->options |= OPTION_TS; |
| 570 | opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; | 579 | opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; |
| 571 | opts->tsecr = tp->rx_opt.ts_recent; | 580 | opts->tsecr = tp->rx_opt.ts_recent; |
| @@ -583,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
| 583 | } | 592 | } |
| 584 | 593 | ||
| 585 | if (fastopen && fastopen->cookie.len >= 0) { | 594 | if (fastopen && fastopen->cookie.len >= 0) { |
| 586 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len; | 595 | u32 need = fastopen->cookie.len; |
| 596 | |||
| 597 | need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : | ||
| 598 | TCPOLEN_FASTOPEN_BASE; | ||
| 587 | need = (need + 3) & ~3U; /* Align to 32 bits */ | 599 | need = (need + 3) & ~3U; /* Align to 32 bits */ |
| 588 | if (remaining >= need) { | 600 | if (remaining >= need) { |
| 589 | opts->options |= OPTION_FAST_OPEN_COOKIE; | 601 | opts->options |= OPTION_FAST_OPEN_COOKIE; |
| 590 | opts->fastopen_cookie = &fastopen->cookie; | 602 | opts->fastopen_cookie = &fastopen->cookie; |
| 591 | remaining -= need; | 603 | remaining -= need; |
| 592 | tp->syn_fastopen = 1; | 604 | tp->syn_fastopen = 1; |
| 605 | tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; | ||
| 593 | } | 606 | } |
| 594 | } | 607 | } |
| 595 | 608 | ||
| @@ -601,15 +614,14 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 601 | struct request_sock *req, | 614 | struct request_sock *req, |
| 602 | unsigned int mss, struct sk_buff *skb, | 615 | unsigned int mss, struct sk_buff *skb, |
| 603 | struct tcp_out_options *opts, | 616 | struct tcp_out_options *opts, |
| 604 | struct tcp_md5sig_key **md5, | 617 | const struct tcp_md5sig_key *md5, |
| 605 | struct tcp_fastopen_cookie *foc) | 618 | struct tcp_fastopen_cookie *foc) |
| 606 | { | 619 | { |
| 607 | struct inet_request_sock *ireq = inet_rsk(req); | 620 | struct inet_request_sock *ireq = inet_rsk(req); |
| 608 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 621 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
| 609 | 622 | ||
| 610 | #ifdef CONFIG_TCP_MD5SIG | 623 | #ifdef CONFIG_TCP_MD5SIG |
| 611 | *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); | 624 | if (md5) { |
| 612 | if (*md5) { | ||
| 613 | opts->options |= OPTION_MD5; | 625 | opts->options |= OPTION_MD5; |
| 614 | remaining -= TCPOLEN_MD5SIG_ALIGNED; | 626 | remaining -= TCPOLEN_MD5SIG_ALIGNED; |
| 615 | 627 | ||
| @@ -620,8 +632,6 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 620 | */ | 632 | */ |
| 621 | ireq->tstamp_ok &= !ireq->sack_ok; | 633 | ireq->tstamp_ok &= !ireq->sack_ok; |
| 622 | } | 634 | } |
| 623 | #else | ||
| 624 | *md5 = NULL; | ||
| 625 | #endif | 635 | #endif |
| 626 | 636 | ||
| 627 | /* We always send an MSS option. */ | 637 | /* We always send an MSS option. */ |
| @@ -645,7 +655,10 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
| 645 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 655 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
| 646 | } | 656 | } |
| 647 | if (foc != NULL && foc->len >= 0) { | 657 | if (foc != NULL && foc->len >= 0) { |
| 648 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | 658 | u32 need = foc->len; |
| 659 | |||
| 660 | need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : | ||
| 661 | TCPOLEN_FASTOPEN_BASE; | ||
| 649 | need = (need + 3) & ~3U; /* Align to 32 bits */ | 662 | need = (need + 3) & ~3U; /* Align to 32 bits */ |
| 650 | if (remaining >= need) { | 663 | if (remaining >= need) { |
| 651 | opts->options |= OPTION_FAST_OPEN_COOKIE; | 664 | opts->options |= OPTION_FAST_OPEN_COOKIE; |
| @@ -989,7 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 989 | if (md5) { | 1002 | if (md5) { |
| 990 | sk_nocaps_add(sk, NETIF_F_GSO_MASK); | 1003 | sk_nocaps_add(sk, NETIF_F_GSO_MASK); |
| 991 | tp->af_specific->calc_md5_hash(opts.hash_location, | 1004 | tp->af_specific->calc_md5_hash(opts.hash_location, |
| 992 | md5, sk, NULL, skb); | 1005 | md5, sk, skb); |
| 993 | } | 1006 | } |
| 994 | #endif | 1007 | #endif |
| 995 | 1008 | ||
| @@ -1151,7 +1164,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
| 1151 | 1164 | ||
| 1152 | /* Get a new skb... force flag on. */ | 1165 | /* Get a new skb... force flag on. */ |
| 1153 | buff = sk_stream_alloc_skb(sk, nsize, gfp); | 1166 | buff = sk_stream_alloc_skb(sk, nsize, gfp); |
| 1154 | if (buff == NULL) | 1167 | if (!buff) |
| 1155 | return -ENOMEM; /* We'll just try again later. */ | 1168 | return -ENOMEM; /* We'll just try again later. */ |
| 1156 | 1169 | ||
| 1157 | sk->sk_wmem_queued += buff->truesize; | 1170 | sk->sk_wmem_queued += buff->truesize; |
| @@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk) | |||
| 1354 | icsk->icsk_af_ops->net_header_len; | 1367 | icsk->icsk_af_ops->net_header_len; |
| 1355 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); | 1368 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); |
| 1356 | icsk->icsk_mtup.probe_size = 0; | 1369 | icsk->icsk_mtup.probe_size = 0; |
| 1370 | if (icsk->icsk_mtup.enabled) | ||
| 1371 | icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; | ||
| 1357 | } | 1372 | } |
| 1358 | EXPORT_SYMBOL(tcp_mtup_init); | 1373 | EXPORT_SYMBOL(tcp_mtup_init); |
| 1359 | 1374 | ||
| @@ -1708,7 +1723,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1708 | return tcp_fragment(sk, skb, len, mss_now, gfp); | 1723 | return tcp_fragment(sk, skb, len, mss_now, gfp); |
| 1709 | 1724 | ||
| 1710 | buff = sk_stream_alloc_skb(sk, 0, gfp); | 1725 | buff = sk_stream_alloc_skb(sk, 0, gfp); |
| 1711 | if (unlikely(buff == NULL)) | 1726 | if (unlikely(!buff)) |
| 1712 | return -ENOMEM; | 1727 | return -ENOMEM; |
| 1713 | 1728 | ||
| 1714 | sk->sk_wmem_queued += buff->truesize; | 1729 | sk->sk_wmem_queued += buff->truesize; |
| @@ -1752,20 +1767,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
| 1752 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | 1767 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
| 1753 | bool *is_cwnd_limited, u32 max_segs) | 1768 | bool *is_cwnd_limited, u32 max_segs) |
| 1754 | { | 1769 | { |
| 1755 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1756 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1770 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1757 | u32 send_win, cong_win, limit, in_flight; | 1771 | u32 age, send_win, cong_win, limit, in_flight; |
| 1772 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1773 | struct skb_mstamp now; | ||
| 1774 | struct sk_buff *head; | ||
| 1758 | int win_divisor; | 1775 | int win_divisor; |
| 1759 | 1776 | ||
| 1760 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | 1777 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
| 1761 | goto send_now; | 1778 | goto send_now; |
| 1762 | 1779 | ||
| 1763 | if (icsk->icsk_ca_state != TCP_CA_Open) | 1780 | if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) |
| 1764 | goto send_now; | 1781 | goto send_now; |
| 1765 | 1782 | ||
| 1766 | /* Defer for less than two clock ticks. */ | 1783 | /* Avoid bursty behavior by allowing defer |
| 1767 | if (tp->tso_deferred && | 1784 | * only if the last write was recent. |
| 1768 | (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) | 1785 | */ |
| 1786 | if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) | ||
| 1769 | goto send_now; | 1787 | goto send_now; |
| 1770 | 1788 | ||
| 1771 | in_flight = tcp_packets_in_flight(tp); | 1789 | in_flight = tcp_packets_in_flight(tp); |
| @@ -1807,11 +1825,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
| 1807 | goto send_now; | 1825 | goto send_now; |
| 1808 | } | 1826 | } |
| 1809 | 1827 | ||
| 1810 | /* Ok, it looks like it is advisable to defer. | 1828 | head = tcp_write_queue_head(sk); |
| 1811 | * Do not rearm the timer if already set to not break TCP ACK clocking. | 1829 | skb_mstamp_get(&now); |
| 1812 | */ | 1830 | age = skb_mstamp_us_delta(&now, &head->skb_mstamp); |
| 1813 | if (!tp->tso_deferred) | 1831 | /* If next ACK is likely to come too late (half srtt), do not defer */ |
| 1814 | tp->tso_deferred = 1 | (jiffies << 1); | 1832 | if (age < (tp->srtt_us >> 4)) |
| 1833 | goto send_now; | ||
| 1834 | |||
| 1835 | /* Ok, it looks like it is advisable to defer. */ | ||
| 1815 | 1836 | ||
| 1816 | if (cong_win < send_win && cong_win < skb->len) | 1837 | if (cong_win < send_win && cong_win < skb->len) |
| 1817 | *is_cwnd_limited = true; | 1838 | *is_cwnd_limited = true; |
| @@ -1819,10 +1840,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
| 1819 | return true; | 1840 | return true; |
| 1820 | 1841 | ||
| 1821 | send_now: | 1842 | send_now: |
| 1822 | tp->tso_deferred = 0; | ||
| 1823 | return false; | 1843 | return false; |
| 1824 | } | 1844 | } |
| 1825 | 1845 | ||
| 1846 | static inline void tcp_mtu_check_reprobe(struct sock *sk) | ||
| 1847 | { | ||
| 1848 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 1849 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 1850 | struct net *net = sock_net(sk); | ||
| 1851 | u32 interval; | ||
| 1852 | s32 delta; | ||
| 1853 | |||
| 1854 | interval = net->ipv4.sysctl_tcp_probe_interval; | ||
| 1855 | delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; | ||
| 1856 | if (unlikely(delta >= interval * HZ)) { | ||
| 1857 | int mss = tcp_current_mss(sk); | ||
| 1858 | |||
| 1859 | /* Update current search range */ | ||
| 1860 | icsk->icsk_mtup.probe_size = 0; | ||
| 1861 | icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + | ||
| 1862 | sizeof(struct tcphdr) + | ||
| 1863 | icsk->icsk_af_ops->net_header_len; | ||
| 1864 | icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); | ||
| 1865 | |||
| 1866 | /* Update probe time stamp */ | ||
| 1867 | icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; | ||
| 1868 | } | ||
| 1869 | } | ||
| 1870 | |||
| 1826 | /* Create a new MTU probe if we are ready. | 1871 | /* Create a new MTU probe if we are ready. |
| 1827 | * MTU probe is regularly attempting to increase the path MTU by | 1872 | * MTU probe is regularly attempting to increase the path MTU by |
| 1828 | * deliberately sending larger packets. This discovers routing | 1873 | * deliberately sending larger packets. This discovers routing |
| @@ -1837,11 +1882,13 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1837 | struct tcp_sock *tp = tcp_sk(sk); | 1882 | struct tcp_sock *tp = tcp_sk(sk); |
| 1838 | struct inet_connection_sock *icsk = inet_csk(sk); | 1883 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1839 | struct sk_buff *skb, *nskb, *next; | 1884 | struct sk_buff *skb, *nskb, *next; |
| 1885 | struct net *net = sock_net(sk); | ||
| 1840 | int len; | 1886 | int len; |
| 1841 | int probe_size; | 1887 | int probe_size; |
| 1842 | int size_needed; | 1888 | int size_needed; |
| 1843 | int copy; | 1889 | int copy; |
| 1844 | int mss_now; | 1890 | int mss_now; |
| 1891 | int interval; | ||
| 1845 | 1892 | ||
| 1846 | /* Not currently probing/verifying, | 1893 | /* Not currently probing/verifying, |
| 1847 | * not in recovery, | 1894 | * not in recovery, |
| @@ -1854,12 +1901,25 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1854 | tp->rx_opt.num_sacks || tp->rx_opt.dsack) | 1901 | tp->rx_opt.num_sacks || tp->rx_opt.dsack) |
| 1855 | return -1; | 1902 | return -1; |
| 1856 | 1903 | ||
| 1857 | /* Very simple search strategy: just double the MSS. */ | 1904 | /* Use binary search for probe_size between tcp_mss_base, |
| 1905 | * and current mss_clamp. if (search_high - search_low) | ||
| 1906 | * smaller than a threshold, backoff from probing. | ||
| 1907 | */ | ||
| 1858 | mss_now = tcp_current_mss(sk); | 1908 | mss_now = tcp_current_mss(sk); |
| 1859 | probe_size = 2 * tp->mss_cache; | 1909 | probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + |
| 1910 | icsk->icsk_mtup.search_low) >> 1); | ||
| 1860 | size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; | 1911 | size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; |
| 1861 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | 1912 | interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; |
| 1862 | /* TODO: set timer for probe_converge_event */ | 1913 | /* When misfortune happens, we are reprobing actively, |
| 1914 | * and then reprobe timer has expired. We stick with current | ||
| 1915 | * probing process by not resetting search range to its orignal. | ||
| 1916 | */ | ||
| 1917 | if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || | ||
| 1918 | interval < net->ipv4.sysctl_tcp_probe_threshold) { | ||
| 1919 | /* Check whether enough time has elaplased for | ||
| 1920 | * another round of probing. | ||
| 1921 | */ | ||
| 1922 | tcp_mtu_check_reprobe(sk); | ||
| 1863 | return -1; | 1923 | return -1; |
| 1864 | } | 1924 | } |
| 1865 | 1925 | ||
| @@ -1881,7 +1941,8 @@ static int tcp_mtu_probe(struct sock *sk) | |||
| 1881 | } | 1941 | } |
| 1882 | 1942 | ||
| 1883 | /* We're allowed to probe. Build it now. */ | 1943 | /* We're allowed to probe. Build it now. */ |
| 1884 | if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | 1944 | nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC); |
| 1945 | if (!nskb) | ||
| 1885 | return -1; | 1946 | return -1; |
| 1886 | sk->sk_wmem_queued += nskb->truesize; | 1947 | sk->sk_wmem_queued += nskb->truesize; |
| 1887 | sk_mem_charge(sk, nskb->truesize); | 1948 | sk_mem_charge(sk, nskb->truesize); |
| @@ -2179,7 +2240,7 @@ void tcp_send_loss_probe(struct sock *sk) | |||
| 2179 | int mss = tcp_current_mss(sk); | 2240 | int mss = tcp_current_mss(sk); |
| 2180 | int err = -1; | 2241 | int err = -1; |
| 2181 | 2242 | ||
| 2182 | if (tcp_send_head(sk) != NULL) { | 2243 | if (tcp_send_head(sk)) { |
| 2183 | err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); | 2244 | err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); |
| 2184 | goto rearm_timer; | 2245 | goto rearm_timer; |
| 2185 | } | 2246 | } |
| @@ -2689,7 +2750,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 2689 | if (skb == tcp_send_head(sk)) | 2750 | if (skb == tcp_send_head(sk)) |
| 2690 | break; | 2751 | break; |
| 2691 | /* we could do better than to assign each time */ | 2752 | /* we could do better than to assign each time */ |
| 2692 | if (hole == NULL) | 2753 | if (!hole) |
| 2693 | tp->retransmit_skb_hint = skb; | 2754 | tp->retransmit_skb_hint = skb; |
| 2694 | 2755 | ||
| 2695 | /* Assume this retransmit will generate | 2756 | /* Assume this retransmit will generate |
| @@ -2713,7 +2774,7 @@ begin_fwd: | |||
| 2713 | if (!tcp_can_forward_retransmit(sk)) | 2774 | if (!tcp_can_forward_retransmit(sk)) |
| 2714 | break; | 2775 | break; |
| 2715 | /* Backtrack if necessary to non-L'ed skb */ | 2776 | /* Backtrack if necessary to non-L'ed skb */ |
| 2716 | if (hole != NULL) { | 2777 | if (hole) { |
| 2717 | skb = hole; | 2778 | skb = hole; |
| 2718 | hole = NULL; | 2779 | hole = NULL; |
| 2719 | } | 2780 | } |
| @@ -2721,7 +2782,7 @@ begin_fwd: | |||
| 2721 | goto begin_fwd; | 2782 | goto begin_fwd; |
| 2722 | 2783 | ||
| 2723 | } else if (!(sacked & TCPCB_LOST)) { | 2784 | } else if (!(sacked & TCPCB_LOST)) { |
| 2724 | if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) | 2785 | if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) |
| 2725 | hole = skb; | 2786 | hole = skb; |
| 2726 | continue; | 2787 | continue; |
| 2727 | 2788 | ||
| @@ -2751,43 +2812,65 @@ begin_fwd: | |||
| 2751 | } | 2812 | } |
| 2752 | } | 2813 | } |
| 2753 | 2814 | ||
| 2754 | /* Send a fin. The caller locks the socket for us. This cannot be | 2815 | /* We allow to exceed memory limits for FIN packets to expedite |
| 2755 | * allowed to fail queueing a FIN frame under any circumstances. | 2816 | * connection tear down and (memory) recovery. |
| 2817 | * Otherwise tcp_send_fin() could be tempted to either delay FIN | ||
| 2818 | * or even be forced to close flow without any FIN. | ||
| 2819 | */ | ||
| 2820 | static void sk_forced_wmem_schedule(struct sock *sk, int size) | ||
| 2821 | { | ||
| 2822 | int amt, status; | ||
| 2823 | |||
| 2824 | if (size <= sk->sk_forward_alloc) | ||
| 2825 | return; | ||
| 2826 | amt = sk_mem_pages(size); | ||
| 2827 | sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; | ||
| 2828 | sk_memory_allocated_add(sk, amt, &status); | ||
| 2829 | } | ||
| 2830 | |||
| 2831 | /* Send a FIN. The caller locks the socket for us. | ||
| 2832 | * We should try to send a FIN packet really hard, but eventually give up. | ||
| 2756 | */ | 2833 | */ |
| 2757 | void tcp_send_fin(struct sock *sk) | 2834 | void tcp_send_fin(struct sock *sk) |
| 2758 | { | 2835 | { |
| 2836 | struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); | ||
| 2759 | struct tcp_sock *tp = tcp_sk(sk); | 2837 | struct tcp_sock *tp = tcp_sk(sk); |
| 2760 | struct sk_buff *skb = tcp_write_queue_tail(sk); | ||
| 2761 | int mss_now; | ||
| 2762 | 2838 | ||
| 2763 | /* Optimization, tack on the FIN if we have a queue of | 2839 | /* Optimization, tack on the FIN if we have one skb in write queue and |
| 2764 | * unsent frames. But be careful about outgoing SACKS | 2840 | * this skb was not yet sent, or we are under memory pressure. |
| 2765 | * and IP options. | 2841 | * Note: in the latter case, FIN packet will be sent after a timeout, |
| 2842 | * as TCP stack thinks it has already been transmitted. | ||
| 2766 | */ | 2843 | */ |
| 2767 | mss_now = tcp_current_mss(sk); | 2844 | if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { |
| 2768 | 2845 | coalesce: | |
| 2769 | if (tcp_send_head(sk) != NULL) { | 2846 | TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; |
| 2770 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; | 2847 | TCP_SKB_CB(tskb)->end_seq++; |
| 2771 | TCP_SKB_CB(skb)->end_seq++; | ||
| 2772 | tp->write_seq++; | 2848 | tp->write_seq++; |
| 2849 | if (!tcp_send_head(sk)) { | ||
| 2850 | /* This means tskb was already sent. | ||
| 2851 | * Pretend we included the FIN on previous transmit. | ||
| 2852 | * We need to set tp->snd_nxt to the value it would have | ||
| 2853 | * if FIN had been sent. This is because retransmit path | ||
| 2854 | * does not change tp->snd_nxt. | ||
| 2855 | */ | ||
| 2856 | tp->snd_nxt++; | ||
| 2857 | return; | ||
| 2858 | } | ||
| 2773 | } else { | 2859 | } else { |
| 2774 | /* Socket is locked, keep trying until memory is available. */ | 2860 | skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); |
| 2775 | for (;;) { | 2861 | if (unlikely(!skb)) { |
| 2776 | skb = alloc_skb_fclone(MAX_TCP_HEADER, | 2862 | if (tskb) |
| 2777 | sk->sk_allocation); | 2863 | goto coalesce; |
| 2778 | if (skb) | 2864 | return; |
| 2779 | break; | ||
| 2780 | yield(); | ||
| 2781 | } | 2865 | } |
| 2782 | |||
| 2783 | /* Reserve space for headers and prepare control bits. */ | ||
| 2784 | skb_reserve(skb, MAX_TCP_HEADER); | 2866 | skb_reserve(skb, MAX_TCP_HEADER); |
| 2867 | sk_forced_wmem_schedule(sk, skb->truesize); | ||
| 2785 | /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ | 2868 | /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ |
| 2786 | tcp_init_nondata_skb(skb, tp->write_seq, | 2869 | tcp_init_nondata_skb(skb, tp->write_seq, |
| 2787 | TCPHDR_ACK | TCPHDR_FIN); | 2870 | TCPHDR_ACK | TCPHDR_FIN); |
| 2788 | tcp_queue_skb(sk, skb); | 2871 | tcp_queue_skb(sk, skb); |
| 2789 | } | 2872 | } |
| 2790 | __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); | 2873 | __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); |
| 2791 | } | 2874 | } |
| 2792 | 2875 | ||
| 2793 | /* We get here when a process closes a file descriptor (either due to | 2876 | /* We get here when a process closes a file descriptor (either due to |
| @@ -2828,14 +2911,14 @@ int tcp_send_synack(struct sock *sk) | |||
| 2828 | struct sk_buff *skb; | 2911 | struct sk_buff *skb; |
| 2829 | 2912 | ||
| 2830 | skb = tcp_write_queue_head(sk); | 2913 | skb = tcp_write_queue_head(sk); |
| 2831 | if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { | 2914 | if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { |
| 2832 | pr_debug("%s: wrong queue state\n", __func__); | 2915 | pr_debug("%s: wrong queue state\n", __func__); |
| 2833 | return -EFAULT; | 2916 | return -EFAULT; |
| 2834 | } | 2917 | } |
| 2835 | if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { | 2918 | if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { |
| 2836 | if (skb_cloned(skb)) { | 2919 | if (skb_cloned(skb)) { |
| 2837 | struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); | 2920 | struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); |
| 2838 | if (nskb == NULL) | 2921 | if (!nskb) |
| 2839 | return -ENOMEM; | 2922 | return -ENOMEM; |
| 2840 | tcp_unlink_write_queue(skb, sk); | 2923 | tcp_unlink_write_queue(skb, sk); |
| 2841 | __skb_header_release(nskb); | 2924 | __skb_header_release(nskb); |
| @@ -2870,7 +2953,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2870 | struct tcp_sock *tp = tcp_sk(sk); | 2953 | struct tcp_sock *tp = tcp_sk(sk); |
| 2871 | struct tcphdr *th; | 2954 | struct tcphdr *th; |
| 2872 | struct sk_buff *skb; | 2955 | struct sk_buff *skb; |
| 2873 | struct tcp_md5sig_key *md5; | 2956 | struct tcp_md5sig_key *md5 = NULL; |
| 2874 | int tcp_header_size; | 2957 | int tcp_header_size; |
| 2875 | int mss; | 2958 | int mss; |
| 2876 | 2959 | ||
| @@ -2883,7 +2966,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2883 | skb_reserve(skb, MAX_TCP_HEADER); | 2966 | skb_reserve(skb, MAX_TCP_HEADER); |
| 2884 | 2967 | ||
| 2885 | skb_dst_set(skb, dst); | 2968 | skb_dst_set(skb, dst); |
| 2886 | security_skb_owned_by(skb, sk); | ||
| 2887 | 2969 | ||
| 2888 | mss = dst_metric_advmss(dst); | 2970 | mss = dst_metric_advmss(dst); |
| 2889 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | 2971 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
| @@ -2896,7 +2978,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2896 | else | 2978 | else |
| 2897 | #endif | 2979 | #endif |
| 2898 | skb_mstamp_get(&skb->skb_mstamp); | 2980 | skb_mstamp_get(&skb->skb_mstamp); |
| 2899 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, | 2981 | |
| 2982 | #ifdef CONFIG_TCP_MD5SIG | ||
| 2983 | rcu_read_lock(); | ||
| 2984 | md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); | ||
| 2985 | #endif | ||
| 2986 | tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, | ||
| 2900 | foc) + sizeof(*th); | 2987 | foc) + sizeof(*th); |
| 2901 | 2988 | ||
| 2902 | skb_push(skb, tcp_header_size); | 2989 | skb_push(skb, tcp_header_size); |
| @@ -2927,12 +3014,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2927 | 3014 | ||
| 2928 | #ifdef CONFIG_TCP_MD5SIG | 3015 | #ifdef CONFIG_TCP_MD5SIG |
| 2929 | /* Okay, we have all we need - do the md5 hash if needed */ | 3016 | /* Okay, we have all we need - do the md5 hash if needed */ |
| 2930 | if (md5) { | 3017 | if (md5) |
| 2931 | tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, | 3018 | tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, |
| 2932 | md5, NULL, req, skb); | 3019 | md5, req_to_sk(req), skb); |
| 2933 | } | 3020 | rcu_read_unlock(); |
| 2934 | #endif | 3021 | #endif |
| 2935 | 3022 | ||
| 3023 | /* Do not fool tcpdump (if any), clean our debris */ | ||
| 3024 | skb->tstamp.tv64 = 0; | ||
| 2936 | return skb; | 3025 | return skb; |
| 2937 | } | 3026 | } |
| 2938 | EXPORT_SYMBOL(tcp_make_synack); | 3027 | EXPORT_SYMBOL(tcp_make_synack); |
| @@ -2970,7 +3059,7 @@ static void tcp_connect_init(struct sock *sk) | |||
| 2970 | (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); | 3059 | (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); |
| 2971 | 3060 | ||
| 2972 | #ifdef CONFIG_TCP_MD5SIG | 3061 | #ifdef CONFIG_TCP_MD5SIG |
| 2973 | if (tp->af_specific->md5_lookup(sk, sk) != NULL) | 3062 | if (tp->af_specific->md5_lookup(sk, sk)) |
| 2974 | tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; | 3063 | tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; |
| 2975 | #endif | 3064 | #endif |
| 2976 | 3065 | ||
| @@ -3256,7 +3345,7 @@ void tcp_send_ack(struct sock *sk) | |||
| 3256 | * sock. | 3345 | * sock. |
| 3257 | */ | 3346 | */ |
| 3258 | buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); | 3347 | buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 3259 | if (buff == NULL) { | 3348 | if (!buff) { |
| 3260 | inet_csk_schedule_ack(sk); | 3349 | inet_csk_schedule_ack(sk); |
| 3261 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; | 3350 | inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; |
| 3262 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, | 3351 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, |
| @@ -3300,7 +3389,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) | |||
| 3300 | 3389 | ||
| 3301 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ | 3390 | /* We don't queue it, tcp_transmit_skb() sets ownership. */ |
| 3302 | skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); | 3391 | skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); |
| 3303 | if (skb == NULL) | 3392 | if (!skb) |
| 3304 | return -1; | 3393 | return -1; |
| 3305 | 3394 | ||
| 3306 | /* Reserve space for headers and set control bits. */ | 3395 | /* Reserve space for headers and set control bits. */ |
| @@ -3331,8 +3420,8 @@ int tcp_write_wakeup(struct sock *sk) | |||
| 3331 | if (sk->sk_state == TCP_CLOSE) | 3420 | if (sk->sk_state == TCP_CLOSE) |
| 3332 | return -1; | 3421 | return -1; |
| 3333 | 3422 | ||
| 3334 | if ((skb = tcp_send_head(sk)) != NULL && | 3423 | skb = tcp_send_head(sk); |
| 3335 | before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { | 3424 | if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { |
| 3336 | int err; | 3425 | int err; |
| 3337 | unsigned int mss = tcp_current_mss(sk); | 3426 | unsigned int mss = tcp_current_mss(sk); |
| 3338 | unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; | 3427 | unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; |
