diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 219 |
1 files changed, 88 insertions, 131 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fe3b4bdfd251..557fe16cbfb0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -42,7 +42,7 @@ | |||
42 | /* People can turn this off for buggy TCP's found in printers etc. */ | 42 | /* People can turn this off for buggy TCP's found in printers etc. */ |
43 | int sysctl_tcp_retrans_collapse __read_mostly = 1; | 43 | int sysctl_tcp_retrans_collapse __read_mostly = 1; |
44 | 44 | ||
45 | /* People can turn this on to work with those rare, broken TCPs that | 45 | /* People can turn this on to work with those rare, broken TCPs that |
46 | * interpret the window field as a signed quantity. | 46 | * interpret the window field as a signed quantity. |
47 | */ | 47 | */ |
48 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 48 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
@@ -484,7 +484,7 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |||
484 | } | 484 | } |
485 | if (likely(sysctl_tcp_window_scaling)) { | 485 | if (likely(sysctl_tcp_window_scaling)) { |
486 | opts->ws = tp->rx_opt.rcv_wscale; | 486 | opts->ws = tp->rx_opt.rcv_wscale; |
487 | if(likely(opts->ws)) | 487 | if (likely(opts->ws)) |
488 | size += TCPOLEN_WSCALE_ALIGNED; | 488 | size += TCPOLEN_WSCALE_ALIGNED; |
489 | } | 489 | } |
490 | if (likely(sysctl_tcp_sack)) { | 490 | if (likely(sysctl_tcp_sack)) { |
@@ -526,7 +526,7 @@ static unsigned tcp_synack_options(struct sock *sk, | |||
526 | 526 | ||
527 | if (likely(ireq->wscale_ok)) { | 527 | if (likely(ireq->wscale_ok)) { |
528 | opts->ws = ireq->rcv_wscale; | 528 | opts->ws = ireq->rcv_wscale; |
529 | if(likely(opts->ws)) | 529 | if (likely(opts->ws)) |
530 | size += TCPOLEN_WSCALE_ALIGNED; | 530 | size += TCPOLEN_WSCALE_ALIGNED; |
531 | } | 531 | } |
532 | if (likely(doing_ts)) { | 532 | if (likely(doing_ts)) { |
@@ -663,10 +663,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
663 | th->urg_ptr = 0; | 663 | th->urg_ptr = 0; |
664 | 664 | ||
665 | /* The urg_mode check is necessary during a below snd_una win probe */ | 665 | /* The urg_mode check is necessary during a below snd_una win probe */ |
666 | if (unlikely(tcp_urg_mode(tp) && | 666 | if (unlikely(tcp_urg_mode(tp))) { |
667 | between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { | 667 | if (between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF)) { |
668 | th->urg_ptr = htons(tp->snd_up - tcb->seq); | 668 | th->urg_ptr = htons(tp->snd_up - tcb->seq); |
669 | th->urg = 1; | 669 | th->urg = 1; |
670 | } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { | ||
671 | th->urg_ptr = 0xFFFF; | ||
672 | th->urg = 1; | ||
673 | } | ||
670 | } | 674 | } |
671 | 675 | ||
672 | tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); | 676 | tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); |
@@ -1168,7 +1172,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, | |||
1168 | 1172 | ||
1169 | static inline int tcp_minshall_check(const struct tcp_sock *tp) | 1173 | static inline int tcp_minshall_check(const struct tcp_sock *tp) |
1170 | { | 1174 | { |
1171 | return after(tp->snd_sml,tp->snd_una) && | 1175 | return after(tp->snd_sml, tp->snd_una) && |
1172 | !after(tp->snd_sml, tp->snd_nxt); | 1176 | !after(tp->snd_sml, tp->snd_nxt); |
1173 | } | 1177 | } |
1174 | 1178 | ||
@@ -1334,7 +1338,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1334 | 1338 | ||
1335 | /* Defer for less than two clock ticks. */ | 1339 | /* Defer for less than two clock ticks. */ |
1336 | if (tp->tso_deferred && | 1340 | if (tp->tso_deferred && |
1337 | ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) | 1341 | (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) |
1338 | goto send_now; | 1342 | goto send_now; |
1339 | 1343 | ||
1340 | in_flight = tcp_packets_in_flight(tp); | 1344 | in_flight = tcp_packets_in_flight(tp); |
@@ -1519,7 +1523,8 @@ static int tcp_mtu_probe(struct sock *sk) | |||
1519 | * Returns 1, if no segments are in flight and we have queued segments, but | 1523 | * Returns 1, if no segments are in flight and we have queued segments, but |
1520 | * cannot send anything now because of SWS or another problem. | 1524 | * cannot send anything now because of SWS or another problem. |
1521 | */ | 1525 | */ |
1522 | static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | 1526 | static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, |
1527 | int push_one, gfp_t gfp) | ||
1523 | { | 1528 | { |
1524 | struct tcp_sock *tp = tcp_sk(sk); | 1529 | struct tcp_sock *tp = tcp_sk(sk); |
1525 | struct sk_buff *skb; | 1530 | struct sk_buff *skb; |
@@ -1527,20 +1532,16 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1527 | int cwnd_quota; | 1532 | int cwnd_quota; |
1528 | int result; | 1533 | int result; |
1529 | 1534 | ||
1530 | /* If we are closed, the bytes will have to remain here. | ||
1531 | * In time closedown will finish, we empty the write queue and all | ||
1532 | * will be happy. | ||
1533 | */ | ||
1534 | if (unlikely(sk->sk_state == TCP_CLOSE)) | ||
1535 | return 0; | ||
1536 | |||
1537 | sent_pkts = 0; | 1535 | sent_pkts = 0; |
1538 | 1536 | ||
1539 | /* Do MTU probing. */ | 1537 | if (!push_one) { |
1540 | if ((result = tcp_mtu_probe(sk)) == 0) { | 1538 | /* Do MTU probing. */ |
1541 | return 0; | 1539 | result = tcp_mtu_probe(sk); |
1542 | } else if (result > 0) { | 1540 | if (!result) { |
1543 | sent_pkts = 1; | 1541 | return 0; |
1542 | } else if (result > 0) { | ||
1543 | sent_pkts = 1; | ||
1544 | } | ||
1544 | } | 1545 | } |
1545 | 1546 | ||
1546 | while ((skb = tcp_send_head(sk))) { | 1547 | while ((skb = tcp_send_head(sk))) { |
@@ -1562,7 +1563,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1562 | nonagle : TCP_NAGLE_PUSH)))) | 1563 | nonagle : TCP_NAGLE_PUSH)))) |
1563 | break; | 1564 | break; |
1564 | } else { | 1565 | } else { |
1565 | if (tcp_tso_should_defer(sk, skb)) | 1566 | if (!push_one && tcp_tso_should_defer(sk, skb)) |
1566 | break; | 1567 | break; |
1567 | } | 1568 | } |
1568 | 1569 | ||
@@ -1577,7 +1578,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1577 | 1578 | ||
1578 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 1579 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
1579 | 1580 | ||
1580 | if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) | 1581 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
1581 | break; | 1582 | break; |
1582 | 1583 | ||
1583 | /* Advance the send_head. This one is sent out. | 1584 | /* Advance the send_head. This one is sent out. |
@@ -1587,6 +1588,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) | |||
1587 | 1588 | ||
1588 | tcp_minshall_update(tp, mss_now, skb); | 1589 | tcp_minshall_update(tp, mss_now, skb); |
1589 | sent_pkts++; | 1590 | sent_pkts++; |
1591 | |||
1592 | if (push_one) | ||
1593 | break; | ||
1590 | } | 1594 | } |
1591 | 1595 | ||
1592 | if (likely(sent_pkts)) { | 1596 | if (likely(sent_pkts)) { |
@@ -1605,10 +1609,18 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | |||
1605 | { | 1609 | { |
1606 | struct sk_buff *skb = tcp_send_head(sk); | 1610 | struct sk_buff *skb = tcp_send_head(sk); |
1607 | 1611 | ||
1608 | if (skb) { | 1612 | if (!skb) |
1609 | if (tcp_write_xmit(sk, cur_mss, nonagle)) | 1613 | return; |
1610 | tcp_check_probe_timer(sk); | 1614 | |
1611 | } | 1615 | /* If we are closed, the bytes will have to remain here. |
1616 | * In time closedown will finish, we empty the write queue and | ||
1617 | * all will be happy. | ||
1618 | */ | ||
1619 | if (unlikely(sk->sk_state == TCP_CLOSE)) | ||
1620 | return; | ||
1621 | |||
1622 | if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) | ||
1623 | tcp_check_probe_timer(sk); | ||
1612 | } | 1624 | } |
1613 | 1625 | ||
1614 | /* Send _single_ skb sitting at the send head. This function requires | 1626 | /* Send _single_ skb sitting at the send head. This function requires |
@@ -1616,38 +1628,11 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | |||
1616 | */ | 1628 | */ |
1617 | void tcp_push_one(struct sock *sk, unsigned int mss_now) | 1629 | void tcp_push_one(struct sock *sk, unsigned int mss_now) |
1618 | { | 1630 | { |
1619 | struct tcp_sock *tp = tcp_sk(sk); | ||
1620 | struct sk_buff *skb = tcp_send_head(sk); | 1631 | struct sk_buff *skb = tcp_send_head(sk); |
1621 | unsigned int tso_segs, cwnd_quota; | ||
1622 | 1632 | ||
1623 | BUG_ON(!skb || skb->len < mss_now); | 1633 | BUG_ON(!skb || skb->len < mss_now); |
1624 | 1634 | ||
1625 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1635 | tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); |
1626 | cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); | ||
1627 | |||
1628 | if (likely(cwnd_quota)) { | ||
1629 | unsigned int limit; | ||
1630 | |||
1631 | BUG_ON(!tso_segs); | ||
1632 | |||
1633 | limit = mss_now; | ||
1634 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | ||
1635 | limit = tcp_mss_split_point(sk, skb, mss_now, | ||
1636 | cwnd_quota); | ||
1637 | |||
1638 | if (skb->len > limit && | ||
1639 | unlikely(tso_fragment(sk, skb, limit, mss_now))) | ||
1640 | return; | ||
1641 | |||
1642 | /* Send it out now. */ | ||
1643 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
1644 | |||
1645 | if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { | ||
1646 | tcp_event_new_data_sent(sk, skb); | ||
1647 | tcp_cwnd_validate(sk); | ||
1648 | return; | ||
1649 | } | ||
1650 | } | ||
1651 | } | 1636 | } |
1652 | 1637 | ||
1653 | /* This function returns the amount that we can raise the | 1638 | /* This function returns the amount that we can raise the |
@@ -1767,46 +1752,22 @@ u32 __tcp_select_window(struct sock *sk) | |||
1767 | return window; | 1752 | return window; |
1768 | } | 1753 | } |
1769 | 1754 | ||
1770 | /* Attempt to collapse two adjacent SKB's during retransmission. */ | 1755 | /* Collapses two adjacent SKB's during retransmission. */ |
1771 | static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | 1756 | static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) |
1772 | int mss_now) | ||
1773 | { | 1757 | { |
1774 | struct tcp_sock *tp = tcp_sk(sk); | 1758 | struct tcp_sock *tp = tcp_sk(sk); |
1775 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); | 1759 | struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); |
1776 | int skb_size, next_skb_size; | 1760 | int skb_size, next_skb_size; |
1777 | u16 flags; | 1761 | u16 flags; |
1778 | 1762 | ||
1779 | /* The first test we must make is that neither of these two | ||
1780 | * SKB's are still referenced by someone else. | ||
1781 | */ | ||
1782 | if (skb_cloned(skb) || skb_cloned(next_skb)) | ||
1783 | return; | ||
1784 | |||
1785 | skb_size = skb->len; | 1763 | skb_size = skb->len; |
1786 | next_skb_size = next_skb->len; | 1764 | next_skb_size = next_skb->len; |
1787 | flags = TCP_SKB_CB(skb)->flags; | 1765 | flags = TCP_SKB_CB(skb)->flags; |
1788 | 1766 | ||
1789 | /* Also punt if next skb has been SACK'd. */ | ||
1790 | if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) | ||
1791 | return; | ||
1792 | |||
1793 | /* Next skb is out of window. */ | ||
1794 | if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) | ||
1795 | return; | ||
1796 | |||
1797 | /* Punt if not enough space exists in the first SKB for | ||
1798 | * the data in the second, or the total combined payload | ||
1799 | * would exceed the MSS. | ||
1800 | */ | ||
1801 | if ((next_skb_size > skb_tailroom(skb)) || | ||
1802 | ((skb_size + next_skb_size) > mss_now)) | ||
1803 | return; | ||
1804 | |||
1805 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); | 1767 | BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); |
1806 | 1768 | ||
1807 | tcp_highest_sack_combine(sk, next_skb, skb); | 1769 | tcp_highest_sack_combine(sk, next_skb, skb); |
1808 | 1770 | ||
1809 | /* Ok. We will be able to collapse the packet. */ | ||
1810 | tcp_unlink_write_queue(next_skb, sk); | 1771 | tcp_unlink_write_queue(next_skb, sk); |
1811 | 1772 | ||
1812 | skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), | 1773 | skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), |
@@ -1848,54 +1809,60 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | |||
1848 | sk_wmem_free_skb(sk, next_skb); | 1809 | sk_wmem_free_skb(sk, next_skb); |
1849 | } | 1810 | } |
1850 | 1811 | ||
1851 | /* Do a simple retransmit without using the backoff mechanisms in | 1812 | static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) |
1852 | * tcp_timer. This is used for path mtu discovery. | 1813 | { |
1853 | * The socket is already locked here. | 1814 | if (tcp_skb_pcount(skb) > 1) |
1854 | */ | 1815 | return 0; |
1855 | void tcp_simple_retransmit(struct sock *sk) | 1816 | /* TODO: SACK collapsing could be used to remove this condition */ |
1817 | if (skb_shinfo(skb)->nr_frags != 0) | ||
1818 | return 0; | ||
1819 | if (skb_cloned(skb)) | ||
1820 | return 0; | ||
1821 | if (skb == tcp_send_head(sk)) | ||
1822 | return 0; | ||
1823 | /* Some heurestics for collapsing over SACK'd could be invented */ | ||
1824 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) | ||
1825 | return 0; | ||
1826 | |||
1827 | return 1; | ||
1828 | } | ||
1829 | |||
1830 | static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | ||
1831 | int space) | ||
1856 | { | 1832 | { |
1857 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1858 | struct tcp_sock *tp = tcp_sk(sk); | 1833 | struct tcp_sock *tp = tcp_sk(sk); |
1859 | struct sk_buff *skb; | 1834 | struct sk_buff *skb = to, *tmp; |
1860 | unsigned int mss = tcp_current_mss(sk, 0); | 1835 | int first = 1; |
1861 | u32 prior_lost = tp->lost_out; | ||
1862 | 1836 | ||
1863 | tcp_for_write_queue(skb, sk) { | 1837 | if (!sysctl_tcp_retrans_collapse) |
1864 | if (skb == tcp_send_head(sk)) | 1838 | return; |
1839 | if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) | ||
1840 | return; | ||
1841 | |||
1842 | tcp_for_write_queue_from_safe(skb, tmp, sk) { | ||
1843 | if (!tcp_can_collapse(sk, skb)) | ||
1865 | break; | 1844 | break; |
1866 | if (skb->len > mss && | ||
1867 | !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | ||
1868 | if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { | ||
1869 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | ||
1870 | tp->retrans_out -= tcp_skb_pcount(skb); | ||
1871 | } | ||
1872 | tcp_skb_mark_lost_uncond_verify(tp, skb); | ||
1873 | } | ||
1874 | } | ||
1875 | 1845 | ||
1876 | tcp_clear_retrans_hints_partial(tp); | 1846 | space -= skb->len; |
1877 | 1847 | ||
1878 | if (prior_lost == tp->lost_out) | 1848 | if (first) { |
1879 | return; | 1849 | first = 0; |
1850 | continue; | ||
1851 | } | ||
1880 | 1852 | ||
1881 | if (tcp_is_reno(tp)) | 1853 | if (space < 0) |
1882 | tcp_limit_reno_sacked(tp); | 1854 | break; |
1855 | /* Punt if not enough space exists in the first SKB for | ||
1856 | * the data in the second | ||
1857 | */ | ||
1858 | if (skb->len > skb_tailroom(to)) | ||
1859 | break; | ||
1883 | 1860 | ||
1884 | tcp_verify_left_out(tp); | 1861 | if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) |
1862 | break; | ||
1885 | 1863 | ||
1886 | /* Don't muck with the congestion window here. | 1864 | tcp_collapse_retrans(sk, to); |
1887 | * Reason is that we do not increase amount of _data_ | ||
1888 | * in network, but units changed and effective | ||
1889 | * cwnd/ssthresh really reduced now. | ||
1890 | */ | ||
1891 | if (icsk->icsk_ca_state != TCP_CA_Loss) { | ||
1892 | tp->high_seq = tp->snd_nxt; | ||
1893 | tp->snd_ssthresh = tcp_current_ssthresh(sk); | ||
1894 | tp->prior_ssthresh = 0; | ||
1895 | tp->undo_marker = 0; | ||
1896 | tcp_set_ca_state(sk, TCP_CA_Loss); | ||
1897 | } | 1865 | } |
1898 | tcp_xmit_retransmit_queue(sk); | ||
1899 | } | 1866 | } |
1900 | 1867 | ||
1901 | /* This retransmits one SKB. Policy decisions and retransmit queue | 1868 | /* This retransmits one SKB. Policy decisions and retransmit queue |
@@ -1947,17 +1914,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1947 | return -ENOMEM; /* We'll try again later. */ | 1914 | return -ENOMEM; /* We'll try again later. */ |
1948 | } | 1915 | } |
1949 | 1916 | ||
1950 | /* Collapse two adjacent packets if worthwhile and we can. */ | 1917 | tcp_retrans_try_collapse(sk, skb, cur_mss); |
1951 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && | ||
1952 | (skb->len < (cur_mss >> 1)) && | ||
1953 | (!tcp_skb_is_last(sk, skb)) && | ||
1954 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
1955 | (skb_shinfo(skb)->nr_frags == 0 && | ||
1956 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && | ||
1957 | (tcp_skb_pcount(skb) == 1 && | ||
1958 | tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && | ||
1959 | (sysctl_tcp_retrans_collapse != 0)) | ||
1960 | tcp_retrans_try_collapse(sk, skb, cur_mss); | ||
1961 | 1918 | ||
1962 | /* Some Solaris stacks overoptimize and ignore the FIN on a | 1919 | /* Some Solaris stacks overoptimize and ignore the FIN on a |
1963 | * retransmit when old data is attached. So strip it off | 1920 | * retransmit when old data is attached. So strip it off |