aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlpo Järvinen <ilpo.jarvinen@helsinki.fi>2008-11-25 00:03:43 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-25 00:03:43 -0500
commit4a17fc3add594fcc1c778e93a95b6ecf47f630e5 (patch)
treeb82a594e347812c130765bcc85194a7ef96766df
parenta21bba945430f3f5e00c349665f88cdacdb32a8d (diff)
tcp: collapse more than two on retransmission
I always had thought that collapsing up to two at a time was intentional decision to avoid excessive processing if 1 byte sized skbs are to be combined for a full mtu, and consecutive retransmissions would make the size of the retransmittee double each round anyway, but some recent discussion made me to understand that was not the case. Thus make collapse work more and wait less. It would be possible to take advantage of the shifting machinery (added in the later patch) in the case of paged data but that can be implemented on top of this change. tcp_skb_is_last check is now provided by the loop. I tested a bit (ss-after-idle-off, fill 4096x4096B xfer, 10s sleep + 4096 x 1byte writes while dropping them for some a while with netem): . 16774097:16775545(1448) ack 1 win 46 . 16775545:16776993(1448) ack 1 win 46 . ack 16759617 win 2399 P 16776993:16777217(224) ack 1 win 46 . ack 16762513 win 2399 . ack 16765409 win 2399 . ack 16768305 win 2399 . ack 16771201 win 2399 . ack 16774097 win 2399 . ack 16776993 win 2399 . ack 16777217 win 2399 P 16777217:16777257(40) ack 1 win 46 . ack 16777257 win 2399 P 16777257:16778705(1448) ack 1 win 46 P 16778705:16780153(1448) ack 1 win 46 FP 16780153:16781313(1160) ack 1 win 46 . ack 16778705 win 2399 . ack 16780153 win 2399 F 1:1(0) ack 16781314 win 2399 While without drop-all period I get this: . 16773585:16775033(1448) ack 1 win 46 . ack 16764897 win 9367 . ack 16767793 win 9367 . ack 16770689 win 9367 . ack 16773585 win 9367 . 16775033:16776481(1448) ack 1 win 46 P 16776481:16777217(736) ack 1 win 46 . ack 16776481 win 9367 . ack 16777217 win 9367 P 16777217:16777218(1) ack 1 win 46 P 16777218:16777219(1) ack 1 win 46 P 16777219:16777220(1) ack 1 win 46 ... P 16777247:16777248(1) ack 1 win 46 . ack 16777218 win 9367 . ack 16777219 win 9367 ... . ack 16777233 win 9367 . ack 16777248 win 9367 P 16777248:16778696(1448) ack 1 win 46 P 16778696:16780144(1448) ack 1 win 46 FP 16780144:16781313(1169) ack 1 win 46 . ack 16780144 win 9367 F 1:1(0) ack 16781314 win 9367 The window seems to be 30-40 segments, which were successfully combined into: P 16777217:16777257(40) ack 1 win 46 Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/tcp_output.c96
1 files changed, 59 insertions, 37 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a524627923ae..86ef98975e94 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1766,46 +1766,22 @@ u32 __tcp_select_window(struct sock *sk)
1766 return window; 1766 return window;
1767} 1767}
1768 1768
1769/* Attempt to collapse two adjacent SKB's during retransmission. */ 1769/* Collapses two adjacent SKB's during retransmission. */
1770static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, 1770static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1771 int mss_now)
1772{ 1771{
1773 struct tcp_sock *tp = tcp_sk(sk); 1772 struct tcp_sock *tp = tcp_sk(sk);
1774 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 1773 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1775 int skb_size, next_skb_size; 1774 int skb_size, next_skb_size;
1776 u16 flags; 1775 u16 flags;
1777 1776
1778 /* The first test we must make is that neither of these two
1779 * SKB's are still referenced by someone else.
1780 */
1781 if (skb_cloned(skb) || skb_cloned(next_skb))
1782 return;
1783
1784 skb_size = skb->len; 1777 skb_size = skb->len;
1785 next_skb_size = next_skb->len; 1778 next_skb_size = next_skb->len;
1786 flags = TCP_SKB_CB(skb)->flags; 1779 flags = TCP_SKB_CB(skb)->flags;
1787 1780
1788 /* Also punt if next skb has been SACK'd. */
1789 if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
1790 return;
1791
1792 /* Next skb is out of window. */
1793 if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
1794 return;
1795
1796 /* Punt if not enough space exists in the first SKB for
1797 * the data in the second, or the total combined payload
1798 * would exceed the MSS.
1799 */
1800 if ((next_skb_size > skb_tailroom(skb)) ||
1801 ((skb_size + next_skb_size) > mss_now))
1802 return;
1803
1804 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 1781 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1805 1782
1806 tcp_highest_sack_combine(sk, next_skb, skb); 1783 tcp_highest_sack_combine(sk, next_skb, skb);
1807 1784
1808 /* Ok. We will be able to collapse the packet. */
1809 tcp_unlink_write_queue(next_skb, sk); 1785 tcp_unlink_write_queue(next_skb, sk);
1810 1786
1811 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), 1787 skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
@@ -1847,6 +1823,62 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1847 sk_wmem_free_skb(sk, next_skb); 1823 sk_wmem_free_skb(sk, next_skb);
1848} 1824}
1849 1825
1826static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1827{
1828 if (tcp_skb_pcount(skb) > 1)
1829 return 0;
1830 /* TODO: SACK collapsing could be used to remove this condition */
1831 if (skb_shinfo(skb)->nr_frags != 0)
1832 return 0;
1833 if (skb_cloned(skb))
1834 return 0;
1835 if (skb == tcp_send_head(sk))
1836 return 0;
1837 /* Some heurestics for collapsing over SACK'd could be invented */
1838 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1839 return 0;
1840
1841 return 1;
1842}
1843
1844static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1845 int space)
1846{
1847 struct tcp_sock *tp = tcp_sk(sk);
1848 struct sk_buff *skb = to, *tmp;
1849 int first = 1;
1850
1851 if (!sysctl_tcp_retrans_collapse)
1852 return;
1853 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
1854 return;
1855
1856 tcp_for_write_queue_from_safe(skb, tmp, sk) {
1857 if (!tcp_can_collapse(sk, skb))
1858 break;
1859
1860 space -= skb->len;
1861
1862 if (first) {
1863 first = 0;
1864 continue;
1865 }
1866
1867 if (space < 0)
1868 break;
1869 /* Punt if not enough space exists in the first SKB for
1870 * the data in the second
1871 */
1872 if (skb->len > skb_tailroom(to))
1873 break;
1874
1875 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
1876 break;
1877
1878 tcp_collapse_retrans(sk, to);
1879 }
1880}
1881
1850/* Do a simple retransmit without using the backoff mechanisms in 1882/* Do a simple retransmit without using the backoff mechanisms in
1851 * tcp_timer. This is used for path mtu discovery. 1883 * tcp_timer. This is used for path mtu discovery.
1852 * The socket is already locked here. 1884 * The socket is already locked here.
@@ -1946,17 +1978,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1946 return -ENOMEM; /* We'll try again later. */ 1978 return -ENOMEM; /* We'll try again later. */
1947 } 1979 }
1948 1980
1949 /* Collapse two adjacent packets if worthwhile and we can. */ 1981 tcp_retrans_try_collapse(sk, skb, cur_mss);
1950 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1951 (skb->len < (cur_mss >> 1)) &&
1952 (!tcp_skb_is_last(sk, skb)) &&
1953 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1954 (skb_shinfo(skb)->nr_frags == 0 &&
1955 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1956 (tcp_skb_pcount(skb) == 1 &&
1957 tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
1958 (sysctl_tcp_retrans_collapse != 0))
1959 tcp_retrans_try_collapse(sk, skb, cur_mss);
1960 1982
1961 /* Some Solaris stacks overoptimize and ignore the FIN on a 1983 /* Some Solaris stacks overoptimize and ignore the FIN on a
1962 * retransmit when old data is attached. So strip it off 1984 * retransmit when old data is attached. So strip it off