aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c189
1 files changed, 120 insertions, 69 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7820f3a7dd70..17a11e65e57f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
363 */ 363 */
364static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 364static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
365{ 365{
366 struct skb_shared_info *shinfo = skb_shinfo(skb);
367
366 skb->ip_summed = CHECKSUM_PARTIAL; 368 skb->ip_summed = CHECKSUM_PARTIAL;
367 skb->csum = 0; 369 skb->csum = 0;
368 370
369 TCP_SKB_CB(skb)->tcp_flags = flags; 371 TCP_SKB_CB(skb)->tcp_flags = flags;
370 TCP_SKB_CB(skb)->sacked = 0; 372 TCP_SKB_CB(skb)->sacked = 0;
371 373
372 skb_shinfo(skb)->gso_segs = 1; 374 shinfo->gso_segs = 1;
373 skb_shinfo(skb)->gso_size = 0; 375 shinfo->gso_size = 0;
374 skb_shinfo(skb)->gso_type = 0; 376 shinfo->gso_type = 0;
375 377
376 TCP_SKB_CB(skb)->seq = seq; 378 TCP_SKB_CB(skb)->seq = seq;
377 if (flags & (TCPHDR_SYN | TCPHDR_FIN)) 379 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -406,7 +408,7 @@ struct tcp_out_options {
406 * Beware: Something in the Internet is very sensitive to the ordering of 408 * Beware: Something in the Internet is very sensitive to the ordering of
407 * TCP options, we learned this through the hard way, so be careful here. 409 * TCP options, we learned this through the hard way, so be careful here.
408 * Luckily we can at least blame others for their non-compliance but from 410 * Luckily we can at least blame others for their non-compliance but from
409 * inter-operatibility perspective it seems that we're somewhat stuck with 411 * inter-operability perspective it seems that we're somewhat stuck with
410 * the ordering which we have been using if we want to keep working with 412 * the ordering which we have been using if we want to keep working with
411 * those broken things (not that it currently hurts anybody as there isn't 413 * those broken things (not that it currently hurts anybody as there isn't
412 * particular reason why the ordering would need to be changed). 414 * particular reason why the ordering would need to be changed).
@@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
679 * 681 *
680 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb 682 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
681 * needs to be reallocated in a driver. 683 * needs to be reallocated in a driver.
682 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc 684 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
683 * 685 *
684 * Since transmit from skb destructor is forbidden, we use a tasklet 686 * Since transmit from skb destructor is forbidden, we use a tasklet
685 * to process all sockets that eventually need to send more skbs. 687 * to process all sockets that eventually need to send more skbs.
@@ -696,12 +698,13 @@ static void tcp_tsq_handler(struct sock *sk)
696 if ((1 << sk->sk_state) & 698 if ((1 << sk->sk_state) &
697 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | 699 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
698 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) 700 TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
699 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); 701 tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
702 0, GFP_ATOMIC);
700} 703}
701/* 704/*
702 * One tasklest per cpu tries to send more skbs. 705 * One tasklet per cpu tries to send more skbs.
703 * We run in tasklet context but need to disable irqs when 706 * We run in tasklet context but need to disable irqs when
704 * transfering tsq->head because tcp_wfree() might 707 * transferring tsq->head because tcp_wfree() might
705 * interrupt us (non NAPI drivers) 708 * interrupt us (non NAPI drivers)
706 */ 709 */
707static void tcp_tasklet_func(unsigned long data) 710static void tcp_tasklet_func(unsigned long data)
@@ -764,6 +767,17 @@ void tcp_release_cb(struct sock *sk)
764 if (flags & (1UL << TCP_TSQ_DEFERRED)) 767 if (flags & (1UL << TCP_TSQ_DEFERRED))
765 tcp_tsq_handler(sk); 768 tcp_tsq_handler(sk);
766 769
770 /* Here begins the tricky part :
771 * We are called from release_sock() with :
772 * 1) BH disabled
773 * 2) sk_lock.slock spinlock held
774 * 3) socket owned by us (sk->sk_lock.owned == 1)
775 *
776 * But following code is meant to be called from BH handlers,
777 * so we should keep BH disabled, but early release socket ownership
778 */
779 sock_release_ownership(sk);
780
767 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { 781 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
768 tcp_write_timer_handler(sk); 782 tcp_write_timer_handler(sk);
769 __sock_put(sk); 783 __sock_put(sk);
@@ -795,7 +809,7 @@ void __init tcp_tasklet_init(void)
795 809
796/* 810/*
797 * Write buffer destructor automatically called from kfree_skb. 811 * Write buffer destructor automatically called from kfree_skb.
798 * We cant xmit new skbs from this context, as we might already 812 * We can't xmit new skbs from this context, as we might already
799 * hold qdisc lock. 813 * hold qdisc lock.
800 */ 814 */
801void tcp_wfree(struct sk_buff *skb) 815void tcp_wfree(struct sk_buff *skb)
@@ -861,8 +875,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
861 875
862 if (unlikely(skb->fclone == SKB_FCLONE_ORIG && 876 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
863 fclone->fclone == SKB_FCLONE_CLONE)) 877 fclone->fclone == SKB_FCLONE_CLONE))
864 NET_INC_STATS_BH(sock_net(sk), 878 NET_INC_STATS(sock_net(sk),
865 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); 879 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
866 880
867 if (unlikely(skb_cloned(skb))) 881 if (unlikely(skb_cloned(skb)))
868 skb = pskb_copy(skb, gfp_mask); 882 skb = pskb_copy(skb, gfp_mask);
@@ -986,6 +1000,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
986static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, 1000static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
987 unsigned int mss_now) 1001 unsigned int mss_now)
988{ 1002{
1003 struct skb_shared_info *shinfo = skb_shinfo(skb);
1004
989 /* Make sure we own this skb before messing gso_size/gso_segs */ 1005 /* Make sure we own this skb before messing gso_size/gso_segs */
990 WARN_ON_ONCE(skb_cloned(skb)); 1006 WARN_ON_ONCE(skb_cloned(skb));
991 1007
@@ -993,13 +1009,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
993 /* Avoid the costly divide in the normal 1009 /* Avoid the costly divide in the normal
994 * non-TSO case. 1010 * non-TSO case.
995 */ 1011 */
996 skb_shinfo(skb)->gso_segs = 1; 1012 shinfo->gso_segs = 1;
997 skb_shinfo(skb)->gso_size = 0; 1013 shinfo->gso_size = 0;
998 skb_shinfo(skb)->gso_type = 0; 1014 shinfo->gso_type = 0;
999 } else { 1015 } else {
1000 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); 1016 shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
1001 skb_shinfo(skb)->gso_size = mss_now; 1017 shinfo->gso_size = mss_now;
1002 skb_shinfo(skb)->gso_type = sk->sk_gso_type; 1018 shinfo->gso_type = sk->sk_gso_type;
1003 } 1019 }
1004} 1020}
1005 1021
@@ -1146,6 +1162,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1146 */ 1162 */
1147static void __pskb_trim_head(struct sk_buff *skb, int len) 1163static void __pskb_trim_head(struct sk_buff *skb, int len)
1148{ 1164{
1165 struct skb_shared_info *shinfo;
1149 int i, k, eat; 1166 int i, k, eat;
1150 1167
1151 eat = min_t(int, len, skb_headlen(skb)); 1168 eat = min_t(int, len, skb_headlen(skb));
@@ -1157,23 +1174,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1157 } 1174 }
1158 eat = len; 1175 eat = len;
1159 k = 0; 1176 k = 0;
1160 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1177 shinfo = skb_shinfo(skb);
1161 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1178 for (i = 0; i < shinfo->nr_frags; i++) {
1179 int size = skb_frag_size(&shinfo->frags[i]);
1162 1180
1163 if (size <= eat) { 1181 if (size <= eat) {
1164 skb_frag_unref(skb, i); 1182 skb_frag_unref(skb, i);
1165 eat -= size; 1183 eat -= size;
1166 } else { 1184 } else {
1167 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1185 shinfo->frags[k] = shinfo->frags[i];
1168 if (eat) { 1186 if (eat) {
1169 skb_shinfo(skb)->frags[k].page_offset += eat; 1187 shinfo->frags[k].page_offset += eat;
1170 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1188 skb_frag_size_sub(&shinfo->frags[k], eat);
1171 eat = 0; 1189 eat = 0;
1172 } 1190 }
1173 k++; 1191 k++;
1174 } 1192 }
1175 } 1193 }
1176 skb_shinfo(skb)->nr_frags = k; 1194 shinfo->nr_frags = k;
1177 1195
1178 skb_reset_tail_pointer(skb); 1196 skb_reset_tail_pointer(skb);
1179 skb->data_len -= len; 1197 skb->data_len -= len;
@@ -1378,23 +1396,51 @@ static void tcp_cwnd_validate(struct sock *sk)
1378 } 1396 }
1379} 1397}
1380 1398
1381/* Returns the portion of skb which can be sent right away without 1399/* Minshall's variant of the Nagle send check. */
1382 * introducing MSS oddities to segment boundaries. In rare cases where 1400static bool tcp_minshall_check(const struct tcp_sock *tp)
1383 * mss_now != mss_cache, we will request caller to create a small skb 1401{
1384 * per input skb which could be mostly avoided here (if desired). 1402 return after(tp->snd_sml, tp->snd_una) &&
1385 * 1403 !after(tp->snd_sml, tp->snd_nxt);
1386 * We explicitly want to create a request for splitting write queue tail 1404}
1387 * to a small skb for Nagle purposes while avoiding unnecessary modulos, 1405
1388 * thus all the complexity (cwnd_len is always MSS multiple which we 1406/* Update snd_sml if this skb is under mss
1389 * return whenever allowed by the other factors). Basically we need the 1407 * Note that a TSO packet might end with a sub-mss segment
1390 * modulo only when the receiver window alone is the limiting factor or 1408 * The test is really :
1391 * when we would be allowed to send the split-due-to-Nagle skb fully. 1409 * if ((skb->len % mss) != 0)
1410 * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1411 * But we can avoid doing the divide again given we already have
1412 * skb_pcount = skb->len / mss_now
1413 */
1414static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1415 const struct sk_buff *skb)
1416{
1417 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1418 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1419}
1420
1421/* Return false, if packet can be sent now without violation Nagle's rules:
1422 * 1. It is full sized. (provided by caller in %partial bool)
1423 * 2. Or it contains FIN. (already checked by caller)
1424 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1425 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1426 * With Minshall's modification: all sent small packets are ACKed.
1392 */ 1427 */
1393static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, 1428static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1394 unsigned int mss_now, unsigned int max_segs) 1429 unsigned int mss_now, int nonagle)
1430{
1431 return partial &&
1432 ((nonagle & TCP_NAGLE_CORK) ||
1433 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1434}
1435/* Returns the portion of skb which can be sent right away */
1436static unsigned int tcp_mss_split_point(const struct sock *sk,
1437 const struct sk_buff *skb,
1438 unsigned int mss_now,
1439 unsigned int max_segs,
1440 int nonagle)
1395{ 1441{
1396 const struct tcp_sock *tp = tcp_sk(sk); 1442 const struct tcp_sock *tp = tcp_sk(sk);
1397 u32 needed, window, max_len; 1443 u32 partial, needed, window, max_len;
1398 1444
1399 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1445 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1400 max_len = mss_now * max_segs; 1446 max_len = mss_now * max_segs;
@@ -1407,7 +1453,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
1407 if (max_len <= needed) 1453 if (max_len <= needed)
1408 return max_len; 1454 return max_len;
1409 1455
1410 return needed - needed % mss_now; 1456 partial = needed % mss_now;
1457 /* If last segment is not a full MSS, check if Nagle rules allow us
1458 * to include this last segment in this skb.
1459 * Otherwise, we'll split the skb at last MSS boundary
1460 */
1461 if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle))
1462 return needed - partial;
1463
1464 return needed;
1411} 1465}
1412 1466
1413/* Can at least one segment of SKB be sent right now, according to the 1467/* Can at least one segment of SKB be sent right now, according to the
@@ -1447,28 +1501,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1447 return tso_segs; 1501 return tso_segs;
1448} 1502}
1449 1503
1450/* Minshall's variant of the Nagle send check. */
1451static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1452{
1453 return after(tp->snd_sml, tp->snd_una) &&
1454 !after(tp->snd_sml, tp->snd_nxt);
1455}
1456
1457/* Return false, if packet can be sent now without violation Nagle's rules:
1458 * 1. It is full sized.
1459 * 2. Or it contains FIN. (already checked by caller)
1460 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1461 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1462 * With Minshall's modification: all sent small packets are ACKed.
1463 */
1464static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1465 const struct sk_buff *skb,
1466 unsigned int mss_now, int nonagle)
1467{
1468 return skb->len < mss_now &&
1469 ((nonagle & TCP_NAGLE_CORK) ||
1470 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1471}
1472 1504
1473/* Return true if the Nagle test allows this packet to be 1505/* Return true if the Nagle test allows this packet to be
1474 * sent now. 1506 * sent now.
@@ -1489,7 +1521,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1489 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1521 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1490 return true; 1522 return true;
1491 1523
1492 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1524 if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
1493 return true; 1525 return true;
1494 1526
1495 return false; 1527 return false;
@@ -1884,7 +1916,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1884 1916
1885 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 1917 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
1886 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 1918 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1887 break; 1919 /* It is possible TX completion already happened
1920 * before we set TSQ_THROTTLED, so we must
1921 * test again the condition.
1922 * We abuse smp_mb__after_clear_bit() because
1923 * there is no smp_mb__after_set_bit() yet
1924 */
1925 smp_mb__after_clear_bit();
1926 if (atomic_read(&sk->sk_wmem_alloc) > limit)
1927 break;
1888 } 1928 }
1889 1929
1890 limit = mss_now; 1930 limit = mss_now;
@@ -1892,7 +1932,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 limit = tcp_mss_split_point(sk, skb, mss_now, 1932 limit = tcp_mss_split_point(sk, skb, mss_now,
1893 min_t(unsigned int, 1933 min_t(unsigned int,
1894 cwnd_quota, 1934 cwnd_quota,
1895 sk->sk_gso_max_segs)); 1935 sk->sk_gso_max_segs),
1936 nonagle);
1896 1937
1897 if (skb->len > limit && 1938 if (skb->len > limit &&
1898 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1939 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -1956,7 +1997,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1956 /* Schedule a loss probe in 2*RTT for SACK capable connections 1997 /* Schedule a loss probe in 2*RTT for SACK capable connections
1957 * in Open state, that are either limited by cwnd or application. 1998 * in Open state, that are either limited by cwnd or application.
1958 */ 1999 */
1959 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || 2000 if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
1960 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2001 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
1961 return false; 2002 return false;
1962 2003
@@ -2307,6 +2348,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2307 struct tcp_sock *tp = tcp_sk(sk); 2348 struct tcp_sock *tp = tcp_sk(sk);
2308 struct inet_connection_sock *icsk = inet_csk(sk); 2349 struct inet_connection_sock *icsk = inet_csk(sk);
2309 unsigned int cur_mss; 2350 unsigned int cur_mss;
2351 int err;
2310 2352
2311 /* Inconslusive MTU probe */ 2353 /* Inconslusive MTU probe */
2312 if (icsk->icsk_mtup.probe_size) { 2354 if (icsk->icsk_mtup.probe_size) {
@@ -2370,11 +2412,15 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2370 skb_headroom(skb) >= 0xFFFF)) { 2412 skb_headroom(skb) >= 0xFFFF)) {
2371 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, 2413 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2372 GFP_ATOMIC); 2414 GFP_ATOMIC);
2373 return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : 2415 err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2374 -ENOBUFS; 2416 -ENOBUFS;
2375 } else { 2417 } else {
2376 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2418 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2377 } 2419 }
2420
2421 if (likely(!err))
2422 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
2423 return err;
2378} 2424}
2379 2425
2380int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2426int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
@@ -2756,7 +2802,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2756EXPORT_SYMBOL(tcp_make_synack); 2802EXPORT_SYMBOL(tcp_make_synack);
2757 2803
2758/* Do all connect socket setups that can be done AF independent. */ 2804/* Do all connect socket setups that can be done AF independent. */
2759void tcp_connect_init(struct sock *sk) 2805static void tcp_connect_init(struct sock *sk)
2760{ 2806{
2761 const struct dst_entry *dst = __sk_dst_get(sk); 2807 const struct dst_entry *dst = __sk_dst_get(sk);
2762 struct tcp_sock *tp = tcp_sk(sk); 2808 struct tcp_sock *tp = tcp_sk(sk);
@@ -2878,7 +2924,12 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2878 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) - 2924 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2879 MAX_TCP_OPTION_SPACE; 2925 MAX_TCP_OPTION_SPACE;
2880 2926
2881 syn_data = skb_copy_expand(syn, skb_headroom(syn), space, 2927 space = min_t(size_t, space, fo->size);
2928
2929 /* limit to order-0 allocations */
2930 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
2931
2932 syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
2882 sk->sk_allocation); 2933 sk->sk_allocation);
2883 if (syn_data == NULL) 2934 if (syn_data == NULL)
2884 goto fallback; 2935 goto fallback;