aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c141
1 files changed, 81 insertions, 60 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7820f3a7dd70..03d26b85eab8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
363 */ 363 */
364static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 364static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
365{ 365{
366 struct skb_shared_info *shinfo = skb_shinfo(skb);
367
366 skb->ip_summed = CHECKSUM_PARTIAL; 368 skb->ip_summed = CHECKSUM_PARTIAL;
367 skb->csum = 0; 369 skb->csum = 0;
368 370
369 TCP_SKB_CB(skb)->tcp_flags = flags; 371 TCP_SKB_CB(skb)->tcp_flags = flags;
370 TCP_SKB_CB(skb)->sacked = 0; 372 TCP_SKB_CB(skb)->sacked = 0;
371 373
372 skb_shinfo(skb)->gso_segs = 1; 374 shinfo->gso_segs = 1;
373 skb_shinfo(skb)->gso_size = 0; 375 shinfo->gso_size = 0;
374 skb_shinfo(skb)->gso_type = 0; 376 shinfo->gso_type = 0;
375 377
376 TCP_SKB_CB(skb)->seq = seq; 378 TCP_SKB_CB(skb)->seq = seq;
377 if (flags & (TCPHDR_SYN | TCPHDR_FIN)) 379 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -406,7 +408,7 @@ struct tcp_out_options {
406 * Beware: Something in the Internet is very sensitive to the ordering of 408 * Beware: Something in the Internet is very sensitive to the ordering of
407 * TCP options, we learned this through the hard way, so be careful here. 409 * TCP options, we learned this through the hard way, so be careful here.
408 * Luckily we can at least blame others for their non-compliance but from 410 * Luckily we can at least blame others for their non-compliance but from
409 * inter-operatibility perspective it seems that we're somewhat stuck with 411 * inter-operability perspective it seems that we're somewhat stuck with
410 * the ordering which we have been using if we want to keep working with 412 * the ordering which we have been using if we want to keep working with
411 * those broken things (not that it currently hurts anybody as there isn't 413 * those broken things (not that it currently hurts anybody as there isn't
412 * particular reason why the ordering would need to be changed). 414 * particular reason why the ordering would need to be changed).
@@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
679 * 681 *
680 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb 682 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
681 * needs to be reallocated in a driver. 683 * needs to be reallocated in a driver.
682 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc 684 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
683 * 685 *
684 * Since transmit from skb destructor is forbidden, we use a tasklet 686 * Since transmit from skb destructor is forbidden, we use a tasklet
685 * to process all sockets that eventually need to send more skbs. 687 * to process all sockets that eventually need to send more skbs.
@@ -699,9 +701,9 @@ static void tcp_tsq_handler(struct sock *sk)
699 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); 701 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
700} 702}
701/* 703/*
702 * One tasklest per cpu tries to send more skbs. 704 * One tasklet per cpu tries to send more skbs.
703 * We run in tasklet context but need to disable irqs when 705 * We run in tasklet context but need to disable irqs when
704 * transfering tsq->head because tcp_wfree() might 706 * transferring tsq->head because tcp_wfree() might
705 * interrupt us (non NAPI drivers) 707 * interrupt us (non NAPI drivers)
706 */ 708 */
707static void tcp_tasklet_func(unsigned long data) 709static void tcp_tasklet_func(unsigned long data)
@@ -795,7 +797,7 @@ void __init tcp_tasklet_init(void)
795 797
796/* 798/*
797 * Write buffer destructor automatically called from kfree_skb. 799 * Write buffer destructor automatically called from kfree_skb.
798 * We cant xmit new skbs from this context, as we might already 800 * We can't xmit new skbs from this context, as we might already
799 * hold qdisc lock. 801 * hold qdisc lock.
800 */ 802 */
801void tcp_wfree(struct sk_buff *skb) 803void tcp_wfree(struct sk_buff *skb)
@@ -986,6 +988,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
986static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, 988static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
987 unsigned int mss_now) 989 unsigned int mss_now)
988{ 990{
991 struct skb_shared_info *shinfo = skb_shinfo(skb);
992
989 /* Make sure we own this skb before messing gso_size/gso_segs */ 993 /* Make sure we own this skb before messing gso_size/gso_segs */
990 WARN_ON_ONCE(skb_cloned(skb)); 994 WARN_ON_ONCE(skb_cloned(skb));
991 995
@@ -993,13 +997,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
993 /* Avoid the costly divide in the normal 997 /* Avoid the costly divide in the normal
994 * non-TSO case. 998 * non-TSO case.
995 */ 999 */
996 skb_shinfo(skb)->gso_segs = 1; 1000 shinfo->gso_segs = 1;
997 skb_shinfo(skb)->gso_size = 0; 1001 shinfo->gso_size = 0;
998 skb_shinfo(skb)->gso_type = 0; 1002 shinfo->gso_type = 0;
999 } else { 1003 } else {
1000 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); 1004 shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
1001 skb_shinfo(skb)->gso_size = mss_now; 1005 shinfo->gso_size = mss_now;
1002 skb_shinfo(skb)->gso_type = sk->sk_gso_type; 1006 shinfo->gso_type = sk->sk_gso_type;
1003 } 1007 }
1004} 1008}
1005 1009
@@ -1146,6 +1150,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1146 */ 1150 */
1147static void __pskb_trim_head(struct sk_buff *skb, int len) 1151static void __pskb_trim_head(struct sk_buff *skb, int len)
1148{ 1152{
1153 struct skb_shared_info *shinfo;
1149 int i, k, eat; 1154 int i, k, eat;
1150 1155
1151 eat = min_t(int, len, skb_headlen(skb)); 1156 eat = min_t(int, len, skb_headlen(skb));
@@ -1157,23 +1162,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1157 } 1162 }
1158 eat = len; 1163 eat = len;
1159 k = 0; 1164 k = 0;
1160 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1165 shinfo = skb_shinfo(skb);
1161 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1166 for (i = 0; i < shinfo->nr_frags; i++) {
1167 int size = skb_frag_size(&shinfo->frags[i]);
1162 1168
1163 if (size <= eat) { 1169 if (size <= eat) {
1164 skb_frag_unref(skb, i); 1170 skb_frag_unref(skb, i);
1165 eat -= size; 1171 eat -= size;
1166 } else { 1172 } else {
1167 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1173 shinfo->frags[k] = shinfo->frags[i];
1168 if (eat) { 1174 if (eat) {
1169 skb_shinfo(skb)->frags[k].page_offset += eat; 1175 shinfo->frags[k].page_offset += eat;
1170 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1176 skb_frag_size_sub(&shinfo->frags[k], eat);
1171 eat = 0; 1177 eat = 0;
1172 } 1178 }
1173 k++; 1179 k++;
1174 } 1180 }
1175 } 1181 }
1176 skb_shinfo(skb)->nr_frags = k; 1182 shinfo->nr_frags = k;
1177 1183
1178 skb_reset_tail_pointer(skb); 1184 skb_reset_tail_pointer(skb);
1179 skb->data_len -= len; 1185 skb->data_len -= len;
@@ -1378,23 +1384,51 @@ static void tcp_cwnd_validate(struct sock *sk)
1378 } 1384 }
1379} 1385}
1380 1386
1381/* Returns the portion of skb which can be sent right away without 1387/* Minshall's variant of the Nagle send check. */
1382 * introducing MSS oddities to segment boundaries. In rare cases where 1388static bool tcp_minshall_check(const struct tcp_sock *tp)
1383 * mss_now != mss_cache, we will request caller to create a small skb 1389{
1384 * per input skb which could be mostly avoided here (if desired). 1390 return after(tp->snd_sml, tp->snd_una) &&
1385 * 1391 !after(tp->snd_sml, tp->snd_nxt);
1386 * We explicitly want to create a request for splitting write queue tail 1392}
1387 * to a small skb for Nagle purposes while avoiding unnecessary modulos, 1393
1388 * thus all the complexity (cwnd_len is always MSS multiple which we 1394/* Update snd_sml if this skb is under mss
1389 * return whenever allowed by the other factors). Basically we need the 1395 * Note that a TSO packet might end with a sub-mss segment
1390 * modulo only when the receiver window alone is the limiting factor or 1396 * The test is really :
1391 * when we would be allowed to send the split-due-to-Nagle skb fully. 1397 * if ((skb->len % mss) != 0)
1398 * tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1399 * But we can avoid doing the divide again given we already have
1400 * skb_pcount = skb->len / mss_now
1401 */
1402static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1403 const struct sk_buff *skb)
1404{
1405 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1406 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1407}
1408
1409/* Return false, if packet can be sent now without violation Nagle's rules:
1410 * 1. It is full sized. (provided by caller in %partial bool)
1411 * 2. Or it contains FIN. (already checked by caller)
1412 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1413 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1414 * With Minshall's modification: all sent small packets are ACKed.
1392 */ 1415 */
1393static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, 1416static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1394 unsigned int mss_now, unsigned int max_segs) 1417 unsigned int mss_now, int nonagle)
1418{
1419 return partial &&
1420 ((nonagle & TCP_NAGLE_CORK) ||
1421 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1422}
1423/* Returns the portion of skb which can be sent right away */
1424static unsigned int tcp_mss_split_point(const struct sock *sk,
1425 const struct sk_buff *skb,
1426 unsigned int mss_now,
1427 unsigned int max_segs,
1428 int nonagle)
1395{ 1429{
1396 const struct tcp_sock *tp = tcp_sk(sk); 1430 const struct tcp_sock *tp = tcp_sk(sk);
1397 u32 needed, window, max_len; 1431 u32 partial, needed, window, max_len;
1398 1432
1399 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1433 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1400 max_len = mss_now * max_segs; 1434 max_len = mss_now * max_segs;
@@ -1407,7 +1441,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
1407 if (max_len <= needed) 1441 if (max_len <= needed)
1408 return max_len; 1442 return max_len;
1409 1443
1410 return needed - needed % mss_now; 1444 partial = needed % mss_now;
1445 /* If last segment is not a full MSS, check if Nagle rules allow us
1446 * to include this last segment in this skb.
1447 * Otherwise, we'll split the skb at last MSS boundary
1448 */
1449 if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle))
1450 return needed - partial;
1451
1452 return needed;
1411} 1453}
1412 1454
1413/* Can at least one segment of SKB be sent right now, according to the 1455/* Can at least one segment of SKB be sent right now, according to the
@@ -1447,28 +1489,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1447 return tso_segs; 1489 return tso_segs;
1448} 1490}
1449 1491
1450/* Minshall's variant of the Nagle send check. */
1451static inline bool tcp_minshall_check(const struct tcp_sock *tp)
1452{
1453 return after(tp->snd_sml, tp->snd_una) &&
1454 !after(tp->snd_sml, tp->snd_nxt);
1455}
1456
1457/* Return false, if packet can be sent now without violation Nagle's rules:
1458 * 1. It is full sized.
1459 * 2. Or it contains FIN. (already checked by caller)
1460 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1461 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1462 * With Minshall's modification: all sent small packets are ACKed.
1463 */
1464static inline bool tcp_nagle_check(const struct tcp_sock *tp,
1465 const struct sk_buff *skb,
1466 unsigned int mss_now, int nonagle)
1467{
1468 return skb->len < mss_now &&
1469 ((nonagle & TCP_NAGLE_CORK) ||
1470 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1471}
1472 1492
1473/* Return true if the Nagle test allows this packet to be 1493/* Return true if the Nagle test allows this packet to be
1474 * sent now. 1494 * sent now.
@@ -1489,7 +1509,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1489 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1509 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1490 return true; 1510 return true;
1491 1511
1492 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1512 if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle))
1493 return true; 1513 return true;
1494 1514
1495 return false; 1515 return false;
@@ -1892,7 +1912,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1892 limit = tcp_mss_split_point(sk, skb, mss_now, 1912 limit = tcp_mss_split_point(sk, skb, mss_now,
1893 min_t(unsigned int, 1913 min_t(unsigned int,
1894 cwnd_quota, 1914 cwnd_quota,
1895 sk->sk_gso_max_segs)); 1915 sk->sk_gso_max_segs),
1916 nonagle);
1896 1917
1897 if (skb->len > limit && 1918 if (skb->len > limit &&
1898 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1919 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -2756,7 +2777,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2756EXPORT_SYMBOL(tcp_make_synack); 2777EXPORT_SYMBOL(tcp_make_synack);
2757 2778
2758/* Do all connect socket setups that can be done AF independent. */ 2779/* Do all connect socket setups that can be done AF independent. */
2759void tcp_connect_init(struct sock *sk) 2780static void tcp_connect_init(struct sock *sk)
2760{ 2781{
2761 const struct dst_entry *dst = __sk_dst_get(sk); 2782 const struct dst_entry *dst = __sk_dst_get(sk);
2762 struct tcp_sock *tp = tcp_sk(sk); 2783 struct tcp_sock *tp = tcp_sk(sk);