diff options
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 141 |
1 files changed, 81 insertions, 60 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7820f3a7dd70..03d26b85eab8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -363,15 +363,17 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, | |||
363 | */ | 363 | */ |
364 | static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | 364 | static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) |
365 | { | 365 | { |
366 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
367 | |||
366 | skb->ip_summed = CHECKSUM_PARTIAL; | 368 | skb->ip_summed = CHECKSUM_PARTIAL; |
367 | skb->csum = 0; | 369 | skb->csum = 0; |
368 | 370 | ||
369 | TCP_SKB_CB(skb)->tcp_flags = flags; | 371 | TCP_SKB_CB(skb)->tcp_flags = flags; |
370 | TCP_SKB_CB(skb)->sacked = 0; | 372 | TCP_SKB_CB(skb)->sacked = 0; |
371 | 373 | ||
372 | skb_shinfo(skb)->gso_segs = 1; | 374 | shinfo->gso_segs = 1; |
373 | skb_shinfo(skb)->gso_size = 0; | 375 | shinfo->gso_size = 0; |
374 | skb_shinfo(skb)->gso_type = 0; | 376 | shinfo->gso_type = 0; |
375 | 377 | ||
376 | TCP_SKB_CB(skb)->seq = seq; | 378 | TCP_SKB_CB(skb)->seq = seq; |
377 | if (flags & (TCPHDR_SYN | TCPHDR_FIN)) | 379 | if (flags & (TCPHDR_SYN | TCPHDR_FIN)) |
@@ -406,7 +408,7 @@ struct tcp_out_options { | |||
406 | * Beware: Something in the Internet is very sensitive to the ordering of | 408 | * Beware: Something in the Internet is very sensitive to the ordering of |
407 | * TCP options, we learned this through the hard way, so be careful here. | 409 | * TCP options, we learned this through the hard way, so be careful here. |
408 | * Luckily we can at least blame others for their non-compliance but from | 410 | * Luckily we can at least blame others for their non-compliance but from |
409 | * inter-operatibility perspective it seems that we're somewhat stuck with | 411 | * inter-operability perspective it seems that we're somewhat stuck with |
410 | * the ordering which we have been using if we want to keep working with | 412 | * the ordering which we have been using if we want to keep working with |
411 | * those broken things (not that it currently hurts anybody as there isn't | 413 | * those broken things (not that it currently hurts anybody as there isn't |
412 | * particular reason why the ordering would need to be changed). | 414 | * particular reason why the ordering would need to be changed). |
@@ -679,7 +681,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
679 | * | 681 | * |
680 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | 682 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb |
681 | * needs to be reallocated in a driver. | 683 | * needs to be reallocated in a driver. |
682 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | 684 | * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc |
683 | * | 685 | * |
684 | * Since transmit from skb destructor is forbidden, we use a tasklet | 686 | * Since transmit from skb destructor is forbidden, we use a tasklet |
685 | * to process all sockets that eventually need to send more skbs. | 687 | * to process all sockets that eventually need to send more skbs. |
@@ -699,9 +701,9 @@ static void tcp_tsq_handler(struct sock *sk) | |||
699 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); | 701 | tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); |
700 | } | 702 | } |
701 | /* | 703 | /* |
702 | * One tasklest per cpu tries to send more skbs. | 704 | * One tasklet per cpu tries to send more skbs. |
703 | * We run in tasklet context but need to disable irqs when | 705 | * We run in tasklet context but need to disable irqs when |
704 | * transfering tsq->head because tcp_wfree() might | 706 | * transferring tsq->head because tcp_wfree() might |
705 | * interrupt us (non NAPI drivers) | 707 | * interrupt us (non NAPI drivers) |
706 | */ | 708 | */ |
707 | static void tcp_tasklet_func(unsigned long data) | 709 | static void tcp_tasklet_func(unsigned long data) |
@@ -795,7 +797,7 @@ void __init tcp_tasklet_init(void) | |||
795 | 797 | ||
796 | /* | 798 | /* |
797 | * Write buffer destructor automatically called from kfree_skb. | 799 | * Write buffer destructor automatically called from kfree_skb. |
798 | * We cant xmit new skbs from this context, as we might already | 800 | * We can't xmit new skbs from this context, as we might already |
799 | * hold qdisc lock. | 801 | * hold qdisc lock. |
800 | */ | 802 | */ |
801 | void tcp_wfree(struct sk_buff *skb) | 803 | void tcp_wfree(struct sk_buff *skb) |
@@ -986,6 +988,8 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) | |||
986 | static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, | 988 | static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, |
987 | unsigned int mss_now) | 989 | unsigned int mss_now) |
988 | { | 990 | { |
991 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
992 | |||
989 | /* Make sure we own this skb before messing gso_size/gso_segs */ | 993 | /* Make sure we own this skb before messing gso_size/gso_segs */ |
990 | WARN_ON_ONCE(skb_cloned(skb)); | 994 | WARN_ON_ONCE(skb_cloned(skb)); |
991 | 995 | ||
@@ -993,13 +997,13 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
993 | /* Avoid the costly divide in the normal | 997 | /* Avoid the costly divide in the normal |
994 | * non-TSO case. | 998 | * non-TSO case. |
995 | */ | 999 | */ |
996 | skb_shinfo(skb)->gso_segs = 1; | 1000 | shinfo->gso_segs = 1; |
997 | skb_shinfo(skb)->gso_size = 0; | 1001 | shinfo->gso_size = 0; |
998 | skb_shinfo(skb)->gso_type = 0; | 1002 | shinfo->gso_type = 0; |
999 | } else { | 1003 | } else { |
1000 | skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); | 1004 | shinfo->gso_segs = DIV_ROUND_UP(skb->len, mss_now); |
1001 | skb_shinfo(skb)->gso_size = mss_now; | 1005 | shinfo->gso_size = mss_now; |
1002 | skb_shinfo(skb)->gso_type = sk->sk_gso_type; | 1006 | shinfo->gso_type = sk->sk_gso_type; |
1003 | } | 1007 | } |
1004 | } | 1008 | } |
1005 | 1009 | ||
@@ -1146,6 +1150,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, | |||
1146 | */ | 1150 | */ |
1147 | static void __pskb_trim_head(struct sk_buff *skb, int len) | 1151 | static void __pskb_trim_head(struct sk_buff *skb, int len) |
1148 | { | 1152 | { |
1153 | struct skb_shared_info *shinfo; | ||
1149 | int i, k, eat; | 1154 | int i, k, eat; |
1150 | 1155 | ||
1151 | eat = min_t(int, len, skb_headlen(skb)); | 1156 | eat = min_t(int, len, skb_headlen(skb)); |
@@ -1157,23 +1162,24 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) | |||
1157 | } | 1162 | } |
1158 | eat = len; | 1163 | eat = len; |
1159 | k = 0; | 1164 | k = 0; |
1160 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | 1165 | shinfo = skb_shinfo(skb); |
1161 | int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); | 1166 | for (i = 0; i < shinfo->nr_frags; i++) { |
1167 | int size = skb_frag_size(&shinfo->frags[i]); | ||
1162 | 1168 | ||
1163 | if (size <= eat) { | 1169 | if (size <= eat) { |
1164 | skb_frag_unref(skb, i); | 1170 | skb_frag_unref(skb, i); |
1165 | eat -= size; | 1171 | eat -= size; |
1166 | } else { | 1172 | } else { |
1167 | skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; | 1173 | shinfo->frags[k] = shinfo->frags[i]; |
1168 | if (eat) { | 1174 | if (eat) { |
1169 | skb_shinfo(skb)->frags[k].page_offset += eat; | 1175 | shinfo->frags[k].page_offset += eat; |
1170 | skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); | 1176 | skb_frag_size_sub(&shinfo->frags[k], eat); |
1171 | eat = 0; | 1177 | eat = 0; |
1172 | } | 1178 | } |
1173 | k++; | 1179 | k++; |
1174 | } | 1180 | } |
1175 | } | 1181 | } |
1176 | skb_shinfo(skb)->nr_frags = k; | 1182 | shinfo->nr_frags = k; |
1177 | 1183 | ||
1178 | skb_reset_tail_pointer(skb); | 1184 | skb_reset_tail_pointer(skb); |
1179 | skb->data_len -= len; | 1185 | skb->data_len -= len; |
@@ -1378,23 +1384,51 @@ static void tcp_cwnd_validate(struct sock *sk) | |||
1378 | } | 1384 | } |
1379 | } | 1385 | } |
1380 | 1386 | ||
1381 | /* Returns the portion of skb which can be sent right away without | 1387 | /* Minshall's variant of the Nagle send check. */ |
1382 | * introducing MSS oddities to segment boundaries. In rare cases where | 1388 | static bool tcp_minshall_check(const struct tcp_sock *tp) |
1383 | * mss_now != mss_cache, we will request caller to create a small skb | 1389 | { |
1384 | * per input skb which could be mostly avoided here (if desired). | 1390 | return after(tp->snd_sml, tp->snd_una) && |
1385 | * | 1391 | !after(tp->snd_sml, tp->snd_nxt); |
1386 | * We explicitly want to create a request for splitting write queue tail | 1392 | } |
1387 | * to a small skb for Nagle purposes while avoiding unnecessary modulos, | 1393 | |
1388 | * thus all the complexity (cwnd_len is always MSS multiple which we | 1394 | /* Update snd_sml if this skb is under mss |
1389 | * return whenever allowed by the other factors). Basically we need the | 1395 | * Note that a TSO packet might end with a sub-mss segment |
1390 | * modulo only when the receiver window alone is the limiting factor or | 1396 | * The test is really : |
1391 | * when we would be allowed to send the split-due-to-Nagle skb fully. | 1397 | * if ((skb->len % mss) != 0) |
1398 | * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; | ||
1399 | * But we can avoid doing the divide again given we already have | ||
1400 | * skb_pcount = skb->len / mss_now | ||
1401 | */ | ||
1402 | static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, | ||
1403 | const struct sk_buff *skb) | ||
1404 | { | ||
1405 | if (skb->len < tcp_skb_pcount(skb) * mss_now) | ||
1406 | tp->snd_sml = TCP_SKB_CB(skb)->end_seq; | ||
1407 | } | ||
1408 | |||
1409 | /* Return false, if packet can be sent now without violation Nagle's rules: | ||
1410 | * 1. It is full sized. (provided by caller in %partial bool) | ||
1411 | * 2. Or it contains FIN. (already checked by caller) | ||
1412 | * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. | ||
1413 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. | ||
1414 | * With Minshall's modification: all sent small packets are ACKed. | ||
1392 | */ | 1415 | */ |
1393 | static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, | 1416 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, |
1394 | unsigned int mss_now, unsigned int max_segs) | 1417 | unsigned int mss_now, int nonagle) |
1418 | { | ||
1419 | return partial && | ||
1420 | ((nonagle & TCP_NAGLE_CORK) || | ||
1421 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | ||
1422 | } | ||
1423 | /* Returns the portion of skb which can be sent right away */ | ||
1424 | static unsigned int tcp_mss_split_point(const struct sock *sk, | ||
1425 | const struct sk_buff *skb, | ||
1426 | unsigned int mss_now, | ||
1427 | unsigned int max_segs, | ||
1428 | int nonagle) | ||
1395 | { | 1429 | { |
1396 | const struct tcp_sock *tp = tcp_sk(sk); | 1430 | const struct tcp_sock *tp = tcp_sk(sk); |
1397 | u32 needed, window, max_len; | 1431 | u32 partial, needed, window, max_len; |
1398 | 1432 | ||
1399 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; | 1433 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; |
1400 | max_len = mss_now * max_segs; | 1434 | max_len = mss_now * max_segs; |
@@ -1407,7 +1441,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b | |||
1407 | if (max_len <= needed) | 1441 | if (max_len <= needed) |
1408 | return max_len; | 1442 | return max_len; |
1409 | 1443 | ||
1410 | return needed - needed % mss_now; | 1444 | partial = needed % mss_now; |
1445 | /* If last segment is not a full MSS, check if Nagle rules allow us | ||
1446 | * to include this last segment in this skb. | ||
1447 | * Otherwise, we'll split the skb at last MSS boundary | ||
1448 | */ | ||
1449 | if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) | ||
1450 | return needed - partial; | ||
1451 | |||
1452 | return needed; | ||
1411 | } | 1453 | } |
1412 | 1454 | ||
1413 | /* Can at least one segment of SKB be sent right now, according to the | 1455 | /* Can at least one segment of SKB be sent right now, according to the |
@@ -1447,28 +1489,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
1447 | return tso_segs; | 1489 | return tso_segs; |
1448 | } | 1490 | } |
1449 | 1491 | ||
1450 | /* Minshall's variant of the Nagle send check. */ | ||
1451 | static inline bool tcp_minshall_check(const struct tcp_sock *tp) | ||
1452 | { | ||
1453 | return after(tp->snd_sml, tp->snd_una) && | ||
1454 | !after(tp->snd_sml, tp->snd_nxt); | ||
1455 | } | ||
1456 | |||
1457 | /* Return false, if packet can be sent now without violation Nagle's rules: | ||
1458 | * 1. It is full sized. | ||
1459 | * 2. Or it contains FIN. (already checked by caller) | ||
1460 | * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. | ||
1461 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. | ||
1462 | * With Minshall's modification: all sent small packets are ACKed. | ||
1463 | */ | ||
1464 | static inline bool tcp_nagle_check(const struct tcp_sock *tp, | ||
1465 | const struct sk_buff *skb, | ||
1466 | unsigned int mss_now, int nonagle) | ||
1467 | { | ||
1468 | return skb->len < mss_now && | ||
1469 | ((nonagle & TCP_NAGLE_CORK) || | ||
1470 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | ||
1471 | } | ||
1472 | 1492 | ||
1473 | /* Return true if the Nagle test allows this packet to be | 1493 | /* Return true if the Nagle test allows this packet to be |
1474 | * sent now. | 1494 | * sent now. |
@@ -1489,7 +1509,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
1489 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | 1509 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
1490 | return true; | 1510 | return true; |
1491 | 1511 | ||
1492 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) | 1512 | if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) |
1493 | return true; | 1513 | return true; |
1494 | 1514 | ||
1495 | return false; | 1515 | return false; |
@@ -1892,7 +1912,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1892 | limit = tcp_mss_split_point(sk, skb, mss_now, | 1912 | limit = tcp_mss_split_point(sk, skb, mss_now, |
1893 | min_t(unsigned int, | 1913 | min_t(unsigned int, |
1894 | cwnd_quota, | 1914 | cwnd_quota, |
1895 | sk->sk_gso_max_segs)); | 1915 | sk->sk_gso_max_segs), |
1916 | nonagle); | ||
1896 | 1917 | ||
1897 | if (skb->len > limit && | 1918 | if (skb->len > limit && |
1898 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 1919 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |
@@ -2756,7 +2777,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2756 | EXPORT_SYMBOL(tcp_make_synack); | 2777 | EXPORT_SYMBOL(tcp_make_synack); |
2757 | 2778 | ||
2758 | /* Do all connect socket setups that can be done AF independent. */ | 2779 | /* Do all connect socket setups that can be done AF independent. */ |
2759 | void tcp_connect_init(struct sock *sk) | 2780 | static void tcp_connect_init(struct sock *sk) |
2760 | { | 2781 | { |
2761 | const struct dst_entry *dst = __sk_dst_get(sk); | 2782 | const struct dst_entry *dst = __sk_dst_get(sk); |
2762 | struct tcp_sock *tp = tcp_sk(sk); | 2783 | struct tcp_sock *tp = tcp_sk(sk); |