diff options
author | Eric Dumazet <edumazet@google.com> | 2013-12-13 16:51:23 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-12-17 15:15:25 -0500 |
commit | d4589926d7a9a4b88e650bb9154bab71e8d2a7dd (patch) | |
tree | 692b45bcf6e29af4f1a33f996aef054756574bee /net/ipv4/tcp_output.c | |
parent | 477bb93320cec7ae74d5ccfad4f2bfa0b28fbe90 (diff) |
tcp: refine TSO splits
While investigating performance problems on small RPC workloads,
I noticed linux TCP stack was always splitting the last TSO skb
into two parts (skbs). One being a multiple of MSS, and a small one
with the Push flag. This split is done even if TCP_NODELAY is set,
or if no small packet is in flight.
Example with request/response of 4K/4K
IP A > B: . ack 68432 win 2783 <nop,nop,timestamp 6524593 6525001>
IP A > B: . 65537:68433(2896) ack 69632 win 2783 <nop,nop,timestamp 6524593 6525001>
IP A > B: P 68433:69633(1200) ack 69632 win 2783 <nop,nop,timestamp 6524593 6525001>
IP B > A: . ack 68433 win 2768 <nop,nop,timestamp 6525001 6524593>
IP B > A: . 69632:72528(2896) ack 69633 win 2768 <nop,nop,timestamp 6525001 6524593>
IP B > A: P 72528:73728(1200) ack 69633 win 2768 <nop,nop,timestamp 6525001 6524593>
IP A > B: . ack 72528 win 2783 <nop,nop,timestamp 6524593 6525001>
IP A > B: . 69633:72529(2896) ack 73728 win 2783 <nop,nop,timestamp 6524593 6525001>
IP A > B: P 72529:73729(1200) ack 73728 win 2783 <nop,nop,timestamp 6524593 6525001>
We can avoid this split by including the Nagle tests at the right place.
Note : If some NIC had trouble sending TSO packets with a partial
last segment, we would have hit the problem in GRO/forwarding workload already.
tcp_minshall_update() is moved to tcp_output.c and is updated as we might
feed a TSO packet with a partial last segment.
This patch tremendously improves performance, as the traffic now looks
like :
IP A > B: . ack 98304 win 2783 <nop,nop,timestamp 6834277 6834685>
IP A > B: P 94209:98305(4096) ack 98304 win 2783 <nop,nop,timestamp 6834277 6834685>
IP B > A: . ack 98305 win 2768 <nop,nop,timestamp 6834686 6834277>
IP B > A: P 98304:102400(4096) ack 98305 win 2768 <nop,nop,timestamp 6834686 6834277>
IP A > B: . ack 102400 win 2783 <nop,nop,timestamp 6834279 6834686>
IP A > B: P 98305:102401(4096) ack 102400 win 2783 <nop,nop,timestamp 6834279 6834686>
IP B > A: . ack 102401 win 2768 <nop,nop,timestamp 6834687 6834279>
IP B > A: P 102400:106496(4096) ack 102401 win 2768 <nop,nop,timestamp 6834687 6834279>
IP A > B: . ack 106496 win 2783 <nop,nop,timestamp 6834280 6834687>
IP A > B: P 102401:106497(4096) ack 106496 win 2783 <nop,nop,timestamp 6834280 6834687>
IP B > A: . ack 106497 win 2768 <nop,nop,timestamp 6834688 6834280>
IP B > A: P 106496:110592(4096) ack 106497 win 2768 <nop,nop,timestamp 6834688 6834280>
Before :
lpq83:~# nstat >/dev/null;perf stat ./super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K
280774
Performance counter stats for './super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K':
205719.049006 task-clock # 9.278 CPUs utilized
8,449,968 context-switches # 0.041 M/sec
1,935,997 CPU-migrations # 0.009 M/sec
160,541 page-faults # 0.780 K/sec
548,478,722,290 cycles # 2.666 GHz [83.20%]
455,240,670,857 stalled-cycles-frontend # 83.00% frontend cycles idle [83.48%]
272,881,454,275 stalled-cycles-backend # 49.75% backend cycles idle [66.73%]
166,091,460,030 instructions # 0.30 insns per cycle
# 2.74 stalled cycles per insn [83.39%]
29,150,229,399 branches # 141.699 M/sec [83.30%]
1,943,814,026 branch-misses # 6.67% of all branches [83.32%]
22.173517844 seconds time elapsed
lpq83:~# nstat | egrep "IpOutRequests|IpExtOutOctets"
IpOutRequests 16851063 0.0
IpExtOutOctets 23878580777 0.0
After patch :
lpq83:~# nstat >/dev/null;perf stat ./super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K
280877
Performance counter stats for './super_netperf 200 -t TCP_RR -H lpq84 -l 20 -- -r 4K,4K':
107496.071918 task-clock # 4.847 CPUs utilized
5,635,458 context-switches # 0.052 M/sec
1,374,707 CPU-migrations # 0.013 M/sec
160,920 page-faults # 0.001 M/sec
281,500,010,924 cycles # 2.619 GHz [83.28%]
228,865,069,307 stalled-cycles-frontend # 81.30% frontend cycles idle [83.38%]
142,462,742,658 stalled-cycles-backend # 50.61% backend cycles idle [66.81%]
95,227,712,566 instructions # 0.34 insns per cycle
# 2.40 stalled cycles per insn [83.43%]
16,209,868,171 branches # 150.795 M/sec [83.20%]
874,252,952 branch-misses # 5.39% of all branches [83.37%]
22.175821286 seconds time elapsed
lpq83:~# nstat | egrep "IpOutRequests|IpExtOutOctets"
IpOutRequests 11239428 0.0
IpExtOutOctets 23595191035 0.0
Indeed, the occupancy of tx skbs (IpExtOutOctets/IpOutRequests) is higher :
2099 instead of 1417, thus helping GRO to be more efficient when using FQ packet
scheduler.
Many thanks to Neal for review and ideas.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Van Jacobson <vanj@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 93 |
1 files changed, 54 insertions, 39 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2a69f42e51ca..9e7aec7ee67e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1384,23 +1384,51 @@ static void tcp_cwnd_validate(struct sock *sk) | |||
1384 | } | 1384 | } |
1385 | } | 1385 | } |
1386 | 1386 | ||
1387 | /* Returns the portion of skb which can be sent right away without | 1387 | /* Minshall's variant of the Nagle send check. */ |
1388 | * introducing MSS oddities to segment boundaries. In rare cases where | 1388 | static bool tcp_minshall_check(const struct tcp_sock *tp) |
1389 | * mss_now != mss_cache, we will request caller to create a small skb | 1389 | { |
1390 | * per input skb which could be mostly avoided here (if desired). | 1390 | return after(tp->snd_sml, tp->snd_una) && |
1391 | * | 1391 | !after(tp->snd_sml, tp->snd_nxt); |
1392 | * We explicitly want to create a request for splitting write queue tail | 1392 | } |
1393 | * to a small skb for Nagle purposes while avoiding unnecessary modulos, | 1393 | |
1394 | * thus all the complexity (cwnd_len is always MSS multiple which we | 1394 | /* Update snd_sml if this skb is under mss |
1395 | * return whenever allowed by the other factors). Basically we need the | 1395 | * Note that a TSO packet might end with a sub-mss segment |
1396 | * modulo only when the receiver window alone is the limiting factor or | 1396 | * The test is really : |
1397 | * when we would be allowed to send the split-due-to-Nagle skb fully. | 1397 | * if ((skb->len % mss) != 0) |
1398 | * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; | ||
1399 | * But we can avoid doing the divide again given we already have | ||
1400 | * skb_pcount = skb->len / mss_now | ||
1398 | */ | 1401 | */ |
1399 | static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, | 1402 | static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, |
1400 | unsigned int mss_now, unsigned int max_segs) | 1403 | const struct sk_buff *skb) |
1404 | { | ||
1405 | if (skb->len < tcp_skb_pcount(skb) * mss_now) | ||
1406 | tp->snd_sml = TCP_SKB_CB(skb)->end_seq; | ||
1407 | } | ||
1408 | |||
1409 | /* Return false, if packet can be sent now without violation Nagle's rules: | ||
1410 | * 1. It is full sized. (provided by caller in %partial bool) | ||
1411 | * 2. Or it contains FIN. (already checked by caller) | ||
1412 | * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. | ||
1413 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. | ||
1414 | * With Minshall's modification: all sent small packets are ACKed. | ||
1415 | */ | ||
1416 | static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, | ||
1417 | unsigned int mss_now, int nonagle) | ||
1418 | { | ||
1419 | return partial && | ||
1420 | ((nonagle & TCP_NAGLE_CORK) || | ||
1421 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | ||
1422 | } | ||
1423 | /* Returns the portion of skb which can be sent right away */ | ||
1424 | static unsigned int tcp_mss_split_point(const struct sock *sk, | ||
1425 | const struct sk_buff *skb, | ||
1426 | unsigned int mss_now, | ||
1427 | unsigned int max_segs, | ||
1428 | int nonagle) | ||
1401 | { | 1429 | { |
1402 | const struct tcp_sock *tp = tcp_sk(sk); | 1430 | const struct tcp_sock *tp = tcp_sk(sk); |
1403 | u32 needed, window, max_len; | 1431 | u32 partial, needed, window, max_len; |
1404 | 1432 | ||
1405 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; | 1433 | window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; |
1406 | max_len = mss_now * max_segs; | 1434 | max_len = mss_now * max_segs; |
@@ -1413,7 +1441,15 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b | |||
1413 | if (max_len <= needed) | 1441 | if (max_len <= needed) |
1414 | return max_len; | 1442 | return max_len; |
1415 | 1443 | ||
1416 | return needed - needed % mss_now; | 1444 | partial = needed % mss_now; |
1445 | /* If last segment is not a full MSS, check if Nagle rules allow us | ||
1446 | * to include this last segment in this skb. | ||
1447 | * Otherwise, we'll split the skb at last MSS boundary | ||
1448 | */ | ||
1449 | if (tcp_nagle_check(partial != 0, tp, mss_now, nonagle)) | ||
1450 | return needed - partial; | ||
1451 | |||
1452 | return needed; | ||
1417 | } | 1453 | } |
1418 | 1454 | ||
1419 | /* Can at least one segment of SKB be sent right now, according to the | 1455 | /* Can at least one segment of SKB be sent right now, according to the |
@@ -1453,28 +1489,6 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, | |||
1453 | return tso_segs; | 1489 | return tso_segs; |
1454 | } | 1490 | } |
1455 | 1491 | ||
1456 | /* Minshall's variant of the Nagle send check. */ | ||
1457 | static inline bool tcp_minshall_check(const struct tcp_sock *tp) | ||
1458 | { | ||
1459 | return after(tp->snd_sml, tp->snd_una) && | ||
1460 | !after(tp->snd_sml, tp->snd_nxt); | ||
1461 | } | ||
1462 | |||
1463 | /* Return false, if packet can be sent now without violation Nagle's rules: | ||
1464 | * 1. It is full sized. | ||
1465 | * 2. Or it contains FIN. (already checked by caller) | ||
1466 | * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. | ||
1467 | * 4. Or TCP_CORK is not set, and all sent packets are ACKed. | ||
1468 | * With Minshall's modification: all sent small packets are ACKed. | ||
1469 | */ | ||
1470 | static inline bool tcp_nagle_check(const struct tcp_sock *tp, | ||
1471 | const struct sk_buff *skb, | ||
1472 | unsigned int mss_now, int nonagle) | ||
1473 | { | ||
1474 | return skb->len < mss_now && | ||
1475 | ((nonagle & TCP_NAGLE_CORK) || | ||
1476 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | ||
1477 | } | ||
1478 | 1492 | ||
1479 | /* Return true if the Nagle test allows this packet to be | 1493 | /* Return true if the Nagle test allows this packet to be |
1480 | * sent now. | 1494 | * sent now. |
@@ -1495,7 +1509,7 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf | |||
1495 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) | 1509 | if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) |
1496 | return true; | 1510 | return true; |
1497 | 1511 | ||
1498 | if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) | 1512 | if (!tcp_nagle_check(skb->len < cur_mss, tp, cur_mss, nonagle)) |
1499 | return true; | 1513 | return true; |
1500 | 1514 | ||
1501 | return false; | 1515 | return false; |
@@ -1898,7 +1912,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1898 | limit = tcp_mss_split_point(sk, skb, mss_now, | 1912 | limit = tcp_mss_split_point(sk, skb, mss_now, |
1899 | min_t(unsigned int, | 1913 | min_t(unsigned int, |
1900 | cwnd_quota, | 1914 | cwnd_quota, |
1901 | sk->sk_gso_max_segs)); | 1915 | sk->sk_gso_max_segs), |
1916 | nonagle); | ||
1902 | 1917 | ||
1903 | if (skb->len > limit && | 1918 | if (skb->len > limit && |
1904 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | 1919 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) |