diff options
author | Daniel Borkmann <dborkman@redhat.com> | 2014-04-14 15:45:17 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-04-14 16:26:48 -0400 |
commit | 362d52040c71f6e8d8158be48c812d7729cb8df1 (patch) | |
tree | 17116c15ee8030175ac6164f5072025ec9f6b635 /net | |
parent | bfae23249955819a42aa6c23d93708c818eff5c9 (diff) |
Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer"
This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management
to reflect real state of the receiver's buffer") as it introduced a
serious performance regression on SCTP over IPv4 and IPv6, though a not
as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs.
Current state:
[root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
Time: Fri, 11 Apr 2014 17:56:21 GMT
Connecting to host 192.168.241.3, port 5201
Cookie: Lab200slot2.1397238981.812898.548918
[ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201
Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval Transfer Bandwidth
[ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec
[ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec
[ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec
[ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec
[ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec
[ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec
[ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec
[ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec
[ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec
[ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec
[ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec
[ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec
[ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec
[ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec
[ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec
[ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec
[ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec
(etc)
[root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64
Time: Fri, 11 Apr 2014 19:08:41 GMT
Connecting to host 2001:db8:0:f101::1, port 5201
Cookie: Lab200slot2.1397243321.714295.2b3f7c
[ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201
Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval Transfer Bandwidth
[ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec
[ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec
[ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec
[ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec
[ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec
[ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec
[ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec
[ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec
[ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec
[ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec
[ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec
[ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec
[ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec
[ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec
(etc)
After patch:
[root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60
iperf version 3.0.1 (10 January 2014)
Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64
Time: Mon, 14 Apr 2014 16:40:48 GMT
Connecting to host 192.168.240.3, port 5201
Cookie: Lab200slot2.1397493648.413274.65e131
[ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201
Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test
[ ID] Interval Transfer Bandwidth
[ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec
[ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec
[ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec
[ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec
[ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec
[ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec
[ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec
[ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec
With the reverted patch applied, the SCTP/IPv4 performance is back
to normal on latest upstream for IPv4 and IPv6 and has same throughput
as 3.4.2 test kernel, steady and interval reports are smooth again.
Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer")
Reported-by: Peter Butler <pbutler@sonusnet.com>
Reported-by: Dongsheng Song <dongsheng.song@gmail.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Tested-by: Peter Butler <pbutler@sonusnet.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com>
Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/sctp/associola.c | 82 | ||||
-rw-r--r-- | net/sctp/sm_statefuns.c | 2 | ||||
-rw-r--r-- | net/sctp/socket.c | 6 | ||||
-rw-r--r-- | net/sctp/ulpevent.c | 8 |
4 files changed, 74 insertions, 24 deletions
diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 4f6d6f9d1274..39579c3e0d14 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c | |||
@@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) | |||
1395 | return false; | 1395 | return false; |
1396 | } | 1396 | } |
1397 | 1397 | ||
1398 | /* Update asoc's rwnd for the approximated state in the buffer, | 1398 | /* Increase asoc's rwnd by len and send any window update SACK if needed. */ |
1399 | * and check whether SACK needs to be sent. | 1399 | void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) |
1400 | */ | ||
1401 | void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) | ||
1402 | { | 1400 | { |
1403 | int rx_count; | ||
1404 | struct sctp_chunk *sack; | 1401 | struct sctp_chunk *sack; |
1405 | struct timer_list *timer; | 1402 | struct timer_list *timer; |
1406 | 1403 | ||
1407 | if (asoc->ep->rcvbuf_policy) | 1404 | if (asoc->rwnd_over) { |
1408 | rx_count = atomic_read(&asoc->rmem_alloc); | 1405 | if (asoc->rwnd_over >= len) { |
1409 | else | 1406 | asoc->rwnd_over -= len; |
1410 | rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); | 1407 | } else { |
1408 | asoc->rwnd += (len - asoc->rwnd_over); | ||
1409 | asoc->rwnd_over = 0; | ||
1410 | } | ||
1411 | } else { | ||
1412 | asoc->rwnd += len; | ||
1413 | } | ||
1411 | 1414 | ||
1412 | if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) | 1415 | /* If we had window pressure, start recovering it |
1413 | asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; | 1416 | * once our rwnd had reached the accumulated pressure |
1414 | else | 1417 | * threshold. The idea is to recover slowly, but up |
1415 | asoc->rwnd = 0; | 1418 | * to the initial advertised window. |
1419 | */ | ||
1420 | if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { | ||
1421 | int change = min(asoc->pathmtu, asoc->rwnd_press); | ||
1422 | asoc->rwnd += change; | ||
1423 | asoc->rwnd_press -= change; | ||
1424 | } | ||
1416 | 1425 | ||
1417 | pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", | 1426 | pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", |
1418 | __func__, asoc, asoc->rwnd, rx_count, | 1427 | __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, |
1419 | asoc->base.sk->sk_rcvbuf); | 1428 | asoc->a_rwnd); |
1420 | 1429 | ||
1421 | /* Send a window update SACK if the rwnd has increased by at least the | 1430 | /* Send a window update SACK if the rwnd has increased by at least the |
1422 | * minimum of the association's PMTU and half of the receive buffer. | 1431 | * minimum of the association's PMTU and half of the receive buffer. |
1423 | * The algorithm used is similar to the one described in | 1432 | * The algorithm used is similar to the one described in |
1424 | * Section 4.2.3.3 of RFC 1122. | 1433 | * Section 4.2.3.3 of RFC 1122. |
1425 | */ | 1434 | */ |
1426 | if (update_peer && sctp_peer_needs_update(asoc)) { | 1435 | if (sctp_peer_needs_update(asoc)) { |
1427 | asoc->a_rwnd = asoc->rwnd; | 1436 | asoc->a_rwnd = asoc->rwnd; |
1428 | 1437 | ||
1429 | pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " | 1438 | pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " |
@@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) | |||
1445 | } | 1454 | } |
1446 | } | 1455 | } |
1447 | 1456 | ||
1457 | /* Decrease asoc's rwnd by len. */ | ||
1458 | void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) | ||
1459 | { | ||
1460 | int rx_count; | ||
1461 | int over = 0; | ||
1462 | |||
1463 | if (unlikely(!asoc->rwnd || asoc->rwnd_over)) | ||
1464 | pr_debug("%s: association:%p has asoc->rwnd:%u, " | ||
1465 | "asoc->rwnd_over:%u!\n", __func__, asoc, | ||
1466 | asoc->rwnd, asoc->rwnd_over); | ||
1467 | |||
1468 | if (asoc->ep->rcvbuf_policy) | ||
1469 | rx_count = atomic_read(&asoc->rmem_alloc); | ||
1470 | else | ||
1471 | rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); | ||
1472 | |||
1473 | /* If we've reached or overflowed our receive buffer, announce | ||
1474 | * a 0 rwnd if rwnd would still be positive. Store the | ||
1475 | * the potential pressure overflow so that the window can be restored | ||
1476 | * back to original value. | ||
1477 | */ | ||
1478 | if (rx_count >= asoc->base.sk->sk_rcvbuf) | ||
1479 | over = 1; | ||
1480 | |||
1481 | if (asoc->rwnd >= len) { | ||
1482 | asoc->rwnd -= len; | ||
1483 | if (over) { | ||
1484 | asoc->rwnd_press += asoc->rwnd; | ||
1485 | asoc->rwnd = 0; | ||
1486 | } | ||
1487 | } else { | ||
1488 | asoc->rwnd_over = len - asoc->rwnd; | ||
1489 | asoc->rwnd = 0; | ||
1490 | } | ||
1491 | |||
1492 | pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", | ||
1493 | __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, | ||
1494 | asoc->rwnd_press); | ||
1495 | } | ||
1448 | 1496 | ||
1449 | /* Build the bind address list for the association based on info from the | 1497 | /* Build the bind address list for the association based on info from the |
1450 | * local endpoint and the remote peer. | 1498 | * local endpoint and the remote peer. |
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 01e002430c85..ae9fbeba40b0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c | |||
@@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, | |||
6178 | * PMTU. In cases, such as loopback, this might be a rather | 6178 | * PMTU. In cases, such as loopback, this might be a rather |
6179 | * large spill over. | 6179 | * large spill over. |
6180 | */ | 6180 | */ |
6181 | if ((!chunk->data_accepted) && (!asoc->rwnd || | 6181 | if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || |
6182 | (datalen > asoc->rwnd + asoc->frag_point))) { | 6182 | (datalen > asoc->rwnd + asoc->frag_point))) { |
6183 | 6183 | ||
6184 | /* If this is the next TSN, consider reneging to make | 6184 | /* If this is the next TSN, consider reneging to make |
diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e13519e9df80..ff20e2dbbbc7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c | |||
@@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, | |||
2115 | sctp_skb_pull(skb, copied); | 2115 | sctp_skb_pull(skb, copied); |
2116 | skb_queue_head(&sk->sk_receive_queue, skb); | 2116 | skb_queue_head(&sk->sk_receive_queue, skb); |
2117 | 2117 | ||
2118 | /* When only partial message is copied to the user, increase | ||
2119 | * rwnd by that amount. If all the data in the skb is read, | ||
2120 | * rwnd is updated when the event is freed. | ||
2121 | */ | ||
2122 | if (!sctp_ulpevent_is_notification(event)) | ||
2123 | sctp_assoc_rwnd_increase(event->asoc, copied); | ||
2118 | goto out; | 2124 | goto out; |
2119 | } else if ((event->msg_flags & MSG_NOTIFICATION) || | 2125 | } else if ((event->msg_flags & MSG_NOTIFICATION) || |
2120 | (event->msg_flags & MSG_EOR)) | 2126 | (event->msg_flags & MSG_EOR)) |
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8d198ae03606..85c64658bd0b 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c | |||
@@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, | |||
989 | skb = sctp_event2skb(event); | 989 | skb = sctp_event2skb(event); |
990 | /* Set the owner and charge rwnd for bytes received. */ | 990 | /* Set the owner and charge rwnd for bytes received. */ |
991 | sctp_ulpevent_set_owner(event, asoc); | 991 | sctp_ulpevent_set_owner(event, asoc); |
992 | sctp_assoc_rwnd_update(asoc, false); | 992 | sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); |
993 | 993 | ||
994 | if (!skb->data_len) | 994 | if (!skb->data_len) |
995 | return; | 995 | return; |
@@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) | |||
1011 | { | 1011 | { |
1012 | struct sk_buff *skb, *frag; | 1012 | struct sk_buff *skb, *frag; |
1013 | unsigned int len; | 1013 | unsigned int len; |
1014 | struct sctp_association *asoc; | ||
1015 | 1014 | ||
1016 | /* Current stack structures assume that the rcv buffer is | 1015 | /* Current stack structures assume that the rcv buffer is |
1017 | * per socket. For UDP style sockets this is not true as | 1016 | * per socket. For UDP style sockets this is not true as |
@@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) | |||
1036 | } | 1035 | } |
1037 | 1036 | ||
1038 | done: | 1037 | done: |
1039 | asoc = event->asoc; | 1038 | sctp_assoc_rwnd_increase(event->asoc, len); |
1040 | sctp_association_hold(asoc); | ||
1041 | sctp_ulpevent_release_owner(event); | 1039 | sctp_ulpevent_release_owner(event); |
1042 | sctp_assoc_rwnd_update(asoc, true); | ||
1043 | sctp_association_put(asoc); | ||
1044 | } | 1040 | } |
1045 | 1041 | ||
1046 | static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) | 1042 | static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) |