aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDaniel Borkmann <dborkman@redhat.com>2014-04-14 15:45:17 -0400
committerDavid S. Miller <davem@davemloft.net>2014-04-14 16:26:48 -0400
commit362d52040c71f6e8d8158be48c812d7729cb8df1 (patch)
tree17116c15ee8030175ac6164f5072025ec9f6b635 /net
parentbfae23249955819a42aa6c23d93708c818eff5c9 (diff)
Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer"
This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler <pbutler@sonusnet.com> Reported-by: Dongsheng Song <dongsheng.song@gmail.com> Reported-by: Fengguang Wu <fengguang.wu@intel.com> Tested-by: Peter Butler <pbutler@sonusnet.com> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Matija Glavinic Pecotic <matija.glavinic-pecotic.ext@nsn.com> Cc: Alexander Sverdlin <alexander.sverdlin@nsn.com> Cc: Vlad Yasevich <vyasevich@gmail.com> Acked-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/sctp/associola.c82
-rw-r--r--net/sctp/sm_statefuns.c2
-rw-r--r--net/sctp/socket.c6
-rw-r--r--net/sctp/ulpevent.c8
4 files changed, 74 insertions, 24 deletions
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4f6d6f9d1274..39579c3e0d14 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
1395 return false; 1395 return false;
1396} 1396}
1397 1397
1398/* Update asoc's rwnd for the approximated state in the buffer, 1398/* Increase asoc's rwnd by len and send any window update SACK if needed. */
1399 * and check whether SACK needs to be sent. 1399void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
1400 */
1401void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
1402{ 1400{
1403 int rx_count;
1404 struct sctp_chunk *sack; 1401 struct sctp_chunk *sack;
1405 struct timer_list *timer; 1402 struct timer_list *timer;
1406 1403
1407 if (asoc->ep->rcvbuf_policy) 1404 if (asoc->rwnd_over) {
1408 rx_count = atomic_read(&asoc->rmem_alloc); 1405 if (asoc->rwnd_over >= len) {
1409 else 1406 asoc->rwnd_over -= len;
1410 rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); 1407 } else {
1408 asoc->rwnd += (len - asoc->rwnd_over);
1409 asoc->rwnd_over = 0;
1410 }
1411 } else {
1412 asoc->rwnd += len;
1413 }
1411 1414
1412 if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) 1415 /* If we had window pressure, start recovering it
1413 asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; 1416 * once our rwnd had reached the accumulated pressure
1414 else 1417 * threshold. The idea is to recover slowly, but up
1415 asoc->rwnd = 0; 1418 * to the initial advertised window.
1419 */
1420 if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) {
1421 int change = min(asoc->pathmtu, asoc->rwnd_press);
1422 asoc->rwnd += change;
1423 asoc->rwnd_press -= change;
1424 }
1416 1425
1417 pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", 1426 pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
1418 __func__, asoc, asoc->rwnd, rx_count, 1427 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
1419 asoc->base.sk->sk_rcvbuf); 1428 asoc->a_rwnd);
1420 1429
1421 /* Send a window update SACK if the rwnd has increased by at least the 1430 /* Send a window update SACK if the rwnd has increased by at least the
1422 * minimum of the association's PMTU and half of the receive buffer. 1431 * minimum of the association's PMTU and half of the receive buffer.
1423 * The algorithm used is similar to the one described in 1432 * The algorithm used is similar to the one described in
1424 * Section 4.2.3.3 of RFC 1122. 1433 * Section 4.2.3.3 of RFC 1122.
1425 */ 1434 */
1426 if (update_peer && sctp_peer_needs_update(asoc)) { 1435 if (sctp_peer_needs_update(asoc)) {
1427 asoc->a_rwnd = asoc->rwnd; 1436 asoc->a_rwnd = asoc->rwnd;
1428 1437
1429 pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " 1438 pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
@@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer)
1445 } 1454 }
1446} 1455}
1447 1456
1457/* Decrease asoc's rwnd by len. */
1458void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
1459{
1460 int rx_count;
1461 int over = 0;
1462
1463 if (unlikely(!asoc->rwnd || asoc->rwnd_over))
1464 pr_debug("%s: association:%p has asoc->rwnd:%u, "
1465 "asoc->rwnd_over:%u!\n", __func__, asoc,
1466 asoc->rwnd, asoc->rwnd_over);
1467
1468 if (asoc->ep->rcvbuf_policy)
1469 rx_count = atomic_read(&asoc->rmem_alloc);
1470 else
1471 rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);
1472
1473 /* If we've reached or overflowed our receive buffer, announce
1474 * a 0 rwnd if rwnd would still be positive. Store the
1475 * the potential pressure overflow so that the window can be restored
1476 * back to original value.
1477 */
1478 if (rx_count >= asoc->base.sk->sk_rcvbuf)
1479 over = 1;
1480
1481 if (asoc->rwnd >= len) {
1482 asoc->rwnd -= len;
1483 if (over) {
1484 asoc->rwnd_press += asoc->rwnd;
1485 asoc->rwnd = 0;
1486 }
1487 } else {
1488 asoc->rwnd_over = len - asoc->rwnd;
1489 asoc->rwnd = 0;
1490 }
1491
1492 pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
1493 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
1494 asoc->rwnd_press);
1495}
1448 1496
1449/* Build the bind address list for the association based on info from the 1497/* Build the bind address list for the association based on info from the
1450 * local endpoint and the remote peer. 1498 * local endpoint and the remote peer.
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 01e002430c85..ae9fbeba40b0 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
6178 * PMTU. In cases, such as loopback, this might be a rather 6178 * PMTU. In cases, such as loopback, this might be a rather
6179 * large spill over. 6179 * large spill over.
6180 */ 6180 */
6181 if ((!chunk->data_accepted) && (!asoc->rwnd || 6181 if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
6182 (datalen > asoc->rwnd + asoc->frag_point))) { 6182 (datalen > asoc->rwnd + asoc->frag_point))) {
6183 6183
6184 /* If this is the next TSN, consider reneging to make 6184 /* If this is the next TSN, consider reneging to make
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e13519e9df80..ff20e2dbbbc7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
2115 sctp_skb_pull(skb, copied); 2115 sctp_skb_pull(skb, copied);
2116 skb_queue_head(&sk->sk_receive_queue, skb); 2116 skb_queue_head(&sk->sk_receive_queue, skb);
2117 2117
2118 /* When only partial message is copied to the user, increase
2119 * rwnd by that amount. If all the data in the skb is read,
2120 * rwnd is updated when the event is freed.
2121 */
2122 if (!sctp_ulpevent_is_notification(event))
2123 sctp_assoc_rwnd_increase(event->asoc, copied);
2118 goto out; 2124 goto out;
2119 } else if ((event->msg_flags & MSG_NOTIFICATION) || 2125 } else if ((event->msg_flags & MSG_NOTIFICATION) ||
2120 (event->msg_flags & MSG_EOR)) 2126 (event->msg_flags & MSG_EOR))
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 8d198ae03606..85c64658bd0b 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
989 skb = sctp_event2skb(event); 989 skb = sctp_event2skb(event);
990 /* Set the owner and charge rwnd for bytes received. */ 990 /* Set the owner and charge rwnd for bytes received. */
991 sctp_ulpevent_set_owner(event, asoc); 991 sctp_ulpevent_set_owner(event, asoc);
992 sctp_assoc_rwnd_update(asoc, false); 992 sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));
993 993
994 if (!skb->data_len) 994 if (!skb->data_len)
995 return; 995 return;
@@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
1011{ 1011{
1012 struct sk_buff *skb, *frag; 1012 struct sk_buff *skb, *frag;
1013 unsigned int len; 1013 unsigned int len;
1014 struct sctp_association *asoc;
1015 1014
1016 /* Current stack structures assume that the rcv buffer is 1015 /* Current stack structures assume that the rcv buffer is
1017 * per socket. For UDP style sockets this is not true as 1016 * per socket. For UDP style sockets this is not true as
@@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
1036 } 1035 }
1037 1036
1038done: 1037done:
1039 asoc = event->asoc; 1038 sctp_assoc_rwnd_increase(event->asoc, len);
1040 sctp_association_hold(asoc);
1041 sctp_ulpevent_release_owner(event); 1039 sctp_ulpevent_release_owner(event);
1042 sctp_assoc_rwnd_update(asoc, true);
1043 sctp_association_put(asoc);
1044} 1040}
1045 1041
1046static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) 1042static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)