aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorFlorian Westphal <fw@strlen.de>2017-07-29 21:57:18 -0400
committerDavid S. Miller <davem@davemloft.net>2017-07-31 17:37:49 -0400
commite7942d0633c47c791ece6afa038be9cf977226de (patch)
tree27dddb46a5358137f6cb6e63bddab14a77a840ec /net/ipv4
parent764646b08d09d29adced740c26447ecdaabc9088 (diff)
tcp: remove prequeue support
prequeue is a tcp receive optimization that moves part of rx processing from bh to process context. This only works if the socket being processed belongs to a process that is blocked in recv on that socket. In practice, this doesn't happen anymore that often because nowadays servers tend to use an event driven (epoll) model. Even normal client applications (web browsers) commonly use many tcp connections in parallel. This has measureable impact only in netperf (which uses plain recv and thus allows prequeue use) from host to locally running vm (~4%), however, there were no changes when using netperf between two physical hosts with ixgbe interfaces. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp.c105
-rw-r--r--net/ipv4/tcp_input.c62
-rw-r--r--net/ipv4/tcp_ipv4.c61
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_timer.c12
5 files changed, 1 insertions, 240 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..62018ea6f45f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)
400 400
401 tp->out_of_order_queue = RB_ROOT; 401 tp->out_of_order_queue = RB_ROOT;
402 tcp_init_xmit_timers(sk); 402 tcp_init_xmit_timers(sk);
403 tcp_prequeue_init(tp);
404 INIT_LIST_HEAD(&tp->tsq_node); 403 INIT_LIST_HEAD(&tp->tsq_node);
405 404
406 icsk->icsk_rto = TCP_TIMEOUT_INIT; 405 icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1525 tcp_send_ack(sk); 1524 tcp_send_ack(sk);
1526} 1525}
1527 1526
1528static void tcp_prequeue_process(struct sock *sk)
1529{
1530 struct sk_buff *skb;
1531 struct tcp_sock *tp = tcp_sk(sk);
1532
1533 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1534
1535 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1536 sk_backlog_rcv(sk, skb);
1537
1538 /* Clear memory counter. */
1539 tp->ucopy.memory = 0;
1540}
1541
1542static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1527static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1543{ 1528{
1544 struct sk_buff *skb; 1529 struct sk_buff *skb;
@@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1671 int err; 1656 int err;
1672 int target; /* Read at least this many bytes */ 1657 int target; /* Read at least this many bytes */
1673 long timeo; 1658 long timeo;
1674 struct task_struct *user_recv = NULL;
1675 struct sk_buff *skb, *last; 1659 struct sk_buff *skb, *last;
1676 u32 urg_hole = 0; 1660 u32 urg_hole = 0;
1677 1661
@@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1806 1790
1807 tcp_cleanup_rbuf(sk, copied); 1791 tcp_cleanup_rbuf(sk, copied);
1808 1792
1809 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1810 /* Install new reader */
1811 if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1812 user_recv = current;
1813 tp->ucopy.task = user_recv;
1814 tp->ucopy.msg = msg;
1815 }
1816
1817 tp->ucopy.len = len;
1818
1819 WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1820 !(flags & (MSG_PEEK | MSG_TRUNC)));
1821
1822 /* Ugly... If prequeue is not empty, we have to
1823 * process it before releasing socket, otherwise
1824 * order will be broken at second iteration.
1825 * More elegant solution is required!!!
1826 *
1827 * Look: we have the following (pseudo)queues:
1828 *
1829 * 1. packets in flight
1830 * 2. backlog
1831 * 3. prequeue
1832 * 4. receive_queue
1833 *
1834 * Each queue can be processed only if the next ones
1835 * are empty. At this point we have empty receive_queue.
1836 * But prequeue _can_ be not empty after 2nd iteration,
1837 * when we jumped to start of loop because backlog
1838 * processing added something to receive_queue.
1839 * We cannot release_sock(), because backlog contains
1840 * packets arrived _after_ prequeued ones.
1841 *
1842 * Shortly, algorithm is clear --- to process all
1843 * the queues in order. We could make it more directly,
1844 * requeueing packets from backlog to prequeue, if
1845 * is not empty. It is more elegant, but eats cycles,
1846 * unfortunately.
1847 */
1848 if (!skb_queue_empty(&tp->ucopy.prequeue))
1849 goto do_prequeue;
1850
1851 /* __ Set realtime policy in scheduler __ */
1852 }
1853
1854 if (copied >= target) { 1793 if (copied >= target) {
1855 /* Do not sleep, just process backlog. */ 1794 /* Do not sleep, just process backlog. */
1856 release_sock(sk); 1795 release_sock(sk);
@@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1859 sk_wait_data(sk, &timeo, last); 1798 sk_wait_data(sk, &timeo, last);
1860 } 1799 }
1861 1800
1862 if (user_recv) {
1863 int chunk;
1864
1865 /* __ Restore normal policy in scheduler __ */
1866
1867 chunk = len - tp->ucopy.len;
1868 if (chunk != 0) {
1869 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1870 len -= chunk;
1871 copied += chunk;
1872 }
1873
1874 if (tp->rcv_nxt == tp->copied_seq &&
1875 !skb_queue_empty(&tp->ucopy.prequeue)) {
1876do_prequeue:
1877 tcp_prequeue_process(sk);
1878
1879 chunk = len - tp->ucopy.len;
1880 if (chunk != 0) {
1881 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1882 len -= chunk;
1883 copied += chunk;
1884 }
1885 }
1886 }
1887 if ((flags & MSG_PEEK) && 1801 if ((flags & MSG_PEEK) &&
1888 (peek_seq - copied - urg_hole != tp->copied_seq)) { 1802 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1889 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", 1803 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1955,25 +1869,6 @@ skip_copy:
1955 break; 1869 break;
1956 } while (len > 0); 1870 } while (len > 0);
1957 1871
1958 if (user_recv) {
1959 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1960 int chunk;
1961
1962 tp->ucopy.len = copied > 0 ? len : 0;
1963
1964 tcp_prequeue_process(sk);
1965
1966 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1967 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1968 len -= chunk;
1969 copied += chunk;
1970 }
1971 }
1972
1973 tp->ucopy.task = NULL;
1974 tp->ucopy.len = 0;
1975 }
1976
1977 /* According to UNIX98, msg_name/msg_namelen are ignored 1872 /* According to UNIX98, msg_name/msg_namelen are ignored
1978 * on connected socket. I was just happy when found this 8) --ANK 1873 * on connected socket. I was just happy when found this 8) --ANK
1979 */ 1874 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index adc3f3e9468c..770ce6cb3eca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4611,22 +4611,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4611 goto out_of_window; 4611 goto out_of_window;
4612 4612
4613 /* Ok. In sequence. In window. */ 4613 /* Ok. In sequence. In window. */
4614 if (tp->ucopy.task == current &&
4615 tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
4616 sock_owned_by_user(sk) && !tp->urg_data) {
4617 int chunk = min_t(unsigned int, skb->len,
4618 tp->ucopy.len);
4619
4620 __set_current_state(TASK_RUNNING);
4621
4622 if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
4623 tp->ucopy.len -= chunk;
4624 tp->copied_seq += chunk;
4625 eaten = (chunk == skb->len);
4626 tcp_rcv_space_adjust(sk);
4627 }
4628 }
4629
4630 if (eaten <= 0) { 4614 if (eaten <= 0) {
4631queue_and_out: 4615queue_and_out:
4632 if (eaten < 0) { 4616 if (eaten < 0) {
@@ -5186,26 +5170,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
5186 } 5170 }
5187} 5171}
5188 5172
5189static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5190{
5191 struct tcp_sock *tp = tcp_sk(sk);
5192 int chunk = skb->len - hlen;
5193 int err;
5194
5195 if (skb_csum_unnecessary(skb))
5196 err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
5197 else
5198 err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
5199
5200 if (!err) {
5201 tp->ucopy.len -= chunk;
5202 tp->copied_seq += chunk;
5203 tcp_rcv_space_adjust(sk);
5204 }
5205
5206 return err;
5207}
5208
5209/* Accept RST for rcv_nxt - 1 after a FIN. 5173/* Accept RST for rcv_nxt - 1 after a FIN.
5210 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a 5174 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5211 * FIN is sent followed by a RST packet. The RST is sent with the same 5175 * FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5446,32 +5410,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5446 int eaten = 0; 5410 int eaten = 0;
5447 bool fragstolen = false; 5411 bool fragstolen = false;
5448 5412
5449 if (tp->ucopy.task == current &&
5450 tp->copied_seq == tp->rcv_nxt &&
5451 len - tcp_header_len <= tp->ucopy.len &&
5452 sock_owned_by_user(sk)) {
5453 __set_current_state(TASK_RUNNING);
5454
5455 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5456 /* Predicted packet is in window by definition.
5457 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5458 * Hence, check seq<=rcv_wup reduces to:
5459 */
5460 if (tcp_header_len ==
5461 (sizeof(struct tcphdr) +
5462 TCPOLEN_TSTAMP_ALIGNED) &&
5463 tp->rcv_nxt == tp->rcv_wup)
5464 tcp_store_ts_recent(tp);
5465
5466 tcp_rcv_rtt_measure_ts(sk, skb);
5467
5468 __skb_pull(skb, tcp_header_len);
5469 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5470 NET_INC_STATS(sock_net(sk),
5471 LINUX_MIB_TCPHPHITSTOUSER);
5472 eaten = 1;
5473 }
5474 }
5475 if (!eaten) { 5413 if (!eaten) {
5476 if (tcp_checksum_complete(skb)) 5414 if (tcp_checksum_complete(skb))
5477 goto csum_error; 5415 goto csum_error;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a19ea28339f..a68eb4577d36 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1541,61 +1541,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1541 } 1541 }
1542} 1542}
1543 1543
1544/* Packet is added to VJ-style prequeue for processing in process
1545 * context, if a reader task is waiting. Apparently, this exciting
1546 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547 * failed somewhere. Latency? Burstiness? Well, at least now we will
1548 * see, why it failed. 8)8) --ANK
1549 *
1550 */
1551bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1552{
1553 struct tcp_sock *tp = tcp_sk(sk);
1554
1555 if (sysctl_tcp_low_latency || !tp->ucopy.task)
1556 return false;
1557
1558 if (skb->len <= tcp_hdrlen(skb) &&
1559 skb_queue_len(&tp->ucopy.prequeue) == 0)
1560 return false;
1561
1562 /* Before escaping RCU protected region, we need to take care of skb
1563 * dst. Prequeue is only enabled for established sockets.
1564 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1565 * Instead of doing full sk_rx_dst validity here, let's perform
1566 * an optimistic check.
1567 */
1568 if (likely(sk->sk_rx_dst))
1569 skb_dst_drop(skb);
1570 else
1571 skb_dst_force_safe(skb);
1572
1573 __skb_queue_tail(&tp->ucopy.prequeue, skb);
1574 tp->ucopy.memory += skb->truesize;
1575 if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1576 tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1577 struct sk_buff *skb1;
1578
1579 BUG_ON(sock_owned_by_user(sk));
1580 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1581 skb_queue_len(&tp->ucopy.prequeue));
1582
1583 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1584 sk_backlog_rcv(sk, skb1);
1585
1586 tp->ucopy.memory = 0;
1587 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1588 wake_up_interruptible_sync_poll(sk_sleep(sk),
1589 POLLIN | POLLRDNORM | POLLRDBAND);
1590 if (!inet_csk_ack_scheduled(sk))
1591 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1592 (3 * tcp_rto_min(sk)) / 4,
1593 TCP_RTO_MAX);
1594 }
1595 return true;
1596}
1597EXPORT_SYMBOL(tcp_prequeue);
1598
1599bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) 1544bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1600{ 1545{
1601 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; 1546 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1770,8 +1715,7 @@ process:
1770 tcp_segs_in(tcp_sk(sk), skb); 1715 tcp_segs_in(tcp_sk(sk), skb);
1771 ret = 0; 1716 ret = 0;
1772 if (!sock_owned_by_user(sk)) { 1717 if (!sock_owned_by_user(sk)) {
1773 if (!tcp_prequeue(sk, skb)) 1718 ret = tcp_v4_do_rcv(sk, skb);
1774 ret = tcp_v4_do_rcv(sk, skb);
1775 } else if (tcp_add_backlog(sk, skb)) { 1719 } else if (tcp_add_backlog(sk, skb)) {
1776 goto discard_and_relse; 1720 goto discard_and_relse;
1777 } 1721 }
@@ -1936,9 +1880,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
1936 } 1880 }
1937#endif 1881#endif
1938 1882
1939 /* Clean prequeue, it must be empty really */
1940 __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942 /* Clean up a referenced TCP bind bucket. */ 1883 /* Clean up a referenced TCP bind bucket. */
1943 if (inet_csk(sk)->icsk_bind_hash) 1884 if (inet_csk(sk)->icsk_bind_hash)
1944 inet_put_port(sk); 1885 inet_put_port(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..188a6f31356d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,7 +445,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
445 newtp->snd_sml = newtp->snd_una = 445 newtp->snd_sml = newtp->snd_una =
446 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; 446 newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
447 447
448 tcp_prequeue_init(newtp);
449 INIT_LIST_HEAD(&newtp->tsq_node); 448 INIT_LIST_HEAD(&newtp->tsq_node);
450 449
451 tcp_init_wl(newtp, treq->rcv_isn); 450 tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c0feeeef962a..f753f9d2fee3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
239/* Called with BH disabled */ 239/* Called with BH disabled */
240void tcp_delack_timer_handler(struct sock *sk) 240void tcp_delack_timer_handler(struct sock *sk)
241{ 241{
242 struct tcp_sock *tp = tcp_sk(sk);
243 struct inet_connection_sock *icsk = inet_csk(sk); 242 struct inet_connection_sock *icsk = inet_csk(sk);
244 243
245 sk_mem_reclaim_partial(sk); 244 sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
254 } 253 }
255 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; 254 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
256 255
257 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
258 struct sk_buff *skb;
259
260 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
261
262 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
263 sk_backlog_rcv(sk, skb);
264
265 tp->ucopy.memory = 0;
266 }
267
268 if (inet_csk_ack_scheduled(sk)) { 256 if (inet_csk_ack_scheduled(sk)) {
269 if (!icsk->icsk_ack.pingpong) { 257 if (!icsk->icsk_ack.pingpong) {
270 /* Delayed ACK missed: inflate ATO. */ 258 /* Delayed ACK missed: inflate ATO. */