tcp: remove prequeue support

prequeue is a tcp receive optimization that moves part of rx processing from bh to process context. This only works if the socket being processed belongs to a process that is blocked in recv on that socket. In practice, this doesn't happen anymore that often because nowadays servers tend to use an event driven (epoll) model. Even normal client applications (web browsers) commonly use many tcp connections in parallel. This has measureable impact only in netperf (which uses plain recv and thus allows prequeue use) from host to locally running vm (~4%), however, there were no changes when using netperf between two physical hosts with ixgbe interfaces. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Florian Westphal <fw@strlen.de> 2017-07-29 21:57:18 -0400
committer: David S. Miller <davem@davemloft.net> 2017-07-31 17:37:49 -0400
commit: e7942d0633c47c791ece6afa038be9cf977226de (patch)
tree: 27dddb46a5358137f6cb6e63bddab14a77a840ec /net/ipv4
parent: 764646b08d09d29adced740c26447ecdaabc9088 (diff)
5 files changed, 1 insertions, 240 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..62018ea6f45f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)
        tp->out_of_order_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
-        tcp_prequeue_init(tp);
        INIT_LIST_HEAD(&tp->tsq_node);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
                tcp_send_ack(sk);
 }
-static void tcp_prequeue_process(struct sock *sk)
-{
-        struct sk_buff *skb;
-        struct tcp_sock *tp = tcp_sk(sk);
-        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
-        while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-                sk_backlog_rcv(sk, skb);
-        /* Clear memory counter. */
-        tp->ucopy.memory = 0;
-}
 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
        struct sk_buff *skb;
@@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
        int err;
        int target;             /* Read at least this many bytes */
        long timeo;
-        struct task_struct *user_recv = NULL;
        struct sk_buff *skb, *last;
        u32 urg_hole = 0;
@@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                tcp_cleanup_rbuf(sk, copied);
-                if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
-                        /* Install new reader */
-                        if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
-                                user_recv = current;
-                                tp->ucopy.task = user_recv;
-                                tp->ucopy.msg = msg;
-                        }
-                        tp->ucopy.len = len;
-                        WARN_ON(tp->copied_seq != tp->rcv_nxt &&
-                                !(flags & (MSG_PEEK | MSG_TRUNC)));
-                        /* Ugly... If prequeue is not empty, we have to
-                         * process it before releasing socket, otherwise
-                         * order will be broken at second iteration.
-                         * More elegant solution is required!!!
-                         *
-                         * Look: we have the following (pseudo)queues:
-                         *
-                         * 1. packets in flight
-                         * 2. backlog
-                         * 3. prequeue
-                         * 4. receive_queue
-                         *
-                         * Each queue can be processed only if the next ones
-                         * are empty. At this point we have empty receive_queue.
-                         * But prequeue _can_ be not empty after 2nd iteration,
-                         * when we jumped to start of loop because backlog
-                         * processing added something to receive_queue.
-                         * We cannot release_sock(), because backlog contains
-                         * packets arrived _after_ prequeued ones.
-                         *
-                         * Shortly, algorithm is clear --- to process all
-                         * the queues in order. We could make it more directly,
-                         * requeueing packets from backlog to prequeue, if
-                         * is not empty. It is more elegant, but eats cycles,
-                         * unfortunately.
-                         */
-                        if (!skb_queue_empty(&tp->ucopy.prequeue))
-                                goto do_prequeue;
-                        /* __ Set realtime policy in scheduler __ */
-                }
                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
                        release_sock(sk);
@@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                        sk_wait_data(sk, &timeo, last);
                }
-                if (user_recv) {
-                        int chunk;
-                        /* __ Restore normal policy in scheduler __ */
-                        chunk = len - tp->ucopy.len;
-                        if (chunk != 0) {
-                                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
-                                len -= chunk;
-                                copied += chunk;
-                        }
-                        if (tp->rcv_nxt == tp->copied_seq &&
-                            !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
-                                tcp_prequeue_process(sk);
-                                chunk = len - tp->ucopy.len;
-                                if (chunk != 0) {
-                                        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
-                                        len -= chunk;
-                                        copied += chunk;
-                                }
-                        }
-                }
                if ((flags & MSG_PEEK) &&
                    (peek_seq - copied - urg_hole != tp->copied_seq)) {
                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1955,25 +1869,6 @@ skip_copy:
                break;
        } while (len > 0);
-        if (user_recv) {
-                if (!skb_queue_empty(&tp->ucopy.prequeue)) {
-                        int chunk;
-                        tp->ucopy.len = copied > 0 ? len : 0;
-                        tcp_prequeue_process(sk);
-                        if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
-                                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
-                                len -= chunk;
-                                copied += chunk;
-                        }
-                }
-                tp->ucopy.task = NULL;
-                tp->ucopy.len = 0;
-        }
        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index adc3f3e9468c..770ce6cb3eca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4611,22 +4611,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                        goto out_of_window;
                /* Ok. In sequence. In window. */
-                if (tp->ucopy.task == current &&
-                    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
-                    sock_owned_by_user(sk) && !tp->urg_data) {
-                        int chunk = min_t(unsigned int, skb->len,
-                                          tp->ucopy.len);
-                        __set_current_state(TASK_RUNNING);
-                        if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
-                                tp->ucopy.len -= chunk;
-                                tp->copied_seq += chunk;
-                                eaten = (chunk == skb->len);
-                                tcp_rcv_space_adjust(sk);
-                        }
-                }
                if (eaten <= 0) {
 queue_and_out:
                        if (eaten < 0) {
@@ -5186,26 +5170,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
        }
 }
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        int chunk = skb->len - hlen;
-        int err;
-        if (skb_csum_unnecessary(skb))
-                err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
-        else
-                err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
-        if (!err) {
-                tp->ucopy.len -= chunk;
-                tp->copied_seq += chunk;
-                tcp_rcv_space_adjust(sk);
-        }
-        return err;
-}
 /* Accept RST for rcv_nxt - 1 after a FIN.
 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
 * FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5446,32 +5410,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        int eaten = 0;
                        bool fragstolen = false;
-                        if (tp->ucopy.task == current &&
-                            tp->copied_seq == tp->rcv_nxt &&
-                            len - tcp_header_len <= tp->ucopy.len &&
-                            sock_owned_by_user(sk)) {
-                                __set_current_state(TASK_RUNNING);
-                                if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
-                                        /* Predicted packet is in window by definition.
-                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
-                                         * Hence, check seq<=rcv_wup reduces to:
-                                         */
-                                        if (tcp_header_len ==
-                                            (sizeof(struct tcphdr) +
-                                             TCPOLEN_TSTAMP_ALIGNED) &&
-                                            tp->rcv_nxt == tp->rcv_wup)
-                                                tcp_store_ts_recent(tp);
-                                        tcp_rcv_rtt_measure_ts(sk, skb);
-                                        __skb_pull(skb, tcp_header_len);
-                                        tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
-                                        NET_INC_STATS(sock_net(sk),
-                                                        LINUX_MIB_TCPHPHITSTOUSER);
-                                        eaten = 1;
-                                }
-                        }
                        if (!eaten) {
                                if (tcp_checksum_complete(skb))
                                        goto csum_error;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a19ea28339f..a68eb4577d36 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1541,61 +1541,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
        }
 }
-/* Packet is added to VJ-style prequeue for processing in process
- * context, if a reader task is waiting. Apparently, this exciting
- * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
- * failed somewhere. Latency? Burstiness? Well, at least now we will
- * see, why it failed. 8)8)                               --ANK
- *
- */
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (sysctl_tcp_low_latency || !tp->ucopy.task)
-                return false;
-        if (skb->len <= tcp_hdrlen(skb) &&
-            skb_queue_len(&tp->ucopy.prequeue) == 0)
-                return false;
-        /* Before escaping RCU protected region, we need to take care of skb
-         * dst. Prequeue is only enabled for established sockets.
-         * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
-         * Instead of doing full sk_rx_dst validity here, let's perform
-         * an optimistic check.
-         */
-        if (likely(sk->sk_rx_dst))
-                skb_dst_drop(skb);
-        else
-                skb_dst_force_safe(skb);
-        __skb_queue_tail(&tp->ucopy.prequeue, skb);
-        tp->ucopy.memory += skb->truesize;
-        if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
-            tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
-                struct sk_buff *skb1;
-                BUG_ON(sock_owned_by_user(sk));
-                __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
-                                skb_queue_len(&tp->ucopy.prequeue));
-                while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-                        sk_backlog_rcv(sk, skb1);
-                tp->ucopy.memory = 0;
-        } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
-                wake_up_interruptible_sync_poll(sk_sleep(sk),
-                                           POLLIN | POLLRDNORM | POLLRDBAND);
-                if (!inet_csk_ack_scheduled(sk))
-                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-                                                  (3 * tcp_rto_min(sk)) / 4,
-                                                  TCP_RTO_MAX);
-        }
-        return true;
-}
-EXPORT_SYMBOL(tcp_prequeue);
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
        u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1770,8 +1715,7 @@ process:
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
-                if (!tcp_prequeue(sk, skb))
+                ret = tcp_v4_do_rcv(sk, skb);
-                        ret = tcp_v4_do_rcv(sk, skb);
        } else if (tcp_add_backlog(sk, skb)) {
                goto discard_and_relse;
        }
@@ -1936,9 +1880,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
        }
 #endif
-        /* Clean prequeue, it must be empty really */
-        __skb_queue_purge(&tp->ucopy.prequeue);
        /* Clean up a referenced TCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash)
                inet_put_port(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..188a6f31356d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,7 +445,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                newtp->snd_sml = newtp->snd_una =
                newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
-                tcp_prequeue_init(newtp);
                INIT_LIST_HEAD(&newtp->tsq_node);
                tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c0feeeef962a..f753f9d2fee3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
 /* Called with BH disabled */
 void tcp_delack_timer_handler(struct sock *sk)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
        }
        icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-        if (!skb_queue_empty(&tp->ucopy.prequeue)) {
-                struct sk_buff *skb;
-                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
-                while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-                        sk_backlog_rcv(sk, skb);
-                tp->ucopy.memory = 0;
-        }
        if (inet_csk_ack_scheduled(sk)) {
                if (!icsk->icsk_ack.pingpong) {
                        /* Delayed ACK missed: inflate ATO. */
author	Florian Westphal <fw@strlen.de>	2017-07-29 21:57:18 -0400
committer	David S. Miller <davem@davemloft.net>	2017-07-31 17:37:49 -0400
commit	e7942d0633c47c791ece6afa038be9cf977226de (patch)
tree	27dddb46a5358137f6cb6e63bddab14a77a840ec /net/ipv4
parent	764646b08d09d29adced740c26447ecdaabc9088 (diff)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..62018ea6f45f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)
400		400
401	tp->out_of_order_queue = RB_ROOT;	401	tp->out_of_order_queue = RB_ROOT;
402	tcp_init_xmit_timers(sk);	402	tcp_init_xmit_timers(sk);
403	tcp_prequeue_init(tp);
404	INIT_LIST_HEAD(&tp->tsq_node);	403	INIT_LIST_HEAD(&tp->tsq_node);
405		404
406	icsk->icsk_rto = TCP_TIMEOUT_INIT;	405	icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1525	tcp_send_ack(sk);	1524	tcp_send_ack(sk);
1526	}	1525	}
1527		1526
1528	static void tcp_prequeue_process(struct sock *sk)
1529	{
1530	struct sk_buff *skb;
1531	struct tcp_sock *tp = tcp_sk(sk);
1532
1533	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
1534
1535	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1536	sk_backlog_rcv(sk, skb);
1537
1538	/* Clear memory counter. */
1539	tp->ucopy.memory = 0;
1540	}
1541
1542	static struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)	1527	static struct sk_buff tcp_recv_skb(struct sock sk, u32 seq, u32 *off)
1543	{	1528	{
1544	struct sk_buff *skb;	1529	struct sk_buff *skb;
@@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock sk, struct msghdr msg, size_t len, int nonblock,
1671	int err;	1656	int err;
1672	int target; /* Read at least this many bytes */	1657	int target; /* Read at least this many bytes */
1673	long timeo;	1658	long timeo;
1674	struct task_struct *user_recv = NULL;
1675	struct sk_buff skb, last;	1659	struct sk_buff skb, last;
1676	u32 urg_hole = 0;	1660	u32 urg_hole = 0;
1677		1661
@@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock sk, struct msghdr msg, size_t len, int nonblock,
1806		1790
1807	tcp_cleanup_rbuf(sk, copied);	1791	tcp_cleanup_rbuf(sk, copied);
1808		1792
1809	if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1810	/* Install new reader */
1811	if (!user_recv && !(flags & (MSG_TRUNC \| MSG_PEEK))) {
1812	user_recv = current;
1813	tp->ucopy.task = user_recv;
1814	tp->ucopy.msg = msg;
1815	}
1816
1817	tp->ucopy.len = len;
1818
1819	WARN_ON(tp->copied_seq != tp->rcv_nxt &&
1820	!(flags & (MSG_PEEK \| MSG_TRUNC)));
1821
1822	/* Ugly... If prequeue is not empty, we have to
1823	* process it before releasing socket, otherwise
1824	* order will be broken at second iteration.
1825	* More elegant solution is required!!!
1826	*
1827	* Look: we have the following (pseudo)queues:
1828	*
1829	* 1. packets in flight
1830	* 2. backlog
1831	* 3. prequeue
1832	* 4. receive_queue
1833	*
1834	* Each queue can be processed only if the next ones
1835	* are empty. At this point we have empty receive_queue.
1836	* But prequeue _can_ be not empty after 2nd iteration,
1837	* when we jumped to start of loop because backlog
1838	* processing added something to receive_queue.
1839	* We cannot release_sock(), because backlog contains
1840	* packets arrived _after_ prequeued ones.
1841	*
1842	* Shortly, algorithm is clear --- to process all
1843	* the queues in order. We could make it more directly,
1844	* requeueing packets from backlog to prequeue, if
1845	* is not empty. It is more elegant, but eats cycles,
1846	* unfortunately.
1847	*/
1848	if (!skb_queue_empty(&tp->ucopy.prequeue))
1849	goto do_prequeue;
1850
1851	/* __ Set realtime policy in scheduler __ */
1852	}
1853
1854	if (copied >= target) {	1793	if (copied >= target) {
1855	/* Do not sleep, just process backlog. */	1794	/* Do not sleep, just process backlog. */
1856	release_sock(sk);	1795	release_sock(sk);
@@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock sk, struct msghdr msg, size_t len, int nonblock,
1859	sk_wait_data(sk, &timeo, last);	1798	sk_wait_data(sk, &timeo, last);
1860	}	1799	}
1861		1800
1862	if (user_recv) {
1863	int chunk;
1864
1865	/* __ Restore normal policy in scheduler __ */
1866
1867	chunk = len - tp->ucopy.len;
1868	if (chunk != 0) {
1869	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1870	len -= chunk;
1871	copied += chunk;
1872	}
1873
1874	if (tp->rcv_nxt == tp->copied_seq &&
1875	!skb_queue_empty(&tp->ucopy.prequeue)) {
1876	do_prequeue:
1877	tcp_prequeue_process(sk);
1878
1879	chunk = len - tp->ucopy.len;
1880	if (chunk != 0) {
1881	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1882	len -= chunk;
1883	copied += chunk;
1884	}
1885	}
1886	}
1887	if ((flags & MSG_PEEK) &&	1801	if ((flags & MSG_PEEK) &&
1888	(peek_seq - copied - urg_hole != tp->copied_seq)) {	1802	(peek_seq - copied - urg_hole != tp->copied_seq)) {
1889	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",	1803	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1955,25 +1869,6 @@ skip_copy:
1955	break;	1869	break;
1956	} while (len > 0);	1870	} while (len > 0);
1957		1871
1958	if (user_recv) {
1959	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1960	int chunk;
1961
1962	tp->ucopy.len = copied > 0 ? len : 0;
1963
1964	tcp_prequeue_process(sk);
1965
1966	if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1967	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1968	len -= chunk;
1969	copied += chunk;
1970	}
1971	}
1972
1973	tp->ucopy.task = NULL;
1974	tp->ucopy.len = 0;
1975	}
1976
1977	/* According to UNIX98, msg_name/msg_namelen are ignored	1872	/* According to UNIX98, msg_name/msg_namelen are ignored
1978	* on connected socket. I was just happy when found this 8) --ANK	1873	* on connected socket. I was just happy when found this 8) --ANK
1979	*/	1874	*/


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index adc3f3e9468c..770ce6cb3eca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -4611,22 +4611,6 @@ static void tcp_data_queue(struct sock sk, struct sk_buff skb)
4611	goto out_of_window;	4611	goto out_of_window;
4612		4612
4613	/* Ok. In sequence. In window. */	4613	/* Ok. In sequence. In window. */
4614	if (tp->ucopy.task == current &&
4615	tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
4616	sock_owned_by_user(sk) && !tp->urg_data) {
4617	int chunk = min_t(unsigned int, skb->len,
4618	tp->ucopy.len);
4619
4620	__set_current_state(TASK_RUNNING);
4621
4622	if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
4623	tp->ucopy.len -= chunk;
4624	tp->copied_seq += chunk;
4625	eaten = (chunk == skb->len);
4626	tcp_rcv_space_adjust(sk);
4627	}
4628	}
4629
4630	if (eaten <= 0) {	4614	if (eaten <= 0) {
4631	queue_and_out:	4615	queue_and_out:
4632	if (eaten < 0) {	4616	if (eaten < 0) {
@@ -5186,26 +5170,6 @@ static void tcp_urg(struct sock sk, struct sk_buff skb, const struct tcphdr *t
5186	}	5170	}
5187	}	5171	}
5188		5172
5189	static int tcp_copy_to_iovec(struct sock sk, struct sk_buff skb, int hlen)
5190	{
5191	struct tcp_sock *tp = tcp_sk(sk);
5192	int chunk = skb->len - hlen;
5193	int err;
5194
5195	if (skb_csum_unnecessary(skb))
5196	err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
5197	else
5198	err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
5199
5200	if (!err) {
5201	tp->ucopy.len -= chunk;
5202	tp->copied_seq += chunk;
5203	tcp_rcv_space_adjust(sk);
5204	}
5205
5206	return err;
5207	}
5208
5209	/* Accept RST for rcv_nxt - 1 after a FIN.	5173	/* Accept RST for rcv_nxt - 1 after a FIN.
5210	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a	5174	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5211	* FIN is sent followed by a RST packet. The RST is sent with the same	5175	* FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5446,32 +5410,6 @@ void tcp_rcv_established(struct sock sk, struct sk_buff skb,
5446	int eaten = 0;	5410	int eaten = 0;
5447	bool fragstolen = false;	5411	bool fragstolen = false;
5448		5412
5449	if (tp->ucopy.task == current &&
5450	tp->copied_seq == tp->rcv_nxt &&
5451	len - tcp_header_len <= tp->ucopy.len &&
5452	sock_owned_by_user(sk)) {
5453	__set_current_state(TASK_RUNNING);
5454
5455	if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5456	/* Predicted packet is in window by definition.
5457	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
5458	* Hence, check seq<=rcv_wup reduces to:
5459	*/
5460	if (tcp_header_len ==
5461	(sizeof(struct tcphdr) +
5462	TCPOLEN_TSTAMP_ALIGNED) &&
5463	tp->rcv_nxt == tp->rcv_wup)
5464	tcp_store_ts_recent(tp);
5465
5466	tcp_rcv_rtt_measure_ts(sk, skb);
5467
5468	__skb_pull(skb, tcp_header_len);
5469	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5470	NET_INC_STATS(sock_net(sk),
5471	LINUX_MIB_TCPHPHITSTOUSER);
5472	eaten = 1;
5473	}
5474	}
5475	if (!eaten) {	5413	if (!eaten) {
5476	if (tcp_checksum_complete(skb))	5414	if (tcp_checksum_complete(skb))
5477	goto csum_error;	5415	goto csum_error;


diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a19ea28339f..a68eb4577d36 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -1541,61 +1541,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1541	}	1541	}
1542	}	1542	}
1543		1543
1544	/* Packet is added to VJ-style prequeue for processing in process
1545	* context, if a reader task is waiting. Apparently, this exciting
1546	* idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1547	* failed somewhere. Latency? Burstiness? Well, at least now we will
1548	* see, why it failed. 8)8) --ANK
1549	*
1550	*/
1551	bool tcp_prequeue(struct sock sk, struct sk_buff skb)
1552	{
1553	struct tcp_sock *tp = tcp_sk(sk);
1554
1555	if (sysctl_tcp_low_latency \|\| !tp->ucopy.task)
1556	return false;
1557
1558	if (skb->len <= tcp_hdrlen(skb) &&
1559	skb_queue_len(&tp->ucopy.prequeue) == 0)
1560	return false;
1561
1562	/* Before escaping RCU protected region, we need to take care of skb
1563	* dst. Prequeue is only enabled for established sockets.
1564	* For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1565	* Instead of doing full sk_rx_dst validity here, let's perform
1566	* an optimistic check.
1567	*/
1568	if (likely(sk->sk_rx_dst))
1569	skb_dst_drop(skb);
1570	else
1571	skb_dst_force_safe(skb);
1572
1573	__skb_queue_tail(&tp->ucopy.prequeue, skb);
1574	tp->ucopy.memory += skb->truesize;
1575	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 \|\|
1576	tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1577	struct sk_buff *skb1;
1578
1579	BUG_ON(sock_owned_by_user(sk));
1580	__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1581	skb_queue_len(&tp->ucopy.prequeue));
1582
1583	while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1584	sk_backlog_rcv(sk, skb1);
1585
1586	tp->ucopy.memory = 0;
1587	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1588	wake_up_interruptible_sync_poll(sk_sleep(sk),
1589	POLLIN \| POLLRDNORM \| POLLRDBAND);
1590	if (!inet_csk_ack_scheduled(sk))
1591	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1592	(3 * tcp_rto_min(sk)) / 4,
1593	TCP_RTO_MAX);
1594	}
1595	return true;
1596	}
1597	EXPORT_SYMBOL(tcp_prequeue);
1598
1599	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)	1544	bool tcp_add_backlog(struct sock sk, struct sk_buff skb)
1600	{	1545	{
1601	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;	1546	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1770,8 +1715,7 @@ process:
1770	tcp_segs_in(tcp_sk(sk), skb);	1715	tcp_segs_in(tcp_sk(sk), skb);
1771	ret = 0;	1716	ret = 0;
1772	if (!sock_owned_by_user(sk)) {	1717	if (!sock_owned_by_user(sk)) {
1773	if (!tcp_prequeue(sk, skb))	1718	ret = tcp_v4_do_rcv(sk, skb);
1774	ret = tcp_v4_do_rcv(sk, skb);
1775	} else if (tcp_add_backlog(sk, skb)) {	1719	} else if (tcp_add_backlog(sk, skb)) {
1776	goto discard_and_relse;	1720	goto discard_and_relse;
1777	}	1721	}
@@ -1936,9 +1880,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
1936	}	1880	}
1937	#endif	1881	#endif
1938		1882
1939	/* Clean prequeue, it must be empty really */
1940	__skb_queue_purge(&tp->ucopy.prequeue);
1941
1942	/* Clean up a referenced TCP bind bucket. */	1883	/* Clean up a referenced TCP bind bucket. */
1943	if (inet_csk(sk)->icsk_bind_hash)	1884	if (inet_csk(sk)->icsk_bind_hash)
1944	inet_put_port(sk);	1885	inet_put_port(sk);


diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0ff83c1637d8..188a6f31356d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c
@@ -445,7 +445,6 @@ struct sock tcp_create_openreq_child(const struct sock sk,
445	newtp->snd_sml = newtp->snd_una =	445	newtp->snd_sml = newtp->snd_una =
446	newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;	446	newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
447		447
448	tcp_prequeue_init(newtp);
449	INIT_LIST_HEAD(&newtp->tsq_node);	448	INIT_LIST_HEAD(&newtp->tsq_node);
450		449
451	tcp_init_wl(newtp, treq->rcv_isn);	450	tcp_init_wl(newtp, treq->rcv_isn);


diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..f753f9d2fee3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
239	/* Called with BH disabled */	239	/* Called with BH disabled */
240	void tcp_delack_timer_handler(struct sock *sk)	240	void tcp_delack_timer_handler(struct sock *sk)
241	{	241	{
242	struct tcp_sock *tp = tcp_sk(sk);
243	struct inet_connection_sock *icsk = inet_csk(sk);	242	struct inet_connection_sock *icsk = inet_csk(sk);
244		243
245	sk_mem_reclaim_partial(sk);	244	sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
254	}	253	}
255	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;	254	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
256		255
257	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
258	struct sk_buff *skb;
259
260	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
261
262	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
263	sk_backlog_rcv(sk, skb);
264
265	tp->ucopy.memory = 0;
266	}
267
268	if (inet_csk_ack_scheduled(sk)) {	256	if (inet_csk_ack_scheduled(sk)) {
269	if (!icsk->icsk_ack.pingpong) {	257	if (!icsk->icsk_ack.pingpong) {
270	/* Delayed ACK missed: inflate ATO. */	258	/* Delayed ACK missed: inflate ATO. */