aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c67
1 files changed, 47 insertions, 20 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e053ad7cae2..c8ed3a04b504 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,8 +283,6 @@
283#include <asm/ioctls.h> 283#include <asm/ioctls.h>
284#include <net/busy_poll.h> 284#include <net/busy_poll.h>
285 285
286#include <trace/events/tcp.h>
287
288struct percpu_counter tcp_orphan_count; 286struct percpu_counter tcp_orphan_count;
289EXPORT_SYMBOL_GPL(tcp_orphan_count); 287EXPORT_SYMBOL_GPL(tcp_orphan_count);
290 288
@@ -465,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
465 tcp_mtup_init(sk); 463 tcp_mtup_init(sk);
466 icsk->icsk_af_ops->rebuild_header(sk); 464 icsk->icsk_af_ops->rebuild_header(sk);
467 tcp_init_metrics(sk); 465 tcp_init_metrics(sk);
468 tcp_call_bpf(sk, bpf_op); 466 tcp_call_bpf(sk, bpf_op, 0, NULL);
469 tcp_init_congestion_control(sk); 467 tcp_init_congestion_control(sk);
470 tcp_init_buffer_space(sk); 468 tcp_init_buffer_space(sk);
471} 469}
@@ -500,11 +498,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
500 const struct tcp_sock *tp = tcp_sk(sk); 498 const struct tcp_sock *tp = tcp_sk(sk);
501 int state; 499 int state;
502 500
503 sock_rps_record_flow(sk);
504
505 sock_poll_wait(file, sk_sleep(sk), wait); 501 sock_poll_wait(file, sk_sleep(sk), wait);
506 502
507 state = sk_state_load(sk); 503 state = inet_sk_state_load(sk);
508 if (state == TCP_LISTEN) 504 if (state == TCP_LISTEN)
509 return inet_csk_listen_poll(sk); 505 return inet_csk_listen_poll(sk);
510 506
@@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb)
1106 return 0; 1102 return 0;
1107} 1103}
1108 1104
1109static int select_size(const struct sock *sk, bool sg, bool first_skb) 1105static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
1110{ 1106{
1111 const struct tcp_sock *tp = tcp_sk(sk); 1107 const struct tcp_sock *tp = tcp_sk(sk);
1112 int tmp = tp->mss_cache; 1108 int tmp = tp->mss_cache;
1113 1109
1114 if (sg) { 1110 if (sg) {
1111 if (zc)
1112 return 0;
1113
1115 if (sk_can_gso(sk)) { 1114 if (sk_can_gso(sk)) {
1116 tmp = linear_payload_sz(first_skb); 1115 tmp = linear_payload_sz(first_skb);
1117 } else { 1116 } else {
@@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1188 int flags, err, copied = 0; 1187 int flags, err, copied = 0;
1189 int mss_now = 0, size_goal, copied_syn = 0; 1188 int mss_now = 0, size_goal, copied_syn = 0;
1190 bool process_backlog = false; 1189 bool process_backlog = false;
1191 bool sg; 1190 bool sg, zc = false;
1192 long timeo; 1191 long timeo;
1193 1192
1194 flags = msg->msg_flags; 1193 flags = msg->msg_flags;
@@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1206 goto out_err; 1205 goto out_err;
1207 } 1206 }
1208 1207
1209 if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) 1208 zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
1209 if (!zc)
1210 uarg->zerocopy = 0; 1210 uarg->zerocopy = 0;
1211 } 1211 }
1212 1212
@@ -1283,6 +1283,7 @@ restart:
1283 1283
1284 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { 1284 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1285 bool first_skb; 1285 bool first_skb;
1286 int linear;
1286 1287
1287new_segment: 1288new_segment:
1288 /* Allocate new segment. If the interface is SG, 1289 /* Allocate new segment. If the interface is SG,
@@ -1296,9 +1297,8 @@ new_segment:
1296 goto restart; 1297 goto restart;
1297 } 1298 }
1298 first_skb = tcp_rtx_and_write_queues_empty(sk); 1299 first_skb = tcp_rtx_and_write_queues_empty(sk);
1299 skb = sk_stream_alloc_skb(sk, 1300 linear = select_size(sk, sg, first_skb, zc);
1300 select_size(sk, sg, first_skb), 1301 skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
1301 sk->sk_allocation,
1302 first_skb); 1302 first_skb);
1303 if (!skb) 1303 if (!skb)
1304 goto wait_for_memory; 1304 goto wait_for_memory;
@@ -1327,13 +1327,13 @@ new_segment:
1327 copy = msg_data_left(msg); 1327 copy = msg_data_left(msg);
1328 1328
1329 /* Where to copy to? */ 1329 /* Where to copy to? */
1330 if (skb_availroom(skb) > 0) { 1330 if (skb_availroom(skb) > 0 && !zc) {
1331 /* We have some space in skb head. Superb! */ 1331 /* We have some space in skb head. Superb! */
1332 copy = min_t(int, copy, skb_availroom(skb)); 1332 copy = min_t(int, copy, skb_availroom(skb));
1333 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); 1333 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1334 if (err) 1334 if (err)
1335 goto do_fault; 1335 goto do_fault;
1336 } else if (!uarg || !uarg->zerocopy) { 1336 } else if (!zc) {
1337 bool merge = true; 1337 bool merge = true;
1338 int i = skb_shinfo(skb)->nr_frags; 1338 int i = skb_shinfo(skb)->nr_frags;
1339 struct page_frag *pfrag = sk_page_frag(sk); 1339 struct page_frag *pfrag = sk_page_frag(sk);
@@ -1373,8 +1373,10 @@ new_segment:
1373 pfrag->offset += copy; 1373 pfrag->offset += copy;
1374 } else { 1374 } else {
1375 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); 1375 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1376 if (err == -EMSGSIZE || err == -EEXIST) 1376 if (err == -EMSGSIZE || err == -EEXIST) {
1377 tcp_mark_push(tp, skb);
1377 goto new_segment; 1378 goto new_segment;
1379 }
1378 if (err < 0) 1380 if (err < 0)
1379 goto do_error; 1381 goto do_error;
1380 copy = err; 1382 copy = err;
@@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
1731} 1733}
1732 1734
1733/* Similar to __sock_recv_timestamp, but does not require an skb */ 1735/* Similar to __sock_recv_timestamp, but does not require an skb */
1734void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, 1736static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1735 struct scm_timestamping *tss) 1737 struct scm_timestamping *tss)
1736{ 1738{
1737 struct timeval tv; 1739 struct timeval tv;
1738 bool has_timestamping = false; 1740 bool has_timestamping = false;
@@ -2040,7 +2042,29 @@ void tcp_set_state(struct sock *sk, int state)
2040{ 2042{
2041 int oldstate = sk->sk_state; 2043 int oldstate = sk->sk_state;
2042 2044
2043 trace_tcp_set_state(sk, oldstate, state); 2045 /* We defined a new enum for TCP states that are exported in BPF
2046 * so as not force the internal TCP states to be frozen. The
2047 * following checks will detect if an internal state value ever
2048 * differs from the BPF value. If this ever happens, then we will
2049 * need to remap the internal value to the BPF value before calling
2050 * tcp_call_bpf_2arg.
2051 */
2052 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2053 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2054 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2055 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2056 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2057 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2058 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2059 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2060 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2061 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2062 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2063 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2064 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2065
2066 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2067 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2044 2068
2045 switch (state) { 2069 switch (state) {
2046 case TCP_ESTABLISHED: 2070 case TCP_ESTABLISHED:
@@ -2065,7 +2089,7 @@ void tcp_set_state(struct sock *sk, int state)
2065 /* Change state AFTER socket is unhashed to avoid closed 2089 /* Change state AFTER socket is unhashed to avoid closed
2066 * socket sitting in hash tables. 2090 * socket sitting in hash tables.
2067 */ 2091 */
2068 sk_state_store(sk, state); 2092 inet_sk_state_store(sk, state);
2069 2093
2070#ifdef STATE_TRACE 2094#ifdef STATE_TRACE
2071 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); 2095 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2923,7 +2947,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2923 if (sk->sk_type != SOCK_STREAM) 2947 if (sk->sk_type != SOCK_STREAM)
2924 return; 2948 return;
2925 2949
2926 info->tcpi_state = sk_state_load(sk); 2950 info->tcpi_state = inet_sk_state_load(sk);
2927 2951
2928 /* Report meaningful fields for all TCP states, including listeners */ 2952 /* Report meaningful fields for all TCP states, including listeners */
2929 rate = READ_ONCE(sk->sk_pacing_rate); 2953 rate = READ_ONCE(sk->sk_pacing_rate);
@@ -3581,6 +3605,9 @@ void __init tcp_init(void)
3581 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); 3605 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3582 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); 3606 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3583 inet_hashinfo_init(&tcp_hashinfo); 3607 inet_hashinfo_init(&tcp_hashinfo);
3608 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
3609 thash_entries, 21, /* one slot per 2 MB*/
3610 0, 64 * 1024);
3584 tcp_hashinfo.bind_bucket_cachep = 3611 tcp_hashinfo.bind_bucket_cachep =
3585 kmem_cache_create("tcp_bind_bucket", 3612 kmem_cache_create("tcp_bind_bucket",
3586 sizeof(struct inet_bind_bucket), 0, 3613 sizeof(struct inet_bind_bucket), 0,