diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 67 |
1 files changed, 47 insertions, 20 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e053ad7cae2..c8ed3a04b504 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -283,8 +283,6 @@ | |||
283 | #include <asm/ioctls.h> | 283 | #include <asm/ioctls.h> |
284 | #include <net/busy_poll.h> | 284 | #include <net/busy_poll.h> |
285 | 285 | ||
286 | #include <trace/events/tcp.h> | ||
287 | |||
288 | struct percpu_counter tcp_orphan_count; | 286 | struct percpu_counter tcp_orphan_count; |
289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
290 | 288 | ||
@@ -465,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) | |||
465 | tcp_mtup_init(sk); | 463 | tcp_mtup_init(sk); |
466 | icsk->icsk_af_ops->rebuild_header(sk); | 464 | icsk->icsk_af_ops->rebuild_header(sk); |
467 | tcp_init_metrics(sk); | 465 | tcp_init_metrics(sk); |
468 | tcp_call_bpf(sk, bpf_op); | 466 | tcp_call_bpf(sk, bpf_op, 0, NULL); |
469 | tcp_init_congestion_control(sk); | 467 | tcp_init_congestion_control(sk); |
470 | tcp_init_buffer_space(sk); | 468 | tcp_init_buffer_space(sk); |
471 | } | 469 | } |
@@ -500,11 +498,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
500 | const struct tcp_sock *tp = tcp_sk(sk); | 498 | const struct tcp_sock *tp = tcp_sk(sk); |
501 | int state; | 499 | int state; |
502 | 500 | ||
503 | sock_rps_record_flow(sk); | ||
504 | |||
505 | sock_poll_wait(file, sk_sleep(sk), wait); | 501 | sock_poll_wait(file, sk_sleep(sk), wait); |
506 | 502 | ||
507 | state = sk_state_load(sk); | 503 | state = inet_sk_state_load(sk); |
508 | if (state == TCP_LISTEN) | 504 | if (state == TCP_LISTEN) |
509 | return inet_csk_listen_poll(sk); | 505 | return inet_csk_listen_poll(sk); |
510 | 506 | ||
@@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb) | |||
1106 | return 0; | 1102 | return 0; |
1107 | } | 1103 | } |
1108 | 1104 | ||
1109 | static int select_size(const struct sock *sk, bool sg, bool first_skb) | 1105 | static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) |
1110 | { | 1106 | { |
1111 | const struct tcp_sock *tp = tcp_sk(sk); | 1107 | const struct tcp_sock *tp = tcp_sk(sk); |
1112 | int tmp = tp->mss_cache; | 1108 | int tmp = tp->mss_cache; |
1113 | 1109 | ||
1114 | if (sg) { | 1110 | if (sg) { |
1111 | if (zc) | ||
1112 | return 0; | ||
1113 | |||
1115 | if (sk_can_gso(sk)) { | 1114 | if (sk_can_gso(sk)) { |
1116 | tmp = linear_payload_sz(first_skb); | 1115 | tmp = linear_payload_sz(first_skb); |
1117 | } else { | 1116 | } else { |
@@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
1188 | int flags, err, copied = 0; | 1187 | int flags, err, copied = 0; |
1189 | int mss_now = 0, size_goal, copied_syn = 0; | 1188 | int mss_now = 0, size_goal, copied_syn = 0; |
1190 | bool process_backlog = false; | 1189 | bool process_backlog = false; |
1191 | bool sg; | 1190 | bool sg, zc = false; |
1192 | long timeo; | 1191 | long timeo; |
1193 | 1192 | ||
1194 | flags = msg->msg_flags; | 1193 | flags = msg->msg_flags; |
@@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
1206 | goto out_err; | 1205 | goto out_err; |
1207 | } | 1206 | } |
1208 | 1207 | ||
1209 | if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) | 1208 | zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; |
1209 | if (!zc) | ||
1210 | uarg->zerocopy = 0; | 1210 | uarg->zerocopy = 0; |
1211 | } | 1211 | } |
1212 | 1212 | ||
@@ -1283,6 +1283,7 @@ restart: | |||
1283 | 1283 | ||
1284 | if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { | 1284 | if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { |
1285 | bool first_skb; | 1285 | bool first_skb; |
1286 | int linear; | ||
1286 | 1287 | ||
1287 | new_segment: | 1288 | new_segment: |
1288 | /* Allocate new segment. If the interface is SG, | 1289 | /* Allocate new segment. If the interface is SG, |
@@ -1296,9 +1297,8 @@ new_segment: | |||
1296 | goto restart; | 1297 | goto restart; |
1297 | } | 1298 | } |
1298 | first_skb = tcp_rtx_and_write_queues_empty(sk); | 1299 | first_skb = tcp_rtx_and_write_queues_empty(sk); |
1299 | skb = sk_stream_alloc_skb(sk, | 1300 | linear = select_size(sk, sg, first_skb, zc); |
1300 | select_size(sk, sg, first_skb), | 1301 | skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, |
1301 | sk->sk_allocation, | ||
1302 | first_skb); | 1302 | first_skb); |
1303 | if (!skb) | 1303 | if (!skb) |
1304 | goto wait_for_memory; | 1304 | goto wait_for_memory; |
@@ -1327,13 +1327,13 @@ new_segment: | |||
1327 | copy = msg_data_left(msg); | 1327 | copy = msg_data_left(msg); |
1328 | 1328 | ||
1329 | /* Where to copy to? */ | 1329 | /* Where to copy to? */ |
1330 | if (skb_availroom(skb) > 0) { | 1330 | if (skb_availroom(skb) > 0 && !zc) { |
1331 | /* We have some space in skb head. Superb! */ | 1331 | /* We have some space in skb head. Superb! */ |
1332 | copy = min_t(int, copy, skb_availroom(skb)); | 1332 | copy = min_t(int, copy, skb_availroom(skb)); |
1333 | err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); | 1333 | err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); |
1334 | if (err) | 1334 | if (err) |
1335 | goto do_fault; | 1335 | goto do_fault; |
1336 | } else if (!uarg || !uarg->zerocopy) { | 1336 | } else if (!zc) { |
1337 | bool merge = true; | 1337 | bool merge = true; |
1338 | int i = skb_shinfo(skb)->nr_frags; | 1338 | int i = skb_shinfo(skb)->nr_frags; |
1339 | struct page_frag *pfrag = sk_page_frag(sk); | 1339 | struct page_frag *pfrag = sk_page_frag(sk); |
@@ -1373,8 +1373,10 @@ new_segment: | |||
1373 | pfrag->offset += copy; | 1373 | pfrag->offset += copy; |
1374 | } else { | 1374 | } else { |
1375 | err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); | 1375 | err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); |
1376 | if (err == -EMSGSIZE || err == -EEXIST) | 1376 | if (err == -EMSGSIZE || err == -EEXIST) { |
1377 | tcp_mark_push(tp, skb); | ||
1377 | goto new_segment; | 1378 | goto new_segment; |
1379 | } | ||
1378 | if (err < 0) | 1380 | if (err < 0) |
1379 | goto do_error; | 1381 | goto do_error; |
1380 | copy = err; | 1382 | copy = err; |
@@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb, | |||
1731 | } | 1733 | } |
1732 | 1734 | ||
1733 | /* Similar to __sock_recv_timestamp, but does not require an skb */ | 1735 | /* Similar to __sock_recv_timestamp, but does not require an skb */ |
1734 | void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, | 1736 | static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, |
1735 | struct scm_timestamping *tss) | 1737 | struct scm_timestamping *tss) |
1736 | { | 1738 | { |
1737 | struct timeval tv; | 1739 | struct timeval tv; |
1738 | bool has_timestamping = false; | 1740 | bool has_timestamping = false; |
@@ -2040,7 +2042,29 @@ void tcp_set_state(struct sock *sk, int state) | |||
2040 | { | 2042 | { |
2041 | int oldstate = sk->sk_state; | 2043 | int oldstate = sk->sk_state; |
2042 | 2044 | ||
2043 | trace_tcp_set_state(sk, oldstate, state); | 2045 | /* We defined a new enum for TCP states that are exported in BPF |
2046 | * so as not force the internal TCP states to be frozen. The | ||
2047 | * following checks will detect if an internal state value ever | ||
2048 | * differs from the BPF value. If this ever happens, then we will | ||
2049 | * need to remap the internal value to the BPF value before calling | ||
2050 | * tcp_call_bpf_2arg. | ||
2051 | */ | ||
2052 | BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); | ||
2053 | BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); | ||
2054 | BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); | ||
2055 | BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); | ||
2056 | BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); | ||
2057 | BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); | ||
2058 | BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); | ||
2059 | BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); | ||
2060 | BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); | ||
2061 | BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); | ||
2062 | BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); | ||
2063 | BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); | ||
2064 | BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); | ||
2065 | |||
2066 | if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) | ||
2067 | tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); | ||
2044 | 2068 | ||
2045 | switch (state) { | 2069 | switch (state) { |
2046 | case TCP_ESTABLISHED: | 2070 | case TCP_ESTABLISHED: |
@@ -2065,7 +2089,7 @@ void tcp_set_state(struct sock *sk, int state) | |||
2065 | /* Change state AFTER socket is unhashed to avoid closed | 2089 | /* Change state AFTER socket is unhashed to avoid closed |
2066 | * socket sitting in hash tables. | 2090 | * socket sitting in hash tables. |
2067 | */ | 2091 | */ |
2068 | sk_state_store(sk, state); | 2092 | inet_sk_state_store(sk, state); |
2069 | 2093 | ||
2070 | #ifdef STATE_TRACE | 2094 | #ifdef STATE_TRACE |
2071 | SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); | 2095 | SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); |
@@ -2923,7 +2947,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
2923 | if (sk->sk_type != SOCK_STREAM) | 2947 | if (sk->sk_type != SOCK_STREAM) |
2924 | return; | 2948 | return; |
2925 | 2949 | ||
2926 | info->tcpi_state = sk_state_load(sk); | 2950 | info->tcpi_state = inet_sk_state_load(sk); |
2927 | 2951 | ||
2928 | /* Report meaningful fields for all TCP states, including listeners */ | 2952 | /* Report meaningful fields for all TCP states, including listeners */ |
2929 | rate = READ_ONCE(sk->sk_pacing_rate); | 2953 | rate = READ_ONCE(sk->sk_pacing_rate); |
@@ -3581,6 +3605,9 @@ void __init tcp_init(void) | |||
3581 | percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); | 3605 | percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); |
3582 | percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); | 3606 | percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); |
3583 | inet_hashinfo_init(&tcp_hashinfo); | 3607 | inet_hashinfo_init(&tcp_hashinfo); |
3608 | inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", | ||
3609 | thash_entries, 21, /* one slot per 2 MB*/ | ||
3610 | 0, 64 * 1024); | ||
3584 | tcp_hashinfo.bind_bucket_cachep = | 3611 | tcp_hashinfo.bind_bucket_cachep = |
3585 | kmem_cache_create("tcp_bind_bucket", | 3612 | kmem_cache_create("tcp_bind_bucket", |
3586 | sizeof(struct inet_bind_bucket), 0, | 3613 | sizeof(struct inet_bind_bucket), 0, |