diff options
author | Eric Dumazet <edumazet@google.com> | 2019-10-10 23:17:39 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-10-13 13:13:08 -0400 |
commit | dba7d9b8c739df27ff3a234c81d6c6b23e3986fa (patch) | |
tree | 2550420425234768ffbfcdbe2295d1a8b045dfe5 | |
parent | d983ea6f16b835dcde2ee9a58a1e764ce68bfccc (diff) |
tcp: annotate tp->rcv_nxt lockless reads
There are few places where we fetch tp->rcv_nxt while
this field can change from IRQ or other cpu.
We need to add READ_ONCE() annotations, and also make
sure write sides use corresponding WRITE_ONCE() to avoid
store-tearing.
Note that tcp_inq_hint() was already using READ_ONCE(tp->rcv_nxt)
syzbot reported :
BUG: KCSAN: data-race in tcp_poll / tcp_queue_rcv
write to 0xffff888120425770 of 4 bytes by interrupt on cpu 0:
tcp_rcv_nxt_update net/ipv4/tcp_input.c:3365 [inline]
tcp_queue_rcv+0x180/0x380 net/ipv4/tcp_input.c:4638
tcp_rcv_established+0xbf1/0xf50 net/ipv4/tcp_input.c:5616
tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542
tcp_v4_rcv+0x1a03/0x1bf0 net/ipv4/tcp_ipv4.c:1923
ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204
ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
NF_HOOK include/linux/netfilter.h:305 [inline]
NF_HOOK include/linux/netfilter.h:299 [inline]
ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
dst_input include/net/dst.h:442 [inline]
ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
NF_HOOK include/linux/netfilter.h:305 [inline]
NF_HOOK include/linux/netfilter.h:299 [inline]
ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
__netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004
__netif_receive_skb+0x37/0xf0 net/core/dev.c:5118
netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208
napi_skb_finish net/core/dev.c:5671 [inline]
napi_gro_receive+0x28f/0x330 net/core/dev.c:5704
receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061
read to 0xffff888120425770 of 4 bytes by task 7254 on cpu 1:
tcp_stream_is_readable net/ipv4/tcp.c:480 [inline]
tcp_poll+0x204/0x6b0 net/ipv4/tcp.c:554
sock_poll+0xed/0x250 net/socket.c:1256
vfs_poll include/linux/poll.h:90 [inline]
ep_item_poll.isra.0+0x90/0x190 fs/eventpoll.c:892
ep_send_events_proc+0x113/0x5c0 fs/eventpoll.c:1749
ep_scan_ready_list.constprop.0+0x189/0x500 fs/eventpoll.c:704
ep_send_events fs/eventpoll.c:1793 [inline]
ep_poll+0xe3/0x900 fs/eventpoll.c:1930
do_epoll_wait+0x162/0x180 fs/eventpoll.c:2294
__do_sys_epoll_pwait fs/eventpoll.c:2325 [inline]
__se_sys_epoll_pwait fs/eventpoll.c:2311 [inline]
__x64_sys_epoll_pwait+0xcd/0x170 fs/eventpoll.c:2311
do_syscall_64+0xcf/0x2f0 arch/x86/entry/common.c:296
entry_SYSCALL_64_after_hwframe+0x44/0xa9
Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 7254 Comm: syz-fuzzer Not tainted 5.3.0+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/ipv4/tcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 7 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 3 |
6 files changed, 15 insertions, 10 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c59d0bd29c5c..883ee863db43 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -477,7 +477,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) | |||
477 | static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, | 477 | static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, |
478 | int target, struct sock *sk) | 478 | int target, struct sock *sk) |
479 | { | 479 | { |
480 | return (tp->rcv_nxt - tp->copied_seq >= target) || | 480 | return (READ_ONCE(tp->rcv_nxt) - tp->copied_seq >= target) || |
481 | (sk->sk_prot->stream_memory_read ? | 481 | (sk->sk_prot->stream_memory_read ? |
482 | sk->sk_prot->stream_memory_read(sk) : false); | 482 | sk->sk_prot->stream_memory_read(sk) : false); |
483 | } | 483 | } |
@@ -2935,7 +2935,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2935 | else if (tp->repair_queue == TCP_SEND_QUEUE) | 2935 | else if (tp->repair_queue == TCP_SEND_QUEUE) |
2936 | tp->write_seq = val; | 2936 | tp->write_seq = val; |
2937 | else if (tp->repair_queue == TCP_RECV_QUEUE) | 2937 | else if (tp->repair_queue == TCP_RECV_QUEUE) |
2938 | tp->rcv_nxt = val; | 2938 | WRITE_ONCE(tp->rcv_nxt, val); |
2939 | else | 2939 | else |
2940 | err = -EINVAL; | 2940 | err = -EINVAL; |
2941 | break; | 2941 | break; |
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 81a8221d650a..cd219161f106 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
@@ -26,7 +26,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, | |||
26 | } else if (sk->sk_type == SOCK_STREAM) { | 26 | } else if (sk->sk_type == SOCK_STREAM) { |
27 | const struct tcp_sock *tp = tcp_sk(sk); | 27 | const struct tcp_sock *tp = tcp_sk(sk); |
28 | 28 | ||
29 | r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 29 | r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - tp->copied_seq, 0); |
30 | r->idiag_wqueue = tp->write_seq - tp->snd_una; | 30 | r->idiag_wqueue = tp->write_seq - tp->snd_una; |
31 | } | 31 | } |
32 | if (info) | 32 | if (info) |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5f9b102c3b55..5b7c8768ed5f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -3362,7 +3362,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) | |||
3362 | 3362 | ||
3363 | sock_owned_by_me((struct sock *)tp); | 3363 | sock_owned_by_me((struct sock *)tp); |
3364 | tp->bytes_received += delta; | 3364 | tp->bytes_received += delta; |
3365 | tp->rcv_nxt = seq; | 3365 | WRITE_ONCE(tp->rcv_nxt, seq); |
3366 | } | 3366 | } |
3367 | 3367 | ||
3368 | /* Update our send window. | 3368 | /* Update our send window. |
@@ -5932,7 +5932,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5932 | /* Ok.. it's good. Set up sequence numbers and | 5932 | /* Ok.. it's good. Set up sequence numbers and |
5933 | * move to established. | 5933 | * move to established. |
5934 | */ | 5934 | */ |
5935 | tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; | 5935 | WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); |
5936 | tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; | 5936 | tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; |
5937 | 5937 | ||
5938 | /* RFC1323: The window in SYN & SYN/ACK segments is | 5938 | /* RFC1323: The window in SYN & SYN/ACK segments is |
@@ -6035,7 +6035,7 @@ discard: | |||
6035 | tp->tcp_header_len = sizeof(struct tcphdr); | 6035 | tp->tcp_header_len = sizeof(struct tcphdr); |
6036 | } | 6036 | } |
6037 | 6037 | ||
6038 | tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; | 6038 | WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); |
6039 | tp->copied_seq = tp->rcv_nxt; | 6039 | tp->copied_seq = tp->rcv_nxt; |
6040 | tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; | 6040 | tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; |
6041 | 6041 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ffa366099eb2..5089dd6bee0f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2455,7 +2455,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) | |||
2455 | /* Because we don't lock the socket, | 2455 | /* Because we don't lock the socket, |
2456 | * we might find a transient negative value. | 2456 | * we might find a transient negative value. |
2457 | */ | 2457 | */ |
2458 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 2458 | rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - |
2459 | tp->copied_seq, 0); | ||
2459 | 2460 | ||
2460 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " | 2461 | seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " |
2461 | "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", | 2462 | "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d", |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5401dbd39c8f..adc6ce486a38 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -462,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, | |||
462 | struct tcp_request_sock *treq = tcp_rsk(req); | 462 | struct tcp_request_sock *treq = tcp_rsk(req); |
463 | struct inet_connection_sock *newicsk; | 463 | struct inet_connection_sock *newicsk; |
464 | struct tcp_sock *oldtp, *newtp; | 464 | struct tcp_sock *oldtp, *newtp; |
465 | u32 seq; | ||
465 | 466 | ||
466 | if (!newsk) | 467 | if (!newsk) |
467 | return NULL; | 468 | return NULL; |
@@ -475,8 +476,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, | |||
475 | /* Now setup tcp_sock */ | 476 | /* Now setup tcp_sock */ |
476 | newtp->pred_flags = 0; | 477 | newtp->pred_flags = 0; |
477 | 478 | ||
478 | newtp->rcv_wup = newtp->copied_seq = | 479 | seq = treq->rcv_isn + 1; |
479 | newtp->rcv_nxt = treq->rcv_isn + 1; | 480 | newtp->rcv_wup = seq; |
481 | newtp->copied_seq = seq; | ||
482 | WRITE_ONCE(newtp->rcv_nxt, seq); | ||
480 | newtp->segs_in = 1; | 483 | newtp->segs_in = 1; |
481 | 484 | ||
482 | newtp->snd_sml = newtp->snd_una = | 485 | newtp->snd_sml = newtp->snd_una = |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 45a95e032bdf..89ea0a7018b5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1895,7 +1895,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) | |||
1895 | /* Because we don't lock the socket, | 1895 | /* Because we don't lock the socket, |
1896 | * we might find a transient negative value. | 1896 | * we might find a transient negative value. |
1897 | */ | 1897 | */ |
1898 | rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); | 1898 | rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - |
1899 | tp->copied_seq, 0); | ||
1899 | 1900 | ||
1900 | seq_printf(seq, | 1901 | seq_printf(seq, |
1901 | "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " | 1902 | "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " |