diff options
author | Jiri Olsa <jolsa@redhat.com> | 2009-07-08 08:09:13 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-07-09 20:06:57 -0400 |
commit | a57de0b4336e48db2811a2030bb68dba8dd09d88 (patch) | |
tree | a01c189d5fd55c69c9e2e842241e84b46728bc60 | |
parent | 1b614fb9a00e97b1eab54d4e442d405229c059dd (diff) |
net: adding memory barrier to the poll and receive callbacks
Adding memory barrier after the poll_wait function, paired with
receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper
to wrap the memory barrier.
Without the memory barrier, following race can happen.
The race fires, when following code paths meet, and the tp->rcv_nxt
and __add_wait_queue updates stay in CPU caches.
CPU1 CPU2
sys_select receive packet
... ...
__add_wait_queue update tp->rcv_nxt
... ...
tp->rcv_nxt check sock_def_readable
... {
schedule ...
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep)
...
}
If there was no cache the code would work ok, since the wait_queue and
rcv_nxt are opposit to each other.
Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
passed the tp->rcv_nxt check and sleeps, or will get the new value for
tp->rcv_nxt and will return with new data mask.
In both cases the process (CPU1) is being added to the wait queue, so the
waitqueue_active (CPU2) call cannot miss and will wake up CPU1.
The bad case is when the __add_wait_queue changes done by CPU1 stay in its
cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then
endup calling schedule and sleep forever if there are no more data on the
socket.
Calls to poll_wait in following modules were ommited:
net/bluetooth/af_bluetooth.c
net/irda/af_irda.c
net/irda/irnet/irnet_ppp.c
net/mac80211/rc80211_pid_debugfs.c
net/phonet/socket.c
net/rds/af_rds.c
net/rfkill/core.c
net/sunrpc/cache.c
net/sunrpc/rpc_pipe.c
net/tipc/socket.c
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/sock.h | 66 | ||||
-rw-r--r-- | net/atm/common.c | 6 | ||||
-rw-r--r-- | net/core/datagram.c | 2 | ||||
-rw-r--r-- | net/core/sock.c | 8 | ||||
-rw-r--r-- | net/dccp/output.c | 2 | ||||
-rw-r--r-- | net/dccp/proto.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
-rw-r--r-- | net/iucv/af_iucv.c | 4 | ||||
-rw-r--r-- | net/rxrpc/af_rxrpc.c | 4 | ||||
-rw-r--r-- | net/unix/af_unix.c | 8 |
10 files changed, 85 insertions, 19 deletions
diff --git a/include/net/sock.h b/include/net/sock.h index 352f06bbd7a9..4eb8409249f6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -54,6 +54,7 @@ | |||
54 | 54 | ||
55 | #include <linux/filter.h> | 55 | #include <linux/filter.h> |
56 | #include <linux/rculist_nulls.h> | 56 | #include <linux/rculist_nulls.h> |
57 | #include <linux/poll.h> | ||
57 | 58 | ||
58 | #include <asm/atomic.h> | 59 | #include <asm/atomic.h> |
59 | #include <net/dst.h> | 60 | #include <net/dst.h> |
@@ -1241,6 +1242,71 @@ static inline int sk_has_allocations(const struct sock *sk) | |||
1241 | return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); | 1242 | return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); |
1242 | } | 1243 | } |
1243 | 1244 | ||
1245 | /** | ||
1246 | * sk_has_sleeper - check if there are any waiting processes | ||
1247 | * @sk: socket | ||
1248 | * | ||
1249 | * Returns true if socket has waiting processes | ||
1250 | * | ||
1251 | * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory | ||
1252 | * barrier call. They were added due to the race found within the tcp code. | ||
1253 | * | ||
1254 | * Consider following tcp code paths: | ||
1255 | * | ||
1256 | * CPU1 CPU2 | ||
1257 | * | ||
1258 | * sys_select receive packet | ||
1259 | * ... ... | ||
1260 | * __add_wait_queue update tp->rcv_nxt | ||
1261 | * ... ... | ||
1262 | * tp->rcv_nxt check sock_def_readable | ||
1263 | * ... { | ||
1264 | * schedule ... | ||
1265 | * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | ||
1266 | * wake_up_interruptible(sk->sk_sleep) | ||
1267 | * ... | ||
1268 | * } | ||
1269 | * | ||
1270 | * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay | ||
1271 | * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 | ||
1272 | * could then endup calling schedule and sleep forever if there are no more | ||
1273 | * data on the socket. | ||
1274 | */ | ||
1275 | static inline int sk_has_sleeper(struct sock *sk) | ||
1276 | { | ||
1277 | /* | ||
1278 | * We need to be sure we are in sync with the | ||
1279 | * add_wait_queue modifications to the wait queue. | ||
1280 | * | ||
1281 | * This memory barrier is paired in the sock_poll_wait. | ||
1282 | */ | ||
1283 | smp_mb(); | ||
1284 | return sk->sk_sleep && waitqueue_active(sk->sk_sleep); | ||
1285 | } | ||
1286 | |||
1287 | /** | ||
1288 | * sock_poll_wait - place memory barrier behind the poll_wait call. | ||
1289 | * @filp: file | ||
1290 | * @wait_address: socket wait queue | ||
1291 | * @p: poll_table | ||
1292 | * | ||
1293 | * See the comments in the sk_has_sleeper function. | ||
1294 | */ | ||
1295 | static inline void sock_poll_wait(struct file *filp, | ||
1296 | wait_queue_head_t *wait_address, poll_table *p) | ||
1297 | { | ||
1298 | if (p && wait_address) { | ||
1299 | poll_wait(filp, wait_address, p); | ||
1300 | /* | ||
1301 | * We need to be sure we are in sync with the | ||
1302 | * socket flags modification. | ||
1303 | * | ||
1304 | * This memory barrier is paired in the sk_has_sleeper. | ||
1305 | */ | ||
1306 | smp_mb(); | ||
1307 | } | ||
1308 | } | ||
1309 | |||
1244 | /* | 1310 | /* |
1245 | * Queue a received datagram if it will fit. Stream and sequenced | 1311 | * Queue a received datagram if it will fit. Stream and sequenced |
1246 | * protocols can't normally use this as they need to fit buffers in | 1312 | * protocols can't normally use this as they need to fit buffers in |
diff --git a/net/atm/common.c b/net/atm/common.c index c1c97936192c..8c4d843eb17f 100644 --- a/net/atm/common.c +++ b/net/atm/common.c | |||
@@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk) | |||
92 | static void vcc_def_wakeup(struct sock *sk) | 92 | static void vcc_def_wakeup(struct sock *sk) |
93 | { | 93 | { |
94 | read_lock(&sk->sk_callback_lock); | 94 | read_lock(&sk->sk_callback_lock); |
95 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 95 | if (sk_has_sleeper(sk)) |
96 | wake_up(sk->sk_sleep); | 96 | wake_up(sk->sk_sleep); |
97 | read_unlock(&sk->sk_callback_lock); | 97 | read_unlock(&sk->sk_callback_lock); |
98 | } | 98 | } |
@@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk) | |||
110 | read_lock(&sk->sk_callback_lock); | 110 | read_lock(&sk->sk_callback_lock); |
111 | 111 | ||
112 | if (vcc_writable(sk)) { | 112 | if (vcc_writable(sk)) { |
113 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 113 | if (sk_has_sleeper(sk)) |
114 | wake_up_interruptible(sk->sk_sleep); | 114 | wake_up_interruptible(sk->sk_sleep); |
115 | 115 | ||
116 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); | 116 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); |
@@ -594,7 +594,7 @@ unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
594 | struct atm_vcc *vcc; | 594 | struct atm_vcc *vcc; |
595 | unsigned int mask; | 595 | unsigned int mask; |
596 | 596 | ||
597 | poll_wait(file, sk->sk_sleep, wait); | 597 | sock_poll_wait(file, sk->sk_sleep, wait); |
598 | mask = 0; | 598 | mask = 0; |
599 | 599 | ||
600 | vcc = ATM_SD(sock); | 600 | vcc = ATM_SD(sock); |
diff --git a/net/core/datagram.c b/net/core/datagram.c index 58abee1f1df1..b0fe69211eef 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c | |||
@@ -712,7 +712,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, | |||
712 | struct sock *sk = sock->sk; | 712 | struct sock *sk = sock->sk; |
713 | unsigned int mask; | 713 | unsigned int mask; |
714 | 714 | ||
715 | poll_wait(file, sk->sk_sleep, wait); | 715 | sock_poll_wait(file, sk->sk_sleep, wait); |
716 | mask = 0; | 716 | mask = 0; |
717 | 717 | ||
718 | /* exceptional events? */ | 718 | /* exceptional events? */ |
diff --git a/net/core/sock.c b/net/core/sock.c index b0ba569bc973..6354863b1c68 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage); | |||
1715 | static void sock_def_wakeup(struct sock *sk) | 1715 | static void sock_def_wakeup(struct sock *sk) |
1716 | { | 1716 | { |
1717 | read_lock(&sk->sk_callback_lock); | 1717 | read_lock(&sk->sk_callback_lock); |
1718 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1718 | if (sk_has_sleeper(sk)) |
1719 | wake_up_interruptible_all(sk->sk_sleep); | 1719 | wake_up_interruptible_all(sk->sk_sleep); |
1720 | read_unlock(&sk->sk_callback_lock); | 1720 | read_unlock(&sk->sk_callback_lock); |
1721 | } | 1721 | } |
@@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk) | |||
1723 | static void sock_def_error_report(struct sock *sk) | 1723 | static void sock_def_error_report(struct sock *sk) |
1724 | { | 1724 | { |
1725 | read_lock(&sk->sk_callback_lock); | 1725 | read_lock(&sk->sk_callback_lock); |
1726 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1726 | if (sk_has_sleeper(sk)) |
1727 | wake_up_interruptible_poll(sk->sk_sleep, POLLERR); | 1727 | wake_up_interruptible_poll(sk->sk_sleep, POLLERR); |
1728 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); | 1728 | sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); |
1729 | read_unlock(&sk->sk_callback_lock); | 1729 | read_unlock(&sk->sk_callback_lock); |
@@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk) | |||
1732 | static void sock_def_readable(struct sock *sk, int len) | 1732 | static void sock_def_readable(struct sock *sk, int len) |
1733 | { | 1733 | { |
1734 | read_lock(&sk->sk_callback_lock); | 1734 | read_lock(&sk->sk_callback_lock); |
1735 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1735 | if (sk_has_sleeper(sk)) |
1736 | wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | | 1736 | wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | |
1737 | POLLRDNORM | POLLRDBAND); | 1737 | POLLRDNORM | POLLRDBAND); |
1738 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); | 1738 | sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); |
@@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk) | |||
1747 | * progress. --DaveM | 1747 | * progress. --DaveM |
1748 | */ | 1748 | */ |
1749 | if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { | 1749 | if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { |
1750 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 1750 | if (sk_has_sleeper(sk)) |
1751 | wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | | 1751 | wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | |
1752 | POLLWRNORM | POLLWRBAND); | 1752 | POLLWRNORM | POLLWRBAND); |
1753 | 1753 | ||
diff --git a/net/dccp/output.c b/net/dccp/output.c index c0e88c16d088..c96119fda688 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c | |||
@@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk) | |||
196 | { | 196 | { |
197 | read_lock(&sk->sk_callback_lock); | 197 | read_lock(&sk->sk_callback_lock); |
198 | 198 | ||
199 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 199 | if (sk_has_sleeper(sk)) |
200 | wake_up_interruptible(sk->sk_sleep); | 200 | wake_up_interruptible(sk->sk_sleep); |
201 | /* Should agree with poll, otherwise some programs break */ | 201 | /* Should agree with poll, otherwise some programs break */ |
202 | if (sock_writeable(sk)) | 202 | if (sock_writeable(sk)) |
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 314a1b5c033c..94ca8eaace7d 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -311,7 +311,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, | |||
311 | unsigned int mask; | 311 | unsigned int mask; |
312 | struct sock *sk = sock->sk; | 312 | struct sock *sk = sock->sk; |
313 | 313 | ||
314 | poll_wait(file, sk->sk_sleep, wait); | 314 | sock_poll_wait(file, sk->sk_sleep, wait); |
315 | if (sk->sk_state == DCCP_LISTEN) | 315 | if (sk->sk_state == DCCP_LISTEN) |
316 | return inet_csk_listen_poll(sk); | 316 | return inet_csk_listen_poll(sk); |
317 | 317 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7870a535dac6..91145244ea63 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -339,7 +339,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
339 | struct sock *sk = sock->sk; | 339 | struct sock *sk = sock->sk; |
340 | struct tcp_sock *tp = tcp_sk(sk); | 340 | struct tcp_sock *tp = tcp_sk(sk); |
341 | 341 | ||
342 | poll_wait(file, sk->sk_sleep, wait); | 342 | sock_poll_wait(file, sk->sk_sleep, wait); |
343 | if (sk->sk_state == TCP_LISTEN) | 343 | if (sk->sk_state == TCP_LISTEN) |
344 | return inet_csk_listen_poll(sk); | 344 | return inet_csk_listen_poll(sk); |
345 | 345 | ||
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6be5f92d1094..49c15b48408e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c | |||
@@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk) | |||
306 | static void iucv_sock_wake_msglim(struct sock *sk) | 306 | static void iucv_sock_wake_msglim(struct sock *sk) |
307 | { | 307 | { |
308 | read_lock(&sk->sk_callback_lock); | 308 | read_lock(&sk->sk_callback_lock); |
309 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 309 | if (sk_has_sleeper(sk)) |
310 | wake_up_interruptible_all(sk->sk_sleep); | 310 | wake_up_interruptible_all(sk->sk_sleep); |
311 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); | 311 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); |
312 | read_unlock(&sk->sk_callback_lock); | 312 | read_unlock(&sk->sk_callback_lock); |
@@ -1256,7 +1256,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, | |||
1256 | struct sock *sk = sock->sk; | 1256 | struct sock *sk = sock->sk; |
1257 | unsigned int mask = 0; | 1257 | unsigned int mask = 0; |
1258 | 1258 | ||
1259 | poll_wait(file, sk->sk_sleep, wait); | 1259 | sock_poll_wait(file, sk->sk_sleep, wait); |
1260 | 1260 | ||
1261 | if (sk->sk_state == IUCV_LISTEN) | 1261 | if (sk->sk_state == IUCV_LISTEN) |
1262 | return iucv_accept_poll(sk); | 1262 | return iucv_accept_poll(sk); |
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index eac5e7bb7365..bfe493ebf27c 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c | |||
@@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk) | |||
63 | _enter("%p", sk); | 63 | _enter("%p", sk); |
64 | read_lock(&sk->sk_callback_lock); | 64 | read_lock(&sk->sk_callback_lock); |
65 | if (rxrpc_writable(sk)) { | 65 | if (rxrpc_writable(sk)) { |
66 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 66 | if (sk_has_sleeper(sk)) |
67 | wake_up_interruptible(sk->sk_sleep); | 67 | wake_up_interruptible(sk->sk_sleep); |
68 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); | 68 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); |
69 | } | 69 | } |
@@ -588,7 +588,7 @@ static unsigned int rxrpc_poll(struct file *file, struct socket *sock, | |||
588 | unsigned int mask; | 588 | unsigned int mask; |
589 | struct sock *sk = sock->sk; | 589 | struct sock *sk = sock->sk; |
590 | 590 | ||
591 | poll_wait(file, sk->sk_sleep, wait); | 591 | sock_poll_wait(file, sk->sk_sleep, wait); |
592 | mask = 0; | 592 | mask = 0; |
593 | 593 | ||
594 | /* the socket is readable if there are any messages waiting on the Rx | 594 | /* the socket is readable if there are any messages waiting on the Rx |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 36d4e44d6233..fc3ebb906911 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk) | |||
315 | { | 315 | { |
316 | read_lock(&sk->sk_callback_lock); | 316 | read_lock(&sk->sk_callback_lock); |
317 | if (unix_writable(sk)) { | 317 | if (unix_writable(sk)) { |
318 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | 318 | if (sk_has_sleeper(sk)) |
319 | wake_up_interruptible_sync(sk->sk_sleep); | 319 | wake_up_interruptible_sync(sk->sk_sleep); |
320 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); | 320 | sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); |
321 | } | 321 | } |
@@ -1985,7 +1985,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table | |||
1985 | struct sock *sk = sock->sk; | 1985 | struct sock *sk = sock->sk; |
1986 | unsigned int mask; | 1986 | unsigned int mask; |
1987 | 1987 | ||
1988 | poll_wait(file, sk->sk_sleep, wait); | 1988 | sock_poll_wait(file, sk->sk_sleep, wait); |
1989 | mask = 0; | 1989 | mask = 0; |
1990 | 1990 | ||
1991 | /* exceptional events? */ | 1991 | /* exceptional events? */ |
@@ -2022,7 +2022,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, | |||
2022 | struct sock *sk = sock->sk, *other; | 2022 | struct sock *sk = sock->sk, *other; |
2023 | unsigned int mask, writable; | 2023 | unsigned int mask, writable; |
2024 | 2024 | ||
2025 | poll_wait(file, sk->sk_sleep, wait); | 2025 | sock_poll_wait(file, sk->sk_sleep, wait); |
2026 | mask = 0; | 2026 | mask = 0; |
2027 | 2027 | ||
2028 | /* exceptional events? */ | 2028 | /* exceptional events? */ |
@@ -2053,7 +2053,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, | |||
2053 | other = unix_peer_get(sk); | 2053 | other = unix_peer_get(sk); |
2054 | if (other) { | 2054 | if (other) { |
2055 | if (unix_peer(other) != sk) { | 2055 | if (unix_peer(other) != sk) { |
2056 | poll_wait(file, &unix_sk(other)->peer_wait, | 2056 | sock_poll_wait(file, &unix_sk(other)->peer_wait, |
2057 | wait); | 2057 | wait); |
2058 | if (unix_recvq_full(other)) | 2058 | if (unix_recvq_full(other)) |
2059 | writable = 0; | 2059 | writable = 0; |