aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-07-31 19:44:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 21:42:47 -0400
commitc76562b6709fee5eff8a6a779be41c0bce661fd7 (patch)
tree18e4a3c71b6d859cc58b7d1156bd1d7bfd1d6a5c
parent68243e76ee343d63c6cf76978588a885951e2818 (diff)
netvm: prevent a stream-specific deadlock
This patch series is based on top of "Swap-over-NBD without deadlocking v15" as it depends on the same reservation of PF_MEMALLOC reserves logic. When a user or administrator requires swap for their application, they create a swap partition and file, format it with mkswap and activate it with swapon. In diskless systems this is not an option so if swap if required then swapping over the network is considered. The two likely scenarios are when blade servers are used as part of a cluster where the form factor or maintenance costs do not allow the use of disks and thin clients. The Linux Terminal Server Project recommends the use of the Network Block Device (NBD) for swap but this is not always an option. There is no guarantee that the network attached storage (NAS) device is running Linux or supports NBD. However, it is likely that it supports NFS so there are users that want support for swapping over NFS despite any performance concern. Some distributions currently carry patches that support swapping over NFS but it would be preferable to support it in the mainline kernel. Patch 1 avoids a stream-specific deadlock that potentially affects TCP. Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC reserves. Patch 3 adds three helpers for filesystems to handle swap cache pages. For example, page_file_mapping() returns page->mapping for file-backed pages and the address_space of the underlying swap file for swap cache pages. Patch 4 adds two address_space_operations to allow a filesystem to pin all metadata relevant to a swapfile in memory. Upon successful activation, the swapfile is marked SWP_FILE and the address space operation ->direct_IO is used for writing and ->readpage for reading in swap pages. Patch 5 notes that patch 3 is bolting filesystem-specific-swapfile-support onto the side and that the default handlers have different information to what is available to the filesystem. This patch refactors the code so that there are generic handlers for each of the new address_space operations. Patch 6 adds an API to allow a vector of kernel addresses to be translated to struct pages and pinned for IO. Patch 7 adds support for using highmem pages for swap by kmapping the pages before calling the direct_IO handler. Patch 8 updates NFS to use the helpers from patch 3 where necessary. Patch 9 avoids setting PF_private on PG_swapcache pages within NFS. Patch 10 implements the new swapfile-related address_space operations for NFS and teaches the direct IO handler how to manage kernel addresses. Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO where appropriate. Patch 12 fixes a NULL pointer dereference that occurs when using swap-over-NFS. With the patches applied, it is possible to mount a swapfile that is on an NFS filesystem. Swap performance is not great with a swap stress test taking roughly twice as long to complete than if the swap device was backed by NBD. This patch: netvm: prevent a stream-specific deadlock It could happen that all !SOCK_MEMALLOC sockets have buffered so much data that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers from receiving data, which will prevent userspace from running, which is needed to reduce the buffered data. Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once this change it applied, it is important that sockets that set SOCK_MEMALLOC do not clear the flag until the socket is being torn down. If this happens, a warning is generated and the tokens reclaimed to avoid accounting errors until the bug is fixed. [davem@davemloft.net: Warning about clearing SOCK_MEMALLOC] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: David S. Miller <davem@davemloft.net> Acked-by: Rik van Riel <riel@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Neil Brown <neilb@suse.de> Cc: Christoph Hellwig <hch@infradead.org> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/net/sock.h8
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/core/sock.c14
-rw-r--r--net/ipv4/tcp_input.c21
-rw-r--r--net/sctp/ulpevent.c3
5 files changed, 32 insertions, 16 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 43a470d40d76..b3730239bf18 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1329,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size)
1329 __sk_mem_schedule(sk, size, SK_MEM_SEND); 1329 __sk_mem_schedule(sk, size, SK_MEM_SEND);
1330} 1330}
1331 1331
1332static inline bool sk_rmem_schedule(struct sock *sk, int size) 1332static inline bool
1333sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size)
1333{ 1334{
1334 if (!sk_has_account(sk)) 1335 if (!sk_has_account(sk))
1335 return true; 1336 return true;
1336 return size <= sk->sk_forward_alloc || 1337 return size<= sk->sk_forward_alloc ||
1337 __sk_mem_schedule(sk, size, SK_MEM_RECV); 1338 __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
1339 skb_pfmemalloc(skb);
1338} 1340}
1339 1341
1340static inline void sk_mem_reclaim(struct sock *sk) 1342static inline void sk_mem_reclaim(struct sock *sk)
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 78f1cdad5b33..095259f83902 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
141 err = sk_filter(sk, skb); 141 err = sk_filter(sk, skb);
142 if (err) 142 if (err)
143 return err; 143 return err;
144 if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { 144 if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
145 set_rx_flow_off(cf_sk); 145 set_rx_flow_off(cf_sk);
146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); 146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); 147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
diff --git a/net/core/sock.c b/net/core/sock.c
index 32fdcd2d6e8f..6b654b3ddfda 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -295,6 +295,18 @@ void sk_clear_memalloc(struct sock *sk)
295 sock_reset_flag(sk, SOCK_MEMALLOC); 295 sock_reset_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation &= ~__GFP_MEMALLOC; 296 sk->sk_allocation &= ~__GFP_MEMALLOC;
297 static_key_slow_dec(&memalloc_socks); 297 static_key_slow_dec(&memalloc_socks);
298
299 /*
300 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 * it has rmem allocations there is a risk that the user of the
303 * socket cannot make forward progress due to exceeding the rmem
304 * limits. By rights, sk_clear_memalloc() should only be called
305 * on sockets being torn down but warn and reset the accounting if
306 * that assumption breaks.
307 */
308 if (WARN_ON(sk->sk_forward_alloc))
309 sk_mem_reclaim(sk);
298} 310}
299EXPORT_SYMBOL_GPL(sk_clear_memalloc); 311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
300 312
@@ -396,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
396 if (err) 408 if (err)
397 return err; 409 return err;
398 410
399 if (!sk_rmem_schedule(sk, skb->truesize)) { 411 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
400 atomic_inc(&sk->sk_drops); 412 atomic_inc(&sk->sk_drops);
401 return -ENOBUFS; 413 return -ENOBUFS;
402 } 414 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9a..00b91b4b8665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
4351static bool tcp_prune_ofo_queue(struct sock *sk); 4351static bool tcp_prune_ofo_queue(struct sock *sk);
4352static int tcp_prune_queue(struct sock *sk); 4352static int tcp_prune_queue(struct sock *sk);
4353 4353
4354static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) 4354static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4355 unsigned int size)
4355{ 4356{
4356 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 4357 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4357 !sk_rmem_schedule(sk, size)) { 4358 !sk_rmem_schedule(sk, skb, size)) {
4358 4359
4359 if (tcp_prune_queue(sk) < 0) 4360 if (tcp_prune_queue(sk) < 0)
4360 return -1; 4361 return -1;
4361 4362
4362 if (!sk_rmem_schedule(sk, size)) { 4363 if (!sk_rmem_schedule(sk, skb, size)) {
4363 if (!tcp_prune_ofo_queue(sk)) 4364 if (!tcp_prune_ofo_queue(sk))
4364 return -1; 4365 return -1;
4365 4366
4366 if (!sk_rmem_schedule(sk, size)) 4367 if (!sk_rmem_schedule(sk, skb, size))
4367 return -1; 4368 return -1;
4368 } 4369 }
4369 } 4370 }
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4418 4419
4419 TCP_ECN_check_ce(tp, skb); 4420 TCP_ECN_check_ce(tp, skb);
4420 4421
4421 if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { 4422 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4422 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4423 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4423 __kfree_skb(skb); 4424 __kfree_skb(skb);
4424 return; 4425 return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4552 4553
4553int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4554int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4554{ 4555{
4555 struct sk_buff *skb; 4556 struct sk_buff *skb = NULL;
4556 struct tcphdr *th; 4557 struct tcphdr *th;
4557 bool fragstolen; 4558 bool fragstolen;
4558 4559
4559 if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
4560 goto err;
4561
4562 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4560 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4563 if (!skb) 4561 if (!skb)
4564 goto err; 4562 goto err;
4565 4563
4564 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
4565 goto err_free;
4566
4566 th = (struct tcphdr *)skb_put(skb, sizeof(*th)); 4567 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4567 skb_reset_transport_header(skb); 4568 skb_reset_transport_header(skb);
4568 memset(th, 0, sizeof(*th)); 4569 memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4633 if (eaten <= 0) { 4634 if (eaten <= 0) {
4634queue_and_out: 4635queue_and_out:
4635 if (eaten < 0 && 4636 if (eaten < 0 &&
4636 tcp_try_rmem_schedule(sk, skb->truesize)) 4637 tcp_try_rmem_schedule(sk, skb, skb->truesize))
4637 goto drop; 4638 goto drop;
4638 4639
4639 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4640 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 33d894776192..10c018a5b9fe 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
702 if (rx_count >= asoc->base.sk->sk_rcvbuf) { 702 if (rx_count >= asoc->base.sk->sk_rcvbuf) {
703 703
704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || 704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) 705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
706 chunk->skb->truesize)))
706 goto fail; 707 goto fail;
707 } 708 }
708 709