aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-07-31 19:44:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 21:42:46 -0400
commitb4b9e3558508980fc0cd161a545ffb55a1f13ee9 (patch)
treeabb2ab54f4b201b1cbdaf181ec16912c3dd889eb
parent0614002bb5f7411e61ffa0dfe5be1f2c84df3da3 (diff)
netvm: set PF_MEMALLOC as appropriate during SKB processing
In order to make sure pfmemalloc packets receive all memory needed to proceed, ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC. This is limited to a subset of protocols that are expected to be used for writing to swap. Taps are not allowed to use PF_MEMALLOC as these are expected to communicate with userspace processes which could be paged out. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [jslaby@suse.cz: Lock imbalance fix] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: David S. Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/net/sock.h5
-rw-r--r--net/core/dev.c53
-rw-r--r--net/core/sock.c16
3 files changed, 68 insertions, 6 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 81198632ac2a..43a470d40d76 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -754,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
754 return 0; 754 return 0;
755} 755}
756 756
757extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
758
757static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 759static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
758{ 760{
761 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
762 return __sk_backlog_rcv(sk, skb);
763
759 return sk->sk_backlog_rcv(sk, skb); 764 return sk->sk_backlog_rcv(sk, skb);
760} 765}
761 766
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ebaea16632f..ce132443d5d1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3155,6 +3155,23 @@ void netdev_rx_handler_unregister(struct net_device *dev)
3155} 3155}
3156EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3156EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3157 3157
3158/*
3159 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3160 * the special handling of PFMEMALLOC skbs.
3161 */
3162static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3163{
3164 switch (skb->protocol) {
3165 case __constant_htons(ETH_P_ARP):
3166 case __constant_htons(ETH_P_IP):
3167 case __constant_htons(ETH_P_IPV6):
3168 case __constant_htons(ETH_P_8021Q):
3169 return true;
3170 default:
3171 return false;
3172 }
3173}
3174
3158static int __netif_receive_skb(struct sk_buff *skb) 3175static int __netif_receive_skb(struct sk_buff *skb)
3159{ 3176{
3160 struct packet_type *ptype, *pt_prev; 3177 struct packet_type *ptype, *pt_prev;
@@ -3164,14 +3181,27 @@ static int __netif_receive_skb(struct sk_buff *skb)
3164 bool deliver_exact = false; 3181 bool deliver_exact = false;
3165 int ret = NET_RX_DROP; 3182 int ret = NET_RX_DROP;
3166 __be16 type; 3183 __be16 type;
3184 unsigned long pflags = current->flags;
3167 3185
3168 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3186 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3169 3187
3170 trace_netif_receive_skb(skb); 3188 trace_netif_receive_skb(skb);
3171 3189
3190 /*
3191 * PFMEMALLOC skbs are special, they should
3192 * - be delivered to SOCK_MEMALLOC sockets only
3193 * - stay away from userspace
3194 * - have bounded memory usage
3195 *
3196 * Use PF_MEMALLOC as this saves us from propagating the allocation
3197 * context down to all allocation sites.
3198 */
3199 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3200 current->flags |= PF_MEMALLOC;
3201
3172 /* if we've gotten here through NAPI, check netpoll */ 3202 /* if we've gotten here through NAPI, check netpoll */
3173 if (netpoll_receive_skb(skb)) 3203 if (netpoll_receive_skb(skb))
3174 return NET_RX_DROP; 3204 goto out;
3175 3205
3176 orig_dev = skb->dev; 3206 orig_dev = skb->dev;
3177 3207
@@ -3191,7 +3221,7 @@ another_round:
3191 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { 3221 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3192 skb = vlan_untag(skb); 3222 skb = vlan_untag(skb);
3193 if (unlikely(!skb)) 3223 if (unlikely(!skb))
3194 goto out; 3224 goto unlock;
3195 } 3225 }
3196 3226
3197#ifdef CONFIG_NET_CLS_ACT 3227#ifdef CONFIG_NET_CLS_ACT
@@ -3201,6 +3231,9 @@ another_round:
3201 } 3231 }
3202#endif 3232#endif
3203 3233
3234 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3235 goto skip_taps;
3236
3204 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3237 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3205 if (!ptype->dev || ptype->dev == skb->dev) { 3238 if (!ptype->dev || ptype->dev == skb->dev) {
3206 if (pt_prev) 3239 if (pt_prev)
@@ -3209,13 +3242,18 @@ another_round:
3209 } 3242 }
3210 } 3243 }
3211 3244
3245skip_taps:
3212#ifdef CONFIG_NET_CLS_ACT 3246#ifdef CONFIG_NET_CLS_ACT
3213 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3247 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214 if (!skb) 3248 if (!skb)
3215 goto out; 3249 goto unlock;
3216ncls: 3250ncls:
3217#endif 3251#endif
3218 3252
3253 if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3254 && !skb_pfmemalloc_protocol(skb))
3255 goto drop;
3256
3219 rx_handler = rcu_dereference(skb->dev->rx_handler); 3257 rx_handler = rcu_dereference(skb->dev->rx_handler);
3220 if (vlan_tx_tag_present(skb)) { 3258 if (vlan_tx_tag_present(skb)) {
3221 if (pt_prev) { 3259 if (pt_prev) {
@@ -3225,7 +3263,7 @@ ncls:
3225 if (vlan_do_receive(&skb, !rx_handler)) 3263 if (vlan_do_receive(&skb, !rx_handler))
3226 goto another_round; 3264 goto another_round;
3227 else if (unlikely(!skb)) 3265 else if (unlikely(!skb))
3228 goto out; 3266 goto unlock;
3229 } 3267 }
3230 3268
3231 if (rx_handler) { 3269 if (rx_handler) {
@@ -3235,7 +3273,7 @@ ncls:
3235 } 3273 }
3236 switch (rx_handler(&skb)) { 3274 switch (rx_handler(&skb)) {
3237 case RX_HANDLER_CONSUMED: 3275 case RX_HANDLER_CONSUMED:
3238 goto out; 3276 goto unlock;
3239 case RX_HANDLER_ANOTHER: 3277 case RX_HANDLER_ANOTHER:
3240 goto another_round; 3278 goto another_round;
3241 case RX_HANDLER_EXACT: 3279 case RX_HANDLER_EXACT:
@@ -3268,6 +3306,7 @@ ncls:
3268 else 3306 else
3269 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3307 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3270 } else { 3308 } else {
3309drop:
3271 atomic_long_inc(&skb->dev->rx_dropped); 3310 atomic_long_inc(&skb->dev->rx_dropped);
3272 kfree_skb(skb); 3311 kfree_skb(skb);
3273 /* Jamal, now you will not able to escape explaining 3312 /* Jamal, now you will not able to escape explaining
@@ -3276,8 +3315,10 @@ ncls:
3276 ret = NET_RX_DROP; 3315 ret = NET_RX_DROP;
3277 } 3316 }
3278 3317
3279out: 3318unlock:
3280 rcu_read_unlock(); 3319 rcu_read_unlock();
3320out:
3321 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3281 return ret; 3322 return ret;
3282} 3323}
3283 3324
diff --git a/net/core/sock.c b/net/core/sock.c
index c8c5816289fe..32fdcd2d6e8f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -298,6 +298,22 @@ void sk_clear_memalloc(struct sock *sk)
298} 298}
299EXPORT_SYMBOL_GPL(sk_clear_memalloc); 299EXPORT_SYMBOL_GPL(sk_clear_memalloc);
300 300
301int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
302{
303 int ret;
304 unsigned long pflags = current->flags;
305
306 /* these should have been dropped before queueing */
307 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
308
309 current->flags |= PF_MEMALLOC;
310 ret = sk->sk_backlog_rcv(sk, skb);
311 tsk_restore_flags(current, pflags, PF_MEMALLOC);
312
313 return ret;
314}
315EXPORT_SYMBOL(__sk_backlog_rcv);
316
301#if defined(CONFIG_CGROUPS) 317#if defined(CONFIG_CGROUPS)
302#if !defined(CONFIG_NET_CLS_CGROUP) 318#if !defined(CONFIG_NET_CLS_CGROUP)
303int net_cls_subsys_id = -1; 319int net_cls_subsys_id = -1;