diff options
author | Mel Gorman <mgorman@suse.de> | 2012-07-31 19:44:26 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 21:42:46 -0400 |
commit | b4b9e3558508980fc0cd161a545ffb55a1f13ee9 (patch) | |
tree | abb2ab54f4b201b1cbdaf181ec16912c3dd889eb | |
parent | 0614002bb5f7411e61ffa0dfe5be1f2c84df3da3 (diff) |
netvm: set PF_MEMALLOC as appropriate during SKB processing
In order to make sure pfmemalloc packets receive all memory needed to
proceed, ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC.
This is limited to a subset of protocols that are expected to be used for
writing to swap. Taps are not allowed to use PF_MEMALLOC as these are
expected to communicate with userspace processes which could be paged out.
[a.p.zijlstra@chello.nl: Ideas taken from various patches]
[jslaby@suse.cz: Lock imbalance fix]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/net/sock.h | 5 | ||||
-rw-r--r-- | net/core/dev.c | 53 | ||||
-rw-r--r-- | net/core/sock.c | 16 |
3 files changed, 68 insertions, 6 deletions
diff --git a/include/net/sock.h b/include/net/sock.h index 81198632ac2a..43a470d40d76 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -754,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s | |||
754 | return 0; | 754 | return 0; |
755 | } | 755 | } |
756 | 756 | ||
757 | extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); | ||
758 | |||
757 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) | 759 | static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) |
758 | { | 760 | { |
761 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
762 | return __sk_backlog_rcv(sk, skb); | ||
763 | |||
759 | return sk->sk_backlog_rcv(sk, skb); | 764 | return sk->sk_backlog_rcv(sk, skb); |
760 | } | 765 | } |
761 | 766 | ||
diff --git a/net/core/dev.c b/net/core/dev.c index 0ebaea16632f..ce132443d5d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -3155,6 +3155,23 @@ void netdev_rx_handler_unregister(struct net_device *dev) | |||
3155 | } | 3155 | } |
3156 | EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); | 3156 | EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); |
3157 | 3157 | ||
3158 | /* | ||
3159 | * Limit the use of PFMEMALLOC reserves to those protocols that implement | ||
3160 | * the special handling of PFMEMALLOC skbs. | ||
3161 | */ | ||
3162 | static bool skb_pfmemalloc_protocol(struct sk_buff *skb) | ||
3163 | { | ||
3164 | switch (skb->protocol) { | ||
3165 | case __constant_htons(ETH_P_ARP): | ||
3166 | case __constant_htons(ETH_P_IP): | ||
3167 | case __constant_htons(ETH_P_IPV6): | ||
3168 | case __constant_htons(ETH_P_8021Q): | ||
3169 | return true; | ||
3170 | default: | ||
3171 | return false; | ||
3172 | } | ||
3173 | } | ||
3174 | |||
3158 | static int __netif_receive_skb(struct sk_buff *skb) | 3175 | static int __netif_receive_skb(struct sk_buff *skb) |
3159 | { | 3176 | { |
3160 | struct packet_type *ptype, *pt_prev; | 3177 | struct packet_type *ptype, *pt_prev; |
@@ -3164,14 +3181,27 @@ static int __netif_receive_skb(struct sk_buff *skb) | |||
3164 | bool deliver_exact = false; | 3181 | bool deliver_exact = false; |
3165 | int ret = NET_RX_DROP; | 3182 | int ret = NET_RX_DROP; |
3166 | __be16 type; | 3183 | __be16 type; |
3184 | unsigned long pflags = current->flags; | ||
3167 | 3185 | ||
3168 | net_timestamp_check(!netdev_tstamp_prequeue, skb); | 3186 | net_timestamp_check(!netdev_tstamp_prequeue, skb); |
3169 | 3187 | ||
3170 | trace_netif_receive_skb(skb); | 3188 | trace_netif_receive_skb(skb); |
3171 | 3189 | ||
3190 | /* | ||
3191 | * PFMEMALLOC skbs are special, they should | ||
3192 | * - be delivered to SOCK_MEMALLOC sockets only | ||
3193 | * - stay away from userspace | ||
3194 | * - have bounded memory usage | ||
3195 | * | ||
3196 | * Use PF_MEMALLOC as this saves us from propagating the allocation | ||
3197 | * context down to all allocation sites. | ||
3198 | */ | ||
3199 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
3200 | current->flags |= PF_MEMALLOC; | ||
3201 | |||
3172 | /* if we've gotten here through NAPI, check netpoll */ | 3202 | /* if we've gotten here through NAPI, check netpoll */ |
3173 | if (netpoll_receive_skb(skb)) | 3203 | if (netpoll_receive_skb(skb)) |
3174 | return NET_RX_DROP; | 3204 | goto out; |
3175 | 3205 | ||
3176 | orig_dev = skb->dev; | 3206 | orig_dev = skb->dev; |
3177 | 3207 | ||
@@ -3191,7 +3221,7 @@ another_round: | |||
3191 | if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { | 3221 | if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { |
3192 | skb = vlan_untag(skb); | 3222 | skb = vlan_untag(skb); |
3193 | if (unlikely(!skb)) | 3223 | if (unlikely(!skb)) |
3194 | goto out; | 3224 | goto unlock; |
3195 | } | 3225 | } |
3196 | 3226 | ||
3197 | #ifdef CONFIG_NET_CLS_ACT | 3227 | #ifdef CONFIG_NET_CLS_ACT |
@@ -3201,6 +3231,9 @@ another_round: | |||
3201 | } | 3231 | } |
3202 | #endif | 3232 | #endif |
3203 | 3233 | ||
3234 | if (sk_memalloc_socks() && skb_pfmemalloc(skb)) | ||
3235 | goto skip_taps; | ||
3236 | |||
3204 | list_for_each_entry_rcu(ptype, &ptype_all, list) { | 3237 | list_for_each_entry_rcu(ptype, &ptype_all, list) { |
3205 | if (!ptype->dev || ptype->dev == skb->dev) { | 3238 | if (!ptype->dev || ptype->dev == skb->dev) { |
3206 | if (pt_prev) | 3239 | if (pt_prev) |
@@ -3209,13 +3242,18 @@ another_round: | |||
3209 | } | 3242 | } |
3210 | } | 3243 | } |
3211 | 3244 | ||
3245 | skip_taps: | ||
3212 | #ifdef CONFIG_NET_CLS_ACT | 3246 | #ifdef CONFIG_NET_CLS_ACT |
3213 | skb = handle_ing(skb, &pt_prev, &ret, orig_dev); | 3247 | skb = handle_ing(skb, &pt_prev, &ret, orig_dev); |
3214 | if (!skb) | 3248 | if (!skb) |
3215 | goto out; | 3249 | goto unlock; |
3216 | ncls: | 3250 | ncls: |
3217 | #endif | 3251 | #endif |
3218 | 3252 | ||
3253 | if (sk_memalloc_socks() && skb_pfmemalloc(skb) | ||
3254 | && !skb_pfmemalloc_protocol(skb)) | ||
3255 | goto drop; | ||
3256 | |||
3219 | rx_handler = rcu_dereference(skb->dev->rx_handler); | 3257 | rx_handler = rcu_dereference(skb->dev->rx_handler); |
3220 | if (vlan_tx_tag_present(skb)) { | 3258 | if (vlan_tx_tag_present(skb)) { |
3221 | if (pt_prev) { | 3259 | if (pt_prev) { |
@@ -3225,7 +3263,7 @@ ncls: | |||
3225 | if (vlan_do_receive(&skb, !rx_handler)) | 3263 | if (vlan_do_receive(&skb, !rx_handler)) |
3226 | goto another_round; | 3264 | goto another_round; |
3227 | else if (unlikely(!skb)) | 3265 | else if (unlikely(!skb)) |
3228 | goto out; | 3266 | goto unlock; |
3229 | } | 3267 | } |
3230 | 3268 | ||
3231 | if (rx_handler) { | 3269 | if (rx_handler) { |
@@ -3235,7 +3273,7 @@ ncls: | |||
3235 | } | 3273 | } |
3236 | switch (rx_handler(&skb)) { | 3274 | switch (rx_handler(&skb)) { |
3237 | case RX_HANDLER_CONSUMED: | 3275 | case RX_HANDLER_CONSUMED: |
3238 | goto out; | 3276 | goto unlock; |
3239 | case RX_HANDLER_ANOTHER: | 3277 | case RX_HANDLER_ANOTHER: |
3240 | goto another_round; | 3278 | goto another_round; |
3241 | case RX_HANDLER_EXACT: | 3279 | case RX_HANDLER_EXACT: |
@@ -3268,6 +3306,7 @@ ncls: | |||
3268 | else | 3306 | else |
3269 | ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); | 3307 | ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); |
3270 | } else { | 3308 | } else { |
3309 | drop: | ||
3271 | atomic_long_inc(&skb->dev->rx_dropped); | 3310 | atomic_long_inc(&skb->dev->rx_dropped); |
3272 | kfree_skb(skb); | 3311 | kfree_skb(skb); |
3273 | /* Jamal, now you will not able to escape explaining | 3312 | /* Jamal, now you will not able to escape explaining |
@@ -3276,8 +3315,10 @@ ncls: | |||
3276 | ret = NET_RX_DROP; | 3315 | ret = NET_RX_DROP; |
3277 | } | 3316 | } |
3278 | 3317 | ||
3279 | out: | 3318 | unlock: |
3280 | rcu_read_unlock(); | 3319 | rcu_read_unlock(); |
3320 | out: | ||
3321 | tsk_restore_flags(current, pflags, PF_MEMALLOC); | ||
3281 | return ret; | 3322 | return ret; |
3282 | } | 3323 | } |
3283 | 3324 | ||
diff --git a/net/core/sock.c b/net/core/sock.c index c8c5816289fe..32fdcd2d6e8f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -298,6 +298,22 @@ void sk_clear_memalloc(struct sock *sk) | |||
298 | } | 298 | } |
299 | EXPORT_SYMBOL_GPL(sk_clear_memalloc); | 299 | EXPORT_SYMBOL_GPL(sk_clear_memalloc); |
300 | 300 | ||
301 | int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) | ||
302 | { | ||
303 | int ret; | ||
304 | unsigned long pflags = current->flags; | ||
305 | |||
306 | /* these should have been dropped before queueing */ | ||
307 | BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); | ||
308 | |||
309 | current->flags |= PF_MEMALLOC; | ||
310 | ret = sk->sk_backlog_rcv(sk, skb); | ||
311 | tsk_restore_flags(current, pflags, PF_MEMALLOC); | ||
312 | |||
313 | return ret; | ||
314 | } | ||
315 | EXPORT_SYMBOL(__sk_backlog_rcv); | ||
316 | |||
301 | #if defined(CONFIG_CGROUPS) | 317 | #if defined(CONFIG_CGROUPS) |
302 | #if !defined(CONFIG_NET_CLS_CGROUP) | 318 | #if !defined(CONFIG_NET_CLS_CGROUP) |
303 | int net_cls_subsys_id = -1; | 319 | int net_cls_subsys_id = -1; |