aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-07-31 19:44:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 21:42:46 -0400
commitc93bdd0e03e848555d144eb44a1f275b871a8dd5 (patch)
tree8aff3bc2b9ff9f32e51040a7652bfb31257db626 /net
parent7cb0240492caea2f6467f827313478f41877e6ef (diff)
netvm: allow skb allocation to use PFMEMALLOC reserves
Change the skb allocation API to indicate RX usage and use this to fall back to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and the socket is later found to be unrelated to page reclaim, the packet is dropped so that the memory remains available for page reclaim. Network protocols are expected to recover from this packet loss. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [davem@davemloft.net: Use static branches, coding style corrections] [sebastian@breakpoint.cc: Avoid unnecessary cast, fix !CONFIG_NET build] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: David S. Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'net')
-rw-r--r--net/core/filter.c8
-rw-r--r--net/core/skbuff.c124
-rw-r--r--net/core/sock.c5
3 files changed, 112 insertions, 25 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index d4ce2dc712e3..907efd27ec77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
83 int err; 83 int err;
84 struct sk_filter *filter; 84 struct sk_filter *filter;
85 85
86 /*
87 * If the skb was allocated from pfmemalloc reserves, only
88 * allow SOCK_MEMALLOC sockets to use it as this socket is
89 * helping free memory
90 */
91 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
92 return -ENOMEM;
93
86 err = security_sock_rcv_skb(sk, skb); 94 err = security_sock_rcv_skb(sk, skb);
87 if (err) 95 if (err)
88 return err; 96 return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368f65c15e4f..fe00d1208167 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
145 BUG(); 145 BUG();
146} 146}
147 147
148
149/*
150 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
151 * the caller if emergency pfmemalloc reserves are being used. If it is and
152 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
153 * may be used. Otherwise, the packet data may be discarded until enough
154 * memory is free
155 */
156#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
157 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
158void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
159 bool *pfmemalloc)
160{
161 void *obj;
162 bool ret_pfmemalloc = false;
163
164 /*
165 * Try a regular allocation, when that fails and we're not entitled
166 * to the reserves, fail.
167 */
168 obj = kmalloc_node_track_caller(size,
169 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
170 node);
171 if (obj || !(gfp_pfmemalloc_allowed(flags)))
172 goto out;
173
174 /* Try again but now we are using pfmemalloc reserves */
175 ret_pfmemalloc = true;
176 obj = kmalloc_node_track_caller(size, flags, node);
177
178out:
179 if (pfmemalloc)
180 *pfmemalloc = ret_pfmemalloc;
181
182 return obj;
183}
184
148/* Allocate a new skbuff. We do this ourselves so we can fill in a few 185/* Allocate a new skbuff. We do this ourselves so we can fill in a few
149 * 'private' fields and also do memory statistics to find all the 186 * 'private' fields and also do memory statistics to find all the
150 * [BEEP] leaks. 187 * [BEEP] leaks.
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
155 * __alloc_skb - allocate a network buffer 192 * __alloc_skb - allocate a network buffer
156 * @size: size to allocate 193 * @size: size to allocate
157 * @gfp_mask: allocation mask 194 * @gfp_mask: allocation mask
158 * @fclone: allocate from fclone cache instead of head cache 195 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
159 * and allocate a cloned (child) skb 196 * instead of head cache and allocate a cloned (child) skb.
197 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
198 * allocations in case the data is required for writeback
160 * @node: numa node to allocate memory on 199 * @node: numa node to allocate memory on
161 * 200 *
162 * Allocate a new &sk_buff. The returned buffer has no headroom and a 201 * Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
167 * %GFP_ATOMIC. 206 * %GFP_ATOMIC.
168 */ 207 */
169struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 208struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
170 int fclone, int node) 209 int flags, int node)
171{ 210{
172 struct kmem_cache *cache; 211 struct kmem_cache *cache;
173 struct skb_shared_info *shinfo; 212 struct skb_shared_info *shinfo;
174 struct sk_buff *skb; 213 struct sk_buff *skb;
175 u8 *data; 214 u8 *data;
215 bool pfmemalloc;
176 216
177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 217 cache = (flags & SKB_ALLOC_FCLONE)
218 ? skbuff_fclone_cache : skbuff_head_cache;
219
220 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
221 gfp_mask |= __GFP_MEMALLOC;
178 222
179 /* Get the HEAD */ 223 /* Get the HEAD */
180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 224 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
189 */ 233 */
190 size = SKB_DATA_ALIGN(size); 234 size = SKB_DATA_ALIGN(size);
191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 235 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
192 data = kmalloc_node_track_caller(size, gfp_mask, node); 236 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
193 if (!data) 237 if (!data)
194 goto nodata; 238 goto nodata;
195 /* kmalloc(size) might give us more room than requested. 239 /* kmalloc(size) might give us more room than requested.
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
207 memset(skb, 0, offsetof(struct sk_buff, tail)); 251 memset(skb, 0, offsetof(struct sk_buff, tail));
208 /* Account for allocated memory : skb + skb->head */ 252 /* Account for allocated memory : skb + skb->head */
209 skb->truesize = SKB_TRUESIZE(size); 253 skb->truesize = SKB_TRUESIZE(size);
254 skb->pfmemalloc = pfmemalloc;
210 atomic_set(&skb->users, 1); 255 atomic_set(&skb->users, 1);
211 skb->head = data; 256 skb->head = data;
212 skb->data = data; 257 skb->data = data;
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
222 atomic_set(&shinfo->dataref, 1); 267 atomic_set(&shinfo->dataref, 1);
223 kmemcheck_annotate_variable(shinfo->destructor_arg); 268 kmemcheck_annotate_variable(shinfo->destructor_arg);
224 269
225 if (fclone) { 270 if (flags & SKB_ALLOC_FCLONE) {
226 struct sk_buff *child = skb + 1; 271 struct sk_buff *child = skb + 1;
227 atomic_t *fclone_ref = (atomic_t *) (child + 1); 272 atomic_t *fclone_ref = (atomic_t *) (child + 1);
228 273
@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
232 atomic_set(fclone_ref, 1); 277 atomic_set(fclone_ref, 1);
233 278
234 child->fclone = SKB_FCLONE_UNAVAILABLE; 279 child->fclone = SKB_FCLONE_UNAVAILABLE;
280 child->pfmemalloc = pfmemalloc;
235 } 281 }
236out: 282out:
237 return skb; 283 return skb;
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
302 348
303#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) 349#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
304 350
305/** 351static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
306 * netdev_alloc_frag - allocate a page fragment
307 * @fragsz: fragment size
308 *
309 * Allocates a frag from a page for receive buffer.
310 * Uses GFP_ATOMIC allocations.
311 */
312void *netdev_alloc_frag(unsigned int fragsz)
313{ 352{
314 struct netdev_alloc_cache *nc; 353 struct netdev_alloc_cache *nc;
315 void *data = NULL; 354 void *data = NULL;
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz)
319 nc = &__get_cpu_var(netdev_alloc_cache); 358 nc = &__get_cpu_var(netdev_alloc_cache);
320 if (unlikely(!nc->page)) { 359 if (unlikely(!nc->page)) {
321refill: 360refill:
322 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); 361 nc->page = alloc_page(gfp_mask);
323 if (unlikely(!nc->page)) 362 if (unlikely(!nc->page))
324 goto end; 363 goto end;
325recycle: 364recycle:
@@ -343,6 +382,18 @@ end:
343 local_irq_restore(flags); 382 local_irq_restore(flags);
344 return data; 383 return data;
345} 384}
385
386/**
387 * netdev_alloc_frag - allocate a page fragment
388 * @fragsz: fragment size
389 *
390 * Allocates a frag from a page for receive buffer.
391 * Uses GFP_ATOMIC allocations.
392 */
393void *netdev_alloc_frag(unsigned int fragsz)
394{
395 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
396}
346EXPORT_SYMBOL(netdev_alloc_frag); 397EXPORT_SYMBOL(netdev_alloc_frag);
347 398
348/** 399/**
@@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
366 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 417 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
367 418
368 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { 419 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
369 void *data = netdev_alloc_frag(fragsz); 420 void *data;
421
422 if (sk_memalloc_socks())
423 gfp_mask |= __GFP_MEMALLOC;
424
425 data = __netdev_alloc_frag(fragsz, gfp_mask);
370 426
371 if (likely(data)) { 427 if (likely(data)) {
372 skb = build_skb(data, fragsz); 428 skb = build_skb(data, fragsz);
@@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
374 put_page(virt_to_head_page(data)); 430 put_page(virt_to_head_page(data));
375 } 431 }
376 } else { 432 } else {
377 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 433 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
434 SKB_ALLOC_RX, NUMA_NO_NODE);
378 } 435 }
379 if (likely(skb)) { 436 if (likely(skb)) {
380 skb_reserve(skb, NET_SKB_PAD); 437 skb_reserve(skb, NET_SKB_PAD);
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
656#if IS_ENABLED(CONFIG_IP_VS) 713#if IS_ENABLED(CONFIG_IP_VS)
657 new->ipvs_property = old->ipvs_property; 714 new->ipvs_property = old->ipvs_property;
658#endif 715#endif
716 new->pfmemalloc = old->pfmemalloc;
659 new->protocol = old->protocol; 717 new->protocol = old->protocol;
660 new->mark = old->mark; 718 new->mark = old->mark;
661 new->skb_iif = old->skb_iif; 719 new->skb_iif = old->skb_iif;
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
814 n->fclone = SKB_FCLONE_CLONE; 872 n->fclone = SKB_FCLONE_CLONE;
815 atomic_inc(fclone_ref); 873 atomic_inc(fclone_ref);
816 } else { 874 } else {
875 if (skb_pfmemalloc(skb))
876 gfp_mask |= __GFP_MEMALLOC;
877
817 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 878 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
818 if (!n) 879 if (!n)
819 return NULL; 880 return NULL;
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
850 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 911 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
851} 912}
852 913
914static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
915{
916 if (skb_pfmemalloc(skb))
917 return SKB_ALLOC_RX;
918 return 0;
919}
920
853/** 921/**
854 * skb_copy - create private copy of an sk_buff 922 * skb_copy - create private copy of an sk_buff
855 * @skb: buffer to copy 923 * @skb: buffer to copy
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
871{ 939{
872 int headerlen = skb_headroom(skb); 940 int headerlen = skb_headroom(skb);
873 unsigned int size = skb_end_offset(skb) + skb->data_len; 941 unsigned int size = skb_end_offset(skb) + skb->data_len;
874 struct sk_buff *n = alloc_skb(size, gfp_mask); 942 struct sk_buff *n = __alloc_skb(size, gfp_mask,
943 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
875 944
876 if (!n) 945 if (!n)
877 return NULL; 946 return NULL;
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
906struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) 975struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
907{ 976{
908 unsigned int size = skb_headlen(skb) + headroom; 977 unsigned int size = skb_headlen(skb) + headroom;
909 struct sk_buff *n = alloc_skb(size, gfp_mask); 978 struct sk_buff *n = __alloc_skb(size, gfp_mask,
979 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
910 980
911 if (!n) 981 if (!n)
912 goto out; 982 goto out;
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
979 1049
980 size = SKB_DATA_ALIGN(size); 1050 size = SKB_DATA_ALIGN(size);
981 1051
982 data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1052 if (skb_pfmemalloc(skb))
983 gfp_mask); 1053 gfp_mask |= __GFP_MEMALLOC;
1054 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1055 gfp_mask, NUMA_NO_NODE, NULL);
984 if (!data) 1056 if (!data)
985 goto nodata; 1057 goto nodata;
986 size = SKB_WITH_OVERHEAD(ksize(data)); 1058 size = SKB_WITH_OVERHEAD(ksize(data));
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1092 /* 1164 /*
1093 * Allocate the copy buffer 1165 * Allocate the copy buffer
1094 */ 1166 */
1095 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1167 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1096 gfp_mask); 1168 gfp_mask, skb_alloc_rx_flag(skb),
1169 NUMA_NO_NODE);
1097 int oldheadroom = skb_headroom(skb); 1170 int oldheadroom = skb_headroom(skb);
1098 int head_copy_len, head_copy_off; 1171 int head_copy_len, head_copy_off;
1099 int off; 1172 int off;
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
2775 skb_release_head_state(nskb); 2848 skb_release_head_state(nskb);
2776 __skb_push(nskb, doffset); 2849 __skb_push(nskb, doffset);
2777 } else { 2850 } else {
2778 nskb = alloc_skb(hsize + doffset + headroom, 2851 nskb = __alloc_skb(hsize + doffset + headroom,
2779 GFP_ATOMIC); 2852 GFP_ATOMIC, skb_alloc_rx_flag(skb),
2853 NUMA_NO_NODE);
2780 2854
2781 if (unlikely(!nskb)) 2855 if (unlikely(!nskb))
2782 goto err; 2856 goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 3617f652f6b0..c8c5816289fe 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -271,6 +271,9 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272EXPORT_SYMBOL(sysctl_optmem_max); 272EXPORT_SYMBOL(sysctl_optmem_max);
273 273
274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275EXPORT_SYMBOL_GPL(memalloc_socks);
276
274/** 277/**
275 * sk_set_memalloc - sets %SOCK_MEMALLOC 278 * sk_set_memalloc - sets %SOCK_MEMALLOC
276 * @sk: socket to set it on 279 * @sk: socket to set it on
@@ -283,6 +286,7 @@ void sk_set_memalloc(struct sock *sk)
283{ 286{
284 sock_set_flag(sk, SOCK_MEMALLOC); 287 sock_set_flag(sk, SOCK_MEMALLOC);
285 sk->sk_allocation |= __GFP_MEMALLOC; 288 sk->sk_allocation |= __GFP_MEMALLOC;
289 static_key_slow_inc(&memalloc_socks);
286} 290}
287EXPORT_SYMBOL_GPL(sk_set_memalloc); 291EXPORT_SYMBOL_GPL(sk_set_memalloc);
288 292
@@ -290,6 +294,7 @@ void sk_clear_memalloc(struct sock *sk)
290{ 294{
291 sock_reset_flag(sk, SOCK_MEMALLOC); 295 sock_reset_flag(sk, SOCK_MEMALLOC);
292 sk->sk_allocation &= ~__GFP_MEMALLOC; 296 sk->sk_allocation &= ~__GFP_MEMALLOC;
297 static_key_slow_dec(&memalloc_socks);
293} 298}
294EXPORT_SYMBOL_GPL(sk_clear_memalloc); 299EXPORT_SYMBOL_GPL(sk_clear_memalloc);
295 300