net: Split netdev_alloc_frag into __alloc_page_frag and add __napi_alloc_frag

This patch splits the netdev_alloc_frag function up so that it can be used on one of two page frag pools instead of being fixed on the netdev_alloc_cache. By doing this we can add a NAPI specific function __napi_alloc_frag that accesses a pool that is only used from softirq context. The advantage to this is that we do not need to call local_irq_save/restore which can be a significant savings. I also took the opportunity to refactor the core bits that were placed in __alloc_page_frag. First I updated the allocation to do either a 32K allocation or an order 0 page. This is based on the changes in commmit d9b2938aa where it was found that latencies could be reduced in case of failures. Then I also rewrote the logic to work from the end of the page to the start. By doing this the size value doesn't have to be used unless we have run out of space for page fragments. Finally I cleaned up the atomic bits so that we just do an atomic_sub_and_test and if that returns true then we set the page->_count via an atomic_set. This way we can remove the extra conditional for the atomic_read since it would have led to an atomic_inc in the case of success anyway. Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com> Acked-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Alexander Duyck <alexander.h.duyck@redhat.com> 2014-12-09 22:40:42 -0500
committer: David S. Miller <davem@davemloft.net> 2014-12-10 13:31:57 -0500
commit: ffde7328a36d16e626bae8468571858d71cd010b (patch)
tree: c84689c687b51d6fe70306f28cb045fa43f9ba59
parent: 6e5f59aacbf9527dfe425541c78cb8c56623e7eb (diff)
2 files changed, 79 insertions, 40 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ab0bc43c82a4..736cc99f3f6c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2164,6 +2164,8 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
 }
+void *napi_alloc_frag(unsigned int fragsz);
 /**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7a338fb55cc4..56ed17cd2151 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -336,59 +336,85 @@ struct netdev_alloc_cache {
        unsigned int            pagecnt_bias;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
+                                       gfp_t gfp_mask)
 {
-        struct netdev_alloc_cache *nc;
+        const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
-        void *data = NULL;
+        struct page *page = NULL;
-        int order;
+        gfp_t gfp = gfp_mask;
-        unsigned long flags;
+        if (order) {
+                gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+                page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+                nc->frag.size = PAGE_SIZE << (page ? order : 0);
+        }
-        local_irq_save(flags);
+        if (unlikely(!page))
-        nc = this_cpu_ptr(&netdev_alloc_cache);
+                page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-        if (unlikely(!nc->frag.page)) {
+        nc->frag.page = page;
+        return page;
+}
+static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+                               unsigned int fragsz, gfp_t gfp_mask)
+{
+        struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
+        struct page *page = nc->frag.page;
+        unsigned int size;
+        int offset;
+        if (unlikely(!page)) {
 refill:
-                for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
+                page = __page_frag_refill(nc, gfp_mask);
-                        gfp_t gfp = gfp_mask;
+                if (!page)
+                        return NULL;
+                /* if size can vary use frag.size else just use PAGE_SIZE */
+                size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-                        if (order)
-                                gfp |= __GFP_COMP | __GFP_NOWARN;
-                        nc->frag.page = alloc_pages(gfp, order);
-                        if (likely(nc->frag.page))
-                                break;
-                        if (--order < 0)
-                                goto end;
-                }
-                nc->frag.size = PAGE_SIZE << order;
                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
-                atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,
+                atomic_add(size - 1, &page->_count);
-                           &nc->frag.page->_count);
-                nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+                /* reset page count bias and offset to start of new frag */
-                nc->frag.offset = 0;
+                nc->pagecnt_bias = size;
+                nc->frag.offset = size;
        }
-        if (nc->frag.offset + fragsz > nc->frag.size) {
+        offset = nc->frag.offset - fragsz;
-                if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {
+        if (unlikely(offset < 0)) {
-                        if (!atomic_sub_and_test(nc->pagecnt_bias,
+                if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
-                                                 &nc->frag.page->_count))
+                        goto refill;
-                                goto refill;
-                        /* OK, page count is 0, we can safely set it */
+                /* if size can vary use frag.size else just use PAGE_SIZE */
-                        atomic_set(&nc->frag.page->_count,
+                size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-                                   NETDEV_PAGECNT_MAX_BIAS);
-                } else {
+                /* OK, page count is 0, we can safely set it */
-                        atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,
+                atomic_set(&page->_count, size);
-                                   &nc->frag.page->_count);
-                }
+                /* reset page count bias and offset to start of new frag */
-                nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
+                nc->pagecnt_bias = size;
-                nc->frag.offset = 0;
+                offset = size - fragsz;
        }
-        data = page_address(nc->frag.page) + nc->frag.offset;
-        nc->frag.offset += fragsz;
        nc->pagecnt_bias--;
-end:
+        nc->frag.offset = offset;
+        return page_address(page) + offset;
+}
+static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+        unsigned long flags;
+        void *data;
+        local_irq_save(flags);
+        data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
        local_irq_restore(flags);
        return data;
 }
@@ -406,6 +432,17 @@ void *netdev_alloc_frag(unsigned int fragsz)
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+{
+        return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+}
+void *napi_alloc_frag(unsigned int fragsz)
+{
+        return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+}
+EXPORT_SYMBOL(napi_alloc_frag);
 /**
 *      __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *      @dev: network device to receive on
author	Alexander Duyck <alexander.h.duyck@redhat.com>	2014-12-09 22:40:42 -0500
committer	David S. Miller <davem@davemloft.net>	2014-12-10 13:31:57 -0500
commit	ffde7328a36d16e626bae8468571858d71cd010b (patch)
tree	c84689c687b51d6fe70306f28cb045fa43f9ba59
parent	6e5f59aacbf9527dfe425541c78cb8c56623e7eb (diff)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ab0bc43c82a4..736cc99f3f6c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h
@@ -2164,6 +2164,8 @@ static inline struct sk_buff netdev_alloc_skb_ip_align(struct net_device dev,
2164	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);	2164	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
2165	}	2165	}
2166		2166
		2167	void *napi_alloc_frag(unsigned int fragsz);
		2168
2167	/**	2169	/**
2168	* __dev_alloc_pages - allocate page for network Rx	2170	* __dev_alloc_pages - allocate page for network Rx
2169	* @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx	2171	* @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx


diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7a338fb55cc4..56ed17cd2151 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c
@@ -336,59 +336,85 @@ struct netdev_alloc_cache {
336	unsigned int pagecnt_bias;	336	unsigned int pagecnt_bias;
337	};	337	};
338	static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);	338	static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
		339	static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
339		340
340	static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)	341	static struct page __page_frag_refill(struct netdev_alloc_cache nc,
		342	gfp_t gfp_mask)
341	{	343	{
342	struct netdev_alloc_cache *nc;	344	const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
343	void *data = NULL;	345	struct page *page = NULL;
344	int order;	346	gfp_t gfp = gfp_mask;
345	unsigned long flags;	347
		348	if (order) {
		349	gfp_mask \|= __GFP_COMP \| __GFP_NOWARN \| __GFP_NORETRY;
		350	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
		351	nc->frag.size = PAGE_SIZE << (page ? order : 0);
		352	}
346		353
347	local_irq_save(flags);	354	if (unlikely(!page))
348	nc = this_cpu_ptr(&netdev_alloc_cache);	355	page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
349	if (unlikely(!nc->frag.page)) {	356
		357	nc->frag.page = page;
		358
		359	return page;
		360	}
		361
		362	static void __alloc_page_frag(struct netdev_alloc_cache __percpu cache,
		363	unsigned int fragsz, gfp_t gfp_mask)
		364	{
		365	struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
		366	struct page *page = nc->frag.page;
		367	unsigned int size;
		368	int offset;
		369
		370	if (unlikely(!page)) {
350	refill:	371	refill:
351	for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {	372	page = __page_frag_refill(nc, gfp_mask);
352	gfp_t gfp = gfp_mask;	373	if (!page)
		374	return NULL;
		375
		376	/* if size can vary use frag.size else just use PAGE_SIZE */
		377	size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
353		378
354	if (order)
355	gfp \|= __GFP_COMP \| __GFP_NOWARN;
356	nc->frag.page = alloc_pages(gfp, order);
357	if (likely(nc->frag.page))
358	break;
359	if (--order < 0)
360	goto end;
361	}
362	nc->frag.size = PAGE_SIZE << order;
363	/* Even if we own the page, we do not use atomic_set().	379	/* Even if we own the page, we do not use atomic_set().
364	* This would break get_page_unless_zero() users.	380	* This would break get_page_unless_zero() users.
365	*/	381	*/
366	atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,	382	atomic_add(size - 1, &page->_count);
367	&nc->frag.page->_count);	383
368	nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;	384	/* reset page count bias and offset to start of new frag */
369	nc->frag.offset = 0;	385	nc->pagecnt_bias = size;
		386	nc->frag.offset = size;
370	}	387	}
371		388
372	if (nc->frag.offset + fragsz > nc->frag.size) {	389	offset = nc->frag.offset - fragsz;
373	if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {	390	if (unlikely(offset < 0)) {
374	if (!atomic_sub_and_test(nc->pagecnt_bias,	391	if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
375	&nc->frag.page->_count))	392	goto refill;
376	goto refill;	393
377	/* OK, page count is 0, we can safely set it */	394	/* if size can vary use frag.size else just use PAGE_SIZE */
378	atomic_set(&nc->frag.page->_count,	395	size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
379	NETDEV_PAGECNT_MAX_BIAS);	396
380	} else {	397	/* OK, page count is 0, we can safely set it */
381	atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,	398	atomic_set(&page->_count, size);
382	&nc->frag.page->_count);	399
383	}	400	/* reset page count bias and offset to start of new frag */
384	nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;	401	nc->pagecnt_bias = size;
385	nc->frag.offset = 0;	402	offset = size - fragsz;
386	}	403	}
387		404
388	data = page_address(nc->frag.page) + nc->frag.offset;
389	nc->frag.offset += fragsz;
390	nc->pagecnt_bias--;	405	nc->pagecnt_bias--;
391	end:	406	nc->frag.offset = offset;
		407
		408	return page_address(page) + offset;
		409	}
		410
		411	static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
		412	{
		413	unsigned long flags;
		414	void *data;
		415
		416	local_irq_save(flags);
		417	data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
392	local_irq_restore(flags);	418	local_irq_restore(flags);
393	return data;	419	return data;
394	}	420	}
@@ -406,6 +432,17 @@ void *netdev_alloc_frag(unsigned int fragsz)
406	}	432	}
407	EXPORT_SYMBOL(netdev_alloc_frag);	433	EXPORT_SYMBOL(netdev_alloc_frag);
408		434
		435	static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
		436	{
		437	return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
		438	}
		439
		440	void *napi_alloc_frag(unsigned int fragsz)
		441	{
		442	return __napi_alloc_frag(fragsz, GFP_ATOMIC \| __GFP_COLD);
		443	}
		444	EXPORT_SYMBOL(napi_alloc_frag);
		445
409	/**	446	/**
410	* __netdev_alloc_skb - allocate an skbuff for rx on a specific device	447	* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
411	* @dev: network device to receive on	448	* @dev: network device to receive on