diff options
| author | David S. Miller <davem@davemloft.net> | 2014-12-10 13:32:02 -0500 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2014-12-10 13:32:02 -0500 |
| commit | 7dbea3e8662eb491e9da109cf8ffc372e160fe9a (patch) | |
| tree | e69f89b43fa65a1f4279bc6aa7dc068c0817c208 /net | |
| parent | 6e5f59aacbf9527dfe425541c78cb8c56623e7eb (diff) | |
| parent | 45abfb1069e4c365f6c1e2fc97c5927272725bfa (diff) | |
Merge branch 'napi_page_frags'
Alexander Duyck says:
====================
net: Alloc NAPI page frags from their own pool
This patch series implements a means of allocating page fragments without
the need for the local_irq_save/restore in __netdev_alloc_frag. By doing
this I am able to decrease packet processing time by 11ns per packet in my
test environment.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
| -rw-r--r-- | net/core/dev.c | 2 | ||||
| -rw-r--r-- | net/core/skbuff.c | 191 |
2 files changed, 145 insertions, 48 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 3f191da383f6..80f798da3d9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
| @@ -4172,7 +4172,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) | |||
| 4172 | struct sk_buff *skb = napi->skb; | 4172 | struct sk_buff *skb = napi->skb; |
| 4173 | 4173 | ||
| 4174 | if (!skb) { | 4174 | if (!skb) { |
| 4175 | skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); | 4175 | skb = napi_alloc_skb(napi, GRO_MAX_HEAD); |
| 4176 | napi->skb = skb; | 4176 | napi->skb = skb; |
| 4177 | } | 4177 | } |
| 4178 | return skb; | 4178 | return skb; |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7a338fb55cc4..ae13ef6b3ea7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
| @@ -336,59 +336,85 @@ struct netdev_alloc_cache { | |||
| 336 | unsigned int pagecnt_bias; | 336 | unsigned int pagecnt_bias; |
| 337 | }; | 337 | }; |
| 338 | static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); | 338 | static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); |
| 339 | static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); | ||
| 339 | 340 | ||
| 340 | static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | 341 | static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, |
| 342 | gfp_t gfp_mask) | ||
| 341 | { | 343 | { |
| 342 | struct netdev_alloc_cache *nc; | 344 | const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; |
| 343 | void *data = NULL; | 345 | struct page *page = NULL; |
| 344 | int order; | 346 | gfp_t gfp = gfp_mask; |
| 345 | unsigned long flags; | 347 | |
| 348 | if (order) { | ||
| 349 | gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY; | ||
| 350 | page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); | ||
| 351 | nc->frag.size = PAGE_SIZE << (page ? order : 0); | ||
| 352 | } | ||
| 346 | 353 | ||
| 347 | local_irq_save(flags); | 354 | if (unlikely(!page)) |
| 348 | nc = this_cpu_ptr(&netdev_alloc_cache); | 355 | page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); |
| 349 | if (unlikely(!nc->frag.page)) { | 356 | |
| 357 | nc->frag.page = page; | ||
| 358 | |||
| 359 | return page; | ||
| 360 | } | ||
| 361 | |||
| 362 | static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, | ||
| 363 | unsigned int fragsz, gfp_t gfp_mask) | ||
| 364 | { | ||
| 365 | struct netdev_alloc_cache *nc = this_cpu_ptr(cache); | ||
| 366 | struct page *page = nc->frag.page; | ||
| 367 | unsigned int size; | ||
| 368 | int offset; | ||
| 369 | |||
| 370 | if (unlikely(!page)) { | ||
| 350 | refill: | 371 | refill: |
| 351 | for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { | 372 | page = __page_frag_refill(nc, gfp_mask); |
| 352 | gfp_t gfp = gfp_mask; | 373 | if (!page) |
| 374 | return NULL; | ||
| 375 | |||
| 376 | /* if size can vary use frag.size else just use PAGE_SIZE */ | ||
| 377 | size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; | ||
| 353 | 378 | ||
| 354 | if (order) | ||
| 355 | gfp |= __GFP_COMP | __GFP_NOWARN; | ||
| 356 | nc->frag.page = alloc_pages(gfp, order); | ||
| 357 | if (likely(nc->frag.page)) | ||
| 358 | break; | ||
| 359 | if (--order < 0) | ||
| 360 | goto end; | ||
| 361 | } | ||
| 362 | nc->frag.size = PAGE_SIZE << order; | ||
| 363 | /* Even if we own the page, we do not use atomic_set(). | 379 | /* Even if we own the page, we do not use atomic_set(). |
| 364 | * This would break get_page_unless_zero() users. | 380 | * This would break get_page_unless_zero() users. |
| 365 | */ | 381 | */ |
| 366 | atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1, | 382 | atomic_add(size - 1, &page->_count); |
| 367 | &nc->frag.page->_count); | 383 | |
| 368 | nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; | 384 | /* reset page count bias and offset to start of new frag */ |
| 369 | nc->frag.offset = 0; | 385 | nc->pagecnt_bias = size; |
| 386 | nc->frag.offset = size; | ||
| 370 | } | 387 | } |
| 371 | 388 | ||
| 372 | if (nc->frag.offset + fragsz > nc->frag.size) { | 389 | offset = nc->frag.offset - fragsz; |
| 373 | if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) { | 390 | if (unlikely(offset < 0)) { |
| 374 | if (!atomic_sub_and_test(nc->pagecnt_bias, | 391 | if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) |
| 375 | &nc->frag.page->_count)) | 392 | goto refill; |
| 376 | goto refill; | 393 | |
| 377 | /* OK, page count is 0, we can safely set it */ | 394 | /* if size can vary use frag.size else just use PAGE_SIZE */ |
| 378 | atomic_set(&nc->frag.page->_count, | 395 | size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; |
| 379 | NETDEV_PAGECNT_MAX_BIAS); | 396 | |
| 380 | } else { | 397 | /* OK, page count is 0, we can safely set it */ |
| 381 | atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias, | 398 | atomic_set(&page->_count, size); |
| 382 | &nc->frag.page->_count); | 399 | |
| 383 | } | 400 | /* reset page count bias and offset to start of new frag */ |
| 384 | nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; | 401 | nc->pagecnt_bias = size; |
| 385 | nc->frag.offset = 0; | 402 | offset = size - fragsz; |
| 386 | } | 403 | } |
| 387 | 404 | ||
| 388 | data = page_address(nc->frag.page) + nc->frag.offset; | ||
| 389 | nc->frag.offset += fragsz; | ||
| 390 | nc->pagecnt_bias--; | 405 | nc->pagecnt_bias--; |
| 391 | end: | 406 | nc->frag.offset = offset; |
| 407 | |||
| 408 | return page_address(page) + offset; | ||
| 409 | } | ||
| 410 | |||
| 411 | static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | ||
| 412 | { | ||
| 413 | unsigned long flags; | ||
| 414 | void *data; | ||
| 415 | |||
| 416 | local_irq_save(flags); | ||
| 417 | data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); | ||
| 392 | local_irq_restore(flags); | 418 | local_irq_restore(flags); |
| 393 | return data; | 419 | return data; |
| 394 | } | 420 | } |
| @@ -406,11 +432,25 @@ void *netdev_alloc_frag(unsigned int fragsz) | |||
| 406 | } | 432 | } |
| 407 | EXPORT_SYMBOL(netdev_alloc_frag); | 433 | EXPORT_SYMBOL(netdev_alloc_frag); |
| 408 | 434 | ||
| 435 | static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | ||
| 436 | { | ||
| 437 | return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); | ||
| 438 | } | ||
| 439 | |||
| 440 | void *napi_alloc_frag(unsigned int fragsz) | ||
| 441 | { | ||
| 442 | return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); | ||
| 443 | } | ||
| 444 | EXPORT_SYMBOL(napi_alloc_frag); | ||
| 445 | |||
| 409 | /** | 446 | /** |
| 410 | * __netdev_alloc_skb - allocate an skbuff for rx on a specific device | 447 | * __alloc_rx_skb - allocate an skbuff for rx |
| 411 | * @dev: network device to receive on | ||
| 412 | * @length: length to allocate | 448 | * @length: length to allocate |
| 413 | * @gfp_mask: get_free_pages mask, passed to alloc_skb | 449 | * @gfp_mask: get_free_pages mask, passed to alloc_skb |
| 450 | * @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for | ||
| 451 | * allocations in case we have to fallback to __alloc_skb() | ||
| 452 | * If SKB_ALLOC_NAPI is set, page fragment will be allocated | ||
| 453 | * from napi_cache instead of netdev_cache. | ||
| 414 | * | 454 | * |
| 415 | * Allocate a new &sk_buff and assign it a usage count of one. The | 455 | * Allocate a new &sk_buff and assign it a usage count of one. The |
| 416 | * buffer has unspecified headroom built in. Users should allocate | 456 | * buffer has unspecified headroom built in. Users should allocate |
| @@ -419,11 +459,11 @@ EXPORT_SYMBOL(netdev_alloc_frag); | |||
| 419 | * | 459 | * |
| 420 | * %NULL is returned if there is no free memory. | 460 | * %NULL is returned if there is no free memory. |
| 421 | */ | 461 | */ |
| 422 | struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | 462 | static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, |
| 423 | unsigned int length, gfp_t gfp_mask) | 463 | int flags) |
| 424 | { | 464 | { |
| 425 | struct sk_buff *skb = NULL; | 465 | struct sk_buff *skb = NULL; |
| 426 | unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + | 466 | unsigned int fragsz = SKB_DATA_ALIGN(length) + |
| 427 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | 467 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
| 428 | 468 | ||
| 429 | if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { | 469 | if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { |
| @@ -432,7 +472,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | |||
| 432 | if (sk_memalloc_socks()) | 472 | if (sk_memalloc_socks()) |
| 433 | gfp_mask |= __GFP_MEMALLOC; | 473 | gfp_mask |= __GFP_MEMALLOC; |
| 434 | 474 | ||
| 435 | data = __netdev_alloc_frag(fragsz, gfp_mask); | 475 | data = (flags & SKB_ALLOC_NAPI) ? |
| 476 | __napi_alloc_frag(fragsz, gfp_mask) : | ||
| 477 | __netdev_alloc_frag(fragsz, gfp_mask); | ||
| 436 | 478 | ||
| 437 | if (likely(data)) { | 479 | if (likely(data)) { |
| 438 | skb = build_skb(data, fragsz); | 480 | skb = build_skb(data, fragsz); |
| @@ -440,17 +482,72 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | |||
| 440 | put_page(virt_to_head_page(data)); | 482 | put_page(virt_to_head_page(data)); |
| 441 | } | 483 | } |
| 442 | } else { | 484 | } else { |
| 443 | skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, | 485 | skb = __alloc_skb(length, gfp_mask, |
| 444 | SKB_ALLOC_RX, NUMA_NO_NODE); | 486 | SKB_ALLOC_RX, NUMA_NO_NODE); |
| 445 | } | 487 | } |
| 488 | return skb; | ||
| 489 | } | ||
| 490 | |||
| 491 | /** | ||
| 492 | * __netdev_alloc_skb - allocate an skbuff for rx on a specific device | ||
| 493 | * @dev: network device to receive on | ||
| 494 | * @length: length to allocate | ||
| 495 | * @gfp_mask: get_free_pages mask, passed to alloc_skb | ||
| 496 | * | ||
| 497 | * Allocate a new &sk_buff and assign it a usage count of one. The | ||
| 498 | * buffer has NET_SKB_PAD headroom built in. Users should allocate | ||
| 499 | * the headroom they think they need without accounting for the | ||
| 500 | * built in space. The built in space is used for optimisations. | ||
| 501 | * | ||
| 502 | * %NULL is returned if there is no free memory. | ||
| 503 | */ | ||
| 504 | struct sk_buff *__netdev_alloc_skb(struct net_device *dev, | ||
| 505 | unsigned int length, gfp_t gfp_mask) | ||
| 506 | { | ||
| 507 | struct sk_buff *skb; | ||
| 508 | |||
| 509 | length += NET_SKB_PAD; | ||
| 510 | skb = __alloc_rx_skb(length, gfp_mask, 0); | ||
| 511 | |||
| 446 | if (likely(skb)) { | 512 | if (likely(skb)) { |
| 447 | skb_reserve(skb, NET_SKB_PAD); | 513 | skb_reserve(skb, NET_SKB_PAD); |
| 448 | skb->dev = dev; | 514 | skb->dev = dev; |
| 449 | } | 515 | } |
| 516 | |||
| 450 | return skb; | 517 | return skb; |
| 451 | } | 518 | } |
| 452 | EXPORT_SYMBOL(__netdev_alloc_skb); | 519 | EXPORT_SYMBOL(__netdev_alloc_skb); |
| 453 | 520 | ||
| 521 | /** | ||
| 522 | * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance | ||
| 523 | * @napi: napi instance this buffer was allocated for | ||
| 524 | * @length: length to allocate | ||
| 525 | * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages | ||
| 526 | * | ||
| 527 | * Allocate a new sk_buff for use in NAPI receive. This buffer will | ||
| 528 | * attempt to allocate the head from a special reserved region used | ||
| 529 | * only for NAPI Rx allocation. By doing this we can save several | ||
| 530 | * CPU cycles by avoiding having to disable and re-enable IRQs. | ||
| 531 | * | ||
| 532 | * %NULL is returned if there is no free memory. | ||
| 533 | */ | ||
| 534 | struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, | ||
| 535 | unsigned int length, gfp_t gfp_mask) | ||
| 536 | { | ||
| 537 | struct sk_buff *skb; | ||
| 538 | |||
| 539 | length += NET_SKB_PAD + NET_IP_ALIGN; | ||
| 540 | skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); | ||
| 541 | |||
| 542 | if (likely(skb)) { | ||
| 543 | skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); | ||
| 544 | skb->dev = napi->dev; | ||
| 545 | } | ||
| 546 | |||
| 547 | return skb; | ||
| 548 | } | ||
| 549 | EXPORT_SYMBOL(__napi_alloc_skb); | ||
| 550 | |||
| 454 | void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, | 551 | void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, |
| 455 | int size, unsigned int truesize) | 552 | int size, unsigned int truesize) |
| 456 | { | 553 | { |
