aboutsummaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-10 07:26:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-10 07:26:02 -0400
commitc798360cd1438090d51eeaa8e67985da11362eba (patch)
tree0107d3b9ee7476264c3357287787d393545bd2d9 /mm/percpu.c
parentb211e9d7c861bdb37b86d6384da9edfb80949ceb (diff)
parent6ae833c7fe0c6ef1f0ab13cc775da230d6f4c256 (diff)
Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
Pull percpu updates from Tejun Heo: "A lot of activities on percpu front. Notable changes are... - percpu allocator now can take @gfp. If @gfp doesn't contain GFP_KERNEL, it tries to allocate from what's already available to the allocator and a work item tries to keep the reserve around certain level so that these atomic allocations usually succeed. This will replace the ad-hoc percpu memory pool used by blk-throttle and also be used by the planned blkcg support for writeback IOs. Please note that I noticed a bug in how @gfp is interpreted while preparing this pull request and applied the fix 6ae833c7fe0c ("percpu: fix how @gfp is interpreted by the percpu allocator") just now. - percpu_ref now uses longs for percpu and global counters instead of ints. It leads to more sparse packing of the percpu counters on 64bit machines but the overhead should be negligible and this allows using percpu_ref for refcnting pages and in-memory objects directly. - The switching between percpu and single counter modes of a percpu_ref is made independent of putting the base ref and a percpu_ref can now optionally be initialized in single or killed mode. This allows avoiding percpu shutdown latency for cases where the refcounted objects may be synchronously created and destroyed in rapid succession with only a fraction of them reaching fully operational status (SCSI probing does this when combined with blk-mq support). It's also planned to be used to implement forced single mode to detect underflow more timely for debugging. There's a separate branch percpu/for-3.18-consistent-ops which cleans up the duplicate percpu accessors. That branch causes a number of conflicts with s390 and other trees. I'll send a separate pull request w/ resolutions once other branches are merged" * 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (33 commits) percpu: fix how @gfp is interpreted by the percpu allocator blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode percpu_ref: make INIT_ATOMIC and switch_to_atomic() sticky percpu_ref: add PERCPU_REF_INIT_* flags percpu_ref: decouple switching to percpu mode and reinit percpu_ref: decouple switching to atomic mode and killing percpu_ref: add PCPU_REF_DEAD percpu_ref: rename things to prepare for decoupling percpu/atomic mode switch percpu_ref: replace pcpu_ prefix with percpu_ percpu_ref: minor code and comment updates percpu_ref: relocate percpu_ref_reinit() Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe" Revert "percpu: free percpu allocation info for uniprocessor system" percpu-refcount: make percpu_ref based on longs instead of ints percpu-refcount: improve WARN messages percpu: fix locking regression in the failure path of pcpu_alloc() percpu-refcount: add @gfp to percpu_ref_init() proportions: add @gfp to init functions percpu_counter: add @gfp to percpu_counter_init() percpu_counter: make percpu_counters_lock irq-safe ...
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c526
1 files changed, 427 insertions, 99 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index da997f9800bd..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
76 76
77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
79 83
80#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
81/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 85/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
102 int free_size; /* free bytes in the chunk */ 106 int free_size; /* free bytes in the chunk */
103 int contig_hint; /* max contiguous size hint */ 107 int contig_hint; /* max contiguous size hint */
104 void *base_addr; /* base address of this chunk */ 108 void *base_addr; /* base address of this chunk */
109
105 int map_used; /* # of map entries used before the sentry */ 110 int map_used; /* # of map entries used before the sentry */
106 int map_alloc; /* # of map entries allocated */ 111 int map_alloc; /* # of map entries allocated */
107 int *map; /* allocation map */ 112 int *map; /* allocation map */
113 struct work_struct map_extend_work;/* async ->map[] extension */
114
108 void *data; /* chunk data */ 115 void *data; /* chunk data */
109 int first_free; /* no free below this */ 116 int first_free; /* no free below this */
110 bool immutable; /* no [de]population allowed */ 117 bool immutable; /* no [de]population allowed */
118 int nr_populated; /* # of populated pages */
111 unsigned long populated[]; /* populated bitmap */ 119 unsigned long populated[]; /* populated bitmap */
112}; 120};
113 121
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
151static struct pcpu_chunk *pcpu_reserved_chunk; 159static struct pcpu_chunk *pcpu_reserved_chunk;
152static int pcpu_reserved_chunk_limit; 160static int pcpu_reserved_chunk_limit;
153 161
162static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
163static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
164
165static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
166
154/* 167/*
155 * Synchronization rules. 168 * The number of empty populated pages, protected by pcpu_lock. The
156 * 169 * reserved chunk doesn't contribute to the count.
157 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
158 * protects allocation/reclaim paths, chunks, populated bitmap and
159 * vmalloc mapping. The latter is a spinlock and protects the index
160 * data structures - chunk slots, chunks and area maps in chunks.
161 *
162 * During allocation, pcpu_alloc_mutex is kept locked all the time and
163 * pcpu_lock is grabbed and released as necessary. All actual memory
164 * allocations are done using GFP_KERNEL with pcpu_lock released. In
165 * general, percpu memory can't be allocated with irq off but
166 * irqsave/restore are still used in alloc path so that it can be used
167 * from early init path - sched_init() specifically.
168 *
169 * Free path accesses and alters only the index data structures, so it
170 * can be safely called from atomic context. When memory needs to be
171 * returned to the system, free path schedules reclaim_work which
172 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
173 * reclaimed, release both locks and frees the chunks. Note that it's
174 * necessary to grab both locks to remove a chunk from circulation as
175 * allocation path might be referencing the chunk with only
176 * pcpu_alloc_mutex locked.
177 */ 170 */
178static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 171static int pcpu_nr_empty_pop_pages;
179static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
180 172
181static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 173/*
174 * Balance work is used to populate or destroy chunks asynchronously. We
175 * try to keep the number of populated free pages between
176 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177 * empty chunk.
178 */
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
182 183
183/* reclaim work to release fully free chunks, scheduled from free path */ 184static void pcpu_schedule_balance_work(void)
184static void pcpu_reclaim(struct work_struct *work); 185{
185static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
186 189
187static bool pcpu_addr_in_first_chunk(void *addr) 190static bool pcpu_addr_in_first_chunk(void *addr)
188{ 191{
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
315} 318}
316 319
317/** 320/**
321 * pcpu_count_occupied_pages - count the number of pages an area occupies
322 * @chunk: chunk of interest
323 * @i: index of the area in question
324 *
325 * Count the number of pages chunk's @i'th area occupies. When the area's
326 * start and/or end address isn't aligned to page boundary, the straddled
327 * page is included in the count iff the rest of the page is free.
328 */
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352/**
318 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 353 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
319 * @chunk: chunk of interest 354 * @chunk: chunk of interest
320 * @oslot: the previous slot it was on 355 * @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342/** 377/**
343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 378 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
344 * @chunk: chunk of interest 379 * @chunk: chunk of interest
380 * @is_atomic: the allocation context
345 * 381 *
346 * Determine whether area map of @chunk needs to be extended to 382 * Determine whether area map of @chunk needs to be extended. If
347 * accommodate a new allocation. 383 * @is_atomic, only the amount necessary for a new allocation is
384 * considered; however, async extension is scheduled if the left amount is
385 * low. If !@is_atomic, it aims for more empty space. Combined, this
386 * ensures that the map is likely to have enough available space to
387 * accomodate atomic allocations which can't extend maps directly.
348 * 388 *
349 * CONTEXT: 389 * CONTEXT:
350 * pcpu_lock. 390 * pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
353 * New target map allocation length if extension is necessary, 0 393 * New target map allocation length if extension is necessary, 0
354 * otherwise. 394 * otherwise.
355 */ 395 */
356static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
357{ 397{
358 int new_alloc; 398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
359 410
360 if (chunk->map_alloc >= chunk->map_used + 3) 411 if (chunk->map_alloc >= chunk->map_used + margin)
361 return 0; 412 return 0;
362 413
363 new_alloc = PCPU_DFL_MAP_ALLOC; 414 new_alloc = PCPU_DFL_MAP_ALLOC;
364 while (new_alloc < chunk->map_used + 3) 415 while (new_alloc < chunk->map_used + margin)
365 new_alloc *= 2; 416 new_alloc *= 2;
366 417
367 return new_alloc; 418 return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
418 return 0; 469 return 0;
419} 470}
420 471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486/**
487 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
488 * @chunk: chunk the candidate area belongs to
489 * @off: the offset to the start of the candidate area
490 * @this_size: the size of the candidate area
491 * @size: the size of the target allocation
492 * @align: the alignment of the target allocation
493 * @pop_only: only allocate from already populated region
494 *
495 * We're trying to allocate @size bytes aligned at @align. @chunk's area
496 * at @off sized @this_size is a candidate. This function determines
497 * whether the target allocation fits in the candidate area and returns the
498 * number of bytes to pad after @off. If the target area doesn't fit, -1
499 * is returned.
500 *
501 * If @pop_only is %true, this function only considers the already
502 * populated part of the candidate area.
503 */
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519 /*
520 * If the first unpopulated page is beyond the end of the
521 * allocation, the whole allocation is populated;
522 * otherwise, retry from the end of the unpopulated area.
523 */
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
421/** 535/**
422 * pcpu_alloc_area - allocate area from a pcpu_chunk 536 * pcpu_alloc_area - allocate area from a pcpu_chunk
423 * @chunk: chunk of interest 537 * @chunk: chunk of interest
424 * @size: wanted size in bytes 538 * @size: wanted size in bytes
425 * @align: wanted align 539 * @align: wanted align
540 * @pop_only: allocate only from the populated area
541 * @occ_pages_p: out param for the number of pages the area occupies
426 * 542 *
427 * Try to allocate @size bytes area aligned at @align from @chunk. 543 * Try to allocate @size bytes area aligned at @align from @chunk.
428 * Note that this function only allocates the offset. It doesn't 544 * Note that this function only allocates the offset. It doesn't
@@ -437,7 +553,8 @@ out_unlock:
437 * Allocated offset in @chunk on success, -1 if no matching area is 553 * Allocated offset in @chunk on success, -1 if no matching area is
438 * found. 554 * found.
439 */ 555 */
440static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
441{ 558{
442 int oslot = pcpu_chunk_slot(chunk); 559 int oslot = pcpu_chunk_slot(chunk);
443 int max_contig = 0; 560 int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
453 if (off & 1) 570 if (off & 1)
454 continue; 571 continue;
455 572
456 /* extra for alignment requirement */
457 head = ALIGN(off, align) - off;
458
459 this_size = (p[1] & ~1) - off; 573 this_size = (p[1] & ~1) - off;
460 if (this_size < head + size) { 574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
461 if (!seen_free) { 578 if (!seen_free) {
462 chunk->first_free = i; 579 chunk->first_free = i;
463 seen_free = true; 580 seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
526 chunk->free_size -= size; 643 chunk->free_size -= size;
527 *p |= 1; 644 *p |= 1;
528 645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
529 pcpu_chunk_relocate(chunk, oslot); 647 pcpu_chunk_relocate(chunk, oslot);
530 return off; 648 return off;
531 } 649 }
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
541 * pcpu_free_area - free area to a pcpu_chunk 659 * pcpu_free_area - free area to a pcpu_chunk
542 * @chunk: chunk of interest 660 * @chunk: chunk of interest
543 * @freeme: offset of area to free 661 * @freeme: offset of area to free
662 * @occ_pages_p: out param for the number of pages the area occupies
544 * 663 *
545 * Free area starting from @freeme to @chunk. Note that this function 664 * Free area starting from @freeme to @chunk. Note that this function
546 * only modifies the allocation map. It doesn't depopulate or unmap 665 * only modifies the allocation map. It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
549 * CONTEXT: 668 * CONTEXT:
550 * pcpu_lock. 669 * pcpu_lock.
551 */ 670 */
552static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
553{ 673{
554 int oslot = pcpu_chunk_slot(chunk); 674 int oslot = pcpu_chunk_slot(chunk);
555 int off = 0; 675 int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
580 *p = off &= ~1; 700 *p = off &= ~1;
581 chunk->free_size += (p[1] & ~1) - off; 701 chunk->free_size += (p[1] & ~1) - off;
582 702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
583 /* merge with next? */ 705 /* merge with next? */
584 if (!(p[1] & 1)) 706 if (!(p[1] & 1))
585 to_free++; 707 to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
620 chunk->map_used = 1; 742 chunk->map_used = 1;
621 743
622 INIT_LIST_HEAD(&chunk->list); 744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
623 chunk->free_size = pcpu_unit_size; 746 chunk->free_size = pcpu_unit_size;
624 chunk->contig_hint = pcpu_unit_size; 747 chunk->contig_hint = pcpu_unit_size;
625 748
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 758}
636 759
760/**
761 * pcpu_chunk_populated - post-population bookkeeping
762 * @chunk: pcpu_chunk which got populated
763 * @page_start: the start page
764 * @page_end: the end page
765 *
766 * Pages in [@page_start,@page_end) have been populated to @chunk. Update
767 * the bookkeeping information accordingly. Must be called after each
768 * successful population.
769 */
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782/**
783 * pcpu_chunk_depopulated - post-depopulation bookkeeping
784 * @chunk: pcpu_chunk which got depopulated
785 * @page_start: the start page
786 * @page_end: the end page
787 *
788 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
789 * Update the bookkeeping information accordingly. Must be called after
790 * each successful depopulation.
791 */
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
637/* 804/*
638 * Chunk management implementation. 805 * Chunk management implementation.
639 * 806 *
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
695 * @size: size of area to allocate in bytes 862 * @size: size of area to allocate in bytes
696 * @align: alignment of area (max PAGE_SIZE) 863 * @align: alignment of area (max PAGE_SIZE)
697 * @reserved: allocate from the reserved chunk if available 864 * @reserved: allocate from the reserved chunk if available
865 * @gfp: allocation flags
698 * 866 *
699 * Allocate percpu area of @size bytes aligned at @align. 867 * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
700 * 868 * contain %GFP_KERNEL, the allocation is atomic.
701 * CONTEXT:
702 * Does GFP_KERNEL allocation.
703 * 869 *
704 * RETURNS: 870 * RETURNS:
705 * Percpu pointer to the allocated area on success, NULL on failure. 871 * Percpu pointer to the allocated area on success, NULL on failure.
706 */ 872 */
707static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
708{ 875{
709 static int warn_limit = 10; 876 static int warn_limit = 10;
710 struct pcpu_chunk *chunk; 877 struct pcpu_chunk *chunk;
711 const char *err; 878 const char *err;
712 int slot, off, new_alloc; 879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
713 unsigned long flags; 882 unsigned long flags;
714 void __percpu *ptr; 883 void __percpu *ptr;
715 884
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
728 return NULL; 897 return NULL;
729 } 898 }
730 899
731 mutex_lock(&pcpu_alloc_mutex);
732 spin_lock_irqsave(&pcpu_lock, flags); 900 spin_lock_irqsave(&pcpu_lock, flags);
733 901
734 /* serve reserved allocations from the reserved chunk if available */ 902 /* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
740 goto fail_unlock; 908 goto fail_unlock;
741 } 909 }
742 910
743 while ((new_alloc = pcpu_need_to_extend(chunk))) { 911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
744 spin_unlock_irqrestore(&pcpu_lock, flags); 912 spin_unlock_irqrestore(&pcpu_lock, flags);
745 if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
746 err = "failed to extend area map of reserved chunk"; 915 err = "failed to extend area map of reserved chunk";
747 goto fail_unlock_mutex; 916 goto fail;
748 } 917 }
749 spin_lock_irqsave(&pcpu_lock, flags); 918 spin_lock_irqsave(&pcpu_lock, flags);
750 } 919 }
751 920
752 off = pcpu_alloc_area(chunk, size, align); 921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
753 if (off >= 0) 923 if (off >= 0)
754 goto area_found; 924 goto area_found;
755 925
@@ -764,13 +934,15 @@ restart:
764 if (size > chunk->contig_hint) 934 if (size > chunk->contig_hint)
765 continue; 935 continue;
766 936
767 new_alloc = pcpu_need_to_extend(chunk); 937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
768 if (new_alloc) { 938 if (new_alloc) {
939 if (is_atomic)
940 continue;
769 spin_unlock_irqrestore(&pcpu_lock, flags); 941 spin_unlock_irqrestore(&pcpu_lock, flags);
770 if (pcpu_extend_area_map(chunk, 942 if (pcpu_extend_area_map(chunk,
771 new_alloc) < 0) { 943 new_alloc) < 0) {
772 err = "failed to extend area map"; 944 err = "failed to extend area map";
773 goto fail_unlock_mutex; 945 goto fail;
774 } 946 }
775 spin_lock_irqsave(&pcpu_lock, flags); 947 spin_lock_irqsave(&pcpu_lock, flags);
776 /* 948 /*
@@ -780,74 +952,134 @@ restart:
780 goto restart; 952 goto restart;
781 } 953 }
782 954
783 off = pcpu_alloc_area(chunk, size, align); 955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
784 if (off >= 0) 957 if (off >= 0)
785 goto area_found; 958 goto area_found;
786 } 959 }
787 } 960 }
788 961
789 /* hmmm... no space left, create a new chunk */
790 spin_unlock_irqrestore(&pcpu_lock, flags); 962 spin_unlock_irqrestore(&pcpu_lock, flags);
791 963
792 chunk = pcpu_create_chunk(); 964 /*
793 if (!chunk) { 965 * No space left. Create a new chunk. We don't want multiple
794 err = "failed to allocate new chunk"; 966 * tasks to create chunks simultaneously. Serialize and create iff
795 goto fail_unlock_mutex; 967 * there's still no empty chunk after grabbing the mutex.
968 */
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
796 } 986 }
797 987
798 spin_lock_irqsave(&pcpu_lock, flags); 988 mutex_unlock(&pcpu_alloc_mutex);
799 pcpu_chunk_relocate(chunk, -1);
800 goto restart; 989 goto restart;
801 990
802area_found: 991area_found:
803 spin_unlock_irqrestore(&pcpu_lock, flags); 992 spin_unlock_irqrestore(&pcpu_lock, flags);
804 993
805 /* populate, map and clear the area */ 994 /* populate if not all pages are already there */
806 if (pcpu_populate_chunk(chunk, off, size)) { 995 if (!is_atomic) {
807 spin_lock_irqsave(&pcpu_lock, flags); 996 int page_start, page_end, rs, re;
808 pcpu_free_area(chunk, off); 997
809 err = "failed to populate"; 998 mutex_lock(&pcpu_alloc_mutex);
810 goto fail_unlock; 999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
811 } 1020 }
812 1021
813 mutex_unlock(&pcpu_alloc_mutex); 1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028 /* clear the areas and return address relative to base address */
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
814 1031
815 /* return address relative to base address */
816 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
817 kmemleak_alloc_percpu(ptr, size); 1033 kmemleak_alloc_percpu(ptr, size);
818 return ptr; 1034 return ptr;
819 1035
820fail_unlock: 1036fail_unlock:
821 spin_unlock_irqrestore(&pcpu_lock, flags); 1037 spin_unlock_irqrestore(&pcpu_lock, flags);
822fail_unlock_mutex: 1038fail:
823 mutex_unlock(&pcpu_alloc_mutex); 1039 if (!is_atomic && warn_limit) {
824 if (warn_limit) { 1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
825 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 1041 size, align, is_atomic, err);
826 "%s\n", size, align, err);
827 dump_stack(); 1042 dump_stack();
828 if (!--warn_limit) 1043 if (!--warn_limit)
829 pr_info("PERCPU: limit reached, disable warning\n"); 1044 pr_info("PERCPU: limit reached, disable warning\n");
830 } 1045 }
1046 if (is_atomic) {
1047 /* see the flag handling in pcpu_blance_workfn() */
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
831 return NULL; 1051 return NULL;
832} 1052}
833 1053
834/** 1054/**
835 * __alloc_percpu - allocate dynamic percpu area 1055 * __alloc_percpu_gfp - allocate dynamic percpu area
836 * @size: size of area to allocate in bytes 1056 * @size: size of area to allocate in bytes
837 * @align: alignment of area (max PAGE_SIZE) 1057 * @align: alignment of area (max PAGE_SIZE)
1058 * @gfp: allocation flags
838 * 1059 *
839 * Allocate zero-filled percpu area of @size bytes aligned at @align. 1060 * Allocate zero-filled percpu area of @size bytes aligned at @align. If
840 * Might sleep. Might trigger writeouts. 1061 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
841 * 1062 * be called from any context but is a lot more likely to fail.
842 * CONTEXT:
843 * Does GFP_KERNEL allocation.
844 * 1063 *
845 * RETURNS: 1064 * RETURNS:
846 * Percpu pointer to the allocated area on success, NULL on failure. 1065 * Percpu pointer to the allocated area on success, NULL on failure.
847 */ 1066 */
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073/**
1074 * __alloc_percpu - allocate dynamic percpu area
1075 * @size: size of area to allocate in bytes
1076 * @align: alignment of area (max PAGE_SIZE)
1077 *
1078 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
1079 */
848void __percpu *__alloc_percpu(size_t size, size_t align) 1080void __percpu *__alloc_percpu(size_t size, size_t align)
849{ 1081{
850 return pcpu_alloc(size, align, false); 1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
851} 1083}
852EXPORT_SYMBOL_GPL(__alloc_percpu); 1084EXPORT_SYMBOL_GPL(__alloc_percpu);
853 1085
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
869 */ 1101 */
870void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
871{ 1103{
872 return pcpu_alloc(size, align, true); 1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
873} 1105}
874 1106
875/** 1107/**
876 * pcpu_reclaim - reclaim fully free chunks, workqueue function 1108 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
877 * @work: unused 1109 * @work: unused
878 * 1110 *
879 * Reclaim all fully free chunks except for the first one. 1111 * Reclaim all fully free chunks except for the first one.
880 *
881 * CONTEXT:
882 * workqueue context.
883 */ 1112 */
884static void pcpu_reclaim(struct work_struct *work) 1113static void pcpu_balance_workfn(struct work_struct *work)
885{ 1114{
886 LIST_HEAD(todo); 1115 LIST_HEAD(to_free);
887 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
888 struct pcpu_chunk *chunk, *next; 1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
889 1119
1120 /*
1121 * There's no reason to keep around multiple unused chunks and VM
1122 * areas can be scarce. Destroy all free chunks except for one.
1123 */
890 mutex_lock(&pcpu_alloc_mutex); 1124 mutex_lock(&pcpu_alloc_mutex);
891 spin_lock_irq(&pcpu_lock); 1125 spin_lock_irq(&pcpu_lock);
892 1126
893 list_for_each_entry_safe(chunk, next, head, list) { 1127 list_for_each_entry_safe(chunk, next, free_head, list) {
894 WARN_ON(chunk->immutable); 1128 WARN_ON(chunk->immutable);
895 1129
896 /* spare the first one */ 1130 /* spare the first one */
897 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
898 continue; 1132 continue;
899 1133
900 list_move(&chunk->list, &todo); 1134 list_move(&chunk->list, &to_free);
901 } 1135 }
902 1136
903 spin_unlock_irq(&pcpu_lock); 1137 spin_unlock_irq(&pcpu_lock);
904 1138
905 list_for_each_entry_safe(chunk, next, &todo, list) { 1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
906 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
907 pcpu_destroy_chunk(chunk); 1148 pcpu_destroy_chunk(chunk);
908 } 1149 }
909 1150
1151 /*
1152 * Ensure there are certain number of free populated pages for
1153 * atomic allocs. Fill up from the most packed so that atomic
1154 * allocs don't increase fragmentation. If atomic allocation
1155 * failed previously, always populate the maximum amount. This
1156 * should prevent atomic allocs larger than PAGE_SIZE from keeping
1157 * failing indefinitely; however, large atomic allocs are not
1158 * something we support properly and can be highly unreliable and
1159 * inefficient.
1160 */
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164 /* best effort anyway, don't worry about synchronization */
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189 /* @chunk can't go away while pcpu_alloc_mutex is held */
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209 /* ran out of chunks to populate, create a new one and retry */
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
910 mutex_unlock(&pcpu_alloc_mutex); 1219 mutex_unlock(&pcpu_alloc_mutex);
911} 1220}
912 1221
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
924 void *addr; 1233 void *addr;
925 struct pcpu_chunk *chunk; 1234 struct pcpu_chunk *chunk;
926 unsigned long flags; 1235 unsigned long flags;
927 int off; 1236 int off, occ_pages;
928 1237
929 if (!ptr) 1238 if (!ptr)
930 return; 1239 return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
938 chunk = pcpu_chunk_addr_search(addr); 1247 chunk = pcpu_chunk_addr_search(addr);
939 off = addr - chunk->base_addr; 1248 off = addr - chunk->base_addr;
940 1249
941 pcpu_free_area(chunk, off); 1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
942 1254
943 /* if there are more than one fully free chunks, wake up grim reaper */ 1255 /* if there are more than one fully free chunks, wake up grim reaper */
944 if (chunk->free_size == pcpu_unit_size) { 1256 if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
946 1258
947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
948 if (pos != chunk) { 1260 if (pos != chunk) {
949 schedule_work(&pcpu_reclaim_work); 1261 pcpu_schedule_balance_work();
950 break; 1262 break;
951 } 1263 }
952 } 1264 }
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1336 */ 1648 */
1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1649 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1338 INIT_LIST_HEAD(&schunk->list); 1650 INIT_LIST_HEAD(&schunk->list);
1651 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1339 schunk->base_addr = base_addr; 1652 schunk->base_addr = base_addr;
1340 schunk->map = smap; 1653 schunk->map = smap;
1341 schunk->map_alloc = ARRAY_SIZE(smap); 1654 schunk->map_alloc = ARRAY_SIZE(smap);
1342 schunk->immutable = true; 1655 schunk->immutable = true;
1343 bitmap_fill(schunk->populated, pcpu_unit_pages); 1656 bitmap_fill(schunk->populated, pcpu_unit_pages);
1657 schunk->nr_populated = pcpu_unit_pages;
1344 1658
1345 if (ai->reserved_size) { 1659 if (ai->reserved_size) {
1346 schunk->free_size = ai->reserved_size; 1660 schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1364 if (dyn_size) { 1678 if (dyn_size) {
1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1679 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1366 INIT_LIST_HEAD(&dchunk->list); 1680 INIT_LIST_HEAD(&dchunk->list);
1681 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1367 dchunk->base_addr = base_addr; 1682 dchunk->base_addr = base_addr;
1368 dchunk->map = dmap; 1683 dchunk->map = dmap;
1369 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 dchunk->map_alloc = ARRAY_SIZE(dmap);
1370 dchunk->immutable = true; 1685 dchunk->immutable = true;
1371 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1686 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1687 dchunk->nr_populated = pcpu_unit_pages;
1372 1688
1373 dchunk->contig_hint = dchunk->free_size = dyn_size; 1689 dchunk->contig_hint = dchunk->free_size = dyn_size;
1374 dchunk->map[0] = 1; 1690 dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1379 1695
1380 /* link the first chunk in */ 1696 /* link the first chunk in */
1381 pcpu_first_chunk = dchunk ?: schunk; 1697 pcpu_first_chunk = dchunk ?: schunk;
1698 pcpu_nr_empty_pop_pages +=
1699 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1382 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1700 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1383 1701
1384 /* we're done */ 1702 /* we're done */
@@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void)
1932 2250
1933 if (pcpu_setup_first_chunk(ai, fc) < 0) 2251 if (pcpu_setup_first_chunk(ai, fc) < 0)
1934 panic("Failed to initialize percpu areas."); 2252 panic("Failed to initialize percpu areas.");
1935
1936 pcpu_free_alloc_info(ai);
1937} 2253}
1938 2254
1939#endif /* CONFIG_SMP */ 2255#endif /* CONFIG_SMP */
@@ -1967,3 +2283,15 @@ void __init percpu_init_late(void)
1967 spin_unlock_irqrestore(&pcpu_lock, flags); 2283 spin_unlock_irqrestore(&pcpu_lock, flags);
1968 } 2284 }
1969} 2285}
2286
2287/*
2288 * Percpu allocator is initialized early during boot when neither slab or
2289 * workqueue is available. Plug async management until everything is up
2290 * and running.
2291 */
2292static int __init percpu_enable_async(void)
2293{
2294 pcpu_async_enabled = true;
2295 return 0;
2296}
2297subsys_initcall(percpu_enable_async);