aboutsummaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c526
1 files changed, 427 insertions, 99 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index da997f9800bd..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
76 76
77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
79 83
80#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
81/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 85/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
102 int free_size; /* free bytes in the chunk */ 106 int free_size; /* free bytes in the chunk */
103 int contig_hint; /* max contiguous size hint */ 107 int contig_hint; /* max contiguous size hint */
104 void *base_addr; /* base address of this chunk */ 108 void *base_addr; /* base address of this chunk */
109
105 int map_used; /* # of map entries used before the sentry */ 110 int map_used; /* # of map entries used before the sentry */
106 int map_alloc; /* # of map entries allocated */ 111 int map_alloc; /* # of map entries allocated */
107 int *map; /* allocation map */ 112 int *map; /* allocation map */
113 struct work_struct map_extend_work;/* async ->map[] extension */
114
108 void *data; /* chunk data */ 115 void *data; /* chunk data */
109 int first_free; /* no free below this */ 116 int first_free; /* no free below this */
110 bool immutable; /* no [de]population allowed */ 117 bool immutable; /* no [de]population allowed */
118 int nr_populated; /* # of populated pages */
111 unsigned long populated[]; /* populated bitmap */ 119 unsigned long populated[]; /* populated bitmap */
112}; 120};
113 121
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
151static struct pcpu_chunk *pcpu_reserved_chunk; 159static struct pcpu_chunk *pcpu_reserved_chunk;
152static int pcpu_reserved_chunk_limit; 160static int pcpu_reserved_chunk_limit;
153 161
162static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
163static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
164
165static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
166
154/* 167/*
155 * Synchronization rules. 168 * The number of empty populated pages, protected by pcpu_lock. The
156 * 169 * reserved chunk doesn't contribute to the count.
157 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
158 * protects allocation/reclaim paths, chunks, populated bitmap and
159 * vmalloc mapping. The latter is a spinlock and protects the index
160 * data structures - chunk slots, chunks and area maps in chunks.
161 *
162 * During allocation, pcpu_alloc_mutex is kept locked all the time and
163 * pcpu_lock is grabbed and released as necessary. All actual memory
164 * allocations are done using GFP_KERNEL with pcpu_lock released. In
165 * general, percpu memory can't be allocated with irq off but
166 * irqsave/restore are still used in alloc path so that it can be used
167 * from early init path - sched_init() specifically.
168 *
169 * Free path accesses and alters only the index data structures, so it
170 * can be safely called from atomic context. When memory needs to be
171 * returned to the system, free path schedules reclaim_work which
172 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
173 * reclaimed, release both locks and frees the chunks. Note that it's
174 * necessary to grab both locks to remove a chunk from circulation as
175 * allocation path might be referencing the chunk with only
176 * pcpu_alloc_mutex locked.
177 */ 170 */
178static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 171static int pcpu_nr_empty_pop_pages;
179static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
180 172
181static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 173/*
174 * Balance work is used to populate or destroy chunks asynchronously. We
175 * try to keep the number of populated free pages between
176 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177 * empty chunk.
178 */
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
182 183
183/* reclaim work to release fully free chunks, scheduled from free path */ 184static void pcpu_schedule_balance_work(void)
184static void pcpu_reclaim(struct work_struct *work); 185{
185static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
186 189
187static bool pcpu_addr_in_first_chunk(void *addr) 190static bool pcpu_addr_in_first_chunk(void *addr)
188{ 191{
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
315} 318}
316 319
317/** 320/**
321 * pcpu_count_occupied_pages - count the number of pages an area occupies
322 * @chunk: chunk of interest
323 * @i: index of the area in question
324 *
325 * Count the number of pages chunk's @i'th area occupies. When the area's
326 * start and/or end address isn't aligned to page boundary, the straddled
327 * page is included in the count iff the rest of the page is free.
328 */
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352/**
318 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 353 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
319 * @chunk: chunk of interest 354 * @chunk: chunk of interest
320 * @oslot: the previous slot it was on 355 * @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342/** 377/**
343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 378 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
344 * @chunk: chunk of interest 379 * @chunk: chunk of interest
380 * @is_atomic: the allocation context
345 * 381 *
346 * Determine whether area map of @chunk needs to be extended to 382 * Determine whether area map of @chunk needs to be extended. If
347 * accommodate a new allocation. 383 * @is_atomic, only the amount necessary for a new allocation is
384 * considered; however, async extension is scheduled if the left amount is
385 * low. If !@is_atomic, it aims for more empty space. Combined, this
386 * ensures that the map is likely to have enough available space to
387 * accomodate atomic allocations which can't extend maps directly.
348 * 388 *
349 * CONTEXT: 389 * CONTEXT:
350 * pcpu_lock. 390 * pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
353 * New target map allocation length if extension is necessary, 0 393 * New target map allocation length if extension is necessary, 0
354 * otherwise. 394 * otherwise.
355 */ 395 */
356static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
357{ 397{
358 int new_alloc; 398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
359 410
360 if (chunk->map_alloc >= chunk->map_used + 3) 411 if (chunk->map_alloc >= chunk->map_used + margin)
361 return 0; 412 return 0;
362 413
363 new_alloc = PCPU_DFL_MAP_ALLOC; 414 new_alloc = PCPU_DFL_MAP_ALLOC;
364 while (new_alloc < chunk->map_used + 3) 415 while (new_alloc < chunk->map_used + margin)
365 new_alloc *= 2; 416 new_alloc *= 2;
366 417
367 return new_alloc; 418 return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
418 return 0; 469 return 0;
419} 470}
420 471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486/**
487 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
488 * @chunk: chunk the candidate area belongs to
489 * @off: the offset to the start of the candidate area
490 * @this_size: the size of the candidate area
491 * @size: the size of the target allocation
492 * @align: the alignment of the target allocation
493 * @pop_only: only allocate from already populated region
494 *
495 * We're trying to allocate @size bytes aligned at @align. @chunk's area
496 * at @off sized @this_size is a candidate. This function determines
497 * whether the target allocation fits in the candidate area and returns the
498 * number of bytes to pad after @off. If the target area doesn't fit, -1
499 * is returned.
500 *
501 * If @pop_only is %true, this function only considers the already
502 * populated part of the candidate area.
503 */
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519 /*
520 * If the first unpopulated page is beyond the end of the
521 * allocation, the whole allocation is populated;
522 * otherwise, retry from the end of the unpopulated area.
523 */
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
421/** 535/**
422 * pcpu_alloc_area - allocate area from a pcpu_chunk 536 * pcpu_alloc_area - allocate area from a pcpu_chunk
423 * @chunk: chunk of interest 537 * @chunk: chunk of interest
424 * @size: wanted size in bytes 538 * @size: wanted size in bytes
425 * @align: wanted align 539 * @align: wanted align
540 * @pop_only: allocate only from the populated area
541 * @occ_pages_p: out param for the number of pages the area occupies
426 * 542 *
427 * Try to allocate @size bytes area aligned at @align from @chunk. 543 * Try to allocate @size bytes area aligned at @align from @chunk.
428 * Note that this function only allocates the offset. It doesn't 544 * Note that this function only allocates the offset. It doesn't
@@ -437,7 +553,8 @@ out_unlock:
437 * Allocated offset in @chunk on success, -1 if no matching area is 553 * Allocated offset in @chunk on success, -1 if no matching area is
438 * found. 554 * found.
439 */ 555 */
440static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
441{ 558{
442 int oslot = pcpu_chunk_slot(chunk); 559 int oslot = pcpu_chunk_slot(chunk);
443 int max_contig = 0; 560 int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
453 if (off & 1) 570 if (off & 1)
454 continue; 571 continue;
455 572
456 /* extra for alignment requirement */
457 head = ALIGN(off, align) - off;
458
459 this_size = (p[1] & ~1) - off; 573 this_size = (p[1] & ~1) - off;
460 if (this_size < head + size) { 574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
461 if (!seen_free) { 578 if (!seen_free) {
462 chunk->first_free = i; 579 chunk->first_free = i;
463 seen_free = true; 580 seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
526 chunk->free_size -= size; 643 chunk->free_size -= size;
527 *p |= 1; 644 *p |= 1;
528 645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
529 pcpu_chunk_relocate(chunk, oslot); 647 pcpu_chunk_relocate(chunk, oslot);
530 return off; 648 return off;
531 } 649 }
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
541 * pcpu_free_area - free area to a pcpu_chunk 659 * pcpu_free_area - free area to a pcpu_chunk
542 * @chunk: chunk of interest 660 * @chunk: chunk of interest
543 * @freeme: offset of area to free 661 * @freeme: offset of area to free
662 * @occ_pages_p: out param for the number of pages the area occupies
544 * 663 *
545 * Free area starting from @freeme to @chunk. Note that this function 664 * Free area starting from @freeme to @chunk. Note that this function
546 * only modifies the allocation map. It doesn't depopulate or unmap 665 * only modifies the allocation map. It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
549 * CONTEXT: 668 * CONTEXT:
550 * pcpu_lock. 669 * pcpu_lock.
551 */ 670 */
552static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
553{ 673{
554 int oslot = pcpu_chunk_slot(chunk); 674 int oslot = pcpu_chunk_slot(chunk);
555 int off = 0; 675 int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
580 *p = off &= ~1; 700 *p = off &= ~1;
581 chunk->free_size += (p[1] & ~1) - off; 701 chunk->free_size += (p[1] & ~1) - off;
582 702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
583 /* merge with next? */ 705 /* merge with next? */
584 if (!(p[1] & 1)) 706 if (!(p[1] & 1))
585 to_free++; 707 to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
620 chunk->map_used = 1; 742 chunk->map_used = 1;
621 743
622 INIT_LIST_HEAD(&chunk->list); 744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
623 chunk->free_size = pcpu_unit_size; 746 chunk->free_size = pcpu_unit_size;
624 chunk->contig_hint = pcpu_unit_size; 747 chunk->contig_hint = pcpu_unit_size;
625 748
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 758}
636 759
760/**
761 * pcpu_chunk_populated - post-population bookkeeping
762 * @chunk: pcpu_chunk which got populated
763 * @page_start: the start page
764 * @page_end: the end page
765 *
766 * Pages in [@page_start,@page_end) have been populated to @chunk. Update
767 * the bookkeeping information accordingly. Must be called after each
768 * successful population.
769 */
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782/**
783 * pcpu_chunk_depopulated - post-depopulation bookkeeping
784 * @chunk: pcpu_chunk which got depopulated
785 * @page_start: the start page
786 * @page_end: the end page
787 *
788 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
789 * Update the bookkeeping information accordingly. Must be called after
790 * each successful depopulation.
791 */
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
637/* 804/*
638 * Chunk management implementation. 805 * Chunk management implementation.
639 * 806 *
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
695 * @size: size of area to allocate in bytes 862 * @size: size of area to allocate in bytes
696 * @align: alignment of area (max PAGE_SIZE) 863 * @align: alignment of area (max PAGE_SIZE)
697 * @reserved: allocate from the reserved chunk if available 864 * @reserved: allocate from the reserved chunk if available
865 * @gfp: allocation flags
698 * 866 *
699 * Allocate percpu area of @size bytes aligned at @align. 867 * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
700 * 868 * contain %GFP_KERNEL, the allocation is atomic.
701 * CONTEXT:
702 * Does GFP_KERNEL allocation.
703 * 869 *
704 * RETURNS: 870 * RETURNS:
705 * Percpu pointer to the allocated area on success, NULL on failure. 871 * Percpu pointer to the allocated area on success, NULL on failure.
706 */ 872 */
707static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
708{ 875{
709 static int warn_limit = 10; 876 static int warn_limit = 10;
710 struct pcpu_chunk *chunk; 877 struct pcpu_chunk *chunk;
711 const char *err; 878 const char *err;
712 int slot, off, new_alloc; 879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
713 unsigned long flags; 882 unsigned long flags;
714 void __percpu *ptr; 883 void __percpu *ptr;
715 884
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
728 return NULL; 897 return NULL;
729 } 898 }
730 899
731 mutex_lock(&pcpu_alloc_mutex);
732 spin_lock_irqsave(&pcpu_lock, flags); 900 spin_lock_irqsave(&pcpu_lock, flags);
733 901
734 /* serve reserved allocations from the reserved chunk if available */ 902 /* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
740 goto fail_unlock; 908 goto fail_unlock;
741 } 909 }
742 910
743 while ((new_alloc = pcpu_need_to_extend(chunk))) { 911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
744 spin_unlock_irqrestore(&pcpu_lock, flags); 912 spin_unlock_irqrestore(&pcpu_lock, flags);
745 if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
746 err = "failed to extend area map of reserved chunk"; 915 err = "failed to extend area map of reserved chunk";
747 goto fail_unlock_mutex; 916 goto fail;
748 } 917 }
749 spin_lock_irqsave(&pcpu_lock, flags); 918 spin_lock_irqsave(&pcpu_lock, flags);
750 } 919 }
751 920
752 off = pcpu_alloc_area(chunk, size, align); 921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
753 if (off >= 0) 923 if (off >= 0)
754 goto area_found; 924 goto area_found;
755 925
@@ -764,13 +934,15 @@ restart:
764 if (size > chunk->contig_hint) 934 if (size > chunk->contig_hint)
765 continue; 935 continue;
766 936
767 new_alloc = pcpu_need_to_extend(chunk); 937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
768 if (new_alloc) { 938 if (new_alloc) {
939 if (is_atomic)
940 continue;
769 spin_unlock_irqrestore(&pcpu_lock, flags); 941 spin_unlock_irqrestore(&pcpu_lock, flags);
770 if (pcpu_extend_area_map(chunk, 942 if (pcpu_extend_area_map(chunk,
771 new_alloc) < 0) { 943 new_alloc) < 0) {
772 err = "failed to extend area map"; 944 err = "failed to extend area map";
773 goto fail_unlock_mutex; 945 goto fail;
774 } 946 }
775 spin_lock_irqsave(&pcpu_lock, flags); 947 spin_lock_irqsave(&pcpu_lock, flags);
776 /* 948 /*
@@ -780,74 +952,134 @@ restart:
780 goto restart; 952 goto restart;
781 } 953 }
782 954
783 off = pcpu_alloc_area(chunk, size, align); 955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
784 if (off >= 0) 957 if (off >= 0)
785 goto area_found; 958 goto area_found;
786 } 959 }
787 } 960 }
788 961
789 /* hmmm... no space left, create a new chunk */
790 spin_unlock_irqrestore(&pcpu_lock, flags); 962 spin_unlock_irqrestore(&pcpu_lock, flags);
791 963
792 chunk = pcpu_create_chunk(); 964 /*
793 if (!chunk) { 965 * No space left. Create a new chunk. We don't want multiple
794 err = "failed to allocate new chunk"; 966 * tasks to create chunks simultaneously. Serialize and create iff
795 goto fail_unlock_mutex; 967 * there's still no empty chunk after grabbing the mutex.
968 */
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
796 } 986 }
797 987
798 spin_lock_irqsave(&pcpu_lock, flags); 988 mutex_unlock(&pcpu_alloc_mutex);
799 pcpu_chunk_relocate(chunk, -1);
800 goto restart; 989 goto restart;
801 990
802area_found: 991area_found:
803 spin_unlock_irqrestore(&pcpu_lock, flags); 992 spin_unlock_irqrestore(&pcpu_lock, flags);
804 993
805 /* populate, map and clear the area */ 994 /* populate if not all pages are already there */
806 if (pcpu_populate_chunk(chunk, off, size)) { 995 if (!is_atomic) {
807 spin_lock_irqsave(&pcpu_lock, flags); 996 int page_start, page_end, rs, re;
808 pcpu_free_area(chunk, off); 997
809 err = "failed to populate"; 998 mutex_lock(&pcpu_alloc_mutex);
810 goto fail_unlock; 999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
811 } 1020 }
812 1021
813 mutex_unlock(&pcpu_alloc_mutex); 1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028 /* clear the areas and return address relative to base address */
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
814 1031
815 /* return address relative to base address */
816 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
817 kmemleak_alloc_percpu(ptr, size); 1033 kmemleak_alloc_percpu(ptr, size);
818 return ptr; 1034 return ptr;
819 1035
820fail_unlock: 1036fail_unlock:
821 spin_unlock_irqrestore(&pcpu_lock, flags); 1037 spin_unlock_irqrestore(&pcpu_lock, flags);
822fail_unlock_mutex: 1038fail:
823 mutex_unlock(&pcpu_alloc_mutex); 1039 if (!is_atomic && warn_limit) {
824 if (warn_limit) { 1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
825 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 1041 size, align, is_atomic, err);
826 "%s\n", size, align, err);
827 dump_stack(); 1042 dump_stack();
828 if (!--warn_limit) 1043 if (!--warn_limit)
829 pr_info("PERCPU: limit reached, disable warning\n"); 1044 pr_info("PERCPU: limit reached, disable warning\n");
830 } 1045 }
1046 if (is_atomic) {
1047 /* see the flag handling in pcpu_blance_workfn() */
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
831 return NULL; 1051 return NULL;
832} 1052}
833 1053
834/** 1054/**
835 * __alloc_percpu - allocate dynamic percpu area 1055 * __alloc_percpu_gfp - allocate dynamic percpu area
836 * @size: size of area to allocate in bytes 1056 * @size: size of area to allocate in bytes
837 * @align: alignment of area (max PAGE_SIZE) 1057 * @align: alignment of area (max PAGE_SIZE)
1058 * @gfp: allocation flags
838 * 1059 *
839 * Allocate zero-filled percpu area of @size bytes aligned at @align. 1060 * Allocate zero-filled percpu area of @size bytes aligned at @align. If
840 * Might sleep. Might trigger writeouts. 1061 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
841 * 1062 * be called from any context but is a lot more likely to fail.
842 * CONTEXT:
843 * Does GFP_KERNEL allocation.
844 * 1063 *
845 * RETURNS: 1064 * RETURNS:
846 * Percpu pointer to the allocated area on success, NULL on failure. 1065 * Percpu pointer to the allocated area on success, NULL on failure.
847 */ 1066 */
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073/**
1074 * __alloc_percpu - allocate dynamic percpu area
1075 * @size: size of area to allocate in bytes
1076 * @align: alignment of area (max PAGE_SIZE)
1077 *
1078 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
1079 */
848void __percpu *__alloc_percpu(size_t size, size_t align) 1080void __percpu *__alloc_percpu(size_t size, size_t align)
849{ 1081{
850 return pcpu_alloc(size, align, false); 1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
851} 1083}
852EXPORT_SYMBOL_GPL(__alloc_percpu); 1084EXPORT_SYMBOL_GPL(__alloc_percpu);
853 1085
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
869 */ 1101 */
870void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
871{ 1103{
872 return pcpu_alloc(size, align, true); 1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
873} 1105}
874 1106
875/** 1107/**
876 * pcpu_reclaim - reclaim fully free chunks, workqueue function 1108 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
877 * @work: unused 1109 * @work: unused
878 * 1110 *
879 * Reclaim all fully free chunks except for the first one. 1111 * Reclaim all fully free chunks except for the first one.
880 *
881 * CONTEXT:
882 * workqueue context.
883 */ 1112 */
884static void pcpu_reclaim(struct work_struct *work) 1113static void pcpu_balance_workfn(struct work_struct *work)
885{ 1114{
886 LIST_HEAD(todo); 1115 LIST_HEAD(to_free);
887 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
888 struct pcpu_chunk *chunk, *next; 1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
889 1119
1120 /*
1121 * There's no reason to keep around multiple unused chunks and VM
1122 * areas can be scarce. Destroy all free chunks except for one.
1123 */
890 mutex_lock(&pcpu_alloc_mutex); 1124 mutex_lock(&pcpu_alloc_mutex);
891 spin_lock_irq(&pcpu_lock); 1125 spin_lock_irq(&pcpu_lock);
892 1126
893 list_for_each_entry_safe(chunk, next, head, list) { 1127 list_for_each_entry_safe(chunk, next, free_head, list) {
894 WARN_ON(chunk->immutable); 1128 WARN_ON(chunk->immutable);
895 1129
896 /* spare the first one */ 1130 /* spare the first one */
897 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
898 continue; 1132 continue;
899 1133
900 list_move(&chunk->list, &todo); 1134 list_move(&chunk->list, &to_free);
901 } 1135 }
902 1136
903 spin_unlock_irq(&pcpu_lock); 1137 spin_unlock_irq(&pcpu_lock);
904 1138
905 list_for_each_entry_safe(chunk, next, &todo, list) { 1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
906 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
907 pcpu_destroy_chunk(chunk); 1148 pcpu_destroy_chunk(chunk);
908 } 1149 }
909 1150
1151 /*
1152 * Ensure there are certain number of free populated pages for
1153 * atomic allocs. Fill up from the most packed so that atomic
1154 * allocs don't increase fragmentation. If atomic allocation
1155 * failed previously, always populate the maximum amount. This
1156 * should prevent atomic allocs larger than PAGE_SIZE from keeping
1157 * failing indefinitely; however, large atomic allocs are not
1158 * something we support properly and can be highly unreliable and
1159 * inefficient.
1160 */
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164 /* best effort anyway, don't worry about synchronization */
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189 /* @chunk can't go away while pcpu_alloc_mutex is held */
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209 /* ran out of chunks to populate, create a new one and retry */
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
910 mutex_unlock(&pcpu_alloc_mutex); 1219 mutex_unlock(&pcpu_alloc_mutex);
911} 1220}
912 1221
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
924 void *addr; 1233 void *addr;
925 struct pcpu_chunk *chunk; 1234 struct pcpu_chunk *chunk;
926 unsigned long flags; 1235 unsigned long flags;
927 int off; 1236 int off, occ_pages;
928 1237
929 if (!ptr) 1238 if (!ptr)
930 return; 1239 return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
938 chunk = pcpu_chunk_addr_search(addr); 1247 chunk = pcpu_chunk_addr_search(addr);
939 off = addr - chunk->base_addr; 1248 off = addr - chunk->base_addr;
940 1249
941 pcpu_free_area(chunk, off); 1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
942 1254
943 /* if there are more than one fully free chunks, wake up grim reaper */ 1255 /* if there are more than one fully free chunks, wake up grim reaper */
944 if (chunk->free_size == pcpu_unit_size) { 1256 if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
946 1258
947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
948 if (pos != chunk) { 1260 if (pos != chunk) {
949 schedule_work(&pcpu_reclaim_work); 1261 pcpu_schedule_balance_work();
950 break; 1262 break;
951 } 1263 }
952 } 1264 }
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1336 */ 1648 */
1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1649 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1338 INIT_LIST_HEAD(&schunk->list); 1650 INIT_LIST_HEAD(&schunk->list);
1651 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1339 schunk->base_addr = base_addr; 1652 schunk->base_addr = base_addr;
1340 schunk->map = smap; 1653 schunk->map = smap;
1341 schunk->map_alloc = ARRAY_SIZE(smap); 1654 schunk->map_alloc = ARRAY_SIZE(smap);
1342 schunk->immutable = true; 1655 schunk->immutable = true;
1343 bitmap_fill(schunk->populated, pcpu_unit_pages); 1656 bitmap_fill(schunk->populated, pcpu_unit_pages);
1657 schunk->nr_populated = pcpu_unit_pages;
1344 1658
1345 if (ai->reserved_size) { 1659 if (ai->reserved_size) {
1346 schunk->free_size = ai->reserved_size; 1660 schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1364 if (dyn_size) { 1678 if (dyn_size) {
1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1679 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1366 INIT_LIST_HEAD(&dchunk->list); 1680 INIT_LIST_HEAD(&dchunk->list);
1681 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1367 dchunk->base_addr = base_addr; 1682 dchunk->base_addr = base_addr;
1368 dchunk->map = dmap; 1683 dchunk->map = dmap;
1369 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 dchunk->map_alloc = ARRAY_SIZE(dmap);
1370 dchunk->immutable = true; 1685 dchunk->immutable = true;
1371 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1686 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1687 dchunk->nr_populated = pcpu_unit_pages;
1372 1688
1373 dchunk->contig_hint = dchunk->free_size = dyn_size; 1689 dchunk->contig_hint = dchunk->free_size = dyn_size;
1374 dchunk->map[0] = 1; 1690 dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1379 1695
1380 /* link the first chunk in */ 1696 /* link the first chunk in */
1381 pcpu_first_chunk = dchunk ?: schunk; 1697 pcpu_first_chunk = dchunk ?: schunk;
1698 pcpu_nr_empty_pop_pages +=
1699 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1382 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1700 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1383 1701
1384 /* we're done */ 1702 /* we're done */
@@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void)
1932 2250
1933 if (pcpu_setup_first_chunk(ai, fc) < 0) 2251 if (pcpu_setup_first_chunk(ai, fc) < 0)
1934 panic("Failed to initialize percpu areas."); 2252 panic("Failed to initialize percpu areas.");
1935
1936 pcpu_free_alloc_info(ai);
1937} 2253}
1938 2254
1939#endif /* CONFIG_SMP */ 2255#endif /* CONFIG_SMP */
@@ -1967,3 +2283,15 @@ void __init percpu_init_late(void)
1967 spin_unlock_irqrestore(&pcpu_lock, flags); 2283 spin_unlock_irqrestore(&pcpu_lock, flags);
1968 } 2284 }
1969} 2285}
2286
2287/*
2288 * Percpu allocator is initialized early during boot when neither slab or
2289 * workqueue is available. Plug async management until everything is up
2290 * and running.
2291 */
2292static int __init percpu_enable_async(void)
2293{
2294 pcpu_async_enabled = true;
2295 return 0;
2296}
2297subsys_initcall(percpu_enable_async);