aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
committerTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
commitce3141a277ff6cc37e51008b8888dc2cb7456ef1 (patch)
tree8bd43d595d85fa37de5f3a7030580aa56b590028 /mm
parentc8a51be4cabb7009db5f865169389242d49c4c60 (diff)
percpu: drop pcpu_chunk->page[]
percpu core doesn't need to tack all the allocated pages. It needs to know whether certain pages are populated and a way to reverse map address to page when freeing. This patch drops pcpu_chunk->page[] and use populated bitmap and vmalloc_to_page() lookup instead. Using vmalloc_to_page() exclusively is also possible but complicates first chunk handling, inflates cache footprint and prevents non-standard memory allocation for percpu memory. pcpu_chunk->page[] was used to track each page's allocation and allowed asymmetric population which happens during failure path; however, with single bitmap for all units, this is no longer possible. Bite the bullet and rewrite (de)populate functions so that things are done in clearly separated steps such that asymmetric population doesn't happen. This makes the (de)population process much more modular and will also ease implementing non-standard memory usage in the future (e.g. large pages). This makes @get_page_fn parameter to pcpu_setup_first_chunk() unnecessary. The parameter is dropped and all first chunk helpers are updated accordingly. Please note that despite the volume most changes to first chunk helpers are symbol renames for variables which don't need to be referenced outside of the helper anymore. This change reduces memory usage and cache footprint of pcpu_chunk. Now only #unit_pages bits are necessary per chunk. [ Impact: reduced memory usage and cache footprint for bookkeeping ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: David Miller <davem@davemloft.net>
Diffstat (limited to 'mm')
-rw-r--r--mm/percpu.c604
1 files changed, 384 insertions, 220 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 639fce4d2caf..21756814d99f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,8 +94,7 @@ struct pcpu_chunk {
94 int map_alloc; /* # of map entries allocated */ 94 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 95 int *map; /* allocation map */
96 bool immutable; /* no [de]population allowed */ 96 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 97 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 98};
100 99
101static int pcpu_unit_pages __read_mostly; 100static int pcpu_unit_pages __read_mostly;
@@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 128 * Synchronization rules.
130 * 129 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 130 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 131 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 132 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 133 * data structures - chunk slots, chunks and area maps in chunks.
135 * 134 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 135 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 136 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
188 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 187 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
189} 188}
190 189
191static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, 190static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
192 unsigned int cpu, int page_idx) 191 unsigned int cpu, int page_idx)
193{ 192{
194 return &chunk->page[pcpu_page_idx(cpu, page_idx)]; 193 /* must not be used on pre-mapped chunk */
195} 194 WARN_ON(chunk->immutable);
196 195
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 196 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
198 int page_idx)
199{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
201} 197}
202 198
203/* set the pointer to a chunk in a page struct */ 199/* set the pointer to a chunk in a page struct */
@@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
212 return (struct pcpu_chunk *)page->index; 208 return (struct pcpu_chunk *)page->index;
213} 209}
214 210
211static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
212{
213 *rs = find_next_zero_bit(chunk->populated, end, *rs);
214 *re = find_next_bit(chunk->populated, end, *rs + 1);
215}
216
217static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
218{
219 *rs = find_next_bit(chunk->populated, end, *rs);
220 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
221}
222
223/*
224 * (Un)populated page region iterators. Iterate over (un)populated
225 * page regions betwen @start and @end in @chunk. @rs and @re should
226 * be integer variables and will be set to start and end page index of
227 * the current region.
228 */
229#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
230 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
231 (rs) < (re); \
232 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
233
234#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
235 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
236 (rs) < (re); \
237 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
238
215/** 239/**
216 * pcpu_mem_alloc - allocate memory 240 * pcpu_mem_alloc - allocate memory
217 * @size: bytes to allocate 241 * @size: bytes to allocate
@@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
545} 569}
546 570
547/** 571/**
548 * pcpu_unmap - unmap pages out of a pcpu_chunk 572 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
573 * @chunk: chunk of interest
574 * @bitmapp: output parameter for bitmap
575 * @may_alloc: may allocate the array
576 *
577 * Returns pointer to array of pointers to struct page and bitmap,
578 * both of which can be indexed with pcpu_page_idx(). The returned
579 * array is cleared to zero and *@bitmapp is copied from
580 * @chunk->populated. Note that there is only one array and bitmap
581 * and access exclusion is the caller's responsibility.
582 *
583 * CONTEXT:
584 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
585 * Otherwise, don't care.
586 *
587 * RETURNS:
588 * Pointer to temp pages array on success, NULL on failure.
589 */
590static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
591 unsigned long **bitmapp,
592 bool may_alloc)
593{
594 static struct page **pages;
595 static unsigned long *bitmap;
596 size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
597 sizeof(pages[0]);
598 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
599 sizeof(unsigned long);
600
601 if (!pages || !bitmap) {
602 if (may_alloc && !pages)
603 pages = pcpu_mem_alloc(pages_size);
604 if (may_alloc && !bitmap)
605 bitmap = pcpu_mem_alloc(bitmap_size);
606 if (!pages || !bitmap)
607 return NULL;
608 }
609
610 memset(pages, 0, pages_size);
611 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
612
613 *bitmapp = bitmap;
614 return pages;
615}
616
617/**
618 * pcpu_free_pages - free pages which were allocated for @chunk
619 * @chunk: chunk pages were allocated for
620 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
621 * @populated: populated bitmap
622 * @page_start: page index of the first page to be freed
623 * @page_end: page index of the last page to be freed + 1
624 *
625 * Free pages [@page_start and @page_end) in @pages for all units.
626 * The pages were allocated for @chunk.
627 */
628static void pcpu_free_pages(struct pcpu_chunk *chunk,
629 struct page **pages, unsigned long *populated,
630 int page_start, int page_end)
631{
632 unsigned int cpu;
633 int i;
634
635 for_each_possible_cpu(cpu) {
636 for (i = page_start; i < page_end; i++) {
637 struct page *page = pages[pcpu_page_idx(cpu, i)];
638
639 if (page)
640 __free_page(page);
641 }
642 }
643}
644
645/**
646 * pcpu_alloc_pages - allocates pages for @chunk
647 * @chunk: target chunk
648 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
649 * @populated: populated bitmap
650 * @page_start: page index of the first page to be allocated
651 * @page_end: page index of the last page to be allocated + 1
652 *
653 * Allocate pages [@page_start,@page_end) into @pages for all units.
654 * The allocation is for @chunk. Percpu core doesn't care about the
655 * content of @pages and will pass it verbatim to pcpu_map_pages().
656 */
657static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
658 struct page **pages, unsigned long *populated,
659 int page_start, int page_end)
660{
661 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
662 unsigned int cpu;
663 int i;
664
665 for_each_possible_cpu(cpu) {
666 for (i = page_start; i < page_end; i++) {
667 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
668
669 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
670 if (!*pagep) {
671 pcpu_free_pages(chunk, pages, populated,
672 page_start, page_end);
673 return -ENOMEM;
674 }
675 }
676 }
677 return 0;
678}
679
680/**
681 * pcpu_pre_unmap_flush - flush cache prior to unmapping
682 * @chunk: chunk the regions to be flushed belongs to
683 * @page_start: page index of the first page to be flushed
684 * @page_end: page index of the last page to be flushed + 1
685 *
686 * Pages in [@page_start,@page_end) of @chunk are about to be
687 * unmapped. Flush cache. As each flushing trial can be very
688 * expensive, issue flush on the whole region at once rather than
689 * doing it for each cpu. This could be an overkill but is more
690 * scalable.
691 */
692static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
693 int page_start, int page_end)
694{
695 unsigned int last = num_possible_cpus() - 1;
696
697 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
698 pcpu_chunk_addr(chunk, last, page_end));
699}
700
701static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
702{
703 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
704}
705
706/**
707 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
549 * @chunk: chunk of interest 708 * @chunk: chunk of interest
709 * @pages: pages array which can be used to pass information to free
710 * @populated: populated bitmap
550 * @page_start: page index of the first page to unmap 711 * @page_start: page index of the first page to unmap
551 * @page_end: page index of the last page to unmap + 1 712 * @page_end: page index of the last page to unmap + 1
552 * @flush_tlb: whether to flush tlb or not
553 * 713 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 714 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
555 * If @flush is true, vcache is flushed before unmapping and tlb 715 * Corresponding elements in @pages were cleared by the caller and can
556 * after. 716 * be used to carry information to pcpu_free_pages() which will be
717 * called after all unmaps are finished. The caller should call
718 * proper pre/post flush functions.
557 */ 719 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 720static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
559 bool flush_tlb) 721 struct page **pages, unsigned long *populated,
722 int page_start, int page_end)
560{ 723{
561 unsigned int last = num_possible_cpus() - 1;
562 unsigned int cpu; 724 unsigned int cpu;
725 int i;
563 726
564 /* unmap must not be done on immutable chunk */ 727 for_each_possible_cpu(cpu) {
565 WARN_ON(chunk->immutable); 728 for (i = page_start; i < page_end; i++) {
729 struct page *page;
566 730
567 /* 731 page = pcpu_chunk_page(chunk, cpu, i);
568 * Each flushing trial can be very expensive, issue flush on 732 WARN_ON(!page);
569 * the whole region at once rather than doing it for each cpu. 733 pages[pcpu_page_idx(cpu, i)] = page;
570 * This could be an overkill but is more scalable. 734 }
571 */ 735 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 736 page_end - page_start);
573 pcpu_chunk_addr(chunk, last, page_end)); 737 }
574 738
575 for_each_possible_cpu(cpu) 739 for (i = page_start; i < page_end; i++)
576 unmap_kernel_range_noflush( 740 __clear_bit(i, populated);
577 pcpu_chunk_addr(chunk, cpu, page_start), 741}
578 (page_end - page_start) << PAGE_SHIFT); 742
579 743/**
580 /* ditto as flush_cache_vunmap() */ 744 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
581 if (flush_tlb) 745 * @chunk: pcpu_chunk the regions to be flushed belong to
582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 746 * @page_start: page index of the first page to be flushed
583 pcpu_chunk_addr(chunk, last, page_end)); 747 * @page_end: page index of the last page to be flushed + 1
748 *
749 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
750 * TLB for the regions. This can be skipped if the area is to be
751 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
752 *
753 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
754 * for the whole region.
755 */
756static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
757 int page_start, int page_end)
758{
759 unsigned int last = num_possible_cpus() - 1;
760
761 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
762 pcpu_chunk_addr(chunk, last, page_end));
584} 763}
585 764
586static int __pcpu_map_pages(unsigned long addr, struct page **pages, 765static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
591} 770}
592 771
593/** 772/**
594 * pcpu_map - map pages into a pcpu_chunk 773 * pcpu_map_pages - map pages into a pcpu_chunk
595 * @chunk: chunk of interest 774 * @chunk: chunk of interest
775 * @pages: pages array containing pages to be mapped
776 * @populated: populated bitmap
596 * @page_start: page index of the first page to map 777 * @page_start: page index of the first page to map
597 * @page_end: page index of the last page to map + 1 778 * @page_end: page index of the last page to map + 1
598 * 779 *
599 * For each cpu, map pages [@page_start,@page_end) into @chunk. 780 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
600 * vcache is flushed afterwards. 781 * caller is responsible for calling pcpu_post_map_flush() after all
782 * mappings are complete.
783 *
784 * This function is responsible for setting corresponding bits in
785 * @chunk->populated bitmap and whatever is necessary for reverse
786 * lookup (addr -> chunk).
601 */ 787 */
602static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 788static int pcpu_map_pages(struct pcpu_chunk *chunk,
789 struct page **pages, unsigned long *populated,
790 int page_start, int page_end)
603{ 791{
604 unsigned int last = num_possible_cpus() - 1; 792 unsigned int cpu, tcpu;
605 unsigned int cpu; 793 int i, err;
606 int err;
607
608 /* map must not be done on immutable chunk */
609 WARN_ON(chunk->immutable);
610 794
611 for_each_possible_cpu(cpu) { 795 for_each_possible_cpu(cpu) {
612 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 796 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
613 pcpu_chunk_pagep(chunk, cpu, page_start), 797 &pages[pcpu_page_idx(cpu, page_start)],
614 page_end - page_start); 798 page_end - page_start);
615 if (err < 0) 799 if (err < 0)
616 return err; 800 goto err;
617 } 801 }
618 802
803 /* mapping successful, link chunk and mark populated */
804 for (i = page_start; i < page_end; i++) {
805 for_each_possible_cpu(cpu)
806 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
807 chunk);
808 __set_bit(i, populated);
809 }
810
811 return 0;
812
813err:
814 for_each_possible_cpu(tcpu) {
815 if (tcpu == cpu)
816 break;
817 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
818 page_end - page_start);
819 }
820 return err;
821}
822
823/**
824 * pcpu_post_map_flush - flush cache after mapping
825 * @chunk: pcpu_chunk the regions to be flushed belong to
826 * @page_start: page index of the first page to be flushed
827 * @page_end: page index of the last page to be flushed + 1
828 *
829 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
830 * cache.
831 *
832 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
833 * for the whole region.
834 */
835static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
836 int page_start, int page_end)
837{
838 unsigned int last = num_possible_cpus() - 1;
839
619 /* flush at once, please read comments in pcpu_unmap() */ 840 /* flush at once, please read comments in pcpu_unmap() */
620 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), 841 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
621 pcpu_chunk_addr(chunk, last, page_end)); 842 pcpu_chunk_addr(chunk, last, page_end));
622 return 0;
623} 843}
624 844
625/** 845/**
@@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
636 * CONTEXT: 856 * CONTEXT:
637 * pcpu_alloc_mutex. 857 * pcpu_alloc_mutex.
638 */ 858 */
639static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 859static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
640 bool flush)
641{ 860{
642 int page_start = PFN_DOWN(off); 861 int page_start = PFN_DOWN(off);
643 int page_end = PFN_UP(off + size); 862 int page_end = PFN_UP(off + size);
644 int unmap_start = -1; 863 struct page **pages;
645 int uninitialized_var(unmap_end); 864 unsigned long *populated;
646 unsigned int cpu; 865 int rs, re;
647 int i; 866
867 /* quick path, check whether it's empty already */
868 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
869 if (rs == page_start && re == page_end)
870 return;
871 break;
872 }
648 873
649 for (i = page_start; i < page_end; i++) { 874 /* immutable chunks can't be depopulated */
650 for_each_possible_cpu(cpu) { 875 WARN_ON(chunk->immutable);
651 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
652 876
653 if (!*pagep) 877 /*
654 continue; 878 * If control reaches here, there must have been at least one
879 * successful population attempt so the temp pages array must
880 * be available now.
881 */
882 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
883 BUG_ON(!pages);
655 884
656 __free_page(*pagep); 885 /* unmap and free */
886 pcpu_pre_unmap_flush(chunk, page_start, page_end);
657 887
658 /* 888 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
659 * If it's partial depopulation, it might get 889 pcpu_unmap_pages(chunk, pages, populated, rs, re);
660 * populated or depopulated again. Mark the
661 * page gone.
662 */
663 *pagep = NULL;
664 890
665 unmap_start = unmap_start < 0 ? i : unmap_start; 891 /* no need to flush tlb, vmalloc will handle it lazily */
666 unmap_end = i + 1; 892
667 } 893 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
668 } 894 pcpu_free_pages(chunk, pages, populated, rs, re);
669 895
670 if (unmap_start >= 0) 896 /* commit new bitmap */
671 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 897 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
672} 898}
673 899
674/** 900/**
@@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
685 */ 911 */
686static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 912static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
687{ 913{
688 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
689 int page_start = PFN_DOWN(off); 914 int page_start = PFN_DOWN(off);
690 int page_end = PFN_UP(off + size); 915 int page_end = PFN_UP(off + size);
691 int map_start = -1; 916 int free_end = page_start, unmap_end = page_start;
692 int uninitialized_var(map_end); 917 struct page **pages;
918 unsigned long *populated;
693 unsigned int cpu; 919 unsigned int cpu;
694 int i; 920 int rs, re, rc;
695 921
696 for (i = page_start; i < page_end; i++) { 922 /* quick path, check whether all pages are already there */
697 if (pcpu_chunk_page_occupied(chunk, i)) { 923 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
698 if (map_start >= 0) { 924 if (rs == page_start && re == page_end)
699 if (pcpu_map(chunk, map_start, map_end)) 925 goto clear;
700 goto err; 926 break;
701 map_start = -1; 927 }
702 }
703 continue;
704 }
705 928
706 map_start = map_start < 0 ? i : map_start; 929 /* need to allocate and map pages, this chunk can't be immutable */
707 map_end = i + 1; 930 WARN_ON(chunk->immutable);
708 931
709 for_each_possible_cpu(cpu) { 932 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
710 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 933 if (!pages)
934 return -ENOMEM;
711 935
712 *pagep = alloc_pages_node(cpu_to_node(cpu), 936 /* alloc and map */
713 alloc_mask, 0); 937 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
714 if (!*pagep) 938 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
715 goto err; 939 if (rc)
716 pcpu_set_page_chunk(*pagep, chunk); 940 goto err_free;
717 } 941 free_end = re;
718 } 942 }
719 943
720 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 944 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
721 goto err; 945 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
946 if (rc)
947 goto err_unmap;
948 unmap_end = re;
949 }
950 pcpu_post_map_flush(chunk, page_start, page_end);
722 951
952 /* commit new bitmap */
953 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
954clear:
723 for_each_possible_cpu(cpu) 955 for_each_possible_cpu(cpu)
724 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 956 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
725 size); 957 size);
726
727 return 0; 958 return 0;
728err: 959
729 /* likely under heavy memory pressure, give memory back */ 960err_unmap:
730 pcpu_depopulate_chunk(chunk, off, size, true); 961 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
731 return -ENOMEM; 962 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
963 pcpu_unmap_pages(chunk, pages, populated, rs, re);
964 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
965err_free:
966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
967 pcpu_free_pages(chunk, pages, populated, rs, re);
968 return rc;
732} 969}
733 970
734static void free_pcpu_chunk(struct pcpu_chunk *chunk) 971static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
752 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 989 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
753 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 990 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
754 chunk->map[chunk->map_used++] = pcpu_unit_size; 991 chunk->map[chunk->map_used++] = pcpu_unit_size;
755 chunk->page = chunk->page_ar;
756 992
757 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 993 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
758 if (!chunk->vm) { 994 if (!chunk->vm) {
@@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work)
933 mutex_unlock(&pcpu_alloc_mutex); 1169 mutex_unlock(&pcpu_alloc_mutex);
934 1170
935 list_for_each_entry_safe(chunk, next, &todo, list) { 1171 list_for_each_entry_safe(chunk, next, &todo, list) {
936 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1172 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
937 free_pcpu_chunk(chunk); 1173 free_pcpu_chunk(chunk);
938 } 1174 }
939} 1175}
@@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
981 1217
982/** 1218/**
983 * pcpu_setup_first_chunk - initialize the first percpu chunk 1219 * pcpu_setup_first_chunk - initialize the first percpu chunk
984 * @get_page_fn: callback to fetch page pointer
985 * @static_size: the size of static percpu area in bytes 1220 * @static_size: the size of static percpu area in bytes
986 * @reserved_size: the size of reserved percpu area in bytes, 0 for none 1221 * @reserved_size: the size of reserved percpu area in bytes, 0 for none
987 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
@@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
992 * perpcu area. This function is to be called from arch percpu area 1227 * perpcu area. This function is to be called from arch percpu area
993 * setup path. 1228 * setup path.
994 * 1229 *
995 * @get_page_fn() should return pointer to percpu page given cpu
996 * number and page number. It should at least return enough pages to
997 * cover the static area. The returned pages for static area should
998 * have been initialized with valid data. It can also return pages
999 * after the static area. NULL return indicates end of pages for the
1000 * cpu. Note that @get_page_fn() must return the same number of pages
1001 * for all cpus.
1002 *
1003 * @reserved_size, if non-zero, specifies the amount of bytes to 1230 * @reserved_size, if non-zero, specifies the amount of bytes to
1004 * reserve after the static area in the first chunk. This reserves 1231 * reserve after the static area in the first chunk. This reserves
1005 * the first chunk such that it's available only through reserved 1232 * the first chunk such that it's available only through reserved
@@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
1031 * The determined pcpu_unit_size which can be used to initialize 1258 * The determined pcpu_unit_size which can be used to initialize
1032 * percpu access. 1259 * percpu access.
1033 */ 1260 */
1034size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1261size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1035 size_t static_size, size_t reserved_size,
1036 ssize_t dyn_size, size_t unit_size, 1262 ssize_t dyn_size, size_t unit_size,
1037 void *base_addr) 1263 void *base_addr)
1038{ 1264{
@@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1041 size_t size_sum = static_size + reserved_size + 1267 size_t size_sum = static_size + reserved_size +
1042 (dyn_size >= 0 ? dyn_size : 0); 1268 (dyn_size >= 0 ? dyn_size : 0);
1043 struct pcpu_chunk *schunk, *dchunk = NULL; 1269 struct pcpu_chunk *schunk, *dchunk = NULL;
1044 unsigned int cpu; 1270 int i;
1045 int i, nr_pages;
1046 1271
1047 /* santiy checks */ 1272 /* santiy checks */
1048 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1273 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
@@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1056 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1281 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1057 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1282 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1058 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1283 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1059 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1284 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1060 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1285 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1061 1286
1062 if (dyn_size < 0) 1287 if (dyn_size < 0)
1063 dyn_size = pcpu_unit_size - static_size - reserved_size; 1288 dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1087 schunk->vm = &first_vm; 1312 schunk->vm = &first_vm;
1088 schunk->map = smap; 1313 schunk->map = smap;
1089 schunk->map_alloc = ARRAY_SIZE(smap); 1314 schunk->map_alloc = ARRAY_SIZE(smap);
1090 schunk->page = schunk->page_ar;
1091 schunk->immutable = true; 1315 schunk->immutable = true;
1316 bitmap_fill(schunk->populated, pcpu_unit_pages);
1092 1317
1093 if (reserved_size) { 1318 if (reserved_size) {
1094 schunk->free_size = reserved_size; 1319 schunk->free_size = reserved_size;
@@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1106 1331
1107 /* init dynamic chunk if necessary */ 1332 /* init dynamic chunk if necessary */
1108 if (dyn_size) { 1333 if (dyn_size) {
1109 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1334 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1110 INIT_LIST_HEAD(&dchunk->list); 1335 INIT_LIST_HEAD(&dchunk->list);
1111 dchunk->vm = &first_vm; 1336 dchunk->vm = &first_vm;
1112 dchunk->map = dmap; 1337 dchunk->map = dmap;
1113 dchunk->map_alloc = ARRAY_SIZE(dmap); 1338 dchunk->map_alloc = ARRAY_SIZE(dmap);
1114 dchunk->page = schunk->page_ar; /* share page map with schunk */
1115 dchunk->immutable = true; 1339 dchunk->immutable = true;
1340 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1116 1341
1117 dchunk->contig_hint = dchunk->free_size = dyn_size; 1342 dchunk->contig_hint = dchunk->free_size = dyn_size;
1118 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1343 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1119 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1344 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1120 } 1345 }
1121 1346
1122 /* assign pages */
1123 nr_pages = -1;
1124 for_each_possible_cpu(cpu) {
1125 for (i = 0; i < pcpu_unit_pages; i++) {
1126 struct page *page = get_page_fn(cpu, i);
1127
1128 if (!page)
1129 break;
1130 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1131 }
1132
1133 BUG_ON(i < PFN_UP(static_size));
1134
1135 if (nr_pages < 0)
1136 nr_pages = i;
1137 else
1138 BUG_ON(nr_pages != i);
1139 }
1140
1141 /* link the first chunk in */ 1347 /* link the first chunk in */
1142 pcpu_first_chunk = dchunk ?: schunk; 1348 pcpu_first_chunk = dchunk ?: schunk;
1143 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1349 pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
1160 return size_sum; 1366 return size_sum;
1161} 1367}
1162 1368
1163/*
1164 * Embedding first chunk setup helper.
1165 */
1166static void *pcpue_ptr __initdata;
1167static size_t pcpue_size __initdata;
1168static size_t pcpue_unit_size __initdata;
1169
1170static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1171{
1172 size_t off = (size_t)pageno << PAGE_SHIFT;
1173
1174 if (off >= pcpue_size)
1175 return NULL;
1176
1177 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1178}
1179
1180/** 1369/**
1181 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1370 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1182 * @static_size: the size of static percpu area in bytes 1371 * @static_size: the size of static percpu area in bytes
@@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1207ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1396ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1208 ssize_t dyn_size) 1397 ssize_t dyn_size)
1209{ 1398{
1210 size_t chunk_size; 1399 size_t size_sum, unit_size, chunk_size;
1400 void *base;
1211 unsigned int cpu; 1401 unsigned int cpu;
1212 1402
1213 /* determine parameters and allocate */ 1403 /* determine parameters and allocate */
1214 pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); 1404 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1215 1405
1216 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1406 unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1217 chunk_size = pcpue_unit_size * num_possible_cpus(); 1407 chunk_size = unit_size * num_possible_cpus();
1218 1408
1219 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1409 base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1220 __pa(MAX_DMA_ADDRESS)); 1410 __pa(MAX_DMA_ADDRESS));
1221 if (!pcpue_ptr) { 1411 if (!base) {
1222 pr_warning("PERCPU: failed to allocate %zu bytes for " 1412 pr_warning("PERCPU: failed to allocate %zu bytes for "
1223 "embedding\n", chunk_size); 1413 "embedding\n", chunk_size);
1224 return -ENOMEM; 1414 return -ENOMEM;
@@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1226 1416
1227 /* return the leftover and copy */ 1417 /* return the leftover and copy */
1228 for_each_possible_cpu(cpu) { 1418 for_each_possible_cpu(cpu) {
1229 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1419 void *ptr = base + cpu * unit_size;
1230 1420
1231 free_bootmem(__pa(ptr + pcpue_size), 1421 free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
1232 pcpue_unit_size - pcpue_size);
1233 memcpy(ptr, __per_cpu_load, static_size); 1422 memcpy(ptr, __per_cpu_load, static_size);
1234 } 1423 }
1235 1424
1236 /* we're ready, commit */ 1425 /* we're ready, commit */
1237 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1426 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1238 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1427 size_sum >> PAGE_SHIFT, base, static_size);
1239 1428
1240 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1429 return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1241 reserved_size, dyn_size, 1430 unit_size, base);
1242 pcpue_unit_size, pcpue_ptr);
1243}
1244
1245/*
1246 * 4k page first chunk setup helper.
1247 */
1248static struct page **pcpu4k_pages __initdata;
1249static int pcpu4k_unit_pages __initdata;
1250
1251static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
1252{
1253 if (pageno < pcpu4k_unit_pages)
1254 return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
1255 return NULL;
1256} 1431}
1257 1432
1258/** 1433/**
@@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1279 pcpu_fc_populate_pte_fn_t populate_pte_fn) 1454 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1280{ 1455{
1281 static struct vm_struct vm; 1456 static struct vm_struct vm;
1457 int unit_pages;
1282 size_t pages_size; 1458 size_t pages_size;
1459 struct page **pages;
1283 unsigned int cpu; 1460 unsigned int cpu;
1284 int i, j; 1461 int i, j;
1285 ssize_t ret; 1462 ssize_t ret;
1286 1463
1287 pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, 1464 unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
1288 PCPU_MIN_UNIT_SIZE)); 1465 PCPU_MIN_UNIT_SIZE));
1289 1466
1290 /* unaligned allocations can't be freed, round up to page size */ 1467 /* unaligned allocations can't be freed, round up to page size */
1291 pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * 1468 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1292 sizeof(pcpu4k_pages[0])); 1469 sizeof(pages[0]));
1293 pcpu4k_pages = alloc_bootmem(pages_size); 1470 pages = alloc_bootmem(pages_size);
1294 1471
1295 /* allocate pages */ 1472 /* allocate pages */
1296 j = 0; 1473 j = 0;
1297 for_each_possible_cpu(cpu) 1474 for_each_possible_cpu(cpu)
1298 for (i = 0; i < pcpu4k_unit_pages; i++) { 1475 for (i = 0; i < unit_pages; i++) {
1299 void *ptr; 1476 void *ptr;
1300 1477
1301 ptr = alloc_fn(cpu, PAGE_SIZE); 1478 ptr = alloc_fn(cpu, PAGE_SIZE);
@@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1304 "4k page for cpu%u\n", cpu); 1481 "4k page for cpu%u\n", cpu);
1305 goto enomem; 1482 goto enomem;
1306 } 1483 }
1307 pcpu4k_pages[j++] = virt_to_page(ptr); 1484 pages[j++] = virt_to_page(ptr);
1308 } 1485 }
1309 1486
1310 /* allocate vm area, map the pages and copy static data */ 1487 /* allocate vm area, map the pages and copy static data */
1311 vm.flags = VM_ALLOC; 1488 vm.flags = VM_ALLOC;
1312 vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; 1489 vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
1313 vm_area_register_early(&vm, PAGE_SIZE); 1490 vm_area_register_early(&vm, PAGE_SIZE);
1314 1491
1315 for_each_possible_cpu(cpu) { 1492 for_each_possible_cpu(cpu) {
1316 unsigned long unit_addr = (unsigned long)vm.addr + 1493 unsigned long unit_addr = (unsigned long)vm.addr +
1317 (cpu * pcpu4k_unit_pages << PAGE_SHIFT); 1494 (cpu * unit_pages << PAGE_SHIFT);
1318 1495
1319 for (i = 0; i < pcpu4k_unit_pages; i++) 1496 for (i = 0; i < unit_pages; i++)
1320 populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); 1497 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1321 1498
1322 /* pte already populated, the following shouldn't fail */ 1499 /* pte already populated, the following shouldn't fail */
1323 ret = __pcpu_map_pages(unit_addr, 1500 ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
1324 &pcpu4k_pages[cpu * pcpu4k_unit_pages], 1501 unit_pages);
1325 pcpu4k_unit_pages);
1326 if (ret < 0) 1502 if (ret < 0)
1327 panic("failed to map percpu area, err=%zd\n", ret); 1503 panic("failed to map percpu area, err=%zd\n", ret);
1328 1504
@@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1340 1516
1341 /* we're ready, commit */ 1517 /* we're ready, commit */
1342 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", 1518 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
1343 pcpu4k_unit_pages, static_size); 1519 unit_pages, static_size);
1344 1520
1345 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 1521 ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
1346 reserved_size, -1, 1522 unit_pages << PAGE_SHIFT, vm.addr);
1347 pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
1348 goto out_free_ar; 1523 goto out_free_ar;
1349 1524
1350enomem: 1525enomem:
1351 while (--j >= 0) 1526 while (--j >= 0)
1352 free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); 1527 free_fn(page_address(pages[j]), PAGE_SIZE);
1353 ret = -ENOMEM; 1528 ret = -ENOMEM;
1354out_free_ar: 1529out_free_ar:
1355 free_bootmem(__pa(pcpu4k_pages), pages_size); 1530 free_bootmem(__pa(pages), pages_size);
1356 return ret; 1531 return ret;
1357} 1532}
1358 1533
@@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size;
1370static struct pcpul_ent *pcpul_map; 1545static struct pcpul_ent *pcpul_map;
1371static struct vm_struct pcpul_vm; 1546static struct vm_struct pcpul_vm;
1372 1547
1373static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
1374{
1375 size_t off = (size_t)pageno << PAGE_SHIFT;
1376
1377 if (off >= pcpul_size)
1378 return NULL;
1379
1380 return virt_to_page(pcpul_map[cpu].ptr + off);
1381}
1382
1383/** 1548/**
1384 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page 1549 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1385 * @static_size: the size of static percpu area in bytes 1550 * @static_size: the size of static percpu area in bytes
@@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1475 pr_info("PERCPU: Remapped at %p with large pages, static data " 1640 pr_info("PERCPU: Remapped at %p with large pages, static data "
1476 "%zu bytes\n", pcpul_vm.addr, static_size); 1641 "%zu bytes\n", pcpul_vm.addr, static_size);
1477 1642
1478 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, 1643 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1479 reserved_size, dyn_size, pcpul_unit_size, 1644 pcpul_unit_size, pcpul_vm.addr);
1480 pcpul_vm.addr);
1481 1645
1482 /* sort pcpul_map array for pcpu_lpage_remapped() */ 1646 /* sort pcpul_map array for pcpu_lpage_remapped() */
1483 for (i = 0; i < num_possible_cpus() - 1; i++) 1647 for (i = 0; i < num_possible_cpus() - 1; i++)