aboutsummaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c604
1 files changed, 384 insertions, 220 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 639fce4d2caf..21756814d99f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,8 +94,7 @@ struct pcpu_chunk {
94 int map_alloc; /* # of map entries allocated */ 94 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 95 int *map; /* allocation map */
96 bool immutable; /* no [de]population allowed */ 96 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 97 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 98};
100 99
101static int pcpu_unit_pages __read_mostly; 100static int pcpu_unit_pages __read_mostly;
@@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 128 * Synchronization rules.
130 * 129 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 130 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 131 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 132 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 133 * data structures - chunk slots, chunks and area maps in chunks.
135 * 134 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 135 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 136 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
188 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 187 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
189} 188}
190 189
191static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, 190static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
192 unsigned int cpu, int page_idx) 191 unsigned int cpu, int page_idx)
193{ 192{
194 return &chunk->page[pcpu_page_idx(cpu, page_idx)]; 193 /* must not be used on pre-mapped chunk */
195} 194 WARN_ON(chunk->immutable);
196 195
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 196 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
198 int page_idx)
199{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
201} 197}
202 198
203/* set the pointer to a chunk in a page struct */ 199/* set the pointer to a chunk in a page struct */
@@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
212 return (struct pcpu_chunk *)page->index; 208 return (struct pcpu_chunk *)page->index;
213} 209}
214 210
211static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
212{
213 *rs = find_next_zero_bit(chunk->populated, end, *rs);
214 *re = find_next_bit(chunk->populated, end, *rs + 1);
215}
216
217static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
218{
219 *rs = find_next_bit(chunk->populated, end, *rs);
220 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
221}
222
223/*
224 * (Un)populated page region iterators. Iterate over (un)populated
225 * page regions betwen @start and @end in @chunk. @rs and @re should
226 * be integer variables and will be set to start and end page index of
227 * the current region.
228 */
229#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
230 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
231 (rs) < (re); \
232 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
233
234#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
235 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
236 (rs) < (re); \
237 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
238
215/** 239/**
216 * pcpu_mem_alloc - allocate memory 240 * pcpu_mem_alloc - allocate memory
217 * @size: bytes to allocate 241 * @size: bytes to allocate
@@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
545} 569}
546 570
547/** 571/**
548 * pcpu_unmap - unmap pages out of a pcpu_chunk 572 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
573 * @chunk: chunk of interest
574 * @bitmapp: output parameter for bitmap
575 * @may_alloc: may allocate the array
576 *
577 * Returns pointer to array of pointers to struct page and bitmap,
578 * both of which can be indexed with pcpu_page_idx(). The returned
579 * array is cleared to zero and *@bitmapp is copied from
580 * @chunk->populated. Note that there is only one array and bitmap
581 * and access exclusion is the caller's responsibility.
582 *
583 * CONTEXT:
584 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
585 * Otherwise, don't care.
586 *
587 * RETURNS:
588 * Pointer to temp pages array on success, NULL on failure.
589 */
590static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
591 unsigned long **bitmapp,
592 bool may_alloc)
593{
594 static struct page **pages;
595 static unsigned long *bitmap;
596 size_t pages_size = num_possible_cpus() * pcpu_unit_pages *
597 sizeof(pages[0]);
598 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
599 sizeof(unsigned long);
600
601 if (!pages || !bitmap) {
602 if (may_alloc && !pages)
603 pages = pcpu_mem_alloc(pages_size);
604 if (may_alloc && !bitmap)
605 bitmap = pcpu_mem_alloc(bitmap_size);
606 if (!pages || !bitmap)
607 return NULL;
608 }
609
610 memset(pages, 0, pages_size);
611 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
612
613 *bitmapp = bitmap;
614 return pages;
615}
616
617/**
618 * pcpu_free_pages - free pages which were allocated for @chunk
619 * @chunk: chunk pages were allocated for
620 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
621 * @populated: populated bitmap
622 * @page_start: page index of the first page to be freed
623 * @page_end: page index of the last page to be freed + 1
624 *
625 * Free pages [@page_start and @page_end) in @pages for all units.
626 * The pages were allocated for @chunk.
627 */
628static void pcpu_free_pages(struct pcpu_chunk *chunk,
629 struct page **pages, unsigned long *populated,
630 int page_start, int page_end)
631{
632 unsigned int cpu;
633 int i;
634
635 for_each_possible_cpu(cpu) {
636 for (i = page_start; i < page_end; i++) {
637 struct page *page = pages[pcpu_page_idx(cpu, i)];
638
639 if (page)
640 __free_page(page);
641 }
642 }
643}
644
645/**
646 * pcpu_alloc_pages - allocates pages for @chunk
647 * @chunk: target chunk
648 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
649 * @populated: populated bitmap
650 * @page_start: page index of the first page to be allocated
651 * @page_end: page index of the last page to be allocated + 1
652 *
653 * Allocate pages [@page_start,@page_end) into @pages for all units.
654 * The allocation is for @chunk. Percpu core doesn't care about the
655 * content of @pages and will pass it verbatim to pcpu_map_pages().
656 */
657static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
658 struct page **pages, unsigned long *populated,
659 int page_start, int page_end)
660{
661 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
662 unsigned int cpu;
663 int i;
664
665 for_each_possible_cpu(cpu) {
666 for (i = page_start; i < page_end; i++) {
667 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
668
669 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
670 if (!*pagep) {
671 pcpu_free_pages(chunk, pages, populated,
672 page_start, page_end);
673 return -ENOMEM;
674 }
675 }
676 }
677 return 0;
678}
679
680/**
681 * pcpu_pre_unmap_flush - flush cache prior to unmapping
682 * @chunk: chunk the regions to be flushed belongs to
683 * @page_start: page index of the first page to be flushed
684 * @page_end: page index of the last page to be flushed + 1
685 *
686 * Pages in [@page_start,@page_end) of @chunk are about to be
687 * unmapped. Flush cache. As each flushing trial can be very
688 * expensive, issue flush on the whole region at once rather than
689 * doing it for each cpu. This could be an overkill but is more
690 * scalable.
691 */
692static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
693 int page_start, int page_end)
694{
695 unsigned int last = num_possible_cpus() - 1;
696
697 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
698 pcpu_chunk_addr(chunk, last, page_end));
699}
700
701static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
702{
703 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
704}
705
706/**
707 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
549 * @chunk: chunk of interest 708 * @chunk: chunk of interest
709 * @pages: pages array which can be used to pass information to free
710 * @populated: populated bitmap
550 * @page_start: page index of the first page to unmap 711 * @page_start: page index of the first page to unmap
551 * @page_end: page index of the last page to unmap + 1 712 * @page_end: page index of the last page to unmap + 1
552 * @flush_tlb: whether to flush tlb or not
553 * 713 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 714 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
555 * If @flush is true, vcache is flushed before unmapping and tlb 715 * Corresponding elements in @pages were cleared by the caller and can
556 * after. 716 * be used to carry information to pcpu_free_pages() which will be
717 * called after all unmaps are finished. The caller should call
718 * proper pre/post flush functions.
557 */ 719 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 720static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
559 bool flush_tlb) 721 struct page **pages, unsigned long *populated,
722 int page_start, int page_end)
560{ 723{
561 unsigned int last = num_possible_cpus() - 1;
562 unsigned int cpu; 724 unsigned int cpu;
725 int i;
563 726
564 /* unmap must not be done on immutable chunk */ 727 for_each_possible_cpu(cpu) {
565 WARN_ON(chunk->immutable); 728 for (i = page_start; i < page_end; i++) {
729 struct page *page;
566 730
567 /* 731 page = pcpu_chunk_page(chunk, cpu, i);
568 * Each flushing trial can be very expensive, issue flush on 732 WARN_ON(!page);
569 * the whole region at once rather than doing it for each cpu. 733 pages[pcpu_page_idx(cpu, i)] = page;
570 * This could be an overkill but is more scalable. 734 }
571 */ 735 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 736 page_end - page_start);
573 pcpu_chunk_addr(chunk, last, page_end)); 737 }
574 738
575 for_each_possible_cpu(cpu) 739 for (i = page_start; i < page_end; i++)
576 unmap_kernel_range_noflush( 740 __clear_bit(i, populated);
577 pcpu_chunk_addr(chunk, cpu, page_start), 741}
578 (page_end - page_start) << PAGE_SHIFT); 742
579 743/**
580 /* ditto as flush_cache_vunmap() */ 744 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
581 if (flush_tlb) 745 * @chunk: pcpu_chunk the regions to be flushed belong to
582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 746 * @page_start: page index of the first page to be flushed
583 pcpu_chunk_addr(chunk, last, page_end)); 747 * @page_end: page index of the last page to be flushed + 1
748 *
749 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
750 * TLB for the regions. This can be skipped if the area is to be
751 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
752 *
753 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
754 * for the whole region.
755 */
756static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
757 int page_start, int page_end)
758{
759 unsigned int last = num_possible_cpus() - 1;
760
761 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
762 pcpu_chunk_addr(chunk, last, page_end));
584} 763}
585 764
586static int __pcpu_map_pages(unsigned long addr, struct page **pages, 765static int __pcpu_map_pages(unsigned long addr, struct page **pages,
@@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
591} 770}
592 771
593/** 772/**
594 * pcpu_map - map pages into a pcpu_chunk 773 * pcpu_map_pages - map pages into a pcpu_chunk
595 * @chunk: chunk of interest 774 * @chunk: chunk of interest
775 * @pages: pages array containing pages to be mapped
776 * @populated: populated bitmap
596 * @page_start: page index of the first page to map 777 * @page_start: page index of the first page to map
597 * @page_end: page index of the last page to map + 1 778 * @page_end: page index of the last page to map + 1
598 * 779 *
599 * For each cpu, map pages [@page_start,@page_end) into @chunk. 780 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
600 * vcache is flushed afterwards. 781 * caller is responsible for calling pcpu_post_map_flush() after all
782 * mappings are complete.
783 *
784 * This function is responsible for setting corresponding bits in
785 * @chunk->populated bitmap and whatever is necessary for reverse
786 * lookup (addr -> chunk).
601 */ 787 */
602static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 788static int pcpu_map_pages(struct pcpu_chunk *chunk,
789 struct page **pages, unsigned long *populated,
790 int page_start, int page_end)
603{ 791{
604 unsigned int last = num_possible_cpus() - 1; 792 unsigned int cpu, tcpu;
605 unsigned int cpu; 793 int i, err;
606 int err;
607
608 /* map must not be done on immutable chunk */
609 WARN_ON(chunk->immutable);
610 794
611 for_each_possible_cpu(cpu) { 795 for_each_possible_cpu(cpu) {
612 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 796 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
613 pcpu_chunk_pagep(chunk, cpu, page_start), 797 &pages[pcpu_page_idx(cpu, page_start)],
614 page_end - page_start); 798 page_end - page_start);
615 if (err < 0) 799 if (err < 0)
616 return err; 800 goto err;
617 } 801 }
618 802
803 /* mapping successful, link chunk and mark populated */
804 for (i = page_start; i < page_end; i++) {
805 for_each_possible_cpu(cpu)
806 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
807 chunk);
808 __set_bit(i, populated);
809 }
810
811 return 0;
812
813err:
814 for_each_possible_cpu(tcpu) {
815 if (tcpu == cpu)
816 break;
817 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
818 page_end - page_start);
819 }
820 return err;
821}
822
823/**
824 * pcpu_post_map_flush - flush cache after mapping
825 * @chunk: pcpu_chunk the regions to be flushed belong to
826 * @page_start: page index of the first page to be flushed
827 * @page_end: page index of the last page to be flushed + 1
828 *
829 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
830 * cache.
831 *
832 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
833 * for the whole region.
834 */
835static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
836 int page_start, int page_end)
837{
838 unsigned int last = num_possible_cpus() - 1;
839
619 /* flush at once, please read comments in pcpu_unmap() */ 840 /* flush at once, please read comments in pcpu_unmap() */
620 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), 841 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
621 pcpu_chunk_addr(chunk, last, page_end)); 842 pcpu_chunk_addr(chunk, last, page_end));
622 return 0;
623} 843}
624 844
625/** 845/**
@@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
636 * CONTEXT: 856 * CONTEXT:
637 * pcpu_alloc_mutex. 857 * pcpu_alloc_mutex.
638 */ 858 */
639static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 859static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
640 bool flush)
641{ 860{
642 int page_start = PFN_DOWN(off); 861 int page_start = PFN_DOWN(off);
643 int page_end = PFN_UP(off + size); 862 int page_end = PFN_UP(off + size);
644 int unmap_start = -1; 863 struct page **pages;
645 int uninitialized_var(unmap_end); 864 unsigned long *populated;
646 unsigned int cpu; 865 int rs, re;
647 int i; 866
867 /* quick path, check whether it's empty already */
868 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
869 if (rs == page_start && re == page_end)
870 return;
871 break;
872 }
648 873
649 for (i = page_start; i < page_end; i++) { 874 /* immutable chunks can't be depopulated */
650 for_each_possible_cpu(cpu) { 875 WARN_ON(chunk->immutable);
651 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
652 876
653 if (!*pagep) 877 /*
654 continue; 878 * If control reaches here, there must have been at least one
879 * successful population attempt so the temp pages array must
880 * be available now.
881 */
882 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
883 BUG_ON(!pages);
655 884
656 __free_page(*pagep); 885 /* unmap and free */
886 pcpu_pre_unmap_flush(chunk, page_start, page_end);
657 887
658 /* 888 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
659 * If it's partial depopulation, it might get 889 pcpu_unmap_pages(chunk, pages, populated, rs, re);
660 * populated or depopulated again. Mark the
661 * page gone.
662 */
663 *pagep = NULL;
664 890
665 unmap_start = unmap_start < 0 ? i : unmap_start; 891 /* no need to flush tlb, vmalloc will handle it lazily */
666 unmap_end = i + 1; 892
667 } 893 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
668 } 894 pcpu_free_pages(chunk, pages, populated, rs, re);
669 895
670 if (unmap_start >= 0) 896 /* commit new bitmap */
671 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 897 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
672} 898}
673 899
674/** 900/**
@@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
685 */ 911 */
686static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 912static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
687{ 913{
688 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
689 int page_start = PFN_DOWN(off); 914 int page_start = PFN_DOWN(off);
690 int page_end = PFN_UP(off + size); 915 int page_end = PFN_UP(off + size);
691 int map_start = -1; 916 int free_end = page_start, unmap_end = page_start;
692 int uninitialized_var(map_end); 917 struct page **pages;
918 unsigned long *populated;
693 unsigned int cpu; 919 unsigned int cpu;
694 int i; 920 int rs, re, rc;
695 921
696 for (i = page_start; i < page_end; i++) { 922 /* quick path, check whether all pages are already there */
697 if (pcpu_chunk_page_occupied(chunk, i)) { 923 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
698 if (map_start >= 0) { 924 if (rs == page_start && re == page_end)
699 if (pcpu_map(chunk, map_start, map_end)) 925 goto clear;
700 goto err; 926 break;
701 map_start = -1; 927 }
702 }
703 continue;
704 }
705 928
706 map_start = map_start < 0 ? i : map_start; 929 /* need to allocate and map pages, this chunk can't be immutable */
707 map_end = i + 1; 930 WARN_ON(chunk->immutable);
708 931
709 for_each_possible_cpu(cpu) { 932 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
710 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 933 if (!pages)
934 return -ENOMEM;
711 935
712 *pagep = alloc_pages_node(cpu_to_node(cpu), 936 /* alloc and map */
713 alloc_mask, 0); 937 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
714 if (!*pagep) 938 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
715 goto err; 939 if (rc)
716 pcpu_set_page_chunk(*pagep, chunk); 940 goto err_free;
717 } 941 free_end = re;
718 } 942 }
719 943
720 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 944 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
721 goto err; 945 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
946 if (rc)
947 goto err_unmap;
948 unmap_end = re;
949 }
950 pcpu_post_map_flush(chunk, page_start, page_end);
722 951
952 /* commit new bitmap */
953 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
954clear:
723 for_each_possible_cpu(cpu) 955 for_each_possible_cpu(cpu)
724 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 956 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
725 size); 957 size);
726
727 return 0; 958 return 0;
728err: 959
729 /* likely under heavy memory pressure, give memory back */ 960err_unmap:
730 pcpu_depopulate_chunk(chunk, off, size, true); 961 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
731 return -ENOMEM; 962 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
963 pcpu_unmap_pages(chunk, pages, populated, rs, re);
964 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
965err_free:
966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
967 pcpu_free_pages(chunk, pages, populated, rs, re);
968 return rc;
732} 969}
733 970
734static void free_pcpu_chunk(struct pcpu_chunk *chunk) 971static void free_pcpu_chunk(struct pcpu_chunk *chunk)
@@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
752 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 989 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
753 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 990 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
754 chunk->map[chunk->map_used++] = pcpu_unit_size; 991 chunk->map[chunk->map_used++] = pcpu_unit_size;
755 chunk->page = chunk->page_ar;
756 992
757 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 993 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
758 if (!chunk->vm) { 994 if (!chunk->vm) {
@@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work)
933 mutex_unlock(&pcpu_alloc_mutex); 1169 mutex_unlock(&pcpu_alloc_mutex);
934 1170
935 list_for_each_entry_safe(chunk, next, &todo, list) { 1171 list_for_each_entry_safe(chunk, next, &todo, list) {
936 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1172 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
937 free_pcpu_chunk(chunk); 1173 free_pcpu_chunk(chunk);
938 } 1174 }
939} 1175}
@@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
981 1217
982/** 1218/**
983 * pcpu_setup_first_chunk - initialize the first percpu chunk 1219 * pcpu_setup_first_chunk - initialize the first percpu chunk
984 * @get_page_fn: callback to fetch page pointer
985 * @static_size: the size of static percpu area in bytes 1220 * @static_size: the size of static percpu area in bytes
986 * @reserved_size: the size of reserved percpu area in bytes, 0 for none 1221 * @reserved_size: the size of reserved percpu area in bytes, 0 for none
987 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
@@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu);
992 * perpcu area. This function is to be called from arch percpu area 1227 * perpcu area. This function is to be called from arch percpu area
993 * setup path. 1228 * setup path.
994 * 1229 *
995 * @get_page_fn() should return pointer to percpu page given cpu
996 * number and page number. It should at least return enough pages to
997 * cover the static area. The returned pages for static area should
998 * have been initialized with valid data. It can also return pages
999 * after the static area. NULL return indicates end of pages for the
1000 * cpu. Note that @get_page_fn() must return the same number of pages
1001 * for all cpus.
1002 *
1003 * @reserved_size, if non-zero, specifies the amount of bytes to 1230 * @reserved_size, if non-zero, specifies the amount of bytes to
1004 * reserve after the static area in the first chunk. This reserves 1231 * reserve after the static area in the first chunk. This reserves
1005 * the first chunk such that it's available only through reserved 1232 * the first chunk such that it's available only through reserved
@@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
1031 * The determined pcpu_unit_size which can be used to initialize 1258 * The determined pcpu_unit_size which can be used to initialize
1032 * percpu access. 1259 * percpu access.
1033 */ 1260 */
1034size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1261size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
1035 size_t static_size, size_t reserved_size,
1036 ssize_t dyn_size, size_t unit_size, 1262 ssize_t dyn_size, size_t unit_size,
1037 void *base_addr) 1263 void *base_addr)
1038{ 1264{
@@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1041 size_t size_sum = static_size + reserved_size + 1267 size_t size_sum = static_size + reserved_size +
1042 (dyn_size >= 0 ? dyn_size : 0); 1268 (dyn_size >= 0 ? dyn_size : 0);
1043 struct pcpu_chunk *schunk, *dchunk = NULL; 1269 struct pcpu_chunk *schunk, *dchunk = NULL;
1044 unsigned int cpu; 1270 int i;
1045 int i, nr_pages;
1046 1271
1047 /* santiy checks */ 1272 /* santiy checks */
1048 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1273 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
@@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1056 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1281 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1057 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1282 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1058 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1283 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1059 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1284 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1060 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); 1285 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1061 1286
1062 if (dyn_size < 0) 1287 if (dyn_size < 0)
1063 dyn_size = pcpu_unit_size - static_size - reserved_size; 1288 dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1087 schunk->vm = &first_vm; 1312 schunk->vm = &first_vm;
1088 schunk->map = smap; 1313 schunk->map = smap;
1089 schunk->map_alloc = ARRAY_SIZE(smap); 1314 schunk->map_alloc = ARRAY_SIZE(smap);
1090 schunk->page = schunk->page_ar;
1091 schunk->immutable = true; 1315 schunk->immutable = true;
1316 bitmap_fill(schunk->populated, pcpu_unit_pages);
1092 1317
1093 if (reserved_size) { 1318 if (reserved_size) {
1094 schunk->free_size = reserved_size; 1319 schunk->free_size = reserved_size;
@@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1106 1331
1107 /* init dynamic chunk if necessary */ 1332 /* init dynamic chunk if necessary */
1108 if (dyn_size) { 1333 if (dyn_size) {
1109 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1334 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1110 INIT_LIST_HEAD(&dchunk->list); 1335 INIT_LIST_HEAD(&dchunk->list);
1111 dchunk->vm = &first_vm; 1336 dchunk->vm = &first_vm;
1112 dchunk->map = dmap; 1337 dchunk->map = dmap;
1113 dchunk->map_alloc = ARRAY_SIZE(dmap); 1338 dchunk->map_alloc = ARRAY_SIZE(dmap);
1114 dchunk->page = schunk->page_ar; /* share page map with schunk */
1115 dchunk->immutable = true; 1339 dchunk->immutable = true;
1340 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1116 1341
1117 dchunk->contig_hint = dchunk->free_size = dyn_size; 1342 dchunk->contig_hint = dchunk->free_size = dyn_size;
1118 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1343 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1119 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1344 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1120 } 1345 }
1121 1346
1122 /* assign pages */
1123 nr_pages = -1;
1124 for_each_possible_cpu(cpu) {
1125 for (i = 0; i < pcpu_unit_pages; i++) {
1126 struct page *page = get_page_fn(cpu, i);
1127
1128 if (!page)
1129 break;
1130 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1131 }
1132
1133 BUG_ON(i < PFN_UP(static_size));
1134
1135 if (nr_pages < 0)
1136 nr_pages = i;
1137 else
1138 BUG_ON(nr_pages != i);
1139 }
1140
1141 /* link the first chunk in */ 1347 /* link the first chunk in */
1142 pcpu_first_chunk = dchunk ?: schunk; 1348 pcpu_first_chunk = dchunk ?: schunk;
1143 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1349 pcpu_chunk_relocate(pcpu_first_chunk, -1);
@@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
1160 return size_sum; 1366 return size_sum;
1161} 1367}
1162 1368
1163/*
1164 * Embedding first chunk setup helper.
1165 */
1166static void *pcpue_ptr __initdata;
1167static size_t pcpue_size __initdata;
1168static size_t pcpue_unit_size __initdata;
1169
1170static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1171{
1172 size_t off = (size_t)pageno << PAGE_SHIFT;
1173
1174 if (off >= pcpue_size)
1175 return NULL;
1176
1177 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1178}
1179
1180/** 1369/**
1181 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1370 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1182 * @static_size: the size of static percpu area in bytes 1371 * @static_size: the size of static percpu area in bytes
@@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1207ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1396ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1208 ssize_t dyn_size) 1397 ssize_t dyn_size)
1209{ 1398{
1210 size_t chunk_size; 1399 size_t size_sum, unit_size, chunk_size;
1400 void *base;
1211 unsigned int cpu; 1401 unsigned int cpu;
1212 1402
1213 /* determine parameters and allocate */ 1403 /* determine parameters and allocate */
1214 pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); 1404 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1215 1405
1216 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1406 unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1217 chunk_size = pcpue_unit_size * num_possible_cpus(); 1407 chunk_size = unit_size * num_possible_cpus();
1218 1408
1219 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1409 base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1220 __pa(MAX_DMA_ADDRESS)); 1410 __pa(MAX_DMA_ADDRESS));
1221 if (!pcpue_ptr) { 1411 if (!base) {
1222 pr_warning("PERCPU: failed to allocate %zu bytes for " 1412 pr_warning("PERCPU: failed to allocate %zu bytes for "
1223 "embedding\n", chunk_size); 1413 "embedding\n", chunk_size);
1224 return -ENOMEM; 1414 return -ENOMEM;
@@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1226 1416
1227 /* return the leftover and copy */ 1417 /* return the leftover and copy */
1228 for_each_possible_cpu(cpu) { 1418 for_each_possible_cpu(cpu) {
1229 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1419 void *ptr = base + cpu * unit_size;
1230 1420
1231 free_bootmem(__pa(ptr + pcpue_size), 1421 free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
1232 pcpue_unit_size - pcpue_size);
1233 memcpy(ptr, __per_cpu_load, static_size); 1422 memcpy(ptr, __per_cpu_load, static_size);
1234 } 1423 }
1235 1424
1236 /* we're ready, commit */ 1425 /* we're ready, commit */
1237 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1426 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1238 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1427 size_sum >> PAGE_SHIFT, base, static_size);
1239 1428
1240 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1429 return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1241 reserved_size, dyn_size, 1430 unit_size, base);
1242 pcpue_unit_size, pcpue_ptr);
1243}
1244
1245/*
1246 * 4k page first chunk setup helper.
1247 */
1248static struct page **pcpu4k_pages __initdata;
1249static int pcpu4k_unit_pages __initdata;
1250
1251static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
1252{
1253 if (pageno < pcpu4k_unit_pages)
1254 return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
1255 return NULL;
1256} 1431}
1257 1432
1258/** 1433/**
@@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1279 pcpu_fc_populate_pte_fn_t populate_pte_fn) 1454 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1280{ 1455{
1281 static struct vm_struct vm; 1456 static struct vm_struct vm;
1457 int unit_pages;
1282 size_t pages_size; 1458 size_t pages_size;
1459 struct page **pages;
1283 unsigned int cpu; 1460 unsigned int cpu;
1284 int i, j; 1461 int i, j;
1285 ssize_t ret; 1462 ssize_t ret;
1286 1463
1287 pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, 1464 unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
1288 PCPU_MIN_UNIT_SIZE)); 1465 PCPU_MIN_UNIT_SIZE));
1289 1466
1290 /* unaligned allocations can't be freed, round up to page size */ 1467 /* unaligned allocations can't be freed, round up to page size */
1291 pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * 1468 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1292 sizeof(pcpu4k_pages[0])); 1469 sizeof(pages[0]));
1293 pcpu4k_pages = alloc_bootmem(pages_size); 1470 pages = alloc_bootmem(pages_size);
1294 1471
1295 /* allocate pages */ 1472 /* allocate pages */
1296 j = 0; 1473 j = 0;
1297 for_each_possible_cpu(cpu) 1474 for_each_possible_cpu(cpu)
1298 for (i = 0; i < pcpu4k_unit_pages; i++) { 1475 for (i = 0; i < unit_pages; i++) {
1299 void *ptr; 1476 void *ptr;
1300 1477
1301 ptr = alloc_fn(cpu, PAGE_SIZE); 1478 ptr = alloc_fn(cpu, PAGE_SIZE);
@@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1304 "4k page for cpu%u\n", cpu); 1481 "4k page for cpu%u\n", cpu);
1305 goto enomem; 1482 goto enomem;
1306 } 1483 }
1307 pcpu4k_pages[j++] = virt_to_page(ptr); 1484 pages[j++] = virt_to_page(ptr);
1308 } 1485 }
1309 1486
1310 /* allocate vm area, map the pages and copy static data */ 1487 /* allocate vm area, map the pages and copy static data */
1311 vm.flags = VM_ALLOC; 1488 vm.flags = VM_ALLOC;
1312 vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; 1489 vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
1313 vm_area_register_early(&vm, PAGE_SIZE); 1490 vm_area_register_early(&vm, PAGE_SIZE);
1314 1491
1315 for_each_possible_cpu(cpu) { 1492 for_each_possible_cpu(cpu) {
1316 unsigned long unit_addr = (unsigned long)vm.addr + 1493 unsigned long unit_addr = (unsigned long)vm.addr +
1317 (cpu * pcpu4k_unit_pages << PAGE_SHIFT); 1494 (cpu * unit_pages << PAGE_SHIFT);
1318 1495
1319 for (i = 0; i < pcpu4k_unit_pages; i++) 1496 for (i = 0; i < unit_pages; i++)
1320 populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); 1497 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1321 1498
1322 /* pte already populated, the following shouldn't fail */ 1499 /* pte already populated, the following shouldn't fail */
1323 ret = __pcpu_map_pages(unit_addr, 1500 ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
1324 &pcpu4k_pages[cpu * pcpu4k_unit_pages], 1501 unit_pages);
1325 pcpu4k_unit_pages);
1326 if (ret < 0) 1502 if (ret < 0)
1327 panic("failed to map percpu area, err=%zd\n", ret); 1503 panic("failed to map percpu area, err=%zd\n", ret);
1328 1504
@@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
1340 1516
1341 /* we're ready, commit */ 1517 /* we're ready, commit */
1342 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", 1518 pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
1343 pcpu4k_unit_pages, static_size); 1519 unit_pages, static_size);
1344 1520
1345 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 1521 ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
1346 reserved_size, -1, 1522 unit_pages << PAGE_SHIFT, vm.addr);
1347 pcpu4k_unit_pages << PAGE_SHIFT, vm.addr);
1348 goto out_free_ar; 1523 goto out_free_ar;
1349 1524
1350enomem: 1525enomem:
1351 while (--j >= 0) 1526 while (--j >= 0)
1352 free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); 1527 free_fn(page_address(pages[j]), PAGE_SIZE);
1353 ret = -ENOMEM; 1528 ret = -ENOMEM;
1354out_free_ar: 1529out_free_ar:
1355 free_bootmem(__pa(pcpu4k_pages), pages_size); 1530 free_bootmem(__pa(pages), pages_size);
1356 return ret; 1531 return ret;
1357} 1532}
1358 1533
@@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size;
1370static struct pcpul_ent *pcpul_map; 1545static struct pcpul_ent *pcpul_map;
1371static struct vm_struct pcpul_vm; 1546static struct vm_struct pcpul_vm;
1372 1547
1373static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
1374{
1375 size_t off = (size_t)pageno << PAGE_SHIFT;
1376
1377 if (off >= pcpul_size)
1378 return NULL;
1379
1380 return virt_to_page(pcpul_map[cpu].ptr + off);
1381}
1382
1383/** 1548/**
1384 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page 1549 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1385 * @static_size: the size of static percpu area in bytes 1550 * @static_size: the size of static percpu area in bytes
@@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1475 pr_info("PERCPU: Remapped at %p with large pages, static data " 1640 pr_info("PERCPU: Remapped at %p with large pages, static data "
1476 "%zu bytes\n", pcpul_vm.addr, static_size); 1641 "%zu bytes\n", pcpul_vm.addr, static_size);
1477 1642
1478 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, 1643 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1479 reserved_size, dyn_size, pcpul_unit_size, 1644 pcpul_unit_size, pcpul_vm.addr);
1480 pcpul_vm.addr);
1481 1645
1482 /* sort pcpul_map array for pcpu_lpage_remapped() */ 1646 /* sort pcpul_map array for pcpu_lpage_remapped() */
1483 for (i = 0; i < num_possible_cpus() - 1; i++) 1647 for (i = 0; i < num_possible_cpus() - 1; i++)