aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWen Congyang <wency@cn.fujitsu.com>2013-02-22 19:33:04 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-23 20:50:12 -0500
commitae9aae9eda2db71bf4b592f15618b0160eb07731 (patch)
tree9d91b4cba95a464cc19afc33a54d807ec2f372fc
parentcd099682e4c786c3a866e462b37fcac6e3a44a68 (diff)
memory-hotplug: common APIs to support page tables hot-remove
When memory is removed, the corresponding pagetables should alse be removed. This patch introduces some common APIs to support vmemmap pagetable and x86_64 architecture direct mapping pagetable removing. All pages of virtual mapping in removed memory cannot be freed if some pages used as PGD/PUD include not only removed memory but also other memory. So this patch uses the following way to check whether a page can be freed or not. 1) When removing memory, the page structs of the removed memory are filled with 0FD. 2) All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared. In this case, the page used as PT/PMD can be freed. For direct mapping pages, update direct_pages_count[level] when we freed their pagetables. And do not free the pages again because they were freed when offlining. For vmemmap pages, free the pages and their pagetables. For larger pages, do not split them into smaller ones because there is no way to know if the larger page has been split. As a result, there is no way to decide when to split. We deal the larger pages in the following way: 1) For direct mapped pages, all the pages were freed when they were offlined. And since menmory offline is done section by section, all the memory ranges being removed are aligned to PAGE_SIZE. So only need to deal with unaligned pages when freeing vmemmap pages. 2) For vmemmap pages being used to store page_struct, if part of the larger page is still in use, just fill the unused part with 0xFD. And when the whole page is fulfilled with 0xFD, then free the larger page. [akpm@linux-foundation.org: fix typo in comment] [tangchen@cn.fujitsu.com: do not calculate direct mapping pages when freeing vmemmap pagetables] [tangchen@cn.fujitsu.com: do not free direct mapping pages twice] [tangchen@cn.fujitsu.com: do not free page split from hugepage one by one] [tangchen@cn.fujitsu.com: do not split pages when freeing pagetable pages] [akpm@linux-foundation.org: use pmd_page_vaddr()] [akpm@linux-foundation.org: fix used-uninitialised bug] Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Signed-off-by: Jianguo Wu <wujianguo@huawei.com> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com> Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Wu Jianguo <wujianguo@huawei.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/mm/init_64.c304
-rw-r--r--arch/x86/mm/pageattr.c47
-rw-r--r--include/linux/bootmem.h1
4 files changed, 331 insertions, 22 deletions
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e6423002c10b..567b5d0632b2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
351 * as a pte too. 351 * as a pte too.
352 */ 352 */
353extern pte_t *lookup_address(unsigned long address, unsigned int *level); 353extern pte_t *lookup_address(unsigned long address, unsigned int *level);
354extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
354extern phys_addr_t slow_virt_to_phys(void *__address); 355extern phys_addr_t slow_virt_to_phys(void *__address);
355 356
356#endif /* !__ASSEMBLY__ */ 357#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f17aa76dc1ae..ca6cd403a275 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -707,6 +707,310 @@ int arch_add_memory(int nid, u64 start, u64 size)
707} 707}
708EXPORT_SYMBOL_GPL(arch_add_memory); 708EXPORT_SYMBOL_GPL(arch_add_memory);
709 709
710#define PAGE_INUSE 0xFD
711
712static void __meminit free_pagetable(struct page *page, int order)
713{
714 struct zone *zone;
715 bool bootmem = false;
716 unsigned long magic;
717 unsigned int nr_pages = 1 << order;
718
719 /* bootmem page has reserved flag */
720 if (PageReserved(page)) {
721 __ClearPageReserved(page);
722 bootmem = true;
723
724 magic = (unsigned long)page->lru.next;
725 if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
726 while (nr_pages--)
727 put_page_bootmem(page++);
728 } else
729 __free_pages_bootmem(page, order);
730 } else
731 free_pages((unsigned long)page_address(page), order);
732
733 /*
734 * SECTION_INFO pages and MIX_SECTION_INFO pages
735 * are all allocated by bootmem.
736 */
737 if (bootmem) {
738 zone = page_zone(page);
739 zone_span_writelock(zone);
740 zone->present_pages += nr_pages;
741 zone_span_writeunlock(zone);
742 totalram_pages += nr_pages;
743 }
744}
745
746static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
747{
748 pte_t *pte;
749 int i;
750
751 for (i = 0; i < PTRS_PER_PTE; i++) {
752 pte = pte_start + i;
753 if (pte_val(*pte))
754 return;
755 }
756
757 /* free a pte talbe */
758 free_pagetable(pmd_page(*pmd), 0);
759 spin_lock(&init_mm.page_table_lock);
760 pmd_clear(pmd);
761 spin_unlock(&init_mm.page_table_lock);
762}
763
764static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
765{
766 pmd_t *pmd;
767 int i;
768
769 for (i = 0; i < PTRS_PER_PMD; i++) {
770 pmd = pmd_start + i;
771 if (pmd_val(*pmd))
772 return;
773 }
774
775 /* free a pmd talbe */
776 free_pagetable(pud_page(*pud), 0);
777 spin_lock(&init_mm.page_table_lock);
778 pud_clear(pud);
779 spin_unlock(&init_mm.page_table_lock);
780}
781
782/* Return true if pgd is changed, otherwise return false. */
783static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
784{
785 pud_t *pud;
786 int i;
787
788 for (i = 0; i < PTRS_PER_PUD; i++) {
789 pud = pud_start + i;
790 if (pud_val(*pud))
791 return false;
792 }
793
794 /* free a pud table */
795 free_pagetable(pgd_page(*pgd), 0);
796 spin_lock(&init_mm.page_table_lock);
797 pgd_clear(pgd);
798 spin_unlock(&init_mm.page_table_lock);
799
800 return true;
801}
802
803static void __meminit
804remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
805 bool direct)
806{
807 unsigned long next, pages = 0;
808 pte_t *pte;
809 void *page_addr;
810 phys_addr_t phys_addr;
811
812 pte = pte_start + pte_index(addr);
813 for (; addr < end; addr = next, pte++) {
814 next = (addr + PAGE_SIZE) & PAGE_MASK;
815 if (next > end)
816 next = end;
817
818 if (!pte_present(*pte))
819 continue;
820
821 /*
822 * We mapped [0,1G) memory as identity mapping when
823 * initializing, in arch/x86/kernel/head_64.S. These
824 * pagetables cannot be removed.
825 */
826 phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
827 if (phys_addr < (phys_addr_t)0x40000000)
828 return;
829
830 if (IS_ALIGNED(addr, PAGE_SIZE) &&
831 IS_ALIGNED(next, PAGE_SIZE)) {
832 /*
833 * Do not free direct mapping pages since they were
834 * freed when offlining, or simplely not in use.
835 */
836 if (!direct)
837 free_pagetable(pte_page(*pte), 0);
838
839 spin_lock(&init_mm.page_table_lock);
840 pte_clear(&init_mm, addr, pte);
841 spin_unlock(&init_mm.page_table_lock);
842
843 /* For non-direct mapping, pages means nothing. */
844 pages++;
845 } else {
846 /*
847 * If we are here, we are freeing vmemmap pages since
848 * direct mapped memory ranges to be freed are aligned.
849 *
850 * If we are not removing the whole page, it means
851 * other page structs in this page are being used and
852 * we canot remove them. So fill the unused page_structs
853 * with 0xFD, and remove the page when it is wholly
854 * filled with 0xFD.
855 */
856 memset((void *)addr, PAGE_INUSE, next - addr);
857
858 page_addr = page_address(pte_page(*pte));
859 if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
860 free_pagetable(pte_page(*pte), 0);
861
862 spin_lock(&init_mm.page_table_lock);
863 pte_clear(&init_mm, addr, pte);
864 spin_unlock(&init_mm.page_table_lock);
865 }
866 }
867 }
868
869 /* Call free_pte_table() in remove_pmd_table(). */
870 flush_tlb_all();
871 if (direct)
872 update_page_count(PG_LEVEL_4K, -pages);
873}
874
875static void __meminit
876remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
877 bool direct)
878{
879 unsigned long next, pages = 0;
880 pte_t *pte_base;
881 pmd_t *pmd;
882 void *page_addr;
883
884 pmd = pmd_start + pmd_index(addr);
885 for (; addr < end; addr = next, pmd++) {
886 next = pmd_addr_end(addr, end);
887
888 if (!pmd_present(*pmd))
889 continue;
890
891 if (pmd_large(*pmd)) {
892 if (IS_ALIGNED(addr, PMD_SIZE) &&
893 IS_ALIGNED(next, PMD_SIZE)) {
894 if (!direct)
895 free_pagetable(pmd_page(*pmd),
896 get_order(PMD_SIZE));
897
898 spin_lock(&init_mm.page_table_lock);
899 pmd_clear(pmd);
900 spin_unlock(&init_mm.page_table_lock);
901 pages++;
902 } else {
903 /* If here, we are freeing vmemmap pages. */
904 memset((void *)addr, PAGE_INUSE, next - addr);
905
906 page_addr = page_address(pmd_page(*pmd));
907 if (!memchr_inv(page_addr, PAGE_INUSE,
908 PMD_SIZE)) {
909 free_pagetable(pmd_page(*pmd),
910 get_order(PMD_SIZE));
911
912 spin_lock(&init_mm.page_table_lock);
913 pmd_clear(pmd);
914 spin_unlock(&init_mm.page_table_lock);
915 }
916 }
917
918 continue;
919 }
920
921 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
922 remove_pte_table(pte_base, addr, next, direct);
923 free_pte_table(pte_base, pmd);
924 }
925
926 /* Call free_pmd_table() in remove_pud_table(). */
927 if (direct)
928 update_page_count(PG_LEVEL_2M, -pages);
929}
930
931static void __meminit
932remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
933 bool direct)
934{
935 unsigned long next, pages = 0;
936 pmd_t *pmd_base;
937 pud_t *pud;
938 void *page_addr;
939
940 pud = pud_start + pud_index(addr);
941 for (; addr < end; addr = next, pud++) {
942 next = pud_addr_end(addr, end);
943
944 if (!pud_present(*pud))
945 continue;
946
947 if (pud_large(*pud)) {
948 if (IS_ALIGNED(addr, PUD_SIZE) &&
949 IS_ALIGNED(next, PUD_SIZE)) {
950 if (!direct)
951 free_pagetable(pud_page(*pud),
952 get_order(PUD_SIZE));
953
954 spin_lock(&init_mm.page_table_lock);
955 pud_clear(pud);
956 spin_unlock(&init_mm.page_table_lock);
957 pages++;
958 } else {
959 /* If here, we are freeing vmemmap pages. */
960 memset((void *)addr, PAGE_INUSE, next - addr);
961
962 page_addr = page_address(pud_page(*pud));
963 if (!memchr_inv(page_addr, PAGE_INUSE,
964 PUD_SIZE)) {
965 free_pagetable(pud_page(*pud),
966 get_order(PUD_SIZE));
967
968 spin_lock(&init_mm.page_table_lock);
969 pud_clear(pud);
970 spin_unlock(&init_mm.page_table_lock);
971 }
972 }
973
974 continue;
975 }
976
977 pmd_base = (pmd_t *)pud_page_vaddr(*pud);
978 remove_pmd_table(pmd_base, addr, next, direct);
979 free_pmd_table(pmd_base, pud);
980 }
981
982 if (direct)
983 update_page_count(PG_LEVEL_1G, -pages);
984}
985
986/* start and end are both virtual address. */
987static void __meminit
988remove_pagetable(unsigned long start, unsigned long end, bool direct)
989{
990 unsigned long next;
991 pgd_t *pgd;
992 pud_t *pud;
993 bool pgd_changed = false;
994
995 for (; start < end; start = next) {
996 next = pgd_addr_end(start, end);
997
998 pgd = pgd_offset_k(start);
999 if (!pgd_present(*pgd))
1000 continue;
1001
1002 pud = (pud_t *)pgd_page_vaddr(*pgd);
1003 remove_pud_table(pud, start, next, direct);
1004 if (free_pud_table(pud, pgd))
1005 pgd_changed = true;
1006 }
1007
1008 if (pgd_changed)
1009 sync_global_pgds(start, end - 1);
1010
1011 flush_tlb_all();
1012}
1013
710#ifdef CONFIG_MEMORY_HOTREMOVE 1014#ifdef CONFIG_MEMORY_HOTREMOVE
711int __ref arch_remove_memory(u64 start, u64 size) 1015int __ref arch_remove_memory(u64 start, u64 size)
712{ 1016{
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a1b1c88f9caf..ca1f1c2bb7be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -529,21 +529,13 @@ out_unlock:
529 return do_split; 529 return do_split;
530} 530}
531 531
532static int split_large_page(pte_t *kpte, unsigned long address) 532int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
533{ 533{
534 unsigned long pfn, pfninc = 1; 534 unsigned long pfn, pfninc = 1;
535 unsigned int i, level; 535 unsigned int i, level;
536 pte_t *pbase, *tmp; 536 pte_t *tmp;
537 pgprot_t ref_prot; 537 pgprot_t ref_prot;
538 struct page *base; 538 struct page *base = virt_to_page(pbase);
539
540 if (!debug_pagealloc)
541 spin_unlock(&cpa_lock);
542 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
543 if (!debug_pagealloc)
544 spin_lock(&cpa_lock);
545 if (!base)
546 return -ENOMEM;
547 539
548 spin_lock(&pgd_lock); 540 spin_lock(&pgd_lock);
549 /* 541 /*
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
551 * up for us already: 543 * up for us already:
552 */ 544 */
553 tmp = lookup_address(address, &level); 545 tmp = lookup_address(address, &level);
554 if (tmp != kpte) 546 if (tmp != kpte) {
555 goto out_unlock; 547 spin_unlock(&pgd_lock);
548 return 1;
549 }
556 550
557 pbase = (pte_t *)page_address(base);
558 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 551 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
559 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 552 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
560 /* 553 /*
@@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
601 * going on. 594 * going on.
602 */ 595 */
603 __flush_tlb_all(); 596 __flush_tlb_all();
597 spin_unlock(&pgd_lock);
604 598
605 base = NULL; 599 return 0;
600}
606 601
607out_unlock: 602static int split_large_page(pte_t *kpte, unsigned long address)
608 /* 603{
609 * If we dropped out via the lookup_address check under 604 pte_t *pbase;
610 * pgd_lock then stick the page back into the pool: 605 struct page *base;
611 */ 606
612 if (base) 607 if (!debug_pagealloc)
608 spin_unlock(&cpa_lock);
609 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
610 if (!debug_pagealloc)
611 spin_lock(&cpa_lock);
612 if (!base)
613 return -ENOMEM;
614
615 pbase = (pte_t *)page_address(base);
616 if (__split_large_page(kpte, address, pbase))
613 __free_page(base); 617 __free_page(base);
614 spin_unlock(&pgd_lock);
615 618
616 return 0; 619 return 0;
617} 620}
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3cd16ba82f15..cdc3bab01832 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
53 unsigned long size); 53 unsigned long size);
54extern void free_bootmem(unsigned long physaddr, unsigned long size); 54extern void free_bootmem(unsigned long physaddr, unsigned long size);
55extern void free_bootmem_late(unsigned long physaddr, unsigned long size); 55extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
56extern void __free_pages_bootmem(struct page *page, unsigned int order);
56 57
57/* 58/*
58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, 59 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,