diff options
author | Dave Jones <davej@redhat.com> | 2006-12-12 18:13:32 -0500 |
---|---|---|
committer | Dave Jones <davej@redhat.com> | 2006-12-12 18:13:32 -0500 |
commit | f0eef25339f92f7cd4aeea23d9ae97987a5a1e82 (patch) | |
tree | 2472e94d39f43a9580a6d2d5d92de0b749023263 /mm/page_alloc.c | |
parent | 0cfea5dd98205f2fa318836da664a7d7df1afbc1 (diff) | |
parent | e1036502e5263851259d147771226161e5ccc85a (diff) |
Merge ../linus
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1156 |
1 files changed, 1030 insertions, 126 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9810f0a60db7..e6b17b2989e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -37,6 +37,10 @@ | |||
37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
38 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> |
39 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> |
40 | #include <linux/sort.h> | ||
41 | #include <linux/pfn.h> | ||
42 | #include <linux/backing-dev.h> | ||
43 | #include <linux/fault-inject.h> | ||
40 | 44 | ||
41 | #include <asm/tlbflush.h> | 45 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 46 | #include <asm/div64.h> |
@@ -80,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | |||
80 | 84 | ||
81 | EXPORT_SYMBOL(totalram_pages); | 85 | EXPORT_SYMBOL(totalram_pages); |
82 | 86 | ||
83 | /* | 87 | static char * const zone_names[MAX_NR_ZONES] = { |
84 | * Used by page_zone() to look up the address of the struct zone whose | ||
85 | * id is encoded in the upper bits of page->flags | ||
86 | */ | ||
87 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | ||
88 | EXPORT_SYMBOL(zone_table); | ||
89 | |||
90 | static char *zone_names[MAX_NR_ZONES] = { | ||
91 | "DMA", | 88 | "DMA", |
92 | #ifdef CONFIG_ZONE_DMA32 | 89 | #ifdef CONFIG_ZONE_DMA32 |
93 | "DMA32", | 90 | "DMA32", |
@@ -102,6 +99,38 @@ int min_free_kbytes = 1024; | |||
102 | 99 | ||
103 | unsigned long __meminitdata nr_kernel_pages; | 100 | unsigned long __meminitdata nr_kernel_pages; |
104 | unsigned long __meminitdata nr_all_pages; | 101 | unsigned long __meminitdata nr_all_pages; |
102 | static unsigned long __initdata dma_reserve; | ||
103 | |||
104 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
105 | /* | ||
106 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct | ||
107 | * ranges of memory (RAM) that may be registered with add_active_range(). | ||
108 | * Ranges passed to add_active_range() will be merged if possible | ||
109 | * so the number of times add_active_range() can be called is | ||
110 | * related to the number of nodes and the number of holes | ||
111 | */ | ||
112 | #ifdef CONFIG_MAX_ACTIVE_REGIONS | ||
113 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ | ||
114 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS | ||
115 | #else | ||
116 | #if MAX_NUMNODES >= 32 | ||
117 | /* If there can be many nodes, allow up to 50 holes per node */ | ||
118 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) | ||
119 | #else | ||
120 | /* By default, allow up to 256 distinct regions */ | ||
121 | #define MAX_ACTIVE_REGIONS 256 | ||
122 | #endif | ||
123 | #endif | ||
124 | |||
125 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | ||
126 | int __initdata nr_nodemap_entries; | ||
127 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | ||
128 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | ||
129 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
130 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
131 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
132 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
133 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
105 | 134 | ||
106 | #ifdef CONFIG_DEBUG_VM | 135 | #ifdef CONFIG_DEBUG_VM |
107 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 136 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
@@ -202,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
202 | int i; | 231 | int i; |
203 | int nr_pages = 1 << order; | 232 | int nr_pages = 1 << order; |
204 | 233 | ||
205 | page[1].lru.next = (void *)free_compound_page; /* set dtor */ | 234 | set_compound_page_dtor(page, free_compound_page); |
206 | page[1].lru.prev = (void *)order; | 235 | page[1].lru.prev = (void *)order; |
207 | for (i = 0; i < nr_pages; i++) { | 236 | for (i = 0; i < nr_pages; i++) { |
208 | struct page *p = page + i; | 237 | struct page *p = page + i; |
@@ -451,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) | |||
451 | spin_lock(&zone->lock); | 480 | spin_lock(&zone->lock); |
452 | zone->all_unreclaimable = 0; | 481 | zone->all_unreclaimable = 0; |
453 | zone->pages_scanned = 0; | 482 | zone->pages_scanned = 0; |
454 | __free_one_page(page, zone ,order); | 483 | __free_one_page(page, zone, order); |
455 | spin_unlock(&zone->lock); | 484 | spin_unlock(&zone->lock); |
456 | } | 485 | } |
457 | 486 | ||
@@ -461,17 +490,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
461 | int i; | 490 | int i; |
462 | int reserved = 0; | 491 | int reserved = 0; |
463 | 492 | ||
464 | arch_free_page(page, order); | ||
465 | if (!PageHighMem(page)) | ||
466 | debug_check_no_locks_freed(page_address(page), | ||
467 | PAGE_SIZE<<order); | ||
468 | |||
469 | for (i = 0 ; i < (1 << order) ; ++i) | 493 | for (i = 0 ; i < (1 << order) ; ++i) |
470 | reserved += free_pages_check(page + i); | 494 | reserved += free_pages_check(page + i); |
471 | if (reserved) | 495 | if (reserved) |
472 | return; | 496 | return; |
473 | 497 | ||
498 | if (!PageHighMem(page)) | ||
499 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | ||
500 | arch_free_page(page, order); | ||
474 | kernel_map_pages(page, 1 << order, 0); | 501 | kernel_map_pages(page, 1 << order, 0); |
502 | |||
475 | local_irq_save(flags); | 503 | local_irq_save(flags); |
476 | __count_vm_events(PGFREE, 1 << order); | 504 | __count_vm_events(PGFREE, 1 << order); |
477 | free_one_page(page_zone(page), page, order); | 505 | free_one_page(page_zone(page), page, order); |
@@ -571,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
571 | 1 << PG_checked | 1 << PG_mappedtodisk); | 599 | 1 << PG_checked | 1 << PG_mappedtodisk); |
572 | set_page_private(page, 0); | 600 | set_page_private(page, 0); |
573 | set_page_refcounted(page); | 601 | set_page_refcounted(page); |
602 | |||
603 | arch_alloc_page(page, order); | ||
574 | kernel_map_pages(page, 1 << order, 1); | 604 | kernel_map_pages(page, 1 << order, 1); |
575 | 605 | ||
576 | if (gfp_flags & __GFP_ZERO) | 606 | if (gfp_flags & __GFP_ZERO) |
@@ -656,9 +686,15 @@ void drain_node_pages(int nodeid) | |||
656 | 686 | ||
657 | pcp = &pset->pcp[i]; | 687 | pcp = &pset->pcp[i]; |
658 | if (pcp->count) { | 688 | if (pcp->count) { |
689 | int to_drain; | ||
690 | |||
659 | local_irq_save(flags); | 691 | local_irq_save(flags); |
660 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 692 | if (pcp->count >= pcp->batch) |
661 | pcp->count = 0; | 693 | to_drain = pcp->batch; |
694 | else | ||
695 | to_drain = pcp->count; | ||
696 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | ||
697 | pcp->count -= to_drain; | ||
662 | local_irq_restore(flags); | 698 | local_irq_restore(flags); |
663 | } | 699 | } |
664 | } | 700 | } |
@@ -666,7 +702,6 @@ void drain_node_pages(int nodeid) | |||
666 | } | 702 | } |
667 | #endif | 703 | #endif |
668 | 704 | ||
669 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | ||
670 | static void __drain_pages(unsigned int cpu) | 705 | static void __drain_pages(unsigned int cpu) |
671 | { | 706 | { |
672 | unsigned long flags; | 707 | unsigned long flags; |
@@ -688,7 +723,6 @@ static void __drain_pages(unsigned int cpu) | |||
688 | } | 723 | } |
689 | } | 724 | } |
690 | } | 725 | } |
691 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | ||
692 | 726 | ||
693 | #ifdef CONFIG_PM | 727 | #ifdef CONFIG_PM |
694 | 728 | ||
@@ -747,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
747 | struct per_cpu_pages *pcp; | 781 | struct per_cpu_pages *pcp; |
748 | unsigned long flags; | 782 | unsigned long flags; |
749 | 783 | ||
750 | arch_free_page(page, 0); | ||
751 | |||
752 | if (PageAnon(page)) | 784 | if (PageAnon(page)) |
753 | page->mapping = NULL; | 785 | page->mapping = NULL; |
754 | if (free_pages_check(page)) | 786 | if (free_pages_check(page)) |
755 | return; | 787 | return; |
756 | 788 | ||
789 | if (!PageHighMem(page)) | ||
790 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | ||
791 | arch_free_page(page, 0); | ||
757 | kernel_map_pages(page, 1, 0); | 792 | kernel_map_pages(page, 1, 0); |
758 | 793 | ||
759 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 794 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
@@ -818,7 +853,7 @@ again: | |||
818 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; | 853 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
819 | local_irq_save(flags); | 854 | local_irq_save(flags); |
820 | if (!pcp->count) { | 855 | if (!pcp->count) { |
821 | pcp->count += rmqueue_bulk(zone, 0, | 856 | pcp->count = rmqueue_bulk(zone, 0, |
822 | pcp->batch, &pcp->list); | 857 | pcp->batch, &pcp->list); |
823 | if (unlikely(!pcp->count)) | 858 | if (unlikely(!pcp->count)) |
824 | goto failed; | 859 | goto failed; |
@@ -858,6 +893,91 @@ failed: | |||
858 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 893 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
859 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 894 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
860 | 895 | ||
896 | #ifdef CONFIG_FAIL_PAGE_ALLOC | ||
897 | |||
898 | static struct fail_page_alloc_attr { | ||
899 | struct fault_attr attr; | ||
900 | |||
901 | u32 ignore_gfp_highmem; | ||
902 | u32 ignore_gfp_wait; | ||
903 | |||
904 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
905 | |||
906 | struct dentry *ignore_gfp_highmem_file; | ||
907 | struct dentry *ignore_gfp_wait_file; | ||
908 | |||
909 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
910 | |||
911 | } fail_page_alloc = { | ||
912 | .attr = FAULT_ATTR_INITIALIZER, | ||
913 | .ignore_gfp_wait = 1, | ||
914 | .ignore_gfp_highmem = 1, | ||
915 | }; | ||
916 | |||
917 | static int __init setup_fail_page_alloc(char *str) | ||
918 | { | ||
919 | return setup_fault_attr(&fail_page_alloc.attr, str); | ||
920 | } | ||
921 | __setup("fail_page_alloc=", setup_fail_page_alloc); | ||
922 | |||
923 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
924 | { | ||
925 | if (gfp_mask & __GFP_NOFAIL) | ||
926 | return 0; | ||
927 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | ||
928 | return 0; | ||
929 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | ||
930 | return 0; | ||
931 | |||
932 | return should_fail(&fail_page_alloc.attr, 1 << order); | ||
933 | } | ||
934 | |||
935 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
936 | |||
937 | static int __init fail_page_alloc_debugfs(void) | ||
938 | { | ||
939 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
940 | struct dentry *dir; | ||
941 | int err; | ||
942 | |||
943 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
944 | "fail_page_alloc"); | ||
945 | if (err) | ||
946 | return err; | ||
947 | dir = fail_page_alloc.attr.dentries.dir; | ||
948 | |||
949 | fail_page_alloc.ignore_gfp_wait_file = | ||
950 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
951 | &fail_page_alloc.ignore_gfp_wait); | ||
952 | |||
953 | fail_page_alloc.ignore_gfp_highmem_file = | ||
954 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
955 | &fail_page_alloc.ignore_gfp_highmem); | ||
956 | |||
957 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
958 | !fail_page_alloc.ignore_gfp_highmem_file) { | ||
959 | err = -ENOMEM; | ||
960 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
961 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
962 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
963 | } | ||
964 | |||
965 | return err; | ||
966 | } | ||
967 | |||
968 | late_initcall(fail_page_alloc_debugfs); | ||
969 | |||
970 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
971 | |||
972 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | ||
973 | |||
974 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
975 | { | ||
976 | return 0; | ||
977 | } | ||
978 | |||
979 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | ||
980 | |||
861 | /* | 981 | /* |
862 | * Return 1 if free pages are above 'mark'. This takes into account the order | 982 | * Return 1 if free pages are above 'mark'. This takes into account the order |
863 | * of the allocation. | 983 | * of the allocation. |
@@ -866,7 +986,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
866 | int classzone_idx, int alloc_flags) | 986 | int classzone_idx, int alloc_flags) |
867 | { | 987 | { |
868 | /* free_pages my go negative - that's OK */ | 988 | /* free_pages my go negative - that's OK */ |
869 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; | 989 | unsigned long min = mark; |
990 | long free_pages = z->free_pages - (1 << order) + 1; | ||
870 | int o; | 991 | int o; |
871 | 992 | ||
872 | if (alloc_flags & ALLOC_HIGH) | 993 | if (alloc_flags & ALLOC_HIGH) |
@@ -889,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
889 | return 1; | 1010 | return 1; |
890 | } | 1011 | } |
891 | 1012 | ||
1013 | #ifdef CONFIG_NUMA | ||
892 | /* | 1014 | /* |
893 | * get_page_from_freeliest goes through the zonelist trying to allocate | 1015 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to |
1016 | * skip over zones that are not allowed by the cpuset, or that have | ||
1017 | * been recently (in last second) found to be nearly full. See further | ||
1018 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
1019 | * that have to skip over alot of full or unallowed zones. | ||
1020 | * | ||
1021 | * If the zonelist cache is present in the passed in zonelist, then | ||
1022 | * returns a pointer to the allowed node mask (either the current | ||
1023 | * tasks mems_allowed, or node_online_map.) | ||
1024 | * | ||
1025 | * If the zonelist cache is not available for this zonelist, does | ||
1026 | * nothing and returns NULL. | ||
1027 | * | ||
1028 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
1029 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
1030 | * | ||
1031 | * We hold off even calling zlc_setup, until after we've checked the | ||
1032 | * first zone in the zonelist, on the theory that most allocations will | ||
1033 | * be satisfied from that first zone, so best to examine that zone as | ||
1034 | * quickly as we can. | ||
1035 | */ | ||
1036 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1037 | { | ||
1038 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1039 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
1040 | |||
1041 | zlc = zonelist->zlcache_ptr; | ||
1042 | if (!zlc) | ||
1043 | return NULL; | ||
1044 | |||
1045 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
1046 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1047 | zlc->last_full_zap = jiffies; | ||
1048 | } | ||
1049 | |||
1050 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
1051 | &cpuset_current_mems_allowed : | ||
1052 | &node_online_map; | ||
1053 | return allowednodes; | ||
1054 | } | ||
1055 | |||
1056 | /* | ||
1057 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
1058 | * if it is worth looking at further for free memory: | ||
1059 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
1060 | * bit set in the zonelist_cache fullzones BITMAP). | ||
1061 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
1062 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
1063 | * Return true (non-zero) if zone is worth looking at further, or | ||
1064 | * else return false (zero) if it is not. | ||
1065 | * | ||
1066 | * This check -ignores- the distinction between various watermarks, | ||
1067 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
1068 | * found to be full for any variation of these watermarks, it will | ||
1069 | * be considered full for up to one second by all requests, unless | ||
1070 | * we are so low on memory on all allowed nodes that we are forced | ||
1071 | * into the second scan of the zonelist. | ||
1072 | * | ||
1073 | * In the second scan we ignore this zonelist cache and exactly | ||
1074 | * apply the watermarks to all zones, even it is slower to do so. | ||
1075 | * We are low on memory in the second scan, and should leave no stone | ||
1076 | * unturned looking for a free page. | ||
1077 | */ | ||
1078 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1079 | nodemask_t *allowednodes) | ||
1080 | { | ||
1081 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1082 | int i; /* index of *z in zonelist zones */ | ||
1083 | int n; /* node that zone *z is on */ | ||
1084 | |||
1085 | zlc = zonelist->zlcache_ptr; | ||
1086 | if (!zlc) | ||
1087 | return 1; | ||
1088 | |||
1089 | i = z - zonelist->zones; | ||
1090 | n = zlc->z_to_n[i]; | ||
1091 | |||
1092 | /* This zone is worth trying if it is allowed but not full */ | ||
1093 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1094 | } | ||
1095 | |||
1096 | /* | ||
1097 | * Given 'z' scanning a zonelist, set the corresponding bit in | ||
1098 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1099 | * from that zone don't waste time re-examining it. | ||
1100 | */ | ||
1101 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1102 | { | ||
1103 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1104 | int i; /* index of *z in zonelist zones */ | ||
1105 | |||
1106 | zlc = zonelist->zlcache_ptr; | ||
1107 | if (!zlc) | ||
1108 | return; | ||
1109 | |||
1110 | i = z - zonelist->zones; | ||
1111 | |||
1112 | set_bit(i, zlc->fullzones); | ||
1113 | } | ||
1114 | |||
1115 | #else /* CONFIG_NUMA */ | ||
1116 | |||
1117 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1118 | { | ||
1119 | return NULL; | ||
1120 | } | ||
1121 | |||
1122 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1123 | nodemask_t *allowednodes) | ||
1124 | { | ||
1125 | return 1; | ||
1126 | } | ||
1127 | |||
1128 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1129 | { | ||
1130 | } | ||
1131 | #endif /* CONFIG_NUMA */ | ||
1132 | |||
1133 | /* | ||
1134 | * get_page_from_freelist goes through the zonelist trying to allocate | ||
894 | * a page. | 1135 | * a page. |
895 | */ | 1136 | */ |
896 | static struct page * | 1137 | static struct page * |
897 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1138 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
898 | struct zonelist *zonelist, int alloc_flags) | 1139 | struct zonelist *zonelist, int alloc_flags) |
899 | { | 1140 | { |
900 | struct zone **z = zonelist->zones; | 1141 | struct zone **z; |
901 | struct page *page = NULL; | 1142 | struct page *page = NULL; |
902 | int classzone_idx = zone_idx(*z); | 1143 | int classzone_idx = zone_idx(zonelist->zones[0]); |
903 | struct zone *zone; | 1144 | struct zone *zone; |
1145 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1146 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1147 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
904 | 1148 | ||
1149 | zonelist_scan: | ||
905 | /* | 1150 | /* |
906 | * Go through the zonelist once, looking for a zone with enough free. | 1151 | * Scan zonelist, looking for a zone with enough free. |
907 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1152 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
908 | */ | 1153 | */ |
1154 | z = zonelist->zones; | ||
1155 | |||
909 | do { | 1156 | do { |
1157 | if (NUMA_BUILD && zlc_active && | ||
1158 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1159 | continue; | ||
910 | zone = *z; | 1160 | zone = *z; |
911 | if (unlikely((gfp_mask & __GFP_THISNODE) && | 1161 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
912 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1162 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
913 | break; | 1163 | break; |
914 | if ((alloc_flags & ALLOC_CPUSET) && | 1164 | if ((alloc_flags & ALLOC_CPUSET) && |
915 | !cpuset_zone_allowed(zone, gfp_mask)) | 1165 | !cpuset_zone_allowed(zone, gfp_mask)) |
916 | continue; | 1166 | goto try_next_zone; |
917 | 1167 | ||
918 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1168 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
919 | unsigned long mark; | 1169 | unsigned long mark; |
@@ -923,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
923 | mark = zone->pages_low; | 1173 | mark = zone->pages_low; |
924 | else | 1174 | else |
925 | mark = zone->pages_high; | 1175 | mark = zone->pages_high; |
926 | if (!zone_watermark_ok(zone , order, mark, | 1176 | if (!zone_watermark_ok(zone, order, mark, |
927 | classzone_idx, alloc_flags)) | 1177 | classzone_idx, alloc_flags)) { |
928 | if (!zone_reclaim_mode || | 1178 | if (!zone_reclaim_mode || |
929 | !zone_reclaim(zone, gfp_mask, order)) | 1179 | !zone_reclaim(zone, gfp_mask, order)) |
930 | continue; | 1180 | goto this_zone_full; |
1181 | } | ||
931 | } | 1182 | } |
932 | 1183 | ||
933 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1184 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
934 | if (page) { | 1185 | if (page) |
935 | break; | 1186 | break; |
1187 | this_zone_full: | ||
1188 | if (NUMA_BUILD) | ||
1189 | zlc_mark_zone_full(zonelist, z); | ||
1190 | try_next_zone: | ||
1191 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1192 | /* we do zlc_setup after the first zone is tried */ | ||
1193 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1194 | zlc_active = 1; | ||
1195 | did_zlc_setup = 1; | ||
936 | } | 1196 | } |
937 | } while (*(++z) != NULL); | 1197 | } while (*(++z) != NULL); |
1198 | |||
1199 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1200 | /* Disable zlc cache for second zonelist scan */ | ||
1201 | zlc_active = 0; | ||
1202 | goto zonelist_scan; | ||
1203 | } | ||
938 | return page; | 1204 | return page; |
939 | } | 1205 | } |
940 | 1206 | ||
@@ -956,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
956 | 1222 | ||
957 | might_sleep_if(wait); | 1223 | might_sleep_if(wait); |
958 | 1224 | ||
1225 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1226 | return NULL; | ||
1227 | |||
959 | restart: | 1228 | restart: |
960 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1229 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
961 | 1230 | ||
@@ -969,9 +1238,19 @@ restart: | |||
969 | if (page) | 1238 | if (page) |
970 | goto got_pg; | 1239 | goto got_pg; |
971 | 1240 | ||
972 | do { | 1241 | /* |
1242 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | ||
1243 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | ||
1244 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | ||
1245 | * using a larger set of nodes after it has established that the | ||
1246 | * allowed per node queues are empty and that nodes are | ||
1247 | * over allocated. | ||
1248 | */ | ||
1249 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
1250 | goto nopage; | ||
1251 | |||
1252 | for (z = zonelist->zones; *z; z++) | ||
973 | wakeup_kswapd(*z, order); | 1253 | wakeup_kswapd(*z, order); |
974 | } while (*(++z)); | ||
975 | 1254 | ||
976 | /* | 1255 | /* |
977 | * OK, we're below the kswapd watermark and have kicked background | 1256 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1005,6 +1284,7 @@ restart: | |||
1005 | 1284 | ||
1006 | /* This allocation should allow future memory freeing. */ | 1285 | /* This allocation should allow future memory freeing. */ |
1007 | 1286 | ||
1287 | rebalance: | ||
1008 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1288 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
1009 | && !in_interrupt()) { | 1289 | && !in_interrupt()) { |
1010 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1290 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
@@ -1015,7 +1295,7 @@ nofail_alloc: | |||
1015 | if (page) | 1295 | if (page) |
1016 | goto got_pg; | 1296 | goto got_pg; |
1017 | if (gfp_mask & __GFP_NOFAIL) { | 1297 | if (gfp_mask & __GFP_NOFAIL) { |
1018 | blk_congestion_wait(WRITE, HZ/50); | 1298 | congestion_wait(WRITE, HZ/50); |
1019 | goto nofail_alloc; | 1299 | goto nofail_alloc; |
1020 | } | 1300 | } |
1021 | } | 1301 | } |
@@ -1026,7 +1306,6 @@ nofail_alloc: | |||
1026 | if (!wait) | 1306 | if (!wait) |
1027 | goto nopage; | 1307 | goto nopage; |
1028 | 1308 | ||
1029 | rebalance: | ||
1030 | cond_resched(); | 1309 | cond_resched(); |
1031 | 1310 | ||
1032 | /* We now go into synchronous reclaim */ | 1311 | /* We now go into synchronous reclaim */ |
@@ -1078,7 +1357,7 @@ rebalance: | |||
1078 | do_retry = 1; | 1357 | do_retry = 1; |
1079 | } | 1358 | } |
1080 | if (do_retry) { | 1359 | if (do_retry) { |
1081 | blk_congestion_wait(WRITE, HZ/50); | 1360 | congestion_wait(WRITE, HZ/50); |
1082 | goto rebalance; | 1361 | goto rebalance; |
1083 | } | 1362 | } |
1084 | 1363 | ||
@@ -1222,14 +1501,12 @@ unsigned int nr_free_pagecache_pages(void) | |||
1222 | { | 1501 | { |
1223 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1502 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); |
1224 | } | 1503 | } |
1225 | #ifdef CONFIG_NUMA | 1504 | |
1226 | static void show_node(struct zone *zone) | 1505 | static inline void show_node(struct zone *zone) |
1227 | { | 1506 | { |
1228 | printk("Node %ld ", zone_to_nid(zone)); | 1507 | if (NUMA_BUILD) |
1508 | printk("Node %d ", zone_to_nid(zone)); | ||
1229 | } | 1509 | } |
1230 | #else | ||
1231 | #define show_node(zone) do { } while (0) | ||
1232 | #endif | ||
1233 | 1510 | ||
1234 | void si_meminfo(struct sysinfo *val) | 1511 | void si_meminfo(struct sysinfo *val) |
1235 | { | 1512 | { |
@@ -1271,34 +1548,30 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
1271 | */ | 1548 | */ |
1272 | void show_free_areas(void) | 1549 | void show_free_areas(void) |
1273 | { | 1550 | { |
1274 | int cpu, temperature; | 1551 | int cpu; |
1275 | unsigned long active; | 1552 | unsigned long active; |
1276 | unsigned long inactive; | 1553 | unsigned long inactive; |
1277 | unsigned long free; | 1554 | unsigned long free; |
1278 | struct zone *zone; | 1555 | struct zone *zone; |
1279 | 1556 | ||
1280 | for_each_zone(zone) { | 1557 | for_each_zone(zone) { |
1281 | show_node(zone); | 1558 | if (!populated_zone(zone)) |
1282 | printk("%s per-cpu:", zone->name); | ||
1283 | |||
1284 | if (!populated_zone(zone)) { | ||
1285 | printk(" empty\n"); | ||
1286 | continue; | 1559 | continue; |
1287 | } else | 1560 | |
1288 | printk("\n"); | 1561 | show_node(zone); |
1562 | printk("%s per-cpu:\n", zone->name); | ||
1289 | 1563 | ||
1290 | for_each_online_cpu(cpu) { | 1564 | for_each_online_cpu(cpu) { |
1291 | struct per_cpu_pageset *pageset; | 1565 | struct per_cpu_pageset *pageset; |
1292 | 1566 | ||
1293 | pageset = zone_pcp(zone, cpu); | 1567 | pageset = zone_pcp(zone, cpu); |
1294 | 1568 | ||
1295 | for (temperature = 0; temperature < 2; temperature++) | 1569 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " |
1296 | printk("cpu %d %s: high %d, batch %d used:%d\n", | 1570 | "Cold: hi:%5d, btch:%4d usd:%4d\n", |
1297 | cpu, | 1571 | cpu, pageset->pcp[0].high, |
1298 | temperature ? "cold" : "hot", | 1572 | pageset->pcp[0].batch, pageset->pcp[0].count, |
1299 | pageset->pcp[temperature].high, | 1573 | pageset->pcp[1].high, pageset->pcp[1].batch, |
1300 | pageset->pcp[temperature].batch, | 1574 | pageset->pcp[1].count); |
1301 | pageset->pcp[temperature].count); | ||
1302 | } | 1575 | } |
1303 | } | 1576 | } |
1304 | 1577 | ||
@@ -1320,6 +1593,9 @@ void show_free_areas(void) | |||
1320 | for_each_zone(zone) { | 1593 | for_each_zone(zone) { |
1321 | int i; | 1594 | int i; |
1322 | 1595 | ||
1596 | if (!populated_zone(zone)) | ||
1597 | continue; | ||
1598 | |||
1323 | show_node(zone); | 1599 | show_node(zone); |
1324 | printk("%s" | 1600 | printk("%s" |
1325 | " free:%lukB" | 1601 | " free:%lukB" |
@@ -1352,12 +1628,11 @@ void show_free_areas(void) | |||
1352 | for_each_zone(zone) { | 1628 | for_each_zone(zone) { |
1353 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 1629 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1354 | 1630 | ||
1631 | if (!populated_zone(zone)) | ||
1632 | continue; | ||
1633 | |||
1355 | show_node(zone); | 1634 | show_node(zone); |
1356 | printk("%s: ", zone->name); | 1635 | printk("%s: ", zone->name); |
1357 | if (!populated_zone(zone)) { | ||
1358 | printk("empty\n"); | ||
1359 | continue; | ||
1360 | } | ||
1361 | 1636 | ||
1362 | spin_lock_irqsave(&zone->lock, flags); | 1637 | spin_lock_irqsave(&zone->lock, flags); |
1363 | for (order = 0; order < MAX_ORDER; order++) { | 1638 | for (order = 0; order < MAX_ORDER; order++) { |
@@ -1510,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1510 | } | 1785 | } |
1511 | } | 1786 | } |
1512 | 1787 | ||
1788 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1789 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1790 | { | ||
1791 | int i; | ||
1792 | |||
1793 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1794 | struct zonelist *zonelist; | ||
1795 | struct zonelist_cache *zlc; | ||
1796 | struct zone **z; | ||
1797 | |||
1798 | zonelist = pgdat->node_zonelists + i; | ||
1799 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1800 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1801 | for (z = zonelist->zones; *z; z++) | ||
1802 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1803 | } | ||
1804 | } | ||
1805 | |||
1513 | #else /* CONFIG_NUMA */ | 1806 | #else /* CONFIG_NUMA */ |
1514 | 1807 | ||
1515 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1808 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1547,21 +1840,33 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1547 | } | 1840 | } |
1548 | } | 1841 | } |
1549 | 1842 | ||
1843 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1844 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1845 | { | ||
1846 | int i; | ||
1847 | |||
1848 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1849 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1850 | } | ||
1851 | |||
1550 | #endif /* CONFIG_NUMA */ | 1852 | #endif /* CONFIG_NUMA */ |
1551 | 1853 | ||
1552 | /* return values int ....just for stop_machine_run() */ | 1854 | /* return values int ....just for stop_machine_run() */ |
1553 | static int __meminit __build_all_zonelists(void *dummy) | 1855 | static int __meminit __build_all_zonelists(void *dummy) |
1554 | { | 1856 | { |
1555 | int nid; | 1857 | int nid; |
1556 | for_each_online_node(nid) | 1858 | |
1859 | for_each_online_node(nid) { | ||
1557 | build_zonelists(NODE_DATA(nid)); | 1860 | build_zonelists(NODE_DATA(nid)); |
1861 | build_zonelist_cache(NODE_DATA(nid)); | ||
1862 | } | ||
1558 | return 0; | 1863 | return 0; |
1559 | } | 1864 | } |
1560 | 1865 | ||
1561 | void __meminit build_all_zonelists(void) | 1866 | void __meminit build_all_zonelists(void) |
1562 | { | 1867 | { |
1563 | if (system_state == SYSTEM_BOOTING) { | 1868 | if (system_state == SYSTEM_BOOTING) { |
1564 | __build_all_zonelists(0); | 1869 | __build_all_zonelists(NULL); |
1565 | cpuset_init_current_mems_allowed(); | 1870 | cpuset_init_current_mems_allowed(); |
1566 | } else { | 1871 | } else { |
1567 | /* we have to stop all cpus to guaranntee there is no user | 1872 | /* we have to stop all cpus to guaranntee there is no user |
@@ -1642,25 +1947,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
1642 | 1947 | ||
1643 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1948 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
1644 | 1949 | ||
1645 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | ||
1646 | unsigned long *zones_size, unsigned long *zholes_size) | ||
1647 | { | ||
1648 | unsigned long realtotalpages, totalpages = 0; | ||
1649 | enum zone_type i; | ||
1650 | |||
1651 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1652 | totalpages += zones_size[i]; | ||
1653 | pgdat->node_spanned_pages = totalpages; | ||
1654 | |||
1655 | realtotalpages = totalpages; | ||
1656 | if (zholes_size) | ||
1657 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1658 | realtotalpages -= zholes_size[i]; | ||
1659 | pgdat->node_present_pages = realtotalpages; | ||
1660 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | ||
1661 | } | ||
1662 | |||
1663 | |||
1664 | /* | 1950 | /* |
1665 | * Initially all pages are reserved - free ones are freed | 1951 | * Initially all pages are reserved - free ones are freed |
1666 | * up by free_all_bootmem() once the early boot process is | 1952 | * up by free_all_bootmem() once the early boot process is |
@@ -1676,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1676 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1962 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1677 | if (!early_pfn_valid(pfn)) | 1963 | if (!early_pfn_valid(pfn)) |
1678 | continue; | 1964 | continue; |
1965 | if (!early_pfn_in_nid(pfn, nid)) | ||
1966 | continue; | ||
1679 | page = pfn_to_page(pfn); | 1967 | page = pfn_to_page(pfn); |
1680 | set_page_links(page, zone, nid, pfn); | 1968 | set_page_links(page, zone, nid, pfn); |
1681 | init_page_count(page); | 1969 | init_page_count(page); |
@@ -1700,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1700 | } | 1988 | } |
1701 | } | 1989 | } |
1702 | 1990 | ||
1703 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
1704 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, | ||
1705 | unsigned long pfn, unsigned long size) | ||
1706 | { | ||
1707 | unsigned long snum = pfn_to_section_nr(pfn); | ||
1708 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
1709 | |||
1710 | if (FLAGS_HAS_NODE) | ||
1711 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
1712 | else | ||
1713 | for (; snum <= end; snum++) | ||
1714 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
1715 | } | ||
1716 | |||
1717 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1991 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1718 | #define memmap_init(size, nid, zone, start_pfn) \ | 1992 | #define memmap_init(size, nid, zone, start_pfn) \ |
1719 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1993 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
@@ -1818,6 +2092,9 @@ static int __cpuinit process_zones(int cpu) | |||
1818 | 2092 | ||
1819 | for_each_zone(zone) { | 2093 | for_each_zone(zone) { |
1820 | 2094 | ||
2095 | if (!populated_zone(zone)) | ||
2096 | continue; | ||
2097 | |||
1821 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2098 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
1822 | GFP_KERNEL, cpu_to_node(cpu)); | 2099 | GFP_KERNEL, cpu_to_node(cpu)); |
1823 | if (!zone_pcp(zone, cpu)) | 2100 | if (!zone_pcp(zone, cpu)) |
@@ -1863,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1863 | int ret = NOTIFY_OK; | 2140 | int ret = NOTIFY_OK; |
1864 | 2141 | ||
1865 | switch (action) { | 2142 | switch (action) { |
1866 | case CPU_UP_PREPARE: | 2143 | case CPU_UP_PREPARE: |
1867 | if (process_zones(cpu)) | 2144 | if (process_zones(cpu)) |
1868 | ret = NOTIFY_BAD; | 2145 | ret = NOTIFY_BAD; |
1869 | break; | 2146 | break; |
1870 | case CPU_UP_CANCELED: | 2147 | case CPU_UP_CANCELED: |
1871 | case CPU_DEAD: | 2148 | case CPU_DEAD: |
1872 | free_zone_pagesets(cpu); | 2149 | free_zone_pagesets(cpu); |
1873 | break; | 2150 | break; |
1874 | default: | 2151 | default: |
1875 | break; | 2152 | break; |
1876 | } | 2153 | } |
1877 | return ret; | 2154 | return ret; |
1878 | } | 2155 | } |
@@ -1977,6 +2254,349 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
1977 | return 0; | 2254 | return 0; |
1978 | } | 2255 | } |
1979 | 2256 | ||
2257 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
2258 | /* | ||
2259 | * Basic iterator support. Return the first range of PFNs for a node | ||
2260 | * Note: nid == MAX_NUMNODES returns first region regardless of node | ||
2261 | */ | ||
2262 | static int __init first_active_region_index_in_nid(int nid) | ||
2263 | { | ||
2264 | int i; | ||
2265 | |||
2266 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2267 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | ||
2268 | return i; | ||
2269 | |||
2270 | return -1; | ||
2271 | } | ||
2272 | |||
2273 | /* | ||
2274 | * Basic iterator support. Return the next active range of PFNs for a node | ||
2275 | * Note: nid == MAX_NUMNODES returns next region regardles of node | ||
2276 | */ | ||
2277 | static int __init next_active_region_index_in_nid(int index, int nid) | ||
2278 | { | ||
2279 | for (index = index + 1; index < nr_nodemap_entries; index++) | ||
2280 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
2281 | return index; | ||
2282 | |||
2283 | return -1; | ||
2284 | } | ||
2285 | |||
2286 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | ||
2287 | /* | ||
2288 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | ||
2289 | * Architectures may implement their own version but if add_active_range() | ||
2290 | * was used and there are no special requirements, this is a convenient | ||
2291 | * alternative | ||
2292 | */ | ||
2293 | int __init early_pfn_to_nid(unsigned long pfn) | ||
2294 | { | ||
2295 | int i; | ||
2296 | |||
2297 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
2298 | unsigned long start_pfn = early_node_map[i].start_pfn; | ||
2299 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
2300 | |||
2301 | if (start_pfn <= pfn && pfn < end_pfn) | ||
2302 | return early_node_map[i].nid; | ||
2303 | } | ||
2304 | |||
2305 | return 0; | ||
2306 | } | ||
2307 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | ||
2308 | |||
2309 | /* Basic iterator support to walk early_node_map[] */ | ||
2310 | #define for_each_active_range_index_in_nid(i, nid) \ | ||
2311 | for (i = first_active_region_index_in_nid(nid); i != -1; \ | ||
2312 | i = next_active_region_index_in_nid(i, nid)) | ||
2313 | |||
2314 | /** | ||
2315 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | ||
2316 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | ||
2317 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | ||
2318 | * | ||
2319 | * If an architecture guarantees that all ranges registered with | ||
2320 | * add_active_ranges() contain no holes and may be freed, this | ||
2321 | * this function may be used instead of calling free_bootmem() manually. | ||
2322 | */ | ||
2323 | void __init free_bootmem_with_active_regions(int nid, | ||
2324 | unsigned long max_low_pfn) | ||
2325 | { | ||
2326 | int i; | ||
2327 | |||
2328 | for_each_active_range_index_in_nid(i, nid) { | ||
2329 | unsigned long size_pages = 0; | ||
2330 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
2331 | |||
2332 | if (early_node_map[i].start_pfn >= max_low_pfn) | ||
2333 | continue; | ||
2334 | |||
2335 | if (end_pfn > max_low_pfn) | ||
2336 | end_pfn = max_low_pfn; | ||
2337 | |||
2338 | size_pages = end_pfn - early_node_map[i].start_pfn; | ||
2339 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), | ||
2340 | PFN_PHYS(early_node_map[i].start_pfn), | ||
2341 | size_pages << PAGE_SHIFT); | ||
2342 | } | ||
2343 | } | ||
2344 | |||
2345 | /** | ||
2346 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | ||
2347 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | ||
2348 | * | ||
2349 | * If an architecture guarantees that all ranges registered with | ||
2350 | * add_active_ranges() contain no holes and may be freed, this | ||
2351 | * function may be used instead of calling memory_present() manually. | ||
2352 | */ | ||
2353 | void __init sparse_memory_present_with_active_regions(int nid) | ||
2354 | { | ||
2355 | int i; | ||
2356 | |||
2357 | for_each_active_range_index_in_nid(i, nid) | ||
2358 | memory_present(early_node_map[i].nid, | ||
2359 | early_node_map[i].start_pfn, | ||
2360 | early_node_map[i].end_pfn); | ||
2361 | } | ||
2362 | |||
2363 | /** | ||
2364 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
2365 | * @nid: The nid of the node to push the boundary for | ||
2366 | * @start_pfn: The start pfn of the node | ||
2367 | * @end_pfn: The end pfn of the node | ||
2368 | * | ||
2369 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
2370 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
2371 | * be hotplugged even though no physical memory exists. This function allows | ||
2372 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
2373 | * be used later. | ||
2374 | */ | ||
2375 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
2376 | void __init push_node_boundaries(unsigned int nid, | ||
2377 | unsigned long start_pfn, unsigned long end_pfn) | ||
2378 | { | ||
2379 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
2380 | nid, start_pfn, end_pfn); | ||
2381 | |||
2382 | /* Initialise the boundary for this node if necessary */ | ||
2383 | if (node_boundary_end_pfn[nid] == 0) | ||
2384 | node_boundary_start_pfn[nid] = -1UL; | ||
2385 | |||
2386 | /* Update the boundaries */ | ||
2387 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
2388 | node_boundary_start_pfn[nid] = start_pfn; | ||
2389 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
2390 | node_boundary_end_pfn[nid] = end_pfn; | ||
2391 | } | ||
2392 | |||
2393 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
2394 | static void __init account_node_boundary(unsigned int nid, | ||
2395 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
2396 | { | ||
2397 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
2398 | nid, *start_pfn, *end_pfn); | ||
2399 | |||
2400 | /* Return if boundary information has not been provided */ | ||
2401 | if (node_boundary_end_pfn[nid] == 0) | ||
2402 | return; | ||
2403 | |||
2404 | /* Check the boundaries and update if necessary */ | ||
2405 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
2406 | *start_pfn = node_boundary_start_pfn[nid]; | ||
2407 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
2408 | *end_pfn = node_boundary_end_pfn[nid]; | ||
2409 | } | ||
2410 | #else | ||
2411 | void __init push_node_boundaries(unsigned int nid, | ||
2412 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
2413 | |||
2414 | static void __init account_node_boundary(unsigned int nid, | ||
2415 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
2416 | #endif | ||
2417 | |||
2418 | |||
2419 | /** | ||
2420 | * get_pfn_range_for_nid - Return the start and end page frames for a node | ||
2421 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | ||
2422 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | ||
2423 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | ||
2424 | * | ||
2425 | * It returns the start and end page frame of a node based on information | ||
2426 | * provided by an arch calling add_active_range(). If called for a node | ||
2427 | * with no available memory, a warning is printed and the start and end | ||
2428 | * PFNs will be 0. | ||
2429 | */ | ||
2430 | void __init get_pfn_range_for_nid(unsigned int nid, | ||
2431 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
2432 | { | ||
2433 | int i; | ||
2434 | *start_pfn = -1UL; | ||
2435 | *end_pfn = 0; | ||
2436 | |||
2437 | for_each_active_range_index_in_nid(i, nid) { | ||
2438 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); | ||
2439 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | ||
2440 | } | ||
2441 | |||
2442 | if (*start_pfn == -1UL) { | ||
2443 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
2444 | *start_pfn = 0; | ||
2445 | } | ||
2446 | |||
2447 | /* Push the node boundaries out if requested */ | ||
2448 | account_node_boundary(nid, start_pfn, end_pfn); | ||
2449 | } | ||
2450 | |||
2451 | /* | ||
2452 | * Return the number of pages a zone spans in a node, including holes | ||
2453 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | ||
2454 | */ | ||
2455 | unsigned long __init zone_spanned_pages_in_node(int nid, | ||
2456 | unsigned long zone_type, | ||
2457 | unsigned long *ignored) | ||
2458 | { | ||
2459 | unsigned long node_start_pfn, node_end_pfn; | ||
2460 | unsigned long zone_start_pfn, zone_end_pfn; | ||
2461 | |||
2462 | /* Get the start and end of the node and zone */ | ||
2463 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
2464 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | ||
2465 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | ||
2466 | |||
2467 | /* Check that this node has pages within the zone's required range */ | ||
2468 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | ||
2469 | return 0; | ||
2470 | |||
2471 | /* Move the zone boundaries inside the node if necessary */ | ||
2472 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | ||
2473 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | ||
2474 | |||
2475 | /* Return the spanned pages */ | ||
2476 | return zone_end_pfn - zone_start_pfn; | ||
2477 | } | ||
2478 | |||
2479 | /* | ||
2480 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | ||
2481 | * then all holes in the requested range will be accounted for. | ||
2482 | */ | ||
2483 | unsigned long __init __absent_pages_in_range(int nid, | ||
2484 | unsigned long range_start_pfn, | ||
2485 | unsigned long range_end_pfn) | ||
2486 | { | ||
2487 | int i = 0; | ||
2488 | unsigned long prev_end_pfn = 0, hole_pages = 0; | ||
2489 | unsigned long start_pfn; | ||
2490 | |||
2491 | /* Find the end_pfn of the first active range of pfns in the node */ | ||
2492 | i = first_active_region_index_in_nid(nid); | ||
2493 | if (i == -1) | ||
2494 | return 0; | ||
2495 | |||
2496 | /* Account for ranges before physical memory on this node */ | ||
2497 | if (early_node_map[i].start_pfn > range_start_pfn) | ||
2498 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; | ||
2499 | |||
2500 | prev_end_pfn = early_node_map[i].start_pfn; | ||
2501 | |||
2502 | /* Find all holes for the zone within the node */ | ||
2503 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { | ||
2504 | |||
2505 | /* No need to continue if prev_end_pfn is outside the zone */ | ||
2506 | if (prev_end_pfn >= range_end_pfn) | ||
2507 | break; | ||
2508 | |||
2509 | /* Make sure the end of the zone is not within the hole */ | ||
2510 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); | ||
2511 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); | ||
2512 | |||
2513 | /* Update the hole size cound and move on */ | ||
2514 | if (start_pfn > range_start_pfn) { | ||
2515 | BUG_ON(prev_end_pfn > start_pfn); | ||
2516 | hole_pages += start_pfn - prev_end_pfn; | ||
2517 | } | ||
2518 | prev_end_pfn = early_node_map[i].end_pfn; | ||
2519 | } | ||
2520 | |||
2521 | /* Account for ranges past physical memory on this node */ | ||
2522 | if (range_end_pfn > prev_end_pfn) | ||
2523 | hole_pages += range_end_pfn - | ||
2524 | max(range_start_pfn, prev_end_pfn); | ||
2525 | |||
2526 | return hole_pages; | ||
2527 | } | ||
2528 | |||
2529 | /** | ||
2530 | * absent_pages_in_range - Return number of page frames in holes within a range | ||
2531 | * @start_pfn: The start PFN to start searching for holes | ||
2532 | * @end_pfn: The end PFN to stop searching for holes | ||
2533 | * | ||
2534 | * It returns the number of pages frames in memory holes within a range. | ||
2535 | */ | ||
2536 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | ||
2537 | unsigned long end_pfn) | ||
2538 | { | ||
2539 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | ||
2540 | } | ||
2541 | |||
2542 | /* Return the number of page frames in holes in a zone on a node */ | ||
2543 | unsigned long __init zone_absent_pages_in_node(int nid, | ||
2544 | unsigned long zone_type, | ||
2545 | unsigned long *ignored) | ||
2546 | { | ||
2547 | unsigned long node_start_pfn, node_end_pfn; | ||
2548 | unsigned long zone_start_pfn, zone_end_pfn; | ||
2549 | |||
2550 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
2551 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], | ||
2552 | node_start_pfn); | ||
2553 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], | ||
2554 | node_end_pfn); | ||
2555 | |||
2556 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | ||
2557 | } | ||
2558 | |||
2559 | #else | ||
2560 | static inline unsigned long zone_spanned_pages_in_node(int nid, | ||
2561 | unsigned long zone_type, | ||
2562 | unsigned long *zones_size) | ||
2563 | { | ||
2564 | return zones_size[zone_type]; | ||
2565 | } | ||
2566 | |||
2567 | static inline unsigned long zone_absent_pages_in_node(int nid, | ||
2568 | unsigned long zone_type, | ||
2569 | unsigned long *zholes_size) | ||
2570 | { | ||
2571 | if (!zholes_size) | ||
2572 | return 0; | ||
2573 | |||
2574 | return zholes_size[zone_type]; | ||
2575 | } | ||
2576 | |||
2577 | #endif | ||
2578 | |||
2579 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | ||
2580 | unsigned long *zones_size, unsigned long *zholes_size) | ||
2581 | { | ||
2582 | unsigned long realtotalpages, totalpages = 0; | ||
2583 | enum zone_type i; | ||
2584 | |||
2585 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2586 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | ||
2587 | zones_size); | ||
2588 | pgdat->node_spanned_pages = totalpages; | ||
2589 | |||
2590 | realtotalpages = totalpages; | ||
2591 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2592 | realtotalpages -= | ||
2593 | zone_absent_pages_in_node(pgdat->node_id, i, | ||
2594 | zholes_size); | ||
2595 | pgdat->node_present_pages = realtotalpages; | ||
2596 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | ||
2597 | realtotalpages); | ||
2598 | } | ||
2599 | |||
1980 | /* | 2600 | /* |
1981 | * Set up the zone data structures: | 2601 | * Set up the zone data structures: |
1982 | * - mark all pages reserved | 2602 | * - mark all pages reserved |
@@ -1998,11 +2618,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
1998 | 2618 | ||
1999 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2619 | for (j = 0; j < MAX_NR_ZONES; j++) { |
2000 | struct zone *zone = pgdat->node_zones + j; | 2620 | struct zone *zone = pgdat->node_zones + j; |
2001 | unsigned long size, realsize; | 2621 | unsigned long size, realsize, memmap_pages; |
2002 | 2622 | ||
2003 | realsize = size = zones_size[j]; | 2623 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
2004 | if (zholes_size) | 2624 | realsize = size - zone_absent_pages_in_node(nid, j, |
2005 | realsize -= zholes_size[j]; | 2625 | zholes_size); |
2626 | |||
2627 | /* | ||
2628 | * Adjust realsize so that it accounts for how much memory | ||
2629 | * is used by this zone for memmap. This affects the watermark | ||
2630 | * and per-cpu initialisations | ||
2631 | */ | ||
2632 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; | ||
2633 | if (realsize >= memmap_pages) { | ||
2634 | realsize -= memmap_pages; | ||
2635 | printk(KERN_DEBUG | ||
2636 | " %s zone: %lu pages used for memmap\n", | ||
2637 | zone_names[j], memmap_pages); | ||
2638 | } else | ||
2639 | printk(KERN_WARNING | ||
2640 | " %s zone: %lu pages exceeds realsize %lu\n", | ||
2641 | zone_names[j], memmap_pages, realsize); | ||
2642 | |||
2643 | /* Account for reserved DMA pages */ | ||
2644 | if (j == ZONE_DMA && realsize > dma_reserve) { | ||
2645 | realsize -= dma_reserve; | ||
2646 | printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", | ||
2647 | dma_reserve); | ||
2648 | } | ||
2006 | 2649 | ||
2007 | if (!is_highmem_idx(j)) | 2650 | if (!is_highmem_idx(j)) |
2008 | nr_kernel_pages += realsize; | 2651 | nr_kernel_pages += realsize; |
@@ -2011,6 +2654,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2011 | zone->spanned_pages = size; | 2654 | zone->spanned_pages = size; |
2012 | zone->present_pages = realsize; | 2655 | zone->present_pages = realsize; |
2013 | #ifdef CONFIG_NUMA | 2656 | #ifdef CONFIG_NUMA |
2657 | zone->node = nid; | ||
2014 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 2658 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
2015 | / 100; | 2659 | / 100; |
2016 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 2660 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; |
@@ -2022,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2022 | zone->zone_pgdat = pgdat; | 2666 | zone->zone_pgdat = pgdat; |
2023 | zone->free_pages = 0; | 2667 | zone->free_pages = 0; |
2024 | 2668 | ||
2025 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 2669 | zone->prev_priority = DEF_PRIORITY; |
2026 | 2670 | ||
2027 | zone_pcp_init(zone); | 2671 | zone_pcp_init(zone); |
2028 | INIT_LIST_HEAD(&zone->active_list); | 2672 | INIT_LIST_HEAD(&zone->active_list); |
@@ -2036,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2036 | if (!size) | 2680 | if (!size) |
2037 | continue; | 2681 | continue; |
2038 | 2682 | ||
2039 | zonetable_add(zone, nid, j, zone_start_pfn, size); | ||
2040 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 2683 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2041 | BUG_ON(ret); | 2684 | BUG_ON(ret); |
2042 | zone_start_pfn += size; | 2685 | zone_start_pfn += size; |
@@ -2073,8 +2716,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2073 | /* | 2716 | /* |
2074 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2717 | * With no DISCONTIG, the global mem_map is just set as node 0's |
2075 | */ | 2718 | */ |
2076 | if (pgdat == NODE_DATA(0)) | 2719 | if (pgdat == NODE_DATA(0)) { |
2077 | mem_map = NODE_DATA(0)->node_mem_map; | 2720 | mem_map = NODE_DATA(0)->node_mem_map; |
2721 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
2722 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | ||
2723 | mem_map -= pgdat->node_start_pfn; | ||
2724 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
2725 | } | ||
2078 | #endif | 2726 | #endif |
2079 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2727 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2080 | } | 2728 | } |
@@ -2085,13 +2733,254 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | |||
2085 | { | 2733 | { |
2086 | pgdat->node_id = nid; | 2734 | pgdat->node_id = nid; |
2087 | pgdat->node_start_pfn = node_start_pfn; | 2735 | pgdat->node_start_pfn = node_start_pfn; |
2088 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | 2736 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
2089 | 2737 | ||
2090 | alloc_node_mem_map(pgdat); | 2738 | alloc_node_mem_map(pgdat); |
2091 | 2739 | ||
2092 | free_area_init_core(pgdat, zones_size, zholes_size); | 2740 | free_area_init_core(pgdat, zones_size, zholes_size); |
2093 | } | 2741 | } |
2094 | 2742 | ||
2743 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
2744 | /** | ||
2745 | * add_active_range - Register a range of PFNs backed by physical memory | ||
2746 | * @nid: The node ID the range resides on | ||
2747 | * @start_pfn: The start PFN of the available physical memory | ||
2748 | * @end_pfn: The end PFN of the available physical memory | ||
2749 | * | ||
2750 | * These ranges are stored in an early_node_map[] and later used by | ||
2751 | * free_area_init_nodes() to calculate zone sizes and holes. If the | ||
2752 | * range spans a memory hole, it is up to the architecture to ensure | ||
2753 | * the memory is not freed by the bootmem allocator. If possible | ||
2754 | * the range being registered will be merged with existing ranges. | ||
2755 | */ | ||
2756 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, | ||
2757 | unsigned long end_pfn) | ||
2758 | { | ||
2759 | int i; | ||
2760 | |||
2761 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " | ||
2762 | "%d entries of %d used\n", | ||
2763 | nid, start_pfn, end_pfn, | ||
2764 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
2765 | |||
2766 | /* Merge with existing active regions if possible */ | ||
2767 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
2768 | if (early_node_map[i].nid != nid) | ||
2769 | continue; | ||
2770 | |||
2771 | /* Skip if an existing region covers this new one */ | ||
2772 | if (start_pfn >= early_node_map[i].start_pfn && | ||
2773 | end_pfn <= early_node_map[i].end_pfn) | ||
2774 | return; | ||
2775 | |||
2776 | /* Merge forward if suitable */ | ||
2777 | if (start_pfn <= early_node_map[i].end_pfn && | ||
2778 | end_pfn > early_node_map[i].end_pfn) { | ||
2779 | early_node_map[i].end_pfn = end_pfn; | ||
2780 | return; | ||
2781 | } | ||
2782 | |||
2783 | /* Merge backward if suitable */ | ||
2784 | if (start_pfn < early_node_map[i].end_pfn && | ||
2785 | end_pfn >= early_node_map[i].start_pfn) { | ||
2786 | early_node_map[i].start_pfn = start_pfn; | ||
2787 | return; | ||
2788 | } | ||
2789 | } | ||
2790 | |||
2791 | /* Check that early_node_map is large enough */ | ||
2792 | if (i >= MAX_ACTIVE_REGIONS) { | ||
2793 | printk(KERN_CRIT "More than %d memory regions, truncating\n", | ||
2794 | MAX_ACTIVE_REGIONS); | ||
2795 | return; | ||
2796 | } | ||
2797 | |||
2798 | early_node_map[i].nid = nid; | ||
2799 | early_node_map[i].start_pfn = start_pfn; | ||
2800 | early_node_map[i].end_pfn = end_pfn; | ||
2801 | nr_nodemap_entries = i + 1; | ||
2802 | } | ||
2803 | |||
2804 | /** | ||
2805 | * shrink_active_range - Shrink an existing registered range of PFNs | ||
2806 | * @nid: The node id the range is on that should be shrunk | ||
2807 | * @old_end_pfn: The old end PFN of the range | ||
2808 | * @new_end_pfn: The new PFN of the range | ||
2809 | * | ||
2810 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. | ||
2811 | * The map is kept at the end physical page range that has already been | ||
2812 | * registered with add_active_range(). This function allows an arch to shrink | ||
2813 | * an existing registered range. | ||
2814 | */ | ||
2815 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, | ||
2816 | unsigned long new_end_pfn) | ||
2817 | { | ||
2818 | int i; | ||
2819 | |||
2820 | /* Find the old active region end and shrink */ | ||
2821 | for_each_active_range_index_in_nid(i, nid) | ||
2822 | if (early_node_map[i].end_pfn == old_end_pfn) { | ||
2823 | early_node_map[i].end_pfn = new_end_pfn; | ||
2824 | break; | ||
2825 | } | ||
2826 | } | ||
2827 | |||
2828 | /** | ||
2829 | * remove_all_active_ranges - Remove all currently registered regions | ||
2830 | * | ||
2831 | * During discovery, it may be found that a table like SRAT is invalid | ||
2832 | * and an alternative discovery method must be used. This function removes | ||
2833 | * all currently registered regions. | ||
2834 | */ | ||
2835 | void __init remove_all_active_ranges(void) | ||
2836 | { | ||
2837 | memset(early_node_map, 0, sizeof(early_node_map)); | ||
2838 | nr_nodemap_entries = 0; | ||
2839 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
2840 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
2841 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
2842 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
2843 | } | ||
2844 | |||
2845 | /* Compare two active node_active_regions */ | ||
2846 | static int __init cmp_node_active_region(const void *a, const void *b) | ||
2847 | { | ||
2848 | struct node_active_region *arange = (struct node_active_region *)a; | ||
2849 | struct node_active_region *brange = (struct node_active_region *)b; | ||
2850 | |||
2851 | /* Done this way to avoid overflows */ | ||
2852 | if (arange->start_pfn > brange->start_pfn) | ||
2853 | return 1; | ||
2854 | if (arange->start_pfn < brange->start_pfn) | ||
2855 | return -1; | ||
2856 | |||
2857 | return 0; | ||
2858 | } | ||
2859 | |||
2860 | /* sort the node_map by start_pfn */ | ||
2861 | static void __init sort_node_map(void) | ||
2862 | { | ||
2863 | sort(early_node_map, (size_t)nr_nodemap_entries, | ||
2864 | sizeof(struct node_active_region), | ||
2865 | cmp_node_active_region, NULL); | ||
2866 | } | ||
2867 | |||
2868 | /* Find the lowest pfn for a node. This depends on a sorted early_node_map */ | ||
2869 | unsigned long __init find_min_pfn_for_node(unsigned long nid) | ||
2870 | { | ||
2871 | int i; | ||
2872 | |||
2873 | /* Regions in the early_node_map can be in any order */ | ||
2874 | sort_node_map(); | ||
2875 | |||
2876 | /* Assuming a sorted map, the first range found has the starting pfn */ | ||
2877 | for_each_active_range_index_in_nid(i, nid) | ||
2878 | return early_node_map[i].start_pfn; | ||
2879 | |||
2880 | printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); | ||
2881 | return 0; | ||
2882 | } | ||
2883 | |||
2884 | /** | ||
2885 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | ||
2886 | * | ||
2887 | * It returns the minimum PFN based on information provided via | ||
2888 | * add_active_range(). | ||
2889 | */ | ||
2890 | unsigned long __init find_min_pfn_with_active_regions(void) | ||
2891 | { | ||
2892 | return find_min_pfn_for_node(MAX_NUMNODES); | ||
2893 | } | ||
2894 | |||
2895 | /** | ||
2896 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
2897 | * | ||
2898 | * It returns the maximum PFN based on information provided via | ||
2899 | * add_active_range(). | ||
2900 | */ | ||
2901 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
2902 | { | ||
2903 | int i; | ||
2904 | unsigned long max_pfn = 0; | ||
2905 | |||
2906 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2907 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
2908 | |||
2909 | return max_pfn; | ||
2910 | } | ||
2911 | |||
2912 | /** | ||
2913 | * free_area_init_nodes - Initialise all pg_data_t and zone data | ||
2914 | * @max_zone_pfn: an array of max PFNs for each zone | ||
2915 | * | ||
2916 | * This will call free_area_init_node() for each active node in the system. | ||
2917 | * Using the page ranges provided by add_active_range(), the size of each | ||
2918 | * zone in each node and their holes is calculated. If the maximum PFN | ||
2919 | * between two adjacent zones match, it is assumed that the zone is empty. | ||
2920 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | ||
2921 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | ||
2922 | * starts where the previous one ended. For example, ZONE_DMA32 starts | ||
2923 | * at arch_max_dma_pfn. | ||
2924 | */ | ||
2925 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | ||
2926 | { | ||
2927 | unsigned long nid; | ||
2928 | enum zone_type i; | ||
2929 | |||
2930 | /* Record where the zone boundaries are */ | ||
2931 | memset(arch_zone_lowest_possible_pfn, 0, | ||
2932 | sizeof(arch_zone_lowest_possible_pfn)); | ||
2933 | memset(arch_zone_highest_possible_pfn, 0, | ||
2934 | sizeof(arch_zone_highest_possible_pfn)); | ||
2935 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | ||
2936 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | ||
2937 | for (i = 1; i < MAX_NR_ZONES; i++) { | ||
2938 | arch_zone_lowest_possible_pfn[i] = | ||
2939 | arch_zone_highest_possible_pfn[i-1]; | ||
2940 | arch_zone_highest_possible_pfn[i] = | ||
2941 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | ||
2942 | } | ||
2943 | |||
2944 | /* Print out the zone ranges */ | ||
2945 | printk("Zone PFN ranges:\n"); | ||
2946 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2947 | printk(" %-8s %8lu -> %8lu\n", | ||
2948 | zone_names[i], | ||
2949 | arch_zone_lowest_possible_pfn[i], | ||
2950 | arch_zone_highest_possible_pfn[i]); | ||
2951 | |||
2952 | /* Print out the early_node_map[] */ | ||
2953 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); | ||
2954 | for (i = 0; i < nr_nodemap_entries; i++) | ||
2955 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, | ||
2956 | early_node_map[i].start_pfn, | ||
2957 | early_node_map[i].end_pfn); | ||
2958 | |||
2959 | /* Initialise every node */ | ||
2960 | for_each_online_node(nid) { | ||
2961 | pg_data_t *pgdat = NODE_DATA(nid); | ||
2962 | free_area_init_node(nid, pgdat, NULL, | ||
2963 | find_min_pfn_for_node(nid), NULL); | ||
2964 | } | ||
2965 | } | ||
2966 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
2967 | |||
2968 | /** | ||
2969 | * set_dma_reserve - set the specified number of pages reserved in the first zone | ||
2970 | * @new_dma_reserve: The number of pages to mark reserved | ||
2971 | * | ||
2972 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | ||
2973 | * In the DMA zone, a significant percentage may be consumed by kernel image | ||
2974 | * and other unfreeable allocations which can skew the watermarks badly. This | ||
2975 | * function may optionally be used to account for unfreeable pages in the | ||
2976 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and | ||
2977 | * smaller per-cpu batchsize. | ||
2978 | */ | ||
2979 | void __init set_dma_reserve(unsigned long new_dma_reserve) | ||
2980 | { | ||
2981 | dma_reserve = new_dma_reserve; | ||
2982 | } | ||
2983 | |||
2095 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2984 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
2096 | static bootmem_data_t contig_bootmem_data; | 2985 | static bootmem_data_t contig_bootmem_data; |
2097 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2986 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
@@ -2105,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size) | |||
2105 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2994 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2106 | } | 2995 | } |
2107 | 2996 | ||
2108 | #ifdef CONFIG_HOTPLUG_CPU | ||
2109 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2997 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2110 | unsigned long action, void *hcpu) | 2998 | unsigned long action, void *hcpu) |
2111 | { | 2999 | { |
@@ -2120,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
2120 | } | 3008 | } |
2121 | return NOTIFY_OK; | 3009 | return NOTIFY_OK; |
2122 | } | 3010 | } |
2123 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2124 | 3011 | ||
2125 | void __init page_alloc_init(void) | 3012 | void __init page_alloc_init(void) |
2126 | { | 3013 | { |
@@ -2198,10 +3085,11 @@ static void setup_per_zone_lowmem_reserve(void) | |||
2198 | calculate_totalreserve_pages(); | 3085 | calculate_totalreserve_pages(); |
2199 | } | 3086 | } |
2200 | 3087 | ||
2201 | /* | 3088 | /** |
2202 | * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures | 3089 | * setup_per_zone_pages_min - called when min_free_kbytes changes. |
2203 | * that the pages_{min,low,high} values for each zone are set correctly | 3090 | * |
2204 | * with respect to min_free_kbytes. | 3091 | * Ensures that the pages_{min,low,high} values for each zone are set correctly |
3092 | * with respect to min_free_kbytes. | ||
2205 | */ | 3093 | */ |
2206 | void setup_per_zone_pages_min(void) | 3094 | void setup_per_zone_pages_min(void) |
2207 | { | 3095 | { |
@@ -2423,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2423 | /* allow the kernel cmdline to have a say */ | 3311 | /* allow the kernel cmdline to have a say */ |
2424 | if (!numentries) { | 3312 | if (!numentries) { |
2425 | /* round applicable memory size up to nearest megabyte */ | 3313 | /* round applicable memory size up to nearest megabyte */ |
2426 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | 3314 | numentries = nr_kernel_pages; |
2427 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 3315 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
2428 | numentries >>= 20 - PAGE_SHIFT; | 3316 | numentries >>= 20 - PAGE_SHIFT; |
2429 | numentries <<= 20 - PAGE_SHIFT; | 3317 | numentries <<= 20 - PAGE_SHIFT; |
@@ -2445,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2445 | if (numentries > max) | 3333 | if (numentries > max) |
2446 | numentries = max; | 3334 | numentries = max; |
2447 | 3335 | ||
2448 | log2qty = long_log2(numentries); | 3336 | log2qty = ilog2(numentries); |
2449 | 3337 | ||
2450 | do { | 3338 | do { |
2451 | size = bucketsize << log2qty; | 3339 | size = bucketsize << log2qty; |
@@ -2467,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2467 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3355 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
2468 | tablename, | 3356 | tablename, |
2469 | (1U << log2qty), | 3357 | (1U << log2qty), |
2470 | long_log2(size) - PAGE_SHIFT, | 3358 | ilog2(size) - PAGE_SHIFT, |
2471 | size); | 3359 | size); |
2472 | 3360 | ||
2473 | if (_hash_shift) | 3361 | if (_hash_shift) |
@@ -2490,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page) | |||
2490 | EXPORT_SYMBOL(pfn_to_page); | 3378 | EXPORT_SYMBOL(pfn_to_page); |
2491 | EXPORT_SYMBOL(page_to_pfn); | 3379 | EXPORT_SYMBOL(page_to_pfn); |
2492 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 3380 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3381 | |||
3382 | #if MAX_NUMNODES > 1 | ||
3383 | /* | ||
3384 | * Find the highest possible node id. | ||
3385 | */ | ||
3386 | int highest_possible_node_id(void) | ||
3387 | { | ||
3388 | unsigned int node; | ||
3389 | unsigned int highest = 0; | ||
3390 | |||
3391 | for_each_node_mask(node, node_possible_map) | ||
3392 | highest = node; | ||
3393 | return highest; | ||
3394 | } | ||
3395 | EXPORT_SYMBOL(highest_possible_node_id); | ||
3396 | #endif | ||