aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c281
1 files changed, 224 insertions, 57 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7ca66f..cace22b3ac25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,14 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
83 83
84EXPORT_SYMBOL(totalram_pages); 84EXPORT_SYMBOL(totalram_pages);
85 85
86/* 86static char * const zone_names[MAX_NR_ZONES] = {
87 * Used by page_zone() to look up the address of the struct zone whose
88 * id is encoded in the upper bits of page->flags
89 */
90struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
91EXPORT_SYMBOL(zone_table);
92
93static char *zone_names[MAX_NR_ZONES] = {
94 "DMA", 87 "DMA",
95#ifdef CONFIG_ZONE_DMA32 88#ifdef CONFIG_ZONE_DMA32
96 "DMA32", 89 "DMA32",
@@ -237,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
237 int i; 230 int i;
238 int nr_pages = 1 << order; 231 int nr_pages = 1 << order;
239 232
240 page[1].lru.next = (void *)free_compound_page; /* set dtor */ 233 set_compound_page_dtor(page, free_compound_page);
241 page[1].lru.prev = (void *)order; 234 page[1].lru.prev = (void *)order;
242 for (i = 0; i < nr_pages; i++) { 235 for (i = 0; i < nr_pages; i++) {
243 struct page *p = page + i; 236 struct page *p = page + i;
@@ -486,7 +479,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
486 spin_lock(&zone->lock); 479 spin_lock(&zone->lock);
487 zone->all_unreclaimable = 0; 480 zone->all_unreclaimable = 0;
488 zone->pages_scanned = 0; 481 zone->pages_scanned = 0;
489 __free_one_page(page, zone ,order); 482 __free_one_page(page, zone, order);
490 spin_unlock(&zone->lock); 483 spin_unlock(&zone->lock);
491} 484}
492 485
@@ -605,6 +598,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
605 1 << PG_checked | 1 << PG_mappedtodisk); 598 1 << PG_checked | 1 << PG_mappedtodisk);
606 set_page_private(page, 0); 599 set_page_private(page, 0);
607 set_page_refcounted(page); 600 set_page_refcounted(page);
601
602 arch_alloc_page(page, order);
608 kernel_map_pages(page, 1 << order, 1); 603 kernel_map_pages(page, 1 << order, 1);
609 604
610 if (gfp_flags & __GFP_ZERO) 605 if (gfp_flags & __GFP_ZERO)
@@ -690,9 +685,15 @@ void drain_node_pages(int nodeid)
690 685
691 pcp = &pset->pcp[i]; 686 pcp = &pset->pcp[i];
692 if (pcp->count) { 687 if (pcp->count) {
688 int to_drain;
689
693 local_irq_save(flags); 690 local_irq_save(flags);
694 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 691 if (pcp->count >= pcp->batch)
695 pcp->count = 0; 692 to_drain = pcp->batch;
693 else
694 to_drain = pcp->count;
695 free_pages_bulk(zone, to_drain, &pcp->list, 0);
696 pcp->count -= to_drain;
696 local_irq_restore(flags); 697 local_irq_restore(flags);
697 } 698 }
698 } 699 }
@@ -700,7 +701,6 @@ void drain_node_pages(int nodeid)
700} 701}
701#endif 702#endif
702 703
703#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
704static void __drain_pages(unsigned int cpu) 704static void __drain_pages(unsigned int cpu)
705{ 705{
706 unsigned long flags; 706 unsigned long flags;
@@ -722,7 +722,6 @@ static void __drain_pages(unsigned int cpu)
722 } 722 }
723 } 723 }
724} 724}
725#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
726 725
727#ifdef CONFIG_PM 726#ifdef CONFIG_PM
728 727
@@ -925,31 +924,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
925 return 1; 924 return 1;
926} 925}
927 926
927#ifdef CONFIG_NUMA
928/*
929 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
930 * skip over zones that are not allowed by the cpuset, or that have
931 * been recently (in last second) found to be nearly full. See further
932 * comments in mmzone.h. Reduces cache footprint of zonelist scans
933 * that have to skip over alot of full or unallowed zones.
934 *
935 * If the zonelist cache is present in the passed in zonelist, then
936 * returns a pointer to the allowed node mask (either the current
937 * tasks mems_allowed, or node_online_map.)
938 *
939 * If the zonelist cache is not available for this zonelist, does
940 * nothing and returns NULL.
941 *
942 * If the fullzones BITMAP in the zonelist cache is stale (more than
943 * a second since last zap'd) then we zap it out (clear its bits.)
944 *
945 * We hold off even calling zlc_setup, until after we've checked the
946 * first zone in the zonelist, on the theory that most allocations will
947 * be satisfied from that first zone, so best to examine that zone as
948 * quickly as we can.
949 */
950static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
951{
952 struct zonelist_cache *zlc; /* cached zonelist speedup info */
953 nodemask_t *allowednodes; /* zonelist_cache approximation */
954
955 zlc = zonelist->zlcache_ptr;
956 if (!zlc)
957 return NULL;
958
959 if (jiffies - zlc->last_full_zap > 1 * HZ) {
960 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
961 zlc->last_full_zap = jiffies;
962 }
963
964 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
965 &cpuset_current_mems_allowed :
966 &node_online_map;
967 return allowednodes;
968}
969
970/*
971 * Given 'z' scanning a zonelist, run a couple of quick checks to see
972 * if it is worth looking at further for free memory:
973 * 1) Check that the zone isn't thought to be full (doesn't have its
974 * bit set in the zonelist_cache fullzones BITMAP).
975 * 2) Check that the zones node (obtained from the zonelist_cache
976 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
977 * Return true (non-zero) if zone is worth looking at further, or
978 * else return false (zero) if it is not.
979 *
980 * This check -ignores- the distinction between various watermarks,
981 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
982 * found to be full for any variation of these watermarks, it will
983 * be considered full for up to one second by all requests, unless
984 * we are so low on memory on all allowed nodes that we are forced
985 * into the second scan of the zonelist.
986 *
987 * In the second scan we ignore this zonelist cache and exactly
988 * apply the watermarks to all zones, even it is slower to do so.
989 * We are low on memory in the second scan, and should leave no stone
990 * unturned looking for a free page.
991 */
992static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
993 nodemask_t *allowednodes)
994{
995 struct zonelist_cache *zlc; /* cached zonelist speedup info */
996 int i; /* index of *z in zonelist zones */
997 int n; /* node that zone *z is on */
998
999 zlc = zonelist->zlcache_ptr;
1000 if (!zlc)
1001 return 1;
1002
1003 i = z - zonelist->zones;
1004 n = zlc->z_to_n[i];
1005
1006 /* This zone is worth trying if it is allowed but not full */
1007 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1008}
1009
928/* 1010/*
929 * get_page_from_freeliest goes through the zonelist trying to allocate 1011 * Given 'z' scanning a zonelist, set the corresponding bit in
1012 * zlc->fullzones, so that subsequent attempts to allocate a page
1013 * from that zone don't waste time re-examining it.
1014 */
1015static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1016{
1017 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1018 int i; /* index of *z in zonelist zones */
1019
1020 zlc = zonelist->zlcache_ptr;
1021 if (!zlc)
1022 return;
1023
1024 i = z - zonelist->zones;
1025
1026 set_bit(i, zlc->fullzones);
1027}
1028
1029#else /* CONFIG_NUMA */
1030
1031static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1032{
1033 return NULL;
1034}
1035
1036static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1037 nodemask_t *allowednodes)
1038{
1039 return 1;
1040}
1041
1042static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1043{
1044}
1045#endif /* CONFIG_NUMA */
1046
1047/*
1048 * get_page_from_freelist goes through the zonelist trying to allocate
930 * a page. 1049 * a page.
931 */ 1050 */
932static struct page * 1051static struct page *
933get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1052get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
934 struct zonelist *zonelist, int alloc_flags) 1053 struct zonelist *zonelist, int alloc_flags)
935{ 1054{
936 struct zone **z = zonelist->zones; 1055 struct zone **z;
937 struct page *page = NULL; 1056 struct page *page = NULL;
938 int classzone_idx = zone_idx(*z); 1057 int classzone_idx = zone_idx(zonelist->zones[0]);
939 struct zone *zone; 1058 struct zone *zone;
1059 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1060 int zlc_active = 0; /* set if using zonelist_cache */
1061 int did_zlc_setup = 0; /* just call zlc_setup() one time */
940 1062
1063zonelist_scan:
941 /* 1064 /*
942 * Go through the zonelist once, looking for a zone with enough free. 1065 * Scan zonelist, looking for a zone with enough free.
943 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1066 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
944 */ 1067 */
1068 z = zonelist->zones;
1069
945 do { 1070 do {
1071 if (NUMA_BUILD && zlc_active &&
1072 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1073 continue;
946 zone = *z; 1074 zone = *z;
947 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1075 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
948 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1076 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
949 break; 1077 break;
950 if ((alloc_flags & ALLOC_CPUSET) && 1078 if ((alloc_flags & ALLOC_CPUSET) &&
951 !cpuset_zone_allowed(zone, gfp_mask)) 1079 !cpuset_zone_allowed(zone, gfp_mask))
952 continue; 1080 goto try_next_zone;
953 1081
954 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1082 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
955 unsigned long mark; 1083 unsigned long mark;
@@ -959,18 +1087,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
959 mark = zone->pages_low; 1087 mark = zone->pages_low;
960 else 1088 else
961 mark = zone->pages_high; 1089 mark = zone->pages_high;
962 if (!zone_watermark_ok(zone , order, mark, 1090 if (!zone_watermark_ok(zone, order, mark,
963 classzone_idx, alloc_flags)) 1091 classzone_idx, alloc_flags)) {
964 if (!zone_reclaim_mode || 1092 if (!zone_reclaim_mode ||
965 !zone_reclaim(zone, gfp_mask, order)) 1093 !zone_reclaim(zone, gfp_mask, order))
966 continue; 1094 goto this_zone_full;
1095 }
967 } 1096 }
968 1097
969 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1098 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
970 if (page) { 1099 if (page)
971 break; 1100 break;
1101this_zone_full:
1102 if (NUMA_BUILD)
1103 zlc_mark_zone_full(zonelist, z);
1104try_next_zone:
1105 if (NUMA_BUILD && !did_zlc_setup) {
1106 /* we do zlc_setup after the first zone is tried */
1107 allowednodes = zlc_setup(zonelist, alloc_flags);
1108 zlc_active = 1;
1109 did_zlc_setup = 1;
972 } 1110 }
973 } while (*(++z) != NULL); 1111 } while (*(++z) != NULL);
1112
1113 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1114 /* Disable zlc cache for second zonelist scan */
1115 zlc_active = 0;
1116 goto zonelist_scan;
1117 }
974 return page; 1118 return page;
975} 1119}
976 1120
@@ -1005,9 +1149,19 @@ restart:
1005 if (page) 1149 if (page)
1006 goto got_pg; 1150 goto got_pg;
1007 1151
1008 do { 1152 /*
1153 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1154 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1155 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1156 * using a larger set of nodes after it has established that the
1157 * allowed per node queues are empty and that nodes are
1158 * over allocated.
1159 */
1160 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1161 goto nopage;
1162
1163 for (z = zonelist->zones; *z; z++)
1009 wakeup_kswapd(*z, order); 1164 wakeup_kswapd(*z, order);
1010 } while (*(++z));
1011 1165
1012 /* 1166 /*
1013 * OK, we're below the kswapd watermark and have kicked background 1167 * OK, we're below the kswapd watermark and have kicked background
@@ -1041,6 +1195,7 @@ restart:
1041 1195
1042 /* This allocation should allow future memory freeing. */ 1196 /* This allocation should allow future memory freeing. */
1043 1197
1198rebalance:
1044 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1199 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1045 && !in_interrupt()) { 1200 && !in_interrupt()) {
1046 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1201 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1062,7 +1217,6 @@ nofail_alloc:
1062 if (!wait) 1217 if (!wait)
1063 goto nopage; 1218 goto nopage;
1064 1219
1065rebalance:
1066 cond_resched(); 1220 cond_resched();
1067 1221
1068 /* We now go into synchronous reclaim */ 1222 /* We now go into synchronous reclaim */
@@ -1262,7 +1416,7 @@ unsigned int nr_free_pagecache_pages(void)
1262static inline void show_node(struct zone *zone) 1416static inline void show_node(struct zone *zone)
1263{ 1417{
1264 if (NUMA_BUILD) 1418 if (NUMA_BUILD)
1265 printk("Node %ld ", zone_to_nid(zone)); 1419 printk("Node %d ", zone_to_nid(zone));
1266} 1420}
1267 1421
1268void si_meminfo(struct sysinfo *val) 1422void si_meminfo(struct sysinfo *val)
@@ -1542,6 +1696,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1542 } 1696 }
1543} 1697}
1544 1698
1699/* Construct the zonelist performance cache - see further mmzone.h */
1700static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1701{
1702 int i;
1703
1704 for (i = 0; i < MAX_NR_ZONES; i++) {
1705 struct zonelist *zonelist;
1706 struct zonelist_cache *zlc;
1707 struct zone **z;
1708
1709 zonelist = pgdat->node_zonelists + i;
1710 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1711 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1712 for (z = zonelist->zones; *z; z++)
1713 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1714 }
1715}
1716
1545#else /* CONFIG_NUMA */ 1717#else /* CONFIG_NUMA */
1546 1718
1547static void __meminit build_zonelists(pg_data_t *pgdat) 1719static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1579,14 +1751,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1579 } 1751 }
1580} 1752}
1581 1753
1754/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1755static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1756{
1757 int i;
1758
1759 for (i = 0; i < MAX_NR_ZONES; i++)
1760 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1761}
1762
1582#endif /* CONFIG_NUMA */ 1763#endif /* CONFIG_NUMA */
1583 1764
1584/* return values int ....just for stop_machine_run() */ 1765/* return values int ....just for stop_machine_run() */
1585static int __meminit __build_all_zonelists(void *dummy) 1766static int __meminit __build_all_zonelists(void *dummy)
1586{ 1767{
1587 int nid; 1768 int nid;
1588 for_each_online_node(nid) 1769
1770 for_each_online_node(nid) {
1589 build_zonelists(NODE_DATA(nid)); 1771 build_zonelists(NODE_DATA(nid));
1772 build_zonelist_cache(NODE_DATA(nid));
1773 }
1590 return 0; 1774 return 0;
1591} 1775}
1592 1776
@@ -1715,20 +1899,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1715 } 1899 }
1716} 1900}
1717 1901
1718#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1719void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1720 unsigned long pfn, unsigned long size)
1721{
1722 unsigned long snum = pfn_to_section_nr(pfn);
1723 unsigned long end = pfn_to_section_nr(pfn + size);
1724
1725 if (FLAGS_HAS_NODE)
1726 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1727 else
1728 for (; snum <= end; snum++)
1729 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1730}
1731
1732#ifndef __HAVE_ARCH_MEMMAP_INIT 1902#ifndef __HAVE_ARCH_MEMMAP_INIT
1733#define memmap_init(size, nid, zone, start_pfn) \ 1903#define memmap_init(size, nid, zone, start_pfn) \
1734 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1904 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1881,16 +2051,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1881 int ret = NOTIFY_OK; 2051 int ret = NOTIFY_OK;
1882 2052
1883 switch (action) { 2053 switch (action) {
1884 case CPU_UP_PREPARE: 2054 case CPU_UP_PREPARE:
1885 if (process_zones(cpu)) 2055 if (process_zones(cpu))
1886 ret = NOTIFY_BAD; 2056 ret = NOTIFY_BAD;
1887 break; 2057 break;
1888 case CPU_UP_CANCELED: 2058 case CPU_UP_CANCELED:
1889 case CPU_DEAD: 2059 case CPU_DEAD:
1890 free_zone_pagesets(cpu); 2060 free_zone_pagesets(cpu);
1891 break; 2061 break;
1892 default: 2062 default:
1893 break; 2063 break;
1894 } 2064 }
1895 return ret; 2065 return ret;
1896} 2066}
@@ -2421,7 +2591,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2421 if (!size) 2591 if (!size)
2422 continue; 2592 continue;
2423 2593
2424 zonetable_add(zone, nid, j, zone_start_pfn, size);
2425 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2594 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2426 BUG_ON(ret); 2595 BUG_ON(ret);
2427 zone_start_pfn += size; 2596 zone_start_pfn += size;
@@ -2736,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size)
2736 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2905 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2737} 2906}
2738 2907
2739#ifdef CONFIG_HOTPLUG_CPU
2740static int page_alloc_cpu_notify(struct notifier_block *self, 2908static int page_alloc_cpu_notify(struct notifier_block *self,
2741 unsigned long action, void *hcpu) 2909 unsigned long action, void *hcpu)
2742{ 2910{
@@ -2751,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2751 } 2919 }
2752 return NOTIFY_OK; 2920 return NOTIFY_OK;
2753} 2921}
2754#endif /* CONFIG_HOTPLUG_CPU */
2755 2922
2756void __init page_alloc_init(void) 2923void __init page_alloc_init(void)
2757{ 2924{
@@ -3055,7 +3222,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3055 /* allow the kernel cmdline to have a say */ 3222 /* allow the kernel cmdline to have a say */
3056 if (!numentries) { 3223 if (!numentries) {
3057 /* round applicable memory size up to nearest megabyte */ 3224 /* round applicable memory size up to nearest megabyte */
3058 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3225 numentries = nr_kernel_pages;
3059 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3226 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3060 numentries >>= 20 - PAGE_SHIFT; 3227 numentries >>= 20 - PAGE_SHIFT;
3061 numentries <<= 20 - PAGE_SHIFT; 3228 numentries <<= 20 - PAGE_SHIFT;