aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c184
1 files changed, 116 insertions, 68 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..423db0db7c02 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h> 39#include <linux/mempolicy.h>
40#include <linux/stop_machine.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42#include <asm/div64.h> 43#include <asm/div64.h>
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table);
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 84static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
84int min_free_kbytes = 1024; 85int min_free_kbytes = 1024;
85 86
86unsigned long __initdata nr_kernel_pages; 87unsigned long __meminitdata nr_kernel_pages;
87unsigned long __initdata nr_all_pages; 88unsigned long __meminitdata nr_all_pages;
88 89
89#ifdef CONFIG_DEBUG_VM 90#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 91static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
286 * we can do coalesce a page and its buddy if 287 * we can do coalesce a page and its buddy if
287 * (a) the buddy is not in a hole && 288 * (a) the buddy is not in a hole &&
288 * (b) the buddy is in the buddy system && 289 * (b) the buddy is in the buddy system &&
289 * (c) a page and its buddy have the same order. 290 * (c) a page and its buddy have the same order &&
291 * (d) a page and its buddy are in the same zone.
290 * 292 *
291 * For recording whether a page is in the buddy system, we use PG_buddy. 293 * For recording whether a page is in the buddy system, we use PG_buddy.
292 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 294 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
293 * 295 *
294 * For recording page's order, we use page_private(page). 296 * For recording page's order, we use page_private(page).
295 */ 297 */
296static inline int page_is_buddy(struct page *page, int order) 298static inline int page_is_buddy(struct page *page, struct page *buddy,
299 int order)
297{ 300{
298#ifdef CONFIG_HOLES_IN_ZONE 301#ifdef CONFIG_HOLES_IN_ZONE
299 if (!pfn_valid(page_to_pfn(page))) 302 if (!pfn_valid(page_to_pfn(buddy)))
300 return 0; 303 return 0;
301#endif 304#endif
302 305
303 if (PageBuddy(page) && page_order(page) == order) { 306 if (page_zone_id(page) != page_zone_id(buddy))
304 BUG_ON(page_count(page) != 0); 307 return 0;
308
309 if (PageBuddy(buddy) && page_order(buddy) == order) {
310 BUG_ON(page_count(buddy) != 0);
305 return 1; 311 return 1;
306 } 312 }
307 return 0; 313 return 0;
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page,
352 struct page *buddy; 358 struct page *buddy;
353 359
354 buddy = __page_find_buddy(page, page_idx, order); 360 buddy = __page_find_buddy(page, page_idx, order);
355 if (!page_is_buddy(buddy, order)) 361 if (!page_is_buddy(page, buddy, order))
356 break; /* Move the buddy up one level. */ 362 break; /* Move the buddy up one level. */
357 363
358 list_del(&buddy->lru); 364 list_del(&buddy->lru);
@@ -1485,7 +1491,7 @@ void show_free_areas(void)
1485 } 1491 }
1486 1492
1487 for_each_zone(zone) { 1493 for_each_zone(zone) {
1488 unsigned long nr, flags, order, total = 0; 1494 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1489 1495
1490 show_node(zone); 1496 show_node(zone);
1491 printk("%s: ", zone->name); 1497 printk("%s: ", zone->name);
@@ -1496,11 +1502,12 @@ void show_free_areas(void)
1496 1502
1497 spin_lock_irqsave(&zone->lock, flags); 1503 spin_lock_irqsave(&zone->lock, flags);
1498 for (order = 0; order < MAX_ORDER; order++) { 1504 for (order = 0; order < MAX_ORDER; order++) {
1499 nr = zone->free_area[order].nr_free; 1505 nr[order] = zone->free_area[order].nr_free;
1500 total += nr << order; 1506 total += nr[order] << order;
1501 printk("%lu*%lukB ", nr, K(1UL) << order);
1502 } 1507 }
1503 spin_unlock_irqrestore(&zone->lock, flags); 1508 spin_unlock_irqrestore(&zone->lock, flags);
1509 for (order = 0; order < MAX_ORDER; order++)
1510 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1504 printk("= %lukB\n", K(total)); 1511 printk("= %lukB\n", K(total));
1505 } 1512 }
1506 1513
@@ -1512,7 +1519,7 @@ void show_free_areas(void)
1512 * 1519 *
1513 * Add all populated zones of a node to the zonelist. 1520 * Add all populated zones of a node to the zonelist.
1514 */ 1521 */
1515static int __init build_zonelists_node(pg_data_t *pgdat, 1522static int __meminit build_zonelists_node(pg_data_t *pgdat,
1516 struct zonelist *zonelist, int nr_zones, int zone_type) 1523 struct zonelist *zonelist, int nr_zones, int zone_type)
1517{ 1524{
1518 struct zone *zone; 1525 struct zone *zone;
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits)
1548 1555
1549#ifdef CONFIG_NUMA 1556#ifdef CONFIG_NUMA
1550#define MAX_NODE_LOAD (num_online_nodes()) 1557#define MAX_NODE_LOAD (num_online_nodes())
1551static int __initdata node_load[MAX_NUMNODES]; 1558static int __meminitdata node_load[MAX_NUMNODES];
1552/** 1559/**
1553 * find_next_best_node - find the next node that should appear in a given node's fallback list 1560 * find_next_best_node - find the next node that should appear in a given node's fallback list
1554 * @node: node whose fallback list we're appending 1561 * @node: node whose fallback list we're appending
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES];
1563 * on them otherwise. 1570 * on them otherwise.
1564 * It returns -1 if no node is found. 1571 * It returns -1 if no node is found.
1565 */ 1572 */
1566static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1573static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1567{ 1574{
1568 int n, val; 1575 int n, val;
1569 int min_val = INT_MAX; 1576 int min_val = INT_MAX;
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1609 return best_node; 1616 return best_node;
1610} 1617}
1611 1618
1612static void __init build_zonelists(pg_data_t *pgdat) 1619static void __meminit build_zonelists(pg_data_t *pgdat)
1613{ 1620{
1614 int i, j, k, node, local_node; 1621 int i, j, k, node, local_node;
1615 int prev_node, load; 1622 int prev_node, load;
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1661 1668
1662#else /* CONFIG_NUMA */ 1669#else /* CONFIG_NUMA */
1663 1670
1664static void __init build_zonelists(pg_data_t *pgdat) 1671static void __meminit build_zonelists(pg_data_t *pgdat)
1665{ 1672{
1666 int i, j, k, node, local_node; 1673 int i, j, k, node, local_node;
1667 1674
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
1699 1706
1700#endif /* CONFIG_NUMA */ 1707#endif /* CONFIG_NUMA */
1701 1708
1702void __init build_all_zonelists(void) 1709/* return values int ....just for stop_machine_run() */
1710static int __meminit __build_all_zonelists(void *dummy)
1703{ 1711{
1704 int i; 1712 int nid;
1713 for_each_online_node(nid)
1714 build_zonelists(NODE_DATA(nid));
1715 return 0;
1716}
1705 1717
1706 for_each_online_node(i) 1718void __meminit build_all_zonelists(void)
1707 build_zonelists(NODE_DATA(i)); 1719{
1708 printk("Built %i zonelists\n", num_online_nodes()); 1720 if (system_state == SYSTEM_BOOTING) {
1709 cpuset_init_current_mems_allowed(); 1721 __build_all_zonelists(0);
1722 cpuset_init_current_mems_allowed();
1723 } else {
1724 /* we have to stop all cpus to guaranntee there is no user
1725 of zonelist */
1726 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1727 /* cpuset refresh routine should be here */
1728 }
1729 vm_total_pages = nr_free_pagecache_pages();
1730 printk("Built %i zonelists. Total pages: %ld\n",
1731 num_online_nodes(), vm_total_pages);
1710} 1732}
1711 1733
1712/* 1734/*
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void)
1722 */ 1744 */
1723#define PAGES_PER_WAITQUEUE 256 1745#define PAGES_PER_WAITQUEUE 256
1724 1746
1725static inline unsigned long wait_table_size(unsigned long pages) 1747#ifndef CONFIG_MEMORY_HOTPLUG
1748static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1726{ 1749{
1727 unsigned long size = 1; 1750 unsigned long size = 1;
1728 1751
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
1740 1763
1741 return max(size, 4UL); 1764 return max(size, 4UL);
1742} 1765}
1766#else
1767/*
1768 * A zone's size might be changed by hot-add, so it is not possible to determine
1769 * a suitable size for its wait_table. So we use the maximum size now.
1770 *
1771 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1772 *
1773 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1774 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1775 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1776 *
1777 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1778 * or more by the traditional way. (See above). It equals:
1779 *
1780 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1781 * ia64(16K page size) : = ( 8G + 4M)byte.
1782 * powerpc (64K page size) : = (32G +16M)byte.
1783 */
1784static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1785{
1786 return 4096UL;
1787}
1788#endif
1743 1789
1744/* 1790/*
1745 * This is an integer logarithm so that shifts can be used later 1791 * This is an integer logarithm so that shifts can be used later
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void)
2005#endif 2051#endif
2006 2052
2007static __meminit 2053static __meminit
2008void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2054int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2009{ 2055{
2010 int i; 2056 int i;
2011 struct pglist_data *pgdat = zone->zone_pgdat; 2057 struct pglist_data *pgdat = zone->zone_pgdat;
2058 size_t alloc_size;
2012 2059
2013 /* 2060 /*
2014 * The per-page waitqueue mechanism uses hashed waitqueues 2061 * The per-page waitqueue mechanism uses hashed waitqueues
2015 * per zone. 2062 * per zone.
2016 */ 2063 */
2017 zone->wait_table_size = wait_table_size(zone_size_pages); 2064 zone->wait_table_hash_nr_entries =
2018 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 2065 wait_table_hash_nr_entries(zone_size_pages);
2019 zone->wait_table = (wait_queue_head_t *) 2066 zone->wait_table_bits =
2020 alloc_bootmem_node(pgdat, zone->wait_table_size 2067 wait_table_bits(zone->wait_table_hash_nr_entries);
2021 * sizeof(wait_queue_head_t)); 2068 alloc_size = zone->wait_table_hash_nr_entries
2069 * sizeof(wait_queue_head_t);
2070
2071 if (system_state == SYSTEM_BOOTING) {
2072 zone->wait_table = (wait_queue_head_t *)
2073 alloc_bootmem_node(pgdat, alloc_size);
2074 } else {
2075 /*
2076 * This case means that a zone whose size was 0 gets new memory
2077 * via memory hot-add.
2078 * But it may be the case that a new node was hot-added. In
2079 * this case vmalloc() will not be able to use this new node's
2080 * memory - this wait_table must be initialized to use this new
2081 * node itself as well.
2082 * To use this new node's memory, further consideration will be
2083 * necessary.
2084 */
2085 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2086 }
2087 if (!zone->wait_table)
2088 return -ENOMEM;
2022 2089
2023 for(i = 0; i < zone->wait_table_size; ++i) 2090 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2024 init_waitqueue_head(zone->wait_table + i); 2091 init_waitqueue_head(zone->wait_table + i);
2092
2093 return 0;
2025} 2094}
2026 2095
2027static __meminit void zone_pcp_init(struct zone *zone) 2096static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
2043 zone->name, zone->present_pages, batch); 2112 zone->name, zone->present_pages, batch);
2044} 2113}
2045 2114
2046static __meminit void init_currently_empty_zone(struct zone *zone, 2115__meminit int init_currently_empty_zone(struct zone *zone,
2047 unsigned long zone_start_pfn, unsigned long size) 2116 unsigned long zone_start_pfn,
2117 unsigned long size)
2048{ 2118{
2049 struct pglist_data *pgdat = zone->zone_pgdat; 2119 struct pglist_data *pgdat = zone->zone_pgdat;
2050 2120 int ret;
2051 zone_wait_table_init(zone, size); 2121 ret = zone_wait_table_init(zone, size);
2122 if (ret)
2123 return ret;
2052 pgdat->nr_zones = zone_idx(zone) + 1; 2124 pgdat->nr_zones = zone_idx(zone) + 1;
2053 2125
2054 zone->zone_start_pfn = zone_start_pfn; 2126 zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2056 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2128 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2057 2129
2058 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2130 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2131
2132 return 0;
2059} 2133}
2060 2134
2061/* 2135/*
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2064 * - mark all memory queues empty 2138 * - mark all memory queues empty
2065 * - clear the memory bitmaps 2139 * - clear the memory bitmaps
2066 */ 2140 */
2067static void __init free_area_init_core(struct pglist_data *pgdat, 2141static void __meminit free_area_init_core(struct pglist_data *pgdat,
2068 unsigned long *zones_size, unsigned long *zholes_size) 2142 unsigned long *zones_size, unsigned long *zholes_size)
2069{ 2143{
2070 unsigned long j; 2144 unsigned long j;
2071 int nid = pgdat->node_id; 2145 int nid = pgdat->node_id;
2072 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2146 unsigned long zone_start_pfn = pgdat->node_start_pfn;
2147 int ret;
2073 2148
2074 pgdat_resize_init(pgdat); 2149 pgdat_resize_init(pgdat);
2075 pgdat->nr_zones = 0; 2150 pgdat->nr_zones = 0;
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
2111 continue; 2186 continue;
2112 2187
2113 zonetable_add(zone, nid, j, zone_start_pfn, size); 2188 zonetable_add(zone, nid, j, zone_start_pfn, size);
2114 init_currently_empty_zone(zone, zone_start_pfn, size); 2189 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2190 BUG_ON(ret);
2115 zone_start_pfn += size; 2191 zone_start_pfn += size;
2116 } 2192 }
2117} 2193}
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2152#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2228#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2153} 2229}
2154 2230
2155void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2231void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2156 unsigned long *zones_size, unsigned long node_start_pfn, 2232 unsigned long *zones_size, unsigned long node_start_pfn,
2157 unsigned long *zholes_size) 2233 unsigned long *zholes_size)
2158{ 2234{
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename,
2804} 2880}
2805 2881
2806#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 2882#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2807/*
2808 * pfn <-> page translation. out-of-line version.
2809 * (see asm-generic/memory_model.h)
2810 */
2811#if defined(CONFIG_FLATMEM)
2812struct page *pfn_to_page(unsigned long pfn) 2883struct page *pfn_to_page(unsigned long pfn)
2813{ 2884{
2814 return mem_map + (pfn - ARCH_PFN_OFFSET); 2885 return __pfn_to_page(pfn);
2815} 2886}
2816unsigned long page_to_pfn(struct page *page) 2887unsigned long page_to_pfn(struct page *page)
2817{ 2888{
2818 return (page - mem_map) + ARCH_PFN_OFFSET; 2889 return __page_to_pfn(page);
2819}
2820#elif defined(CONFIG_DISCONTIGMEM)
2821struct page *pfn_to_page(unsigned long pfn)
2822{
2823 int nid = arch_pfn_to_nid(pfn);
2824 return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
2825}
2826unsigned long page_to_pfn(struct page *page)
2827{
2828 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
2829 return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
2830}
2831#elif defined(CONFIG_SPARSEMEM)
2832struct page *pfn_to_page(unsigned long pfn)
2833{
2834 return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
2835}
2836
2837unsigned long page_to_pfn(struct page *page)
2838{
2839 long section_id = page_to_section(page);
2840 return page - __section_mem_map_addr(__nr_to_section(section_id));
2841} 2890}
2842#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
2843EXPORT_SYMBOL(pfn_to_page); 2891EXPORT_SYMBOL(pfn_to_page);
2844EXPORT_SYMBOL(page_to_pfn); 2892EXPORT_SYMBOL(page_to_pfn);
2845#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 2893#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */