diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 184 |
1 files changed, 116 insertions, 68 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 253a450c400d..423db0db7c02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mempolicy.h> | 39 | #include <linux/mempolicy.h> |
40 | #include <linux/stop_machine.h> | ||
40 | 41 | ||
41 | #include <asm/tlbflush.h> | 42 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 43 | #include <asm/div64.h> |
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table); | |||
83 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; | 84 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; |
84 | int min_free_kbytes = 1024; | 85 | int min_free_kbytes = 1024; |
85 | 86 | ||
86 | unsigned long __initdata nr_kernel_pages; | 87 | unsigned long __meminitdata nr_kernel_pages; |
87 | unsigned long __initdata nr_all_pages; | 88 | unsigned long __meminitdata nr_all_pages; |
88 | 89 | ||
89 | #ifdef CONFIG_DEBUG_VM | 90 | #ifdef CONFIG_DEBUG_VM |
90 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 91 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
286 | * we can do coalesce a page and its buddy if | 287 | * we can do coalesce a page and its buddy if |
287 | * (a) the buddy is not in a hole && | 288 | * (a) the buddy is not in a hole && |
288 | * (b) the buddy is in the buddy system && | 289 | * (b) the buddy is in the buddy system && |
289 | * (c) a page and its buddy have the same order. | 290 | * (c) a page and its buddy have the same order && |
291 | * (d) a page and its buddy are in the same zone. | ||
290 | * | 292 | * |
291 | * For recording whether a page is in the buddy system, we use PG_buddy. | 293 | * For recording whether a page is in the buddy system, we use PG_buddy. |
292 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 294 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. |
293 | * | 295 | * |
294 | * For recording page's order, we use page_private(page). | 296 | * For recording page's order, we use page_private(page). |
295 | */ | 297 | */ |
296 | static inline int page_is_buddy(struct page *page, int order) | 298 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
299 | int order) | ||
297 | { | 300 | { |
298 | #ifdef CONFIG_HOLES_IN_ZONE | 301 | #ifdef CONFIG_HOLES_IN_ZONE |
299 | if (!pfn_valid(page_to_pfn(page))) | 302 | if (!pfn_valid(page_to_pfn(buddy))) |
300 | return 0; | 303 | return 0; |
301 | #endif | 304 | #endif |
302 | 305 | ||
303 | if (PageBuddy(page) && page_order(page) == order) { | 306 | if (page_zone_id(page) != page_zone_id(buddy)) |
304 | BUG_ON(page_count(page) != 0); | 307 | return 0; |
308 | |||
309 | if (PageBuddy(buddy) && page_order(buddy) == order) { | ||
310 | BUG_ON(page_count(buddy) != 0); | ||
305 | return 1; | 311 | return 1; |
306 | } | 312 | } |
307 | return 0; | 313 | return 0; |
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page, | |||
352 | struct page *buddy; | 358 | struct page *buddy; |
353 | 359 | ||
354 | buddy = __page_find_buddy(page, page_idx, order); | 360 | buddy = __page_find_buddy(page, page_idx, order); |
355 | if (!page_is_buddy(buddy, order)) | 361 | if (!page_is_buddy(page, buddy, order)) |
356 | break; /* Move the buddy up one level. */ | 362 | break; /* Move the buddy up one level. */ |
357 | 363 | ||
358 | list_del(&buddy->lru); | 364 | list_del(&buddy->lru); |
@@ -1485,7 +1491,7 @@ void show_free_areas(void) | |||
1485 | } | 1491 | } |
1486 | 1492 | ||
1487 | for_each_zone(zone) { | 1493 | for_each_zone(zone) { |
1488 | unsigned long nr, flags, order, total = 0; | 1494 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1489 | 1495 | ||
1490 | show_node(zone); | 1496 | show_node(zone); |
1491 | printk("%s: ", zone->name); | 1497 | printk("%s: ", zone->name); |
@@ -1496,11 +1502,12 @@ void show_free_areas(void) | |||
1496 | 1502 | ||
1497 | spin_lock_irqsave(&zone->lock, flags); | 1503 | spin_lock_irqsave(&zone->lock, flags); |
1498 | for (order = 0; order < MAX_ORDER; order++) { | 1504 | for (order = 0; order < MAX_ORDER; order++) { |
1499 | nr = zone->free_area[order].nr_free; | 1505 | nr[order] = zone->free_area[order].nr_free; |
1500 | total += nr << order; | 1506 | total += nr[order] << order; |
1501 | printk("%lu*%lukB ", nr, K(1UL) << order); | ||
1502 | } | 1507 | } |
1503 | spin_unlock_irqrestore(&zone->lock, flags); | 1508 | spin_unlock_irqrestore(&zone->lock, flags); |
1509 | for (order = 0; order < MAX_ORDER; order++) | ||
1510 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | ||
1504 | printk("= %lukB\n", K(total)); | 1511 | printk("= %lukB\n", K(total)); |
1505 | } | 1512 | } |
1506 | 1513 | ||
@@ -1512,7 +1519,7 @@ void show_free_areas(void) | |||
1512 | * | 1519 | * |
1513 | * Add all populated zones of a node to the zonelist. | 1520 | * Add all populated zones of a node to the zonelist. |
1514 | */ | 1521 | */ |
1515 | static int __init build_zonelists_node(pg_data_t *pgdat, | 1522 | static int __meminit build_zonelists_node(pg_data_t *pgdat, |
1516 | struct zonelist *zonelist, int nr_zones, int zone_type) | 1523 | struct zonelist *zonelist, int nr_zones, int zone_type) |
1517 | { | 1524 | { |
1518 | struct zone *zone; | 1525 | struct zone *zone; |
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits) | |||
1548 | 1555 | ||
1549 | #ifdef CONFIG_NUMA | 1556 | #ifdef CONFIG_NUMA |
1550 | #define MAX_NODE_LOAD (num_online_nodes()) | 1557 | #define MAX_NODE_LOAD (num_online_nodes()) |
1551 | static int __initdata node_load[MAX_NUMNODES]; | 1558 | static int __meminitdata node_load[MAX_NUMNODES]; |
1552 | /** | 1559 | /** |
1553 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1560 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1554 | * @node: node whose fallback list we're appending | 1561 | * @node: node whose fallback list we're appending |
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES]; | |||
1563 | * on them otherwise. | 1570 | * on them otherwise. |
1564 | * It returns -1 if no node is found. | 1571 | * It returns -1 if no node is found. |
1565 | */ | 1572 | */ |
1566 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | 1573 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) |
1567 | { | 1574 | { |
1568 | int n, val; | 1575 | int n, val; |
1569 | int min_val = INT_MAX; | 1576 | int min_val = INT_MAX; |
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1609 | return best_node; | 1616 | return best_node; |
1610 | } | 1617 | } |
1611 | 1618 | ||
1612 | static void __init build_zonelists(pg_data_t *pgdat) | 1619 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1613 | { | 1620 | { |
1614 | int i, j, k, node, local_node; | 1621 | int i, j, k, node, local_node; |
1615 | int prev_node, load; | 1622 | int prev_node, load; |
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1661 | 1668 | ||
1662 | #else /* CONFIG_NUMA */ | 1669 | #else /* CONFIG_NUMA */ |
1663 | 1670 | ||
1664 | static void __init build_zonelists(pg_data_t *pgdat) | 1671 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1665 | { | 1672 | { |
1666 | int i, j, k, node, local_node; | 1673 | int i, j, k, node, local_node; |
1667 | 1674 | ||
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1699 | 1706 | ||
1700 | #endif /* CONFIG_NUMA */ | 1707 | #endif /* CONFIG_NUMA */ |
1701 | 1708 | ||
1702 | void __init build_all_zonelists(void) | 1709 | /* return values int ....just for stop_machine_run() */ |
1710 | static int __meminit __build_all_zonelists(void *dummy) | ||
1703 | { | 1711 | { |
1704 | int i; | 1712 | int nid; |
1713 | for_each_online_node(nid) | ||
1714 | build_zonelists(NODE_DATA(nid)); | ||
1715 | return 0; | ||
1716 | } | ||
1705 | 1717 | ||
1706 | for_each_online_node(i) | 1718 | void __meminit build_all_zonelists(void) |
1707 | build_zonelists(NODE_DATA(i)); | 1719 | { |
1708 | printk("Built %i zonelists\n", num_online_nodes()); | 1720 | if (system_state == SYSTEM_BOOTING) { |
1709 | cpuset_init_current_mems_allowed(); | 1721 | __build_all_zonelists(0); |
1722 | cpuset_init_current_mems_allowed(); | ||
1723 | } else { | ||
1724 | /* we have to stop all cpus to guaranntee there is no user | ||
1725 | of zonelist */ | ||
1726 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | ||
1727 | /* cpuset refresh routine should be here */ | ||
1728 | } | ||
1729 | vm_total_pages = nr_free_pagecache_pages(); | ||
1730 | printk("Built %i zonelists. Total pages: %ld\n", | ||
1731 | num_online_nodes(), vm_total_pages); | ||
1710 | } | 1732 | } |
1711 | 1733 | ||
1712 | /* | 1734 | /* |
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void) | |||
1722 | */ | 1744 | */ |
1723 | #define PAGES_PER_WAITQUEUE 256 | 1745 | #define PAGES_PER_WAITQUEUE 256 |
1724 | 1746 | ||
1725 | static inline unsigned long wait_table_size(unsigned long pages) | 1747 | #ifndef CONFIG_MEMORY_HOTPLUG |
1748 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1726 | { | 1749 | { |
1727 | unsigned long size = 1; | 1750 | unsigned long size = 1; |
1728 | 1751 | ||
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages) | |||
1740 | 1763 | ||
1741 | return max(size, 4UL); | 1764 | return max(size, 4UL); |
1742 | } | 1765 | } |
1766 | #else | ||
1767 | /* | ||
1768 | * A zone's size might be changed by hot-add, so it is not possible to determine | ||
1769 | * a suitable size for its wait_table. So we use the maximum size now. | ||
1770 | * | ||
1771 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | ||
1772 | * | ||
1773 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | ||
1774 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | ||
1775 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | ||
1776 | * | ||
1777 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | ||
1778 | * or more by the traditional way. (See above). It equals: | ||
1779 | * | ||
1780 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | ||
1781 | * ia64(16K page size) : = ( 8G + 4M)byte. | ||
1782 | * powerpc (64K page size) : = (32G +16M)byte. | ||
1783 | */ | ||
1784 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1785 | { | ||
1786 | return 4096UL; | ||
1787 | } | ||
1788 | #endif | ||
1743 | 1789 | ||
1744 | /* | 1790 | /* |
1745 | * This is an integer logarithm so that shifts can be used later | 1791 | * This is an integer logarithm so that shifts can be used later |
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void) | |||
2005 | #endif | 2051 | #endif |
2006 | 2052 | ||
2007 | static __meminit | 2053 | static __meminit |
2008 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 2054 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
2009 | { | 2055 | { |
2010 | int i; | 2056 | int i; |
2011 | struct pglist_data *pgdat = zone->zone_pgdat; | 2057 | struct pglist_data *pgdat = zone->zone_pgdat; |
2058 | size_t alloc_size; | ||
2012 | 2059 | ||
2013 | /* | 2060 | /* |
2014 | * The per-page waitqueue mechanism uses hashed waitqueues | 2061 | * The per-page waitqueue mechanism uses hashed waitqueues |
2015 | * per zone. | 2062 | * per zone. |
2016 | */ | 2063 | */ |
2017 | zone->wait_table_size = wait_table_size(zone_size_pages); | 2064 | zone->wait_table_hash_nr_entries = |
2018 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | 2065 | wait_table_hash_nr_entries(zone_size_pages); |
2019 | zone->wait_table = (wait_queue_head_t *) | 2066 | zone->wait_table_bits = |
2020 | alloc_bootmem_node(pgdat, zone->wait_table_size | 2067 | wait_table_bits(zone->wait_table_hash_nr_entries); |
2021 | * sizeof(wait_queue_head_t)); | 2068 | alloc_size = zone->wait_table_hash_nr_entries |
2069 | * sizeof(wait_queue_head_t); | ||
2070 | |||
2071 | if (system_state == SYSTEM_BOOTING) { | ||
2072 | zone->wait_table = (wait_queue_head_t *) | ||
2073 | alloc_bootmem_node(pgdat, alloc_size); | ||
2074 | } else { | ||
2075 | /* | ||
2076 | * This case means that a zone whose size was 0 gets new memory | ||
2077 | * via memory hot-add. | ||
2078 | * But it may be the case that a new node was hot-added. In | ||
2079 | * this case vmalloc() will not be able to use this new node's | ||
2080 | * memory - this wait_table must be initialized to use this new | ||
2081 | * node itself as well. | ||
2082 | * To use this new node's memory, further consideration will be | ||
2083 | * necessary. | ||
2084 | */ | ||
2085 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | ||
2086 | } | ||
2087 | if (!zone->wait_table) | ||
2088 | return -ENOMEM; | ||
2022 | 2089 | ||
2023 | for(i = 0; i < zone->wait_table_size; ++i) | 2090 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
2024 | init_waitqueue_head(zone->wait_table + i); | 2091 | init_waitqueue_head(zone->wait_table + i); |
2092 | |||
2093 | return 0; | ||
2025 | } | 2094 | } |
2026 | 2095 | ||
2027 | static __meminit void zone_pcp_init(struct zone *zone) | 2096 | static __meminit void zone_pcp_init(struct zone *zone) |
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2043 | zone->name, zone->present_pages, batch); | 2112 | zone->name, zone->present_pages, batch); |
2044 | } | 2113 | } |
2045 | 2114 | ||
2046 | static __meminit void init_currently_empty_zone(struct zone *zone, | 2115 | __meminit int init_currently_empty_zone(struct zone *zone, |
2047 | unsigned long zone_start_pfn, unsigned long size) | 2116 | unsigned long zone_start_pfn, |
2117 | unsigned long size) | ||
2048 | { | 2118 | { |
2049 | struct pglist_data *pgdat = zone->zone_pgdat; | 2119 | struct pglist_data *pgdat = zone->zone_pgdat; |
2050 | 2120 | int ret; | |
2051 | zone_wait_table_init(zone, size); | 2121 | ret = zone_wait_table_init(zone, size); |
2122 | if (ret) | ||
2123 | return ret; | ||
2052 | pgdat->nr_zones = zone_idx(zone) + 1; | 2124 | pgdat->nr_zones = zone_idx(zone) + 1; |
2053 | 2125 | ||
2054 | zone->zone_start_pfn = zone_start_pfn; | 2126 | zone->zone_start_pfn = zone_start_pfn; |
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2056 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 2128 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
2057 | 2129 | ||
2058 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 2130 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
2131 | |||
2132 | return 0; | ||
2059 | } | 2133 | } |
2060 | 2134 | ||
2061 | /* | 2135 | /* |
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2064 | * - mark all memory queues empty | 2138 | * - mark all memory queues empty |
2065 | * - clear the memory bitmaps | 2139 | * - clear the memory bitmaps |
2066 | */ | 2140 | */ |
2067 | static void __init free_area_init_core(struct pglist_data *pgdat, | 2141 | static void __meminit free_area_init_core(struct pglist_data *pgdat, |
2068 | unsigned long *zones_size, unsigned long *zholes_size) | 2142 | unsigned long *zones_size, unsigned long *zholes_size) |
2069 | { | 2143 | { |
2070 | unsigned long j; | 2144 | unsigned long j; |
2071 | int nid = pgdat->node_id; | 2145 | int nid = pgdat->node_id; |
2072 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 2146 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
2147 | int ret; | ||
2073 | 2148 | ||
2074 | pgdat_resize_init(pgdat); | 2149 | pgdat_resize_init(pgdat); |
2075 | pgdat->nr_zones = 0; | 2150 | pgdat->nr_zones = 0; |
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
2111 | continue; | 2186 | continue; |
2112 | 2187 | ||
2113 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2188 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
2114 | init_currently_empty_zone(zone, zone_start_pfn, size); | 2189 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2190 | BUG_ON(ret); | ||
2115 | zone_start_pfn += size; | 2191 | zone_start_pfn += size; |
2116 | } | 2192 | } |
2117 | } | 2193 | } |
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2152 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2228 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2153 | } | 2229 | } |
2154 | 2230 | ||
2155 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 2231 | void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, |
2156 | unsigned long *zones_size, unsigned long node_start_pfn, | 2232 | unsigned long *zones_size, unsigned long node_start_pfn, |
2157 | unsigned long *zholes_size) | 2233 | unsigned long *zholes_size) |
2158 | { | 2234 | { |
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2804 | } | 2880 | } |
2805 | 2881 | ||
2806 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | 2882 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE |
2807 | /* | ||
2808 | * pfn <-> page translation. out-of-line version. | ||
2809 | * (see asm-generic/memory_model.h) | ||
2810 | */ | ||
2811 | #if defined(CONFIG_FLATMEM) | ||
2812 | struct page *pfn_to_page(unsigned long pfn) | 2883 | struct page *pfn_to_page(unsigned long pfn) |
2813 | { | 2884 | { |
2814 | return mem_map + (pfn - ARCH_PFN_OFFSET); | 2885 | return __pfn_to_page(pfn); |
2815 | } | 2886 | } |
2816 | unsigned long page_to_pfn(struct page *page) | 2887 | unsigned long page_to_pfn(struct page *page) |
2817 | { | 2888 | { |
2818 | return (page - mem_map) + ARCH_PFN_OFFSET; | 2889 | return __page_to_pfn(page); |
2819 | } | ||
2820 | #elif defined(CONFIG_DISCONTIGMEM) | ||
2821 | struct page *pfn_to_page(unsigned long pfn) | ||
2822 | { | ||
2823 | int nid = arch_pfn_to_nid(pfn); | ||
2824 | return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); | ||
2825 | } | ||
2826 | unsigned long page_to_pfn(struct page *page) | ||
2827 | { | ||
2828 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); | ||
2829 | return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; | ||
2830 | } | ||
2831 | #elif defined(CONFIG_SPARSEMEM) | ||
2832 | struct page *pfn_to_page(unsigned long pfn) | ||
2833 | { | ||
2834 | return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; | ||
2835 | } | ||
2836 | |||
2837 | unsigned long page_to_pfn(struct page *page) | ||
2838 | { | ||
2839 | long section_id = page_to_section(page); | ||
2840 | return page - __section_mem_map_addr(__nr_to_section(section_id)); | ||
2841 | } | 2890 | } |
2842 | #endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ | ||
2843 | EXPORT_SYMBOL(pfn_to_page); | 2891 | EXPORT_SYMBOL(pfn_to_page); |
2844 | EXPORT_SYMBOL(page_to_pfn); | 2892 | EXPORT_SYMBOL(page_to_pfn); |
2845 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 2893 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |