aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c207
1 files changed, 125 insertions, 82 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94c864eac9c4..2dbdd98426fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
33#include <linux/sysctl.h> 33#include <linux/sysctl.h>
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/cpuset.h> 35#include <linux/cpuset.h>
36#include <linux/memory_hotplug.h>
36#include <linux/nodemask.h> 37#include <linux/nodemask.h>
37#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
38 39
@@ -78,21 +79,44 @@ int min_free_kbytes = 1024;
78unsigned long __initdata nr_kernel_pages; 79unsigned long __initdata nr_kernel_pages;
79unsigned long __initdata nr_all_pages; 80unsigned long __initdata nr_all_pages;
80 81
82static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
83{
84 int ret = 0;
85 unsigned seq;
86 unsigned long pfn = page_to_pfn(page);
87
88 do {
89 seq = zone_span_seqbegin(zone);
90 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
91 ret = 1;
92 else if (pfn < zone->zone_start_pfn)
93 ret = 1;
94 } while (zone_span_seqretry(zone, seq));
95
96 return ret;
97}
98
99static int page_is_consistent(struct zone *zone, struct page *page)
100{
101#ifdef CONFIG_HOLES_IN_ZONE
102 if (!pfn_valid(page_to_pfn(page)))
103 return 0;
104#endif
105 if (zone != page_zone(page))
106 return 0;
107
108 return 1;
109}
81/* 110/*
82 * Temporary debugging check for pages not lying within a given zone. 111 * Temporary debugging check for pages not lying within a given zone.
83 */ 112 */
84static int bad_range(struct zone *zone, struct page *page) 113static int bad_range(struct zone *zone, struct page *page)
85{ 114{
86 if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) 115 if (page_outside_zone_boundaries(zone, page))
87 return 1; 116 return 1;
88 if (page_to_pfn(page) < zone->zone_start_pfn) 117 if (!page_is_consistent(zone, page))
89 return 1;
90#ifdef CONFIG_HOLES_IN_ZONE
91 if (!pfn_valid(page_to_pfn(page)))
92 return 1;
93#endif
94 if (zone != page_zone(page))
95 return 1; 118 return 1;
119
96 return 0; 120 return 0;
97} 121}
98 122
@@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page)
114 1 << PG_reclaim | 138 1 << PG_reclaim |
115 1 << PG_slab | 139 1 << PG_slab |
116 1 << PG_swapcache | 140 1 << PG_swapcache |
117 1 << PG_writeback); 141 1 << PG_writeback |
142 1 << PG_reserved );
118 set_page_count(page, 0); 143 set_page_count(page, 0);
119 reset_page_mapcount(page); 144 reset_page_mapcount(page);
120 page->mapping = NULL; 145 page->mapping = NULL;
@@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
153 struct page *p = page + i; 178 struct page *p = page + i;
154 179
155 SetPageCompound(p); 180 SetPageCompound(p);
156 p->private = (unsigned long)page; 181 set_page_private(p, (unsigned long)page);
157 } 182 }
158} 183}
159 184
@@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
173 198
174 if (!PageCompound(p)) 199 if (!PageCompound(p))
175 bad_page(__FUNCTION__, page); 200 bad_page(__FUNCTION__, page);
176 if (p->private != (unsigned long)page) 201 if (page_private(p) != (unsigned long)page)
177 bad_page(__FUNCTION__, page); 202 bad_page(__FUNCTION__, page);
178 ClearPageCompound(p); 203 ClearPageCompound(p);
179 } 204 }
@@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186 * So, we don't need atomic page->flags operations here. 211 * So, we don't need atomic page->flags operations here.
187 */ 212 */
188static inline unsigned long page_order(struct page *page) { 213static inline unsigned long page_order(struct page *page) {
189 return page->private; 214 return page_private(page);
190} 215}
191 216
192static inline void set_page_order(struct page *page, int order) { 217static inline void set_page_order(struct page *page, int order) {
193 page->private = order; 218 set_page_private(page, order);
194 __SetPagePrivate(page); 219 __SetPagePrivate(page);
195} 220}
196 221
197static inline void rmv_page_order(struct page *page) 222static inline void rmv_page_order(struct page *page)
198{ 223{
199 __ClearPagePrivate(page); 224 __ClearPagePrivate(page);
200 page->private = 0; 225 set_page_private(page, 0);
201} 226}
202 227
203/* 228/*
@@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
237 * (a) the buddy is free && 262 * (a) the buddy is free &&
238 * (b) the buddy is on the buddy system && 263 * (b) the buddy is on the buddy system &&
239 * (c) a page and its buddy have the same order. 264 * (c) a page and its buddy have the same order.
240 * for recording page's order, we use page->private and PG_private. 265 * for recording page's order, we use page_private(page) and PG_private.
241 * 266 *
242 */ 267 */
243static inline int page_is_buddy(struct page *page, int order) 268static inline int page_is_buddy(struct page *page, int order)
244{ 269{
245 if (PagePrivate(page) && 270 if (PagePrivate(page) &&
246 (page_order(page) == order) && 271 (page_order(page) == order) &&
247 !PageReserved(page) &&
248 page_count(page) == 0) 272 page_count(page) == 0)
249 return 1; 273 return 1;
250 return 0; 274 return 0;
@@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order)
264 * parts of the VM system. 288 * parts of the VM system.
265 * At each level, we keep a list of pages, which are heads of continuous 289 * At each level, we keep a list of pages, which are heads of continuous
266 * free pages of length of (1 << order) and marked with PG_Private.Page's 290 * free pages of length of (1 << order) and marked with PG_Private.Page's
267 * order is recorded in page->private field. 291 * order is recorded in page_private(page) field.
268 * So when we are allocating or freeing one, we can derive the state of the 292 * So when we are allocating or freeing one, we can derive the state of the
269 * other. That is, if we allocate a small block, and both were 293 * other. That is, if we allocate a small block, and both were
270 * free, the remainder of the region must be split into blocks. 294 * free, the remainder of the region must be split into blocks.
@@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page)
327 1 << PG_reclaim | 351 1 << PG_reclaim |
328 1 << PG_slab | 352 1 << PG_slab |
329 1 << PG_swapcache | 353 1 << PG_swapcache |
330 1 << PG_writeback ))) 354 1 << PG_writeback |
355 1 << PG_reserved )))
331 bad_page(function, page); 356 bad_page(function, page);
332 if (PageDirty(page)) 357 if (PageDirty(page))
333 __ClearPageDirty(page); 358 __ClearPageDirty(page);
@@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order)
455 1 << PG_reclaim | 480 1 << PG_reclaim |
456 1 << PG_slab | 481 1 << PG_slab |
457 1 << PG_swapcache | 482 1 << PG_swapcache |
458 1 << PG_writeback ))) 483 1 << PG_writeback |
484 1 << PG_reserved )))
459 bad_page(__FUNCTION__, page); 485 bad_page(__FUNCTION__, page);
460 486
461 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 487 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
462 1 << PG_referenced | 1 << PG_arch_1 | 488 1 << PG_referenced | 1 << PG_arch_1 |
463 1 << PG_checked | 1 << PG_mappedtodisk); 489 1 << PG_checked | 1 << PG_mappedtodisk);
464 page->private = 0; 490 set_page_private(page, 0);
465 set_page_refs(page, order); 491 set_page_refs(page, order);
466 kernel_map_pages(page, 1 << order, 1); 492 kernel_map_pages(page, 1 << order, 1);
467} 493}
@@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec)
1016 1042
1017fastcall void __free_pages(struct page *page, unsigned int order) 1043fastcall void __free_pages(struct page *page, unsigned int order)
1018{ 1044{
1019 if (!PageReserved(page) && put_page_testzero(page)) { 1045 if (put_page_testzero(page)) {
1020 if (order == 0) 1046 if (order == 0)
1021 free_hot_page(page); 1047 free_hot_page(page);
1022 else 1048 else
@@ -1305,12 +1331,9 @@ void show_free_areas(void)
1305 } else 1331 } else
1306 printk("\n"); 1332 printk("\n");
1307 1333
1308 for (cpu = 0; cpu < NR_CPUS; ++cpu) { 1334 for_each_cpu(cpu) {
1309 struct per_cpu_pageset *pageset; 1335 struct per_cpu_pageset *pageset;
1310 1336
1311 if (!cpu_possible(cpu))
1312 continue;
1313
1314 pageset = zone_pcp(zone, cpu); 1337 pageset = zone_pcp(zone, cpu);
1315 1338
1316 for (temperature = 0; temperature < 2; temperature++) 1339 for (temperature = 0; temperature < 2; temperature++)
@@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1660 * up by free_all_bootmem() once the early boot process is 1683 * up by free_all_bootmem() once the early boot process is
1661 * done. Non-atomic initialization, single-pass. 1684 * done. Non-atomic initialization, single-pass.
1662 */ 1685 */
1663void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1686void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1664 unsigned long start_pfn) 1687 unsigned long start_pfn)
1665{ 1688{
1666 struct page *page; 1689 struct page *page;
@@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1674 continue; 1697 continue;
1675 page = pfn_to_page(pfn); 1698 page = pfn_to_page(pfn);
1676 set_page_links(page, zone, nid, pfn); 1699 set_page_links(page, zone, nid, pfn);
1677 set_page_count(page, 0); 1700 set_page_count(page, 1);
1678 reset_page_mapcount(page); 1701 reset_page_mapcount(page);
1679 SetPageReserved(page); 1702 SetPageReserved(page);
1680 INIT_LIST_HEAD(&page->lru); 1703 INIT_LIST_HEAD(&page->lru);
@@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone)
1721 1744
1722 /* 1745 /*
1723 * The per-cpu-pages pools are set to around 1000th of the 1746 * The per-cpu-pages pools are set to around 1000th of the
1724 * size of the zone. But no more than 1/4 of a meg - there's 1747 * size of the zone. But no more than 1/2 of a meg.
1725 * no point in going beyond the size of L2 cache.
1726 * 1748 *
1727 * OK, so we don't know how big the cache is. So guess. 1749 * OK, so we don't know how big the cache is. So guess.
1728 */ 1750 */
1729 batch = zone->present_pages / 1024; 1751 batch = zone->present_pages / 1024;
1730 if (batch * PAGE_SIZE > 256 * 1024) 1752 if (batch * PAGE_SIZE > 512 * 1024)
1731 batch = (256 * 1024) / PAGE_SIZE; 1753 batch = (512 * 1024) / PAGE_SIZE;
1732 batch /= 4; /* We effectively *= 4 below */ 1754 batch /= 4; /* We effectively *= 4 below */
1733 if (batch < 1) 1755 if (batch < 1)
1734 batch = 1; 1756 batch = 1;
1735 1757
1736 /* 1758 /*
1737 * Clamp the batch to a 2^n - 1 value. Having a power 1759 * We will be trying to allcoate bigger chunks of contiguous
1738 * of 2 value was found to be more likely to have 1760 * memory of the order of fls(batch). This should result in
1739 * suboptimal cache aliasing properties in some cases. 1761 * better cache coloring.
1740 * 1762 *
1741 * For example if 2 tasks are alternately allocating 1763 * A sanity check also to ensure that batch is still in limits.
1742 * batches of pages, one task can end up with a lot
1743 * of pages of one half of the possible page colors
1744 * and the other with pages of the other colors.
1745 */ 1764 */
1746 batch = (1 << fls(batch + batch/2)) - 1; 1765 batch = (1 << fls(batch + batch/2));
1766
1767 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1768 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1769
1747 return batch; 1770 return batch;
1748} 1771}
1749 1772
@@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1755 1778
1756 pcp = &p->pcp[0]; /* hot */ 1779 pcp = &p->pcp[0]; /* hot */
1757 pcp->count = 0; 1780 pcp->count = 0;
1758 pcp->low = 2 * batch; 1781 pcp->low = 0;
1759 pcp->high = 6 * batch; 1782 pcp->high = 6 * batch;
1760 pcp->batch = max(1UL, 1 * batch); 1783 pcp->batch = max(1UL, 1 * batch);
1761 INIT_LIST_HEAD(&pcp->list); 1784 INIT_LIST_HEAD(&pcp->list);
@@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1764 pcp->count = 0; 1787 pcp->count = 0;
1765 pcp->low = 0; 1788 pcp->low = 0;
1766 pcp->high = 2 * batch; 1789 pcp->high = 2 * batch;
1767 pcp->batch = max(1UL, 1 * batch); 1790 pcp->batch = max(1UL, batch/2);
1768 INIT_LIST_HEAD(&pcp->list); 1791 INIT_LIST_HEAD(&pcp->list);
1769} 1792}
1770 1793
@@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset()
1873 1896
1874#endif 1897#endif
1875 1898
1899static __devinit
1900void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1901{
1902 int i;
1903 struct pglist_data *pgdat = zone->zone_pgdat;
1904
1905 /*
1906 * The per-page waitqueue mechanism uses hashed waitqueues
1907 * per zone.
1908 */
1909 zone->wait_table_size = wait_table_size(zone_size_pages);
1910 zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
1911 zone->wait_table = (wait_queue_head_t *)
1912 alloc_bootmem_node(pgdat, zone->wait_table_size
1913 * sizeof(wait_queue_head_t));
1914
1915 for(i = 0; i < zone->wait_table_size; ++i)
1916 init_waitqueue_head(zone->wait_table + i);
1917}
1918
1919static __devinit void zone_pcp_init(struct zone *zone)
1920{
1921 int cpu;
1922 unsigned long batch = zone_batchsize(zone);
1923
1924 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1925#ifdef CONFIG_NUMA
1926 /* Early boot. Slab allocator not functional yet */
1927 zone->pageset[cpu] = &boot_pageset[cpu];
1928 setup_pageset(&boot_pageset[cpu],0);
1929#else
1930 setup_pageset(zone_pcp(zone,cpu), batch);
1931#endif
1932 }
1933 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1934 zone->name, zone->present_pages, batch);
1935}
1936
1937static __devinit void init_currently_empty_zone(struct zone *zone,
1938 unsigned long zone_start_pfn, unsigned long size)
1939{
1940 struct pglist_data *pgdat = zone->zone_pgdat;
1941
1942 zone_wait_table_init(zone, size);
1943 pgdat->nr_zones = zone_idx(zone) + 1;
1944
1945 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1946 zone->zone_start_pfn = zone_start_pfn;
1947
1948 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
1949
1950 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1951}
1952
1876/* 1953/*
1877 * Set up the zone data structures: 1954 * Set up the zone data structures:
1878 * - mark all pages reserved 1955 * - mark all pages reserved
@@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset()
1882static void __init free_area_init_core(struct pglist_data *pgdat, 1959static void __init free_area_init_core(struct pglist_data *pgdat,
1883 unsigned long *zones_size, unsigned long *zholes_size) 1960 unsigned long *zones_size, unsigned long *zholes_size)
1884{ 1961{
1885 unsigned long i, j; 1962 unsigned long j;
1886 int cpu, nid = pgdat->node_id; 1963 int nid = pgdat->node_id;
1887 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1964 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1888 1965
1966 pgdat_resize_init(pgdat);
1889 pgdat->nr_zones = 0; 1967 pgdat->nr_zones = 0;
1890 init_waitqueue_head(&pgdat->kswapd_wait); 1968 init_waitqueue_head(&pgdat->kswapd_wait);
1891 pgdat->kswapd_max_order = 0; 1969 pgdat->kswapd_max_order = 0;
@@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1893 for (j = 0; j < MAX_NR_ZONES; j++) { 1971 for (j = 0; j < MAX_NR_ZONES; j++) {
1894 struct zone *zone = pgdat->node_zones + j; 1972 struct zone *zone = pgdat->node_zones + j;
1895 unsigned long size, realsize; 1973 unsigned long size, realsize;
1896 unsigned long batch;
1897 1974
1898 realsize = size = zones_size[j]; 1975 realsize = size = zones_size[j];
1899 if (zholes_size) 1976 if (zholes_size)
@@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1908 zone->name = zone_names[j]; 1985 zone->name = zone_names[j];
1909 spin_lock_init(&zone->lock); 1986 spin_lock_init(&zone->lock);
1910 spin_lock_init(&zone->lru_lock); 1987 spin_lock_init(&zone->lru_lock);
1988 zone_seqlock_init(zone);
1911 zone->zone_pgdat = pgdat; 1989 zone->zone_pgdat = pgdat;
1912 zone->free_pages = 0; 1990 zone->free_pages = 0;
1913 1991
1914 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1992 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1915 1993
1916 batch = zone_batchsize(zone); 1994 zone_pcp_init(zone);
1917
1918 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1919#ifdef CONFIG_NUMA
1920 /* Early boot. Slab allocator not functional yet */
1921 zone->pageset[cpu] = &boot_pageset[cpu];
1922 setup_pageset(&boot_pageset[cpu],0);
1923#else
1924 setup_pageset(zone_pcp(zone,cpu), batch);
1925#endif
1926 }
1927 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1928 zone_names[j], realsize, batch);
1929 INIT_LIST_HEAD(&zone->active_list); 1995 INIT_LIST_HEAD(&zone->active_list);
1930 INIT_LIST_HEAD(&zone->inactive_list); 1996 INIT_LIST_HEAD(&zone->inactive_list);
1931 zone->nr_scan_active = 0; 1997 zone->nr_scan_active = 0;
@@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1936 if (!size) 2002 if (!size)
1937 continue; 2003 continue;
1938 2004
1939 /*
1940 * The per-page waitqueue mechanism uses hashed waitqueues
1941 * per zone.
1942 */
1943 zone->wait_table_size = wait_table_size(size);
1944 zone->wait_table_bits =
1945 wait_table_bits(zone->wait_table_size);
1946 zone->wait_table = (wait_queue_head_t *)
1947 alloc_bootmem_node(pgdat, zone->wait_table_size
1948 * sizeof(wait_queue_head_t));
1949
1950 for(i = 0; i < zone->wait_table_size; ++i)
1951 init_waitqueue_head(zone->wait_table + i);
1952
1953 pgdat->nr_zones = j+1;
1954
1955 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1956 zone->zone_start_pfn = zone_start_pfn;
1957
1958 memmap_init(size, nid, j, zone_start_pfn);
1959
1960 zonetable_add(zone, nid, j, zone_start_pfn, size); 2005 zonetable_add(zone, nid, j, zone_start_pfn, size);
1961 2006 init_currently_empty_zone(zone, zone_start_pfn, size);
1962 zone_start_pfn += size; 2007 zone_start_pfn += size;
1963
1964 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1965 } 2008 }
1966} 2009}
1967 2010
@@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void)
2361 * that the pages_{min,low,high} values for each zone are set correctly 2404 * that the pages_{min,low,high} values for each zone are set correctly
2362 * with respect to min_free_kbytes. 2405 * with respect to min_free_kbytes.
2363 */ 2406 */
2364static void setup_per_zone_pages_min(void) 2407void setup_per_zone_pages_min(void)
2365{ 2408{
2366 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 2409 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
2367 unsigned long lowmem_pages = 0; 2410 unsigned long lowmem_pages = 0;