aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c670
1 files changed, 135 insertions, 535 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..3e792a583f3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -14,7 +14,6 @@
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17#include <linux/config.h>
18#include <linux/stddef.h> 17#include <linux/stddef.h>
19#include <linux/mm.h> 18#include <linux/mm.h>
20#include <linux/swap.h> 19#include <linux/swap.h>
@@ -37,6 +36,7 @@
37#include <linux/nodemask.h> 36#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
39#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h>
40 40
41#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
42#include <asm/div64.h> 42#include <asm/div64.h>
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; 83static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
84int min_free_kbytes = 1024; 84int min_free_kbytes = 1024;
85 85
86unsigned long __initdata nr_kernel_pages; 86unsigned long __meminitdata nr_kernel_pages;
87unsigned long __initdata nr_all_pages; 87unsigned long __meminitdata nr_all_pages;
88 88
89#ifdef CONFIG_DEBUG_VM 89#ifdef CONFIG_DEBUG_VM
90static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 90static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -265,7 +265,7 @@ static inline void rmv_page_order(struct page *page)
265 * satisfies the following equation: 265 * satisfies the following equation:
266 * P = B & ~(1 << O) 266 * P = B & ~(1 << O)
267 * 267 *
268 * Assumption: *_mem_map is contigious at least up to MAX_ORDER 268 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
269 */ 269 */
270static inline struct page * 270static inline struct page *
271__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 271__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
286 * we can do coalesce a page and its buddy if 286 * we can do coalesce a page and its buddy if
287 * (a) the buddy is not in a hole && 287 * (a) the buddy is not in a hole &&
288 * (b) the buddy is in the buddy system && 288 * (b) the buddy is in the buddy system &&
289 * (c) a page and its buddy have the same order. 289 * (c) a page and its buddy have the same order &&
290 * (d) a page and its buddy are in the same zone.
290 * 291 *
291 * For recording whether a page is in the buddy system, we use PG_buddy. 292 * For recording whether a page is in the buddy system, we use PG_buddy.
292 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 293 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
293 * 294 *
294 * For recording page's order, we use page_private(page). 295 * For recording page's order, we use page_private(page).
295 */ 296 */
296static inline int page_is_buddy(struct page *page, int order) 297static inline int page_is_buddy(struct page *page, struct page *buddy,
298 int order)
297{ 299{
298#ifdef CONFIG_HOLES_IN_ZONE 300#ifdef CONFIG_HOLES_IN_ZONE
299 if (!pfn_valid(page_to_pfn(page))) 301 if (!pfn_valid(page_to_pfn(buddy)))
300 return 0; 302 return 0;
301#endif 303#endif
302 304
303 if (PageBuddy(page) && page_order(page) == order) { 305 if (page_zone_id(page) != page_zone_id(buddy))
304 BUG_ON(page_count(page) != 0); 306 return 0;
307
308 if (PageBuddy(buddy) && page_order(buddy) == order) {
309 BUG_ON(page_count(buddy) != 0);
305 return 1; 310 return 1;
306 } 311 }
307 return 0; 312 return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
352 struct page *buddy; 357 struct page *buddy;
353 358
354 buddy = __page_find_buddy(page, page_idx, order); 359 buddy = __page_find_buddy(page, page_idx, order);
355 if (!page_is_buddy(buddy, order)) 360 if (!page_is_buddy(page, buddy, order))
356 break; /* Move the buddy up one level. */ 361 break; /* Move the buddy up one level. */
357 362
358 list_del(&buddy->lru); 363 list_del(&buddy->lru);
@@ -440,8 +445,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
440 445
441 arch_free_page(page, order); 446 arch_free_page(page, order);
442 if (!PageHighMem(page)) 447 if (!PageHighMem(page))
443 mutex_debug_check_no_locks_freed(page_address(page), 448 debug_check_no_locks_freed(page_address(page),
444 PAGE_SIZE<<order); 449 PAGE_SIZE<<order);
445 450
446 for (i = 0 ; i < (1 << order) ; ++i) 451 for (i = 0 ; i < (1 << order) ; ++i)
447 reserved += free_pages_check(page + i); 452 reserved += free_pages_check(page + i);
@@ -450,7 +455,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
450 455
451 kernel_map_pages(page, 1 << order, 0); 456 kernel_map_pages(page, 1 << order, 0);
452 local_irq_save(flags); 457 local_irq_save(flags);
453 __mod_page_state(pgfree, 1 << order); 458 __count_vm_events(PGFREE, 1 << order);
454 free_one_page(page_zone(page), page, order); 459 free_one_page(page_zone(page), page, order);
455 local_irq_restore(flags); 460 local_irq_restore(flags);
456} 461}
@@ -703,27 +708,6 @@ void drain_local_pages(void)
703} 708}
704#endif /* CONFIG_PM */ 709#endif /* CONFIG_PM */
705 710
706static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
707{
708#ifdef CONFIG_NUMA
709 pg_data_t *pg = z->zone_pgdat;
710 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
711 struct per_cpu_pageset *p;
712
713 p = zone_pcp(z, cpu);
714 if (pg == orig) {
715 p->numa_hit++;
716 } else {
717 p->numa_miss++;
718 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
719 }
720 if (pg == NODE_DATA(numa_node_id()))
721 p->local_node++;
722 else
723 p->other_node++;
724#endif
725}
726
727/* 711/*
728 * Free a 0-order page 712 * Free a 0-order page
729 */ 713 */
@@ -744,7 +728,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
744 728
745 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 729 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
746 local_irq_save(flags); 730 local_irq_save(flags);
747 __inc_page_state(pgfree); 731 __count_vm_event(PGFREE);
748 list_add(&page->lru, &pcp->list); 732 list_add(&page->lru, &pcp->list);
749 pcp->count++; 733 pcp->count++;
750 if (pcp->count >= pcp->high) { 734 if (pcp->count >= pcp->high) {
@@ -820,8 +804,8 @@ again:
820 goto failed; 804 goto failed;
821 } 805 }
822 806
823 __mod_page_state_zone(zone, pgalloc, 1 << order); 807 __count_zone_vm_events(PGALLOC, zone, 1 << order);
824 zone_statistics(zonelist, zone, cpu); 808 zone_statistics(zonelist, zone);
825 local_irq_restore(flags); 809 local_irq_restore(flags);
826 put_cpu(); 810 put_cpu();
827 811
@@ -951,8 +935,7 @@ restart:
951 goto got_pg; 935 goto got_pg;
952 936
953 do { 937 do {
954 if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) 938 wakeup_kswapd(*z, order);
955 wakeup_kswapd(*z, order);
956 } while (*(++z)); 939 } while (*(++z));
957 940
958 /* 941 /*
@@ -1226,141 +1209,6 @@ static void show_node(struct zone *zone)
1226#define show_node(zone) do { } while (0) 1209#define show_node(zone) do { } while (0)
1227#endif 1210#endif
1228 1211
1229/*
1230 * Accumulate the page_state information across all CPUs.
1231 * The result is unavoidably approximate - it can change
1232 * during and after execution of this function.
1233 */
1234static DEFINE_PER_CPU(struct page_state, page_states) = {0};
1235
1236atomic_t nr_pagecache = ATOMIC_INIT(0);
1237EXPORT_SYMBOL(nr_pagecache);
1238#ifdef CONFIG_SMP
1239DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1240#endif
1241
1242static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1243{
1244 unsigned cpu;
1245
1246 memset(ret, 0, nr * sizeof(unsigned long));
1247 cpus_and(*cpumask, *cpumask, cpu_online_map);
1248
1249 for_each_cpu_mask(cpu, *cpumask) {
1250 unsigned long *in;
1251 unsigned long *out;
1252 unsigned off;
1253 unsigned next_cpu;
1254
1255 in = (unsigned long *)&per_cpu(page_states, cpu);
1256
1257 next_cpu = next_cpu(cpu, *cpumask);
1258 if (likely(next_cpu < NR_CPUS))
1259 prefetch(&per_cpu(page_states, next_cpu));
1260
1261 out = (unsigned long *)ret;
1262 for (off = 0; off < nr; off++)
1263 *out++ += *in++;
1264 }
1265}
1266
1267void get_page_state_node(struct page_state *ret, int node)
1268{
1269 int nr;
1270 cpumask_t mask = node_to_cpumask(node);
1271
1272 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1273 nr /= sizeof(unsigned long);
1274
1275 __get_page_state(ret, nr+1, &mask);
1276}
1277
1278void get_page_state(struct page_state *ret)
1279{
1280 int nr;
1281 cpumask_t mask = CPU_MASK_ALL;
1282
1283 nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
1284 nr /= sizeof(unsigned long);
1285
1286 __get_page_state(ret, nr + 1, &mask);
1287}
1288
1289void get_full_page_state(struct page_state *ret)
1290{
1291 cpumask_t mask = CPU_MASK_ALL;
1292
1293 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1294}
1295
1296unsigned long read_page_state_offset(unsigned long offset)
1297{
1298 unsigned long ret = 0;
1299 int cpu;
1300
1301 for_each_online_cpu(cpu) {
1302 unsigned long in;
1303
1304 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
1305 ret += *((unsigned long *)in);
1306 }
1307 return ret;
1308}
1309
1310void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1311{
1312 void *ptr;
1313
1314 ptr = &__get_cpu_var(page_states);
1315 *(unsigned long *)(ptr + offset) += delta;
1316}
1317EXPORT_SYMBOL(__mod_page_state_offset);
1318
1319void mod_page_state_offset(unsigned long offset, unsigned long delta)
1320{
1321 unsigned long flags;
1322 void *ptr;
1323
1324 local_irq_save(flags);
1325 ptr = &__get_cpu_var(page_states);
1326 *(unsigned long *)(ptr + offset) += delta;
1327 local_irq_restore(flags);
1328}
1329EXPORT_SYMBOL(mod_page_state_offset);
1330
1331void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1332 unsigned long *free, struct pglist_data *pgdat)
1333{
1334 struct zone *zones = pgdat->node_zones;
1335 int i;
1336
1337 *active = 0;
1338 *inactive = 0;
1339 *free = 0;
1340 for (i = 0; i < MAX_NR_ZONES; i++) {
1341 *active += zones[i].nr_active;
1342 *inactive += zones[i].nr_inactive;
1343 *free += zones[i].free_pages;
1344 }
1345}
1346
1347void get_zone_counts(unsigned long *active,
1348 unsigned long *inactive, unsigned long *free)
1349{
1350 struct pglist_data *pgdat;
1351
1352 *active = 0;
1353 *inactive = 0;
1354 *free = 0;
1355 for_each_online_pgdat(pgdat) {
1356 unsigned long l, m, n;
1357 __get_zone_counts(&l, &m, &n, pgdat);
1358 *active += l;
1359 *inactive += m;
1360 *free += n;
1361 }
1362}
1363
1364void si_meminfo(struct sysinfo *val) 1212void si_meminfo(struct sysinfo *val)
1365{ 1213{
1366 val->totalram = totalram_pages; 1214 val->totalram = totalram_pages;
@@ -1401,7 +1249,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1401 */ 1249 */
1402void show_free_areas(void) 1250void show_free_areas(void)
1403{ 1251{
1404 struct page_state ps;
1405 int cpu, temperature; 1252 int cpu, temperature;
1406 unsigned long active; 1253 unsigned long active;
1407 unsigned long inactive; 1254 unsigned long inactive;
@@ -1433,7 +1280,6 @@ void show_free_areas(void)
1433 } 1280 }
1434 } 1281 }
1435 1282
1436 get_page_state(&ps);
1437 get_zone_counts(&active, &inactive, &free); 1283 get_zone_counts(&active, &inactive, &free);
1438 1284
1439 printk("Free pages: %11ukB (%ukB HighMem)\n", 1285 printk("Free pages: %11ukB (%ukB HighMem)\n",
@@ -1444,13 +1290,13 @@ void show_free_areas(void)
1444 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1290 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
1445 active, 1291 active,
1446 inactive, 1292 inactive,
1447 ps.nr_dirty, 1293 global_page_state(NR_FILE_DIRTY),
1448 ps.nr_writeback, 1294 global_page_state(NR_WRITEBACK),
1449 ps.nr_unstable, 1295 global_page_state(NR_UNSTABLE_NFS),
1450 nr_free_pages(), 1296 nr_free_pages(),
1451 ps.nr_slab, 1297 global_page_state(NR_SLAB),
1452 ps.nr_mapped, 1298 global_page_state(NR_FILE_MAPPED),
1453 ps.nr_page_table_pages); 1299 global_page_state(NR_PAGETABLE));
1454 1300
1455 for_each_zone(zone) { 1301 for_each_zone(zone) {
1456 int i; 1302 int i;
@@ -1485,7 +1331,7 @@ void show_free_areas(void)
1485 } 1331 }
1486 1332
1487 for_each_zone(zone) { 1333 for_each_zone(zone) {
1488 unsigned long nr, flags, order, total = 0; 1334 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1489 1335
1490 show_node(zone); 1336 show_node(zone);
1491 printk("%s: ", zone->name); 1337 printk("%s: ", zone->name);
@@ -1496,11 +1342,12 @@ void show_free_areas(void)
1496 1342
1497 spin_lock_irqsave(&zone->lock, flags); 1343 spin_lock_irqsave(&zone->lock, flags);
1498 for (order = 0; order < MAX_ORDER; order++) { 1344 for (order = 0; order < MAX_ORDER; order++) {
1499 nr = zone->free_area[order].nr_free; 1345 nr[order] = zone->free_area[order].nr_free;
1500 total += nr << order; 1346 total += nr[order] << order;
1501 printk("%lu*%lukB ", nr, K(1UL) << order);
1502 } 1347 }
1503 spin_unlock_irqrestore(&zone->lock, flags); 1348 spin_unlock_irqrestore(&zone->lock, flags);
1349 for (order = 0; order < MAX_ORDER; order++)
1350 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1504 printk("= %lukB\n", K(total)); 1351 printk("= %lukB\n", K(total));
1505 } 1352 }
1506 1353
@@ -1512,7 +1359,7 @@ void show_free_areas(void)
1512 * 1359 *
1513 * Add all populated zones of a node to the zonelist. 1360 * Add all populated zones of a node to the zonelist.
1514 */ 1361 */
1515static int __init build_zonelists_node(pg_data_t *pgdat, 1362static int __meminit build_zonelists_node(pg_data_t *pgdat,
1516 struct zonelist *zonelist, int nr_zones, int zone_type) 1363 struct zonelist *zonelist, int nr_zones, int zone_type)
1517{ 1364{
1518 struct zone *zone; 1365 struct zone *zone;
@@ -1548,7 +1395,7 @@ static inline int highest_zone(int zone_bits)
1548 1395
1549#ifdef CONFIG_NUMA 1396#ifdef CONFIG_NUMA
1550#define MAX_NODE_LOAD (num_online_nodes()) 1397#define MAX_NODE_LOAD (num_online_nodes())
1551static int __initdata node_load[MAX_NUMNODES]; 1398static int __meminitdata node_load[MAX_NUMNODES];
1552/** 1399/**
1553 * find_next_best_node - find the next node that should appear in a given node's fallback list 1400 * find_next_best_node - find the next node that should appear in a given node's fallback list
1554 * @node: node whose fallback list we're appending 1401 * @node: node whose fallback list we're appending
@@ -1563,7 +1410,7 @@ static int __initdata node_load[MAX_NUMNODES];
1563 * on them otherwise. 1410 * on them otherwise.
1564 * It returns -1 if no node is found. 1411 * It returns -1 if no node is found.
1565 */ 1412 */
1566static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1413static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1567{ 1414{
1568 int n, val; 1415 int n, val;
1569 int min_val = INT_MAX; 1416 int min_val = INT_MAX;
@@ -1609,7 +1456,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1609 return best_node; 1456 return best_node;
1610} 1457}
1611 1458
1612static void __init build_zonelists(pg_data_t *pgdat) 1459static void __meminit build_zonelists(pg_data_t *pgdat)
1613{ 1460{
1614 int i, j, k, node, local_node; 1461 int i, j, k, node, local_node;
1615 int prev_node, load; 1462 int prev_node, load;
@@ -1661,7 +1508,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
1661 1508
1662#else /* CONFIG_NUMA */ 1509#else /* CONFIG_NUMA */
1663 1510
1664static void __init build_zonelists(pg_data_t *pgdat) 1511static void __meminit build_zonelists(pg_data_t *pgdat)
1665{ 1512{
1666 int i, j, k, node, local_node; 1513 int i, j, k, node, local_node;
1667 1514
@@ -1699,14 +1546,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
1699 1546
1700#endif /* CONFIG_NUMA */ 1547#endif /* CONFIG_NUMA */
1701 1548
1702void __init build_all_zonelists(void) 1549/* return values int ....just for stop_machine_run() */
1550static int __meminit __build_all_zonelists(void *dummy)
1703{ 1551{
1704 int i; 1552 int nid;
1553 for_each_online_node(nid)
1554 build_zonelists(NODE_DATA(nid));
1555 return 0;
1556}
1705 1557
1706 for_each_online_node(i) 1558void __meminit build_all_zonelists(void)
1707 build_zonelists(NODE_DATA(i)); 1559{
1708 printk("Built %i zonelists\n", num_online_nodes()); 1560 if (system_state == SYSTEM_BOOTING) {
1709 cpuset_init_current_mems_allowed(); 1561 __build_all_zonelists(0);
1562 cpuset_init_current_mems_allowed();
1563 } else {
1564 /* we have to stop all cpus to guaranntee there is no user
1565 of zonelist */
1566 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1567 /* cpuset refresh routine should be here */
1568 }
1569 vm_total_pages = nr_free_pagecache_pages();
1570 printk("Built %i zonelists. Total pages: %ld\n",
1571 num_online_nodes(), vm_total_pages);
1710} 1572}
1711 1573
1712/* 1574/*
@@ -1722,7 +1584,8 @@ void __init build_all_zonelists(void)
1722 */ 1584 */
1723#define PAGES_PER_WAITQUEUE 256 1585#define PAGES_PER_WAITQUEUE 256
1724 1586
1725static inline unsigned long wait_table_size(unsigned long pages) 1587#ifndef CONFIG_MEMORY_HOTPLUG
1588static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1726{ 1589{
1727 unsigned long size = 1; 1590 unsigned long size = 1;
1728 1591
@@ -1740,6 +1603,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
1740 1603
1741 return max(size, 4UL); 1604 return max(size, 4UL);
1742} 1605}
1606#else
1607/*
1608 * A zone's size might be changed by hot-add, so it is not possible to determine
1609 * a suitable size for its wait_table. So we use the maximum size now.
1610 *
1611 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1612 *
1613 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1614 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1615 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1616 *
1617 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1618 * or more by the traditional way. (See above). It equals:
1619 *
1620 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1621 * ia64(16K page size) : = ( 8G + 4M)byte.
1622 * powerpc (64K page size) : = (32G +16M)byte.
1623 */
1624static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1625{
1626 return 4096UL;
1627}
1628#endif
1743 1629
1744/* 1630/*
1745 * This is an integer logarithm so that shifts can be used later 1631 * This is an integer logarithm so that shifts can be used later
@@ -1964,7 +1850,7 @@ static inline void free_zone_pagesets(int cpu)
1964 } 1850 }
1965} 1851}
1966 1852
1967static int pageset_cpuup_callback(struct notifier_block *nfb, 1853static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1968 unsigned long action, 1854 unsigned long action,
1969 void *hcpu) 1855 void *hcpu)
1970{ 1856{
@@ -1986,7 +1872,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb,
1986 return ret; 1872 return ret;
1987} 1873}
1988 1874
1989static struct notifier_block pageset_notifier = 1875static struct notifier_block __cpuinitdata pageset_notifier =
1990 { &pageset_cpuup_callback, NULL, 0 }; 1876 { &pageset_cpuup_callback, NULL, 0 };
1991 1877
1992void __init setup_per_cpu_pageset(void) 1878void __init setup_per_cpu_pageset(void)
@@ -2005,23 +1891,46 @@ void __init setup_per_cpu_pageset(void)
2005#endif 1891#endif
2006 1892
2007static __meminit 1893static __meminit
2008void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1894int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2009{ 1895{
2010 int i; 1896 int i;
2011 struct pglist_data *pgdat = zone->zone_pgdat; 1897 struct pglist_data *pgdat = zone->zone_pgdat;
1898 size_t alloc_size;
2012 1899
2013 /* 1900 /*
2014 * The per-page waitqueue mechanism uses hashed waitqueues 1901 * The per-page waitqueue mechanism uses hashed waitqueues
2015 * per zone. 1902 * per zone.
2016 */ 1903 */
2017 zone->wait_table_size = wait_table_size(zone_size_pages); 1904 zone->wait_table_hash_nr_entries =
2018 zone->wait_table_bits = wait_table_bits(zone->wait_table_size); 1905 wait_table_hash_nr_entries(zone_size_pages);
2019 zone->wait_table = (wait_queue_head_t *) 1906 zone->wait_table_bits =
2020 alloc_bootmem_node(pgdat, zone->wait_table_size 1907 wait_table_bits(zone->wait_table_hash_nr_entries);
2021 * sizeof(wait_queue_head_t)); 1908 alloc_size = zone->wait_table_hash_nr_entries
1909 * sizeof(wait_queue_head_t);
1910
1911 if (system_state == SYSTEM_BOOTING) {
1912 zone->wait_table = (wait_queue_head_t *)
1913 alloc_bootmem_node(pgdat, alloc_size);
1914 } else {
1915 /*
1916 * This case means that a zone whose size was 0 gets new memory
1917 * via memory hot-add.
1918 * But it may be the case that a new node was hot-added. In
1919 * this case vmalloc() will not be able to use this new node's
1920 * memory - this wait_table must be initialized to use this new
1921 * node itself as well.
1922 * To use this new node's memory, further consideration will be
1923 * necessary.
1924 */
1925 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
1926 }
1927 if (!zone->wait_table)
1928 return -ENOMEM;
2022 1929
2023 for(i = 0; i < zone->wait_table_size; ++i) 1930 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2024 init_waitqueue_head(zone->wait_table + i); 1931 init_waitqueue_head(zone->wait_table + i);
1932
1933 return 0;
2025} 1934}
2026 1935
2027static __meminit void zone_pcp_init(struct zone *zone) 1936static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +1952,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
2043 zone->name, zone->present_pages, batch); 1952 zone->name, zone->present_pages, batch);
2044} 1953}
2045 1954
2046static __meminit void init_currently_empty_zone(struct zone *zone, 1955__meminit int init_currently_empty_zone(struct zone *zone,
2047 unsigned long zone_start_pfn, unsigned long size) 1956 unsigned long zone_start_pfn,
1957 unsigned long size)
2048{ 1958{
2049 struct pglist_data *pgdat = zone->zone_pgdat; 1959 struct pglist_data *pgdat = zone->zone_pgdat;
2050 1960 int ret;
2051 zone_wait_table_init(zone, size); 1961 ret = zone_wait_table_init(zone, size);
1962 if (ret)
1963 return ret;
2052 pgdat->nr_zones = zone_idx(zone) + 1; 1964 pgdat->nr_zones = zone_idx(zone) + 1;
2053 1965
2054 zone->zone_start_pfn = zone_start_pfn; 1966 zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +1968,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2056 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 1968 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2057 1969
2058 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1970 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
1971
1972 return 0;
2059} 1973}
2060 1974
2061/* 1975/*
@@ -2064,12 +1978,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2064 * - mark all memory queues empty 1978 * - mark all memory queues empty
2065 * - clear the memory bitmaps 1979 * - clear the memory bitmaps
2066 */ 1980 */
2067static void __init free_area_init_core(struct pglist_data *pgdat, 1981static void __meminit free_area_init_core(struct pglist_data *pgdat,
2068 unsigned long *zones_size, unsigned long *zholes_size) 1982 unsigned long *zones_size, unsigned long *zholes_size)
2069{ 1983{
2070 unsigned long j; 1984 unsigned long j;
2071 int nid = pgdat->node_id; 1985 int nid = pgdat->node_id;
2072 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1986 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1987 int ret;
2073 1988
2074 pgdat_resize_init(pgdat); 1989 pgdat_resize_init(pgdat);
2075 pgdat->nr_zones = 0; 1990 pgdat->nr_zones = 0;
@@ -2106,12 +2021,14 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
2106 zone->nr_scan_inactive = 0; 2021 zone->nr_scan_inactive = 0;
2107 zone->nr_active = 0; 2022 zone->nr_active = 0;
2108 zone->nr_inactive = 0; 2023 zone->nr_inactive = 0;
2024 zap_zone_vm_stats(zone);
2109 atomic_set(&zone->reclaim_in_progress, 0); 2025 atomic_set(&zone->reclaim_in_progress, 0);
2110 if (!size) 2026 if (!size)
2111 continue; 2027 continue;
2112 2028
2113 zonetable_add(zone, nid, j, zone_start_pfn, size); 2029 zonetable_add(zone, nid, j, zone_start_pfn, size);
2114 init_currently_empty_zone(zone, zone_start_pfn, size); 2030 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2031 BUG_ON(ret);
2115 zone_start_pfn += size; 2032 zone_start_pfn += size;
2116 } 2033 }
2117} 2034}
@@ -2152,7 +2069,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2152#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2069#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2153} 2070}
2154 2071
2155void __init free_area_init_node(int nid, struct pglist_data *pgdat, 2072void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2156 unsigned long *zones_size, unsigned long node_start_pfn, 2073 unsigned long *zones_size, unsigned long node_start_pfn,
2157 unsigned long *zholes_size) 2074 unsigned long *zholes_size)
2158{ 2075{
@@ -2178,307 +2095,18 @@ void __init free_area_init(unsigned long *zones_size)
2178 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2095 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2179} 2096}
2180 2097
2181#ifdef CONFIG_PROC_FS
2182
2183#include <linux/seq_file.h>
2184
2185static void *frag_start(struct seq_file *m, loff_t *pos)
2186{
2187 pg_data_t *pgdat;
2188 loff_t node = *pos;
2189 for (pgdat = first_online_pgdat();
2190 pgdat && node;
2191 pgdat = next_online_pgdat(pgdat))
2192 --node;
2193
2194 return pgdat;
2195}
2196
2197static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
2198{
2199 pg_data_t *pgdat = (pg_data_t *)arg;
2200
2201 (*pos)++;
2202 return next_online_pgdat(pgdat);
2203}
2204
2205static void frag_stop(struct seq_file *m, void *arg)
2206{
2207}
2208
2209/*
2210 * This walks the free areas for each zone.
2211 */
2212static int frag_show(struct seq_file *m, void *arg)
2213{
2214 pg_data_t *pgdat = (pg_data_t *)arg;
2215 struct zone *zone;
2216 struct zone *node_zones = pgdat->node_zones;
2217 unsigned long flags;
2218 int order;
2219
2220 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2221 if (!populated_zone(zone))
2222 continue;
2223
2224 spin_lock_irqsave(&zone->lock, flags);
2225 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
2226 for (order = 0; order < MAX_ORDER; ++order)
2227 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
2228 spin_unlock_irqrestore(&zone->lock, flags);
2229 seq_putc(m, '\n');
2230 }
2231 return 0;
2232}
2233
2234struct seq_operations fragmentation_op = {
2235 .start = frag_start,
2236 .next = frag_next,
2237 .stop = frag_stop,
2238 .show = frag_show,
2239};
2240
2241/*
2242 * Output information about zones in @pgdat.
2243 */
2244static int zoneinfo_show(struct seq_file *m, void *arg)
2245{
2246 pg_data_t *pgdat = arg;
2247 struct zone *zone;
2248 struct zone *node_zones = pgdat->node_zones;
2249 unsigned long flags;
2250
2251 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2252 int i;
2253
2254 if (!populated_zone(zone))
2255 continue;
2256
2257 spin_lock_irqsave(&zone->lock, flags);
2258 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2259 seq_printf(m,
2260 "\n pages free %lu"
2261 "\n min %lu"
2262 "\n low %lu"
2263 "\n high %lu"
2264 "\n active %lu"
2265 "\n inactive %lu"
2266 "\n scanned %lu (a: %lu i: %lu)"
2267 "\n spanned %lu"
2268 "\n present %lu",
2269 zone->free_pages,
2270 zone->pages_min,
2271 zone->pages_low,
2272 zone->pages_high,
2273 zone->nr_active,
2274 zone->nr_inactive,
2275 zone->pages_scanned,
2276 zone->nr_scan_active, zone->nr_scan_inactive,
2277 zone->spanned_pages,
2278 zone->present_pages);
2279 seq_printf(m,
2280 "\n protection: (%lu",
2281 zone->lowmem_reserve[0]);
2282 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2283 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2284 seq_printf(m,
2285 ")"
2286 "\n pagesets");
2287 for_each_online_cpu(i) {
2288 struct per_cpu_pageset *pageset;
2289 int j;
2290
2291 pageset = zone_pcp(zone, i);
2292 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2293 if (pageset->pcp[j].count)
2294 break;
2295 }
2296 if (j == ARRAY_SIZE(pageset->pcp))
2297 continue;
2298 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2299 seq_printf(m,
2300 "\n cpu: %i pcp: %i"
2301 "\n count: %i"
2302 "\n high: %i"
2303 "\n batch: %i",
2304 i, j,
2305 pageset->pcp[j].count,
2306 pageset->pcp[j].high,
2307 pageset->pcp[j].batch);
2308 }
2309#ifdef CONFIG_NUMA
2310 seq_printf(m,
2311 "\n numa_hit: %lu"
2312 "\n numa_miss: %lu"
2313 "\n numa_foreign: %lu"
2314 "\n interleave_hit: %lu"
2315 "\n local_node: %lu"
2316 "\n other_node: %lu",
2317 pageset->numa_hit,
2318 pageset->numa_miss,
2319 pageset->numa_foreign,
2320 pageset->interleave_hit,
2321 pageset->local_node,
2322 pageset->other_node);
2323#endif
2324 }
2325 seq_printf(m,
2326 "\n all_unreclaimable: %u"
2327 "\n prev_priority: %i"
2328 "\n temp_priority: %i"
2329 "\n start_pfn: %lu",
2330 zone->all_unreclaimable,
2331 zone->prev_priority,
2332 zone->temp_priority,
2333 zone->zone_start_pfn);
2334 spin_unlock_irqrestore(&zone->lock, flags);
2335 seq_putc(m, '\n');
2336 }
2337 return 0;
2338}
2339
2340struct seq_operations zoneinfo_op = {
2341 .start = frag_start, /* iterate over all zones. The same as in
2342 * fragmentation. */
2343 .next = frag_next,
2344 .stop = frag_stop,
2345 .show = zoneinfo_show,
2346};
2347
2348static char *vmstat_text[] = {
2349 "nr_dirty",
2350 "nr_writeback",
2351 "nr_unstable",
2352 "nr_page_table_pages",
2353 "nr_mapped",
2354 "nr_slab",
2355
2356 "pgpgin",
2357 "pgpgout",
2358 "pswpin",
2359 "pswpout",
2360
2361 "pgalloc_high",
2362 "pgalloc_normal",
2363 "pgalloc_dma32",
2364 "pgalloc_dma",
2365
2366 "pgfree",
2367 "pgactivate",
2368 "pgdeactivate",
2369
2370 "pgfault",
2371 "pgmajfault",
2372
2373 "pgrefill_high",
2374 "pgrefill_normal",
2375 "pgrefill_dma32",
2376 "pgrefill_dma",
2377
2378 "pgsteal_high",
2379 "pgsteal_normal",
2380 "pgsteal_dma32",
2381 "pgsteal_dma",
2382
2383 "pgscan_kswapd_high",
2384 "pgscan_kswapd_normal",
2385 "pgscan_kswapd_dma32",
2386 "pgscan_kswapd_dma",
2387
2388 "pgscan_direct_high",
2389 "pgscan_direct_normal",
2390 "pgscan_direct_dma32",
2391 "pgscan_direct_dma",
2392
2393 "pginodesteal",
2394 "slabs_scanned",
2395 "kswapd_steal",
2396 "kswapd_inodesteal",
2397 "pageoutrun",
2398 "allocstall",
2399
2400 "pgrotated",
2401 "nr_bounce",
2402};
2403
2404static void *vmstat_start(struct seq_file *m, loff_t *pos)
2405{
2406 struct page_state *ps;
2407
2408 if (*pos >= ARRAY_SIZE(vmstat_text))
2409 return NULL;
2410
2411 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2412 m->private = ps;
2413 if (!ps)
2414 return ERR_PTR(-ENOMEM);
2415 get_full_page_state(ps);
2416 ps->pgpgin /= 2; /* sectors -> kbytes */
2417 ps->pgpgout /= 2;
2418 return (unsigned long *)ps + *pos;
2419}
2420
2421static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
2422{
2423 (*pos)++;
2424 if (*pos >= ARRAY_SIZE(vmstat_text))
2425 return NULL;
2426 return (unsigned long *)m->private + *pos;
2427}
2428
2429static int vmstat_show(struct seq_file *m, void *arg)
2430{
2431 unsigned long *l = arg;
2432 unsigned long off = l - (unsigned long *)m->private;
2433
2434 seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
2435 return 0;
2436}
2437
2438static void vmstat_stop(struct seq_file *m, void *arg)
2439{
2440 kfree(m->private);
2441 m->private = NULL;
2442}
2443
2444struct seq_operations vmstat_op = {
2445 .start = vmstat_start,
2446 .next = vmstat_next,
2447 .stop = vmstat_stop,
2448 .show = vmstat_show,
2449};
2450
2451#endif /* CONFIG_PROC_FS */
2452
2453#ifdef CONFIG_HOTPLUG_CPU 2098#ifdef CONFIG_HOTPLUG_CPU
2454static int page_alloc_cpu_notify(struct notifier_block *self, 2099static int page_alloc_cpu_notify(struct notifier_block *self,
2455 unsigned long action, void *hcpu) 2100 unsigned long action, void *hcpu)
2456{ 2101{
2457 int cpu = (unsigned long)hcpu; 2102 int cpu = (unsigned long)hcpu;
2458 long *count;
2459 unsigned long *src, *dest;
2460 2103
2461 if (action == CPU_DEAD) { 2104 if (action == CPU_DEAD) {
2462 int i;
2463
2464 /* Drain local pagecache count. */
2465 count = &per_cpu(nr_pagecache_local, cpu);
2466 atomic_add(*count, &nr_pagecache);
2467 *count = 0;
2468 local_irq_disable(); 2105 local_irq_disable();
2469 __drain_pages(cpu); 2106 __drain_pages(cpu);
2470 2107 vm_events_fold_cpu(cpu);
2471 /* Add dead cpu's page_states to our own. */
2472 dest = (unsigned long *)&__get_cpu_var(page_states);
2473 src = (unsigned long *)&per_cpu(page_states, cpu);
2474
2475 for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
2476 i++) {
2477 dest[i] += src[i];
2478 src[i] = 0;
2479 }
2480
2481 local_irq_enable(); 2108 local_irq_enable();
2109 refresh_cpu_vm_stats(cpu);
2482 } 2110 }
2483 return NOTIFY_OK; 2111 return NOTIFY_OK;
2484} 2112}
@@ -2804,42 +2432,14 @@ void *__init alloc_large_system_hash(const char *tablename,
2804} 2432}
2805 2433
2806#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 2434#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2807/*
2808 * pfn <-> page translation. out-of-line version.
2809 * (see asm-generic/memory_model.h)
2810 */
2811#if defined(CONFIG_FLATMEM)
2812struct page *pfn_to_page(unsigned long pfn)
2813{
2814 return mem_map + (pfn - ARCH_PFN_OFFSET);
2815}
2816unsigned long page_to_pfn(struct page *page)
2817{
2818 return (page - mem_map) + ARCH_PFN_OFFSET;
2819}
2820#elif defined(CONFIG_DISCONTIGMEM)
2821struct page *pfn_to_page(unsigned long pfn)
2822{
2823 int nid = arch_pfn_to_nid(pfn);
2824 return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
2825}
2826unsigned long page_to_pfn(struct page *page)
2827{
2828 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
2829 return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
2830}
2831#elif defined(CONFIG_SPARSEMEM)
2832struct page *pfn_to_page(unsigned long pfn) 2435struct page *pfn_to_page(unsigned long pfn)
2833{ 2436{
2834 return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; 2437 return __pfn_to_page(pfn);
2835} 2438}
2836
2837unsigned long page_to_pfn(struct page *page) 2439unsigned long page_to_pfn(struct page *page)
2838{ 2440{
2839 long section_id = page_to_section(page); 2441 return __page_to_pfn(page);
2840 return page - __section_mem_map_addr(__nr_to_section(section_id));
2841} 2442}
2842#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
2843EXPORT_SYMBOL(pfn_to_page); 2443EXPORT_SYMBOL(pfn_to_page);
2844EXPORT_SYMBOL(page_to_pfn); 2444EXPORT_SYMBOL(page_to_pfn);
2845#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 2445#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */