From 295ab93497ec703f7d6eaf0787dd9768b83035fe Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Tue, 21 Jun 2005 17:14:38 -0700 Subject: [PATCH] mm: add /proc/zoneinfo Add /proc/zoneinfo file to display information about memory zones. Useful to analyze VM behaviour. Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f8..40169f0b7e9e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1853,6 +1853,115 @@ struct seq_operations fragmentation_op = { .show = frag_show, }; +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { + struct per_cpu_pageset *pageset; + int j; + + pageset = &zone->pageset[i]; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n low: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].low, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } +#ifdef CONFIG_NUMA + seq_printf(m, + "\n numa_hit: %lu" + "\n numa_miss: %lu" + "\n numa_foreign: %lu" + "\n interleave_hit: %lu" + "\n local_node: %lu" + "\n other_node: %lu", + pageset->numa_hit, + pageset->numa_miss, + pageset->numa_foreign, + pageset->interleave_hit, + pageset->local_node, + pageset->other_node); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + static char *vmstat_text[] = { "nr_dirty", "nr_writeback", @@ -2058,10 +2167,10 @@ static void setup_per_zone_pages_min(void) min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* if it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / + zone->pages_min = (pages_min * zone->present_pages) / lowmem_pages; } -- cgit v1.2.2 From 753ee728964e5afb80c17659cc6c3a6fd0a42fe0 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:41 -0700 Subject: [PATCH] VM: early zone reclaim This is the core of the (much simplified) early reclaim. The goal of this patch is to reclaim some easily-freed pages from a zone before falling back onto another zone. One of the major uses of this is NUMA machines. With the default allocator behavior the allocator would look for memory in another zone, which might be off-node, before trying to reclaim from the current zone. This adds a zone tuneable to enable early zone reclaim. It is selected on a per-zone basis and is turned on/off via syscall. Adding some extra throttling on the reclaim was also required (patch 4/4). Without the machine would grind to a crawl when doing a "make -j" kernel build. Even with this patch the System Time is higher on average, but it seems tolerable. Here are some numbers for kernbench runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run: wall user sys %cpu ctx sw. sleeps ---- ---- --- ---- ------ ------ No patch 1009 1384 847 258 298170 504402 w/patch, no reclaim 880 1376 667 288 254064 396745 w/patch & reclaim 1079 1385 926 252 291625 548873 These numbers are the average of 2 runs of 3 "make -j" runs done right after system boot. Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to seee that with reclaim the benchmark still finishes in a reasonable amount of time. I also looked at the NUMA hit/miss stats for the "make -j" runs and the reclaim doesn't make any difference when the machine is thrashing away. Doing a "make -j8" on a single node that is filled with page cache pages takes 700 seconds with reclaim turned on and 735 seconds without reclaim (due to remote memory accesses). The simple zone_reclaim syscall program is at http://www.bork.org/~mort/sgi/zone_reclaim.c Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40169f0b7e9e..3c0f69ded6b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -724,6 +724,14 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, return 1; } +static inline int +should_reclaim_zone(struct zone *z, unsigned int gfp_mask) +{ + if (!z->reclaim_pages) + return 0; + return 1; +} + /* * This is the 'heart' of the zoned buddy allocator. */ @@ -760,17 +768,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, classzone_idx = zone_idx(zones[0]); - restart: +restart: /* Go through the zonelist once, looking for a zone with enough free */ for (i = 0; (z = zones[i]) != NULL; i++) { - - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) - continue; + int do_reclaim = should_reclaim_zone(z, gfp_mask); if (!cpuset_zone_allowed(z)) continue; + /* + * If the zone is to attempt early page reclaim then this loop + * will try to reclaim pages and check the watermark a second + * time before giving up and falling back to the next zone. + */ +zone_reclaim_retry: + if (!zone_watermark_ok(z, order, z->pages_low, + classzone_idx, 0, 0)) { + if (!do_reclaim) + continue; + else { + zone_reclaim(z, gfp_mask, order); + /* Only try reclaim once */ + do_reclaim = 0; + goto zone_reclaim_retry; + } + } + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; -- cgit v1.2.2 From 0c35bbadc59f5ed105c34471143eceb4c0dd9c95 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:42 -0700 Subject: [PATCH] VM: add __GFP_NORECLAIM When using the early zone reclaim, it was noticed that allocating new pages that should be spread across the whole system caused eviction of local pages. This adds a new GFP flag to prevent early reclaim from happening during certain allocation attempts. The example that is implemented here is for page cache pages. We want page cache pages to be spread across the whole system, and we don't want page cache pages to evict other pages to get local memory. Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c0f69ded6b5..a9da20bc2ed0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -729,6 +729,8 @@ should_reclaim_zone(struct zone *z, unsigned int gfp_mask) { if (!z->reclaim_pages) return 0; + if (gfp_mask & __GFP_NORECLAIM) + return 0; return 1; } -- cgit v1.2.2 From 1e7e5a9048b30c57ba1ddaa6cdf59b21b65cde99 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:43 -0700 Subject: [PATCH] VM: rate limit early reclaim When early zone reclaim is turned on the LRU is scanned more frequently when a zone is low on memory. This limits when the zone reclaim can be called by skipping the scan if another thread (either via kswapd or sync reclaim) is already reclaiming from the zone. Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9da20bc2ed0..2019c1b19254 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1738,6 +1738,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + atomic_set(&zone->reclaim_in_progress, -1); if (!size) continue; -- cgit v1.2.2 From e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 21 Jun 2005 17:14:47 -0700 Subject: [PATCH] node local per-cpu-pages This patch modifies the way pagesets in struct zone are managed. Each zone has a per-cpu array of pagesets. So any particular CPU has some memory in each zone structure which belongs to itself. Even if that CPU is not local to that zone. So the patch relocates the pagesets for each cpu to the node that is nearest to the cpu instead of allocating the pagesets in the (possibly remote) target zone. This means that the operations to manage pages on remote zone can be done with information available locally. We play a macro trick so that non-NUMA pmachines avoid the additional pointer chase on the page allocator fastpath. AIM7 benchmark on a 32 CPU SGI Altix w/o patches: Tasks jobs/min jti jobs/min/task real cpu 1 484.68 100 484.6769 12.01 1.97 Fri Mar 25 11:01:42 2005 100 27140.46 89 271.4046 21.44 148.71 Fri Mar 25 11:02:04 2005 200 30792.02 82 153.9601 37.80 296.72 Fri Mar 25 11:02:42 2005 300 32209.27 81 107.3642 54.21 451.34 Fri Mar 25 11:03:37 2005 400 34962.83 78 87.4071 66.59 588.97 Fri Mar 25 11:04:44 2005 500 31676.92 75 63.3538 91.87 742.71 Fri Mar 25 11:06:16 2005 600 36032.69 73 60.0545 96.91 885.44 Fri Mar 25 11:07:54 2005 700 35540.43 77 50.7720 114.63 1024.28 Fri Mar 25 11:09:49 2005 800 33906.70 74 42.3834 137.32 1181.65 Fri Mar 25 11:12:06 2005 900 34120.67 73 37.9119 153.51 1325.26 Fri Mar 25 11:14:41 2005 1000 34802.37 74 34.8024 167.23 1465.26 Fri Mar 25 11:17:28 2005 with slab API changes and pageset patch: Tasks jobs/min jti jobs/min/task real cpu 1 485.00 100 485.0000 12.00 1.96 Fri Mar 25 11:46:18 2005 100 28000.96 89 280.0096 20.79 150.45 Fri Mar 25 11:46:39 2005 200 32285.80 79 161.4290 36.05 293.37 Fri Mar 25 11:47:16 2005 300 40424.15 84 134.7472 43.19 438.42 Fri Mar 25 11:47:59 2005 400 39155.01 79 97.8875 59.46 590.05 Fri Mar 25 11:48:59 2005 500 37881.25 82 75.7625 76.82 730.19 Fri Mar 25 11:50:16 2005 600 39083.14 78 65.1386 89.35 872.79 Fri Mar 25 11:51:46 2005 700 38627.83 77 55.1826 105.47 1022.46 Fri Mar 25 11:53:32 2005 800 39631.94 78 49.5399 117.48 1169.94 Fri Mar 25 11:55:30 2005 900 36903.70 79 41.0041 141.94 1310.78 Fri Mar 25 11:57:53 2005 1000 36201.23 77 36.2012 160.77 1458.31 Fri Mar 25 12:00:34 2005 Signed-off-by: Christoph Lameter Signed-off-by: Shobhit Dayal Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 211 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 176 insertions(+), 35 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages); struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; EXPORT_SYMBOL(zone_table); +#ifdef CONFIG_NUMA +static struct per_cpu_pageset + pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; +#endif + static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; @@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; - pset = &zone->pageset[cpu]; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) local_irq_save(flags); cpu = smp_processor_id(); - p = &z->pageset[cpu]; + p = zone_pcp(z,cpu); if (pg == orig) { - z->pageset[cpu].numa_hit++; + p->numa_hit++; } else { p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; } if (pg == NODE_DATA(numa_node_id())) p->local_node++; @@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count >= pcp->high) pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) if (order == 0) { struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, @@ -1262,7 +1267,7 @@ void show_free_areas(void) if (!cpu_possible(cpu)) continue; - pageset = zone->pageset + cpu; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) printk("cpu %d %s: low %d, high %d, batch %d\n", @@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static int __devinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + return batch; +} + +#ifdef CONFIG_NUMA +/* + * Dynamicaly allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __devinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *npageset = NULL; + + npageset = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!npageset) { + zone->pageset[cpu] = NULL; + goto bad; + } + + if (zone->pageset[cpu]) { + memcpy(npageset, zone->pageset[cpu], + sizeof(struct per_cpu_pageset)); + + /* Relocate lists */ + for (i = 0; i < 2; i++) { + INIT_LIST_HEAD(&npageset->pcp[i].list); + list_splice(&zone->pageset[cpu]->pcp[i].list, + &npageset->pcp[i].list); + } + } else { + struct per_cpu_pages *pcp; + unsigned long batch; + + batch = zone_batchsize(zone); + + pcp = &npageset->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &npageset->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + zone->pageset[cpu] = npageset; + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(dzone->pageset[cpu]); + dzone->pageset[cpu] = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +#endif +} + +static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + free_zone_pagesets(cpu); + break; +#endif + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset() +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. - * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. - */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; +#ifdef CONFIG_NUMA + struct per_cpu_pageset *pgset; + pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + + (j * NR_CPUS) + cpu]; + + zone->pageset[cpu] = pgset; +#else + struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); +#endif - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp = &pgset->pcp[0]; /* hot */ pcp->count = 0; pcp->low = 2 * batch; pcp->high = 6 * batch; pcp->batch = 1 * batch; INIT_LIST_HEAD(&pcp->list); - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp = &pgset->pcp[1]; /* cold */ pcp->count = 0; pcp->low = 0; pcp->high = 2 * batch; @@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) struct per_cpu_pageset *pageset; int j; - pageset = &zone->pageset[i]; + pageset = zone_pcp(zone, i); for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { if (pageset->pcp[j].count) break; -- cgit v1.2.2 From 1ad539b2bd89bf2e129123eb24d5bcc4484a35de Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 21 Jun 2005 17:14:53 -0700 Subject: [PATCH] vm: try_to_free_pages unused argument try_to_free_pages accepts a third argument, order, but hasn't used it since before 2.6.0. The following patch removes the argument and updates all the calls to try_to_free_pages. Signed-off-by: Darren Hart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95cbd30a67b8..70f710286ced 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -859,7 +859,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask, order); + did_some_progress = try_to_free_pages(zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; -- cgit v1.2.2 From 83e5d8f7253cb7b14472385a6d57df1e9f848e8e Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 21 Jun 2005 17:14:54 -0700 Subject: [PATCH] __mod_page_state(): pass unsigned long instead of unsigned By making the offset argument of __mod_page_state an unsigned long instead of unsigned, we can avoid forcing the compiler to sign extend a usually constant argument. This saves 1 instruction on x86-64. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70f710286ced..c976628f9d95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1158,7 +1158,7 @@ unsigned long __read_page_state(unsigned offset) return ret; } -void __mod_page_state(unsigned offset, unsigned long delta) +void __mod_page_state(unsigned long offset, unsigned long delta) { unsigned long flags; void* ptr; -- cgit v1.2.2 From c2f29ea111e3344ed48257c2a142c3db514e1529 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 21 Jun 2005 17:14:55 -0700 Subject: [PATCH] __read_page_state(): pass unsigned long instead of unsigned By making the offset argument of __read_page_state an unsigned long instead of unsigned, we can avoid forcing the compiler to sign extend a usually constant argument. This saves 1 instruction on x86-64. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c976628f9d95..10446a646374 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1144,7 +1144,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); } -unsigned long __read_page_state(unsigned offset) +unsigned long __read_page_state(unsigned long offset) { unsigned long ret = 0; int cpu; -- cgit v1.2.2 From 578c2fd6a7f378434655e5c480e23152a3994404 Mon Sep 17 00:00:00 2001 From: Janet Morgan Date: Tue, 21 Jun 2005 17:14:56 -0700 Subject: [PATCH] add OOM debug This patch provides more debug info when the system is OOM. It displays memory stats (basically sysrq-m info) from __alloc_pages() when page allocation fails and during OOM kill. Thanks to Dave Jones for coming up with the idea. Signed-off-by: Janet Morgan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10446a646374..be05d17bd7df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -935,6 +935,7 @@ nopage: " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); + show_mem(); } return NULL; got_pg: -- cgit v1.2.2 From 4ae7c03943fca73f23bc0cdb938070f41b98101f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 21 Jun 2005 17:14:57 -0700 Subject: [PATCH] Periodically drain non local pagesets The pageset array can potentially acquire a huge amount of memory on large NUMA systems. F.e. on a system with 512 processors and 256 nodes there will be 256*512 pagesets. If each pageset only holds 5 pages then we are talking about 655360 pages.With a 16K page size on IA64 this results in potentially 10 Gigabytes of memory being trapped in pagesets. The typical cases are much less for smaller systems but there is still the potential of memory being trapped in off node pagesets. Off node memory may be rarely used if local memory is available and so we may potentially have memory in seldom used pagesets without this patch. The slab allocator flushes its per cpu caches every 2 seconds. The following patch flushes the off node pageset caches in the same way by tying into the slab flush. The patch also changes /proc/zoneinfo to include the number of pages currently in each pageset. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be05d17bd7df..a95e72d7f945 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -516,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return allocated; } +#ifdef CONFIG_NUMA +/* Called from the slab reaper to drain remote pagesets */ +void drain_remote_pages(void) +{ + struct zone *zone; + int i; + unsigned long flags; + + local_irq_save(flags); + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + /* Do not drain local pagesets */ + if (zone->zone_pgdat->node_id == numa_node_id()) + continue; + + pset = zone->pageset[smp_processor_id()]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + if (pcp->count) + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } + local_irq_restore(flags); +} +#endif + #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { @@ -1271,12 +1301,13 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", + printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", pageset->pcp[temperature].low, pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); } } -- cgit v1.2.2 From 2caaad41e4aa8f5dd999695b4ddeaa0e7f3912a4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 21 Jun 2005 17:15:00 -0700 Subject: [PATCH] Reduce size of huge boot per_cpu_pageset Reduce size of the huge per_cpu_pageset structure in __initdata introduced into mm1 with the pageset localization patchset. Use one specially configured pageset per cpu for all zones and nodes during bootup. - Avoid duplication of pageset initialization code. - do the adding to the pageset list before potential free_pages_bulk in free_hot_cold_page (otherwise we would have to hold a page in a pageset during the period that the boot pagesets are in use). - remove mistaken __cpuinitdata attribute and revert back to __initdata for the boot pageset. A boot pageset is not necessary for cpu hotplug. Tested for UP SMP NUMA on x86_64 (2.6.12-rc6-mm1): UP SMP NUMA Tested on IA64 (2.6.12-rc5-mm2): NUMA (2.6.12-rc6-mm1 broken for IA64 because of sparsemem patches) Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 108 ++++++++++++++++++++++---------------------------------- 1 file changed, 42 insertions(+), 66 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a95e72d7f945..418102a02921 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -71,11 +71,6 @@ EXPORT_SYMBOL(nr_swap_pages); struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; EXPORT_SYMBOL(zone_table); -#ifdef CONFIG_NUMA -static struct per_cpu_pageset - pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; -#endif - static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; @@ -652,10 +647,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) free_pages_check(__FUNCTION__, page); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); list_add(&page->lru, &pcp->list); pcp->count++; + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); local_irq_restore(flags); put_cpu(); } @@ -1714,57 +1709,55 @@ static int __devinit zone_batchsize(struct zone *zone) return batch; } +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); +} + #ifdef CONFIG_NUMA /* - * Dynamicaly allocate memory for the + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * These will be discarded when bootup is complete. + */ +static struct per_cpu_pageset + boot_pageset[NR_CPUS] __initdata; + +/* + * Dynamically allocate memory for the * per cpu pageset array in struct zone. */ static int __devinit process_zones(int cpu) { struct zone *zone, *dzone; - int i; for_each_zone(zone) { - struct per_cpu_pageset *npageset = NULL; - npageset = kmalloc_node(sizeof(struct per_cpu_pageset), + zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); - if (!npageset) { - zone->pageset[cpu] = NULL; + if (!zone->pageset[cpu]) goto bad; - } - if (zone->pageset[cpu]) { - memcpy(npageset, zone->pageset[cpu], - sizeof(struct per_cpu_pageset)); - - /* Relocate lists */ - for (i = 0; i < 2; i++) { - INIT_LIST_HEAD(&npageset->pcp[i].list); - list_splice(&zone->pageset[cpu]->pcp[i].list, - &npageset->pcp[i].list); - } - } else { - struct per_cpu_pages *pcp; - unsigned long batch; - - batch = zone_batchsize(zone); - - pcp = &npageset->pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &npageset->pcp[1]; /* cold*/ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - } - zone->pageset[cpu] = npageset; + setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); } return 0; @@ -1878,30 +1871,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; #ifdef CONFIG_NUMA - struct per_cpu_pageset *pgset; - pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + - (j * NR_CPUS) + cpu]; - - zone->pageset[cpu] = pgset; + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); #else - struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); + setup_pageset(zone_pcp(zone,cpu), batch); #endif - - pcp = &pgset->pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &pgset->pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); -- cgit v1.2.2 From 334795eca421287c41c257992027d29659dc0f97 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 21 Jun 2005 17:15:08 -0700 Subject: [PATCH] bad_page: clear reclaim and slab Since free_pages_check complains if PG_reclaim or PG_slab is set, bad_page ought to clear them to avoid repetitive reports (Nikita noticed this too). Let prep_new_page check page_count and PG_slab as free_pages_check does. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 418102a02921..62b950901d6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | + page->flags &= ~(1 << PG_lru | + 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); @@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapcount(page) || - (page->flags & ( + if ( page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); -- cgit v1.2.2 From c0d62219a48bd91ec40fb254c930914dccc77ff1 Mon Sep 17 00:00:00 2001 From: Denis Vlasenko Date: Tue, 21 Jun 2005 17:15:14 -0700 Subject: [PATCH] Kill stray newline OOM killer prints a stray newline. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62b950901d6f..206920796f5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1314,7 +1314,7 @@ void show_free_areas(void) get_page_state(&ps); get_zone_counts(&active, &inactive, &free); - printk("\nFree pages: %11ukB (%ukB HighMem)\n", + printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); -- cgit v1.2.2