From 2ae88149a27cadf2840e0ab8155bef13be285c03 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 28 Oct 2006 10:38:23 -0700 Subject: [PATCH] mm: clean up pagecache allocation - Consolidate page_cache_alloc - Fix splice: only the pagecache pages and filesystem data need to use mapping_gfp_mask. - Fix grab_cache_page_nowait: same as splice, also honour NUMA placement. Signed-off-by: Nick Piggin Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index cb26e33fd0ff..7b84dc814347 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -467,25 +467,15 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } #ifdef CONFIG_NUMA -struct page *page_cache_alloc(struct address_space *x) +struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x), 0); + return alloc_pages_node(n, gfp, 0); } - return alloc_pages(mapping_gfp_mask(x), 0); + return alloc_pages(gfp, 0); } -EXPORT_SYMBOL(page_cache_alloc); - -struct page *page_cache_alloc_cold(struct address_space *x) -{ - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); - } - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); -} -EXPORT_SYMBOL(page_cache_alloc_cold); +EXPORT_SYMBOL(__page_cache_alloc); #endif static int __sleep_on_page_lock(void *word) @@ -826,7 +816,6 @@ struct page * grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { struct page *page = find_get_page(mapping, index); - gfp_t gfp_mask; if (page) { if (!TestSetPageLocked(page)) @@ -834,9 +823,8 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) page_cache_release(page); return NULL; } - gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); - if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; } -- cgit v1.2.2 From 3bb1a852ab6c9cdf211a2f4a2f502340c8c38eca Mon Sep 17 00:00:00 2001 From: Martin Bligh Date: Sat, 28 Oct 2006 10:38:24 -0700 Subject: [PATCH] vmscan: Fix temp_priority race The temp_priority field in zone is racy, as we can walk through a reclaim path, and just before we copy it into prev_priority, it can be overwritten (say with DEF_PRIORITY) by another reclaimer. The same bug is contained in both try_to_free_pages and balance_pgdat, but it is fixed slightly differently. In balance_pgdat, we keep a separate priority record per zone in a local array. In try_to_free_pages there is no need to do this, as the priority level is the same for all zones that we reclaim from. Impact of this bug is that temp_priority is copied into prev_priority, and setting this artificially high causes reclaimers to set distress artificially low. They then fail to reclaim mapped pages, when they are, in fact, under severe memory pressure (their priority may be as low as 0). This causes the OOM killer to fire incorrectly. From: Andrew Morton __zone_reclaim() isn't modifying zone->prev_priority. But zone->prev_priority is used in the decision whether or not to bring mapped pages onto the inactive list. Hence there's a risk here that __zone_reclaim() will fail because zone->prev_priority ir large (ie: low urgency) and lots of mapped pages end up stuck on the active list. Fix that up by decreasing (ie making more urgent) zone->prev_priority as __zone_reclaim() scans the zone's pages. This bug perhaps explains why ZONE_RECLAIM_PRIORITY was created. It should be possible to remove that now, and to just start out at DEF_PRIORITY? Cc: Nick Piggin Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- mm/vmscan.c | 55 +++++++++++++++++++++++++++++++++++++++++-------------- mm/vmstat.c | 2 -- 3 files changed, 42 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f5fc45472d5c..ecf853b5e30e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2407,7 +2407,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index f05527bf792b..b32560ead5c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -723,6 +723,20 @@ done: return nr_reclaimed; } +/* + * We are about to scan this zone at a certain priority level. If that priority + * level is smaller (ie: more urgent) than the previous priority, then note + * that priority level within the zone. This is done so that when the next + * process comes in to scan this zone, it will immediately start out at this + * priority level rather than having to build up its own scanning priority. + * Here, this priority affects only the reclaim-mapped threshold. + */ +static inline void note_zone_scanning_priority(struct zone *zone, int priority) +{ + if (priority < zone->prev_priority) + zone->prev_priority = priority; +} + static inline int zone_is_near_oom(struct zone *zone) { return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; @@ -972,9 +986,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + note_zone_scanning_priority(zone, priority); if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ @@ -1024,7 +1036,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = DEF_PRIORITY; lru_pages += zone->nr_active + zone->nr_inactive; } @@ -1065,13 +1076,22 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) if (!sc.all_unreclaimable) ret = 1; out: + /* + * Now that we've scanned all the zones at this priority level, note + * that level within the zone so that the next thread which performs + * scanning of this zone will immediately start out at this priority + * level. This affects only the decision whether or not to bring + * mapped pages onto the inactive list. + */ + if (priority < 0) + priority = 0; for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = priority; } return ret; } @@ -1111,6 +1131,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, }; + /* + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. + */ + int temp_priority[MAX_NR_ZONES]; loop_again: total_scanned = 0; @@ -1118,11 +1143,8 @@ loop_again: sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->temp_priority = DEF_PRIORITY; - } + for (i = 0; i < pgdat->nr_zones; i++) + temp_priority[i] = DEF_PRIORITY; for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -1183,10 +1205,9 @@ scan: if (!zone_watermark_ok(zone, order, zone->pages_high, end_zone, 0)) all_zones_ok = 0; - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; + temp_priority[i] = priority; sc.nr_scanned = 0; + note_zone_scanning_priority(zone, priority); nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1226,10 +1247,15 @@ scan: break; } out: + /* + * Note within each zone the priority level at which this zone was + * brought into a happy state. So that the next thread which scans this + * zone will start out at that priority level. + */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; - zone->prev_priority = zone->temp_priority; + zone->prev_priority = temp_priority[i]; } if (!all_zones_ok) { cond_resched(); @@ -1614,6 +1640,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { + note_zone_scanning_priority(zone, priority); nr_reclaimed += shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && nr_reclaimed < nr_pages); diff --git a/mm/vmstat.c b/mm/vmstat.c index 45b124e012f5..8614e8f6743b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -587,11 +587,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n temp_priority: %i" "\n start_pfn: %lu", zone->all_unreclaimable, zone->prev_priority, - zone->temp_priority, zone->zone_start_pfn); spin_unlock_irqrestore(&zone->lock, flags); seq_putc(m, '\n'); -- cgit v1.2.2 From bbdb396a60b2ebf7de3b717991e5d3e28c8b7bbd Mon Sep 17 00:00:00 2001 From: Martin Bligh Date: Sat, 28 Oct 2006 10:38:25 -0700 Subject: [PATCH] Use min of two prio settings in calculating distress for reclaim If try_to_free_pages / balance_pgdat are called with a gfp_mask specifying GFP_IO and/or GFP_FS, they will reclaim the requisite number of pages, and the reset prev_priority to DEF_PRIORITY (or to some other high (ie: unurgent) value). However, another reclaimer without those gfp_mask flags set (say, GFP_NOIO) may still be struggling to reclaim pages. The concurrent overwrite of zone->prev_priority will cause this GFP_NOIO thread to unexpectedly cease deactivating mapped pages, thus causing reclaim difficulties. Fix this is to key the distress calculation not off zone->prev_priority, but also take into account the local caller's priority by using min(zone->prev_priority, sc->priority) Signed-off-by: Martin J. Bligh Cc: Nick Piggin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b32560ead5c0..518540a4a2a6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -760,7 +760,7 @@ static inline int zone_is_near_oom(struct zone *zone) * But we had to alter page->flags anyway. */ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, int priority) { unsigned long pgmoved; int pgdeactivate = 0; @@ -784,7 +784,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * `distress' is a measure of how much trouble we're having * reclaiming pages. 0 -> no problems. 100 -> great trouble. */ - distress = 100 >> zone->prev_priority; + distress = 100 >> min(zone->prev_priority, priority); /* * The point of this algorithm is to decide when to start @@ -936,7 +936,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc); + shrink_active_list(nr_to_scan, zone, sc, priority); } if (nr_inactive) { @@ -1384,7 +1384,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, if (zone->nr_scan_active >= nr_pages || pass > 3) { zone->nr_scan_active = 0; nr_to_scan = min(nr_pages, zone->nr_active); - shrink_active_list(nr_to_scan, zone, sc); + shrink_active_list(nr_to_scan, zone, sc, prio); } } -- cgit v1.2.2 From f2d0aa5bf8d4f7ae4cb1a7feebf5b1afddd0b9b0 Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Sat, 28 Oct 2006 10:38:32 -0700 Subject: [PATCH] memory hotplug: __GFP_NOWARN is better for __kmalloc_section_memmap() Add __GFP_NOWARN flag to calling of __alloc_pages() in __kmalloc_section_memmap(). It can reduce noisy failure message. In ia64, section size is 1 GB, this means that order 8 pages are necessary for each section's memmap. It is often very hard requirement under heavy memory pressure as you know. So, __alloc_pages() gives up allocation and shows many noisy stack traces which means no page for each sections. (Current my environment shows 32 times of stack trace....) But, __kmalloc_section_memmap() calls vmalloc() after failure of it, and it can succeed allocation of memmap. So, its stack trace warning becomes just noisy. I suppose it shouldn't be shown. Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 86c52ab80878..b3c82ba30012 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -211,7 +211,7 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) struct page *page, *ret; unsigned long memmap_size = sizeof(struct page) * nr_pages; - page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); if (page) goto got_map_page; -- cgit v1.2.2 From 52fd24ca1db3a741f144bbc229beefe044202cac Mon Sep 17 00:00:00 2001 From: Giridhar Pemmasani Date: Sat, 28 Oct 2006 10:38:34 -0700 Subject: [PATCH] __vmalloc with GFP_ATOMIC causes 'sleeping from invalid context' If __vmalloc is called to allocate memory with GFP_ATOMIC in atomic context, the chain of calls results in __get_vm_area_node allocating memory for vm_struct with GFP_KERNEL, causing the 'sleeping from invalid context' warning. This patch fixes it by passing the gfp flags along so __get_vm_area_node allocates memory for vm_struct with the same flags. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1133dd3aafcf..6d381df7c9b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -160,13 +160,15 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) return err; } -struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end, int node) +static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + int node, gfp_t gfp_mask) { struct vm_struct **p, *tmp, *area; unsigned long align = 1; unsigned long addr; + BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { int bit = fls(size); @@ -180,7 +182,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, addr = ALIGN(start, align); size = PAGE_ALIGN(size); - area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); + area = kmalloc_node(sizeof(*area), gfp_mask, node); if (unlikely(!area)) return NULL; @@ -236,7 +238,7 @@ out: struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1); + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); } /** @@ -253,9 +255,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) +struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, + int node, gfp_t gfp_mask) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, + gfp_mask); } /* Caller must hold vmlist_lock */ @@ -487,7 +491,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!size || (size >> PAGE_SHIFT) > num_physpages) return NULL; - area = get_vm_area_node(size, VM_ALLOC, node); + area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask); if (!area) return NULL; -- cgit v1.2.2 From ebed4bfc8da8df5b6b0bc4a5064a949f04683509 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 28 Oct 2006 10:38:43 -0700 Subject: [PATCH] hugetlb: fix absurd HugePages_Rsvd If you truncated an mmap'ed hugetlbfs file, then faulted on the truncated area, /proc/meminfo's HugePages_Rsvd wrapped hugely "negative". Reinstate my preliminary i_size check before attempting to allocate the page (though this only fixes the most obvious case: more work will be needed here). Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: David Gibson Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2dbec90dc3ba..a088f593a807 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -478,6 +478,9 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, retry: page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> HPAGE_SHIFT; + if (idx >= size) + goto out; if (hugetlb_get_quota(mapping)) goto out; page = alloc_huge_page(vma, address); -- cgit v1.2.2 From 0c6cb974636dd29681b03f8eb0ae227decab01fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 28 Oct 2006 10:38:59 -0700 Subject: [PATCH] Calculation fix for memory holes beyong the end of physical memory absent_pages_in_range() made the assumption that users of the arch-independent zone-sizing API would not care about holes beyound the end of physical memory. This was not the case and was "fixed" in a patch called "Account for holes that are outside the range of physical memory". However, when given a range that started before a hole in "real" memory and ended beyond the end of memory, it would get the result wrong. The bug is in mainline but a patch is below. It has been tested successfully on a number of machines and architectures. Additional credit to Keith Mannthey for discovering the problem, helping identify the correct fix and confirming it Worked For Him. Signed-off-by: Mel Gorman Cc: keith mannthey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecf853b5e30e..b55bb358b832 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2261,7 +2261,7 @@ unsigned long __init __absent_pages_in_range(int nid, /* Account for ranges past physical memory on this node */ if (range_end_pfn > prev_end_pfn) - hole_pages = range_end_pfn - + hole_pages += range_end_pfn - max(range_start_pfn, prev_end_pfn); return hole_pages; -- cgit v1.2.2 From 5211e6e6c671f0d4b1e1a1023384d20227d8ee65 Mon Sep 17 00:00:00 2001 From: Giridhar Pemmasani Date: Sun, 29 Oct 2006 04:46:55 -0800 Subject: [PATCH] Fix GFP_HIGHMEM slab panic As reported by Martin J. Bligh , we let through some non-slab bits to slab allocation through __get_vm_area_node when doing a vmalloc. I haven't been able to reproduce this, although I understand why it happens: vmalloc allocates memory with GFP_KERNEL | __GFP_HIGHMEM and commit 52fd24ca1db3a741f144bbc229beefe044202cac resulted in the same flags are passed down to cache_alloc_refill, causing the BUG. The following patch fixes it. Note that when calling kmalloc_node, I am masking off __GFP_HIGHMEM with GFP_LEVEL_MASK, whereas __vmalloc_area_node does the same with ~(__GFP_HIGHMEM | __GFP_ZERO). IMHO, using GFP_LEVEL_MASK is preferable, but either should fix this problem. Signed-off-by: Giridhar Pemmasani (pgiri@yahoo.com) Cc: Martin J. Bligh Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6d381df7c9b3..46606c133e82 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -182,7 +182,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl addr = ALIGN(start, align); size = PAGE_ALIGN(size); - area = kmalloc_node(sizeof(*area), gfp_mask, node); + area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); if (unlikely(!area)) return NULL; -- cgit v1.2.2 From 941c7105dc4f4961727acc518e18e00b9a03cbf3 Mon Sep 17 00:00:00 2001 From: nkalmala Date: Thu, 2 Nov 2006 22:07:04 -0800 Subject: [PATCH] mm: un-needed add-store operation wastes a few bytes Un-needed add-store operation wastes a few bytes. 8 bytes wasted with -O2, on a ppc. Signed-off-by: nkalmala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b55bb358b832..bf2f6cff1d6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -853,7 +853,7 @@ again: pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); if (!pcp->count) { - pcp->count += rmqueue_bulk(zone, 0, + pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (unlikely(!pcp->count)) goto failed; -- cgit v1.2.2 From 029e332ea717810172e965ec50f942755ad0c58a Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 2 Nov 2006 22:07:06 -0800 Subject: [PATCH] Cleanup read_pages() Current read_pages() assume ->readpages() frees the passed pages. This patch free the pages in ->read_pages(), if those were remaining in the pages_list. So, readpages() just can ignore the remaining pages in pages_list. Signed-off-by: OGAWA Hirofumi Cc: Steven French Cc: Miklos Szeredi Cc: Steven Whitehouse Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 1ba736ac0367..23cb61a01c6e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -173,6 +173,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + /* Clean up the remaining pages */ + put_pages_list(pages); goto out; } -- cgit v1.2.2 From 7f6b8876c7e66b0d15af134e2a5b87e55514eb6d Mon Sep 17 00:00:00 2001 From: Daniel Yeisley Date: Thu, 2 Nov 2006 22:07:14 -0800 Subject: [PATCH] init_reap_node() initialization fix It looks like there is a bug in init_reap_node() in slab.c that can cause multiple oops's on certain ES7000 configurations. The variable reap_node is defined per cpu, but only initialized on a single CPU. This causes an oops in next_reap_node() when __get_cpu_var(reap_node) returns the wrong value. Fix is below. Signed-off-by: Dan Yeisley Cc: Andi Kleen Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 84c631f30741..3c4a7e34eddc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -883,7 +883,7 @@ static void init_reap_node(int cpu) if (node == MAX_NUMNODES) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + per_cpu(reap_node, cpu) = node; } static void next_reap_node(void) -- cgit v1.2.2 From 8ce08464d2c749610a52c4d6c7c11080a7eaaef1 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Nov 2006 22:07:28 -0800 Subject: [PATCH] Fix sys_move_pages when a NULL node list is passed sys_move_pages() uses vmalloc() to allocate an array of structures that is fills with information passed from user mode and then passes to do_stat_pages() (in the case the node list is NULL). do_stat_pages() depends on a marker in the node field of the structure to decide how large the array is and this marker is correctly inserted into the last element of the array. However, vmalloc() doesn't zero the memory it allocates and if the user passes NULL for the node list, then the node fields are not filled in (except for the end marker). If the memory the vmalloc() returned happend to have a word with the marker value in it in just the right place, do_pages_stat will fail to fill the status field of part of the array and we will return (random) kernel data to user mode. Signed-off-by: Stephen Rothwell Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index ba2453f9483d..b4979d423d2b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -952,7 +952,8 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, goto out; pm[i].node = node; - } + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ } /* End marker */ pm[nr_pages].node = MAX_NUMNODES; -- cgit v1.2.2 From 2b4ac44e7c7e16cf9411b81693ff3e604f332bf1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Nov 2006 12:27:48 -0800 Subject: [PATCH] vmalloc: optimization, cleanup, bugfixes - reorder 'struct vm_struct' to speedup lookups on CPUS with small cache lines. The fields 'next,addr,size' should be now in the same cache line, to speedup lookups. - One minor cleanup in __get_vm_area_node() - Bugfixes in vmalloc_user() and vmalloc_32_user() NULL returns from __vmalloc() and __find_vm_area() were not tested. [akpm@osdl.org: remove redundant BUG_ONs] Signed-off-by: Eric Dumazet Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 46606c133e82..7dc6aa745166 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -186,10 +186,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl if (unlikely(!area)) return NULL; - if (unlikely(!size)) { - kfree (area); + if (unlikely(!size)) return NULL; - } /* * We always allocate a guard page. @@ -532,11 +530,12 @@ void *vmalloc_user(unsigned long size) void *ret; ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - write_lock(&vmlist_lock); - area = __find_vm_area(ret); - area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); - + if (ret) { + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + } return ret; } EXPORT_SYMBOL(vmalloc_user); @@ -605,11 +604,12 @@ void *vmalloc_32_user(unsigned long size) void *ret; ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); - write_lock(&vmlist_lock); - area = __find_vm_area(ret); - area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); - + if (ret) { + write_lock(&vmlist_lock); + area = __find_vm_area(ret); + area->flags |= VM_USERMAP; + write_unlock(&vmlist_lock); + } return ret; } EXPORT_SYMBOL(vmalloc_32_user); -- cgit v1.2.2 From 68589bc353037f233fe510ad9ff432338c95db66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 14 Nov 2006 02:03:32 -0800 Subject: [PATCH] hugetlb: prepare_hugepage_range check offset too (David:) If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example, because the given file offset is not hugepage aligned - then do_mmap_pgoff will go to the unmap_and_free_vma backout path. But at this stage the vma hasn't been marked as hugepage, and the backout path will call unmap_region() on it. That will eventually call down to the non-hugepage version of unmap_page_range(). On ppc64, at least, that will cause serious problems if there are any existing hugepage pagetable entries in the vicinity - for example if there are any other hugepage mappings under the same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud entries. I suspect this will also cause bad problems on ia64, though I don't have a machine to test it on. (Hugh:) prepare_hugepage_range() should check file offset alignment when it checks virtual address and length, to stop MAP_FIXED with a bad huge offset from unmapping before it fails further down. PowerPC should apply the same prepare_hugepage_range alignment checks as ia64 and all the others do. Then none of the alignment checks in hugetlbfs_file_mmap are required (nor is the check for too small a mapping); but even so, move up setting of VM_HUGETLB and add a comment to warn of what David Gibson discovered - if hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region when unwinding from error will go the non-huge way, which may cause bad behaviour on architectures (powerpc and ia64) which segregate their huge mappings into a separate region of the address space. Signed-off-by: Hugh Dickins Cc: "Luck, Tony" Cc: "David S. Miller" Acked-by: Adam Litke Acked-by: David Gibson Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 497e502dfd6b..bdace87d7c01 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1379,7 +1379,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, * Check if the given range is hugepage aligned, and * can be made suitable for hugepages. */ - ret = prepare_hugepage_range(addr, len); + ret = prepare_hugepage_range(addr, len, pgoff); } else { /* * Ensure that a normal request is not falling in a -- cgit v1.2.2 From cb07c9a1864a8eac9f3123e428100d5b2a16e65a Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 14 Nov 2006 02:03:38 -0800 Subject: [PATCH] hugetlb: check for brk() entering a hugepage region Unlike mmap(), the codepath for brk() creates a vma without first checking that it doesn't touch a region exclusively reserved for hugepages. On powerpc, this can allow it to create a normal page vma in a hugepage region, causing oopses and other badness. Add a test to prevent this. With this patch, brk() will simply fail if it attempts to move the break into a hugepage reserved region. Signed-off-by: David Gibson Cc: Adam Litke Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bdace87d7c01..2526463c99a7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1880,6 +1880,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; + error = is_hugepage_only_range(current->mm, addr, len); + if (error) + return error; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = arch_mmap_check(addr, len, flags); -- cgit v1.2.2 From cd2579d7aa7bfc966cc271a88e77f8cfc3b0b7ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 14 Nov 2006 13:43:38 +0000 Subject: [PATCH] hugetlb: fix error return for brk() entering a hugepage region Commit cb07c9a1864a8eac9f3123e428100d5b2a16e65a causes the wrong return value. is_hugepage_only_range() is a boolean, so we should return -EINVAL rather than 1. Also - we can use "mm" instead of looking up "current->mm" again. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2526463c99a7..7b40abd7cba2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1880,9 +1880,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; - error = is_hugepage_only_range(current->mm, addr, len); - if (error) - return error; + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; -- cgit v1.2.2 From 31be8309532a6743f301cb2e83bd12ca07988b09 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 16 Nov 2006 01:19:29 -0800 Subject: [PATCH] Fix strange size check in __get_vm_area_node() Recently, __get_vm_area_node() was changed like following if (unlikely(!area)) return NULL; - if (unlikely(!size)) { - kfree (area); + if (unlikely(!size)) return NULL; - } It is leaking `area', also original code seems strange already. Probably, we wanted to do this patch. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7dc6aa745166..86897ee792d6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -181,14 +181,13 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl } addr = ALIGN(start, align); size = PAGE_ALIGN(size); + if (unlikely(!size)) + return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); if (unlikely(!area)) return NULL; - if (unlikely(!size)) - return NULL; - /* * We always allocate a guard page. */ -- cgit v1.2.2