From 676165a8af7167f488abdcce6851a9bc36e83254 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 10 Apr 2006 11:21:48 +1000 Subject: [PATCH] Fix buddy list race that could lead to page lru list corruptions Rohit found an obscure bug causing buddy list corruption. page_is_buddy is using a non-atomic test (PagePrivate && page_count == 0) to determine whether or not a free page's buddy is itself free and in the buddy lists. Each of the conjuncts may be true at different times due to unrelated conditions, so the non-atomic page_is_buddy test may find each conjunct to be true even if they were not both true at the same time (ie. the page was not on the buddy lists). Signed-off-by: Martin Bligh Signed-off-by: Rohit Seth Signed-off-by: Nick Piggin Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc523a1f270d..b8165e037dee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -151,7 +151,8 @@ static void bad_page(struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ); + 1 << PG_writeback | + 1 << PG_buddy ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -236,12 +237,12 @@ static inline unsigned long page_order(struct page *page) { static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); - __SetPagePrivate(page); + __SetPageBuddy(page); } static inline void rmv_page_order(struct page *page) { - __ClearPagePrivate(page); + __ClearPageBuddy(page); set_page_private(page, 0); } @@ -280,11 +281,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && - * (b) the buddy is free && - * (c) the buddy is on the buddy system && - * (d) a page and its buddy have the same order. - * for recording page's order, we use page_private(page) and PG_private. + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order. + * + * For recording whether a page is in the buddy system, we use PG_buddy. + * Setting, clearing, and testing PG_buddy is serialized by zone->lock. * + * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, int order) { @@ -293,10 +296,10 @@ static inline int page_is_buddy(struct page *page, int order) return 0; #endif - if (PagePrivate(page) && - (page_order(page) == order) && - page_count(page) == 0) + if (PageBuddy(page) && page_order(page) == order) { + BUG_ON(page_count(page) != 0); return 1; + } return 0; } @@ -313,7 +316,7 @@ static inline int page_is_buddy(struct page *page, int order) * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_Private.Page's + * free pages of length of (1 << order) and marked with PG_buddy. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -376,7 +379,8 @@ static inline int free_pages_check(struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -524,7 +528,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); /* -- cgit v1.2.2 From cb45b0e966cbe747b6189c15b108901cc7d6c97c Mon Sep 17 00:00:00 2001 From: Hideo AOKI Date: Mon, 10 Apr 2006 22:52:59 -0700 Subject: [PATCH] overcommit: add calculate_totalreserve_pages() These patches are an enhancement of OVERCOMMIT_GUESS algorithm in __vm_enough_memory(). - why the kernel needed patching When the kernel can't allocate anonymous pages in practice, currnet OVERCOMMIT_GUESS could return success. This implementation might be the cause of oom kill in memory pressure situation. If the Linux runs with page reservation features like /proc/sys/vm/lowmem_reserve_ratio and without swap region, I think the oom kill occurs easily. - the overall design approach in the patch When the OVERCOMMET_GUESS algorithm calculates number of free pages, the reserved free pages are regarded as non-free pages. This change helps to avoid the pitfall that the number of free pages become less than the number which the kernel tries to keep free. - testing results I tested the patches using my test kernel module. If the patches aren't applied to the kernel, __vm_enough_memory() returns success in the situation but autual page allocation is failed. On the other hand, if the patches are applied to the kernel, memory allocation failure is avoided since __vm_enough_memory() returns failure in the situation. I checked that on i386 SMP 16GB memory machine. I haven't tested on nommu environment currently. This patch adds totalreserve_pages for __vm_enough_memory(). Calculate_totalreserve_pages() checks maximum lowmem_reserve pages and pages_high in each zone. Finally, the function stores the sum of each zone to totalreserve_pages. The totalreserve_pages is calculated when the VM is initilized. And the variable is updated when /proc/sys/vm/lowmem_reserve_raito or /proc/sys/vm/min_free_kbytes are changed. Signed-off-by: Hideo Aoki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b8165e037dee..97d6827c7d66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,6 +51,7 @@ nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; @@ -2476,6 +2477,38 @@ void __init page_alloc_init(void) hotcpu_notifier(page_alloc_cpu_notify, 0); } +/* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + int i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat pages_high as reserved pages. */ + max += zone->pages_high; + + if (max > zone->present_pages) + max = zone->present_pages; + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + /* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone @@ -2507,6 +2540,9 @@ static void setup_per_zone_lowmem_reserve(void) } } } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2561,6 +2597,9 @@ void setup_per_zone_pages_min(void) zone->pages_high = zone->pages_min + tmp / 2; spin_unlock_irqrestore(&zone->lru_lock, flags); } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* -- cgit v1.2.2 From 6aa3001b239b387d98a7f945e4a51edeb59e4f2d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 18 Apr 2006 22:20:52 -0700 Subject: [PATCH] page_alloc.c: buddy handling cleanup Fix up some whitespace damage. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97d6827c7d66..123c60586740 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -232,11 +232,13 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); __SetPageBuddy(page); } @@ -299,9 +301,9 @@ static inline int page_is_buddy(struct page *page, int order) if (PageBuddy(page) && page_order(page) == order) { BUG_ON(page_count(page) != 0); - return 1; + return 1; } - return 0; + return 0; } /* -- cgit v1.2.2