diff options
author | Nick Piggin <piggin@cyberone.com.au> | 2006-04-09 21:21:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-04-10 13:16:37 -0400 |
commit | 676165a8af7167f488abdcce6851a9bc36e83254 (patch) | |
tree | a9b2b8dc155b48ce073b5ada31f2ac0694118e69 | |
parent | c3a9d6541f84ac3ff566982d08389b87c1c36b4e (diff) |
[PATCH] Fix buddy list race that could lead to page lru list corruptions
Rohit found an obscure bug causing buddy list corruption.
page_is_buddy is using a non-atomic test (PagePrivate && page_count == 0)
to determine whether or not a free page's buddy is itself free and in the
buddy lists.
Each of the conjuncts may be true at different times due to unrelated
conditions, so the non-atomic page_is_buddy test may find each conjunct to
be true even if they were not both true at the same time (ie. the page was
not on the buddy lists).
Signed-off-by: Martin Bligh <mbligh@google.com>
Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/mm.h | 5 | ||||
-rw-r--r-- | include/linux/page-flags.h | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 31 |
3 files changed, 27 insertions, 17 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index 6aa016f1d3ae..1154684209a4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -229,10 +229,9 @@ struct page { | |||
229 | unsigned long private; /* Mapping-private opaque data: | 229 | unsigned long private; /* Mapping-private opaque data: |
230 | * usually used for buffer_heads | 230 | * usually used for buffer_heads |
231 | * if PagePrivate set; used for | 231 | * if PagePrivate set; used for |
232 | * swp_entry_t if PageSwapCache. | 232 | * swp_entry_t if PageSwapCache; |
233 | * When page is free, this | ||
234 | * indicates order in the buddy | 233 | * indicates order in the buddy |
235 | * system. | 234 | * system if PG_buddy is set. |
236 | */ | 235 | */ |
237 | struct address_space *mapping; /* If low bit clear, points to | 236 | struct address_space *mapping; /* If low bit clear, points to |
238 | * inode address_space, or NULL. | 237 | * inode address_space, or NULL. |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9ea629c02a4b..547aac7696cd 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -74,7 +74,9 @@ | |||
74 | #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ | 74 | #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ |
75 | #define PG_reclaim 17 /* To be reclaimed asap */ | 75 | #define PG_reclaim 17 /* To be reclaimed asap */ |
76 | #define PG_nosave_free 18 /* Free, should not be written */ | 76 | #define PG_nosave_free 18 /* Free, should not be written */ |
77 | #define PG_uncached 19 /* Page has been mapped as uncached */ | 77 | #define PG_buddy 19 /* Page is free, on buddy lists */ |
78 | |||
79 | #define PG_uncached 20 /* Page has been mapped as uncached */ | ||
78 | 80 | ||
79 | /* | 81 | /* |
80 | * Global page accounting. One instance per CPU. Only unsigned longs are | 82 | * Global page accounting. One instance per CPU. Only unsigned longs are |
@@ -317,6 +319,10 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); | |||
317 | #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags) | 319 | #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags) |
318 | #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags) | 320 | #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags) |
319 | 321 | ||
322 | #define PageBuddy(page) test_bit(PG_buddy, &(page)->flags) | ||
323 | #define __SetPageBuddy(page) __set_bit(PG_buddy, &(page)->flags) | ||
324 | #define __ClearPageBuddy(page) __clear_bit(PG_buddy, &(page)->flags) | ||
325 | |||
320 | #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) | 326 | #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) |
321 | #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) | 327 | #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) |
322 | #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) | 328 | #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc523a1f270d..b8165e037dee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -151,7 +151,8 @@ static void bad_page(struct page *page) | |||
151 | 1 << PG_reclaim | | 151 | 1 << PG_reclaim | |
152 | 1 << PG_slab | | 152 | 1 << PG_slab | |
153 | 1 << PG_swapcache | | 153 | 1 << PG_swapcache | |
154 | 1 << PG_writeback ); | 154 | 1 << PG_writeback | |
155 | 1 << PG_buddy ); | ||
155 | set_page_count(page, 0); | 156 | set_page_count(page, 0); |
156 | reset_page_mapcount(page); | 157 | reset_page_mapcount(page); |
157 | page->mapping = NULL; | 158 | page->mapping = NULL; |
@@ -236,12 +237,12 @@ static inline unsigned long page_order(struct page *page) { | |||
236 | 237 | ||
237 | static inline void set_page_order(struct page *page, int order) { | 238 | static inline void set_page_order(struct page *page, int order) { |
238 | set_page_private(page, order); | 239 | set_page_private(page, order); |
239 | __SetPagePrivate(page); | 240 | __SetPageBuddy(page); |
240 | } | 241 | } |
241 | 242 | ||
242 | static inline void rmv_page_order(struct page *page) | 243 | static inline void rmv_page_order(struct page *page) |
243 | { | 244 | { |
244 | __ClearPagePrivate(page); | 245 | __ClearPageBuddy(page); |
245 | set_page_private(page, 0); | 246 | set_page_private(page, 0); |
246 | } | 247 | } |
247 | 248 | ||
@@ -280,11 +281,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
280 | * This function checks whether a page is free && is the buddy | 281 | * This function checks whether a page is free && is the buddy |
281 | * we can do coalesce a page and its buddy if | 282 | * we can do coalesce a page and its buddy if |
282 | * (a) the buddy is not in a hole && | 283 | * (a) the buddy is not in a hole && |
283 | * (b) the buddy is free && | 284 | * (b) the buddy is in the buddy system && |
284 | * (c) the buddy is on the buddy system && | 285 | * (c) a page and its buddy have the same order. |
285 | * (d) a page and its buddy have the same order. | 286 | * |
286 | * for recording page's order, we use page_private(page) and PG_private. | 287 | * For recording whether a page is in the buddy system, we use PG_buddy. |
288 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | ||
287 | * | 289 | * |
290 | * For recording page's order, we use page_private(page). | ||
288 | */ | 291 | */ |
289 | static inline int page_is_buddy(struct page *page, int order) | 292 | static inline int page_is_buddy(struct page *page, int order) |
290 | { | 293 | { |
@@ -293,10 +296,10 @@ static inline int page_is_buddy(struct page *page, int order) | |||
293 | return 0; | 296 | return 0; |
294 | #endif | 297 | #endif |
295 | 298 | ||
296 | if (PagePrivate(page) && | 299 | if (PageBuddy(page) && page_order(page) == order) { |
297 | (page_order(page) == order) && | 300 | BUG_ON(page_count(page) != 0); |
298 | page_count(page) == 0) | ||
299 | return 1; | 301 | return 1; |
302 | } | ||
300 | return 0; | 303 | return 0; |
301 | } | 304 | } |
302 | 305 | ||
@@ -313,7 +316,7 @@ static inline int page_is_buddy(struct page *page, int order) | |||
313 | * as necessary, plus some accounting needed to play nicely with other | 316 | * as necessary, plus some accounting needed to play nicely with other |
314 | * parts of the VM system. | 317 | * parts of the VM system. |
315 | * At each level, we keep a list of pages, which are heads of continuous | 318 | * At each level, we keep a list of pages, which are heads of continuous |
316 | * free pages of length of (1 << order) and marked with PG_Private.Page's | 319 | * free pages of length of (1 << order) and marked with PG_buddy. Page's |
317 | * order is recorded in page_private(page) field. | 320 | * order is recorded in page_private(page) field. |
318 | * So when we are allocating or freeing one, we can derive the state of the | 321 | * So when we are allocating or freeing one, we can derive the state of the |
319 | * other. That is, if we allocate a small block, and both were | 322 | * other. That is, if we allocate a small block, and both were |
@@ -376,7 +379,8 @@ static inline int free_pages_check(struct page *page) | |||
376 | 1 << PG_slab | | 379 | 1 << PG_slab | |
377 | 1 << PG_swapcache | | 380 | 1 << PG_swapcache | |
378 | 1 << PG_writeback | | 381 | 1 << PG_writeback | |
379 | 1 << PG_reserved )))) | 382 | 1 << PG_reserved | |
383 | 1 << PG_buddy )))) | ||
380 | bad_page(page); | 384 | bad_page(page); |
381 | if (PageDirty(page)) | 385 | if (PageDirty(page)) |
382 | __ClearPageDirty(page); | 386 | __ClearPageDirty(page); |
@@ -524,7 +528,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
524 | 1 << PG_slab | | 528 | 1 << PG_slab | |
525 | 1 << PG_swapcache | | 529 | 1 << PG_swapcache | |
526 | 1 << PG_writeback | | 530 | 1 << PG_writeback | |
527 | 1 << PG_reserved )))) | 531 | 1 << PG_reserved | |
532 | 1 << PG_buddy )))) | ||
528 | bad_page(page); | 533 | bad_page(page); |
529 | 534 | ||
530 | /* | 535 | /* |