diff options
author | Dave Kleikamp <shaggy@austin.ibm.com> | 2006-01-24 15:34:47 -0500 |
---|---|---|
committer | Dave Kleikamp <shaggy@austin.ibm.com> | 2006-01-24 15:34:47 -0500 |
commit | 0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch) | |
tree | 7b42490a676cf39ae0691b6859ecf7fd410f229b /mm/page_alloc.c | |
parent | 4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff) | |
parent | 3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff) |
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 788 |
1 files changed, 450 insertions, 338 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 987225bdd661..df54e2fc8ee0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/memory_hotplug.h> | 36 | #include <linux/memory_hotplug.h> |
37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mempolicy.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include "internal.h" | 42 | #include "internal.h" |
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly; | |||
52 | unsigned long totalram_pages __read_mostly; | 53 | unsigned long totalram_pages __read_mostly; |
53 | unsigned long totalhigh_pages __read_mostly; | 54 | unsigned long totalhigh_pages __read_mostly; |
54 | long nr_swap_pages; | 55 | long nr_swap_pages; |
56 | int percpu_pagelist_fraction; | ||
57 | |||
58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
55 | 59 | ||
56 | /* | 60 | /* |
57 | * results with 256, 32 in the lowmem_reserve sysctl: | 61 | * results with 256, 32 in the lowmem_reserve sysctl: |
@@ -60,8 +64,11 @@ long nr_swap_pages; | |||
60 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 64 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
61 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 65 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
62 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 66 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA |
67 | * | ||
68 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | ||
69 | * don't need any ZONE_NORMAL reservation | ||
63 | */ | 70 | */ |
64 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; | 71 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; |
65 | 72 | ||
66 | EXPORT_SYMBOL(totalram_pages); | 73 | EXPORT_SYMBOL(totalram_pages); |
67 | 74 | ||
@@ -72,12 +79,13 @@ EXPORT_SYMBOL(totalram_pages); | |||
72 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | 79 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; |
73 | EXPORT_SYMBOL(zone_table); | 80 | EXPORT_SYMBOL(zone_table); |
74 | 81 | ||
75 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | 82 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; |
76 | int min_free_kbytes = 1024; | 83 | int min_free_kbytes = 1024; |
77 | 84 | ||
78 | unsigned long __initdata nr_kernel_pages; | 85 | unsigned long __initdata nr_kernel_pages; |
79 | unsigned long __initdata nr_all_pages; | 86 | unsigned long __initdata nr_all_pages; |
80 | 87 | ||
88 | #ifdef CONFIG_DEBUG_VM | ||
81 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 89 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
82 | { | 90 | { |
83 | int ret = 0; | 91 | int ret = 0; |
@@ -119,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page) | |||
119 | return 0; | 127 | return 0; |
120 | } | 128 | } |
121 | 129 | ||
122 | static void bad_page(const char *function, struct page *page) | 130 | #else |
131 | static inline int bad_range(struct zone *zone, struct page *page) | ||
132 | { | ||
133 | return 0; | ||
134 | } | ||
135 | #endif | ||
136 | |||
137 | static void bad_page(struct page *page) | ||
123 | { | 138 | { |
124 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | 139 | printk(KERN_EMERG "Bad page state in process '%s'\n" |
125 | function, current->comm, page); | 140 | KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" |
126 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 141 | KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
127 | (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, | 142 | KERN_EMERG "Backtrace:\n", |
128 | page->mapping, page_mapcount(page), page_count(page)); | 143 | current->comm, page, (int)(2*sizeof(unsigned long)), |
129 | printk(KERN_EMERG "Backtrace:\n"); | 144 | (unsigned long)page->flags, page->mapping, |
145 | page_mapcount(page), page_count(page)); | ||
130 | dump_stack(); | 146 | dump_stack(); |
131 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | ||
132 | page->flags &= ~(1 << PG_lru | | 147 | page->flags &= ~(1 << PG_lru | |
133 | 1 << PG_private | | 148 | 1 << PG_private | |
134 | 1 << PG_locked | | 149 | 1 << PG_locked | |
@@ -137,18 +152,13 @@ static void bad_page(const char *function, struct page *page) | |||
137 | 1 << PG_reclaim | | 152 | 1 << PG_reclaim | |
138 | 1 << PG_slab | | 153 | 1 << PG_slab | |
139 | 1 << PG_swapcache | | 154 | 1 << PG_swapcache | |
140 | 1 << PG_writeback | | 155 | 1 << PG_writeback ); |
141 | 1 << PG_reserved ); | ||
142 | set_page_count(page, 0); | 156 | set_page_count(page, 0); |
143 | reset_page_mapcount(page); | 157 | reset_page_mapcount(page); |
144 | page->mapping = NULL; | 158 | page->mapping = NULL; |
145 | add_taint(TAINT_BAD_PAGE); | 159 | add_taint(TAINT_BAD_PAGE); |
146 | } | 160 | } |
147 | 161 | ||
148 | #ifndef CONFIG_HUGETLB_PAGE | ||
149 | #define prep_compound_page(page, order) do { } while (0) | ||
150 | #define destroy_compound_page(page, order) do { } while (0) | ||
151 | #else | ||
152 | /* | 162 | /* |
153 | * Higher-order pages are called "compound pages". They are structured thusly: | 163 | * Higher-order pages are called "compound pages". They are structured thusly: |
154 | * | 164 | * |
@@ -186,23 +196,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
186 | int i; | 196 | int i; |
187 | int nr_pages = 1 << order; | 197 | int nr_pages = 1 << order; |
188 | 198 | ||
189 | if (!PageCompound(page)) | 199 | if (unlikely(page[1].index != order)) |
190 | return; | 200 | bad_page(page); |
191 | |||
192 | if (page[1].index != order) | ||
193 | bad_page(__FUNCTION__, page); | ||
194 | 201 | ||
195 | for (i = 0; i < nr_pages; i++) { | 202 | for (i = 0; i < nr_pages; i++) { |
196 | struct page *p = page + i; | 203 | struct page *p = page + i; |
197 | 204 | ||
198 | if (!PageCompound(p)) | 205 | if (unlikely(!PageCompound(p) | |
199 | bad_page(__FUNCTION__, page); | 206 | (page_private(p) != (unsigned long)page))) |
200 | if (page_private(p) != (unsigned long)page) | 207 | bad_page(page); |
201 | bad_page(__FUNCTION__, page); | ||
202 | ClearPageCompound(p); | 208 | ClearPageCompound(p); |
203 | } | 209 | } |
204 | } | 210 | } |
205 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
206 | 211 | ||
207 | /* | 212 | /* |
208 | * function for dealing with page's order in buddy system. | 213 | * function for dealing with page's order in buddy system. |
@@ -258,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
258 | /* | 263 | /* |
259 | * This function checks whether a page is free && is the buddy | 264 | * This function checks whether a page is free && is the buddy |
260 | * we can do coalesce a page and its buddy if | 265 | * we can do coalesce a page and its buddy if |
261 | * (a) the buddy is free && | 266 | * (a) the buddy is not in a hole && |
262 | * (b) the buddy is on the buddy system && | 267 | * (b) the buddy is free && |
263 | * (c) a page and its buddy have the same order. | 268 | * (c) the buddy is on the buddy system && |
269 | * (d) a page and its buddy have the same order. | ||
264 | * for recording page's order, we use page_private(page) and PG_private. | 270 | * for recording page's order, we use page_private(page) and PG_private. |
265 | * | 271 | * |
266 | */ | 272 | */ |
267 | static inline int page_is_buddy(struct page *page, int order) | 273 | static inline int page_is_buddy(struct page *page, int order) |
268 | { | 274 | { |
275 | #ifdef CONFIG_HOLES_IN_ZONE | ||
276 | if (!pfn_valid(page_to_pfn(page))) | ||
277 | return 0; | ||
278 | #endif | ||
279 | |||
269 | if (PagePrivate(page) && | 280 | if (PagePrivate(page) && |
270 | (page_order(page) == order) && | 281 | (page_order(page) == order) && |
271 | page_count(page) == 0) | 282 | page_count(page) == 0) |
@@ -297,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order) | |||
297 | * -- wli | 308 | * -- wli |
298 | */ | 309 | */ |
299 | 310 | ||
300 | static inline void __free_pages_bulk (struct page *page, | 311 | static inline void __free_one_page(struct page *page, |
301 | struct zone *zone, unsigned int order) | 312 | struct zone *zone, unsigned int order) |
302 | { | 313 | { |
303 | unsigned long page_idx; | 314 | unsigned long page_idx; |
304 | int order_size = 1 << order; | 315 | int order_size = 1 << order; |
305 | 316 | ||
306 | if (unlikely(order)) | 317 | if (unlikely(PageCompound(page))) |
307 | destroy_compound_page(page, order); | 318 | destroy_compound_page(page, order); |
308 | 319 | ||
309 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 320 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
@@ -317,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, | |||
317 | struct free_area *area; | 328 | struct free_area *area; |
318 | struct page *buddy; | 329 | struct page *buddy; |
319 | 330 | ||
320 | combined_idx = __find_combined_index(page_idx, order); | ||
321 | buddy = __page_find_buddy(page, page_idx, order); | 331 | buddy = __page_find_buddy(page, page_idx, order); |
322 | |||
323 | if (bad_range(zone, buddy)) | ||
324 | break; | ||
325 | if (!page_is_buddy(buddy, order)) | 332 | if (!page_is_buddy(buddy, order)) |
326 | break; /* Move the buddy up one level. */ | 333 | break; /* Move the buddy up one level. */ |
334 | |||
327 | list_del(&buddy->lru); | 335 | list_del(&buddy->lru); |
328 | area = zone->free_area + order; | 336 | area = zone->free_area + order; |
329 | area->nr_free--; | 337 | area->nr_free--; |
330 | rmv_page_order(buddy); | 338 | rmv_page_order(buddy); |
339 | combined_idx = __find_combined_index(page_idx, order); | ||
331 | page = page + (combined_idx - page_idx); | 340 | page = page + (combined_idx - page_idx); |
332 | page_idx = combined_idx; | 341 | page_idx = combined_idx; |
333 | order++; | 342 | order++; |
@@ -337,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page, | |||
337 | zone->free_area[order].nr_free++; | 346 | zone->free_area[order].nr_free++; |
338 | } | 347 | } |
339 | 348 | ||
340 | static inline void free_pages_check(const char *function, struct page *page) | 349 | static inline int free_pages_check(struct page *page) |
341 | { | 350 | { |
342 | if ( page_mapcount(page) || | 351 | if (unlikely(page_mapcount(page) | |
343 | page->mapping != NULL || | 352 | (page->mapping != NULL) | |
344 | page_count(page) != 0 || | 353 | (page_count(page) != 0) | |
345 | (page->flags & ( | 354 | (page->flags & ( |
346 | 1 << PG_lru | | 355 | 1 << PG_lru | |
347 | 1 << PG_private | | 356 | 1 << PG_private | |
@@ -351,10 +360,16 @@ static inline void free_pages_check(const char *function, struct page *page) | |||
351 | 1 << PG_slab | | 360 | 1 << PG_slab | |
352 | 1 << PG_swapcache | | 361 | 1 << PG_swapcache | |
353 | 1 << PG_writeback | | 362 | 1 << PG_writeback | |
354 | 1 << PG_reserved ))) | 363 | 1 << PG_reserved )))) |
355 | bad_page(function, page); | 364 | bad_page(page); |
356 | if (PageDirty(page)) | 365 | if (PageDirty(page)) |
357 | __ClearPageDirty(page); | 366 | __ClearPageDirty(page); |
367 | /* | ||
368 | * For now, we report if PG_reserved was found set, but do not | ||
369 | * clear it, and do not free the page. But we shall soon need | ||
370 | * to do more, for when the ZERO_PAGE count wraps negative. | ||
371 | */ | ||
372 | return PageReserved(page); | ||
358 | } | 373 | } |
359 | 374 | ||
360 | /* | 375 | /* |
@@ -368,48 +383,90 @@ static inline void free_pages_check(const char *function, struct page *page) | |||
368 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 383 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
369 | * pinned" detection logic. | 384 | * pinned" detection logic. |
370 | */ | 385 | */ |
371 | static int | 386 | static void free_pages_bulk(struct zone *zone, int count, |
372 | free_pages_bulk(struct zone *zone, int count, | 387 | struct list_head *list, int order) |
373 | struct list_head *list, unsigned int order) | ||
374 | { | 388 | { |
375 | unsigned long flags; | 389 | spin_lock(&zone->lock); |
376 | struct page *page = NULL; | ||
377 | int ret = 0; | ||
378 | |||
379 | spin_lock_irqsave(&zone->lock, flags); | ||
380 | zone->all_unreclaimable = 0; | 390 | zone->all_unreclaimable = 0; |
381 | zone->pages_scanned = 0; | 391 | zone->pages_scanned = 0; |
382 | while (!list_empty(list) && count--) { | 392 | while (count--) { |
393 | struct page *page; | ||
394 | |||
395 | BUG_ON(list_empty(list)); | ||
383 | page = list_entry(list->prev, struct page, lru); | 396 | page = list_entry(list->prev, struct page, lru); |
384 | /* have to delete it as __free_pages_bulk list manipulates */ | 397 | /* have to delete it as __free_one_page list manipulates */ |
385 | list_del(&page->lru); | 398 | list_del(&page->lru); |
386 | __free_pages_bulk(page, zone, order); | 399 | __free_one_page(page, zone, order); |
387 | ret++; | ||
388 | } | 400 | } |
389 | spin_unlock_irqrestore(&zone->lock, flags); | 401 | spin_unlock(&zone->lock); |
390 | return ret; | ||
391 | } | 402 | } |
392 | 403 | ||
393 | void __free_pages_ok(struct page *page, unsigned int order) | 404 | static void free_one_page(struct zone *zone, struct page *page, int order) |
394 | { | 405 | { |
395 | LIST_HEAD(list); | 406 | LIST_HEAD(list); |
407 | list_add(&page->lru, &list); | ||
408 | free_pages_bulk(zone, 1, &list, order); | ||
409 | } | ||
410 | |||
411 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
412 | { | ||
413 | unsigned long flags; | ||
396 | int i; | 414 | int i; |
415 | int reserved = 0; | ||
397 | 416 | ||
398 | arch_free_page(page, order); | 417 | arch_free_page(page, order); |
399 | 418 | if (!PageHighMem(page)) | |
400 | mod_page_state(pgfree, 1 << order); | 419 | mutex_debug_check_no_locks_freed(page_address(page), |
420 | PAGE_SIZE<<order); | ||
401 | 421 | ||
402 | #ifndef CONFIG_MMU | 422 | #ifndef CONFIG_MMU |
403 | if (order > 0) | 423 | for (i = 1 ; i < (1 << order) ; ++i) |
404 | for (i = 1 ; i < (1 << order) ; ++i) | 424 | __put_page(page + i); |
405 | __put_page(page + i); | ||
406 | #endif | 425 | #endif |
407 | 426 | ||
408 | for (i = 0 ; i < (1 << order) ; ++i) | 427 | for (i = 0 ; i < (1 << order) ; ++i) |
409 | free_pages_check(__FUNCTION__, page + i); | 428 | reserved += free_pages_check(page + i); |
410 | list_add(&page->lru, &list); | 429 | if (reserved) |
411 | kernel_map_pages(page, 1<<order, 0); | 430 | return; |
412 | free_pages_bulk(page_zone(page), 1, &list, order); | 431 | |
432 | kernel_map_pages(page, 1 << order, 0); | ||
433 | local_irq_save(flags); | ||
434 | __mod_page_state(pgfree, 1 << order); | ||
435 | free_one_page(page_zone(page), page, order); | ||
436 | local_irq_restore(flags); | ||
437 | } | ||
438 | |||
439 | /* | ||
440 | * permit the bootmem allocator to evade page validation on high-order frees | ||
441 | */ | ||
442 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | ||
443 | { | ||
444 | if (order == 0) { | ||
445 | __ClearPageReserved(page); | ||
446 | set_page_count(page, 0); | ||
447 | |||
448 | free_hot_cold_page(page, 0); | ||
449 | } else { | ||
450 | LIST_HEAD(list); | ||
451 | int loop; | ||
452 | |||
453 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | ||
454 | struct page *p = &page[loop]; | ||
455 | |||
456 | if (loop + 16 < BITS_PER_LONG) | ||
457 | prefetchw(p + 16); | ||
458 | __ClearPageReserved(p); | ||
459 | set_page_count(p, 0); | ||
460 | } | ||
461 | |||
462 | arch_free_page(page, order); | ||
463 | |||
464 | mod_page_state(pgfree, 1 << order); | ||
465 | |||
466 | list_add(&page->lru, &list); | ||
467 | kernel_map_pages(page, 1 << order, 0); | ||
468 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
469 | } | ||
413 | } | 470 | } |
414 | 471 | ||
415 | 472 | ||
@@ -427,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order) | |||
427 | * | 484 | * |
428 | * -- wli | 485 | * -- wli |
429 | */ | 486 | */ |
430 | static inline struct page * | 487 | static inline void expand(struct zone *zone, struct page *page, |
431 | expand(struct zone *zone, struct page *page, | ||
432 | int low, int high, struct free_area *area) | 488 | int low, int high, struct free_area *area) |
433 | { | 489 | { |
434 | unsigned long size = 1 << high; | 490 | unsigned long size = 1 << high; |
@@ -442,34 +498,16 @@ expand(struct zone *zone, struct page *page, | |||
442 | area->nr_free++; | 498 | area->nr_free++; |
443 | set_page_order(&page[size], high); | 499 | set_page_order(&page[size], high); |
444 | } | 500 | } |
445 | return page; | ||
446 | } | ||
447 | |||
448 | void set_page_refs(struct page *page, int order) | ||
449 | { | ||
450 | #ifdef CONFIG_MMU | ||
451 | set_page_count(page, 1); | ||
452 | #else | ||
453 | int i; | ||
454 | |||
455 | /* | ||
456 | * We need to reference all the pages for this order, otherwise if | ||
457 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
458 | * - eg: access_process_vm() | ||
459 | */ | ||
460 | for (i = 0; i < (1 << order); i++) | ||
461 | set_page_count(page + i, 1); | ||
462 | #endif /* CONFIG_MMU */ | ||
463 | } | 501 | } |
464 | 502 | ||
465 | /* | 503 | /* |
466 | * This page is about to be returned from the page allocator | 504 | * This page is about to be returned from the page allocator |
467 | */ | 505 | */ |
468 | static void prep_new_page(struct page *page, int order) | 506 | static int prep_new_page(struct page *page, int order) |
469 | { | 507 | { |
470 | if ( page_mapcount(page) || | 508 | if (unlikely(page_mapcount(page) | |
471 | page->mapping != NULL || | 509 | (page->mapping != NULL) | |
472 | page_count(page) != 0 || | 510 | (page_count(page) != 0) | |
473 | (page->flags & ( | 511 | (page->flags & ( |
474 | 1 << PG_lru | | 512 | 1 << PG_lru | |
475 | 1 << PG_private | | 513 | 1 << PG_private | |
@@ -480,8 +518,15 @@ static void prep_new_page(struct page *page, int order) | |||
480 | 1 << PG_slab | | 518 | 1 << PG_slab | |
481 | 1 << PG_swapcache | | 519 | 1 << PG_swapcache | |
482 | 1 << PG_writeback | | 520 | 1 << PG_writeback | |
483 | 1 << PG_reserved ))) | 521 | 1 << PG_reserved )))) |
484 | bad_page(__FUNCTION__, page); | 522 | bad_page(page); |
523 | |||
524 | /* | ||
525 | * For now, we report if PG_reserved was found set, but do not | ||
526 | * clear it, and do not allocate the page: as a safety net. | ||
527 | */ | ||
528 | if (PageReserved(page)) | ||
529 | return 1; | ||
485 | 530 | ||
486 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | 531 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
487 | 1 << PG_referenced | 1 << PG_arch_1 | | 532 | 1 << PG_referenced | 1 << PG_arch_1 | |
@@ -489,6 +534,7 @@ static void prep_new_page(struct page *page, int order) | |||
489 | set_page_private(page, 0); | 534 | set_page_private(page, 0); |
490 | set_page_refs(page, order); | 535 | set_page_refs(page, order); |
491 | kernel_map_pages(page, 1 << order, 1); | 536 | kernel_map_pages(page, 1 << order, 1); |
537 | return 0; | ||
492 | } | 538 | } |
493 | 539 | ||
494 | /* | 540 | /* |
@@ -511,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
511 | rmv_page_order(page); | 557 | rmv_page_order(page); |
512 | area->nr_free--; | 558 | area->nr_free--; |
513 | zone->free_pages -= 1UL << order; | 559 | zone->free_pages -= 1UL << order; |
514 | return expand(zone, page, order, current_order, area); | 560 | expand(zone, page, order, current_order, area); |
561 | return page; | ||
515 | } | 562 | } |
516 | 563 | ||
517 | return NULL; | 564 | return NULL; |
@@ -525,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
525 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 572 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
526 | unsigned long count, struct list_head *list) | 573 | unsigned long count, struct list_head *list) |
527 | { | 574 | { |
528 | unsigned long flags; | ||
529 | int i; | 575 | int i; |
530 | int allocated = 0; | ||
531 | struct page *page; | ||
532 | 576 | ||
533 | spin_lock_irqsave(&zone->lock, flags); | 577 | spin_lock(&zone->lock); |
534 | for (i = 0; i < count; ++i) { | 578 | for (i = 0; i < count; ++i) { |
535 | page = __rmqueue(zone, order); | 579 | struct page *page = __rmqueue(zone, order); |
536 | if (page == NULL) | 580 | if (unlikely(page == NULL)) |
537 | break; | 581 | break; |
538 | allocated++; | ||
539 | list_add_tail(&page->lru, list); | 582 | list_add_tail(&page->lru, list); |
540 | } | 583 | } |
541 | spin_unlock_irqrestore(&zone->lock, flags); | 584 | spin_unlock(&zone->lock); |
542 | return allocated; | 585 | return i; |
543 | } | 586 | } |
544 | 587 | ||
545 | #ifdef CONFIG_NUMA | 588 | #ifdef CONFIG_NUMA |
@@ -558,14 +601,13 @@ void drain_remote_pages(void) | |||
558 | if (zone->zone_pgdat->node_id == numa_node_id()) | 601 | if (zone->zone_pgdat->node_id == numa_node_id()) |
559 | continue; | 602 | continue; |
560 | 603 | ||
561 | pset = zone->pageset[smp_processor_id()]; | 604 | pset = zone_pcp(zone, smp_processor_id()); |
562 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 605 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
563 | struct per_cpu_pages *pcp; | 606 | struct per_cpu_pages *pcp; |
564 | 607 | ||
565 | pcp = &pset->pcp[i]; | 608 | pcp = &pset->pcp[i]; |
566 | if (pcp->count) | 609 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
567 | pcp->count -= free_pages_bulk(zone, pcp->count, | 610 | pcp->count = 0; |
568 | &pcp->list, 0); | ||
569 | } | 611 | } |
570 | } | 612 | } |
571 | local_irq_restore(flags); | 613 | local_irq_restore(flags); |
@@ -575,6 +617,7 @@ void drain_remote_pages(void) | |||
575 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 617 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
576 | static void __drain_pages(unsigned int cpu) | 618 | static void __drain_pages(unsigned int cpu) |
577 | { | 619 | { |
620 | unsigned long flags; | ||
578 | struct zone *zone; | 621 | struct zone *zone; |
579 | int i; | 622 | int i; |
580 | 623 | ||
@@ -586,8 +629,10 @@ static void __drain_pages(unsigned int cpu) | |||
586 | struct per_cpu_pages *pcp; | 629 | struct per_cpu_pages *pcp; |
587 | 630 | ||
588 | pcp = &pset->pcp[i]; | 631 | pcp = &pset->pcp[i]; |
589 | pcp->count -= free_pages_bulk(zone, pcp->count, | 632 | local_irq_save(flags); |
590 | &pcp->list, 0); | 633 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
634 | pcp->count = 0; | ||
635 | local_irq_restore(flags); | ||
591 | } | 636 | } |
592 | } | 637 | } |
593 | } | 638 | } |
@@ -633,18 +678,14 @@ void drain_local_pages(void) | |||
633 | } | 678 | } |
634 | #endif /* CONFIG_PM */ | 679 | #endif /* CONFIG_PM */ |
635 | 680 | ||
636 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | 681 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) |
637 | { | 682 | { |
638 | #ifdef CONFIG_NUMA | 683 | #ifdef CONFIG_NUMA |
639 | unsigned long flags; | ||
640 | int cpu; | ||
641 | pg_data_t *pg = z->zone_pgdat; | 684 | pg_data_t *pg = z->zone_pgdat; |
642 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 685 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; |
643 | struct per_cpu_pageset *p; | 686 | struct per_cpu_pageset *p; |
644 | 687 | ||
645 | local_irq_save(flags); | 688 | p = zone_pcp(z, cpu); |
646 | cpu = smp_processor_id(); | ||
647 | p = zone_pcp(z,cpu); | ||
648 | if (pg == orig) { | 689 | if (pg == orig) { |
649 | p->numa_hit++; | 690 | p->numa_hit++; |
650 | } else { | 691 | } else { |
@@ -655,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
655 | p->local_node++; | 696 | p->local_node++; |
656 | else | 697 | else |
657 | p->other_node++; | 698 | p->other_node++; |
658 | local_irq_restore(flags); | ||
659 | #endif | 699 | #endif |
660 | } | 700 | } |
661 | 701 | ||
662 | /* | 702 | /* |
663 | * Free a 0-order page | 703 | * Free a 0-order page |
664 | */ | 704 | */ |
665 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | ||
666 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 705 | static void fastcall free_hot_cold_page(struct page *page, int cold) |
667 | { | 706 | { |
668 | struct zone *zone = page_zone(page); | 707 | struct zone *zone = page_zone(page); |
@@ -671,17 +710,22 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
671 | 710 | ||
672 | arch_free_page(page, 0); | 711 | arch_free_page(page, 0); |
673 | 712 | ||
674 | kernel_map_pages(page, 1, 0); | ||
675 | inc_page_state(pgfree); | ||
676 | if (PageAnon(page)) | 713 | if (PageAnon(page)) |
677 | page->mapping = NULL; | 714 | page->mapping = NULL; |
678 | free_pages_check(__FUNCTION__, page); | 715 | if (free_pages_check(page)) |
716 | return; | ||
717 | |||
718 | kernel_map_pages(page, 1, 0); | ||
719 | |||
679 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 720 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
680 | local_irq_save(flags); | 721 | local_irq_save(flags); |
722 | __inc_page_state(pgfree); | ||
681 | list_add(&page->lru, &pcp->list); | 723 | list_add(&page->lru, &pcp->list); |
682 | pcp->count++; | 724 | pcp->count++; |
683 | if (pcp->count >= pcp->high) | 725 | if (pcp->count >= pcp->high) { |
684 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 726 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
727 | pcp->count -= pcp->batch; | ||
728 | } | ||
685 | local_irq_restore(flags); | 729 | local_irq_restore(flags); |
686 | put_cpu(); | 730 | put_cpu(); |
687 | } | 731 | } |
@@ -710,64 +754,82 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
710 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 754 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
711 | * or two. | 755 | * or two. |
712 | */ | 756 | */ |
713 | static struct page * | 757 | static struct page *buffered_rmqueue(struct zonelist *zonelist, |
714 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | 758 | struct zone *zone, int order, gfp_t gfp_flags) |
715 | { | 759 | { |
716 | unsigned long flags; | 760 | unsigned long flags; |
717 | struct page *page = NULL; | 761 | struct page *page; |
718 | int cold = !!(gfp_flags & __GFP_COLD); | 762 | int cold = !!(gfp_flags & __GFP_COLD); |
763 | int cpu; | ||
719 | 764 | ||
720 | if (order == 0) { | 765 | again: |
766 | cpu = get_cpu(); | ||
767 | if (likely(order == 0)) { | ||
721 | struct per_cpu_pages *pcp; | 768 | struct per_cpu_pages *pcp; |
722 | 769 | ||
723 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 770 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
724 | local_irq_save(flags); | 771 | local_irq_save(flags); |
725 | if (pcp->count <= pcp->low) | 772 | if (!pcp->count) { |
726 | pcp->count += rmqueue_bulk(zone, 0, | 773 | pcp->count += rmqueue_bulk(zone, 0, |
727 | pcp->batch, &pcp->list); | 774 | pcp->batch, &pcp->list); |
728 | if (pcp->count) { | 775 | if (unlikely(!pcp->count)) |
729 | page = list_entry(pcp->list.next, struct page, lru); | 776 | goto failed; |
730 | list_del(&page->lru); | ||
731 | pcp->count--; | ||
732 | } | 777 | } |
733 | local_irq_restore(flags); | 778 | page = list_entry(pcp->list.next, struct page, lru); |
734 | put_cpu(); | 779 | list_del(&page->lru); |
735 | } | 780 | pcp->count--; |
736 | 781 | } else { | |
737 | if (page == NULL) { | ||
738 | spin_lock_irqsave(&zone->lock, flags); | 782 | spin_lock_irqsave(&zone->lock, flags); |
739 | page = __rmqueue(zone, order); | 783 | page = __rmqueue(zone, order); |
740 | spin_unlock_irqrestore(&zone->lock, flags); | 784 | spin_unlock(&zone->lock); |
785 | if (!page) | ||
786 | goto failed; | ||
741 | } | 787 | } |
742 | 788 | ||
743 | if (page != NULL) { | 789 | __mod_page_state_zone(zone, pgalloc, 1 << order); |
744 | BUG_ON(bad_range(zone, page)); | 790 | zone_statistics(zonelist, zone, cpu); |
745 | mod_page_state_zone(zone, pgalloc, 1 << order); | 791 | local_irq_restore(flags); |
746 | prep_new_page(page, order); | 792 | put_cpu(); |
793 | |||
794 | BUG_ON(bad_range(zone, page)); | ||
795 | if (prep_new_page(page, order)) | ||
796 | goto again; | ||
747 | 797 | ||
748 | if (gfp_flags & __GFP_ZERO) | 798 | if (gfp_flags & __GFP_ZERO) |
749 | prep_zero_page(page, order, gfp_flags); | 799 | prep_zero_page(page, order, gfp_flags); |
750 | 800 | ||
751 | if (order && (gfp_flags & __GFP_COMP)) | 801 | if (order && (gfp_flags & __GFP_COMP)) |
752 | prep_compound_page(page, order); | 802 | prep_compound_page(page, order); |
753 | } | ||
754 | return page; | 803 | return page; |
804 | |||
805 | failed: | ||
806 | local_irq_restore(flags); | ||
807 | put_cpu(); | ||
808 | return NULL; | ||
755 | } | 809 | } |
756 | 810 | ||
811 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | ||
812 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | ||
813 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | ||
814 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | ||
815 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
816 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
817 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
818 | |||
757 | /* | 819 | /* |
758 | * Return 1 if free pages are above 'mark'. This takes into account the order | 820 | * Return 1 if free pages are above 'mark'. This takes into account the order |
759 | * of the allocation. | 821 | * of the allocation. |
760 | */ | 822 | */ |
761 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 823 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
762 | int classzone_idx, int can_try_harder, gfp_t gfp_high) | 824 | int classzone_idx, int alloc_flags) |
763 | { | 825 | { |
764 | /* free_pages my go negative - that's OK */ | 826 | /* free_pages my go negative - that's OK */ |
765 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; | 827 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; |
766 | int o; | 828 | int o; |
767 | 829 | ||
768 | if (gfp_high) | 830 | if (alloc_flags & ALLOC_HIGH) |
769 | min -= min / 2; | 831 | min -= min / 2; |
770 | if (can_try_harder) | 832 | if (alloc_flags & ALLOC_HARDER) |
771 | min -= min / 4; | 833 | min -= min / 4; |
772 | 834 | ||
773 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 835 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
@@ -785,14 +847,48 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
785 | return 1; | 847 | return 1; |
786 | } | 848 | } |
787 | 849 | ||
788 | static inline int | 850 | /* |
789 | should_reclaim_zone(struct zone *z, gfp_t gfp_mask) | 851 | * get_page_from_freeliest goes through the zonelist trying to allocate |
852 | * a page. | ||
853 | */ | ||
854 | static struct page * | ||
855 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | ||
856 | struct zonelist *zonelist, int alloc_flags) | ||
790 | { | 857 | { |
791 | if (!z->reclaim_pages) | 858 | struct zone **z = zonelist->zones; |
792 | return 0; | 859 | struct page *page = NULL; |
793 | if (gfp_mask & __GFP_NORECLAIM) | 860 | int classzone_idx = zone_idx(*z); |
794 | return 0; | 861 | |
795 | return 1; | 862 | /* |
863 | * Go through the zonelist once, looking for a zone with enough free. | ||
864 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
865 | */ | ||
866 | do { | ||
867 | if ((alloc_flags & ALLOC_CPUSET) && | ||
868 | !cpuset_zone_allowed(*z, gfp_mask)) | ||
869 | continue; | ||
870 | |||
871 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | ||
872 | unsigned long mark; | ||
873 | if (alloc_flags & ALLOC_WMARK_MIN) | ||
874 | mark = (*z)->pages_min; | ||
875 | else if (alloc_flags & ALLOC_WMARK_LOW) | ||
876 | mark = (*z)->pages_low; | ||
877 | else | ||
878 | mark = (*z)->pages_high; | ||
879 | if (!zone_watermark_ok(*z, order, mark, | ||
880 | classzone_idx, alloc_flags)) | ||
881 | if (!zone_reclaim_mode || | ||
882 | !zone_reclaim(*z, gfp_mask, order)) | ||
883 | continue; | ||
884 | } | ||
885 | |||
886 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); | ||
887 | if (page) { | ||
888 | break; | ||
889 | } | ||
890 | } while (*(++z) != NULL); | ||
891 | return page; | ||
796 | } | 892 | } |
797 | 893 | ||
798 | /* | 894 | /* |
@@ -803,105 +899,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
803 | struct zonelist *zonelist) | 899 | struct zonelist *zonelist) |
804 | { | 900 | { |
805 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 901 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
806 | struct zone **zones, *z; | 902 | struct zone **z; |
807 | struct page *page; | 903 | struct page *page; |
808 | struct reclaim_state reclaim_state; | 904 | struct reclaim_state reclaim_state; |
809 | struct task_struct *p = current; | 905 | struct task_struct *p = current; |
810 | int i; | ||
811 | int classzone_idx; | ||
812 | int do_retry; | 906 | int do_retry; |
813 | int can_try_harder; | 907 | int alloc_flags; |
814 | int did_some_progress; | 908 | int did_some_progress; |
815 | 909 | ||
816 | might_sleep_if(wait); | 910 | might_sleep_if(wait); |
817 | 911 | ||
818 | /* | 912 | restart: |
819 | * The caller may dip into page reserves a bit more if the caller | 913 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
820 | * cannot run direct reclaim, or is the caller has realtime scheduling | ||
821 | * policy | ||
822 | */ | ||
823 | can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; | ||
824 | |||
825 | zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ | ||
826 | 914 | ||
827 | if (unlikely(zones[0] == NULL)) { | 915 | if (unlikely(*z == NULL)) { |
828 | /* Should this ever happen?? */ | 916 | /* Should this ever happen?? */ |
829 | return NULL; | 917 | return NULL; |
830 | } | 918 | } |
831 | 919 | ||
832 | classzone_idx = zone_idx(zones[0]); | 920 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
921 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); | ||
922 | if (page) | ||
923 | goto got_pg; | ||
924 | |||
925 | do { | ||
926 | wakeup_kswapd(*z, order); | ||
927 | } while (*(++z)); | ||
833 | 928 | ||
834 | restart: | ||
835 | /* | 929 | /* |
836 | * Go through the zonelist once, looking for a zone with enough free. | 930 | * OK, we're below the kswapd watermark and have kicked background |
837 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 931 | * reclaim. Now things get more complex, so set up alloc_flags according |
932 | * to how we want to proceed. | ||
933 | * | ||
934 | * The caller may dip into page reserves a bit more if the caller | ||
935 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
936 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
937 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
838 | */ | 938 | */ |
839 | for (i = 0; (z = zones[i]) != NULL; i++) { | 939 | alloc_flags = ALLOC_WMARK_MIN; |
840 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | 940 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) |
841 | 941 | alloc_flags |= ALLOC_HARDER; | |
842 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | 942 | if (gfp_mask & __GFP_HIGH) |
843 | continue; | 943 | alloc_flags |= ALLOC_HIGH; |
844 | 944 | alloc_flags |= ALLOC_CPUSET; | |
845 | /* | ||
846 | * If the zone is to attempt early page reclaim then this loop | ||
847 | * will try to reclaim pages and check the watermark a second | ||
848 | * time before giving up and falling back to the next zone. | ||
849 | */ | ||
850 | zone_reclaim_retry: | ||
851 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
852 | classzone_idx, 0, 0)) { | ||
853 | if (!do_reclaim) | ||
854 | continue; | ||
855 | else { | ||
856 | zone_reclaim(z, gfp_mask, order); | ||
857 | /* Only try reclaim once */ | ||
858 | do_reclaim = 0; | ||
859 | goto zone_reclaim_retry; | ||
860 | } | ||
861 | } | ||
862 | |||
863 | page = buffered_rmqueue(z, order, gfp_mask); | ||
864 | if (page) | ||
865 | goto got_pg; | ||
866 | } | ||
867 | |||
868 | for (i = 0; (z = zones[i]) != NULL; i++) | ||
869 | wakeup_kswapd(z, order); | ||
870 | 945 | ||
871 | /* | 946 | /* |
872 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 947 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
873 | * coming from realtime tasks to go deeper into reserves | 948 | * coming from realtime tasks go deeper into reserves. |
874 | * | 949 | * |
875 | * This is the last chance, in general, before the goto nopage. | 950 | * This is the last chance, in general, before the goto nopage. |
876 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 951 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
877 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 952 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
878 | */ | 953 | */ |
879 | for (i = 0; (z = zones[i]) != NULL; i++) { | 954 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); |
880 | if (!zone_watermark_ok(z, order, z->pages_min, | 955 | if (page) |
881 | classzone_idx, can_try_harder, | 956 | goto got_pg; |
882 | gfp_mask & __GFP_HIGH)) | ||
883 | continue; | ||
884 | |||
885 | if (wait && !cpuset_zone_allowed(z, gfp_mask)) | ||
886 | continue; | ||
887 | |||
888 | page = buffered_rmqueue(z, order, gfp_mask); | ||
889 | if (page) | ||
890 | goto got_pg; | ||
891 | } | ||
892 | 957 | ||
893 | /* This allocation should allow future memory freeing. */ | 958 | /* This allocation should allow future memory freeing. */ |
894 | 959 | ||
895 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 960 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
896 | && !in_interrupt()) { | 961 | && !in_interrupt()) { |
897 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 962 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
963 | nofail_alloc: | ||
898 | /* go through the zonelist yet again, ignoring mins */ | 964 | /* go through the zonelist yet again, ignoring mins */ |
899 | for (i = 0; (z = zones[i]) != NULL; i++) { | 965 | page = get_page_from_freelist(gfp_mask, order, |
900 | if (!cpuset_zone_allowed(z, gfp_mask)) | 966 | zonelist, ALLOC_NO_WATERMARKS); |
901 | continue; | 967 | if (page) |
902 | page = buffered_rmqueue(z, order, gfp_mask); | 968 | goto got_pg; |
903 | if (page) | 969 | if (gfp_mask & __GFP_NOFAIL) { |
904 | goto got_pg; | 970 | blk_congestion_wait(WRITE, HZ/50); |
971 | goto nofail_alloc; | ||
905 | } | 972 | } |
906 | } | 973 | } |
907 | goto nopage; | 974 | goto nopage; |
@@ -915,11 +982,12 @@ rebalance: | |||
915 | cond_resched(); | 982 | cond_resched(); |
916 | 983 | ||
917 | /* We now go into synchronous reclaim */ | 984 | /* We now go into synchronous reclaim */ |
985 | cpuset_memory_pressure_bump(); | ||
918 | p->flags |= PF_MEMALLOC; | 986 | p->flags |= PF_MEMALLOC; |
919 | reclaim_state.reclaimed_slab = 0; | 987 | reclaim_state.reclaimed_slab = 0; |
920 | p->reclaim_state = &reclaim_state; | 988 | p->reclaim_state = &reclaim_state; |
921 | 989 | ||
922 | did_some_progress = try_to_free_pages(zones, gfp_mask); | 990 | did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); |
923 | 991 | ||
924 | p->reclaim_state = NULL; | 992 | p->reclaim_state = NULL; |
925 | p->flags &= ~PF_MEMALLOC; | 993 | p->flags &= ~PF_MEMALLOC; |
@@ -927,19 +995,10 @@ rebalance: | |||
927 | cond_resched(); | 995 | cond_resched(); |
928 | 996 | ||
929 | if (likely(did_some_progress)) { | 997 | if (likely(did_some_progress)) { |
930 | for (i = 0; (z = zones[i]) != NULL; i++) { | 998 | page = get_page_from_freelist(gfp_mask, order, |
931 | if (!zone_watermark_ok(z, order, z->pages_min, | 999 | zonelist, alloc_flags); |
932 | classzone_idx, can_try_harder, | 1000 | if (page) |
933 | gfp_mask & __GFP_HIGH)) | 1001 | goto got_pg; |
934 | continue; | ||
935 | |||
936 | if (!cpuset_zone_allowed(z, gfp_mask)) | ||
937 | continue; | ||
938 | |||
939 | page = buffered_rmqueue(z, order, gfp_mask); | ||
940 | if (page) | ||
941 | goto got_pg; | ||
942 | } | ||
943 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1002 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
944 | /* | 1003 | /* |
945 | * Go through the zonelist yet one more time, keep | 1004 | * Go through the zonelist yet one more time, keep |
@@ -947,18 +1006,10 @@ rebalance: | |||
947 | * a parallel oom killing, we must fail if we're still | 1006 | * a parallel oom killing, we must fail if we're still |
948 | * under heavy pressure. | 1007 | * under heavy pressure. |
949 | */ | 1008 | */ |
950 | for (i = 0; (z = zones[i]) != NULL; i++) { | 1009 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
951 | if (!zone_watermark_ok(z, order, z->pages_high, | 1010 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
952 | classzone_idx, 0, 0)) | 1011 | if (page) |
953 | continue; | 1012 | goto got_pg; |
954 | |||
955 | if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) | ||
956 | continue; | ||
957 | |||
958 | page = buffered_rmqueue(z, order, gfp_mask); | ||
959 | if (page) | ||
960 | goto got_pg; | ||
961 | } | ||
962 | 1013 | ||
963 | out_of_memory(gfp_mask, order); | 1014 | out_of_memory(gfp_mask, order); |
964 | goto restart; | 1015 | goto restart; |
@@ -991,9 +1042,7 @@ nopage: | |||
991 | dump_stack(); | 1042 | dump_stack(); |
992 | show_mem(); | 1043 | show_mem(); |
993 | } | 1044 | } |
994 | return NULL; | ||
995 | got_pg: | 1045 | got_pg: |
996 | zone_statistics(zonelist, z); | ||
997 | return page; | 1046 | return page; |
998 | } | 1047 | } |
999 | 1048 | ||
@@ -1160,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache); | |||
1160 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 1209 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; |
1161 | #endif | 1210 | #endif |
1162 | 1211 | ||
1163 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1212 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
1164 | { | 1213 | { |
1165 | int cpu = 0; | 1214 | int cpu = 0; |
1166 | 1215 | ||
@@ -1213,7 +1262,7 @@ void get_full_page_state(struct page_state *ret) | |||
1213 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 1262 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); |
1214 | } | 1263 | } |
1215 | 1264 | ||
1216 | unsigned long __read_page_state(unsigned long offset) | 1265 | unsigned long read_page_state_offset(unsigned long offset) |
1217 | { | 1266 | { |
1218 | unsigned long ret = 0; | 1267 | unsigned long ret = 0; |
1219 | int cpu; | 1268 | int cpu; |
@@ -1227,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset) | |||
1227 | return ret; | 1276 | return ret; |
1228 | } | 1277 | } |
1229 | 1278 | ||
1230 | void __mod_page_state(unsigned long offset, unsigned long delta) | 1279 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) |
1280 | { | ||
1281 | void *ptr; | ||
1282 | |||
1283 | ptr = &__get_cpu_var(page_states); | ||
1284 | *(unsigned long *)(ptr + offset) += delta; | ||
1285 | } | ||
1286 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
1287 | |||
1288 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
1231 | { | 1289 | { |
1232 | unsigned long flags; | 1290 | unsigned long flags; |
1233 | void* ptr; | 1291 | void *ptr; |
1234 | 1292 | ||
1235 | local_irq_save(flags); | 1293 | local_irq_save(flags); |
1236 | ptr = &__get_cpu_var(page_states); | 1294 | ptr = &__get_cpu_var(page_states); |
1237 | *(unsigned long*)(ptr + offset) += delta; | 1295 | *(unsigned long *)(ptr + offset) += delta; |
1238 | local_irq_restore(flags); | 1296 | local_irq_restore(flags); |
1239 | } | 1297 | } |
1240 | 1298 | EXPORT_SYMBOL(mod_page_state_offset); | |
1241 | EXPORT_SYMBOL(__mod_page_state); | ||
1242 | 1299 | ||
1243 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 1300 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
1244 | unsigned long *free, struct pglist_data *pgdat) | 1301 | unsigned long *free, struct pglist_data *pgdat) |
@@ -1324,7 +1381,7 @@ void show_free_areas(void) | |||
1324 | show_node(zone); | 1381 | show_node(zone); |
1325 | printk("%s per-cpu:", zone->name); | 1382 | printk("%s per-cpu:", zone->name); |
1326 | 1383 | ||
1327 | if (!zone->present_pages) { | 1384 | if (!populated_zone(zone)) { |
1328 | printk(" empty\n"); | 1385 | printk(" empty\n"); |
1329 | continue; | 1386 | continue; |
1330 | } else | 1387 | } else |
@@ -1336,10 +1393,9 @@ void show_free_areas(void) | |||
1336 | pageset = zone_pcp(zone, cpu); | 1393 | pageset = zone_pcp(zone, cpu); |
1337 | 1394 | ||
1338 | for (temperature = 0; temperature < 2; temperature++) | 1395 | for (temperature = 0; temperature < 2; temperature++) |
1339 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", | 1396 | printk("cpu %d %s: high %d, batch %d used:%d\n", |
1340 | cpu, | 1397 | cpu, |
1341 | temperature ? "cold" : "hot", | 1398 | temperature ? "cold" : "hot", |
1342 | pageset->pcp[temperature].low, | ||
1343 | pageset->pcp[temperature].high, | 1399 | pageset->pcp[temperature].high, |
1344 | pageset->pcp[temperature].batch, | 1400 | pageset->pcp[temperature].batch, |
1345 | pageset->pcp[temperature].count); | 1401 | pageset->pcp[temperature].count); |
@@ -1402,7 +1458,7 @@ void show_free_areas(void) | |||
1402 | 1458 | ||
1403 | show_node(zone); | 1459 | show_node(zone); |
1404 | printk("%s: ", zone->name); | 1460 | printk("%s: ", zone->name); |
1405 | if (!zone->present_pages) { | 1461 | if (!populated_zone(zone)) { |
1406 | printk("empty\n"); | 1462 | printk("empty\n"); |
1407 | continue; | 1463 | continue; |
1408 | } | 1464 | } |
@@ -1422,32 +1478,29 @@ void show_free_areas(void) | |||
1422 | 1478 | ||
1423 | /* | 1479 | /* |
1424 | * Builds allocation fallback zone lists. | 1480 | * Builds allocation fallback zone lists. |
1481 | * | ||
1482 | * Add all populated zones of a node to the zonelist. | ||
1425 | */ | 1483 | */ |
1426 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | 1484 | static int __init build_zonelists_node(pg_data_t *pgdat, |
1427 | { | 1485 | struct zonelist *zonelist, int nr_zones, int zone_type) |
1428 | switch (k) { | 1486 | { |
1429 | struct zone *zone; | 1487 | struct zone *zone; |
1430 | default: | 1488 | |
1431 | BUG(); | 1489 | BUG_ON(zone_type > ZONE_HIGHMEM); |
1432 | case ZONE_HIGHMEM: | 1490 | |
1433 | zone = pgdat->node_zones + ZONE_HIGHMEM; | 1491 | do { |
1434 | if (zone->present_pages) { | 1492 | zone = pgdat->node_zones + zone_type; |
1493 | if (populated_zone(zone)) { | ||
1435 | #ifndef CONFIG_HIGHMEM | 1494 | #ifndef CONFIG_HIGHMEM |
1436 | BUG(); | 1495 | BUG_ON(zone_type > ZONE_NORMAL); |
1437 | #endif | 1496 | #endif |
1438 | zonelist->zones[j++] = zone; | 1497 | zonelist->zones[nr_zones++] = zone; |
1498 | check_highest_zone(zone_type); | ||
1439 | } | 1499 | } |
1440 | case ZONE_NORMAL: | 1500 | zone_type--; |
1441 | zone = pgdat->node_zones + ZONE_NORMAL; | ||
1442 | if (zone->present_pages) | ||
1443 | zonelist->zones[j++] = zone; | ||
1444 | case ZONE_DMA: | ||
1445 | zone = pgdat->node_zones + ZONE_DMA; | ||
1446 | if (zone->present_pages) | ||
1447 | zonelist->zones[j++] = zone; | ||
1448 | } | ||
1449 | 1501 | ||
1450 | return j; | 1502 | } while (zone_type >= 0); |
1503 | return nr_zones; | ||
1451 | } | 1504 | } |
1452 | 1505 | ||
1453 | static inline int highest_zone(int zone_bits) | 1506 | static inline int highest_zone(int zone_bits) |
@@ -1455,6 +1508,8 @@ static inline int highest_zone(int zone_bits) | |||
1455 | int res = ZONE_NORMAL; | 1508 | int res = ZONE_NORMAL; |
1456 | if (zone_bits & (__force int)__GFP_HIGHMEM) | 1509 | if (zone_bits & (__force int)__GFP_HIGHMEM) |
1457 | res = ZONE_HIGHMEM; | 1510 | res = ZONE_HIGHMEM; |
1511 | if (zone_bits & (__force int)__GFP_DMA32) | ||
1512 | res = ZONE_DMA32; | ||
1458 | if (zone_bits & (__force int)__GFP_DMA) | 1513 | if (zone_bits & (__force int)__GFP_DMA) |
1459 | res = ZONE_DMA; | 1514 | res = ZONE_DMA; |
1460 | return res; | 1515 | return res; |
@@ -1542,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1542 | prev_node = local_node; | 1597 | prev_node = local_node; |
1543 | nodes_clear(used_mask); | 1598 | nodes_clear(used_mask); |
1544 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1599 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1600 | int distance = node_distance(local_node, node); | ||
1601 | |||
1602 | /* | ||
1603 | * If another node is sufficiently far away then it is better | ||
1604 | * to reclaim pages in a zone before going off node. | ||
1605 | */ | ||
1606 | if (distance > RECLAIM_DISTANCE) | ||
1607 | zone_reclaim_mode = 1; | ||
1608 | |||
1545 | /* | 1609 | /* |
1546 | * We don't want to pressure a particular node. | 1610 | * We don't want to pressure a particular node. |
1547 | * So adding penalty to the first node in same | 1611 | * So adding penalty to the first node in same |
1548 | * distance group to make it round-robin. | 1612 | * distance group to make it round-robin. |
1549 | */ | 1613 | */ |
1550 | if (node_distance(local_node, node) != | 1614 | |
1551 | node_distance(local_node, prev_node)) | 1615 | if (distance != node_distance(local_node, prev_node)) |
1552 | node_load[node] += load; | 1616 | node_load[node] += load; |
1553 | prev_node = node; | 1617 | prev_node = node; |
1554 | load--; | 1618 | load--; |
@@ -1682,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | |||
1682 | * up by free_all_bootmem() once the early boot process is | 1746 | * up by free_all_bootmem() once the early boot process is |
1683 | * done. Non-atomic initialization, single-pass. | 1747 | * done. Non-atomic initialization, single-pass. |
1684 | */ | 1748 | */ |
1685 | void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1749 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1686 | unsigned long start_pfn) | 1750 | unsigned long start_pfn) |
1687 | { | 1751 | { |
1688 | struct page *page; | 1752 | struct page *page; |
1689 | unsigned long end_pfn = start_pfn + size; | 1753 | unsigned long end_pfn = start_pfn + size; |
1690 | unsigned long pfn; | 1754 | unsigned long pfn; |
1691 | 1755 | ||
1692 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { | 1756 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1693 | if (!early_pfn_valid(pfn)) | 1757 | if (!early_pfn_valid(pfn)) |
1694 | continue; | 1758 | continue; |
1695 | if (!early_pfn_in_nid(pfn, nid)) | ||
1696 | continue; | ||
1697 | page = pfn_to_page(pfn); | 1759 | page = pfn_to_page(pfn); |
1698 | set_page_links(page, zone, nid, pfn); | 1760 | set_page_links(page, zone, nid, pfn); |
1699 | set_page_count(page, 1); | 1761 | set_page_count(page, 1); |
@@ -1737,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | |||
1737 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1799 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
1738 | #endif | 1800 | #endif |
1739 | 1801 | ||
1740 | static int __devinit zone_batchsize(struct zone *zone) | 1802 | static int __meminit zone_batchsize(struct zone *zone) |
1741 | { | 1803 | { |
1742 | int batch; | 1804 | int batch; |
1743 | 1805 | ||
@@ -1755,16 +1817,16 @@ static int __devinit zone_batchsize(struct zone *zone) | |||
1755 | batch = 1; | 1817 | batch = 1; |
1756 | 1818 | ||
1757 | /* | 1819 | /* |
1758 | * We will be trying to allcoate bigger chunks of contiguous | 1820 | * Clamp the batch to a 2^n - 1 value. Having a power |
1759 | * memory of the order of fls(batch). This should result in | 1821 | * of 2 value was found to be more likely to have |
1760 | * better cache coloring. | 1822 | * suboptimal cache aliasing properties in some cases. |
1761 | * | 1823 | * |
1762 | * A sanity check also to ensure that batch is still in limits. | 1824 | * For example if 2 tasks are alternately allocating |
1825 | * batches of pages, one task can end up with a lot | ||
1826 | * of pages of one half of the possible page colors | ||
1827 | * and the other with pages of the other colors. | ||
1763 | */ | 1828 | */ |
1764 | batch = (1 << fls(batch + batch/2)); | 1829 | batch = (1 << (fls(batch + batch/2)-1)) - 1; |
1765 | |||
1766 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) | ||
1767 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); | ||
1768 | 1830 | ||
1769 | return batch; | 1831 | return batch; |
1770 | } | 1832 | } |
@@ -1777,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
1777 | 1839 | ||
1778 | pcp = &p->pcp[0]; /* hot */ | 1840 | pcp = &p->pcp[0]; /* hot */ |
1779 | pcp->count = 0; | 1841 | pcp->count = 0; |
1780 | pcp->low = 0; | ||
1781 | pcp->high = 6 * batch; | 1842 | pcp->high = 6 * batch; |
1782 | pcp->batch = max(1UL, 1 * batch); | 1843 | pcp->batch = max(1UL, 1 * batch); |
1783 | INIT_LIST_HEAD(&pcp->list); | 1844 | INIT_LIST_HEAD(&pcp->list); |
1784 | 1845 | ||
1785 | pcp = &p->pcp[1]; /* cold*/ | 1846 | pcp = &p->pcp[1]; /* cold*/ |
1786 | pcp->count = 0; | 1847 | pcp->count = 0; |
1787 | pcp->low = 0; | ||
1788 | pcp->high = 2 * batch; | 1848 | pcp->high = 2 * batch; |
1789 | pcp->batch = max(1UL, batch/2); | 1849 | pcp->batch = max(1UL, batch/2); |
1790 | INIT_LIST_HEAD(&pcp->list); | 1850 | INIT_LIST_HEAD(&pcp->list); |
1791 | } | 1851 | } |
1792 | 1852 | ||
1853 | /* | ||
1854 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | ||
1855 | * to the value high for the pageset p. | ||
1856 | */ | ||
1857 | |||
1858 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
1859 | unsigned long high) | ||
1860 | { | ||
1861 | struct per_cpu_pages *pcp; | ||
1862 | |||
1863 | pcp = &p->pcp[0]; /* hot list */ | ||
1864 | pcp->high = high; | ||
1865 | pcp->batch = max(1UL, high/4); | ||
1866 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
1867 | pcp->batch = PAGE_SHIFT * 8; | ||
1868 | } | ||
1869 | |||
1870 | |||
1793 | #ifdef CONFIG_NUMA | 1871 | #ifdef CONFIG_NUMA |
1794 | /* | 1872 | /* |
1795 | * Boot pageset table. One per cpu which is going to be used for all | 1873 | * Boot pageset table. One per cpu which is going to be used for all |
@@ -1815,18 +1893,22 @@ static struct per_cpu_pageset | |||
1815 | * Dynamically allocate memory for the | 1893 | * Dynamically allocate memory for the |
1816 | * per cpu pageset array in struct zone. | 1894 | * per cpu pageset array in struct zone. |
1817 | */ | 1895 | */ |
1818 | static int __devinit process_zones(int cpu) | 1896 | static int __meminit process_zones(int cpu) |
1819 | { | 1897 | { |
1820 | struct zone *zone, *dzone; | 1898 | struct zone *zone, *dzone; |
1821 | 1899 | ||
1822 | for_each_zone(zone) { | 1900 | for_each_zone(zone) { |
1823 | 1901 | ||
1824 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | 1902 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
1825 | GFP_KERNEL, cpu_to_node(cpu)); | 1903 | GFP_KERNEL, cpu_to_node(cpu)); |
1826 | if (!zone->pageset[cpu]) | 1904 | if (!zone_pcp(zone, cpu)) |
1827 | goto bad; | 1905 | goto bad; |
1828 | 1906 | ||
1829 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | 1907 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); |
1908 | |||
1909 | if (percpu_pagelist_fraction) | ||
1910 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
1911 | (zone->present_pages / percpu_pagelist_fraction)); | ||
1830 | } | 1912 | } |
1831 | 1913 | ||
1832 | return 0; | 1914 | return 0; |
@@ -1834,15 +1916,14 @@ bad: | |||
1834 | for_each_zone(dzone) { | 1916 | for_each_zone(dzone) { |
1835 | if (dzone == zone) | 1917 | if (dzone == zone) |
1836 | break; | 1918 | break; |
1837 | kfree(dzone->pageset[cpu]); | 1919 | kfree(zone_pcp(dzone, cpu)); |
1838 | dzone->pageset[cpu] = NULL; | 1920 | zone_pcp(dzone, cpu) = NULL; |
1839 | } | 1921 | } |
1840 | return -ENOMEM; | 1922 | return -ENOMEM; |
1841 | } | 1923 | } |
1842 | 1924 | ||
1843 | static inline void free_zone_pagesets(int cpu) | 1925 | static inline void free_zone_pagesets(int cpu) |
1844 | { | 1926 | { |
1845 | #ifdef CONFIG_NUMA | ||
1846 | struct zone *zone; | 1927 | struct zone *zone; |
1847 | 1928 | ||
1848 | for_each_zone(zone) { | 1929 | for_each_zone(zone) { |
@@ -1851,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu) | |||
1851 | zone_pcp(zone, cpu) = NULL; | 1932 | zone_pcp(zone, cpu) = NULL; |
1852 | kfree(pset); | 1933 | kfree(pset); |
1853 | } | 1934 | } |
1854 | #endif | ||
1855 | } | 1935 | } |
1856 | 1936 | ||
1857 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | 1937 | static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, |
1858 | unsigned long action, | 1938 | unsigned long action, |
1859 | void *hcpu) | 1939 | void *hcpu) |
1860 | { | 1940 | { |
@@ -1866,11 +1946,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1866 | if (process_zones(cpu)) | 1946 | if (process_zones(cpu)) |
1867 | ret = NOTIFY_BAD; | 1947 | ret = NOTIFY_BAD; |
1868 | break; | 1948 | break; |
1869 | #ifdef CONFIG_HOTPLUG_CPU | 1949 | case CPU_UP_CANCELED: |
1870 | case CPU_DEAD: | 1950 | case CPU_DEAD: |
1871 | free_zone_pagesets(cpu); | 1951 | free_zone_pagesets(cpu); |
1872 | break; | 1952 | break; |
1873 | #endif | ||
1874 | default: | 1953 | default: |
1875 | break; | 1954 | break; |
1876 | } | 1955 | } |
@@ -1880,7 +1959,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1880 | static struct notifier_block pageset_notifier = | 1959 | static struct notifier_block pageset_notifier = |
1881 | { &pageset_cpuup_callback, NULL, 0 }; | 1960 | { &pageset_cpuup_callback, NULL, 0 }; |
1882 | 1961 | ||
1883 | void __init setup_per_cpu_pageset() | 1962 | void __init setup_per_cpu_pageset(void) |
1884 | { | 1963 | { |
1885 | int err; | 1964 | int err; |
1886 | 1965 | ||
@@ -1895,7 +1974,7 @@ void __init setup_per_cpu_pageset() | |||
1895 | 1974 | ||
1896 | #endif | 1975 | #endif |
1897 | 1976 | ||
1898 | static __devinit | 1977 | static __meminit |
1899 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 1978 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
1900 | { | 1979 | { |
1901 | int i; | 1980 | int i; |
@@ -1915,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
1915 | init_waitqueue_head(zone->wait_table + i); | 1994 | init_waitqueue_head(zone->wait_table + i); |
1916 | } | 1995 | } |
1917 | 1996 | ||
1918 | static __devinit void zone_pcp_init(struct zone *zone) | 1997 | static __meminit void zone_pcp_init(struct zone *zone) |
1919 | { | 1998 | { |
1920 | int cpu; | 1999 | int cpu; |
1921 | unsigned long batch = zone_batchsize(zone); | 2000 | unsigned long batch = zone_batchsize(zone); |
@@ -1923,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone) | |||
1923 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 2002 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1924 | #ifdef CONFIG_NUMA | 2003 | #ifdef CONFIG_NUMA |
1925 | /* Early boot. Slab allocator not functional yet */ | 2004 | /* Early boot. Slab allocator not functional yet */ |
1926 | zone->pageset[cpu] = &boot_pageset[cpu]; | 2005 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; |
1927 | setup_pageset(&boot_pageset[cpu],0); | 2006 | setup_pageset(&boot_pageset[cpu],0); |
1928 | #else | 2007 | #else |
1929 | setup_pageset(zone_pcp(zone,cpu), batch); | 2008 | setup_pageset(zone_pcp(zone,cpu), batch); |
@@ -1933,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone) | |||
1933 | zone->name, zone->present_pages, batch); | 2012 | zone->name, zone->present_pages, batch); |
1934 | } | 2013 | } |
1935 | 2014 | ||
1936 | static __devinit void init_currently_empty_zone(struct zone *zone, | 2015 | static __meminit void init_currently_empty_zone(struct zone *zone, |
1937 | unsigned long zone_start_pfn, unsigned long size) | 2016 | unsigned long zone_start_pfn, unsigned long size) |
1938 | { | 2017 | { |
1939 | struct pglist_data *pgdat = zone->zone_pgdat; | 2018 | struct pglist_data *pgdat = zone->zone_pgdat; |
@@ -1975,7 +2054,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1975 | if (zholes_size) | 2054 | if (zholes_size) |
1976 | realsize -= zholes_size[j]; | 2055 | realsize -= zholes_size[j]; |
1977 | 2056 | ||
1978 | if (j == ZONE_DMA || j == ZONE_NORMAL) | 2057 | if (j < ZONE_HIGHMEM) |
1979 | nr_kernel_pages += realsize; | 2058 | nr_kernel_pages += realsize; |
1980 | nr_all_pages += realsize; | 2059 | nr_all_pages += realsize; |
1981 | 2060 | ||
@@ -2100,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
2100 | int order; | 2179 | int order; |
2101 | 2180 | ||
2102 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2181 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
2103 | if (!zone->present_pages) | 2182 | if (!populated_zone(zone)) |
2104 | continue; | 2183 | continue; |
2105 | 2184 | ||
2106 | spin_lock_irqsave(&zone->lock, flags); | 2185 | spin_lock_irqsave(&zone->lock, flags); |
@@ -2133,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
2133 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 2212 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { |
2134 | int i; | 2213 | int i; |
2135 | 2214 | ||
2136 | if (!zone->present_pages) | 2215 | if (!populated_zone(zone)) |
2137 | continue; | 2216 | continue; |
2138 | 2217 | ||
2139 | spin_lock_irqsave(&zone->lock, flags); | 2218 | spin_lock_irqsave(&zone->lock, flags); |
@@ -2166,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
2166 | seq_printf(m, | 2245 | seq_printf(m, |
2167 | ")" | 2246 | ")" |
2168 | "\n pagesets"); | 2247 | "\n pagesets"); |
2169 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | 2248 | for_each_online_cpu(i) { |
2170 | struct per_cpu_pageset *pageset; | 2249 | struct per_cpu_pageset *pageset; |
2171 | int j; | 2250 | int j; |
2172 | 2251 | ||
@@ -2181,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
2181 | seq_printf(m, | 2260 | seq_printf(m, |
2182 | "\n cpu: %i pcp: %i" | 2261 | "\n cpu: %i pcp: %i" |
2183 | "\n count: %i" | 2262 | "\n count: %i" |
2184 | "\n low: %i" | ||
2185 | "\n high: %i" | 2263 | "\n high: %i" |
2186 | "\n batch: %i", | 2264 | "\n batch: %i", |
2187 | i, j, | 2265 | i, j, |
2188 | pageset->pcp[j].count, | 2266 | pageset->pcp[j].count, |
2189 | pageset->pcp[j].low, | ||
2190 | pageset->pcp[j].high, | 2267 | pageset->pcp[j].high, |
2191 | pageset->pcp[j].batch); | 2268 | pageset->pcp[j].batch); |
2192 | } | 2269 | } |
@@ -2241,32 +2318,40 @@ static char *vmstat_text[] = { | |||
2241 | "pgpgout", | 2318 | "pgpgout", |
2242 | "pswpin", | 2319 | "pswpin", |
2243 | "pswpout", | 2320 | "pswpout", |
2244 | "pgalloc_high", | ||
2245 | 2321 | ||
2322 | "pgalloc_high", | ||
2246 | "pgalloc_normal", | 2323 | "pgalloc_normal", |
2324 | "pgalloc_dma32", | ||
2247 | "pgalloc_dma", | 2325 | "pgalloc_dma", |
2326 | |||
2248 | "pgfree", | 2327 | "pgfree", |
2249 | "pgactivate", | 2328 | "pgactivate", |
2250 | "pgdeactivate", | 2329 | "pgdeactivate", |
2251 | 2330 | ||
2252 | "pgfault", | 2331 | "pgfault", |
2253 | "pgmajfault", | 2332 | "pgmajfault", |
2333 | |||
2254 | "pgrefill_high", | 2334 | "pgrefill_high", |
2255 | "pgrefill_normal", | 2335 | "pgrefill_normal", |
2336 | "pgrefill_dma32", | ||
2256 | "pgrefill_dma", | 2337 | "pgrefill_dma", |
2257 | 2338 | ||
2258 | "pgsteal_high", | 2339 | "pgsteal_high", |
2259 | "pgsteal_normal", | 2340 | "pgsteal_normal", |
2341 | "pgsteal_dma32", | ||
2260 | "pgsteal_dma", | 2342 | "pgsteal_dma", |
2343 | |||
2261 | "pgscan_kswapd_high", | 2344 | "pgscan_kswapd_high", |
2262 | "pgscan_kswapd_normal", | 2345 | "pgscan_kswapd_normal", |
2263 | 2346 | "pgscan_kswapd_dma32", | |
2264 | "pgscan_kswapd_dma", | 2347 | "pgscan_kswapd_dma", |
2348 | |||
2265 | "pgscan_direct_high", | 2349 | "pgscan_direct_high", |
2266 | "pgscan_direct_normal", | 2350 | "pgscan_direct_normal", |
2351 | "pgscan_direct_dma32", | ||
2267 | "pgscan_direct_dma", | 2352 | "pgscan_direct_dma", |
2268 | "pginodesteal", | ||
2269 | 2353 | ||
2354 | "pginodesteal", | ||
2270 | "slabs_scanned", | 2355 | "slabs_scanned", |
2271 | "kswapd_steal", | 2356 | "kswapd_steal", |
2272 | "kswapd_inodesteal", | 2357 | "kswapd_inodesteal", |
@@ -2417,13 +2502,18 @@ void setup_per_zone_pages_min(void) | |||
2417 | } | 2502 | } |
2418 | 2503 | ||
2419 | for_each_zone(zone) { | 2504 | for_each_zone(zone) { |
2505 | unsigned long tmp; | ||
2420 | spin_lock_irqsave(&zone->lru_lock, flags); | 2506 | spin_lock_irqsave(&zone->lru_lock, flags); |
2507 | tmp = (pages_min * zone->present_pages) / lowmem_pages; | ||
2421 | if (is_highmem(zone)) { | 2508 | if (is_highmem(zone)) { |
2422 | /* | 2509 | /* |
2423 | * Often, highmem doesn't need to reserve any pages. | 2510 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
2424 | * But the pages_min/low/high values are also used for | 2511 | * need highmem pages, so cap pages_min to a small |
2425 | * batching up page reclaim activity so we need a | 2512 | * value here. |
2426 | * decent value here. | 2513 | * |
2514 | * The (pages_high-pages_low) and (pages_low-pages_min) | ||
2515 | * deltas controls asynch page reclaim, and so should | ||
2516 | * not be capped for highmem. | ||
2427 | */ | 2517 | */ |
2428 | int min_pages; | 2518 | int min_pages; |
2429 | 2519 | ||
@@ -2434,19 +2524,15 @@ void setup_per_zone_pages_min(void) | |||
2434 | min_pages = 128; | 2524 | min_pages = 128; |
2435 | zone->pages_min = min_pages; | 2525 | zone->pages_min = min_pages; |
2436 | } else { | 2526 | } else { |
2437 | /* if it's a lowmem zone, reserve a number of pages | 2527 | /* |
2528 | * If it's a lowmem zone, reserve a number of pages | ||
2438 | * proportionate to the zone's size. | 2529 | * proportionate to the zone's size. |
2439 | */ | 2530 | */ |
2440 | zone->pages_min = (pages_min * zone->present_pages) / | 2531 | zone->pages_min = tmp; |
2441 | lowmem_pages; | ||
2442 | } | 2532 | } |
2443 | 2533 | ||
2444 | /* | 2534 | zone->pages_low = zone->pages_min + tmp / 4; |
2445 | * When interpreting these watermarks, just keep in mind that: | 2535 | zone->pages_high = zone->pages_min + tmp / 2; |
2446 | * zone->pages_min == (zone->pages_min * 4) / 4; | ||
2447 | */ | ||
2448 | zone->pages_low = (zone->pages_min * 5) / 4; | ||
2449 | zone->pages_high = (zone->pages_min * 6) / 4; | ||
2450 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2536 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2451 | } | 2537 | } |
2452 | } | 2538 | } |
@@ -2522,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
2522 | return 0; | 2608 | return 0; |
2523 | } | 2609 | } |
2524 | 2610 | ||
2611 | /* | ||
2612 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | ||
2613 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | ||
2614 | * can have before it gets flushed back to buddy allocator. | ||
2615 | */ | ||
2616 | |||
2617 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | ||
2618 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
2619 | { | ||
2620 | struct zone *zone; | ||
2621 | unsigned int cpu; | ||
2622 | int ret; | ||
2623 | |||
2624 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
2625 | if (!write || (ret == -EINVAL)) | ||
2626 | return ret; | ||
2627 | for_each_zone(zone) { | ||
2628 | for_each_online_cpu(cpu) { | ||
2629 | unsigned long high; | ||
2630 | high = zone->present_pages / percpu_pagelist_fraction; | ||
2631 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | ||
2632 | } | ||
2633 | } | ||
2634 | return 0; | ||
2635 | } | ||
2636 | |||
2525 | __initdata int hashdist = HASHDIST_DEFAULT; | 2637 | __initdata int hashdist = HASHDIST_DEFAULT; |
2526 | 2638 | ||
2527 | #ifdef CONFIG_NUMA | 2639 | #ifdef CONFIG_NUMA |