diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 472 | 
1 files changed, 278 insertions, 194 deletions
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc2..8c960b469593 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/memory_hotplug.h> | 36 | #include <linux/memory_hotplug.h> | 
| 37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> | 
| 38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> | 
| 39 | #include <linux/mempolicy.h> | ||
| 39 | 40 | ||
| 40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> | 
| 41 | #include "internal.h" | 42 | #include "internal.h" | 
| @@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly; | |||
| 52 | unsigned long totalram_pages __read_mostly; | 53 | unsigned long totalram_pages __read_mostly; | 
| 53 | unsigned long totalhigh_pages __read_mostly; | 54 | unsigned long totalhigh_pages __read_mostly; | 
| 54 | long nr_swap_pages; | 55 | long nr_swap_pages; | 
| 56 | int percpu_pagelist_fraction; | ||
| 57 | |||
| 58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
| 55 | 59 | ||
| 56 | /* | 60 | /* | 
| 57 | * results with 256, 32 in the lowmem_reserve sysctl: | 61 | * results with 256, 32 in the lowmem_reserve sysctl: | 
| @@ -81,6 +85,7 @@ int min_free_kbytes = 1024; | |||
| 81 | unsigned long __initdata nr_kernel_pages; | 85 | unsigned long __initdata nr_kernel_pages; | 
| 82 | unsigned long __initdata nr_all_pages; | 86 | unsigned long __initdata nr_all_pages; | 
| 83 | 87 | ||
| 88 | #ifdef CONFIG_DEBUG_VM | ||
| 84 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 89 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 
| 85 | { | 90 | { | 
| 86 | int ret = 0; | 91 | int ret = 0; | 
| @@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page) | |||
| 122 | return 0; | 127 | return 0; | 
| 123 | } | 128 | } | 
| 124 | 129 | ||
| 125 | static void bad_page(const char *function, struct page *page) | 130 | #else | 
| 131 | static inline int bad_range(struct zone *zone, struct page *page) | ||
| 126 | { | 132 | { | 
| 127 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | 133 | return 0; | 
| 128 | function, current->comm, page); | 134 | } | 
| 129 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 135 | #endif | 
| 130 | (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, | 136 | |
| 131 | page->mapping, page_mapcount(page), page_count(page)); | 137 | static void bad_page(struct page *page) | 
| 132 | printk(KERN_EMERG "Backtrace:\n"); | 138 | { | 
| 139 | printk(KERN_EMERG "Bad page state in process '%s'\n" | ||
| 140 | KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" | ||
| 141 | KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | ||
| 142 | KERN_EMERG "Backtrace:\n", | ||
| 143 | current->comm, page, (int)(2*sizeof(unsigned long)), | ||
| 144 | (unsigned long)page->flags, page->mapping, | ||
| 145 | page_mapcount(page), page_count(page)); | ||
| 133 | dump_stack(); | 146 | dump_stack(); | 
| 134 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | ||
| 135 | page->flags &= ~(1 << PG_lru | | 147 | page->flags &= ~(1 << PG_lru | | 
| 136 | 1 << PG_private | | 148 | 1 << PG_private | | 
| 137 | 1 << PG_locked | | 149 | 1 << PG_locked | | 
| @@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 184 | int i; | 196 | int i; | 
| 185 | int nr_pages = 1 << order; | 197 | int nr_pages = 1 << order; | 
| 186 | 198 | ||
| 187 | if (!PageCompound(page)) | 199 | if (unlikely(page[1].index != order)) | 
| 188 | return; | 200 | bad_page(page); | 
| 189 | |||
| 190 | if (page[1].index != order) | ||
| 191 | bad_page(__FUNCTION__, page); | ||
| 192 | 201 | ||
| 193 | for (i = 0; i < nr_pages; i++) { | 202 | for (i = 0; i < nr_pages; i++) { | 
| 194 | struct page *p = page + i; | 203 | struct page *p = page + i; | 
| 195 | 204 | ||
| 196 | if (!PageCompound(p)) | 205 | if (unlikely(!PageCompound(p) | | 
| 197 | bad_page(__FUNCTION__, page); | 206 | (page_private(p) != (unsigned long)page))) | 
| 198 | if (page_private(p) != (unsigned long)page) | 207 | bad_page(page); | 
| 199 | bad_page(__FUNCTION__, page); | ||
| 200 | ClearPageCompound(p); | 208 | ClearPageCompound(p); | 
| 201 | } | 209 | } | 
| 202 | } | 210 | } | 
| @@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
| 255 | /* | 263 | /* | 
| 256 | * This function checks whether a page is free && is the buddy | 264 | * This function checks whether a page is free && is the buddy | 
| 257 | * we can do coalesce a page and its buddy if | 265 | * we can do coalesce a page and its buddy if | 
| 258 | * (a) the buddy is free && | 266 | * (a) the buddy is not in a hole && | 
| 259 | * (b) the buddy is on the buddy system && | 267 | * (b) the buddy is free && | 
| 260 | * (c) a page and its buddy have the same order. | 268 | * (c) the buddy is on the buddy system && | 
| 269 | * (d) a page and its buddy have the same order. | ||
| 261 | * for recording page's order, we use page_private(page) and PG_private. | 270 | * for recording page's order, we use page_private(page) and PG_private. | 
| 262 | * | 271 | * | 
| 263 | */ | 272 | */ | 
| 264 | static inline int page_is_buddy(struct page *page, int order) | 273 | static inline int page_is_buddy(struct page *page, int order) | 
| 265 | { | 274 | { | 
| 275 | #ifdef CONFIG_HOLES_IN_ZONE | ||
| 276 | if (!pfn_valid(page_to_pfn(page))) | ||
| 277 | return 0; | ||
| 278 | #endif | ||
| 279 | |||
| 266 | if (PagePrivate(page) && | 280 | if (PagePrivate(page) && | 
| 267 | (page_order(page) == order) && | 281 | (page_order(page) == order) && | 
| 268 | page_count(page) == 0) | 282 | page_count(page) == 0) | 
| @@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order) | |||
| 294 | * -- wli | 308 | * -- wli | 
| 295 | */ | 309 | */ | 
| 296 | 310 | ||
| 297 | static inline void __free_pages_bulk (struct page *page, | 311 | static inline void __free_one_page(struct page *page, | 
| 298 | struct zone *zone, unsigned int order) | 312 | struct zone *zone, unsigned int order) | 
| 299 | { | 313 | { | 
| 300 | unsigned long page_idx; | 314 | unsigned long page_idx; | 
| 301 | int order_size = 1 << order; | 315 | int order_size = 1 << order; | 
| 302 | 316 | ||
| 303 | if (unlikely(order)) | 317 | if (unlikely(PageCompound(page))) | 
| 304 | destroy_compound_page(page, order); | 318 | destroy_compound_page(page, order); | 
| 305 | 319 | ||
| 306 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 320 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 
| @@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, | |||
| 314 | struct free_area *area; | 328 | struct free_area *area; | 
| 315 | struct page *buddy; | 329 | struct page *buddy; | 
| 316 | 330 | ||
| 317 | combined_idx = __find_combined_index(page_idx, order); | ||
| 318 | buddy = __page_find_buddy(page, page_idx, order); | 331 | buddy = __page_find_buddy(page, page_idx, order); | 
| 319 | |||
| 320 | if (bad_range(zone, buddy)) | ||
| 321 | break; | ||
| 322 | if (!page_is_buddy(buddy, order)) | 332 | if (!page_is_buddy(buddy, order)) | 
| 323 | break; /* Move the buddy up one level. */ | 333 | break; /* Move the buddy up one level. */ | 
| 334 | |||
| 324 | list_del(&buddy->lru); | 335 | list_del(&buddy->lru); | 
| 325 | area = zone->free_area + order; | 336 | area = zone->free_area + order; | 
| 326 | area->nr_free--; | 337 | area->nr_free--; | 
| 327 | rmv_page_order(buddy); | 338 | rmv_page_order(buddy); | 
| 339 | combined_idx = __find_combined_index(page_idx, order); | ||
| 328 | page = page + (combined_idx - page_idx); | 340 | page = page + (combined_idx - page_idx); | 
| 329 | page_idx = combined_idx; | 341 | page_idx = combined_idx; | 
| 330 | order++; | 342 | order++; | 
| @@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page, | |||
| 334 | zone->free_area[order].nr_free++; | 346 | zone->free_area[order].nr_free++; | 
| 335 | } | 347 | } | 
| 336 | 348 | ||
| 337 | static inline int free_pages_check(const char *function, struct page *page) | 349 | static inline int free_pages_check(struct page *page) | 
| 338 | { | 350 | { | 
| 339 | if ( page_mapcount(page) || | 351 | if (unlikely(page_mapcount(page) | | 
| 340 | page->mapping != NULL || | 352 | (page->mapping != NULL) | | 
| 341 | page_count(page) != 0 || | 353 | (page_count(page) != 0) | | 
| 342 | (page->flags & ( | 354 | (page->flags & ( | 
| 343 | 1 << PG_lru | | 355 | 1 << PG_lru | | 
| 344 | 1 << PG_private | | 356 | 1 << PG_private | | 
| @@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page) | |||
| 348 | 1 << PG_slab | | 360 | 1 << PG_slab | | 
| 349 | 1 << PG_swapcache | | 361 | 1 << PG_swapcache | | 
| 350 | 1 << PG_writeback | | 362 | 1 << PG_writeback | | 
| 351 | 1 << PG_reserved ))) | 363 | 1 << PG_reserved )))) | 
| 352 | bad_page(function, page); | 364 | bad_page(page); | 
| 353 | if (PageDirty(page)) | 365 | if (PageDirty(page)) | 
| 354 | __ClearPageDirty(page); | 366 | __ClearPageDirty(page); | 
| 355 | /* | 367 | /* | 
| @@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page) | |||
| 371 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 383 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 
| 372 | * pinned" detection logic. | 384 | * pinned" detection logic. | 
| 373 | */ | 385 | */ | 
| 374 | static int | 386 | static void free_pages_bulk(struct zone *zone, int count, | 
| 375 | free_pages_bulk(struct zone *zone, int count, | 387 | struct list_head *list, int order) | 
| 376 | struct list_head *list, unsigned int order) | ||
| 377 | { | 388 | { | 
| 378 | unsigned long flags; | 389 | spin_lock(&zone->lock); | 
| 379 | struct page *page = NULL; | ||
| 380 | int ret = 0; | ||
| 381 | |||
| 382 | spin_lock_irqsave(&zone->lock, flags); | ||
| 383 | zone->all_unreclaimable = 0; | 390 | zone->all_unreclaimable = 0; | 
| 384 | zone->pages_scanned = 0; | 391 | zone->pages_scanned = 0; | 
| 385 | while (!list_empty(list) && count--) { | 392 | while (count--) { | 
| 393 | struct page *page; | ||
| 394 | |||
| 395 | BUG_ON(list_empty(list)); | ||
| 386 | page = list_entry(list->prev, struct page, lru); | 396 | page = list_entry(list->prev, struct page, lru); | 
| 387 | /* have to delete it as __free_pages_bulk list manipulates */ | 397 | /* have to delete it as __free_one_page list manipulates */ | 
| 388 | list_del(&page->lru); | 398 | list_del(&page->lru); | 
| 389 | __free_pages_bulk(page, zone, order); | 399 | __free_one_page(page, zone, order); | 
| 390 | ret++; | ||
| 391 | } | 400 | } | 
| 392 | spin_unlock_irqrestore(&zone->lock, flags); | 401 | spin_unlock(&zone->lock); | 
| 393 | return ret; | ||
| 394 | } | 402 | } | 
| 395 | 403 | ||
| 396 | void __free_pages_ok(struct page *page, unsigned int order) | 404 | static void free_one_page(struct zone *zone, struct page *page, int order) | 
| 397 | { | 405 | { | 
| 398 | LIST_HEAD(list); | 406 | LIST_HEAD(list); | 
| 407 | list_add(&page->lru, &list); | ||
| 408 | free_pages_bulk(zone, 1, &list, order); | ||
| 409 | } | ||
| 410 | |||
| 411 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
| 412 | { | ||
| 413 | unsigned long flags; | ||
| 399 | int i; | 414 | int i; | 
| 400 | int reserved = 0; | 415 | int reserved = 0; | 
| 401 | 416 | ||
| 402 | arch_free_page(page, order); | 417 | arch_free_page(page, order); | 
| 418 | if (!PageHighMem(page)) | ||
| 419 | mutex_debug_check_no_locks_freed(page_address(page), | ||
| 420 | PAGE_SIZE<<order); | ||
| 403 | 421 | ||
| 404 | #ifndef CONFIG_MMU | 422 | #ifndef CONFIG_MMU | 
| 405 | if (order > 0) | 423 | for (i = 1 ; i < (1 << order) ; ++i) | 
| 406 | for (i = 1 ; i < (1 << order) ; ++i) | 424 | __put_page(page + i); | 
| 407 | __put_page(page + i); | ||
| 408 | #endif | 425 | #endif | 
| 409 | 426 | ||
| 410 | for (i = 0 ; i < (1 << order) ; ++i) | 427 | for (i = 0 ; i < (1 << order) ; ++i) | 
| 411 | reserved += free_pages_check(__FUNCTION__, page + i); | 428 | reserved += free_pages_check(page + i); | 
| 412 | if (reserved) | 429 | if (reserved) | 
| 413 | return; | 430 | return; | 
| 414 | 431 | ||
| 415 | list_add(&page->lru, &list); | 432 | kernel_map_pages(page, 1 << order, 0); | 
| 416 | mod_page_state(pgfree, 1 << order); | 433 | local_irq_save(flags); | 
| 417 | kernel_map_pages(page, 1<<order, 0); | 434 | __mod_page_state(pgfree, 1 << order); | 
| 418 | free_pages_bulk(page_zone(page), 1, &list, order); | 435 | free_one_page(page_zone(page), page, order); | 
| 436 | local_irq_restore(flags); | ||
| 437 | } | ||
| 438 | |||
| 439 | /* | ||
| 440 | * permit the bootmem allocator to evade page validation on high-order frees | ||
| 441 | */ | ||
| 442 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | ||
| 443 | { | ||
| 444 | if (order == 0) { | ||
| 445 | __ClearPageReserved(page); | ||
| 446 | set_page_count(page, 0); | ||
| 447 | |||
| 448 | free_hot_cold_page(page, 0); | ||
| 449 | } else { | ||
| 450 | LIST_HEAD(list); | ||
| 451 | int loop; | ||
| 452 | |||
| 453 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | ||
| 454 | struct page *p = &page[loop]; | ||
| 455 | |||
| 456 | if (loop + 16 < BITS_PER_LONG) | ||
| 457 | prefetchw(p + 16); | ||
| 458 | __ClearPageReserved(p); | ||
| 459 | set_page_count(p, 0); | ||
| 460 | } | ||
| 461 | |||
| 462 | arch_free_page(page, order); | ||
| 463 | |||
| 464 | mod_page_state(pgfree, 1 << order); | ||
| 465 | |||
| 466 | list_add(&page->lru, &list); | ||
| 467 | kernel_map_pages(page, 1 << order, 0); | ||
| 468 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
| 469 | } | ||
| 419 | } | 470 | } | 
| 420 | 471 | ||
| 421 | 472 | ||
| @@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order) | |||
| 433 | * | 484 | * | 
| 434 | * -- wli | 485 | * -- wli | 
| 435 | */ | 486 | */ | 
| 436 | static inline struct page * | 487 | static inline void expand(struct zone *zone, struct page *page, | 
| 437 | expand(struct zone *zone, struct page *page, | ||
| 438 | int low, int high, struct free_area *area) | 488 | int low, int high, struct free_area *area) | 
| 439 | { | 489 | { | 
| 440 | unsigned long size = 1 << high; | 490 | unsigned long size = 1 << high; | 
| @@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page, | |||
| 448 | area->nr_free++; | 498 | area->nr_free++; | 
| 449 | set_page_order(&page[size], high); | 499 | set_page_order(&page[size], high); | 
| 450 | } | 500 | } | 
| 451 | return page; | ||
| 452 | } | ||
| 453 | |||
| 454 | void set_page_refs(struct page *page, int order) | ||
| 455 | { | ||
| 456 | #ifdef CONFIG_MMU | ||
| 457 | set_page_count(page, 1); | ||
| 458 | #else | ||
| 459 | int i; | ||
| 460 | |||
| 461 | /* | ||
| 462 | * We need to reference all the pages for this order, otherwise if | ||
| 463 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
| 464 | * - eg: access_process_vm() | ||
| 465 | */ | ||
| 466 | for (i = 0; i < (1 << order); i++) | ||
| 467 | set_page_count(page + i, 1); | ||
| 468 | #endif /* CONFIG_MMU */ | ||
| 469 | } | 501 | } | 
| 470 | 502 | ||
| 471 | /* | 503 | /* | 
| @@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order) | |||
| 473 | */ | 505 | */ | 
| 474 | static int prep_new_page(struct page *page, int order) | 506 | static int prep_new_page(struct page *page, int order) | 
| 475 | { | 507 | { | 
| 476 | if ( page_mapcount(page) || | 508 | if (unlikely(page_mapcount(page) | | 
| 477 | page->mapping != NULL || | 509 | (page->mapping != NULL) | | 
| 478 | page_count(page) != 0 || | 510 | (page_count(page) != 0) | | 
| 479 | (page->flags & ( | 511 | (page->flags & ( | 
| 480 | 1 << PG_lru | | 512 | 1 << PG_lru | | 
| 481 | 1 << PG_private | | 513 | 1 << PG_private | | 
| @@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order) | |||
| 486 | 1 << PG_slab | | 518 | 1 << PG_slab | | 
| 487 | 1 << PG_swapcache | | 519 | 1 << PG_swapcache | | 
| 488 | 1 << PG_writeback | | 520 | 1 << PG_writeback | | 
| 489 | 1 << PG_reserved ))) | 521 | 1 << PG_reserved )))) | 
| 490 | bad_page(__FUNCTION__, page); | 522 | bad_page(page); | 
| 491 | 523 | ||
| 492 | /* | 524 | /* | 
| 493 | * For now, we report if PG_reserved was found set, but do not | 525 | * For now, we report if PG_reserved was found set, but do not | 
| @@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
| 525 | rmv_page_order(page); | 557 | rmv_page_order(page); | 
| 526 | area->nr_free--; | 558 | area->nr_free--; | 
| 527 | zone->free_pages -= 1UL << order; | 559 | zone->free_pages -= 1UL << order; | 
| 528 | return expand(zone, page, order, current_order, area); | 560 | expand(zone, page, order, current_order, area); | 
| 561 | return page; | ||
| 529 | } | 562 | } | 
| 530 | 563 | ||
| 531 | return NULL; | 564 | return NULL; | 
| @@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
| 539 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 572 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 
| 540 | unsigned long count, struct list_head *list) | 573 | unsigned long count, struct list_head *list) | 
| 541 | { | 574 | { | 
| 542 | unsigned long flags; | ||
| 543 | int i; | 575 | int i; | 
| 544 | int allocated = 0; | ||
| 545 | struct page *page; | ||
| 546 | 576 | ||
| 547 | spin_lock_irqsave(&zone->lock, flags); | 577 | spin_lock(&zone->lock); | 
| 548 | for (i = 0; i < count; ++i) { | 578 | for (i = 0; i < count; ++i) { | 
| 549 | page = __rmqueue(zone, order); | 579 | struct page *page = __rmqueue(zone, order); | 
| 550 | if (page == NULL) | 580 | if (unlikely(page == NULL)) | 
| 551 | break; | 581 | break; | 
| 552 | allocated++; | ||
| 553 | list_add_tail(&page->lru, list); | 582 | list_add_tail(&page->lru, list); | 
| 554 | } | 583 | } | 
| 555 | spin_unlock_irqrestore(&zone->lock, flags); | 584 | spin_unlock(&zone->lock); | 
| 556 | return allocated; | 585 | return i; | 
| 557 | } | 586 | } | 
| 558 | 587 | ||
| 559 | #ifdef CONFIG_NUMA | 588 | #ifdef CONFIG_NUMA | 
| @@ -572,14 +601,13 @@ void drain_remote_pages(void) | |||
| 572 | if (zone->zone_pgdat->node_id == numa_node_id()) | 601 | if (zone->zone_pgdat->node_id == numa_node_id()) | 
| 573 | continue; | 602 | continue; | 
| 574 | 603 | ||
| 575 | pset = zone->pageset[smp_processor_id()]; | 604 | pset = zone_pcp(zone, smp_processor_id()); | 
| 576 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 605 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 
| 577 | struct per_cpu_pages *pcp; | 606 | struct per_cpu_pages *pcp; | 
| 578 | 607 | ||
| 579 | pcp = &pset->pcp[i]; | 608 | pcp = &pset->pcp[i]; | 
| 580 | if (pcp->count) | 609 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 
| 581 | pcp->count -= free_pages_bulk(zone, pcp->count, | 610 | pcp->count = 0; | 
| 582 | &pcp->list, 0); | ||
| 583 | } | 611 | } | 
| 584 | } | 612 | } | 
| 585 | local_irq_restore(flags); | 613 | local_irq_restore(flags); | 
| @@ -589,6 +617,7 @@ void drain_remote_pages(void) | |||
| 589 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 617 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 
| 590 | static void __drain_pages(unsigned int cpu) | 618 | static void __drain_pages(unsigned int cpu) | 
| 591 | { | 619 | { | 
| 620 | unsigned long flags; | ||
| 592 | struct zone *zone; | 621 | struct zone *zone; | 
| 593 | int i; | 622 | int i; | 
| 594 | 623 | ||
| @@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu) | |||
| 600 | struct per_cpu_pages *pcp; | 629 | struct per_cpu_pages *pcp; | 
| 601 | 630 | ||
| 602 | pcp = &pset->pcp[i]; | 631 | pcp = &pset->pcp[i]; | 
| 603 | pcp->count -= free_pages_bulk(zone, pcp->count, | 632 | local_irq_save(flags); | 
| 604 | &pcp->list, 0); | 633 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 
| 634 | pcp->count = 0; | ||
| 635 | local_irq_restore(flags); | ||
| 605 | } | 636 | } | 
| 606 | } | 637 | } | 
| 607 | } | 638 | } | 
| @@ -647,18 +678,14 @@ void drain_local_pages(void) | |||
| 647 | } | 678 | } | 
| 648 | #endif /* CONFIG_PM */ | 679 | #endif /* CONFIG_PM */ | 
| 649 | 680 | ||
| 650 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | 681 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) | 
| 651 | { | 682 | { | 
| 652 | #ifdef CONFIG_NUMA | 683 | #ifdef CONFIG_NUMA | 
| 653 | unsigned long flags; | ||
| 654 | int cpu; | ||
| 655 | pg_data_t *pg = z->zone_pgdat; | 684 | pg_data_t *pg = z->zone_pgdat; | 
| 656 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 685 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 
| 657 | struct per_cpu_pageset *p; | 686 | struct per_cpu_pageset *p; | 
| 658 | 687 | ||
| 659 | local_irq_save(flags); | 688 | p = zone_pcp(z, cpu); | 
| 660 | cpu = smp_processor_id(); | ||
| 661 | p = zone_pcp(z,cpu); | ||
| 662 | if (pg == orig) { | 689 | if (pg == orig) { | 
| 663 | p->numa_hit++; | 690 | p->numa_hit++; | 
| 664 | } else { | 691 | } else { | 
| @@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
| 669 | p->local_node++; | 696 | p->local_node++; | 
| 670 | else | 697 | else | 
| 671 | p->other_node++; | 698 | p->other_node++; | 
| 672 | local_irq_restore(flags); | ||
| 673 | #endif | 699 | #endif | 
| 674 | } | 700 | } | 
| 675 | 701 | ||
| 676 | /* | 702 | /* | 
| 677 | * Free a 0-order page | 703 | * Free a 0-order page | 
| 678 | */ | 704 | */ | 
| 679 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | ||
| 680 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 705 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 
| 681 | { | 706 | { | 
| 682 | struct zone *zone = page_zone(page); | 707 | struct zone *zone = page_zone(page); | 
| @@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
| 687 | 712 | ||
| 688 | if (PageAnon(page)) | 713 | if (PageAnon(page)) | 
| 689 | page->mapping = NULL; | 714 | page->mapping = NULL; | 
| 690 | if (free_pages_check(__FUNCTION__, page)) | 715 | if (free_pages_check(page)) | 
| 691 | return; | 716 | return; | 
| 692 | 717 | ||
| 693 | inc_page_state(pgfree); | ||
| 694 | kernel_map_pages(page, 1, 0); | 718 | kernel_map_pages(page, 1, 0); | 
| 695 | 719 | ||
| 696 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 720 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 
| 697 | local_irq_save(flags); | 721 | local_irq_save(flags); | 
| 722 | __inc_page_state(pgfree); | ||
| 698 | list_add(&page->lru, &pcp->list); | 723 | list_add(&page->lru, &pcp->list); | 
| 699 | pcp->count++; | 724 | pcp->count++; | 
| 700 | if (pcp->count >= pcp->high) | 725 | if (pcp->count >= pcp->high) { | 
| 701 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 726 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 
| 727 | pcp->count -= pcp->batch; | ||
| 728 | } | ||
| 702 | local_irq_restore(flags); | 729 | local_irq_restore(flags); | 
| 703 | put_cpu(); | 730 | put_cpu(); | 
| 704 | } | 731 | } | 
| @@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 727 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 754 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 
| 728 | * or two. | 755 | * or two. | 
| 729 | */ | 756 | */ | 
| 730 | static struct page * | 757 | static struct page *buffered_rmqueue(struct zonelist *zonelist, | 
| 731 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | 758 | struct zone *zone, int order, gfp_t gfp_flags) | 
| 732 | { | 759 | { | 
| 733 | unsigned long flags; | 760 | unsigned long flags; | 
| 734 | struct page *page; | 761 | struct page *page; | 
| 735 | int cold = !!(gfp_flags & __GFP_COLD); | 762 | int cold = !!(gfp_flags & __GFP_COLD); | 
| 763 | int cpu; | ||
| 736 | 764 | ||
| 737 | again: | 765 | again: | 
| 738 | if (order == 0) { | 766 | cpu = get_cpu(); | 
| 767 | if (likely(order == 0)) { | ||
| 739 | struct per_cpu_pages *pcp; | 768 | struct per_cpu_pages *pcp; | 
| 740 | 769 | ||
| 741 | page = NULL; | 770 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; | 
| 742 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | ||
| 743 | local_irq_save(flags); | 771 | local_irq_save(flags); | 
| 744 | if (pcp->count <= pcp->low) | 772 | if (!pcp->count) { | 
| 745 | pcp->count += rmqueue_bulk(zone, 0, | 773 | pcp->count += rmqueue_bulk(zone, 0, | 
| 746 | pcp->batch, &pcp->list); | 774 | pcp->batch, &pcp->list); | 
| 747 | if (pcp->count) { | 775 | if (unlikely(!pcp->count)) | 
| 748 | page = list_entry(pcp->list.next, struct page, lru); | 776 | goto failed; | 
| 749 | list_del(&page->lru); | ||
| 750 | pcp->count--; | ||
| 751 | } | 777 | } | 
| 752 | local_irq_restore(flags); | 778 | page = list_entry(pcp->list.next, struct page, lru); | 
| 753 | put_cpu(); | 779 | list_del(&page->lru); | 
| 780 | pcp->count--; | ||
| 754 | } else { | 781 | } else { | 
| 755 | spin_lock_irqsave(&zone->lock, flags); | 782 | spin_lock_irqsave(&zone->lock, flags); | 
| 756 | page = __rmqueue(zone, order); | 783 | page = __rmqueue(zone, order); | 
| 757 | spin_unlock_irqrestore(&zone->lock, flags); | 784 | spin_unlock(&zone->lock); | 
| 785 | if (!page) | ||
| 786 | goto failed; | ||
| 758 | } | 787 | } | 
| 759 | 788 | ||
| 760 | if (page != NULL) { | 789 | __mod_page_state_zone(zone, pgalloc, 1 << order); | 
| 761 | BUG_ON(bad_range(zone, page)); | 790 | zone_statistics(zonelist, zone, cpu); | 
| 762 | mod_page_state_zone(zone, pgalloc, 1 << order); | 791 | local_irq_restore(flags); | 
| 763 | if (prep_new_page(page, order)) | 792 | put_cpu(); | 
| 764 | goto again; | ||
| 765 | 793 | ||
| 766 | if (gfp_flags & __GFP_ZERO) | 794 | BUG_ON(bad_range(zone, page)); | 
| 767 | prep_zero_page(page, order, gfp_flags); | 795 | if (prep_new_page(page, order)) | 
| 796 | goto again; | ||
| 768 | 797 | ||
| 769 | if (order && (gfp_flags & __GFP_COMP)) | 798 | if (gfp_flags & __GFP_ZERO) | 
| 770 | prep_compound_page(page, order); | 799 | prep_zero_page(page, order, gfp_flags); | 
| 771 | } | 800 | |
| 801 | if (order && (gfp_flags & __GFP_COMP)) | ||
| 802 | prep_compound_page(page, order); | ||
| 772 | return page; | 803 | return page; | 
| 804 | |||
| 805 | failed: | ||
| 806 | local_irq_restore(flags); | ||
| 807 | put_cpu(); | ||
| 808 | return NULL; | ||
| 773 | } | 809 | } | 
| 774 | 810 | ||
| 775 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 811 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 
| @@ -845,9 +881,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
| 845 | continue; | 881 | continue; | 
| 846 | } | 882 | } | 
| 847 | 883 | ||
| 848 | page = buffered_rmqueue(*z, order, gfp_mask); | 884 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); | 
| 849 | if (page) { | 885 | if (page) { | 
| 850 | zone_statistics(zonelist, *z); | ||
| 851 | break; | 886 | break; | 
| 852 | } | 887 | } | 
| 853 | } while (*(++z) != NULL); | 888 | } while (*(++z) != NULL); | 
| @@ -896,15 +931,15 @@ restart: | |||
| 896 | * | 931 | * | 
| 897 | * The caller may dip into page reserves a bit more if the caller | 932 | * The caller may dip into page reserves a bit more if the caller | 
| 898 | * cannot run direct reclaim, or if the caller has realtime scheduling | 933 | * cannot run direct reclaim, or if the caller has realtime scheduling | 
| 899 | * policy. | 934 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 
| 935 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 900 | */ | 936 | */ | 
| 901 | alloc_flags = ALLOC_WMARK_MIN; | 937 | alloc_flags = ALLOC_WMARK_MIN; | 
| 902 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | 938 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | 
| 903 | alloc_flags |= ALLOC_HARDER; | 939 | alloc_flags |= ALLOC_HARDER; | 
| 904 | if (gfp_mask & __GFP_HIGH) | 940 | if (gfp_mask & __GFP_HIGH) | 
| 905 | alloc_flags |= ALLOC_HIGH; | 941 | alloc_flags |= ALLOC_HIGH; | 
| 906 | if (wait) | 942 | alloc_flags |= ALLOC_CPUSET; | 
| 907 | alloc_flags |= ALLOC_CPUSET; | ||
| 908 | 943 | ||
| 909 | /* | 944 | /* | 
| 910 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 945 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 
| @@ -926,7 +961,7 @@ restart: | |||
| 926 | nofail_alloc: | 961 | nofail_alloc: | 
| 927 | /* go through the zonelist yet again, ignoring mins */ | 962 | /* go through the zonelist yet again, ignoring mins */ | 
| 928 | page = get_page_from_freelist(gfp_mask, order, | 963 | page = get_page_from_freelist(gfp_mask, order, | 
| 929 | zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); | 964 | zonelist, ALLOC_NO_WATERMARKS); | 
| 930 | if (page) | 965 | if (page) | 
| 931 | goto got_pg; | 966 | goto got_pg; | 
| 932 | if (gfp_mask & __GFP_NOFAIL) { | 967 | if (gfp_mask & __GFP_NOFAIL) { | 
| @@ -945,6 +980,7 @@ rebalance: | |||
| 945 | cond_resched(); | 980 | cond_resched(); | 
| 946 | 981 | ||
| 947 | /* We now go into synchronous reclaim */ | 982 | /* We now go into synchronous reclaim */ | 
| 983 | cpuset_memory_pressure_bump(); | ||
| 948 | p->flags |= PF_MEMALLOC; | 984 | p->flags |= PF_MEMALLOC; | 
| 949 | reclaim_state.reclaimed_slab = 0; | 985 | reclaim_state.reclaimed_slab = 0; | 
| 950 | p->reclaim_state = &reclaim_state; | 986 | p->reclaim_state = &reclaim_state; | 
| @@ -1171,7 +1207,7 @@ EXPORT_SYMBOL(nr_pagecache); | |||
| 1171 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 1207 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 
| 1172 | #endif | 1208 | #endif | 
| 1173 | 1209 | ||
| 1174 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1210 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 
| 1175 | { | 1211 | { | 
| 1176 | int cpu = 0; | 1212 | int cpu = 0; | 
| 1177 | 1213 | ||
| @@ -1224,7 +1260,7 @@ void get_full_page_state(struct page_state *ret) | |||
| 1224 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 1260 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 
| 1225 | } | 1261 | } | 
| 1226 | 1262 | ||
| 1227 | unsigned long __read_page_state(unsigned long offset) | 1263 | unsigned long read_page_state_offset(unsigned long offset) | 
| 1228 | { | 1264 | { | 
| 1229 | unsigned long ret = 0; | 1265 | unsigned long ret = 0; | 
| 1230 | int cpu; | 1266 | int cpu; | 
| @@ -1238,18 +1274,26 @@ unsigned long __read_page_state(unsigned long offset) | |||
| 1238 | return ret; | 1274 | return ret; | 
| 1239 | } | 1275 | } | 
| 1240 | 1276 | ||
| 1241 | void __mod_page_state(unsigned long offset, unsigned long delta) | 1277 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) | 
| 1278 | { | ||
| 1279 | void *ptr; | ||
| 1280 | |||
| 1281 | ptr = &__get_cpu_var(page_states); | ||
| 1282 | *(unsigned long *)(ptr + offset) += delta; | ||
| 1283 | } | ||
| 1284 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
| 1285 | |||
| 1286 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
| 1242 | { | 1287 | { | 
| 1243 | unsigned long flags; | 1288 | unsigned long flags; | 
| 1244 | void* ptr; | 1289 | void *ptr; | 
| 1245 | 1290 | ||
| 1246 | local_irq_save(flags); | 1291 | local_irq_save(flags); | 
| 1247 | ptr = &__get_cpu_var(page_states); | 1292 | ptr = &__get_cpu_var(page_states); | 
| 1248 | *(unsigned long*)(ptr + offset) += delta; | 1293 | *(unsigned long *)(ptr + offset) += delta; | 
| 1249 | local_irq_restore(flags); | 1294 | local_irq_restore(flags); | 
| 1250 | } | 1295 | } | 
| 1251 | 1296 | EXPORT_SYMBOL(mod_page_state_offset); | |
| 1252 | EXPORT_SYMBOL(__mod_page_state); | ||
| 1253 | 1297 | ||
| 1254 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 1298 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 
| 1255 | unsigned long *free, struct pglist_data *pgdat) | 1299 | unsigned long *free, struct pglist_data *pgdat) | 
| @@ -1335,7 +1379,7 @@ void show_free_areas(void) | |||
| 1335 | show_node(zone); | 1379 | show_node(zone); | 
| 1336 | printk("%s per-cpu:", zone->name); | 1380 | printk("%s per-cpu:", zone->name); | 
| 1337 | 1381 | ||
| 1338 | if (!zone->present_pages) { | 1382 | if (!populated_zone(zone)) { | 
| 1339 | printk(" empty\n"); | 1383 | printk(" empty\n"); | 
| 1340 | continue; | 1384 | continue; | 
| 1341 | } else | 1385 | } else | 
| @@ -1347,10 +1391,9 @@ void show_free_areas(void) | |||
| 1347 | pageset = zone_pcp(zone, cpu); | 1391 | pageset = zone_pcp(zone, cpu); | 
| 1348 | 1392 | ||
| 1349 | for (temperature = 0; temperature < 2; temperature++) | 1393 | for (temperature = 0; temperature < 2; temperature++) | 
| 1350 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", | 1394 | printk("cpu %d %s: high %d, batch %d used:%d\n", | 
| 1351 | cpu, | 1395 | cpu, | 
| 1352 | temperature ? "cold" : "hot", | 1396 | temperature ? "cold" : "hot", | 
| 1353 | pageset->pcp[temperature].low, | ||
| 1354 | pageset->pcp[temperature].high, | 1397 | pageset->pcp[temperature].high, | 
| 1355 | pageset->pcp[temperature].batch, | 1398 | pageset->pcp[temperature].batch, | 
| 1356 | pageset->pcp[temperature].count); | 1399 | pageset->pcp[temperature].count); | 
| @@ -1413,7 +1456,7 @@ void show_free_areas(void) | |||
| 1413 | 1456 | ||
| 1414 | show_node(zone); | 1457 | show_node(zone); | 
| 1415 | printk("%s: ", zone->name); | 1458 | printk("%s: ", zone->name); | 
| 1416 | if (!zone->present_pages) { | 1459 | if (!populated_zone(zone)) { | 
| 1417 | printk("empty\n"); | 1460 | printk("empty\n"); | 
| 1418 | continue; | 1461 | continue; | 
| 1419 | } | 1462 | } | 
| @@ -1433,36 +1476,29 @@ void show_free_areas(void) | |||
| 1433 | 1476 | ||
| 1434 | /* | 1477 | /* | 
| 1435 | * Builds allocation fallback zone lists. | 1478 | * Builds allocation fallback zone lists. | 
| 1479 | * | ||
| 1480 | * Add all populated zones of a node to the zonelist. | ||
| 1436 | */ | 1481 | */ | 
| 1437 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | 1482 | static int __init build_zonelists_node(pg_data_t *pgdat, | 
| 1438 | { | 1483 | struct zonelist *zonelist, int nr_zones, int zone_type) | 
| 1439 | switch (k) { | 1484 | { | 
| 1440 | struct zone *zone; | 1485 | struct zone *zone; | 
| 1441 | default: | 1486 | |
| 1442 | BUG(); | 1487 | BUG_ON(zone_type > ZONE_HIGHMEM); | 
| 1443 | case ZONE_HIGHMEM: | 1488 | |
| 1444 | zone = pgdat->node_zones + ZONE_HIGHMEM; | 1489 | do { | 
| 1445 | if (zone->present_pages) { | 1490 | zone = pgdat->node_zones + zone_type; | 
| 1491 | if (populated_zone(zone)) { | ||
| 1446 | #ifndef CONFIG_HIGHMEM | 1492 | #ifndef CONFIG_HIGHMEM | 
| 1447 | BUG(); | 1493 | BUG_ON(zone_type > ZONE_NORMAL); | 
| 1448 | #endif | 1494 | #endif | 
| 1449 | zonelist->zones[j++] = zone; | 1495 | zonelist->zones[nr_zones++] = zone; | 
| 1496 | check_highest_zone(zone_type); | ||
| 1450 | } | 1497 | } | 
| 1451 | case ZONE_NORMAL: | 1498 | zone_type--; | 
| 1452 | zone = pgdat->node_zones + ZONE_NORMAL; | ||
| 1453 | if (zone->present_pages) | ||
| 1454 | zonelist->zones[j++] = zone; | ||
| 1455 | case ZONE_DMA32: | ||
| 1456 | zone = pgdat->node_zones + ZONE_DMA32; | ||
| 1457 | if (zone->present_pages) | ||
| 1458 | zonelist->zones[j++] = zone; | ||
| 1459 | case ZONE_DMA: | ||
| 1460 | zone = pgdat->node_zones + ZONE_DMA; | ||
| 1461 | if (zone->present_pages) | ||
| 1462 | zonelist->zones[j++] = zone; | ||
| 1463 | } | ||
| 1464 | 1499 | ||
| 1465 | return j; | 1500 | } while (zone_type >= 0); | 
| 1501 | return nr_zones; | ||
| 1466 | } | 1502 | } | 
| 1467 | 1503 | ||
| 1468 | static inline int highest_zone(int zone_bits) | 1504 | static inline int highest_zone(int zone_bits) | 
| @@ -1706,11 +1742,9 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 1706 | unsigned long end_pfn = start_pfn + size; | 1742 | unsigned long end_pfn = start_pfn + size; | 
| 1707 | unsigned long pfn; | 1743 | unsigned long pfn; | 
| 1708 | 1744 | ||
| 1709 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { | 1745 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 
| 1710 | if (!early_pfn_valid(pfn)) | 1746 | if (!early_pfn_valid(pfn)) | 
| 1711 | continue; | 1747 | continue; | 
| 1712 | if (!early_pfn_in_nid(pfn, nid)) | ||
| 1713 | continue; | ||
| 1714 | page = pfn_to_page(pfn); | 1748 | page = pfn_to_page(pfn); | 
| 1715 | set_page_links(page, zone, nid, pfn); | 1749 | set_page_links(page, zone, nid, pfn); | 
| 1716 | set_page_count(page, 1); | 1750 | set_page_count(page, 1); | 
| @@ -1794,19 +1828,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
| 1794 | 1828 | ||
| 1795 | pcp = &p->pcp[0]; /* hot */ | 1829 | pcp = &p->pcp[0]; /* hot */ | 
| 1796 | pcp->count = 0; | 1830 | pcp->count = 0; | 
| 1797 | pcp->low = 0; | ||
| 1798 | pcp->high = 6 * batch; | 1831 | pcp->high = 6 * batch; | 
| 1799 | pcp->batch = max(1UL, 1 * batch); | 1832 | pcp->batch = max(1UL, 1 * batch); | 
| 1800 | INIT_LIST_HEAD(&pcp->list); | 1833 | INIT_LIST_HEAD(&pcp->list); | 
| 1801 | 1834 | ||
| 1802 | pcp = &p->pcp[1]; /* cold*/ | 1835 | pcp = &p->pcp[1]; /* cold*/ | 
| 1803 | pcp->count = 0; | 1836 | pcp->count = 0; | 
| 1804 | pcp->low = 0; | ||
| 1805 | pcp->high = 2 * batch; | 1837 | pcp->high = 2 * batch; | 
| 1806 | pcp->batch = max(1UL, batch/2); | 1838 | pcp->batch = max(1UL, batch/2); | 
| 1807 | INIT_LIST_HEAD(&pcp->list); | 1839 | INIT_LIST_HEAD(&pcp->list); | 
| 1808 | } | 1840 | } | 
| 1809 | 1841 | ||
| 1842 | /* | ||
| 1843 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | ||
| 1844 | * to the value high for the pageset p. | ||
| 1845 | */ | ||
| 1846 | |||
| 1847 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
| 1848 | unsigned long high) | ||
| 1849 | { | ||
| 1850 | struct per_cpu_pages *pcp; | ||
| 1851 | |||
| 1852 | pcp = &p->pcp[0]; /* hot list */ | ||
| 1853 | pcp->high = high; | ||
| 1854 | pcp->batch = max(1UL, high/4); | ||
| 1855 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
| 1856 | pcp->batch = PAGE_SHIFT * 8; | ||
| 1857 | } | ||
| 1858 | |||
| 1859 | |||
| 1810 | #ifdef CONFIG_NUMA | 1860 | #ifdef CONFIG_NUMA | 
| 1811 | /* | 1861 | /* | 
| 1812 | * Boot pageset table. One per cpu which is going to be used for all | 1862 | * Boot pageset table. One per cpu which is going to be used for all | 
| @@ -1838,12 +1888,16 @@ static int __devinit process_zones(int cpu) | |||
| 1838 | 1888 | ||
| 1839 | for_each_zone(zone) { | 1889 | for_each_zone(zone) { | 
| 1840 | 1890 | ||
| 1841 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | 1891 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 
| 1842 | GFP_KERNEL, cpu_to_node(cpu)); | 1892 | GFP_KERNEL, cpu_to_node(cpu)); | 
| 1843 | if (!zone->pageset[cpu]) | 1893 | if (!zone_pcp(zone, cpu)) | 
| 1844 | goto bad; | 1894 | goto bad; | 
| 1845 | 1895 | ||
| 1846 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | 1896 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 
| 1897 | |||
| 1898 | if (percpu_pagelist_fraction) | ||
| 1899 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
| 1900 | (zone->present_pages / percpu_pagelist_fraction)); | ||
| 1847 | } | 1901 | } | 
| 1848 | 1902 | ||
| 1849 | return 0; | 1903 | return 0; | 
| @@ -1851,15 +1905,14 @@ bad: | |||
| 1851 | for_each_zone(dzone) { | 1905 | for_each_zone(dzone) { | 
| 1852 | if (dzone == zone) | 1906 | if (dzone == zone) | 
| 1853 | break; | 1907 | break; | 
| 1854 | kfree(dzone->pageset[cpu]); | 1908 | kfree(zone_pcp(dzone, cpu)); | 
| 1855 | dzone->pageset[cpu] = NULL; | 1909 | zone_pcp(dzone, cpu) = NULL; | 
| 1856 | } | 1910 | } | 
| 1857 | return -ENOMEM; | 1911 | return -ENOMEM; | 
| 1858 | } | 1912 | } | 
| 1859 | 1913 | ||
| 1860 | static inline void free_zone_pagesets(int cpu) | 1914 | static inline void free_zone_pagesets(int cpu) | 
| 1861 | { | 1915 | { | 
| 1862 | #ifdef CONFIG_NUMA | ||
| 1863 | struct zone *zone; | 1916 | struct zone *zone; | 
| 1864 | 1917 | ||
| 1865 | for_each_zone(zone) { | 1918 | for_each_zone(zone) { | 
| @@ -1868,7 +1921,6 @@ static inline void free_zone_pagesets(int cpu) | |||
| 1868 | zone_pcp(zone, cpu) = NULL; | 1921 | zone_pcp(zone, cpu) = NULL; | 
| 1869 | kfree(pset); | 1922 | kfree(pset); | 
| 1870 | } | 1923 | } | 
| 1871 | #endif | ||
| 1872 | } | 1924 | } | 
| 1873 | 1925 | ||
| 1874 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | 1926 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | 
| @@ -1939,7 +1991,7 @@ static __devinit void zone_pcp_init(struct zone *zone) | |||
| 1939 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1991 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 
| 1940 | #ifdef CONFIG_NUMA | 1992 | #ifdef CONFIG_NUMA | 
| 1941 | /* Early boot. Slab allocator not functional yet */ | 1993 | /* Early boot. Slab allocator not functional yet */ | 
| 1942 | zone->pageset[cpu] = &boot_pageset[cpu]; | 1994 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | 
| 1943 | setup_pageset(&boot_pageset[cpu],0); | 1995 | setup_pageset(&boot_pageset[cpu],0); | 
| 1944 | #else | 1996 | #else | 
| 1945 | setup_pageset(zone_pcp(zone,cpu), batch); | 1997 | setup_pageset(zone_pcp(zone,cpu), batch); | 
| @@ -2116,7 +2168,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
| 2116 | int order; | 2168 | int order; | 
| 2117 | 2169 | ||
| 2118 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2170 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 
| 2119 | if (!zone->present_pages) | 2171 | if (!populated_zone(zone)) | 
| 2120 | continue; | 2172 | continue; | 
| 2121 | 2173 | ||
| 2122 | spin_lock_irqsave(&zone->lock, flags); | 2174 | spin_lock_irqsave(&zone->lock, flags); | 
| @@ -2149,7 +2201,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2149 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 2201 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 
| 2150 | int i; | 2202 | int i; | 
| 2151 | 2203 | ||
| 2152 | if (!zone->present_pages) | 2204 | if (!populated_zone(zone)) | 
| 2153 | continue; | 2205 | continue; | 
| 2154 | 2206 | ||
| 2155 | spin_lock_irqsave(&zone->lock, flags); | 2207 | spin_lock_irqsave(&zone->lock, flags); | 
| @@ -2182,7 +2234,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2182 | seq_printf(m, | 2234 | seq_printf(m, | 
| 2183 | ")" | 2235 | ")" | 
| 2184 | "\n pagesets"); | 2236 | "\n pagesets"); | 
| 2185 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | 2237 | for_each_online_cpu(i) { | 
| 2186 | struct per_cpu_pageset *pageset; | 2238 | struct per_cpu_pageset *pageset; | 
| 2187 | int j; | 2239 | int j; | 
| 2188 | 2240 | ||
| @@ -2197,12 +2249,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2197 | seq_printf(m, | 2249 | seq_printf(m, | 
| 2198 | "\n cpu: %i pcp: %i" | 2250 | "\n cpu: %i pcp: %i" | 
| 2199 | "\n count: %i" | 2251 | "\n count: %i" | 
| 2200 | "\n low: %i" | ||
| 2201 | "\n high: %i" | 2252 | "\n high: %i" | 
| 2202 | "\n batch: %i", | 2253 | "\n batch: %i", | 
| 2203 | i, j, | 2254 | i, j, | 
| 2204 | pageset->pcp[j].count, | 2255 | pageset->pcp[j].count, | 
| 2205 | pageset->pcp[j].low, | ||
| 2206 | pageset->pcp[j].high, | 2256 | pageset->pcp[j].high, | 
| 2207 | pageset->pcp[j].batch); | 2257 | pageset->pcp[j].batch); | 
| 2208 | } | 2258 | } | 
| @@ -2257,32 +2307,40 @@ static char *vmstat_text[] = { | |||
| 2257 | "pgpgout", | 2307 | "pgpgout", | 
| 2258 | "pswpin", | 2308 | "pswpin", | 
| 2259 | "pswpout", | 2309 | "pswpout", | 
| 2260 | "pgalloc_high", | ||
| 2261 | 2310 | ||
| 2311 | "pgalloc_high", | ||
| 2262 | "pgalloc_normal", | 2312 | "pgalloc_normal", | 
| 2313 | "pgalloc_dma32", | ||
| 2263 | "pgalloc_dma", | 2314 | "pgalloc_dma", | 
| 2315 | |||
| 2264 | "pgfree", | 2316 | "pgfree", | 
| 2265 | "pgactivate", | 2317 | "pgactivate", | 
| 2266 | "pgdeactivate", | 2318 | "pgdeactivate", | 
| 2267 | 2319 | ||
| 2268 | "pgfault", | 2320 | "pgfault", | 
| 2269 | "pgmajfault", | 2321 | "pgmajfault", | 
| 2322 | |||
| 2270 | "pgrefill_high", | 2323 | "pgrefill_high", | 
| 2271 | "pgrefill_normal", | 2324 | "pgrefill_normal", | 
| 2325 | "pgrefill_dma32", | ||
| 2272 | "pgrefill_dma", | 2326 | "pgrefill_dma", | 
| 2273 | 2327 | ||
| 2274 | "pgsteal_high", | 2328 | "pgsteal_high", | 
| 2275 | "pgsteal_normal", | 2329 | "pgsteal_normal", | 
| 2330 | "pgsteal_dma32", | ||
| 2276 | "pgsteal_dma", | 2331 | "pgsteal_dma", | 
| 2332 | |||
| 2277 | "pgscan_kswapd_high", | 2333 | "pgscan_kswapd_high", | 
| 2278 | "pgscan_kswapd_normal", | 2334 | "pgscan_kswapd_normal", | 
| 2279 | 2335 | "pgscan_kswapd_dma32", | |
| 2280 | "pgscan_kswapd_dma", | 2336 | "pgscan_kswapd_dma", | 
| 2337 | |||
| 2281 | "pgscan_direct_high", | 2338 | "pgscan_direct_high", | 
| 2282 | "pgscan_direct_normal", | 2339 | "pgscan_direct_normal", | 
| 2340 | "pgscan_direct_dma32", | ||
| 2283 | "pgscan_direct_dma", | 2341 | "pgscan_direct_dma", | 
| 2284 | "pginodesteal", | ||
| 2285 | 2342 | ||
| 2343 | "pginodesteal", | ||
| 2286 | "slabs_scanned", | 2344 | "slabs_scanned", | 
| 2287 | "kswapd_steal", | 2345 | "kswapd_steal", | 
| 2288 | "kswapd_inodesteal", | 2346 | "kswapd_inodesteal", | 
| @@ -2539,6 +2597,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 2539 | return 0; | 2597 | return 0; | 
| 2540 | } | 2598 | } | 
| 2541 | 2599 | ||
| 2600 | /* | ||
| 2601 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | ||
| 2602 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | ||
| 2603 | * can have before it gets flushed back to buddy allocator. | ||
| 2604 | */ | ||
| 2605 | |||
| 2606 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | ||
| 2607 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2608 | { | ||
| 2609 | struct zone *zone; | ||
| 2610 | unsigned int cpu; | ||
| 2611 | int ret; | ||
| 2612 | |||
| 2613 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2614 | if (!write || (ret == -EINVAL)) | ||
| 2615 | return ret; | ||
| 2616 | for_each_zone(zone) { | ||
| 2617 | for_each_online_cpu(cpu) { | ||
| 2618 | unsigned long high; | ||
| 2619 | high = zone->present_pages / percpu_pagelist_fraction; | ||
| 2620 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | ||
| 2621 | } | ||
| 2622 | } | ||
| 2623 | return 0; | ||
| 2624 | } | ||
| 2625 | |||
| 2542 | __initdata int hashdist = HASHDIST_DEFAULT; | 2626 | __initdata int hashdist = HASHDIST_DEFAULT; | 
| 2543 | 2627 | ||
| 2544 | #ifdef CONFIG_NUMA | 2628 | #ifdef CONFIG_NUMA | 
