diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 155 |
1 files changed, 120 insertions, 35 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27b8681139fd..d8ac01474563 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | #include <linux/backing-dev.h> | 44 | #include <linux/backing-dev.h> |
| 45 | #include <linux/fault-inject.h> | 45 | #include <linux/fault-inject.h> |
| 46 | #include <linux/page-isolation.h> | 46 | #include <linux/page-isolation.h> |
| 47 | #include <linux/memcontrol.h> | 47 | #include <linux/page_cgroup.h> |
| 48 | #include <linux/debugobjects.h> | 48 | #include <linux/debugobjects.h> |
| 49 | 49 | ||
| 50 | #include <asm/tlbflush.h> | 50 | #include <asm/tlbflush.h> |
| @@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) | |||
| 223 | 223 | ||
| 224 | static void bad_page(struct page *page) | 224 | static void bad_page(struct page *page) |
| 225 | { | 225 | { |
| 226 | void *pc = page_get_page_cgroup(page); | ||
| 227 | |||
| 228 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG | 226 | printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG |
| 229 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 227 | "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", |
| 230 | current->comm, page, (int)(2*sizeof(unsigned long)), | 228 | current->comm, page, (int)(2*sizeof(unsigned long)), |
| 231 | (unsigned long)page->flags, page->mapping, | 229 | (unsigned long)page->flags, page->mapping, |
| 232 | page_mapcount(page), page_count(page)); | 230 | page_mapcount(page), page_count(page)); |
| 233 | if (pc) { | 231 | |
| 234 | printk(KERN_EMERG "cgroup:%p\n", pc); | ||
| 235 | page_reset_bad_cgroup(page); | ||
| 236 | } | ||
| 237 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | 232 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
| 238 | KERN_EMERG "Backtrace:\n"); | 233 | KERN_EMERG "Backtrace:\n"); |
| 239 | dump_stack(); | 234 | dump_stack(); |
| @@ -268,24 +263,39 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 268 | { | 263 | { |
| 269 | int i; | 264 | int i; |
| 270 | int nr_pages = 1 << order; | 265 | int nr_pages = 1 << order; |
| 266 | |||
| 267 | set_compound_page_dtor(page, free_compound_page); | ||
| 268 | set_compound_order(page, order); | ||
| 269 | __SetPageHead(page); | ||
| 270 | for (i = 1; i < nr_pages; i++) { | ||
| 271 | struct page *p = page + i; | ||
| 272 | |||
| 273 | __SetPageTail(p); | ||
| 274 | p->first_page = page; | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | #ifdef CONFIG_HUGETLBFS | ||
| 279 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 280 | { | ||
| 281 | int i; | ||
| 282 | int nr_pages = 1 << order; | ||
| 271 | struct page *p = page + 1; | 283 | struct page *p = page + 1; |
| 272 | 284 | ||
| 273 | set_compound_page_dtor(page, free_compound_page); | 285 | set_compound_page_dtor(page, free_compound_page); |
| 274 | set_compound_order(page, order); | 286 | set_compound_order(page, order); |
| 275 | __SetPageHead(page); | 287 | __SetPageHead(page); |
| 276 | for (i = 1; i < nr_pages; i++, p++) { | 288 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
| 277 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) | ||
| 278 | p = pfn_to_page(page_to_pfn(page) + i); | ||
| 279 | __SetPageTail(p); | 289 | __SetPageTail(p); |
| 280 | p->first_page = page; | 290 | p->first_page = page; |
| 281 | } | 291 | } |
| 282 | } | 292 | } |
| 293 | #endif | ||
| 283 | 294 | ||
| 284 | static void destroy_compound_page(struct page *page, unsigned long order) | 295 | static void destroy_compound_page(struct page *page, unsigned long order) |
| 285 | { | 296 | { |
| 286 | int i; | 297 | int i; |
| 287 | int nr_pages = 1 << order; | 298 | int nr_pages = 1 << order; |
| 288 | struct page *p = page + 1; | ||
| 289 | 299 | ||
| 290 | if (unlikely(compound_order(page) != order)) | 300 | if (unlikely(compound_order(page) != order)) |
| 291 | bad_page(page); | 301 | bad_page(page); |
| @@ -293,9 +303,8 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 293 | if (unlikely(!PageHead(page))) | 303 | if (unlikely(!PageHead(page))) |
| 294 | bad_page(page); | 304 | bad_page(page); |
| 295 | __ClearPageHead(page); | 305 | __ClearPageHead(page); |
| 296 | for (i = 1; i < nr_pages; i++, p++) { | 306 | for (i = 1; i < nr_pages; i++) { |
| 297 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) | 307 | struct page *p = page + i; |
| 298 | p = pfn_to_page(page_to_pfn(page) + i); | ||
| 299 | 308 | ||
| 300 | if (unlikely(!PageTail(p) | | 309 | if (unlikely(!PageTail(p) | |
| 301 | (p->first_page != page))) | 310 | (p->first_page != page))) |
| @@ -454,14 +463,16 @@ static inline void __free_one_page(struct page *page, | |||
| 454 | 463 | ||
| 455 | static inline int free_pages_check(struct page *page) | 464 | static inline int free_pages_check(struct page *page) |
| 456 | { | 465 | { |
| 466 | free_page_mlock(page); | ||
| 457 | if (unlikely(page_mapcount(page) | | 467 | if (unlikely(page_mapcount(page) | |
| 458 | (page->mapping != NULL) | | 468 | (page->mapping != NULL) | |
| 459 | (page_get_page_cgroup(page) != NULL) | | ||
| 460 | (page_count(page) != 0) | | 469 | (page_count(page) != 0) | |
| 461 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) | 470 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) |
| 462 | bad_page(page); | 471 | bad_page(page); |
| 463 | if (PageDirty(page)) | 472 | if (PageDirty(page)) |
| 464 | __ClearPageDirty(page); | 473 | __ClearPageDirty(page); |
| 474 | if (PageSwapBacked(page)) | ||
| 475 | __ClearPageSwapBacked(page); | ||
| 465 | /* | 476 | /* |
| 466 | * For now, we report if PG_reserved was found set, but do not | 477 | * For now, we report if PG_reserved was found set, but do not |
| 467 | * clear it, and do not free the page. But we shall soon need | 478 | * clear it, and do not free the page. But we shall soon need |
| @@ -600,7 +611,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 600 | { | 611 | { |
| 601 | if (unlikely(page_mapcount(page) | | 612 | if (unlikely(page_mapcount(page) | |
| 602 | (page->mapping != NULL) | | 613 | (page->mapping != NULL) | |
| 603 | (page_get_page_cgroup(page) != NULL) | | ||
| 604 | (page_count(page) != 0) | | 614 | (page_count(page) != 0) | |
| 605 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) | 615 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) |
| 606 | bad_page(page); | 616 | bad_page(page); |
| @@ -614,7 +624,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 614 | 624 | ||
| 615 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | | 625 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | |
| 616 | 1 << PG_referenced | 1 << PG_arch_1 | | 626 | 1 << PG_referenced | 1 << PG_arch_1 | |
| 617 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 627 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk |
| 628 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 629 | | 1 << PG_mlocked | ||
| 630 | #endif | ||
| 631 | ); | ||
| 618 | set_page_private(page, 0); | 632 | set_page_private(page, 0); |
| 619 | set_page_refcounted(page); | 633 | set_page_refcounted(page); |
| 620 | 634 | ||
| @@ -1547,6 +1561,10 @@ nofail_alloc: | |||
| 1547 | 1561 | ||
| 1548 | /* We now go into synchronous reclaim */ | 1562 | /* We now go into synchronous reclaim */ |
| 1549 | cpuset_memory_pressure_bump(); | 1563 | cpuset_memory_pressure_bump(); |
| 1564 | /* | ||
| 1565 | * The task's cpuset might have expanded its set of allowable nodes | ||
| 1566 | */ | ||
| 1567 | cpuset_update_task_memory_state(); | ||
| 1550 | p->flags |= PF_MEMALLOC; | 1568 | p->flags |= PF_MEMALLOC; |
| 1551 | reclaim_state.reclaimed_slab = 0; | 1569 | reclaim_state.reclaimed_slab = 0; |
| 1552 | p->reclaim_state = &reclaim_state; | 1570 | p->reclaim_state = &reclaim_state; |
| @@ -1862,10 +1880,21 @@ void show_free_areas(void) | |||
| 1862 | } | 1880 | } |
| 1863 | } | 1881 | } |
| 1864 | 1882 | ||
| 1865 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 1883 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
| 1884 | " inactive_file:%lu" | ||
| 1885 | //TODO: check/adjust line lengths | ||
| 1886 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1887 | " unevictable:%lu" | ||
| 1888 | #endif | ||
| 1889 | " dirty:%lu writeback:%lu unstable:%lu\n" | ||
| 1866 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 1890 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
| 1867 | global_page_state(NR_ACTIVE), | 1891 | global_page_state(NR_ACTIVE_ANON), |
| 1868 | global_page_state(NR_INACTIVE), | 1892 | global_page_state(NR_ACTIVE_FILE), |
| 1893 | global_page_state(NR_INACTIVE_ANON), | ||
| 1894 | global_page_state(NR_INACTIVE_FILE), | ||
| 1895 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1896 | global_page_state(NR_UNEVICTABLE), | ||
| 1897 | #endif | ||
| 1869 | global_page_state(NR_FILE_DIRTY), | 1898 | global_page_state(NR_FILE_DIRTY), |
| 1870 | global_page_state(NR_WRITEBACK), | 1899 | global_page_state(NR_WRITEBACK), |
| 1871 | global_page_state(NR_UNSTABLE_NFS), | 1900 | global_page_state(NR_UNSTABLE_NFS), |
| @@ -1888,8 +1917,13 @@ void show_free_areas(void) | |||
| 1888 | " min:%lukB" | 1917 | " min:%lukB" |
| 1889 | " low:%lukB" | 1918 | " low:%lukB" |
| 1890 | " high:%lukB" | 1919 | " high:%lukB" |
| 1891 | " active:%lukB" | 1920 | " active_anon:%lukB" |
| 1892 | " inactive:%lukB" | 1921 | " inactive_anon:%lukB" |
| 1922 | " active_file:%lukB" | ||
| 1923 | " inactive_file:%lukB" | ||
| 1924 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1925 | " unevictable:%lukB" | ||
| 1926 | #endif | ||
| 1893 | " present:%lukB" | 1927 | " present:%lukB" |
| 1894 | " pages_scanned:%lu" | 1928 | " pages_scanned:%lu" |
| 1895 | " all_unreclaimable? %s" | 1929 | " all_unreclaimable? %s" |
| @@ -1899,8 +1933,13 @@ void show_free_areas(void) | |||
| 1899 | K(zone->pages_min), | 1933 | K(zone->pages_min), |
| 1900 | K(zone->pages_low), | 1934 | K(zone->pages_low), |
| 1901 | K(zone->pages_high), | 1935 | K(zone->pages_high), |
| 1902 | K(zone_page_state(zone, NR_ACTIVE)), | 1936 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
| 1903 | K(zone_page_state(zone, NR_INACTIVE)), | 1937 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
| 1938 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | ||
| 1939 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | ||
| 1940 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1941 | K(zone_page_state(zone, NR_UNEVICTABLE)), | ||
| 1942 | #endif | ||
| 1904 | K(zone->present_pages), | 1943 | K(zone->present_pages), |
| 1905 | zone->pages_scanned, | 1944 | zone->pages_scanned, |
| 1906 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 1945 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
| @@ -3410,10 +3449,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3410 | pgdat->nr_zones = 0; | 3449 | pgdat->nr_zones = 0; |
| 3411 | init_waitqueue_head(&pgdat->kswapd_wait); | 3450 | init_waitqueue_head(&pgdat->kswapd_wait); |
| 3412 | pgdat->kswapd_max_order = 0; | 3451 | pgdat->kswapd_max_order = 0; |
| 3452 | pgdat_page_cgroup_init(pgdat); | ||
| 3413 | 3453 | ||
| 3414 | for (j = 0; j < MAX_NR_ZONES; j++) { | 3454 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 3415 | struct zone *zone = pgdat->node_zones + j; | 3455 | struct zone *zone = pgdat->node_zones + j; |
| 3416 | unsigned long size, realsize, memmap_pages; | 3456 | unsigned long size, realsize, memmap_pages; |
| 3457 | enum lru_list l; | ||
| 3417 | 3458 | ||
| 3418 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 3459 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
| 3419 | realsize = size - zone_absent_pages_in_node(nid, j, | 3460 | realsize = size - zone_absent_pages_in_node(nid, j, |
| @@ -3428,8 +3469,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3428 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3469 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
| 3429 | if (realsize >= memmap_pages) { | 3470 | if (realsize >= memmap_pages) { |
| 3430 | realsize -= memmap_pages; | 3471 | realsize -= memmap_pages; |
| 3431 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3472 | printk(KERN_DEBUG |
| 3432 | "%s zone: %lu pages used for memmap\n", | 3473 | " %s zone: %lu pages used for memmap\n", |
| 3433 | zone_names[j], memmap_pages); | 3474 | zone_names[j], memmap_pages); |
| 3434 | } else | 3475 | } else |
| 3435 | printk(KERN_WARNING | 3476 | printk(KERN_WARNING |
| @@ -3439,8 +3480,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3439 | /* Account for reserved pages */ | 3480 | /* Account for reserved pages */ |
| 3440 | if (j == 0 && realsize > dma_reserve) { | 3481 | if (j == 0 && realsize > dma_reserve) { |
| 3441 | realsize -= dma_reserve; | 3482 | realsize -= dma_reserve; |
| 3442 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3483 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
| 3443 | "%s zone: %lu pages reserved\n", | ||
| 3444 | zone_names[0], dma_reserve); | 3484 | zone_names[0], dma_reserve); |
| 3445 | } | 3485 | } |
| 3446 | 3486 | ||
| @@ -3465,10 +3505,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3465 | zone->prev_priority = DEF_PRIORITY; | 3505 | zone->prev_priority = DEF_PRIORITY; |
| 3466 | 3506 | ||
| 3467 | zone_pcp_init(zone); | 3507 | zone_pcp_init(zone); |
| 3468 | INIT_LIST_HEAD(&zone->active_list); | 3508 | for_each_lru(l) { |
| 3469 | INIT_LIST_HEAD(&zone->inactive_list); | 3509 | INIT_LIST_HEAD(&zone->lru[l].list); |
| 3470 | zone->nr_scan_active = 0; | 3510 | zone->lru[l].nr_scan = 0; |
| 3471 | zone->nr_scan_inactive = 0; | 3511 | } |
| 3512 | zone->recent_rotated[0] = 0; | ||
| 3513 | zone->recent_rotated[1] = 0; | ||
| 3514 | zone->recent_scanned[0] = 0; | ||
| 3515 | zone->recent_scanned[1] = 0; | ||
| 3472 | zap_zone_vm_stats(zone); | 3516 | zap_zone_vm_stats(zone); |
| 3473 | zone->flags = 0; | 3517 | zone->flags = 0; |
| 3474 | if (!size) | 3518 | if (!size) |
| @@ -3952,7 +3996,7 @@ static void check_for_regular_memory(pg_data_t *pgdat) | |||
| 3952 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 3996 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
| 3953 | { | 3997 | { |
| 3954 | unsigned long nid; | 3998 | unsigned long nid; |
| 3955 | enum zone_type i; | 3999 | int i; |
| 3956 | 4000 | ||
| 3957 | /* Sort early_node_map as initialisation assumes it is sorted */ | 4001 | /* Sort early_node_map as initialisation assumes it is sorted */ |
| 3958 | sort_node_map(); | 4002 | sort_node_map(); |
| @@ -4210,7 +4254,7 @@ void setup_per_zone_pages_min(void) | |||
| 4210 | for_each_zone(zone) { | 4254 | for_each_zone(zone) { |
| 4211 | u64 tmp; | 4255 | u64 tmp; |
| 4212 | 4256 | ||
| 4213 | spin_lock_irqsave(&zone->lru_lock, flags); | 4257 | spin_lock_irqsave(&zone->lock, flags); |
| 4214 | tmp = (u64)pages_min * zone->present_pages; | 4258 | tmp = (u64)pages_min * zone->present_pages; |
| 4215 | do_div(tmp, lowmem_pages); | 4259 | do_div(tmp, lowmem_pages); |
| 4216 | if (is_highmem(zone)) { | 4260 | if (is_highmem(zone)) { |
| @@ -4242,13 +4286,53 @@ void setup_per_zone_pages_min(void) | |||
| 4242 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4286 | zone->pages_low = zone->pages_min + (tmp >> 2); |
| 4243 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4287 | zone->pages_high = zone->pages_min + (tmp >> 1); |
| 4244 | setup_zone_migrate_reserve(zone); | 4288 | setup_zone_migrate_reserve(zone); |
| 4245 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4289 | spin_unlock_irqrestore(&zone->lock, flags); |
| 4246 | } | 4290 | } |
| 4247 | 4291 | ||
| 4248 | /* update totalreserve_pages */ | 4292 | /* update totalreserve_pages */ |
| 4249 | calculate_totalreserve_pages(); | 4293 | calculate_totalreserve_pages(); |
| 4250 | } | 4294 | } |
| 4251 | 4295 | ||
| 4296 | /** | ||
| 4297 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
| 4298 | * | ||
| 4299 | * The inactive anon list should be small enough that the VM never has to | ||
| 4300 | * do too much work, but large enough that each inactive page has a chance | ||
| 4301 | * to be referenced again before it is swapped out. | ||
| 4302 | * | ||
| 4303 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to | ||
| 4304 | * INACTIVE_ANON pages on this zone's LRU, maintained by the | ||
| 4305 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of | ||
| 4306 | * the anonymous pages are kept on the inactive list. | ||
| 4307 | * | ||
| 4308 | * total target max | ||
| 4309 | * memory ratio inactive anon | ||
| 4310 | * ------------------------------------- | ||
| 4311 | * 10MB 1 5MB | ||
| 4312 | * 100MB 1 50MB | ||
| 4313 | * 1GB 3 250MB | ||
| 4314 | * 10GB 10 0.9GB | ||
| 4315 | * 100GB 31 3GB | ||
| 4316 | * 1TB 101 10GB | ||
| 4317 | * 10TB 320 32GB | ||
| 4318 | */ | ||
| 4319 | void setup_per_zone_inactive_ratio(void) | ||
| 4320 | { | ||
| 4321 | struct zone *zone; | ||
| 4322 | |||
| 4323 | for_each_zone(zone) { | ||
| 4324 | unsigned int gb, ratio; | ||
| 4325 | |||
| 4326 | /* Zone size in gigabytes */ | ||
| 4327 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
| 4328 | ratio = int_sqrt(10 * gb); | ||
| 4329 | if (!ratio) | ||
| 4330 | ratio = 1; | ||
| 4331 | |||
| 4332 | zone->inactive_ratio = ratio; | ||
| 4333 | } | ||
| 4334 | } | ||
| 4335 | |||
| 4252 | /* | 4336 | /* |
| 4253 | * Initialise min_free_kbytes. | 4337 | * Initialise min_free_kbytes. |
| 4254 | * | 4338 | * |
| @@ -4286,6 +4370,7 @@ static int __init init_per_zone_pages_min(void) | |||
| 4286 | min_free_kbytes = 65536; | 4370 | min_free_kbytes = 65536; |
| 4287 | setup_per_zone_pages_min(); | 4371 | setup_per_zone_pages_min(); |
| 4288 | setup_per_zone_lowmem_reserve(); | 4372 | setup_per_zone_lowmem_reserve(); |
| 4373 | setup_per_zone_inactive_ratio(); | ||
| 4289 | return 0; | 4374 | return 0; |
| 4290 | } | 4375 | } |
| 4291 | module_init(init_per_zone_pages_min) | 4376 | module_init(init_per_zone_pages_min) |
