aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-10-30 16:16:56 -0400
committerJiri Kosina <jkosina@suse.cz>2010-10-30 16:16:56 -0400
commitf1e095f1d206b81b44160f41278ce5c78641e9b7 (patch)
treebd293d46d2d3e4cdf435a22ddb2877c6ba1b8acc /mm
parentb0438a1b71955c425c304a2a483765ef24841766 (diff)
parent1792f17b7210280a3d7ff29da9614ba779cfcedb (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c74
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c42
-rw-r--r--mm/highmem.c66
-rw-r--r--mm/hugetlb.c238
-rw-r--r--mm/internal.h2
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memcontrol.c406
-rw-r--r--mm/memory-failure.c176
-rw-r--r--mm/memory.c35
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/migrate.c249
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c51
-rw-r--r--mm/oom_kill.c33
-rw-r--r--mm/page-writeback.c31
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/rmap.c37
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmalloc.c56
-rw-r--r--mm/vmscan.c218
-rw-r--r--mm/vmstat.c44
28 files changed, 1450 insertions, 554 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
74 74
75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 75 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
76 spin_lock(&inode_lock); 76 spin_lock(&inode_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_list) 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list) 79 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 nr_io++; 80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list) 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 82 nr_more_io++;
83 spin_unlock(&inode_lock); 83 spin_unlock(&inode_lock);
84 84
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
362{ 362{
363 struct bdi_writeback *me = ptr; 363 struct bdi_writeback *me = ptr;
364 364
365 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 365 current->flags |= PF_SWAPWRITE;
366 set_freezable(); 366 set_freezable();
367 367
368 /* 368 /*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 }; 731 };
732static atomic_t nr_bdi_congested[2];
732 733
733void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734{ 735{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 737 wait_queue_head_t *wqh = &congestion_wqh[sync];
737 738
738 bit = sync ? BDI_sync_congested : BDI_async_congested; 739 bit = sync ? BDI_sync_congested : BDI_async_congested;
739 clear_bit(bit, &bdi->state); 740 if (test_and_clear_bit(bit, &bdi->state))
741 atomic_dec(&nr_bdi_congested[sync]);
740 smp_mb__after_clear_bit(); 742 smp_mb__after_clear_bit();
741 if (waitqueue_active(wqh)) 743 if (waitqueue_active(wqh))
742 wake_up(wqh); 744 wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748 enum bdi_state bit; 750 enum bdi_state bit;
749 751
750 bit = sync ? BDI_sync_congested : BDI_async_congested; 752 bit = sync ? BDI_sync_congested : BDI_async_congested;
751 set_bit(bit, &bdi->state); 753 if (!test_and_set_bit(bit, &bdi->state))
754 atomic_inc(&nr_bdi_congested[sync]);
752} 755}
753EXPORT_SYMBOL(set_bdi_congested); 756EXPORT_SYMBOL(set_bdi_congested);
754 757
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
764long congestion_wait(int sync, long timeout) 767long congestion_wait(int sync, long timeout)
765{ 768{
766 long ret; 769 long ret;
770 unsigned long start = jiffies;
767 DEFINE_WAIT(wait); 771 DEFINE_WAIT(wait);
768 wait_queue_head_t *wqh = &congestion_wqh[sync]; 772 wait_queue_head_t *wqh = &congestion_wqh[sync];
769 773
770 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 774 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
771 ret = io_schedule_timeout(timeout); 775 ret = io_schedule_timeout(timeout);
772 finish_wait(wqh, &wait); 776 finish_wait(wqh, &wait);
777
778 trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
779 jiffies_to_usecs(jiffies - start));
780
773 return ret; 781 return ret;
774} 782}
775EXPORT_SYMBOL(congestion_wait); 783EXPORT_SYMBOL(congestion_wait);
776 784
785/**
786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787 * @zone: A zone to check if it is heavily congested
788 * @sync: SYNC or ASYNC IO
789 * @timeout: timeout in jiffies
790 *
791 * In the event of a congested backing_dev (any backing_dev) and the given
792 * @zone has experienced recent congestion, this waits for up to @timeout
793 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete.
795 *
796 * In the absense of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep.
798 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
800 * it is the number of jiffies that were still remaining when the function
801 * returned. return_value == timeout implies the function did not sleep.
802 */
803long wait_iff_congested(struct zone *zone, int sync, long timeout)
804{
805 long ret;
806 unsigned long start = jiffies;
807 DEFINE_WAIT(wait);
808 wait_queue_head_t *wqh = &congestion_wqh[sync];
809
810 /*
811 * If there is no congestion, or heavy congestion is not being
812 * encountered in the current zone, yield if necessary instead
813 * of sleeping on the congestion queue
814 */
815 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 !zone_is_reclaim_congested(zone)) {
817 cond_resched();
818
819 /* In case we scheduled, work out time remaining */
820 ret = timeout - (jiffies - start);
821 if (ret < 0)
822 ret = 0;
823
824 goto out;
825 }
826
827 /* Sleep until uncongested or a write happens */
828 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 ret = io_schedule_timeout(timeout);
830 finish_wait(wqh, &wait);
831
832out:
833 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 jiffies_to_usecs(jiffies - start));
835
836 return ret;
837}
838EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..4df2de77e069 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
311 size_t offset; 311 size_t offset;
312 void *retval; 312 void *retval;
313 313
314 might_sleep_if(mem_flags & __GFP_WAIT);
315
314 spin_lock_irqsave(&pool->lock, flags); 316 spin_lock_irqsave(&pool->lock, flags);
315 restart: 317 restart:
316 list_for_each_entry(page, &pool->page_list, page_list) { 318 list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..75572b5f2374 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
612 TASK_UNINTERRUPTIBLE); 612 TASK_UNINTERRUPTIBLE);
613} 613}
614 614
615int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
616 unsigned int flags)
617{
618 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
619 __lock_page(page);
620 return 1;
621 } else {
622 up_read(&mm->mmap_sem);
623 wait_on_page_locked(page);
624 return 0;
625 }
626}
627
615/** 628/**
616 * find_get_page - find and get a page reference 629 * find_get_page - find and get a page reference
617 * @mapping: the address_space to search 630 * @mapping: the address_space to search
@@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1539 * waiting for the lock. 1552 * waiting for the lock.
1540 */ 1553 */
1541 do_async_mmap_readahead(vma, ra, file, page, offset); 1554 do_async_mmap_readahead(vma, ra, file, page, offset);
1542 lock_page(page);
1543
1544 /* Did it get truncated? */
1545 if (unlikely(page->mapping != mapping)) {
1546 unlock_page(page);
1547 put_page(page);
1548 goto no_cached_page;
1549 }
1550 } else { 1555 } else {
1551 /* No page in the page cache at all */ 1556 /* No page in the page cache at all */
1552 do_sync_mmap_readahead(vma, ra, file, offset); 1557 do_sync_mmap_readahead(vma, ra, file, offset);
1553 count_vm_event(PGMAJFAULT); 1558 count_vm_event(PGMAJFAULT);
1554 ret = VM_FAULT_MAJOR; 1559 ret = VM_FAULT_MAJOR;
1555retry_find: 1560retry_find:
1556 page = find_lock_page(mapping, offset); 1561 page = find_get_page(mapping, offset);
1557 if (!page) 1562 if (!page)
1558 goto no_cached_page; 1563 goto no_cached_page;
1559 } 1564 }
1560 1565
1566 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
1567 return ret | VM_FAULT_RETRY;
1568
1569 /* Did it get truncated? */
1570 if (unlikely(page->mapping != mapping)) {
1571 unlock_page(page);
1572 put_page(page);
1573 goto retry_find;
1574 }
1575 VM_BUG_ON(page->index != offset);
1576
1561 /* 1577 /*
1562 * We have a locked page in the page cache, now we need to check 1578 * We have a locked page in the page cache, now we need to check
1563 * that it's up-to-date. If not, it is going to be due to an error. 1579 * that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2177 } 2193 }
2178 2194
2179 if (written > 0) { 2195 if (written > 0) {
2180 loff_t end = pos + written; 2196 pos += written;
2181 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2197 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2182 i_size_write(inode, end); 2198 i_size_write(inode, pos);
2183 mark_inode_dirty(inode); 2199 mark_inode_dirty(inode);
2184 } 2200 }
2185 *ppos = end; 2201 *ppos = pos;
2186 } 2202 }
2187out: 2203out:
2188 return written; 2204 return written;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
29#include <linux/kgdb.h> 29#include <linux/kgdb.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32
33#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34DEFINE_PER_CPU(int, __kmap_atomic_idx);
35#endif
36
32/* 37/*
33 * Virtual_count is not a pure "count". 38 * Virtual_count is not a pure "count".
34 * 0 means that it is not mapped, and has not been mapped 39 * 0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
42unsigned long totalhigh_pages __read_mostly; 47unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages); 48EXPORT_SYMBOL(totalhigh_pages);
44 49
50
51EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
52
45unsigned int nr_free_highpages (void) 53unsigned int nr_free_highpages (void)
46{ 54{
47 pg_data_t *pgdat; 55 pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
422} 430}
423 431
424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 432#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#ifdef CONFIG_DEBUG_HIGHMEM
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static int warn_count = 10;
431
432 if (unlikely(warn_count < 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_nmi()) {
437 if (type != KM_NMI && type != KM_NMI_PTE) {
438 WARN_ON(1);
439 warn_count--;
440 }
441 } else if (in_irq()) {
442 if (type != KM_IRQ0 && type != KM_IRQ1 &&
443 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
444 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
445 WARN_ON(1);
446 warn_count--;
447 }
448 } else if (!irqs_disabled()) { /* softirq */
449 if (type != KM_IRQ0 && type != KM_IRQ1 &&
450 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
451 type != KM_SKB_SUNRPC_DATA &&
452 type != KM_SKB_DATA_SOFTIRQ &&
453 type != KM_BOUNCE_READ) {
454 WARN_ON(1);
455 warn_count--;
456 }
457 }
458 }
459
460 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
461 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
462 type == KM_IRQ_PTE || type == KM_NMI ||
463 type == KM_NMI_PTE ) {
464 if (!irqs_disabled()) {
465 WARN_ON(1);
466 warn_count--;
467 }
468 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
469 if (irq_count() == 0 && !irqs_disabled()) {
470 WARN_ON(1);
471 warn_count--;
472 }
473 }
474#ifdef CONFIG_KGDB_KDB
475 if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
476 WARN_ON(1);
477 warn_count--;
478 }
479#endif /* CONFIG_KGDB_KDB */
480}
481
482#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..c4a3558589ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
423 } 423 }
424} 424}
425 425
426static void copy_gigantic_page(struct page *dst, struct page *src, 426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma) 427 unsigned long addr, struct vm_area_struct *vma)
428{ 428{
429 int i; 429 int i;
430 struct hstate *h = hstate_vma(vma); 430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 431 struct page *dst_base = dst;
432 struct page *src_base = src; 432 struct page *src_base = src;
433 might_sleep(); 433
434 for (i = 0; i < pages_per_huge_page(h); ) { 434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
440 src = mem_map_next(src, src_base, i); 440 src = mem_map_next(src, src_base, i);
441 } 441 }
442} 442}
443static void copy_huge_page(struct page *dst, struct page *src, 443
444static void copy_user_huge_page(struct page *dst, struct page *src,
444 unsigned long addr, struct vm_area_struct *vma) 445 unsigned long addr, struct vm_area_struct *vma)
445{ 446{
446 int i; 447 int i;
447 struct hstate *h = hstate_vma(vma); 448 struct hstate *h = hstate_vma(vma);
448 449
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 451 copy_user_gigantic_page(dst, src, addr, vma);
451 return; 452 return;
452 } 453 }
453 454
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
458 } 459 }
459} 460}
460 461
462static void copy_gigantic_page(struct page *dst, struct page *src)
463{
464 int i;
465 struct hstate *h = page_hstate(src);
466 struct page *dst_base = dst;
467 struct page *src_base = src;
468
469 for (i = 0; i < pages_per_huge_page(h); ) {
470 cond_resched();
471 copy_highpage(dst, src);
472
473 i++;
474 dst = mem_map_next(dst, dst_base, i);
475 src = mem_map_next(src, src_base, i);
476 }
477}
478
479void copy_huge_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483
484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
485 copy_gigantic_page(dst, src);
486 return;
487 }
488
489 might_sleep();
490 for (i = 0; i < pages_per_huge_page(h); i++) {
491 cond_resched();
492 copy_highpage(dst + i, src + i);
493 }
494}
495
461static void enqueue_huge_page(struct hstate *h, struct page *page) 496static void enqueue_huge_page(struct hstate *h, struct page *page)
462{ 497{
463 int nid = page_to_nid(page); 498 int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 501 h->free_huge_pages_node[nid]++;
467} 502}
468 503
504static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
505{
506 struct page *page;
507
508 if (list_empty(&h->hugepage_freelists[nid]))
509 return NULL;
510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
511 list_del(&page->lru);
512 set_page_refcounted(page);
513 h->free_huge_pages--;
514 h->free_huge_pages_node[nid]--;
515 return page;
516}
517
469static struct page *dequeue_huge_page_vma(struct hstate *h, 518static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 519 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 520 unsigned long address, int avoid_reserve)
472{ 521{
473 int nid;
474 struct page *page = NULL; 522 struct page *page = NULL;
475 struct mempolicy *mpol; 523 struct mempolicy *mpol;
476 nodemask_t *nodemask; 524 nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
496 544
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 546 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 548 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 549 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 550 if (!avoid_reserve)
503 struct page, lru); 551 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 552 break;
505 h->free_huge_pages--; 553 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 554 }
513 } 555 }
514err: 556err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 812 return ret;
771} 813}
772 814
773static struct page *alloc_buddy_huge_page(struct hstate *h, 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 816{
776 struct page *page; 817 struct page *page;
777 unsigned int nid; 818 unsigned int r_nid;
778 819
779 if (h->order >= MAX_ORDER) 820 if (h->order >= MAX_ORDER)
780 return NULL; 821 return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 853 }
813 spin_unlock(&hugetlb_lock); 854 spin_unlock(&hugetlb_lock);
814 855
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 856 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 858 __GFP_REPEAT|__GFP_NOWARN,
859 huge_page_order(h));
860 else
861 page = alloc_pages_exact_node(nid,
862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 864
819 if (page && arch_prepare_hugepage(page)) { 865 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 866 __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 869
824 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
825 if (page) { 871 if (page) {
826 /* 872 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 873 set_compound_page_dtor(page, free_huge_page);
834 /* 874 /*
835 * We incremented the global counters already 875 * We incremented the global counters already
836 */ 876 */
837 h->nr_huge_pages_node[nid]++; 877 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 878 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 880 } else {
841 h->nr_huge_pages--; 881 h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 888}
849 889
850/* 890/*
891 * This allocation function is useful in the context where vma is irrelevant.
892 * E.g. soft-offlining uses this function because it only cares physical
893 * address of error page.
894 */
895struct page *alloc_huge_page_node(struct hstate *h, int nid)
896{
897 struct page *page;
898
899 spin_lock(&hugetlb_lock);
900 page = dequeue_huge_page_node(h, nid);
901 spin_unlock(&hugetlb_lock);
902
903 if (!page)
904 page = alloc_buddy_huge_page(h, nid);
905
906 return page;
907}
908
909/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 910 * Increase the hugetlb pool such that it can accomodate a reservation
852 * of size 'delta'. 911 * of size 'delta'.
853 */ 912 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 930retry:
872 spin_unlock(&hugetlb_lock); 931 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 932 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 934 if (!page)
876 /* 935 /*
877 * We were not able to allocate enough pages to 936 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 937 * satisfy the entire reservation so we free what
879 * we've allocated so far. 938 * we've allocated so far.
880 */ 939 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 940 goto free;
884 }
885 941
886 list_add(&page->lru, &surplus_list); 942 list_add(&page->lru, &surplus_list);
887 } 943 }
@@ -908,31 +964,31 @@ retry:
908 needed += allocated; 964 needed += allocated;
909 h->resv_huge_pages += delta; 965 h->resv_huge_pages += delta;
910 ret = 0; 966 ret = 0;
911free: 967
968 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 969 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 971 if ((--needed) < 0)
915 break; 972 break;
916 list_del(&page->lru); 973 list_del(&page->lru);
974 /*
975 * This page is now managed by the hugetlb allocator and has
976 * no users -- drop the buddy allocator's reference.
977 */
978 put_page_testzero(page);
979 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 980 enqueue_huge_page(h, page);
918 } 981 }
919 982
920 /* Free unnecessary surplus pages to the buddy allocator */ 983 /* Free unnecessary surplus pages to the buddy allocator */
984free:
921 if (!list_empty(&surplus_list)) { 985 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 987 list_del(&page->lru);
925 /* 988 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 989 }
934 spin_lock(&hugetlb_lock);
935 } 990 }
991 spin_lock(&hugetlb_lock);
936 992
937 return ret; 993 return ret;
938} 994}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1052 spin_unlock(&hugetlb_lock); 1108 spin_unlock(&hugetlb_lock);
1053 1109
1054 if (!page) { 1110 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1112 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1113 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1115 }
1060 } 1116 }
1061 1117
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1118 set_page_private(page, (unsigned long) mapping);
1064 1119
1065 vma_commit_reservation(h, vma, addr); 1120 vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
2153 return -ENOMEM; 2208 return -ENOMEM;
2154} 2209}
2155 2210
2211static int is_hugetlb_entry_migration(pte_t pte)
2212{
2213 swp_entry_t swp;
2214
2215 if (huge_pte_none(pte) || pte_present(pte))
2216 return 0;
2217 swp = pte_to_swp_entry(pte);
2218 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2219 return 1;
2220 } else
2221 return 0;
2222}
2223
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2224static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2225{
2158 swp_entry_t swp; 2226 swp_entry_t swp;
@@ -2380,10 +2448,13 @@ retry_avoidcopy:
2380 * When the original hugepage is shared one, it does not have 2448 * When the original hugepage is shared one, it does not have
2381 * anon_vma prepared. 2449 * anon_vma prepared.
2382 */ 2450 */
2383 if (unlikely(anon_vma_prepare(vma))) 2451 if (unlikely(anon_vma_prepare(vma))) {
2452 /* Caller expects lock to be held */
2453 spin_lock(&mm->page_table_lock);
2384 return VM_FAULT_OOM; 2454 return VM_FAULT_OOM;
2455 }
2385 2456
2386 copy_huge_page(new_page, old_page, address, vma); 2457 copy_user_huge_page(new_page, old_page, address, vma);
2387 __SetPageUptodate(new_page); 2458 __SetPageUptodate(new_page);
2388 2459
2389 /* 2460 /*
@@ -2515,22 +2586,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2586 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2587 }
2517 } else { 2588 } else {
2589 /*
2590 * If memory error occurs between mmap() and fault, some process
2591 * don't have hwpoisoned swap entry for errored virtual address.
2592 * So we need to block hugepage fault by PG_hwpoison bit check.
2593 */
2594 if (unlikely(PageHWPoison(page))) {
2595 ret = VM_FAULT_HWPOISON |
2596 VM_FAULT_SET_HINDEX(h - hstates);
2597 goto backout_unlocked;
2598 }
2518 page_dup_rmap(page); 2599 page_dup_rmap(page);
2519 } 2600 }
2520 2601
2521 /* 2602 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2603 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2604 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2605 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2656 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2657 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2658 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2659 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2660 migration_entry_wait(mm, (pmd_t *)ptep, address);
2661 return 0;
2662 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2663 return VM_FAULT_HWPOISON_LARGE |
2664 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2665 }
2593 2666
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2667 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2951 hugetlb_acct_memory(h, -(chg - freed));
2879} 2952}
2880 2953
2954#ifdef CONFIG_MEMORY_FAILURE
2955
2956/* Should be called in hugetlb_lock */
2957static int is_hugepage_on_freelist(struct page *hpage)
2958{
2959 struct page *page;
2960 struct page *tmp;
2961 struct hstate *h = page_hstate(hpage);
2962 int nid = page_to_nid(hpage);
2963
2964 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2965 if (page == hpage)
2966 return 1;
2967 return 0;
2968}
2969
2881/* 2970/*
2882 * This function is called from memory failure code. 2971 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2972 * Assume the caller holds page lock of the head page.
2884 */ 2973 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2974int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2975{
2887 struct hstate *h = page_hstate(hpage); 2976 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2977 int nid = page_to_nid(hpage);
2978 int ret = -EBUSY;
2889 2979
2890 spin_lock(&hugetlb_lock); 2980 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2981 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2982 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2983 set_page_refcounted(hpage);
2984 h->free_huge_pages--;
2985 h->free_huge_pages_node[nid]--;
2986 ret = 0;
2987 }
2894 spin_unlock(&hugetlb_lock); 2988 spin_unlock(&hugetlb_lock);
2989 return ret;
2895} 2990}
2991#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
62 */ 62 */
63static inline unsigned long page_order(struct page *page) 63static inline unsigned long page_order(struct page *page)
64{ 64{
65 VM_BUG_ON(!PageBuddy(page)); 65 /* PageBuddy() must be checked by the caller */
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/uaccess.h>
5#include <linux/module.h> 4#include <linux/module.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h>
7 7
8/** 8/**
9 * probe_kernel_read(): safely attempt to read from a location 9 * probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..9a99cfaf0a19 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 /* incremented at every pagein/pageout */
94 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
95 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 96
94 MEM_CGROUP_STAT_NSTATS, 97 MEM_CGROUP_STAT_NSTATS,
95}; 98};
@@ -254,6 +257,12 @@ struct mem_cgroup {
254 * percpu counter. 257 * percpu counter.
255 */ 258 */
256 struct mem_cgroup_stat_cpu *stat; 259 struct mem_cgroup_stat_cpu *stat;
260 /*
261 * used when a cpu is offlined or other synchronizations
262 * See mem_cgroup_read_stat().
263 */
264 struct mem_cgroup_stat_cpu nocpu_base;
265 spinlock_t pcp_counter_lock;
257}; 266};
258 267
259/* Stuffs for move charges at task migration. */ 268/* Stuffs for move charges at task migration. */
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
530 return mz; 539 return mz;
531} 540}
532 541
542/*
543 * Implementation Note: reading percpu statistics for memcg.
544 *
545 * Both of vmstat[] and percpu_counter has threshold and do periodic
546 * synchronization to implement "quick" read. There are trade-off between
547 * reading cost and precision of value. Then, we may have a chance to implement
548 * a periodic synchronizion of counter in memcg's counter.
549 *
550 * But this _read() function is used for user interface now. The user accounts
551 * memory usage by memory cgroup and he _always_ requires exact value because
552 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
553 * have to visit all online cpus and make sum. So, for now, unnecessary
554 * synchronization is not implemented. (just implemented for cpu hotplug)
555 *
556 * If there are kernel internal actions which can make use of some not-exact
557 * value, and reading all cpu value can be performance bottleneck in some
558 * common workload, threashold and synchonization as vmstat[] should be
559 * implemented.
560 */
533static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 561static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
534 enum mem_cgroup_stat_index idx) 562 enum mem_cgroup_stat_index idx)
535{ 563{
536 int cpu; 564 int cpu;
537 s64 val = 0; 565 s64 val = 0;
538 566
539 for_each_possible_cpu(cpu) 567 get_online_cpus();
568 for_each_online_cpu(cpu)
540 val += per_cpu(mem->stat->count[idx], cpu); 569 val += per_cpu(mem->stat->count[idx], cpu);
570#ifdef CONFIG_HOTPLUG_CPU
571 spin_lock(&mem->pcp_counter_lock);
572 val += mem->nocpu_base.count[idx];
573 spin_unlock(&mem->pcp_counter_lock);
574#endif
575 put_online_cpus();
541 return val; 576 return val;
542} 577}
543 578
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
659 return mem; 694 return mem;
660} 695}
661 696
662/* 697/* The caller has to guarantee "mem" exists before calling this */
663 * Call callback function against all cgroup under hierarchy tree. 698static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
664 */
665static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
666 int (*func)(struct mem_cgroup *, void *))
667{ 699{
668 int found, ret, nextid;
669 struct cgroup_subsys_state *css; 700 struct cgroup_subsys_state *css;
670 struct mem_cgroup *mem; 701 int found;
671
672 if (!root->use_hierarchy)
673 return (*func)(root, data);
674 702
675 nextid = 1; 703 if (!mem) /* ROOT cgroup has the smallest ID */
676 do { 704 return root_mem_cgroup; /*css_put/get against root is ignored*/
677 ret = 0; 705 if (!mem->use_hierarchy) {
706 if (css_tryget(&mem->css))
707 return mem;
708 return NULL;
709 }
710 rcu_read_lock();
711 /*
712 * searching a memory cgroup which has the smallest ID under given
713 * ROOT cgroup. (ID >= 1)
714 */
715 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
716 if (css && css_tryget(css))
717 mem = container_of(css, struct mem_cgroup, css);
718 else
678 mem = NULL; 719 mem = NULL;
720 rcu_read_unlock();
721 return mem;
722}
723
724static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
725 struct mem_cgroup *root,
726 bool cond)
727{
728 int nextid = css_id(&iter->css) + 1;
729 int found;
730 int hierarchy_used;
731 struct cgroup_subsys_state *css;
732
733 hierarchy_used = iter->use_hierarchy;
679 734
735 css_put(&iter->css);
736 /* If no ROOT, walk all, ignore hierarchy */
737 if (!cond || (root && !hierarchy_used))
738 return NULL;
739
740 if (!root)
741 root = root_mem_cgroup;
742
743 do {
744 iter = NULL;
680 rcu_read_lock(); 745 rcu_read_lock();
681 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 746
682 &found); 747 css = css_get_next(&mem_cgroup_subsys, nextid,
748 &root->css, &found);
683 if (css && css_tryget(css)) 749 if (css && css_tryget(css))
684 mem = container_of(css, struct mem_cgroup, css); 750 iter = container_of(css, struct mem_cgroup, css);
685 rcu_read_unlock(); 751 rcu_read_unlock();
686 752 /* If css is NULL, no more cgroups will be found */
687 if (mem) {
688 ret = (*func)(mem, data);
689 css_put(&mem->css);
690 }
691 nextid = found + 1; 753 nextid = found + 1;
692 } while (!ret && css); 754 } while (css && !iter);
693 755
694 return ret; 756 return iter;
695} 757}
758/*
759 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
760 * be careful that "break" loop is not allowed. We have reference count.
761 * Instead of that modify "cond" to be false and "continue" to exit the loop.
762 */
763#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
764 for (iter = mem_cgroup_start_loop(root);\
765 iter != NULL;\
766 iter = mem_cgroup_get_next(iter, root, cond))
767
768#define for_each_mem_cgroup_tree(iter, root) \
769 for_each_mem_cgroup_tree_cond(iter, root, true)
770
771#define for_each_mem_cgroup_all(iter) \
772 for_each_mem_cgroup_tree_cond(iter, NULL, true)
773
696 774
697static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 775static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
698{ 776{
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051 return swappiness; 1129 return swappiness;
1052} 1130}
1053 1131
1054/* A routine for testing mem is not under move_account */ 1132static void mem_cgroup_start_move(struct mem_cgroup *mem)
1133{
1134 int cpu;
1135
1136 get_online_cpus();
1137 spin_lock(&mem->pcp_counter_lock);
1138 for_each_online_cpu(cpu)
1139 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1140 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1141 spin_unlock(&mem->pcp_counter_lock);
1142 put_online_cpus();
1143
1144 synchronize_rcu();
1145}
1146
1147static void mem_cgroup_end_move(struct mem_cgroup *mem)
1148{
1149 int cpu;
1150
1151 if (!mem)
1152 return;
1153 get_online_cpus();
1154 spin_lock(&mem->pcp_counter_lock);
1155 for_each_online_cpu(cpu)
1156 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1157 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1158 spin_unlock(&mem->pcp_counter_lock);
1159 put_online_cpus();
1160}
1161/*
1162 * 2 routines for checking "mem" is under move_account() or not.
1163 *
1164 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1165 * for avoiding race in accounting. If true,
1166 * pc->mem_cgroup may be overwritten.
1167 *
1168 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1169 * under hierarchy of moving cgroups. This is for
1170 * waiting at hith-memory prressure caused by "move".
1171 */
1172
1173static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1174{
1175 VM_BUG_ON(!rcu_read_lock_held());
1176 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1177}
1055 1178
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1179static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1180{
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1092 return false; 1215 return false;
1093} 1216}
1094 1217
1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1096{
1097 int *val = data;
1098 (*val)++;
1099 return 0;
1100}
1101
1102/** 1218/**
1103 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1219 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1104 * @memcg: The memory cgroup that went over limit 1220 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1289,10 @@ done:
1173static int mem_cgroup_count_children(struct mem_cgroup *mem) 1289static int mem_cgroup_count_children(struct mem_cgroup *mem)
1174{ 1290{
1175 int num = 0; 1291 int num = 0;
1176 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1292 struct mem_cgroup *iter;
1293
1294 for_each_mem_cgroup_tree(iter, mem)
1295 num++;
1177 return num; 1296 return num;
1178} 1297}
1179 1298
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1322 return total; 1441 return total;
1323} 1442}
1324 1443
1325static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1326{
1327 int *val = (int *)data;
1328 int x;
1329 /*
1330 * Logically, we can stop scanning immediately when we find
1331 * a memcg is already locked. But condidering unlock ops and
1332 * creation/removal of memcg, scan-all is simple operation.
1333 */
1334 x = atomic_inc_return(&mem->oom_lock);
1335 *val = max(x, *val);
1336 return 0;
1337}
1338/* 1444/*
1339 * Check OOM-Killer is already running under our hierarchy. 1445 * Check OOM-Killer is already running under our hierarchy.
1340 * If someone is running, return false. 1446 * If someone is running, return false.
1341 */ 1447 */
1342static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1448static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1343{ 1449{
1344 int lock_count = 0; 1450 int x, lock_count = 0;
1451 struct mem_cgroup *iter;
1345 1452
1346 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1453 for_each_mem_cgroup_tree(iter, mem) {
1454 x = atomic_inc_return(&iter->oom_lock);
1455 lock_count = max(x, lock_count);
1456 }
1347 1457
1348 if (lock_count == 1) 1458 if (lock_count == 1)
1349 return true; 1459 return true;
1350 return false; 1460 return false;
1351} 1461}
1352 1462
1353static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1463static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1354{ 1464{
1465 struct mem_cgroup *iter;
1466
1355 /* 1467 /*
1356 * When a new child is created while the hierarchy is under oom, 1468 * When a new child is created while the hierarchy is under oom,
1357 * mem_cgroup_oom_lock() may not be called. We have to use 1469 * mem_cgroup_oom_lock() may not be called. We have to use
1358 * atomic_add_unless() here. 1470 * atomic_add_unless() here.
1359 */ 1471 */
1360 atomic_add_unless(&mem->oom_lock, -1, 0); 1472 for_each_mem_cgroup_tree(iter, mem)
1473 atomic_add_unless(&iter->oom_lock, -1, 0);
1361 return 0; 1474 return 0;
1362} 1475}
1363 1476
1364static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1365{
1366 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1367}
1368 1477
1369static DEFINE_MUTEX(memcg_oom_mutex); 1478static DEFINE_MUTEX(memcg_oom_mutex);
1370static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1479static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1571/*
1463 * Currently used to update mapped file statistics, but the routine can be 1572 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1573 * generalized to update other statistics as well.
1574 *
1575 * Notes: Race condition
1576 *
1577 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1578 * it tends to be costly. But considering some conditions, we doesn't need
1579 * to do so _always_.
1580 *
1581 * Considering "charge", lock_page_cgroup() is not required because all
1582 * file-stat operations happen after a page is attached to radix-tree. There
1583 * are no race with "charge".
1584 *
1585 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1586 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1587 * if there are race with "uncharge". Statistics itself is properly handled
1588 * by flags.
1589 *
1590 * Considering "move", this is an only case we see a race. To make the race
1591 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1592 * possibility of race condition. If there is, we take a lock.
1465 */ 1593 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1594
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1467{ 1596{
1468 struct mem_cgroup *mem; 1597 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1598 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false;
1470 1600
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1601 if (unlikely(!pc))
1473 return; 1602 return;
1474 1603
1475 lock_page_cgroup(pc); 1604 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1605 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1606 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1607 goto out;
1608 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) {
1610 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc);
1612 need_unlock = true;
1613 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc))
1615 goto out;
1616 }
1479 1617
1480 /* 1618 this_cpu_add(mem->stat->count[idx], val);
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1619
1482 */ 1620 switch (idx) {
1483 if (val > 0) { 1621 case MEM_CGROUP_STAT_FILE_MAPPED:
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1622 if (val > 0)
1485 SetPageCgroupFileMapped(pc); 1623 SetPageCgroupFileMapped(pc);
1486 } else { 1624 else if (!page_mapped(page))
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1625 ClearPageCgroupFileMapped(pc);
1488 ClearPageCgroupFileMapped(pc); 1626 break;
1627 default:
1628 BUG();
1489 } 1629 }
1490 1630
1491done: 1631out:
1492 unlock_page_cgroup(pc); 1632 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc);
1634 rcu_read_unlock();
1635 return;
1636}
1637
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1493} 1641}
1494 1642
1495/* 1643/*
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void)
1605 atomic_dec(&memcg_drain_count); 1753 atomic_dec(&memcg_drain_count);
1606} 1754}
1607 1755
1608static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1756/*
1757 * This function drains percpu counter value from DEAD cpu and
1758 * move it to local cpu. Note that this function can be preempted.
1759 */
1760static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1761{
1762 int i;
1763
1764 spin_lock(&mem->pcp_counter_lock);
1765 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1766 s64 x = per_cpu(mem->stat->count[i], cpu);
1767
1768 per_cpu(mem->stat->count[i], cpu) = 0;
1769 mem->nocpu_base.count[i] += x;
1770 }
1771 /* need to clear ON_MOVE value, works as a kind of lock. */
1772 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1773 spin_unlock(&mem->pcp_counter_lock);
1774}
1775
1776static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1777{
1778 int idx = MEM_CGROUP_ON_MOVE;
1779
1780 spin_lock(&mem->pcp_counter_lock);
1781 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1782 spin_unlock(&mem->pcp_counter_lock);
1783}
1784
1785static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1609 unsigned long action, 1786 unsigned long action,
1610 void *hcpu) 1787 void *hcpu)
1611{ 1788{
1612 int cpu = (unsigned long)hcpu; 1789 int cpu = (unsigned long)hcpu;
1613 struct memcg_stock_pcp *stock; 1790 struct memcg_stock_pcp *stock;
1791 struct mem_cgroup *iter;
1792
1793 if ((action == CPU_ONLINE)) {
1794 for_each_mem_cgroup_all(iter)
1795 synchronize_mem_cgroup_on_move(iter, cpu);
1796 return NOTIFY_OK;
1797 }
1614 1798
1615 if (action != CPU_DEAD) 1799 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1616 return NOTIFY_OK; 1800 return NOTIFY_OK;
1801
1802 for_each_mem_cgroup_all(iter)
1803 mem_cgroup_drain_pcp_counter(iter, cpu);
1804
1617 stock = &per_cpu(memcg_stock, cpu); 1805 stock = &per_cpu(memcg_stock, cpu);
1618 drain_stock(stock); 1806 drain_stock(stock);
1619 return NOTIFY_OK; 1807 return NOTIFY_OK;
@@ -3038,6 +3226,7 @@ move_account:
3038 lru_add_drain_all(); 3226 lru_add_drain_all();
3039 drain_all_stock_sync(); 3227 drain_all_stock_sync();
3040 ret = 0; 3228 ret = 0;
3229 mem_cgroup_start_move(mem);
3041 for_each_node_state(node, N_HIGH_MEMORY) { 3230 for_each_node_state(node, N_HIGH_MEMORY) {
3042 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3231 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3043 enum lru_list l; 3232 enum lru_list l;
@@ -3051,6 +3240,7 @@ move_account:
3051 if (ret) 3240 if (ret)
3052 break; 3241 break;
3053 } 3242 }
3243 mem_cgroup_end_move(mem);
3054 memcg_oom_recover(mem); 3244 memcg_oom_recover(mem);
3055 /* it seems parent cgroup doesn't have enough mem */ 3245 /* it seems parent cgroup doesn't have enough mem */
3056 if (ret == -ENOMEM) 3246 if (ret == -ENOMEM)
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3137 return retval; 3327 return retval;
3138} 3328}
3139 3329
3140struct mem_cgroup_idx_data {
3141 s64 val;
3142 enum mem_cgroup_stat_index idx;
3143};
3144 3330
3145static int 3331static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3146mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3332 enum mem_cgroup_stat_index idx)
3147{ 3333{
3148 struct mem_cgroup_idx_data *d = data; 3334 struct mem_cgroup *iter;
3149 d->val += mem_cgroup_read_stat(mem, d->idx); 3335 s64 val = 0;
3150 return 0;
3151}
3152 3336
3153static void 3337 /* each per cpu's value can be minus.Then, use s64 */
3154mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3338 for_each_mem_cgroup_tree(iter, mem)
3155 enum mem_cgroup_stat_index idx, s64 *val) 3339 val += mem_cgroup_read_stat(iter, idx);
3156{ 3340
3157 struct mem_cgroup_idx_data d; 3341 if (val < 0) /* race ? */
3158 d.idx = idx; 3342 val = 0;
3159 d.val = 0; 3343 return val;
3160 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3161 *val = d.val;
3162} 3344}
3163 3345
3164static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3346static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3165{ 3347{
3166 u64 idx_val, val; 3348 u64 val;
3167 3349
3168 if (!mem_cgroup_is_root(mem)) { 3350 if (!mem_cgroup_is_root(mem)) {
3169 if (!swap) 3351 if (!swap)
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3172 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3354 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3173 } 3355 }
3174 3356
3175 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3357 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3176 val = idx_val; 3358 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3177 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3178 val += idx_val;
3179 3359
3180 if (swap) { 3360 if (swap)
3181 mem_cgroup_get_recursive_idx_stat(mem, 3361 val += mem_cgroup_get_recursive_idx_stat(mem,
3182 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3362 MEM_CGROUP_STAT_SWAPOUT);
3183 val += idx_val;
3184 }
3185 3363
3186 return val << PAGE_SHIFT; 3364 return val << PAGE_SHIFT;
3187} 3365}
@@ -3389,9 +3567,9 @@ struct {
3389}; 3567};
3390 3568
3391 3569
3392static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3570static void
3571mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3393{ 3572{
3394 struct mcs_total_stat *s = data;
3395 s64 val; 3573 s64 val;
3396 3574
3397 /* per cpu stat */ 3575 /* per cpu stat */
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3421 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3599 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3422 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3600 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3423 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3601 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3424 return 0;
3425} 3602}
3426 3603
3427static void 3604static void
3428mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3605mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3429{ 3606{
3430 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3607 struct mem_cgroup *iter;
3608
3609 for_each_mem_cgroup_tree(iter, mem)
3610 mem_cgroup_get_local_stat(iter, s);
3431} 3611}
3432 3612
3433static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3613static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
3604 return _a->threshold - _b->threshold; 3784 return _a->threshold - _b->threshold;
3605} 3785}
3606 3786
3607static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3787static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3608{ 3788{
3609 struct mem_cgroup_eventfd_list *ev; 3789 struct mem_cgroup_eventfd_list *ev;
3610 3790
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3615 3795
3616static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3796static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3617{ 3797{
3618 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3798 struct mem_cgroup *iter;
3799
3800 for_each_mem_cgroup_tree(iter, mem)
3801 mem_cgroup_oom_notify_cb(iter);
3619} 3802}
3620 3803
3621static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3804static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4032 vfree(mem); 4215 vfree(mem);
4033 mem = NULL; 4216 mem = NULL;
4034 } 4217 }
4218 spin_lock_init(&mem->pcp_counter_lock);
4035 return mem; 4219 return mem;
4036} 4220}
4037 4221
@@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4158 &per_cpu(memcg_stock, cpu); 4342 &per_cpu(memcg_stock, cpu);
4159 INIT_WORK(&stock->work, drain_local_stock); 4343 INIT_WORK(&stock->work, drain_local_stock);
4160 } 4344 }
4161 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4345 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4162 } else { 4346 } else {
4163 parent = mem_cgroup_from_cont(cont->parent); 4347 parent = mem_cgroup_from_cont(cont->parent);
4164 mem->use_hierarchy = parent->use_hierarchy; 4348 mem->use_hierarchy = parent->use_hierarchy;
@@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void)
4513 mc.to = NULL; 4697 mc.to = NULL;
4514 mc.moving_task = NULL; 4698 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 4699 spin_unlock(&mc.lock);
4700 mem_cgroup_end_move(from);
4516 memcg_oom_recover(from); 4701 memcg_oom_recover(from);
4517 memcg_oom_recover(to); 4702 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq); 4703 wake_up_all(&mc.waitq);
@@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4543 VM_BUG_ON(mc.moved_charge); 4728 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 4729 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 4730 VM_BUG_ON(mc.moving_task);
4731 mem_cgroup_start_move(from);
4546 spin_lock(&mc.lock); 4732 spin_lock(&mc.lock);
4547 mc.from = from; 4733 mc.from = from;
4548 mc.to = mem; 4734 mc.to = mem;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
78 return 0; 82 return 0;
79 83
80 /* 84 /*
81 * page_mapping() does not accept slab page 85 * page_mapping() does not accept slab pages.
82 */ 86 */
83 if (PageSlab(p)) 87 if (PageSlab(p))
84 return -EINVAL; 88 return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
268 struct list_head nd; 272 struct list_head nd;
269 struct task_struct *tsk; 273 struct task_struct *tsk;
270 unsigned long addr; 274 unsigned long addr;
271 unsigned addr_valid:1; 275 char addr_valid;
272}; 276};
273 277
274/* 278/*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
309 * a SIGKILL because the error is not contained anymore. 313 * a SIGKILL because the error is not contained anymore.
310 */ 314 */
311 if (tk->addr == -EFAULT) { 315 if (tk->addr == -EFAULT) {
312 pr_debug("MCE: Unable to find user space address %lx in %s\n", 316 pr_info("MCE: Unable to find user space address %lx in %s\n",
313 page_to_pfn(p), tsk->comm); 317 page_to_pfn(p), tsk->comm);
314 tk->addr_valid = 0; 318 tk->addr_valid = 0;
315 } 319 }
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
577 pfn, err); 581 pfn, err);
578 } else if (page_has_private(p) && 582 } else if (page_has_private(p) &&
579 !try_to_release_page(p, GFP_NOIO)) { 583 !try_to_release_page(p, GFP_NOIO)) {
580 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 584 pr_info("MCE %#lx: failed to release buffers\n", pfn);
581 } else { 585 } else {
582 ret = RECOVERED; 586 ret = RECOVERED;
583 } 587 }
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
693 * Issues: 697 * Issues:
694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 698 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
695 * To narrow down kill region to one page, we need to break up pmd. 699 * To narrow down kill region to one page, we need to break up pmd.
696 * - To support soft-offlining for hugepage, we need to support hugepage
697 * migration.
698 */ 700 */
699static int me_huge_page(struct page *p, unsigned long pfn) 701static int me_huge_page(struct page *p, unsigned long pfn)
700{ 702{
703 int res = 0;
701 struct page *hpage = compound_head(p); 704 struct page *hpage = compound_head(p);
702 /* 705 /*
703 * We can safely recover from error on free or reserved (i.e. 706 * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
710 * so there is no race between isolation and mapping/unmapping. 713 * so there is no race between isolation and mapping/unmapping.
711 */ 714 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 715 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage); 716 res = dequeue_hwpoisoned_huge_page(hpage);
714 return RECOVERED; 717 if (!res)
718 return RECOVERED;
715 } 719 }
716 return DELAYED; 720 return DELAYED;
717} 721}
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 840 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
837} 841}
838 842
839#define N_UNMAP_TRIES 5
840
841/* 843/*
842 * Do all that is necessary to remove user space mappings. Unmap 844 * Do all that is necessary to remove user space mappings. Unmap
843 * the pages and send SIGBUS to the processes if the data was dirty. 845 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
849 struct address_space *mapping; 851 struct address_space *mapping;
850 LIST_HEAD(tokill); 852 LIST_HEAD(tokill);
851 int ret; 853 int ret;
852 int i;
853 int kill = 1; 854 int kill = 1;
854 struct page *hpage = compound_head(p); 855 struct page *hpage = compound_head(p);
855 856
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
903 if (kill) 904 if (kill)
904 collect_procs(hpage, &tokill); 905 collect_procs(hpage, &tokill);
905 906
906 /* 907 ret = try_to_unmap(hpage, ttu);
907 * try_to_unmap can fail temporarily due to races.
908 * Try a few times (RED-PEN better strategy?)
909 */
910 for (i = 0; i < N_UNMAP_TRIES; i++) {
911 ret = try_to_unmap(hpage, ttu);
912 if (ret == SWAP_SUCCESS)
913 break;
914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
915 }
916
917 if (ret != SWAP_SUCCESS) 908 if (ret != SWAP_SUCCESS)
918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 909 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
919 pfn, page_mapcount(hpage)); 910 pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
981 * We need/can do nothing about count=0 pages. 972 * We need/can do nothing about count=0 pages.
982 * 1) it's a free page, and therefore in safe hand: 973 * 1) it's a free page, and therefore in safe hand:
983 * prep_new_page() will be the gate keeper. 974 * prep_new_page() will be the gate keeper.
984 * 2) it's part of a non-compound high order page. 975 * 2) it's a free hugepage, which is also safe:
976 * an affected hugepage will be dequeued from hugepage freelist,
977 * so there's no concern about reusing it ever after.
978 * 3) it's part of a non-compound high order page.
985 * Implies some kernel user: cannot stop them from 979 * Implies some kernel user: cannot stop them from
986 * R/W the page; let's pray that the page has been 980 * R/W the page; let's pray that the page has been
987 * used and will be freed some time later. 981 * used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
993 if (is_free_buddy_page(p)) { 987 if (is_free_buddy_page(p)) {
994 action_result(pfn, "free buddy", DELAYED); 988 action_result(pfn, "free buddy", DELAYED);
995 return 0; 989 return 0;
990 } else if (PageHuge(hpage)) {
991 /*
992 * Check "just unpoisoned", "filter hit", and
993 * "race with other subpage."
994 */
995 lock_page_nosync(hpage);
996 if (!PageHWPoison(hpage)
997 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
998 || (p != hpage && TestSetPageHWPoison(hpage))) {
999 atomic_long_sub(nr_pages, &mce_bad_pages);
1000 return 0;
1001 }
1002 set_page_hwpoison_huge_page(hpage);
1003 res = dequeue_hwpoisoned_huge_page(hpage);
1004 action_result(pfn, "free huge",
1005 res ? IGNORED : DELAYED);
1006 unlock_page(hpage);
1007 return res;
996 } else { 1008 } else {
997 action_result(pfn, "high order kernel", IGNORED); 1009 action_result(pfn, "high order kernel", IGNORED);
998 return -EBUSY; 1010 return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
1147 page = compound_head(p); 1159 page = compound_head(p);
1148 1160
1149 if (!PageHWPoison(p)) { 1161 if (!PageHWPoison(p)) {
1150 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1162 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1151 return 0; 1163 return 0;
1152 } 1164 }
1153 1165
1154 nr_pages = 1 << compound_order(page); 1166 nr_pages = 1 << compound_order(page);
1155 1167
1156 if (!get_page_unless_zero(page)) { 1168 if (!get_page_unless_zero(page)) {
1169 /*
1170 * Since HWPoisoned hugepage should have non-zero refcount,
1171 * race between memory failure and unpoison seems to happen.
1172 * In such case unpoison fails and memory failure runs
1173 * to the end.
1174 */
1175 if (PageHuge(page)) {
1176 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1177 return 0;
1178 }
1157 if (TestClearPageHWPoison(p)) 1179 if (TestClearPageHWPoison(p))
1158 atomic_long_sub(nr_pages, &mce_bad_pages); 1180 atomic_long_sub(nr_pages, &mce_bad_pages);
1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1181 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1160 return 0; 1182 return 0;
1161 } 1183 }
1162 1184
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
1168 * the free buddy page pool. 1190 * the free buddy page pool.
1169 */ 1191 */
1170 if (TestClearPageHWPoison(page)) { 1192 if (TestClearPageHWPoison(page)) {
1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1193 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1194 atomic_long_sub(nr_pages, &mce_bad_pages);
1173 freeit = 1; 1195 freeit = 1;
1196 if (PageHuge(page))
1197 clear_page_hwpoison_huge_page(page);
1174 } 1198 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1177 unlock_page(page); 1199 unlock_page(page);
1178 1200
1179 put_page(page); 1201 put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
1187static struct page *new_page(struct page *p, unsigned long private, int **x) 1209static struct page *new_page(struct page *p, unsigned long private, int **x)
1188{ 1210{
1189 int nid = page_to_nid(p); 1211 int nid = page_to_nid(p);
1190 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1212 if (PageHuge(p))
1213 return alloc_huge_page_node(page_hstate(compound_head(p)),
1214 nid);
1215 else
1216 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1191} 1217}
1192 1218
1193/* 1219/*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1215 * was free. 1241 * was free.
1216 */ 1242 */
1217 set_migratetype_isolate(p); 1243 set_migratetype_isolate(p);
1244 /*
1245 * When the target page is a free hugepage, just remove it
1246 * from free hugepage list.
1247 */
1218 if (!get_page_unless_zero(compound_head(p))) { 1248 if (!get_page_unless_zero(compound_head(p))) {
1219 if (is_free_buddy_page(p)) { 1249 if (PageHuge(p)) {
1220 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1250 pr_info("get_any_page: %#lx free huge page\n", pfn);
1251 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1252 } else if (is_free_buddy_page(p)) {
1253 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1221 /* Set hwpoison bit while page is still isolated */ 1254 /* Set hwpoison bit while page is still isolated */
1222 SetPageHWPoison(p); 1255 SetPageHWPoison(p);
1223 ret = 0; 1256 ret = 0;
1224 } else { 1257 } else {
1225 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1258 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1226 pfn, p->flags); 1259 pfn, p->flags);
1227 ret = -EIO; 1260 ret = -EIO;
1228 } 1261 }
@@ -1235,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1235 return ret; 1268 return ret;
1236} 1269}
1237 1270
1271static int soft_offline_huge_page(struct page *page, int flags)
1272{
1273 int ret;
1274 unsigned long pfn = page_to_pfn(page);
1275 struct page *hpage = compound_head(page);
1276 LIST_HEAD(pagelist);
1277
1278 ret = get_any_page(page, pfn, flags);
1279 if (ret < 0)
1280 return ret;
1281 if (ret == 0)
1282 goto done;
1283
1284 if (PageHWPoison(hpage)) {
1285 put_page(hpage);
1286 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1287 return -EBUSY;
1288 }
1289
1290 /* Keep page count to indicate a given hugepage is isolated. */
1291
1292 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1294 if (ret) {
1295 putback_lru_pages(&pagelist);
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags);
1298 if (ret > 0)
1299 ret = -EIO;
1300 return ret;
1301 }
1302done:
1303 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */
1308 return ret;
1309}
1310
1238/** 1311/**
1239 * soft_offline_page - Soft offline a page. 1312 * soft_offline_page - Soft offline a page.
1240 * @page: page to offline 1313 * @page: page to offline
@@ -1262,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
1262 int ret; 1335 int ret;
1263 unsigned long pfn = page_to_pfn(page); 1336 unsigned long pfn = page_to_pfn(page);
1264 1337
1338 if (PageHuge(page))
1339 return soft_offline_huge_page(page, flags);
1340
1265 ret = get_any_page(page, pfn, flags); 1341 ret = get_any_page(page, pfn, flags);
1266 if (ret < 0) 1342 if (ret < 0)
1267 return ret; 1343 return ret;
@@ -1288,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
1288 goto done; 1364 goto done;
1289 } 1365 }
1290 if (!PageLRU(page)) { 1366 if (!PageLRU(page)) {
1291 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1367 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1292 pfn, page->flags); 1368 pfn, page->flags);
1293 return -EIO; 1369 return -EIO;
1294 } 1370 }
@@ -1302,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
1302 if (PageHWPoison(page)) { 1378 if (PageHWPoison(page)) {
1303 unlock_page(page); 1379 unlock_page(page);
1304 put_page(page); 1380 put_page(page);
1305 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1381 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1306 return -EBUSY; 1382 return -EBUSY;
1307 } 1383 }
1308 1384
@@ -1323,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
1323 put_page(page); 1399 put_page(page);
1324 if (ret == 1) { 1400 if (ret == 1) {
1325 ret = 0; 1401 ret = 0;
1326 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1402 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1327 goto done; 1403 goto done;
1328 } 1404 }
1329 1405
@@ -1339,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
1339 list_add(&page->lru, &pagelist); 1415 list_add(&page->lru, &pagelist);
1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1341 if (ret) { 1417 if (ret) {
1342 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1343 pfn, ret, page->flags); 1419 pfn, ret, page->flags);
1344 if (ret > 0) 1420 if (ret > 0)
1345 ret = -EIO; 1421 ret = -EIO;
1346 } 1422 }
1347 } else { 1423 } else {
1348 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1424 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1349 pfn, ret, page_count(page), page->flags); 1425 pfn, ret, page_count(page), page->flags);
1350 } 1426 }
1351 if (ret) 1427 if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 98b58fecedef..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,7 +736,7 @@ again:
736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
737 if (!dst_pte) 737 if (!dst_pte)
738 return -ENOMEM; 738 return -ENOMEM;
739 src_pte = pte_offset_map_nested(src_pmd, addr); 739 src_pte = pte_offset_map(src_pmd, addr);
740 src_ptl = pte_lockptr(src_mm, src_pmd); 740 src_ptl = pte_lockptr(src_mm, src_pmd);
741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
742 orig_src_pte = src_pte; 742 orig_src_pte = src_pte;
@@ -767,7 +767,7 @@ again:
767 767
768 arch_leave_lazy_mmu_mode(); 768 arch_leave_lazy_mmu_mode();
769 spin_unlock(src_ptl); 769 spin_unlock(src_ptl);
770 pte_unmap_nested(orig_src_pte); 770 pte_unmap(orig_src_pte);
771 add_mm_rss_vec(dst_mm, rss); 771 add_mm_rss_vec(dst_mm, rss);
772 pte_unmap_unlock(orig_dst_pte, dst_ptl); 772 pte_unmap_unlock(orig_dst_pte, dst_ptl);
773 cond_resched(); 773 cond_resched();
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1450 if (ret & VM_FAULT_OOM) 1450 if (ret & VM_FAULT_OOM)
1451 return i ? i : -ENOMEM; 1451 return i ? i : -ENOMEM;
1452 if (ret & 1452 if (ret &
1453 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1453 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
1454 VM_FAULT_SIGBUS))
1454 return i ? i : -EFAULT; 1455 return i ? i : -EFAULT;
1455 BUG(); 1456 BUG();
1456 } 1457 }
@@ -1590,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
1590} 1591}
1591#endif /* CONFIG_ELF_CORE */ 1592#endif /* CONFIG_ELF_CORE */
1592 1593
1593pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1594pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1594 spinlock_t **ptl) 1595 spinlock_t **ptl)
1595{ 1596{
1596 pgd_t * pgd = pgd_offset(mm, addr); 1597 pgd_t * pgd = pgd_offset(mm, addr);
@@ -2079,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2079 * zeroes. 2080 * zeroes.
2080 */ 2081 */
2081 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2082 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2082 memset(kaddr, 0, PAGE_SIZE); 2083 clear_page(kaddr);
2083 kunmap_atomic(kaddr, KM_USER0); 2084 kunmap_atomic(kaddr, KM_USER0);
2084 flush_dcache_page(dst); 2085 flush_dcache_page(dst);
2085 } else 2086 } else
@@ -2107,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2107static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2108static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2108 unsigned long address, pte_t *page_table, pmd_t *pmd, 2109 unsigned long address, pte_t *page_table, pmd_t *pmd,
2109 spinlock_t *ptl, pte_t orig_pte) 2110 spinlock_t *ptl, pte_t orig_pte)
2111 __releases(ptl)
2110{ 2112{
2111 struct page *old_page, *new_page; 2113 struct page *old_page, *new_page;
2112 pte_t entry; 2114 pte_t entry;
@@ -2626,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2626 struct page *page, *swapcache = NULL; 2628 struct page *page, *swapcache = NULL;
2627 swp_entry_t entry; 2629 swp_entry_t entry;
2628 pte_t pte; 2630 pte_t pte;
2631 int locked;
2629 struct mem_cgroup *ptr = NULL; 2632 struct mem_cgroup *ptr = NULL;
2630 int exclusive = 0; 2633 int exclusive = 0;
2631 int ret = 0; 2634 int ret = 0;
@@ -2676,8 +2679,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2676 goto out_release; 2679 goto out_release;
2677 } 2680 }
2678 2681
2679 lock_page(page); 2682 locked = lock_page_or_retry(page, mm, flags);
2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2683 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2684 if (!locked) {
2685 ret |= VM_FAULT_RETRY;
2686 goto out_release;
2687 }
2681 2688
2682 /* 2689 /*
2683 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 2690 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2926,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2926 vmf.page = NULL; 2933 vmf.page = NULL;
2927 2934
2928 ret = vma->vm_ops->fault(vma, &vmf); 2935 ret = vma->vm_ops->fault(vma, &vmf);
2929 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2936 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
2937 VM_FAULT_RETRY)))
2930 return ret; 2938 return ret;
2931 2939
2932 if (unlikely(PageHWPoison(vmf.page))) { 2940 if (unlikely(PageHWPoison(vmf.page))) {
@@ -3343,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
3343 3351
3344#endif /* __HAVE_ARCH_GATE_AREA */ 3352#endif /* __HAVE_ARCH_GATE_AREA */
3345 3353
3346static int follow_pte(struct mm_struct *mm, unsigned long address, 3354static int __follow_pte(struct mm_struct *mm, unsigned long address,
3347 pte_t **ptepp, spinlock_t **ptlp) 3355 pte_t **ptepp, spinlock_t **ptlp)
3348{ 3356{
3349 pgd_t *pgd; 3357 pgd_t *pgd;
@@ -3380,6 +3388,17 @@ out:
3380 return -EINVAL; 3388 return -EINVAL;
3381} 3389}
3382 3390
3391static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3392 pte_t **ptepp, spinlock_t **ptlp)
3393{
3394 int res;
3395
3396 /* (void) is needed to make gcc happy */
3397 (void) __cond_lock(*ptlp,
3398 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3399 return res;
3400}
3401
3383/** 3402/**
3384 * follow_pfn - look up PFN at a user virtual address 3403 * follow_pfn - look up PFN at a user virtual address
3385 * @vma: memory mapping 3404 * @vma: memory mapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4e940a26945..9260314a221e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page)
602/* Checks if this range of memory is likely to be hot-removable. */ 602/* Checks if this range of memory is likely to be hot-removable. */
603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
604{ 604{
605 int type;
606 struct page *page = pfn_to_page(start_pfn); 605 struct page *page = pfn_to_page(start_pfn);
607 struct page *end_page = page + nr_pages; 606 struct page *end_page = page + nr_pages;
608 607
609 /* Check the starting page of each pageblock within the range */ 608 /* Check the starting page of each pageblock within the range */
610 for (; page < end_page; page = next_active_pageblock(page)) { 609 for (; page < end_page; page = next_active_pageblock(page)) {
611 type = get_pageblock_migratetype(page); 610 if (!is_pageblock_removable_nolock(page))
612
613 /*
614 * A pageblock containing MOVABLE or free pages is considered
615 * removable
616 */
617 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
618 return 0;
619
620 /*
621 * A pageblock starting with a PageReserved page is not
622 * considered removable.
623 */
624 if (PageReserved(page))
625 return 0; 611 return 0;
612 cond_resched();
626 } 613 }
627 614
628 /* All pageblocks in the memory block are likely to be hot-removable */ 615 /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
659 * Scanning pfn is much easier than scanning lru list. 646 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page. 647 * Scan pfn from start to end and Find LRU page.
661 */ 648 */
662int scan_lru_pages(unsigned long start, unsigned long end) 649static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
663{ 650{
664 unsigned long pfn; 651 unsigned long pfn;
665 struct page *page; 652 struct page *page;
@@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
709 page_is_file_cache(page)); 696 page_is_file_cache(page));
710 697
711 } else { 698 } else {
712 /* Becasue we don't have big zone->lock. we should
713 check this again here. */
714 if (page_count(page))
715 not_managed++;
716#ifdef CONFIG_DEBUG_VM 699#ifdef CONFIG_DEBUG_VM
717 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 700 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
718 pfn); 701 pfn);
719 dump_page(page); 702 dump_page(page);
720#endif 703#endif
704 /* Becasue we don't have big zone->lock. we should
705 check this again here. */
706 if (page_count(page)) {
707 not_managed++;
708 ret = -EBUSY;
709 break;
710 }
721 } 711 }
722 } 712 }
723 ret = -EBUSY; 713 if (!list_empty(&source)) {
724 if (not_managed) { 714 if (not_managed) {
725 if (!list_empty(&source)) 715 putback_lru_pages(&source);
716 goto out;
717 }
718 /* this function returns # of failed pages */
719 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
720 if (ret)
726 putback_lru_pages(&source); 721 putback_lru_pages(&source);
727 goto out;
728 } 722 }
729 ret = 0;
730 if (list_empty(&source))
731 goto out;
732 /* this function returns # of failed pages */
733 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
734
735out: 723out:
736 return ret; 724 return ret;
737} 725}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..4a57f135b76e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 924 nodemask_t nmask;
925 LIST_HEAD(pagelist); 925 LIST_HEAD(pagelist);
926 int err = 0; 926 int err = 0;
927 struct vm_area_struct *vma;
927 928
928 nodes_clear(nmask); 929 nodes_clear(nmask);
929 node_set(source, nmask); 930 node_set(source, nmask);
930 931
931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
932 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 933 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
933 936
934 if (!list_empty(&pagelist)) 937 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 0); 938 err = migrate_pages(&pagelist, new_node_page, dest, 0);
939 if (err)
940 putback_lru_pages(&pagelist);
941 }
936 942
937 return err; 943 return err;
938} 944}
@@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len,
1147 1153
1148 err = mbind_range(mm, start, end, new); 1154 err = mbind_range(mm, start, end, new);
1149 1155
1150 if (!list_empty(&pagelist)) 1156 if (!list_empty(&pagelist)) {
1151 nr_failed = migrate_pages(&pagelist, new_vma_page, 1157 nr_failed = migrate_pages(&pagelist, new_vma_page,
1152 (unsigned long)vma, 0); 1158 (unsigned long)vma, 0);
1159 if (nr_failed)
1160 putback_lru_pages(&pagelist);
1161 }
1153 1162
1154 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1163 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1155 err = -EIO; 1164 err = -EIO;
@@ -1588,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy)
1588 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1597 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1589 &policy->v.nodes, 1598 &policy->v.nodes,
1590 &zone); 1599 &zone);
1591 return zone->node; 1600 return zone ? zone->node : numa_node_id();
1592 } 1601 }
1593 1602
1594 default: 1603 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..fe5a3c6a5426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
37#include "internal.h" 38#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 96 pte_t *ptep, pte;
96 spinlock_t *ptl; 97 spinlock_t *ptl;
97 98
98 pgd = pgd_offset(mm, addr); 99 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 100 ptep = huge_pte_offset(mm, addr);
100 goto out; 101 if (!ptep)
102 goto out;
103 ptl = &mm->page_table_lock;
104 } else {
105 pgd = pgd_offset(mm, addr);
106 if (!pgd_present(*pgd))
107 goto out;
101 108
102 pud = pud_offset(pgd, addr); 109 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 110 if (!pud_present(*pud))
104 goto out; 111 goto out;
105 112
106 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 114 if (!pmd_present(*pmd))
108 goto out; 115 goto out;
109 116
110 ptep = pte_offset_map(pmd, addr); 117 ptep = pte_offset_map(pmd, addr);
111 118
112 if (!is_swap_pte(*ptep)) { 119 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 120 pte_unmap(ptep);
114 goto out; 121 goto out;
115 } 122 }
123
124 ptl = pte_lockptr(mm, pmd);
125 }
116 126
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 127 spin_lock(ptl);
119 pte = *ptep; 128 pte = *ptep;
120 if (!is_swap_pte(pte)) 129 if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 140 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 141 pte = pte_mkwrite(pte);
142#ifdef CONFIG_HUGETLB_PAGE
143 if (PageHuge(new))
144 pte = pte_mkhuge(pte);
145#endif
133 flush_cache_page(vma, addr, pte_pfn(pte)); 146 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 147 set_pte_at(mm, addr, ptep, pte);
135 148
136 if (PageAnon(new)) 149 if (PageHuge(new)) {
150 if (PageAnon(new))
151 hugepage_add_anon_rmap(new, vma, addr);
152 else
153 page_dup_rmap(new);
154 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 155 page_add_anon_rmap(new, vma, addr);
138 else 156 else
139 page_add_file_rmap(new); 157 page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 294}
277 295
278/* 296/*
297 * The expected number of remaining references is the same as that
298 * of migrate_page_move_mapping().
299 */
300int migrate_huge_page_move_mapping(struct address_space *mapping,
301 struct page *newpage, struct page *page)
302{
303 int expected_count;
304 void **pslot;
305
306 if (!mapping) {
307 if (page_count(page) != 1)
308 return -EAGAIN;
309 return 0;
310 }
311
312 spin_lock_irq(&mapping->tree_lock);
313
314 pslot = radix_tree_lookup_slot(&mapping->page_tree,
315 page_index(page));
316
317 expected_count = 2 + page_has_private(page);
318 if (page_count(page) != expected_count ||
319 (struct page *)radix_tree_deref_slot(pslot) != page) {
320 spin_unlock_irq(&mapping->tree_lock);
321 return -EAGAIN;
322 }
323
324 if (!page_freeze_refs(page, expected_count)) {
325 spin_unlock_irq(&mapping->tree_lock);
326 return -EAGAIN;
327 }
328
329 get_page(newpage);
330
331 radix_tree_replace_slot(pslot, newpage);
332
333 page_unfreeze_refs(page, expected_count);
334
335 __put_page(page);
336
337 spin_unlock_irq(&mapping->tree_lock);
338 return 0;
339}
340
341/*
279 * Copy the page to its new location 342 * Copy the page to its new location
280 */ 343 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 344void migrate_page_copy(struct page *newpage, struct page *page)
282{ 345{
283 copy_highpage(newpage, page); 346 if (PageHuge(page))
347 copy_huge_page(newpage, page);
348 else
349 copy_highpage(newpage, page);
284 350
285 if (PageError(page)) 351 if (PageError(page))
286 SetPageError(newpage); 352 SetPageError(newpage);
@@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
431 .nr_to_write = 1, 497 .nr_to_write = 1,
432 .range_start = 0, 498 .range_start = 0,
433 .range_end = LLONG_MAX, 499 .range_end = LLONG_MAX,
434 .nonblocking = 1,
435 .for_reclaim = 1 500 .for_reclaim = 1
436 }; 501 };
437 int rc; 502 int rc;
@@ -724,6 +789,92 @@ move_newpage:
724} 789}
725 790
726/* 791/*
792 * Counterpart of unmap_and_move_page() for hugepage migration.
793 *
794 * This function doesn't wait the completion of hugepage I/O
795 * because there is no race between I/O and migration for hugepage.
796 * Note that currently hugepage I/O occurs only in direct I/O
797 * where no lock is held and PG_writeback is irrelevant,
798 * and writeback status of all subpages are counted in the reference
799 * count of the head page (i.e. if all subpages of a 2MB hugepage are
800 * under direct I/O, the reference of the head page is 512 and a bit more.)
801 * This means that when we try to migrate hugepage whose subpages are
802 * doing direct I/O, some references remain after try_to_unmap() and
803 * hugepage migration fails without data corruption.
804 *
805 * There is also no race when direct I/O is issued on the page under migration,
806 * because then pte is replaced with migration swap entry and direct I/O code
807 * will wait in the page fault for migration to complete.
808 */
809static int unmap_and_move_huge_page(new_page_t get_new_page,
810 unsigned long private, struct page *hpage,
811 int force, int offlining)
812{
813 int rc = 0;
814 int *result = NULL;
815 struct page *new_hpage = get_new_page(hpage, private, &result);
816 int rcu_locked = 0;
817 struct anon_vma *anon_vma = NULL;
818
819 if (!new_hpage)
820 return -ENOMEM;
821
822 rc = -EAGAIN;
823
824 if (!trylock_page(hpage)) {
825 if (!force)
826 goto out;
827 lock_page(hpage);
828 }
829
830 if (PageAnon(hpage)) {
831 rcu_read_lock();
832 rcu_locked = 1;
833
834 if (page_mapped(hpage)) {
835 anon_vma = page_anon_vma(hpage);
836 atomic_inc(&anon_vma->external_refcount);
837 }
838 }
839
840 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
841
842 if (!page_mapped(hpage))
843 rc = move_to_new_page(new_hpage, hpage, 1);
844
845 if (rc)
846 remove_migration_ptes(hpage, hpage);
847
848 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
849 &anon_vma->lock)) {
850 int empty = list_empty(&anon_vma->head);
851 spin_unlock(&anon_vma->lock);
852 if (empty)
853 anon_vma_free(anon_vma);
854 }
855
856 if (rcu_locked)
857 rcu_read_unlock();
858out:
859 unlock_page(hpage);
860
861 if (rc != -EAGAIN) {
862 list_del(&hpage->lru);
863 put_page(hpage);
864 }
865
866 put_page(new_hpage);
867
868 if (result) {
869 if (rc)
870 *result = rc;
871 else
872 *result = page_to_nid(new_hpage);
873 }
874 return rc;
875}
876
877/*
727 * migrate_pages 878 * migrate_pages
728 * 879 *
729 * The function takes one list of pages to migrate and a function 880 * The function takes one list of pages to migrate and a function
@@ -732,8 +883,9 @@ move_newpage:
732 * 883 *
733 * The function returns after 10 attempts or if no pages 884 * The function returns after 10 attempts or if no pages
734 * are movable anymore because to has become empty 885 * are movable anymore because to has become empty
735 * or no retryable pages exist anymore. All pages will be 886 * or no retryable pages exist anymore.
736 * returned to the LRU or freed. 887 * Caller should call putback_lru_pages to return pages to the LRU
888 * or free list.
737 * 889 *
738 * Return: Number of pages not migrated or error code. 890 * Return: Number of pages not migrated or error code.
739 */ 891 */
@@ -780,7 +932,51 @@ out:
780 if (!swapwrite) 932 if (!swapwrite)
781 current->flags &= ~PF_SWAPWRITE; 933 current->flags &= ~PF_SWAPWRITE;
782 934
783 putback_lru_pages(from); 935 if (rc)
936 return rc;
937
938 return nr_failed + retry;
939}
940
941int migrate_huge_pages(struct list_head *from,
942 new_page_t get_new_page, unsigned long private, int offlining)
943{
944 int retry = 1;
945 int nr_failed = 0;
946 int pass = 0;
947 struct page *page;
948 struct page *page2;
949 int rc;
950
951 for (pass = 0; pass < 10 && retry; pass++) {
952 retry = 0;
953
954 list_for_each_entry_safe(page, page2, from, lru) {
955 cond_resched();
956
957 rc = unmap_and_move_huge_page(get_new_page,
958 private, page, pass > 2, offlining);
959
960 switch(rc) {
961 case -ENOMEM:
962 goto out;
963 case -EAGAIN:
964 retry++;
965 break;
966 case 0:
967 break;
968 default:
969 /* Permanent failure */
970 nr_failed++;
971 break;
972 }
973 }
974 }
975 rc = 0;
976out:
977
978 list_for_each_entry_safe(page, page2, from, lru)
979 put_page(page);
784 980
785 if (rc) 981 if (rc)
786 return rc; 982 return rc;
@@ -841,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
841 1037
842 err = -EFAULT; 1038 err = -EFAULT;
843 vma = find_vma(mm, pp->addr); 1039 vma = find_vma(mm, pp->addr);
844 if (!vma || !vma_migratable(vma)) 1040 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
845 goto set_status; 1041 goto set_status;
846 1042
847 page = follow_page(vma, pp->addr, FOLL_GET); 1043 page = follow_page(vma, pp->addr, FOLL_GET);
@@ -890,9 +1086,12 @@ set_status:
890 } 1086 }
891 1087
892 err = 0; 1088 err = 0;
893 if (!list_empty(&pagelist)) 1089 if (!list_empty(&pagelist)) {
894 err = migrate_pages(&pagelist, new_page_node, 1090 err = migrate_pages(&pagelist, new_page_node,
895 (unsigned long)pm, 0); 1091 (unsigned long)pm, 0);
1092 if (err)
1093 putback_lru_pages(&pagelist);
1094 }
896 1095
897 up_read(&mm->mmap_sem); 1096 up_read(&mm->mmap_sem);
898 return err; 1097 return err;
@@ -1005,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1005 int err = -EFAULT; 1204 int err = -EFAULT;
1006 1205
1007 vma = find_vma(mm, addr); 1206 vma = find_vma(mm, addr);
1008 if (!vma) 1207 if (!vma || addr < vma->vm_start)
1009 goto set_status; 1208 goto set_status;
1010 1209
1011 page = follow_page(vma, addr, 0); 1210 page = follow_page(vma, addr, 0);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..b179abb1474a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1108,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1108 unsigned long retval = -EBADF; 1109 unsigned long retval = -EBADF;
1109 1110
1110 if (!(flags & MAP_ANONYMOUS)) { 1111 if (!(flags & MAP_ANONYMOUS)) {
1112 audit_mmap_fd(fd, flags);
1111 if (unlikely(flags & MAP_HUGETLB)) 1113 if (unlikely(flags & MAP_HUGETLB))
1112 return -EINVAL; 1114 return -EINVAL;
1113 file = fget(fd); 1115 file = fget(fd);
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..563fbdd6293a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
101 * pte locks because exclusive mmap_sem prevents deadlock. 101 * pte locks because exclusive mmap_sem prevents deadlock.
102 */ 102 */
103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
104 new_pte = pte_offset_map_nested(new_pmd, new_addr); 104 new_pte = pte_offset_map(new_pmd, new_addr);
105 new_ptl = pte_lockptr(mm, new_pmd); 105 new_ptl = pte_lockptr(mm, new_pmd);
106 if (new_ptl != old_ptl) 106 if (new_ptl != old_ptl)
107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 arch_leave_lazy_mmu_mode(); 119 arch_leave_lazy_mmu_mode();
120 if (new_ptl != old_ptl) 120 if (new_ptl != old_ptl)
121 spin_unlock(new_ptl); 121 spin_unlock(new_ptl);
122 pte_unmap_nested(new_pte - 1); 122 pte_unmap(new_pte - 1);
123 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 124 if (mapping)
125 spin_unlock(&mapping->i_mmap_lock); 125 spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..3613517c7592 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
29#include <linux/personality.h> 29#include <linux/personality.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/audit.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -293,11 +294,58 @@ void *vmalloc(unsigned long size)
293} 294}
294EXPORT_SYMBOL(vmalloc); 295EXPORT_SYMBOL(vmalloc);
295 296
297/*
298 * vzalloc - allocate virtually continguos memory with zero fill
299 *
300 * @size: allocation size
301 *
302 * Allocate enough pages to cover @size from the page level
303 * allocator and map them into continguos kernel virtual space.
304 * The memory allocated is set to zero.
305 *
306 * For tight control over page level allocator and protection flags
307 * use __vmalloc() instead.
308 */
309void *vzalloc(unsigned long size)
310{
311 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
312 PAGE_KERNEL);
313}
314EXPORT_SYMBOL(vzalloc);
315
316/**
317 * vmalloc_node - allocate memory on a specific node
318 * @size: allocation size
319 * @node: numa node
320 *
321 * Allocate enough pages to cover @size from the page level
322 * allocator and map them into contiguous kernel virtual space.
323 *
324 * For tight control over page level allocator and protection flags
325 * use __vmalloc() instead.
326 */
296void *vmalloc_node(unsigned long size, int node) 327void *vmalloc_node(unsigned long size, int node)
297{ 328{
298 return vmalloc(size); 329 return vmalloc(size);
299} 330}
300EXPORT_SYMBOL(vmalloc_node); 331
332/**
333 * vzalloc_node - allocate memory on a specific node with zero fill
334 * @size: allocation size
335 * @node: numa node
336 *
337 * Allocate enough pages to cover @size from the page level
338 * allocator and map them into contiguous kernel virtual space.
339 * The memory allocated is set to zero.
340 *
341 * For tight control over page level allocator and protection flags
342 * use __vmalloc() instead.
343 */
344void *vzalloc_node(unsigned long size, int node)
345{
346 return vzalloc(size);
347}
348EXPORT_SYMBOL(vzalloc_node);
301 349
302#ifndef PAGE_KERNEL_EXEC 350#ifndef PAGE_KERNEL_EXEC
303# define PAGE_KERNEL_EXEC PAGE_KERNEL 351# define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -1411,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1411 struct file *file = NULL; 1459 struct file *file = NULL;
1412 unsigned long retval = -EBADF; 1460 unsigned long retval = -EBADF;
1413 1461
1462 audit_mmap_fd(fd, flags);
1414 if (!(flags & MAP_ANONYMOUS)) { 1463 if (!(flags & MAP_ANONYMOUS)) {
1415 file = fget(fd); 1464 file = fget(fd);
1416 if (!file) 1465 if (!file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
162 return 0; 162 return 0;
163 163
164 /* 164 /*
165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't 165 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
166 * need to be executed for something that cannot be killed. 166 * so the entire heuristic doesn't need to be executed for something
167 * that cannot be killed.
167 */ 168 */
168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 169 if (atomic_read(&p->mm->oom_disable_count)) {
169 task_unlock(p); 170 task_unlock(p);
170 return 0; 171 return 0;
171 } 172 }
@@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
403#define K(x) ((x) << (PAGE_SHIFT-10)) 404#define K(x) ((x) << (PAGE_SHIFT-10))
404static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 405static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
405{ 406{
407 struct task_struct *q;
408 struct mm_struct *mm;
409
406 p = find_lock_task_mm(p); 410 p = find_lock_task_mm(p);
407 if (!p) 411 if (!p)
408 return 1; 412 return 1;
409 413
414 /* mm cannot be safely dereferenced after task_unlock(p) */
415 mm = p->mm;
416
410 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 417 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
411 task_pid_nr(p), p->comm, K(p->mm->total_vm), 418 task_pid_nr(p), p->comm, K(p->mm->total_vm),
412 K(get_mm_counter(p->mm, MM_ANONPAGES)), 419 K(get_mm_counter(p->mm, MM_ANONPAGES)),
413 K(get_mm_counter(p->mm, MM_FILEPAGES))); 420 K(get_mm_counter(p->mm, MM_FILEPAGES)));
414 task_unlock(p); 421 task_unlock(p);
415 422
423 /*
424 * Kill all processes sharing p->mm in other thread groups, if any.
425 * They don't get access to memory reserves or a higher scheduler
426 * priority, though, to avoid depletion of all memory or task
427 * starvation. This prevents mm->mmap_sem livelock when an oom killed
428 * task cannot exit because it requires the semaphore and its contended
429 * by another thread trying to allocate memory itself. That thread will
430 * now get access to memory reserves since it has a pending fatal
431 * signal.
432 */
433 for_each_process(q)
434 if (q->mm == mm && !same_thread_group(q, p)) {
435 task_lock(q); /* Protect ->comm from prctl() */
436 pr_err("Kill process %d (%s) sharing same memory\n",
437 task_pid_nr(q), q->comm);
438 task_unlock(q);
439 force_sig(SIGKILL, q);
440 }
416 441
417 set_tsk_thread_flag(p, TIF_MEMDIE); 442 set_tsk_thread_flag(p, TIF_MEMDIE);
418 force_sig(SIGKILL, p); 443 force_sig(SIGKILL, p);
@@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
680 read_lock(&tasklist_lock); 705 read_lock(&tasklist_lock);
681 if (sysctl_oom_kill_allocating_task && 706 if (sysctl_oom_kill_allocating_task &&
682 !oom_unkillable_task(current, NULL, nodemask) && 707 !oom_unkillable_task(current, NULL, nodemask) &&
683 (current->signal->oom_adj != OOM_DISABLE)) { 708 current->mm && !atomic_read(&current->mm->oom_disable_count)) {
684 /* 709 /*
685 * oom_kill_process() needs tasklist_lock held. If it returns 710 * oom_kill_process() needs tasklist_lock held. If it returns
686 * non-zero, current could not be killed so we must fallback to 711 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
415 415
416 if (vm_dirty_bytes) 416 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else { 418 else
419 int dirty_ratio; 419 dirty = (vm_dirty_ratio * available_memory) / 100;
420
421 dirty_ratio = vm_dirty_ratio;
422 if (dirty_ratio < 5)
423 dirty_ratio = 5;
424 dirty = (dirty_ratio * available_memory) / 100;
425 }
426 420
427 if (dirty_background_bytes) 421 if (dirty_background_bytes)
428 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); 422 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
510 * catch-up. This avoids (excessively) small writeouts 504 * catch-up. This avoids (excessively) small writeouts
511 * when the bdi limits are ramping up. 505 * when the bdi limits are ramping up.
512 */ 506 */
513 if (nr_reclaimable + nr_writeback < 507 if (nr_reclaimable + nr_writeback <=
514 (background_thresh + dirty_thresh) / 2) 508 (background_thresh + dirty_thresh) / 2)
515 break; 509 break;
516 510
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
542 * the last resort safeguard. 536 * the last resort safeguard.
543 */ 537 */
544 dirty_exceeded = 538 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) 539 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh); 540 || (nr_reclaimable + nr_writeback > dirty_thresh);
547 541
548 if (!dirty_exceeded) 542 if (!dirty_exceeded)
549 break; 543 break;
@@ -1121,6 +1115,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1121{ 1115{
1122 if (mapping_cap_account_dirty(mapping)) { 1116 if (mapping_cap_account_dirty(mapping)) {
1123 __inc_zone_page_state(page, NR_FILE_DIRTY); 1117 __inc_zone_page_state(page, NR_FILE_DIRTY);
1118 __inc_zone_page_state(page, NR_DIRTIED);
1124 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1119 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1125 task_dirty_inc(current); 1120 task_dirty_inc(current);
1126 task_io_account_write(PAGE_CACHE_SIZE); 1121 task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1124,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1129EXPORT_SYMBOL(account_page_dirtied); 1124EXPORT_SYMBOL(account_page_dirtied);
1130 1125
1131/* 1126/*
1127 * Helper function for set_page_writeback family.
1128 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1129 * wrt interrupts.
1130 */
1131void account_page_writeback(struct page *page)
1132{
1133 inc_zone_page_state(page, NR_WRITEBACK);
1134 inc_zone_page_state(page, NR_WRITTEN);
1135}
1136EXPORT_SYMBOL(account_page_writeback);
1137
1138/*
1132 * For address_spaces which do not use buffers. Just tag the page as dirty in 1139 * For address_spaces which do not use buffers. Just tag the page as dirty in
1133 * its radix tree. 1140 * its radix tree.
1134 * 1141 *
@@ -1366,7 +1373,7 @@ int test_set_page_writeback(struct page *page)
1366 ret = TestSetPageWriteback(page); 1373 ret = TestSetPageWriteback(page);
1367 } 1374 }
1368 if (!ret) 1375 if (!ret)
1369 inc_zone_page_state(page, NR_WRITEBACK); 1376 account_page_writeback(page);
1370 return ret; 1377 return ret;
1371 1378
1372} 1379}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a362c52fdf4..07a654486f75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -531,7 +531,7 @@ static inline void __free_one_page(struct page *page,
531 * so it's less likely to be used soon and more likely to be merged 531 * so it's less likely to be used soon and more likely to be merged
532 * as a higher order page 532 * as a higher order page
533 */ 533 */
534 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
535 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
536 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = __find_combined_index(page_idx, order);
537 higher_page = page + combined_idx - page_idx; 537 higher_page = page + combined_idx - page_idx;
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1907 preferred_zone, migratetype); 1907 preferred_zone, migratetype);
1908 1908
1909 if (!page && gfp_mask & __GFP_NOFAIL) 1909 if (!page && gfp_mask & __GFP_NOFAIL)
1910 congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1911 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1911 } while (!page && (gfp_mask & __GFP_NOFAIL));
1912 1912
1913 return page; 1913 return page;
@@ -1932,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1932 const gfp_t wait = gfp_mask & __GFP_WAIT; 1932 const gfp_t wait = gfp_mask & __GFP_WAIT;
1933 1933
1934 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 1934 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1935 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 1935 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1936 1936
1937 /* 1937 /*
1938 * The caller may dip into page reserves a bit more if the caller 1938 * The caller may dip into page reserves a bit more if the caller
@@ -1940,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1940 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1940 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1941 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1941 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1942 */ 1942 */
1943 alloc_flags |= (gfp_mask & __GFP_HIGH); 1943 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1944 1944
1945 if (!wait) { 1945 if (!wait) {
1946 alloc_flags |= ALLOC_HARDER; 1946 alloc_flags |= ALLOC_HARDER;
@@ -2095,7 +2095,7 @@ rebalance:
2095 pages_reclaimed += did_some_progress; 2095 pages_reclaimed += did_some_progress;
2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2097 /* Wait for some write requests to complete then retry */ 2097 /* Wait for some write requests to complete then retry */
2098 congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2099 goto rebalance; 2099 goto rebalance;
2100 } 2100 }
2101 2101
@@ -5297,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5297 * page allocater never alloc memory from ISOLATE block. 5297 * page allocater never alloc memory from ISOLATE block.
5298 */ 5298 */
5299 5299
5300static int
5301__count_immobile_pages(struct zone *zone, struct page *page, int count)
5302{
5303 unsigned long pfn, iter, found;
5304 /*
5305 * For avoiding noise data, lru_add_drain_all() should be called
5306 * If ZONE_MOVABLE, the zone never contains immobile pages
5307 */
5308 if (zone_idx(zone) == ZONE_MOVABLE)
5309 return true;
5310
5311 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5312 return true;
5313
5314 pfn = page_to_pfn(page);
5315 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5316 unsigned long check = pfn + iter;
5317
5318 if (!pfn_valid_within(check)) {
5319 iter++;
5320 continue;
5321 }
5322 page = pfn_to_page(check);
5323 if (!page_count(page)) {
5324 if (PageBuddy(page))
5325 iter += (1 << page_order(page)) - 1;
5326 continue;
5327 }
5328 if (!PageLRU(page))
5329 found++;
5330 /*
5331 * If there are RECLAIMABLE pages, we need to check it.
5332 * But now, memory offline itself doesn't call shrink_slab()
5333 * and it still to be fixed.
5334 */
5335 /*
5336 * If the page is not RAM, page_count()should be 0.
5337 * we don't need more check. This is an _used_ not-movable page.
5338 *
5339 * The problematic thing here is PG_reserved pages. PG_reserved
5340 * is set to both of a memory hole page and a _used_ kernel
5341 * page at boot.
5342 */
5343 if (found > count)
5344 return false;
5345 }
5346 return true;
5347}
5348
5349bool is_pageblock_removable_nolock(struct page *page)
5350{
5351 struct zone *zone = page_zone(page);
5352 return __count_immobile_pages(zone, page, 0);
5353}
5354
5300int set_migratetype_isolate(struct page *page) 5355int set_migratetype_isolate(struct page *page)
5301{ 5356{
5302 struct zone *zone; 5357 struct zone *zone;
5303 struct page *curr_page; 5358 unsigned long flags, pfn;
5304 unsigned long flags, pfn, iter;
5305 unsigned long immobile = 0;
5306 struct memory_isolate_notify arg; 5359 struct memory_isolate_notify arg;
5307 int notifier_ret; 5360 int notifier_ret;
5308 int ret = -EBUSY; 5361 int ret = -EBUSY;
@@ -5312,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
5312 zone_idx = zone_idx(zone); 5365 zone_idx = zone_idx(zone);
5313 5366
5314 spin_lock_irqsave(&zone->lock, flags); 5367 spin_lock_irqsave(&zone->lock, flags);
5315 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5316 zone_idx == ZONE_MOVABLE) {
5317 ret = 0;
5318 goto out;
5319 }
5320 5368
5321 pfn = page_to_pfn(page); 5369 pfn = page_to_pfn(page);
5322 arg.start_pfn = pfn; 5370 arg.start_pfn = pfn;
@@ -5336,23 +5384,20 @@ int set_migratetype_isolate(struct page *page)
5336 */ 5384 */
5337 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5385 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5338 notifier_ret = notifier_to_errno(notifier_ret); 5386 notifier_ret = notifier_to_errno(notifier_ret);
5339 if (notifier_ret || !arg.pages_found) 5387 if (notifier_ret)
5340 goto out; 5388 goto out;
5341 5389 /*
5342 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { 5390 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5343 if (!pfn_valid_within(pfn)) 5391 * We just check MOVABLE pages.
5344 continue; 5392 */
5345 5393 if (__count_immobile_pages(zone, page, arg.pages_found))
5346 curr_page = pfn_to_page(iter);
5347 if (!page_count(curr_page) || PageLRU(curr_page))
5348 continue;
5349
5350 immobile++;
5351 }
5352
5353 if (arg.pages_found == immobile)
5354 ret = 0; 5394 ret = 0;
5355 5395
5396 /*
5397 * immobile means "not-on-lru" paes. If immobile is larger than
5398 * removable-by-driver pages reported by notifier, we'll fail.
5399 */
5400
5356out: 5401out:
5357 if (!ret) { 5402 if (!ret) {
5358 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5403 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 86 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 87 * zone->lock must be held before call this.
88 * 88 *
89 * Returns 0 if all pages in the range is isolated. 89 * Returns 1 if all pages in the range is isolated.
90 */ 90 */
91static int 91static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
119 struct zone *zone; 119 struct zone *zone;
120 int ret; 120 int ret;
121 121
122 pfn = start_pfn;
123 /* 122 /*
124 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page 123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
125 * is not aligned to pageblock_nr_pages. 124 * is not aligned to pageblock_nr_pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..1a8bf76bfd03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
81} 81}
82 82
83void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 83static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
84{ 84{
85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
86} 86}
@@ -314,7 +314,7 @@ void __init anon_vma_init(void)
314 * Getting a lock on a stable anon_vma from a page off the LRU is 314 * Getting a lock on a stable anon_vma from a page off the LRU is
315 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 315 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
316 */ 316 */
317struct anon_vma *page_lock_anon_vma(struct page *page) 317struct anon_vma *__page_lock_anon_vma(struct page *page)
318{ 318{
319 struct anon_vma *anon_vma, *root_anon_vma; 319 struct anon_vma *anon_vma, *root_anon_vma;
320 unsigned long anon_mapping; 320 unsigned long anon_mapping;
@@ -348,6 +348,8 @@ out:
348} 348}
349 349
350void page_unlock_anon_vma(struct anon_vma *anon_vma) 350void page_unlock_anon_vma(struct anon_vma *anon_vma)
351 __releases(&anon_vma->root->lock)
352 __releases(RCU)
351{ 353{
352 anon_vma_unlock(anon_vma); 354 anon_vma_unlock(anon_vma);
353 rcu_read_unlock(); 355 rcu_read_unlock();
@@ -407,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
407 * 409 *
408 * On success returns with pte mapped and locked. 410 * On success returns with pte mapped and locked.
409 */ 411 */
410pte_t *page_check_address(struct page *page, struct mm_struct *mm, 412pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
411 unsigned long address, spinlock_t **ptlp, int sync) 413 unsigned long address, spinlock_t **ptlp, int sync)
412{ 414{
413 pgd_t *pgd; 415 pgd_t *pgd;
@@ -745,7 +747,7 @@ int page_mkclean(struct page *page)
745 if (mapping) { 747 if (mapping) {
746 ret = page_mkclean_file(mapping, page); 748 ret = page_mkclean_file(mapping, page);
747 if (page_test_dirty(page)) { 749 if (page_test_dirty(page)) {
748 page_clear_dirty(page); 750 page_clear_dirty(page, 1);
749 ret = 1; 751 ret = 1;
750 } 752 }
751 } 753 }
@@ -780,10 +782,10 @@ void page_move_anon_rmap(struct page *page,
780} 782}
781 783
782/** 784/**
783 * __page_set_anon_rmap - setup new anonymous rmap 785 * __page_set_anon_rmap - set up new anonymous rmap
784 * @page: the page to add the mapping to 786 * @page: Page to add to rmap
785 * @vma: the vm area in which the mapping is added 787 * @vma: VM area to add page to.
786 * @address: the user virtual address mapped 788 * @address: User virtual address of the mapping
787 * @exclusive: the page is exclusively owned by the current process 789 * @exclusive: the page is exclusively owned by the current process
788 */ 790 */
789static void __page_set_anon_rmap(struct page *page, 791static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +795,16 @@ static void __page_set_anon_rmap(struct page *page,
793 795
794 BUG_ON(!anon_vma); 796 BUG_ON(!anon_vma);
795 797
798 if (PageAnon(page))
799 return;
800
796 /* 801 /*
797 * If the page isn't exclusively mapped into this vma, 802 * If the page isn't exclusively mapped into this vma,
798 * we must use the _oldest_ possible anon_vma for the 803 * we must use the _oldest_ possible anon_vma for the
799 * page mapping! 804 * page mapping!
800 */ 805 */
801 if (!exclusive) { 806 if (!exclusive)
802 if (PageAnon(page))
803 return;
804 anon_vma = anon_vma->root; 807 anon_vma = anon_vma->root;
805 } else {
806 /*
807 * In this case, swapped-out-but-not-discarded swap-cache
808 * is remapped. So, no need to update page->mapping here.
809 * We convice anon_vma poitned by page->mapping is not obsolete
810 * because vma->anon_vma is necessary to be a family of it.
811 */
812 if (PageAnon(page))
813 return;
814 }
815 808
816 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 809 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817 page->mapping = (struct address_space *) anon_vma; 810 page->mapping = (struct address_space *) anon_vma;
@@ -942,7 +935,7 @@ void page_remove_rmap(struct page *page)
942 * containing the swap entry, but page not yet written to swap. 935 * containing the swap entry, but page not yet written to swap.
943 */ 936 */
944 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 937 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
945 page_clear_dirty(page); 938 page_clear_dirty(page, 1);
946 set_page_dirty(page); 939 set_page_dirty(page);
947 } 940 }
948 /* 941 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..47fdeeb9d636 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1586 1586
1587 inode = new_inode(sb); 1587 inode = new_inode(sb);
1588 if (inode) { 1588 if (inode) {
1589 inode->i_ino = get_next_ino();
1589 inode_init_owner(inode, dir, mode); 1590 inode_init_owner(inode, dir, mode);
1590 inode->i_blocks = 0; 1591 inode->i_blocks = 0;
1591 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1592 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1903 dir->i_size += BOGO_DIRENT_SIZE; 1904 dir->i_size += BOGO_DIRENT_SIZE;
1904 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1905 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1905 inc_nlink(inode); 1906 inc_nlink(inode);
1906 atomic_inc(&inode->i_count); /* New dentry reference */ 1907 ihold(inode); /* New dentry reference */
1907 dget(dentry); /* Extra pinning count for the created dentry */ 1908 dget(dentry); /* Extra pinning count for the created dentry */
1908 d_instantiate(dentry, inode); 1909 d_instantiate(dentry, inode);
1909out: 1910out:
@@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2146 if (*len < 3) 2147 if (*len < 3)
2147 return 255; 2148 return 255;
2148 2149
2149 if (hlist_unhashed(&inode->i_hash)) { 2150 if (inode_unhashed(inode)) {
2150 /* Unfortunately insert_inode_hash is not idempotent, 2151 /* Unfortunately insert_inode_hash is not idempotent,
2151 * so as we hash inodes here rather than at creation 2152 * so as we hash inodes here rather than at creation
2152 * time, we need a lock to ensure we only try 2153 * time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2154 */ 2155 */
2155 static DEFINE_SPINLOCK(lock); 2156 static DEFINE_SPINLOCK(lock);
2156 spin_lock(&lock); 2157 spin_lock(&lock);
2157 if (hlist_unhashed(&inode->i_hash)) 2158 if (inode_unhashed(inode))
2158 __insert_inode_hash(inode, 2159 __insert_inode_hash(inode,
2159 inode->i_ino + inode->i_generation); 2160 inode->i_ino + inode->i_generation);
2160 spin_unlock(&lock); 2161 spin_unlock(&lock);
@@ -2537,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
2537}; 2538};
2538 2539
2539 2540
2540static int shmem_get_sb(struct file_system_type *fs_type, 2541static struct dentry *shmem_mount(struct file_system_type *fs_type,
2541 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 2542 int flags, const char *dev_name, void *data)
2542{ 2543{
2543 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); 2544 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2544} 2545}
2545 2546
2546static struct file_system_type tmpfs_fs_type = { 2547static struct file_system_type tmpfs_fs_type = {
2547 .owner = THIS_MODULE, 2548 .owner = THIS_MODULE,
2548 .name = "tmpfs", 2549 .name = "tmpfs",
2549 .get_sb = shmem_get_sb, 2550 .mount = shmem_mount,
2550 .kill_sb = kill_litter_super, 2551 .kill_sb = kill_litter_super,
2551}; 2552};
2552 2553
@@ -2642,7 +2643,7 @@ out:
2642 2643
2643static struct file_system_type tmpfs_fs_type = { 2644static struct file_system_type tmpfs_fs_type = {
2644 .name = "tmpfs", 2645 .name = "tmpfs",
2645 .get_sb = ramfs_get_sb, 2646 .mount = ramfs_mount,
2646 .kill_sb = kill_litter_super, 2647 .kill_sb = kill_litter_super,
2647}; 2648};
2648 2649
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..b1e40dafbab3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
901 struct array_cache *from, unsigned int max) 901 struct array_cache *from, unsigned int max)
902{ 902{
903 /* Figure out how many entries to transfer */ 903 /* Figure out how many entries to transfer */
904 int nr = min(min(from->avail, max), to->limit - to->avail); 904 int nr = min3(from->avail, max, to->limit - to->avail);
905 905
906 if (!nr) 906 if (!nr)
907 return 0; 907 return 0;
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
378 378
379 pagevec_free(&pages_to_free); 379 pagevec_free(&pages_to_free);
380} 380}
381EXPORT_SYMBOL(release_pages);
381 382
382/* 383/*
383 * The pages which we're about to release may be in the deferred lru-addition 384 * The pages which we're about to release may be in the deferred lru-addition
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fc7bac7db0c..67ddaaf98c74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/poll.h>
33 34
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
58 59
59static DEFINE_MUTEX(swapon_mutex); 60static DEFINE_MUTEX(swapon_mutex);
60 61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63/* Activity counter to indicate that a swapon or swapoff has occurred */
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
61static inline unsigned char swap_count(unsigned char ent) 66static inline unsigned char swap_count(unsigned char ent)
62{ 67{
63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 68 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1680 } 1685 }
1681 filp_close(swap_file, NULL); 1686 filp_close(swap_file, NULL);
1682 err = 0; 1687 err = 0;
1688 atomic_inc(&proc_poll_event);
1689 wake_up_interruptible(&proc_poll_wait);
1683 1690
1684out_dput: 1691out_dput:
1685 filp_close(victim, NULL); 1692 filp_close(victim, NULL);
@@ -1688,6 +1695,25 @@ out:
1688} 1695}
1689 1696
1690#ifdef CONFIG_PROC_FS 1697#ifdef CONFIG_PROC_FS
1698struct proc_swaps {
1699 struct seq_file seq;
1700 int event;
1701};
1702
1703static unsigned swaps_poll(struct file *file, poll_table *wait)
1704{
1705 struct proc_swaps *s = file->private_data;
1706
1707 poll_wait(file, &proc_poll_wait, wait);
1708
1709 if (s->event != atomic_read(&proc_poll_event)) {
1710 s->event = atomic_read(&proc_poll_event);
1711 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1712 }
1713
1714 return POLLIN | POLLRDNORM;
1715}
1716
1691/* iterator */ 1717/* iterator */
1692static void *swap_start(struct seq_file *swap, loff_t *pos) 1718static void *swap_start(struct seq_file *swap, loff_t *pos)
1693{ 1719{
@@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = {
1771 1797
1772static int swaps_open(struct inode *inode, struct file *file) 1798static int swaps_open(struct inode *inode, struct file *file)
1773{ 1799{
1774 return seq_open(file, &swaps_op); 1800 struct proc_swaps *s;
1801 int ret;
1802
1803 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1804 if (!s)
1805 return -ENOMEM;
1806
1807 file->private_data = s;
1808
1809 ret = seq_open(file, &swaps_op);
1810 if (ret) {
1811 kfree(s);
1812 return ret;
1813 }
1814
1815 s->seq.private = s;
1816 s->event = atomic_read(&proc_poll_event);
1817 return ret;
1775} 1818}
1776 1819
1777static const struct file_operations proc_swaps_operations = { 1820static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
1779 .read = seq_read, 1822 .read = seq_read,
1780 .llseek = seq_lseek, 1823 .llseek = seq_lseek,
1781 .release = seq_release, 1824 .release = seq_release,
1825 .poll = swaps_poll,
1782}; 1826};
1783 1827
1784static int __init procswaps_init(void) 1828static int __init procswaps_init(void)
@@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2084 swap_info[prev]->next = type; 2128 swap_info[prev]->next = type;
2085 spin_unlock(&swap_lock); 2129 spin_unlock(&swap_lock);
2086 mutex_unlock(&swapon_mutex); 2130 mutex_unlock(&swapon_mutex);
2131 atomic_inc(&proc_poll_event);
2132 wake_up_interruptible(&proc_poll_wait);
2133
2087 error = 0; 2134 error = 0;
2088 goto out; 2135 goto out;
2089bad_swap: 2136bad_swap:
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9f909622a25e..a3d66b3dc5cb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
293 struct rb_node *tmp; 293 struct rb_node *tmp;
294 294
295 while (*p) { 295 while (*p) {
296 struct vmap_area *tmp; 296 struct vmap_area *tmp_va;
297 297
298 parent = *p; 298 parent = *p;
299 tmp = rb_entry(parent, struct vmap_area, rb_node); 299 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
300 if (va->va_start < tmp->va_end) 300 if (va->va_start < tmp_va->va_end)
301 p = &(*p)->rb_left; 301 p = &(*p)->rb_left;
302 else if (va->va_end > tmp->va_start) 302 else if (va->va_end > tmp_va->va_start)
303 p = &(*p)->rb_right; 303 p = &(*p)->rb_right;
304 else 304 else
305 BUG(); 305 BUG();
@@ -1596,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1596} 1596}
1597EXPORT_SYMBOL(__vmalloc); 1597EXPORT_SYMBOL(__vmalloc);
1598 1598
1599static inline void *__vmalloc_node_flags(unsigned long size,
1600 int node, gfp_t flags)
1601{
1602 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1603 node, __builtin_return_address(0));
1604}
1605
1599/** 1606/**
1600 * vmalloc - allocate virtually contiguous memory 1607 * vmalloc - allocate virtually contiguous memory
1601 * @size: allocation size 1608 * @size: allocation size
@@ -1607,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc);
1607 */ 1614 */
1608void *vmalloc(unsigned long size) 1615void *vmalloc(unsigned long size)
1609{ 1616{
1610 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1617 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
1611 -1, __builtin_return_address(0));
1612} 1618}
1613EXPORT_SYMBOL(vmalloc); 1619EXPORT_SYMBOL(vmalloc);
1614 1620
1615/** 1621/**
1622 * vzalloc - allocate virtually contiguous memory with zero fill
1623 * @size: allocation size
1624 * Allocate enough pages to cover @size from the page level
1625 * allocator and map them into contiguous kernel virtual space.
1626 * The memory allocated is set to zero.
1627 *
1628 * For tight control over page level allocator and protection flags
1629 * use __vmalloc() instead.
1630 */
1631void *vzalloc(unsigned long size)
1632{
1633 return __vmalloc_node_flags(size, -1,
1634 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1635}
1636EXPORT_SYMBOL(vzalloc);
1637
1638/**
1616 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1639 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1617 * @size: allocation size 1640 * @size: allocation size
1618 * 1641 *
@@ -1653,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
1653} 1676}
1654EXPORT_SYMBOL(vmalloc_node); 1677EXPORT_SYMBOL(vmalloc_node);
1655 1678
1679/**
1680 * vzalloc_node - allocate memory on a specific node with zero fill
1681 * @size: allocation size
1682 * @node: numa node
1683 *
1684 * Allocate enough pages to cover @size from the page level
1685 * allocator and map them into contiguous kernel virtual space.
1686 * The memory allocated is set to zero.
1687 *
1688 * For tight control over page level allocator and protection flags
1689 * use __vmalloc_node() instead.
1690 */
1691void *vzalloc_node(unsigned long size, int node)
1692{
1693 return __vmalloc_node_flags(size, node,
1694 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1695}
1696EXPORT_SYMBOL(vzalloc_node);
1697
1656#ifndef PAGE_KERNEL_EXEC 1698#ifndef PAGE_KERNEL_EXEC
1657# define PAGE_KERNEL_EXEC PAGE_KERNEL 1699# define PAGE_KERNEL_EXEC PAGE_KERNEL
1658#endif 1700#endif
@@ -2350,6 +2392,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2350 2392
2351#ifdef CONFIG_PROC_FS 2393#ifdef CONFIG_PROC_FS
2352static void *s_start(struct seq_file *m, loff_t *pos) 2394static void *s_start(struct seq_file *m, loff_t *pos)
2395 __acquires(&vmlist_lock)
2353{ 2396{
2354 loff_t n = *pos; 2397 loff_t n = *pos;
2355 struct vm_struct *v; 2398 struct vm_struct *v;
@@ -2376,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2376} 2419}
2377 2420
2378static void s_stop(struct seq_file *m, void *p) 2421static void s_stop(struct seq_file *m, void *p)
2422 __releases(&vmlist_lock)
2379{ 2423{
2380 read_unlock(&vmlist_lock); 2424 read_unlock(&vmlist_lock);
2381} 2425}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94c9464f262..b8a6fdc21312 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,12 @@
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 52#include <trace/events/vmscan.h>
53 53
54enum lumpy_mode {
55 LUMPY_MODE_NONE,
56 LUMPY_MODE_ASYNC,
57 LUMPY_MODE_SYNC,
58};
59
54struct scan_control { 60struct scan_control {
55 /* Incremented by the number of inactive pages that were scanned */ 61 /* Incremented by the number of inactive pages that were scanned */
56 unsigned long nr_scanned; 62 unsigned long nr_scanned;
@@ -82,7 +88,7 @@ struct scan_control {
82 * Intend to reclaim enough continuous memory rather than reclaim 88 * Intend to reclaim enough continuous memory rather than reclaim
83 * enough amount of memory. i.e, mode for high order allocation. 89 * enough amount of memory. i.e, mode for high order allocation.
84 */ 90 */
85 bool lumpy_reclaim_mode; 91 enum lumpy_mode lumpy_reclaim_mode;
86 92
87 /* Which cgroup do we reclaim from */ 93 /* Which cgroup do we reclaim from */
88 struct mem_cgroup *mem_cgroup; 94 struct mem_cgroup *mem_cgroup;
@@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
265 return ret; 271 return ret;
266} 272}
267 273
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync)
276{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
278
279 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous
281 * lumpy reclaim.
282 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
284 return;
285
286 /*
287 * If we need a large contiguous chunk of memory, or have
288 * trouble getting a small set of contiguous pages, we
289 * will reclaim both active and inactive pages.
290 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode;
293 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode;
295 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
297}
298
299static void disable_lumpy_reclaim_mode(struct scan_control *sc)
300{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
302}
303
268static inline int is_page_cache_freeable(struct page *page) 304static inline int is_page_cache_freeable(struct page *page)
269{ 305{
270 /* 306 /*
@@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
275 return page_count(page) - page_has_private(page) == 2; 311 return page_count(page) - page_has_private(page) == 2;
276} 312}
277 313
278static int may_write_to_queue(struct backing_dev_info *bdi) 314static int may_write_to_queue(struct backing_dev_info *bdi,
315 struct scan_control *sc)
279{ 316{
280 if (current->flags & PF_SWAPWRITE) 317 if (current->flags & PF_SWAPWRITE)
281 return 1; 318 return 1;
@@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
283 return 1; 320 return 1;
284 if (bdi == current->backing_dev_info) 321 if (bdi == current->backing_dev_info)
285 return 1; 322 return 1;
323
324 /* lumpy reclaim for hugepage often need a lot of write */
325 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
326 return 1;
286 return 0; 327 return 0;
287} 328}
288 329
@@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
307 unlock_page(page); 348 unlock_page(page);
308} 349}
309 350
310/* Request for sync pageout. */
311enum pageout_io {
312 PAGEOUT_IO_ASYNC,
313 PAGEOUT_IO_SYNC,
314};
315
316/* possible outcome of pageout() */ 351/* possible outcome of pageout() */
317typedef enum { 352typedef enum {
318 /* failed to write page out, page is locked */ 353 /* failed to write page out, page is locked */
@@ -330,7 +365,7 @@ typedef enum {
330 * Calls ->writepage(). 365 * Calls ->writepage().
331 */ 366 */
332static pageout_t pageout(struct page *page, struct address_space *mapping, 367static pageout_t pageout(struct page *page, struct address_space *mapping,
333 enum pageout_io sync_writeback) 368 struct scan_control *sc)
334{ 369{
335 /* 370 /*
336 * If the page is dirty, only perform writeback if that write 371 * If the page is dirty, only perform writeback if that write
@@ -366,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
366 } 401 }
367 if (mapping->a_ops->writepage == NULL) 402 if (mapping->a_ops->writepage == NULL)
368 return PAGE_ACTIVATE; 403 return PAGE_ACTIVATE;
369 if (!may_write_to_queue(mapping->backing_dev_info)) 404 if (!may_write_to_queue(mapping->backing_dev_info, sc))
370 return PAGE_KEEP; 405 return PAGE_KEEP;
371 406
372 if (clear_page_dirty_for_io(page)) { 407 if (clear_page_dirty_for_io(page)) {
@@ -376,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
376 .nr_to_write = SWAP_CLUSTER_MAX, 411 .nr_to_write = SWAP_CLUSTER_MAX,
377 .range_start = 0, 412 .range_start = 0,
378 .range_end = LLONG_MAX, 413 .range_end = LLONG_MAX,
379 .nonblocking = 1,
380 .for_reclaim = 1, 414 .for_reclaim = 1,
381 }; 415 };
382 416
@@ -394,7 +428,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
394 * direct reclaiming a large contiguous area and the 428 * direct reclaiming a large contiguous area and the
395 * first attempt to free a range of pages fails. 429 * first attempt to free a range of pages fails.
396 */ 430 */
397 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 431 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
398 wait_on_page_writeback(page); 433 wait_on_page_writeback(page);
399 434
400 if (!PageWriteback(page)) { 435 if (!PageWriteback(page)) {
@@ -402,7 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
402 ClearPageReclaim(page); 437 ClearPageReclaim(page);
403 } 438 }
404 trace_mm_vmscan_writepage(page, 439 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback)); 440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
406 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
407 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
408 } 443 }
@@ -580,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
580 referenced_page = TestClearPageReferenced(page); 615 referenced_page = TestClearPageReferenced(page);
581 616
582 /* Lumpy reclaim - ignore references */ 617 /* Lumpy reclaim - ignore references */
583 if (sc->lumpy_reclaim_mode) 618 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
584 return PAGEREF_RECLAIM; 619 return PAGEREF_RECLAIM;
585 620
586 /* 621 /*
@@ -616,7 +651,7 @@ static enum page_references page_check_references(struct page *page,
616 } 651 }
617 652
618 /* Reclaim if clean, defer dirty pages to writeback */ 653 /* Reclaim if clean, defer dirty pages to writeback */
619 if (referenced_page) 654 if (referenced_page && !PageSwapBacked(page))
620 return PAGEREF_RECLAIM_CLEAN; 655 return PAGEREF_RECLAIM_CLEAN;
621 656
622 return PAGEREF_RECLAIM; 657 return PAGEREF_RECLAIM;
@@ -644,12 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
644 * shrink_page_list() returns the number of reclaimed pages 679 * shrink_page_list() returns the number of reclaimed pages
645 */ 680 */
646static unsigned long shrink_page_list(struct list_head *page_list, 681static unsigned long shrink_page_list(struct list_head *page_list,
647 struct scan_control *sc, 682 struct zone *zone,
648 enum pageout_io sync_writeback) 683 struct scan_control *sc)
649{ 684{
650 LIST_HEAD(ret_pages); 685 LIST_HEAD(ret_pages);
651 LIST_HEAD(free_pages); 686 LIST_HEAD(free_pages);
652 int pgactivate = 0; 687 int pgactivate = 0;
688 unsigned long nr_dirty = 0;
689 unsigned long nr_congested = 0;
653 unsigned long nr_reclaimed = 0; 690 unsigned long nr_reclaimed = 0;
654 691
655 cond_resched(); 692 cond_resched();
@@ -669,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
669 goto keep; 706 goto keep;
670 707
671 VM_BUG_ON(PageActive(page)); 708 VM_BUG_ON(PageActive(page));
709 VM_BUG_ON(page_zone(page) != zone);
672 710
673 sc->nr_scanned++; 711 sc->nr_scanned++;
674 712
@@ -694,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
694 * for any page for which writeback has already 732 * for any page for which writeback has already
695 * started. 733 * started.
696 */ 734 */
697 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 735 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
736 may_enter_fs)
698 wait_on_page_writeback(page); 737 wait_on_page_writeback(page);
699 else 738 else {
700 goto keep_locked; 739 unlock_page(page);
740 goto keep_lumpy;
741 }
701 } 742 }
702 743
703 references = page_check_references(page, sc); 744 references = page_check_references(page, sc);
@@ -743,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
743 } 784 }
744 785
745 if (PageDirty(page)) { 786 if (PageDirty(page)) {
787 nr_dirty++;
788
746 if (references == PAGEREF_RECLAIM_CLEAN) 789 if (references == PAGEREF_RECLAIM_CLEAN)
747 goto keep_locked; 790 goto keep_locked;
748 if (!may_enter_fs) 791 if (!may_enter_fs)
@@ -751,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
751 goto keep_locked; 794 goto keep_locked;
752 795
753 /* Page is dirty, try to write it out here */ 796 /* Page is dirty, try to write it out here */
754 switch (pageout(page, mapping, sync_writeback)) { 797 switch (pageout(page, mapping, sc)) {
755 case PAGE_KEEP: 798 case PAGE_KEEP:
799 nr_congested++;
756 goto keep_locked; 800 goto keep_locked;
757 case PAGE_ACTIVATE: 801 case PAGE_ACTIVATE:
758 goto activate_locked; 802 goto activate_locked;
759 case PAGE_SUCCESS: 803 case PAGE_SUCCESS:
760 if (PageWriteback(page) || PageDirty(page)) 804 if (PageWriteback(page))
805 goto keep_lumpy;
806 if (PageDirty(page))
761 goto keep; 807 goto keep;
808
762 /* 809 /*
763 * A synchronous write - probably a ramdisk. Go 810 * A synchronous write - probably a ramdisk. Go
764 * ahead and try to reclaim the page. 811 * ahead and try to reclaim the page.
@@ -841,6 +888,7 @@ cull_mlocked:
841 try_to_free_swap(page); 888 try_to_free_swap(page);
842 unlock_page(page); 889 unlock_page(page);
843 putback_lru_page(page); 890 putback_lru_page(page);
891 disable_lumpy_reclaim_mode(sc);
844 continue; 892 continue;
845 893
846activate_locked: 894activate_locked:
@@ -853,10 +901,21 @@ activate_locked:
853keep_locked: 901keep_locked:
854 unlock_page(page); 902 unlock_page(page);
855keep: 903keep:
904 disable_lumpy_reclaim_mode(sc);
905keep_lumpy:
856 list_add(&page->lru, &ret_pages); 906 list_add(&page->lru, &ret_pages);
857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
858 } 908 }
859 909
910 /*
911 * Tag a zone as congested if all the dirty pages encountered were
912 * backed by a congested BDI. In this case, reclaimers should just
913 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem
915 */
916 if (nr_dirty == nr_congested)
917 zone_set_flag(zone, ZONE_CONGESTED);
918
860 free_page_list(&free_pages); 919 free_page_list(&free_pages);
861 920
862 list_splice(&ret_pages, page_list); 921 list_splice(&ret_pages, page_list);
@@ -1006,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1006 1065
1007 /* Check that we have not crossed a zone boundary. */ 1066 /* Check that we have not crossed a zone boundary. */
1008 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1067 if (unlikely(page_zone_id(cursor_page) != zone_id))
1009 continue; 1068 break;
1010 1069
1011 /* 1070 /*
1012 * If we don't have enough swap space, reclaiming of 1071 * If we don't have enough swap space, reclaiming of
@@ -1014,8 +1073,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1014 * pointless. 1073 * pointless.
1015 */ 1074 */
1016 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1075 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1017 !PageSwapCache(cursor_page)) 1076 !PageSwapCache(cursor_page))
1018 continue; 1077 break;
1019 1078
1020 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1079 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1021 list_move(&cursor_page->lru, dst); 1080 list_move(&cursor_page->lru, dst);
@@ -1026,11 +1085,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1026 nr_lumpy_dirty++; 1085 nr_lumpy_dirty++;
1027 scan++; 1086 scan++;
1028 } else { 1087 } else {
1029 if (mode == ISOLATE_BOTH && 1088 /* the page is freed already. */
1030 page_count(cursor_page)) 1089 if (!page_count(cursor_page))
1031 nr_lumpy_failed++; 1090 continue;
1091 break;
1032 } 1092 }
1033 } 1093 }
1094
1095 /* If we break out of the loop above, lumpy reclaim failed */
1096 if (pfn < end_pfn)
1097 nr_lumpy_failed++;
1034 } 1098 }
1035 1099
1036 *scanned = scan; 1100 *scanned = scan;
@@ -1253,7 +1317,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1253 return false; 1317 return false;
1254 1318
1255 /* Only stall on lumpy reclaim */ 1319 /* Only stall on lumpy reclaim */
1256 if (!sc->lumpy_reclaim_mode) 1320 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
1257 return false; 1321 return false;
1258 1322
1259 /* If we have relaimed everything on the isolated list, no stall */ 1323 /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1350,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1286 unsigned long nr_scanned; 1350 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0; 1351 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken; 1352 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon; 1353 unsigned long nr_anon;
1291 unsigned long nr_file; 1354 unsigned long nr_file;
1292 1355
@@ -1298,15 +1361,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1298 return SWAP_CLUSTER_MAX; 1361 return SWAP_CLUSTER_MAX;
1299 } 1362 }
1300 1363
1301 1364 set_lumpy_reclaim_mode(priority, sc, false);
1302 lru_add_drain(); 1365 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock); 1366 spin_lock_irq(&zone->lru_lock);
1304 1367
1305 if (scanning_global_lru(sc)) { 1368 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan, 1369 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order, 1370 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ? 1371 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE, 1372 ISOLATE_INACTIVE : ISOLATE_BOTH,
1310 zone, 0, file); 1373 zone, 0, file);
1311 zone->pages_scanned += nr_scanned; 1374 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd()) 1375 if (current_is_kswapd())
@@ -1318,8 +1381,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1318 } else { 1381 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1382 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order, 1383 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ? 1384 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE, 1385 ISOLATE_INACTIVE : ISOLATE_BOTH,
1323 zone, sc->mem_cgroup, 1386 zone, sc->mem_cgroup,
1324 0, file); 1387 0, file);
1325 /* 1388 /*
@@ -1337,20 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1337 1400
1338 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1339 1402
1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1403 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1341 1404
1342 /* Check if we should syncronously wait for writeback */ 1405 /* Check if we should syncronously wait for writeback */
1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10); 1407 set_lumpy_reclaim_mode(priority, sc, true);
1345 1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1346 /*
1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1349 */
1350 nr_active = clear_active_flags(&page_list, NULL);
1351 count_vm_events(PGDEACTIVATE, nr_active);
1352
1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 } 1409 }
1355 1410
1356 local_irq_disable(); 1411 local_irq_disable();
@@ -1359,6 +1414,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1414 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1360 1415
1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1416 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1417
1418 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1419 zone_idx(zone),
1420 nr_scanned, nr_reclaimed,
1421 priority,
1422 trace_shrink_flags(file, sc->lumpy_reclaim_mode));
1362 return nr_reclaimed; 1423 return nr_reclaimed;
1363} 1424}
1364 1425
@@ -1506,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1506 spin_unlock_irq(&zone->lru_lock); 1567 spin_unlock_irq(&zone->lru_lock);
1507} 1568}
1508 1569
1570#ifdef CONFIG_SWAP
1509static int inactive_anon_is_low_global(struct zone *zone) 1571static int inactive_anon_is_low_global(struct zone *zone)
1510{ 1572{
1511 unsigned long active, inactive; 1573 unsigned long active, inactive;
@@ -1531,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1531{ 1593{
1532 int low; 1594 int low;
1533 1595
1596 /*
1597 * If we don't have swap space, anonymous page deactivation
1598 * is pointless.
1599 */
1600 if (!total_swap_pages)
1601 return 0;
1602
1534 if (scanning_global_lru(sc)) 1603 if (scanning_global_lru(sc))
1535 low = inactive_anon_is_low_global(zone); 1604 low = inactive_anon_is_low_global(zone);
1536 else 1605 else
1537 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1606 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1538 return low; 1607 return low;
1539} 1608}
1609#else
1610static inline int inactive_anon_is_low(struct zone *zone,
1611 struct scan_control *sc)
1612{
1613 return 0;
1614}
1615#endif
1540 1616
1541static int inactive_file_is_low_global(struct zone *zone) 1617static int inactive_file_is_low_global(struct zone *zone)
1542{ 1618{
@@ -1721,21 +1797,6 @@ out:
1721 } 1797 }
1722} 1798}
1723 1799
1724static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1725{
1726 /*
1727 * If we need a large contiguous chunk of memory, or have
1728 * trouble getting a small set of contiguous pages, we
1729 * will reclaim both active and inactive pages.
1730 */
1731 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1732 sc->lumpy_reclaim_mode = 1;
1733 else if (sc->order && priority < DEF_PRIORITY - 2)
1734 sc->lumpy_reclaim_mode = 1;
1735 else
1736 sc->lumpy_reclaim_mode = 0;
1737}
1738
1739/* 1800/*
1740 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1801 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1741 */ 1802 */
@@ -1750,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
1750 1811
1751 get_scan_count(zone, sc, nr, priority); 1812 get_scan_count(zone, sc, nr, priority);
1752 1813
1753 set_lumpy_reclaim_mode(priority, sc);
1754
1755 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1756 nr[LRU_INACTIVE_FILE]) { 1815 nr[LRU_INACTIVE_FILE]) {
1757 for_each_evictable_lru(l) { 1816 for_each_evictable_lru(l) {
@@ -1782,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
1782 * Even if we did not try to evict anon pages at all, we want to 1841 * Even if we did not try to evict anon pages at all, we want to
1783 * rebalance the anon lru active/inactive ratio. 1842 * rebalance the anon lru active/inactive ratio.
1784 */ 1843 */
1785 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) 1844 if (inactive_anon_is_low(zone, sc))
1786 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1845 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1787 1846
1788 throttle_vm_writeout(sc->gfp_mask); 1847 throttle_vm_writeout(sc->gfp_mask);
@@ -1937,21 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1937 1996
1938 /* Take a nap, wait for some writeback to complete */ 1997 /* Take a nap, wait for some writeback to complete */
1939 if (!sc->hibernation_mode && sc->nr_scanned && 1998 if (!sc->hibernation_mode && sc->nr_scanned &&
1940 priority < DEF_PRIORITY - 2) 1999 priority < DEF_PRIORITY - 2) {
1941 congestion_wait(BLK_RW_ASYNC, HZ/10); 2000 struct zone *preferred_zone;
2001
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 }
1942 } 2006 }
1943 2007
1944out: 2008out:
1945 /*
1946 * Now that we've scanned all the zones at this priority level, note
1947 * that level within the zone so that the next thread which performs
1948 * scanning of this zone will immediately start out at this priority
1949 * level. This affects only the decision whether or not to bring
1950 * mapped pages onto the inactive list.
1951 */
1952 if (priority < 0)
1953 priority = 0;
1954
1955 delayacct_freepages_end(); 2009 delayacct_freepages_end();
1956 put_mems_allowed(); 2010 put_mems_allowed();
1957 2011
@@ -2247,6 +2301,15 @@ loop_again:
2247 if (!zone_watermark_ok(zone, order, 2301 if (!zone_watermark_ok(zone, order,
2248 min_wmark_pages(zone), end_zone, 0)) 2302 min_wmark_pages(zone), end_zone, 0))
2249 has_under_min_watermark_zone = 1; 2303 has_under_min_watermark_zone = 1;
2304 } else {
2305 /*
2306 * If a zone reaches its high watermark,
2307 * consider it to be no longer congested. It's
2308 * possible there are dirty pages backed by
2309 * congested BDIs but as pressure is relieved,
2310 * spectulatively avoid congestion waits
2311 */
2312 zone_clear_flag(zone, ZONE_CONGESTED);
2250 } 2313 }
2251 2314
2252 } 2315 }
@@ -2987,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
2987 return 0; 3050 return 0;
2988} 3051}
2989 3052
3053#ifdef CONFIG_NUMA
2990/* 3054/*
2991 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3055 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2992 * a specified node's per zone unevictable lists for evictable pages. 3056 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
3033{ 3097{
3034 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3098 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3035} 3099}
3036 3100#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..cd2e42be7b68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h>
21#include <linux/compaction.h>
20 22
21#ifdef CONFIG_VM_EVENT_COUNTERS 23#ifdef CONFIG_VM_EVENT_COUNTERS
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -394,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
394#endif 396#endif
395 397
396#ifdef CONFIG_COMPACTION 398#ifdef CONFIG_COMPACTION
399
397struct contig_page_info { 400struct contig_page_info {
398 unsigned long free_pages; 401 unsigned long free_pages;
399 unsigned long free_blocks_total; 402 unsigned long free_blocks_total;
@@ -745,6 +748,11 @@ static const char * const vmstat_text[] = {
745 "nr_isolated_anon", 748 "nr_isolated_anon",
746 "nr_isolated_file", 749 "nr_isolated_file",
747 "nr_shmem", 750 "nr_shmem",
751 "nr_dirtied",
752 "nr_written",
753 "nr_dirty_threshold",
754 "nr_dirty_background_threshold",
755
748#ifdef CONFIG_NUMA 756#ifdef CONFIG_NUMA
749 "numa_hit", 757 "numa_hit",
750 "numa_miss", 758 "numa_miss",
@@ -904,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
904 .release = seq_release, 912 .release = seq_release,
905}; 913};
906 914
915enum writeback_stat_item {
916 NR_DIRTY_THRESHOLD,
917 NR_DIRTY_BG_THRESHOLD,
918 NR_VM_WRITEBACK_STAT_ITEMS,
919};
920
907static void *vmstat_start(struct seq_file *m, loff_t *pos) 921static void *vmstat_start(struct seq_file *m, loff_t *pos)
908{ 922{
909 unsigned long *v; 923 unsigned long *v;
910#ifdef CONFIG_VM_EVENT_COUNTERS 924 int i, stat_items_size;
911 unsigned long *e;
912#endif
913 int i;
914 925
915 if (*pos >= ARRAY_SIZE(vmstat_text)) 926 if (*pos >= ARRAY_SIZE(vmstat_text))
916 return NULL; 927 return NULL;
928 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
929 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
917 930
918#ifdef CONFIG_VM_EVENT_COUNTERS 931#ifdef CONFIG_VM_EVENT_COUNTERS
919 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 932 stat_items_size += sizeof(struct vm_event_state);
920 + sizeof(struct vm_event_state), GFP_KERNEL);
921#else
922 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
923 GFP_KERNEL);
924#endif 933#endif
934
935 v = kmalloc(stat_items_size, GFP_KERNEL);
925 m->private = v; 936 m->private = v;
926 if (!v) 937 if (!v)
927 return ERR_PTR(-ENOMEM); 938 return ERR_PTR(-ENOMEM);
928 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 939 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
929 v[i] = global_page_state(i); 940 v[i] = global_page_state(i);
941 v += NR_VM_ZONE_STAT_ITEMS;
942
943 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
944 v + NR_DIRTY_THRESHOLD);
945 v += NR_VM_WRITEBACK_STAT_ITEMS;
946
930#ifdef CONFIG_VM_EVENT_COUNTERS 947#ifdef CONFIG_VM_EVENT_COUNTERS
931 e = v + NR_VM_ZONE_STAT_ITEMS; 948 all_vm_events(v);
932 all_vm_events(e); 949 v[PGPGIN] /= 2; /* sectors -> kbytes */
933 e[PGPGIN] /= 2; /* sectors -> kbytes */ 950 v[PGPGOUT] /= 2;
934 e[PGPGOUT] /= 2;
935#endif 951#endif
936 return v + *pos; 952 return m->private + *pos;
937} 953}
938 954
939static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 955static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)