aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2010-11-03 10:51:26 -0400
committerTakashi Iwai <tiwai@suse.de>2010-11-03 10:51:26 -0400
commit69dbdd819599e2f3b77c172e83af512845bca5ad (patch)
tree49939d8b80ec2115a801eae2aebc21f23867c876 /mm
parent87232dd49aeb6b7d1af291edca8bd129a82ef4b5 (diff)
parent75e3f3137cb570661c2ad3035a139dda671fbb63 (diff)
Merge branch 'fix/asoc' into for-linus
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c74
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c42
-rw-r--r--mm/highmem.c66
-rw-r--r--mm/hugetlb.c238
-rw-r--r--mm/internal.h2
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memblock.c837
-rw-r--r--mm/memcontrol.c406
-rw-r--r--mm/memory-failure.c176
-rw-r--r--mm/memory.c37
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/migrate.c249
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c51
-rw-r--r--mm/oom_kill.c33
-rw-r--r--mm/page-writeback.c31
-rw-r--r--mm/page_alloc.c185
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu.c403
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/rmap.c37
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c788
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c67
-rw-r--r--mm/vmscan.c222
-rw-r--r--mm/vmstat.c44
39 files changed, 2779 insertions, 1458 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb9124e410..c2c8a4a11898 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS
301 of 1 says that all excess pages should be trimmed. 301 of 1 says that all excess pages should be trimmed.
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304
305#
306# UP and nommu archs use km based percpu allocator
307#
308config NEED_PER_CPU_KM
309 depends on !SMP
310 bool
311 default y
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e37..f73f75a29f82 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o mmu_context.o \ 14 page_isolation.o mm_init.o mmu_context.o percpu.o \
15 $(mmu-y) 15 $(mmu-y)
16obj-y += init-mm.o 16obj-y += init-mm.o
17 17
@@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
36obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 36obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 37obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 38obj-$(CONFIG_MIGRATION) += migrate.o
39ifdef CONFIG_SMP
40obj-y += percpu.o
41else
42obj-y += percpu_up.o
43endif
44obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
45obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
46obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 41obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
74 74
75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 75 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
76 spin_lock(&inode_lock); 76 spin_lock(&inode_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_list) 77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list) 79 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 nr_io++; 80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list) 81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 82 nr_more_io++;
83 spin_unlock(&inode_lock); 83 spin_unlock(&inode_lock);
84 84
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
362{ 362{
363 struct bdi_writeback *me = ptr; 363 struct bdi_writeback *me = ptr;
364 364
365 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 365 current->flags |= PF_SWAPWRITE;
366 set_freezable(); 366 set_freezable();
367 367
368 /* 368 /*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 }; 731 };
732static atomic_t nr_bdi_congested[2];
732 733
733void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 734void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734{ 735{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 737 wait_queue_head_t *wqh = &congestion_wqh[sync];
737 738
738 bit = sync ? BDI_sync_congested : BDI_async_congested; 739 bit = sync ? BDI_sync_congested : BDI_async_congested;
739 clear_bit(bit, &bdi->state); 740 if (test_and_clear_bit(bit, &bdi->state))
741 atomic_dec(&nr_bdi_congested[sync]);
740 smp_mb__after_clear_bit(); 742 smp_mb__after_clear_bit();
741 if (waitqueue_active(wqh)) 743 if (waitqueue_active(wqh))
742 wake_up(wqh); 744 wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748 enum bdi_state bit; 750 enum bdi_state bit;
749 751
750 bit = sync ? BDI_sync_congested : BDI_async_congested; 752 bit = sync ? BDI_sync_congested : BDI_async_congested;
751 set_bit(bit, &bdi->state); 753 if (!test_and_set_bit(bit, &bdi->state))
754 atomic_inc(&nr_bdi_congested[sync]);
752} 755}
753EXPORT_SYMBOL(set_bdi_congested); 756EXPORT_SYMBOL(set_bdi_congested);
754 757
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
764long congestion_wait(int sync, long timeout) 767long congestion_wait(int sync, long timeout)
765{ 768{
766 long ret; 769 long ret;
770 unsigned long start = jiffies;
767 DEFINE_WAIT(wait); 771 DEFINE_WAIT(wait);
768 wait_queue_head_t *wqh = &congestion_wqh[sync]; 772 wait_queue_head_t *wqh = &congestion_wqh[sync];
769 773
770 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 774 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
771 ret = io_schedule_timeout(timeout); 775 ret = io_schedule_timeout(timeout);
772 finish_wait(wqh, &wait); 776 finish_wait(wqh, &wait);
777
778 trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
779 jiffies_to_usecs(jiffies - start));
780
773 return ret; 781 return ret;
774} 782}
775EXPORT_SYMBOL(congestion_wait); 783EXPORT_SYMBOL(congestion_wait);
776 784
785/**
786 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
787 * @zone: A zone to check if it is heavily congested
788 * @sync: SYNC or ASYNC IO
789 * @timeout: timeout in jiffies
790 *
791 * In the event of a congested backing_dev (any backing_dev) and the given
792 * @zone has experienced recent congestion, this waits for up to @timeout
793 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete.
795 *
796 * In the absense of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep.
798 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise,
800 * it is the number of jiffies that were still remaining when the function
801 * returned. return_value == timeout implies the function did not sleep.
802 */
803long wait_iff_congested(struct zone *zone, int sync, long timeout)
804{
805 long ret;
806 unsigned long start = jiffies;
807 DEFINE_WAIT(wait);
808 wait_queue_head_t *wqh = &congestion_wqh[sync];
809
810 /*
811 * If there is no congestion, or heavy congestion is not being
812 * encountered in the current zone, yield if necessary instead
813 * of sleeping on the congestion queue
814 */
815 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
816 !zone_is_reclaim_congested(zone)) {
817 cond_resched();
818
819 /* In case we scheduled, work out time remaining */
820 ret = timeout - (jiffies - start);
821 if (ret < 0)
822 ret = 0;
823
824 goto out;
825 }
826
827 /* Sleep until uncongested or a write happens */
828 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
829 ret = io_schedule_timeout(timeout);
830 finish_wait(wqh, &wait);
831
832out:
833 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
834 jiffies_to_usecs(jiffies - start));
835
836 return ret;
837}
838EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a54993..13b0caa9793c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h>
18 19
19#include <asm/bug.h> 20#include <asm/bug.h>
20#include <asm/io.h> 21#include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
434 unsigned long size) 435 unsigned long size)
435{ 436{
436#ifdef CONFIG_NO_BOOTMEM 437#ifdef CONFIG_NO_BOOTMEM
437 free_early(physaddr, physaddr + size); 438 kmemleak_free_part(__va(physaddr), size);
439 memblock_x86_free_range(physaddr, physaddr + size);
438#else 440#else
439 unsigned long start, end; 441 unsigned long start, end;
440 442
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
459void __init free_bootmem(unsigned long addr, unsigned long size) 461void __init free_bootmem(unsigned long addr, unsigned long size)
460{ 462{
461#ifdef CONFIG_NO_BOOTMEM 463#ifdef CONFIG_NO_BOOTMEM
462 free_early(addr, addr + size); 464 kmemleak_free_part(__va(addr), size);
465 memblock_x86_free_range(addr, addr + size);
463#else 466#else
464 unsigned long start, end; 467 unsigned long start, end;
465 468
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
526} 529}
527 530
528#ifndef CONFIG_NO_BOOTMEM 531#ifndef CONFIG_NO_BOOTMEM
532int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
533 int flags)
534{
535 return reserve_bootmem(phys, len, flags);
536}
537
529static unsigned long __init align_idx(struct bootmem_data *bdata, 538static unsigned long __init align_idx(struct bootmem_data *bdata,
530 unsigned long idx, unsigned long step) 539 unsigned long idx, unsigned long step)
531{ 540{
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..4df2de77e069 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
311 size_t offset; 311 size_t offset;
312 void *retval; 312 void *retval;
313 313
314 might_sleep_if(mem_flags & __GFP_WAIT);
315
314 spin_lock_irqsave(&pool->lock, flags); 316 spin_lock_irqsave(&pool->lock, flags);
315 restart: 317 restart:
316 list_for_each_entry(page, &pool->page_list, page_list) { 318 list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..75572b5f2374 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
612 TASK_UNINTERRUPTIBLE); 612 TASK_UNINTERRUPTIBLE);
613} 613}
614 614
615int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
616 unsigned int flags)
617{
618 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
619 __lock_page(page);
620 return 1;
621 } else {
622 up_read(&mm->mmap_sem);
623 wait_on_page_locked(page);
624 return 0;
625 }
626}
627
615/** 628/**
616 * find_get_page - find and get a page reference 629 * find_get_page - find and get a page reference
617 * @mapping: the address_space to search 630 * @mapping: the address_space to search
@@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1539 * waiting for the lock. 1552 * waiting for the lock.
1540 */ 1553 */
1541 do_async_mmap_readahead(vma, ra, file, page, offset); 1554 do_async_mmap_readahead(vma, ra, file, page, offset);
1542 lock_page(page);
1543
1544 /* Did it get truncated? */
1545 if (unlikely(page->mapping != mapping)) {
1546 unlock_page(page);
1547 put_page(page);
1548 goto no_cached_page;
1549 }
1550 } else { 1555 } else {
1551 /* No page in the page cache at all */ 1556 /* No page in the page cache at all */
1552 do_sync_mmap_readahead(vma, ra, file, offset); 1557 do_sync_mmap_readahead(vma, ra, file, offset);
1553 count_vm_event(PGMAJFAULT); 1558 count_vm_event(PGMAJFAULT);
1554 ret = VM_FAULT_MAJOR; 1559 ret = VM_FAULT_MAJOR;
1555retry_find: 1560retry_find:
1556 page = find_lock_page(mapping, offset); 1561 page = find_get_page(mapping, offset);
1557 if (!page) 1562 if (!page)
1558 goto no_cached_page; 1563 goto no_cached_page;
1559 } 1564 }
1560 1565
1566 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
1567 return ret | VM_FAULT_RETRY;
1568
1569 /* Did it get truncated? */
1570 if (unlikely(page->mapping != mapping)) {
1571 unlock_page(page);
1572 put_page(page);
1573 goto retry_find;
1574 }
1575 VM_BUG_ON(page->index != offset);
1576
1561 /* 1577 /*
1562 * We have a locked page in the page cache, now we need to check 1578 * We have a locked page in the page cache, now we need to check
1563 * that it's up-to-date. If not, it is going to be due to an error. 1579 * that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2177 } 2193 }
2178 2194
2179 if (written > 0) { 2195 if (written > 0) {
2180 loff_t end = pos + written; 2196 pos += written;
2181 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2197 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2182 i_size_write(inode, end); 2198 i_size_write(inode, pos);
2183 mark_inode_dirty(inode); 2199 mark_inode_dirty(inode);
2184 } 2200 }
2185 *ppos = end; 2201 *ppos = pos;
2186 } 2202 }
2187out: 2203out:
2188 return written; 2204 return written;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
29#include <linux/kgdb.h> 29#include <linux/kgdb.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32
33#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34DEFINE_PER_CPU(int, __kmap_atomic_idx);
35#endif
36
32/* 37/*
33 * Virtual_count is not a pure "count". 38 * Virtual_count is not a pure "count".
34 * 0 means that it is not mapped, and has not been mapped 39 * 0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
42unsigned long totalhigh_pages __read_mostly; 47unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages); 48EXPORT_SYMBOL(totalhigh_pages);
44 49
50
51EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
52
45unsigned int nr_free_highpages (void) 53unsigned int nr_free_highpages (void)
46{ 54{
47 pg_data_t *pgdat; 55 pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
422} 430}
423 431
424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 432#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#ifdef CONFIG_DEBUG_HIGHMEM
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static int warn_count = 10;
431
432 if (unlikely(warn_count < 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_nmi()) {
437 if (type != KM_NMI && type != KM_NMI_PTE) {
438 WARN_ON(1);
439 warn_count--;
440 }
441 } else if (in_irq()) {
442 if (type != KM_IRQ0 && type != KM_IRQ1 &&
443 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
444 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
445 WARN_ON(1);
446 warn_count--;
447 }
448 } else if (!irqs_disabled()) { /* softirq */
449 if (type != KM_IRQ0 && type != KM_IRQ1 &&
450 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
451 type != KM_SKB_SUNRPC_DATA &&
452 type != KM_SKB_DATA_SOFTIRQ &&
453 type != KM_BOUNCE_READ) {
454 WARN_ON(1);
455 warn_count--;
456 }
457 }
458 }
459
460 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
461 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
462 type == KM_IRQ_PTE || type == KM_NMI ||
463 type == KM_NMI_PTE ) {
464 if (!irqs_disabled()) {
465 WARN_ON(1);
466 warn_count--;
467 }
468 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
469 if (irq_count() == 0 && !irqs_disabled()) {
470 WARN_ON(1);
471 warn_count--;
472 }
473 }
474#ifdef CONFIG_KGDB_KDB
475 if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
476 WARN_ON(1);
477 warn_count--;
478 }
479#endif /* CONFIG_KGDB_KDB */
480}
481
482#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..c4a3558589ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
423 } 423 }
424} 424}
425 425
426static void copy_gigantic_page(struct page *dst, struct page *src, 426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma) 427 unsigned long addr, struct vm_area_struct *vma)
428{ 428{
429 int i; 429 int i;
430 struct hstate *h = hstate_vma(vma); 430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 431 struct page *dst_base = dst;
432 struct page *src_base = src; 432 struct page *src_base = src;
433 might_sleep(); 433
434 for (i = 0; i < pages_per_huge_page(h); ) { 434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
440 src = mem_map_next(src, src_base, i); 440 src = mem_map_next(src, src_base, i);
441 } 441 }
442} 442}
443static void copy_huge_page(struct page *dst, struct page *src, 443
444static void copy_user_huge_page(struct page *dst, struct page *src,
444 unsigned long addr, struct vm_area_struct *vma) 445 unsigned long addr, struct vm_area_struct *vma)
445{ 446{
446 int i; 447 int i;
447 struct hstate *h = hstate_vma(vma); 448 struct hstate *h = hstate_vma(vma);
448 449
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 451 copy_user_gigantic_page(dst, src, addr, vma);
451 return; 452 return;
452 } 453 }
453 454
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
458 } 459 }
459} 460}
460 461
462static void copy_gigantic_page(struct page *dst, struct page *src)
463{
464 int i;
465 struct hstate *h = page_hstate(src);
466 struct page *dst_base = dst;
467 struct page *src_base = src;
468
469 for (i = 0; i < pages_per_huge_page(h); ) {
470 cond_resched();
471 copy_highpage(dst, src);
472
473 i++;
474 dst = mem_map_next(dst, dst_base, i);
475 src = mem_map_next(src, src_base, i);
476 }
477}
478
479void copy_huge_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483
484 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
485 copy_gigantic_page(dst, src);
486 return;
487 }
488
489 might_sleep();
490 for (i = 0; i < pages_per_huge_page(h); i++) {
491 cond_resched();
492 copy_highpage(dst + i, src + i);
493 }
494}
495
461static void enqueue_huge_page(struct hstate *h, struct page *page) 496static void enqueue_huge_page(struct hstate *h, struct page *page)
462{ 497{
463 int nid = page_to_nid(page); 498 int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 501 h->free_huge_pages_node[nid]++;
467} 502}
468 503
504static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
505{
506 struct page *page;
507
508 if (list_empty(&h->hugepage_freelists[nid]))
509 return NULL;
510 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
511 list_del(&page->lru);
512 set_page_refcounted(page);
513 h->free_huge_pages--;
514 h->free_huge_pages_node[nid]--;
515 return page;
516}
517
469static struct page *dequeue_huge_page_vma(struct hstate *h, 518static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 519 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 520 unsigned long address, int avoid_reserve)
472{ 521{
473 int nid;
474 struct page *page = NULL; 522 struct page *page = NULL;
475 struct mempolicy *mpol; 523 struct mempolicy *mpol;
476 nodemask_t *nodemask; 524 nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
496 544
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 545 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 546 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 547 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 548 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 549 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 550 if (!avoid_reserve)
503 struct page, lru); 551 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 552 break;
505 h->free_huge_pages--; 553 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 554 }
513 } 555 }
514err: 556err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 812 return ret;
771} 813}
772 814
773static struct page *alloc_buddy_huge_page(struct hstate *h, 815static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 816{
776 struct page *page; 817 struct page *page;
777 unsigned int nid; 818 unsigned int r_nid;
778 819
779 if (h->order >= MAX_ORDER) 820 if (h->order >= MAX_ORDER)
780 return NULL; 821 return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 853 }
813 spin_unlock(&hugetlb_lock); 854 spin_unlock(&hugetlb_lock);
814 855
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 856 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 857 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 858 __GFP_REPEAT|__GFP_NOWARN,
859 huge_page_order(h));
860 else
861 page = alloc_pages_exact_node(nid,
862 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
863 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 864
819 if (page && arch_prepare_hugepage(page)) { 865 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 866 __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 869
824 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
825 if (page) { 871 if (page) {
826 /* 872 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 873 set_compound_page_dtor(page, free_huge_page);
834 /* 874 /*
835 * We incremented the global counters already 875 * We incremented the global counters already
836 */ 876 */
837 h->nr_huge_pages_node[nid]++; 877 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 878 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 879 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 880 } else {
841 h->nr_huge_pages--; 881 h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 888}
849 889
850/* 890/*
891 * This allocation function is useful in the context where vma is irrelevant.
892 * E.g. soft-offlining uses this function because it only cares physical
893 * address of error page.
894 */
895struct page *alloc_huge_page_node(struct hstate *h, int nid)
896{
897 struct page *page;
898
899 spin_lock(&hugetlb_lock);
900 page = dequeue_huge_page_node(h, nid);
901 spin_unlock(&hugetlb_lock);
902
903 if (!page)
904 page = alloc_buddy_huge_page(h, nid);
905
906 return page;
907}
908
909/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 910 * Increase the hugetlb pool such that it can accomodate a reservation
852 * of size 'delta'. 911 * of size 'delta'.
853 */ 912 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 930retry:
872 spin_unlock(&hugetlb_lock); 931 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 932 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 933 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 934 if (!page)
876 /* 935 /*
877 * We were not able to allocate enough pages to 936 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 937 * satisfy the entire reservation so we free what
879 * we've allocated so far. 938 * we've allocated so far.
880 */ 939 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 940 goto free;
884 }
885 941
886 list_add(&page->lru, &surplus_list); 942 list_add(&page->lru, &surplus_list);
887 } 943 }
@@ -908,31 +964,31 @@ retry:
908 needed += allocated; 964 needed += allocated;
909 h->resv_huge_pages += delta; 965 h->resv_huge_pages += delta;
910 ret = 0; 966 ret = 0;
911free: 967
968 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 969 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 970 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 971 if ((--needed) < 0)
915 break; 972 break;
916 list_del(&page->lru); 973 list_del(&page->lru);
974 /*
975 * This page is now managed by the hugetlb allocator and has
976 * no users -- drop the buddy allocator's reference.
977 */
978 put_page_testzero(page);
979 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 980 enqueue_huge_page(h, page);
918 } 981 }
919 982
920 /* Free unnecessary surplus pages to the buddy allocator */ 983 /* Free unnecessary surplus pages to the buddy allocator */
984free:
921 if (!list_empty(&surplus_list)) { 985 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 986 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 987 list_del(&page->lru);
925 /* 988 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 989 }
934 spin_lock(&hugetlb_lock);
935 } 990 }
991 spin_lock(&hugetlb_lock);
936 992
937 return ret; 993 return ret;
938} 994}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1052 spin_unlock(&hugetlb_lock); 1108 spin_unlock(&hugetlb_lock);
1053 1109
1054 if (!page) { 1110 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1111 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1112 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1113 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1114 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1115 }
1060 } 1116 }
1061 1117
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1118 set_page_private(page, (unsigned long) mapping);
1064 1119
1065 vma_commit_reservation(h, vma, addr); 1120 vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
2153 return -ENOMEM; 2208 return -ENOMEM;
2154} 2209}
2155 2210
2211static int is_hugetlb_entry_migration(pte_t pte)
2212{
2213 swp_entry_t swp;
2214
2215 if (huge_pte_none(pte) || pte_present(pte))
2216 return 0;
2217 swp = pte_to_swp_entry(pte);
2218 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2219 return 1;
2220 } else
2221 return 0;
2222}
2223
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2224static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2225{
2158 swp_entry_t swp; 2226 swp_entry_t swp;
@@ -2380,10 +2448,13 @@ retry_avoidcopy:
2380 * When the original hugepage is shared one, it does not have 2448 * When the original hugepage is shared one, it does not have
2381 * anon_vma prepared. 2449 * anon_vma prepared.
2382 */ 2450 */
2383 if (unlikely(anon_vma_prepare(vma))) 2451 if (unlikely(anon_vma_prepare(vma))) {
2452 /* Caller expects lock to be held */
2453 spin_lock(&mm->page_table_lock);
2384 return VM_FAULT_OOM; 2454 return VM_FAULT_OOM;
2455 }
2385 2456
2386 copy_huge_page(new_page, old_page, address, vma); 2457 copy_user_huge_page(new_page, old_page, address, vma);
2387 __SetPageUptodate(new_page); 2458 __SetPageUptodate(new_page);
2388 2459
2389 /* 2460 /*
@@ -2515,22 +2586,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2586 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2587 }
2517 } else { 2588 } else {
2589 /*
2590 * If memory error occurs between mmap() and fault, some process
2591 * don't have hwpoisoned swap entry for errored virtual address.
2592 * So we need to block hugepage fault by PG_hwpoison bit check.
2593 */
2594 if (unlikely(PageHWPoison(page))) {
2595 ret = VM_FAULT_HWPOISON |
2596 VM_FAULT_SET_HINDEX(h - hstates);
2597 goto backout_unlocked;
2598 }
2518 page_dup_rmap(page); 2599 page_dup_rmap(page);
2519 } 2600 }
2520 2601
2521 /* 2602 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2603 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2604 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2605 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2656 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2657 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2658 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2659 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2660 migration_entry_wait(mm, (pmd_t *)ptep, address);
2661 return 0;
2662 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2663 return VM_FAULT_HWPOISON_LARGE |
2664 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2665 }
2593 2666
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2667 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2951 hugetlb_acct_memory(h, -(chg - freed));
2879} 2952}
2880 2953
2954#ifdef CONFIG_MEMORY_FAILURE
2955
2956/* Should be called in hugetlb_lock */
2957static int is_hugepage_on_freelist(struct page *hpage)
2958{
2959 struct page *page;
2960 struct page *tmp;
2961 struct hstate *h = page_hstate(hpage);
2962 int nid = page_to_nid(hpage);
2963
2964 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2965 if (page == hpage)
2966 return 1;
2967 return 0;
2968}
2969
2881/* 2970/*
2882 * This function is called from memory failure code. 2971 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2972 * Assume the caller holds page lock of the head page.
2884 */ 2973 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2974int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2975{
2887 struct hstate *h = page_hstate(hpage); 2976 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2977 int nid = page_to_nid(hpage);
2978 int ret = -EBUSY;
2889 2979
2890 spin_lock(&hugetlb_lock); 2980 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2981 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2982 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2983 set_page_refcounted(hpage);
2984 h->free_huge_pages--;
2985 h->free_huge_pages_node[nid]--;
2986 ret = 0;
2987 }
2894 spin_unlock(&hugetlb_lock); 2988 spin_unlock(&hugetlb_lock);
2989 return ret;
2895} 2990}
2991#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
62 */ 62 */
63static inline unsigned long page_order(struct page *page) 63static inline unsigned long page_order(struct page *page)
64{ 64{
65 VM_BUG_ON(!PageBuddy(page)); 65 /* PageBuddy() must be checked by the caller */
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/uaccess.h>
5#include <linux/module.h> 4#include <linux/module.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h>
7 7
8/** 8/**
9 * probe_kernel_read(): safely attempt to read from a location 9 * probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ecb..400dc62697d7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,423 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/poison.h>
18#include <linux/pfn.h>
19#include <linux/debugfs.h>
20#include <linux/seq_file.h>
16#include <linux/memblock.h> 21#include <linux/memblock.h>
17 22
18#define MEMBLOCK_ALLOC_ANYWHERE 0 23struct memblock memblock __initdata_memblock;
19 24
20struct memblock memblock; 25int memblock_debug __initdata_memblock;
26int memblock_can_resize __initdata_memblock;
27static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
28static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
21 29
22static int memblock_debug; 30/* inline so we don't get a warning when pr_debug is compiled out */
31static inline const char *memblock_type_name(struct memblock_type *type)
32{
33 if (type == &memblock.memory)
34 return "memory";
35 else if (type == &memblock.reserved)
36 return "reserved";
37 else
38 return "unknown";
39}
23 40
24static int __init early_memblock(char *p) 41/*
42 * Address comparison utilities
43 */
44
45static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
25{ 46{
26 if (p && strstr(p, "debug")) 47 return addr & ~(size - 1);
27 memblock_debug = 1; 48}
49
50static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
51{
52 return (addr + (size - 1)) & ~(size - 1);
53}
54
55static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
56 phys_addr_t base2, phys_addr_t size2)
57{
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59}
60
61static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
62 phys_addr_t base2, phys_addr_t size2)
63{
64 if (base2 == base1 + size1)
65 return 1;
66 else if (base1 == base2 + size2)
67 return -1;
68
28 return 0; 69 return 0;
29} 70}
30early_param("memblock", early_memblock);
31 71
32static void memblock_dump(struct memblock_region *region, char *name) 72static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
73 unsigned long r1, unsigned long r2)
33{ 74{
34 unsigned long long base, size; 75 phys_addr_t base1 = type->regions[r1].base;
35 int i; 76 phys_addr_t size1 = type->regions[r1].size;
77 phys_addr_t base2 = type->regions[r2].base;
78 phys_addr_t size2 = type->regions[r2].size;
36 79
37 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); 80 return memblock_addrs_adjacent(base1, size1, base2, size2);
81}
38 82
39 for (i = 0; i < region->cnt; i++) { 83long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
40 base = region->region[i].base; 84{
41 size = region->region[i].size; 85 unsigned long i;
42 86
43 pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", 87 for (i = 0; i < type->cnt; i++) {
44 name, i, base, base + size - 1, size); 88 phys_addr_t rgnbase = type->regions[i].base;
89 phys_addr_t rgnsize = type->regions[i].size;
90 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
91 break;
45 } 92 }
93
94 return (i < type->cnt) ? i : -1;
46} 95}
47 96
48void memblock_dump_all(void) 97/*
98 * Find, allocate, deallocate or reserve unreserved regions. All allocations
99 * are top-down.
100 */
101
102static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align)
49{ 104{
50 if (!memblock_debug) 105 phys_addr_t base, res_base;
51 return; 106 long j;
52 107
53 pr_info("MEMBLOCK configuration:\n"); 108 /* In case, huge size is requested */
54 pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); 109 if (end < size)
55 pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); 110 return MEMBLOCK_ERROR;
56 111
57 memblock_dump(&memblock.memory, "memory"); 112 base = memblock_align_down((end - size), align);
58 memblock_dump(&memblock.reserved, "reserved"); 113
114 /* Prevent allocations returning 0 as it's also used to
115 * indicate an allocation failure
116 */
117 if (start == 0)
118 start = PAGE_SIZE;
119
120 while (start <= base) {
121 j = memblock_overlaps_region(&memblock.reserved, base, size);
122 if (j < 0)
123 return base;
124 res_base = memblock.reserved.regions[j].base;
125 if (res_base < size)
126 break;
127 base = memblock_align_down(res_base - size, align);
128 }
129
130 return MEMBLOCK_ERROR;
59} 131}
60 132
61static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, 133static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
62 u64 size2) 134 phys_addr_t align, phys_addr_t start, phys_addr_t end)
63{ 135{
64 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 136 long i;
137
138 BUG_ON(0 == size);
139
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit;
145
146 /* We do a top-down search, this tends to limit memory
147 * fragmentation by keeping early boot allocs near the
148 * top of memory
149 */
150 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
151 phys_addr_t memblockbase = memblock.memory.regions[i].base;
152 phys_addr_t memblocksize = memblock.memory.regions[i].size;
153 phys_addr_t bottom, top, found;
154
155 if (memblocksize < size)
156 continue;
157 if ((memblockbase + memblocksize) <= start)
158 break;
159 bottom = max(memblockbase, start);
160 top = min(memblockbase + memblocksize, end);
161 if (bottom >= top)
162 continue;
163 found = memblock_find_region(bottom, top, size, align);
164 if (found != MEMBLOCK_ERROR)
165 return found;
166 }
167 return MEMBLOCK_ERROR;
65} 168}
66 169
67static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) 170/*
171 * Find a free area with specified alignment in a specific range.
172 */
173u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
68{ 174{
69 if (base2 == base1 + size1) 175 return memblock_find_base(size, align, start, end);
70 return 1; 176}
71 else if (base1 == base2 + size2)
72 return -1;
73 177
74 return 0; 178/*
179 * Free memblock.reserved.regions
180 */
181int __init_memblock memblock_free_reserved_regions(void)
182{
183 if (memblock.reserved.regions == memblock_reserved_init_regions)
184 return 0;
185
186 return memblock_free(__pa(memblock.reserved.regions),
187 sizeof(struct memblock_region) * memblock.reserved.max);
75} 188}
76 189
77static long memblock_regions_adjacent(struct memblock_region *rgn, 190/*
78 unsigned long r1, unsigned long r2) 191 * Reserve memblock.reserved.regions
192 */
193int __init_memblock memblock_reserve_reserved_regions(void)
79{ 194{
80 u64 base1 = rgn->region[r1].base; 195 if (memblock.reserved.regions == memblock_reserved_init_regions)
81 u64 size1 = rgn->region[r1].size; 196 return 0;
82 u64 base2 = rgn->region[r2].base;
83 u64 size2 = rgn->region[r2].size;
84 197
85 return memblock_addrs_adjacent(base1, size1, base2, size2); 198 return memblock_reserve(__pa(memblock.reserved.regions),
199 sizeof(struct memblock_region) * memblock.reserved.max);
86} 200}
87 201
88static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) 202static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
89{ 203{
90 unsigned long i; 204 unsigned long i;
91 205
92 for (i = r; i < rgn->cnt - 1; i++) { 206 for (i = r; i < type->cnt - 1; i++) {
93 rgn->region[i].base = rgn->region[i + 1].base; 207 type->regions[i].base = type->regions[i + 1].base;
94 rgn->region[i].size = rgn->region[i + 1].size; 208 type->regions[i].size = type->regions[i + 1].size;
95 } 209 }
96 rgn->cnt--; 210 type->cnt--;
97} 211}
98 212
99/* Assumption: base addr of region 1 < base addr of region 2 */ 213/* Assumption: base addr of region 1 < base addr of region 2 */
100static void memblock_coalesce_regions(struct memblock_region *rgn, 214static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
101 unsigned long r1, unsigned long r2) 215 unsigned long r1, unsigned long r2)
102{ 216{
103 rgn->region[r1].size += rgn->region[r2].size; 217 type->regions[r1].size += type->regions[r2].size;
104 memblock_remove_region(rgn, r2); 218 memblock_remove_region(type, r2);
105} 219}
106 220
107void __init memblock_init(void) 221/* Defined below but needed now */
222static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
223
224static int __init_memblock memblock_double_array(struct memblock_type *type)
108{ 225{
109 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 226 struct memblock_region *new_array, *old_array;
110 * This simplifies the memblock_add() code below... 227 phys_addr_t old_size, new_size, addr;
228 int use_slab = slab_is_available();
229
230 /* We don't allow resizing until we know about the reserved regions
231 * of memory that aren't suitable for allocation
111 */ 232 */
112 memblock.memory.region[0].base = 0; 233 if (!memblock_can_resize)
113 memblock.memory.region[0].size = 0; 234 return -1;
114 memblock.memory.cnt = 1;
115 235
116 /* Ditto. */ 236 /* Calculate new doubled size */
117 memblock.reserved.region[0].base = 0; 237 old_size = type->max * sizeof(struct memblock_region);
118 memblock.reserved.region[0].size = 0; 238 new_size = old_size << 1;
119 memblock.reserved.cnt = 1; 239
120} 240 /* Try to find some space for it.
241 *
242 * WARNING: We assume that either slab_is_available() and we use it or
243 * we use MEMBLOCK for allocations. That means that this is unsafe to use
244 * when bootmem is currently active (unless bootmem itself is implemented
245 * on top of MEMBLOCK which isn't the case yet)
246 *
247 * This should however not be an issue for now, as we currently only
248 * call into MEMBLOCK while it's still active, or much later when slab is
249 * active for memory hotplug operations
250 */
251 if (use_slab) {
252 new_array = kmalloc(new_size, GFP_KERNEL);
253 addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
254 } else
255 addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
256 if (addr == MEMBLOCK_ERROR) {
257 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
258 memblock_type_name(type), type->max, type->max * 2);
259 return -1;
260 }
261 new_array = __va(addr);
121 262
122void __init memblock_analyze(void) 263 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
123{ 264 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
124 int i;
125 265
126 memblock.memory.size = 0; 266 /* Found space, we now need to move the array over before
267 * we add the reserved region since it may be our reserved
268 * array itself that is full.
269 */
270 memcpy(new_array, type->regions, old_size);
271 memset(new_array + type->max, 0, old_size);
272 old_array = type->regions;
273 type->regions = new_array;
274 type->max <<= 1;
275
276 /* If we use SLAB that's it, we are done */
277 if (use_slab)
278 return 0;
127 279
128 for (i = 0; i < memblock.memory.cnt; i++) 280 /* Add the new reserved region now. Should not fail ! */
129 memblock.memory.size += memblock.memory.region[i].size; 281 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
282
283 /* If the array wasn't our static init one, then free it. We only do
284 * that before SLAB is available as later on, we don't know whether
285 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
286 * anyways
287 */
288 if (old_array != memblock_memory_init_regions &&
289 old_array != memblock_reserved_init_regions)
290 memblock_free(__pa(old_array), old_size);
291
292 return 0;
130} 293}
131 294
132static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) 295extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
296 phys_addr_t addr2, phys_addr_t size2)
297{
298 return 1;
299}
300
301static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
133{ 302{
134 unsigned long coalesced = 0; 303 unsigned long coalesced = 0;
135 long adjacent, i; 304 long adjacent, i;
136 305
137 if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { 306 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
138 rgn->region[0].base = base; 307 type->regions[0].base = base;
139 rgn->region[0].size = size; 308 type->regions[0].size = size;
140 return 0; 309 return 0;
141 } 310 }
142 311
143 /* First try and coalesce this MEMBLOCK with another. */ 312 /* First try and coalesce this MEMBLOCK with another. */
144 for (i = 0; i < rgn->cnt; i++) { 313 for (i = 0; i < type->cnt; i++) {
145 u64 rgnbase = rgn->region[i].base; 314 phys_addr_t rgnbase = type->regions[i].base;
146 u64 rgnsize = rgn->region[i].size; 315 phys_addr_t rgnsize = type->regions[i].size;
147 316
148 if ((rgnbase == base) && (rgnsize == size)) 317 if ((rgnbase == base) && (rgnsize == size))
149 /* Already have this region, so we're done */ 318 /* Already have this region, so we're done */
150 return 0; 319 return 0;
151 320
152 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); 321 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
322 /* Check if arch allows coalescing */
323 if (adjacent != 0 && type == &memblock.memory &&
324 !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
325 break;
153 if (adjacent > 0) { 326 if (adjacent > 0) {
154 rgn->region[i].base -= size; 327 type->regions[i].base -= size;
155 rgn->region[i].size += size; 328 type->regions[i].size += size;
156 coalesced++; 329 coalesced++;
157 break; 330 break;
158 } else if (adjacent < 0) { 331 } else if (adjacent < 0) {
159 rgn->region[i].size += size; 332 type->regions[i].size += size;
160 coalesced++; 333 coalesced++;
161 break; 334 break;
162 } 335 }
163 } 336 }
164 337
165 if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { 338 /* If we plugged a hole, we may want to also coalesce with the
166 memblock_coalesce_regions(rgn, i, i+1); 339 * next region
340 */
341 if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
342 ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
343 type->regions[i].size,
344 type->regions[i+1].base,
345 type->regions[i+1].size)))) {
346 memblock_coalesce_regions(type, i, i+1);
167 coalesced++; 347 coalesced++;
168 } 348 }
169 349
170 if (coalesced) 350 if (coalesced)
171 return coalesced; 351 return coalesced;
172 if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) 352
353 /* If we are out of space, we fail. It's too late to resize the array
354 * but then this shouldn't have happened in the first place.
355 */
356 if (WARN_ON(type->cnt >= type->max))
173 return -1; 357 return -1;
174 358
175 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ 359 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
176 for (i = rgn->cnt - 1; i >= 0; i--) { 360 for (i = type->cnt - 1; i >= 0; i--) {
177 if (base < rgn->region[i].base) { 361 if (base < type->regions[i].base) {
178 rgn->region[i+1].base = rgn->region[i].base; 362 type->regions[i+1].base = type->regions[i].base;
179 rgn->region[i+1].size = rgn->region[i].size; 363 type->regions[i+1].size = type->regions[i].size;
180 } else { 364 } else {
181 rgn->region[i+1].base = base; 365 type->regions[i+1].base = base;
182 rgn->region[i+1].size = size; 366 type->regions[i+1].size = size;
183 break; 367 break;
184 } 368 }
185 } 369 }
186 370
187 if (base < rgn->region[0].base) { 371 if (base < type->regions[0].base) {
188 rgn->region[0].base = base; 372 type->regions[0].base = base;
189 rgn->region[0].size = size; 373 type->regions[0].size = size;
374 }
375 type->cnt++;
376
377 /* The array is full ? Try to resize it. If that fails, we undo
378 * our allocation and return an error
379 */
380 if (type->cnt == type->max && memblock_double_array(type)) {
381 type->cnt--;
382 return -1;
190 } 383 }
191 rgn->cnt++;
192 384
193 return 0; 385 return 0;
194} 386}
195 387
196long memblock_add(u64 base, u64 size) 388long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
197{ 389{
198 struct memblock_region *_rgn = &memblock.memory; 390 return memblock_add_region(&memblock.memory, base, size);
199
200 /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
201 if (base == 0)
202 memblock.rmo_size = size;
203
204 return memblock_add_region(_rgn, base, size);
205 391
206} 392}
207 393
208static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) 394static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
209{ 395{
210 u64 rgnbegin, rgnend; 396 phys_addr_t rgnbegin, rgnend;
211 u64 end = base + size; 397 phys_addr_t end = base + size;
212 int i; 398 int i;
213 399
214 rgnbegin = rgnend = 0; /* supress gcc warnings */ 400 rgnbegin = rgnend = 0; /* supress gcc warnings */
215 401
216 /* Find the region where (base, size) belongs to */ 402 /* Find the region where (base, size) belongs to */
217 for (i=0; i < rgn->cnt; i++) { 403 for (i=0; i < type->cnt; i++) {
218 rgnbegin = rgn->region[i].base; 404 rgnbegin = type->regions[i].base;
219 rgnend = rgnbegin + rgn->region[i].size; 405 rgnend = rgnbegin + type->regions[i].size;
220 406
221 if ((rgnbegin <= base) && (end <= rgnend)) 407 if ((rgnbegin <= base) && (end <= rgnend))
222 break; 408 break;
223 } 409 }
224 410
225 /* Didn't find the region */ 411 /* Didn't find the region */
226 if (i == rgn->cnt) 412 if (i == type->cnt)
227 return -1; 413 return -1;
228 414
229 /* Check to see if we are removing entire region */ 415 /* Check to see if we are removing entire region */
230 if ((rgnbegin == base) && (rgnend == end)) { 416 if ((rgnbegin == base) && (rgnend == end)) {
231 memblock_remove_region(rgn, i); 417 memblock_remove_region(type, i);
232 return 0; 418 return 0;
233 } 419 }
234 420
235 /* Check to see if region is matching at the front */ 421 /* Check to see if region is matching at the front */
236 if (rgnbegin == base) { 422 if (rgnbegin == base) {
237 rgn->region[i].base = end; 423 type->regions[i].base = end;
238 rgn->region[i].size -= size; 424 type->regions[i].size -= size;
239 return 0; 425 return 0;
240 } 426 }
241 427
242 /* Check to see if the region is matching at the end */ 428 /* Check to see if the region is matching at the end */
243 if (rgnend == end) { 429 if (rgnend == end) {
244 rgn->region[i].size -= size; 430 type->regions[i].size -= size;
245 return 0; 431 return 0;
246 } 432 }
247 433
@@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
249 * We need to split the entry - adjust the current one to the 435 * We need to split the entry - adjust the current one to the
250 * beginging of the hole and add the region after hole. 436 * beginging of the hole and add the region after hole.
251 */ 437 */
252 rgn->region[i].size = base - rgn->region[i].base; 438 type->regions[i].size = base - type->regions[i].base;
253 return memblock_add_region(rgn, end, rgnend - end); 439 return memblock_add_region(type, end, rgnend - end);
254} 440}
255 441
256long memblock_remove(u64 base, u64 size) 442long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
257{ 443{
258 return __memblock_remove(&memblock.memory, base, size); 444 return __memblock_remove(&memblock.memory, base, size);
259} 445}
260 446
261long __init memblock_free(u64 base, u64 size) 447long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
262{ 448{
263 return __memblock_remove(&memblock.reserved, base, size); 449 return __memblock_remove(&memblock.reserved, base, size);
264} 450}
265 451
266long __init memblock_reserve(u64 base, u64 size) 452long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
267{ 453{
268 struct memblock_region *_rgn = &memblock.reserved; 454 struct memblock_type *_rgn = &memblock.reserved;
269 455
270 BUG_ON(0 == size); 456 BUG_ON(0 == size);
271 457
272 return memblock_add_region(_rgn, base, size); 458 return memblock_add_region(_rgn, base, size);
273} 459}
274 460
275long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) 461phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
276{ 462{
277 unsigned long i; 463 phys_addr_t found;
278 464
279 for (i = 0; i < rgn->cnt; i++) { 465 /* We align the size to limit fragmentation. Without this, a lot of
280 u64 rgnbase = rgn->region[i].base; 466 * small allocs quickly eat up the whole reserve array on sparc
281 u64 rgnsize = rgn->region[i].size; 467 */
282 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) 468 size = memblock_align_up(size, align);
283 break;
284 }
285 469
286 return (i < rgn->cnt) ? i : -1; 470 found = memblock_find_base(size, align, 0, max_addr);
471 if (found != MEMBLOCK_ERROR &&
472 memblock_add_region(&memblock.reserved, found, size) >= 0)
473 return found;
474
475 return 0;
287} 476}
288 477
289static u64 memblock_align_down(u64 addr, u64 size) 478phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
290{ 479{
291 return addr & ~(size - 1); 480 phys_addr_t alloc;
481
482 alloc = __memblock_alloc_base(size, align, max_addr);
483
484 if (alloc == 0)
485 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
486 (unsigned long long) size, (unsigned long long) max_addr);
487
488 return alloc;
292} 489}
293 490
294static u64 memblock_align_up(u64 addr, u64 size) 491phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
295{ 492{
296 return (addr + (size - 1)) & ~(size - 1); 493 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
297} 494}
298 495
299static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, 496
300 u64 size, u64 align) 497/*
498 * Additional node-local allocators. Search for node memory is bottom up
499 * and walks memblock regions within that node bottom-up as well, but allocation
500 * within an memblock region is top-down. XXX I plan to fix that at some stage
501 *
502 * WARNING: Only available after early_node_map[] has been populated,
503 * on some architectures, that is after all the calls to add_active_range()
504 * have been done to populate it.
505 */
506
507phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
301{ 508{
302 u64 base, res_base; 509#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
303 long j; 510 /*
511 * This code originates from sparc which really wants use to walk by addresses
512 * and returns the nid. This is not very convenient for early_pfn_map[] users
513 * as the map isn't sorted yet, and it really wants to be walked by nid.
514 *
515 * For now, I implement the inefficient method below which walks the early
516 * map multiple times. Eventually we may want to use an ARCH config option
517 * to implement a completely different method for both case.
518 */
519 unsigned long start_pfn, end_pfn;
520 int i;
304 521
305 base = memblock_align_down((end - size), align); 522 for (i = 0; i < MAX_NUMNODES; i++) {
306 while (start <= base) { 523 get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
307 j = memblock_overlaps_region(&memblock.reserved, base, size); 524 if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
308 if (j < 0) { 525 continue;
309 /* this area isn't reserved, take it */ 526 *nid = i;
310 if (memblock_add_region(&memblock.reserved, base, size) < 0) 527 return min(end, PFN_PHYS(end_pfn));
311 base = ~(u64)0;
312 return base;
313 }
314 res_base = memblock.reserved.region[j].base;
315 if (res_base < size)
316 break;
317 base = memblock_align_down(res_base - size, align);
318 } 528 }
529#endif
530 *nid = 0;
319 531
320 return ~(u64)0; 532 return end;
321} 533}
322 534
323static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, 535static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
324 u64 (*nid_range)(u64, u64, int *), 536 phys_addr_t size,
325 u64 size, u64 align, int nid) 537 phys_addr_t align, int nid)
326{ 538{
327 u64 start, end; 539 phys_addr_t start, end;
328 540
329 start = mp->base; 541 start = mp->base;
330 end = start + mp->size; 542 end = start + mp->size;
331 543
332 start = memblock_align_up(start, align); 544 start = memblock_align_up(start, align);
333 while (start < end) { 545 while (start < end) {
334 u64 this_end; 546 phys_addr_t this_end;
335 int this_nid; 547 int this_nid;
336 548
337 this_end = nid_range(start, end, &this_nid); 549 this_end = memblock_nid_range(start, end, &this_nid);
338 if (this_nid == nid) { 550 if (this_nid == nid) {
339 u64 ret = memblock_alloc_nid_unreserved(start, this_end, 551 phys_addr_t ret = memblock_find_region(start, this_end, size, align);
340 size, align); 552 if (ret != MEMBLOCK_ERROR &&
341 if (ret != ~(u64)0) 553 memblock_add_region(&memblock.reserved, ret, size) >= 0)
342 return ret; 554 return ret;
343 } 555 }
344 start = this_end; 556 start = this_end;
345 } 557 }
346 558
347 return ~(u64)0; 559 return MEMBLOCK_ERROR;
348} 560}
349 561
350u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, 562phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
351 u64 (*nid_range)(u64 start, u64 end, int *nid))
352{ 563{
353 struct memblock_region *mem = &memblock.memory; 564 struct memblock_type *mem = &memblock.memory;
354 int i; 565 int i;
355 566
356 BUG_ON(0 == size); 567 BUG_ON(0 == size);
357 568
569 /* We align the size to limit fragmentation. Without this, a lot of
570 * small allocs quickly eat up the whole reserve array on sparc
571 */
358 size = memblock_align_up(size, align); 572 size = memblock_align_up(size, align);
359 573
574 /* We do a bottom-up search for a region with the right
575 * nid since that's easier considering how memblock_nid_range()
576 * works
577 */
360 for (i = 0; i < mem->cnt; i++) { 578 for (i = 0; i < mem->cnt; i++) {
361 u64 ret = memblock_alloc_nid_region(&mem->region[i], 579 phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
362 nid_range,
363 size, align, nid); 580 size, align, nid);
364 if (ret != ~(u64)0) 581 if (ret != MEMBLOCK_ERROR)
365 return ret; 582 return ret;
366 } 583 }
367 584
368 return memblock_alloc(size, align); 585 return 0;
369}
370
371u64 __init memblock_alloc(u64 size, u64 align)
372{
373 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
374} 586}
375 587
376u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) 588phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
377{ 589{
378 u64 alloc; 590 phys_addr_t res = memblock_alloc_nid(size, align, nid);
379
380 alloc = __memblock_alloc_base(size, align, max_addr);
381 591
382 if (alloc == 0) 592 if (res)
383 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", 593 return res;
384 (unsigned long long) size, (unsigned long long) max_addr); 594 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
385
386 return alloc;
387} 595}
388 596
389u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
390{
391 long i, j;
392 u64 base = 0;
393 u64 res_base;
394
395 BUG_ON(0 == size);
396 597
397 size = memblock_align_up(size, align); 598/*
398 599 * Remaining API functions
399 /* On some platforms, make sure we allocate lowmem */ 600 */
400 /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
401 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
402 max_addr = MEMBLOCK_REAL_LIMIT;
403
404 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
405 u64 memblockbase = memblock.memory.region[i].base;
406 u64 memblocksize = memblock.memory.region[i].size;
407
408 if (memblocksize < size)
409 continue;
410 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
411 base = memblock_align_down(memblockbase + memblocksize - size, align);
412 else if (memblockbase < max_addr) {
413 base = min(memblockbase + memblocksize, max_addr);
414 base = memblock_align_down(base - size, align);
415 } else
416 continue;
417
418 while (base && memblockbase <= base) {
419 j = memblock_overlaps_region(&memblock.reserved, base, size);
420 if (j < 0) {
421 /* this area isn't reserved, take it */
422 if (memblock_add_region(&memblock.reserved, base, size) < 0)
423 return 0;
424 return base;
425 }
426 res_base = memblock.reserved.region[j].base;
427 if (res_base < size)
428 break;
429 base = memblock_align_down(res_base - size, align);
430 }
431 }
432 return 0;
433}
434 601
435/* You must call memblock_analyze() before this. */ 602/* You must call memblock_analyze() before this. */
436u64 __init memblock_phys_mem_size(void) 603phys_addr_t __init memblock_phys_mem_size(void)
437{ 604{
438 return memblock.memory.size; 605 return memblock.memory_size;
439} 606}
440 607
441u64 memblock_end_of_DRAM(void) 608phys_addr_t __init_memblock memblock_end_of_DRAM(void)
442{ 609{
443 int idx = memblock.memory.cnt - 1; 610 int idx = memblock.memory.cnt - 1;
444 611
445 return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); 612 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
446} 613}
447 614
448/* You must call memblock_analyze() after this. */ 615/* You must call memblock_analyze() after this. */
449void __init memblock_enforce_memory_limit(u64 memory_limit) 616void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
450{ 617{
451 unsigned long i; 618 unsigned long i;
452 u64 limit; 619 phys_addr_t limit;
453 struct memblock_property *p; 620 struct memblock_region *p;
454 621
455 if (!memory_limit) 622 if (!memory_limit)
456 return; 623 return;
@@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
458 /* Truncate the memblock regions to satisfy the memory limit. */ 625 /* Truncate the memblock regions to satisfy the memory limit. */
459 limit = memory_limit; 626 limit = memory_limit;
460 for (i = 0; i < memblock.memory.cnt; i++) { 627 for (i = 0; i < memblock.memory.cnt; i++) {
461 if (limit > memblock.memory.region[i].size) { 628 if (limit > memblock.memory.regions[i].size) {
462 limit -= memblock.memory.region[i].size; 629 limit -= memblock.memory.regions[i].size;
463 continue; 630 continue;
464 } 631 }
465 632
466 memblock.memory.region[i].size = limit; 633 memblock.memory.regions[i].size = limit;
467 memblock.memory.cnt = i + 1; 634 memblock.memory.cnt = i + 1;
468 break; 635 break;
469 } 636 }
470 637
471 if (memblock.memory.region[0].size < memblock.rmo_size)
472 memblock.rmo_size = memblock.memory.region[0].size;
473
474 memory_limit = memblock_end_of_DRAM(); 638 memory_limit = memblock_end_of_DRAM();
475 639
476 /* And truncate any reserves above the limit also. */ 640 /* And truncate any reserves above the limit also. */
477 for (i = 0; i < memblock.reserved.cnt; i++) { 641 for (i = 0; i < memblock.reserved.cnt; i++) {
478 p = &memblock.reserved.region[i]; 642 p = &memblock.reserved.regions[i];
479 643
480 if (p->base > memory_limit) 644 if (p->base > memory_limit)
481 p->size = 0; 645 p->size = 0;
@@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
489 } 653 }
490} 654}
491 655
492int __init memblock_is_reserved(u64 addr) 656static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
657{
658 unsigned int left = 0, right = type->cnt;
659
660 do {
661 unsigned int mid = (right + left) / 2;
662
663 if (addr < type->regions[mid].base)
664 right = mid;
665 else if (addr >= (type->regions[mid].base +
666 type->regions[mid].size))
667 left = mid + 1;
668 else
669 return mid;
670 } while (left < right);
671 return -1;
672}
673
674int __init memblock_is_reserved(phys_addr_t addr)
675{
676 return memblock_search(&memblock.reserved, addr) != -1;
677}
678
679int __init_memblock memblock_is_memory(phys_addr_t addr)
680{
681 return memblock_search(&memblock.memory, addr) != -1;
682}
683
684int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
685{
686 int idx = memblock_search(&memblock.reserved, base);
687
688 if (idx == -1)
689 return 0;
690 return memblock.reserved.regions[idx].base <= base &&
691 (memblock.reserved.regions[idx].base +
692 memblock.reserved.regions[idx].size) >= (base + size);
693}
694
695int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
696{
697 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
698}
699
700
701void __init_memblock memblock_set_current_limit(phys_addr_t limit)
493{ 702{
703 memblock.current_limit = limit;
704}
705
706static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
707{
708 unsigned long long base, size;
494 int i; 709 int i;
495 710
496 for (i = 0; i < memblock.reserved.cnt; i++) { 711 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
497 u64 upper = memblock.reserved.region[i].base + 712
498 memblock.reserved.region[i].size - 1; 713 for (i = 0; i < region->cnt; i++) {
499 if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) 714 base = region->regions[i].base;
500 return 1; 715 size = region->regions[i].size;
716
717 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
718 name, i, base, base + size - 1, size);
501 } 719 }
502 return 0;
503} 720}
504 721
505int memblock_is_region_reserved(u64 base, u64 size) 722void __init_memblock memblock_dump_all(void)
506{ 723{
507 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 724 if (!memblock_debug)
725 return;
726
727 pr_info("MEMBLOCK configuration:\n");
728 pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
729
730 memblock_dump(&memblock.memory, "memory");
731 memblock_dump(&memblock.reserved, "reserved");
508} 732}
509 733
510/* 734void __init memblock_analyze(void)
511 * Given a <base, len>, find which memory regions belong to this range.
512 * Adjust the request and return a contiguous chunk.
513 */
514int memblock_find(struct memblock_property *res)
515{ 735{
516 int i; 736 int i;
517 u64 rstart, rend;
518 737
519 rstart = res->base; 738 /* Check marker in the unused last array entry */
520 rend = rstart + res->size - 1; 739 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
740 != (phys_addr_t)RED_INACTIVE);
741 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
742 != (phys_addr_t)RED_INACTIVE);
743
744 memblock.memory_size = 0;
745
746 for (i = 0; i < memblock.memory.cnt; i++)
747 memblock.memory_size += memblock.memory.regions[i].size;
748
749 /* We allow resizing from there */
750 memblock_can_resize = 1;
751}
752
753void __init memblock_init(void)
754{
755 static int init_done __initdata = 0;
756
757 if (init_done)
758 return;
759 init_done = 1;
760
761 /* Hookup the initial arrays */
762 memblock.memory.regions = memblock_memory_init_regions;
763 memblock.memory.max = INIT_MEMBLOCK_REGIONS;
764 memblock.reserved.regions = memblock_reserved_init_regions;
765 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
766
767 /* Write a marker in the unused last array entry */
768 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
769 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
770
771 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
772 * This simplifies the memblock_add() code below...
773 */
774 memblock.memory.regions[0].base = 0;
775 memblock.memory.regions[0].size = 0;
776 memblock.memory.cnt = 1;
777
778 /* Ditto. */
779 memblock.reserved.regions[0].base = 0;
780 memblock.reserved.regions[0].size = 0;
781 memblock.reserved.cnt = 1;
782
783 memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
784}
785
786static int __init early_memblock(char *p)
787{
788 if (p && strstr(p, "debug"))
789 memblock_debug = 1;
790 return 0;
791}
792early_param("memblock", early_memblock);
793
794#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
795
796static int memblock_debug_show(struct seq_file *m, void *private)
797{
798 struct memblock_type *type = m->private;
799 struct memblock_region *reg;
800 int i;
801
802 for (i = 0; i < type->cnt; i++) {
803 reg = &type->regions[i];
804 seq_printf(m, "%4d: ", i);
805 if (sizeof(phys_addr_t) == 4)
806 seq_printf(m, "0x%08lx..0x%08lx\n",
807 (unsigned long)reg->base,
808 (unsigned long)(reg->base + reg->size - 1));
809 else
810 seq_printf(m, "0x%016llx..0x%016llx\n",
811 (unsigned long long)reg->base,
812 (unsigned long long)(reg->base + reg->size - 1));
521 813
522 for (i = 0; i < memblock.memory.cnt; i++) {
523 u64 start = memblock.memory.region[i].base;
524 u64 end = start + memblock.memory.region[i].size - 1;
525
526 if (start > rend)
527 return -1;
528
529 if ((end >= rstart) && (start < rend)) {
530 /* adjust the request */
531 if (rstart < start)
532 rstart = start;
533 if (rend > end)
534 rend = end;
535 res->base = rstart;
536 res->size = rend - rstart + 1;
537 return 0;
538 }
539 } 814 }
540 return -1; 815 return 0;
816}
817
818static int memblock_debug_open(struct inode *inode, struct file *file)
819{
820 return single_open(file, memblock_debug_show, inode->i_private);
541} 821}
822
823static const struct file_operations memblock_debug_fops = {
824 .open = memblock_debug_open,
825 .read = seq_read,
826 .llseek = seq_lseek,
827 .release = single_release,
828};
829
830static int __init memblock_init_debugfs(void)
831{
832 struct dentry *root = debugfs_create_dir("memblock", NULL);
833 if (!root)
834 return -ENXIO;
835 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
836 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
837
838 return 0;
839}
840__initcall(memblock_init_debugfs);
841
842#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..9a99cfaf0a19 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 92 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 /* incremented at every pagein/pageout */
94 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
95 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 96
94 MEM_CGROUP_STAT_NSTATS, 97 MEM_CGROUP_STAT_NSTATS,
95}; 98};
@@ -254,6 +257,12 @@ struct mem_cgroup {
254 * percpu counter. 257 * percpu counter.
255 */ 258 */
256 struct mem_cgroup_stat_cpu *stat; 259 struct mem_cgroup_stat_cpu *stat;
260 /*
261 * used when a cpu is offlined or other synchronizations
262 * See mem_cgroup_read_stat().
263 */
264 struct mem_cgroup_stat_cpu nocpu_base;
265 spinlock_t pcp_counter_lock;
257}; 266};
258 267
259/* Stuffs for move charges at task migration. */ 268/* Stuffs for move charges at task migration. */
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
530 return mz; 539 return mz;
531} 540}
532 541
542/*
543 * Implementation Note: reading percpu statistics for memcg.
544 *
545 * Both of vmstat[] and percpu_counter has threshold and do periodic
546 * synchronization to implement "quick" read. There are trade-off between
547 * reading cost and precision of value. Then, we may have a chance to implement
548 * a periodic synchronizion of counter in memcg's counter.
549 *
550 * But this _read() function is used for user interface now. The user accounts
551 * memory usage by memory cgroup and he _always_ requires exact value because
552 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
553 * have to visit all online cpus and make sum. So, for now, unnecessary
554 * synchronization is not implemented. (just implemented for cpu hotplug)
555 *
556 * If there are kernel internal actions which can make use of some not-exact
557 * value, and reading all cpu value can be performance bottleneck in some
558 * common workload, threashold and synchonization as vmstat[] should be
559 * implemented.
560 */
533static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 561static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
534 enum mem_cgroup_stat_index idx) 562 enum mem_cgroup_stat_index idx)
535{ 563{
536 int cpu; 564 int cpu;
537 s64 val = 0; 565 s64 val = 0;
538 566
539 for_each_possible_cpu(cpu) 567 get_online_cpus();
568 for_each_online_cpu(cpu)
540 val += per_cpu(mem->stat->count[idx], cpu); 569 val += per_cpu(mem->stat->count[idx], cpu);
570#ifdef CONFIG_HOTPLUG_CPU
571 spin_lock(&mem->pcp_counter_lock);
572 val += mem->nocpu_base.count[idx];
573 spin_unlock(&mem->pcp_counter_lock);
574#endif
575 put_online_cpus();
541 return val; 576 return val;
542} 577}
543 578
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
659 return mem; 694 return mem;
660} 695}
661 696
662/* 697/* The caller has to guarantee "mem" exists before calling this */
663 * Call callback function against all cgroup under hierarchy tree. 698static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
664 */
665static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
666 int (*func)(struct mem_cgroup *, void *))
667{ 699{
668 int found, ret, nextid;
669 struct cgroup_subsys_state *css; 700 struct cgroup_subsys_state *css;
670 struct mem_cgroup *mem; 701 int found;
671
672 if (!root->use_hierarchy)
673 return (*func)(root, data);
674 702
675 nextid = 1; 703 if (!mem) /* ROOT cgroup has the smallest ID */
676 do { 704 return root_mem_cgroup; /*css_put/get against root is ignored*/
677 ret = 0; 705 if (!mem->use_hierarchy) {
706 if (css_tryget(&mem->css))
707 return mem;
708 return NULL;
709 }
710 rcu_read_lock();
711 /*
712 * searching a memory cgroup which has the smallest ID under given
713 * ROOT cgroup. (ID >= 1)
714 */
715 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
716 if (css && css_tryget(css))
717 mem = container_of(css, struct mem_cgroup, css);
718 else
678 mem = NULL; 719 mem = NULL;
720 rcu_read_unlock();
721 return mem;
722}
723
724static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
725 struct mem_cgroup *root,
726 bool cond)
727{
728 int nextid = css_id(&iter->css) + 1;
729 int found;
730 int hierarchy_used;
731 struct cgroup_subsys_state *css;
732
733 hierarchy_used = iter->use_hierarchy;
679 734
735 css_put(&iter->css);
736 /* If no ROOT, walk all, ignore hierarchy */
737 if (!cond || (root && !hierarchy_used))
738 return NULL;
739
740 if (!root)
741 root = root_mem_cgroup;
742
743 do {
744 iter = NULL;
680 rcu_read_lock(); 745 rcu_read_lock();
681 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 746
682 &found); 747 css = css_get_next(&mem_cgroup_subsys, nextid,
748 &root->css, &found);
683 if (css && css_tryget(css)) 749 if (css && css_tryget(css))
684 mem = container_of(css, struct mem_cgroup, css); 750 iter = container_of(css, struct mem_cgroup, css);
685 rcu_read_unlock(); 751 rcu_read_unlock();
686 752 /* If css is NULL, no more cgroups will be found */
687 if (mem) {
688 ret = (*func)(mem, data);
689 css_put(&mem->css);
690 }
691 nextid = found + 1; 753 nextid = found + 1;
692 } while (!ret && css); 754 } while (css && !iter);
693 755
694 return ret; 756 return iter;
695} 757}
758/*
759 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
760 * be careful that "break" loop is not allowed. We have reference count.
761 * Instead of that modify "cond" to be false and "continue" to exit the loop.
762 */
763#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
764 for (iter = mem_cgroup_start_loop(root);\
765 iter != NULL;\
766 iter = mem_cgroup_get_next(iter, root, cond))
767
768#define for_each_mem_cgroup_tree(iter, root) \
769 for_each_mem_cgroup_tree_cond(iter, root, true)
770
771#define for_each_mem_cgroup_all(iter) \
772 for_each_mem_cgroup_tree_cond(iter, NULL, true)
773
696 774
697static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 775static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
698{ 776{
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1051 return swappiness; 1129 return swappiness;
1052} 1130}
1053 1131
1054/* A routine for testing mem is not under move_account */ 1132static void mem_cgroup_start_move(struct mem_cgroup *mem)
1133{
1134 int cpu;
1135
1136 get_online_cpus();
1137 spin_lock(&mem->pcp_counter_lock);
1138 for_each_online_cpu(cpu)
1139 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1140 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1141 spin_unlock(&mem->pcp_counter_lock);
1142 put_online_cpus();
1143
1144 synchronize_rcu();
1145}
1146
1147static void mem_cgroup_end_move(struct mem_cgroup *mem)
1148{
1149 int cpu;
1150
1151 if (!mem)
1152 return;
1153 get_online_cpus();
1154 spin_lock(&mem->pcp_counter_lock);
1155 for_each_online_cpu(cpu)
1156 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1157 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1158 spin_unlock(&mem->pcp_counter_lock);
1159 put_online_cpus();
1160}
1161/*
1162 * 2 routines for checking "mem" is under move_account() or not.
1163 *
1164 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1165 * for avoiding race in accounting. If true,
1166 * pc->mem_cgroup may be overwritten.
1167 *
1168 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1169 * under hierarchy of moving cgroups. This is for
1170 * waiting at hith-memory prressure caused by "move".
1171 */
1172
1173static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1174{
1175 VM_BUG_ON(!rcu_read_lock_held());
1176 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1177}
1055 1178
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1179static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1180{
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1092 return false; 1215 return false;
1093} 1216}
1094 1217
1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1096{
1097 int *val = data;
1098 (*val)++;
1099 return 0;
1100}
1101
1102/** 1218/**
1103 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1219 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1104 * @memcg: The memory cgroup that went over limit 1220 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1289,10 @@ done:
1173static int mem_cgroup_count_children(struct mem_cgroup *mem) 1289static int mem_cgroup_count_children(struct mem_cgroup *mem)
1174{ 1290{
1175 int num = 0; 1291 int num = 0;
1176 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1292 struct mem_cgroup *iter;
1293
1294 for_each_mem_cgroup_tree(iter, mem)
1295 num++;
1177 return num; 1296 return num;
1178} 1297}
1179 1298
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1322 return total; 1441 return total;
1323} 1442}
1324 1443
1325static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1326{
1327 int *val = (int *)data;
1328 int x;
1329 /*
1330 * Logically, we can stop scanning immediately when we find
1331 * a memcg is already locked. But condidering unlock ops and
1332 * creation/removal of memcg, scan-all is simple operation.
1333 */
1334 x = atomic_inc_return(&mem->oom_lock);
1335 *val = max(x, *val);
1336 return 0;
1337}
1338/* 1444/*
1339 * Check OOM-Killer is already running under our hierarchy. 1445 * Check OOM-Killer is already running under our hierarchy.
1340 * If someone is running, return false. 1446 * If someone is running, return false.
1341 */ 1447 */
1342static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1448static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1343{ 1449{
1344 int lock_count = 0; 1450 int x, lock_count = 0;
1451 struct mem_cgroup *iter;
1345 1452
1346 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1453 for_each_mem_cgroup_tree(iter, mem) {
1454 x = atomic_inc_return(&iter->oom_lock);
1455 lock_count = max(x, lock_count);
1456 }
1347 1457
1348 if (lock_count == 1) 1458 if (lock_count == 1)
1349 return true; 1459 return true;
1350 return false; 1460 return false;
1351} 1461}
1352 1462
1353static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1463static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1354{ 1464{
1465 struct mem_cgroup *iter;
1466
1355 /* 1467 /*
1356 * When a new child is created while the hierarchy is under oom, 1468 * When a new child is created while the hierarchy is under oom,
1357 * mem_cgroup_oom_lock() may not be called. We have to use 1469 * mem_cgroup_oom_lock() may not be called. We have to use
1358 * atomic_add_unless() here. 1470 * atomic_add_unless() here.
1359 */ 1471 */
1360 atomic_add_unless(&mem->oom_lock, -1, 0); 1472 for_each_mem_cgroup_tree(iter, mem)
1473 atomic_add_unless(&iter->oom_lock, -1, 0);
1361 return 0; 1474 return 0;
1362} 1475}
1363 1476
1364static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1365{
1366 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1367}
1368 1477
1369static DEFINE_MUTEX(memcg_oom_mutex); 1478static DEFINE_MUTEX(memcg_oom_mutex);
1370static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1479static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1571/*
1463 * Currently used to update mapped file statistics, but the routine can be 1572 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1573 * generalized to update other statistics as well.
1574 *
1575 * Notes: Race condition
1576 *
1577 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1578 * it tends to be costly. But considering some conditions, we doesn't need
1579 * to do so _always_.
1580 *
1581 * Considering "charge", lock_page_cgroup() is not required because all
1582 * file-stat operations happen after a page is attached to radix-tree. There
1583 * are no race with "charge".
1584 *
1585 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1586 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1587 * if there are race with "uncharge". Statistics itself is properly handled
1588 * by flags.
1589 *
1590 * Considering "move", this is an only case we see a race. To make the race
1591 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1592 * possibility of race condition. If there is, we take a lock.
1465 */ 1593 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1594
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1467{ 1596{
1468 struct mem_cgroup *mem; 1597 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1598 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false;
1470 1600
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1601 if (unlikely(!pc))
1473 return; 1602 return;
1474 1603
1475 lock_page_cgroup(pc); 1604 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1605 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1606 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1607 goto out;
1608 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) {
1610 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc);
1612 need_unlock = true;
1613 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc))
1615 goto out;
1616 }
1479 1617
1480 /* 1618 this_cpu_add(mem->stat->count[idx], val);
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1619
1482 */ 1620 switch (idx) {
1483 if (val > 0) { 1621 case MEM_CGROUP_STAT_FILE_MAPPED:
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1622 if (val > 0)
1485 SetPageCgroupFileMapped(pc); 1623 SetPageCgroupFileMapped(pc);
1486 } else { 1624 else if (!page_mapped(page))
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1625 ClearPageCgroupFileMapped(pc);
1488 ClearPageCgroupFileMapped(pc); 1626 break;
1627 default:
1628 BUG();
1489 } 1629 }
1490 1630
1491done: 1631out:
1492 unlock_page_cgroup(pc); 1632 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc);
1634 rcu_read_unlock();
1635 return;
1636}
1637
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1493} 1641}
1494 1642
1495/* 1643/*
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void)
1605 atomic_dec(&memcg_drain_count); 1753 atomic_dec(&memcg_drain_count);
1606} 1754}
1607 1755
1608static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 1756/*
1757 * This function drains percpu counter value from DEAD cpu and
1758 * move it to local cpu. Note that this function can be preempted.
1759 */
1760static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1761{
1762 int i;
1763
1764 spin_lock(&mem->pcp_counter_lock);
1765 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1766 s64 x = per_cpu(mem->stat->count[i], cpu);
1767
1768 per_cpu(mem->stat->count[i], cpu) = 0;
1769 mem->nocpu_base.count[i] += x;
1770 }
1771 /* need to clear ON_MOVE value, works as a kind of lock. */
1772 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1773 spin_unlock(&mem->pcp_counter_lock);
1774}
1775
1776static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1777{
1778 int idx = MEM_CGROUP_ON_MOVE;
1779
1780 spin_lock(&mem->pcp_counter_lock);
1781 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1782 spin_unlock(&mem->pcp_counter_lock);
1783}
1784
1785static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1609 unsigned long action, 1786 unsigned long action,
1610 void *hcpu) 1787 void *hcpu)
1611{ 1788{
1612 int cpu = (unsigned long)hcpu; 1789 int cpu = (unsigned long)hcpu;
1613 struct memcg_stock_pcp *stock; 1790 struct memcg_stock_pcp *stock;
1791 struct mem_cgroup *iter;
1792
1793 if ((action == CPU_ONLINE)) {
1794 for_each_mem_cgroup_all(iter)
1795 synchronize_mem_cgroup_on_move(iter, cpu);
1796 return NOTIFY_OK;
1797 }
1614 1798
1615 if (action != CPU_DEAD) 1799 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1616 return NOTIFY_OK; 1800 return NOTIFY_OK;
1801
1802 for_each_mem_cgroup_all(iter)
1803 mem_cgroup_drain_pcp_counter(iter, cpu);
1804
1617 stock = &per_cpu(memcg_stock, cpu); 1805 stock = &per_cpu(memcg_stock, cpu);
1618 drain_stock(stock); 1806 drain_stock(stock);
1619 return NOTIFY_OK; 1807 return NOTIFY_OK;
@@ -3038,6 +3226,7 @@ move_account:
3038 lru_add_drain_all(); 3226 lru_add_drain_all();
3039 drain_all_stock_sync(); 3227 drain_all_stock_sync();
3040 ret = 0; 3228 ret = 0;
3229 mem_cgroup_start_move(mem);
3041 for_each_node_state(node, N_HIGH_MEMORY) { 3230 for_each_node_state(node, N_HIGH_MEMORY) {
3042 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3231 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3043 enum lru_list l; 3232 enum lru_list l;
@@ -3051,6 +3240,7 @@ move_account:
3051 if (ret) 3240 if (ret)
3052 break; 3241 break;
3053 } 3242 }
3243 mem_cgroup_end_move(mem);
3054 memcg_oom_recover(mem); 3244 memcg_oom_recover(mem);
3055 /* it seems parent cgroup doesn't have enough mem */ 3245 /* it seems parent cgroup doesn't have enough mem */
3056 if (ret == -ENOMEM) 3246 if (ret == -ENOMEM)
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3137 return retval; 3327 return retval;
3138} 3328}
3139 3329
3140struct mem_cgroup_idx_data {
3141 s64 val;
3142 enum mem_cgroup_stat_index idx;
3143};
3144 3330
3145static int 3331static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
3146mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3332 enum mem_cgroup_stat_index idx)
3147{ 3333{
3148 struct mem_cgroup_idx_data *d = data; 3334 struct mem_cgroup *iter;
3149 d->val += mem_cgroup_read_stat(mem, d->idx); 3335 s64 val = 0;
3150 return 0;
3151}
3152 3336
3153static void 3337 /* each per cpu's value can be minus.Then, use s64 */
3154mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3338 for_each_mem_cgroup_tree(iter, mem)
3155 enum mem_cgroup_stat_index idx, s64 *val) 3339 val += mem_cgroup_read_stat(iter, idx);
3156{ 3340
3157 struct mem_cgroup_idx_data d; 3341 if (val < 0) /* race ? */
3158 d.idx = idx; 3342 val = 0;
3159 d.val = 0; 3343 return val;
3160 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3161 *val = d.val;
3162} 3344}
3163 3345
3164static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3346static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3165{ 3347{
3166 u64 idx_val, val; 3348 u64 val;
3167 3349
3168 if (!mem_cgroup_is_root(mem)) { 3350 if (!mem_cgroup_is_root(mem)) {
3169 if (!swap) 3351 if (!swap)
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3172 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3354 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3173 } 3355 }
3174 3356
3175 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3357 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
3176 val = idx_val; 3358 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
3177 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3178 val += idx_val;
3179 3359
3180 if (swap) { 3360 if (swap)
3181 mem_cgroup_get_recursive_idx_stat(mem, 3361 val += mem_cgroup_get_recursive_idx_stat(mem,
3182 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 3362 MEM_CGROUP_STAT_SWAPOUT);
3183 val += idx_val;
3184 }
3185 3363
3186 return val << PAGE_SHIFT; 3364 return val << PAGE_SHIFT;
3187} 3365}
@@ -3389,9 +3567,9 @@ struct {
3389}; 3567};
3390 3568
3391 3569
3392static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 3570static void
3571mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3393{ 3572{
3394 struct mcs_total_stat *s = data;
3395 s64 val; 3573 s64 val;
3396 3574
3397 /* per cpu stat */ 3575 /* per cpu stat */
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3421 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3599 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3422 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3600 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3423 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3601 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3424 return 0;
3425} 3602}
3426 3603
3427static void 3604static void
3428mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3605mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3429{ 3606{
3430 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 3607 struct mem_cgroup *iter;
3608
3609 for_each_mem_cgroup_tree(iter, mem)
3610 mem_cgroup_get_local_stat(iter, s);
3431} 3611}
3432 3612
3433static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3613static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
3604 return _a->threshold - _b->threshold; 3784 return _a->threshold - _b->threshold;
3605} 3785}
3606 3786
3607static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 3787static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3608{ 3788{
3609 struct mem_cgroup_eventfd_list *ev; 3789 struct mem_cgroup_eventfd_list *ev;
3610 3790
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3615 3795
3616static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 3796static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3617{ 3797{
3618 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 3798 struct mem_cgroup *iter;
3799
3800 for_each_mem_cgroup_tree(iter, mem)
3801 mem_cgroup_oom_notify_cb(iter);
3619} 3802}
3620 3803
3621static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 3804static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4032 vfree(mem); 4215 vfree(mem);
4033 mem = NULL; 4216 mem = NULL;
4034 } 4217 }
4218 spin_lock_init(&mem->pcp_counter_lock);
4035 return mem; 4219 return mem;
4036} 4220}
4037 4221
@@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4158 &per_cpu(memcg_stock, cpu); 4342 &per_cpu(memcg_stock, cpu);
4159 INIT_WORK(&stock->work, drain_local_stock); 4343 INIT_WORK(&stock->work, drain_local_stock);
4160 } 4344 }
4161 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4345 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4162 } else { 4346 } else {
4163 parent = mem_cgroup_from_cont(cont->parent); 4347 parent = mem_cgroup_from_cont(cont->parent);
4164 mem->use_hierarchy = parent->use_hierarchy; 4348 mem->use_hierarchy = parent->use_hierarchy;
@@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void)
4513 mc.to = NULL; 4697 mc.to = NULL;
4514 mc.moving_task = NULL; 4698 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 4699 spin_unlock(&mc.lock);
4700 mem_cgroup_end_move(from);
4516 memcg_oom_recover(from); 4701 memcg_oom_recover(from);
4517 memcg_oom_recover(to); 4702 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq); 4703 wake_up_all(&mc.waitq);
@@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4543 VM_BUG_ON(mc.moved_charge); 4728 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 4729 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 4730 VM_BUG_ON(mc.moving_task);
4731 mem_cgroup_start_move(from);
4546 spin_lock(&mc.lock); 4732 spin_lock(&mc.lock);
4547 mc.from = from; 4733 mc.from = from;
4548 mc.to = mem; 4734 mc.to = mem;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
78 return 0; 82 return 0;
79 83
80 /* 84 /*
81 * page_mapping() does not accept slab page 85 * page_mapping() does not accept slab pages.
82 */ 86 */
83 if (PageSlab(p)) 87 if (PageSlab(p))
84 return -EINVAL; 88 return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
268 struct list_head nd; 272 struct list_head nd;
269 struct task_struct *tsk; 273 struct task_struct *tsk;
270 unsigned long addr; 274 unsigned long addr;
271 unsigned addr_valid:1; 275 char addr_valid;
272}; 276};
273 277
274/* 278/*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
309 * a SIGKILL because the error is not contained anymore. 313 * a SIGKILL because the error is not contained anymore.
310 */ 314 */
311 if (tk->addr == -EFAULT) { 315 if (tk->addr == -EFAULT) {
312 pr_debug("MCE: Unable to find user space address %lx in %s\n", 316 pr_info("MCE: Unable to find user space address %lx in %s\n",
313 page_to_pfn(p), tsk->comm); 317 page_to_pfn(p), tsk->comm);
314 tk->addr_valid = 0; 318 tk->addr_valid = 0;
315 } 319 }
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
577 pfn, err); 581 pfn, err);
578 } else if (page_has_private(p) && 582 } else if (page_has_private(p) &&
579 !try_to_release_page(p, GFP_NOIO)) { 583 !try_to_release_page(p, GFP_NOIO)) {
580 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 584 pr_info("MCE %#lx: failed to release buffers\n", pfn);
581 } else { 585 } else {
582 ret = RECOVERED; 586 ret = RECOVERED;
583 } 587 }
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
693 * Issues: 697 * Issues:
694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 698 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
695 * To narrow down kill region to one page, we need to break up pmd. 699 * To narrow down kill region to one page, we need to break up pmd.
696 * - To support soft-offlining for hugepage, we need to support hugepage
697 * migration.
698 */ 700 */
699static int me_huge_page(struct page *p, unsigned long pfn) 701static int me_huge_page(struct page *p, unsigned long pfn)
700{ 702{
703 int res = 0;
701 struct page *hpage = compound_head(p); 704 struct page *hpage = compound_head(p);
702 /* 705 /*
703 * We can safely recover from error on free or reserved (i.e. 706 * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
710 * so there is no race between isolation and mapping/unmapping. 713 * so there is no race between isolation and mapping/unmapping.
711 */ 714 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 715 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage); 716 res = dequeue_hwpoisoned_huge_page(hpage);
714 return RECOVERED; 717 if (!res)
718 return RECOVERED;
715 } 719 }
716 return DELAYED; 720 return DELAYED;
717} 721}
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 840 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
837} 841}
838 842
839#define N_UNMAP_TRIES 5
840
841/* 843/*
842 * Do all that is necessary to remove user space mappings. Unmap 844 * Do all that is necessary to remove user space mappings. Unmap
843 * the pages and send SIGBUS to the processes if the data was dirty. 845 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
849 struct address_space *mapping; 851 struct address_space *mapping;
850 LIST_HEAD(tokill); 852 LIST_HEAD(tokill);
851 int ret; 853 int ret;
852 int i;
853 int kill = 1; 854 int kill = 1;
854 struct page *hpage = compound_head(p); 855 struct page *hpage = compound_head(p);
855 856
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
903 if (kill) 904 if (kill)
904 collect_procs(hpage, &tokill); 905 collect_procs(hpage, &tokill);
905 906
906 /* 907 ret = try_to_unmap(hpage, ttu);
907 * try_to_unmap can fail temporarily due to races.
908 * Try a few times (RED-PEN better strategy?)
909 */
910 for (i = 0; i < N_UNMAP_TRIES; i++) {
911 ret = try_to_unmap(hpage, ttu);
912 if (ret == SWAP_SUCCESS)
913 break;
914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
915 }
916
917 if (ret != SWAP_SUCCESS) 908 if (ret != SWAP_SUCCESS)
918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 909 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
919 pfn, page_mapcount(hpage)); 910 pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
981 * We need/can do nothing about count=0 pages. 972 * We need/can do nothing about count=0 pages.
982 * 1) it's a free page, and therefore in safe hand: 973 * 1) it's a free page, and therefore in safe hand:
983 * prep_new_page() will be the gate keeper. 974 * prep_new_page() will be the gate keeper.
984 * 2) it's part of a non-compound high order page. 975 * 2) it's a free hugepage, which is also safe:
976 * an affected hugepage will be dequeued from hugepage freelist,
977 * so there's no concern about reusing it ever after.
978 * 3) it's part of a non-compound high order page.
985 * Implies some kernel user: cannot stop them from 979 * Implies some kernel user: cannot stop them from
986 * R/W the page; let's pray that the page has been 980 * R/W the page; let's pray that the page has been
987 * used and will be freed some time later. 981 * used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
993 if (is_free_buddy_page(p)) { 987 if (is_free_buddy_page(p)) {
994 action_result(pfn, "free buddy", DELAYED); 988 action_result(pfn, "free buddy", DELAYED);
995 return 0; 989 return 0;
990 } else if (PageHuge(hpage)) {
991 /*
992 * Check "just unpoisoned", "filter hit", and
993 * "race with other subpage."
994 */
995 lock_page_nosync(hpage);
996 if (!PageHWPoison(hpage)
997 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
998 || (p != hpage && TestSetPageHWPoison(hpage))) {
999 atomic_long_sub(nr_pages, &mce_bad_pages);
1000 return 0;
1001 }
1002 set_page_hwpoison_huge_page(hpage);
1003 res = dequeue_hwpoisoned_huge_page(hpage);
1004 action_result(pfn, "free huge",
1005 res ? IGNORED : DELAYED);
1006 unlock_page(hpage);
1007 return res;
996 } else { 1008 } else {
997 action_result(pfn, "high order kernel", IGNORED); 1009 action_result(pfn, "high order kernel", IGNORED);
998 return -EBUSY; 1010 return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
1147 page = compound_head(p); 1159 page = compound_head(p);
1148 1160
1149 if (!PageHWPoison(p)) { 1161 if (!PageHWPoison(p)) {
1150 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1162 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1151 return 0; 1163 return 0;
1152 } 1164 }
1153 1165
1154 nr_pages = 1 << compound_order(page); 1166 nr_pages = 1 << compound_order(page);
1155 1167
1156 if (!get_page_unless_zero(page)) { 1168 if (!get_page_unless_zero(page)) {
1169 /*
1170 * Since HWPoisoned hugepage should have non-zero refcount,
1171 * race between memory failure and unpoison seems to happen.
1172 * In such case unpoison fails and memory failure runs
1173 * to the end.
1174 */
1175 if (PageHuge(page)) {
1176 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1177 return 0;
1178 }
1157 if (TestClearPageHWPoison(p)) 1179 if (TestClearPageHWPoison(p))
1158 atomic_long_sub(nr_pages, &mce_bad_pages); 1180 atomic_long_sub(nr_pages, &mce_bad_pages);
1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1181 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1160 return 0; 1182 return 0;
1161 } 1183 }
1162 1184
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
1168 * the free buddy page pool. 1190 * the free buddy page pool.
1169 */ 1191 */
1170 if (TestClearPageHWPoison(page)) { 1192 if (TestClearPageHWPoison(page)) {
1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1193 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1194 atomic_long_sub(nr_pages, &mce_bad_pages);
1173 freeit = 1; 1195 freeit = 1;
1196 if (PageHuge(page))
1197 clear_page_hwpoison_huge_page(page);
1174 } 1198 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1177 unlock_page(page); 1199 unlock_page(page);
1178 1200
1179 put_page(page); 1201 put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
1187static struct page *new_page(struct page *p, unsigned long private, int **x) 1209static struct page *new_page(struct page *p, unsigned long private, int **x)
1188{ 1210{
1189 int nid = page_to_nid(p); 1211 int nid = page_to_nid(p);
1190 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1212 if (PageHuge(p))
1213 return alloc_huge_page_node(page_hstate(compound_head(p)),
1214 nid);
1215 else
1216 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1191} 1217}
1192 1218
1193/* 1219/*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1215 * was free. 1241 * was free.
1216 */ 1242 */
1217 set_migratetype_isolate(p); 1243 set_migratetype_isolate(p);
1244 /*
1245 * When the target page is a free hugepage, just remove it
1246 * from free hugepage list.
1247 */
1218 if (!get_page_unless_zero(compound_head(p))) { 1248 if (!get_page_unless_zero(compound_head(p))) {
1219 if (is_free_buddy_page(p)) { 1249 if (PageHuge(p)) {
1220 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1250 pr_info("get_any_page: %#lx free huge page\n", pfn);
1251 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1252 } else if (is_free_buddy_page(p)) {
1253 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1221 /* Set hwpoison bit while page is still isolated */ 1254 /* Set hwpoison bit while page is still isolated */
1222 SetPageHWPoison(p); 1255 SetPageHWPoison(p);
1223 ret = 0; 1256 ret = 0;
1224 } else { 1257 } else {
1225 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1258 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1226 pfn, p->flags); 1259 pfn, p->flags);
1227 ret = -EIO; 1260 ret = -EIO;
1228 } 1261 }
@@ -1235,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1235 return ret; 1268 return ret;
1236} 1269}
1237 1270
1271static int soft_offline_huge_page(struct page *page, int flags)
1272{
1273 int ret;
1274 unsigned long pfn = page_to_pfn(page);
1275 struct page *hpage = compound_head(page);
1276 LIST_HEAD(pagelist);
1277
1278 ret = get_any_page(page, pfn, flags);
1279 if (ret < 0)
1280 return ret;
1281 if (ret == 0)
1282 goto done;
1283
1284 if (PageHWPoison(hpage)) {
1285 put_page(hpage);
1286 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1287 return -EBUSY;
1288 }
1289
1290 /* Keep page count to indicate a given hugepage is isolated. */
1291
1292 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1294 if (ret) {
1295 putback_lru_pages(&pagelist);
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags);
1298 if (ret > 0)
1299 ret = -EIO;
1300 return ret;
1301 }
1302done:
1303 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */
1308 return ret;
1309}
1310
1238/** 1311/**
1239 * soft_offline_page - Soft offline a page. 1312 * soft_offline_page - Soft offline a page.
1240 * @page: page to offline 1313 * @page: page to offline
@@ -1262,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
1262 int ret; 1335 int ret;
1263 unsigned long pfn = page_to_pfn(page); 1336 unsigned long pfn = page_to_pfn(page);
1264 1337
1338 if (PageHuge(page))
1339 return soft_offline_huge_page(page, flags);
1340
1265 ret = get_any_page(page, pfn, flags); 1341 ret = get_any_page(page, pfn, flags);
1266 if (ret < 0) 1342 if (ret < 0)
1267 return ret; 1343 return ret;
@@ -1288,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
1288 goto done; 1364 goto done;
1289 } 1365 }
1290 if (!PageLRU(page)) { 1366 if (!PageLRU(page)) {
1291 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1367 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1292 pfn, page->flags); 1368 pfn, page->flags);
1293 return -EIO; 1369 return -EIO;
1294 } 1370 }
@@ -1302,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
1302 if (PageHWPoison(page)) { 1378 if (PageHWPoison(page)) {
1303 unlock_page(page); 1379 unlock_page(page);
1304 put_page(page); 1380 put_page(page);
1305 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1381 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1306 return -EBUSY; 1382 return -EBUSY;
1307 } 1383 }
1308 1384
@@ -1323,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
1323 put_page(page); 1399 put_page(page);
1324 if (ret == 1) { 1400 if (ret == 1) {
1325 ret = 0; 1401 ret = 0;
1326 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1402 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1327 goto done; 1403 goto done;
1328 } 1404 }
1329 1405
@@ -1339,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
1339 list_add(&page->lru, &pagelist); 1415 list_add(&page->lru, &pagelist);
1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1341 if (ret) { 1417 if (ret) {
1342 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1343 pfn, ret, page->flags); 1419 pfn, ret, page->flags);
1344 if (ret > 0) 1420 if (ret > 0)
1345 ret = -EIO; 1421 ret = -EIO;
1346 } 1422 }
1347 } else { 1423 } else {
1348 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1424 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1349 pfn, ret, page_count(page), page->flags); 1425 pfn, ret, page_count(page), page->flags);
1350 } 1426 }
1351 if (ret) 1427 if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,7 +736,7 @@ again:
736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
737 if (!dst_pte) 737 if (!dst_pte)
738 return -ENOMEM; 738 return -ENOMEM;
739 src_pte = pte_offset_map_nested(src_pmd, addr); 739 src_pte = pte_offset_map(src_pmd, addr);
740 src_ptl = pte_lockptr(src_mm, src_pmd); 740 src_ptl = pte_lockptr(src_mm, src_pmd);
741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
742 orig_src_pte = src_pte; 742 orig_src_pte = src_pte;
@@ -767,7 +767,7 @@ again:
767 767
768 arch_leave_lazy_mmu_mode(); 768 arch_leave_lazy_mmu_mode();
769 spin_unlock(src_ptl); 769 spin_unlock(src_ptl);
770 pte_unmap_nested(orig_src_pte); 770 pte_unmap(orig_src_pte);
771 add_mm_rss_vec(dst_mm, rss); 771 add_mm_rss_vec(dst_mm, rss);
772 pte_unmap_unlock(orig_dst_pte, dst_ptl); 772 pte_unmap_unlock(orig_dst_pte, dst_ptl);
773 cond_resched(); 773 cond_resched();
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1450 if (ret & VM_FAULT_OOM) 1450 if (ret & VM_FAULT_OOM)
1451 return i ? i : -ENOMEM; 1451 return i ? i : -ENOMEM;
1452 if (ret & 1452 if (ret &
1453 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1453 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
1454 VM_FAULT_SIGBUS))
1454 return i ? i : -EFAULT; 1455 return i ? i : -EFAULT;
1455 BUG(); 1456 BUG();
1456 } 1457 }
@@ -1590,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
1590} 1591}
1591#endif /* CONFIG_ELF_CORE */ 1592#endif /* CONFIG_ELF_CORE */
1592 1593
1593pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1594pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1594 spinlock_t **ptl) 1595 spinlock_t **ptl)
1595{ 1596{
1596 pgd_t * pgd = pgd_offset(mm, addr); 1597 pgd_t * pgd = pgd_offset(mm, addr);
@@ -2079,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2079 * zeroes. 2080 * zeroes.
2080 */ 2081 */
2081 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2082 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2082 memset(kaddr, 0, PAGE_SIZE); 2083 clear_page(kaddr);
2083 kunmap_atomic(kaddr, KM_USER0); 2084 kunmap_atomic(kaddr, KM_USER0);
2084 flush_dcache_page(dst); 2085 flush_dcache_page(dst);
2085 } else 2086 } else
@@ -2107,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2107static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2108static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2108 unsigned long address, pte_t *page_table, pmd_t *pmd, 2109 unsigned long address, pte_t *page_table, pmd_t *pmd,
2109 spinlock_t *ptl, pte_t orig_pte) 2110 spinlock_t *ptl, pte_t orig_pte)
2111 __releases(ptl)
2110{ 2112{
2111 struct page *old_page, *new_page; 2113 struct page *old_page, *new_page;
2112 pte_t entry; 2114 pte_t entry;
@@ -2626,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2626 struct page *page, *swapcache = NULL; 2628 struct page *page, *swapcache = NULL;
2627 swp_entry_t entry; 2629 swp_entry_t entry;
2628 pte_t pte; 2630 pte_t pte;
2631 int locked;
2629 struct mem_cgroup *ptr = NULL; 2632 struct mem_cgroup *ptr = NULL;
2630 int exclusive = 0; 2633 int exclusive = 0;
2631 int ret = 0; 2634 int ret = 0;
@@ -2676,8 +2679,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2676 goto out_release; 2679 goto out_release;
2677 } 2680 }
2678 2681
2679 lock_page(page); 2682 locked = lock_page_or_retry(page, mm, flags);
2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2683 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2684 if (!locked) {
2685 ret |= VM_FAULT_RETRY;
2686 goto out_release;
2687 }
2681 2688
2682 /* 2689 /*
2683 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 2690 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2926,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2926 vmf.page = NULL; 2933 vmf.page = NULL;
2927 2934
2928 ret = vma->vm_ops->fault(vma, &vmf); 2935 ret = vma->vm_ops->fault(vma, &vmf);
2929 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2936 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
2937 VM_FAULT_RETRY)))
2930 return ret; 2938 return ret;
2931 2939
2932 if (unlikely(PageHWPoison(vmf.page))) { 2940 if (unlikely(PageHWPoison(vmf.page))) {
@@ -3185,7 +3193,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
3185 * with threads. 3193 * with threads.
3186 */ 3194 */
3187 if (flags & FAULT_FLAG_WRITE) 3195 if (flags & FAULT_FLAG_WRITE)
3188 flush_tlb_page(vma, address); 3196 flush_tlb_fix_spurious_fault(vma, address);
3189 } 3197 }
3190unlock: 3198unlock:
3191 pte_unmap_unlock(pte, ptl); 3199 pte_unmap_unlock(pte, ptl);
@@ -3343,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
3343 3351
3344#endif /* __HAVE_ARCH_GATE_AREA */ 3352#endif /* __HAVE_ARCH_GATE_AREA */
3345 3353
3346static int follow_pte(struct mm_struct *mm, unsigned long address, 3354static int __follow_pte(struct mm_struct *mm, unsigned long address,
3347 pte_t **ptepp, spinlock_t **ptlp) 3355 pte_t **ptepp, spinlock_t **ptlp)
3348{ 3356{
3349 pgd_t *pgd; 3357 pgd_t *pgd;
@@ -3380,6 +3388,17 @@ out:
3380 return -EINVAL; 3388 return -EINVAL;
3381} 3389}
3382 3390
3391static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3392 pte_t **ptepp, spinlock_t **ptlp)
3393{
3394 int res;
3395
3396 /* (void) is needed to make gcc happy */
3397 (void) __cond_lock(*ptlp,
3398 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3399 return res;
3400}
3401
3383/** 3402/**
3384 * follow_pfn - look up PFN at a user virtual address 3403 * follow_pfn - look up PFN at a user virtual address
3385 * @vma: memory mapping 3404 * @vma: memory mapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..9260314a221e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page)
602/* Checks if this range of memory is likely to be hot-removable. */ 602/* Checks if this range of memory is likely to be hot-removable. */
603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
604{ 604{
605 int type;
606 struct page *page = pfn_to_page(start_pfn); 605 struct page *page = pfn_to_page(start_pfn);
607 struct page *end_page = page + nr_pages; 606 struct page *end_page = page + nr_pages;
608 607
609 /* Check the starting page of each pageblock within the range */ 608 /* Check the starting page of each pageblock within the range */
610 for (; page < end_page; page = next_active_pageblock(page)) { 609 for (; page < end_page; page = next_active_pageblock(page)) {
611 type = get_pageblock_migratetype(page); 610 if (!is_pageblock_removable_nolock(page))
612
613 /*
614 * A pageblock containing MOVABLE or free pages is considered
615 * removable
616 */
617 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
618 return 0;
619
620 /*
621 * A pageblock starting with a PageReserved page is not
622 * considered removable.
623 */
624 if (PageReserved(page))
625 return 0; 611 return 0;
612 cond_resched();
626 } 613 }
627 614
628 /* All pageblocks in the memory block are likely to be hot-removable */ 615 /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
659 * Scanning pfn is much easier than scanning lru list. 646 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page. 647 * Scan pfn from start to end and Find LRU page.
661 */ 648 */
662int scan_lru_pages(unsigned long start, unsigned long end) 649static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
663{ 650{
664 unsigned long pfn; 651 unsigned long pfn;
665 struct page *page; 652 struct page *page;
@@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
709 page_is_file_cache(page)); 696 page_is_file_cache(page));
710 697
711 } else { 698 } else {
712 /* Becasue we don't have big zone->lock. we should
713 check this again here. */
714 if (page_count(page))
715 not_managed++;
716#ifdef CONFIG_DEBUG_VM 699#ifdef CONFIG_DEBUG_VM
717 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 700 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
718 pfn); 701 pfn);
719 dump_page(page); 702 dump_page(page);
720#endif 703#endif
704 /* Becasue we don't have big zone->lock. we should
705 check this again here. */
706 if (page_count(page)) {
707 not_managed++;
708 ret = -EBUSY;
709 break;
710 }
721 } 711 }
722 } 712 }
723 ret = -EBUSY; 713 if (!list_empty(&source)) {
724 if (not_managed) { 714 if (not_managed) {
725 if (!list_empty(&source)) 715 putback_lru_pages(&source);
716 goto out;
717 }
718 /* this function returns # of failed pages */
719 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
720 if (ret)
726 putback_lru_pages(&source); 721 putback_lru_pages(&source);
727 goto out;
728 } 722 }
729 ret = 0;
730 if (list_empty(&source))
731 goto out;
732 /* this function returns # of failed pages */
733 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
734
735out: 723out:
736 return ret; 724 return ret;
737} 725}
@@ -840,7 +828,6 @@ repeat:
840 ret = 0; 828 ret = 0;
841 if (drain) { 829 if (drain) {
842 lru_add_drain_all(); 830 lru_add_drain_all();
843 flush_scheduled_work();
844 cond_resched(); 831 cond_resched();
845 drain_all_pages(); 832 drain_all_pages();
846 } 833 }
@@ -862,7 +849,6 @@ repeat:
862 } 849 }
863 /* drain all zone's lru pagevec, this is asyncronous... */ 850 /* drain all zone's lru pagevec, this is asyncronous... */
864 lru_add_drain_all(); 851 lru_add_drain_all();
865 flush_scheduled_work();
866 yield(); 852 yield();
867 /* drain pcp pages , this is synchrouns. */ 853 /* drain pcp pages , this is synchrouns. */
868 drain_all_pages(); 854 drain_all_pages();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..4a57f135b76e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 924 nodemask_t nmask;
925 LIST_HEAD(pagelist); 925 LIST_HEAD(pagelist);
926 int err = 0; 926 int err = 0;
927 struct vm_area_struct *vma;
927 928
928 nodes_clear(nmask); 929 nodes_clear(nmask);
929 node_set(source, nmask); 930 node_set(source, nmask);
930 931
931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
932 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 933 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
933 936
934 if (!list_empty(&pagelist)) 937 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 0); 938 err = migrate_pages(&pagelist, new_node_page, dest, 0);
939 if (err)
940 putback_lru_pages(&pagelist);
941 }
936 942
937 return err; 943 return err;
938} 944}
@@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len,
1147 1153
1148 err = mbind_range(mm, start, end, new); 1154 err = mbind_range(mm, start, end, new);
1149 1155
1150 if (!list_empty(&pagelist)) 1156 if (!list_empty(&pagelist)) {
1151 nr_failed = migrate_pages(&pagelist, new_vma_page, 1157 nr_failed = migrate_pages(&pagelist, new_vma_page,
1152 (unsigned long)vma, 0); 1158 (unsigned long)vma, 0);
1159 if (nr_failed)
1160 putback_lru_pages(&pagelist);
1161 }
1153 1162
1154 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1163 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1155 err = -EIO; 1164 err = -EIO;
@@ -1588,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy)
1588 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1597 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1589 &policy->v.nodes, 1598 &policy->v.nodes,
1590 &zone); 1599 &zone);
1591 return zone->node; 1600 return zone ? zone->node : numa_node_id();
1592 } 1601 }
1593 1602
1594 default: 1603 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..fe5a3c6a5426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
37#include "internal.h" 38#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 96 pte_t *ptep, pte;
96 spinlock_t *ptl; 97 spinlock_t *ptl;
97 98
98 pgd = pgd_offset(mm, addr); 99 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 100 ptep = huge_pte_offset(mm, addr);
100 goto out; 101 if (!ptep)
102 goto out;
103 ptl = &mm->page_table_lock;
104 } else {
105 pgd = pgd_offset(mm, addr);
106 if (!pgd_present(*pgd))
107 goto out;
101 108
102 pud = pud_offset(pgd, addr); 109 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 110 if (!pud_present(*pud))
104 goto out; 111 goto out;
105 112
106 pmd = pmd_offset(pud, addr); 113 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 114 if (!pmd_present(*pmd))
108 goto out; 115 goto out;
109 116
110 ptep = pte_offset_map(pmd, addr); 117 ptep = pte_offset_map(pmd, addr);
111 118
112 if (!is_swap_pte(*ptep)) { 119 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 120 pte_unmap(ptep);
114 goto out; 121 goto out;
115 } 122 }
123
124 ptl = pte_lockptr(mm, pmd);
125 }
116 126
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 127 spin_lock(ptl);
119 pte = *ptep; 128 pte = *ptep;
120 if (!is_swap_pte(pte)) 129 if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 140 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 141 pte = pte_mkwrite(pte);
142#ifdef CONFIG_HUGETLB_PAGE
143 if (PageHuge(new))
144 pte = pte_mkhuge(pte);
145#endif
133 flush_cache_page(vma, addr, pte_pfn(pte)); 146 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 147 set_pte_at(mm, addr, ptep, pte);
135 148
136 if (PageAnon(new)) 149 if (PageHuge(new)) {
150 if (PageAnon(new))
151 hugepage_add_anon_rmap(new, vma, addr);
152 else
153 page_dup_rmap(new);
154 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 155 page_add_anon_rmap(new, vma, addr);
138 else 156 else
139 page_add_file_rmap(new); 157 page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 294}
277 295
278/* 296/*
297 * The expected number of remaining references is the same as that
298 * of migrate_page_move_mapping().
299 */
300int migrate_huge_page_move_mapping(struct address_space *mapping,
301 struct page *newpage, struct page *page)
302{
303 int expected_count;
304 void **pslot;
305
306 if (!mapping) {
307 if (page_count(page) != 1)
308 return -EAGAIN;
309 return 0;
310 }
311
312 spin_lock_irq(&mapping->tree_lock);
313
314 pslot = radix_tree_lookup_slot(&mapping->page_tree,
315 page_index(page));
316
317 expected_count = 2 + page_has_private(page);
318 if (page_count(page) != expected_count ||
319 (struct page *)radix_tree_deref_slot(pslot) != page) {
320 spin_unlock_irq(&mapping->tree_lock);
321 return -EAGAIN;
322 }
323
324 if (!page_freeze_refs(page, expected_count)) {
325 spin_unlock_irq(&mapping->tree_lock);
326 return -EAGAIN;
327 }
328
329 get_page(newpage);
330
331 radix_tree_replace_slot(pslot, newpage);
332
333 page_unfreeze_refs(page, expected_count);
334
335 __put_page(page);
336
337 spin_unlock_irq(&mapping->tree_lock);
338 return 0;
339}
340
341/*
279 * Copy the page to its new location 342 * Copy the page to its new location
280 */ 343 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 344void migrate_page_copy(struct page *newpage, struct page *page)
282{ 345{
283 copy_highpage(newpage, page); 346 if (PageHuge(page))
347 copy_huge_page(newpage, page);
348 else
349 copy_highpage(newpage, page);
284 350
285 if (PageError(page)) 351 if (PageError(page))
286 SetPageError(newpage); 352 SetPageError(newpage);
@@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
431 .nr_to_write = 1, 497 .nr_to_write = 1,
432 .range_start = 0, 498 .range_start = 0,
433 .range_end = LLONG_MAX, 499 .range_end = LLONG_MAX,
434 .nonblocking = 1,
435 .for_reclaim = 1 500 .for_reclaim = 1
436 }; 501 };
437 int rc; 502 int rc;
@@ -724,6 +789,92 @@ move_newpage:
724} 789}
725 790
726/* 791/*
792 * Counterpart of unmap_and_move_page() for hugepage migration.
793 *
794 * This function doesn't wait the completion of hugepage I/O
795 * because there is no race between I/O and migration for hugepage.
796 * Note that currently hugepage I/O occurs only in direct I/O
797 * where no lock is held and PG_writeback is irrelevant,
798 * and writeback status of all subpages are counted in the reference
799 * count of the head page (i.e. if all subpages of a 2MB hugepage are
800 * under direct I/O, the reference of the head page is 512 and a bit more.)
801 * This means that when we try to migrate hugepage whose subpages are
802 * doing direct I/O, some references remain after try_to_unmap() and
803 * hugepage migration fails without data corruption.
804 *
805 * There is also no race when direct I/O is issued on the page under migration,
806 * because then pte is replaced with migration swap entry and direct I/O code
807 * will wait in the page fault for migration to complete.
808 */
809static int unmap_and_move_huge_page(new_page_t get_new_page,
810 unsigned long private, struct page *hpage,
811 int force, int offlining)
812{
813 int rc = 0;
814 int *result = NULL;
815 struct page *new_hpage = get_new_page(hpage, private, &result);
816 int rcu_locked = 0;
817 struct anon_vma *anon_vma = NULL;
818
819 if (!new_hpage)
820 return -ENOMEM;
821
822 rc = -EAGAIN;
823
824 if (!trylock_page(hpage)) {
825 if (!force)
826 goto out;
827 lock_page(hpage);
828 }
829
830 if (PageAnon(hpage)) {
831 rcu_read_lock();
832 rcu_locked = 1;
833
834 if (page_mapped(hpage)) {
835 anon_vma = page_anon_vma(hpage);
836 atomic_inc(&anon_vma->external_refcount);
837 }
838 }
839
840 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
841
842 if (!page_mapped(hpage))
843 rc = move_to_new_page(new_hpage, hpage, 1);
844
845 if (rc)
846 remove_migration_ptes(hpage, hpage);
847
848 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
849 &anon_vma->lock)) {
850 int empty = list_empty(&anon_vma->head);
851 spin_unlock(&anon_vma->lock);
852 if (empty)
853 anon_vma_free(anon_vma);
854 }
855
856 if (rcu_locked)
857 rcu_read_unlock();
858out:
859 unlock_page(hpage);
860
861 if (rc != -EAGAIN) {
862 list_del(&hpage->lru);
863 put_page(hpage);
864 }
865
866 put_page(new_hpage);
867
868 if (result) {
869 if (rc)
870 *result = rc;
871 else
872 *result = page_to_nid(new_hpage);
873 }
874 return rc;
875}
876
877/*
727 * migrate_pages 878 * migrate_pages
728 * 879 *
729 * The function takes one list of pages to migrate and a function 880 * The function takes one list of pages to migrate and a function
@@ -732,8 +883,9 @@ move_newpage:
732 * 883 *
733 * The function returns after 10 attempts or if no pages 884 * The function returns after 10 attempts or if no pages
734 * are movable anymore because to has become empty 885 * are movable anymore because to has become empty
735 * or no retryable pages exist anymore. All pages will be 886 * or no retryable pages exist anymore.
736 * returned to the LRU or freed. 887 * Caller should call putback_lru_pages to return pages to the LRU
888 * or free list.
737 * 889 *
738 * Return: Number of pages not migrated or error code. 890 * Return: Number of pages not migrated or error code.
739 */ 891 */
@@ -780,7 +932,51 @@ out:
780 if (!swapwrite) 932 if (!swapwrite)
781 current->flags &= ~PF_SWAPWRITE; 933 current->flags &= ~PF_SWAPWRITE;
782 934
783 putback_lru_pages(from); 935 if (rc)
936 return rc;
937
938 return nr_failed + retry;
939}
940
941int migrate_huge_pages(struct list_head *from,
942 new_page_t get_new_page, unsigned long private, int offlining)
943{
944 int retry = 1;
945 int nr_failed = 0;
946 int pass = 0;
947 struct page *page;
948 struct page *page2;
949 int rc;
950
951 for (pass = 0; pass < 10 && retry; pass++) {
952 retry = 0;
953
954 list_for_each_entry_safe(page, page2, from, lru) {
955 cond_resched();
956
957 rc = unmap_and_move_huge_page(get_new_page,
958 private, page, pass > 2, offlining);
959
960 switch(rc) {
961 case -ENOMEM:
962 goto out;
963 case -EAGAIN:
964 retry++;
965 break;
966 case 0:
967 break;
968 default:
969 /* Permanent failure */
970 nr_failed++;
971 break;
972 }
973 }
974 }
975 rc = 0;
976out:
977
978 list_for_each_entry_safe(page, page2, from, lru)
979 put_page(page);
784 980
785 if (rc) 981 if (rc)
786 return rc; 982 return rc;
@@ -841,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
841 1037
842 err = -EFAULT; 1038 err = -EFAULT;
843 vma = find_vma(mm, pp->addr); 1039 vma = find_vma(mm, pp->addr);
844 if (!vma || !vma_migratable(vma)) 1040 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
845 goto set_status; 1041 goto set_status;
846 1042
847 page = follow_page(vma, pp->addr, FOLL_GET); 1043 page = follow_page(vma, pp->addr, FOLL_GET);
@@ -890,9 +1086,12 @@ set_status:
890 } 1086 }
891 1087
892 err = 0; 1088 err = 0;
893 if (!list_empty(&pagelist)) 1089 if (!list_empty(&pagelist)) {
894 err = migrate_pages(&pagelist, new_page_node, 1090 err = migrate_pages(&pagelist, new_page_node,
895 (unsigned long)pm, 0); 1091 (unsigned long)pm, 0);
1092 if (err)
1093 putback_lru_pages(&pagelist);
1094 }
896 1095
897 up_read(&mm->mmap_sem); 1096 up_read(&mm->mmap_sem);
898 return err; 1097 return err;
@@ -1005,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1005 int err = -EFAULT; 1204 int err = -EFAULT;
1006 1205
1007 vma = find_vma(mm, addr); 1206 vma = find_vma(mm, addr);
1008 if (!vma) 1207 if (!vma || addr < vma->vm_start)
1009 goto set_status; 1208 goto set_status;
1010 1209
1011 page = follow_page(vma, addr, 0); 1210 page = follow_page(vma, addr, 0);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..b179abb1474a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1108,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1108 unsigned long retval = -EBADF; 1109 unsigned long retval = -EBADF;
1109 1110
1110 if (!(flags & MAP_ANONYMOUS)) { 1111 if (!(flags & MAP_ANONYMOUS)) {
1112 audit_mmap_fd(fd, flags);
1111 if (unlikely(flags & MAP_HUGETLB)) 1113 if (unlikely(flags & MAP_HUGETLB))
1112 return -EINVAL; 1114 return -EINVAL;
1113 file = fget(fd); 1115 file = fget(fd);
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..563fbdd6293a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
101 * pte locks because exclusive mmap_sem prevents deadlock. 101 * pte locks because exclusive mmap_sem prevents deadlock.
102 */ 102 */
103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
104 new_pte = pte_offset_map_nested(new_pmd, new_addr); 104 new_pte = pte_offset_map(new_pmd, new_addr);
105 new_ptl = pte_lockptr(mm, new_pmd); 105 new_ptl = pte_lockptr(mm, new_pmd);
106 if (new_ptl != old_ptl) 106 if (new_ptl != old_ptl)
107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 arch_leave_lazy_mmu_mode(); 119 arch_leave_lazy_mmu_mode();
120 if (new_ptl != old_ptl) 120 if (new_ptl != old_ptl)
121 spin_unlock(new_ptl); 121 spin_unlock(new_ptl);
122 pte_unmap_nested(new_pte - 1); 122 pte_unmap(new_pte - 1);
123 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 124 if (mapping)
125 spin_unlock(&mapping->i_mmap_lock); 125 spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..3613517c7592 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
29#include <linux/personality.h> 29#include <linux/personality.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/audit.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -293,11 +294,58 @@ void *vmalloc(unsigned long size)
293} 294}
294EXPORT_SYMBOL(vmalloc); 295EXPORT_SYMBOL(vmalloc);
295 296
297/*
298 * vzalloc - allocate virtually continguos memory with zero fill
299 *
300 * @size: allocation size
301 *
302 * Allocate enough pages to cover @size from the page level
303 * allocator and map them into continguos kernel virtual space.
304 * The memory allocated is set to zero.
305 *
306 * For tight control over page level allocator and protection flags
307 * use __vmalloc() instead.
308 */
309void *vzalloc(unsigned long size)
310{
311 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
312 PAGE_KERNEL);
313}
314EXPORT_SYMBOL(vzalloc);
315
316/**
317 * vmalloc_node - allocate memory on a specific node
318 * @size: allocation size
319 * @node: numa node
320 *
321 * Allocate enough pages to cover @size from the page level
322 * allocator and map them into contiguous kernel virtual space.
323 *
324 * For tight control over page level allocator and protection flags
325 * use __vmalloc() instead.
326 */
296void *vmalloc_node(unsigned long size, int node) 327void *vmalloc_node(unsigned long size, int node)
297{ 328{
298 return vmalloc(size); 329 return vmalloc(size);
299} 330}
300EXPORT_SYMBOL(vmalloc_node); 331
332/**
333 * vzalloc_node - allocate memory on a specific node with zero fill
334 * @size: allocation size
335 * @node: numa node
336 *
337 * Allocate enough pages to cover @size from the page level
338 * allocator and map them into contiguous kernel virtual space.
339 * The memory allocated is set to zero.
340 *
341 * For tight control over page level allocator and protection flags
342 * use __vmalloc() instead.
343 */
344void *vzalloc_node(unsigned long size, int node)
345{
346 return vzalloc(size);
347}
348EXPORT_SYMBOL(vzalloc_node);
301 349
302#ifndef PAGE_KERNEL_EXEC 350#ifndef PAGE_KERNEL_EXEC
303# define PAGE_KERNEL_EXEC PAGE_KERNEL 351# define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -1411,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1411 struct file *file = NULL; 1459 struct file *file = NULL;
1412 unsigned long retval = -EBADF; 1460 unsigned long retval = -EBADF;
1413 1461
1462 audit_mmap_fd(fd, flags);
1414 if (!(flags & MAP_ANONYMOUS)) { 1463 if (!(flags & MAP_ANONYMOUS)) {
1415 file = fget(fd); 1464 file = fget(fd);
1416 if (!file) 1465 if (!file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
162 return 0; 162 return 0;
163 163
164 /* 164 /*
165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't 165 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
166 * need to be executed for something that cannot be killed. 166 * so the entire heuristic doesn't need to be executed for something
167 * that cannot be killed.
167 */ 168 */
168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 169 if (atomic_read(&p->mm->oom_disable_count)) {
169 task_unlock(p); 170 task_unlock(p);
170 return 0; 171 return 0;
171 } 172 }
@@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
403#define K(x) ((x) << (PAGE_SHIFT-10)) 404#define K(x) ((x) << (PAGE_SHIFT-10))
404static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 405static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
405{ 406{
407 struct task_struct *q;
408 struct mm_struct *mm;
409
406 p = find_lock_task_mm(p); 410 p = find_lock_task_mm(p);
407 if (!p) 411 if (!p)
408 return 1; 412 return 1;
409 413
414 /* mm cannot be safely dereferenced after task_unlock(p) */
415 mm = p->mm;
416
410 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 417 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
411 task_pid_nr(p), p->comm, K(p->mm->total_vm), 418 task_pid_nr(p), p->comm, K(p->mm->total_vm),
412 K(get_mm_counter(p->mm, MM_ANONPAGES)), 419 K(get_mm_counter(p->mm, MM_ANONPAGES)),
413 K(get_mm_counter(p->mm, MM_FILEPAGES))); 420 K(get_mm_counter(p->mm, MM_FILEPAGES)));
414 task_unlock(p); 421 task_unlock(p);
415 422
423 /*
424 * Kill all processes sharing p->mm in other thread groups, if any.
425 * They don't get access to memory reserves or a higher scheduler
426 * priority, though, to avoid depletion of all memory or task
427 * starvation. This prevents mm->mmap_sem livelock when an oom killed
428 * task cannot exit because it requires the semaphore and its contended
429 * by another thread trying to allocate memory itself. That thread will
430 * now get access to memory reserves since it has a pending fatal
431 * signal.
432 */
433 for_each_process(q)
434 if (q->mm == mm && !same_thread_group(q, p)) {
435 task_lock(q); /* Protect ->comm from prctl() */
436 pr_err("Kill process %d (%s) sharing same memory\n",
437 task_pid_nr(q), q->comm);
438 task_unlock(q);
439 force_sig(SIGKILL, q);
440 }
416 441
417 set_tsk_thread_flag(p, TIF_MEMDIE); 442 set_tsk_thread_flag(p, TIF_MEMDIE);
418 force_sig(SIGKILL, p); 443 force_sig(SIGKILL, p);
@@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
680 read_lock(&tasklist_lock); 705 read_lock(&tasklist_lock);
681 if (sysctl_oom_kill_allocating_task && 706 if (sysctl_oom_kill_allocating_task &&
682 !oom_unkillable_task(current, NULL, nodemask) && 707 !oom_unkillable_task(current, NULL, nodemask) &&
683 (current->signal->oom_adj != OOM_DISABLE)) { 708 current->mm && !atomic_read(&current->mm->oom_disable_count)) {
684 /* 709 /*
685 * oom_kill_process() needs tasklist_lock held. If it returns 710 * oom_kill_process() needs tasklist_lock held. If it returns
686 * non-zero, current could not be killed so we must fallback to 711 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
415 415
416 if (vm_dirty_bytes) 416 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else { 418 else
419 int dirty_ratio; 419 dirty = (vm_dirty_ratio * available_memory) / 100;
420
421 dirty_ratio = vm_dirty_ratio;
422 if (dirty_ratio < 5)
423 dirty_ratio = 5;
424 dirty = (dirty_ratio * available_memory) / 100;
425 }
426 420
427 if (dirty_background_bytes) 421 if (dirty_background_bytes)
428 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); 422 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
510 * catch-up. This avoids (excessively) small writeouts 504 * catch-up. This avoids (excessively) small writeouts
511 * when the bdi limits are ramping up. 505 * when the bdi limits are ramping up.
512 */ 506 */
513 if (nr_reclaimable + nr_writeback < 507 if (nr_reclaimable + nr_writeback <=
514 (background_thresh + dirty_thresh) / 2) 508 (background_thresh + dirty_thresh) / 2)
515 break; 509 break;
516 510
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
542 * the last resort safeguard. 536 * the last resort safeguard.
543 */ 537 */
544 dirty_exceeded = 538 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) 539 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh); 540 || (nr_reclaimable + nr_writeback > dirty_thresh);
547 541
548 if (!dirty_exceeded) 542 if (!dirty_exceeded)
549 break; 543 break;
@@ -1121,6 +1115,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1121{ 1115{
1122 if (mapping_cap_account_dirty(mapping)) { 1116 if (mapping_cap_account_dirty(mapping)) {
1123 __inc_zone_page_state(page, NR_FILE_DIRTY); 1117 __inc_zone_page_state(page, NR_FILE_DIRTY);
1118 __inc_zone_page_state(page, NR_DIRTIED);
1124 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1119 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1125 task_dirty_inc(current); 1120 task_dirty_inc(current);
1126 task_io_account_write(PAGE_CACHE_SIZE); 1121 task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1124,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1129EXPORT_SYMBOL(account_page_dirtied); 1124EXPORT_SYMBOL(account_page_dirtied);
1130 1125
1131/* 1126/*
1127 * Helper function for set_page_writeback family.
1128 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1129 * wrt interrupts.
1130 */
1131void account_page_writeback(struct page *page)
1132{
1133 inc_zone_page_state(page, NR_WRITEBACK);
1134 inc_zone_page_state(page, NR_WRITTEN);
1135}
1136EXPORT_SYMBOL(account_page_writeback);
1137
1138/*
1132 * For address_spaces which do not use buffers. Just tag the page as dirty in 1139 * For address_spaces which do not use buffers. Just tag the page as dirty in
1133 * its radix tree. 1140 * its radix tree.
1134 * 1141 *
@@ -1366,7 +1373,7 @@ int test_set_page_writeback(struct page *page)
1366 ret = TestSetPageWriteback(page); 1373 ret = TestSetPageWriteback(page);
1367 } 1374 }
1368 if (!ret) 1375 if (!ret)
1369 inc_zone_page_state(page, NR_WRITEBACK); 1376 account_page_writeback(page);
1370 return ret; 1377 return ret;
1371 1378
1372} 1379}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..07a654486f75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/compiler.h> 25#include <linux/compiler.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
@@ -530,7 +531,7 @@ static inline void __free_one_page(struct page *page,
530 * so it's less likely to be used soon and more likely to be merged 531 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page 532 * as a higher order page
532 */ 533 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = __find_combined_index(page_idx, order);
536 higher_page = page + combined_idx - page_idx; 537 higher_page = page + combined_idx - page_idx;
@@ -1906,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1906 preferred_zone, migratetype); 1907 preferred_zone, migratetype);
1907 1908
1908 if (!page && gfp_mask & __GFP_NOFAIL) 1909 if (!page && gfp_mask & __GFP_NOFAIL)
1909 congestion_wait(BLK_RW_ASYNC, HZ/50); 1910 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1910 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1911 } while (!page && (gfp_mask & __GFP_NOFAIL));
1911 1912
1912 return page; 1913 return page;
@@ -1931,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1931 const gfp_t wait = gfp_mask & __GFP_WAIT; 1932 const gfp_t wait = gfp_mask & __GFP_WAIT;
1932 1933
1933 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 1934 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1934 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 1935 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1935 1936
1936 /* 1937 /*
1937 * The caller may dip into page reserves a bit more if the caller 1938 * The caller may dip into page reserves a bit more if the caller
@@ -1939,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1939 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1940 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1940 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1941 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1941 */ 1942 */
1942 alloc_flags |= (gfp_mask & __GFP_HIGH); 1943 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1943 1944
1944 if (!wait) { 1945 if (!wait) {
1945 alloc_flags |= ALLOC_HARDER; 1946 alloc_flags |= ALLOC_HARDER;
@@ -2094,7 +2095,7 @@ rebalance:
2094 pages_reclaimed += did_some_progress; 2095 pages_reclaimed += did_some_progress;
2095 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2096 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2096 /* Wait for some write requests to complete then retry */ 2097 /* Wait for some write requests to complete then retry */
2097 congestion_wait(BLK_RW_ASYNC, HZ/50); 2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2098 goto rebalance; 2099 goto rebalance;
2099 } 2100 }
2100 2101
@@ -3636,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid,
3636 } 3637 }
3637} 3638}
3638 3639
3640#ifdef CONFIG_HAVE_MEMBLOCK
3641u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3642 u64 goal, u64 limit)
3643{
3644 int i;
3645
3646 /* Need to go over early_node_map to find out good range for node */
3647 for_each_active_range_index_in_nid(i, nid) {
3648 u64 addr;
3649 u64 ei_start, ei_last;
3650 u64 final_start, final_end;
3651
3652 ei_last = early_node_map[i].end_pfn;
3653 ei_last <<= PAGE_SHIFT;
3654 ei_start = early_node_map[i].start_pfn;
3655 ei_start <<= PAGE_SHIFT;
3656
3657 final_start = max(ei_start, goal);
3658 final_end = min(ei_last, limit);
3659
3660 if (final_start >= final_end)
3661 continue;
3662
3663 addr = memblock_find_in_range(final_start, final_end, size, align);
3664
3665 if (addr == MEMBLOCK_ERROR)
3666 continue;
3667
3668 return addr;
3669 }
3670
3671 return MEMBLOCK_ERROR;
3672}
3673#endif
3674
3639int __init add_from_early_node_map(struct range *range, int az, 3675int __init add_from_early_node_map(struct range *range, int az,
3640 int nr_range, int nid) 3676 int nr_range, int nid)
3641{ 3677{
@@ -3655,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az,
3655void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, 3691void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3656 u64 goal, u64 limit) 3692 u64 goal, u64 limit)
3657{ 3693{
3658 int i;
3659 void *ptr; 3694 void *ptr;
3695 u64 addr;
3660 3696
3661 if (limit > get_max_mapped()) 3697 if (limit > memblock.current_limit)
3662 limit = get_max_mapped(); 3698 limit = memblock.current_limit;
3663 3699
3664 /* need to go over early_node_map to find out good range for node */ 3700 addr = find_memory_core_early(nid, size, align, goal, limit);
3665 for_each_active_range_index_in_nid(i, nid) {
3666 u64 addr;
3667 u64 ei_start, ei_last;
3668 3701
3669 ei_last = early_node_map[i].end_pfn; 3702 if (addr == MEMBLOCK_ERROR)
3670 ei_last <<= PAGE_SHIFT; 3703 return NULL;
3671 ei_start = early_node_map[i].start_pfn;
3672 ei_start <<= PAGE_SHIFT;
3673 addr = find_early_area(ei_start, ei_last,
3674 goal, limit, size, align);
3675
3676 if (addr == -1ULL)
3677 continue;
3678
3679#if 0
3680 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3681 nid,
3682 ei_start, ei_last, goal, limit, size,
3683 align, addr);
3684#endif
3685
3686 ptr = phys_to_virt(addr);
3687 memset(ptr, 0, size);
3688 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3689 /*
3690 * The min_count is set to 0 so that bootmem allocated blocks
3691 * are never reported as leaks.
3692 */
3693 kmemleak_alloc(ptr, size, 0, 0);
3694 return ptr;
3695 }
3696 3704
3697 return NULL; 3705 ptr = phys_to_virt(addr);
3706 memset(ptr, 0, size);
3707 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
3708 /*
3709 * The min_count is set to 0 so that bootmem allocated blocks
3710 * are never reported as leaks.
3711 */
3712 kmemleak_alloc(ptr, size, 0, 0);
3713 return ptr;
3698} 3714}
3699#endif 3715#endif
3700 3716
@@ -5281,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5281 * page allocater never alloc memory from ISOLATE block. 5297 * page allocater never alloc memory from ISOLATE block.
5282 */ 5298 */
5283 5299
5300static int
5301__count_immobile_pages(struct zone *zone, struct page *page, int count)
5302{
5303 unsigned long pfn, iter, found;
5304 /*
5305 * For avoiding noise data, lru_add_drain_all() should be called
5306 * If ZONE_MOVABLE, the zone never contains immobile pages
5307 */
5308 if (zone_idx(zone) == ZONE_MOVABLE)
5309 return true;
5310
5311 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5312 return true;
5313
5314 pfn = page_to_pfn(page);
5315 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5316 unsigned long check = pfn + iter;
5317
5318 if (!pfn_valid_within(check)) {
5319 iter++;
5320 continue;
5321 }
5322 page = pfn_to_page(check);
5323 if (!page_count(page)) {
5324 if (PageBuddy(page))
5325 iter += (1 << page_order(page)) - 1;
5326 continue;
5327 }
5328 if (!PageLRU(page))
5329 found++;
5330 /*
5331 * If there are RECLAIMABLE pages, we need to check it.
5332 * But now, memory offline itself doesn't call shrink_slab()
5333 * and it still to be fixed.
5334 */
5335 /*
5336 * If the page is not RAM, page_count()should be 0.
5337 * we don't need more check. This is an _used_ not-movable page.
5338 *
5339 * The problematic thing here is PG_reserved pages. PG_reserved
5340 * is set to both of a memory hole page and a _used_ kernel
5341 * page at boot.
5342 */
5343 if (found > count)
5344 return false;
5345 }
5346 return true;
5347}
5348
5349bool is_pageblock_removable_nolock(struct page *page)
5350{
5351 struct zone *zone = page_zone(page);
5352 return __count_immobile_pages(zone, page, 0);
5353}
5354
5284int set_migratetype_isolate(struct page *page) 5355int set_migratetype_isolate(struct page *page)
5285{ 5356{
5286 struct zone *zone; 5357 struct zone *zone;
5287 struct page *curr_page; 5358 unsigned long flags, pfn;
5288 unsigned long flags, pfn, iter;
5289 unsigned long immobile = 0;
5290 struct memory_isolate_notify arg; 5359 struct memory_isolate_notify arg;
5291 int notifier_ret; 5360 int notifier_ret;
5292 int ret = -EBUSY; 5361 int ret = -EBUSY;
@@ -5296,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
5296 zone_idx = zone_idx(zone); 5365 zone_idx = zone_idx(zone);
5297 5366
5298 spin_lock_irqsave(&zone->lock, flags); 5367 spin_lock_irqsave(&zone->lock, flags);
5299 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5300 zone_idx == ZONE_MOVABLE) {
5301 ret = 0;
5302 goto out;
5303 }
5304 5368
5305 pfn = page_to_pfn(page); 5369 pfn = page_to_pfn(page);
5306 arg.start_pfn = pfn; 5370 arg.start_pfn = pfn;
@@ -5320,23 +5384,20 @@ int set_migratetype_isolate(struct page *page)
5320 */ 5384 */
5321 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5385 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5322 notifier_ret = notifier_to_errno(notifier_ret); 5386 notifier_ret = notifier_to_errno(notifier_ret);
5323 if (notifier_ret || !arg.pages_found) 5387 if (notifier_ret)
5324 goto out; 5388 goto out;
5325 5389 /*
5326 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { 5390 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5327 if (!pfn_valid_within(pfn)) 5391 * We just check MOVABLE pages.
5328 continue; 5392 */
5329 5393 if (__count_immobile_pages(zone, page, arg.pages_found))
5330 curr_page = pfn_to_page(iter);
5331 if (!page_count(curr_page) || PageLRU(curr_page))
5332 continue;
5333
5334 immobile++;
5335 }
5336
5337 if (arg.pages_found == immobile)
5338 ret = 0; 5394 ret = 0;
5339 5395
5396 /*
5397 * immobile means "not-on-lru" paes. If immobile is larger than
5398 * removable-by-driver pages reported by notifier, we'll fail.
5399 */
5400
5340out: 5401out:
5341 if (!ret) { 5402 if (!ret) {
5342 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5403 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 86 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 87 * zone->lock must be held before call this.
88 * 88 *
89 * Returns 0 if all pages in the range is isolated. 89 * Returns 1 if all pages in the range is isolated.
90 */ 90 */
91static int 91static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
119 struct zone *zone; 119 struct zone *zone;
120 int ret; 120 int ret;
121 121
122 pfn = start_pfn;
123 /* 122 /*
124 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page 123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
125 * is not aligned to pageblock_nr_pages. 124 * is not aligned to pageblock_nr_pages.
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540a..89633fefc6a2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
27 * chunk size is not aligned. percpu-km code will whine about it. 27 * chunk size is not aligned. percpu-km code will whine about it.
28 */ 28 */
29 29
30#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 30#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
31#error "contiguous percpu allocation is incompatible with paged first chunk" 31#error "contiguous percpu allocation is incompatible with paged first chunk"
32#endif 32#endif
33 33
@@ -35,7 +35,11 @@
35 35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
37{ 37{
38 /* noop */ 38 unsigned int cpu;
39
40 for_each_possible_cpu(cpu)
41 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
42
39 return 0; 43 return 0;
40} 44}
41 45
diff --git a/mm/percpu.c b/mm/percpu.c
index c76ef3891e0d..efe816856a9d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
31 * as small as 4 bytes. The allocator organizes chunks into lists 31 * as small as 4 bytes. The allocator organizes chunks into lists
32 * according to free size and tries to allocate from the fullest one. 32 * according to free size and tries to allocate from the fullest one.
33 * Each chunk keeps the maximum contiguous area size hint which is 33 * Each chunk keeps the maximum contiguous area size hint which is
34 * guaranteed to be eqaul to or larger than the maximum contiguous 34 * guaranteed to be equal to or larger than the maximum contiguous
35 * area in the chunk. This helps the allocator not to iterate the 35 * area in the chunk. This helps the allocator not to iterate the
36 * chunk maps unnecessarily. 36 * chunk maps unnecessarily.
37 * 37 *
@@ -76,6 +76,7 @@
76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
78 78
79#ifdef CONFIG_SMP
79/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
80#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
81#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
@@ -89,6 +90,11 @@
89 (unsigned long)pcpu_base_addr - \ 90 (unsigned long)pcpu_base_addr - \
90 (unsigned long)__per_cpu_start) 91 (unsigned long)__per_cpu_start)
91#endif 92#endif
93#else /* CONFIG_SMP */
94/* on UP, it's always identity mapped */
95#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
96#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
97#endif /* CONFIG_SMP */
92 98
93struct pcpu_chunk { 99struct pcpu_chunk {
94 struct list_head list; /* linked to pcpu_slot lists */ 100 struct list_head list; /* linked to pcpu_slot lists */
@@ -820,8 +826,8 @@ fail_unlock_mutex:
820 * @size: size of area to allocate in bytes 826 * @size: size of area to allocate in bytes
821 * @align: alignment of area (max PAGE_SIZE) 827 * @align: alignment of area (max PAGE_SIZE)
822 * 828 *
823 * Allocate percpu area of @size bytes aligned at @align. Might 829 * Allocate zero-filled percpu area of @size bytes aligned at @align.
824 * sleep. Might trigger writeouts. 830 * Might sleep. Might trigger writeouts.
825 * 831 *
826 * CONTEXT: 832 * CONTEXT:
827 * Does GFP_KERNEL allocation. 833 * Does GFP_KERNEL allocation.
@@ -840,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
840 * @size: size of area to allocate in bytes 846 * @size: size of area to allocate in bytes
841 * @align: alignment of area (max PAGE_SIZE) 847 * @align: alignment of area (max PAGE_SIZE)
842 * 848 *
843 * Allocate percpu area of @size bytes aligned at @align from reserved 849 * Allocate zero-filled percpu area of @size bytes aligned at @align
844 * percpu area if arch has set it up; otherwise, allocation is served 850 * from reserved percpu area if arch has set it up; otherwise,
845 * from the same dynamic area. Might sleep. Might trigger writeouts. 851 * allocation is served from the same dynamic area. Might sleep.
852 * Might trigger writeouts.
846 * 853 *
847 * CONTEXT: 854 * CONTEXT:
848 * Does GFP_KERNEL allocation. 855 * Does GFP_KERNEL allocation.
@@ -949,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
949 */ 956 */
950bool is_kernel_percpu_address(unsigned long addr) 957bool is_kernel_percpu_address(unsigned long addr)
951{ 958{
959#ifdef CONFIG_SMP
952 const size_t static_size = __per_cpu_end - __per_cpu_start; 960 const size_t static_size = __per_cpu_end - __per_cpu_start;
953 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 961 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
954 unsigned int cpu; 962 unsigned int cpu;
@@ -959,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr)
959 if ((void *)addr >= start && (void *)addr < start + static_size) 967 if ((void *)addr >= start && (void *)addr < start + static_size)
960 return true; 968 return true;
961 } 969 }
970#endif
971 /* on UP, can't distinguish from other static vars, always false */
962 return false; 972 return false;
963} 973}
964 974
@@ -1067,161 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1067} 1077}
1068 1078
1069/** 1079/**
1070 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1071 * @reserved_size: the size of reserved percpu area in bytes
1072 * @dyn_size: minimum free size for dynamic allocation in bytes
1073 * @atom_size: allocation atom size
1074 * @cpu_distance_fn: callback to determine distance between cpus, optional
1075 *
1076 * This function determines grouping of units, their mappings to cpus
1077 * and other parameters considering needed percpu size, allocation
1078 * atom size and distances between CPUs.
1079 *
1080 * Groups are always mutliples of atom size and CPUs which are of
1081 * LOCAL_DISTANCE both ways are grouped together and share space for
1082 * units in the same group. The returned configuration is guaranteed
1083 * to have CPUs on different nodes on different groups and >=75% usage
1084 * of allocated virtual address space.
1085 *
1086 * RETURNS:
1087 * On success, pointer to the new allocation_info is returned. On
1088 * failure, ERR_PTR value is returned.
1089 */
1090static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1091 size_t reserved_size, size_t dyn_size,
1092 size_t atom_size,
1093 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1094{
1095 static int group_map[NR_CPUS] __initdata;
1096 static int group_cnt[NR_CPUS] __initdata;
1097 const size_t static_size = __per_cpu_end - __per_cpu_start;
1098 int nr_groups = 1, nr_units = 0;
1099 size_t size_sum, min_unit_size, alloc_size;
1100 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1101 int last_allocs, group, unit;
1102 unsigned int cpu, tcpu;
1103 struct pcpu_alloc_info *ai;
1104 unsigned int *cpu_map;
1105
1106 /* this function may be called multiple times */
1107 memset(group_map, 0, sizeof(group_map));
1108 memset(group_cnt, 0, sizeof(group_cnt));
1109
1110 /* calculate size_sum and ensure dyn_size is enough for early alloc */
1111 size_sum = PFN_ALIGN(static_size + reserved_size +
1112 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1113 dyn_size = size_sum - static_size - reserved_size;
1114
1115 /*
1116 * Determine min_unit_size, alloc_size and max_upa such that
1117 * alloc_size is multiple of atom_size and is the smallest
1118 * which can accomodate 4k aligned segments which are equal to
1119 * or larger than min_unit_size.
1120 */
1121 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1122
1123 alloc_size = roundup(min_unit_size, atom_size);
1124 upa = alloc_size / min_unit_size;
1125 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1126 upa--;
1127 max_upa = upa;
1128
1129 /* group cpus according to their proximity */
1130 for_each_possible_cpu(cpu) {
1131 group = 0;
1132 next_group:
1133 for_each_possible_cpu(tcpu) {
1134 if (cpu == tcpu)
1135 break;
1136 if (group_map[tcpu] == group && cpu_distance_fn &&
1137 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1138 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1139 group++;
1140 nr_groups = max(nr_groups, group + 1);
1141 goto next_group;
1142 }
1143 }
1144 group_map[cpu] = group;
1145 group_cnt[group]++;
1146 }
1147
1148 /*
1149 * Expand unit size until address space usage goes over 75%
1150 * and then as much as possible without using more address
1151 * space.
1152 */
1153 last_allocs = INT_MAX;
1154 for (upa = max_upa; upa; upa--) {
1155 int allocs = 0, wasted = 0;
1156
1157 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1158 continue;
1159
1160 for (group = 0; group < nr_groups; group++) {
1161 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1162 allocs += this_allocs;
1163 wasted += this_allocs * upa - group_cnt[group];
1164 }
1165
1166 /*
1167 * Don't accept if wastage is over 1/3. The
1168 * greater-than comparison ensures upa==1 always
1169 * passes the following check.
1170 */
1171 if (wasted > num_possible_cpus() / 3)
1172 continue;
1173
1174 /* and then don't consume more memory */
1175 if (allocs > last_allocs)
1176 break;
1177 last_allocs = allocs;
1178 best_upa = upa;
1179 }
1180 upa = best_upa;
1181
1182 /* allocate and fill alloc_info */
1183 for (group = 0; group < nr_groups; group++)
1184 nr_units += roundup(group_cnt[group], upa);
1185
1186 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1187 if (!ai)
1188 return ERR_PTR(-ENOMEM);
1189 cpu_map = ai->groups[0].cpu_map;
1190
1191 for (group = 0; group < nr_groups; group++) {
1192 ai->groups[group].cpu_map = cpu_map;
1193 cpu_map += roundup(group_cnt[group], upa);
1194 }
1195
1196 ai->static_size = static_size;
1197 ai->reserved_size = reserved_size;
1198 ai->dyn_size = dyn_size;
1199 ai->unit_size = alloc_size / upa;
1200 ai->atom_size = atom_size;
1201 ai->alloc_size = alloc_size;
1202
1203 for (group = 0, unit = 0; group_cnt[group]; group++) {
1204 struct pcpu_group_info *gi = &ai->groups[group];
1205
1206 /*
1207 * Initialize base_offset as if all groups are located
1208 * back-to-back. The caller should update this to
1209 * reflect actual allocation.
1210 */
1211 gi->base_offset = unit * ai->unit_size;
1212
1213 for_each_possible_cpu(cpu)
1214 if (group_map[cpu] == group)
1215 gi->cpu_map[gi->nr_units++] = cpu;
1216 gi->nr_units = roundup(gi->nr_units, upa);
1217 unit += gi->nr_units;
1218 }
1219 BUG_ON(unit != nr_units);
1220
1221 return ai;
1222}
1223
1224/**
1225 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info 1080 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1226 * @lvl: loglevel 1081 * @lvl: loglevel
1227 * @ai: allocation info to dump 1082 * @ai: allocation info to dump
@@ -1363,7 +1218,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1363 1218
1364 /* sanity checks */ 1219 /* sanity checks */
1365 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1220 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1221#ifdef CONFIG_SMP
1366 PCPU_SETUP_BUG_ON(!ai->static_size); 1222 PCPU_SETUP_BUG_ON(!ai->static_size);
1223#endif
1367 PCPU_SETUP_BUG_ON(!base_addr); 1224 PCPU_SETUP_BUG_ON(!base_addr);
1368 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1225 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1369 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1226 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
@@ -1488,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1488 return 0; 1345 return 0;
1489} 1346}
1490 1347
1348#ifdef CONFIG_SMP
1349
1491const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { 1350const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1492 [PCPU_FC_AUTO] = "auto", 1351 [PCPU_FC_AUTO] = "auto",
1493 [PCPU_FC_EMBED] = "embed", 1352 [PCPU_FC_EMBED] = "embed",
@@ -1515,8 +1374,180 @@ static int __init percpu_alloc_setup(char *str)
1515} 1374}
1516early_param("percpu_alloc", percpu_alloc_setup); 1375early_param("percpu_alloc", percpu_alloc_setup);
1517 1376
1377/*
1378 * pcpu_embed_first_chunk() is used by the generic percpu setup.
1379 * Build it if needed by the arch config or the generic setup is going
1380 * to be used.
1381 */
1518#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ 1382#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1519 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 1383 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1384#define BUILD_EMBED_FIRST_CHUNK
1385#endif
1386
1387/* build pcpu_page_first_chunk() iff needed by the arch config */
1388#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1389#define BUILD_PAGE_FIRST_CHUNK
1390#endif
1391
1392/* pcpu_build_alloc_info() is used by both embed and page first chunk */
1393#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1394/**
1395 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1396 * @reserved_size: the size of reserved percpu area in bytes
1397 * @dyn_size: minimum free size for dynamic allocation in bytes
1398 * @atom_size: allocation atom size
1399 * @cpu_distance_fn: callback to determine distance between cpus, optional
1400 *
1401 * This function determines grouping of units, their mappings to cpus
1402 * and other parameters considering needed percpu size, allocation
1403 * atom size and distances between CPUs.
1404 *
1405 * Groups are always mutliples of atom size and CPUs which are of
1406 * LOCAL_DISTANCE both ways are grouped together and share space for
1407 * units in the same group. The returned configuration is guaranteed
1408 * to have CPUs on different nodes on different groups and >=75% usage
1409 * of allocated virtual address space.
1410 *
1411 * RETURNS:
1412 * On success, pointer to the new allocation_info is returned. On
1413 * failure, ERR_PTR value is returned.
1414 */
1415static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1416 size_t reserved_size, size_t dyn_size,
1417 size_t atom_size,
1418 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1419{
1420 static int group_map[NR_CPUS] __initdata;
1421 static int group_cnt[NR_CPUS] __initdata;
1422 const size_t static_size = __per_cpu_end - __per_cpu_start;
1423 int nr_groups = 1, nr_units = 0;
1424 size_t size_sum, min_unit_size, alloc_size;
1425 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1426 int last_allocs, group, unit;
1427 unsigned int cpu, tcpu;
1428 struct pcpu_alloc_info *ai;
1429 unsigned int *cpu_map;
1430
1431 /* this function may be called multiple times */
1432 memset(group_map, 0, sizeof(group_map));
1433 memset(group_cnt, 0, sizeof(group_cnt));
1434
1435 /* calculate size_sum and ensure dyn_size is enough for early alloc */
1436 size_sum = PFN_ALIGN(static_size + reserved_size +
1437 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1438 dyn_size = size_sum - static_size - reserved_size;
1439
1440 /*
1441 * Determine min_unit_size, alloc_size and max_upa such that
1442 * alloc_size is multiple of atom_size and is the smallest
1443 * which can accomodate 4k aligned segments which are equal to
1444 * or larger than min_unit_size.
1445 */
1446 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1447
1448 alloc_size = roundup(min_unit_size, atom_size);
1449 upa = alloc_size / min_unit_size;
1450 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1451 upa--;
1452 max_upa = upa;
1453
1454 /* group cpus according to their proximity */
1455 for_each_possible_cpu(cpu) {
1456 group = 0;
1457 next_group:
1458 for_each_possible_cpu(tcpu) {
1459 if (cpu == tcpu)
1460 break;
1461 if (group_map[tcpu] == group && cpu_distance_fn &&
1462 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1463 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1464 group++;
1465 nr_groups = max(nr_groups, group + 1);
1466 goto next_group;
1467 }
1468 }
1469 group_map[cpu] = group;
1470 group_cnt[group]++;
1471 }
1472
1473 /*
1474 * Expand unit size until address space usage goes over 75%
1475 * and then as much as possible without using more address
1476 * space.
1477 */
1478 last_allocs = INT_MAX;
1479 for (upa = max_upa; upa; upa--) {
1480 int allocs = 0, wasted = 0;
1481
1482 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1483 continue;
1484
1485 for (group = 0; group < nr_groups; group++) {
1486 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1487 allocs += this_allocs;
1488 wasted += this_allocs * upa - group_cnt[group];
1489 }
1490
1491 /*
1492 * Don't accept if wastage is over 1/3. The
1493 * greater-than comparison ensures upa==1 always
1494 * passes the following check.
1495 */
1496 if (wasted > num_possible_cpus() / 3)
1497 continue;
1498
1499 /* and then don't consume more memory */
1500 if (allocs > last_allocs)
1501 break;
1502 last_allocs = allocs;
1503 best_upa = upa;
1504 }
1505 upa = best_upa;
1506
1507 /* allocate and fill alloc_info */
1508 for (group = 0; group < nr_groups; group++)
1509 nr_units += roundup(group_cnt[group], upa);
1510
1511 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1512 if (!ai)
1513 return ERR_PTR(-ENOMEM);
1514 cpu_map = ai->groups[0].cpu_map;
1515
1516 for (group = 0; group < nr_groups; group++) {
1517 ai->groups[group].cpu_map = cpu_map;
1518 cpu_map += roundup(group_cnt[group], upa);
1519 }
1520
1521 ai->static_size = static_size;
1522 ai->reserved_size = reserved_size;
1523 ai->dyn_size = dyn_size;
1524 ai->unit_size = alloc_size / upa;
1525 ai->atom_size = atom_size;
1526 ai->alloc_size = alloc_size;
1527
1528 for (group = 0, unit = 0; group_cnt[group]; group++) {
1529 struct pcpu_group_info *gi = &ai->groups[group];
1530
1531 /*
1532 * Initialize base_offset as if all groups are located
1533 * back-to-back. The caller should update this to
1534 * reflect actual allocation.
1535 */
1536 gi->base_offset = unit * ai->unit_size;
1537
1538 for_each_possible_cpu(cpu)
1539 if (group_map[cpu] == group)
1540 gi->cpu_map[gi->nr_units++] = cpu;
1541 gi->nr_units = roundup(gi->nr_units, upa);
1542 unit += gi->nr_units;
1543 }
1544 BUG_ON(unit != nr_units);
1545
1546 return ai;
1547}
1548#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
1549
1550#if defined(BUILD_EMBED_FIRST_CHUNK)
1520/** 1551/**
1521 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1552 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1522 * @reserved_size: the size of reserved percpu area in bytes 1553 * @reserved_size: the size of reserved percpu area in bytes
@@ -1645,10 +1676,9 @@ out_free:
1645 free_bootmem(__pa(areas), areas_size); 1676 free_bootmem(__pa(areas), areas_size);
1646 return rc; 1677 return rc;
1647} 1678}
1648#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || 1679#endif /* BUILD_EMBED_FIRST_CHUNK */
1649 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1650 1680
1651#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1681#ifdef BUILD_PAGE_FIRST_CHUNK
1652/** 1682/**
1653 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1683 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1654 * @reserved_size: the size of reserved percpu area in bytes 1684 * @reserved_size: the size of reserved percpu area in bytes
@@ -1756,10 +1786,11 @@ out_free_ar:
1756 pcpu_free_alloc_info(ai); 1786 pcpu_free_alloc_info(ai);
1757 return rc; 1787 return rc;
1758} 1788}
1759#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ 1789#endif /* BUILD_PAGE_FIRST_CHUNK */
1760 1790
1791#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1761/* 1792/*
1762 * Generic percpu area setup. 1793 * Generic SMP percpu area setup.
1763 * 1794 *
1764 * The embedding helper is used because its behavior closely resembles 1795 * The embedding helper is used because its behavior closely resembles
1765 * the original non-dynamic generic percpu area setup. This is 1796 * the original non-dynamic generic percpu area setup. This is
@@ -1770,7 +1801,6 @@ out_free_ar:
1770 * on the physical linear memory mapping which uses large page 1801 * on the physical linear memory mapping which uses large page
1771 * mappings on applicable archs. 1802 * mappings on applicable archs.
1772 */ 1803 */
1773#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1774unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 1804unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1775EXPORT_SYMBOL(__per_cpu_offset); 1805EXPORT_SYMBOL(__per_cpu_offset);
1776 1806
@@ -1799,13 +1829,48 @@ void __init setup_per_cpu_areas(void)
1799 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, 1829 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
1800 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); 1830 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
1801 if (rc < 0) 1831 if (rc < 0)
1802 panic("Failed to initialized percpu areas."); 1832 panic("Failed to initialize percpu areas.");
1803 1833
1804 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 1834 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1805 for_each_possible_cpu(cpu) 1835 for_each_possible_cpu(cpu)
1806 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 1836 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1807} 1837}
1808#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ 1838#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
1839
1840#else /* CONFIG_SMP */
1841
1842/*
1843 * UP percpu area setup.
1844 *
1845 * UP always uses km-based percpu allocator with identity mapping.
1846 * Static percpu variables are indistinguishable from the usual static
1847 * variables and don't require any special preparation.
1848 */
1849void __init setup_per_cpu_areas(void)
1850{
1851 const size_t unit_size =
1852 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
1853 PERCPU_DYNAMIC_RESERVE));
1854 struct pcpu_alloc_info *ai;
1855 void *fc;
1856
1857 ai = pcpu_alloc_alloc_info(1, 1);
1858 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1859 if (!ai || !fc)
1860 panic("Failed to allocate memory for percpu areas.");
1861
1862 ai->dyn_size = unit_size;
1863 ai->unit_size = unit_size;
1864 ai->atom_size = unit_size;
1865 ai->alloc_size = unit_size;
1866 ai->groups[0].nr_units = 1;
1867 ai->groups[0].cpu_map[0] = 0;
1868
1869 if (pcpu_setup_first_chunk(ai, fc) < 0)
1870 panic("Failed to initialize percpu areas.");
1871}
1872
1873#endif /* CONFIG_SMP */
1809 1874
1810/* 1875/*
1811 * First and reserved chunks are initialized with temporary allocation 1876 * First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index db884fae5721..000000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(this_cpu_ptr(p));
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..1a8bf76bfd03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
81} 81}
82 82
83void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 83static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
84{ 84{
85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
86} 86}
@@ -314,7 +314,7 @@ void __init anon_vma_init(void)
314 * Getting a lock on a stable anon_vma from a page off the LRU is 314 * Getting a lock on a stable anon_vma from a page off the LRU is
315 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 315 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
316 */ 316 */
317struct anon_vma *page_lock_anon_vma(struct page *page) 317struct anon_vma *__page_lock_anon_vma(struct page *page)
318{ 318{
319 struct anon_vma *anon_vma, *root_anon_vma; 319 struct anon_vma *anon_vma, *root_anon_vma;
320 unsigned long anon_mapping; 320 unsigned long anon_mapping;
@@ -348,6 +348,8 @@ out:
348} 348}
349 349
350void page_unlock_anon_vma(struct anon_vma *anon_vma) 350void page_unlock_anon_vma(struct anon_vma *anon_vma)
351 __releases(&anon_vma->root->lock)
352 __releases(RCU)
351{ 353{
352 anon_vma_unlock(anon_vma); 354 anon_vma_unlock(anon_vma);
353 rcu_read_unlock(); 355 rcu_read_unlock();
@@ -407,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
407 * 409 *
408 * On success returns with pte mapped and locked. 410 * On success returns with pte mapped and locked.
409 */ 411 */
410pte_t *page_check_address(struct page *page, struct mm_struct *mm, 412pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
411 unsigned long address, spinlock_t **ptlp, int sync) 413 unsigned long address, spinlock_t **ptlp, int sync)
412{ 414{
413 pgd_t *pgd; 415 pgd_t *pgd;
@@ -745,7 +747,7 @@ int page_mkclean(struct page *page)
745 if (mapping) { 747 if (mapping) {
746 ret = page_mkclean_file(mapping, page); 748 ret = page_mkclean_file(mapping, page);
747 if (page_test_dirty(page)) { 749 if (page_test_dirty(page)) {
748 page_clear_dirty(page); 750 page_clear_dirty(page, 1);
749 ret = 1; 751 ret = 1;
750 } 752 }
751 } 753 }
@@ -780,10 +782,10 @@ void page_move_anon_rmap(struct page *page,
780} 782}
781 783
782/** 784/**
783 * __page_set_anon_rmap - setup new anonymous rmap 785 * __page_set_anon_rmap - set up new anonymous rmap
784 * @page: the page to add the mapping to 786 * @page: Page to add to rmap
785 * @vma: the vm area in which the mapping is added 787 * @vma: VM area to add page to.
786 * @address: the user virtual address mapped 788 * @address: User virtual address of the mapping
787 * @exclusive: the page is exclusively owned by the current process 789 * @exclusive: the page is exclusively owned by the current process
788 */ 790 */
789static void __page_set_anon_rmap(struct page *page, 791static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +795,16 @@ static void __page_set_anon_rmap(struct page *page,
793 795
794 BUG_ON(!anon_vma); 796 BUG_ON(!anon_vma);
795 797
798 if (PageAnon(page))
799 return;
800
796 /* 801 /*
797 * If the page isn't exclusively mapped into this vma, 802 * If the page isn't exclusively mapped into this vma,
798 * we must use the _oldest_ possible anon_vma for the 803 * we must use the _oldest_ possible anon_vma for the
799 * page mapping! 804 * page mapping!
800 */ 805 */
801 if (!exclusive) { 806 if (!exclusive)
802 if (PageAnon(page))
803 return;
804 anon_vma = anon_vma->root; 807 anon_vma = anon_vma->root;
805 } else {
806 /*
807 * In this case, swapped-out-but-not-discarded swap-cache
808 * is remapped. So, no need to update page->mapping here.
809 * We convice anon_vma poitned by page->mapping is not obsolete
810 * because vma->anon_vma is necessary to be a family of it.
811 */
812 if (PageAnon(page))
813 return;
814 }
815 808
816 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 809 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817 page->mapping = (struct address_space *) anon_vma; 810 page->mapping = (struct address_space *) anon_vma;
@@ -942,7 +935,7 @@ void page_remove_rmap(struct page *page)
942 * containing the swap entry, but page not yet written to swap. 935 * containing the swap entry, but page not yet written to swap.
943 */ 936 */
944 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 937 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
945 page_clear_dirty(page); 938 page_clear_dirty(page, 1);
946 set_page_dirty(page); 939 set_page_dirty(page);
947 } 940 }
948 /* 941 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..47fdeeb9d636 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1586 1586
1587 inode = new_inode(sb); 1587 inode = new_inode(sb);
1588 if (inode) { 1588 if (inode) {
1589 inode->i_ino = get_next_ino();
1589 inode_init_owner(inode, dir, mode); 1590 inode_init_owner(inode, dir, mode);
1590 inode->i_blocks = 0; 1591 inode->i_blocks = 0;
1591 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1592 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1903 dir->i_size += BOGO_DIRENT_SIZE; 1904 dir->i_size += BOGO_DIRENT_SIZE;
1904 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1905 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1905 inc_nlink(inode); 1906 inc_nlink(inode);
1906 atomic_inc(&inode->i_count); /* New dentry reference */ 1907 ihold(inode); /* New dentry reference */
1907 dget(dentry); /* Extra pinning count for the created dentry */ 1908 dget(dentry); /* Extra pinning count for the created dentry */
1908 d_instantiate(dentry, inode); 1909 d_instantiate(dentry, inode);
1909out: 1910out:
@@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2146 if (*len < 3) 2147 if (*len < 3)
2147 return 255; 2148 return 255;
2148 2149
2149 if (hlist_unhashed(&inode->i_hash)) { 2150 if (inode_unhashed(inode)) {
2150 /* Unfortunately insert_inode_hash is not idempotent, 2151 /* Unfortunately insert_inode_hash is not idempotent,
2151 * so as we hash inodes here rather than at creation 2152 * so as we hash inodes here rather than at creation
2152 * time, we need a lock to ensure we only try 2153 * time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2154 */ 2155 */
2155 static DEFINE_SPINLOCK(lock); 2156 static DEFINE_SPINLOCK(lock);
2156 spin_lock(&lock); 2157 spin_lock(&lock);
2157 if (hlist_unhashed(&inode->i_hash)) 2158 if (inode_unhashed(inode))
2158 __insert_inode_hash(inode, 2159 __insert_inode_hash(inode,
2159 inode->i_ino + inode->i_generation); 2160 inode->i_ino + inode->i_generation);
2160 spin_unlock(&lock); 2161 spin_unlock(&lock);
@@ -2537,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
2537}; 2538};
2538 2539
2539 2540
2540static int shmem_get_sb(struct file_system_type *fs_type, 2541static struct dentry *shmem_mount(struct file_system_type *fs_type,
2541 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 2542 int flags, const char *dev_name, void *data)
2542{ 2543{
2543 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); 2544 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2544} 2545}
2545 2546
2546static struct file_system_type tmpfs_fs_type = { 2547static struct file_system_type tmpfs_fs_type = {
2547 .owner = THIS_MODULE, 2548 .owner = THIS_MODULE,
2548 .name = "tmpfs", 2549 .name = "tmpfs",
2549 .get_sb = shmem_get_sb, 2550 .mount = shmem_mount,
2550 .kill_sb = kill_litter_super, 2551 .kill_sb = kill_litter_super,
2551}; 2552};
2552 2553
@@ -2642,7 +2643,7 @@ out:
2642 2643
2643static struct file_system_type tmpfs_fs_type = { 2644static struct file_system_type tmpfs_fs_type = {
2644 .name = "tmpfs", 2645 .name = "tmpfs",
2645 .get_sb = ramfs_get_sb, 2646 .mount = ramfs_mount,
2646 .kill_sb = kill_litter_super, 2647 .kill_sb = kill_litter_super,
2647}; 2648};
2648 2649
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..b1e40dafbab3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
901 struct array_cache *from, unsigned int max) 901 struct array_cache *from, unsigned int max)
902{ 902{
903 /* Figure out how many entries to transfer */ 903 /* Figure out how many entries to transfer */
904 int nr = min(min(from->avail, max), to->limit - to->avail); 904 int nr = min3(from->avail, max, to->limit - to->avail);
905 905
906 if (!nr) 906 if (!nr)
907 return 0; 907 return 0;
diff --git a/mm/slob.c b/mm/slob.c
index d582171c8101..617b6d6c42c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
500 } else { 500 } else {
501 unsigned int order = get_order(size); 501 unsigned int order = get_order(size);
502 502
503 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); 503 if (likely(order))
504 gfp |= __GFP_COMP;
505 ret = slob_new_pages(gfp, order, node);
504 if (ret) { 506 if (ret) {
505 struct page *page; 507 struct page *page;
506 page = virt_to_page(ret); 508 page = virt_to_page(ret);
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1f0f3d..8fd5401bb071 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -168,7 +168,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
168 168
169/* Internal SLUB flags */ 169/* Internal SLUB flags */
170#define __OBJECT_POISON 0x80000000UL /* Poison object */ 170#define __OBJECT_POISON 0x80000000UL /* Poison object */
171#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
172 171
173static int kmem_size = sizeof(struct kmem_cache); 172static int kmem_size = sizeof(struct kmem_cache);
174 173
@@ -178,7 +177,7 @@ static struct notifier_block slab_notifier;
178 177
179static enum { 178static enum {
180 DOWN, /* No slab functionality available */ 179 DOWN, /* No slab functionality available */
181 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 180 PARTIAL, /* Kmem_cache_node works */
182 UP, /* Everything works but does not show up in sysfs */ 181 UP, /* Everything works but does not show up in sysfs */
183 SYSFS /* Sysfs up */ 182 SYSFS /* Sysfs up */
184} slab_state = DOWN; 183} slab_state = DOWN;
@@ -199,7 +198,7 @@ struct track {
199 198
200enum track_item { TRACK_ALLOC, TRACK_FREE }; 199enum track_item { TRACK_ALLOC, TRACK_FREE };
201 200
202#ifdef CONFIG_SLUB_DEBUG 201#ifdef CONFIG_SYSFS
203static int sysfs_slab_add(struct kmem_cache *); 202static int sysfs_slab_add(struct kmem_cache *);
204static int sysfs_slab_alias(struct kmem_cache *, const char *); 203static int sysfs_slab_alias(struct kmem_cache *, const char *);
205static void sysfs_slab_remove(struct kmem_cache *); 204static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
210 { return 0; } 209 { return 0; }
211static inline void sysfs_slab_remove(struct kmem_cache *s) 210static inline void sysfs_slab_remove(struct kmem_cache *s)
212{ 211{
212 kfree(s->name);
213 kfree(s); 213 kfree(s);
214} 214}
215 215
@@ -233,11 +233,7 @@ int slab_is_available(void)
233 233
234static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 234static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
235{ 235{
236#ifdef CONFIG_NUMA
237 return s->node[node]; 236 return s->node[node];
238#else
239 return &s->local_node;
240#endif
241} 237}
242 238
243/* Verify that a pointer has an address that is valid within a slab page */ 239/* Verify that a pointer has an address that is valid within a slab page */
@@ -494,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
494 dump_stack(); 490 dump_stack();
495} 491}
496 492
497static void init_object(struct kmem_cache *s, void *object, int active) 493static void init_object(struct kmem_cache *s, void *object, u8 val)
498{ 494{
499 u8 *p = object; 495 u8 *p = object;
500 496
@@ -504,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
504 } 500 }
505 501
506 if (s->flags & SLAB_RED_ZONE) 502 if (s->flags & SLAB_RED_ZONE)
507 memset(p + s->objsize, 503 memset(p + s->objsize, val, s->inuse - s->objsize);
508 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
509 s->inuse - s->objsize);
510} 504}
511 505
512static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 506static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -641,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
641} 635}
642 636
643static int check_object(struct kmem_cache *s, struct page *page, 637static int check_object(struct kmem_cache *s, struct page *page,
644 void *object, int active) 638 void *object, u8 val)
645{ 639{
646 u8 *p = object; 640 u8 *p = object;
647 u8 *endobject = object + s->objsize; 641 u8 *endobject = object + s->objsize;
648 642
649 if (s->flags & SLAB_RED_ZONE) { 643 if (s->flags & SLAB_RED_ZONE) {
650 unsigned int red =
651 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
652
653 if (!check_bytes_and_report(s, page, object, "Redzone", 644 if (!check_bytes_and_report(s, page, object, "Redzone",
654 endobject, red, s->inuse - s->objsize)) 645 endobject, val, s->inuse - s->objsize))
655 return 0; 646 return 0;
656 } else { 647 } else {
657 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 648 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
661 } 652 }
662 653
663 if (s->flags & SLAB_POISON) { 654 if (s->flags & SLAB_POISON) {
664 if (!active && (s->flags & __OBJECT_POISON) && 655 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
665 (!check_bytes_and_report(s, page, p, "Poison", p, 656 (!check_bytes_and_report(s, page, p, "Poison", p,
666 POISON_FREE, s->objsize - 1) || 657 POISON_FREE, s->objsize - 1) ||
667 !check_bytes_and_report(s, page, p, "Poison", 658 !check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
673 check_pad_bytes(s, page, p); 664 check_pad_bytes(s, page, p);
674 } 665 }
675 666
676 if (!s->offset && active) 667 if (!s->offset && val == SLUB_RED_ACTIVE)
677 /* 668 /*
678 * Object and freepointer overlap. Cannot check 669 * Object and freepointer overlap. Cannot check
679 * freepointer while object is allocated. 670 * freepointer while object is allocated.
@@ -792,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
792} 783}
793 784
794/* 785/*
786 * Hooks for other subsystems that check memory allocations. In a typical
787 * production configuration these hooks all should produce no code at all.
788 */
789static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
790{
791 flags &= gfp_allowed_mask;
792 lockdep_trace_alloc(flags);
793 might_sleep_if(flags & __GFP_WAIT);
794
795 return should_failslab(s->objsize, flags, s->flags);
796}
797
798static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
799{
800 flags &= gfp_allowed_mask;
801 kmemcheck_slab_alloc(s, flags, object, s->objsize);
802 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
803}
804
805static inline void slab_free_hook(struct kmem_cache *s, void *x)
806{
807 kmemleak_free_recursive(x, s->flags);
808}
809
810static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
811{
812 kmemcheck_slab_free(s, object, s->objsize);
813 debug_check_no_locks_freed(object, s->objsize);
814 if (!(s->flags & SLAB_DEBUG_OBJECTS))
815 debug_check_no_obj_freed(object, s->objsize);
816}
817
818/*
795 * Tracking of fully allocated slabs for debugging purposes. 819 * Tracking of fully allocated slabs for debugging purposes.
796 */ 820 */
797static void add_full(struct kmem_cache_node *n, struct page *page) 821static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
838 * dilemma by deferring the increment of the count during 862 * dilemma by deferring the increment of the count during
839 * bootstrap (see early_kmem_cache_node_alloc). 863 * bootstrap (see early_kmem_cache_node_alloc).
840 */ 864 */
841 if (!NUMA_BUILD || n) { 865 if (n) {
842 atomic_long_inc(&n->nr_slabs); 866 atomic_long_inc(&n->nr_slabs);
843 atomic_long_add(objects, &n->total_objects); 867 atomic_long_add(objects, &n->total_objects);
844 } 868 }
@@ -858,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
858 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 882 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
859 return; 883 return;
860 884
861 init_object(s, object, 0); 885 init_object(s, object, SLUB_RED_INACTIVE);
862 init_tracking(s, object); 886 init_tracking(s, object);
863} 887}
864 888
865static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 889static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
866 void *object, unsigned long addr) 890 void *object, unsigned long addr)
867{ 891{
868 if (!check_slab(s, page)) 892 if (!check_slab(s, page))
@@ -878,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
878 goto bad; 902 goto bad;
879 } 903 }
880 904
881 if (!check_object(s, page, object, 0)) 905 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
882 goto bad; 906 goto bad;
883 907
884 /* Success perform special debug activities for allocs */ 908 /* Success perform special debug activities for allocs */
885 if (s->flags & SLAB_STORE_USER) 909 if (s->flags & SLAB_STORE_USER)
886 set_track(s, object, TRACK_ALLOC, addr); 910 set_track(s, object, TRACK_ALLOC, addr);
887 trace(s, page, object, 1); 911 trace(s, page, object, 1);
888 init_object(s, object, 1); 912 init_object(s, object, SLUB_RED_ACTIVE);
889 return 1; 913 return 1;
890 914
891bad: 915bad:
@@ -902,8 +926,8 @@ bad:
902 return 0; 926 return 0;
903} 927}
904 928
905static int free_debug_processing(struct kmem_cache *s, struct page *page, 929static noinline int free_debug_processing(struct kmem_cache *s,
906 void *object, unsigned long addr) 930 struct page *page, void *object, unsigned long addr)
907{ 931{
908 if (!check_slab(s, page)) 932 if (!check_slab(s, page))
909 goto fail; 933 goto fail;
@@ -918,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
918 goto fail; 942 goto fail;
919 } 943 }
920 944
921 if (!check_object(s, page, object, 1)) 945 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
922 return 0; 946 return 0;
923 947
924 if (unlikely(s != page->slab)) { 948 if (unlikely(s != page->slab)) {
@@ -942,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
942 if (s->flags & SLAB_STORE_USER) 966 if (s->flags & SLAB_STORE_USER)
943 set_track(s, object, TRACK_FREE, addr); 967 set_track(s, object, TRACK_FREE, addr);
944 trace(s, page, object, 0); 968 trace(s, page, object, 0);
945 init_object(s, object, 0); 969 init_object(s, object, SLUB_RED_INACTIVE);
946 return 1; 970 return 1;
947 971
948fail: 972fail:
@@ -1046,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
1046static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1070static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1047 { return 1; } 1071 { return 1; }
1048static inline int check_object(struct kmem_cache *s, struct page *page, 1072static inline int check_object(struct kmem_cache *s, struct page *page,
1049 void *object, int active) { return 1; } 1073 void *object, u8 val) { return 1; }
1050static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1074static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1051static inline unsigned long kmem_cache_flags(unsigned long objsize, 1075static inline unsigned long kmem_cache_flags(unsigned long objsize,
1052 unsigned long flags, const char *name, 1076 unsigned long flags, const char *name,
@@ -1066,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
1066 int objects) {} 1090 int objects) {}
1067static inline void dec_slabs_node(struct kmem_cache *s, int node, 1091static inline void dec_slabs_node(struct kmem_cache *s, int node,
1068 int objects) {} 1092 int objects) {}
1069#endif 1093
1094static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1095 { return 0; }
1096
1097static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1098 void *object) {}
1099
1100static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1101
1102static inline void slab_free_hook_irq(struct kmem_cache *s,
1103 void *object) {}
1104
1105#endif /* CONFIG_SLUB_DEBUG */
1070 1106
1071/* 1107/*
1072 * Slab allocation and freeing 1108 * Slab allocation and freeing
@@ -1194,7 +1230,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1194 slab_pad_check(s, page); 1230 slab_pad_check(s, page);
1195 for_each_object(p, s, page_address(page), 1231 for_each_object(p, s, page_address(page),
1196 page->objects) 1232 page->objects)
1197 check_object(s, page, p, 0); 1233 check_object(s, page, p, SLUB_RED_INACTIVE);
1198 } 1234 }
1199 1235
1200 kmemcheck_free_shadow(page, compound_order(page)); 1236 kmemcheck_free_shadow(page, compound_order(page));
@@ -1274,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n,
1274 spin_unlock(&n->list_lock); 1310 spin_unlock(&n->list_lock);
1275} 1311}
1276 1312
1313static inline void __remove_partial(struct kmem_cache_node *n,
1314 struct page *page)
1315{
1316 list_del(&page->lru);
1317 n->nr_partial--;
1318}
1319
1277static void remove_partial(struct kmem_cache *s, struct page *page) 1320static void remove_partial(struct kmem_cache *s, struct page *page)
1278{ 1321{
1279 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1322 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1280 1323
1281 spin_lock(&n->list_lock); 1324 spin_lock(&n->list_lock);
1282 list_del(&page->lru); 1325 __remove_partial(n, page);
1283 n->nr_partial--;
1284 spin_unlock(&n->list_lock); 1326 spin_unlock(&n->list_lock);
1285} 1327}
1286 1328
@@ -1293,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1293 struct page *page) 1335 struct page *page)
1294{ 1336{
1295 if (slab_trylock(page)) { 1337 if (slab_trylock(page)) {
1296 list_del(&page->lru); 1338 __remove_partial(n, page);
1297 n->nr_partial--;
1298 __SetPageSlubFrozen(page); 1339 __SetPageSlubFrozen(page);
1299 return 1; 1340 return 1;
1300 } 1341 }
@@ -1405,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1405 * On exit the slab lock will have been dropped. 1446 * On exit the slab lock will have been dropped.
1406 */ 1447 */
1407static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1448static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1449 __releases(bitlock)
1408{ 1450{
1409 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1451 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1410 1452
@@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1447 * Remove the cpu slab 1489 * Remove the cpu slab
1448 */ 1490 */
1449static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1491static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1492 __releases(bitlock)
1450{ 1493{
1451 struct page *page = c->page; 1494 struct page *page = c->page;
1452 int tail = 1; 1495 int tail = 1;
@@ -1647,6 +1690,7 @@ new_slab:
1647 goto load_freelist; 1690 goto load_freelist;
1648 } 1691 }
1649 1692
1693 gfpflags &= gfp_allowed_mask;
1650 if (gfpflags & __GFP_WAIT) 1694 if (gfpflags & __GFP_WAIT)
1651 local_irq_enable(); 1695 local_irq_enable();
1652 1696
@@ -1674,7 +1718,7 @@ debug:
1674 1718
1675 c->page->inuse++; 1719 c->page->inuse++;
1676 c->page->freelist = get_freepointer(s, object); 1720 c->page->freelist = get_freepointer(s, object);
1677 c->node = -1; 1721 c->node = NUMA_NO_NODE;
1678 goto unlock_out; 1722 goto unlock_out;
1679} 1723}
1680 1724
@@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1695 struct kmem_cache_cpu *c; 1739 struct kmem_cache_cpu *c;
1696 unsigned long flags; 1740 unsigned long flags;
1697 1741
1698 gfpflags &= gfp_allowed_mask; 1742 if (slab_pre_alloc_hook(s, gfpflags))
1699
1700 lockdep_trace_alloc(gfpflags);
1701 might_sleep_if(gfpflags & __GFP_WAIT);
1702
1703 if (should_failslab(s->objsize, gfpflags, s->flags))
1704 return NULL; 1743 return NULL;
1705 1744
1706 local_irq_save(flags); 1745 local_irq_save(flags);
@@ -1719,8 +1758,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1719 if (unlikely(gfpflags & __GFP_ZERO) && object) 1758 if (unlikely(gfpflags & __GFP_ZERO) && object)
1720 memset(object, 0, s->objsize); 1759 memset(object, 0, s->objsize);
1721 1760
1722 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); 1761 slab_post_alloc_hook(s, gfpflags, object);
1723 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1724 1762
1725 return object; 1763 return object;
1726} 1764}
@@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1754 return ret; 1792 return ret;
1755} 1793}
1756EXPORT_SYMBOL(kmem_cache_alloc_node); 1794EXPORT_SYMBOL(kmem_cache_alloc_node);
1757#endif
1758 1795
1759#ifdef CONFIG_TRACING 1796#ifdef CONFIG_TRACING
1760void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1797void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
@@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1765} 1802}
1766EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 1803EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1767#endif 1804#endif
1805#endif
1768 1806
1769/* 1807/*
1770 * Slow patch handling. This may still be called frequently since objects 1808 * Slow patch handling. This may still be called frequently since objects
@@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s,
1850 struct kmem_cache_cpu *c; 1888 struct kmem_cache_cpu *c;
1851 unsigned long flags; 1889 unsigned long flags;
1852 1890
1853 kmemleak_free_recursive(x, s->flags); 1891 slab_free_hook(s, x);
1892
1854 local_irq_save(flags); 1893 local_irq_save(flags);
1855 c = __this_cpu_ptr(s->cpu_slab); 1894 c = __this_cpu_ptr(s->cpu_slab);
1856 kmemcheck_slab_free(s, object, s->objsize); 1895
1857 debug_check_no_locks_freed(object, s->objsize); 1896 slab_free_hook_irq(s, x);
1858 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1897
1859 debug_check_no_obj_freed(object, s->objsize); 1898 if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1860 if (likely(page == c->page && c->node >= 0)) {
1861 set_freepointer(s, object, c->freelist); 1899 set_freepointer(s, object, c->freelist);
1862 c->freelist = object; 1900 c->freelist = object;
1863 stat(s, FREE_FASTPATH); 1901 stat(s, FREE_FASTPATH);
@@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2062#endif 2100#endif
2063} 2101}
2064 2102
2065static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); 2103static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2066
2067static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2068{ 2104{
2069 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) 2105 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2070 /* 2106 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2071 * Boot time creation of the kmalloc array. Use static per cpu data
2072 * since the per cpu allocator is not available yet.
2073 */
2074 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2075 else
2076 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2077 2107
2078 if (!s->cpu_slab) 2108 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2079 return 0;
2080 2109
2081 return 1; 2110 return s->cpu_slab != NULL;
2082} 2111}
2083 2112
2084#ifdef CONFIG_NUMA 2113static struct kmem_cache *kmem_cache_node;
2114
2085/* 2115/*
2086 * No kmalloc_node yet so do it by hand. We know that this is the first 2116 * No kmalloc_node yet so do it by hand. We know that this is the first
2087 * slab on the node for this slabcache. There are no concurrent accesses 2117 * slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2091 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2121 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2092 * memory on a fresh node that has no slab structures yet. 2122 * memory on a fresh node that has no slab structures yet.
2093 */ 2123 */
2094static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) 2124static void early_kmem_cache_node_alloc(int node)
2095{ 2125{
2096 struct page *page; 2126 struct page *page;
2097 struct kmem_cache_node *n; 2127 struct kmem_cache_node *n;
2098 unsigned long flags; 2128 unsigned long flags;
2099 2129
2100 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2130 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2101 2131
2102 page = new_slab(kmalloc_caches, gfpflags, node); 2132 page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2103 2133
2104 BUG_ON(!page); 2134 BUG_ON(!page);
2105 if (page_to_nid(page) != node) { 2135 if (page_to_nid(page) != node) {
@@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
2111 2141
2112 n = page->freelist; 2142 n = page->freelist;
2113 BUG_ON(!n); 2143 BUG_ON(!n);
2114 page->freelist = get_freepointer(kmalloc_caches, n); 2144 page->freelist = get_freepointer(kmem_cache_node, n);
2115 page->inuse++; 2145 page->inuse++;
2116 kmalloc_caches->node[node] = n; 2146 kmem_cache_node->node[node] = n;
2117#ifdef CONFIG_SLUB_DEBUG 2147#ifdef CONFIG_SLUB_DEBUG
2118 init_object(kmalloc_caches, n, 1); 2148 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2119 init_tracking(kmalloc_caches, n); 2149 init_tracking(kmem_cache_node, n);
2120#endif 2150#endif
2121 init_kmem_cache_node(n, kmalloc_caches); 2151 init_kmem_cache_node(n, kmem_cache_node);
2122 inc_slabs_node(kmalloc_caches, node, page->objects); 2152 inc_slabs_node(kmem_cache_node, node, page->objects);
2123 2153
2124 /* 2154 /*
2125 * lockdep requires consistent irq usage for each lock 2155 * lockdep requires consistent irq usage for each lock
@@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2137 2167
2138 for_each_node_state(node, N_NORMAL_MEMORY) { 2168 for_each_node_state(node, N_NORMAL_MEMORY) {
2139 struct kmem_cache_node *n = s->node[node]; 2169 struct kmem_cache_node *n = s->node[node];
2170
2140 if (n) 2171 if (n)
2141 kmem_cache_free(kmalloc_caches, n); 2172 kmem_cache_free(kmem_cache_node, n);
2173
2142 s->node[node] = NULL; 2174 s->node[node] = NULL;
2143 } 2175 }
2144} 2176}
2145 2177
2146static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2178static int init_kmem_cache_nodes(struct kmem_cache *s)
2147{ 2179{
2148 int node; 2180 int node;
2149 2181
@@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2151 struct kmem_cache_node *n; 2183 struct kmem_cache_node *n;
2152 2184
2153 if (slab_state == DOWN) { 2185 if (slab_state == DOWN) {
2154 early_kmem_cache_node_alloc(gfpflags, node); 2186 early_kmem_cache_node_alloc(node);
2155 continue; 2187 continue;
2156 } 2188 }
2157 n = kmem_cache_alloc_node(kmalloc_caches, 2189 n = kmem_cache_alloc_node(kmem_cache_node,
2158 gfpflags, node); 2190 GFP_KERNEL, node);
2159 2191
2160 if (!n) { 2192 if (!n) {
2161 free_kmem_cache_nodes(s); 2193 free_kmem_cache_nodes(s);
@@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2167 } 2199 }
2168 return 1; 2200 return 1;
2169} 2201}
2170#else
2171static void free_kmem_cache_nodes(struct kmem_cache *s)
2172{
2173}
2174
2175static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2176{
2177 init_kmem_cache_node(&s->local_node, s);
2178 return 1;
2179}
2180#endif
2181 2202
2182static void set_min_partial(struct kmem_cache *s, unsigned long min) 2203static void set_min_partial(struct kmem_cache *s, unsigned long min)
2183{ 2204{
@@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2312 2333
2313} 2334}
2314 2335
2315static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2336static int kmem_cache_open(struct kmem_cache *s,
2316 const char *name, size_t size, 2337 const char *name, size_t size,
2317 size_t align, unsigned long flags, 2338 size_t align, unsigned long flags,
2318 void (*ctor)(void *)) 2339 void (*ctor)(void *))
@@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2348#ifdef CONFIG_NUMA 2369#ifdef CONFIG_NUMA
2349 s->remote_node_defrag_ratio = 1000; 2370 s->remote_node_defrag_ratio = 1000;
2350#endif 2371#endif
2351 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2372 if (!init_kmem_cache_nodes(s))
2352 goto error; 2373 goto error;
2353 2374
2354 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2375 if (alloc_kmem_cache_cpus(s))
2355 return 1; 2376 return 1;
2356 2377
2357 free_kmem_cache_nodes(s); 2378 free_kmem_cache_nodes(s);
@@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2414#ifdef CONFIG_SLUB_DEBUG 2435#ifdef CONFIG_SLUB_DEBUG
2415 void *addr = page_address(page); 2436 void *addr = page_address(page);
2416 void *p; 2437 void *p;
2417 long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), 2438 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2418 GFP_ATOMIC); 2439 sizeof(long), GFP_ATOMIC);
2419
2420 if (!map) 2440 if (!map)
2421 return; 2441 return;
2422 slab_err(s, page, "%s", text); 2442 slab_err(s, page, "%s", text);
@@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2448 spin_lock_irqsave(&n->list_lock, flags); 2468 spin_lock_irqsave(&n->list_lock, flags);
2449 list_for_each_entry_safe(page, h, &n->partial, lru) { 2469 list_for_each_entry_safe(page, h, &n->partial, lru) {
2450 if (!page->inuse) { 2470 if (!page->inuse) {
2451 list_del(&page->lru); 2471 __remove_partial(n, page);
2452 discard_slab(s, page); 2472 discard_slab(s, page);
2453 n->nr_partial--;
2454 } else { 2473 } else {
2455 list_slab_objects(s, page, 2474 list_slab_objects(s, page,
2456 "Objects remaining on kmem_cache_close()"); 2475 "Objects remaining on kmem_cache_close()");
@@ -2507,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2507 * Kmalloc subsystem 2526 * Kmalloc subsystem
2508 *******************************************************************/ 2527 *******************************************************************/
2509 2528
2510struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; 2529struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
2511EXPORT_SYMBOL(kmalloc_caches); 2530EXPORT_SYMBOL(kmalloc_caches);
2512 2531
2532static struct kmem_cache *kmem_cache;
2533
2534#ifdef CONFIG_ZONE_DMA
2535static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
2536#endif
2537
2513static int __init setup_slub_min_order(char *str) 2538static int __init setup_slub_min_order(char *str)
2514{ 2539{
2515 get_option(&str, &slub_min_order); 2540 get_option(&str, &slub_min_order);
@@ -2546,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str)
2546 2571
2547__setup("slub_nomerge", setup_slub_nomerge); 2572__setup("slub_nomerge", setup_slub_nomerge);
2548 2573
2549static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2574static struct kmem_cache *__init create_kmalloc_cache(const char *name,
2550 const char *name, int size, gfp_t gfp_flags) 2575 int size, unsigned int flags)
2551{ 2576{
2552 unsigned int flags = 0; 2577 struct kmem_cache *s;
2553 2578
2554 if (gfp_flags & SLUB_DMA) 2579 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
2555 flags = SLAB_CACHE_DMA;
2556 2580
2557 /* 2581 /*
2558 * This function is called with IRQs disabled during early-boot on 2582 * This function is called with IRQs disabled during early-boot on
2559 * single CPU so there's no need to take slub_lock here. 2583 * single CPU so there's no need to take slub_lock here.
2560 */ 2584 */
2561 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2585 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
2562 flags, NULL)) 2586 flags, NULL))
2563 goto panic; 2587 goto panic;
2564 2588
2565 list_add(&s->list, &slab_caches); 2589 list_add(&s->list, &slab_caches);
2566
2567 if (sysfs_slab_add(s))
2568 goto panic;
2569 return s; 2590 return s;
2570 2591
2571panic: 2592panic:
2572 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2593 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2594 return NULL;
2573} 2595}
2574 2596
2575#ifdef CONFIG_ZONE_DMA
2576static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2577
2578static void sysfs_add_func(struct work_struct *w)
2579{
2580 struct kmem_cache *s;
2581
2582 down_write(&slub_lock);
2583 list_for_each_entry(s, &slab_caches, list) {
2584 if (s->flags & __SYSFS_ADD_DEFERRED) {
2585 s->flags &= ~__SYSFS_ADD_DEFERRED;
2586 sysfs_slab_add(s);
2587 }
2588 }
2589 up_write(&slub_lock);
2590}
2591
2592static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2593
2594static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2595{
2596 struct kmem_cache *s;
2597 char *text;
2598 size_t realsize;
2599 unsigned long slabflags;
2600 int i;
2601
2602 s = kmalloc_caches_dma[index];
2603 if (s)
2604 return s;
2605
2606 /* Dynamically create dma cache */
2607 if (flags & __GFP_WAIT)
2608 down_write(&slub_lock);
2609 else {
2610 if (!down_write_trylock(&slub_lock))
2611 goto out;
2612 }
2613
2614 if (kmalloc_caches_dma[index])
2615 goto unlock_out;
2616
2617 realsize = kmalloc_caches[index].objsize;
2618 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2619 (unsigned int)realsize);
2620
2621 s = NULL;
2622 for (i = 0; i < KMALLOC_CACHES; i++)
2623 if (!kmalloc_caches[i].size)
2624 break;
2625
2626 BUG_ON(i >= KMALLOC_CACHES);
2627 s = kmalloc_caches + i;
2628
2629 /*
2630 * Must defer sysfs creation to a workqueue because we don't know
2631 * what context we are called from. Before sysfs comes up, we don't
2632 * need to do anything because our sysfs initcall will start by
2633 * adding all existing slabs to sysfs.
2634 */
2635 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2636 if (slab_state >= SYSFS)
2637 slabflags |= __SYSFS_ADD_DEFERRED;
2638
2639 if (!text || !kmem_cache_open(s, flags, text,
2640 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2641 s->size = 0;
2642 kfree(text);
2643 goto unlock_out;
2644 }
2645
2646 list_add(&s->list, &slab_caches);
2647 kmalloc_caches_dma[index] = s;
2648
2649 if (slab_state >= SYSFS)
2650 schedule_work(&sysfs_add_work);
2651
2652unlock_out:
2653 up_write(&slub_lock);
2654out:
2655 return kmalloc_caches_dma[index];
2656}
2657#endif
2658
2659/* 2597/*
2660 * Conversion table for small slabs sizes / 8 to the index in the 2598 * Conversion table for small slabs sizes / 8 to the index in the
2661 * kmalloc array. This is necessary for slabs < 192 since we have non power 2599 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2708 2646
2709#ifdef CONFIG_ZONE_DMA 2647#ifdef CONFIG_ZONE_DMA
2710 if (unlikely((flags & SLUB_DMA))) 2648 if (unlikely((flags & SLUB_DMA)))
2711 return dma_kmalloc_cache(index, flags); 2649 return kmalloc_dma_caches[index];
2712 2650
2713#endif 2651#endif
2714 return &kmalloc_caches[index]; 2652 return kmalloc_caches[index];
2715} 2653}
2716 2654
2717void *__kmalloc(size_t size, gfp_t flags) 2655void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2735} 2673}
2736EXPORT_SYMBOL(__kmalloc); 2674EXPORT_SYMBOL(__kmalloc);
2737 2675
2676#ifdef CONFIG_NUMA
2738static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2677static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2739{ 2678{
2740 struct page *page; 2679 struct page *page;
@@ -2749,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2749 return ptr; 2688 return ptr;
2750} 2689}
2751 2690
2752#ifdef CONFIG_NUMA
2753void *__kmalloc_node(size_t size, gfp_t flags, int node) 2691void *__kmalloc_node(size_t size, gfp_t flags, int node)
2754{ 2692{
2755 struct kmem_cache *s; 2693 struct kmem_cache *s;
@@ -2889,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2889 * may have freed the last object and be 2827 * may have freed the last object and be
2890 * waiting to release the slab. 2828 * waiting to release the slab.
2891 */ 2829 */
2892 list_del(&page->lru); 2830 __remove_partial(n, page);
2893 n->nr_partial--;
2894 slab_unlock(page); 2831 slab_unlock(page);
2895 discard_slab(s, page); 2832 discard_slab(s, page);
2896 } else { 2833 } else {
@@ -2914,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2914} 2851}
2915EXPORT_SYMBOL(kmem_cache_shrink); 2852EXPORT_SYMBOL(kmem_cache_shrink);
2916 2853
2917#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 2854#if defined(CONFIG_MEMORY_HOTPLUG)
2918static int slab_mem_going_offline_callback(void *arg) 2855static int slab_mem_going_offline_callback(void *arg)
2919{ 2856{
2920 struct kmem_cache *s; 2857 struct kmem_cache *s;
@@ -2956,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg)
2956 BUG_ON(slabs_node(s, offline_node)); 2893 BUG_ON(slabs_node(s, offline_node));
2957 2894
2958 s->node[offline_node] = NULL; 2895 s->node[offline_node] = NULL;
2959 kmem_cache_free(kmalloc_caches, n); 2896 kmem_cache_free(kmem_cache_node, n);
2960 } 2897 }
2961 } 2898 }
2962 up_read(&slub_lock); 2899 up_read(&slub_lock);
@@ -2989,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg)
2989 * since memory is not yet available from the node that 2926 * since memory is not yet available from the node that
2990 * is brought up. 2927 * is brought up.
2991 */ 2928 */
2992 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); 2929 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
2993 if (!n) { 2930 if (!n) {
2994 ret = -ENOMEM; 2931 ret = -ENOMEM;
2995 goto out; 2932 goto out;
@@ -3035,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self,
3035 * Basic setup of slabs 2972 * Basic setup of slabs
3036 *******************************************************************/ 2973 *******************************************************************/
3037 2974
2975/*
2976 * Used for early kmem_cache structures that were allocated using
2977 * the page allocator
2978 */
2979
2980static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
2981{
2982 int node;
2983
2984 list_add(&s->list, &slab_caches);
2985 s->refcount = -1;
2986
2987 for_each_node_state(node, N_NORMAL_MEMORY) {
2988 struct kmem_cache_node *n = get_node(s, node);
2989 struct page *p;
2990
2991 if (n) {
2992 list_for_each_entry(p, &n->partial, lru)
2993 p->slab = s;
2994
2995#ifdef CONFIG_SLAB_DEBUG
2996 list_for_each_entry(p, &n->full, lru)
2997 p->slab = s;
2998#endif
2999 }
3000 }
3001}
3002
3038void __init kmem_cache_init(void) 3003void __init kmem_cache_init(void)
3039{ 3004{
3040 int i; 3005 int i;
3041 int caches = 0; 3006 int caches = 0;
3007 struct kmem_cache *temp_kmem_cache;
3008 int order;
3009 struct kmem_cache *temp_kmem_cache_node;
3010 unsigned long kmalloc_size;
3011
3012 kmem_size = offsetof(struct kmem_cache, node) +
3013 nr_node_ids * sizeof(struct kmem_cache_node *);
3014
3015 /* Allocate two kmem_caches from the page allocator */
3016 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3017 order = get_order(2 * kmalloc_size);
3018 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
3042 3019
3043#ifdef CONFIG_NUMA
3044 /* 3020 /*
3045 * Must first have the slab cache available for the allocations of the 3021 * Must first have the slab cache available for the allocations of the
3046 * struct kmem_cache_node's. There is special bootstrap code in 3022 * struct kmem_cache_node's. There is special bootstrap code in
3047 * kmem_cache_open for slab_state == DOWN. 3023 * kmem_cache_open for slab_state == DOWN.
3048 */ 3024 */
3049 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3025 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3050 sizeof(struct kmem_cache_node), GFP_NOWAIT); 3026
3051 kmalloc_caches[0].refcount = -1; 3027 kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3052 caches++; 3028 sizeof(struct kmem_cache_node),
3029 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3053 3030
3054 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3031 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3055#endif
3056 3032
3057 /* Able to allocate the per node structures */ 3033 /* Able to allocate the per node structures */
3058 slab_state = PARTIAL; 3034 slab_state = PARTIAL;
3059 3035
3060 /* Caches that are not of the two-to-the-power-of size */ 3036 temp_kmem_cache = kmem_cache;
3061 if (KMALLOC_MIN_SIZE <= 32) { 3037 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3062 create_kmalloc_cache(&kmalloc_caches[1], 3038 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3063 "kmalloc-96", 96, GFP_NOWAIT); 3039 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3064 caches++; 3040 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3065 }
3066 if (KMALLOC_MIN_SIZE <= 64) {
3067 create_kmalloc_cache(&kmalloc_caches[2],
3068 "kmalloc-192", 192, GFP_NOWAIT);
3069 caches++;
3070 }
3071 3041
3072 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3042 /*
3073 create_kmalloc_cache(&kmalloc_caches[i], 3043 * Allocate kmem_cache_node properly from the kmem_cache slab.
3074 "kmalloc", 1 << i, GFP_NOWAIT); 3044 * kmem_cache_node is separately allocated so no need to
3075 caches++; 3045 * update any list pointers.
3076 } 3046 */
3047 temp_kmem_cache_node = kmem_cache_node;
3048
3049 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3050 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3051
3052 kmem_cache_bootstrap_fixup(kmem_cache_node);
3077 3053
3054 caches++;
3055 kmem_cache_bootstrap_fixup(kmem_cache);
3056 caches++;
3057 /* Free temporary boot structure */
3058 free_pages((unsigned long)temp_kmem_cache, order);
3059
3060 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3078 3061
3079 /* 3062 /*
3080 * Patch up the size_index table if we have strange large alignment 3063 * Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3097,60 @@ void __init kmem_cache_init(void)
3114 size_index[size_index_elem(i)] = 8; 3097 size_index[size_index_elem(i)] = 8;
3115 } 3098 }
3116 3099
3100 /* Caches that are not of the two-to-the-power-of size */
3101 if (KMALLOC_MIN_SIZE <= 32) {
3102 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
3103 caches++;
3104 }
3105
3106 if (KMALLOC_MIN_SIZE <= 64) {
3107 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
3108 caches++;
3109 }
3110
3111 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3112 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
3113 caches++;
3114 }
3115
3117 slab_state = UP; 3116 slab_state = UP;
3118 3117
3119 /* Provide the correct kmalloc names now that the caches are up */ 3118 /* Provide the correct kmalloc names now that the caches are up */
3119 if (KMALLOC_MIN_SIZE <= 32) {
3120 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
3121 BUG_ON(!kmalloc_caches[1]->name);
3122 }
3123
3124 if (KMALLOC_MIN_SIZE <= 64) {
3125 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
3126 BUG_ON(!kmalloc_caches[2]->name);
3127 }
3128
3120 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3129 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3121 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3130 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3122 3131
3123 BUG_ON(!s); 3132 BUG_ON(!s);
3124 kmalloc_caches[i].name = s; 3133 kmalloc_caches[i]->name = s;
3125 } 3134 }
3126 3135
3127#ifdef CONFIG_SMP 3136#ifdef CONFIG_SMP
3128 register_cpu_notifier(&slab_notifier); 3137 register_cpu_notifier(&slab_notifier);
3129#endif 3138#endif
3130#ifdef CONFIG_NUMA
3131 kmem_size = offsetof(struct kmem_cache, node) +
3132 nr_node_ids * sizeof(struct kmem_cache_node *);
3133#else
3134 kmem_size = sizeof(struct kmem_cache);
3135#endif
3136 3139
3140#ifdef CONFIG_ZONE_DMA
3141 for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
3142 struct kmem_cache *s = kmalloc_caches[i];
3143
3144 if (s && s->size) {
3145 char *name = kasprintf(GFP_NOWAIT,
3146 "dma-kmalloc-%d", s->objsize);
3147
3148 BUG_ON(!name);
3149 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3150 s->objsize, SLAB_CACHE_DMA);
3151 }
3152 }
3153#endif
3137 printk(KERN_INFO 3154 printk(KERN_INFO
3138 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3155 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3139 " CPUs=%d, Nodes=%d\n", 3156 " CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3211 size_t align, unsigned long flags, void (*ctor)(void *)) 3228 size_t align, unsigned long flags, void (*ctor)(void *))
3212{ 3229{
3213 struct kmem_cache *s; 3230 struct kmem_cache *s;
3231 char *n;
3214 3232
3215 if (WARN_ON(!name)) 3233 if (WARN_ON(!name))
3216 return NULL; 3234 return NULL;
@@ -3234,19 +3252,25 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3234 return s; 3252 return s;
3235 } 3253 }
3236 3254
3255 n = kstrdup(name, GFP_KERNEL);
3256 if (!n)
3257 goto err;
3258
3237 s = kmalloc(kmem_size, GFP_KERNEL); 3259 s = kmalloc(kmem_size, GFP_KERNEL);
3238 if (s) { 3260 if (s) {
3239 if (kmem_cache_open(s, GFP_KERNEL, name, 3261 if (kmem_cache_open(s, n,
3240 size, align, flags, ctor)) { 3262 size, align, flags, ctor)) {
3241 list_add(&s->list, &slab_caches); 3263 list_add(&s->list, &slab_caches);
3242 if (sysfs_slab_add(s)) { 3264 if (sysfs_slab_add(s)) {
3243 list_del(&s->list); 3265 list_del(&s->list);
3266 kfree(n);
3244 kfree(s); 3267 kfree(s);
3245 goto err; 3268 goto err;
3246 } 3269 }
3247 up_write(&slub_lock); 3270 up_write(&slub_lock);
3248 return s; 3271 return s;
3249 } 3272 }
3273 kfree(n);
3250 kfree(s); 3274 kfree(s);
3251 } 3275 }
3252 up_write(&slub_lock); 3276 up_write(&slub_lock);
@@ -3318,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3318 return ret; 3342 return ret;
3319} 3343}
3320 3344
3345#ifdef CONFIG_NUMA
3321void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3346void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3322 int node, unsigned long caller) 3347 int node, unsigned long caller)
3323{ 3348{
@@ -3346,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3346 3371
3347 return ret; 3372 return ret;
3348} 3373}
3374#endif
3349 3375
3350#ifdef CONFIG_SLUB_DEBUG 3376#ifdef CONFIG_SYSFS
3351static int count_inuse(struct page *page) 3377static int count_inuse(struct page *page)
3352{ 3378{
3353 return page->inuse; 3379 return page->inuse;
@@ -3357,7 +3383,9 @@ static int count_total(struct page *page)
3357{ 3383{
3358 return page->objects; 3384 return page->objects;
3359} 3385}
3386#endif
3360 3387
3388#ifdef CONFIG_SLUB_DEBUG
3361static int validate_slab(struct kmem_cache *s, struct page *page, 3389static int validate_slab(struct kmem_cache *s, struct page *page,
3362 unsigned long *map) 3390 unsigned long *map)
3363{ 3391{
@@ -3448,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s)
3448 kfree(map); 3476 kfree(map);
3449 return count; 3477 return count;
3450} 3478}
3451
3452#ifdef SLUB_RESILIENCY_TEST
3453static void resiliency_test(void)
3454{
3455 u8 *p;
3456
3457 printk(KERN_ERR "SLUB resiliency testing\n");
3458 printk(KERN_ERR "-----------------------\n");
3459 printk(KERN_ERR "A. Corruption after allocation\n");
3460
3461 p = kzalloc(16, GFP_KERNEL);
3462 p[16] = 0x12;
3463 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3464 " 0x12->0x%p\n\n", p + 16);
3465
3466 validate_slab_cache(kmalloc_caches + 4);
3467
3468 /* Hmmm... The next two are dangerous */
3469 p = kzalloc(32, GFP_KERNEL);
3470 p[32 + sizeof(void *)] = 0x34;
3471 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3472 " 0x34 -> -0x%p\n", p);
3473 printk(KERN_ERR
3474 "If allocated object is overwritten then not detectable\n\n");
3475
3476 validate_slab_cache(kmalloc_caches + 5);
3477 p = kzalloc(64, GFP_KERNEL);
3478 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3479 *p = 0x56;
3480 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3481 p);
3482 printk(KERN_ERR
3483 "If allocated object is overwritten then not detectable\n\n");
3484 validate_slab_cache(kmalloc_caches + 6);
3485
3486 printk(KERN_ERR "\nB. Corruption after free\n");
3487 p = kzalloc(128, GFP_KERNEL);
3488 kfree(p);
3489 *p = 0x78;
3490 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3491 validate_slab_cache(kmalloc_caches + 7);
3492
3493 p = kzalloc(256, GFP_KERNEL);
3494 kfree(p);
3495 p[50] = 0x9a;
3496 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3497 p);
3498 validate_slab_cache(kmalloc_caches + 8);
3499
3500 p = kzalloc(512, GFP_KERNEL);
3501 kfree(p);
3502 p[512] = 0xab;
3503 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3504 validate_slab_cache(kmalloc_caches + 9);
3505}
3506#else
3507static void resiliency_test(void) {};
3508#endif
3509
3510/* 3479/*
3511 * Generate lists of code addresses where slabcache objects are allocated 3480 * Generate lists of code addresses where slabcache objects are allocated
3512 * and freed. 3481 * and freed.
@@ -3635,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3635 3604
3636static void process_slab(struct loc_track *t, struct kmem_cache *s, 3605static void process_slab(struct loc_track *t, struct kmem_cache *s,
3637 struct page *page, enum track_item alloc, 3606 struct page *page, enum track_item alloc,
3638 long *map) 3607 unsigned long *map)
3639{ 3608{
3640 void *addr = page_address(page); 3609 void *addr = page_address(page);
3641 void *p; 3610 void *p;
@@ -3735,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
3735 len += sprintf(buf, "No data\n"); 3704 len += sprintf(buf, "No data\n");
3736 return len; 3705 return len;
3737} 3706}
3707#endif
3708
3709#ifdef SLUB_RESILIENCY_TEST
3710static void resiliency_test(void)
3711{
3712 u8 *p;
3738 3713
3714 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
3715
3716 printk(KERN_ERR "SLUB resiliency testing\n");
3717 printk(KERN_ERR "-----------------------\n");
3718 printk(KERN_ERR "A. Corruption after allocation\n");
3719
3720 p = kzalloc(16, GFP_KERNEL);
3721 p[16] = 0x12;
3722 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3723 " 0x12->0x%p\n\n", p + 16);
3724
3725 validate_slab_cache(kmalloc_caches[4]);
3726
3727 /* Hmmm... The next two are dangerous */
3728 p = kzalloc(32, GFP_KERNEL);
3729 p[32 + sizeof(void *)] = 0x34;
3730 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3731 " 0x34 -> -0x%p\n", p);
3732 printk(KERN_ERR
3733 "If allocated object is overwritten then not detectable\n\n");
3734
3735 validate_slab_cache(kmalloc_caches[5]);
3736 p = kzalloc(64, GFP_KERNEL);
3737 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3738 *p = 0x56;
3739 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3740 p);
3741 printk(KERN_ERR
3742 "If allocated object is overwritten then not detectable\n\n");
3743 validate_slab_cache(kmalloc_caches[6]);
3744
3745 printk(KERN_ERR "\nB. Corruption after free\n");
3746 p = kzalloc(128, GFP_KERNEL);
3747 kfree(p);
3748 *p = 0x78;
3749 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3750 validate_slab_cache(kmalloc_caches[7]);
3751
3752 p = kzalloc(256, GFP_KERNEL);
3753 kfree(p);
3754 p[50] = 0x9a;
3755 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3756 p);
3757 validate_slab_cache(kmalloc_caches[8]);
3758
3759 p = kzalloc(512, GFP_KERNEL);
3760 kfree(p);
3761 p[512] = 0xab;
3762 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3763 validate_slab_cache(kmalloc_caches[9]);
3764}
3765#else
3766#ifdef CONFIG_SYSFS
3767static void resiliency_test(void) {};
3768#endif
3769#endif
3770
3771#ifdef CONFIG_SYSFS
3739enum slab_stat_type { 3772enum slab_stat_type {
3740 SL_ALL, /* All slabs */ 3773 SL_ALL, /* All slabs */
3741 SL_PARTIAL, /* Only partially allocated slabs */ 3774 SL_PARTIAL, /* Only partially allocated slabs */
@@ -3788,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3788 } 3821 }
3789 } 3822 }
3790 3823
3824 down_read(&slub_lock);
3825#ifdef CONFIG_SLUB_DEBUG
3791 if (flags & SO_ALL) { 3826 if (flags & SO_ALL) {
3792 for_each_node_state(node, N_NORMAL_MEMORY) { 3827 for_each_node_state(node, N_NORMAL_MEMORY) {
3793 struct kmem_cache_node *n = get_node(s, node); 3828 struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3804 nodes[node] += x; 3839 nodes[node] += x;
3805 } 3840 }
3806 3841
3807 } else if (flags & SO_PARTIAL) { 3842 } else
3843#endif
3844 if (flags & SO_PARTIAL) {
3808 for_each_node_state(node, N_NORMAL_MEMORY) { 3845 for_each_node_state(node, N_NORMAL_MEMORY) {
3809 struct kmem_cache_node *n = get_node(s, node); 3846 struct kmem_cache_node *n = get_node(s, node);
3810 3847
@@ -3829,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3829 return x + sprintf(buf + x, "\n"); 3866 return x + sprintf(buf + x, "\n");
3830} 3867}
3831 3868
3869#ifdef CONFIG_SLUB_DEBUG
3832static int any_slab_objects(struct kmem_cache *s) 3870static int any_slab_objects(struct kmem_cache *s)
3833{ 3871{
3834 int node; 3872 int node;
@@ -3844,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s)
3844 } 3882 }
3845 return 0; 3883 return 0;
3846} 3884}
3885#endif
3847 3886
3848#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 3887#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3849#define to_slab(n) container_of(n, struct kmem_cache, kobj); 3888#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3945,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3945} 3984}
3946SLAB_ATTR_RO(aliases); 3985SLAB_ATTR_RO(aliases);
3947 3986
3948static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3949{
3950 return show_slab_objects(s, buf, SO_ALL);
3951}
3952SLAB_ATTR_RO(slabs);
3953
3954static ssize_t partial_show(struct kmem_cache *s, char *buf) 3987static ssize_t partial_show(struct kmem_cache *s, char *buf)
3955{ 3988{
3956 return show_slab_objects(s, buf, SO_PARTIAL); 3989 return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
3975} 4008}
3976SLAB_ATTR_RO(objects_partial); 4009SLAB_ATTR_RO(objects_partial);
3977 4010
3978static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4011static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3979{
3980 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
3981}
3982SLAB_ATTR_RO(total_objects);
3983
3984static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3985{ 4012{
3986 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4013 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3987} 4014}
3988 4015
3989static ssize_t sanity_checks_store(struct kmem_cache *s, 4016static ssize_t reclaim_account_store(struct kmem_cache *s,
3990 const char *buf, size_t length) 4017 const char *buf, size_t length)
3991{ 4018{
3992 s->flags &= ~SLAB_DEBUG_FREE; 4019 s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3993 if (buf[0] == '1') 4020 if (buf[0] == '1')
3994 s->flags |= SLAB_DEBUG_FREE; 4021 s->flags |= SLAB_RECLAIM_ACCOUNT;
3995 return length; 4022 return length;
3996} 4023}
3997SLAB_ATTR(sanity_checks); 4024SLAB_ATTR(reclaim_account);
3998 4025
3999static ssize_t trace_show(struct kmem_cache *s, char *buf) 4026static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4000{ 4027{
4001 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4028 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4002} 4029}
4030SLAB_ATTR_RO(hwcache_align);
4003 4031
4004static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4032#ifdef CONFIG_ZONE_DMA
4005 size_t length) 4033static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4006{ 4034{
4007 s->flags &= ~SLAB_TRACE; 4035 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4008 if (buf[0] == '1')
4009 s->flags |= SLAB_TRACE;
4010 return length;
4011} 4036}
4012SLAB_ATTR(trace); 4037SLAB_ATTR_RO(cache_dma);
4038#endif
4013 4039
4014#ifdef CONFIG_FAILSLAB 4040static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4015static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4016{ 4041{
4017 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4042 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4018} 4043}
4044SLAB_ATTR_RO(destroy_by_rcu);
4019 4045
4020static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4046#ifdef CONFIG_SLUB_DEBUG
4021 size_t length) 4047static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4022{ 4048{
4023 s->flags &= ~SLAB_FAILSLAB; 4049 return show_slab_objects(s, buf, SO_ALL);
4024 if (buf[0] == '1')
4025 s->flags |= SLAB_FAILSLAB;
4026 return length;
4027} 4050}
4028SLAB_ATTR(failslab); 4051SLAB_ATTR_RO(slabs);
4029#endif
4030 4052
4031static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4053static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4032{ 4054{
4033 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4055 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4034} 4056}
4057SLAB_ATTR_RO(total_objects);
4035 4058
4036static ssize_t reclaim_account_store(struct kmem_cache *s, 4059static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4037 const char *buf, size_t length)
4038{ 4060{
4039 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4061 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4040 if (buf[0] == '1')
4041 s->flags |= SLAB_RECLAIM_ACCOUNT;
4042 return length;
4043} 4062}
4044SLAB_ATTR(reclaim_account);
4045 4063
4046static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4064static ssize_t sanity_checks_store(struct kmem_cache *s,
4065 const char *buf, size_t length)
4047{ 4066{
4048 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4067 s->flags &= ~SLAB_DEBUG_FREE;
4068 if (buf[0] == '1')
4069 s->flags |= SLAB_DEBUG_FREE;
4070 return length;
4049} 4071}
4050SLAB_ATTR_RO(hwcache_align); 4072SLAB_ATTR(sanity_checks);
4051 4073
4052#ifdef CONFIG_ZONE_DMA 4074static ssize_t trace_show(struct kmem_cache *s, char *buf)
4053static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4054{ 4075{
4055 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4076 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4056} 4077}
4057SLAB_ATTR_RO(cache_dma);
4058#endif
4059 4078
4060static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4079static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4080 size_t length)
4061{ 4081{
4062 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4082 s->flags &= ~SLAB_TRACE;
4083 if (buf[0] == '1')
4084 s->flags |= SLAB_TRACE;
4085 return length;
4063} 4086}
4064SLAB_ATTR_RO(destroy_by_rcu); 4087SLAB_ATTR(trace);
4065 4088
4066static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4089static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4067{ 4090{
@@ -4139,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s,
4139} 4162}
4140SLAB_ATTR(validate); 4163SLAB_ATTR(validate);
4141 4164
4165static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4166{
4167 if (!(s->flags & SLAB_STORE_USER))
4168 return -ENOSYS;
4169 return list_locations(s, buf, TRACK_ALLOC);
4170}
4171SLAB_ATTR_RO(alloc_calls);
4172
4173static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4174{
4175 if (!(s->flags & SLAB_STORE_USER))
4176 return -ENOSYS;
4177 return list_locations(s, buf, TRACK_FREE);
4178}
4179SLAB_ATTR_RO(free_calls);
4180#endif /* CONFIG_SLUB_DEBUG */
4181
4182#ifdef CONFIG_FAILSLAB
4183static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4184{
4185 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4186}
4187
4188static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4189 size_t length)
4190{
4191 s->flags &= ~SLAB_FAILSLAB;
4192 if (buf[0] == '1')
4193 s->flags |= SLAB_FAILSLAB;
4194 return length;
4195}
4196SLAB_ATTR(failslab);
4197#endif
4198
4142static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4199static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4143{ 4200{
4144 return 0; 4201 return 0;
@@ -4158,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
4158} 4215}
4159SLAB_ATTR(shrink); 4216SLAB_ATTR(shrink);
4160 4217
4161static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4162{
4163 if (!(s->flags & SLAB_STORE_USER))
4164 return -ENOSYS;
4165 return list_locations(s, buf, TRACK_ALLOC);
4166}
4167SLAB_ATTR_RO(alloc_calls);
4168
4169static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4170{
4171 if (!(s->flags & SLAB_STORE_USER))
4172 return -ENOSYS;
4173 return list_locations(s, buf, TRACK_FREE);
4174}
4175SLAB_ATTR_RO(free_calls);
4176
4177#ifdef CONFIG_NUMA 4218#ifdef CONFIG_NUMA
4178static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4219static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4179{ 4220{
@@ -4279,25 +4320,27 @@ static struct attribute *slab_attrs[] = {
4279 &min_partial_attr.attr, 4320 &min_partial_attr.attr,
4280 &objects_attr.attr, 4321 &objects_attr.attr,
4281 &objects_partial_attr.attr, 4322 &objects_partial_attr.attr,
4282 &total_objects_attr.attr,
4283 &slabs_attr.attr,
4284 &partial_attr.attr, 4323 &partial_attr.attr,
4285 &cpu_slabs_attr.attr, 4324 &cpu_slabs_attr.attr,
4286 &ctor_attr.attr, 4325 &ctor_attr.attr,
4287 &aliases_attr.attr, 4326 &aliases_attr.attr,
4288 &align_attr.attr, 4327 &align_attr.attr,
4289 &sanity_checks_attr.attr,
4290 &trace_attr.attr,
4291 &hwcache_align_attr.attr, 4328 &hwcache_align_attr.attr,
4292 &reclaim_account_attr.attr, 4329 &reclaim_account_attr.attr,
4293 &destroy_by_rcu_attr.attr, 4330 &destroy_by_rcu_attr.attr,
4331 &shrink_attr.attr,
4332#ifdef CONFIG_SLUB_DEBUG
4333 &total_objects_attr.attr,
4334 &slabs_attr.attr,
4335 &sanity_checks_attr.attr,
4336 &trace_attr.attr,
4294 &red_zone_attr.attr, 4337 &red_zone_attr.attr,
4295 &poison_attr.attr, 4338 &poison_attr.attr,
4296 &store_user_attr.attr, 4339 &store_user_attr.attr,
4297 &validate_attr.attr, 4340 &validate_attr.attr,
4298 &shrink_attr.attr,
4299 &alloc_calls_attr.attr, 4341 &alloc_calls_attr.attr,
4300 &free_calls_attr.attr, 4342 &free_calls_attr.attr,
4343#endif
4301#ifdef CONFIG_ZONE_DMA 4344#ifdef CONFIG_ZONE_DMA
4302 &cache_dma_attr.attr, 4345 &cache_dma_attr.attr,
4303#endif 4346#endif
@@ -4377,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj)
4377{ 4420{
4378 struct kmem_cache *s = to_slab(kobj); 4421 struct kmem_cache *s = to_slab(kobj);
4379 4422
4423 kfree(s->name);
4380 kfree(s); 4424 kfree(s);
4381} 4425}
4382 4426
@@ -4579,7 +4623,7 @@ static int __init slab_sysfs_init(void)
4579} 4623}
4580 4624
4581__initcall(slab_sysfs_init); 4625__initcall(slab_sysfs_init);
4582#endif 4626#endif /* CONFIG_SYSFS */
4583 4627
4584/* 4628/*
4585 * The /proc/slabinfo ABI 4629 * The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa41..29d6cbffb283 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
220 220
221 if (vmemmap_buf_start) { 221 if (vmemmap_buf_start) {
222 /* need to free left buf */ 222 /* need to free left buf */
223#ifdef CONFIG_NO_BOOTMEM
224 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
225 if (vmemmap_buf_start < vmemmap_buf) {
226 char name[15];
227
228 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
229 reserve_early_without_check(__pa(vmemmap_buf_start),
230 __pa(vmemmap_buf), name);
231 }
232#else
233 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 223 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
234#endif
235 vmemmap_buf = NULL; 224 vmemmap_buf = NULL;
236 vmemmap_buf_end = NULL; 225 vmemmap_buf_end = NULL;
237 } 226 }
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
378 378
379 pagevec_free(&pages_to_free); 379 pagevec_free(&pages_to_free);
380} 380}
381EXPORT_SYMBOL(release_pages);
381 382
382/* 383/*
383 * The pages which we're about to release may be in the deferred lru-addition 384 * The pages which we're about to release may be in the deferred lru-addition
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36f..67ddaaf98c74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/poll.h>
33 34
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
58 59
59static DEFINE_MUTEX(swapon_mutex); 60static DEFINE_MUTEX(swapon_mutex);
60 61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63/* Activity counter to indicate that a swapon or swapoff has occurred */
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
61static inline unsigned char swap_count(unsigned char ent) 66static inline unsigned char swap_count(unsigned char ent)
62{ 67{
63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 68 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -139,7 +144,7 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 144 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 145 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 146 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 147 nr_blocks, GFP_KERNEL, 0);
143 if (err) 148 if (err)
144 return err; 149 return err;
145 cond_resched(); 150 cond_resched();
@@ -150,7 +155,7 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 155 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 156
152 err = blkdev_issue_discard(si->bdev, start_block, 157 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 158 nr_blocks, GFP_KERNEL, 0);
154 if (err) 159 if (err)
155 break; 160 break;
156 161
@@ -189,7 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 194 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 195 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 196 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) 197 nr_blocks, GFP_NOIO, 0))
193 break; 198 break;
194 } 199 }
195 200
@@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1680 } 1685 }
1681 filp_close(swap_file, NULL); 1686 filp_close(swap_file, NULL);
1682 err = 0; 1687 err = 0;
1688 atomic_inc(&proc_poll_event);
1689 wake_up_interruptible(&proc_poll_wait);
1683 1690
1684out_dput: 1691out_dput:
1685 filp_close(victim, NULL); 1692 filp_close(victim, NULL);
@@ -1688,6 +1695,25 @@ out:
1688} 1695}
1689 1696
1690#ifdef CONFIG_PROC_FS 1697#ifdef CONFIG_PROC_FS
1698struct proc_swaps {
1699 struct seq_file seq;
1700 int event;
1701};
1702
1703static unsigned swaps_poll(struct file *file, poll_table *wait)
1704{
1705 struct proc_swaps *s = file->private_data;
1706
1707 poll_wait(file, &proc_poll_wait, wait);
1708
1709 if (s->event != atomic_read(&proc_poll_event)) {
1710 s->event = atomic_read(&proc_poll_event);
1711 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1712 }
1713
1714 return POLLIN | POLLRDNORM;
1715}
1716
1691/* iterator */ 1717/* iterator */
1692static void *swap_start(struct seq_file *swap, loff_t *pos) 1718static void *swap_start(struct seq_file *swap, loff_t *pos)
1693{ 1719{
@@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = {
1771 1797
1772static int swaps_open(struct inode *inode, struct file *file) 1798static int swaps_open(struct inode *inode, struct file *file)
1773{ 1799{
1774 return seq_open(file, &swaps_op); 1800 struct proc_swaps *s;
1801 int ret;
1802
1803 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1804 if (!s)
1805 return -ENOMEM;
1806
1807 file->private_data = s;
1808
1809 ret = seq_open(file, &swaps_op);
1810 if (ret) {
1811 kfree(s);
1812 return ret;
1813 }
1814
1815 s->seq.private = s;
1816 s->event = atomic_read(&proc_poll_event);
1817 return ret;
1775} 1818}
1776 1819
1777static const struct file_operations proc_swaps_operations = { 1820static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
1779 .read = seq_read, 1822 .read = seq_read,
1780 .llseek = seq_lseek, 1823 .llseek = seq_lseek,
1781 .release = seq_release, 1824 .release = seq_release,
1825 .poll = swaps_poll,
1782}; 1826};
1783 1827
1784static int __init procswaps_init(void) 1828static int __init procswaps_init(void)
@@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2084 swap_info[prev]->next = type; 2128 swap_info[prev]->next = type;
2085 spin_unlock(&swap_lock); 2129 spin_unlock(&swap_lock);
2086 mutex_unlock(&swapon_mutex); 2130 mutex_unlock(&swapon_mutex);
2131 atomic_inc(&proc_poll_event);
2132 wake_up_interruptible(&proc_poll_wait);
2133
2087 error = 0; 2134 error = 0;
2088 goto out; 2135 goto out;
2089bad_swap: 2136bad_swap:
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..73dac81e9f78 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
245} 245}
246#endif 246#endif
247 247
248/*
249 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
250 * back to the regular GUP.
251 * If the architecture not support this fucntion, simply return with no
252 * page pinned
253 */
254int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
255 int nr_pages, int write, struct page **pages)
256{
257 return 0;
258}
259EXPORT_SYMBOL_GPL(__get_user_pages_fast);
260
248/** 261/**
249 * get_user_pages_fast() - pin user pages in memory 262 * get_user_pages_fast() - pin user pages in memory
250 * @start: starting user address 263 * @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..a3d66b3dc5cb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
293 struct rb_node *tmp; 293 struct rb_node *tmp;
294 294
295 while (*p) { 295 while (*p) {
296 struct vmap_area *tmp; 296 struct vmap_area *tmp_va;
297 297
298 parent = *p; 298 parent = *p;
299 tmp = rb_entry(parent, struct vmap_area, rb_node); 299 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
300 if (va->va_start < tmp->va_end) 300 if (va->va_start < tmp_va->va_end)
301 p = &(*p)->rb_left; 301 p = &(*p)->rb_left;
302 else if (va->va_end > tmp->va_start) 302 else if (va->va_end > tmp_va->va_start)
303 p = &(*p)->rb_right; 303 p = &(*p)->rb_right;
304 else 304 else
305 BUG(); 305 BUG();
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
517static void purge_fragmented_blocks_allcpus(void); 517static void purge_fragmented_blocks_allcpus(void);
518 518
519/* 519/*
520 * called before a call to iounmap() if the caller wants vm_area_struct's
521 * immediately freed.
522 */
523void set_iounmap_nonlazy(void)
524{
525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
526}
527
528/*
520 * Purges all lazily-freed vmap areas. 529 * Purges all lazily-freed vmap areas.
521 * 530 *
522 * If sync is 0 then don't purge if there is already a purge in progress. 531 * If sync is 0 then don't purge if there is already a purge in progress.
@@ -1587,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1587} 1596}
1588EXPORT_SYMBOL(__vmalloc); 1597EXPORT_SYMBOL(__vmalloc);
1589 1598
1599static inline void *__vmalloc_node_flags(unsigned long size,
1600 int node, gfp_t flags)
1601{
1602 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1603 node, __builtin_return_address(0));
1604}
1605
1590/** 1606/**
1591 * vmalloc - allocate virtually contiguous memory 1607 * vmalloc - allocate virtually contiguous memory
1592 * @size: allocation size 1608 * @size: allocation size
@@ -1598,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc);
1598 */ 1614 */
1599void *vmalloc(unsigned long size) 1615void *vmalloc(unsigned long size)
1600{ 1616{
1601 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1617 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
1602 -1, __builtin_return_address(0));
1603} 1618}
1604EXPORT_SYMBOL(vmalloc); 1619EXPORT_SYMBOL(vmalloc);
1605 1620
1606/** 1621/**
1622 * vzalloc - allocate virtually contiguous memory with zero fill
1623 * @size: allocation size
1624 * Allocate enough pages to cover @size from the page level
1625 * allocator and map them into contiguous kernel virtual space.
1626 * The memory allocated is set to zero.
1627 *
1628 * For tight control over page level allocator and protection flags
1629 * use __vmalloc() instead.
1630 */
1631void *vzalloc(unsigned long size)
1632{
1633 return __vmalloc_node_flags(size, -1,
1634 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1635}
1636EXPORT_SYMBOL(vzalloc);
1637
1638/**
1607 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1639 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1608 * @size: allocation size 1640 * @size: allocation size
1609 * 1641 *
@@ -1644,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
1644} 1676}
1645EXPORT_SYMBOL(vmalloc_node); 1677EXPORT_SYMBOL(vmalloc_node);
1646 1678
1679/**
1680 * vzalloc_node - allocate memory on a specific node with zero fill
1681 * @size: allocation size
1682 * @node: numa node
1683 *
1684 * Allocate enough pages to cover @size from the page level
1685 * allocator and map them into contiguous kernel virtual space.
1686 * The memory allocated is set to zero.
1687 *
1688 * For tight control over page level allocator and protection flags
1689 * use __vmalloc_node() instead.
1690 */
1691void *vzalloc_node(unsigned long size, int node)
1692{
1693 return __vmalloc_node_flags(size, node,
1694 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1695}
1696EXPORT_SYMBOL(vzalloc_node);
1697
1647#ifndef PAGE_KERNEL_EXEC 1698#ifndef PAGE_KERNEL_EXEC
1648# define PAGE_KERNEL_EXEC PAGE_KERNEL 1699# define PAGE_KERNEL_EXEC PAGE_KERNEL
1649#endif 1700#endif
@@ -2056,6 +2107,7 @@ void free_vm_area(struct vm_struct *area)
2056} 2107}
2057EXPORT_SYMBOL_GPL(free_vm_area); 2108EXPORT_SYMBOL_GPL(free_vm_area);
2058 2109
2110#ifdef CONFIG_SMP
2059static struct vmap_area *node_to_va(struct rb_node *n) 2111static struct vmap_area *node_to_va(struct rb_node *n)
2060{ 2112{
2061 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2113 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2336,9 +2388,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2336 free_vm_area(vms[i]); 2388 free_vm_area(vms[i]);
2337 kfree(vms); 2389 kfree(vms);
2338} 2390}
2391#endif /* CONFIG_SMP */
2339 2392
2340#ifdef CONFIG_PROC_FS 2393#ifdef CONFIG_PROC_FS
2341static void *s_start(struct seq_file *m, loff_t *pos) 2394static void *s_start(struct seq_file *m, loff_t *pos)
2395 __acquires(&vmlist_lock)
2342{ 2396{
2343 loff_t n = *pos; 2397 loff_t n = *pos;
2344 struct vm_struct *v; 2398 struct vm_struct *v;
@@ -2365,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2365} 2419}
2366 2420
2367static void s_stop(struct seq_file *m, void *p) 2421static void s_stop(struct seq_file *m, void *p)
2422 __releases(&vmlist_lock)
2368{ 2423{
2369 read_unlock(&vmlist_lock); 2424 read_unlock(&vmlist_lock);
2370} 2425}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..b8a6fdc21312 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,12 @@
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 52#include <trace/events/vmscan.h>
53 53
54enum lumpy_mode {
55 LUMPY_MODE_NONE,
56 LUMPY_MODE_ASYNC,
57 LUMPY_MODE_SYNC,
58};
59
54struct scan_control { 60struct scan_control {
55 /* Incremented by the number of inactive pages that were scanned */ 61 /* Incremented by the number of inactive pages that were scanned */
56 unsigned long nr_scanned; 62 unsigned long nr_scanned;
@@ -79,10 +85,10 @@ struct scan_control {
79 int order; 85 int order;
80 86
81 /* 87 /*
82 * Intend to reclaim enough contenious memory rather than to reclaim 88 * Intend to reclaim enough continuous memory rather than reclaim
83 * enough amount memory. I.e, it's the mode for high order allocation. 89 * enough amount of memory. i.e, mode for high order allocation.
84 */ 90 */
85 bool lumpy_reclaim_mode; 91 enum lumpy_mode lumpy_reclaim_mode;
86 92
87 /* Which cgroup do we reclaim from */ 93 /* Which cgroup do we reclaim from */
88 struct mem_cgroup *mem_cgroup; 94 struct mem_cgroup *mem_cgroup;
@@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
265 return ret; 271 return ret;
266} 272}
267 273
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync)
276{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
278
279 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous
281 * lumpy reclaim.
282 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
284 return;
285
286 /*
287 * If we need a large contiguous chunk of memory, or have
288 * trouble getting a small set of contiguous pages, we
289 * will reclaim both active and inactive pages.
290 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode;
293 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode;
295 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
297}
298
299static void disable_lumpy_reclaim_mode(struct scan_control *sc)
300{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
302}
303
268static inline int is_page_cache_freeable(struct page *page) 304static inline int is_page_cache_freeable(struct page *page)
269{ 305{
270 /* 306 /*
@@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
275 return page_count(page) - page_has_private(page) == 2; 311 return page_count(page) - page_has_private(page) == 2;
276} 312}
277 313
278static int may_write_to_queue(struct backing_dev_info *bdi) 314static int may_write_to_queue(struct backing_dev_info *bdi,
315 struct scan_control *sc)
279{ 316{
280 if (current->flags & PF_SWAPWRITE) 317 if (current->flags & PF_SWAPWRITE)
281 return 1; 318 return 1;
@@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
283 return 1; 320 return 1;
284 if (bdi == current->backing_dev_info) 321 if (bdi == current->backing_dev_info)
285 return 1; 322 return 1;
323
324 /* lumpy reclaim for hugepage often need a lot of write */
325 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
326 return 1;
286 return 0; 327 return 0;
287} 328}
288 329
@@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
307 unlock_page(page); 348 unlock_page(page);
308} 349}
309 350
310/* Request for sync pageout. */
311enum pageout_io {
312 PAGEOUT_IO_ASYNC,
313 PAGEOUT_IO_SYNC,
314};
315
316/* possible outcome of pageout() */ 351/* possible outcome of pageout() */
317typedef enum { 352typedef enum {
318 /* failed to write page out, page is locked */ 353 /* failed to write page out, page is locked */
@@ -330,7 +365,7 @@ typedef enum {
330 * Calls ->writepage(). 365 * Calls ->writepage().
331 */ 366 */
332static pageout_t pageout(struct page *page, struct address_space *mapping, 367static pageout_t pageout(struct page *page, struct address_space *mapping,
333 enum pageout_io sync_writeback) 368 struct scan_control *sc)
334{ 369{
335 /* 370 /*
336 * If the page is dirty, only perform writeback if that write 371 * If the page is dirty, only perform writeback if that write
@@ -366,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
366 } 401 }
367 if (mapping->a_ops->writepage == NULL) 402 if (mapping->a_ops->writepage == NULL)
368 return PAGE_ACTIVATE; 403 return PAGE_ACTIVATE;
369 if (!may_write_to_queue(mapping->backing_dev_info)) 404 if (!may_write_to_queue(mapping->backing_dev_info, sc))
370 return PAGE_KEEP; 405 return PAGE_KEEP;
371 406
372 if (clear_page_dirty_for_io(page)) { 407 if (clear_page_dirty_for_io(page)) {
@@ -376,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
376 .nr_to_write = SWAP_CLUSTER_MAX, 411 .nr_to_write = SWAP_CLUSTER_MAX,
377 .range_start = 0, 412 .range_start = 0,
378 .range_end = LLONG_MAX, 413 .range_end = LLONG_MAX,
379 .nonblocking = 1,
380 .for_reclaim = 1, 414 .for_reclaim = 1,
381 }; 415 };
382 416
@@ -394,7 +428,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
394 * direct reclaiming a large contiguous area and the 428 * direct reclaiming a large contiguous area and the
395 * first attempt to free a range of pages fails. 429 * first attempt to free a range of pages fails.
396 */ 430 */
397 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 431 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
398 wait_on_page_writeback(page); 433 wait_on_page_writeback(page);
399 434
400 if (!PageWriteback(page)) { 435 if (!PageWriteback(page)) {
@@ -402,7 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
402 ClearPageReclaim(page); 437 ClearPageReclaim(page);
403 } 438 }
404 trace_mm_vmscan_writepage(page, 439 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback)); 440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
406 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
407 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
408 } 443 }
@@ -580,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
580 referenced_page = TestClearPageReferenced(page); 615 referenced_page = TestClearPageReferenced(page);
581 616
582 /* Lumpy reclaim - ignore references */ 617 /* Lumpy reclaim - ignore references */
583 if (sc->lumpy_reclaim_mode) 618 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
584 return PAGEREF_RECLAIM; 619 return PAGEREF_RECLAIM;
585 620
586 /* 621 /*
@@ -616,7 +651,7 @@ static enum page_references page_check_references(struct page *page,
616 } 651 }
617 652
618 /* Reclaim if clean, defer dirty pages to writeback */ 653 /* Reclaim if clean, defer dirty pages to writeback */
619 if (referenced_page) 654 if (referenced_page && !PageSwapBacked(page))
620 return PAGEREF_RECLAIM_CLEAN; 655 return PAGEREF_RECLAIM_CLEAN;
621 656
622 return PAGEREF_RECLAIM; 657 return PAGEREF_RECLAIM;
@@ -644,12 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
644 * shrink_page_list() returns the number of reclaimed pages 679 * shrink_page_list() returns the number of reclaimed pages
645 */ 680 */
646static unsigned long shrink_page_list(struct list_head *page_list, 681static unsigned long shrink_page_list(struct list_head *page_list,
647 struct scan_control *sc, 682 struct zone *zone,
648 enum pageout_io sync_writeback) 683 struct scan_control *sc)
649{ 684{
650 LIST_HEAD(ret_pages); 685 LIST_HEAD(ret_pages);
651 LIST_HEAD(free_pages); 686 LIST_HEAD(free_pages);
652 int pgactivate = 0; 687 int pgactivate = 0;
688 unsigned long nr_dirty = 0;
689 unsigned long nr_congested = 0;
653 unsigned long nr_reclaimed = 0; 690 unsigned long nr_reclaimed = 0;
654 691
655 cond_resched(); 692 cond_resched();
@@ -669,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
669 goto keep; 706 goto keep;
670 707
671 VM_BUG_ON(PageActive(page)); 708 VM_BUG_ON(PageActive(page));
709 VM_BUG_ON(page_zone(page) != zone);
672 710
673 sc->nr_scanned++; 711 sc->nr_scanned++;
674 712
@@ -694,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
694 * for any page for which writeback has already 732 * for any page for which writeback has already
695 * started. 733 * started.
696 */ 734 */
697 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 735 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
736 may_enter_fs)
698 wait_on_page_writeback(page); 737 wait_on_page_writeback(page);
699 else 738 else {
700 goto keep_locked; 739 unlock_page(page);
740 goto keep_lumpy;
741 }
701 } 742 }
702 743
703 references = page_check_references(page, sc); 744 references = page_check_references(page, sc);
@@ -743,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
743 } 784 }
744 785
745 if (PageDirty(page)) { 786 if (PageDirty(page)) {
787 nr_dirty++;
788
746 if (references == PAGEREF_RECLAIM_CLEAN) 789 if (references == PAGEREF_RECLAIM_CLEAN)
747 goto keep_locked; 790 goto keep_locked;
748 if (!may_enter_fs) 791 if (!may_enter_fs)
@@ -751,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
751 goto keep_locked; 794 goto keep_locked;
752 795
753 /* Page is dirty, try to write it out here */ 796 /* Page is dirty, try to write it out here */
754 switch (pageout(page, mapping, sync_writeback)) { 797 switch (pageout(page, mapping, sc)) {
755 case PAGE_KEEP: 798 case PAGE_KEEP:
799 nr_congested++;
756 goto keep_locked; 800 goto keep_locked;
757 case PAGE_ACTIVATE: 801 case PAGE_ACTIVATE:
758 goto activate_locked; 802 goto activate_locked;
759 case PAGE_SUCCESS: 803 case PAGE_SUCCESS:
760 if (PageWriteback(page) || PageDirty(page)) 804 if (PageWriteback(page))
805 goto keep_lumpy;
806 if (PageDirty(page))
761 goto keep; 807 goto keep;
808
762 /* 809 /*
763 * A synchronous write - probably a ramdisk. Go 810 * A synchronous write - probably a ramdisk. Go
764 * ahead and try to reclaim the page. 811 * ahead and try to reclaim the page.
@@ -841,6 +888,7 @@ cull_mlocked:
841 try_to_free_swap(page); 888 try_to_free_swap(page);
842 unlock_page(page); 889 unlock_page(page);
843 putback_lru_page(page); 890 putback_lru_page(page);
891 disable_lumpy_reclaim_mode(sc);
844 continue; 892 continue;
845 893
846activate_locked: 894activate_locked:
@@ -853,10 +901,21 @@ activate_locked:
853keep_locked: 901keep_locked:
854 unlock_page(page); 902 unlock_page(page);
855keep: 903keep:
904 disable_lumpy_reclaim_mode(sc);
905keep_lumpy:
856 list_add(&page->lru, &ret_pages); 906 list_add(&page->lru, &ret_pages);
857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
858 } 908 }
859 909
910 /*
911 * Tag a zone as congested if all the dirty pages encountered were
912 * backed by a congested BDI. In this case, reclaimers should just
913 * back off and wait for congestion to clear because further reclaim
914 * will encounter the same problem
915 */
916 if (nr_dirty == nr_congested)
917 zone_set_flag(zone, ZONE_CONGESTED);
918
860 free_page_list(&free_pages); 919 free_page_list(&free_pages);
861 920
862 list_splice(&ret_pages, page_list); 921 list_splice(&ret_pages, page_list);
@@ -1006,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1006 1065
1007 /* Check that we have not crossed a zone boundary. */ 1066 /* Check that we have not crossed a zone boundary. */
1008 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1067 if (unlikely(page_zone_id(cursor_page) != zone_id))
1009 continue; 1068 break;
1010 1069
1011 /* 1070 /*
1012 * If we don't have enough swap space, reclaiming of 1071 * If we don't have enough swap space, reclaiming of
@@ -1014,8 +1073,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1014 * pointless. 1073 * pointless.
1015 */ 1074 */
1016 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1075 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1017 !PageSwapCache(cursor_page)) 1076 !PageSwapCache(cursor_page))
1018 continue; 1077 break;
1019 1078
1020 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1079 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1021 list_move(&cursor_page->lru, dst); 1080 list_move(&cursor_page->lru, dst);
@@ -1026,11 +1085,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1026 nr_lumpy_dirty++; 1085 nr_lumpy_dirty++;
1027 scan++; 1086 scan++;
1028 } else { 1087 } else {
1029 if (mode == ISOLATE_BOTH && 1088 /* the page is freed already. */
1030 page_count(cursor_page)) 1089 if (!page_count(cursor_page))
1031 nr_lumpy_failed++; 1090 continue;
1091 break;
1032 } 1092 }
1033 } 1093 }
1094
1095 /* If we break out of the loop above, lumpy reclaim failed */
1096 if (pfn < end_pfn)
1097 nr_lumpy_failed++;
1034 } 1098 }
1035 1099
1036 *scanned = scan; 1100 *scanned = scan;
@@ -1253,7 +1317,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1253 return false; 1317 return false;
1254 1318
1255 /* Only stall on lumpy reclaim */ 1319 /* Only stall on lumpy reclaim */
1256 if (!sc->lumpy_reclaim_mode) 1320 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
1257 return false; 1321 return false;
1258 1322
1259 /* If we have relaimed everything on the isolated list, no stall */ 1323 /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1350,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1286 unsigned long nr_scanned; 1350 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0; 1351 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken; 1352 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon; 1353 unsigned long nr_anon;
1291 unsigned long nr_file; 1354 unsigned long nr_file;
1292 1355
@@ -1298,15 +1361,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1298 return SWAP_CLUSTER_MAX; 1361 return SWAP_CLUSTER_MAX;
1299 } 1362 }
1300 1363
1301 1364 set_lumpy_reclaim_mode(priority, sc, false);
1302 lru_add_drain(); 1365 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock); 1366 spin_lock_irq(&zone->lru_lock);
1304 1367
1305 if (scanning_global_lru(sc)) { 1368 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan, 1369 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order, 1370 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ? 1371 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE, 1372 ISOLATE_INACTIVE : ISOLATE_BOTH,
1310 zone, 0, file); 1373 zone, 0, file);
1311 zone->pages_scanned += nr_scanned; 1374 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd()) 1375 if (current_is_kswapd())
@@ -1318,8 +1381,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1318 } else { 1381 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1382 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order, 1383 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ? 1384 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE, 1385 ISOLATE_INACTIVE : ISOLATE_BOTH,
1323 zone, sc->mem_cgroup, 1386 zone, sc->mem_cgroup,
1324 0, file); 1387 0, file);
1325 /* 1388 /*
@@ -1337,20 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1337 1400
1338 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1339 1402
1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1403 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1341 1404
1342 /* Check if we should syncronously wait for writeback */ 1405 /* Check if we should syncronously wait for writeback */
1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10); 1407 set_lumpy_reclaim_mode(priority, sc, true);
1345 1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1346 /*
1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1349 */
1350 nr_active = clear_active_flags(&page_list, NULL);
1351 count_vm_events(PGDEACTIVATE, nr_active);
1352
1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 } 1409 }
1355 1410
1356 local_irq_disable(); 1411 local_irq_disable();
@@ -1359,6 +1414,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1414 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1360 1415
1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1416 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1417
1418 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1419 zone_idx(zone),
1420 nr_scanned, nr_reclaimed,
1421 priority,
1422 trace_shrink_flags(file, sc->lumpy_reclaim_mode));
1362 return nr_reclaimed; 1423 return nr_reclaimed;
1363} 1424}
1364 1425
@@ -1506,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1506 spin_unlock_irq(&zone->lru_lock); 1567 spin_unlock_irq(&zone->lru_lock);
1507} 1568}
1508 1569
1570#ifdef CONFIG_SWAP
1509static int inactive_anon_is_low_global(struct zone *zone) 1571static int inactive_anon_is_low_global(struct zone *zone)
1510{ 1572{
1511 unsigned long active, inactive; 1573 unsigned long active, inactive;
@@ -1531,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1531{ 1593{
1532 int low; 1594 int low;
1533 1595
1596 /*
1597 * If we don't have swap space, anonymous page deactivation
1598 * is pointless.
1599 */
1600 if (!total_swap_pages)
1601 return 0;
1602
1534 if (scanning_global_lru(sc)) 1603 if (scanning_global_lru(sc))
1535 low = inactive_anon_is_low_global(zone); 1604 low = inactive_anon_is_low_global(zone);
1536 else 1605 else
1537 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1606 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1538 return low; 1607 return low;
1539} 1608}
1609#else
1610static inline int inactive_anon_is_low(struct zone *zone,
1611 struct scan_control *sc)
1612{
1613 return 0;
1614}
1615#endif
1540 1616
1541static int inactive_file_is_low_global(struct zone *zone) 1617static int inactive_file_is_low_global(struct zone *zone)
1542{ 1618{
@@ -1721,21 +1797,6 @@ out:
1721 } 1797 }
1722} 1798}
1723 1799
1724static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
1725{
1726 /*
1727 * If we need a large contiguous chunk of memory, or have
1728 * trouble getting a small set of contiguous pages, we
1729 * will reclaim both active and inactive pages.
1730 */
1731 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1732 sc->lumpy_reclaim_mode = 1;
1733 else if (sc->order && priority < DEF_PRIORITY - 2)
1734 sc->lumpy_reclaim_mode = 1;
1735 else
1736 sc->lumpy_reclaim_mode = 0;
1737}
1738
1739/* 1800/*
1740 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1801 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1741 */ 1802 */
@@ -1750,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
1750 1811
1751 get_scan_count(zone, sc, nr, priority); 1812 get_scan_count(zone, sc, nr, priority);
1752 1813
1753 set_lumpy_reclaim_mode(priority, sc);
1754
1755 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1756 nr[LRU_INACTIVE_FILE]) { 1815 nr[LRU_INACTIVE_FILE]) {
1757 for_each_evictable_lru(l) { 1816 for_each_evictable_lru(l) {
@@ -1782,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
1782 * Even if we did not try to evict anon pages at all, we want to 1841 * Even if we did not try to evict anon pages at all, we want to
1783 * rebalance the anon lru active/inactive ratio. 1842 * rebalance the anon lru active/inactive ratio.
1784 */ 1843 */
1785 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) 1844 if (inactive_anon_is_low(zone, sc))
1786 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1845 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1787 1846
1788 throttle_vm_writeout(sc->gfp_mask); 1847 throttle_vm_writeout(sc->gfp_mask);
@@ -1937,21 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1937 1996
1938 /* Take a nap, wait for some writeback to complete */ 1997 /* Take a nap, wait for some writeback to complete */
1939 if (!sc->hibernation_mode && sc->nr_scanned && 1998 if (!sc->hibernation_mode && sc->nr_scanned &&
1940 priority < DEF_PRIORITY - 2) 1999 priority < DEF_PRIORITY - 2) {
1941 congestion_wait(BLK_RW_ASYNC, HZ/10); 2000 struct zone *preferred_zone;
2001
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 }
1942 } 2006 }
1943 2007
1944out: 2008out:
1945 /*
1946 * Now that we've scanned all the zones at this priority level, note
1947 * that level within the zone so that the next thread which performs
1948 * scanning of this zone will immediately start out at this priority
1949 * level. This affects only the decision whether or not to bring
1950 * mapped pages onto the inactive list.
1951 */
1952 if (priority < 0)
1953 priority = 0;
1954
1955 delayacct_freepages_end(); 2009 delayacct_freepages_end();
1956 put_mems_allowed(); 2010 put_mems_allowed();
1957 2011
@@ -2247,6 +2301,15 @@ loop_again:
2247 if (!zone_watermark_ok(zone, order, 2301 if (!zone_watermark_ok(zone, order,
2248 min_wmark_pages(zone), end_zone, 0)) 2302 min_wmark_pages(zone), end_zone, 0))
2249 has_under_min_watermark_zone = 1; 2303 has_under_min_watermark_zone = 1;
2304 } else {
2305 /*
2306 * If a zone reaches its high watermark,
2307 * consider it to be no longer congested. It's
2308 * possible there are dirty pages backed by
2309 * congested BDIs but as pressure is relieved,
2310 * spectulatively avoid congestion waits
2311 */
2312 zone_clear_flag(zone, ZONE_CONGESTED);
2250 } 2313 }
2251 2314
2252 } 2315 }
@@ -2987,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
2987 return 0; 3050 return 0;
2988} 3051}
2989 3052
3053#ifdef CONFIG_NUMA
2990/* 3054/*
2991 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3055 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2992 * a specified node's per zone unevictable lists for evictable pages. 3056 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
3033{ 3097{
3034 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3098 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3035} 3099}
3036 3100#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..cd2e42be7b68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h>
21#include <linux/compaction.h>
20 22
21#ifdef CONFIG_VM_EVENT_COUNTERS 23#ifdef CONFIG_VM_EVENT_COUNTERS
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -394,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
394#endif 396#endif
395 397
396#ifdef CONFIG_COMPACTION 398#ifdef CONFIG_COMPACTION
399
397struct contig_page_info { 400struct contig_page_info {
398 unsigned long free_pages; 401 unsigned long free_pages;
399 unsigned long free_blocks_total; 402 unsigned long free_blocks_total;
@@ -745,6 +748,11 @@ static const char * const vmstat_text[] = {
745 "nr_isolated_anon", 748 "nr_isolated_anon",
746 "nr_isolated_file", 749 "nr_isolated_file",
747 "nr_shmem", 750 "nr_shmem",
751 "nr_dirtied",
752 "nr_written",
753 "nr_dirty_threshold",
754 "nr_dirty_background_threshold",
755
748#ifdef CONFIG_NUMA 756#ifdef CONFIG_NUMA
749 "numa_hit", 757 "numa_hit",
750 "numa_miss", 758 "numa_miss",
@@ -904,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
904 .release = seq_release, 912 .release = seq_release,
905}; 913};
906 914
915enum writeback_stat_item {
916 NR_DIRTY_THRESHOLD,
917 NR_DIRTY_BG_THRESHOLD,
918 NR_VM_WRITEBACK_STAT_ITEMS,
919};
920
907static void *vmstat_start(struct seq_file *m, loff_t *pos) 921static void *vmstat_start(struct seq_file *m, loff_t *pos)
908{ 922{
909 unsigned long *v; 923 unsigned long *v;
910#ifdef CONFIG_VM_EVENT_COUNTERS 924 int i, stat_items_size;
911 unsigned long *e;
912#endif
913 int i;
914 925
915 if (*pos >= ARRAY_SIZE(vmstat_text)) 926 if (*pos >= ARRAY_SIZE(vmstat_text))
916 return NULL; 927 return NULL;
928 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
929 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
917 930
918#ifdef CONFIG_VM_EVENT_COUNTERS 931#ifdef CONFIG_VM_EVENT_COUNTERS
919 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 932 stat_items_size += sizeof(struct vm_event_state);
920 + sizeof(struct vm_event_state), GFP_KERNEL);
921#else
922 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
923 GFP_KERNEL);
924#endif 933#endif
934
935 v = kmalloc(stat_items_size, GFP_KERNEL);
925 m->private = v; 936 m->private = v;
926 if (!v) 937 if (!v)
927 return ERR_PTR(-ENOMEM); 938 return ERR_PTR(-ENOMEM);
928 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 939 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
929 v[i] = global_page_state(i); 940 v[i] = global_page_state(i);
941 v += NR_VM_ZONE_STAT_ITEMS;
942
943 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
944 v + NR_DIRTY_THRESHOLD);
945 v += NR_VM_WRITEBACK_STAT_ITEMS;
946
930#ifdef CONFIG_VM_EVENT_COUNTERS 947#ifdef CONFIG_VM_EVENT_COUNTERS
931 e = v + NR_VM_ZONE_STAT_ITEMS; 948 all_vm_events(v);
932 all_vm_events(e); 949 v[PGPGIN] /= 2; /* sectors -> kbytes */
933 e[PGPGIN] /= 2; /* sectors -> kbytes */ 950 v[PGPGOUT] /= 2;
934 e[PGPGOUT] /= 2;
935#endif 951#endif
936 return v + *pos; 952 return m->private + *pos;
937} 953}
938 954
939static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 955static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)