aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile1
-rw-r--r--mm/allocpercpu.c20
-rw-r--r--mm/bootmem.c37
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/filemap.c265
-rw-r--r--mm/filemap_xip.c70
-rw-r--r--mm/fremap.c3
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/hugetlb.c97
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memcontrol.c23
-rw-r--r--mm/memory.c79
-rw-r--r--mm/mempolicy.c1
-rw-r--r--mm/migrate.c33
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c10
-rw-r--r--mm/mmap.c172
-rw-r--r--mm/mmu_notifier.c277
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/nommu.c25
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c45
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/quicklist.c9
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c55
-rw-r--r--mm/shmem.c15
-rw-r--r--mm/shmem_acl.c2
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slob.c16
-rw-r--r--mm/slub.c45
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c9
-rw-r--r--mm/swap_state.c40
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/tiny-shmem.c26
-rw-r--r--mm/truncate.c16
-rw-r--r--mm/util.c70
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c88
-rw-r--r--mm/vmstat.c19
45 files changed, 1304 insertions, 372 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -205,3 +205,6 @@ config NR_QUICK
205config VIRT_TO_BUS 205config VIRT_TO_BUS
206 def_bool y 206 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 207 depends on !ARCH_NO_VIRT_TO_BUS
208
209config MMU_NOTIFIER
210 bool
diff --git a/mm/Makefile b/mm/Makefile
index 06ca2381fef1..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 27obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 843364594e23..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
18 * Depopulating per-cpu data for a cpu going offline would be a typical 18 * Depopulating per-cpu data for a cpu going offline would be a typical
19 * use case. You need to register a cpu hotplug handler for that purpose. 19 * use case. You need to register a cpu hotplug handler for that purpose.
20 */ 20 */
21void percpu_depopulate(void *__pdata, int cpu) 21static void percpu_depopulate(void *__pdata, int cpu)
22{ 22{
23 struct percpu_data *pdata = __percpu_disguise(__pdata); 23 struct percpu_data *pdata = __percpu_disguise(__pdata);
24 24
25 kfree(pdata->ptrs[cpu]); 25 kfree(pdata->ptrs[cpu]);
26 pdata->ptrs[cpu] = NULL; 26 pdata->ptrs[cpu] = NULL;
27} 27}
28EXPORT_SYMBOL_GPL(percpu_depopulate);
29 28
30/** 29/**
31 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's 30 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
32 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
33 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
34 */ 33 */
35void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
36{ 35{
37 int cpu; 36 int cpu;
38 for_each_cpu_mask_nr(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
39 percpu_depopulate(__pdata, cpu); 38 percpu_depopulate(__pdata, cpu);
40} 39}
41EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); 40
41#define percpu_depopulate_mask(__pdata, mask) \
42 __percpu_depopulate_mask((__pdata), &(mask))
42 43
43/** 44/**
44 * percpu_populate - populate per-cpu data for given cpu 45 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
51 * use case. You need to register a cpu hotplug handler for that purpose. 52 * use case. You need to register a cpu hotplug handler for that purpose.
52 * Per-cpu object is populated with zeroed buffer. 53 * Per-cpu object is populated with zeroed buffer.
53 */ 54 */
54void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) 55static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
55{ 56{
56 struct percpu_data *pdata = __percpu_disguise(__pdata); 57 struct percpu_data *pdata = __percpu_disguise(__pdata);
57 int node = cpu_to_node(cpu); 58 int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
68 pdata->ptrs[cpu] = kzalloc(size, gfp); 69 pdata->ptrs[cpu] = kzalloc(size, gfp);
69 return pdata->ptrs[cpu]; 70 return pdata->ptrs[cpu];
70} 71}
71EXPORT_SYMBOL_GPL(percpu_populate);
72 72
73/** 73/**
74 * percpu_populate_mask - populate per-cpu data for more cpu's 74 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate);
79 * 79 *
80 * Per-cpu objects are populated with zeroed buffers. 80 * Per-cpu objects are populated with zeroed buffers.
81 */ 81 */
82int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, 82static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
83 cpumask_t *mask) 83 cpumask_t *mask)
84{ 84{
85 cpumask_t populated; 85 cpumask_t populated;
86 int cpu; 86 int cpu;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
94 cpu_set(cpu, populated); 94 cpu_set(cpu, populated);
95 return 0; 95 return 0;
96} 96}
97EXPORT_SYMBOL_GPL(__percpu_populate_mask); 97
98#define percpu_populate_mask(__pdata, size, gfp, mask) \
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
98 100
99/** 101/**
100 * percpu_alloc_mask - initial setup of per-cpu data 102 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4af15d0340ad..ad8eec6e44a8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
405} 405}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
407 407
408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step)
410{
411 unsigned long base = bdata->node_min_pfn;
412
413 /*
414 * Align the index with respect to the node start so that the
415 * combination of both satisfies the requested alignment.
416 */
417
418 return ALIGN(base + idx, step) - base;
419}
420
421static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
422 unsigned long align)
423{
424 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
425
426 /* Same as align_idx for byte offsets */
427
428 return ALIGN(base + off, align) - base;
429}
430
408static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 431static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
409 unsigned long size, unsigned long align, 432 unsigned long size, unsigned long align,
410 unsigned long goal, unsigned long limit) 433 unsigned long goal, unsigned long limit)
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
441 else 464 else
442 start = ALIGN(min, step); 465 start = ALIGN(min, step);
443 466
444 sidx = start - bdata->node_min_pfn;; 467 sidx = start - bdata->node_min_pfn;
445 midx = max - bdata->node_min_pfn; 468 midx = max - bdata->node_min_pfn;
446 469
447 if (bdata->hint_idx > sidx) { 470 if (bdata->hint_idx > sidx) {
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
450 * catch the fallback below. 473 * catch the fallback below.
451 */ 474 */
452 fallback = sidx + 1; 475 fallback = sidx + 1;
453 sidx = ALIGN(bdata->hint_idx, step); 476 sidx = align_idx(bdata, bdata->hint_idx, step);
454 } 477 }
455 478
456 while (1) { 479 while (1) {
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
459 unsigned long eidx, i, start_off, end_off; 482 unsigned long eidx, i, start_off, end_off;
460find_block: 483find_block:
461 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); 484 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
462 sidx = ALIGN(sidx, step); 485 sidx = align_idx(bdata, sidx, step);
463 eidx = sidx + PFN_UP(size); 486 eidx = sidx + PFN_UP(size);
464 487
465 if (sidx >= midx || eidx > midx) 488 if (sidx >= midx || eidx > midx)
@@ -467,15 +490,15 @@ find_block:
467 490
468 for (i = sidx; i < eidx; i++) 491 for (i = sidx; i < eidx; i++)
469 if (test_bit(i, bdata->node_bootmem_map)) { 492 if (test_bit(i, bdata->node_bootmem_map)) {
470 sidx = ALIGN(i, step); 493 sidx = align_idx(bdata, i, step);
471 if (sidx == i) 494 if (sidx == i)
472 sidx += step; 495 sidx += step;
473 goto find_block; 496 goto find_block;
474 } 497 }
475 498
476 if (bdata->last_end_off && 499 if (bdata->last_end_off & (PAGE_SIZE - 1) &&
477 PFN_DOWN(bdata->last_end_off) + 1 == sidx) 500 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
478 start_off = ALIGN(bdata->last_end_off, align); 501 start_off = align_off(bdata, bdata->last_end_off, align);
479 else 502 else
480 start_off = PFN_PHYS(sidx); 503 start_off = PFN_PHYS(sidx);
481 504
@@ -499,7 +522,7 @@ find_block:
499 } 522 }
500 523
501 if (fallback) { 524 if (fallback) {
502 sidx = ALIGN(fallback - 1, step); 525 sidx = align_idx(bdata, fallback - 1, step);
503 fallback = 0; 526 fallback = 0;
504 goto find_block; 527 goto find_block;
505 } 528 }
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
267 /* 267 /*
268 * Data-less bio, nothing to bounce 268 * Data-less bio, nothing to bounce
269 */ 269 */
270 if (bio_empty_barrier(*bio_orig)) 270 if (!bio_has_data(*bio_orig))
271 return; 271 return;
272 272
273 /* 273 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..876bc595d0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
109/* 109/*
110 * Remove a page from the page cache and free it. Caller has to make 110 * Remove a page from the page cache and free it. Caller has to make
111 * sure the page is locked and that nobody else uses it - or that usage 111 * sure the page is locked and that nobody else uses it - or that usage
112 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 112 * is safe. The caller must hold the mapping's tree_lock.
113 */ 113 */
114void __remove_from_page_cache(struct page *page) 114void __remove_from_page_cache(struct page *page)
115{ 115{
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
141 141
142 BUG_ON(!PageLocked(page)); 142 BUG_ON(!PageLocked(page));
143 143
144 write_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
145 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
146 write_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
147} 147}
148 148
149static int sync_page(void *word) 149static int sync_page(void *word)
@@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
442} 442}
443 443
444/** 444/**
445 * add_to_page_cache - add newly allocated pagecache pages 445 * add_to_page_cache_locked - add a locked page to the pagecache
446 * @page: page to add 446 * @page: page to add
447 * @mapping: the page's address_space 447 * @mapping: the page's address_space
448 * @offset: page index 448 * @offset: page index
449 * @gfp_mask: page allocation mode 449 * @gfp_mask: page allocation mode
450 * 450 *
451 * This function is used to add newly allocated pagecache pages; 451 * This function is used to add a page to the pagecache. It must be locked.
452 * the page is new, so we can just run SetPageLocked() against it.
453 * The other page state flags were set by rmqueue().
454 *
455 * This function does not add the page to the LRU. The caller must do that. 452 * This function does not add the page to the LRU. The caller must do that.
456 */ 453 */
457int add_to_page_cache(struct page *page, struct address_space *mapping, 454int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
458 pgoff_t offset, gfp_t gfp_mask) 455 pgoff_t offset, gfp_t gfp_mask)
459{ 456{
460 int error = mem_cgroup_cache_charge(page, current->mm, 457 int error;
458
459 VM_BUG_ON(!PageLocked(page));
460
461 error = mem_cgroup_cache_charge(page, current->mm,
461 gfp_mask & ~__GFP_HIGHMEM); 462 gfp_mask & ~__GFP_HIGHMEM);
462 if (error) 463 if (error)
463 goto out; 464 goto out;
464 465
465 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 466 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
466 if (error == 0) { 467 if (error == 0) {
467 write_lock_irq(&mapping->tree_lock); 468 page_cache_get(page);
469 page->mapping = mapping;
470 page->index = offset;
471
472 spin_lock_irq(&mapping->tree_lock);
468 error = radix_tree_insert(&mapping->page_tree, offset, page); 473 error = radix_tree_insert(&mapping->page_tree, offset, page);
469 if (!error) { 474 if (likely(!error)) {
470 page_cache_get(page);
471 SetPageLocked(page);
472 page->mapping = mapping;
473 page->index = offset;
474 mapping->nrpages++; 475 mapping->nrpages++;
475 __inc_zone_page_state(page, NR_FILE_PAGES); 476 __inc_zone_page_state(page, NR_FILE_PAGES);
476 } else 477 } else {
478 page->mapping = NULL;
477 mem_cgroup_uncharge_cache_page(page); 479 mem_cgroup_uncharge_cache_page(page);
480 page_cache_release(page);
481 }
478 482
479 write_unlock_irq(&mapping->tree_lock); 483 spin_unlock_irq(&mapping->tree_lock);
480 radix_tree_preload_end(); 484 radix_tree_preload_end();
481 } else 485 } else
482 mem_cgroup_uncharge_cache_page(page); 486 mem_cgroup_uncharge_cache_page(page);
483out: 487out:
484 return error; 488 return error;
485} 489}
486EXPORT_SYMBOL(add_to_page_cache); 490EXPORT_SYMBOL(add_to_page_cache_locked);
487 491
488int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 492int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
489 pgoff_t offset, gfp_t gfp_mask) 493 pgoff_t offset, gfp_t gfp_mask)
@@ -554,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
554 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
555 * 559 *
556 * The first mb is necessary to safely close the critical section opened by the 560 * The first mb is necessary to safely close the critical section opened by the
557 * TestSetPageLocked(), the second mb is necessary to enforce ordering between 561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce
558 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
559 * parallel wait_on_page_locked()). 563 * races with a parallel wait_on_page_locked()).
560 */ 564 */
561void unlock_page(struct page *page) 565void unlock_page(struct page *page)
562{ 566{
563 smp_mb__before_clear_bit(); 567 smp_mb__before_clear_bit();
564 if (!TestClearPageLocked(page)) 568 if (!test_and_clear_bit(PG_locked, &page->flags))
565 BUG(); 569 BUG();
566 smp_mb__after_clear_bit(); 570 smp_mb__after_clear_bit();
567 wake_up_page(page, PG_locked); 571 wake_up_page(page, PG_locked);
@@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page)
633 * Is there a pagecache struct page at the given (mapping, offset) tuple? 637 * Is there a pagecache struct page at the given (mapping, offset) tuple?
634 * If yes, increment its refcount and return it; if no, return NULL. 638 * If yes, increment its refcount and return it; if no, return NULL.
635 */ 639 */
636struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 640struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
637{ 641{
642 void **pagep;
638 struct page *page; 643 struct page *page;
639 644
640 read_lock_irq(&mapping->tree_lock); 645 rcu_read_lock();
641 page = radix_tree_lookup(&mapping->page_tree, offset); 646repeat:
642 if (page) 647 page = NULL;
643 page_cache_get(page); 648 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
644 read_unlock_irq(&mapping->tree_lock); 649 if (pagep) {
650 page = radix_tree_deref_slot(pagep);
651 if (unlikely(!page || page == RADIX_TREE_RETRY))
652 goto repeat;
653
654 if (!page_cache_get_speculative(page))
655 goto repeat;
656
657 /*
658 * Has the page moved?
659 * This is part of the lockless pagecache protocol. See
660 * include/linux/pagemap.h for details.
661 */
662 if (unlikely(page != *pagep)) {
663 page_cache_release(page);
664 goto repeat;
665 }
666 }
667 rcu_read_unlock();
668
645 return page; 669 return page;
646} 670}
647EXPORT_SYMBOL(find_get_page); 671EXPORT_SYMBOL(find_get_page);
@@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
656 * 680 *
657 * Returns zero if the page was not present. find_lock_page() may sleep. 681 * Returns zero if the page was not present. find_lock_page() may sleep.
658 */ 682 */
659struct page *find_lock_page(struct address_space *mapping, 683struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
660 pgoff_t offset)
661{ 684{
662 struct page *page; 685 struct page *page;
663 686
664repeat: 687repeat:
665 read_lock_irq(&mapping->tree_lock); 688 page = find_get_page(mapping, offset);
666 page = radix_tree_lookup(&mapping->page_tree, offset);
667 if (page) { 689 if (page) {
668 page_cache_get(page); 690 lock_page(page);
669 if (TestSetPageLocked(page)) { 691 /* Has the page been truncated? */
670 read_unlock_irq(&mapping->tree_lock); 692 if (unlikely(page->mapping != mapping)) {
671 __lock_page(page); 693 unlock_page(page);
672 694 page_cache_release(page);
673 /* Has the page been truncated while we slept? */ 695 goto repeat;
674 if (unlikely(page->mapping != mapping)) {
675 unlock_page(page);
676 page_cache_release(page);
677 goto repeat;
678 }
679 VM_BUG_ON(page->index != offset);
680 goto out;
681 } 696 }
697 VM_BUG_ON(page->index != offset);
682 } 698 }
683 read_unlock_irq(&mapping->tree_lock);
684out:
685 return page; 699 return page;
686} 700}
687EXPORT_SYMBOL(find_lock_page); 701EXPORT_SYMBOL(find_lock_page);
@@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
747{ 761{
748 unsigned int i; 762 unsigned int i;
749 unsigned int ret; 763 unsigned int ret;
764 unsigned int nr_found;
765
766 rcu_read_lock();
767restart:
768 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
769 (void ***)pages, start, nr_pages);
770 ret = 0;
771 for (i = 0; i < nr_found; i++) {
772 struct page *page;
773repeat:
774 page = radix_tree_deref_slot((void **)pages[i]);
775 if (unlikely(!page))
776 continue;
777 /*
778 * this can only trigger if nr_found == 1, making livelock
779 * a non issue.
780 */
781 if (unlikely(page == RADIX_TREE_RETRY))
782 goto restart;
783
784 if (!page_cache_get_speculative(page))
785 goto repeat;
786
787 /* Has the page moved? */
788 if (unlikely(page != *((void **)pages[i]))) {
789 page_cache_release(page);
790 goto repeat;
791 }
750 792
751 read_lock_irq(&mapping->tree_lock); 793 pages[ret] = page;
752 ret = radix_tree_gang_lookup(&mapping->page_tree, 794 ret++;
753 (void **)pages, start, nr_pages); 795 }
754 for (i = 0; i < ret; i++) 796 rcu_read_unlock();
755 page_cache_get(pages[i]);
756 read_unlock_irq(&mapping->tree_lock);
757 return ret; 797 return ret;
758} 798}
759 799
@@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
774{ 814{
775 unsigned int i; 815 unsigned int i;
776 unsigned int ret; 816 unsigned int ret;
817 unsigned int nr_found;
818
819 rcu_read_lock();
820restart:
821 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
822 (void ***)pages, index, nr_pages);
823 ret = 0;
824 for (i = 0; i < nr_found; i++) {
825 struct page *page;
826repeat:
827 page = radix_tree_deref_slot((void **)pages[i]);
828 if (unlikely(!page))
829 continue;
830 /*
831 * this can only trigger if nr_found == 1, making livelock
832 * a non issue.
833 */
834 if (unlikely(page == RADIX_TREE_RETRY))
835 goto restart;
777 836
778 read_lock_irq(&mapping->tree_lock); 837 if (page->mapping == NULL || page->index != index)
779 ret = radix_tree_gang_lookup(&mapping->page_tree,
780 (void **)pages, index, nr_pages);
781 for (i = 0; i < ret; i++) {
782 if (pages[i]->mapping == NULL || pages[i]->index != index)
783 break; 838 break;
784 839
785 page_cache_get(pages[i]); 840 if (!page_cache_get_speculative(page))
841 goto repeat;
842
843 /* Has the page moved? */
844 if (unlikely(page != *((void **)pages[i]))) {
845 page_cache_release(page);
846 goto repeat;
847 }
848
849 pages[ret] = page;
850 ret++;
786 index++; 851 index++;
787 } 852 }
788 read_unlock_irq(&mapping->tree_lock); 853 rcu_read_unlock();
789 return i; 854 return ret;
790} 855}
791EXPORT_SYMBOL(find_get_pages_contig); 856EXPORT_SYMBOL(find_get_pages_contig);
792 857
@@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
806{ 871{
807 unsigned int i; 872 unsigned int i;
808 unsigned int ret; 873 unsigned int ret;
874 unsigned int nr_found;
875
876 rcu_read_lock();
877restart:
878 nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
879 (void ***)pages, *index, nr_pages, tag);
880 ret = 0;
881 for (i = 0; i < nr_found; i++) {
882 struct page *page;
883repeat:
884 page = radix_tree_deref_slot((void **)pages[i]);
885 if (unlikely(!page))
886 continue;
887 /*
888 * this can only trigger if nr_found == 1, making livelock
889 * a non issue.
890 */
891 if (unlikely(page == RADIX_TREE_RETRY))
892 goto restart;
893
894 if (!page_cache_get_speculative(page))
895 goto repeat;
896
897 /* Has the page moved? */
898 if (unlikely(page != *((void **)pages[i]))) {
899 page_cache_release(page);
900 goto repeat;
901 }
902
903 pages[ret] = page;
904 ret++;
905 }
906 rcu_read_unlock();
809 907
810 read_lock_irq(&mapping->tree_lock);
811 ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
812 (void **)pages, *index, nr_pages, tag);
813 for (i = 0; i < ret; i++)
814 page_cache_get(pages[i]);
815 if (ret) 908 if (ret)
816 *index = pages[ret - 1]->index + 1; 909 *index = pages[ret - 1]->index + 1;
817 read_unlock_irq(&mapping->tree_lock); 910
818 return ret; 911 return ret;
819} 912}
820EXPORT_SYMBOL(find_get_pages_tag); 913EXPORT_SYMBOL(find_get_pages_tag);
@@ -838,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
838 struct page *page = find_get_page(mapping, index); 931 struct page *page = find_get_page(mapping, index);
839 932
840 if (page) { 933 if (page) {
841 if (!TestSetPageLocked(page)) 934 if (trylock_page(page))
842 return page; 935 return page;
843 page_cache_release(page); 936 page_cache_release(page);
844 return NULL; 937 return NULL;
@@ -930,8 +1023,17 @@ find_page:
930 ra, filp, page, 1023 ra, filp, page,
931 index, last_index - index); 1024 index, last_index - index);
932 } 1025 }
933 if (!PageUptodate(page)) 1026 if (!PageUptodate(page)) {
934 goto page_not_up_to_date; 1027 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1028 !mapping->a_ops->is_partially_uptodate)
1029 goto page_not_up_to_date;
1030 if (!trylock_page(page))
1031 goto page_not_up_to_date;
1032 if (!mapping->a_ops->is_partially_uptodate(page,
1033 desc, offset))
1034 goto page_not_up_to_date_locked;
1035 unlock_page(page);
1036 }
935page_ok: 1037page_ok:
936 /* 1038 /*
937 * i_size must be checked after we know the page is Uptodate. 1039 * i_size must be checked after we know the page is Uptodate.
@@ -1001,6 +1103,7 @@ page_not_up_to_date:
1001 if (lock_page_killable(page)) 1103 if (lock_page_killable(page))
1002 goto readpage_eio; 1104 goto readpage_eio;
1003 1105
1106page_not_up_to_date_locked:
1004 /* Did it get truncated before we got the lock? */ 1107 /* Did it get truncated before we got the lock? */
1005 if (!page->mapping) { 1108 if (!page->mapping) {
1006 unlock_page(page); 1109 unlock_page(page);
@@ -1665,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
1665 return notify_change(dentry, &newattrs); 1768 return notify_change(dentry, &newattrs);
1666} 1769}
1667 1770
1668int remove_suid(struct dentry *dentry) 1771int file_remove_suid(struct file *file)
1669{ 1772{
1773 struct dentry *dentry = file->f_path.dentry;
1670 int killsuid = should_remove_suid(dentry); 1774 int killsuid = should_remove_suid(dentry);
1671 int killpriv = security_inode_need_killpriv(dentry); 1775 int killpriv = security_inode_need_killpriv(dentry);
1672 int error = 0; 1776 int error = 0;
@@ -1680,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
1680 1784
1681 return error; 1785 return error;
1682} 1786}
1683EXPORT_SYMBOL(remove_suid); 1787EXPORT_SYMBOL(file_remove_suid);
1684 1788
1685static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1789static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1686 const struct iovec *iov, size_t base, size_t bytes) 1790 const struct iovec *iov, size_t base, size_t bytes)
@@ -1775,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1775 * The !iov->iov_len check ensures we skip over unlikely 1879 * The !iov->iov_len check ensures we skip over unlikely
1776 * zero-length segments (without overruning the iovec). 1880 * zero-length segments (without overruning the iovec).
1777 */ 1881 */
1778 while (bytes || unlikely(!iov->iov_len && i->count)) { 1882 while (bytes || unlikely(i->count && !iov->iov_len)) {
1779 int copy; 1883 int copy;
1780 1884
1781 copy = min(bytes, iov->iov_len - base); 1885 copy = min(bytes, iov->iov_len - base);
@@ -2025,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2025 * After a write we want buffered reads to be sure to go to disk to get 2129 * After a write we want buffered reads to be sure to go to disk to get
2026 * the new data. We invalidate clean cached page from the region we're 2130 * the new data. We invalidate clean cached page from the region we're
2027 * about to write. We do this *before* the write so that we can return 2131 * about to write. We do this *before* the write so that we can return
2028 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). 2132 * without clobbering -EIOCBQUEUED from ->direct_IO().
2029 */ 2133 */
2030 if (mapping->nrpages) { 2134 if (mapping->nrpages) {
2031 written = invalidate_inode_pages2_range(mapping, 2135 written = invalidate_inode_pages2_range(mapping,
2032 pos >> PAGE_CACHE_SHIFT, end); 2136 pos >> PAGE_CACHE_SHIFT, end);
2033 if (written) 2137 /*
2138 * If a page can not be invalidated, return 0 to fall back
2139 * to buffered write.
2140 */
2141 if (written) {
2142 if (written == -EBUSY)
2143 return 0;
2034 goto out; 2144 goto out;
2145 }
2035 } 2146 }
2036 2147
2037 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2148 written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
@@ -2436,7 +2547,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2436 if (count == 0) 2547 if (count == 0)
2437 goto out; 2548 goto out;
2438 2549
2439 err = remove_suid(file->f_path.dentry); 2550 err = file_remove_suid(file);
2440 if (err) 2551 if (err)
2441 goto out; 2552 goto out;
2442 2553
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/mmu_notifier.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h>
19#include <linux/mutex.h>
17#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
18#include <asm/io.h> 21#include <asm/io.h>
19 22
@@ -21,22 +24,18 @@
21 * We do use our own empty page to avoid interference with other users 24 * We do use our own empty page to avoid interference with other users
22 * of ZERO_PAGE(), such as /dev/zero 25 * of ZERO_PAGE(), such as /dev/zero
23 */ 26 */
27static DEFINE_MUTEX(xip_sparse_mutex);
28static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
24static struct page *__xip_sparse_page; 29static struct page *__xip_sparse_page;
25 30
31/* called under xip_sparse_mutex */
26static struct page *xip_sparse_page(void) 32static struct page *xip_sparse_page(void)
27{ 33{
28 if (!__xip_sparse_page) { 34 if (!__xip_sparse_page) {
29 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); 35 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
30 36
31 if (page) { 37 if (page)
32 static DEFINE_SPINLOCK(xip_alloc_lock); 38 __xip_sparse_page = page;
33 spin_lock(&xip_alloc_lock);
34 if (!__xip_sparse_page)
35 __xip_sparse_page = page;
36 else
37 __free_page(page);
38 spin_unlock(&xip_alloc_lock);
39 }
40 } 39 }
41 return __xip_sparse_page; 40 return __xip_sparse_page;
42} 41}
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
173 pte_t pteval; 172 pte_t pteval;
174 spinlock_t *ptl; 173 spinlock_t *ptl;
175 struct page *page; 174 struct page *page;
175 unsigned count;
176 int locked = 0;
177
178 count = read_seqcount_begin(&xip_sparse_seq);
176 179
177 page = __xip_sparse_page; 180 page = __xip_sparse_page;
178 if (!page) 181 if (!page)
179 return; 182 return;
180 183
184retry:
181 spin_lock(&mapping->i_mmap_lock); 185 spin_lock(&mapping->i_mmap_lock);
182 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
183 mm = vma->vm_mm; 187 mm = vma->vm_mm;
184 address = vma->vm_start + 188 address = vma->vm_start +
185 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
186 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 190 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
187 pte = page_check_address(page, mm, address, &ptl); 191 pte = page_check_address(page, mm, address, &ptl, 1);
188 if (pte) { 192 if (pte) {
189 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
192 page_remove_rmap(page, vma); 196 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
197 } 201 }
198 } 202 }
199 spin_unlock(&mapping->i_mmap_lock); 203 spin_unlock(&mapping->i_mmap_lock);
204
205 if (locked) {
206 mutex_unlock(&xip_sparse_mutex);
207 } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
208 mutex_lock(&xip_sparse_mutex);
209 locked = 1;
210 goto retry;
211 }
200} 212}
201 213
202/* 214/*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
217 int error; 229 int error;
218 230
219 /* XXX: are VM_FAULT_ codes OK? */ 231 /* XXX: are VM_FAULT_ codes OK? */
220 232again:
221 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 233 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
222 if (vmf->pgoff >= size) 234 if (vmf->pgoff >= size)
223 return VM_FAULT_SIGBUS; 235 return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
236 int err; 248 int err;
237 249
238 /* maybe shared writable, allocate new block */ 250 /* maybe shared writable, allocate new block */
251 mutex_lock(&xip_sparse_mutex);
239 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, 252 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
240 &xip_mem, &xip_pfn); 253 &xip_mem, &xip_pfn);
254 mutex_unlock(&xip_sparse_mutex);
241 if (error) 255 if (error)
242 return VM_FAULT_SIGBUS; 256 return VM_FAULT_SIGBUS;
243 /* unmap sparse mappings at pgoff from all other vmas */ 257 /* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
251 BUG_ON(err); 265 BUG_ON(err);
252 return VM_FAULT_NOPAGE; 266 return VM_FAULT_NOPAGE;
253 } else { 267 } else {
268 int err, ret = VM_FAULT_OOM;
269
270 mutex_lock(&xip_sparse_mutex);
271 write_seqcount_begin(&xip_sparse_seq);
272 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
273 &xip_mem, &xip_pfn);
274 if (unlikely(!error)) {
275 write_seqcount_end(&xip_sparse_seq);
276 mutex_unlock(&xip_sparse_mutex);
277 goto again;
278 }
279 if (error != -ENODATA)
280 goto out;
254 /* not shared and writable, use xip_sparse_page() */ 281 /* not shared and writable, use xip_sparse_page() */
255 page = xip_sparse_page(); 282 page = xip_sparse_page();
256 if (!page) 283 if (!page)
257 return VM_FAULT_OOM; 284 goto out;
285 err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
286 page);
287 if (err == -ENOMEM)
288 goto out;
258 289
259 page_cache_get(page); 290 ret = VM_FAULT_NOPAGE;
260 vmf->page = page; 291out:
261 return 0; 292 write_seqcount_end(&xip_sparse_seq);
293 mutex_unlock(&xip_sparse_mutex);
294
295 return ret;
262 } 296 }
263} 297}
264 298
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
307 &xip_mem, &xip_pfn); 341 &xip_mem, &xip_pfn);
308 if (status == -ENODATA) { 342 if (status == -ENODATA) {
309 /* we allocate a new page unmap it */ 343 /* we allocate a new page unmap it */
344 mutex_lock(&xip_sparse_mutex);
310 status = a_ops->get_xip_mem(mapping, index, 1, 345 status = a_ops->get_xip_mem(mapping, index, 1,
311 &xip_mem, &xip_pfn); 346 &xip_mem, &xip_pfn);
347 mutex_unlock(&xip_sparse_mutex);
312 if (!status) 348 if (!status)
313 /* unmap page at pgoff from all other vmas */ 349 /* unmap page at pgoff from all other vmas */
314 __xip_unmap(mapping, index); 350 __xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
380 if (count == 0) 416 if (count == 0)
381 goto out_backing; 417 goto out_backing;
382 418
383 ret = remove_suid(filp->f_path.dentry); 419 ret = file_remove_suid(filp);
384 if (ret) 420 if (ret)
385 goto out_backing; 421 goto out_backing;
386 422
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
18 19
19#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
20#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
214 spin_unlock(&mapping->i_mmap_lock); 215 spin_unlock(&mapping->i_mmap_lock);
215 } 216 }
216 217
218 mmu_notifier_invalidate_range_start(mm, start, start + size);
217 err = populate_range(mm, vma, start, size, pgoff); 219 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size);
218 if (!err && !(flags & MAP_NONBLOCK)) { 221 if (!err && !(flags & MAP_NONBLOCK)) {
219 if (unlikely(has_write_lock)) { 222 if (unlikely(has_write_lock)) {
220 downgrade_write(&mm->mmap_sem); 223 downgrade_write(&mm->mmap_sem);
diff --git a/mm/highmem.c b/mm/highmem.c
index e16e1523b688..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
70static void flush_all_zero_pkmaps(void) 70static void flush_all_zero_pkmaps(void)
71{ 71{
72 int i; 72 int i;
73 int need_flush = 0;
73 74
74 flush_cache_kmaps(); 75 flush_cache_kmaps();
75 76
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
101 &pkmap_page_table[i]); 102 &pkmap_page_table[i]);
102 103
103 set_page_address(page, NULL); 104 set_page_address(page, NULL);
105 need_flush = 1;
104 } 106 }
105 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 107 if (need_flush)
108 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
106} 109}
107 110
108/** 111/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8bf4ab01f86..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/sysctl.h> 10#include <linux/sysctl.h>
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h> 13#include <linux/nodemask.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
@@ -19,6 +20,7 @@
19 20
20#include <asm/page.h> 21#include <asm/page.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23#include <asm/io.h>
22 24
23#include <linux/hugetlb.h> 25#include <linux/hugetlb.h>
24#include "internal.h" 26#include "internal.h"
@@ -563,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
563 huge_page_order(h)); 565 huge_page_order(h));
564 if (page) { 566 if (page) {
565 if (arch_prepare_hugepage(page)) { 567 if (arch_prepare_hugepage(page)) {
566 __free_pages(page, HUGETLB_PAGE_ORDER); 568 __free_pages(page, huge_page_order(h));
567 return NULL; 569 return NULL;
568 } 570 }
569 prep_new_huge_page(h, page, nid); 571 prep_new_huge_page(h, page, nid);
@@ -663,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
663 __GFP_REPEAT|__GFP_NOWARN, 665 __GFP_REPEAT|__GFP_NOWARN,
664 huge_page_order(h)); 666 huge_page_order(h));
665 667
668 if (page && arch_prepare_hugepage(page)) {
669 __free_pages(page, huge_page_order(h));
670 return NULL;
671 }
672
666 spin_lock(&hugetlb_lock); 673 spin_lock(&hugetlb_lock);
667 if (page) { 674 if (page) {
668 /* 675 /*
@@ -1026,18 +1033,6 @@ static void __init report_hugepages(void)
1026 } 1033 }
1027} 1034}
1028 1035
1029static unsigned int cpuset_mems_nr(unsigned int *array)
1030{
1031 int node;
1032 unsigned int nr = 0;
1033
1034 for_each_node_mask(node, cpuset_current_mems_allowed)
1035 nr += array[node];
1036
1037 return nr;
1038}
1039
1040#ifdef CONFIG_SYSCTL
1041#ifdef CONFIG_HIGHMEM 1036#ifdef CONFIG_HIGHMEM
1042static void try_to_free_low(struct hstate *h, unsigned long count) 1037static void try_to_free_low(struct hstate *h, unsigned long count)
1043{ 1038{
@@ -1293,7 +1288,12 @@ module_exit(hugetlb_exit);
1293 1288
1294static int __init hugetlb_init(void) 1289static int __init hugetlb_init(void)
1295{ 1290{
1296 BUILD_BUG_ON(HPAGE_SHIFT == 0); 1291 /* Some platform decide whether they support huge pages at boot
1292 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1293 * there is no such support
1294 */
1295 if (HPAGE_SHIFT == 0)
1296 return 0;
1297 1297
1298 if (!size_to_hstate(default_hstate_size)) { 1298 if (!size_to_hstate(default_hstate_size)) {
1299 default_hstate_size = HPAGE_SIZE; 1299 default_hstate_size = HPAGE_SIZE;
@@ -1386,6 +1386,18 @@ static int __init hugetlb_default_setup(char *s)
1386} 1386}
1387__setup("default_hugepagesz=", hugetlb_default_setup); 1387__setup("default_hugepagesz=", hugetlb_default_setup);
1388 1388
1389static unsigned int cpuset_mems_nr(unsigned int *array)
1390{
1391 int node;
1392 unsigned int nr = 0;
1393
1394 for_each_node_mask(node, cpuset_current_mems_allowed)
1395 nr += array[node];
1396
1397 return nr;
1398}
1399
1400#ifdef CONFIG_SYSCTL
1389int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1401int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1390 struct file *file, void __user *buffer, 1402 struct file *file, void __user *buffer,
1391 size_t *length, loff_t *ppos) 1403 size_t *length, loff_t *ppos)
@@ -1672,6 +1684,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1672 BUG_ON(start & ~huge_page_mask(h)); 1684 BUG_ON(start & ~huge_page_mask(h));
1673 BUG_ON(end & ~huge_page_mask(h)); 1685 BUG_ON(end & ~huge_page_mask(h));
1674 1686
1687 mmu_notifier_invalidate_range_start(mm, start, end);
1675 spin_lock(&mm->page_table_lock); 1688 spin_lock(&mm->page_table_lock);
1676 for (address = start; address < end; address += sz) { 1689 for (address = start; address < end; address += sz) {
1677 ptep = huge_pte_offset(mm, address); 1690 ptep = huge_pte_offset(mm, address);
@@ -1713,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1713 } 1726 }
1714 spin_unlock(&mm->page_table_lock); 1727 spin_unlock(&mm->page_table_lock);
1715 flush_tlb_range(vma, start, end); 1728 flush_tlb_range(vma, start, end);
1729 mmu_notifier_invalidate_range_end(mm, start, end);
1716 list_for_each_entry_safe(page, tmp, &page_list, lru) { 1730 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1717 list_del(&page->lru); 1731 list_del(&page->lru);
1718 put_page(page); 1732 put_page(page);
@@ -1928,6 +1942,18 @@ retry:
1928 lock_page(page); 1942 lock_page(page);
1929 } 1943 }
1930 1944
1945 /*
1946 * If we are going to COW a private mapping later, we examine the
1947 * pending reservations for this page now. This will ensure that
1948 * any allocations necessary to record that reservation occur outside
1949 * the spinlock.
1950 */
1951 if (write_access && !(vma->vm_flags & VM_SHARED))
1952 if (vma_needs_reservation(h, vma, address) < 0) {
1953 ret = VM_FAULT_OOM;
1954 goto backout_unlocked;
1955 }
1956
1931 spin_lock(&mm->page_table_lock); 1957 spin_lock(&mm->page_table_lock);
1932 size = i_size_read(mapping->host) >> huge_page_shift(h); 1958 size = i_size_read(mapping->host) >> huge_page_shift(h);
1933 if (idx >= size) 1959 if (idx >= size)
@@ -1953,6 +1979,7 @@ out:
1953 1979
1954backout: 1980backout:
1955 spin_unlock(&mm->page_table_lock); 1981 spin_unlock(&mm->page_table_lock);
1982backout_unlocked:
1956 unlock_page(page); 1983 unlock_page(page);
1957 put_page(page); 1984 put_page(page);
1958 goto out; 1985 goto out;
@@ -1964,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1964 pte_t *ptep; 1991 pte_t *ptep;
1965 pte_t entry; 1992 pte_t entry;
1966 int ret; 1993 int ret;
1994 struct page *pagecache_page = NULL;
1967 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1995 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1968 struct hstate *h = hstate_vma(vma); 1996 struct hstate *h = hstate_vma(vma);
1969 1997
@@ -1980,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1980 entry = huge_ptep_get(ptep); 2008 entry = huge_ptep_get(ptep);
1981 if (huge_pte_none(entry)) { 2009 if (huge_pte_none(entry)) {
1982 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2010 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
1983 mutex_unlock(&hugetlb_instantiation_mutex); 2011 goto out_unlock;
1984 return ret;
1985 } 2012 }
1986 2013
1987 ret = 0; 2014 ret = 0;
1988 2015
2016 /*
2017 * If we are going to COW the mapping later, we examine the pending
2018 * reservations for this page now. This will ensure that any
2019 * allocations necessary to record that reservation occur outside the
2020 * spinlock. For private mappings, we also lookup the pagecache
2021 * page now as it is used to determine if a reservation has been
2022 * consumed.
2023 */
2024 if (write_access && !pte_write(entry)) {
2025 if (vma_needs_reservation(h, vma, address) < 0) {
2026 ret = VM_FAULT_OOM;
2027 goto out_unlock;
2028 }
2029
2030 if (!(vma->vm_flags & VM_SHARED))
2031 pagecache_page = hugetlbfs_pagecache_page(h,
2032 vma, address);
2033 }
2034
1989 spin_lock(&mm->page_table_lock); 2035 spin_lock(&mm->page_table_lock);
1990 /* Check for a racing update before calling hugetlb_cow */ 2036 /* Check for a racing update before calling hugetlb_cow */
1991 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 2037 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1992 if (write_access && !pte_write(entry)) { 2038 if (write_access && !pte_write(entry))
1993 struct page *page; 2039 ret = hugetlb_cow(mm, vma, address, ptep, entry,
1994 page = hugetlbfs_pagecache_page(h, vma, address); 2040 pagecache_page);
1995 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1996 if (page) {
1997 unlock_page(page);
1998 put_page(page);
1999 }
2000 }
2001 spin_unlock(&mm->page_table_lock); 2041 spin_unlock(&mm->page_table_lock);
2042
2043 if (pagecache_page) {
2044 unlock_page(pagecache_page);
2045 put_page(pagecache_page);
2046 }
2047
2048out_unlock:
2002 mutex_unlock(&hugetlb_instantiation_mutex); 2049 mutex_unlock(&hugetlb_instantiation_mutex);
2003 2050
2004 return ret; 2051 return ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
132 * Application no longer needs these pages. If the pages are dirty, 132 * Application no longer needs these pages. If the pages are dirty,
133 * it's OK to just throw them away. The app will be more careful about 133 * it's OK to just throw them away. The app will be more careful about
134 * data it wants to keep. Be sure to free swap resources too. The 134 * data it wants to keep. Be sure to free swap resources too. The
135 * zap_page_range call sets things up for refill_inactive to actually free 135 * zap_page_range call sets things up for shrink_active_list to actually free
136 * these pages later if no one else has touched them in the meantime, 136 * these pages later if no one else has touched them in the meantime,
137 * although we could add these pages to a global reuse list for 137 * although we could add these pages to a global reuse list for
138 * refill_inactive to pick up before reclaiming other pages. 138 * shrink_active_list to pick up before reclaiming other pages.
139 * 139 *
140 * NB: This interface discards data rather than pushes it out to swap, 140 * NB: This interface discards data rather than pushes it out to swap,
141 * as some implementations do. This has performance implications for 141 * as some implementations do. This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fba566c51322..36896f3eb7f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
250 250
251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
252{ 252{
253 /*
254 * mm_update_next_owner() may clear mm->owner to NULL
255 * if it races with swapoff, page migration, etc.
256 * So this can be called with p == NULL.
257 */
258 if (unlikely(!p))
259 return NULL;
260
253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 261 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
254 struct mem_cgroup, css); 262 struct mem_cgroup, css);
255} 263}
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
549 if (likely(!memcg)) { 557 if (likely(!memcg)) {
550 rcu_read_lock(); 558 rcu_read_lock();
551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 559 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
560 if (unlikely(!mem)) {
561 rcu_read_unlock();
562 kmem_cache_free(page_cgroup_cache, pc);
563 return 0;
564 }
552 /* 565 /*
553 * For every charge from the cgroup, increment reference count 566 * For every charge from the cgroup, increment reference count
554 */ 567 */
@@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
796 809
797 if (mem_cgroup_subsys.disabled) 810 if (mem_cgroup_subsys.disabled)
798 return 0; 811 return 0;
812 if (!mm)
813 return 0;
799 814
800 rcu_read_lock(); 815 rcu_read_lock();
801 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 816 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
817 if (unlikely(!mem)) {
818 rcu_read_unlock();
819 return 0;
820 }
802 css_get(&mem->css); 821 css_get(&mem->css);
803 rcu_read_unlock(); 822 rcu_read_unlock();
804 823
805 do { 824 do {
806 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 825 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
826 progress += res_counter_check_under_limit(&mem->res);
807 } while (!progress && --retry); 827 } while (!progress && --retry);
808 828
809 css_put(&mem->css); 829 css_put(&mem->css);
@@ -1168,9 +1188,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1168 mem = mem_cgroup_from_cont(cont); 1188 mem = mem_cgroup_from_cont(cont);
1169 old_mem = mem_cgroup_from_cont(old_cont); 1189 old_mem = mem_cgroup_from_cont(old_cont);
1170 1190
1171 if (mem == old_mem)
1172 goto out;
1173
1174 /* 1191 /*
1175 * Only thread group leaders are allowed to migrate, the mm_struct is 1192 * Only thread group leaders are allowed to migrate, the mm_struct is
1176 * in effect owned by the leader 1193 * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 262e3eb6601a..1002f473f497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h>
54 55
55#include <asm/pgalloc.h> 56#include <asm/pgalloc.h>
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -374,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
374 * 375 *
375 * The calling function must still handle the error. 376 * The calling function must still handle the error.
376 */ 377 */
377void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
379 unsigned long vaddr)
378{ 380{
379 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
380 "vm_flags = %lx, vaddr = %lx\n", 382 "vm_flags = %lx, vaddr = %lx\n",
@@ -651,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
651 unsigned long next; 653 unsigned long next;
652 unsigned long addr = vma->vm_start; 654 unsigned long addr = vma->vm_start;
653 unsigned long end = vma->vm_end; 655 unsigned long end = vma->vm_end;
656 int ret;
654 657
655 /* 658 /*
656 * Don't copy ptes where a page fault will fill them correctly. 659 * Don't copy ptes where a page fault will fill them correctly.
@@ -666,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
666 if (is_vm_hugetlb_page(vma)) 669 if (is_vm_hugetlb_page(vma))
667 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 670 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
668 671
672 /*
673 * We need to invalidate the secondary MMU mappings only when
674 * there could be a permission downgrade on the ptes of the
675 * parent mm. And a permission downgrade will only happen if
676 * is_cow_mapping() returns true.
677 */
678 if (is_cow_mapping(vma->vm_flags))
679 mmu_notifier_invalidate_range_start(src_mm, addr, end);
680
681 ret = 0;
669 dst_pgd = pgd_offset(dst_mm, addr); 682 dst_pgd = pgd_offset(dst_mm, addr);
670 src_pgd = pgd_offset(src_mm, addr); 683 src_pgd = pgd_offset(src_mm, addr);
671 do { 684 do {
672 next = pgd_addr_end(addr, end); 685 next = pgd_addr_end(addr, end);
673 if (pgd_none_or_clear_bad(src_pgd)) 686 if (pgd_none_or_clear_bad(src_pgd))
674 continue; 687 continue;
675 if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 688 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
676 vma, addr, next)) 689 vma, addr, next))) {
677 return -ENOMEM; 690 ret = -ENOMEM;
691 break;
692 }
678 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 693 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
679 return 0; 694
695 if (is_cow_mapping(vma->vm_flags))
696 mmu_notifier_invalidate_range_end(src_mm,
697 vma->vm_start, end);
698 return ret;
680} 699}
681 700
682static unsigned long zap_pte_range(struct mmu_gather *tlb, 701static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -880,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
880 unsigned long start = start_addr; 899 unsigned long start = start_addr;
881 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 900 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
882 int fullmm = (*tlbp)->fullmm; 901 int fullmm = (*tlbp)->fullmm;
902 struct mm_struct *mm = vma->vm_mm;
883 903
904 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
884 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 905 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
885 unsigned long end; 906 unsigned long end;
886 907
@@ -945,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
945 } 966 }
946 } 967 }
947out: 968out:
969 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
948 return start; /* which is now the end (or restart) address */ 970 return start; /* which is now the end (or restart) address */
949} 971}
950 972
@@ -972,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
972 return end; 994 return end;
973} 995}
974 996
997/**
998 * zap_vma_ptes - remove ptes mapping the vma
999 * @vma: vm_area_struct holding ptes to be zapped
1000 * @address: starting address of pages to zap
1001 * @size: number of bytes to zap
1002 *
1003 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1004 *
1005 * The entire address range must be fully contained within the vma.
1006 *
1007 * Returns 0 if successful.
1008 */
1009int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size)
1011{
1012 if (address < vma->vm_start || address + size > vma->vm_end ||
1013 !(vma->vm_flags & VM_PFNMAP))
1014 return -1;
1015 zap_page_range(vma, address, size, NULL);
1016 return 0;
1017}
1018EXPORT_SYMBOL_GPL(zap_vma_ptes);
1019
975/* 1020/*
976 * Do a quick page-table lookup for a single page. 1021 * Do a quick page-table lookup for a single page.
977 */ 1022 */
@@ -1615,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1615{ 1660{
1616 pgd_t *pgd; 1661 pgd_t *pgd;
1617 unsigned long next; 1662 unsigned long next;
1618 unsigned long end = addr + size; 1663 unsigned long start = addr, end = addr + size;
1619 int err; 1664 int err;
1620 1665
1621 BUG_ON(addr >= end); 1666 BUG_ON(addr >= end);
1667 mmu_notifier_invalidate_range_start(mm, start, end);
1622 pgd = pgd_offset(mm, addr); 1668 pgd = pgd_offset(mm, addr);
1623 do { 1669 do {
1624 next = pgd_addr_end(addr, end); 1670 next = pgd_addr_end(addr, end);
@@ -1626,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1626 if (err) 1672 if (err)
1627 break; 1673 break;
1628 } while (pgd++, addr = next, addr != end); 1674 } while (pgd++, addr = next, addr != end);
1675 mmu_notifier_invalidate_range_end(mm, start, end);
1629 return err; 1676 return err;
1630} 1677}
1631EXPORT_SYMBOL_GPL(apply_to_page_range); 1678EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1742,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1742 * not dirty accountable. 1789 * not dirty accountable.
1743 */ 1790 */
1744 if (PageAnon(old_page)) { 1791 if (PageAnon(old_page)) {
1745 if (!TestSetPageLocked(old_page)) { 1792 if (trylock_page(old_page)) {
1746 reuse = can_share_swap_page(old_page); 1793 reuse = can_share_swap_page(old_page);
1747 unlock_page(old_page); 1794 unlock_page(old_page);
1748 } 1795 }
@@ -1838,7 +1885,7 @@ gotten:
1838 * seen in the presence of one thread doing SMC and another 1885 * seen in the presence of one thread doing SMC and another
1839 * thread doing COW. 1886 * thread doing COW.
1840 */ 1887 */
1841 ptep_clear_flush(vma, address, page_table); 1888 ptep_clear_flush_notify(vma, address, page_table);
1842 set_pte_at(mm, address, page_table, entry); 1889 set_pte_at(mm, address, page_table, entry);
1843 update_mmu_cache(vma, address, entry); 1890 update_mmu_cache(vma, address, entry);
1844 lru_cache_add_active(new_page); 1891 lru_cache_add_active(new_page);
@@ -2718,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
2718 2765
2719 vma = find_vma(current->mm, addr); 2766 vma = find_vma(current->mm, addr);
2720 if (!vma) 2767 if (!vma)
2721 return -1; 2768 return -ENOMEM;
2722 write = (vma->vm_flags & VM_WRITE) != 0; 2769 write = (vma->vm_flags & VM_WRITE) != 0;
2723 BUG_ON(addr >= end); 2770 BUG_ON(addr >= end);
2724 BUG_ON(end > vma->vm_end); 2771 BUG_ON(end > vma->vm_end);
2725 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 2772 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2726 ret = get_user_pages(current, current->mm, addr, 2773 ret = get_user_pages(current, current->mm, addr,
2727 len, write, 0, NULL, NULL); 2774 len, write, 0, NULL, NULL);
2728 if (ret < 0) 2775 if (ret < 0) {
2776 /*
2777 SUS require strange return value to mlock
2778 - invalid addr generate to ENOMEM.
2779 - out of memory should generate EAGAIN.
2780 */
2781 if (ret == -EFAULT)
2782 ret = -ENOMEM;
2783 else if (ret == -ENOMEM)
2784 ret = -EAGAIN;
2729 return ret; 2785 return ret;
2730 return ret == len ? 0 : -1; 2786 }
2787 return ret == len ? 0 : -ENOMEM;
2731} 2788}
2732 2789
2733#if !defined(__HAVE_ARCH_GATE_AREA) 2790#if !defined(__HAVE_ARCH_GATE_AREA)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e550bec20582..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
803int do_migrate_pages(struct mm_struct *mm, 803int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{ 805{
806 LIST_HEAD(pagelist);
807 int busy = 0; 806 int busy = 0;
808 int err = 0; 807 int err = 0;
809 nodemask_t tmp; 808 nodemask_t tmp;
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..2a80136b23bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
285 285
286 page = migration_entry_to_page(entry); 286 page = migration_entry_to_page(entry);
287 287
288 get_page(page); 288 /*
289 * Once radix-tree replacement of page migration started, page_count
290 * *must* be zero. And, we don't want to call wait_on_page_locked()
291 * against a page without get_page().
292 * So, we use get_page_unless_zero(), here. Even failed, page fault
293 * will occur again.
294 */
295 if (!get_page_unless_zero(page))
296 goto out;
289 pte_unmap_unlock(ptep, ptl); 297 pte_unmap_unlock(ptep, ptl);
290 wait_on_page_locked(page); 298 wait_on_page_locked(page);
291 put_page(page); 299 put_page(page);
@@ -305,6 +313,7 @@ out:
305static int migrate_page_move_mapping(struct address_space *mapping, 313static int migrate_page_move_mapping(struct address_space *mapping,
306 struct page *newpage, struct page *page) 314 struct page *newpage, struct page *page)
307{ 315{
316 int expected_count;
308 void **pslot; 317 void **pslot;
309 318
310 if (!mapping) { 319 if (!mapping) {
@@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
314 return 0; 323 return 0;
315 } 324 }
316 325
317 write_lock_irq(&mapping->tree_lock); 326 spin_lock_irq(&mapping->tree_lock);
318 327
319 pslot = radix_tree_lookup_slot(&mapping->page_tree, 328 pslot = radix_tree_lookup_slot(&mapping->page_tree,
320 page_index(page)); 329 page_index(page));
321 330
322 if (page_count(page) != 2 + !!PagePrivate(page) || 331 expected_count = 2 + !!PagePrivate(page);
332 if (page_count(page) != expected_count ||
323 (struct page *)radix_tree_deref_slot(pslot) != page) { 333 (struct page *)radix_tree_deref_slot(pslot) != page) {
324 write_unlock_irq(&mapping->tree_lock); 334 spin_unlock_irq(&mapping->tree_lock);
335 return -EAGAIN;
336 }
337
338 if (!page_freeze_refs(page, expected_count)) {
339 spin_unlock_irq(&mapping->tree_lock);
325 return -EAGAIN; 340 return -EAGAIN;
326 } 341 }
327 342
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
338 353
339 radix_tree_replace_slot(pslot, newpage); 354 radix_tree_replace_slot(pslot, newpage);
340 355
356 page_unfreeze_refs(page, expected_count);
341 /* 357 /*
342 * Drop cache reference from old page. 358 * Drop cache reference from old page.
343 * We know this isn't the last reference. 359 * We know this isn't the last reference.
@@ -357,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
357 __dec_zone_page_state(page, NR_FILE_PAGES); 373 __dec_zone_page_state(page, NR_FILE_PAGES);
358 __inc_zone_page_state(newpage, NR_FILE_PAGES); 374 __inc_zone_page_state(newpage, NR_FILE_PAGES);
359 375
360 write_unlock_irq(&mapping->tree_lock); 376 spin_unlock_irq(&mapping->tree_lock);
361 if (!PageSwapCache(newpage)) { 377 if (!PageSwapCache(newpage))
362 mem_cgroup_uncharge_cache_page(page); 378 mem_cgroup_uncharge_cache_page(page);
363 }
364 379
365 return 0; 380 return 0;
366} 381}
@@ -590,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
590 * establishing additional references. We are the only one 605 * establishing additional references. We are the only one
591 * holding a reference to the new page at this point. 606 * holding a reference to the new page at this point.
592 */ 607 */
593 if (TestSetPageLocked(newpage)) 608 if (!trylock_page(newpage))
594 BUG(); 609 BUG();
595 610
596 /* Prepare mapping for the new page.*/ 611 /* Prepare mapping for the new page.*/
@@ -652,7 +667,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
652 BUG_ON(charge); 667 BUG_ON(charge);
653 668
654 rc = -EAGAIN; 669 rc = -EAGAIN;
655 if (TestSetPageLocked(page)) { 670 if (!trylock_page(page)) {
656 if (!force) 671 if (!force)
657 goto move_newpage; 672 goto move_newpage;
658 lock_page(page); 673 lock_page(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..01fbe93eff5c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
78 78
79 mm->locked_vm -= pages; 79 mm->locked_vm -= pages;
80out: 80out:
81 if (ret == -ENOMEM)
82 ret = -EAGAIN;
83 return ret; 81 return ret;
84} 82}
85 83
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c6af41ea9994..4e0e26591dfa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -12,7 +12,11 @@
12#include "internal.h" 12#include "internal.h"
13 13
14#ifdef CONFIG_DEBUG_MEMORY_INIT 14#ifdef CONFIG_DEBUG_MEMORY_INIT
15int __meminitdata mminit_loglevel; 15int mminit_loglevel;
16
17#ifndef SECTIONS_SHIFT
18#define SECTIONS_SHIFT 0
19#endif
16 20
17/* The zonelists are simply reported, validation is manual. */ 21/* The zonelists are simply reported, validation is manual. */
18void mminit_verify_zonelist(void) 22void mminit_verify_zonelist(void)
@@ -74,11 +78,7 @@ void __init mminit_verify_pageflags_layout(void)
74 NR_PAGEFLAGS); 78 NR_PAGEFLAGS);
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
76 "Section %d Node %d Zone %d\n", 80 "Section %d Node %d Zone %d\n",
77#ifdef SECTIONS_SHIFT
78 SECTIONS_SHIFT, 81 SECTIONS_SHIFT,
79#else
80 0,
81#endif
82 NODES_SHIFT, 82 NODES_SHIFT,
83 ZONES_SHIFT); 83 ZONES_SHIFT);
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", 84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
diff --git a/mm/mmap.c b/mm/mmap.c
index 5e0cc99e9cd5..e7a5a68a9c2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,6 +26,7 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/cacheflush.h> 32#include <asm/cacheflush.h>
@@ -369,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
369 if (vma_tmp->vm_end > addr) { 370 if (vma_tmp->vm_end > addr) {
370 vma = vma_tmp; 371 vma = vma_tmp;
371 if (vma_tmp->vm_start <= addr) 372 if (vma_tmp->vm_start <= addr)
372 return vma; 373 break;
373 __rb_link = &__rb_parent->rb_left; 374 __rb_link = &__rb_parent->rb_left;
374 } else { 375 } else {
375 rb_prev = __rb_parent; 376 rb_prev = __rb_parent;
@@ -1029,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
1029 } else { 1030 } else {
1030 switch (flags & MAP_TYPE) { 1031 switch (flags & MAP_TYPE) {
1031 case MAP_SHARED: 1032 case MAP_SHARED:
1033 /*
1034 * Ignore pgoff.
1035 */
1036 pgoff = 0;
1032 vm_flags |= VM_SHARED | VM_MAYSHARE; 1037 vm_flags |= VM_SHARED | VM_MAYSHARE;
1033 break; 1038 break;
1034 case MAP_PRIVATE: 1039 case MAP_PRIVATE:
@@ -2061,6 +2066,7 @@ void exit_mmap(struct mm_struct *mm)
2061 2066
2062 /* mm's last user has gone, and its about to be pulled down */ 2067 /* mm's last user has gone, and its about to be pulled down */
2063 arch_exit_mmap(mm); 2068 arch_exit_mmap(mm);
2069 mmu_notifier_release(mm);
2064 2070
2065 lru_add_drain(); 2071 lru_add_drain();
2066 flush_cache_mm(mm); 2072 flush_cache_mm(mm);
@@ -2268,3 +2274,167 @@ int install_special_mapping(struct mm_struct *mm,
2268 2274
2269 return 0; 2275 return 0;
2270} 2276}
2277
2278static DEFINE_MUTEX(mm_all_locks_mutex);
2279
2280static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2281{
2282 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2283 /*
2284 * The LSB of head.next can't change from under us
2285 * because we hold the mm_all_locks_mutex.
2286 */
2287 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
2288 /*
2289 * We can safely modify head.next after taking the
2290 * anon_vma->lock. If some other vma in this mm shares
2291 * the same anon_vma we won't take it again.
2292 *
2293 * No need of atomic instructions here, head.next
2294 * can't change from under us thanks to the
2295 * anon_vma->lock.
2296 */
2297 if (__test_and_set_bit(0, (unsigned long *)
2298 &anon_vma->head.next))
2299 BUG();
2300 }
2301}
2302
2303static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2304{
2305 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2306 /*
2307 * AS_MM_ALL_LOCKS can't change from under us because
2308 * we hold the mm_all_locks_mutex.
2309 *
2310 * Operations on ->flags have to be atomic because
2311 * even if AS_MM_ALL_LOCKS is stable thanks to the
2312 * mm_all_locks_mutex, there may be other cpus
2313 * changing other bitflags in parallel to us.
2314 */
2315 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2316 BUG();
2317 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
2318 }
2319}
2320
2321/*
2322 * This operation locks against the VM for all pte/vma/mm related
2323 * operations that could ever happen on a certain mm. This includes
2324 * vmtruncate, try_to_unmap, and all page faults.
2325 *
2326 * The caller must take the mmap_sem in write mode before calling
2327 * mm_take_all_locks(). The caller isn't allowed to release the
2328 * mmap_sem until mm_drop_all_locks() returns.
2329 *
2330 * mmap_sem in write mode is required in order to block all operations
2331 * that could modify pagetables and free pages without need of
2332 * altering the vma layout (for example populate_range() with
2333 * nonlinear vmas). It's also needed in write mode to avoid new
2334 * anon_vmas to be associated with existing vmas.
2335 *
2336 * A single task can't take more than one mm_take_all_locks() in a row
2337 * or it would deadlock.
2338 *
2339 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
2340 * mapping->flags avoid to take the same lock twice, if more than one
2341 * vma in this mm is backed by the same anon_vma or address_space.
2342 *
2343 * We can take all the locks in random order because the VM code
2344 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
2345 * takes more than one of them in a row. Secondly we're protected
2346 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2347 *
2348 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2349 * that may have to take thousand of locks.
2350 *
2351 * mm_take_all_locks() can fail if it's interrupted by signals.
2352 */
2353int mm_take_all_locks(struct mm_struct *mm)
2354{
2355 struct vm_area_struct *vma;
2356 int ret = -EINTR;
2357
2358 BUG_ON(down_read_trylock(&mm->mmap_sem));
2359
2360 mutex_lock(&mm_all_locks_mutex);
2361
2362 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2363 if (signal_pending(current))
2364 goto out_unlock;
2365 if (vma->vm_file && vma->vm_file->f_mapping)
2366 vm_lock_mapping(mm, vma->vm_file->f_mapping);
2367 }
2368
2369 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2370 if (signal_pending(current))
2371 goto out_unlock;
2372 if (vma->anon_vma)
2373 vm_lock_anon_vma(mm, vma->anon_vma);
2374 }
2375
2376 ret = 0;
2377
2378out_unlock:
2379 if (ret)
2380 mm_drop_all_locks(mm);
2381
2382 return ret;
2383}
2384
2385static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2386{
2387 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
2388 /*
2389 * The LSB of head.next can't change to 0 from under
2390 * us because we hold the mm_all_locks_mutex.
2391 *
2392 * We must however clear the bitflag before unlocking
2393 * the vma so the users using the anon_vma->head will
2394 * never see our bitflag.
2395 *
2396 * No need of atomic instructions here, head.next
2397 * can't change from under us until we release the
2398 * anon_vma->lock.
2399 */
2400 if (!__test_and_clear_bit(0, (unsigned long *)
2401 &anon_vma->head.next))
2402 BUG();
2403 spin_unlock(&anon_vma->lock);
2404 }
2405}
2406
2407static void vm_unlock_mapping(struct address_space *mapping)
2408{
2409 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2410 /*
2411 * AS_MM_ALL_LOCKS can't change to 0 from under us
2412 * because we hold the mm_all_locks_mutex.
2413 */
2414 spin_unlock(&mapping->i_mmap_lock);
2415 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2416 &mapping->flags))
2417 BUG();
2418 }
2419}
2420
2421/*
2422 * The mmap_sem cannot be released by the caller until
2423 * mm_drop_all_locks() returns.
2424 */
2425void mm_drop_all_locks(struct mm_struct *mm)
2426{
2427 struct vm_area_struct *vma;
2428
2429 BUG_ON(down_read_trylock(&mm->mmap_sem));
2430 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2431
2432 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2433 if (vma->anon_vma)
2434 vm_unlock_anon_vma(vma->anon_vma);
2435 if (vma->vm_file && vma->vm_file->f_mapping)
2436 vm_unlock_mapping(vma->vm_file->f_mapping);
2437 }
2438
2439 mutex_unlock(&mm_all_locks_mutex);
2440}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
1/*
2 * linux/mm/mmu_notifier.c
3 *
4 * Copyright (C) 2008 Qumranet, Inc.
5 * Copyright (C) 2008 SGI
6 * Christoph Lameter <clameter@sgi.com>
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See
9 * the COPYING file in the top-level directory.
10 */
11
12#include <linux/rculist.h>
13#include <linux/mmu_notifier.h>
14#include <linux/module.h>
15#include <linux/mm.h>
16#include <linux/err.h>
17#include <linux/rcupdate.h>
18#include <linux/sched.h>
19
20/*
21 * This function can't run concurrently against mmu_notifier_register
22 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
23 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
24 * in parallel despite there being no task using this mm any more,
25 * through the vmas outside of the exit_mmap context, such as with
26 * vmtruncate. This serializes against mmu_notifier_unregister with
27 * the mmu_notifier_mm->lock in addition to RCU and it serializes
28 * against the other mmu notifiers with RCU. struct mmu_notifier_mm
29 * can't go away from under us as exit_mmap holds an mm_count pin
30 * itself.
31 */
32void __mmu_notifier_release(struct mm_struct *mm)
33{
34 struct mmu_notifier *mn;
35
36 spin_lock(&mm->mmu_notifier_mm->lock);
37 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
38 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
39 struct mmu_notifier,
40 hlist);
41 /*
42 * We arrived before mmu_notifier_unregister so
43 * mmu_notifier_unregister will do nothing other than
44 * to wait ->release to finish and
45 * mmu_notifier_unregister to return.
46 */
47 hlist_del_init_rcu(&mn->hlist);
48 /*
49 * RCU here will block mmu_notifier_unregister until
50 * ->release returns.
51 */
52 rcu_read_lock();
53 spin_unlock(&mm->mmu_notifier_mm->lock);
54 /*
55 * if ->release runs before mmu_notifier_unregister it
56 * must be handled as it's the only way for the driver
57 * to flush all existing sptes and stop the driver
58 * from establishing any more sptes before all the
59 * pages in the mm are freed.
60 */
61 if (mn->ops->release)
62 mn->ops->release(mn, mm);
63 rcu_read_unlock();
64 spin_lock(&mm->mmu_notifier_mm->lock);
65 }
66 spin_unlock(&mm->mmu_notifier_mm->lock);
67
68 /*
69 * synchronize_rcu here prevents mmu_notifier_release to
70 * return to exit_mmap (which would proceed freeing all pages
71 * in the mm) until the ->release method returns, if it was
72 * invoked by mmu_notifier_unregister.
73 *
74 * The mmu_notifier_mm can't go away from under us because one
75 * mm_count is hold by exit_mmap.
76 */
77 synchronize_rcu();
78}
79
80/*
81 * If no young bitflag is supported by the hardware, ->clear_flush_young can
82 * unmap the address and return 1 or 0 depending if the mapping previously
83 * existed or not.
84 */
85int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
86 unsigned long address)
87{
88 struct mmu_notifier *mn;
89 struct hlist_node *n;
90 int young = 0;
91
92 rcu_read_lock();
93 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
94 if (mn->ops->clear_flush_young)
95 young |= mn->ops->clear_flush_young(mn, mm, address);
96 }
97 rcu_read_unlock();
98
99 return young;
100}
101
102void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->invalidate_page)
111 mn->ops->invalidate_page(mn, mm, address);
112 }
113 rcu_read_unlock();
114}
115
116void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
117 unsigned long start, unsigned long end)
118{
119 struct mmu_notifier *mn;
120 struct hlist_node *n;
121
122 rcu_read_lock();
123 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
124 if (mn->ops->invalidate_range_start)
125 mn->ops->invalidate_range_start(mn, mm, start, end);
126 }
127 rcu_read_unlock();
128}
129
130void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
131 unsigned long start, unsigned long end)
132{
133 struct mmu_notifier *mn;
134 struct hlist_node *n;
135
136 rcu_read_lock();
137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
138 if (mn->ops->invalidate_range_end)
139 mn->ops->invalidate_range_end(mn, mm, start, end);
140 }
141 rcu_read_unlock();
142}
143
144static int do_mmu_notifier_register(struct mmu_notifier *mn,
145 struct mm_struct *mm,
146 int take_mmap_sem)
147{
148 struct mmu_notifier_mm *mmu_notifier_mm;
149 int ret;
150
151 BUG_ON(atomic_read(&mm->mm_users) <= 0);
152
153 ret = -ENOMEM;
154 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
155 if (unlikely(!mmu_notifier_mm))
156 goto out;
157
158 if (take_mmap_sem)
159 down_write(&mm->mmap_sem);
160 ret = mm_take_all_locks(mm);
161 if (unlikely(ret))
162 goto out_cleanup;
163
164 if (!mm_has_notifiers(mm)) {
165 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
166 spin_lock_init(&mmu_notifier_mm->lock);
167 mm->mmu_notifier_mm = mmu_notifier_mm;
168 mmu_notifier_mm = NULL;
169 }
170 atomic_inc(&mm->mm_count);
171
172 /*
173 * Serialize the update against mmu_notifier_unregister. A
174 * side note: mmu_notifier_release can't run concurrently with
175 * us because we hold the mm_users pin (either implicitly as
176 * current->mm or explicitly with get_task_mm() or similar).
177 * We can't race against any other mmu notifier method either
178 * thanks to mm_take_all_locks().
179 */
180 spin_lock(&mm->mmu_notifier_mm->lock);
181 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
182 spin_unlock(&mm->mmu_notifier_mm->lock);
183
184 mm_drop_all_locks(mm);
185out_cleanup:
186 if (take_mmap_sem)
187 up_write(&mm->mmap_sem);
188 /* kfree() does nothing if mmu_notifier_mm is NULL */
189 kfree(mmu_notifier_mm);
190out:
191 BUG_ON(atomic_read(&mm->mm_users) <= 0);
192 return ret;
193}
194
195/*
196 * Must not hold mmap_sem nor any other VM related lock when calling
197 * this registration function. Must also ensure mm_users can't go down
198 * to zero while this runs to avoid races with mmu_notifier_release,
199 * so mm has to be current->mm or the mm should be pinned safely such
200 * as with get_task_mm(). If the mm is not current->mm, the mm_users
201 * pin should be released by calling mmput after mmu_notifier_register
202 * returns. mmu_notifier_unregister must be always called to
203 * unregister the notifier. mm_count is automatically pinned to allow
204 * mmu_notifier_unregister to safely run at any time later, before or
205 * after exit_mmap. ->release will always be called before exit_mmap
206 * frees the pages.
207 */
208int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
209{
210 return do_mmu_notifier_register(mn, mm, 1);
211}
212EXPORT_SYMBOL_GPL(mmu_notifier_register);
213
214/*
215 * Same as mmu_notifier_register but here the caller must hold the
216 * mmap_sem in write mode.
217 */
218int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
219{
220 return do_mmu_notifier_register(mn, mm, 0);
221}
222EXPORT_SYMBOL_GPL(__mmu_notifier_register);
223
224/* this is called after the last mmu_notifier_unregister() returned */
225void __mmu_notifier_mm_destroy(struct mm_struct *mm)
226{
227 BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
228 kfree(mm->mmu_notifier_mm);
229 mm->mmu_notifier_mm = LIST_POISON1; /* debug */
230}
231
232/*
233 * This releases the mm_count pin automatically and frees the mm
234 * structure if it was the last user of it. It serializes against
235 * running mmu notifiers with RCU and against mmu_notifier_unregister
236 * with the unregister lock + RCU. All sptes must be dropped before
237 * calling mmu_notifier_unregister. ->release or any other notifier
238 * method may be invoked concurrently with mmu_notifier_unregister,
239 * and only after mmu_notifier_unregister returned we're guaranteed
240 * that ->release or any other method can't run anymore.
241 */
242void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
243{
244 BUG_ON(atomic_read(&mm->mm_count) <= 0);
245
246 spin_lock(&mm->mmu_notifier_mm->lock);
247 if (!hlist_unhashed(&mn->hlist)) {
248 hlist_del_rcu(&mn->hlist);
249
250 /*
251 * RCU here will force exit_mmap to wait ->release to finish
252 * before freeing the pages.
253 */
254 rcu_read_lock();
255 spin_unlock(&mm->mmu_notifier_mm->lock);
256 /*
257 * exit_mmap will block in mmu_notifier_release to
258 * guarantee ->release is called before freeing the
259 * pages.
260 */
261 if (mn->ops->release)
262 mn->ops->release(mn, mm);
263 rcu_read_unlock();
264 } else
265 spin_unlock(&mm->mmu_notifier_mm->lock);
266
267 /*
268 * Wait any running method to finish, of course including
269 * ->release if it was run by mmu_notifier_relase instead of us.
270 */
271 synchronize_rcu();
272
273 BUG_ON(atomic_read(&mm->mm_count) <= 0);
274
275 mmdrop(mm);
276}
277EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 70 z++;
71 71
72 *zone = zonelist_zone(z++); 72 *zone = zonelist_zone(z);
73 return z; 73 return z;
74} 74}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abd645a3b0a0..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
21#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h>
24#include <asm/uaccess.h> 25#include <asm/uaccess.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
@@ -203,10 +204,12 @@ success:
203 dirty_accountable = 1; 204 dirty_accountable = 1;
204 } 205 }
205 206
207 mmu_notifier_invalidate_range_start(mm, start, end);
206 if (is_vm_hugetlb_page(vma)) 208 if (is_vm_hugetlb_page(vma))
207 hugetlb_change_protection(vma, start, end, vma->vm_page_prot); 209 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
208 else 210 else
209 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); 211 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
212 mmu_notifier_invalidate_range_end(mm, start, end);
210 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 213 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
211 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 214 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
212 return 0; 215 return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
18#include <linux/highmem.h> 18#include <linux/highmem.h>
19#include <linux/security.h> 19#include <linux/security.h>
20#include <linux/syscalls.h> 20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
21 22
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
74 struct mm_struct *mm = vma->vm_mm; 75 struct mm_struct *mm = vma->vm_mm;
75 pte_t *old_pte, *new_pte, pte; 76 pte_t *old_pte, *new_pte, pte;
76 spinlock_t *old_ptl, *new_ptl; 77 spinlock_t *old_ptl, *new_ptl;
78 unsigned long old_start;
77 79
80 old_start = old_addr;
81 mmu_notifier_invalidate_range_start(vma->vm_mm,
82 old_start, old_end);
78 if (vma->vm_file) { 83 if (vma->vm_file) {
79 /* 84 /*
80 * Subtle point from Rajesh Venkatasubramanian: before 85 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
116 pte_unmap_unlock(old_pte - 1, old_ptl); 121 pte_unmap_unlock(old_pte - 1, old_ptl);
117 if (mapping) 122 if (mapping)
118 spin_unlock(&mapping->i_mmap_lock); 123 spin_unlock(&mapping->i_mmap_lock);
124 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
119} 125}
120 126
121#define LATENCY_LIMIT (64 * PAGE_SIZE) 127#define LATENCY_LIMIT (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..ed75bc962fbe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/ptrace.h> 25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
266} 266}
267EXPORT_SYMBOL(vmalloc_node); 267EXPORT_SYMBOL(vmalloc_node);
268 268
269#ifndef PAGE_KERNEL_EXEC
270# define PAGE_KERNEL_EXEC PAGE_KERNEL
271#endif
272
273/**
274 * vmalloc_exec - allocate virtually contiguous, executable memory
275 * @size: allocation size
276 *
277 * Kernel-internal function to allocate enough pages to cover @size
278 * the page level allocator and map them into contiguous and
279 * executable kernel virtual space.
280 *
281 * For tight control over page level allocator and protection flags
282 * use __vmalloc() instead.
283 */
284
285void *vmalloc_exec(unsigned long size)
286{
287 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
288}
289
269/** 290/**
270 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 291 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
271 * @size: allocation size 292 * @size: allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
745 * it's being traced - otherwise breakpoints set in it may interfere 766 * it's being traced - otherwise breakpoints set in it may interfere
746 * with another untraced process 767 * with another untraced process
747 */ 768 */
748 if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) 769 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
749 vm_flags &= ~VM_MAYSHARE; 770 vm_flags &= ~VM_MAYSHARE;
750 771
751 return vm_flags; 772 return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28#include <linux/memcontrol.h> 28#include <linux/memcontrol.h>
29#include <linux/security.h>
29 30
30int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
31int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
128 * Superuser processes are usually more important, so we make it 129 * Superuser processes are usually more important, so we make it
129 * less likely that we kill those. 130 * less likely that we kill those.
130 */ 131 */
131 if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) 132 if (has_capability(p, CAP_SYS_ADMIN) ||
133 has_capability(p, CAP_SYS_RESOURCE))
132 points /= 4; 134 points /= 4;
133 135
134 /* 136 /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
137 * tend to only have this flag set on applications they think 139 * tend to only have this flag set on applications they think
138 * of as important. 140 * of as important.
139 */ 141 */
140 if (__capable(p, CAP_SYS_RAWIO)) 142 if (has_capability(p, CAP_SYS_RAWIO))
141 points /= 4; 143 points /= 4;
142 144
143 /* 145 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1088 if (!mapping) 1088 if (!mapping)
1089 return 1; 1089 return 1;
1090 1090
1091 write_lock_irq(&mapping->tree_lock); 1091 spin_lock_irq(&mapping->tree_lock);
1092 mapping2 = page_mapping(page); 1092 mapping2 = page_mapping(page);
1093 if (mapping2) { /* Race with truncate? */ 1093 if (mapping2) { /* Race with truncate? */
1094 BUG_ON(mapping2 != mapping); 1094 BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1102 radix_tree_tag_set(&mapping->page_tree, 1102 radix_tree_tag_set(&mapping->page_tree,
1103 page_index(page), PAGECACHE_TAG_DIRTY); 1103 page_index(page), PAGECACHE_TAG_DIRTY);
1104 } 1104 }
1105 write_unlock_irq(&mapping->tree_lock); 1105 spin_unlock_irq(&mapping->tree_lock);
1106 if (mapping->host) { 1106 if (mapping->host) {
1107 /* !PageAnon && !swapper_space */ 1107 /* !PageAnon && !swapper_space */
1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1108 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
1258 struct backing_dev_info *bdi = mapping->backing_dev_info; 1258 struct backing_dev_info *bdi = mapping->backing_dev_info;
1259 unsigned long flags; 1259 unsigned long flags;
1260 1260
1261 write_lock_irqsave(&mapping->tree_lock, flags); 1261 spin_lock_irqsave(&mapping->tree_lock, flags);
1262 ret = TestClearPageWriteback(page); 1262 ret = TestClearPageWriteback(page);
1263 if (ret) { 1263 if (ret) {
1264 radix_tree_tag_clear(&mapping->page_tree, 1264 radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
1269 __bdi_writeout_inc(bdi); 1269 __bdi_writeout_inc(bdi);
1270 } 1270 }
1271 } 1271 }
1272 write_unlock_irqrestore(&mapping->tree_lock, flags); 1272 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1273 } else { 1273 } else {
1274 ret = TestClearPageWriteback(page); 1274 ret = TestClearPageWriteback(page);
1275 } 1275 }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
1287 struct backing_dev_info *bdi = mapping->backing_dev_info; 1287 struct backing_dev_info *bdi = mapping->backing_dev_info;
1288 unsigned long flags; 1288 unsigned long flags;
1289 1289
1290 write_lock_irqsave(&mapping->tree_lock, flags); 1290 spin_lock_irqsave(&mapping->tree_lock, flags);
1291 ret = TestSetPageWriteback(page); 1291 ret = TestSetPageWriteback(page);
1292 if (!ret) { 1292 if (!ret) {
1293 radix_tree_tag_set(&mapping->page_tree, 1293 radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
1300 radix_tree_tag_clear(&mapping->page_tree, 1300 radix_tree_tag_clear(&mapping->page_tree,
1301 page_index(page), 1301 page_index(page),
1302 PAGECACHE_TAG_DIRTY); 1302 PAGECACHE_TAG_DIRTY);
1303 write_unlock_irqrestore(&mapping->tree_lock, flags); 1303 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1304 } else { 1304 } else {
1305 ret = TestSetPageWriteback(page); 1305 ret = TestSetPageWriteback(page);
1306 } 1306 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da667274df5..27b8681139fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order)
268{ 268{
269 int i; 269 int i;
270 int nr_pages = 1 << order; 270 int nr_pages = 1 << order;
271 struct page *p = page + 1;
271 272
272 set_compound_page_dtor(page, free_compound_page); 273 set_compound_page_dtor(page, free_compound_page);
273 set_compound_order(page, order); 274 set_compound_order(page, order);
274 __SetPageHead(page); 275 __SetPageHead(page);
275 for (i = 1; i < nr_pages; i++) { 276 for (i = 1; i < nr_pages; i++, p++) {
276 struct page *p = page + i; 277 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
277 278 p = pfn_to_page(page_to_pfn(page) + i);
278 __SetPageTail(p); 279 __SetPageTail(p);
279 p->first_page = page; 280 p->first_page = page;
280 } 281 }
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
284{ 285{
285 int i; 286 int i;
286 int nr_pages = 1 << order; 287 int nr_pages = 1 << order;
288 struct page *p = page + 1;
287 289
288 if (unlikely(compound_order(page) != order)) 290 if (unlikely(compound_order(page) != order))
289 bad_page(page); 291 bad_page(page);
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
291 if (unlikely(!PageHead(page))) 293 if (unlikely(!PageHead(page)))
292 bad_page(page); 294 bad_page(page);
293 __ClearPageHead(page); 295 __ClearPageHead(page);
294 for (i = 1; i < nr_pages; i++) { 296 for (i = 1; i < nr_pages; i++, p++) {
295 struct page *p = page + i; 297 if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
298 p = pfn_to_page(page_to_pfn(page) + i);
296 299
297 if (unlikely(!PageTail(p) | 300 if (unlikely(!PageTail(p) |
298 (p->first_page != page))) 301 (p->first_page != page)))
@@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone,
694#endif 697#endif
695 698
696 for (page = start_page; page <= end_page;) { 699 for (page = start_page; page <= end_page;) {
700 /* Make sure we are not inadvertently changing nodes */
701 VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
702
697 if (!pfn_valid_within(page_to_pfn(page))) { 703 if (!pfn_valid_within(page_to_pfn(page))) {
698 page++; 704 page++;
699 continue; 705 continue;
@@ -2372,7 +2378,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2372 2378
2373#endif /* CONFIG_NUMA */ 2379#endif /* CONFIG_NUMA */
2374 2380
2375/* return values int ....just for stop_machine_run() */ 2381/* return values int ....just for stop_machine() */
2376static int __build_all_zonelists(void *dummy) 2382static int __build_all_zonelists(void *dummy)
2377{ 2383{
2378 int nid; 2384 int nid;
@@ -2397,7 +2403,7 @@ void build_all_zonelists(void)
2397 } else { 2403 } else {
2398 /* we have to stop all cpus to guarantee there is no user 2404 /* we have to stop all cpus to guarantee there is no user
2399 of zonelist */ 2405 of zonelist */
2400 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 2406 stop_machine(__build_all_zonelists, NULL, NULL);
2401 /* cpuset refresh routine should be here */ 2407 /* cpuset refresh routine should be here */
2402 } 2408 }
2403 vm_total_pages = nr_free_pagecache_pages(); 2409 vm_total_pages = nr_free_pagecache_pages();
@@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2516 continue; 2522 continue;
2517 page = pfn_to_page(pfn); 2523 page = pfn_to_page(pfn);
2518 2524
2525 /* Watch out for overlapping nodes */
2526 if (page_to_nid(page) != zone_to_nid(zone))
2527 continue;
2528
2519 /* Blocks with reserved pages will never free, skip them. */ 2529 /* Blocks with reserved pages will never free, skip them. */
2520 if (PageReserved(page)) 2530 if (PageReserved(page))
2521 continue; 2531 continue;
@@ -3753,23 +3763,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
3753 return find_min_pfn_for_node(MAX_NUMNODES); 3763 return find_min_pfn_for_node(MAX_NUMNODES);
3754} 3764}
3755 3765
3756/**
3757 * find_max_pfn_with_active_regions - Find the maximum PFN registered
3758 *
3759 * It returns the maximum PFN based on information provided via
3760 * add_active_range().
3761 */
3762unsigned long __init find_max_pfn_with_active_regions(void)
3763{
3764 int i;
3765 unsigned long max_pfn = 0;
3766
3767 for (i = 0; i < nr_nodemap_entries; i++)
3768 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
3769
3770 return max_pfn;
3771}
3772
3773/* 3766/*
3774 * early_calculate_totalpages() 3767 * early_calculate_totalpages()
3775 * Sum pages in active regions for movable zone. 3768 * Sum pages in active regions for movable zone.
@@ -4081,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4081} 4074}
4082 4075
4083#ifndef CONFIG_NEED_MULTIPLE_NODES 4076#ifndef CONFIG_NEED_MULTIPLE_NODES
4084struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; 4077struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
4085EXPORT_SYMBOL(contig_page_data); 4078EXPORT_SYMBOL(contig_page_data);
4086#endif 4079#endif
4087 4080
@@ -4454,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4454 do { 4447 do {
4455 size = bucketsize << log2qty; 4448 size = bucketsize << log2qty;
4456 if (flags & HASH_EARLY) 4449 if (flags & HASH_EARLY)
4457 table = alloc_bootmem(size); 4450 table = alloc_bootmem_nopanic(size);
4458 else if (hashdist) 4451 else if (hashdist)
4459 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4452 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4460 else { 4453 else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..b70a7fec1ff6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
2 * linux/mm/page_isolation.c 2 * linux/mm/page_isolation.c
3 */ 3 */
4 4
5#include <stddef.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
7#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
8#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
115 114
116int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 115int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
117{ 116{
118 unsigned long pfn; 117 unsigned long pfn, flags;
119 struct page *page; 118 struct page *page;
119 struct zone *zone;
120 int ret;
120 121
121 pfn = start_pfn; 122 pfn = start_pfn;
122 /* 123 /*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
132 if (pfn < end_pfn) 133 if (pfn < end_pfn)
133 return -EBUSY; 134 return -EBUSY;
134 /* Check all pages are free or Marked as ISOLATED */ 135 /* Check all pages are free or Marked as ISOLATED */
135 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) 136 zone = page_zone(pfn_to_page(pfn));
136 return 0; 137 spin_lock_irqsave(&zone->lock, flags);
137 return -EBUSY; 138 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
139 spin_unlock_irqrestore(&zone->lock, flags);
140 return ret ? 0 : -EBUSY;
138} 141}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
26static unsigned long max_pages(unsigned long min_pages) 26static unsigned long max_pages(unsigned long min_pages)
27{ 27{
28 unsigned long node_free_pages, max; 28 unsigned long node_free_pages, max;
29 struct zone *zones = NODE_DATA(numa_node_id())->node_zones; 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node;
32 node_to_cpumask_ptr(cpumask_on_node, node);
30 33
31 node_free_pages = 34 node_free_pages =
32#ifdef CONFIG_ZONE_DMA 35#ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
38 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); 41 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
39 42
40 max = node_free_pages / FRACTION_OF_NODE_MEM; 43 max = node_free_pages / FRACTION_OF_NODE_MEM;
44
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
46 max /= num_cpus_on_node;
47
41 return max(max, min_pages); 48 return max(max, min_pages);
42} 49}
43 50
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
382 if (hit_readahead_marker) { 382 if (hit_readahead_marker) {
383 pgoff_t start; 383 pgoff_t start;
384 384
385 read_lock_irq(&mapping->tree_lock); 385 rcu_read_lock();
386 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); 386 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
387 read_unlock_irq(&mapping->tree_lock); 387 rcu_read_unlock();
388 388
389 if (!start || start - offset > max) 389 if (!start || start - offset > max)
390 return 0; 390 return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..0383acfcb068 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
54 55
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
138 anon_vma_free(anon_vma); 139 anon_vma_free(anon_vma);
139} 140}
140 141
141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 142static void anon_vma_ctor(void *data)
142{ 143{
143 struct anon_vma *anon_vma = data; 144 struct anon_vma *anon_vma = data;
144 145
@@ -223,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
223/* 224/*
224 * Check that @page is mapped at @address into @mm. 225 * Check that @page is mapped at @address into @mm.
225 * 226 *
227 * If @sync is false, page_check_address may perform a racy check to avoid
228 * the page table lock when the pte is not present (helpful when reclaiming
229 * highly shared pages).
230 *
226 * On success returns with pte mapped and locked. 231 * On success returns with pte mapped and locked.
227 */ 232 */
228pte_t *page_check_address(struct page *page, struct mm_struct *mm, 233pte_t *page_check_address(struct page *page, struct mm_struct *mm,
229 unsigned long address, spinlock_t **ptlp) 234 unsigned long address, spinlock_t **ptlp, int sync)
230{ 235{
231 pgd_t *pgd; 236 pgd_t *pgd;
232 pud_t *pud; 237 pud_t *pud;
@@ -248,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
248 253
249 pte = pte_offset_map(pmd, address); 254 pte = pte_offset_map(pmd, address);
250 /* Make a quick check before getting the lock */ 255 /* Make a quick check before getting the lock */
251 if (!pte_present(*pte)) { 256 if (!sync && !pte_present(*pte)) {
252 pte_unmap(pte); 257 pte_unmap(pte);
253 return NULL; 258 return NULL;
254 } 259 }
@@ -280,14 +285,14 @@ static int page_referenced_one(struct page *page,
280 if (address == -EFAULT) 285 if (address == -EFAULT)
281 goto out; 286 goto out;
282 287
283 pte = page_check_address(page, mm, address, &ptl); 288 pte = page_check_address(page, mm, address, &ptl, 0);
284 if (!pte) 289 if (!pte)
285 goto out; 290 goto out;
286 291
287 if (vma->vm_flags & VM_LOCKED) { 292 if (vma->vm_flags & VM_LOCKED) {
288 referenced++; 293 referenced++;
289 *mapcount = 1; /* break early from loop */ 294 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte)) 295 } else if (ptep_clear_flush_young_notify(vma, address, pte))
291 referenced++; 296 referenced++;
292 297
293 /* Pretend the page is referenced if the task has the 298 /* Pretend the page is referenced if the task has the
@@ -421,7 +426,7 @@ int page_referenced(struct page *page, int is_locked,
421 referenced += page_referenced_anon(page, mem_cont); 426 referenced += page_referenced_anon(page, mem_cont);
422 else if (is_locked) 427 else if (is_locked)
423 referenced += page_referenced_file(page, mem_cont); 428 referenced += page_referenced_file(page, mem_cont);
424 else if (TestSetPageLocked(page)) 429 else if (!trylock_page(page))
425 referenced++; 430 referenced++;
426 else { 431 else {
427 if (page->mapping) 432 if (page->mapping)
@@ -449,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
449 if (address == -EFAULT) 454 if (address == -EFAULT)
450 goto out; 455 goto out;
451 456
452 pte = page_check_address(page, mm, address, &ptl); 457 pte = page_check_address(page, mm, address, &ptl, 1);
453 if (!pte) 458 if (!pte)
454 goto out; 459 goto out;
455 460
@@ -457,7 +462,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
457 pte_t entry; 462 pte_t entry;
458 463
459 flush_cache_page(vma, address, pte_pfn(*pte)); 464 flush_cache_page(vma, address, pte_pfn(*pte));
460 entry = ptep_clear_flush(vma, address, pte); 465 entry = ptep_clear_flush_notify(vma, address, pte);
461 entry = pte_wrprotect(entry); 466 entry = pte_wrprotect(entry);
462 entry = pte_mkclean(entry); 467 entry = pte_mkclean(entry);
463 set_pte_at(mm, address, pte, entry); 468 set_pte_at(mm, address, pte, entry);
@@ -658,6 +663,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
658 } 663 }
659 664
660 /* 665 /*
666 * Now that the last pte has gone, s390 must transfer dirty
667 * flag from storage key to struct page. We can usually skip
668 * this if the page is anon, so about to be freed; but perhaps
669 * not if it's in swapcache - there might be another pte slot
670 * containing the swap entry, but page not yet written to swap.
671 */
672 if ((!PageAnon(page) || PageSwapCache(page)) &&
673 page_test_dirty(page)) {
674 page_clear_dirty(page);
675 set_page_dirty(page);
676 }
677
678 mem_cgroup_uncharge_page(page);
679 __dec_zone_page_state(page,
680 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
681 /*
661 * It would be tidy to reset the PageAnon mapping here, 682 * It would be tidy to reset the PageAnon mapping here,
662 * but that might overwrite a racing page_add_anon_rmap 683 * but that might overwrite a racing page_add_anon_rmap
663 * which increments mapcount after us but sets mapping 684 * which increments mapcount after us but sets mapping
@@ -666,14 +687,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
666 * Leaving it set also helps swapoff to reinstate ptes 687 * Leaving it set also helps swapoff to reinstate ptes
667 * faster for those pages still in swapcache. 688 * faster for those pages still in swapcache.
668 */ 689 */
669 if (page_test_dirty(page)) {
670 page_clear_dirty(page);
671 set_page_dirty(page);
672 }
673 mem_cgroup_uncharge_page(page);
674
675 __dec_zone_page_state(page,
676 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
677 } 690 }
678} 691}
679 692
@@ -695,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
695 if (address == -EFAULT) 708 if (address == -EFAULT)
696 goto out; 709 goto out;
697 710
698 pte = page_check_address(page, mm, address, &ptl); 711 pte = page_check_address(page, mm, address, &ptl, 0);
699 if (!pte) 712 if (!pte)
700 goto out; 713 goto out;
701 714
@@ -705,14 +718,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
705 * skipped over this mm) then we should reactivate it. 718 * skipped over this mm) then we should reactivate it.
706 */ 719 */
707 if (!migration && ((vma->vm_flags & VM_LOCKED) || 720 if (!migration && ((vma->vm_flags & VM_LOCKED) ||
708 (ptep_clear_flush_young(vma, address, pte)))) { 721 (ptep_clear_flush_young_notify(vma, address, pte)))) {
709 ret = SWAP_FAIL; 722 ret = SWAP_FAIL;
710 goto out_unmap; 723 goto out_unmap;
711 } 724 }
712 725
713 /* Nuke the page table entry. */ 726 /* Nuke the page table entry. */
714 flush_cache_page(vma, address, page_to_pfn(page)); 727 flush_cache_page(vma, address, page_to_pfn(page));
715 pteval = ptep_clear_flush(vma, address, pte); 728 pteval = ptep_clear_flush_notify(vma, address, pte);
716 729
717 /* Move the dirty bit to the physical page now the pte is gone. */ 730 /* Move the dirty bit to the physical page now the pte is gone. */
718 if (pte_dirty(pteval)) 731 if (pte_dirty(pteval))
@@ -837,12 +850,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
837 page = vm_normal_page(vma, address, *pte); 850 page = vm_normal_page(vma, address, *pte);
838 BUG_ON(!page || PageAnon(page)); 851 BUG_ON(!page || PageAnon(page));
839 852
840 if (ptep_clear_flush_young(vma, address, pte)) 853 if (ptep_clear_flush_young_notify(vma, address, pte))
841 continue; 854 continue;
842 855
843 /* Nuke the page table entry. */ 856 /* Nuke the page table entry. */
844 flush_cache_page(vma, address, pte_pfn(*pte)); 857 flush_cache_page(vma, address, pte_pfn(*pte));
845 pteval = ptep_clear_flush(vma, address, pte); 858 pteval = ptep_clear_flush_notify(vma, address, pte);
846 859
847 /* If nonlinear, store the file page offset in the pte. */ 860 /* If nonlinear, store the file page offset in the pte. */
848 if (page->index != linear_page_index(vma, address)) 861 if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..04fb4f1ab88e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
936 spin_lock(&info->lock); 936 spin_lock(&info->lock);
937 ptr = shmem_swp_entry(info, idx, NULL); 937 ptr = shmem_swp_entry(info, idx, NULL);
938 if (ptr && ptr->val == entry.val) { 938 if (ptr && ptr->val == entry.val) {
939 error = add_to_page_cache(page, inode->i_mapping, 939 error = add_to_page_cache_locked(page, inode->i_mapping,
940 idx, GFP_NOWAIT); 940 idx, GFP_NOWAIT);
941 /* does mem_cgroup_uncharge_cache_page on error */ 941 /* does mem_cgroup_uncharge_cache_page on error */
942 } else /* we must compensate for our precharge above */ 942 } else /* we must compensate for our precharge above */
@@ -1265,7 +1265,7 @@ repeat:
1265 } 1265 }
1266 1266
1267 /* We have to do this with page locked to prevent races */ 1267 /* We have to do this with page locked to prevent races */
1268 if (TestSetPageLocked(swappage)) { 1268 if (!trylock_page(swappage)) {
1269 shmem_swp_unmap(entry); 1269 shmem_swp_unmap(entry);
1270 spin_unlock(&info->lock); 1270 spin_unlock(&info->lock);
1271 wait_on_page_locked(swappage); 1271 wait_on_page_locked(swappage);
@@ -1301,8 +1301,8 @@ repeat:
1301 SetPageUptodate(filepage); 1301 SetPageUptodate(filepage);
1302 set_page_dirty(filepage); 1302 set_page_dirty(filepage);
1303 swap_free(swap); 1303 swap_free(swap);
1304 } else if (!(error = add_to_page_cache( 1304 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1305 swappage, mapping, idx, GFP_NOWAIT))) { 1305 idx, GFP_NOWAIT))) {
1306 info->flags |= SHMEM_PAGEIN; 1306 info->flags |= SHMEM_PAGEIN;
1307 shmem_swp_set(info, entry, 0); 1307 shmem_swp_set(info, entry, 0);
1308 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
@@ -1329,7 +1329,7 @@ repeat:
1329 shmem_swp_unmap(entry); 1329 shmem_swp_unmap(entry);
1330 filepage = find_get_page(mapping, idx); 1330 filepage = find_get_page(mapping, idx);
1331 if (filepage && 1331 if (filepage &&
1332 (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { 1332 (!PageUptodate(filepage) || !trylock_page(filepage))) {
1333 spin_unlock(&info->lock); 1333 spin_unlock(&info->lock);
1334 wait_on_page_locked(filepage); 1334 wait_on_page_locked(filepage);
1335 page_cache_release(filepage); 1335 page_cache_release(filepage);
@@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1513 inode->i_uid = current->fsuid; 1513 inode->i_uid = current->fsuid;
1514 inode->i_gid = current->fsgid; 1514 inode->i_gid = current->fsgid;
1515 inode->i_blocks = 0; 1515 inode->i_blocks = 0;
1516 inode->i_mapping->a_ops = &shmem_aops;
1517 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1516 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1518 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1517 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1519 inode->i_generation = get_seconds(); 1518 inode->i_generation = get_seconds();
@@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1528 init_special_inode(inode, mode, dev); 1527 init_special_inode(inode, mode, dev);
1529 break; 1528 break;
1530 case S_IFREG: 1529 case S_IFREG:
1530 inode->i_mapping->a_ops = &shmem_aops;
1531 inode->i_op = &shmem_inode_operations; 1531 inode->i_op = &shmem_inode_operations;
1532 inode->i_fop = &shmem_file_operations; 1532 inode->i_fop = &shmem_file_operations;
1533 mpol_shared_policy_init(&info->policy, 1533 mpol_shared_policy_init(&info->policy,
@@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1929 return error; 1929 return error;
1930 } 1930 }
1931 unlock_page(page); 1931 unlock_page(page);
1932 inode->i_mapping->a_ops = &shmem_aops;
1932 inode->i_op = &shmem_symlink_inode_operations; 1933 inode->i_op = &shmem_symlink_inode_operations;
1933 kaddr = kmap_atomic(page, KM_USER0); 1934 kaddr = kmap_atomic(page, KM_USER0);
1934 memcpy(kaddr, symname, len); 1935 memcpy(kaddr, symname, len);
@@ -2352,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
2352 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2353 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2353} 2354}
2354 2355
2355static void init_once(struct kmem_cache *cachep, void *foo) 2356static void init_once(void *foo)
2356{ 2357{
2357 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2358 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2358 2359
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
191 * shmem_permission - permission() inode operation 191 * shmem_permission - permission() inode operation
192 */ 192 */
193int 193int
194shmem_permission(struct inode *inode, int mask, struct nameidata *nd) 194shmem_permission(struct inode *inode, int mask)
195{ 195{
196 return generic_permission(inode, mask, shmem_check_acl); 196 return generic_permission(inode, mask, shmem_check_acl);
197} 197}
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..e76eee466886 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
406 unsigned int dflags; /* dynamic flags */ 406 unsigned int dflags; /* dynamic flags */
407 407
408 /* constructor func */ 408 /* constructor func */
409 void (*ctor)(struct kmem_cache *, void *); 409 void (*ctor)(void *obj);
410 410
411/* 5) cache creation/removal */ 411/* 5) cache creation/removal */
412 const char *name; 412 const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2137 */ 2137 */
2138struct kmem_cache * 2138struct kmem_cache *
2139kmem_cache_create (const char *name, size_t size, size_t align, 2139kmem_cache_create (const char *name, size_t size, size_t align,
2140 unsigned long flags, 2140 unsigned long flags, void (*ctor)(void *))
2141 void (*ctor)(struct kmem_cache *, void *))
2142{ 2141{
2143 size_t left_over, slab_size, ralign; 2142 size_t left_over, slab_size, ralign;
2144 struct kmem_cache *cachep = NULL, *pc; 2143 struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 * They must also be threaded. 2652 * They must also be threaded.
2654 */ 2653 */
2655 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2654 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2656 cachep->ctor(cachep, objp + obj_offset(cachep)); 2655 cachep->ctor(objp + obj_offset(cachep));
2657 2656
2658 if (cachep->flags & SLAB_RED_ZONE) { 2657 if (cachep->flags & SLAB_RED_ZONE) {
2659 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2658 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2669 cachep->buffer_size / PAGE_SIZE, 0); 2668 cachep->buffer_size / PAGE_SIZE, 0);
2670#else 2669#else
2671 if (cachep->ctor) 2670 if (cachep->ctor)
2672 cachep->ctor(cachep, objp); 2671 cachep->ctor(objp);
2673#endif 2672#endif
2674 slab_bufctl(slabp)[i] = i + 1; 2673 slab_bufctl(slabp)[i] = i + 1;
2675 } 2674 }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3093#endif 3092#endif
3094 objp += obj_offset(cachep); 3093 objp += obj_offset(cachep);
3095 if (cachep->ctor && cachep->flags & SLAB_POISON) 3094 if (cachep->ctor && cachep->flags & SLAB_POISON)
3096 cachep->ctor(cachep, objp); 3095 cachep->ctor(objp);
3097#if ARCH_SLAB_MINALIGN 3096#if ARCH_SLAB_MINALIGN
3098 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3097 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3099 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3098 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4473,4 +4472,3 @@ size_t ksize(const void *objp)
4473 4472
4474 return obj_size(virt_to_cache(objp)); 4473 return obj_size(virt_to_cache(objp));
4475} 4474}
4476EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
514 return 0; 514 return 0;
515 515
516 sp = (struct slob_page *)virt_to_page(block); 516 sp = (struct slob_page *)virt_to_page(block);
517 if (slob_page(sp)) 517 if (slob_page(sp)) {
518 return ((slob_t *)block - 1)->units + SLOB_UNIT; 518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 else 519 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT;
521 } else
520 return sp->page.private; 522 return sp->page.private;
521} 523}
522EXPORT_SYMBOL(ksize);
523 524
524struct kmem_cache { 525struct kmem_cache {
525 unsigned int size, align; 526 unsigned int size, align;
526 unsigned long flags; 527 unsigned long flags;
527 const char *name; 528 const char *name;
528 void (*ctor)(struct kmem_cache *, void *); 529 void (*ctor)(void *);
529}; 530};
530 531
531struct kmem_cache *kmem_cache_create(const char *name, size_t size, 532struct kmem_cache *kmem_cache_create(const char *name, size_t size,
532 size_t align, unsigned long flags, 533 size_t align, unsigned long flags, void (*ctor)(void *))
533 void (*ctor)(struct kmem_cache *, void *))
534{ 534{
535 struct kmem_cache *c; 535 struct kmem_cache *c;
536 536
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
575 b = slob_new_page(flags, get_order(c->size), node); 575 b = slob_new_page(flags, get_order(c->size), node);
576 576
577 if (c->ctor) 577 if (c->ctor)
578 c->ctor(c, b); 578 c->ctor(b);
579 579
580 return b; 580 return b;
581} 581}
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..0c83e6afe7b2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
1012 1012
1013static unsigned long kmem_cache_flags(unsigned long objsize, 1013static unsigned long kmem_cache_flags(unsigned long objsize,
1014 unsigned long flags, const char *name, 1014 unsigned long flags, const char *name,
1015 void (*ctor)(struct kmem_cache *, void *)) 1015 void (*ctor)(void *))
1016{ 1016{
1017 /* 1017 /*
1018 * Enable debugging if selected on the kernel commandline. 1018 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1040static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1041static inline unsigned long kmem_cache_flags(unsigned long objsize, 1041static inline unsigned long kmem_cache_flags(unsigned long objsize,
1042 unsigned long flags, const char *name, 1042 unsigned long flags, const char *name,
1043 void (*ctor)(struct kmem_cache *, void *)) 1043 void (*ctor)(void *))
1044{ 1044{
1045 return flags; 1045 return flags;
1046} 1046}
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1103{ 1103{
1104 setup_object_debug(s, page, object); 1104 setup_object_debug(s, page, object);
1105 if (unlikely(s->ctor)) 1105 if (unlikely(s->ctor))
1106 s->ctor(s, object); 1106 s->ctor(object);
1107} 1107}
1108 1108
1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1109static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1329 n = get_node(s, zone_to_nid(zone)); 1329 n = get_node(s, zone_to_nid(zone));
1330 1330
1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1332 n->nr_partial > MIN_PARTIAL) { 1332 n->nr_partial > n->min_partial) {
1333 page = get_partial_node(n); 1333 page = get_partial_node(n);
1334 if (page) 1334 if (page)
1335 return page; 1335 return page;
@@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1381 slab_unlock(page); 1381 slab_unlock(page);
1382 } else { 1382 } else {
1383 stat(c, DEACTIVATE_EMPTY); 1383 stat(c, DEACTIVATE_EMPTY);
1384 if (n->nr_partial < MIN_PARTIAL) { 1384 if (n->nr_partial < n->min_partial) {
1385 /* 1385 /*
1386 * Adding an empty slab to the partial slabs in order 1386 * Adding an empty slab to the partial slabs in order
1387 * to avoid page allocator overhead. This slab needs 1387 * to avoid page allocator overhead. This slab needs
@@ -1913,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
1913#endif 1913#endif
1914} 1914}
1915 1915
1916static void init_kmem_cache_node(struct kmem_cache_node *n) 1916static void
1917init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1917{ 1918{
1918 n->nr_partial = 0; 1919 n->nr_partial = 0;
1920
1921 /*
1922 * The larger the object size is, the more pages we want on the partial
1923 * list to avoid pounding the page allocator excessively.
1924 */
1925 n->min_partial = ilog2(s->size);
1926 if (n->min_partial < MIN_PARTIAL)
1927 n->min_partial = MIN_PARTIAL;
1928 else if (n->min_partial > MAX_PARTIAL)
1929 n->min_partial = MAX_PARTIAL;
1930
1919 spin_lock_init(&n->list_lock); 1931 spin_lock_init(&n->list_lock);
1920 INIT_LIST_HEAD(&n->partial); 1932 INIT_LIST_HEAD(&n->partial);
1921#ifdef CONFIG_SLUB_DEBUG 1933#ifdef CONFIG_SLUB_DEBUG
1922 atomic_long_set(&n->nr_slabs, 0); 1934 atomic_long_set(&n->nr_slabs, 0);
1935 atomic_long_set(&n->total_objects, 0);
1923 INIT_LIST_HEAD(&n->full); 1936 INIT_LIST_HEAD(&n->full);
1924#endif 1937#endif
1925} 1938}
@@ -2087,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2087 init_object(kmalloc_caches, n, 1); 2100 init_object(kmalloc_caches, n, 1);
2088 init_tracking(kmalloc_caches, n); 2101 init_tracking(kmalloc_caches, n);
2089#endif 2102#endif
2090 init_kmem_cache_node(n); 2103 init_kmem_cache_node(n, kmalloc_caches);
2091 inc_slabs_node(kmalloc_caches, node, page->objects); 2104 inc_slabs_node(kmalloc_caches, node, page->objects);
2092 2105
2093 /* 2106 /*
@@ -2144,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2144 2157
2145 } 2158 }
2146 s->node[node] = n; 2159 s->node[node] = n;
2147 init_kmem_cache_node(n); 2160 init_kmem_cache_node(n, s);
2148 } 2161 }
2149 return 1; 2162 return 1;
2150} 2163}
@@ -2155,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2155 2168
2156static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2169static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2157{ 2170{
2158 init_kmem_cache_node(&s->local_node); 2171 init_kmem_cache_node(&s->local_node, s);
2159 return 1; 2172 return 1;
2160} 2173}
2161#endif 2174#endif
@@ -2286,7 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2286static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2299static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2287 const char *name, size_t size, 2300 const char *name, size_t size,
2288 size_t align, unsigned long flags, 2301 size_t align, unsigned long flags,
2289 void (*ctor)(struct kmem_cache *, void *)) 2302 void (*ctor)(void *))
2290{ 2303{
2291 memset(s, 0, kmem_size); 2304 memset(s, 0, kmem_size);
2292 s->name = name; 2305 s->name = name;
@@ -2300,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2300 2313
2301 s->refcount = 1; 2314 s->refcount = 1;
2302#ifdef CONFIG_NUMA 2315#ifdef CONFIG_NUMA
2303 s->remote_node_defrag_ratio = 100; 2316 s->remote_node_defrag_ratio = 1000;
2304#endif 2317#endif
2305 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2318 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2306 goto error; 2319 goto error;
@@ -2715,7 +2728,6 @@ size_t ksize(const void *object)
2715 */ 2728 */
2716 return s->size; 2729 return s->size;
2717} 2730}
2718EXPORT_SYMBOL(ksize);
2719 2731
2720void kfree(const void *x) 2732void kfree(const void *x)
2721{ 2733{
@@ -2890,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg)
2890 ret = -ENOMEM; 2902 ret = -ENOMEM;
2891 goto out; 2903 goto out;
2892 } 2904 }
2893 init_kmem_cache_node(n); 2905 init_kmem_cache_node(n, s);
2894 s->node[nid] = n; 2906 s->node[nid] = n;
2895 } 2907 }
2896out: 2908out:
@@ -3042,7 +3054,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3042 3054
3043static struct kmem_cache *find_mergeable(size_t size, 3055static struct kmem_cache *find_mergeable(size_t size,
3044 size_t align, unsigned long flags, const char *name, 3056 size_t align, unsigned long flags, const char *name,
3045 void (*ctor)(struct kmem_cache *, void *)) 3057 void (*ctor)(void *))
3046{ 3058{
3047 struct kmem_cache *s; 3059 struct kmem_cache *s;
3048 3060
@@ -3082,8 +3094,7 @@ static struct kmem_cache *find_mergeable(size_t size,
3082} 3094}
3083 3095
3084struct kmem_cache *kmem_cache_create(const char *name, size_t size, 3096struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3085 size_t align, unsigned long flags, 3097 size_t align, unsigned long flags, void (*ctor)(void *))
3086 void (*ctor)(struct kmem_cache *, void *))
3087{ 3098{
3088 struct kmem_cache *s; 3099 struct kmem_cache *s;
3089 3100
@@ -4048,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4048 if (err) 4059 if (err)
4049 return err; 4060 return err;
4050 4061
4051 if (ratio < 100) 4062 if (ratio <= 100)
4052 s->remote_node_defrag_ratio = ratio * 10; 4063 s->remote_node_defrag_ratio = ratio * 10;
4053 4064
4054 return length; 4065 return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 8ffc08990008..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,7 +12,6 @@
12#include <asm/dma.h> 12#include <asm/dma.h>
13#include <asm/pgalloc.h> 13#include <asm/pgalloc.h>
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include "internal.h"
16 15
17/* 16/*
18 * Permanent SPARSEMEM data: 17 * Permanent SPARSEMEM data:
@@ -377,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
377} 376}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 377#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 378
380struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 379static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 380{
382 struct page *map; 381 struct page *map;
383 struct mem_section *ms = __nr_to_section(pnum); 382 struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index dd89234ee51f..9e0cb3118079 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 278 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
279 * for the remainder of the operation. 279 * for the remainder of the operation.
280 * 280 *
281 * The locking in this function is against shrink_cache(): we recheck the 281 * The locking in this function is against shrink_inactive_list(): we recheck
282 * page count inside the lock to see whether shrink_cache grabbed the page 282 * the page count inside the lock to see whether shrink_inactive_list()
283 * via the LRU. If it did, give up: shrink_cache will free it. 283 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
284 * will free it.
284 */ 285 */
285void release_pages(struct page **pages, int nr, int cold) 286void release_pages(struct page **pages, int nr, int cold)
286{ 287{
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
443 for (i = 0; i < pagevec_count(pvec); i++) { 444 for (i = 0; i < pagevec_count(pvec); i++) {
444 struct page *page = pvec->pages[i]; 445 struct page *page = pvec->pages[i];
445 446
446 if (PagePrivate(page) && !TestSetPageLocked(page)) { 447 if (PagePrivate(page) && trylock_page(page)) {
447 if (PagePrivate(page)) 448 if (PagePrivate(page))
448 try_to_release_page(page, 0); 449 try_to_release_page(page, 0);
449 unlock_page(page); 450 unlock_page(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..797c3831cbec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
39 39
40struct address_space swapper_space = { 40struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), 42 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
43 .a_ops = &swap_aops, 43 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 44 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_info, 45 .backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
56 56
57void show_swap_cache_info(void) 57void show_swap_cache_info(void)
58{ 58{
59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", 59 printk("%lu pages in swap cache\n", total_swapcache_pages);
60 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 61 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 62 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 64 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 65}
65 66
66/* 67/*
67 * add_to_swap_cache resembles add_to_page_cache on swapper_space, 68 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index. 69 * but sets SwapCache flag and private instead of mapping and index.
69 */ 70 */
70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 71int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
76 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 78 error = radix_tree_preload(gfp_mask);
78 if (!error) { 79 if (!error) {
79 write_lock_irq(&swapper_space.tree_lock); 80 page_cache_get(page);
81 SetPageSwapCache(page);
82 set_page_private(page, entry.val);
83
84 spin_lock_irq(&swapper_space.tree_lock);
80 error = radix_tree_insert(&swapper_space.page_tree, 85 error = radix_tree_insert(&swapper_space.page_tree,
81 entry.val, page); 86 entry.val, page);
82 if (!error) { 87 if (likely(!error)) {
83 page_cache_get(page);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86 total_swapcache_pages++; 88 total_swapcache_pages++;
87 __inc_zone_page_state(page, NR_FILE_PAGES); 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total); 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_space.tree_lock); 92 spin_unlock_irq(&swapper_space.tree_lock);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
94
95 if (unlikely(error)) {
96 set_page_private(page, 0UL);
97 ClearPageSwapCache(page);
98 page_cache_release(page);
99 }
92 } 100 }
93 return error; 101 return error;
94} 102}
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
175 183
176 entry.val = page_private(page); 184 entry.val = page_private(page);
177 185
178 write_lock_irq(&swapper_space.tree_lock); 186 spin_lock_irq(&swapper_space.tree_lock);
179 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
181 189
182 swap_free(entry); 190 swap_free(entry);
183 page_cache_release(page); 191 page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
193 */ 201 */
194static inline void free_swap_cache(struct page *page) 202static inline void free_swap_cache(struct page *page)
195{ 203{
196 if (PageSwapCache(page) && !TestSetPageLocked(page)) { 204 if (PageSwapCache(page) && trylock_page(page)) {
197 remove_exclusive_swap_page(page); 205 remove_exclusive_swap_page(page);
198 unlock_page(page); 206 unlock_page(page);
199 } 207 }
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
294 * re-using the just freed swap entry for an existing page. 302 * re-using the just freed swap entry for an existing page.
295 * May fail (-ENOMEM) if radix-tree node allocation failed. 303 * May fail (-ENOMEM) if radix-tree node allocation failed.
296 */ 304 */
297 SetPageLocked(new_page); 305 set_page_locked(new_page);
298 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 306 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
299 if (!err) { 307 if (likely(!err)) {
300 /* 308 /*
301 * Initiate read into locked page and return. 309 * Initiate read into locked page and return.
302 */ 310 */
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
304 swap_readpage(NULL, new_page); 312 swap_readpage(NULL, new_page);
305 return new_page; 313 return new_page;
306 } 314 }
307 ClearPageLocked(new_page); 315 clear_page_locked(new_page);
308 swap_free(entry); 316 swap_free(entry);
309 } while (err != -ENOMEM); 317 } while (err != -ENOMEM);
310 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee9..1e330f2998fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,8 +33,8 @@
33#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
34#include <linux/swapops.h> 34#include <linux/swapops.h>
35 35
36DEFINE_SPINLOCK(swap_lock); 36static DEFINE_SPINLOCK(swap_lock);
37unsigned int nr_swapfiles; 37static unsigned int nr_swapfiles;
38long total_swap_pages; 38long total_swap_pages;
39static int swap_overflow; 39static int swap_overflow;
40static int least_priority; 40static int least_priority;
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry ";
44static const char Bad_offset[] = "Bad swap offset entry "; 44static const char Bad_offset[] = "Bad swap offset entry ";
45static const char Unused_offset[] = "Unused swap offset entry "; 45static const char Unused_offset[] = "Unused swap offset entry ";
46 46
47struct swap_list_t swap_list = {-1, -1}; 47static struct swap_list_t swap_list = {-1, -1};
48 48
49static struct swap_info_struct swap_info[MAX_SWAPFILES]; 49static struct swap_info_struct swap_info[MAX_SWAPFILES];
50 50
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
369 retval = 0; 369 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
372 write_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == 2) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
375 SetPageDirty(page); 375 SetPageDirty(page);
376 retval = 1; 376 retval = 1;
377 } 377 }
378 write_unlock_irq(&swapper_space.tree_lock); 378 spin_unlock_irq(&swapper_space.tree_lock);
379 } 379 }
380 spin_unlock(&swap_lock); 380 spin_unlock(&swap_lock);
381 381
@@ -403,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
403 if (p) { 403 if (p) {
404 if (swap_entry_free(p, swp_offset(entry)) == 1) { 404 if (swap_entry_free(p, swp_offset(entry)) == 1) {
405 page = find_get_page(&swapper_space, entry.val); 405 page = find_get_page(&swapper_space, entry.val);
406 if (page && unlikely(TestSetPageLocked(page))) { 406 if (page && unlikely(!trylock_page(page))) {
407 page_cache_release(page); 407 page_cache_release(page);
408 page = NULL; 408 page = NULL;
409 } 409 }
@@ -656,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
656 656
657 if (!down_read_trylock(&mm->mmap_sem)) { 657 if (!down_read_trylock(&mm->mmap_sem)) {
658 /* 658 /*
659 * Activate page so shrink_cache is unlikely to unmap its 659 * Activate page so shrink_inactive_list is unlikely to unmap
660 * ptes while lock is dropped, so swapoff can make progress. 660 * its ptes while lock is dropped, so swapoff can make progress.
661 */ 661 */
662 activate_page(page); 662 activate_page(page);
663 unlock_page(page); 663 unlock_page(page);
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..8d7a27a6335c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
65 if (!dentry) 65 if (!dentry)
66 goto put_memory; 66 goto put_memory;
67 67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
68 error = -ENOSPC; 73 error = -ENOSPC;
69 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
70 if (!inode) 75 if (!inode)
71 goto put_dentry; 76 goto close_file;
72 77
73 d_instantiate(dentry, inode); 78 d_instantiate(dentry, inode);
74 error = -ENFILE; 79 inode->i_size = size;
75 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
76 &ramfs_file_operations);
77 if (!file)
78 goto put_dentry;
79
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
81 83
82 /* notify everyone as to the change of file size */ 84#ifndef CONFIG_MMU
83 error = do_truncate(dentry, size, 0, file); 85 error = ramfs_nommu_expand_for_mapping(inode, size);
84 if (error < 0) 86 if (error)
85 goto close_file; 87 goto close_file;
86 88#endif
87 return file; 89 return file;
88 90
89close_file: 91close_file:
90 put_filp(file); 92 put_filp(file);
91 return ERR_PTR(error);
92
93put_dentry: 93put_dentry:
94 dput(dentry); 94 dput(dentry);
95put_memory: 95put_memory:
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..6650c1d878b4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 104 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 105
106 remove_from_page_cache(page); 106 remove_from_page_cache(page);
107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 107 ClearPageMappedToDisk(page);
109 page_cache_release(page); /* pagecache ref */ 108 page_cache_release(page); /* pagecache ref */
110} 109}
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
188 if (page_index > next) 187 if (page_index > next)
189 next = page_index; 188 next = page_index;
190 next++; 189 next++;
191 if (TestSetPageLocked(page)) 190 if (!trylock_page(page))
192 continue; 191 continue;
193 if (PageWriteback(page)) { 192 if (PageWriteback(page)) {
194 unlock_page(page); 193 unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
281 pgoff_t index; 280 pgoff_t index;
282 int lock_failed; 281 int lock_failed;
283 282
284 lock_failed = TestSetPageLocked(page); 283 lock_failed = !trylock_page(page);
285 284
286 /* 285 /*
287 * We really shouldn't be looking at the ->index of an 286 * We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
349 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 348 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
350 return 0; 349 return 0;
351 350
352 write_lock_irq(&mapping->tree_lock); 351 spin_lock_irq(&mapping->tree_lock);
353 if (PageDirty(page)) 352 if (PageDirty(page))
354 goto failed; 353 goto failed;
355 354
356 BUG_ON(PagePrivate(page)); 355 BUG_ON(PagePrivate(page));
357 __remove_from_page_cache(page); 356 __remove_from_page_cache(page);
358 write_unlock_irq(&mapping->tree_lock); 357 spin_unlock_irq(&mapping->tree_lock);
359 ClearPageUptodate(page);
360 page_cache_release(page); /* pagecache ref */ 358 page_cache_release(page); /* pagecache ref */
361 return 1; 359 return 1;
362failed: 360failed:
363 write_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
364 return 0; 362 return 0;
365} 363}
366 364
@@ -382,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
382 * Any pages which are found to be mapped into pagetables are unmapped prior to 380 * Any pages which are found to be mapped into pagetables are unmapped prior to
383 * invalidation. 381 * invalidation.
384 * 382 *
385 * Returns -EIO if any pages could not be invalidated. 383 * Returns -EBUSY if any pages could not be invalidated.
386 */ 384 */
387int invalidate_inode_pages2_range(struct address_space *mapping, 385int invalidate_inode_pages2_range(struct address_space *mapping,
388 pgoff_t start, pgoff_t end) 386 pgoff_t start, pgoff_t end)
@@ -442,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 ret2 = do_launder_page(mapping, page); 440 ret2 = do_launder_page(mapping, page);
443 if (ret2 == 0) { 441 if (ret2 == 0) {
444 if (!invalidate_complete_page2(mapping, page)) 442 if (!invalidate_complete_page2(mapping, page))
445 ret2 = -EIO; 443 ret2 = -EBUSY;
446 } 444 }
447 if (ret2 < 0) 445 if (ret2 < 0)
448 ret = ret2; 446 ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
1#include <linux/mm.h>
1#include <linux/slab.h> 2#include <linux/slab.h>
2#include <linux/string.h> 3#include <linux/string.h>
3#include <linux/module.h> 4#include <linux/module.h>
4#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h>
5#include <asm/uaccess.h> 7#include <asm/uaccess.h>
6 8
7/** 9/**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
68EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
69 71
70/** 72/**
71 * krealloc - reallocate memory. The contents will remain unchanged. 73 * __krealloc - like krealloc() but don't free @p.
72 * @p: object to reallocate memory for. 74 * @p: object to reallocate memory for.
73 * @new_size: how many bytes of memory are required. 75 * @new_size: how many bytes of memory are required.
74 * @flags: the type of memory to allocate. 76 * @flags: the type of memory to allocate.
75 * 77 *
76 * The contents of the object pointed to are preserved up to the 78 * This function is like krealloc() except it never frees the originally
77 * lesser of the new and old sizes. If @p is %NULL, krealloc() 79 * allocated buffer. Use this if you don't want to free the buffer immediately
78 * behaves exactly like kmalloc(). If @size is 0 and @p is not a 80 * like, for example, with RCU.
79 * %NULL pointer, the object pointed to is freed.
80 */ 81 */
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 82void *__krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 83{
83 void *ret; 84 void *ret;
84 size_t ks = 0; 85 size_t ks = 0;
85 86
86 if (unlikely(!new_size)) { 87 if (unlikely(!new_size))
87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 }
90 89
91 if (p) 90 if (p)
92 ks = ksize(p); 91 ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
95 return (void *)p; 94 return (void *)p;
96 95
97 ret = kmalloc_track_caller(new_size, flags); 96 ret = kmalloc_track_caller(new_size, flags);
98 if (ret && p) { 97 if (ret && p)
99 memcpy(ret, p, ks); 98 memcpy(ret, p, ks);
99
100 return ret;
101}
102EXPORT_SYMBOL(__krealloc);
103
104/**
105 * krealloc - reallocate memory. The contents will remain unchanged.
106 * @p: object to reallocate memory for.
107 * @new_size: how many bytes of memory are required.
108 * @flags: the type of memory to allocate.
109 *
110 * The contents of the object pointed to are preserved up to the
111 * lesser of the new and old sizes. If @p is %NULL, krealloc()
112 * behaves exactly like kmalloc(). If @size is 0 and @p is not a
113 * %NULL pointer, the object pointed to is freed.
114 */
115void *krealloc(const void *p, size_t new_size, gfp_t flags)
116{
117 void *ret;
118
119 if (unlikely(!new_size)) {
100 kfree(p); 120 kfree(p);
121 return ZERO_SIZE_PTR;
101 } 122 }
123
124 ret = __krealloc(p, new_size, flags);
125 if (ret && p != ret)
126 kfree(p);
127
102 return ret; 128 return ret;
103} 129}
104EXPORT_SYMBOL(krealloc); 130EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
136 return p; 162 return p;
137} 163}
138EXPORT_SYMBOL(strndup_user); 164EXPORT_SYMBOL(strndup_user);
165
166#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
167void arch_pick_mmap_layout(struct mm_struct *mm)
168{
169 mm->mmap_base = TASK_UNMAPPED_BASE;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 mm->unmap_area = arch_unmap_area;
172}
173#endif
174
175int __attribute__((weak)) get_user_pages_fast(unsigned long start,
176 int nr_pages, int write, struct page **pages)
177{
178 struct mm_struct *mm = current->mm;
179 int ret;
180
181 down_read(&mm->mmap_sem);
182 ret = get_user_pages(current, mm, start, nr_pages,
183 write, 0, pages, NULL);
184 up_read(&mm->mmap_sem);
185
186 return ret;
187}
188EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f293816294..bba06c41fc59 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -180,6 +180,13 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
180 pmd_t *pmd; 180 pmd_t *pmd;
181 pte_t *ptep, pte; 181 pte_t *ptep, pte;
182 182
183 /*
184 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
185 * architectures that do not vmalloc module space
186 */
187 VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) &&
188 !is_module_address(addr));
189
183 if (!pgd_none(*pgd)) { 190 if (!pgd_none(*pgd)) {
184 pud = pud_offset(pgd, addr); 191 pud = pud_offset(pgd, addr);
185 if (!pud_none(*pud)) { 192 if (!pud_none(*pud)) {
@@ -381,16 +388,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
381 return; 388 return;
382 389
383 if ((PAGE_SIZE-1) & (unsigned long)addr) { 390 if ((PAGE_SIZE-1) & (unsigned long)addr) {
384 printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 391 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
385 WARN_ON(1);
386 return; 392 return;
387 } 393 }
388 394
389 area = remove_vm_area(addr); 395 area = remove_vm_area(addr);
390 if (unlikely(!area)) { 396 if (unlikely(!area)) {
391 printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 397 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
392 addr); 398 addr);
393 WARN_ON(1);
394 return; 399 return;
395 } 400 }
396 401
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..1ff1a58e7c10 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
391} 391}
392 392
393/* 393/*
394 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 394 * Same as remove_mapping, but if the page is removed from the mapping, it
395 * someone else has a ref on the page, abort and return 0. If it was 395 * gets returned with a refcount of 0.
396 * successfully detached, return 1. Assumes the caller has a single ref on
397 * this page.
398 */ 396 */
399int remove_mapping(struct address_space *mapping, struct page *page) 397static int __remove_mapping(struct address_space *mapping, struct page *page)
400{ 398{
401 BUG_ON(!PageLocked(page)); 399 BUG_ON(!PageLocked(page));
402 BUG_ON(mapping != page_mapping(page)); 400 BUG_ON(mapping != page_mapping(page));
403 401
404 write_lock_irq(&mapping->tree_lock); 402 spin_lock_irq(&mapping->tree_lock);
405 /* 403 /*
406 * The non racy check for a busy page. 404 * The non racy check for a busy page.
407 * 405 *
@@ -427,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
427 * Note that if SetPageDirty is always performed via set_page_dirty, 425 * Note that if SetPageDirty is always performed via set_page_dirty,
428 * and thus under tree_lock, then this ordering is not required. 426 * and thus under tree_lock, then this ordering is not required.
429 */ 427 */
430 if (unlikely(page_count(page) != 2)) 428 if (!page_freeze_refs(page, 2))
431 goto cannot_free; 429 goto cannot_free;
432 smp_rmb(); 430 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
433 if (unlikely(PageDirty(page))) 431 if (unlikely(PageDirty(page))) {
432 page_unfreeze_refs(page, 2);
434 goto cannot_free; 433 goto cannot_free;
434 }
435 435
436 if (PageSwapCache(page)) { 436 if (PageSwapCache(page)) {
437 swp_entry_t swap = { .val = page_private(page) }; 437 swp_entry_t swap = { .val = page_private(page) };
438 __delete_from_swap_cache(page); 438 __delete_from_swap_cache(page);
439 write_unlock_irq(&mapping->tree_lock); 439 spin_unlock_irq(&mapping->tree_lock);
440 swap_free(swap); 440 swap_free(swap);
441 __put_page(page); /* The pagecache ref */ 441 } else {
442 return 1; 442 __remove_from_page_cache(page);
443 spin_unlock_irq(&mapping->tree_lock);
443 } 444 }
444 445
445 __remove_from_page_cache(page);
446 write_unlock_irq(&mapping->tree_lock);
447 __put_page(page);
448 return 1; 446 return 1;
449 447
450cannot_free: 448cannot_free:
451 write_unlock_irq(&mapping->tree_lock); 449 spin_unlock_irq(&mapping->tree_lock);
450 return 0;
451}
452
453/*
454 * Attempt to detach a locked page from its ->mapping. If it is dirty or if
455 * someone else has a ref on the page, abort and return 0. If it was
456 * successfully detached, return 1. Assumes the caller has a single ref on
457 * this page.
458 */
459int remove_mapping(struct address_space *mapping, struct page *page)
460{
461 if (__remove_mapping(mapping, page)) {
462 /*
463 * Unfreezing the refcount with 1 rather than 2 effectively
464 * drops the pagecache ref for us without requiring another
465 * atomic operation.
466 */
467 page_unfreeze_refs(page, 1);
468 return 1;
469 }
452 return 0; 470 return 0;
453} 471}
454 472
@@ -478,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
478 page = lru_to_page(page_list); 496 page = lru_to_page(page_list);
479 list_del(&page->lru); 497 list_del(&page->lru);
480 498
481 if (TestSetPageLocked(page)) 499 if (!trylock_page(page))
482 goto keep; 500 goto keep;
483 501
484 VM_BUG_ON(PageActive(page)); 502 VM_BUG_ON(PageActive(page));
@@ -564,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
564 * A synchronous write - probably a ramdisk. Go 582 * A synchronous write - probably a ramdisk. Go
565 * ahead and try to reclaim the page. 583 * ahead and try to reclaim the page.
566 */ 584 */
567 if (TestSetPageLocked(page)) 585 if (!trylock_page(page))
568 goto keep; 586 goto keep;
569 if (PageDirty(page) || PageWriteback(page)) 587 if (PageDirty(page) || PageWriteback(page))
570 goto keep_locked; 588 goto keep_locked;
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
598 if (PagePrivate(page)) { 616 if (PagePrivate(page)) {
599 if (!try_to_release_page(page, sc->gfp_mask)) 617 if (!try_to_release_page(page, sc->gfp_mask))
600 goto activate_locked; 618 goto activate_locked;
601 if (!mapping && page_count(page) == 1) 619 if (!mapping && page_count(page) == 1) {
602 goto free_it; 620 unlock_page(page);
621 if (put_page_testzero(page))
622 goto free_it;
623 else {
624 /*
625 * rare race with speculative reference.
626 * the speculative reference will free
627 * this page shortly, so we may
628 * increment nr_reclaimed here (and
629 * leave it off the LRU).
630 */
631 nr_reclaimed++;
632 continue;
633 }
634 }
603 } 635 }
604 636
605 if (!mapping || !remove_mapping(mapping, page)) 637 if (!mapping || !__remove_mapping(mapping, page))
606 goto keep_locked; 638 goto keep_locked;
607 639
608free_it:
609 unlock_page(page); 640 unlock_page(page);
641free_it:
610 nr_reclaimed++; 642 nr_reclaimed++;
611 if (!pagevec_add(&freed_pvec, page)) 643 if (!pagevec_add(&freed_pvec, page)) {
612 __pagevec_release_nonlru(&freed_pvec); 644 __pagevec_free(&freed_pvec);
645 pagevec_reinit(&freed_pvec);
646 }
613 continue; 647 continue;
614 648
615activate_locked: 649activate_locked:
@@ -623,7 +657,7 @@ keep:
623 } 657 }
624 list_splice(&ret_pages, page_list); 658 list_splice(&ret_pages, page_list);
625 if (pagevec_count(&freed_pvec)) 659 if (pagevec_count(&freed_pvec))
626 __pagevec_release_nonlru(&freed_pvec); 660 __pagevec_free(&freed_pvec);
627 count_vm_events(PGACTIVATE, pgactivate); 661 count_vm_events(PGACTIVATE, pgactivate);
628 return nr_reclaimed; 662 return nr_reclaimed;
629} 663}
@@ -1374,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1374 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1408 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1375 congestion_wait(WRITE, HZ/10); 1409 congestion_wait(WRITE, HZ/10);
1376 } 1410 }
1377 /* top priority shrink_caches still had more to do? don't OOM, then */ 1411 /* top priority shrink_zones still had more to do? don't OOM, then */
1378 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1412 if (!sc->all_unreclaimable && scan_global_lru(sc))
1379 ret = nr_reclaimed; 1413 ret = nr_reclaimed;
1380out: 1414out:
@@ -1945,7 +1979,7 @@ module_init(kswapd_init)
1945int zone_reclaim_mode __read_mostly; 1979int zone_reclaim_mode __read_mostly;
1946 1980
1947#define RECLAIM_OFF 0 1981#define RECLAIM_OFF 0
1948#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1982#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
1949#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1983#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1950#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1984#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1951 1985
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e667ece..d7826af2fb07 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
516 continue; 516 continue;
517 517
518 page = pfn_to_page(pfn); 518 page = pfn_to_page(pfn);
519#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
520 /*
521 * Ordinarily, memory holes in flatmem still have a valid
522 * memmap for the PFN range. However, an architecture for
523 * embedded systems (e.g. ARM) can free up the memmap backing
524 * holes to save memory on the assumption the memmap is
525 * never used. The page_zone linkages are then broken even
526 * though pfn_valid() returns true. Skip the page if the
527 * linkages are broken. Even if this test passed, the impact
528 * is that the counters for the movable type are off but
529 * fragmentation monitoring is likely meaningless on small
530 * systems.
531 */
532 if (page_zone(page) != zone)
533 continue;
534#endif
519 mtype = get_pageblock_migratetype(page); 535 mtype = get_pageblock_migratetype(page);
520 536
521 count[mtype]++; 537 if (mtype < MIGRATE_TYPES)
538 count[mtype]++;
522 } 539 }
523 540
524 /* Print counts */ 541 /* Print counts */