aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig71
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/Makefile19
-rw-r--r--mm/backing-dev.c94
-rw-r--r--mm/bootmem.c193
-rw-r--r--mm/cleancache.c244
-rw-r--r--mm/compaction.c259
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/filemap.c370
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/highmem.c66
-rw-r--r--mm/huge_memory.c2391
-rw-r--r--mm/hugetlb.c338
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h16
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c26
-rw-r--r--mm/ksm.c126
-rw-r--r--mm/maccess.c10
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c944
-rw-r--r--mm/memcontrol.c1809
-rw-r--r--mm/memory-failure.c370
-rw-r--r--mm/memory.c1030
-rw-r--r--mm/memory_hotplug.c131
-rw-r--r--mm/mempolicy.c226
-rw-r--r--mm/migrate.c378
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c188
-rw-r--r--mm/mmap.c200
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c22
-rw-r--r--mm/mremap.c31
-rw-r--r--mm/nobootmem.c404
-rw-r--r--mm/nommu.c259
-rw-r--r--mm/oom_kill.c167
-rw-r--r--mm/page-writeback.c67
-rw-r--r--mm/page_alloc.c629
-rw-r--r--mm/page_cgroup.c235
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/pagewalk.c28
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c432
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/pgtable-generic.c121
-rw-r--r--mm/prio_tree.c1
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c523
-rw-r--r--mm/shmem.c607
-rw-r--r--mm/slab.c155
-rw-r--r--mm/slob.c15
-rw-r--r--mm/slub.c1283
-rw-r--r--mm/sparse-vmemmap.c13
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c376
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c479
-rw-r--r--mm/thrash.c105
-rw-r--r--mm/truncate.c74
-rw-r--r--mm/util.c58
-rw-r--r--mm/vmalloc.c356
-rw-r--r--mm/vmscan.c853
-rw-r--r--mm/vmstat.c501
68 files changed, 12466 insertions, 5029 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb9124e410..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
179config COMPACTION 179config COMPACTION
180 bool "Allow for memory compaction" 180 bool "Allow for memory compaction"
181 select MIGRATION 181 select MIGRATION
182 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU 182 depends on MMU
183 help 183 help
184 Allows the compaction of memory for the allocation of huge pages. 184 Allows the compaction of memory for the allocation of huge pages.
185 185
@@ -301,3 +301,72 @@ config NOMMU_INITIAL_TRIM_EXCESS
301 of 1 says that all excess pages should be trimmed. 301 of 1 says that all excess pages should be trimmed.
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304
305config TRANSPARENT_HUGEPAGE
306 bool "Transparent Hugepage Support"
307 depends on X86 && MMU
308 select COMPACTION
309 help
310 Transparent Hugepages allows the kernel to use huge pages and
311 huge tlb transparently to the applications whenever possible.
312 This feature can improve computing performance to certain
313 applications by speeding up page faults during memory
314 allocation, by reducing the number of tlb misses and by speeding
315 up the pagetable walking.
316
317 If memory constrained on embedded, you may want to say N.
318
319choice
320 prompt "Transparent Hugepage Support sysfs defaults"
321 depends on TRANSPARENT_HUGEPAGE
322 default TRANSPARENT_HUGEPAGE_ALWAYS
323 help
324 Selects the sysfs defaults for Transparent Hugepage Support.
325
326 config TRANSPARENT_HUGEPAGE_ALWAYS
327 bool "always"
328 help
329 Enabling Transparent Hugepage always, can increase the
330 memory footprint of applications without a guaranteed
331 benefit but it will work automatically for all applications.
332
333 config TRANSPARENT_HUGEPAGE_MADVISE
334 bool "madvise"
335 help
336 Enabling Transparent Hugepage madvise, will only provide a
337 performance improvement benefit to the applications using
338 madvise(MADV_HUGEPAGE) but it won't risk to increase the
339 memory footprint of applications without a guaranteed
340 benefit.
341endchoice
342
343#
344# UP and nommu archs use km based percpu allocator
345#
346config NEED_PER_CPU_KM
347 depends on !SMP
348 bool
349 default y
350
351config CLEANCACHE
352 bool "Enable cleancache driver to cache clean pages if tmem is present"
353 default n
354 help
355 Cleancache can be thought of as a page-granularity victim cache
356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled
363 filesystem wishes to access a page in a file on disk, it first
364 checks cleancache to see if it already contains it; if it does,
365 the page is copied into the kernel and a disk access is avoided.
366 When a transcendent memory driver is available (such as zcache or
367 Xen transcendent memory), a significant I/O reduction
368 may be achieved. When none is available, all cleancache calls
369 are reduced to a single pointer-compare-against-NULL resulting
370 in a negligible performance hit.
371
372 If unsure, say Y to enable cleancache
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f0..8b1a477162dc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
1config DEBUG_PAGEALLOC 1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 5 depends on !KMEMCHECK
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
6 ---help--- 7 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 8 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 9 This results in a large slowdown, but helps to find certain types
9 of memory corruption. 10 of memory corruption.
10 11
12 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
13 fill the pages with poison patterns after free_pages() and verify
14 the patterns before alloc_pages(). Additionally,
15 this option cannot be enabled in combination with hibernation as
16 that would result in incorrect warnings of memory corruption after
17 a resume because free pages are not saved to the suspend image.
18
11config WANT_PAGE_DEBUG_FLAGS 19config WANT_PAGE_DEBUG_FLAGS
12 bool 20 bool
13 21
14config PAGE_POISONING 22config PAGE_POISONING
15 bool "Debug page memory allocations" 23 bool
16 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 24 select WANT_PAGE_DEBUG_FLAGS
20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruption.
24
25 This option cannot be enabled in combination with hibernation as
26 that would result in incorrect warnings of memory corruption after
27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e37..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,22 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o mmu_context.o \ 14 page_isolation.o mm_init.o mmu_context.o percpu.o \
15 $(mmu-y) 15 $(mmu-y)
16obj-y += init-mm.o 16obj-y += init-mm.o
17 17
18ifdef CONFIG_NO_BOOTMEM
19 obj-y += nobootmem.o
20else
21 obj-y += bootmem.o
22endif
23
18obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 24obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
19 25
20obj-$(CONFIG_BOUNCE) += bounce.o 26obj-$(CONFIG_BOUNCE) += bounce.o
@@ -36,14 +42,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
36obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 42obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 43obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 44obj-$(CONFIG_MIGRATION) += migrate.o
39ifdef CONFIG_SMP
40obj-y += percpu.o
41else
42obj-y += percpu_up.o
43endif
44obj-$(CONFIG_QUICKLIST) += quicklist.o 45obj-$(CONFIG_QUICKLIST) += quicklist.o
46obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
45obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 47obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
46obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 48obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
47obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 49obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
48obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
49obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
52obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..f032e6e1e09a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
14 14
15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
18{
19}
20EXPORT_SYMBOL(default_unplug_io_fn);
21
22struct backing_dev_info default_backing_dev_info = { 17struct backing_dev_info default_backing_dev_info = {
23 .name = "default", 18 .name = "default",
24 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
25 .state = 0, 20 .state = 0,
26 .capabilities = BDI_CAP_MAP_COPY, 21 .capabilities = BDI_CAP_MAP_COPY,
27 .unplug_io_fn = default_unplug_io_fn,
28}; 22};
29EXPORT_SYMBOL_GPL(default_backing_dev_info); 23EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 24
@@ -69,18 +63,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
69 unsigned long background_thresh; 63 unsigned long background_thresh;
70 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
71 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
72 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 66 unsigned long nr_dirty, nr_io, nr_more_io;
73 struct inode *inode; 67 struct inode *inode;
74 68
75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_dirty = nr_io = nr_more_io = 0;
76 spin_lock(&inode_lock); 70 spin_lock(&inode_wb_list_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 72 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list) 73 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 nr_io++; 74 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list) 75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 76 nr_more_io++;
83 spin_unlock(&inode_lock); 77 spin_unlock(&inode_wb_list_lock);
84 78
85 global_dirty_limits(&background_thresh, &dirty_thresh); 79 global_dirty_limits(&background_thresh, &dirty_thresh);
86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -362,7 +356,7 @@ static int bdi_forker_thread(void *ptr)
362{ 356{
363 struct bdi_writeback *me = ptr; 357 struct bdi_writeback *me = ptr;
364 358
365 current->flags |= PF_FLUSHER | PF_SWAPWRITE; 359 current->flags |= PF_SWAPWRITE;
366 set_freezable(); 360 set_freezable();
367 361
368 /* 362 /*
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
604 spin_lock(&sb_lock); 598 spin_lock(&sb_lock);
605 list_for_each_entry(sb, &super_blocks, s_list) { 599 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (sb->s_bdi == bdi) 600 if (sb->s_bdi == bdi)
607 sb->s_bdi = NULL; 601 sb->s_bdi = &default_backing_dev_info;
608 } 602 }
609 spin_unlock(&sb_lock); 603 spin_unlock(&sb_lock);
610} 604}
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
682 if (bdi_has_dirty_io(bdi)) { 676 if (bdi_has_dirty_io(bdi)) {
683 struct bdi_writeback *dst = &default_backing_dev_info.wb; 677 struct bdi_writeback *dst = &default_backing_dev_info.wb;
684 678
685 spin_lock(&inode_lock); 679 spin_lock(&inode_wb_list_lock);
686 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
687 list_splice(&bdi->wb.b_io, &dst->b_io); 681 list_splice(&bdi->wb.b_io, &dst->b_io);
688 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
689 spin_unlock(&inode_lock); 683 spin_unlock(&inode_wb_list_lock);
690 } 684 }
691 685
692 bdi_unregister(bdi); 686 bdi_unregister(bdi);
@@ -729,6 +723,7 @@ static wait_queue_head_t congestion_wqh[2] = {
729 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 723 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
730 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 724 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
731 }; 725 };
726static atomic_t nr_bdi_congested[2];
732 727
733void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 728void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
734{ 729{
@@ -736,7 +731,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
736 wait_queue_head_t *wqh = &congestion_wqh[sync]; 731 wait_queue_head_t *wqh = &congestion_wqh[sync];
737 732
738 bit = sync ? BDI_sync_congested : BDI_async_congested; 733 bit = sync ? BDI_sync_congested : BDI_async_congested;
739 clear_bit(bit, &bdi->state); 734 if (test_and_clear_bit(bit, &bdi->state))
735 atomic_dec(&nr_bdi_congested[sync]);
740 smp_mb__after_clear_bit(); 736 smp_mb__after_clear_bit();
741 if (waitqueue_active(wqh)) 737 if (waitqueue_active(wqh))
742 wake_up(wqh); 738 wake_up(wqh);
@@ -748,7 +744,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
748 enum bdi_state bit; 744 enum bdi_state bit;
749 745
750 bit = sync ? BDI_sync_congested : BDI_async_congested; 746 bit = sync ? BDI_sync_congested : BDI_async_congested;
751 set_bit(bit, &bdi->state); 747 if (!test_and_set_bit(bit, &bdi->state))
748 atomic_inc(&nr_bdi_congested[sync]);
752} 749}
753EXPORT_SYMBOL(set_bdi_congested); 750EXPORT_SYMBOL(set_bdi_congested);
754 751
@@ -764,13 +761,72 @@ EXPORT_SYMBOL(set_bdi_congested);
764long congestion_wait(int sync, long timeout) 761long congestion_wait(int sync, long timeout)
765{ 762{
766 long ret; 763 long ret;
764 unsigned long start = jiffies;
767 DEFINE_WAIT(wait); 765 DEFINE_WAIT(wait);
768 wait_queue_head_t *wqh = &congestion_wqh[sync]; 766 wait_queue_head_t *wqh = &congestion_wqh[sync];
769 767
770 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 768 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
771 ret = io_schedule_timeout(timeout); 769 ret = io_schedule_timeout(timeout);
772 finish_wait(wqh, &wait); 770 finish_wait(wqh, &wait);
771
772 trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
773 jiffies_to_usecs(jiffies - start));
774
773 return ret; 775 return ret;
774} 776}
775EXPORT_SYMBOL(congestion_wait); 777EXPORT_SYMBOL(congestion_wait);
776 778
779/**
780 * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
781 * @zone: A zone to check if it is heavily congested
782 * @sync: SYNC or ASYNC IO
783 * @timeout: timeout in jiffies
784 *
785 * In the event of a congested backing_dev (any backing_dev) and the given
786 * @zone has experienced recent congestion, this waits for up to @timeout
787 * jiffies for either a BDI to exit congestion of the given @sync queue
788 * or a write to complete.
789 *
790 * In the absence of zone congestion, cond_resched() is called to yield
791 * the processor if necessary but otherwise does not sleep.
792 *
793 * The return value is 0 if the sleep is for the full timeout. Otherwise,
794 * it is the number of jiffies that were still remaining when the function
795 * returned. return_value == timeout implies the function did not sleep.
796 */
797long wait_iff_congested(struct zone *zone, int sync, long timeout)
798{
799 long ret;
800 unsigned long start = jiffies;
801 DEFINE_WAIT(wait);
802 wait_queue_head_t *wqh = &congestion_wqh[sync];
803
804 /*
805 * If there is no congestion, or heavy congestion is not being
806 * encountered in the current zone, yield if necessary instead
807 * of sleeping on the congestion queue
808 */
809 if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
810 !zone_is_reclaim_congested(zone)) {
811 cond_resched();
812
813 /* In case we scheduled, work out time remaining */
814 ret = timeout - (jiffies - start);
815 if (ret < 0)
816 ret = 0;
817
818 goto out;
819 }
820
821 /* Sleep until uncongested or a write happens */
822 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
823 ret = io_schedule_timeout(timeout);
824 finish_wait(wqh, &wait);
825
826out:
827 trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
828 jiffies_to_usecs(jiffies - start));
829
830 return ret;
831}
832EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a54993..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h> 17#include <linux/range.h>
18#include <linux/memblock.h>
18 19
19#include <asm/bug.h> 20#include <asm/bug.h>
20#include <asm/io.h> 21#include <asm/io.h>
@@ -22,19 +23,17 @@
22 23
23#include "internal.h" 24#include "internal.h"
24 25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data = {
28 .bdata = &bootmem_node_data[0]
29};
30EXPORT_SYMBOL(contig_page_data);
31#endif
32
25unsigned long max_low_pfn; 33unsigned long max_low_pfn;
26unsigned long min_low_pfn; 34unsigned long min_low_pfn;
27unsigned long max_pfn; 35unsigned long max_pfn;
28 36
29#ifdef CONFIG_CRASH_DUMP
30/*
31 * If we have booted due to a crash, max_pfn will be a very low value. We need
32 * to know the amount of memory that the previous kernel used.
33 */
34unsigned long saved_max_pfn;
35#endif
36
37#ifndef CONFIG_NO_BOOTMEM
38bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
39 38
40static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -145,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
145 min_low_pfn = start; 144 min_low_pfn = start;
146 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
147} 146}
148#endif 147
149/* 148/*
150 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
151 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -170,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
170 } 169 }
171} 170}
172 171
173#ifdef CONFIG_NO_BOOTMEM
174static void __init __free_pages_memory(unsigned long start, unsigned long end)
175{
176 int i;
177 unsigned long start_aligned, end_aligned;
178 int order = ilog2(BITS_PER_LONG);
179
180 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
181 end_aligned = end & ~(BITS_PER_LONG - 1);
182
183 if (end_aligned <= start_aligned) {
184 for (i = start; i < end; i++)
185 __free_pages_bootmem(pfn_to_page(i), 0);
186
187 return;
188 }
189
190 for (i = start; i < start_aligned; i++)
191 __free_pages_bootmem(pfn_to_page(i), 0);
192
193 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
194 __free_pages_bootmem(pfn_to_page(i), order);
195
196 for (i = end_aligned; i < end; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198}
199
200unsigned long __init free_all_memory_core_early(int nodeid)
201{
202 int i;
203 u64 start, end;
204 unsigned long count = 0;
205 struct range *range = NULL;
206 int nr_range;
207
208 nr_range = get_free_all_memory_range(&range, nodeid);
209
210 for (i = 0; i < nr_range; i++) {
211 start = range[i].start;
212 end = range[i].end;
213 count += end - start;
214 __free_pages_memory(start, end);
215 }
216
217 return count;
218}
219#else
220static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
221{ 173{
222 int aligned; 174 int aligned;
@@ -277,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
277 229
278 return count; 230 return count;
279} 231}
280#endif
281 232
282/** 233/**
283 * free_all_bootmem_node - release a node's free pages to the buddy allocator 234 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -288,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
288unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 239unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
289{ 240{
290 register_page_bootmem_info_node(pgdat); 241 register_page_bootmem_info_node(pgdat);
291#ifdef CONFIG_NO_BOOTMEM
292 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
293 return 0;
294#else
295 return free_all_bootmem_core(pgdat->bdata); 242 return free_all_bootmem_core(pgdat->bdata);
296#endif
297} 243}
298 244
299/** 245/**
@@ -303,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
303 */ 249 */
304unsigned long __init free_all_bootmem(void) 250unsigned long __init free_all_bootmem(void)
305{ 251{
306#ifdef CONFIG_NO_BOOTMEM
307 /*
308 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
309 * because in some case like Node0 doesnt have RAM installed
310 * low ram will be on Node1
311 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
312 * will be used instead of only Node0 related
313 */
314 return free_all_memory_core_early(MAX_NUMNODES);
315#else
316 unsigned long total_pages = 0; 252 unsigned long total_pages = 0;
317 bootmem_data_t *bdata; 253 bootmem_data_t *bdata;
318 254
@@ -320,10 +256,8 @@ unsigned long __init free_all_bootmem(void)
320 total_pages += free_all_bootmem_core(bdata); 256 total_pages += free_all_bootmem_core(bdata);
321 257
322 return total_pages; 258 return total_pages;
323#endif
324} 259}
325 260
326#ifndef CONFIG_NO_BOOTMEM
327static void __init __free(bootmem_data_t *bdata, 261static void __init __free(bootmem_data_t *bdata,
328 unsigned long sidx, unsigned long eidx) 262 unsigned long sidx, unsigned long eidx)
329{ 263{
@@ -418,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
418 } 352 }
419 BUG(); 353 BUG();
420} 354}
421#endif
422 355
423/** 356/**
424 * free_bootmem_node - mark a page range as usable 357 * free_bootmem_node - mark a page range as usable
@@ -433,9 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
433void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 366void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
434 unsigned long size) 367 unsigned long size)
435{ 368{
436#ifdef CONFIG_NO_BOOTMEM
437 free_early(physaddr, physaddr + size);
438#else
439 unsigned long start, end; 369 unsigned long start, end;
440 370
441 kmemleak_free_part(__va(physaddr), size); 371 kmemleak_free_part(__va(physaddr), size);
@@ -444,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
444 end = PFN_DOWN(physaddr + size); 374 end = PFN_DOWN(physaddr + size);
445 375
446 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 376 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
447#endif
448} 377}
449 378
450/** 379/**
@@ -458,9 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
458 */ 387 */
459void __init free_bootmem(unsigned long addr, unsigned long size) 388void __init free_bootmem(unsigned long addr, unsigned long size)
460{ 389{
461#ifdef CONFIG_NO_BOOTMEM
462 free_early(addr, addr + size);
463#else
464 unsigned long start, end; 390 unsigned long start, end;
465 391
466 kmemleak_free_part(__va(addr), size); 392 kmemleak_free_part(__va(addr), size);
@@ -469,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
469 end = PFN_DOWN(addr + size); 395 end = PFN_DOWN(addr + size);
470 396
471 mark_bootmem(start, end, 0, 0); 397 mark_bootmem(start, end, 0, 0);
472#endif
473} 398}
474 399
475/** 400/**
@@ -486,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
486int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 411int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
487 unsigned long size, int flags) 412 unsigned long size, int flags)
488{ 413{
489#ifdef CONFIG_NO_BOOTMEM
490 panic("no bootmem");
491 return 0;
492#else
493 unsigned long start, end; 414 unsigned long start, end;
494 415
495 start = PFN_DOWN(physaddr); 416 start = PFN_DOWN(physaddr);
496 end = PFN_UP(physaddr + size); 417 end = PFN_UP(physaddr + size);
497 418
498 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 419 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
499#endif
500} 420}
501 421
502/** 422/**
@@ -512,20 +432,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
512int __init reserve_bootmem(unsigned long addr, unsigned long size, 432int __init reserve_bootmem(unsigned long addr, unsigned long size,
513 int flags) 433 int flags)
514{ 434{
515#ifdef CONFIG_NO_BOOTMEM
516 panic("no bootmem");
517 return 0;
518#else
519 unsigned long start, end; 435 unsigned long start, end;
520 436
521 start = PFN_DOWN(addr); 437 start = PFN_DOWN(addr);
522 end = PFN_UP(addr + size); 438 end = PFN_UP(addr + size);
523 439
524 return mark_bootmem(start, end, 1, flags); 440 return mark_bootmem(start, end, 1, flags);
525#endif
526} 441}
527 442
528#ifndef CONFIG_NO_BOOTMEM 443int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
444 int flags)
445{
446 return reserve_bootmem(phys, len, flags);
447}
448
529static unsigned long __init align_idx(struct bootmem_data *bdata, 449static unsigned long __init align_idx(struct bootmem_data *bdata,
530 unsigned long idx, unsigned long step) 450 unsigned long idx, unsigned long step)
531{ 451{
@@ -676,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
676#endif 596#endif
677 return NULL; 597 return NULL;
678} 598}
679#endif
680 599
681static void * __init ___alloc_bootmem_nopanic(unsigned long size, 600static void * __init ___alloc_bootmem_nopanic(unsigned long size,
682 unsigned long align, 601 unsigned long align,
683 unsigned long goal, 602 unsigned long goal,
684 unsigned long limit) 603 unsigned long limit)
685{ 604{
686#ifdef CONFIG_NO_BOOTMEM
687 void *ptr;
688
689 if (WARN_ON_ONCE(slab_is_available()))
690 return kzalloc(size, GFP_NOWAIT);
691
692restart:
693
694 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
695
696 if (ptr)
697 return ptr;
698
699 if (goal != 0) {
700 goal = 0;
701 goto restart;
702 }
703
704 return NULL;
705#else
706 bootmem_data_t *bdata; 605 bootmem_data_t *bdata;
707 void *region; 606 void *region;
708 607
@@ -728,7 +627,6 @@ restart:
728 } 627 }
729 628
730 return NULL; 629 return NULL;
731#endif
732} 630}
733 631
734/** 632/**
@@ -749,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
749{ 647{
750 unsigned long limit = 0; 648 unsigned long limit = 0;
751 649
752#ifdef CONFIG_NO_BOOTMEM
753 limit = -1UL;
754#endif
755
756 return ___alloc_bootmem_nopanic(size, align, goal, limit); 650 return ___alloc_bootmem_nopanic(size, align, goal, limit);
757} 651}
758 652
@@ -789,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
789{ 683{
790 unsigned long limit = 0; 684 unsigned long limit = 0;
791 685
792#ifdef CONFIG_NO_BOOTMEM
793 limit = -1UL;
794#endif
795
796 return ___alloc_bootmem(size, align, goal, limit); 686 return ___alloc_bootmem(size, align, goal, limit);
797} 687}
798 688
799#ifndef CONFIG_NO_BOOTMEM
800static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 689static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
801 unsigned long size, unsigned long align, 690 unsigned long size, unsigned long align,
802 unsigned long goal, unsigned long limit) 691 unsigned long goal, unsigned long limit)
@@ -813,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
813 702
814 return ___alloc_bootmem(size, align, goal, limit); 703 return ___alloc_bootmem(size, align, goal, limit);
815} 704}
816#endif
817 705
818/** 706/**
819 * __alloc_bootmem_node - allocate boot memory from a specific node 707 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -833,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
833void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 721void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
834 unsigned long align, unsigned long goal) 722 unsigned long align, unsigned long goal)
835{ 723{
836 void *ptr;
837
838 if (WARN_ON_ONCE(slab_is_available())) 724 if (WARN_ON_ONCE(slab_is_available()))
839 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 725 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
840 726
841#ifdef CONFIG_NO_BOOTMEM 727 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
842 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
843 goal, -1ULL);
844 if (ptr)
845 return ptr;
846
847 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
848 goal, -1ULL);
849#else
850 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
851#endif
852
853 return ptr;
854} 728}
855 729
856void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 730void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -871,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
871 unsigned long new_goal; 745 unsigned long new_goal;
872 746
873 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 747 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
874#ifdef CONFIG_NO_BOOTMEM
875 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
876 new_goal, -1ULL);
877#else
878 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 748 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
879 new_goal, 0); 749 new_goal, 0);
880#endif
881 if (ptr) 750 if (ptr)
882 return ptr; 751 return ptr;
883 } 752 }
@@ -898,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
898void * __init alloc_bootmem_section(unsigned long size, 767void * __init alloc_bootmem_section(unsigned long size,
899 unsigned long section_nr) 768 unsigned long section_nr)
900{ 769{
901#ifdef CONFIG_NO_BOOTMEM
902 unsigned long pfn, goal, limit;
903
904 pfn = section_nr_to_pfn(section_nr);
905 goal = pfn << PAGE_SHIFT;
906 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
907
908 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
909 SMP_CACHE_BYTES, goal, limit);
910#else
911 bootmem_data_t *bdata; 770 bootmem_data_t *bdata;
912 unsigned long pfn, goal, limit; 771 unsigned long pfn, goal, limit;
913 772
@@ -917,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size,
917 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 776 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
918 777
919 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 778 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
920#endif
921} 779}
922#endif 780#endif
923 781
@@ -929,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
929 if (WARN_ON_ONCE(slab_is_available())) 787 if (WARN_ON_ONCE(slab_is_available()))
930 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 788 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
931 789
932#ifdef CONFIG_NO_BOOTMEM
933 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
934 goal, -1ULL);
935#else
936 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 790 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
937 if (ptr) 791 if (ptr)
938 return ptr; 792 return ptr;
939 793
940 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 794 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
941#endif
942 if (ptr) 795 if (ptr)
943 return ptr; 796 return ptr;
944 797
@@ -986,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
986void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 839void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
987 unsigned long align, unsigned long goal) 840 unsigned long align, unsigned long goal)
988{ 841{
989 void *ptr;
990
991 if (WARN_ON_ONCE(slab_is_available())) 842 if (WARN_ON_ONCE(slab_is_available()))
992 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 843 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
993 844
994#ifdef CONFIG_NO_BOOTMEM 845 return ___alloc_bootmem_node(pgdat->bdata, size, align,
995 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
996 goal, ARCH_LOW_ADDRESS_LIMIT); 846 goal, ARCH_LOW_ADDRESS_LIMIT);
997 if (ptr)
998 return ptr;
999 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
1000 goal, ARCH_LOW_ADDRESS_LIMIT);
1001#else
1002 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
1003 goal, ARCH_LOW_ADDRESS_LIMIT);
1004#endif
1005 return ptr;
1006} 847}
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
1/*
2 * Cleancache frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of cleancache. See
6 * Documentation/vm/cleancache.txt for more information.
7 *
8 * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/module.h>
15#include <linux/fs.h>
16#include <linux/exportfs.h>
17#include <linux/mm.h>
18#include <linux/cleancache.h>
19
20/*
21 * This global enablement flag may be read thousands of times per second
22 * by cleancache_get/put/flush even on systems where cleancache_ops
23 * is not claimed (e.g. cleancache is config'ed on but remains
24 * disabled), so is preferred to the slower alternative: a function
25 * call that checks a non-global.
26 */
27int cleancache_enabled;
28EXPORT_SYMBOL(cleancache_enabled);
29
30/*
31 * cleancache_ops is set by cleancache_ops_register to contain the pointers
32 * to the cleancache "backend" implementation functions.
33 */
34static struct cleancache_ops cleancache_ops;
35
36/* useful stats available in /sys/kernel/mm/cleancache */
37static unsigned long cleancache_succ_gets;
38static unsigned long cleancache_failed_gets;
39static unsigned long cleancache_puts;
40static unsigned long cleancache_flushes;
41
42/*
43 * register operations for cleancache, returning previous thus allowing
44 * detection of multiple backends and possible nesting
45 */
46struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
47{
48 struct cleancache_ops old = cleancache_ops;
49
50 cleancache_ops = *ops;
51 cleancache_enabled = 1;
52 return old;
53}
54EXPORT_SYMBOL(cleancache_register_ops);
55
56/* Called by a cleancache-enabled filesystem at time of mount */
57void __cleancache_init_fs(struct super_block *sb)
58{
59 sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
60}
61EXPORT_SYMBOL(__cleancache_init_fs);
62
63/* Called by a cleancache-enabled clustered filesystem at time of mount */
64void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
65{
66 sb->cleancache_poolid =
67 (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
68}
69EXPORT_SYMBOL(__cleancache_init_shared_fs);
70
71/*
72 * If the filesystem uses exportable filehandles, use the filehandle as
73 * the key, else use the inode number.
74 */
75static int cleancache_get_key(struct inode *inode,
76 struct cleancache_filekey *key)
77{
78 int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
79 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
80 struct super_block *sb = inode->i_sb;
81
82 key->u.ino = inode->i_ino;
83 if (sb->s_export_op != NULL) {
84 fhfn = sb->s_export_op->encode_fh;
85 if (fhfn) {
86 struct dentry d;
87 d.d_inode = inode;
88 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
89 if (len <= 0 || len == 255)
90 return -1;
91 if (maxlen > CLEANCACHE_KEY_MAX)
92 return -1;
93 }
94 }
95 return 0;
96}
97
98/*
99 * "Get" data from cleancache associated with the poolid/inode/index
100 * that were specified when the data was put to cleanache and, if
101 * successful, use it to fill the specified page with data and return 0.
102 * The pageframe is unchanged and returns -1 if the get fails.
103 * Page must be locked by caller.
104 */
105int __cleancache_get_page(struct page *page)
106{
107 int ret = -1;
108 int pool_id;
109 struct cleancache_filekey key = { .u.key = { 0 } };
110
111 VM_BUG_ON(!PageLocked(page));
112 pool_id = page->mapping->host->i_sb->cleancache_poolid;
113 if (pool_id < 0)
114 goto out;
115
116 if (cleancache_get_key(page->mapping->host, &key) < 0)
117 goto out;
118
119 ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
120 if (ret == 0)
121 cleancache_succ_gets++;
122 else
123 cleancache_failed_gets++;
124out:
125 return ret;
126}
127EXPORT_SYMBOL(__cleancache_get_page);
128
129/*
130 * "Put" data from a page to cleancache and associate it with the
131 * (previously-obtained per-filesystem) poolid and the page's,
132 * inode and page index. Page must be locked. Note that a put_page
133 * always "succeeds", though a subsequent get_page may succeed or fail.
134 */
135void __cleancache_put_page(struct page *page)
136{
137 int pool_id;
138 struct cleancache_filekey key = { .u.key = { 0 } };
139
140 VM_BUG_ON(!PageLocked(page));
141 pool_id = page->mapping->host->i_sb->cleancache_poolid;
142 if (pool_id >= 0 &&
143 cleancache_get_key(page->mapping->host, &key) >= 0) {
144 (*cleancache_ops.put_page)(pool_id, key, page->index, page);
145 cleancache_puts++;
146 }
147}
148EXPORT_SYMBOL(__cleancache_put_page);
149
150/*
151 * Flush any data from cleancache associated with the poolid and the
152 * page's inode and page index so that a subsequent "get" will fail.
153 */
154void __cleancache_flush_page(struct address_space *mapping, struct page *page)
155{
156 /* careful... page->mapping is NULL sometimes when this is called */
157 int pool_id = mapping->host->i_sb->cleancache_poolid;
158 struct cleancache_filekey key = { .u.key = { 0 } };
159
160 if (pool_id >= 0) {
161 VM_BUG_ON(!PageLocked(page));
162 if (cleancache_get_key(mapping->host, &key) >= 0) {
163 (*cleancache_ops.flush_page)(pool_id, key, page->index);
164 cleancache_flushes++;
165 }
166 }
167}
168EXPORT_SYMBOL(__cleancache_flush_page);
169
170/*
171 * Flush all data from cleancache associated with the poolid and the
172 * mappings's inode so that all subsequent gets to this poolid/inode
173 * will fail.
174 */
175void __cleancache_flush_inode(struct address_space *mapping)
176{
177 int pool_id = mapping->host->i_sb->cleancache_poolid;
178 struct cleancache_filekey key = { .u.key = { 0 } };
179
180 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
181 (*cleancache_ops.flush_inode)(pool_id, key);
182}
183EXPORT_SYMBOL(__cleancache_flush_inode);
184
185/*
186 * Called by any cleancache-enabled filesystem at time of unmount;
187 * note that pool_id is surrendered and may be reutrned by a subsequent
188 * cleancache_init_fs or cleancache_init_shared_fs
189 */
190void __cleancache_flush_fs(struct super_block *sb)
191{
192 if (sb->cleancache_poolid >= 0) {
193 int old_poolid = sb->cleancache_poolid;
194 sb->cleancache_poolid = -1;
195 (*cleancache_ops.flush_fs)(old_poolid);
196 }
197}
198EXPORT_SYMBOL(__cleancache_flush_fs);
199
200#ifdef CONFIG_SYSFS
201
202/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
203
204#define CLEANCACHE_SYSFS_RO(_name) \
205 static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
206 struct kobj_attribute *attr, char *buf) \
207 { \
208 return sprintf(buf, "%lu\n", cleancache_##_name); \
209 } \
210 static struct kobj_attribute cleancache_##_name##_attr = { \
211 .attr = { .name = __stringify(_name), .mode = 0444 }, \
212 .show = cleancache_##_name##_show, \
213 }
214
215CLEANCACHE_SYSFS_RO(succ_gets);
216CLEANCACHE_SYSFS_RO(failed_gets);
217CLEANCACHE_SYSFS_RO(puts);
218CLEANCACHE_SYSFS_RO(flushes);
219
220static struct attribute *cleancache_attrs[] = {
221 &cleancache_succ_gets_attr.attr,
222 &cleancache_failed_gets_attr.attr,
223 &cleancache_puts_attr.attr,
224 &cleancache_flushes_attr.attr,
225 NULL,
226};
227
228static struct attribute_group cleancache_attr_group = {
229 .attrs = cleancache_attrs,
230 .name = "cleancache",
231};
232
233#endif /* CONFIG_SYSFS */
234
235static int __init init_cleancache(void)
236{
237#ifdef CONFIG_SYSFS
238 int err;
239
240 err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
241#endif /* CONFIG_SYSFS */
242 return 0;
243}
244module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h>
21
19/* 22/*
20 * compact_control is used to track pages being migrated and the free pages 23 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts 24 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
30 unsigned long nr_migratepages; /* Number of pages to migrate */ 33 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */ 34 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
33 37
34 /* Account for isolated anon and file pages */ 38 /* Account for isolated anon and file pages */
35 unsigned long nr_anon; 39 unsigned long nr_anon;
@@ -60,7 +64,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
60 struct list_head *freelist) 64 struct list_head *freelist)
61{ 65{
62 unsigned long zone_end_pfn, end_pfn; 66 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0; 67 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor; 68 struct page *cursor;
65 69
66 /* Get the last PFN we should scan for free pages at */ 70 /* Get the last PFN we should scan for free pages at */
@@ -81,6 +85,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
81 85
82 if (!pfn_valid_within(blockpfn)) 86 if (!pfn_valid_within(blockpfn))
83 continue; 87 continue;
88 nr_scanned++;
84 89
85 if (!PageBuddy(page)) 90 if (!PageBuddy(page))
86 continue; 91 continue;
@@ -100,6 +105,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
100 } 105 }
101 } 106 }
102 107
108 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
103 return total_isolated; 109 return total_isolated;
104} 110}
105 111
@@ -138,16 +144,26 @@ static void isolate_freepages(struct zone *zone,
138 int nr_freepages = cc->nr_freepages; 144 int nr_freepages = cc->nr_freepages;
139 struct list_head *freelist = &cc->freepages; 145 struct list_head *freelist = &cc->freepages;
140 146
147 /*
148 * Initialise the free scanner. The starting point is where we last
149 * scanned from (or the end of the zone if starting). The low point
150 * is the end of the pageblock the migration scanner is using.
151 */
141 pfn = cc->free_pfn; 152 pfn = cc->free_pfn;
142 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 153 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
143 high_pfn = low_pfn; 154
155 /*
156 * Take care that if the migration scanner is at the end of the zone
157 * that the free scanner does not accidentally move to the next zone
158 * in the next isolation cycle.
159 */
160 high_pfn = min(low_pfn, pfn);
144 161
145 /* 162 /*
146 * Isolate free pages until enough are available to migrate the 163 * Isolate free pages until enough are available to migrate the
147 * pages on cc->migratepages. We stop searching if the migrate 164 * pages on cc->migratepages. We stop searching if the migrate
148 * and free page scanners meet or enough free pages are isolated. 165 * and free page scanners meet or enough free pages are isolated.
149 */ 166 */
150 spin_lock_irqsave(&zone->lock, flags);
151 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 167 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
152 pfn -= pageblock_nr_pages) { 168 pfn -= pageblock_nr_pages) {
153 unsigned long isolated; 169 unsigned long isolated;
@@ -170,9 +186,19 @@ static void isolate_freepages(struct zone *zone,
170 if (!suitable_migration_target(page)) 186 if (!suitable_migration_target(page))
171 continue; 187 continue;
172 188
173 /* Found a block suitable for isolating free pages from */ 189 /*
174 isolated = isolate_freepages_block(zone, pfn, freelist); 190 * Found a block suitable for isolating free pages from. Now
175 nr_freepages += isolated; 191 * we disabled interrupts, double check things are ok and
192 * isolate the pages. This is to minimise the time IRQs
193 * are disabled
194 */
195 isolated = 0;
196 spin_lock_irqsave(&zone->lock, flags);
197 if (suitable_migration_target(page)) {
198 isolated = isolate_freepages_block(zone, pfn, freelist);
199 nr_freepages += isolated;
200 }
201 spin_unlock_irqrestore(&zone->lock, flags);
176 202
177 /* 203 /*
178 * Record the highest PFN we isolated pages from. When next 204 * Record the highest PFN we isolated pages from. When next
@@ -182,7 +208,6 @@ static void isolate_freepages(struct zone *zone,
182 if (isolated) 208 if (isolated)
183 high_pfn = max(high_pfn, pfn); 209 high_pfn = max(high_pfn, pfn);
184 } 210 }
185 spin_unlock_irqrestore(&zone->lock, flags);
186 211
187 /* split_free_page does not map the pages */ 212 /* split_free_page does not map the pages */
188 list_for_each_entry(page, freelist, lru) { 213 list_for_each_entry(page, freelist, lru) {
@@ -226,14 +251,23 @@ static bool too_many_isolated(struct zone *zone)
226 return isolated > (inactive + active) / 2; 251 return isolated > (inactive + active) / 2;
227} 252}
228 253
254/* possible outcome of isolate_migratepages */
255typedef enum {
256 ISOLATE_ABORT, /* Abort compaction now */
257 ISOLATE_NONE, /* No pages isolated, continue scanning */
258 ISOLATE_SUCCESS, /* Pages isolated, migrate */
259} isolate_migrate_t;
260
229/* 261/*
230 * Isolate all pages that can be migrated from the block pointed to by 262 * Isolate all pages that can be migrated from the block pointed to by
231 * the migrate scanner within compact_control. 263 * the migrate scanner within compact_control.
232 */ 264 */
233static unsigned long isolate_migratepages(struct zone *zone, 265static isolate_migrate_t isolate_migratepages(struct zone *zone,
234 struct compact_control *cc) 266 struct compact_control *cc)
235{ 267{
236 unsigned long low_pfn, end_pfn; 268 unsigned long low_pfn, end_pfn;
269 unsigned long last_pageblock_nr = 0, pageblock_nr;
270 unsigned long nr_scanned = 0, nr_isolated = 0;
237 struct list_head *migratelist = &cc->migratepages; 271 struct list_head *migratelist = &cc->migratepages;
238 272
239 /* Do not scan outside zone boundaries */ 273 /* Do not scan outside zone boundaries */
@@ -245,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
245 /* Do not cross the free scanner or scan within a memory hole */ 279 /* Do not cross the free scanner or scan within a memory hole */
246 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 280 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
247 cc->migrate_pfn = end_pfn; 281 cc->migrate_pfn = end_pfn;
248 return 0; 282 return ISOLATE_NONE;
249 } 283 }
250 284
251 /* 285 /*
@@ -254,33 +288,85 @@ static unsigned long isolate_migratepages(struct zone *zone,
254 * delay for some time until fewer pages are isolated 288 * delay for some time until fewer pages are isolated
255 */ 289 */
256 while (unlikely(too_many_isolated(zone))) { 290 while (unlikely(too_many_isolated(zone))) {
291 /* async migration should just abort */
292 if (!cc->sync)
293 return ISOLATE_ABORT;
294
257 congestion_wait(BLK_RW_ASYNC, HZ/10); 295 congestion_wait(BLK_RW_ASYNC, HZ/10);
258 296
259 if (fatal_signal_pending(current)) 297 if (fatal_signal_pending(current))
260 return 0; 298 return ISOLATE_ABORT;
261 } 299 }
262 300
263 /* Time to isolate some pages for migration */ 301 /* Time to isolate some pages for migration */
302 cond_resched();
264 spin_lock_irq(&zone->lru_lock); 303 spin_lock_irq(&zone->lru_lock);
265 for (; low_pfn < end_pfn; low_pfn++) { 304 for (; low_pfn < end_pfn; low_pfn++) {
266 struct page *page; 305 struct page *page;
306 bool locked = true;
307
308 /* give a chance to irqs before checking need_resched() */
309 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
310 spin_unlock_irq(&zone->lru_lock);
311 locked = false;
312 }
313 if (need_resched() || spin_is_contended(&zone->lru_lock)) {
314 if (locked)
315 spin_unlock_irq(&zone->lru_lock);
316 cond_resched();
317 spin_lock_irq(&zone->lru_lock);
318 if (fatal_signal_pending(current))
319 break;
320 } else if (!locked)
321 spin_lock_irq(&zone->lru_lock);
322
267 if (!pfn_valid_within(low_pfn)) 323 if (!pfn_valid_within(low_pfn))
268 continue; 324 continue;
325 nr_scanned++;
269 326
270 /* Get the page and skip if free */ 327 /* Get the page and skip if free */
271 page = pfn_to_page(low_pfn); 328 page = pfn_to_page(low_pfn);
272 if (PageBuddy(page)) 329 if (PageBuddy(page))
273 continue; 330 continue;
274 331
332 /*
333 * For async migration, also only scan in MOVABLE blocks. Async
334 * migration is optimistic to see if the minimum amount of work
335 * satisfies the allocation
336 */
337 pageblock_nr = low_pfn >> pageblock_order;
338 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
339 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
340 low_pfn += pageblock_nr_pages;
341 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
342 last_pageblock_nr = pageblock_nr;
343 continue;
344 }
345
346 if (!PageLRU(page))
347 continue;
348
349 /*
350 * PageLRU is set, and lru_lock excludes isolation,
351 * splitting and collapsing (collapsing has already
352 * happened if PageLRU is set).
353 */
354 if (PageTransHuge(page)) {
355 low_pfn += (1 << compound_order(page)) - 1;
356 continue;
357 }
358
275 /* Try isolate the page */ 359 /* Try isolate the page */
276 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 360 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
277 continue; 361 continue;
278 362
363 VM_BUG_ON(PageTransCompound(page));
364
279 /* Successfully isolated */ 365 /* Successfully isolated */
280 del_page_from_lru_list(zone, page, page_lru(page)); 366 del_page_from_lru_list(zone, page, page_lru(page));
281 list_add(&page->lru, migratelist); 367 list_add(&page->lru, migratelist);
282 mem_cgroup_del_lru(page);
283 cc->nr_migratepages++; 368 cc->nr_migratepages++;
369 nr_isolated++;
284 370
285 /* Avoid isolating too much */ 371 /* Avoid isolating too much */
286 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 372 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,7 +378,9 @@ static unsigned long isolate_migratepages(struct zone *zone,
292 spin_unlock_irq(&zone->lru_lock); 378 spin_unlock_irq(&zone->lru_lock);
293 cc->migrate_pfn = low_pfn; 379 cc->migrate_pfn = low_pfn;
294 380
295 return cc->nr_migratepages; 381 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
382
383 return ISOLATE_SUCCESS;
296} 384}
297 385
298/* 386/*
@@ -342,10 +430,10 @@ static void update_nr_listpages(struct compact_control *cc)
342} 430}
343 431
344static int compact_finished(struct zone *zone, 432static int compact_finished(struct zone *zone,
345 struct compact_control *cc) 433 struct compact_control *cc)
346{ 434{
347 unsigned int order; 435 unsigned int order;
348 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); 436 unsigned long watermark;
349 437
350 if (fatal_signal_pending(current)) 438 if (fatal_signal_pending(current))
351 return COMPACT_PARTIAL; 439 return COMPACT_PARTIAL;
@@ -354,11 +442,18 @@ static int compact_finished(struct zone *zone,
354 if (cc->free_pfn <= cc->migrate_pfn) 442 if (cc->free_pfn <= cc->migrate_pfn)
355 return COMPACT_COMPLETE; 443 return COMPACT_COMPLETE;
356 444
357 /* Compaction run is not finished if the watermark is not met */ 445 /*
358 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 446 * order == -1 is expected when compacting via
447 * /proc/sys/vm/compact_memory
448 */
449 if (cc->order == -1)
359 return COMPACT_CONTINUE; 450 return COMPACT_CONTINUE;
360 451
361 if (cc->order == -1) 452 /* Compaction run is not finished if the watermark is not met */
453 watermark = low_wmark_pages(zone);
454 watermark += (1 << cc->order);
455
456 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
362 return COMPACT_CONTINUE; 457 return COMPACT_CONTINUE;
363 458
364 /* Direct compactor: Is a suitable page free? */ 459 /* Direct compactor: Is a suitable page free? */
@@ -375,10 +470,71 @@ static int compact_finished(struct zone *zone,
375 return COMPACT_CONTINUE; 470 return COMPACT_CONTINUE;
376} 471}
377 472
473/*
474 * compaction_suitable: Is this suitable to run compaction on this zone now?
475 * Returns
476 * COMPACT_SKIPPED - If there are too few free pages for compaction
477 * COMPACT_PARTIAL - If the allocation would succeed without compaction
478 * COMPACT_CONTINUE - If compaction should run now
479 */
480unsigned long compaction_suitable(struct zone *zone, int order)
481{
482 int fragindex;
483 unsigned long watermark;
484
485 /*
486 * order == -1 is expected when compacting via
487 * /proc/sys/vm/compact_memory
488 */
489 if (order == -1)
490 return COMPACT_CONTINUE;
491
492 /*
493 * Watermarks for order-0 must be met for compaction. Note the 2UL.
494 * This is because during migration, copies of pages need to be
495 * allocated and for a short time, the footprint is higher
496 */
497 watermark = low_wmark_pages(zone) + (2UL << order);
498 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
499 return COMPACT_SKIPPED;
500
501 /*
502 * fragmentation index determines if allocation failures are due to
503 * low memory or external fragmentation
504 *
505 * index of -1000 implies allocations might succeed depending on
506 * watermarks
507 * index towards 0 implies failure is due to lack of memory
508 * index towards 1000 implies failure is due to fragmentation
509 *
510 * Only compact if a failure would be due to fragmentation.
511 */
512 fragindex = fragmentation_index(zone, order);
513 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
514 return COMPACT_SKIPPED;
515
516 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
517 0, 0))
518 return COMPACT_PARTIAL;
519
520 return COMPACT_CONTINUE;
521}
522
378static int compact_zone(struct zone *zone, struct compact_control *cc) 523static int compact_zone(struct zone *zone, struct compact_control *cc)
379{ 524{
380 int ret; 525 int ret;
381 526
527 ret = compaction_suitable(zone, cc->order);
528 switch (ret) {
529 case COMPACT_PARTIAL:
530 case COMPACT_SKIPPED:
531 /* Compaction is likely to fail */
532 return ret;
533 case COMPACT_CONTINUE:
534 /* Fall through to compaction */
535 ;
536 }
537
382 /* Setup to move all movable pages to the end of the zone */ 538 /* Setup to move all movable pages to the end of the zone */
383 cc->migrate_pfn = zone->zone_start_pfn; 539 cc->migrate_pfn = zone->zone_start_pfn;
384 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 540 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -388,13 +544,22 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
388 544
389 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 545 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
390 unsigned long nr_migrate, nr_remaining; 546 unsigned long nr_migrate, nr_remaining;
547 int err;
391 548
392 if (!isolate_migratepages(zone, cc)) 549 switch (isolate_migratepages(zone, cc)) {
550 case ISOLATE_ABORT:
551 ret = COMPACT_PARTIAL;
552 goto out;
553 case ISOLATE_NONE:
393 continue; 554 continue;
555 case ISOLATE_SUCCESS:
556 ;
557 }
394 558
395 nr_migrate = cc->nr_migratepages; 559 nr_migrate = cc->nr_migratepages;
396 migrate_pages(&cc->migratepages, compaction_alloc, 560 err = migrate_pages(&cc->migratepages, compaction_alloc,
397 (unsigned long)cc, 0); 561 (unsigned long)cc, false,
562 cc->sync);
398 update_nr_listpages(cc); 563 update_nr_listpages(cc);
399 nr_remaining = cc->nr_migratepages; 564 nr_remaining = cc->nr_migratepages;
400 565
@@ -402,15 +567,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
402 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); 567 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
403 if (nr_remaining) 568 if (nr_remaining)
404 count_vm_events(COMPACTPAGEFAILED, nr_remaining); 569 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
570 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
571 nr_remaining);
405 572
406 /* Release LRU pages not migrated */ 573 /* Release LRU pages not migrated */
407 if (!list_empty(&cc->migratepages)) { 574 if (err) {
408 putback_lru_pages(&cc->migratepages); 575 putback_lru_pages(&cc->migratepages);
409 cc->nr_migratepages = 0; 576 cc->nr_migratepages = 0;
410 } 577 }
411 578
412 } 579 }
413 580
581out:
414 /* Release free pages and check accounting */ 582 /* Release free pages and check accounting */
415 cc->nr_freepages -= release_freepages(&cc->freepages); 583 cc->nr_freepages -= release_freepages(&cc->freepages);
416 VM_BUG_ON(cc->nr_freepages != 0); 584 VM_BUG_ON(cc->nr_freepages != 0);
@@ -418,8 +586,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
418 return ret; 586 return ret;
419} 587}
420 588
421static unsigned long compact_zone_order(struct zone *zone, 589unsigned long compact_zone_order(struct zone *zone,
422 int order, gfp_t gfp_mask) 590 int order, gfp_t gfp_mask,
591 bool sync)
423{ 592{
424 struct compact_control cc = { 593 struct compact_control cc = {
425 .nr_freepages = 0, 594 .nr_freepages = 0,
@@ -427,6 +596,7 @@ static unsigned long compact_zone_order(struct zone *zone,
427 .order = order, 596 .order = order,
428 .migratetype = allocflags_to_migratetype(gfp_mask), 597 .migratetype = allocflags_to_migratetype(gfp_mask),
429 .zone = zone, 598 .zone = zone,
599 .sync = sync,
430 }; 600 };
431 INIT_LIST_HEAD(&cc.freepages); 601 INIT_LIST_HEAD(&cc.freepages);
432 INIT_LIST_HEAD(&cc.migratepages); 602 INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +612,17 @@ int sysctl_extfrag_threshold = 500;
442 * @order: The order of the current allocation 612 * @order: The order of the current allocation
443 * @gfp_mask: The GFP mask of the current allocation 613 * @gfp_mask: The GFP mask of the current allocation
444 * @nodemask: The allowed nodes to allocate from 614 * @nodemask: The allowed nodes to allocate from
615 * @sync: Whether migration is synchronous or not
445 * 616 *
446 * This is the main entry point for direct page compaction. 617 * This is the main entry point for direct page compaction.
447 */ 618 */
448unsigned long try_to_compact_pages(struct zonelist *zonelist, 619unsigned long try_to_compact_pages(struct zonelist *zonelist,
449 int order, gfp_t gfp_mask, nodemask_t *nodemask) 620 int order, gfp_t gfp_mask, nodemask_t *nodemask,
621 bool sync)
450{ 622{
451 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 623 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
452 int may_enter_fs = gfp_mask & __GFP_FS; 624 int may_enter_fs = gfp_mask & __GFP_FS;
453 int may_perform_io = gfp_mask & __GFP_IO; 625 int may_perform_io = gfp_mask & __GFP_IO;
454 unsigned long watermark;
455 struct zoneref *z; 626 struct zoneref *z;
456 struct zone *zone; 627 struct zone *zone;
457 int rc = COMPACT_SKIPPED; 628 int rc = COMPACT_SKIPPED;
@@ -461,7 +632,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
461 * made because an assumption is made that the page allocator can satisfy 632 * made because an assumption is made that the page allocator can satisfy
462 * the "cheaper" orders without taking special steps 633 * the "cheaper" orders without taking special steps
463 */ 634 */
464 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) 635 if (!order || !may_enter_fs || !may_perform_io)
465 return rc; 636 return rc;
466 637
467 count_vm_event(COMPACTSTALL); 638 count_vm_event(COMPACTSTALL);
@@ -469,43 +640,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
469 /* Compact each zone in the list */ 640 /* Compact each zone in the list */
470 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 641 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
471 nodemask) { 642 nodemask) {
472 int fragindex;
473 int status; 643 int status;
474 644
475 /* 645 status = compact_zone_order(zone, order, gfp_mask, sync);
476 * Watermarks for order-0 must be met for compaction. Note
477 * the 2UL. This is because during migration, copies of
478 * pages need to be allocated and for a short time, the
479 * footprint is higher
480 */
481 watermark = low_wmark_pages(zone) + (2UL << order);
482 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
483 continue;
484
485 /*
486 * fragmentation index determines if allocation failures are
487 * due to low memory or external fragmentation
488 *
489 * index of -1 implies allocations might succeed depending
490 * on watermarks
491 * index towards 0 implies failure is due to lack of memory
492 * index towards 1000 implies failure is due to fragmentation
493 *
494 * Only compact if a failure would be due to fragmentation.
495 */
496 fragindex = fragmentation_index(zone, order);
497 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
498 continue;
499
500 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
501 rc = COMPACT_PARTIAL;
502 break;
503 }
504
505 status = compact_zone_order(zone, order, gfp_mask);
506 rc = max(status, rc); 646 rc = max(status, rc);
507 647
508 if (zone_watermark_ok(zone, order, watermark, 0, 0)) 648 /* If a normal allocation would succeed, stop compacting */
649 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
509 break; 650 break;
510 } 651 }
511 652
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
311 size_t offset; 311 size_t offset;
312 void *retval; 312 void *retval;
313 313
314 might_sleep_if(mem_flags & __GFP_WAIT);
315
314 spin_lock_irqsave(&pool->lock, flags); 316 spin_lock_irqsave(&pool->lock, flags);
315 restart: 317 restart:
316 list_for_each_entry(page, &pool->page_list, page_list) { 318 list_for_each_entry(page, &pool->page_list, page_list) {
@@ -322,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
322 if (mem_flags & __GFP_WAIT) { 324 if (mem_flags & __GFP_WAIT) {
323 DECLARE_WAITQUEUE(wait, current); 325 DECLARE_WAITQUEUE(wait, current);
324 326
325 __set_current_state(TASK_INTERRUPTIBLE); 327 __set_current_state(TASK_UNINTERRUPTIBLE);
326 __add_wait_queue(&pool->waitq, &wait); 328 __add_wait_queue(&pool->waitq, &wait);
327 spin_unlock_irqrestore(&pool->lock, flags); 329 spin_unlock_irqrestore(&pool->lock, flags);
328 330
@@ -353,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
353 355
354static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) 356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
355{ 357{
356 unsigned long flags;
357 struct dma_page *page; 358 struct dma_page *page;
358 359
359 spin_lock_irqsave(&pool->lock, flags);
360 list_for_each_entry(page, &pool->page_list, page_list) { 360 list_for_each_entry(page, &pool->page_list, page_list) {
361 if (dma < page->dma) 361 if (dma < page->dma)
362 continue; 362 continue;
363 if (dma < (page->dma + pool->allocation)) 363 if (dma < (page->dma + pool->allocation))
364 goto done; 364 return page;
365 } 365 }
366 page = NULL; 366 return NULL;
367 done:
368 spin_unlock_irqrestore(&pool->lock, flags);
369 return page;
370} 367}
371 368
372/** 369/**
@@ -384,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
384 unsigned long flags; 381 unsigned long flags;
385 unsigned int offset; 382 unsigned int offset;
386 383
384 spin_lock_irqsave(&pool->lock, flags);
387 page = pool_find_page(pool, dma); 385 page = pool_find_page(pool, dma);
388 if (!page) { 386 if (!page) {
387 spin_unlock_irqrestore(&pool->lock, flags);
389 if (pool->dev) 388 if (pool->dev)
390 dev_err(pool->dev, 389 dev_err(pool->dev,
391 "dma_pool_free %s, %p/%lx (bad dma)\n", 390 "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -399,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
399 offset = vaddr - page->vaddr; 398 offset = vaddr - page->vaddr;
400#ifdef DMAPOOL_DEBUG 399#ifdef DMAPOOL_DEBUG
401 if ((dma - page->dma) != offset) { 400 if ((dma - page->dma) != offset) {
401 spin_unlock_irqrestore(&pool->lock, flags);
402 if (pool->dev) 402 if (pool->dev)
403 dev_err(pool->dev, 403 dev_err(pool->dev,
404 "dma_pool_free %s, %p (bad vaddr)/%Lx\n", 404 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -416,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
416 chain = *(int *)(page->vaddr + chain); 416 chain = *(int *)(page->vaddr + chain);
417 continue; 417 continue;
418 } 418 }
419 spin_unlock_irqrestore(&pool->lock, flags);
419 if (pool->dev) 420 if (pool->dev)
420 dev_err(pool->dev, "dma_pool_free %s, dma %Lx " 421 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
421 "already free\n", pool->name, 422 "already free\n", pool->name,
@@ -430,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
430 memset(vaddr, POOL_POISON_FREED, pool->size); 431 memset(vaddr, POOL_POISON_FREED, pool->size);
431#endif 432#endif
432 433
433 spin_lock_irqsave(&pool->lock, flags);
434 page->in_use--; 434 page->in_use--;
435 *(int *)vaddr = page->offset; 435 *(int *)vaddr = page->offset;
436 page->offset = offset; 436 page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..a8251a8d3457 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */ 36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* 40/*
@@ -58,16 +59,16 @@
58/* 59/*
59 * Lock ordering: 60 * Lock ordering:
60 * 61 *
61 * ->i_mmap_lock (truncate_pagecache) 62 * ->i_mmap_mutex (truncate_pagecache)
62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->private_lock (__free_pte->__set_page_dirty_buffers)
63 * ->swap_lock (exclusive_swap_page, others) 64 * ->swap_lock (exclusive_swap_page, others)
64 * ->mapping->tree_lock 65 * ->mapping->tree_lock
65 * 66 *
66 * ->i_mutex 67 * ->i_mutex
67 * ->i_mmap_lock (truncate->unmap_mapping_range) 68 * ->i_mmap_mutex (truncate->unmap_mapping_range)
68 * 69 *
69 * ->mmap_sem 70 * ->mmap_sem
70 * ->i_mmap_lock 71 * ->i_mmap_mutex
71 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->page_table_lock or pte_lock (various, mainly in memory.c)
72 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
73 * 74 *
@@ -80,11 +81,11 @@
80 * ->i_mutex 81 * ->i_mutex
81 * ->i_alloc_sem (various) 82 * ->i_alloc_sem (various)
82 * 83 *
83 * ->inode_lock 84 * inode_wb_list_lock
84 * ->sb_lock (fs/fs-writeback.c) 85 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 86 * ->mapping->tree_lock (__sync_single_inode)
86 * 87 *
87 * ->i_mmap_lock 88 * ->i_mmap_mutex
88 * ->anon_vma.lock (vma_adjust) 89 * ->anon_vma.lock (vma_adjust)
89 * 90 *
90 * ->anon_vma.lock 91 * ->anon_vma.lock
@@ -98,27 +99,36 @@
98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 99 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 * ->private_lock (page_remove_rmap->set_page_dirty) 100 * ->private_lock (page_remove_rmap->set_page_dirty)
100 * ->tree_lock (page_remove_rmap->set_page_dirty) 101 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 * ->inode_lock (page_remove_rmap->set_page_dirty) 102 * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 * ->inode_lock (zap_pte_range->set_page_dirty) 103 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * inode_wb_list_lock (zap_pte_range->set_page_dirty)
105 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 107 *
105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around) 108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock 110 * ->i_mmap_mutex
111 */ 111 */
112 112
113/* 113/*
114 * Remove a page from the page cache and free it. Caller has to make 114 * Delete a page from the page cache and free it. Caller has to make
115 * sure the page is locked and that nobody else uses it - or that usage 115 * sure the page is locked and that nobody else uses it - or that usage
116 * is safe. The caller must hold the mapping's tree_lock. 116 * is safe. The caller must hold the mapping's tree_lock.
117 */ 117 */
118void __remove_from_page_cache(struct page *page) 118void __delete_from_page_cache(struct page *page)
119{ 119{
120 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
121 121
122 /*
123 * if we're uptodate, flush out into the cleancache, otherwise
124 * invalidate any existing cleancache entries. We can't leave
125 * stale data around in the cleancache once our page is gone
126 */
127 if (PageUptodate(page) && PageMappedToDisk(page))
128 cleancache_put_page(page);
129 else
130 cleancache_flush_page(mapping, page);
131
122 radix_tree_delete(&mapping->page_tree, page->index); 132 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 133 page->mapping = NULL;
124 mapping->nrpages--; 134 mapping->nrpages--;
@@ -140,58 +150,42 @@ void __remove_from_page_cache(struct page *page)
140 } 150 }
141} 151}
142 152
143void remove_from_page_cache(struct page *page) 153/**
154 * delete_from_page_cache - delete page from page cache
155 * @page: the page which the kernel is trying to remove from page cache
156 *
157 * This must be called only on pages that have been verified to be in the page
158 * cache and locked. It will never put the page into the free list, the caller
159 * has a reference on the page.
160 */
161void delete_from_page_cache(struct page *page)
144{ 162{
145 struct address_space *mapping = page->mapping; 163 struct address_space *mapping = page->mapping;
164 void (*freepage)(struct page *);
146 165
147 BUG_ON(!PageLocked(page)); 166 BUG_ON(!PageLocked(page));
148 167
168 freepage = mapping->a_ops->freepage;
149 spin_lock_irq(&mapping->tree_lock); 169 spin_lock_irq(&mapping->tree_lock);
150 __remove_from_page_cache(page); 170 __delete_from_page_cache(page);
151 spin_unlock_irq(&mapping->tree_lock); 171 spin_unlock_irq(&mapping->tree_lock);
152 mem_cgroup_uncharge_cache_page(page); 172 mem_cgroup_uncharge_cache_page(page);
173
174 if (freepage)
175 freepage(page);
176 page_cache_release(page);
153} 177}
154EXPORT_SYMBOL(remove_from_page_cache); 178EXPORT_SYMBOL(delete_from_page_cache);
155 179
156static int sync_page(void *word) 180static int sleep_on_page(void *word)
157{ 181{
158 struct address_space *mapping;
159 struct page *page;
160
161 page = container_of((unsigned long *)word, struct page, flags);
162
163 /*
164 * page_mapping() is being called without PG_locked held.
165 * Some knowledge of the state and use of the page is used to
166 * reduce the requirements down to a memory barrier.
167 * The danger here is of a stale page_mapping() return value
168 * indicating a struct address_space different from the one it's
169 * associated with when it is associated with one.
170 * After smp_mb(), it's either the correct page_mapping() for
171 * the page, or an old page_mapping() and the page's own
172 * page_mapping() has gone NULL.
173 * The ->sync_page() address_space operation must tolerate
174 * page_mapping() going NULL. By an amazing coincidence,
175 * this comes about because none of the users of the page
176 * in the ->sync_page() methods make essential use of the
177 * page_mapping(), merely passing the page down to the backing
178 * device's unplug functions when it's non-NULL, which in turn
179 * ignore it for all cases but swap, where only page_private(page) is
180 * of interest. When page_mapping() does go NULL, the entire
181 * call stack gracefully ignores the page and returns.
182 * -- wli
183 */
184 smp_mb();
185 mapping = page_mapping(page);
186 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
187 mapping->a_ops->sync_page(page);
188 io_schedule(); 182 io_schedule();
189 return 0; 183 return 0;
190} 184}
191 185
192static int sync_page_killable(void *word) 186static int sleep_on_page_killable(void *word)
193{ 187{
194 sync_page(word); 188 sleep_on_page(word);
195 return fatal_signal_pending(current) ? -EINTR : 0; 189 return fatal_signal_pending(current) ? -EINTR : 0;
196} 190}
197 191
@@ -296,7 +290,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
296 continue; 290 continue;
297 291
298 wait_on_page_writeback(page); 292 wait_on_page_writeback(page);
299 if (PageError(page)) 293 if (TestClearPageError(page))
300 ret = -EIO; 294 ret = -EIO;
301 } 295 }
302 pagevec_release(&pvec); 296 pagevec_release(&pvec);
@@ -385,6 +379,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
385EXPORT_SYMBOL(filemap_write_and_wait_range); 379EXPORT_SYMBOL(filemap_write_and_wait_range);
386 380
387/** 381/**
382 * replace_page_cache_page - replace a pagecache page with a new one
383 * @old: page to be replaced
384 * @new: page to replace with
385 * @gfp_mask: allocation mode
386 *
387 * This function replaces a page in the pagecache with a new one. On
388 * success it acquires the pagecache reference for the new page and
389 * drops it for the old page. Both the old and new pages must be
390 * locked. This function does not add the new page to the LRU, the
391 * caller must do that.
392 *
393 * The remove + add is atomic. The only way this function can fail is
394 * memory allocation failure.
395 */
396int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
397{
398 int error;
399 struct mem_cgroup *memcg = NULL;
400
401 VM_BUG_ON(!PageLocked(old));
402 VM_BUG_ON(!PageLocked(new));
403 VM_BUG_ON(new->mapping);
404
405 /*
406 * This is not page migration, but prepare_migration and
407 * end_migration does enough work for charge replacement.
408 *
409 * In the longer term we probably want a specialized function
410 * for moving the charge from old to new in a more efficient
411 * manner.
412 */
413 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
414 if (error)
415 return error;
416
417 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
418 if (!error) {
419 struct address_space *mapping = old->mapping;
420 void (*freepage)(struct page *);
421
422 pgoff_t offset = old->index;
423 freepage = mapping->a_ops->freepage;
424
425 page_cache_get(new);
426 new->mapping = mapping;
427 new->index = offset;
428
429 spin_lock_irq(&mapping->tree_lock);
430 __delete_from_page_cache(old);
431 error = radix_tree_insert(&mapping->page_tree, offset, new);
432 BUG_ON(error);
433 mapping->nrpages++;
434 __inc_zone_page_state(new, NR_FILE_PAGES);
435 if (PageSwapBacked(new))
436 __inc_zone_page_state(new, NR_SHMEM);
437 spin_unlock_irq(&mapping->tree_lock);
438 radix_tree_preload_end();
439 if (freepage)
440 freepage(old);
441 page_cache_release(old);
442 mem_cgroup_end_migration(memcg, old, new, true);
443 } else {
444 mem_cgroup_end_migration(memcg, old, new, false);
445 }
446
447 return error;
448}
449EXPORT_SYMBOL_GPL(replace_page_cache_page);
450
451/**
388 * add_to_page_cache_locked - add a locked page to the pagecache 452 * add_to_page_cache_locked - add a locked page to the pagecache
389 * @page: page to add 453 * @page: page to add
390 * @mapping: the page's address_space 454 * @mapping: the page's address_space
@@ -477,12 +541,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
477EXPORT_SYMBOL(__page_cache_alloc); 541EXPORT_SYMBOL(__page_cache_alloc);
478#endif 542#endif
479 543
480static int __sleep_on_page_lock(void *word)
481{
482 io_schedule();
483 return 0;
484}
485
486/* 544/*
487 * In order to wait for pages to become available there must be 545 * In order to wait for pages to become available there must be
488 * waitqueues associated with pages. By using a hash table of 546 * waitqueues associated with pages. By using a hash table of
@@ -510,11 +568,22 @@ void wait_on_page_bit(struct page *page, int bit_nr)
510 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 568 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
511 569
512 if (test_bit(bit_nr, &page->flags)) 570 if (test_bit(bit_nr, &page->flags))
513 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 571 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
514 TASK_UNINTERRUPTIBLE); 572 TASK_UNINTERRUPTIBLE);
515} 573}
516EXPORT_SYMBOL(wait_on_page_bit); 574EXPORT_SYMBOL(wait_on_page_bit);
517 575
576int wait_on_page_bit_killable(struct page *page, int bit_nr)
577{
578 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
579
580 if (!test_bit(bit_nr, &page->flags))
581 return 0;
582
583 return __wait_on_bit(page_waitqueue(page), &wait,
584 sleep_on_page_killable, TASK_KILLABLE);
585}
586
518/** 587/**
519 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 588 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
520 * @page: Page defining the wait queue of interest 589 * @page: Page defining the wait queue of interest
@@ -574,17 +643,12 @@ EXPORT_SYMBOL(end_page_writeback);
574/** 643/**
575 * __lock_page - get a lock on the page, assuming we need to sleep to get it 644 * __lock_page - get a lock on the page, assuming we need to sleep to get it
576 * @page: the page to lock 645 * @page: the page to lock
577 *
578 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
579 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
580 * chances are that on the second loop, the block layer's plug list is empty,
581 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
582 */ 646 */
583void __lock_page(struct page *page) 647void __lock_page(struct page *page)
584{ 648{
585 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 649 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
586 650
587 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 651 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
588 TASK_UNINTERRUPTIBLE); 652 TASK_UNINTERRUPTIBLE);
589} 653}
590EXPORT_SYMBOL(__lock_page); 654EXPORT_SYMBOL(__lock_page);
@@ -594,22 +658,40 @@ int __lock_page_killable(struct page *page)
594 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 658 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
595 659
596 return __wait_on_bit_lock(page_waitqueue(page), &wait, 660 return __wait_on_bit_lock(page_waitqueue(page), &wait,
597 sync_page_killable, TASK_KILLABLE); 661 sleep_on_page_killable, TASK_KILLABLE);
598} 662}
599EXPORT_SYMBOL_GPL(__lock_page_killable); 663EXPORT_SYMBOL_GPL(__lock_page_killable);
600 664
601/** 665int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
602 * __lock_page_nosync - get a lock on the page, without calling sync_page() 666 unsigned int flags)
603 * @page: the page to lock
604 *
605 * Variant of lock_page that does not require the caller to hold a reference
606 * on the page's mapping.
607 */
608void __lock_page_nosync(struct page *page)
609{ 667{
610 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 668 if (flags & FAULT_FLAG_ALLOW_RETRY) {
611 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 669 /*
612 TASK_UNINTERRUPTIBLE); 670 * CAUTION! In this case, mmap_sem is not released
671 * even though return 0.
672 */
673 if (flags & FAULT_FLAG_RETRY_NOWAIT)
674 return 0;
675
676 up_read(&mm->mmap_sem);
677 if (flags & FAULT_FLAG_KILLABLE)
678 wait_on_page_locked_killable(page);
679 else
680 wait_on_page_locked(page);
681 return 0;
682 } else {
683 if (flags & FAULT_FLAG_KILLABLE) {
684 int ret;
685
686 ret = __lock_page_killable(page);
687 if (ret) {
688 up_read(&mm->mmap_sem);
689 return 0;
690 }
691 } else
692 __lock_page(page);
693 return 1;
694 }
613} 695}
614 696
615/** 697/**
@@ -631,7 +713,9 @@ repeat:
631 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 713 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
632 if (pagep) { 714 if (pagep) {
633 page = radix_tree_deref_slot(pagep); 715 page = radix_tree_deref_slot(pagep);
634 if (unlikely(!page || page == RADIX_TREE_RETRY)) 716 if (unlikely(!page))
717 goto out;
718 if (radix_tree_deref_retry(page))
635 goto repeat; 719 goto repeat;
636 720
637 if (!page_cache_get_speculative(page)) 721 if (!page_cache_get_speculative(page))
@@ -647,6 +731,7 @@ repeat:
647 goto repeat; 731 goto repeat;
648 } 732 }
649 } 733 }
734out:
650 rcu_read_unlock(); 735 rcu_read_unlock();
651 736
652 return page; 737 return page;
@@ -764,12 +849,15 @@ repeat:
764 page = radix_tree_deref_slot((void **)pages[i]); 849 page = radix_tree_deref_slot((void **)pages[i]);
765 if (unlikely(!page)) 850 if (unlikely(!page))
766 continue; 851 continue;
852
767 /* 853 /*
768 * this can only trigger if nr_found == 1, making livelock 854 * This can only trigger when the entry at index 0 moves out
769 * a non issue. 855 * of or back to the root: none yet gotten, safe to restart.
770 */ 856 */
771 if (unlikely(page == RADIX_TREE_RETRY)) 857 if (radix_tree_deref_retry(page)) {
858 WARN_ON(start | i);
772 goto restart; 859 goto restart;
860 }
773 861
774 if (!page_cache_get_speculative(page)) 862 if (!page_cache_get_speculative(page))
775 goto repeat; 863 goto repeat;
@@ -783,6 +871,13 @@ repeat:
783 pages[ret] = page; 871 pages[ret] = page;
784 ret++; 872 ret++;
785 } 873 }
874
875 /*
876 * If all entries were removed before we could secure them,
877 * try again, because callers stop trying once 0 is returned.
878 */
879 if (unlikely(!ret && nr_found))
880 goto restart;
786 rcu_read_unlock(); 881 rcu_read_unlock();
787 return ret; 882 return ret;
788} 883}
@@ -817,16 +912,14 @@ repeat:
817 page = radix_tree_deref_slot((void **)pages[i]); 912 page = radix_tree_deref_slot((void **)pages[i]);
818 if (unlikely(!page)) 913 if (unlikely(!page))
819 continue; 914 continue;
915
820 /* 916 /*
821 * this can only trigger if nr_found == 1, making livelock 917 * This can only trigger when the entry at index 0 moves out
822 * a non issue. 918 * of or back to the root: none yet gotten, safe to restart.
823 */ 919 */
824 if (unlikely(page == RADIX_TREE_RETRY)) 920 if (radix_tree_deref_retry(page))
825 goto restart; 921 goto restart;
826 922
827 if (page->mapping == NULL || page->index != index)
828 break;
829
830 if (!page_cache_get_speculative(page)) 923 if (!page_cache_get_speculative(page))
831 goto repeat; 924 goto repeat;
832 925
@@ -836,6 +929,16 @@ repeat:
836 goto repeat; 929 goto repeat;
837 } 930 }
838 931
932 /*
933 * must check mapping and index after taking the ref.
934 * otherwise we can get both false positives and false
935 * negatives, which is just confusing to the caller.
936 */
937 if (page->mapping == NULL || page->index != index) {
938 page_cache_release(page);
939 break;
940 }
941
839 pages[ret] = page; 942 pages[ret] = page;
840 ret++; 943 ret++;
841 index++; 944 index++;
@@ -874,11 +977,12 @@ repeat:
874 page = radix_tree_deref_slot((void **)pages[i]); 977 page = radix_tree_deref_slot((void **)pages[i]);
875 if (unlikely(!page)) 978 if (unlikely(!page))
876 continue; 979 continue;
980
877 /* 981 /*
878 * this can only trigger if nr_found == 1, making livelock 982 * This can only trigger when the entry at index 0 moves out
879 * a non issue. 983 * of or back to the root: none yet gotten, safe to restart.
880 */ 984 */
881 if (unlikely(page == RADIX_TREE_RETRY)) 985 if (radix_tree_deref_retry(page))
882 goto restart; 986 goto restart;
883 987
884 if (!page_cache_get_speculative(page)) 988 if (!page_cache_get_speculative(page))
@@ -893,6 +997,13 @@ repeat:
893 pages[ret] = page; 997 pages[ret] = page;
894 ret++; 998 ret++;
895 } 999 }
1000
1001 /*
1002 * If all entries were removed before we could secure them,
1003 * try again, because callers stop trying once 0 is returned.
1004 */
1005 if (unlikely(!ret && nr_found))
1006 goto restart;
896 rcu_read_unlock(); 1007 rcu_read_unlock();
897 1008
898 if (ret) 1009 if (ret)
@@ -1016,6 +1127,9 @@ find_page:
1016 goto page_not_up_to_date; 1127 goto page_not_up_to_date;
1017 if (!trylock_page(page)) 1128 if (!trylock_page(page))
1018 goto page_not_up_to_date; 1129 goto page_not_up_to_date;
1130 /* Did it get truncated before we got the lock? */
1131 if (!page->mapping)
1132 goto page_not_up_to_date_locked;
1019 if (!mapping->a_ops->is_partially_uptodate(page, 1133 if (!mapping->a_ops->is_partially_uptodate(page,
1020 desc, offset)) 1134 desc, offset))
1021 goto page_not_up_to_date_locked; 1135 goto page_not_up_to_date_locked;
@@ -1279,12 +1393,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1279 unsigned long seg = 0; 1393 unsigned long seg = 0;
1280 size_t count; 1394 size_t count;
1281 loff_t *ppos = &iocb->ki_pos; 1395 loff_t *ppos = &iocb->ki_pos;
1396 struct blk_plug plug;
1282 1397
1283 count = 0; 1398 count = 0;
1284 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1399 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1285 if (retval) 1400 if (retval)
1286 return retval; 1401 return retval;
1287 1402
1403 blk_start_plug(&plug);
1404
1288 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1405 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1289 if (filp->f_flags & O_DIRECT) { 1406 if (filp->f_flags & O_DIRECT) {
1290 loff_t size; 1407 loff_t size;
@@ -1357,6 +1474,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1357 break; 1474 break;
1358 } 1475 }
1359out: 1476out:
1477 blk_finish_plug(&plug);
1360 return retval; 1478 return retval;
1361} 1479}
1362EXPORT_SYMBOL(generic_file_aio_read); 1480EXPORT_SYMBOL(generic_file_aio_read);
@@ -1449,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1449 /* If we don't want any read-ahead, don't bother */ 1567 /* If we don't want any read-ahead, don't bother */
1450 if (VM_RandomReadHint(vma)) 1568 if (VM_RandomReadHint(vma))
1451 return; 1569 return;
1570 if (!ra->ra_pages)
1571 return;
1452 1572
1453 if (VM_SequentialReadHint(vma) || 1573 if (VM_SequentialReadHint(vma)) {
1454 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1455 page_cache_sync_readahead(mapping, ra, file, offset, 1574 page_cache_sync_readahead(mapping, ra, file, offset,
1456 ra->ra_pages); 1575 ra->ra_pages);
1457 return; 1576 return;
1458 } 1577 }
1459 1578
1460 if (ra->mmap_miss < INT_MAX) 1579 /* Avoid banging the cache line if not needed */
1580 if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
1461 ra->mmap_miss++; 1581 ra->mmap_miss++;
1462 1582
1463 /* 1583 /*
@@ -1471,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1471 * mmap read-around 1591 * mmap read-around
1472 */ 1592 */
1473 ra_pages = max_sane_readahead(ra->ra_pages); 1593 ra_pages = max_sane_readahead(ra->ra_pages);
1474 if (ra_pages) { 1594 ra->start = max_t(long, 0, offset - ra_pages / 2);
1475 ra->start = max_t(long, 0, offset - ra_pages/2); 1595 ra->size = ra_pages;
1476 ra->size = ra_pages; 1596 ra->async_size = ra_pages / 4;
1477 ra->async_size = 0; 1597 ra_submit(ra, mapping, file);
1478 ra_submit(ra, mapping, file);
1479 }
1480} 1598}
1481 1599
1482/* 1600/*
@@ -1539,25 +1657,31 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1539 * waiting for the lock. 1657 * waiting for the lock.
1540 */ 1658 */
1541 do_async_mmap_readahead(vma, ra, file, page, offset); 1659 do_async_mmap_readahead(vma, ra, file, page, offset);
1542 lock_page(page);
1543
1544 /* Did it get truncated? */
1545 if (unlikely(page->mapping != mapping)) {
1546 unlock_page(page);
1547 put_page(page);
1548 goto no_cached_page;
1549 }
1550 } else { 1660 } else {
1551 /* No page in the page cache at all */ 1661 /* No page in the page cache at all */
1552 do_sync_mmap_readahead(vma, ra, file, offset); 1662 do_sync_mmap_readahead(vma, ra, file, offset);
1553 count_vm_event(PGMAJFAULT); 1663 count_vm_event(PGMAJFAULT);
1664 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 ret = VM_FAULT_MAJOR; 1665 ret = VM_FAULT_MAJOR;
1555retry_find: 1666retry_find:
1556 page = find_lock_page(mapping, offset); 1667 page = find_get_page(mapping, offset);
1557 if (!page) 1668 if (!page)
1558 goto no_cached_page; 1669 goto no_cached_page;
1559 } 1670 }
1560 1671
1672 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
1673 page_cache_release(page);
1674 return ret | VM_FAULT_RETRY;
1675 }
1676
1677 /* Did it get truncated? */
1678 if (unlikely(page->mapping != mapping)) {
1679 unlock_page(page);
1680 put_page(page);
1681 goto retry_find;
1682 }
1683 VM_BUG_ON(page->index != offset);
1684
1561 /* 1685 /*
1562 * We have a locked page in the page cache, now we need to check 1686 * We have a locked page in the page cache, now we need to check
1563 * that it's up-to-date. If not, it is going to be due to an error. 1687 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1576,7 +1700,6 @@ retry_find:
1576 return VM_FAULT_SIGBUS; 1700 return VM_FAULT_SIGBUS;
1577 } 1701 }
1578 1702
1579 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1580 vmf->page = page; 1703 vmf->page = page;
1581 return ret | VM_FAULT_LOCKED; 1704 return ret | VM_FAULT_LOCKED;
1582 1705
@@ -1859,16 +1982,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
1859int file_remove_suid(struct file *file) 1982int file_remove_suid(struct file *file)
1860{ 1983{
1861 struct dentry *dentry = file->f_path.dentry; 1984 struct dentry *dentry = file->f_path.dentry;
1862 int killsuid = should_remove_suid(dentry); 1985 struct inode *inode = dentry->d_inode;
1863 int killpriv = security_inode_need_killpriv(dentry); 1986 int killsuid;
1987 int killpriv;
1864 int error = 0; 1988 int error = 0;
1865 1989
1990 /* Fast path for nothing security related */
1991 if (IS_NOSEC(inode))
1992 return 0;
1993
1994 killsuid = should_remove_suid(dentry);
1995 killpriv = security_inode_need_killpriv(dentry);
1996
1866 if (killpriv < 0) 1997 if (killpriv < 0)
1867 return killpriv; 1998 return killpriv;
1868 if (killpriv) 1999 if (killpriv)
1869 error = security_inode_killpriv(dentry); 2000 error = security_inode_killpriv(dentry);
1870 if (!error && killsuid) 2001 if (!error && killsuid)
1871 error = __remove_suid(dentry, killsuid); 2002 error = __remove_suid(dentry, killsuid);
2003 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2004 inode->i_flags |= S_NOSEC;
1872 2005
1873 return error; 2006 return error;
1874} 2007}
@@ -2177,12 +2310,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2177 } 2310 }
2178 2311
2179 if (written > 0) { 2312 if (written > 0) {
2180 loff_t end = pos + written; 2313 pos += written;
2181 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2314 if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2182 i_size_write(inode, end); 2315 i_size_write(inode, pos);
2183 mark_inode_dirty(inode); 2316 mark_inode_dirty(inode);
2184 } 2317 }
2185 *ppos = end; 2318 *ppos = pos;
2186 } 2319 }
2187out: 2320out:
2188 return written; 2321 return written;
@@ -2203,8 +2336,8 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2203 gfp_notmask = __GFP_FS; 2336 gfp_notmask = __GFP_FS;
2204repeat: 2337repeat:
2205 page = find_lock_page(mapping, index); 2338 page = find_lock_page(mapping, index);
2206 if (likely(page)) 2339 if (page)
2207 return page; 2340 goto found;
2208 2341
2209 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2342 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2210 if (!page) 2343 if (!page)
@@ -2217,6 +2350,8 @@ repeat:
2217 goto repeat; 2350 goto repeat;
2218 return NULL; 2351 return NULL;
2219 } 2352 }
2353found:
2354 wait_on_page_writeback(page);
2220 return page; 2355 return page;
2221} 2356}
2222EXPORT_SYMBOL(grab_cache_page_write_begin); 2357EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2463,11 +2598,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2463{ 2598{
2464 struct file *file = iocb->ki_filp; 2599 struct file *file = iocb->ki_filp;
2465 struct inode *inode = file->f_mapping->host; 2600 struct inode *inode = file->f_mapping->host;
2601 struct blk_plug plug;
2466 ssize_t ret; 2602 ssize_t ret;
2467 2603
2468 BUG_ON(iocb->ki_pos != pos); 2604 BUG_ON(iocb->ki_pos != pos);
2469 2605
2470 mutex_lock(&inode->i_mutex); 2606 mutex_lock(&inode->i_mutex);
2607 blk_start_plug(&plug);
2471 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2608 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2472 mutex_unlock(&inode->i_mutex); 2609 mutex_unlock(&inode->i_mutex);
2473 2610
@@ -2478,6 +2615,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2478 if (err < 0 && ret > 0) 2615 if (err < 0 && ret > 0)
2479 ret = err; 2616 ret = err;
2480 } 2617 }
2618 blk_finish_plug(&plug);
2481 return ret; 2619 return ret;
2482} 2620}
2483EXPORT_SYMBOL(generic_file_aio_write); 2621EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df74a33..93356cd12828 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
183 return; 183 return;
184 184
185retry: 185retry:
186 spin_lock(&mapping->i_mmap_lock); 186 mutex_lock(&mapping->i_mmap_mutex);
187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
188 mm = vma->vm_mm; 188 mm = vma->vm_mm;
189 address = vma->vm_start + 189 address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
201 page_cache_release(page); 201 page_cache_release(page);
202 } 202 }
203 } 203 }
204 spin_unlock(&mapping->i_mmap_lock); 204 mutex_unlock(&mapping->i_mmap_mutex);
205 205
206 if (locked) { 206 if (locked) {
207 mutex_unlock(&xip_sparse_mutex); 207 mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7b28df..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
211 } 211 }
212 goto out; 212 goto out;
213 } 213 }
214 spin_lock(&mapping->i_mmap_lock); 214 mutex_lock(&mapping->i_mmap_mutex);
215 flush_dcache_mmap_lock(mapping); 215 flush_dcache_mmap_lock(mapping);
216 vma->vm_flags |= VM_NONLINEAR; 216 vma->vm_flags |= VM_NONLINEAR;
217 vma_prio_tree_remove(vma, &mapping->i_mmap); 217 vma_prio_tree_remove(vma, &mapping->i_mmap);
218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
219 flush_dcache_mmap_unlock(mapping); 219 flush_dcache_mmap_unlock(mapping);
220 spin_unlock(&mapping->i_mmap_lock); 220 mutex_unlock(&mapping->i_mmap_mutex);
221 } 221 }
222 222
223 if (vma->vm_flags & VM_LOCKED) { 223 if (vma->vm_flags & VM_LOCKED) {
224 /* 224 /*
225 * drop PG_Mlocked flag for over-mapped range 225 * drop PG_Mlocked flag for over-mapped range
226 */ 226 */
227 unsigned int saved_flags = vma->vm_flags; 227 vm_flags_t saved_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 228 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 229 vma->vm_flags = saved_flags;
230 } 230 }
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
29#include <linux/kgdb.h> 29#include <linux/kgdb.h>
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32
33#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
34DEFINE_PER_CPU(int, __kmap_atomic_idx);
35#endif
36
32/* 37/*
33 * Virtual_count is not a pure "count". 38 * Virtual_count is not a pure "count".
34 * 0 means that it is not mapped, and has not been mapped 39 * 0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
42unsigned long totalhigh_pages __read_mostly; 47unsigned long totalhigh_pages __read_mostly;
43EXPORT_SYMBOL(totalhigh_pages); 48EXPORT_SYMBOL(totalhigh_pages);
44 49
50
51EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
52
45unsigned int nr_free_highpages (void) 53unsigned int nr_free_highpages (void)
46{ 54{
47 pg_data_t *pgdat; 55 pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
422} 430}
423 431
424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 432#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#ifdef CONFIG_DEBUG_HIGHMEM
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static int warn_count = 10;
431
432 if (unlikely(warn_count < 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_nmi()) {
437 if (type != KM_NMI && type != KM_NMI_PTE) {
438 WARN_ON(1);
439 warn_count--;
440 }
441 } else if (in_irq()) {
442 if (type != KM_IRQ0 && type != KM_IRQ1 &&
443 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
444 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
445 WARN_ON(1);
446 warn_count--;
447 }
448 } else if (!irqs_disabled()) { /* softirq */
449 if (type != KM_IRQ0 && type != KM_IRQ1 &&
450 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
451 type != KM_SKB_SUNRPC_DATA &&
452 type != KM_SKB_DATA_SOFTIRQ &&
453 type != KM_BOUNCE_READ) {
454 WARN_ON(1);
455 warn_count--;
456 }
457 }
458 }
459
460 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
461 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
462 type == KM_IRQ_PTE || type == KM_NMI ||
463 type == KM_NMI_PTE ) {
464 if (!irqs_disabled()) {
465 WARN_ON(1);
466 warn_count--;
467 }
468 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
469 if (irq_count() == 0 && !irqs_disabled()) {
470 WARN_ON(1);
471 warn_count--;
472 }
473 }
474#ifdef CONFIG_KGDB_KDB
475 if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
476 WARN_ON(1);
477 warn_count--;
478 }
479#endif /* CONFIG_KGDB_KDB */
480}
481
482#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..81532f297fd2
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2391 @@
1/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
18#include <linux/freezer.h>
19#include <linux/mman.h>
20#include <asm/tlb.h>
21#include <asm/pgalloc.h>
22#include "internal.h"
23
24/*
25 * By default transparent hugepage support is enabled for all mappings
26 * and khugepaged scans all mappings. Defrag is only invoked by
27 * khugepaged hugepage allocations and by page faults inside
28 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29 * allocations.
30 */
31unsigned long transparent_hugepage_flags __read_mostly =
32#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
33 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
34#endif
35#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
36 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
37#endif
38 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
40
41/* default scan 8*512 pte (or vmas) every 30 second */
42static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
43static unsigned int khugepaged_pages_collapsed;
44static unsigned int khugepaged_full_scans;
45static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
46/* during fragmentation poll the hugepage allocator once every minute */
47static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
48static struct task_struct *khugepaged_thread __read_mostly;
49static DEFINE_MUTEX(khugepaged_mutex);
50static DEFINE_SPINLOCK(khugepaged_mm_lock);
51static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
52/*
53 * default collapse hugepages if there is at least one pte mapped like
54 * it would have happened if the vma was large enough during page
55 * fault.
56 */
57static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
58
59static int khugepaged(void *none);
60static int mm_slots_hash_init(void);
61static int khugepaged_slab_init(void);
62static void khugepaged_slab_free(void);
63
64#define MM_SLOTS_HASH_HEADS 1024
65static struct hlist_head *mm_slots_hash __read_mostly;
66static struct kmem_cache *mm_slot_cache __read_mostly;
67
68/**
69 * struct mm_slot - hash lookup from mm to mm_slot
70 * @hash: hash collision list
71 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
72 * @mm: the mm that this information is valid for
73 */
74struct mm_slot {
75 struct hlist_node hash;
76 struct list_head mm_node;
77 struct mm_struct *mm;
78};
79
80/**
81 * struct khugepaged_scan - cursor for scanning
82 * @mm_head: the head of the mm list to scan
83 * @mm_slot: the current mm_slot we are scanning
84 * @address: the next address inside that to be scanned
85 *
86 * There is only the one khugepaged_scan instance of this cursor structure.
87 */
88struct khugepaged_scan {
89 struct list_head mm_head;
90 struct mm_slot *mm_slot;
91 unsigned long address;
92} khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94};
95
96
97static int set_recommended_min_free_kbytes(void)
98{
99 struct zone *zone;
100 int nr_zones = 0;
101 unsigned long recommended_min;
102 extern int min_free_kbytes;
103
104 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
105 &transparent_hugepage_flags) &&
106 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
107 &transparent_hugepage_flags))
108 return 0;
109
110 for_each_populated_zone(zone)
111 nr_zones++;
112
113 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
114 recommended_min = pageblock_nr_pages * nr_zones * 2;
115
116 /*
117 * Make sure that on average at least two pageblocks are almost free
118 * of another type, one for a migratetype to fall back to and a
119 * second to avoid subsequent fallbacks of other types There are 3
120 * MIGRATE_TYPES we care about.
121 */
122 recommended_min += pageblock_nr_pages * nr_zones *
123 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
124
125 /* don't ever allow to reserve more than 5% of the lowmem */
126 recommended_min = min(recommended_min,
127 (unsigned long) nr_free_buffer_pages() / 20);
128 recommended_min <<= (PAGE_SHIFT-10);
129
130 if (recommended_min > min_free_kbytes)
131 min_free_kbytes = recommended_min;
132 setup_per_zone_wmarks();
133 return 0;
134}
135late_initcall(set_recommended_min_free_kbytes);
136
137static int start_khugepaged(void)
138{
139 int err = 0;
140 if (khugepaged_enabled()) {
141 int wakeup;
142 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
143 err = -ENOMEM;
144 goto out;
145 }
146 mutex_lock(&khugepaged_mutex);
147 if (!khugepaged_thread)
148 khugepaged_thread = kthread_run(khugepaged, NULL,
149 "khugepaged");
150 if (unlikely(IS_ERR(khugepaged_thread))) {
151 printk(KERN_ERR
152 "khugepaged: kthread_run(khugepaged) failed\n");
153 err = PTR_ERR(khugepaged_thread);
154 khugepaged_thread = NULL;
155 }
156 wakeup = !list_empty(&khugepaged_scan.mm_head);
157 mutex_unlock(&khugepaged_mutex);
158 if (wakeup)
159 wake_up_interruptible(&khugepaged_wait);
160
161 set_recommended_min_free_kbytes();
162 } else
163 /* wakeup to exit */
164 wake_up_interruptible(&khugepaged_wait);
165out:
166 return err;
167}
168
169#ifdef CONFIG_SYSFS
170
171static ssize_t double_flag_show(struct kobject *kobj,
172 struct kobj_attribute *attr, char *buf,
173 enum transparent_hugepage_flag enabled,
174 enum transparent_hugepage_flag req_madv)
175{
176 if (test_bit(enabled, &transparent_hugepage_flags)) {
177 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
178 return sprintf(buf, "[always] madvise never\n");
179 } else if (test_bit(req_madv, &transparent_hugepage_flags))
180 return sprintf(buf, "always [madvise] never\n");
181 else
182 return sprintf(buf, "always madvise [never]\n");
183}
184static ssize_t double_flag_store(struct kobject *kobj,
185 struct kobj_attribute *attr,
186 const char *buf, size_t count,
187 enum transparent_hugepage_flag enabled,
188 enum transparent_hugepage_flag req_madv)
189{
190 if (!memcmp("always", buf,
191 min(sizeof("always")-1, count))) {
192 set_bit(enabled, &transparent_hugepage_flags);
193 clear_bit(req_madv, &transparent_hugepage_flags);
194 } else if (!memcmp("madvise", buf,
195 min(sizeof("madvise")-1, count))) {
196 clear_bit(enabled, &transparent_hugepage_flags);
197 set_bit(req_madv, &transparent_hugepage_flags);
198 } else if (!memcmp("never", buf,
199 min(sizeof("never")-1, count))) {
200 clear_bit(enabled, &transparent_hugepage_flags);
201 clear_bit(req_madv, &transparent_hugepage_flags);
202 } else
203 return -EINVAL;
204
205 return count;
206}
207
208static ssize_t enabled_show(struct kobject *kobj,
209 struct kobj_attribute *attr, char *buf)
210{
211 return double_flag_show(kobj, attr, buf,
212 TRANSPARENT_HUGEPAGE_FLAG,
213 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
214}
215static ssize_t enabled_store(struct kobject *kobj,
216 struct kobj_attribute *attr,
217 const char *buf, size_t count)
218{
219 ssize_t ret;
220
221 ret = double_flag_store(kobj, attr, buf, count,
222 TRANSPARENT_HUGEPAGE_FLAG,
223 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
224
225 if (ret > 0) {
226 int err = start_khugepaged();
227 if (err)
228 ret = err;
229 }
230
231 if (ret > 0 &&
232 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
233 &transparent_hugepage_flags) ||
234 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
235 &transparent_hugepage_flags)))
236 set_recommended_min_free_kbytes();
237
238 return ret;
239}
240static struct kobj_attribute enabled_attr =
241 __ATTR(enabled, 0644, enabled_show, enabled_store);
242
243static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag)
246{
247 return sprintf(buf, "%d\n",
248 !!test_bit(flag, &transparent_hugepage_flags));
249}
250
251static ssize_t single_flag_store(struct kobject *kobj,
252 struct kobj_attribute *attr,
253 const char *buf, size_t count,
254 enum transparent_hugepage_flag flag)
255{
256 unsigned long value;
257 int ret;
258
259 ret = kstrtoul(buf, 10, &value);
260 if (ret < 0)
261 return ret;
262 if (value > 1)
263 return -EINVAL;
264
265 if (value)
266 set_bit(flag, &transparent_hugepage_flags);
267 else
268 clear_bit(flag, &transparent_hugepage_flags);
269
270 return count;
271}
272
273/*
274 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
275 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
276 * memory just to allocate one more hugepage.
277 */
278static ssize_t defrag_show(struct kobject *kobj,
279 struct kobj_attribute *attr, char *buf)
280{
281 return double_flag_show(kobj, attr, buf,
282 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
283 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
284}
285static ssize_t defrag_store(struct kobject *kobj,
286 struct kobj_attribute *attr,
287 const char *buf, size_t count)
288{
289 return double_flag_store(kobj, attr, buf, count,
290 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
291 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
292}
293static struct kobj_attribute defrag_attr =
294 __ATTR(defrag, 0644, defrag_show, defrag_store);
295
296#ifdef CONFIG_DEBUG_VM
297static ssize_t debug_cow_show(struct kobject *kobj,
298 struct kobj_attribute *attr, char *buf)
299{
300 return single_flag_show(kobj, attr, buf,
301 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
302}
303static ssize_t debug_cow_store(struct kobject *kobj,
304 struct kobj_attribute *attr,
305 const char *buf, size_t count)
306{
307 return single_flag_store(kobj, attr, buf, count,
308 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
309}
310static struct kobj_attribute debug_cow_attr =
311 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
312#endif /* CONFIG_DEBUG_VM */
313
314static struct attribute *hugepage_attr[] = {
315 &enabled_attr.attr,
316 &defrag_attr.attr,
317#ifdef CONFIG_DEBUG_VM
318 &debug_cow_attr.attr,
319#endif
320 NULL,
321};
322
323static struct attribute_group hugepage_attr_group = {
324 .attrs = hugepage_attr,
325};
326
327static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
328 struct kobj_attribute *attr,
329 char *buf)
330{
331 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
332}
333
334static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
335 struct kobj_attribute *attr,
336 const char *buf, size_t count)
337{
338 unsigned long msecs;
339 int err;
340
341 err = strict_strtoul(buf, 10, &msecs);
342 if (err || msecs > UINT_MAX)
343 return -EINVAL;
344
345 khugepaged_scan_sleep_millisecs = msecs;
346 wake_up_interruptible(&khugepaged_wait);
347
348 return count;
349}
350static struct kobj_attribute scan_sleep_millisecs_attr =
351 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
352 scan_sleep_millisecs_store);
353
354static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
355 struct kobj_attribute *attr,
356 char *buf)
357{
358 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
359}
360
361static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
362 struct kobj_attribute *attr,
363 const char *buf, size_t count)
364{
365 unsigned long msecs;
366 int err;
367
368 err = strict_strtoul(buf, 10, &msecs);
369 if (err || msecs > UINT_MAX)
370 return -EINVAL;
371
372 khugepaged_alloc_sleep_millisecs = msecs;
373 wake_up_interruptible(&khugepaged_wait);
374
375 return count;
376}
377static struct kobj_attribute alloc_sleep_millisecs_attr =
378 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
379 alloc_sleep_millisecs_store);
380
381static ssize_t pages_to_scan_show(struct kobject *kobj,
382 struct kobj_attribute *attr,
383 char *buf)
384{
385 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
386}
387static ssize_t pages_to_scan_store(struct kobject *kobj,
388 struct kobj_attribute *attr,
389 const char *buf, size_t count)
390{
391 int err;
392 unsigned long pages;
393
394 err = strict_strtoul(buf, 10, &pages);
395 if (err || !pages || pages > UINT_MAX)
396 return -EINVAL;
397
398 khugepaged_pages_to_scan = pages;
399
400 return count;
401}
402static struct kobj_attribute pages_to_scan_attr =
403 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
404 pages_to_scan_store);
405
406static ssize_t pages_collapsed_show(struct kobject *kobj,
407 struct kobj_attribute *attr,
408 char *buf)
409{
410 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
411}
412static struct kobj_attribute pages_collapsed_attr =
413 __ATTR_RO(pages_collapsed);
414
415static ssize_t full_scans_show(struct kobject *kobj,
416 struct kobj_attribute *attr,
417 char *buf)
418{
419 return sprintf(buf, "%u\n", khugepaged_full_scans);
420}
421static struct kobj_attribute full_scans_attr =
422 __ATTR_RO(full_scans);
423
424static ssize_t khugepaged_defrag_show(struct kobject *kobj,
425 struct kobj_attribute *attr, char *buf)
426{
427 return single_flag_show(kobj, attr, buf,
428 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
429}
430static ssize_t khugepaged_defrag_store(struct kobject *kobj,
431 struct kobj_attribute *attr,
432 const char *buf, size_t count)
433{
434 return single_flag_store(kobj, attr, buf, count,
435 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
436}
437static struct kobj_attribute khugepaged_defrag_attr =
438 __ATTR(defrag, 0644, khugepaged_defrag_show,
439 khugepaged_defrag_store);
440
441/*
442 * max_ptes_none controls if khugepaged should collapse hugepages over
443 * any unmapped ptes in turn potentially increasing the memory
444 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
445 * reduce the available free memory in the system as it
446 * runs. Increasing max_ptes_none will instead potentially reduce the
447 * free memory in the system during the khugepaged scan.
448 */
449static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
450 struct kobj_attribute *attr,
451 char *buf)
452{
453 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
454}
455static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
456 struct kobj_attribute *attr,
457 const char *buf, size_t count)
458{
459 int err;
460 unsigned long max_ptes_none;
461
462 err = strict_strtoul(buf, 10, &max_ptes_none);
463 if (err || max_ptes_none > HPAGE_PMD_NR-1)
464 return -EINVAL;
465
466 khugepaged_max_ptes_none = max_ptes_none;
467
468 return count;
469}
470static struct kobj_attribute khugepaged_max_ptes_none_attr =
471 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
472 khugepaged_max_ptes_none_store);
473
474static struct attribute *khugepaged_attr[] = {
475 &khugepaged_defrag_attr.attr,
476 &khugepaged_max_ptes_none_attr.attr,
477 &pages_to_scan_attr.attr,
478 &pages_collapsed_attr.attr,
479 &full_scans_attr.attr,
480 &scan_sleep_millisecs_attr.attr,
481 &alloc_sleep_millisecs_attr.attr,
482 NULL,
483};
484
485static struct attribute_group khugepaged_attr_group = {
486 .attrs = khugepaged_attr,
487 .name = "khugepaged",
488};
489#endif /* CONFIG_SYSFS */
490
491static int __init hugepage_init(void)
492{
493 int err;
494#ifdef CONFIG_SYSFS
495 static struct kobject *hugepage_kobj;
496#endif
497
498 err = -EINVAL;
499 if (!has_transparent_hugepage()) {
500 transparent_hugepage_flags = 0;
501 goto out;
502 }
503
504#ifdef CONFIG_SYSFS
505 err = -ENOMEM;
506 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
507 if (unlikely(!hugepage_kobj)) {
508 printk(KERN_ERR "hugepage: failed kobject create\n");
509 goto out;
510 }
511
512 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
513 if (err) {
514 printk(KERN_ERR "hugepage: failed register hugeage group\n");
515 goto out;
516 }
517
518 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
519 if (err) {
520 printk(KERN_ERR "hugepage: failed register hugeage group\n");
521 goto out;
522 }
523#endif
524
525 err = khugepaged_slab_init();
526 if (err)
527 goto out;
528
529 err = mm_slots_hash_init();
530 if (err) {
531 khugepaged_slab_free();
532 goto out;
533 }
534
535 /*
536 * By default disable transparent hugepages on smaller systems,
537 * where the extra memory used could hurt more than TLB overhead
538 * is likely to save. The admin can still enable it through /sys.
539 */
540 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
541 transparent_hugepage_flags = 0;
542
543 start_khugepaged();
544
545 set_recommended_min_free_kbytes();
546
547out:
548 return err;
549}
550module_init(hugepage_init)
551
552static int __init setup_transparent_hugepage(char *str)
553{
554 int ret = 0;
555 if (!str)
556 goto out;
557 if (!strcmp(str, "always")) {
558 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
559 &transparent_hugepage_flags);
560 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
561 &transparent_hugepage_flags);
562 ret = 1;
563 } else if (!strcmp(str, "madvise")) {
564 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
565 &transparent_hugepage_flags);
566 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
567 &transparent_hugepage_flags);
568 ret = 1;
569 } else if (!strcmp(str, "never")) {
570 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
571 &transparent_hugepage_flags);
572 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
573 &transparent_hugepage_flags);
574 ret = 1;
575 }
576out:
577 if (!ret)
578 printk(KERN_WARNING
579 "transparent_hugepage= cannot parse, ignored\n");
580 return ret;
581}
582__setup("transparent_hugepage=", setup_transparent_hugepage);
583
584static void prepare_pmd_huge_pte(pgtable_t pgtable,
585 struct mm_struct *mm)
586{
587 assert_spin_locked(&mm->page_table_lock);
588
589 /* FIFO */
590 if (!mm->pmd_huge_pte)
591 INIT_LIST_HEAD(&pgtable->lru);
592 else
593 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
594 mm->pmd_huge_pte = pgtable;
595}
596
597static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
598{
599 if (likely(vma->vm_flags & VM_WRITE))
600 pmd = pmd_mkwrite(pmd);
601 return pmd;
602}
603
604static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
605 struct vm_area_struct *vma,
606 unsigned long haddr, pmd_t *pmd,
607 struct page *page)
608{
609 int ret = 0;
610 pgtable_t pgtable;
611
612 VM_BUG_ON(!PageCompound(page));
613 pgtable = pte_alloc_one(mm, haddr);
614 if (unlikely(!pgtable)) {
615 mem_cgroup_uncharge_page(page);
616 put_page(page);
617 return VM_FAULT_OOM;
618 }
619
620 clear_huge_page(page, haddr, HPAGE_PMD_NR);
621 __SetPageUptodate(page);
622
623 spin_lock(&mm->page_table_lock);
624 if (unlikely(!pmd_none(*pmd))) {
625 spin_unlock(&mm->page_table_lock);
626 mem_cgroup_uncharge_page(page);
627 put_page(page);
628 pte_free(mm, pgtable);
629 } else {
630 pmd_t entry;
631 entry = mk_pmd(page, vma->vm_page_prot);
632 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
633 entry = pmd_mkhuge(entry);
634 /*
635 * The spinlocking to take the lru_lock inside
636 * page_add_new_anon_rmap() acts as a full memory
637 * barrier to be sure clear_huge_page writes become
638 * visible after the set_pmd_at() write.
639 */
640 page_add_new_anon_rmap(page, vma, haddr);
641 set_pmd_at(mm, haddr, pmd, entry);
642 prepare_pmd_huge_pte(pgtable, mm);
643 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
644 spin_unlock(&mm->page_table_lock);
645 }
646
647 return ret;
648}
649
650static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
651{
652 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
653}
654
655static inline struct page *alloc_hugepage_vma(int defrag,
656 struct vm_area_struct *vma,
657 unsigned long haddr, int nd,
658 gfp_t extra_gfp)
659{
660 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
661 HPAGE_PMD_ORDER, vma, haddr, nd);
662}
663
664#ifndef CONFIG_NUMA
665static inline struct page *alloc_hugepage(int defrag)
666{
667 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
668 HPAGE_PMD_ORDER);
669}
670#endif
671
672int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
673 unsigned long address, pmd_t *pmd,
674 unsigned int flags)
675{
676 struct page *page;
677 unsigned long haddr = address & HPAGE_PMD_MASK;
678 pte_t *pte;
679
680 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
681 if (unlikely(anon_vma_prepare(vma)))
682 return VM_FAULT_OOM;
683 if (unlikely(khugepaged_enter(vma)))
684 return VM_FAULT_OOM;
685 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
686 vma, haddr, numa_node_id(), 0);
687 if (unlikely(!page)) {
688 count_vm_event(THP_FAULT_FALLBACK);
689 goto out;
690 }
691 count_vm_event(THP_FAULT_ALLOC);
692 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
693 put_page(page);
694 goto out;
695 }
696
697 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
698 }
699out:
700 /*
701 * Use __pte_alloc instead of pte_alloc_map, because we can't
702 * run pte_offset_map on the pmd, if an huge pmd could
703 * materialize from under us from a different thread.
704 */
705 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
706 return VM_FAULT_OOM;
707 /* if an huge pmd materialized from under us just retry later */
708 if (unlikely(pmd_trans_huge(*pmd)))
709 return 0;
710 /*
711 * A regular pmd is established and it can't morph into a huge pmd
712 * from under us anymore at this point because we hold the mmap_sem
713 * read mode and khugepaged takes it in write mode. So now it's
714 * safe to run pte_offset_map().
715 */
716 pte = pte_offset_map(pmd, address);
717 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
718}
719
720int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
721 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
722 struct vm_area_struct *vma)
723{
724 struct page *src_page;
725 pmd_t pmd;
726 pgtable_t pgtable;
727 int ret;
728
729 ret = -ENOMEM;
730 pgtable = pte_alloc_one(dst_mm, addr);
731 if (unlikely(!pgtable))
732 goto out;
733
734 spin_lock(&dst_mm->page_table_lock);
735 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
736
737 ret = -EAGAIN;
738 pmd = *src_pmd;
739 if (unlikely(!pmd_trans_huge(pmd))) {
740 pte_free(dst_mm, pgtable);
741 goto out_unlock;
742 }
743 if (unlikely(pmd_trans_splitting(pmd))) {
744 /* split huge page running from under us */
745 spin_unlock(&src_mm->page_table_lock);
746 spin_unlock(&dst_mm->page_table_lock);
747 pte_free(dst_mm, pgtable);
748
749 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
750 goto out;
751 }
752 src_page = pmd_page(pmd);
753 VM_BUG_ON(!PageHead(src_page));
754 get_page(src_page);
755 page_dup_rmap(src_page);
756 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
757
758 pmdp_set_wrprotect(src_mm, addr, src_pmd);
759 pmd = pmd_mkold(pmd_wrprotect(pmd));
760 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
761 prepare_pmd_huge_pte(pgtable, dst_mm);
762
763 ret = 0;
764out_unlock:
765 spin_unlock(&src_mm->page_table_lock);
766 spin_unlock(&dst_mm->page_table_lock);
767out:
768 return ret;
769}
770
771/* no "address" argument so destroys page coloring of some arch */
772pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
773{
774 pgtable_t pgtable;
775
776 assert_spin_locked(&mm->page_table_lock);
777
778 /* FIFO */
779 pgtable = mm->pmd_huge_pte;
780 if (list_empty(&pgtable->lru))
781 mm->pmd_huge_pte = NULL;
782 else {
783 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
784 struct page, lru);
785 list_del(&pgtable->lru);
786 }
787 return pgtable;
788}
789
790static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
791 struct vm_area_struct *vma,
792 unsigned long address,
793 pmd_t *pmd, pmd_t orig_pmd,
794 struct page *page,
795 unsigned long haddr)
796{
797 pgtable_t pgtable;
798 pmd_t _pmd;
799 int ret = 0, i;
800 struct page **pages;
801
802 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
803 GFP_KERNEL);
804 if (unlikely(!pages)) {
805 ret |= VM_FAULT_OOM;
806 goto out;
807 }
808
809 for (i = 0; i < HPAGE_PMD_NR; i++) {
810 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
811 __GFP_OTHER_NODE,
812 vma, address, page_to_nid(page));
813 if (unlikely(!pages[i] ||
814 mem_cgroup_newpage_charge(pages[i], mm,
815 GFP_KERNEL))) {
816 if (pages[i])
817 put_page(pages[i]);
818 mem_cgroup_uncharge_start();
819 while (--i >= 0) {
820 mem_cgroup_uncharge_page(pages[i]);
821 put_page(pages[i]);
822 }
823 mem_cgroup_uncharge_end();
824 kfree(pages);
825 ret |= VM_FAULT_OOM;
826 goto out;
827 }
828 }
829
830 for (i = 0; i < HPAGE_PMD_NR; i++) {
831 copy_user_highpage(pages[i], page + i,
832 haddr + PAGE_SHIFT*i, vma);
833 __SetPageUptodate(pages[i]);
834 cond_resched();
835 }
836
837 spin_lock(&mm->page_table_lock);
838 if (unlikely(!pmd_same(*pmd, orig_pmd)))
839 goto out_free_pages;
840 VM_BUG_ON(!PageHead(page));
841
842 pmdp_clear_flush_notify(vma, haddr, pmd);
843 /* leave pmd empty until pte is filled */
844
845 pgtable = get_pmd_huge_pte(mm);
846 pmd_populate(mm, &_pmd, pgtable);
847
848 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
849 pte_t *pte, entry;
850 entry = mk_pte(pages[i], vma->vm_page_prot);
851 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
852 page_add_new_anon_rmap(pages[i], vma, haddr);
853 pte = pte_offset_map(&_pmd, haddr);
854 VM_BUG_ON(!pte_none(*pte));
855 set_pte_at(mm, haddr, pte, entry);
856 pte_unmap(pte);
857 }
858 kfree(pages);
859
860 mm->nr_ptes++;
861 smp_wmb(); /* make pte visible before pmd */
862 pmd_populate(mm, pmd, pgtable);
863 page_remove_rmap(page);
864 spin_unlock(&mm->page_table_lock);
865
866 ret |= VM_FAULT_WRITE;
867 put_page(page);
868
869out:
870 return ret;
871
872out_free_pages:
873 spin_unlock(&mm->page_table_lock);
874 mem_cgroup_uncharge_start();
875 for (i = 0; i < HPAGE_PMD_NR; i++) {
876 mem_cgroup_uncharge_page(pages[i]);
877 put_page(pages[i]);
878 }
879 mem_cgroup_uncharge_end();
880 kfree(pages);
881 goto out;
882}
883
884int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
885 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
886{
887 int ret = 0;
888 struct page *page, *new_page;
889 unsigned long haddr;
890
891 VM_BUG_ON(!vma->anon_vma);
892 spin_lock(&mm->page_table_lock);
893 if (unlikely(!pmd_same(*pmd, orig_pmd)))
894 goto out_unlock;
895
896 page = pmd_page(orig_pmd);
897 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
898 haddr = address & HPAGE_PMD_MASK;
899 if (page_mapcount(page) == 1) {
900 pmd_t entry;
901 entry = pmd_mkyoung(orig_pmd);
902 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
903 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
904 update_mmu_cache(vma, address, entry);
905 ret |= VM_FAULT_WRITE;
906 goto out_unlock;
907 }
908 get_page(page);
909 spin_unlock(&mm->page_table_lock);
910
911 if (transparent_hugepage_enabled(vma) &&
912 !transparent_hugepage_debug_cow())
913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
914 vma, haddr, numa_node_id(), 0);
915 else
916 new_page = NULL;
917
918 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK);
920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
921 pmd, orig_pmd, page, haddr);
922 put_page(page);
923 goto out;
924 }
925 count_vm_event(THP_FAULT_ALLOC);
926
927 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
928 put_page(new_page);
929 put_page(page);
930 ret |= VM_FAULT_OOM;
931 goto out;
932 }
933
934 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
935 __SetPageUptodate(new_page);
936
937 spin_lock(&mm->page_table_lock);
938 put_page(page);
939 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
940 mem_cgroup_uncharge_page(new_page);
941 put_page(new_page);
942 } else {
943 pmd_t entry;
944 VM_BUG_ON(!PageHead(page));
945 entry = mk_pmd(new_page, vma->vm_page_prot);
946 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
947 entry = pmd_mkhuge(entry);
948 pmdp_clear_flush_notify(vma, haddr, pmd);
949 page_add_new_anon_rmap(new_page, vma, haddr);
950 set_pmd_at(mm, haddr, pmd, entry);
951 update_mmu_cache(vma, address, entry);
952 page_remove_rmap(page);
953 put_page(page);
954 ret |= VM_FAULT_WRITE;
955 }
956out_unlock:
957 spin_unlock(&mm->page_table_lock);
958out:
959 return ret;
960}
961
962struct page *follow_trans_huge_pmd(struct mm_struct *mm,
963 unsigned long addr,
964 pmd_t *pmd,
965 unsigned int flags)
966{
967 struct page *page = NULL;
968
969 assert_spin_locked(&mm->page_table_lock);
970
971 if (flags & FOLL_WRITE && !pmd_write(*pmd))
972 goto out;
973
974 page = pmd_page(*pmd);
975 VM_BUG_ON(!PageHead(page));
976 if (flags & FOLL_TOUCH) {
977 pmd_t _pmd;
978 /*
979 * We should set the dirty bit only for FOLL_WRITE but
980 * for now the dirty bit in the pmd is meaningless.
981 * And if the dirty bit will become meaningful and
982 * we'll only set it with FOLL_WRITE, an atomic
983 * set_bit will be required on the pmd to set the
984 * young bit, instead of the current set_pmd_at.
985 */
986 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
987 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
988 }
989 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
990 VM_BUG_ON(!PageCompound(page));
991 if (flags & FOLL_GET)
992 get_page(page);
993
994out:
995 return page;
996}
997
998int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
999 pmd_t *pmd)
1000{
1001 int ret = 0;
1002
1003 spin_lock(&tlb->mm->page_table_lock);
1004 if (likely(pmd_trans_huge(*pmd))) {
1005 if (unlikely(pmd_trans_splitting(*pmd))) {
1006 spin_unlock(&tlb->mm->page_table_lock);
1007 wait_split_huge_page(vma->anon_vma,
1008 pmd);
1009 } else {
1010 struct page *page;
1011 pgtable_t pgtable;
1012 pgtable = get_pmd_huge_pte(tlb->mm);
1013 page = pmd_page(*pmd);
1014 pmd_clear(pmd);
1015 page_remove_rmap(page);
1016 VM_BUG_ON(page_mapcount(page) < 0);
1017 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1018 VM_BUG_ON(!PageHead(page));
1019 spin_unlock(&tlb->mm->page_table_lock);
1020 tlb_remove_page(tlb, page);
1021 pte_free(tlb->mm, pgtable);
1022 ret = 1;
1023 }
1024 } else
1025 spin_unlock(&tlb->mm->page_table_lock);
1026
1027 return ret;
1028}
1029
1030int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1031 unsigned long addr, unsigned long end,
1032 unsigned char *vec)
1033{
1034 int ret = 0;
1035
1036 spin_lock(&vma->vm_mm->page_table_lock);
1037 if (likely(pmd_trans_huge(*pmd))) {
1038 ret = !pmd_trans_splitting(*pmd);
1039 spin_unlock(&vma->vm_mm->page_table_lock);
1040 if (unlikely(!ret))
1041 wait_split_huge_page(vma->anon_vma, pmd);
1042 else {
1043 /*
1044 * All logical pages in the range are present
1045 * if backed by a huge page.
1046 */
1047 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1048 }
1049 } else
1050 spin_unlock(&vma->vm_mm->page_table_lock);
1051
1052 return ret;
1053}
1054
1055int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1056 unsigned long addr, pgprot_t newprot)
1057{
1058 struct mm_struct *mm = vma->vm_mm;
1059 int ret = 0;
1060
1061 spin_lock(&mm->page_table_lock);
1062 if (likely(pmd_trans_huge(*pmd))) {
1063 if (unlikely(pmd_trans_splitting(*pmd))) {
1064 spin_unlock(&mm->page_table_lock);
1065 wait_split_huge_page(vma->anon_vma, pmd);
1066 } else {
1067 pmd_t entry;
1068
1069 entry = pmdp_get_and_clear(mm, addr, pmd);
1070 entry = pmd_modify(entry, newprot);
1071 set_pmd_at(mm, addr, pmd, entry);
1072 spin_unlock(&vma->vm_mm->page_table_lock);
1073 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1074 ret = 1;
1075 }
1076 } else
1077 spin_unlock(&vma->vm_mm->page_table_lock);
1078
1079 return ret;
1080}
1081
1082pmd_t *page_check_address_pmd(struct page *page,
1083 struct mm_struct *mm,
1084 unsigned long address,
1085 enum page_check_address_pmd_flag flag)
1086{
1087 pgd_t *pgd;
1088 pud_t *pud;
1089 pmd_t *pmd, *ret = NULL;
1090
1091 if (address & ~HPAGE_PMD_MASK)
1092 goto out;
1093
1094 pgd = pgd_offset(mm, address);
1095 if (!pgd_present(*pgd))
1096 goto out;
1097
1098 pud = pud_offset(pgd, address);
1099 if (!pud_present(*pud))
1100 goto out;
1101
1102 pmd = pmd_offset(pud, address);
1103 if (pmd_none(*pmd))
1104 goto out;
1105 if (pmd_page(*pmd) != page)
1106 goto out;
1107 /*
1108 * split_vma() may create temporary aliased mappings. There is
1109 * no risk as long as all huge pmd are found and have their
1110 * splitting bit set before __split_huge_page_refcount
1111 * runs. Finding the same huge pmd more than once during the
1112 * same rmap walk is not a problem.
1113 */
1114 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1115 pmd_trans_splitting(*pmd))
1116 goto out;
1117 if (pmd_trans_huge(*pmd)) {
1118 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1119 !pmd_trans_splitting(*pmd));
1120 ret = pmd;
1121 }
1122out:
1123 return ret;
1124}
1125
1126static int __split_huge_page_splitting(struct page *page,
1127 struct vm_area_struct *vma,
1128 unsigned long address)
1129{
1130 struct mm_struct *mm = vma->vm_mm;
1131 pmd_t *pmd;
1132 int ret = 0;
1133
1134 spin_lock(&mm->page_table_lock);
1135 pmd = page_check_address_pmd(page, mm, address,
1136 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1137 if (pmd) {
1138 /*
1139 * We can't temporarily set the pmd to null in order
1140 * to split it, the pmd must remain marked huge at all
1141 * times or the VM won't take the pmd_trans_huge paths
1142 * and it won't wait on the anon_vma->root->mutex to
1143 * serialize against split_huge_page*.
1144 */
1145 pmdp_splitting_flush_notify(vma, address, pmd);
1146 ret = 1;
1147 }
1148 spin_unlock(&mm->page_table_lock);
1149
1150 return ret;
1151}
1152
1153static void __split_huge_page_refcount(struct page *page)
1154{
1155 int i;
1156 unsigned long head_index = page->index;
1157 struct zone *zone = page_zone(page);
1158 int zonestat;
1159
1160 /* prevent PageLRU to go away from under us, and freeze lru stats */
1161 spin_lock_irq(&zone->lru_lock);
1162 compound_lock(page);
1163
1164 for (i = 1; i < HPAGE_PMD_NR; i++) {
1165 struct page *page_tail = page + i;
1166
1167 /* tail_page->_count cannot change */
1168 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1169 BUG_ON(page_count(page) <= 0);
1170 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1171 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1172
1173 /* after clearing PageTail the gup refcount can be released */
1174 smp_mb();
1175
1176 /*
1177 * retain hwpoison flag of the poisoned tail page:
1178 * fix for the unsuitable process killed on Guest Machine(KVM)
1179 * by the memory-failure.
1180 */
1181 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1182 page_tail->flags |= (page->flags &
1183 ((1L << PG_referenced) |
1184 (1L << PG_swapbacked) |
1185 (1L << PG_mlocked) |
1186 (1L << PG_uptodate)));
1187 page_tail->flags |= (1L << PG_dirty);
1188
1189 /*
1190 * 1) clear PageTail before overwriting first_page
1191 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1192 */
1193 smp_wmb();
1194
1195 /*
1196 * __split_huge_page_splitting() already set the
1197 * splitting bit in all pmd that could map this
1198 * hugepage, that will ensure no CPU can alter the
1199 * mapcount on the head page. The mapcount is only
1200 * accounted in the head page and it has to be
1201 * transferred to all tail pages in the below code. So
1202 * for this code to be safe, the split the mapcount
1203 * can't change. But that doesn't mean userland can't
1204 * keep changing and reading the page contents while
1205 * we transfer the mapcount, so the pmd splitting
1206 * status is achieved setting a reserved bit in the
1207 * pmd, not by clearing the present bit.
1208 */
1209 BUG_ON(page_mapcount(page_tail));
1210 page_tail->_mapcount = page->_mapcount;
1211
1212 BUG_ON(page_tail->mapping);
1213 page_tail->mapping = page->mapping;
1214
1215 page_tail->index = ++head_index;
1216
1217 BUG_ON(!PageAnon(page_tail));
1218 BUG_ON(!PageUptodate(page_tail));
1219 BUG_ON(!PageDirty(page_tail));
1220 BUG_ON(!PageSwapBacked(page_tail));
1221
1222 mem_cgroup_split_huge_fixup(page, page_tail);
1223
1224 lru_add_page_tail(zone, page, page_tail);
1225 }
1226
1227 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1228 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1229
1230 /*
1231 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1232 * so adjust those appropriately if this page is on the LRU.
1233 */
1234 if (PageLRU(page)) {
1235 zonestat = NR_LRU_BASE + page_lru(page);
1236 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1237 }
1238
1239 ClearPageCompound(page);
1240 compound_unlock(page);
1241 spin_unlock_irq(&zone->lru_lock);
1242
1243 for (i = 1; i < HPAGE_PMD_NR; i++) {
1244 struct page *page_tail = page + i;
1245 BUG_ON(page_count(page_tail) <= 0);
1246 /*
1247 * Tail pages may be freed if there wasn't any mapping
1248 * like if add_to_swap() is running on a lru page that
1249 * had its mapping zapped. And freeing these pages
1250 * requires taking the lru_lock so we do the put_page
1251 * of the tail pages after the split is complete.
1252 */
1253 put_page(page_tail);
1254 }
1255
1256 /*
1257 * Only the head page (now become a regular page) is required
1258 * to be pinned by the caller.
1259 */
1260 BUG_ON(page_count(page) <= 0);
1261}
1262
1263static int __split_huge_page_map(struct page *page,
1264 struct vm_area_struct *vma,
1265 unsigned long address)
1266{
1267 struct mm_struct *mm = vma->vm_mm;
1268 pmd_t *pmd, _pmd;
1269 int ret = 0, i;
1270 pgtable_t pgtable;
1271 unsigned long haddr;
1272
1273 spin_lock(&mm->page_table_lock);
1274 pmd = page_check_address_pmd(page, mm, address,
1275 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1276 if (pmd) {
1277 pgtable = get_pmd_huge_pte(mm);
1278 pmd_populate(mm, &_pmd, pgtable);
1279
1280 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1281 i++, haddr += PAGE_SIZE) {
1282 pte_t *pte, entry;
1283 BUG_ON(PageCompound(page+i));
1284 entry = mk_pte(page + i, vma->vm_page_prot);
1285 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1286 if (!pmd_write(*pmd))
1287 entry = pte_wrprotect(entry);
1288 else
1289 BUG_ON(page_mapcount(page) != 1);
1290 if (!pmd_young(*pmd))
1291 entry = pte_mkold(entry);
1292 pte = pte_offset_map(&_pmd, haddr);
1293 BUG_ON(!pte_none(*pte));
1294 set_pte_at(mm, haddr, pte, entry);
1295 pte_unmap(pte);
1296 }
1297
1298 mm->nr_ptes++;
1299 smp_wmb(); /* make pte visible before pmd */
1300 /*
1301 * Up to this point the pmd is present and huge and
1302 * userland has the whole access to the hugepage
1303 * during the split (which happens in place). If we
1304 * overwrite the pmd with the not-huge version
1305 * pointing to the pte here (which of course we could
1306 * if all CPUs were bug free), userland could trigger
1307 * a small page size TLB miss on the small sized TLB
1308 * while the hugepage TLB entry is still established
1309 * in the huge TLB. Some CPU doesn't like that. See
1310 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1311 * Erratum 383 on page 93. Intel should be safe but is
1312 * also warns that it's only safe if the permission
1313 * and cache attributes of the two entries loaded in
1314 * the two TLB is identical (which should be the case
1315 * here). But it is generally safer to never allow
1316 * small and huge TLB entries for the same virtual
1317 * address to be loaded simultaneously. So instead of
1318 * doing "pmd_populate(); flush_tlb_range();" we first
1319 * mark the current pmd notpresent (atomically because
1320 * here the pmd_trans_huge and pmd_trans_splitting
1321 * must remain set at all times on the pmd until the
1322 * split is complete for this pmd), then we flush the
1323 * SMP TLB and finally we write the non-huge version
1324 * of the pmd entry with pmd_populate.
1325 */
1326 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1327 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1328 pmd_populate(mm, pmd, pgtable);
1329 ret = 1;
1330 }
1331 spin_unlock(&mm->page_table_lock);
1332
1333 return ret;
1334}
1335
1336/* must be called with anon_vma->root->mutex hold */
1337static void __split_huge_page(struct page *page,
1338 struct anon_vma *anon_vma)
1339{
1340 int mapcount, mapcount2;
1341 struct anon_vma_chain *avc;
1342
1343 BUG_ON(!PageHead(page));
1344 BUG_ON(PageTail(page));
1345
1346 mapcount = 0;
1347 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1348 struct vm_area_struct *vma = avc->vma;
1349 unsigned long addr = vma_address(page, vma);
1350 BUG_ON(is_vma_temporary_stack(vma));
1351 if (addr == -EFAULT)
1352 continue;
1353 mapcount += __split_huge_page_splitting(page, vma, addr);
1354 }
1355 /*
1356 * It is critical that new vmas are added to the tail of the
1357 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1358 * and establishes a child pmd before
1359 * __split_huge_page_splitting() freezes the parent pmd (so if
1360 * we fail to prevent copy_huge_pmd() from running until the
1361 * whole __split_huge_page() is complete), we will still see
1362 * the newly established pmd of the child later during the
1363 * walk, to be able to set it as pmd_trans_splitting too.
1364 */
1365 if (mapcount != page_mapcount(page))
1366 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1367 mapcount, page_mapcount(page));
1368 BUG_ON(mapcount != page_mapcount(page));
1369
1370 __split_huge_page_refcount(page);
1371
1372 mapcount2 = 0;
1373 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1374 struct vm_area_struct *vma = avc->vma;
1375 unsigned long addr = vma_address(page, vma);
1376 BUG_ON(is_vma_temporary_stack(vma));
1377 if (addr == -EFAULT)
1378 continue;
1379 mapcount2 += __split_huge_page_map(page, vma, addr);
1380 }
1381 if (mapcount != mapcount2)
1382 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1383 mapcount, mapcount2, page_mapcount(page));
1384 BUG_ON(mapcount != mapcount2);
1385}
1386
1387int split_huge_page(struct page *page)
1388{
1389 struct anon_vma *anon_vma;
1390 int ret = 1;
1391
1392 BUG_ON(!PageAnon(page));
1393 anon_vma = page_lock_anon_vma(page);
1394 if (!anon_vma)
1395 goto out;
1396 ret = 0;
1397 if (!PageCompound(page))
1398 goto out_unlock;
1399
1400 BUG_ON(!PageSwapBacked(page));
1401 __split_huge_page(page, anon_vma);
1402 count_vm_event(THP_SPLIT);
1403
1404 BUG_ON(PageCompound(page));
1405out_unlock:
1406 page_unlock_anon_vma(anon_vma);
1407out:
1408 return ret;
1409}
1410
1411#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
1412 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1413
1414int hugepage_madvise(struct vm_area_struct *vma,
1415 unsigned long *vm_flags, int advice)
1416{
1417 switch (advice) {
1418 case MADV_HUGEPAGE:
1419 /*
1420 * Be somewhat over-protective like KSM for now!
1421 */
1422 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1423 return -EINVAL;
1424 *vm_flags &= ~VM_NOHUGEPAGE;
1425 *vm_flags |= VM_HUGEPAGE;
1426 /*
1427 * If the vma become good for khugepaged to scan,
1428 * register it here without waiting a page fault that
1429 * may not happen any time soon.
1430 */
1431 if (unlikely(khugepaged_enter_vma_merge(vma)))
1432 return -ENOMEM;
1433 break;
1434 case MADV_NOHUGEPAGE:
1435 /*
1436 * Be somewhat over-protective like KSM for now!
1437 */
1438 if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1439 return -EINVAL;
1440 *vm_flags &= ~VM_HUGEPAGE;
1441 *vm_flags |= VM_NOHUGEPAGE;
1442 /*
1443 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1444 * this vma even if we leave the mm registered in khugepaged if
1445 * it got registered before VM_NOHUGEPAGE was set.
1446 */
1447 break;
1448 }
1449
1450 return 0;
1451}
1452
1453static int __init khugepaged_slab_init(void)
1454{
1455 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1456 sizeof(struct mm_slot),
1457 __alignof__(struct mm_slot), 0, NULL);
1458 if (!mm_slot_cache)
1459 return -ENOMEM;
1460
1461 return 0;
1462}
1463
1464static void __init khugepaged_slab_free(void)
1465{
1466 kmem_cache_destroy(mm_slot_cache);
1467 mm_slot_cache = NULL;
1468}
1469
1470static inline struct mm_slot *alloc_mm_slot(void)
1471{
1472 if (!mm_slot_cache) /* initialization failed */
1473 return NULL;
1474 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1475}
1476
1477static inline void free_mm_slot(struct mm_slot *mm_slot)
1478{
1479 kmem_cache_free(mm_slot_cache, mm_slot);
1480}
1481
1482static int __init mm_slots_hash_init(void)
1483{
1484 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1485 GFP_KERNEL);
1486 if (!mm_slots_hash)
1487 return -ENOMEM;
1488 return 0;
1489}
1490
1491#if 0
1492static void __init mm_slots_hash_free(void)
1493{
1494 kfree(mm_slots_hash);
1495 mm_slots_hash = NULL;
1496}
1497#endif
1498
1499static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1500{
1501 struct mm_slot *mm_slot;
1502 struct hlist_head *bucket;
1503 struct hlist_node *node;
1504
1505 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1506 % MM_SLOTS_HASH_HEADS];
1507 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1508 if (mm == mm_slot->mm)
1509 return mm_slot;
1510 }
1511 return NULL;
1512}
1513
1514static void insert_to_mm_slots_hash(struct mm_struct *mm,
1515 struct mm_slot *mm_slot)
1516{
1517 struct hlist_head *bucket;
1518
1519 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1520 % MM_SLOTS_HASH_HEADS];
1521 mm_slot->mm = mm;
1522 hlist_add_head(&mm_slot->hash, bucket);
1523}
1524
1525static inline int khugepaged_test_exit(struct mm_struct *mm)
1526{
1527 return atomic_read(&mm->mm_users) == 0;
1528}
1529
1530int __khugepaged_enter(struct mm_struct *mm)
1531{
1532 struct mm_slot *mm_slot;
1533 int wakeup;
1534
1535 mm_slot = alloc_mm_slot();
1536 if (!mm_slot)
1537 return -ENOMEM;
1538
1539 /* __khugepaged_exit() must not run from under us */
1540 VM_BUG_ON(khugepaged_test_exit(mm));
1541 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1542 free_mm_slot(mm_slot);
1543 return 0;
1544 }
1545
1546 spin_lock(&khugepaged_mm_lock);
1547 insert_to_mm_slots_hash(mm, mm_slot);
1548 /*
1549 * Insert just behind the scanning cursor, to let the area settle
1550 * down a little.
1551 */
1552 wakeup = list_empty(&khugepaged_scan.mm_head);
1553 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1554 spin_unlock(&khugepaged_mm_lock);
1555
1556 atomic_inc(&mm->mm_count);
1557 if (wakeup)
1558 wake_up_interruptible(&khugepaged_wait);
1559
1560 return 0;
1561}
1562
1563int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1564{
1565 unsigned long hstart, hend;
1566 if (!vma->anon_vma)
1567 /*
1568 * Not yet faulted in so we will register later in the
1569 * page fault if needed.
1570 */
1571 return 0;
1572 if (vma->vm_ops)
1573 /* khugepaged not yet working on file or special mappings */
1574 return 0;
1575 /*
1576 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1577 * true too, verify it here.
1578 */
1579 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1580 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1581 hend = vma->vm_end & HPAGE_PMD_MASK;
1582 if (hstart < hend)
1583 return khugepaged_enter(vma);
1584 return 0;
1585}
1586
1587void __khugepaged_exit(struct mm_struct *mm)
1588{
1589 struct mm_slot *mm_slot;
1590 int free = 0;
1591
1592 spin_lock(&khugepaged_mm_lock);
1593 mm_slot = get_mm_slot(mm);
1594 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1595 hlist_del(&mm_slot->hash);
1596 list_del(&mm_slot->mm_node);
1597 free = 1;
1598 }
1599
1600 if (free) {
1601 spin_unlock(&khugepaged_mm_lock);
1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603 free_mm_slot(mm_slot);
1604 mmdrop(mm);
1605 } else if (mm_slot) {
1606 spin_unlock(&khugepaged_mm_lock);
1607 /*
1608 * This is required to serialize against
1609 * khugepaged_test_exit() (which is guaranteed to run
1610 * under mmap sem read mode). Stop here (after we
1611 * return all pagetables will be destroyed) until
1612 * khugepaged has finished working on the pagetables
1613 * under the mmap_sem.
1614 */
1615 down_write(&mm->mmap_sem);
1616 up_write(&mm->mmap_sem);
1617 } else
1618 spin_unlock(&khugepaged_mm_lock);
1619}
1620
1621static void release_pte_page(struct page *page)
1622{
1623 /* 0 stands for page_is_file_cache(page) == false */
1624 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1625 unlock_page(page);
1626 putback_lru_page(page);
1627}
1628
1629static void release_pte_pages(pte_t *pte, pte_t *_pte)
1630{
1631 while (--_pte >= pte) {
1632 pte_t pteval = *_pte;
1633 if (!pte_none(pteval))
1634 release_pte_page(pte_page(pteval));
1635 }
1636}
1637
1638static void release_all_pte_pages(pte_t *pte)
1639{
1640 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1641}
1642
1643static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1644 unsigned long address,
1645 pte_t *pte)
1646{
1647 struct page *page;
1648 pte_t *_pte;
1649 int referenced = 0, isolated = 0, none = 0;
1650 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1651 _pte++, address += PAGE_SIZE) {
1652 pte_t pteval = *_pte;
1653 if (pte_none(pteval)) {
1654 if (++none <= khugepaged_max_ptes_none)
1655 continue;
1656 else {
1657 release_pte_pages(pte, _pte);
1658 goto out;
1659 }
1660 }
1661 if (!pte_present(pteval) || !pte_write(pteval)) {
1662 release_pte_pages(pte, _pte);
1663 goto out;
1664 }
1665 page = vm_normal_page(vma, address, pteval);
1666 if (unlikely(!page)) {
1667 release_pte_pages(pte, _pte);
1668 goto out;
1669 }
1670 VM_BUG_ON(PageCompound(page));
1671 BUG_ON(!PageAnon(page));
1672 VM_BUG_ON(!PageSwapBacked(page));
1673
1674 /* cannot use mapcount: can't collapse if there's a gup pin */
1675 if (page_count(page) != 1) {
1676 release_pte_pages(pte, _pte);
1677 goto out;
1678 }
1679 /*
1680 * We can do it before isolate_lru_page because the
1681 * page can't be freed from under us. NOTE: PG_lock
1682 * is needed to serialize against split_huge_page
1683 * when invoked from the VM.
1684 */
1685 if (!trylock_page(page)) {
1686 release_pte_pages(pte, _pte);
1687 goto out;
1688 }
1689 /*
1690 * Isolate the page to avoid collapsing an hugepage
1691 * currently in use by the VM.
1692 */
1693 if (isolate_lru_page(page)) {
1694 unlock_page(page);
1695 release_pte_pages(pte, _pte);
1696 goto out;
1697 }
1698 /* 0 stands for page_is_file_cache(page) == false */
1699 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1700 VM_BUG_ON(!PageLocked(page));
1701 VM_BUG_ON(PageLRU(page));
1702
1703 /* If there is no mapped pte young don't collapse the page */
1704 if (pte_young(pteval) || PageReferenced(page) ||
1705 mmu_notifier_test_young(vma->vm_mm, address))
1706 referenced = 1;
1707 }
1708 if (unlikely(!referenced))
1709 release_all_pte_pages(pte);
1710 else
1711 isolated = 1;
1712out:
1713 return isolated;
1714}
1715
1716static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1717 struct vm_area_struct *vma,
1718 unsigned long address,
1719 spinlock_t *ptl)
1720{
1721 pte_t *_pte;
1722 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1723 pte_t pteval = *_pte;
1724 struct page *src_page;
1725
1726 if (pte_none(pteval)) {
1727 clear_user_highpage(page, address);
1728 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1729 } else {
1730 src_page = pte_page(pteval);
1731 copy_user_highpage(page, src_page, address, vma);
1732 VM_BUG_ON(page_mapcount(src_page) != 1);
1733 VM_BUG_ON(page_count(src_page) != 2);
1734 release_pte_page(src_page);
1735 /*
1736 * ptl mostly unnecessary, but preempt has to
1737 * be disabled to update the per-cpu stats
1738 * inside page_remove_rmap().
1739 */
1740 spin_lock(ptl);
1741 /*
1742 * paravirt calls inside pte_clear here are
1743 * superfluous.
1744 */
1745 pte_clear(vma->vm_mm, address, _pte);
1746 page_remove_rmap(src_page);
1747 spin_unlock(ptl);
1748 free_page_and_swap_cache(src_page);
1749 }
1750
1751 address += PAGE_SIZE;
1752 page++;
1753 }
1754}
1755
1756static void collapse_huge_page(struct mm_struct *mm,
1757 unsigned long address,
1758 struct page **hpage,
1759 struct vm_area_struct *vma,
1760 int node)
1761{
1762 pgd_t *pgd;
1763 pud_t *pud;
1764 pmd_t *pmd, _pmd;
1765 pte_t *pte;
1766 pgtable_t pgtable;
1767 struct page *new_page;
1768 spinlock_t *ptl;
1769 int isolated;
1770 unsigned long hstart, hend;
1771
1772 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1773#ifndef CONFIG_NUMA
1774 up_read(&mm->mmap_sem);
1775 VM_BUG_ON(!*hpage);
1776 new_page = *hpage;
1777#else
1778 VM_BUG_ON(*hpage);
1779 /*
1780 * Allocate the page while the vma is still valid and under
1781 * the mmap_sem read mode so there is no memory allocation
1782 * later when we take the mmap_sem in write mode. This is more
1783 * friendly behavior (OTOH it may actually hide bugs) to
1784 * filesystems in userland with daemons allocating memory in
1785 * the userland I/O paths. Allocating memory with the
1786 * mmap_sem in read mode is good idea also to allow greater
1787 * scalability.
1788 */
1789 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1790 node, __GFP_OTHER_NODE);
1791
1792 /*
1793 * After allocating the hugepage, release the mmap_sem read lock in
1794 * preparation for taking it in write mode.
1795 */
1796 up_read(&mm->mmap_sem);
1797 if (unlikely(!new_page)) {
1798 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1799 *hpage = ERR_PTR(-ENOMEM);
1800 return;
1801 }
1802#endif
1803
1804 count_vm_event(THP_COLLAPSE_ALLOC);
1805 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1806#ifdef CONFIG_NUMA
1807 put_page(new_page);
1808#endif
1809 return;
1810 }
1811
1812 /*
1813 * Prevent all access to pagetables with the exception of
1814 * gup_fast later hanlded by the ptep_clear_flush and the VM
1815 * handled by the anon_vma lock + PG_lock.
1816 */
1817 down_write(&mm->mmap_sem);
1818 if (unlikely(khugepaged_test_exit(mm)))
1819 goto out;
1820
1821 vma = find_vma(mm, address);
1822 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1823 hend = vma->vm_end & HPAGE_PMD_MASK;
1824 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1825 goto out;
1826
1827 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1828 (vma->vm_flags & VM_NOHUGEPAGE))
1829 goto out;
1830
1831 if (!vma->anon_vma || vma->vm_ops)
1832 goto out;
1833 if (is_vma_temporary_stack(vma))
1834 goto out;
1835 /*
1836 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1837 * true too, verify it here.
1838 */
1839 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1840
1841 pgd = pgd_offset(mm, address);
1842 if (!pgd_present(*pgd))
1843 goto out;
1844
1845 pud = pud_offset(pgd, address);
1846 if (!pud_present(*pud))
1847 goto out;
1848
1849 pmd = pmd_offset(pud, address);
1850 /* pmd can't go away or become huge under us */
1851 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1852 goto out;
1853
1854 anon_vma_lock(vma->anon_vma);
1855
1856 pte = pte_offset_map(pmd, address);
1857 ptl = pte_lockptr(mm, pmd);
1858
1859 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1860 /*
1861 * After this gup_fast can't run anymore. This also removes
1862 * any huge TLB entry from the CPU so we won't allow
1863 * huge and small TLB entries for the same virtual address
1864 * to avoid the risk of CPU bugs in that area.
1865 */
1866 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1867 spin_unlock(&mm->page_table_lock);
1868
1869 spin_lock(ptl);
1870 isolated = __collapse_huge_page_isolate(vma, address, pte);
1871 spin_unlock(ptl);
1872
1873 if (unlikely(!isolated)) {
1874 pte_unmap(pte);
1875 spin_lock(&mm->page_table_lock);
1876 BUG_ON(!pmd_none(*pmd));
1877 set_pmd_at(mm, address, pmd, _pmd);
1878 spin_unlock(&mm->page_table_lock);
1879 anon_vma_unlock(vma->anon_vma);
1880 goto out;
1881 }
1882
1883 /*
1884 * All pages are isolated and locked so anon_vma rmap
1885 * can't run anymore.
1886 */
1887 anon_vma_unlock(vma->anon_vma);
1888
1889 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1890 pte_unmap(pte);
1891 __SetPageUptodate(new_page);
1892 pgtable = pmd_pgtable(_pmd);
1893 VM_BUG_ON(page_count(pgtable) != 1);
1894 VM_BUG_ON(page_mapcount(pgtable) != 0);
1895
1896 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1897 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1898 _pmd = pmd_mkhuge(_pmd);
1899
1900 /*
1901 * spin_lock() below is not the equivalent of smp_wmb(), so
1902 * this is needed to avoid the copy_huge_page writes to become
1903 * visible after the set_pmd_at() write.
1904 */
1905 smp_wmb();
1906
1907 spin_lock(&mm->page_table_lock);
1908 BUG_ON(!pmd_none(*pmd));
1909 page_add_new_anon_rmap(new_page, vma, address);
1910 set_pmd_at(mm, address, pmd, _pmd);
1911 update_mmu_cache(vma, address, entry);
1912 prepare_pmd_huge_pte(pgtable, mm);
1913 mm->nr_ptes--;
1914 spin_unlock(&mm->page_table_lock);
1915
1916#ifndef CONFIG_NUMA
1917 *hpage = NULL;
1918#endif
1919 khugepaged_pages_collapsed++;
1920out_up_write:
1921 up_write(&mm->mmap_sem);
1922 return;
1923
1924out:
1925 mem_cgroup_uncharge_page(new_page);
1926#ifdef CONFIG_NUMA
1927 put_page(new_page);
1928#endif
1929 goto out_up_write;
1930}
1931
1932static int khugepaged_scan_pmd(struct mm_struct *mm,
1933 struct vm_area_struct *vma,
1934 unsigned long address,
1935 struct page **hpage)
1936{
1937 pgd_t *pgd;
1938 pud_t *pud;
1939 pmd_t *pmd;
1940 pte_t *pte, *_pte;
1941 int ret = 0, referenced = 0, none = 0;
1942 struct page *page;
1943 unsigned long _address;
1944 spinlock_t *ptl;
1945 int node = -1;
1946
1947 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1948
1949 pgd = pgd_offset(mm, address);
1950 if (!pgd_present(*pgd))
1951 goto out;
1952
1953 pud = pud_offset(pgd, address);
1954 if (!pud_present(*pud))
1955 goto out;
1956
1957 pmd = pmd_offset(pud, address);
1958 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1959 goto out;
1960
1961 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1962 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1963 _pte++, _address += PAGE_SIZE) {
1964 pte_t pteval = *_pte;
1965 if (pte_none(pteval)) {
1966 if (++none <= khugepaged_max_ptes_none)
1967 continue;
1968 else
1969 goto out_unmap;
1970 }
1971 if (!pte_present(pteval) || !pte_write(pteval))
1972 goto out_unmap;
1973 page = vm_normal_page(vma, _address, pteval);
1974 if (unlikely(!page))
1975 goto out_unmap;
1976 /*
1977 * Chose the node of the first page. This could
1978 * be more sophisticated and look at more pages,
1979 * but isn't for now.
1980 */
1981 if (node == -1)
1982 node = page_to_nid(page);
1983 VM_BUG_ON(PageCompound(page));
1984 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1985 goto out_unmap;
1986 /* cannot use mapcount: can't collapse if there's a gup pin */
1987 if (page_count(page) != 1)
1988 goto out_unmap;
1989 if (pte_young(pteval) || PageReferenced(page) ||
1990 mmu_notifier_test_young(vma->vm_mm, address))
1991 referenced = 1;
1992 }
1993 if (referenced)
1994 ret = 1;
1995out_unmap:
1996 pte_unmap_unlock(pte, ptl);
1997 if (ret)
1998 /* collapse_huge_page will return with the mmap_sem released */
1999 collapse_huge_page(mm, address, hpage, vma, node);
2000out:
2001 return ret;
2002}
2003
2004static void collect_mm_slot(struct mm_slot *mm_slot)
2005{
2006 struct mm_struct *mm = mm_slot->mm;
2007
2008 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
2009
2010 if (khugepaged_test_exit(mm)) {
2011 /* free mm_slot */
2012 hlist_del(&mm_slot->hash);
2013 list_del(&mm_slot->mm_node);
2014
2015 /*
2016 * Not strictly needed because the mm exited already.
2017 *
2018 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2019 */
2020
2021 /* khugepaged_mm_lock actually not necessary for the below */
2022 free_mm_slot(mm_slot);
2023 mmdrop(mm);
2024 }
2025}
2026
2027static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2028 struct page **hpage)
2029{
2030 struct mm_slot *mm_slot;
2031 struct mm_struct *mm;
2032 struct vm_area_struct *vma;
2033 int progress = 0;
2034
2035 VM_BUG_ON(!pages);
2036 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
2037
2038 if (khugepaged_scan.mm_slot)
2039 mm_slot = khugepaged_scan.mm_slot;
2040 else {
2041 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2042 struct mm_slot, mm_node);
2043 khugepaged_scan.address = 0;
2044 khugepaged_scan.mm_slot = mm_slot;
2045 }
2046 spin_unlock(&khugepaged_mm_lock);
2047
2048 mm = mm_slot->mm;
2049 down_read(&mm->mmap_sem);
2050 if (unlikely(khugepaged_test_exit(mm)))
2051 vma = NULL;
2052 else
2053 vma = find_vma(mm, khugepaged_scan.address);
2054
2055 progress++;
2056 for (; vma; vma = vma->vm_next) {
2057 unsigned long hstart, hend;
2058
2059 cond_resched();
2060 if (unlikely(khugepaged_test_exit(mm))) {
2061 progress++;
2062 break;
2063 }
2064
2065 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2066 !khugepaged_always()) ||
2067 (vma->vm_flags & VM_NOHUGEPAGE)) {
2068 skip:
2069 progress++;
2070 continue;
2071 }
2072 if (!vma->anon_vma || vma->vm_ops)
2073 goto skip;
2074 if (is_vma_temporary_stack(vma))
2075 goto skip;
2076 /*
2077 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2078 * must be true too, verify it here.
2079 */
2080 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2081 vma->vm_flags & VM_NO_THP);
2082
2083 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2084 hend = vma->vm_end & HPAGE_PMD_MASK;
2085 if (hstart >= hend)
2086 goto skip;
2087 if (khugepaged_scan.address > hend)
2088 goto skip;
2089 if (khugepaged_scan.address < hstart)
2090 khugepaged_scan.address = hstart;
2091 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2092
2093 while (khugepaged_scan.address < hend) {
2094 int ret;
2095 cond_resched();
2096 if (unlikely(khugepaged_test_exit(mm)))
2097 goto breakouterloop;
2098
2099 VM_BUG_ON(khugepaged_scan.address < hstart ||
2100 khugepaged_scan.address + HPAGE_PMD_SIZE >
2101 hend);
2102 ret = khugepaged_scan_pmd(mm, vma,
2103 khugepaged_scan.address,
2104 hpage);
2105 /* move to next address */
2106 khugepaged_scan.address += HPAGE_PMD_SIZE;
2107 progress += HPAGE_PMD_NR;
2108 if (ret)
2109 /* we released mmap_sem so break loop */
2110 goto breakouterloop_mmap_sem;
2111 if (progress >= pages)
2112 goto breakouterloop;
2113 }
2114 }
2115breakouterloop:
2116 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2117breakouterloop_mmap_sem:
2118
2119 spin_lock(&khugepaged_mm_lock);
2120 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2121 /*
2122 * Release the current mm_slot if this mm is about to die, or
2123 * if we scanned all vmas of this mm.
2124 */
2125 if (khugepaged_test_exit(mm) || !vma) {
2126 /*
2127 * Make sure that if mm_users is reaching zero while
2128 * khugepaged runs here, khugepaged_exit will find
2129 * mm_slot not pointing to the exiting mm.
2130 */
2131 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2132 khugepaged_scan.mm_slot = list_entry(
2133 mm_slot->mm_node.next,
2134 struct mm_slot, mm_node);
2135 khugepaged_scan.address = 0;
2136 } else {
2137 khugepaged_scan.mm_slot = NULL;
2138 khugepaged_full_scans++;
2139 }
2140
2141 collect_mm_slot(mm_slot);
2142 }
2143
2144 return progress;
2145}
2146
2147static int khugepaged_has_work(void)
2148{
2149 return !list_empty(&khugepaged_scan.mm_head) &&
2150 khugepaged_enabled();
2151}
2152
2153static int khugepaged_wait_event(void)
2154{
2155 return !list_empty(&khugepaged_scan.mm_head) ||
2156 !khugepaged_enabled();
2157}
2158
2159static void khugepaged_do_scan(struct page **hpage)
2160{
2161 unsigned int progress = 0, pass_through_head = 0;
2162 unsigned int pages = khugepaged_pages_to_scan;
2163
2164 barrier(); /* write khugepaged_pages_to_scan to local stack */
2165
2166 while (progress < pages) {
2167 cond_resched();
2168
2169#ifndef CONFIG_NUMA
2170 if (!*hpage) {
2171 *hpage = alloc_hugepage(khugepaged_defrag());
2172 if (unlikely(!*hpage)) {
2173 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2174 break;
2175 }
2176 count_vm_event(THP_COLLAPSE_ALLOC);
2177 }
2178#else
2179 if (IS_ERR(*hpage))
2180 break;
2181#endif
2182
2183 if (unlikely(kthread_should_stop() || freezing(current)))
2184 break;
2185
2186 spin_lock(&khugepaged_mm_lock);
2187 if (!khugepaged_scan.mm_slot)
2188 pass_through_head++;
2189 if (khugepaged_has_work() &&
2190 pass_through_head < 2)
2191 progress += khugepaged_scan_mm_slot(pages - progress,
2192 hpage);
2193 else
2194 progress = pages;
2195 spin_unlock(&khugepaged_mm_lock);
2196 }
2197}
2198
2199static void khugepaged_alloc_sleep(void)
2200{
2201 DEFINE_WAIT(wait);
2202 add_wait_queue(&khugepaged_wait, &wait);
2203 schedule_timeout_interruptible(
2204 msecs_to_jiffies(
2205 khugepaged_alloc_sleep_millisecs));
2206 remove_wait_queue(&khugepaged_wait, &wait);
2207}
2208
2209#ifndef CONFIG_NUMA
2210static struct page *khugepaged_alloc_hugepage(void)
2211{
2212 struct page *hpage;
2213
2214 do {
2215 hpage = alloc_hugepage(khugepaged_defrag());
2216 if (!hpage) {
2217 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2218 khugepaged_alloc_sleep();
2219 } else
2220 count_vm_event(THP_COLLAPSE_ALLOC);
2221 } while (unlikely(!hpage) &&
2222 likely(khugepaged_enabled()));
2223 return hpage;
2224}
2225#endif
2226
2227static void khugepaged_loop(void)
2228{
2229 struct page *hpage;
2230
2231#ifdef CONFIG_NUMA
2232 hpage = NULL;
2233#endif
2234 while (likely(khugepaged_enabled())) {
2235#ifndef CONFIG_NUMA
2236 hpage = khugepaged_alloc_hugepage();
2237 if (unlikely(!hpage))
2238 break;
2239#else
2240 if (IS_ERR(hpage)) {
2241 khugepaged_alloc_sleep();
2242 hpage = NULL;
2243 }
2244#endif
2245
2246 khugepaged_do_scan(&hpage);
2247#ifndef CONFIG_NUMA
2248 if (hpage)
2249 put_page(hpage);
2250#endif
2251 try_to_freeze();
2252 if (unlikely(kthread_should_stop()))
2253 break;
2254 if (khugepaged_has_work()) {
2255 DEFINE_WAIT(wait);
2256 if (!khugepaged_scan_sleep_millisecs)
2257 continue;
2258 add_wait_queue(&khugepaged_wait, &wait);
2259 schedule_timeout_interruptible(
2260 msecs_to_jiffies(
2261 khugepaged_scan_sleep_millisecs));
2262 remove_wait_queue(&khugepaged_wait, &wait);
2263 } else if (khugepaged_enabled())
2264 wait_event_freezable(khugepaged_wait,
2265 khugepaged_wait_event());
2266 }
2267}
2268
2269static int khugepaged(void *none)
2270{
2271 struct mm_slot *mm_slot;
2272
2273 set_freezable();
2274 set_user_nice(current, 19);
2275
2276 /* serialize with start_khugepaged() */
2277 mutex_lock(&khugepaged_mutex);
2278
2279 for (;;) {
2280 mutex_unlock(&khugepaged_mutex);
2281 VM_BUG_ON(khugepaged_thread != current);
2282 khugepaged_loop();
2283 VM_BUG_ON(khugepaged_thread != current);
2284
2285 mutex_lock(&khugepaged_mutex);
2286 if (!khugepaged_enabled())
2287 break;
2288 if (unlikely(kthread_should_stop()))
2289 break;
2290 }
2291
2292 spin_lock(&khugepaged_mm_lock);
2293 mm_slot = khugepaged_scan.mm_slot;
2294 khugepaged_scan.mm_slot = NULL;
2295 if (mm_slot)
2296 collect_mm_slot(mm_slot);
2297 spin_unlock(&khugepaged_mm_lock);
2298
2299 khugepaged_thread = NULL;
2300 mutex_unlock(&khugepaged_mutex);
2301
2302 return 0;
2303}
2304
2305void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2306{
2307 struct page *page;
2308
2309 spin_lock(&mm->page_table_lock);
2310 if (unlikely(!pmd_trans_huge(*pmd))) {
2311 spin_unlock(&mm->page_table_lock);
2312 return;
2313 }
2314 page = pmd_page(*pmd);
2315 VM_BUG_ON(!page_count(page));
2316 get_page(page);
2317 spin_unlock(&mm->page_table_lock);
2318
2319 split_huge_page(page);
2320
2321 put_page(page);
2322 BUG_ON(pmd_trans_huge(*pmd));
2323}
2324
2325static void split_huge_page_address(struct mm_struct *mm,
2326 unsigned long address)
2327{
2328 pgd_t *pgd;
2329 pud_t *pud;
2330 pmd_t *pmd;
2331
2332 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2333
2334 pgd = pgd_offset(mm, address);
2335 if (!pgd_present(*pgd))
2336 return;
2337
2338 pud = pud_offset(pgd, address);
2339 if (!pud_present(*pud))
2340 return;
2341
2342 pmd = pmd_offset(pud, address);
2343 if (!pmd_present(*pmd))
2344 return;
2345 /*
2346 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2347 * materialize from under us.
2348 */
2349 split_huge_page_pmd(mm, pmd);
2350}
2351
2352void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2353 unsigned long start,
2354 unsigned long end,
2355 long adjust_next)
2356{
2357 /*
2358 * If the new start address isn't hpage aligned and it could
2359 * previously contain an hugepage: check if we need to split
2360 * an huge pmd.
2361 */
2362 if (start & ~HPAGE_PMD_MASK &&
2363 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2364 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2365 split_huge_page_address(vma->vm_mm, start);
2366
2367 /*
2368 * If the new end address isn't hpage aligned and it could
2369 * previously contain an hugepage: check if we need to split
2370 * an huge pmd.
2371 */
2372 if (end & ~HPAGE_PMD_MASK &&
2373 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2374 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2375 split_huge_page_address(vma->vm_mm, end);
2376
2377 /*
2378 * If we're also updating the vma->vm_next->vm_start, if the new
2379 * vm_next->vm_start isn't page aligned and it could previously
2380 * contain an hugepage: check if we need to split an huge pmd.
2381 */
2382 if (adjust_next > 0) {
2383 struct vm_area_struct *next = vma->vm_next;
2384 unsigned long nstart = next->vm_start;
2385 nstart += adjust_next << PAGE_SHIFT;
2386 if (nstart & ~HPAGE_PMD_MASK &&
2387 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2388 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2389 split_huge_page_address(next->vm_mm, nstart);
2390 }
2391}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..bfcf153bc829 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
146 if (rg->from > t) 146 if (rg->from > t)
147 return chg; 147 return chg;
148 148
149 /* We overlap with this area, if it extends futher than 149 /* We overlap with this area, if it extends further than
150 * us then we must extend ourselves. Account for its 150 * us then we must extend ourselves. Account for its
151 * existing reservation. */ 151 * existing reservation. */
152 if (rg->to > t) { 152 if (rg->to > t) {
@@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma)
394 return 0; 394 return 0;
395} 395}
396 396
397static void clear_gigantic_page(struct page *page, 397static void copy_gigantic_page(struct page *dst, struct page *src)
398 unsigned long addr, unsigned long sz)
399{ 398{
400 int i; 399 int i;
401 struct page *p = page; 400 struct hstate *h = page_hstate(src);
402
403 might_sleep();
404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405 cond_resched();
406 clear_user_highpage(p, addr + i * PAGE_SIZE);
407 }
408}
409static void clear_huge_page(struct page *page,
410 unsigned long addr, unsigned long sz)
411{
412 int i;
413
414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415 clear_gigantic_page(page, addr, sz);
416 return;
417 }
418
419 might_sleep();
420 for (i = 0; i < sz/PAGE_SIZE; i++) {
421 cond_resched();
422 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423 }
424}
425
426static void copy_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma)
428{
429 int i;
430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 401 struct page *dst_base = dst;
432 struct page *src_base = src; 402 struct page *src_base = src;
433 might_sleep(); 403
434 for (i = 0; i < pages_per_huge_page(h); ) { 404 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 405 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 406 copy_highpage(dst, src);
437 407
438 i++; 408 i++;
439 dst = mem_map_next(dst, dst_base, i); 409 dst = mem_map_next(dst, dst_base, i);
440 src = mem_map_next(src, src_base, i); 410 src = mem_map_next(src, src_base, i);
441 } 411 }
442} 412}
443static void copy_huge_page(struct page *dst, struct page *src, 413
444 unsigned long addr, struct vm_area_struct *vma) 414void copy_huge_page(struct page *dst, struct page *src)
445{ 415{
446 int i; 416 int i;
447 struct hstate *h = hstate_vma(vma); 417 struct hstate *h = page_hstate(src);
448 418
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 419 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 420 copy_gigantic_page(dst, src);
451 return; 421 return;
452 } 422 }
453 423
454 might_sleep(); 424 might_sleep();
455 for (i = 0; i < pages_per_huge_page(h); i++) { 425 for (i = 0; i < pages_per_huge_page(h); i++) {
456 cond_resched(); 426 cond_resched();
457 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 427 copy_highpage(dst + i, src + i);
458 } 428 }
459} 429}
460 430
@@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 436 h->free_huge_pages_node[nid]++;
467} 437}
468 438
439static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
440{
441 struct page *page;
442
443 if (list_empty(&h->hugepage_freelists[nid]))
444 return NULL;
445 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
446 list_del(&page->lru);
447 set_page_refcounted(page);
448 h->free_huge_pages--;
449 h->free_huge_pages_node[nid]--;
450 return page;
451}
452
469static struct page *dequeue_huge_page_vma(struct hstate *h, 453static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 454 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 455 unsigned long address, int avoid_reserve)
472{ 456{
473 int nid;
474 struct page *page = NULL; 457 struct page *page = NULL;
475 struct mempolicy *mpol; 458 struct mempolicy *mpol;
476 nodemask_t *nodemask; 459 nodemask_t *nodemask;
@@ -492,23 +475,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
492 475
493 /* If reserves cannot be used, ensure enough pages are in the pool */ 476 /* If reserves cannot be used, ensure enough pages are in the pool */
494 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 477 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
495 goto err;; 478 goto err;
496 479
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 480 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 481 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 482 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 483 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 484 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 485 if (!avoid_reserve)
503 struct page, lru); 486 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 487 break;
505 h->free_huge_pages--; 488 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 489 }
513 } 490 }
514err: 491err:
@@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 747 return ret;
771} 748}
772 749
773static struct page *alloc_buddy_huge_page(struct hstate *h, 750static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 751{
776 struct page *page; 752 struct page *page;
777 unsigned int nid; 753 unsigned int r_nid;
778 754
779 if (h->order >= MAX_ORDER) 755 if (h->order >= MAX_ORDER)
780 return NULL; 756 return NULL;
@@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 788 }
813 spin_unlock(&hugetlb_lock); 789 spin_unlock(&hugetlb_lock);
814 790
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 791 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 792 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 793 __GFP_REPEAT|__GFP_NOWARN,
794 huge_page_order(h));
795 else
796 page = alloc_pages_exact_node(nid,
797 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
798 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 799
819 if (page && arch_prepare_hugepage(page)) { 800 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 801 __free_pages(page, huge_page_order(h));
@@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 804
824 spin_lock(&hugetlb_lock); 805 spin_lock(&hugetlb_lock);
825 if (page) { 806 if (page) {
826 /* 807 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 808 set_compound_page_dtor(page, free_huge_page);
834 /* 809 /*
835 * We incremented the global counters already 810 * We incremented the global counters already
836 */ 811 */
837 h->nr_huge_pages_node[nid]++; 812 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 813 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 814 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 815 } else {
841 h->nr_huge_pages--; 816 h->nr_huge_pages--;
@@ -848,7 +823,26 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 823}
849 824
850/* 825/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 826 * This allocation function is useful in the context where vma is irrelevant.
827 * E.g. soft-offlining uses this function because it only cares physical
828 * address of error page.
829 */
830struct page *alloc_huge_page_node(struct hstate *h, int nid)
831{
832 struct page *page;
833
834 spin_lock(&hugetlb_lock);
835 page = dequeue_huge_page_node(h, nid);
836 spin_unlock(&hugetlb_lock);
837
838 if (!page)
839 page = alloc_buddy_huge_page(h, nid);
840
841 return page;
842}
843
844/*
845 * Increase the hugetlb pool such that it can accommodate a reservation
852 * of size 'delta'. 846 * of size 'delta'.
853 */ 847 */
854static int gather_surplus_pages(struct hstate *h, int delta) 848static int gather_surplus_pages(struct hstate *h, int delta)
@@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 865retry:
872 spin_unlock(&hugetlb_lock); 866 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 867 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 868 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 869 if (!page)
876 /* 870 /*
877 * We were not able to allocate enough pages to 871 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 872 * satisfy the entire reservation so we free what
879 * we've allocated so far. 873 * we've allocated so far.
880 */ 874 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 875 goto free;
884 }
885 876
886 list_add(&page->lru, &surplus_list); 877 list_add(&page->lru, &surplus_list);
887 } 878 }
@@ -899,7 +890,7 @@ retry:
899 890
900 /* 891 /*
901 * The surplus_list now contains _at_least_ the number of extra pages 892 * The surplus_list now contains _at_least_ the number of extra pages
902 * needed to accomodate the reservation. Add the appropriate number 893 * needed to accommodate the reservation. Add the appropriate number
903 * of pages to the hugetlb pool and free the extras back to the buddy 894 * of pages to the hugetlb pool and free the extras back to the buddy
904 * allocator. Commit the entire reservation here to prevent another 895 * allocator. Commit the entire reservation here to prevent another
905 * process from stealing the pages as they are added to the pool but 896 * process from stealing the pages as they are added to the pool but
@@ -908,31 +899,31 @@ retry:
908 needed += allocated; 899 needed += allocated;
909 h->resv_huge_pages += delta; 900 h->resv_huge_pages += delta;
910 ret = 0; 901 ret = 0;
911free: 902
903 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 904 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 906 if ((--needed) < 0)
915 break; 907 break;
916 list_del(&page->lru); 908 list_del(&page->lru);
909 /*
910 * This page is now managed by the hugetlb allocator and has
911 * no users -- drop the buddy allocator's reference.
912 */
913 put_page_testzero(page);
914 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 915 enqueue_huge_page(h, page);
918 } 916 }
919 917
920 /* Free unnecessary surplus pages to the buddy allocator */ 918 /* Free unnecessary surplus pages to the buddy allocator */
919free:
921 if (!list_empty(&surplus_list)) { 920 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 921 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 922 list_del(&page->lru);
925 /* 923 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 924 }
934 spin_lock(&hugetlb_lock);
935 } 925 }
926 spin_lock(&hugetlb_lock);
936 927
937 return ret; 928 return ret;
938} 929}
@@ -1042,24 +1033,23 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1042 */ 1033 */
1043 chg = vma_needs_reservation(h, vma, addr); 1034 chg = vma_needs_reservation(h, vma, addr);
1044 if (chg < 0) 1035 if (chg < 0)
1045 return ERR_PTR(chg); 1036 return ERR_PTR(-VM_FAULT_OOM);
1046 if (chg) 1037 if (chg)
1047 if (hugetlb_get_quota(inode->i_mapping, chg)) 1038 if (hugetlb_get_quota(inode->i_mapping, chg))
1048 return ERR_PTR(-ENOSPC); 1039 return ERR_PTR(-VM_FAULT_SIGBUS);
1049 1040
1050 spin_lock(&hugetlb_lock); 1041 spin_lock(&hugetlb_lock);
1051 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1042 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1052 spin_unlock(&hugetlb_lock); 1043 spin_unlock(&hugetlb_lock);
1053 1044
1054 if (!page) { 1045 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1046 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1047 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1048 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1049 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1050 }
1060 } 1051 }
1061 1052
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1053 set_page_private(page, (unsigned long) mapping);
1064 1054
1065 vma_commit_reservation(h, vma, addr); 1055 vma_commit_reservation(h, vma, addr);
@@ -1121,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
1121 WARN_ON(page_count(page) != 1); 1111 WARN_ON(page_count(page) != 1);
1122 prep_compound_huge_page(page, h->order); 1112 prep_compound_huge_page(page, h->order);
1123 prep_new_huge_page(h, page, page_to_nid(page)); 1113 prep_new_huge_page(h, page, page_to_nid(page));
1114 /*
1115 * If we had gigantic hugepages allocated at boot time, we need
1116 * to restore the 'stolen' pages to totalram_pages in order to
1117 * fix confusing memory reports from free(1) and another
1118 * side-effects, like CommitLimit going negative.
1119 */
1120 if (h->order > (MAX_ORDER - 1))
1121 totalram_pages += 1 << h->order;
1124 } 1122 }
1125} 1123}
1126 1124
@@ -1373,6 +1371,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1373 1371
1374 return sprintf(buf, "%lu\n", nr_huge_pages); 1372 return sprintf(buf, "%lu\n", nr_huge_pages);
1375} 1373}
1374
1376static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1375static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1377 struct kobject *kobj, struct kobj_attribute *attr, 1376 struct kobject *kobj, struct kobj_attribute *attr,
1378 const char *buf, size_t len) 1377 const char *buf, size_t len)
@@ -1385,9 +1384,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1385 1384
1386 err = strict_strtoul(buf, 10, &count); 1385 err = strict_strtoul(buf, 10, &count);
1387 if (err) 1386 if (err)
1388 return 0; 1387 goto out;
1389 1388
1390 h = kobj_to_hstate(kobj, &nid); 1389 h = kobj_to_hstate(kobj, &nid);
1390 if (h->order >= MAX_ORDER) {
1391 err = -EINVAL;
1392 goto out;
1393 }
1394
1391 if (nid == NUMA_NO_NODE) { 1395 if (nid == NUMA_NO_NODE) {
1392 /* 1396 /*
1393 * global hstate attribute 1397 * global hstate attribute
@@ -1413,6 +1417,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1413 NODEMASK_FREE(nodes_allowed); 1417 NODEMASK_FREE(nodes_allowed);
1414 1418
1415 return len; 1419 return len;
1420out:
1421 NODEMASK_FREE(nodes_allowed);
1422 return err;
1416} 1423}
1417 1424
1418static ssize_t nr_hugepages_show(struct kobject *kobj, 1425static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1455,6 +1462,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1455 struct hstate *h = kobj_to_hstate(kobj, NULL); 1462 struct hstate *h = kobj_to_hstate(kobj, NULL);
1456 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1463 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1457} 1464}
1465
1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1466static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1459 struct kobj_attribute *attr, const char *buf, size_t count) 1467 struct kobj_attribute *attr, const char *buf, size_t count)
1460{ 1468{
@@ -1462,9 +1470,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1462 unsigned long input; 1470 unsigned long input;
1463 struct hstate *h = kobj_to_hstate(kobj, NULL); 1471 struct hstate *h = kobj_to_hstate(kobj, NULL);
1464 1472
1473 if (h->order >= MAX_ORDER)
1474 return -EINVAL;
1475
1465 err = strict_strtoul(buf, 10, &input); 1476 err = strict_strtoul(buf, 10, &input);
1466 if (err) 1477 if (err)
1467 return 0; 1478 return err;
1468 1479
1469 spin_lock(&hugetlb_lock); 1480 spin_lock(&hugetlb_lock);
1470 h->nr_overcommit_huge_pages = input; 1481 h->nr_overcommit_huge_pages = input;
@@ -1867,13 +1878,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1867{ 1878{
1868 struct hstate *h = &default_hstate; 1879 struct hstate *h = &default_hstate;
1869 unsigned long tmp; 1880 unsigned long tmp;
1881 int ret;
1882
1883 tmp = h->max_huge_pages;
1870 1884
1871 if (!write) 1885 if (write && h->order >= MAX_ORDER)
1872 tmp = h->max_huge_pages; 1886 return -EINVAL;
1873 1887
1874 table->data = &tmp; 1888 table->data = &tmp;
1875 table->maxlen = sizeof(unsigned long); 1889 table->maxlen = sizeof(unsigned long);
1876 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1890 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1891 if (ret)
1892 goto out;
1877 1893
1878 if (write) { 1894 if (write) {
1879 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1895 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1888,8 +1904,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1888 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1904 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1889 NODEMASK_FREE(nodes_allowed); 1905 NODEMASK_FREE(nodes_allowed);
1890 } 1906 }
1891 1907out:
1892 return 0; 1908 return ret;
1893} 1909}
1894 1910
1895int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1911int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1927,21 +1943,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1927{ 1943{
1928 struct hstate *h = &default_hstate; 1944 struct hstate *h = &default_hstate;
1929 unsigned long tmp; 1945 unsigned long tmp;
1946 int ret;
1930 1947
1931 if (!write) 1948 tmp = h->nr_overcommit_huge_pages;
1932 tmp = h->nr_overcommit_huge_pages; 1949
1950 if (write && h->order >= MAX_ORDER)
1951 return -EINVAL;
1933 1952
1934 table->data = &tmp; 1953 table->data = &tmp;
1935 table->maxlen = sizeof(unsigned long); 1954 table->maxlen = sizeof(unsigned long);
1936 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1955 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1956 if (ret)
1957 goto out;
1937 1958
1938 if (write) { 1959 if (write) {
1939 spin_lock(&hugetlb_lock); 1960 spin_lock(&hugetlb_lock);
1940 h->nr_overcommit_huge_pages = tmp; 1961 h->nr_overcommit_huge_pages = tmp;
1941 spin_unlock(&hugetlb_lock); 1962 spin_unlock(&hugetlb_lock);
1942 } 1963 }
1943 1964out:
1944 return 0; 1965 return ret;
1945} 1966}
1946 1967
1947#endif /* CONFIG_SYSCTL */ 1968#endif /* CONFIG_SYSCTL */
@@ -2030,7 +2051,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2030 * This new VMA should share its siblings reservation map if present. 2051 * This new VMA should share its siblings reservation map if present.
2031 * The VMA will only ever have a valid reservation map pointer where 2052 * The VMA will only ever have a valid reservation map pointer where
2032 * it is being copied for another still existing VMA. As that VMA 2053 * it is being copied for another still existing VMA. As that VMA
2033 * has a reference to the reservation map it cannot dissappear until 2054 * has a reference to the reservation map it cannot disappear until
2034 * after this open call completes. It is therefore safe to take a 2055 * after this open call completes. It is therefore safe to take a
2035 * new reference here without additional locking. 2056 * new reference here without additional locking.
2036 */ 2057 */
@@ -2153,6 +2174,19 @@ nomem:
2153 return -ENOMEM; 2174 return -ENOMEM;
2154} 2175}
2155 2176
2177static int is_hugetlb_entry_migration(pte_t pte)
2178{
2179 swp_entry_t swp;
2180
2181 if (huge_pte_none(pte) || pte_present(pte))
2182 return 0;
2183 swp = pte_to_swp_entry(pte);
2184 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2185 return 1;
2186 } else
2187 return 0;
2188}
2189
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2190static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2191{
2158 swp_entry_t swp; 2192 swp_entry_t swp;
@@ -2179,7 +2213,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2179 unsigned long sz = huge_page_size(h); 2213 unsigned long sz = huge_page_size(h);
2180 2214
2181 /* 2215 /*
2182 * A page gathering list, protected by per file i_mmap_lock. The 2216 * A page gathering list, protected by per file i_mmap_mutex. The
2183 * lock is used to avoid list corruption from multiple unmapping 2217 * lock is used to avoid list corruption from multiple unmapping
2184 * of the same page since we are using page->lru. 2218 * of the same page since we are using page->lru.
2185 */ 2219 */
@@ -2248,9 +2282,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2248void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2282void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2249 unsigned long end, struct page *ref_page) 2283 unsigned long end, struct page *ref_page)
2250{ 2284{
2251 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2285 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2252 __unmap_hugepage_range(vma, start, end, ref_page); 2286 __unmap_hugepage_range(vma, start, end, ref_page);
2253 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2287 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2254} 2288}
2255 2289
2256/* 2290/*
@@ -2282,7 +2316,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2282 * this mapping should be shared between all the VMAs, 2316 * this mapping should be shared between all the VMAs,
2283 * __unmap_hugepage_range() is called as the lock is already held 2317 * __unmap_hugepage_range() is called as the lock is already held
2284 */ 2318 */
2285 spin_lock(&mapping->i_mmap_lock); 2319 mutex_lock(&mapping->i_mmap_mutex);
2286 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2320 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2287 /* Do not unmap the current VMA */ 2321 /* Do not unmap the current VMA */
2288 if (iter_vma == vma) 2322 if (iter_vma == vma)
@@ -2300,7 +2334,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2300 address, address + huge_page_size(h), 2334 address, address + huge_page_size(h),
2301 page); 2335 page);
2302 } 2336 }
2303 spin_unlock(&mapping->i_mmap_lock); 2337 mutex_unlock(&mapping->i_mmap_mutex);
2304 2338
2305 return 1; 2339 return 1;
2306} 2340}
@@ -2380,10 +2414,14 @@ retry_avoidcopy:
2380 * When the original hugepage is shared one, it does not have 2414 * When the original hugepage is shared one, it does not have
2381 * anon_vma prepared. 2415 * anon_vma prepared.
2382 */ 2416 */
2383 if (unlikely(anon_vma_prepare(vma))) 2417 if (unlikely(anon_vma_prepare(vma))) {
2418 /* Caller expects lock to be held */
2419 spin_lock(&mm->page_table_lock);
2384 return VM_FAULT_OOM; 2420 return VM_FAULT_OOM;
2421 }
2385 2422
2386 copy_huge_page(new_page, old_page, address, vma); 2423 copy_user_huge_page(new_page, old_page, address, vma,
2424 pages_per_huge_page(h));
2387 __SetPageUptodate(new_page); 2425 __SetPageUptodate(new_page);
2388 2426
2389 /* 2427 /*
@@ -2460,7 +2498,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2460 /* 2498 /*
2461 * Currently, we are forced to kill the process in the event the 2499 * Currently, we are forced to kill the process in the event the
2462 * original mapper has unmapped pages from the child due to a failed 2500 * original mapper has unmapped pages from the child due to a failed
2463 * COW. Warn that such a situation has occured as it may not be obvious 2501 * COW. Warn that such a situation has occurred as it may not be obvious
2464 */ 2502 */
2465 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2503 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2466 printk(KERN_WARNING 2504 printk(KERN_WARNING
@@ -2487,7 +2525,7 @@ retry:
2487 ret = -PTR_ERR(page); 2525 ret = -PTR_ERR(page);
2488 goto out; 2526 goto out;
2489 } 2527 }
2490 clear_huge_page(page, address, huge_page_size(h)); 2528 clear_huge_page(page, address, pages_per_huge_page(h));
2491 __SetPageUptodate(page); 2529 __SetPageUptodate(page);
2492 2530
2493 if (vma->vm_flags & VM_MAYSHARE) { 2531 if (vma->vm_flags & VM_MAYSHARE) {
@@ -2515,22 +2553,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2553 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2554 }
2517 } else { 2555 } else {
2556 /*
2557 * If memory error occurs between mmap() and fault, some process
2558 * don't have hwpoisoned swap entry for errored virtual address.
2559 * So we need to block hugepage fault by PG_hwpoison bit check.
2560 */
2561 if (unlikely(PageHWPoison(page))) {
2562 ret = VM_FAULT_HWPOISON |
2563 VM_FAULT_SET_HINDEX(h - hstates);
2564 goto backout_unlocked;
2565 }
2518 page_dup_rmap(page); 2566 page_dup_rmap(page);
2519 } 2567 }
2520 2568
2521 /* 2569 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2570 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2571 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2572 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2623,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2623 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2624 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2625 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2626 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2627 migration_entry_wait(mm, (pmd_t *)ptep, address);
2628 return 0;
2629 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2630 return VM_FAULT_HWPOISON_LARGE |
2631 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2632 }
2593 2633
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2634 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2665,7 +2705,8 @@ out_page_table_lock:
2665 unlock_page(pagecache_page); 2705 unlock_page(pagecache_page);
2666 put_page(pagecache_page); 2706 put_page(pagecache_page);
2667 } 2707 }
2668 unlock_page(page); 2708 if (page != pagecache_page)
2709 unlock_page(page);
2669 2710
2670out_mutex: 2711out_mutex:
2671 mutex_unlock(&hugetlb_instantiation_mutex); 2712 mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2777,7 +2818,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2777 BUG_ON(address >= end); 2818 BUG_ON(address >= end);
2778 flush_cache_range(vma, address, end); 2819 flush_cache_range(vma, address, end);
2779 2820
2780 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2821 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2781 spin_lock(&mm->page_table_lock); 2822 spin_lock(&mm->page_table_lock);
2782 for (; address < end; address += huge_page_size(h)) { 2823 for (; address < end; address += huge_page_size(h)) {
2783 ptep = huge_pte_offset(mm, address); 2824 ptep = huge_pte_offset(mm, address);
@@ -2792,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2792 } 2833 }
2793 } 2834 }
2794 spin_unlock(&mm->page_table_lock); 2835 spin_unlock(&mm->page_table_lock);
2795 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2836 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2796 2837
2797 flush_tlb_range(vma, start, end); 2838 flush_tlb_range(vma, start, end);
2798} 2839}
@@ -2800,7 +2841,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2800int hugetlb_reserve_pages(struct inode *inode, 2841int hugetlb_reserve_pages(struct inode *inode,
2801 long from, long to, 2842 long from, long to,
2802 struct vm_area_struct *vma, 2843 struct vm_area_struct *vma,
2803 int acctflag) 2844 vm_flags_t vm_flags)
2804{ 2845{
2805 long ret, chg; 2846 long ret, chg;
2806 struct hstate *h = hstate_inode(inode); 2847 struct hstate *h = hstate_inode(inode);
@@ -2810,7 +2851,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2810 * attempt will be made for VM_NORESERVE to allocate a page 2851 * attempt will be made for VM_NORESERVE to allocate a page
2811 * and filesystem quota without using reserves 2852 * and filesystem quota without using reserves
2812 */ 2853 */
2813 if (acctflag & VM_NORESERVE) 2854 if (vm_flags & VM_NORESERVE)
2814 return 0; 2855 return 0;
2815 2856
2816 /* 2857 /*
@@ -2878,18 +2919,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2919 hugetlb_acct_memory(h, -(chg - freed));
2879} 2920}
2880 2921
2922#ifdef CONFIG_MEMORY_FAILURE
2923
2924/* Should be called in hugetlb_lock */
2925static int is_hugepage_on_freelist(struct page *hpage)
2926{
2927 struct page *page;
2928 struct page *tmp;
2929 struct hstate *h = page_hstate(hpage);
2930 int nid = page_to_nid(hpage);
2931
2932 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2933 if (page == hpage)
2934 return 1;
2935 return 0;
2936}
2937
2881/* 2938/*
2882 * This function is called from memory failure code. 2939 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2940 * Assume the caller holds page lock of the head page.
2884 */ 2941 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2942int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2943{
2887 struct hstate *h = page_hstate(hpage); 2944 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2945 int nid = page_to_nid(hpage);
2946 int ret = -EBUSY;
2889 2947
2890 spin_lock(&hugetlb_lock); 2948 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2949 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2950 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2951 set_page_refcounted(hpage);
2952 h->free_huge_pages--;
2953 h->free_huge_pages_node[nid]--;
2954 ret = 0;
2955 }
2894 spin_unlock(&hugetlb_lock); 2956 spin_unlock(&hugetlb_lock);
2957 return ret;
2895} 2958}
2959#endif
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6b..c7fc7fd00e32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */ 1/* Inject a hwpoison memory failure on a arbitrary pfn */
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdfe8ebb..4019979b2637 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
21 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 21 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
24 .cpu_vm_mask = CPU_MASK_ALL,
25 INIT_MM_CONTEXT(init_mm) 24 INIT_MM_CONTEXT(init_mm)
26}; 25};
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..d071d380fb49 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,10 +62,14 @@ extern bool is_free_buddy_page(struct page *page);
62 */ 62 */
63static inline unsigned long page_order(struct page *page) 63static inline unsigned long page_order(struct page *page)
64{ 64{
65 VM_BUG_ON(!PageBuddy(page)); 65 /* PageBuddy() must be checked by the caller */
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
69/* mm/util.c */
70void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
71 struct vm_area_struct *prev, struct rb_node *rb_parent);
72
69#ifdef CONFIG_MMU 73#ifdef CONFIG_MMU
70extern long mlock_vma_pages_range(struct vm_area_struct *vma, 74extern long mlock_vma_pages_range(struct vm_area_struct *vma,
71 unsigned long start, unsigned long end); 75 unsigned long start, unsigned long end);
@@ -134,6 +138,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
134 } 138 }
135} 139}
136 140
141#ifdef CONFIG_TRANSPARENT_HUGEPAGE
142extern unsigned long vma_address(struct page *page,
143 struct vm_area_struct *vma);
144#endif
137#else /* !CONFIG_MMU */ 145#else /* !CONFIG_MMU */
138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 146static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
139{ 147{
@@ -158,7 +166,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
158} 166}
159 167
160/* 168/*
161 * Iterator over all subpages withing the maximally aligned gigantic 169 * Iterator over all subpages within the maximally aligned gigantic
162 * page 'base'. Handle any discontiguity in the mem_map. 170 * page 'base'. Handle any discontiguity in the mem_map.
163 */ 171 */
164static inline struct page *mem_map_next(struct page *iter, 172static inline struct page *mem_map_next(struct page *iter,
@@ -241,10 +249,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
241} 249}
242#endif /* CONFIG_SPARSEMEM */ 250#endif /* CONFIG_SPARSEMEM */
243 251
244int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
245 unsigned long start, int len, unsigned int foll_flags,
246 struct page **pages, struct vm_area_struct **vmas);
247
248#define ZONE_RECLAIM_NOSCAN -2 252#define ZONE_RECLAIM_NOSCAN -2
249#define ZONE_RECLAIM_FULL -1 253#define ZONE_RECLAIM_FULL -1
250#define ZONE_RECLAIM_SOME 0 254#define ZONE_RECLAIM_SOME 0
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
75 * after the module is removed. 75 * after the module is removed.
76 */ 76 */
77 for (i = 0; i < 10; i++) { 77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 78 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); 79 pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem) 80 if (!elem)
81 return -ENOMEM; 81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list); 82 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list); 83 list_add_tail(&elem->list, &test_list);
86 } 84 }
87 85
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..aacee45616fc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
113#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
114 114
115/* GFP bitmask for kmemleak internal allocations */ 115/* GFP bitmask for kmemleak internal allocations */
116#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) 116#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
117 __GFP_NORETRY | __GFP_NOMEMALLOC | \
118 __GFP_NOWARN)
117 119
118/* scanning area inside a memory block */ 120/* scanning area inside a memory block */
119struct kmemleak_scan_area { 121struct kmemleak_scan_area {
@@ -263,7 +265,7 @@ static void kmemleak_disable(void);
263} while (0) 265} while (0)
264 266
265/* 267/*
266 * Macro invoked when a serious kmemleak condition occured and cannot be 268 * Macro invoked when a serious kmemleak condition occurred and cannot be
267 * recovered from. Kmemleak will be disabled and further allocation/freeing 269 * recovered from. Kmemleak will be disabled and further allocation/freeing
268 * tracing no longer available. 270 * tracing no longer available.
269 */ 271 */
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
511 struct kmemleak_object *object; 513 struct kmemleak_object *object;
512 struct prio_tree_node *node; 514 struct prio_tree_node *node;
513 515
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 516 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
515 if (!object) { 517 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 518 pr_warning("Cannot allocate a kmemleak_object structure\n");
519 kmemleak_disable();
517 return NULL; 520 return NULL;
518 } 521 }
519 522
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
734 return; 737 return;
735 } 738 }
736 739
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); 740 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
738 if (!area) { 741 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n"); 742 pr_warning("Cannot allocate a scan area\n");
740 goto out; 743 goto out;
741 } 744 }
742 745
@@ -1003,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
1003 1006
1004/* 1007/*
1005 * Memory scanning is a long process and it needs to be interruptable. This 1008 * Memory scanning is a long process and it needs to be interruptable. This
1006 * function checks whether such interrupt condition occured. 1009 * function checks whether such interrupt condition occurred.
1007 */ 1010 */
1008static int scan_should_stop(void) 1011static int scan_should_stop(void)
1009{ 1012{
@@ -1411,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1411 ++(*pos); 1414 ++(*pos);
1412 1415
1413 list_for_each_continue_rcu(n, &object_list) { 1416 list_for_each_continue_rcu(n, &object_list) {
1414 next_obj = list_entry(n, struct kmemleak_object, object_list); 1417 struct kmemleak_object *obj =
1415 if (get_object(next_obj)) 1418 list_entry(n, struct kmemleak_object, object_list);
1419 if (get_object(obj)) {
1420 next_obj = obj;
1416 break; 1421 break;
1422 }
1417 } 1423 }
1418 1424
1419 put_object(prev_obj); 1425 put_object(prev_obj);
@@ -1730,7 +1736,7 @@ static int __init kmemleak_late_init(void)
1730 1736
1731 if (atomic_read(&kmemleak_error)) { 1737 if (atomic_read(&kmemleak_error)) {
1732 /* 1738 /*
1733 * Some error occured and kmemleak was disabled. There is a 1739 * Some error occurred and kmemleak was disabled. There is a
1734 * small chance that kmemleak_disable() was called immediately 1740 * small chance that kmemleak_disable() was called immediately
1735 * after setting kmemleak_initialized and we may end up with 1741 * after setting kmemleak_initialized and we may end up with
1736 * two clean-up threads but serialized by scan_mutex. 1742 * two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d9..9a68b0cf0a1c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,8 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/freezer.h>
38#include <linux/oom.h>
37 39
38#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
39#include "internal.h" 41#include "internal.h"
@@ -300,20 +302,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
300 return rmap_item->address & STABLE_FLAG; 302 return rmap_item->address & STABLE_FLAG;
301} 303}
302 304
303static void hold_anon_vma(struct rmap_item *rmap_item,
304 struct anon_vma *anon_vma)
305{
306 rmap_item->anon_vma = anon_vma;
307 get_anon_vma(anon_vma);
308}
309
310static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
311{
312 struct anon_vma *anon_vma = rmap_item->anon_vma;
313
314 drop_anon_vma(anon_vma);
315}
316
317/* 305/*
318 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 306 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
319 * page tables after it has passed through ksm_exit() - which, if necessary, 307 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -396,7 +384,7 @@ static void break_cow(struct rmap_item *rmap_item)
396 * It is not an accident that whenever we want to break COW 384 * It is not an accident that whenever we want to break COW
397 * to undo, we also need to drop a reference to the anon_vma. 385 * to undo, we also need to drop a reference to the anon_vma.
398 */ 386 */
399 ksm_drop_anon_vma(rmap_item); 387 put_anon_vma(rmap_item->anon_vma);
400 388
401 down_read(&mm->mmap_sem); 389 down_read(&mm->mmap_sem);
402 if (ksm_test_exit(mm)) 390 if (ksm_test_exit(mm))
@@ -411,6 +399,20 @@ out:
411 up_read(&mm->mmap_sem); 399 up_read(&mm->mmap_sem);
412} 400}
413 401
402static struct page *page_trans_compound_anon(struct page *page)
403{
404 if (PageTransCompound(page)) {
405 struct page *head = compound_trans_head(page);
406 /*
407 * head may actually be splitted and freed from under
408 * us but it's ok here.
409 */
410 if (PageAnon(head))
411 return head;
412 }
413 return NULL;
414}
415
414static struct page *get_mergeable_page(struct rmap_item *rmap_item) 416static struct page *get_mergeable_page(struct rmap_item *rmap_item)
415{ 417{
416 struct mm_struct *mm = rmap_item->mm; 418 struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +432,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
430 page = follow_page(vma, addr, FOLL_GET); 432 page = follow_page(vma, addr, FOLL_GET);
431 if (IS_ERR_OR_NULL(page)) 433 if (IS_ERR_OR_NULL(page))
432 goto out; 434 goto out;
433 if (PageAnon(page)) { 435 if (PageAnon(page) || page_trans_compound_anon(page)) {
434 flush_anon_page(vma, page, addr); 436 flush_anon_page(vma, page, addr);
435 flush_dcache_page(page); 437 flush_dcache_page(page);
436 } else { 438 } else {
@@ -451,7 +453,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
451 ksm_pages_sharing--; 453 ksm_pages_sharing--;
452 else 454 else
453 ksm_pages_shared--; 455 ksm_pages_shared--;
454 ksm_drop_anon_vma(rmap_item); 456 put_anon_vma(rmap_item->anon_vma);
455 rmap_item->address &= PAGE_MASK; 457 rmap_item->address &= PAGE_MASK;
456 cond_resched(); 458 cond_resched();
457 } 459 }
@@ -539,7 +541,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
539 else 541 else
540 ksm_pages_shared--; 542 ksm_pages_shared--;
541 543
542 ksm_drop_anon_vma(rmap_item); 544 put_anon_vma(rmap_item->anon_vma);
543 rmap_item->address &= PAGE_MASK; 545 rmap_item->address &= PAGE_MASK;
544 546
545 } else if (rmap_item->address & UNSTABLE_FLAG) { 547 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -708,6 +710,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
708 if (addr == -EFAULT) 710 if (addr == -EFAULT)
709 goto out; 711 goto out;
710 712
713 BUG_ON(PageTransCompound(page));
711 ptep = page_check_address(page, mm, addr, &ptl, 0); 714 ptep = page_check_address(page, mm, addr, &ptl, 0);
712 if (!ptep) 715 if (!ptep)
713 goto out; 716 goto out;
@@ -718,7 +721,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
718 swapped = PageSwapCache(page); 721 swapped = PageSwapCache(page);
719 flush_cache_page(vma, addr, page_to_pfn(page)); 722 flush_cache_page(vma, addr, page_to_pfn(page));
720 /* 723 /*
721 * Ok this is tricky, when get_user_pages_fast() run it doesnt 724 * Ok this is tricky, when get_user_pages_fast() run it doesn't
722 * take any lock, therefore the check that we are going to make 725 * take any lock, therefore the check that we are going to make
723 * with the pagecount against the mapcount is racey and 726 * with the pagecount against the mapcount is racey and
724 * O_DIRECT can happen right after the check. 727 * O_DIRECT can happen right after the check.
@@ -783,6 +786,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
783 goto out; 786 goto out;
784 787
785 pmd = pmd_offset(pud, addr); 788 pmd = pmd_offset(pud, addr);
789 BUG_ON(pmd_trans_huge(*pmd));
786 if (!pmd_present(*pmd)) 790 if (!pmd_present(*pmd))
787 goto out; 791 goto out;
788 792
@@ -800,6 +804,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
800 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 804 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
801 805
802 page_remove_rmap(page); 806 page_remove_rmap(page);
807 if (!page_mapped(page))
808 try_to_free_swap(page);
803 put_page(page); 809 put_page(page);
804 810
805 pte_unmap_unlock(ptep, ptl); 811 pte_unmap_unlock(ptep, ptl);
@@ -808,6 +814,33 @@ out:
808 return err; 814 return err;
809} 815}
810 816
817static int page_trans_compound_anon_split(struct page *page)
818{
819 int ret = 0;
820 struct page *transhuge_head = page_trans_compound_anon(page);
821 if (transhuge_head) {
822 /* Get the reference on the head to split it. */
823 if (get_page_unless_zero(transhuge_head)) {
824 /*
825 * Recheck we got the reference while the head
826 * was still anonymous.
827 */
828 if (PageAnon(transhuge_head))
829 ret = split_huge_page(transhuge_head);
830 else
831 /*
832 * Retry later if split_huge_page run
833 * from under us.
834 */
835 ret = 1;
836 put_page(transhuge_head);
837 } else
838 /* Retry later if split_huge_page run from under us. */
839 ret = 1;
840 }
841 return ret;
842}
843
811/* 844/*
812 * try_to_merge_one_page - take two pages and merge them into one 845 * try_to_merge_one_page - take two pages and merge them into one
813 * @vma: the vma that holds the pte pointing to page 846 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +861,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
828 861
829 if (!(vma->vm_flags & VM_MERGEABLE)) 862 if (!(vma->vm_flags & VM_MERGEABLE))
830 goto out; 863 goto out;
864 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
865 goto out;
866 BUG_ON(PageTransCompound(page));
831 if (!PageAnon(page)) 867 if (!PageAnon(page))
832 goto out; 868 goto out;
833 869
@@ -900,7 +936,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
900 goto out; 936 goto out;
901 937
902 /* Must get reference to anon_vma while still holding mmap_sem */ 938 /* Must get reference to anon_vma while still holding mmap_sem */
903 hold_anon_vma(rmap_item, vma->anon_vma); 939 rmap_item->anon_vma = vma->anon_vma;
940 get_anon_vma(vma->anon_vma);
904out: 941out:
905 up_read(&mm->mmap_sem); 942 up_read(&mm->mmap_sem);
906 return err; 943 return err;
@@ -1247,12 +1284,30 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1247 1284
1248 slot = ksm_scan.mm_slot; 1285 slot = ksm_scan.mm_slot;
1249 if (slot == &ksm_mm_head) { 1286 if (slot == &ksm_mm_head) {
1287 /*
1288 * A number of pages can hang around indefinitely on per-cpu
1289 * pagevecs, raised page count preventing write_protect_page
1290 * from merging them. Though it doesn't really matter much,
1291 * it is puzzling to see some stuck in pages_volatile until
1292 * other activity jostles them out, and they also prevented
1293 * LTP's KSM test from succeeding deterministically; so drain
1294 * them here (here rather than on entry to ksm_do_scan(),
1295 * so we don't IPI too often when pages_to_scan is set low).
1296 */
1297 lru_add_drain_all();
1298
1250 root_unstable_tree = RB_ROOT; 1299 root_unstable_tree = RB_ROOT;
1251 1300
1252 spin_lock(&ksm_mmlist_lock); 1301 spin_lock(&ksm_mmlist_lock);
1253 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1254 ksm_scan.mm_slot = slot; 1303 ksm_scan.mm_slot = slot;
1255 spin_unlock(&ksm_mmlist_lock); 1304 spin_unlock(&ksm_mmlist_lock);
1305 /*
1306 * Although we tested list_empty() above, a racing __ksm_exit
1307 * of the last mm on the list may have removed it since then.
1308 */
1309 if (slot == &ksm_mm_head)
1310 return NULL;
1256next_mm: 1311next_mm:
1257 ksm_scan.address = 0; 1312 ksm_scan.address = 0;
1258 ksm_scan.rmap_list = &slot->rmap_list; 1313 ksm_scan.rmap_list = &slot->rmap_list;
@@ -1277,7 +1332,13 @@ next_mm:
1277 if (ksm_test_exit(mm)) 1332 if (ksm_test_exit(mm))
1278 break; 1333 break;
1279 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1334 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1280 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { 1335 if (IS_ERR_OR_NULL(*page)) {
1336 ksm_scan.address += PAGE_SIZE;
1337 cond_resched();
1338 continue;
1339 }
1340 if (PageAnon(*page) ||
1341 page_trans_compound_anon(*page)) {
1281 flush_anon_page(vma, *page, ksm_scan.address); 1342 flush_anon_page(vma, *page, ksm_scan.address);
1282 flush_dcache_page(*page); 1343 flush_dcache_page(*page);
1283 rmap_item = get_next_rmap_item(slot, 1344 rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1352,7 @@ next_mm:
1291 up_read(&mm->mmap_sem); 1352 up_read(&mm->mmap_sem);
1292 return rmap_item; 1353 return rmap_item;
1293 } 1354 }
1294 if (!IS_ERR_OR_NULL(*page)) 1355 put_page(*page);
1295 put_page(*page);
1296 ksm_scan.address += PAGE_SIZE; 1356 ksm_scan.address += PAGE_SIZE;
1297 cond_resched(); 1357 cond_resched();
1298 } 1358 }
@@ -1352,7 +1412,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1352 struct rmap_item *rmap_item; 1412 struct rmap_item *rmap_item;
1353 struct page *uninitialized_var(page); 1413 struct page *uninitialized_var(page);
1354 1414
1355 while (scan_npages--) { 1415 while (scan_npages-- && likely(!freezing(current))) {
1356 cond_resched(); 1416 cond_resched();
1357 rmap_item = scan_get_next_rmap_item(&page); 1417 rmap_item = scan_get_next_rmap_item(&page);
1358 if (!rmap_item) 1418 if (!rmap_item)
@@ -1370,6 +1430,7 @@ static int ksmd_should_run(void)
1370 1430
1371static int ksm_scan_thread(void *nothing) 1431static int ksm_scan_thread(void *nothing)
1372{ 1432{
1433 set_freezable();
1373 set_user_nice(current, 5); 1434 set_user_nice(current, 5);
1374 1435
1375 while (!kthread_should_stop()) { 1436 while (!kthread_should_stop()) {
@@ -1378,11 +1439,13 @@ static int ksm_scan_thread(void *nothing)
1378 ksm_do_scan(ksm_thread_pages_to_scan); 1439 ksm_do_scan(ksm_thread_pages_to_scan);
1379 mutex_unlock(&ksm_thread_mutex); 1440 mutex_unlock(&ksm_thread_mutex);
1380 1441
1442 try_to_freeze();
1443
1381 if (ksmd_should_run()) { 1444 if (ksmd_should_run()) {
1382 schedule_timeout_interruptible( 1445 schedule_timeout_interruptible(
1383 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1446 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1384 } else { 1447 } else {
1385 wait_event_interruptible(ksm_thread_wait, 1448 wait_event_freezable(ksm_thread_wait,
1386 ksmd_should_run() || kthread_should_stop()); 1449 ksmd_should_run() || kthread_should_stop());
1387 } 1450 }
1388 } 1451 }
@@ -1724,8 +1787,13 @@ static int ksm_memory_callback(struct notifier_block *self,
1724 /* 1787 /*
1725 * Keep it very simple for now: just lock out ksmd and 1788 * Keep it very simple for now: just lock out ksmd and
1726 * MADV_UNMERGEABLE while any memory is going offline. 1789 * MADV_UNMERGEABLE while any memory is going offline.
1790 * mutex_lock_nested() is necessary because lockdep was alarmed
1791 * that here we take ksm_thread_mutex inside notifier chain
1792 * mutex, and later take notifier chain mutex inside
1793 * ksm_thread_mutex to unlock it. But that's safe because both
1794 * are inside mem_hotplug_mutex.
1727 */ 1795 */
1728 mutex_lock(&ksm_thread_mutex); 1796 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
1729 break; 1797 break;
1730 1798
1731 case MEM_OFFLINE: 1799 case MEM_OFFLINE:
@@ -1833,9 +1901,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1833 if (ksm_run != flags) { 1901 if (ksm_run != flags) {
1834 ksm_run = flags; 1902 ksm_run = flags;
1835 if (flags & KSM_RUN_UNMERGE) { 1903 if (flags & KSM_RUN_UNMERGE) {
1836 current->flags |= PF_OOM_ORIGIN; 1904 int oom_score_adj;
1905
1906 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1837 err = unmerge_and_remove_all_rmap_items(); 1907 err = unmerge_and_remove_all_rmap_items();
1838 current->flags &= ~PF_OOM_ORIGIN; 1908 test_set_oom_score_adj(oom_score_adj);
1839 if (err) { 1909 if (err) {
1840 ksm_run = KSM_RUN_STOP; 1910 ksm_run = KSM_RUN_STOP;
1841 count = err; 1911 count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..4cee182ab5f3 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * Access kernel memory without faulting. 2 * Access kernel memory without faulting.
3 */ 3 */
4#include <linux/uaccess.h>
5#include <linux/module.h> 4#include <linux/module.h>
6#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/uaccess.h>
7 7
8/** 8/**
9 * probe_kernel_read(): safely attempt to read from a location 9 * probe_kernel_read(): safely attempt to read from a location
@@ -15,10 +15,10 @@
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size) 18long __weak probe_kernel_read(void *dst, const void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read"))); 19 __attribute__((alias("__probe_kernel_read")));
20 20
21long __probe_kernel_read(void *dst, void *src, size_t size) 21long __probe_kernel_read(void *dst, const void *src, size_t size)
22{ 22{
23 long ret; 23 long ret;
24 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
43 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
44 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
45 */ 45 */
46long __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, const void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write"))); 47 __attribute__((alias("__probe_kernel_write")));
48 48
49long __probe_kernel_write(void *dst, void *src, size_t size) 49long __probe_kernel_write(void *dst, const void *src, size_t size)
50{ 50{
51 long ret; 51 long ret;
52 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
71 if (error) 71 if (error)
72 goto out; 72 goto out;
73 break; 73 break;
74 case MADV_HUGEPAGE:
75 case MADV_NOHUGEPAGE:
76 error = hugepage_madvise(vma, &new_flags, behavior);
77 if (error)
78 goto out;
79 break;
74 } 80 }
75 81
76 if (new_flags == vma->vm_flags) { 82 if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
283 case MADV_MERGEABLE: 289 case MADV_MERGEABLE:
284 case MADV_UNMERGEABLE: 290 case MADV_UNMERGEABLE:
285#endif 291#endif
292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE:
295#endif
286 return 1; 296 return 1;
287 297
288 default: 298 default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ecb..a0562d1a6ad4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,446 +11,634 @@
11 */ 11 */
12 12
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/slab.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
17#include <linux/poison.h>
18#include <linux/pfn.h>
19#include <linux/debugfs.h>
20#include <linux/seq_file.h>
16#include <linux/memblock.h> 21#include <linux/memblock.h>
17 22
18#define MEMBLOCK_ALLOC_ANYWHERE 0 23struct memblock memblock __initdata_memblock;
19 24
20struct memblock memblock; 25int memblock_debug __initdata_memblock;
26int memblock_can_resize __initdata_memblock;
27static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
28static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
21 29
22static int memblock_debug; 30/* inline so we don't get a warning when pr_debug is compiled out */
31static inline const char *memblock_type_name(struct memblock_type *type)
32{
33 if (type == &memblock.memory)
34 return "memory";
35 else if (type == &memblock.reserved)
36 return "reserved";
37 else
38 return "unknown";
39}
23 40
24static int __init early_memblock(char *p) 41/*
42 * Address comparison utilities
43 */
44
45static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
25{ 46{
26 if (p && strstr(p, "debug")) 47 return addr & ~(size - 1);
27 memblock_debug = 1;
28 return 0;
29} 48}
30early_param("memblock", early_memblock);
31 49
32static void memblock_dump(struct memblock_region *region, char *name) 50static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
33{ 51{
34 unsigned long long base, size; 52 return (addr + (size - 1)) & ~(size - 1);
35 int i; 53}
36 54
37 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); 55static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
56 phys_addr_t base2, phys_addr_t size2)
57{
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59}
38 60
39 for (i = 0; i < region->cnt; i++) { 61long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
40 base = region->region[i].base; 62{
41 size = region->region[i].size; 63 unsigned long i;
42 64
43 pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n", 65 for (i = 0; i < type->cnt; i++) {
44 name, i, base, base + size - 1, size); 66 phys_addr_t rgnbase = type->regions[i].base;
67 phys_addr_t rgnsize = type->regions[i].size;
68 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
69 break;
45 } 70 }
71
72 return (i < type->cnt) ? i : -1;
46} 73}
47 74
48void memblock_dump_all(void) 75/*
76 * Find, allocate, deallocate or reserve unreserved regions. All allocations
77 * are top-down.
78 */
79
80static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
81 phys_addr_t size, phys_addr_t align)
49{ 82{
50 if (!memblock_debug) 83 phys_addr_t base, res_base;
51 return; 84 long j;
52 85
53 pr_info("MEMBLOCK configuration:\n"); 86 /* In case, huge size is requested */
54 pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size); 87 if (end < size)
55 pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size); 88 return MEMBLOCK_ERROR;
56 89
57 memblock_dump(&memblock.memory, "memory"); 90 base = memblock_align_down((end - size), align);
58 memblock_dump(&memblock.reserved, "reserved"); 91
92 /* Prevent allocations returning 0 as it's also used to
93 * indicate an allocation failure
94 */
95 if (start == 0)
96 start = PAGE_SIZE;
97
98 while (start <= base) {
99 j = memblock_overlaps_region(&memblock.reserved, base, size);
100 if (j < 0)
101 return base;
102 res_base = memblock.reserved.regions[j].base;
103 if (res_base < size)
104 break;
105 base = memblock_align_down(res_base - size, align);
106 }
107
108 return MEMBLOCK_ERROR;
59} 109}
60 110
61static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2, 111static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
62 u64 size2) 112 phys_addr_t align, phys_addr_t start, phys_addr_t end)
63{ 113{
64 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 114 long i;
115
116 BUG_ON(0 == size);
117
118 /* Pump up max_addr */
119 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
120 end = memblock.current_limit;
121
122 /* We do a top-down search, this tends to limit memory
123 * fragmentation by keeping early boot allocs near the
124 * top of memory
125 */
126 for (i = memblock.memory.cnt - 1; i >= 0; i--) {
127 phys_addr_t memblockbase = memblock.memory.regions[i].base;
128 phys_addr_t memblocksize = memblock.memory.regions[i].size;
129 phys_addr_t bottom, top, found;
130
131 if (memblocksize < size)
132 continue;
133 if ((memblockbase + memblocksize) <= start)
134 break;
135 bottom = max(memblockbase, start);
136 top = min(memblockbase + memblocksize, end);
137 if (bottom >= top)
138 continue;
139 found = memblock_find_region(bottom, top, size, align);
140 if (found != MEMBLOCK_ERROR)
141 return found;
142 }
143 return MEMBLOCK_ERROR;
65} 144}
66 145
67static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2) 146/*
147 * Find a free area with specified alignment in a specific range.
148 */
149u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
68{ 150{
69 if (base2 == base1 + size1) 151 return memblock_find_base(size, align, start, end);
70 return 1; 152}
71 else if (base1 == base2 + size2)
72 return -1;
73 153
74 return 0; 154/*
155 * Free memblock.reserved.regions
156 */
157int __init_memblock memblock_free_reserved_regions(void)
158{
159 if (memblock.reserved.regions == memblock_reserved_init_regions)
160 return 0;
161
162 return memblock_free(__pa(memblock.reserved.regions),
163 sizeof(struct memblock_region) * memblock.reserved.max);
75} 164}
76 165
77static long memblock_regions_adjacent(struct memblock_region *rgn, 166/*
78 unsigned long r1, unsigned long r2) 167 * Reserve memblock.reserved.regions
168 */
169int __init_memblock memblock_reserve_reserved_regions(void)
79{ 170{
80 u64 base1 = rgn->region[r1].base; 171 if (memblock.reserved.regions == memblock_reserved_init_regions)
81 u64 size1 = rgn->region[r1].size; 172 return 0;
82 u64 base2 = rgn->region[r2].base;
83 u64 size2 = rgn->region[r2].size;
84 173
85 return memblock_addrs_adjacent(base1, size1, base2, size2); 174 return memblock_reserve(__pa(memblock.reserved.regions),
175 sizeof(struct memblock_region) * memblock.reserved.max);
86} 176}
87 177
88static void memblock_remove_region(struct memblock_region *rgn, unsigned long r) 178static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
89{ 179{
90 unsigned long i; 180 unsigned long i;
91 181
92 for (i = r; i < rgn->cnt - 1; i++) { 182 for (i = r; i < type->cnt - 1; i++) {
93 rgn->region[i].base = rgn->region[i + 1].base; 183 type->regions[i].base = type->regions[i + 1].base;
94 rgn->region[i].size = rgn->region[i + 1].size; 184 type->regions[i].size = type->regions[i + 1].size;
95 } 185 }
96 rgn->cnt--; 186 type->cnt--;
97}
98 187
99/* Assumption: base addr of region 1 < base addr of region 2 */ 188 /* Special case for empty arrays */
100static void memblock_coalesce_regions(struct memblock_region *rgn, 189 if (type->cnt == 0) {
101 unsigned long r1, unsigned long r2) 190 type->cnt = 1;
102{ 191 type->regions[0].base = 0;
103 rgn->region[r1].size += rgn->region[r2].size; 192 type->regions[0].size = 0;
104 memblock_remove_region(rgn, r2); 193 }
105} 194}
106 195
107void __init memblock_init(void) 196/* Defined below but needed now */
197static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
198
199static int __init_memblock memblock_double_array(struct memblock_type *type)
108{ 200{
109 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 201 struct memblock_region *new_array, *old_array;
110 * This simplifies the memblock_add() code below... 202 phys_addr_t old_size, new_size, addr;
203 int use_slab = slab_is_available();
204
205 /* We don't allow resizing until we know about the reserved regions
206 * of memory that aren't suitable for allocation
111 */ 207 */
112 memblock.memory.region[0].base = 0; 208 if (!memblock_can_resize)
113 memblock.memory.region[0].size = 0; 209 return -1;
114 memblock.memory.cnt = 1;
115 210
116 /* Ditto. */ 211 /* Calculate new doubled size */
117 memblock.reserved.region[0].base = 0; 212 old_size = type->max * sizeof(struct memblock_region);
118 memblock.reserved.region[0].size = 0; 213 new_size = old_size << 1;
119 memblock.reserved.cnt = 1; 214
120} 215 /* Try to find some space for it.
216 *
217 * WARNING: We assume that either slab_is_available() and we use it or
218 * we use MEMBLOCK for allocations. That means that this is unsafe to use
219 * when bootmem is currently active (unless bootmem itself is implemented
220 * on top of MEMBLOCK which isn't the case yet)
221 *
222 * This should however not be an issue for now, as we currently only
223 * call into MEMBLOCK while it's still active, or much later when slab is
224 * active for memory hotplug operations
225 */
226 if (use_slab) {
227 new_array = kmalloc(new_size, GFP_KERNEL);
228 addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
229 } else
230 addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
231 if (addr == MEMBLOCK_ERROR) {
232 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
233 memblock_type_name(type), type->max, type->max * 2);
234 return -1;
235 }
236 new_array = __va(addr);
121 237
122void __init memblock_analyze(void) 238 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
123{ 239 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
124 int i;
125 240
126 memblock.memory.size = 0; 241 /* Found space, we now need to move the array over before
242 * we add the reserved region since it may be our reserved
243 * array itself that is full.
244 */
245 memcpy(new_array, type->regions, old_size);
246 memset(new_array + type->max, 0, old_size);
247 old_array = type->regions;
248 type->regions = new_array;
249 type->max <<= 1;
250
251 /* If we use SLAB that's it, we are done */
252 if (use_slab)
253 return 0;
127 254
128 for (i = 0; i < memblock.memory.cnt; i++) 255 /* Add the new reserved region now. Should not fail ! */
129 memblock.memory.size += memblock.memory.region[i].size; 256 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
257
258 /* If the array wasn't our static init one, then free it. We only do
259 * that before SLAB is available as later on, we don't know whether
260 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
261 * anyways
262 */
263 if (old_array != memblock_memory_init_regions &&
264 old_array != memblock_reserved_init_regions)
265 memblock_free(__pa(old_array), old_size);
266
267 return 0;
268}
269
270extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
271 phys_addr_t addr2, phys_addr_t size2)
272{
273 return 1;
130} 274}
131 275
132static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size) 276static long __init_memblock memblock_add_region(struct memblock_type *type,
277 phys_addr_t base, phys_addr_t size)
133{ 278{
134 unsigned long coalesced = 0; 279 phys_addr_t end = base + size;
135 long adjacent, i; 280 int i, slot = -1;
136 281
137 if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) { 282 /* First try and coalesce this MEMBLOCK with others */
138 rgn->region[0].base = base; 283 for (i = 0; i < type->cnt; i++) {
139 rgn->region[0].size = size; 284 struct memblock_region *rgn = &type->regions[i];
140 return 0; 285 phys_addr_t rend = rgn->base + rgn->size;
141 }
142 286
143 /* First try and coalesce this MEMBLOCK with another. */ 287 /* Exit if there's no possible hits */
144 for (i = 0; i < rgn->cnt; i++) { 288 if (rgn->base > end || rgn->size == 0)
145 u64 rgnbase = rgn->region[i].base; 289 break;
146 u64 rgnsize = rgn->region[i].size;
147 290
148 if ((rgnbase == base) && (rgnsize == size)) 291 /* Check if we are fully enclosed within an existing
149 /* Already have this region, so we're done */ 292 * block
293 */
294 if (rgn->base <= base && rend >= end)
150 return 0; 295 return 0;
151 296
152 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); 297 /* Check if we overlap or are adjacent with the bottom
153 if (adjacent > 0) { 298 * of a block.
154 rgn->region[i].base -= size; 299 */
155 rgn->region[i].size += size; 300 if (base < rgn->base && end >= rgn->base) {
156 coalesced++; 301 /* If we can't coalesce, create a new block */
157 break; 302 if (!memblock_memory_can_coalesce(base, size,
158 } else if (adjacent < 0) { 303 rgn->base,
159 rgn->region[i].size += size; 304 rgn->size)) {
160 coalesced++; 305 /* Overlap & can't coalesce are mutually
161 break; 306 * exclusive, if you do that, be prepared
307 * for trouble
308 */
309 WARN_ON(end != rgn->base);
310 goto new_block;
311 }
312 /* We extend the bottom of the block down to our
313 * base
314 */
315 rgn->base = base;
316 rgn->size = rend - base;
317
318 /* Return if we have nothing else to allocate
319 * (fully coalesced)
320 */
321 if (rend >= end)
322 return 0;
323
324 /* We continue processing from the end of the
325 * coalesced block.
326 */
327 base = rend;
328 size = end - base;
329 }
330
331 /* Now check if we overlap or are adjacent with the
332 * top of a block
333 */
334 if (base <= rend && end >= rend) {
335 /* If we can't coalesce, create a new block */
336 if (!memblock_memory_can_coalesce(rgn->base,
337 rgn->size,
338 base, size)) {
339 /* Overlap & can't coalesce are mutually
340 * exclusive, if you do that, be prepared
341 * for trouble
342 */
343 WARN_ON(rend != base);
344 goto new_block;
345 }
346 /* We adjust our base down to enclose the
347 * original block and destroy it. It will be
348 * part of our new allocation. Since we've
349 * freed an entry, we know we won't fail
350 * to allocate one later, so we won't risk
351 * losing the original block allocation.
352 */
353 size += (base - rgn->base);
354 base = rgn->base;
355 memblock_remove_region(type, i--);
162 } 356 }
163 } 357 }
164 358
165 if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) { 359 /* If the array is empty, special case, replace the fake
166 memblock_coalesce_regions(rgn, i, i+1); 360 * filler region and return
167 coalesced++; 361 */
362 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
363 type->regions[0].base = base;
364 type->regions[0].size = size;
365 return 0;
168 } 366 }
169 367
170 if (coalesced) 368 new_block:
171 return coalesced; 369 /* If we are out of space, we fail. It's too late to resize the array
172 if (rgn->cnt >= MAX_MEMBLOCK_REGIONS) 370 * but then this shouldn't have happened in the first place.
371 */
372 if (WARN_ON(type->cnt >= type->max))
173 return -1; 373 return -1;
174 374
175 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ 375 /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
176 for (i = rgn->cnt - 1; i >= 0; i--) { 376 for (i = type->cnt - 1; i >= 0; i--) {
177 if (base < rgn->region[i].base) { 377 if (base < type->regions[i].base) {
178 rgn->region[i+1].base = rgn->region[i].base; 378 type->regions[i+1].base = type->regions[i].base;
179 rgn->region[i+1].size = rgn->region[i].size; 379 type->regions[i+1].size = type->regions[i].size;
180 } else { 380 } else {
181 rgn->region[i+1].base = base; 381 type->regions[i+1].base = base;
182 rgn->region[i+1].size = size; 382 type->regions[i+1].size = size;
383 slot = i + 1;
183 break; 384 break;
184 } 385 }
185 } 386 }
387 if (base < type->regions[0].base) {
388 type->regions[0].base = base;
389 type->regions[0].size = size;
390 slot = 0;
391 }
392 type->cnt++;
186 393
187 if (base < rgn->region[0].base) { 394 /* The array is full ? Try to resize it. If that fails, we undo
188 rgn->region[0].base = base; 395 * our allocation and return an error
189 rgn->region[0].size = size; 396 */
397 if (type->cnt == type->max && memblock_double_array(type)) {
398 BUG_ON(slot < 0);
399 memblock_remove_region(type, slot);
400 return -1;
190 } 401 }
191 rgn->cnt++;
192 402
193 return 0; 403 return 0;
194} 404}
195 405
196long memblock_add(u64 base, u64 size) 406long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
197{ 407{
198 struct memblock_region *_rgn = &memblock.memory; 408 return memblock_add_region(&memblock.memory, base, size);
199
200 /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
201 if (base == 0)
202 memblock.rmo_size = size;
203
204 return memblock_add_region(_rgn, base, size);
205 409
206} 410}
207 411
208static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size) 412static long __init_memblock __memblock_remove(struct memblock_type *type,
413 phys_addr_t base, phys_addr_t size)
209{ 414{
210 u64 rgnbegin, rgnend; 415 phys_addr_t end = base + size;
211 u64 end = base + size;
212 int i; 416 int i;
213 417
214 rgnbegin = rgnend = 0; /* supress gcc warnings */ 418 /* Walk through the array for collisions */
215 419 for (i = 0; i < type->cnt; i++) {
216 /* Find the region where (base, size) belongs to */ 420 struct memblock_region *rgn = &type->regions[i];
217 for (i=0; i < rgn->cnt; i++) { 421 phys_addr_t rend = rgn->base + rgn->size;
218 rgnbegin = rgn->region[i].base;
219 rgnend = rgnbegin + rgn->region[i].size;
220 422
221 if ((rgnbegin <= base) && (end <= rgnend)) 423 /* Nothing more to do, exit */
424 if (rgn->base > end || rgn->size == 0)
222 break; 425 break;
223 }
224 426
225 /* Didn't find the region */ 427 /* If we fully enclose the block, drop it */
226 if (i == rgn->cnt) 428 if (base <= rgn->base && end >= rend) {
227 return -1; 429 memblock_remove_region(type, i--);
430 continue;
431 }
228 432
229 /* Check to see if we are removing entire region */ 433 /* If we are fully enclosed within a block
230 if ((rgnbegin == base) && (rgnend == end)) { 434 * then we need to split it and we are done
231 memblock_remove_region(rgn, i); 435 */
232 return 0; 436 if (base > rgn->base && end < rend) {
233 } 437 rgn->size = base - rgn->base;
438 if (!memblock_add_region(type, end, rend - end))
439 return 0;
440 /* Failure to split is bad, we at least
441 * restore the block before erroring
442 */
443 rgn->size = rend - rgn->base;
444 WARN_ON(1);
445 return -1;
446 }
234 447
235 /* Check to see if region is matching at the front */ 448 /* Check if we need to trim the bottom of a block */
236 if (rgnbegin == base) { 449 if (rgn->base < end && rend > end) {
237 rgn->region[i].base = end; 450 rgn->size -= end - rgn->base;
238 rgn->region[i].size -= size; 451 rgn->base = end;
239 return 0; 452 break;
240 } 453 }
241 454
242 /* Check to see if the region is matching at the end */ 455 /* And check if we need to trim the top of a block */
243 if (rgnend == end) { 456 if (base < rend)
244 rgn->region[i].size -= size; 457 rgn->size -= rend - base;
245 return 0;
246 }
247 458
248 /* 459 }
249 * We need to split the entry - adjust the current one to the 460 return 0;
250 * beginging of the hole and add the region after hole.
251 */
252 rgn->region[i].size = base - rgn->region[i].base;
253 return memblock_add_region(rgn, end, rgnend - end);
254} 461}
255 462
256long memblock_remove(u64 base, u64 size) 463long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
257{ 464{
258 return __memblock_remove(&memblock.memory, base, size); 465 return __memblock_remove(&memblock.memory, base, size);
259} 466}
260 467
261long __init memblock_free(u64 base, u64 size) 468long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
262{ 469{
263 return __memblock_remove(&memblock.reserved, base, size); 470 return __memblock_remove(&memblock.reserved, base, size);
264} 471}
265 472
266long __init memblock_reserve(u64 base, u64 size) 473long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
267{ 474{
268 struct memblock_region *_rgn = &memblock.reserved; 475 struct memblock_type *_rgn = &memblock.reserved;
269 476
270 BUG_ON(0 == size); 477 BUG_ON(0 == size);
271 478
272 return memblock_add_region(_rgn, base, size); 479 return memblock_add_region(_rgn, base, size);
273} 480}
274 481
275long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size) 482phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
276{ 483{
277 unsigned long i; 484 phys_addr_t found;
278 485
279 for (i = 0; i < rgn->cnt; i++) { 486 /* We align the size to limit fragmentation. Without this, a lot of
280 u64 rgnbase = rgn->region[i].base; 487 * small allocs quickly eat up the whole reserve array on sparc
281 u64 rgnsize = rgn->region[i].size; 488 */
282 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) 489 size = memblock_align_up(size, align);
283 break;
284 }
285 490
286 return (i < rgn->cnt) ? i : -1; 491 found = memblock_find_base(size, align, 0, max_addr);
492 if (found != MEMBLOCK_ERROR &&
493 !memblock_add_region(&memblock.reserved, found, size))
494 return found;
495
496 return 0;
287} 497}
288 498
289static u64 memblock_align_down(u64 addr, u64 size) 499phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
290{ 500{
291 return addr & ~(size - 1); 501 phys_addr_t alloc;
502
503 alloc = __memblock_alloc_base(size, align, max_addr);
504
505 if (alloc == 0)
506 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
507 (unsigned long long) size, (unsigned long long) max_addr);
508
509 return alloc;
292} 510}
293 511
294static u64 memblock_align_up(u64 addr, u64 size) 512phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
295{ 513{
296 return (addr + (size - 1)) & ~(size - 1); 514 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
297} 515}
298 516
299static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end, 517
300 u64 size, u64 align) 518/*
519 * Additional node-local allocators. Search for node memory is bottom up
520 * and walks memblock regions within that node bottom-up as well, but allocation
521 * within an memblock region is top-down. XXX I plan to fix that at some stage
522 *
523 * WARNING: Only available after early_node_map[] has been populated,
524 * on some architectures, that is after all the calls to add_active_range()
525 * have been done to populate it.
526 */
527
528phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
301{ 529{
302 u64 base, res_base; 530#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
303 long j; 531 /*
532 * This code originates from sparc which really wants use to walk by addresses
533 * and returns the nid. This is not very convenient for early_pfn_map[] users
534 * as the map isn't sorted yet, and it really wants to be walked by nid.
535 *
536 * For now, I implement the inefficient method below which walks the early
537 * map multiple times. Eventually we may want to use an ARCH config option
538 * to implement a completely different method for both case.
539 */
540 unsigned long start_pfn, end_pfn;
541 int i;
304 542
305 base = memblock_align_down((end - size), align); 543 for (i = 0; i < MAX_NUMNODES; i++) {
306 while (start <= base) { 544 get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
307 j = memblock_overlaps_region(&memblock.reserved, base, size); 545 if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
308 if (j < 0) { 546 continue;
309 /* this area isn't reserved, take it */ 547 *nid = i;
310 if (memblock_add_region(&memblock.reserved, base, size) < 0) 548 return min(end, PFN_PHYS(end_pfn));
311 base = ~(u64)0;
312 return base;
313 }
314 res_base = memblock.reserved.region[j].base;
315 if (res_base < size)
316 break;
317 base = memblock_align_down(res_base - size, align);
318 } 549 }
550#endif
551 *nid = 0;
319 552
320 return ~(u64)0; 553 return end;
321} 554}
322 555
323static u64 __init memblock_alloc_nid_region(struct memblock_property *mp, 556static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
324 u64 (*nid_range)(u64, u64, int *), 557 phys_addr_t size,
325 u64 size, u64 align, int nid) 558 phys_addr_t align, int nid)
326{ 559{
327 u64 start, end; 560 phys_addr_t start, end;
328 561
329 start = mp->base; 562 start = mp->base;
330 end = start + mp->size; 563 end = start + mp->size;
331 564
332 start = memblock_align_up(start, align); 565 start = memblock_align_up(start, align);
333 while (start < end) { 566 while (start < end) {
334 u64 this_end; 567 phys_addr_t this_end;
335 int this_nid; 568 int this_nid;
336 569
337 this_end = nid_range(start, end, &this_nid); 570 this_end = memblock_nid_range(start, end, &this_nid);
338 if (this_nid == nid) { 571 if (this_nid == nid) {
339 u64 ret = memblock_alloc_nid_unreserved(start, this_end, 572 phys_addr_t ret = memblock_find_region(start, this_end, size, align);
340 size, align); 573 if (ret != MEMBLOCK_ERROR &&
341 if (ret != ~(u64)0) 574 !memblock_add_region(&memblock.reserved, ret, size))
342 return ret; 575 return ret;
343 } 576 }
344 start = this_end; 577 start = this_end;
345 } 578 }
346 579
347 return ~(u64)0; 580 return MEMBLOCK_ERROR;
348} 581}
349 582
350u64 __init memblock_alloc_nid(u64 size, u64 align, int nid, 583phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
351 u64 (*nid_range)(u64 start, u64 end, int *nid))
352{ 584{
353 struct memblock_region *mem = &memblock.memory; 585 struct memblock_type *mem = &memblock.memory;
354 int i; 586 int i;
355 587
356 BUG_ON(0 == size); 588 BUG_ON(0 == size);
357 589
590 /* We align the size to limit fragmentation. Without this, a lot of
591 * small allocs quickly eat up the whole reserve array on sparc
592 */
358 size = memblock_align_up(size, align); 593 size = memblock_align_up(size, align);
359 594
595 /* We do a bottom-up search for a region with the right
596 * nid since that's easier considering how memblock_nid_range()
597 * works
598 */
360 for (i = 0; i < mem->cnt; i++) { 599 for (i = 0; i < mem->cnt; i++) {
361 u64 ret = memblock_alloc_nid_region(&mem->region[i], 600 phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
362 nid_range,
363 size, align, nid); 601 size, align, nid);
364 if (ret != ~(u64)0) 602 if (ret != MEMBLOCK_ERROR)
365 return ret; 603 return ret;
366 } 604 }
367 605
368 return memblock_alloc(size, align); 606 return 0;
369}
370
371u64 __init memblock_alloc(u64 size, u64 align)
372{
373 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
374} 607}
375 608
376u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr) 609phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
377{ 610{
378 u64 alloc; 611 phys_addr_t res = memblock_alloc_nid(size, align, nid);
379 612
380 alloc = __memblock_alloc_base(size, align, max_addr); 613 if (res)
381 614 return res;
382 if (alloc == 0) 615 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
383 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
384 (unsigned long long) size, (unsigned long long) max_addr);
385
386 return alloc;
387} 616}
388 617
389u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
390{
391 long i, j;
392 u64 base = 0;
393 u64 res_base;
394
395 BUG_ON(0 == size);
396
397 size = memblock_align_up(size, align);
398
399 /* On some platforms, make sure we allocate lowmem */
400 /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
401 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
402 max_addr = MEMBLOCK_REAL_LIMIT;
403 618
404 for (i = memblock.memory.cnt - 1; i >= 0; i--) { 619/*
405 u64 memblockbase = memblock.memory.region[i].base; 620 * Remaining API functions
406 u64 memblocksize = memblock.memory.region[i].size; 621 */
407
408 if (memblocksize < size)
409 continue;
410 if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
411 base = memblock_align_down(memblockbase + memblocksize - size, align);
412 else if (memblockbase < max_addr) {
413 base = min(memblockbase + memblocksize, max_addr);
414 base = memblock_align_down(base - size, align);
415 } else
416 continue;
417
418 while (base && memblockbase <= base) {
419 j = memblock_overlaps_region(&memblock.reserved, base, size);
420 if (j < 0) {
421 /* this area isn't reserved, take it */
422 if (memblock_add_region(&memblock.reserved, base, size) < 0)
423 return 0;
424 return base;
425 }
426 res_base = memblock.reserved.region[j].base;
427 if (res_base < size)
428 break;
429 base = memblock_align_down(res_base - size, align);
430 }
431 }
432 return 0;
433}
434 622
435/* You must call memblock_analyze() before this. */ 623/* You must call memblock_analyze() before this. */
436u64 __init memblock_phys_mem_size(void) 624phys_addr_t __init memblock_phys_mem_size(void)
437{ 625{
438 return memblock.memory.size; 626 return memblock.memory_size;
439} 627}
440 628
441u64 memblock_end_of_DRAM(void) 629phys_addr_t __init_memblock memblock_end_of_DRAM(void)
442{ 630{
443 int idx = memblock.memory.cnt - 1; 631 int idx = memblock.memory.cnt - 1;
444 632
445 return (memblock.memory.region[idx].base + memblock.memory.region[idx].size); 633 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
446} 634}
447 635
448/* You must call memblock_analyze() after this. */ 636/* You must call memblock_analyze() after this. */
449void __init memblock_enforce_memory_limit(u64 memory_limit) 637void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
450{ 638{
451 unsigned long i; 639 unsigned long i;
452 u64 limit; 640 phys_addr_t limit;
453 struct memblock_property *p; 641 struct memblock_region *p;
454 642
455 if (!memory_limit) 643 if (!memory_limit)
456 return; 644 return;
@@ -458,24 +646,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
458 /* Truncate the memblock regions to satisfy the memory limit. */ 646 /* Truncate the memblock regions to satisfy the memory limit. */
459 limit = memory_limit; 647 limit = memory_limit;
460 for (i = 0; i < memblock.memory.cnt; i++) { 648 for (i = 0; i < memblock.memory.cnt; i++) {
461 if (limit > memblock.memory.region[i].size) { 649 if (limit > memblock.memory.regions[i].size) {
462 limit -= memblock.memory.region[i].size; 650 limit -= memblock.memory.regions[i].size;
463 continue; 651 continue;
464 } 652 }
465 653
466 memblock.memory.region[i].size = limit; 654 memblock.memory.regions[i].size = limit;
467 memblock.memory.cnt = i + 1; 655 memblock.memory.cnt = i + 1;
468 break; 656 break;
469 } 657 }
470 658
471 if (memblock.memory.region[0].size < memblock.rmo_size)
472 memblock.rmo_size = memblock.memory.region[0].size;
473
474 memory_limit = memblock_end_of_DRAM(); 659 memory_limit = memblock_end_of_DRAM();
475 660
476 /* And truncate any reserves above the limit also. */ 661 /* And truncate any reserves above the limit also. */
477 for (i = 0; i < memblock.reserved.cnt; i++) { 662 for (i = 0; i < memblock.reserved.cnt; i++) {
478 p = &memblock.reserved.region[i]; 663 p = &memblock.reserved.regions[i];
479 664
480 if (p->base > memory_limit) 665 if (p->base > memory_limit)
481 p->size = 0; 666 p->size = 0;
@@ -489,53 +674,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
489 } 674 }
490} 675}
491 676
492int __init memblock_is_reserved(u64 addr) 677static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
678{
679 unsigned int left = 0, right = type->cnt;
680
681 do {
682 unsigned int mid = (right + left) / 2;
683
684 if (addr < type->regions[mid].base)
685 right = mid;
686 else if (addr >= (type->regions[mid].base +
687 type->regions[mid].size))
688 left = mid + 1;
689 else
690 return mid;
691 } while (left < right);
692 return -1;
693}
694
695int __init memblock_is_reserved(phys_addr_t addr)
696{
697 return memblock_search(&memblock.reserved, addr) != -1;
698}
699
700int __init_memblock memblock_is_memory(phys_addr_t addr)
701{
702 return memblock_search(&memblock.memory, addr) != -1;
703}
704
705int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
706{
707 int idx = memblock_search(&memblock.memory, base);
708
709 if (idx == -1)
710 return 0;
711 return memblock.memory.regions[idx].base <= base &&
712 (memblock.memory.regions[idx].base +
713 memblock.memory.regions[idx].size) >= (base + size);
714}
715
716int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
717{
718 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
719}
720
721
722void __init_memblock memblock_set_current_limit(phys_addr_t limit)
723{
724 memblock.current_limit = limit;
725}
726
727static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
493{ 728{
729 unsigned long long base, size;
494 int i; 730 int i;
495 731
496 for (i = 0; i < memblock.reserved.cnt; i++) { 732 pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
497 u64 upper = memblock.reserved.region[i].base + 733
498 memblock.reserved.region[i].size - 1; 734 for (i = 0; i < region->cnt; i++) {
499 if ((addr >= memblock.reserved.region[i].base) && (addr <= upper)) 735 base = region->regions[i].base;
500 return 1; 736 size = region->regions[i].size;
737
738 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
739 name, i, base, base + size - 1, size);
501 } 740 }
502 return 0;
503} 741}
504 742
505int memblock_is_region_reserved(u64 base, u64 size) 743void __init_memblock memblock_dump_all(void)
506{ 744{
507 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 745 if (!memblock_debug)
746 return;
747
748 pr_info("MEMBLOCK configuration:\n");
749 pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
750
751 memblock_dump(&memblock.memory, "memory");
752 memblock_dump(&memblock.reserved, "reserved");
508} 753}
509 754
510/* 755void __init memblock_analyze(void)
511 * Given a <base, len>, find which memory regions belong to this range.
512 * Adjust the request and return a contiguous chunk.
513 */
514int memblock_find(struct memblock_property *res)
515{ 756{
516 int i; 757 int i;
517 u64 rstart, rend;
518 758
519 rstart = res->base; 759 /* Check marker in the unused last array entry */
520 rend = rstart + res->size - 1; 760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
761 != (phys_addr_t)RED_INACTIVE);
762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
763 != (phys_addr_t)RED_INACTIVE);
521 764
522 for (i = 0; i < memblock.memory.cnt; i++) { 765 memblock.memory_size = 0;
523 u64 start = memblock.memory.region[i].base;
524 u64 end = start + memblock.memory.region[i].size - 1;
525 766
526 if (start > rend) 767 for (i = 0; i < memblock.memory.cnt; i++)
527 return -1; 768 memblock.memory_size += memblock.memory.regions[i].size;
769
770 /* We allow resizing from there */
771 memblock_can_resize = 1;
772}
773
774void __init memblock_init(void)
775{
776 static int init_done __initdata = 0;
777
778 if (init_done)
779 return;
780 init_done = 1;
781
782 /* Hookup the initial arrays */
783 memblock.memory.regions = memblock_memory_init_regions;
784 memblock.memory.max = INIT_MEMBLOCK_REGIONS;
785 memblock.reserved.regions = memblock_reserved_init_regions;
786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
787
788 /* Write a marker in the unused last array entry */
789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
791
792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
793 * This simplifies the memblock_add() code below...
794 */
795 memblock.memory.regions[0].base = 0;
796 memblock.memory.regions[0].size = 0;
797 memblock.memory.cnt = 1;
798
799 /* Ditto. */
800 memblock.reserved.regions[0].base = 0;
801 memblock.reserved.regions[0].size = 0;
802 memblock.reserved.cnt = 1;
803
804 memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
805}
806
807static int __init early_memblock(char *p)
808{
809 if (p && strstr(p, "debug"))
810 memblock_debug = 1;
811 return 0;
812}
813early_param("memblock", early_memblock);
814
815#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
816
817static int memblock_debug_show(struct seq_file *m, void *private)
818{
819 struct memblock_type *type = m->private;
820 struct memblock_region *reg;
821 int i;
822
823 for (i = 0; i < type->cnt; i++) {
824 reg = &type->regions[i];
825 seq_printf(m, "%4d: ", i);
826 if (sizeof(phys_addr_t) == 4)
827 seq_printf(m, "0x%08lx..0x%08lx\n",
828 (unsigned long)reg->base,
829 (unsigned long)(reg->base + reg->size - 1));
830 else
831 seq_printf(m, "0x%016llx..0x%016llx\n",
832 (unsigned long long)reg->base,
833 (unsigned long long)(reg->base + reg->size - 1));
528 834
529 if ((end >= rstart) && (start < rend)) {
530 /* adjust the request */
531 if (rstart < start)
532 rstart = start;
533 if (rend > end)
534 rend = end;
535 res->base = rstart;
536 res->size = rend - rstart + 1;
537 return 0;
538 }
539 } 835 }
540 return -1; 836 return 0;
837}
838
839static int memblock_debug_open(struct inode *inode, struct file *file)
840{
841 return single_open(file, memblock_debug_show, inode->i_private);
541} 842}
843
844static const struct file_operations memblock_debug_fops = {
845 .open = memblock_debug_open,
846 .read = seq_read,
847 .llseek = seq_lseek,
848 .release = single_release,
849};
850
851static int __init memblock_init_debugfs(void)
852{
853 struct dentry *root = debugfs_create_dir("memblock", NULL);
854 if (!root)
855 return -ENXIO;
856 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
857 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
858
859 return 0;
860}
861__initcall(memblock_init_debugfs);
862
863#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
38#include <linux/slab.h> 39#include <linux/slab.h>
39#include <linux/swap.h> 40#include <linux/swap.h>
40#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -61,20 +62,18 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63int do_swap_account __read_mostly; 64int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 65
66/* for remember boot option*/
67#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
68static int really_do_swap_account __initdata = 1;
69#else
70static int really_do_swap_account __initdata = 0;
71#endif
72
65#else 73#else
66#define do_swap_account (0) 74#define do_swap_account (0)
67#endif 75#endif
68 76
69/*
70 * Per memcg event counter is incremented at every pagein/pageout. This counter
71 * is used for trigger some periodic events. This is straightforward and better
72 * than using jiffies etc. to handle periodic memcg event.
73 *
74 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
75 */
76#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
77#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
78 77
79/* 78/*
80 * Statistics for memory cgroup. 79 * Statistics for memory cgroup.
@@ -86,16 +85,40 @@ enum mem_cgroup_stat_index {
86 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 85 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
87 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 86 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
88 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 87 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
89 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
90 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
91 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
92 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ 89 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
93 90 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
94 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
95}; 92};
96 93
94enum mem_cgroup_events_index {
95 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
96 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
97 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
98 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
99 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
100 MEM_CGROUP_EVENTS_NSTATS,
101};
102/*
103 * Per memcg event counter is incremented at every pagein/pageout. With THP,
104 * it will be incremated by the number of pages. This counter is used for
105 * for trigger some periodic events. This is straightforward and better
106 * than using jiffies etc. to handle periodic memcg event.
107 */
108enum mem_cgroup_events_target {
109 MEM_CGROUP_TARGET_THRESH,
110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO,
112 MEM_CGROUP_NTARGETS,
113};
114#define THRESHOLDS_EVENTS_TARGET (128)
115#define SOFTLIMIT_EVENTS_TARGET (1024)
116#define NUMAINFO_EVENTS_TARGET (1024)
117
97struct mem_cgroup_stat_cpu { 118struct mem_cgroup_stat_cpu {
98 s64 count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
120 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
121 unsigned long targets[MEM_CGROUP_NTARGETS];
99}; 122};
100 123
101/* 124/*
@@ -208,17 +231,17 @@ struct mem_cgroup {
208 * per zone LRU lists. 231 * per zone LRU lists.
209 */ 232 */
210 struct mem_cgroup_lru_info info; 233 struct mem_cgroup_lru_info info;
211
212 /*
213 protect against reclaim related member.
214 */
215 spinlock_t reclaim_param_lock;
216
217 /* 234 /*
218 * While reclaiming in a hierarchy, we cache the last child we 235 * While reclaiming in a hierarchy, we cache the last child we
219 * reclaimed from. 236 * reclaimed from.
220 */ 237 */
221 int last_scanned_child; 238 int last_scanned_child;
239 int last_scanned_node;
240#if MAX_NUMNODES > 1
241 nodemask_t scan_nodes;
242 atomic_t numainfo_events;
243 atomic_t numainfo_updating;
244#endif
222 /* 245 /*
223 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
224 */ 247 */
@@ -254,6 +277,12 @@ struct mem_cgroup {
254 * percpu counter. 277 * percpu counter.
255 */ 278 */
256 struct mem_cgroup_stat_cpu *stat; 279 struct mem_cgroup_stat_cpu *stat;
280 /*
281 * used when a cpu is offlined or other synchronizations
282 * See mem_cgroup_read_stat().
283 */
284 struct mem_cgroup_stat_cpu nocpu_base;
285 spinlock_t pcp_counter_lock;
257}; 286};
258 287
259/* Stuffs for move charges at task migration. */ 288/* Stuffs for move charges at task migration. */
@@ -269,7 +298,7 @@ enum move_type {
269 298
270/* "mc" and its members are protected by cgroup_mutex */ 299/* "mc" and its members are protected by cgroup_mutex */
271static struct move_charge_struct { 300static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */ 301 spinlock_t lock; /* for from, to */
273 struct mem_cgroup *from; 302 struct mem_cgroup *from;
274 struct mem_cgroup *to; 303 struct mem_cgroup *to;
275 unsigned long precharge; 304 unsigned long precharge;
@@ -311,13 +340,6 @@ enum charge_type {
311 NR_CHARGE_TYPE, 340 NR_CHARGE_TYPE,
312}; 341};
313 342
314/* only for here (for easy reading.) */
315#define PCGF_CACHE (1UL << PCG_CACHE)
316#define PCGF_USED (1UL << PCG_USED)
317#define PCGF_LOCK (1UL << PCG_LOCK)
318/* Not used, but added here for completeness */
319#define PCGF_ACCT (1UL << PCG_ACCT)
320
321/* for encoding cft->private value on file */ 343/* for encoding cft->private value on file */
322#define _MEM (0) 344#define _MEM (0)
323#define _MEMSWAP (1) 345#define _MEMSWAP (1)
@@ -341,7 +363,7 @@ enum charge_type {
341static void mem_cgroup_get(struct mem_cgroup *mem); 363static void mem_cgroup_get(struct mem_cgroup *mem);
342static void mem_cgroup_put(struct mem_cgroup *mem); 364static void mem_cgroup_put(struct mem_cgroup *mem);
343static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 365static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
344static void drain_all_stock_async(void); 366static void drain_all_stock_async(struct mem_cgroup *mem);
345 367
346static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
347mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 369mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -355,14 +377,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
355} 377}
356 378
357static struct mem_cgroup_per_zone * 379static struct mem_cgroup_per_zone *
358page_cgroup_zoneinfo(struct page_cgroup *pc) 380page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
359{ 381{
360 struct mem_cgroup *mem = pc->mem_cgroup; 382 int nid = page_to_nid(page);
361 int nid = page_cgroup_nid(pc); 383 int zid = page_zonenum(page);
362 int zid = page_cgroup_zid(pc);
363
364 if (!mem)
365 return NULL;
366 384
367 return mem_cgroup_zoneinfo(mem, nid, zid); 385 return mem_cgroup_zoneinfo(mem, nid, zid);
368} 386}
@@ -488,11 +506,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
488 } 506 }
489} 507}
490 508
491static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
492{
493 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
494}
495
496static struct mem_cgroup_per_zone * 509static struct mem_cgroup_per_zone *
497__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 510__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
498{ 511{
@@ -530,26 +543,43 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
530 return mz; 543 return mz;
531} 544}
532 545
533static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 546/*
534 enum mem_cgroup_stat_index idx) 547 * Implementation Note: reading percpu statistics for memcg.
548 *
549 * Both of vmstat[] and percpu_counter has threshold and do periodic
550 * synchronization to implement "quick" read. There are trade-off between
551 * reading cost and precision of value. Then, we may have a chance to implement
552 * a periodic synchronizion of counter in memcg's counter.
553 *
554 * But this _read() function is used for user interface now. The user accounts
555 * memory usage by memory cgroup and he _always_ requires exact value because
556 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
557 * have to visit all online cpus and make sum. So, for now, unnecessary
558 * synchronization is not implemented. (just implemented for cpu hotplug)
559 *
560 * If there are kernel internal actions which can make use of some not-exact
561 * value, and reading all cpu value can be performance bottleneck in some
562 * common workload, threashold and synchonization as vmstat[] should be
563 * implemented.
564 */
565static long mem_cgroup_read_stat(struct mem_cgroup *mem,
566 enum mem_cgroup_stat_index idx)
535{ 567{
568 long val = 0;
536 int cpu; 569 int cpu;
537 s64 val = 0;
538 570
539 for_each_possible_cpu(cpu) 571 get_online_cpus();
572 for_each_online_cpu(cpu)
540 val += per_cpu(mem->stat->count[idx], cpu); 573 val += per_cpu(mem->stat->count[idx], cpu);
574#ifdef CONFIG_HOTPLUG_CPU
575 spin_lock(&mem->pcp_counter_lock);
576 val += mem->nocpu_base.count[idx];
577 spin_unlock(&mem->pcp_counter_lock);
578#endif
579 put_online_cpus();
541 return val; 580 return val;
542} 581}
543 582
544static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
545{
546 s64 ret;
547
548 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
549 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
550 return ret;
551}
552
553static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
554 bool charge) 584 bool charge)
555{ 585{
@@ -557,50 +587,110 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
557 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 587 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
558} 588}
559 589
560static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 590void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
561 struct page_cgroup *pc,
562 bool charge)
563{ 591{
564 int val = (charge) ? 1 : -1; 592 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
593}
594
595void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
596{
597 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
598}
599
600static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
601 enum mem_cgroup_events_index idx)
602{
603 unsigned long val = 0;
604 int cpu;
605
606 for_each_online_cpu(cpu)
607 val += per_cpu(mem->stat->events[idx], cpu);
608#ifdef CONFIG_HOTPLUG_CPU
609 spin_lock(&mem->pcp_counter_lock);
610 val += mem->nocpu_base.events[idx];
611 spin_unlock(&mem->pcp_counter_lock);
612#endif
613 return val;
614}
565 615
616static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
617 bool file, int nr_pages)
618{
566 preempt_disable(); 619 preempt_disable();
567 620
568 if (PageCgroupCache(pc)) 621 if (file)
569 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 622 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
570 else 623 else
571 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 624 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
625
626 /* pagein of a big page is an event. So, ignore page size */
627 if (nr_pages > 0)
628 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
629 else {
630 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
631 nr_pages = -nr_pages; /* for event */
632 }
572 633
573 if (charge) 634 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
574 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
575 else
576 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
577 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
578 635
579 preempt_enable(); 636 preempt_enable();
580} 637}
581 638
639static unsigned long
640mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
641{
642 struct mem_cgroup_per_zone *mz;
643 u64 total = 0;
644 int zid;
645
646 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
647 mz = mem_cgroup_zoneinfo(mem, nid, zid);
648 total += MEM_CGROUP_ZSTAT(mz, idx);
649 }
650 return total;
651}
582static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 652static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
583 enum lru_list idx) 653 enum lru_list idx)
584{ 654{
585 int nid, zid; 655 int nid;
586 struct mem_cgroup_per_zone *mz;
587 u64 total = 0; 656 u64 total = 0;
588 657
589 for_each_online_node(nid) 658 for_each_online_node(nid)
590 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 659 total += mem_cgroup_get_zonestat_node(mem, nid, idx);
591 mz = mem_cgroup_zoneinfo(mem, nid, zid);
592 total += MEM_CGROUP_ZSTAT(mz, idx);
593 }
594 return total; 660 return total;
595} 661}
596 662
597static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 663static bool __memcg_event_check(struct mem_cgroup *mem, int target)
598{ 664{
599 s64 val; 665 unsigned long val, next;
666
667 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
668 next = this_cpu_read(mem->stat->targets[target]);
669 /* from time_after() in jiffies.h */
670 return ((long)next - (long)val < 0);
671}
672
673static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
674{
675 unsigned long val, next;
600 676
601 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 677 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
602 678
603 return !(val & ((1 << event_mask_shift) - 1)); 679 switch (target) {
680 case MEM_CGROUP_TARGET_THRESH:
681 next = val + THRESHOLDS_EVENTS_TARGET;
682 break;
683 case MEM_CGROUP_TARGET_SOFTLIMIT:
684 next = val + SOFTLIMIT_EVENTS_TARGET;
685 break;
686 case MEM_CGROUP_TARGET_NUMAINFO:
687 next = val + NUMAINFO_EVENTS_TARGET;
688 break;
689 default:
690 return;
691 }
692
693 this_cpu_write(mem->stat->targets[target], next);
604} 694}
605 695
606/* 696/*
@@ -610,10 +700,23 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
610static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 700static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
611{ 701{
612 /* threshold event is triggered in finer grain than soft limit */ 702 /* threshold event is triggered in finer grain than soft limit */
613 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 703 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
614 mem_cgroup_threshold(mem); 704 mem_cgroup_threshold(mem);
615 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
706 if (unlikely(__memcg_event_check(mem,
707 MEM_CGROUP_TARGET_SOFTLIMIT))) {
616 mem_cgroup_update_tree(mem, page); 708 mem_cgroup_update_tree(mem, page);
709 __mem_cgroup_target_update(mem,
710 MEM_CGROUP_TARGET_SOFTLIMIT);
711 }
712#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem,
714 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events);
716 __mem_cgroup_target_update(mem,
717 MEM_CGROUP_TARGET_NUMAINFO);
718 }
719#endif
617 } 720 }
618} 721}
619 722
@@ -638,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
638 struct mem_cgroup, css); 741 struct mem_cgroup, css);
639} 742}
640 743
641static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 744struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
642{ 745{
643 struct mem_cgroup *mem = NULL; 746 struct mem_cgroup *mem = NULL;
644 747
@@ -659,46 +762,116 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
659 return mem; 762 return mem;
660} 763}
661 764
662/* 765/* The caller has to guarantee "mem" exists before calling this */
663 * Call callback function against all cgroup under hierarchy tree. 766static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
664 */
665static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
666 int (*func)(struct mem_cgroup *, void *))
667{ 767{
668 int found, ret, nextid;
669 struct cgroup_subsys_state *css; 768 struct cgroup_subsys_state *css;
670 struct mem_cgroup *mem; 769 int found;
671
672 if (!root->use_hierarchy)
673 return (*func)(root, data);
674 770
675 nextid = 1; 771 if (!mem) /* ROOT cgroup has the smallest ID */
676 do { 772 return root_mem_cgroup; /*css_put/get against root is ignored*/
677 ret = 0; 773 if (!mem->use_hierarchy) {
774 if (css_tryget(&mem->css))
775 return mem;
776 return NULL;
777 }
778 rcu_read_lock();
779 /*
780 * searching a memory cgroup which has the smallest ID under given
781 * ROOT cgroup. (ID >= 1)
782 */
783 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
784 if (css && css_tryget(css))
785 mem = container_of(css, struct mem_cgroup, css);
786 else
678 mem = NULL; 787 mem = NULL;
788 rcu_read_unlock();
789 return mem;
790}
791
792static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
793 struct mem_cgroup *root,
794 bool cond)
795{
796 int nextid = css_id(&iter->css) + 1;
797 int found;
798 int hierarchy_used;
799 struct cgroup_subsys_state *css;
800
801 hierarchy_used = iter->use_hierarchy;
679 802
803 css_put(&iter->css);
804 /* If no ROOT, walk all, ignore hierarchy */
805 if (!cond || (root && !hierarchy_used))
806 return NULL;
807
808 if (!root)
809 root = root_mem_cgroup;
810
811 do {
812 iter = NULL;
680 rcu_read_lock(); 813 rcu_read_lock();
681 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, 814
682 &found); 815 css = css_get_next(&mem_cgroup_subsys, nextid,
816 &root->css, &found);
683 if (css && css_tryget(css)) 817 if (css && css_tryget(css))
684 mem = container_of(css, struct mem_cgroup, css); 818 iter = container_of(css, struct mem_cgroup, css);
685 rcu_read_unlock(); 819 rcu_read_unlock();
686 820 /* If css is NULL, no more cgroups will be found */
687 if (mem) {
688 ret = (*func)(mem, data);
689 css_put(&mem->css);
690 }
691 nextid = found + 1; 821 nextid = found + 1;
692 } while (!ret && css); 822 } while (css && !iter);
693 823
694 return ret; 824 return iter;
695} 825}
826/*
827 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
828 * be careful that "break" loop is not allowed. We have reference count.
829 * Instead of that modify "cond" to be false and "continue" to exit the loop.
830 */
831#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
832 for (iter = mem_cgroup_start_loop(root);\
833 iter != NULL;\
834 iter = mem_cgroup_get_next(iter, root, cond))
835
836#define for_each_mem_cgroup_tree(iter, root) \
837 for_each_mem_cgroup_tree_cond(iter, root, true)
838
839#define for_each_mem_cgroup_all(iter) \
840 for_each_mem_cgroup_tree_cond(iter, NULL, true)
841
696 842
697static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 843static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
698{ 844{
699 return (mem == root_mem_cgroup); 845 return (mem == root_mem_cgroup);
700} 846}
701 847
848void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
849{
850 struct mem_cgroup *mem;
851
852 if (!mm)
853 return;
854
855 rcu_read_lock();
856 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
857 if (unlikely(!mem))
858 goto out;
859
860 switch (idx) {
861 case PGMAJFAULT:
862 mem_cgroup_pgmajfault(mem, 1);
863 break;
864 case PGFAULT:
865 mem_cgroup_pgfault(mem, 1);
866 break;
867 default:
868 BUG();
869 }
870out:
871 rcu_read_unlock();
872}
873EXPORT_SYMBOL(mem_cgroup_count_vm_event);
874
702/* 875/*
703 * Following LRU functions are allowed to be used without PCG_LOCK. 876 * Following LRU functions are allowed to be used without PCG_LOCK.
704 * Operations are called by routine of global LRU independently from memcg. 877 * Operations are called by routine of global LRU independently from memcg.
@@ -729,13 +902,13 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
729 * We don't check PCG_USED bit. It's cleared when the "page" is finally 902 * We don't check PCG_USED bit. It's cleared when the "page" is finally
730 * removed from global LRU. 903 * removed from global LRU.
731 */ 904 */
732 mz = page_cgroup_zoneinfo(pc); 905 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
733 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 906 /* huge page split is done under lru_lock. so, we have no races. */
907 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
734 if (mem_cgroup_is_root(pc->mem_cgroup)) 908 if (mem_cgroup_is_root(pc->mem_cgroup))
735 return; 909 return;
736 VM_BUG_ON(list_empty(&pc->lru)); 910 VM_BUG_ON(list_empty(&pc->lru));
737 list_del_init(&pc->lru); 911 list_del_init(&pc->lru);
738 return;
739} 912}
740 913
741void mem_cgroup_del_lru(struct page *page) 914void mem_cgroup_del_lru(struct page *page)
@@ -743,24 +916,49 @@ void mem_cgroup_del_lru(struct page *page)
743 mem_cgroup_del_lru_list(page, page_lru(page)); 916 mem_cgroup_del_lru_list(page, page_lru(page));
744} 917}
745 918
746void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 919/*
920 * Writeback is about to end against a page which has been marked for immediate
921 * reclaim. If it still appears to be reclaimable, move it to the tail of the
922 * inactive list.
923 */
924void mem_cgroup_rotate_reclaimable_page(struct page *page)
747{ 925{
748 struct mem_cgroup_per_zone *mz; 926 struct mem_cgroup_per_zone *mz;
749 struct page_cgroup *pc; 927 struct page_cgroup *pc;
928 enum lru_list lru = page_lru(page);
750 929
751 if (mem_cgroup_disabled()) 930 if (mem_cgroup_disabled())
752 return; 931 return;
753 932
754 pc = lookup_page_cgroup(page); 933 pc = lookup_page_cgroup(page);
755 /* 934 /* unused or root page is not rotated. */
756 * Used bit is set without atomic ops but after smp_wmb(). 935 if (!PageCgroupUsed(pc))
757 * For making pc->mem_cgroup visible, insert smp_rmb() here. 936 return;
758 */ 937 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
759 smp_rmb(); 938 smp_rmb();
939 if (mem_cgroup_is_root(pc->mem_cgroup))
940 return;
941 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
942 list_move_tail(&pc->lru, &mz->lists[lru]);
943}
944
945void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
946{
947 struct mem_cgroup_per_zone *mz;
948 struct page_cgroup *pc;
949
950 if (mem_cgroup_disabled())
951 return;
952
953 pc = lookup_page_cgroup(page);
760 /* unused or root page is not rotated. */ 954 /* unused or root page is not rotated. */
761 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 955 if (!PageCgroupUsed(pc))
956 return;
957 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
958 smp_rmb();
959 if (mem_cgroup_is_root(pc->mem_cgroup))
762 return; 960 return;
763 mz = page_cgroup_zoneinfo(pc); 961 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
764 list_move(&pc->lru, &mz->lists[lru]); 962 list_move(&pc->lru, &mz->lists[lru]);
765} 963}
766 964
@@ -773,16 +971,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
773 return; 971 return;
774 pc = lookup_page_cgroup(page); 972 pc = lookup_page_cgroup(page);
775 VM_BUG_ON(PageCgroupAcctLRU(pc)); 973 VM_BUG_ON(PageCgroupAcctLRU(pc));
776 /*
777 * Used bit is set without atomic ops but after smp_wmb().
778 * For making pc->mem_cgroup visible, insert smp_rmb() here.
779 */
780 smp_rmb();
781 if (!PageCgroupUsed(pc)) 974 if (!PageCgroupUsed(pc))
782 return; 975 return;
783 976 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
784 mz = page_cgroup_zoneinfo(pc); 977 smp_rmb();
785 MEM_CGROUP_ZSTAT(mz, lru) += 1; 978 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
979 /* huge page split is done under lru_lock. so, we have no races. */
980 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
786 SetPageCgroupAcctLRU(pc); 981 SetPageCgroupAcctLRU(pc);
787 if (mem_cgroup_is_root(pc->mem_cgroup)) 982 if (mem_cgroup_is_root(pc->mem_cgroup))
788 return; 983 return;
@@ -790,18 +985,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
790} 985}
791 986
792/* 987/*
793 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 988 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
794 * lru because the page may.be reused after it's fully uncharged (because of 989 * while it's linked to lru because the page may be reused after it's fully
795 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 990 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
796 * it again. This function is only used to charge SwapCache. It's done under 991 * It's done under lock_page and expected that zone->lru_lock isnever held.
797 * lock_page and expected that zone->lru_lock is never held.
798 */ 992 */
799static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 993static void mem_cgroup_lru_del_before_commit(struct page *page)
800{ 994{
801 unsigned long flags; 995 unsigned long flags;
802 struct zone *zone = page_zone(page); 996 struct zone *zone = page_zone(page);
803 struct page_cgroup *pc = lookup_page_cgroup(page); 997 struct page_cgroup *pc = lookup_page_cgroup(page);
804 998
999 /*
1000 * Doing this check without taking ->lru_lock seems wrong but this
1001 * is safe. Because if page_cgroup's USED bit is unset, the page
1002 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1003 * set, the commit after this will fail, anyway.
1004 * This all charge/uncharge is done under some mutual execustion.
1005 * So, we don't need to taking care of changes in USED bit.
1006 */
1007 if (likely(!PageLRU(page)))
1008 return;
1009
805 spin_lock_irqsave(&zone->lru_lock, flags); 1010 spin_lock_irqsave(&zone->lru_lock, flags);
806 /* 1011 /*
807 * Forget old LRU when this page_cgroup is *not* used. This Used bit 1012 * Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -812,12 +1017,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
812 spin_unlock_irqrestore(&zone->lru_lock, flags); 1017 spin_unlock_irqrestore(&zone->lru_lock, flags);
813} 1018}
814 1019
815static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 1020static void mem_cgroup_lru_add_after_commit(struct page *page)
816{ 1021{
817 unsigned long flags; 1022 unsigned long flags;
818 struct zone *zone = page_zone(page); 1023 struct zone *zone = page_zone(page);
819 struct page_cgroup *pc = lookup_page_cgroup(page); 1024 struct page_cgroup *pc = lookup_page_cgroup(page);
820 1025
1026 /* taking care of that the page is added to LRU while we commit it */
1027 if (likely(!PageLRU(page)))
1028 return;
821 spin_lock_irqsave(&zone->lru_lock, flags); 1029 spin_lock_irqsave(&zone->lru_lock, flags);
822 /* link when the page is linked to LRU but page_cgroup isn't */ 1030 /* link when the page is linked to LRU but page_cgroup isn't */
823 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 1031 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -915,9 +1123,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
915 return (active > inactive); 1123 return (active > inactive);
916} 1124}
917 1125
918unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 1126unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
919 struct zone *zone, 1127 struct zone *zone,
920 enum lru_list lru) 1128 enum lru_list lru)
921{ 1129{
922 int nid = zone_to_nid(zone); 1130 int nid = zone_to_nid(zone);
923 int zid = zone_idx(zone); 1131 int zid = zone_idx(zone);
@@ -926,6 +1134,92 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
926 return MEM_CGROUP_ZSTAT(mz, lru); 1134 return MEM_CGROUP_ZSTAT(mz, lru);
927} 1135}
928 1136
1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1138 int nid)
1139{
1140 unsigned long ret;
1141
1142 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1143 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1144
1145 return ret;
1146}
1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1160{
1161 u64 total = 0;
1162 int nid;
1163
1164 for_each_node_state(nid, N_HIGH_MEMORY)
1165 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1166
1167 return total;
1168}
1169
1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1171{
1172 u64 total = 0;
1173 int nid;
1174
1175 for_each_node_state(nid, N_HIGH_MEMORY)
1176 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1177
1178 return total;
1179}
1180
1181static unsigned long
1182mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1183{
1184 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1185}
1186
1187static unsigned long
1188mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1189{
1190 u64 total = 0;
1191 int nid;
1192
1193 for_each_node_state(nid, N_HIGH_MEMORY)
1194 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1195
1196 return total;
1197}
1198
1199static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1200 int nid)
1201{
1202 enum lru_list l;
1203 u64 total = 0;
1204
1205 for_each_lru(l)
1206 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1207
1208 return total;
1209}
1210
1211static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1212{
1213 u64 total = 0;
1214 int nid;
1215
1216 for_each_node_state(nid, N_HIGH_MEMORY)
1217 total += mem_cgroup_node_nr_lru_pages(memcg, nid);
1218
1219 return total;
1220}
1221#endif /* CONFIG_NUMA */
1222
929struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1223struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
930 struct zone *zone) 1224 struct zone *zone)
931{ 1225{
@@ -946,18 +1240,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
946 return NULL; 1240 return NULL;
947 1241
948 pc = lookup_page_cgroup(page); 1242 pc = lookup_page_cgroup(page);
949 /*
950 * Used bit is set without atomic ops but after smp_wmb().
951 * For making pc->mem_cgroup visible, insert smp_rmb() here.
952 */
953 smp_rmb();
954 if (!PageCgroupUsed(pc)) 1243 if (!PageCgroupUsed(pc))
955 return NULL; 1244 return NULL;
956 1245 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
957 mz = page_cgroup_zoneinfo(pc); 1246 smp_rmb();
958 if (!mz) 1247 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
959 return NULL;
960
961 return &mz->reclaim_stat; 1248 return &mz->reclaim_stat;
962} 1249}
963 1250
@@ -989,9 +1276,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
989 if (scan >= nr_to_scan) 1276 if (scan >= nr_to_scan)
990 break; 1277 break;
991 1278
992 page = pc->page;
993 if (unlikely(!PageCgroupUsed(pc))) 1279 if (unlikely(!PageCgroupUsed(pc)))
994 continue; 1280 continue;
1281
1282 page = lookup_cgroup_page(pc);
1283
995 if (unlikely(!PageLRU(page))) 1284 if (unlikely(!PageLRU(page)))
996 continue; 1285 continue;
997 1286
@@ -1001,7 +1290,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1001 case 0: 1290 case 0:
1002 list_move(&page->lru, dst); 1291 list_move(&page->lru, dst);
1003 mem_cgroup_del_lru(page); 1292 mem_cgroup_del_lru(page);
1004 nr_taken++; 1293 nr_taken += hpage_nr_pages(page);
1005 break; 1294 break;
1006 case -EBUSY: 1295 case -EBUSY:
1007 /* we don't affect global LRU but rotate in our LRU */ 1296 /* we don't affect global LRU but rotate in our LRU */
@@ -1023,35 +1312,80 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1023#define mem_cgroup_from_res_counter(counter, member) \ 1312#define mem_cgroup_from_res_counter(counter, member) \
1024 container_of(counter, struct mem_cgroup, member) 1313 container_of(counter, struct mem_cgroup, member)
1025 1314
1026static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1315/**
1316 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1317 * @mem: the memory cgroup
1318 *
1319 * Returns the maximum amount of memory @mem can be charged with, in
1320 * pages.
1321 */
1322static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1027{ 1323{
1028 if (do_swap_account) { 1324 unsigned long long margin;
1029 if (res_counter_check_under_limit(&mem->res) && 1325
1030 res_counter_check_under_limit(&mem->memsw)) 1326 margin = res_counter_margin(&mem->res);
1031 return true; 1327 if (do_swap_account)
1032 } else 1328 margin = min(margin, res_counter_margin(&mem->memsw));
1033 if (res_counter_check_under_limit(&mem->res)) 1329 return margin >> PAGE_SHIFT;
1034 return true;
1035 return false;
1036} 1330}
1037 1331
1038static unsigned int get_swappiness(struct mem_cgroup *memcg) 1332static unsigned int get_swappiness(struct mem_cgroup *memcg)
1039{ 1333{
1040 struct cgroup *cgrp = memcg->css.cgroup; 1334 struct cgroup *cgrp = memcg->css.cgroup;
1041 unsigned int swappiness;
1042 1335
1043 /* root ? */ 1336 /* root ? */
1044 if (cgrp->parent == NULL) 1337 if (cgrp->parent == NULL)
1045 return vm_swappiness; 1338 return vm_swappiness;
1046 1339
1047 spin_lock(&memcg->reclaim_param_lock); 1340 return memcg->swappiness;
1048 swappiness = memcg->swappiness; 1341}
1049 spin_unlock(&memcg->reclaim_param_lock); 1342
1343static void mem_cgroup_start_move(struct mem_cgroup *mem)
1344{
1345 int cpu;
1346
1347 get_online_cpus();
1348 spin_lock(&mem->pcp_counter_lock);
1349 for_each_online_cpu(cpu)
1350 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1351 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1352 spin_unlock(&mem->pcp_counter_lock);
1353 put_online_cpus();
1050 1354
1051 return swappiness; 1355 synchronize_rcu();
1052} 1356}
1053 1357
1054/* A routine for testing mem is not under move_account */ 1358static void mem_cgroup_end_move(struct mem_cgroup *mem)
1359{
1360 int cpu;
1361
1362 if (!mem)
1363 return;
1364 get_online_cpus();
1365 spin_lock(&mem->pcp_counter_lock);
1366 for_each_online_cpu(cpu)
1367 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1368 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1369 spin_unlock(&mem->pcp_counter_lock);
1370 put_online_cpus();
1371}
1372/*
1373 * 2 routines for checking "mem" is under move_account() or not.
1374 *
1375 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1376 * for avoiding race in accounting. If true,
1377 * pc->mem_cgroup may be overwritten.
1378 *
1379 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1380 * under hierarchy of moving cgroups. This is for
1381 * waiting at hith-memory prressure caused by "move".
1382 */
1383
1384static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1385{
1386 VM_BUG_ON(!rcu_read_lock_held());
1387 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1388}
1055 1389
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1390static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{ 1391{
@@ -1092,13 +1426,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1092 return false; 1426 return false;
1093} 1427}
1094 1428
1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1096{
1097 int *val = data;
1098 (*val)++;
1099 return 0;
1100}
1101
1102/** 1429/**
1103 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1430 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1104 * @memcg: The memory cgroup that went over limit 1431 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1500,10 @@ done:
1173static int mem_cgroup_count_children(struct mem_cgroup *mem) 1500static int mem_cgroup_count_children(struct mem_cgroup *mem)
1174{ 1501{
1175 int num = 0; 1502 int num = 0;
1176 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); 1503 struct mem_cgroup *iter;
1504
1505 for_each_mem_cgroup_tree(iter, mem)
1506 num++;
1177 return num; 1507 return num;
1178} 1508}
1179 1509
@@ -1185,8 +1515,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1185 u64 limit; 1515 u64 limit;
1186 u64 memsw; 1516 u64 memsw;
1187 1517
1188 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1518 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1189 total_swap_pages; 1519 limit += total_swap_pages << PAGE_SHIFT;
1520
1190 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1521 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1191 /* 1522 /*
1192 * If memsw is finite and limits the amount of swap space available 1523 * If memsw is finite and limits the amount of swap space available
@@ -1222,18 +1553,153 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1222 1553
1223 rcu_read_unlock(); 1554 rcu_read_unlock();
1224 /* Updates scanning parameter */ 1555 /* Updates scanning parameter */
1225 spin_lock(&root_mem->reclaim_param_lock);
1226 if (!css) { 1556 if (!css) {
1227 /* this means start scan from ID:1 */ 1557 /* this means start scan from ID:1 */
1228 root_mem->last_scanned_child = 0; 1558 root_mem->last_scanned_child = 0;
1229 } else 1559 } else
1230 root_mem->last_scanned_child = found; 1560 root_mem->last_scanned_child = found;
1231 spin_unlock(&root_mem->reclaim_param_lock);
1232 } 1561 }
1233 1562
1234 return ret; 1563 return ret;
1235} 1564}
1236 1565
1566/**
1567 * test_mem_cgroup_node_reclaimable
1568 * @mem: the target memcg
1569 * @nid: the node ID to be checked.
1570 * @noswap : specify true here if the user wants flle only information.
1571 *
1572 * This function returns whether the specified memcg contains any
1573 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node.
1575 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap)
1578{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
1580 return true;
1581 if (noswap || !total_swap_pages)
1582 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
1584 return true;
1585 return false;
1586
1587}
1588#if MAX_NUMNODES > 1
1589
1590/*
1591 * Always updating the nodemask is not very good - even if we have an empty
1592 * list or the wrong list here, we can start from some node and traverse all
1593 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1594 *
1595 */
1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1597{
1598 int nid;
1599 /*
1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update.
1602 */
1603 if (!atomic_read(&mem->numainfo_events))
1604 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1606 return;
1607
1608 /* make a nodemask where this memcg uses memory from */
1609 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1610
1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1612
1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1614 node_clear(nid, mem->scan_nodes);
1615 }
1616
1617 atomic_set(&mem->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0);
1619}
1620
1621/*
1622 * Selecting a node where we start reclaim from. Because what we need is just
1623 * reducing usage counter, start from anywhere is O,K. Considering
1624 * memory reclaim from current node, there are pros. and cons.
1625 *
1626 * Freeing memory from current node means freeing memory from a node which
1627 * we'll use or we've used. So, it may make LRU bad. And if several threads
1628 * hit limits, it will see a contention on a node. But freeing from remote
1629 * node means more costs for memory reclaim because of memory latency.
1630 *
1631 * Now, we use round-robin. Better algorithm is welcomed.
1632 */
1633int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1634{
1635 int node;
1636
1637 mem_cgroup_may_update_nodemask(mem);
1638 node = mem->last_scanned_node;
1639
1640 node = next_node(node, mem->scan_nodes);
1641 if (node == MAX_NUMNODES)
1642 node = first_node(mem->scan_nodes);
1643 /*
1644 * We call this when we hit limit, not when pages are added to LRU.
1645 * No LRU may hold pages because all pages are UNEVICTABLE or
1646 * memcg is too small and all pages are not on LRU. In that case,
1647 * we use curret node.
1648 */
1649 if (unlikely(node == MAX_NUMNODES))
1650 node = numa_node_id();
1651
1652 mem->last_scanned_node = node;
1653 return node;
1654}
1655
1656/*
1657 * Check all nodes whether it contains reclaimable pages or not.
1658 * For quick scan, we make use of scan_nodes. This will allow us to skip
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check.
1661 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1663{
1664 int nid;
1665
1666 /*
1667 * quick check...making use of scan_node.
1668 * We can skip unused nodes.
1669 */
1670 if (!nodes_empty(mem->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes);
1672 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) {
1674
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1676 return true;
1677 }
1678 }
1679 /*
1680 * Check rest of nodes.
1681 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes))
1684 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1686 return true;
1687 }
1688 return false;
1689}
1690
1691#else
1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1693{
1694 return 0;
1695}
1696
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1698{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1700}
1701#endif
1702
1237/* 1703/*
1238 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1704 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1239 * we reclaimed from, so that we don't end up penalizing one child extensively 1705 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1249,7 +1715,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1249static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1715static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1250 struct zone *zone, 1716 struct zone *zone,
1251 gfp_t gfp_mask, 1717 gfp_t gfp_mask,
1252 unsigned long reclaim_options) 1718 unsigned long reclaim_options,
1719 unsigned long *total_scanned)
1253{ 1720{
1254 struct mem_cgroup *victim; 1721 struct mem_cgroup *victim;
1255 int ret, total = 0; 1722 int ret, total = 0;
@@ -1257,18 +1724,27 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1257 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1724 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1258 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1725 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1259 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1726 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1260 unsigned long excess = mem_cgroup_get_excess(root_mem); 1727 unsigned long excess;
1728 unsigned long nr_scanned;
1729
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1261 1731
1262 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1263 if (root_mem->memsw_is_minimum) 1733 if (!check_soft && root_mem->memsw_is_minimum)
1264 noswap = true; 1734 noswap = true;
1265 1735
1266 while (1) { 1736 while (1) {
1267 victim = mem_cgroup_select_victim(root_mem); 1737 victim = mem_cgroup_select_victim(root_mem);
1268 if (victim == root_mem) { 1738 if (victim == root_mem) {
1269 loop++; 1739 loop++;
1270 if (loop >= 1) 1740 /*
1271 drain_all_stock_async(); 1741 * We are not draining per cpu cached charges during
1742 * soft limit reclaim because global reclaim doesn't
1743 * care about charges. It tries to free some memory and
1744 * charges will not give any.
1745 */
1746 if (!check_soft && loop >= 1)
1747 drain_all_stock_async(root_mem);
1272 if (loop >= 2) { 1748 if (loop >= 2) {
1273 /* 1749 /*
1274 * If we have not been able to reclaim 1750 * If we have not been able to reclaim
@@ -1280,7 +1756,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1280 break; 1756 break;
1281 } 1757 }
1282 /* 1758 /*
1283 * We want to do more targetted reclaim. 1759 * We want to do more targeted reclaim.
1284 * excess >> 2 is not to excessive so as to 1760 * excess >> 2 is not to excessive so as to
1285 * reclaim too much, nor too less that we keep 1761 * reclaim too much, nor too less that we keep
1286 * coming back to reclaim from this cgroup 1762 * coming back to reclaim from this cgroup
@@ -1292,16 +1768,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1292 } 1768 }
1293 } 1769 }
1294 } 1770 }
1295 if (!mem_cgroup_local_usage(victim)) { 1771 if (!mem_cgroup_reclaimable(victim, noswap)) {
1296 /* this cgroup's local usage == 0 */ 1772 /* this cgroup's local usage == 0 */
1297 css_put(&victim->css); 1773 css_put(&victim->css);
1298 continue; 1774 continue;
1299 } 1775 }
1300 /* we use swappiness of local cgroup */ 1776 /* we use swappiness of local cgroup */
1301 if (check_soft) 1777 if (check_soft) {
1302 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1778 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1303 noswap, get_swappiness(victim), zone); 1779 noswap, get_swappiness(victim), zone,
1304 else 1780 &nr_scanned);
1781 *total_scanned += nr_scanned;
1782 } else
1305 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1783 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1306 noswap, get_swappiness(victim)); 1784 noswap, get_swappiness(victim));
1307 css_put(&victim->css); 1785 css_put(&victim->css);
@@ -1314,57 +1792,47 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1314 return ret; 1792 return ret;
1315 total += ret; 1793 total += ret;
1316 if (check_soft) { 1794 if (check_soft) {
1317 if (res_counter_check_under_soft_limit(&root_mem->res)) 1795 if (!res_counter_soft_limit_excess(&root_mem->res))
1318 return total; 1796 return total;
1319 } else if (mem_cgroup_check_under_limit(root_mem)) 1797 } else if (mem_cgroup_margin(root_mem))
1320 return 1 + total; 1798 return total;
1321 } 1799 }
1322 return total; 1800 return total;
1323} 1801}
1324 1802
1325static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1326{
1327 int *val = (int *)data;
1328 int x;
1329 /*
1330 * Logically, we can stop scanning immediately when we find
1331 * a memcg is already locked. But condidering unlock ops and
1332 * creation/removal of memcg, scan-all is simple operation.
1333 */
1334 x = atomic_inc_return(&mem->oom_lock);
1335 *val = max(x, *val);
1336 return 0;
1337}
1338/* 1803/*
1339 * Check OOM-Killer is already running under our hierarchy. 1804 * Check OOM-Killer is already running under our hierarchy.
1340 * If someone is running, return false. 1805 * If someone is running, return false.
1341 */ 1806 */
1342static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1807static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1343{ 1808{
1344 int lock_count = 0; 1809 int x, lock_count = 0;
1810 struct mem_cgroup *iter;
1345 1811
1346 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); 1812 for_each_mem_cgroup_tree(iter, mem) {
1813 x = atomic_inc_return(&iter->oom_lock);
1814 lock_count = max(x, lock_count);
1815 }
1347 1816
1348 if (lock_count == 1) 1817 if (lock_count == 1)
1349 return true; 1818 return true;
1350 return false; 1819 return false;
1351} 1820}
1352 1821
1353static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) 1822static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1354{ 1823{
1824 struct mem_cgroup *iter;
1825
1355 /* 1826 /*
1356 * When a new child is created while the hierarchy is under oom, 1827 * When a new child is created while the hierarchy is under oom,
1357 * mem_cgroup_oom_lock() may not be called. We have to use 1828 * mem_cgroup_oom_lock() may not be called. We have to use
1358 * atomic_add_unless() here. 1829 * atomic_add_unless() here.
1359 */ 1830 */
1360 atomic_add_unless(&mem->oom_lock, -1, 0); 1831 for_each_mem_cgroup_tree(iter, mem)
1832 atomic_add_unless(&iter->oom_lock, -1, 0);
1361 return 0; 1833 return 0;
1362} 1834}
1363 1835
1364static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1365{
1366 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1367}
1368 1836
1369static DEFINE_MUTEX(memcg_oom_mutex); 1837static DEFINE_MUTEX(memcg_oom_mutex);
1370static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1838static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,51 +1930,91 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1462/* 1930/*
1463 * Currently used to update mapped file statistics, but the routine can be 1931 * Currently used to update mapped file statistics, but the routine can be
1464 * generalized to update other statistics as well. 1932 * generalized to update other statistics as well.
1933 *
1934 * Notes: Race condition
1935 *
1936 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1937 * it tends to be costly. But considering some conditions, we doesn't need
1938 * to do so _always_.
1939 *
1940 * Considering "charge", lock_page_cgroup() is not required because all
1941 * file-stat operations happen after a page is attached to radix-tree. There
1942 * are no race with "charge".
1943 *
1944 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1945 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1946 * if there are race with "uncharge". Statistics itself is properly handled
1947 * by flags.
1948 *
1949 * Considering "move", this is an only case we see a race. To make the race
1950 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1951 * possibility of race condition. If there is, we take a lock.
1465 */ 1952 */
1466void mem_cgroup_update_file_mapped(struct page *page, int val) 1953
1954void mem_cgroup_update_page_stat(struct page *page,
1955 enum mem_cgroup_page_stat_item idx, int val)
1467{ 1956{
1468 struct mem_cgroup *mem; 1957 struct mem_cgroup *mem;
1469 struct page_cgroup *pc; 1958 struct page_cgroup *pc = lookup_page_cgroup(page);
1959 bool need_unlock = false;
1960 unsigned long uninitialized_var(flags);
1470 1961
1471 pc = lookup_page_cgroup(page);
1472 if (unlikely(!pc)) 1962 if (unlikely(!pc))
1473 return; 1963 return;
1474 1964
1475 lock_page_cgroup(pc); 1965 rcu_read_lock();
1476 mem = pc->mem_cgroup; 1966 mem = pc->mem_cgroup;
1477 if (!mem || !PageCgroupUsed(pc)) 1967 if (unlikely(!mem || !PageCgroupUsed(pc)))
1478 goto done; 1968 goto out;
1969 /* pc->mem_cgroup is unstable ? */
1970 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1971 /* take a lock against to access pc->mem_cgroup */
1972 move_lock_page_cgroup(pc, &flags);
1973 need_unlock = true;
1974 mem = pc->mem_cgroup;
1975 if (!mem || !PageCgroupUsed(pc))
1976 goto out;
1977 }
1479 1978
1480 /* 1979 switch (idx) {
1481 * Preemption is already disabled. We can use __this_cpu_xxx 1980 case MEMCG_NR_FILE_MAPPED:
1482 */ 1981 if (val > 0)
1483 if (val > 0) { 1982 SetPageCgroupFileMapped(pc);
1484 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1983 else if (!page_mapped(page))
1485 SetPageCgroupFileMapped(pc); 1984 ClearPageCgroupFileMapped(pc);
1486 } else { 1985 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1487 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1986 break;
1488 ClearPageCgroupFileMapped(pc); 1987 default:
1988 BUG();
1489 } 1989 }
1490 1990
1491done: 1991 this_cpu_add(mem->stat->count[idx], val);
1492 unlock_page_cgroup(pc); 1992
1993out:
1994 if (unlikely(need_unlock))
1995 move_unlock_page_cgroup(pc, &flags);
1996 rcu_read_unlock();
1997 return;
1493} 1998}
1999EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1494 2000
1495/* 2001/*
1496 * size of first charge trial. "32" comes from vmscan.c's magic value. 2002 * size of first charge trial. "32" comes from vmscan.c's magic value.
1497 * TODO: maybe necessary to use big numbers in big irons. 2003 * TODO: maybe necessary to use big numbers in big irons.
1498 */ 2004 */
1499#define CHARGE_SIZE (32 * PAGE_SIZE) 2005#define CHARGE_BATCH 32U
1500struct memcg_stock_pcp { 2006struct memcg_stock_pcp {
1501 struct mem_cgroup *cached; /* this never be root cgroup */ 2007 struct mem_cgroup *cached; /* this never be root cgroup */
1502 int charge; 2008 unsigned int nr_pages;
1503 struct work_struct work; 2009 struct work_struct work;
2010 unsigned long flags;
2011#define FLUSHING_CACHED_CHARGE (0)
1504}; 2012};
1505static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2013static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1506static atomic_t memcg_drain_count; 2014static DEFINE_MUTEX(percpu_charge_mutex);
1507 2015
1508/* 2016/*
1509 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 2017 * Try to consume stocked charge on this cpu. If success, one page is consumed
1510 * from local stock and true is returned. If the stock is 0 or charges from a 2018 * from local stock and true is returned. If the stock is 0 or charges from a
1511 * cgroup which is not current target, returns false. This stock will be 2019 * cgroup which is not current target, returns false. This stock will be
1512 * refilled. 2020 * refilled.
@@ -1517,8 +2025,8 @@ static bool consume_stock(struct mem_cgroup *mem)
1517 bool ret = true; 2025 bool ret = true;
1518 2026
1519 stock = &get_cpu_var(memcg_stock); 2027 stock = &get_cpu_var(memcg_stock);
1520 if (mem == stock->cached && stock->charge) 2028 if (mem == stock->cached && stock->nr_pages)
1521 stock->charge -= PAGE_SIZE; 2029 stock->nr_pages--;
1522 else /* need to call res_counter_charge */ 2030 else /* need to call res_counter_charge */
1523 ret = false; 2031 ret = false;
1524 put_cpu_var(memcg_stock); 2032 put_cpu_var(memcg_stock);
@@ -1532,13 +2040,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
1532{ 2040{
1533 struct mem_cgroup *old = stock->cached; 2041 struct mem_cgroup *old = stock->cached;
1534 2042
1535 if (stock->charge) { 2043 if (stock->nr_pages) {
1536 res_counter_uncharge(&old->res, stock->charge); 2044 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2045
2046 res_counter_uncharge(&old->res, bytes);
1537 if (do_swap_account) 2047 if (do_swap_account)
1538 res_counter_uncharge(&old->memsw, stock->charge); 2048 res_counter_uncharge(&old->memsw, bytes);
2049 stock->nr_pages = 0;
1539 } 2050 }
1540 stock->cached = NULL; 2051 stock->cached = NULL;
1541 stock->charge = 0;
1542} 2052}
1543 2053
1544/* 2054/*
@@ -1549,13 +2059,14 @@ static void drain_local_stock(struct work_struct *dummy)
1549{ 2059{
1550 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2060 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1551 drain_stock(stock); 2061 drain_stock(stock);
2062 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1552} 2063}
1553 2064
1554/* 2065/*
1555 * Cache charges(val) which is from res_counter, to local per_cpu area. 2066 * Cache charges(val) which is from res_counter, to local per_cpu area.
1556 * This will be consumed by consume_stock() function, later. 2067 * This will be consumed by consume_stock() function, later.
1557 */ 2068 */
1558static void refill_stock(struct mem_cgroup *mem, int val) 2069static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
1559{ 2070{
1560 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2071 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1561 2072
@@ -1563,7 +2074,7 @@ static void refill_stock(struct mem_cgroup *mem, int val)
1563 drain_stock(stock); 2074 drain_stock(stock);
1564 stock->cached = mem; 2075 stock->cached = mem;
1565 } 2076 }
1566 stock->charge += val; 2077 stock->nr_pages += nr_pages;
1567 put_cpu_var(memcg_stock); 2078 put_cpu_var(memcg_stock);
1568} 2079}
1569 2080
@@ -1573,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, int val)
1573 * expects some charges will be back to res_counter later but cannot wait for 2084 * expects some charges will be back to res_counter later but cannot wait for
1574 * it. 2085 * it.
1575 */ 2086 */
1576static void drain_all_stock_async(void) 2087static void drain_all_stock_async(struct mem_cgroup *root_mem)
1577{ 2088{
1578 int cpu; 2089 int cpu, curcpu;
1579 /* This function is for scheduling "drain" in asynchronous way. 2090 /*
1580 * The result of "drain" is not directly handled by callers. Then, 2091 * If someone calls draining, avoid adding more kworker runs.
1581 * if someone is calling drain, we don't have to call drain more.
1582 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1583 * there is a race. We just do loose check here.
1584 */ 2092 */
1585 if (atomic_read(&memcg_drain_count)) 2093 if (!mutex_trylock(&percpu_charge_mutex))
1586 return; 2094 return;
1587 /* Notify other cpus that system-wide "drain" is running */ 2095 /* Notify other cpus that system-wide "drain" is running */
1588 atomic_inc(&memcg_drain_count);
1589 get_online_cpus(); 2096 get_online_cpus();
2097 /*
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
1590 for_each_online_cpu(cpu) { 2104 for_each_online_cpu(cpu) {
1591 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1592 schedule_work_on(cpu, &stock->work); 2106 struct mem_cgroup *mem;
2107
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached;
2112 if (!mem)
2113 continue;
2114 if (mem != root_mem) {
2115 if (!root_mem->use_hierarchy)
2116 continue;
2117 /* check whether "mem" is under tree of "root_mem" */
2118 if (!css_is_ancestor(&mem->css, &root_mem->css))
2119 continue;
2120 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
1593 } 2123 }
1594 put_online_cpus(); 2124 put_online_cpus();
1595 atomic_dec(&memcg_drain_count); 2125 mutex_unlock(&percpu_charge_mutex);
1596 /* We don't wait for flush_work */ 2126 /* We don't wait for flush_work */
1597} 2127}
1598 2128
@@ -1600,20 +2130,66 @@ static void drain_all_stock_async(void)
1600static void drain_all_stock_sync(void) 2130static void drain_all_stock_sync(void)
1601{ 2131{
1602 /* called when force_empty is called */ 2132 /* called when force_empty is called */
1603 atomic_inc(&memcg_drain_count); 2133 mutex_lock(&percpu_charge_mutex);
1604 schedule_on_each_cpu(drain_local_stock); 2134 schedule_on_each_cpu(drain_local_stock);
1605 atomic_dec(&memcg_drain_count); 2135 mutex_unlock(&percpu_charge_mutex);
1606} 2136}
1607 2137
1608static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, 2138/*
2139 * This function drains percpu counter value from DEAD cpu and
2140 * move it to local cpu. Note that this function can be preempted.
2141 */
2142static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
2143{
2144 int i;
2145
2146 spin_lock(&mem->pcp_counter_lock);
2147 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2148 long x = per_cpu(mem->stat->count[i], cpu);
2149
2150 per_cpu(mem->stat->count[i], cpu) = 0;
2151 mem->nocpu_base.count[i] += x;
2152 }
2153 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2154 unsigned long x = per_cpu(mem->stat->events[i], cpu);
2155
2156 per_cpu(mem->stat->events[i], cpu) = 0;
2157 mem->nocpu_base.events[i] += x;
2158 }
2159 /* need to clear ON_MOVE value, works as a kind of lock. */
2160 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2161 spin_unlock(&mem->pcp_counter_lock);
2162}
2163
2164static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
2165{
2166 int idx = MEM_CGROUP_ON_MOVE;
2167
2168 spin_lock(&mem->pcp_counter_lock);
2169 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
2170 spin_unlock(&mem->pcp_counter_lock);
2171}
2172
2173static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1609 unsigned long action, 2174 unsigned long action,
1610 void *hcpu) 2175 void *hcpu)
1611{ 2176{
1612 int cpu = (unsigned long)hcpu; 2177 int cpu = (unsigned long)hcpu;
1613 struct memcg_stock_pcp *stock; 2178 struct memcg_stock_pcp *stock;
2179 struct mem_cgroup *iter;
1614 2180
1615 if (action != CPU_DEAD) 2181 if ((action == CPU_ONLINE)) {
2182 for_each_mem_cgroup_all(iter)
2183 synchronize_mem_cgroup_on_move(iter, cpu);
1616 return NOTIFY_OK; 2184 return NOTIFY_OK;
2185 }
2186
2187 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2188 return NOTIFY_OK;
2189
2190 for_each_mem_cgroup_all(iter)
2191 mem_cgroup_drain_pcp_counter(iter, cpu);
2192
1617 stock = &per_cpu(memcg_stock, cpu); 2193 stock = &per_cpu(memcg_stock, cpu);
1618 drain_stock(stock); 2194 drain_stock(stock);
1619 return NOTIFY_OK; 2195 return NOTIFY_OK;
@@ -1629,9 +2205,10 @@ enum {
1629 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2205 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1630}; 2206};
1631 2207
1632static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2208static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1633 int csize, bool oom_check) 2209 unsigned int nr_pages, bool oom_check)
1634{ 2210{
2211 unsigned long csize = nr_pages * PAGE_SIZE;
1635 struct mem_cgroup *mem_over_limit; 2212 struct mem_cgroup *mem_over_limit;
1636 struct res_counter *fail_res; 2213 struct res_counter *fail_res;
1637 unsigned long flags = 0; 2214 unsigned long flags = 0;
@@ -1646,27 +2223,38 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1646 if (likely(!ret)) 2223 if (likely(!ret))
1647 return CHARGE_OK; 2224 return CHARGE_OK;
1648 2225
2226 res_counter_uncharge(&mem->res, csize);
1649 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2227 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1650 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2228 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1651 } else 2229 } else
1652 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2230 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1653 2231 /*
1654 if (csize > PAGE_SIZE) /* change csize and retry */ 2232 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2233 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2234 *
2235 * Never reclaim on behalf of optional batching, retry with a
2236 * single page instead.
2237 */
2238 if (nr_pages == CHARGE_BATCH)
1655 return CHARGE_RETRY; 2239 return CHARGE_RETRY;
1656 2240
1657 if (!(gfp_mask & __GFP_WAIT)) 2241 if (!(gfp_mask & __GFP_WAIT))
1658 return CHARGE_WOULDBLOCK; 2242 return CHARGE_WOULDBLOCK;
1659 2243
1660 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2244 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1661 gfp_mask, flags); 2245 gfp_mask, flags, NULL);
2246 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2247 return CHARGE_RETRY;
1662 /* 2248 /*
1663 * try_to_free_mem_cgroup_pages() might not give us a full 2249 * Even though the limit is exceeded at this point, reclaim
1664 * picture of reclaim. Some pages are reclaimed and might be 2250 * may have been able to free some pages. Retry the charge
1665 * moved to swap cache or just unmapped from the cgroup. 2251 * before killing the task.
1666 * Check the limit again to see if the reclaim reduced the 2252 *
1667 * current usage of the cgroup before giving up 2253 * Only for regular pages, though: huge pages are rather
2254 * unlikely to succeed so close to the limit, and we fall back
2255 * to regular pages anyway in case of failure.
1668 */ 2256 */
1669 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 2257 if (nr_pages == 1 && ret)
1670 return CHARGE_RETRY; 2258 return CHARGE_RETRY;
1671 2259
1672 /* 2260 /*
@@ -1691,12 +2279,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1691 * oom-killer can be invoked. 2279 * oom-killer can be invoked.
1692 */ 2280 */
1693static int __mem_cgroup_try_charge(struct mm_struct *mm, 2281static int __mem_cgroup_try_charge(struct mm_struct *mm,
1694 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 2282 gfp_t gfp_mask,
2283 unsigned int nr_pages,
2284 struct mem_cgroup **memcg,
2285 bool oom)
1695{ 2286{
2287 unsigned int batch = max(CHARGE_BATCH, nr_pages);
1696 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2288 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1697 struct mem_cgroup *mem = NULL; 2289 struct mem_cgroup *mem = NULL;
1698 int ret; 2290 int ret;
1699 int csize = CHARGE_SIZE;
1700 2291
1701 /* 2292 /*
1702 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2293 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1721,7 +2312,7 @@ again:
1721 VM_BUG_ON(css_is_removed(&mem->css)); 2312 VM_BUG_ON(css_is_removed(&mem->css));
1722 if (mem_cgroup_is_root(mem)) 2313 if (mem_cgroup_is_root(mem))
1723 goto done; 2314 goto done;
1724 if (consume_stock(mem)) 2315 if (nr_pages == 1 && consume_stock(mem))
1725 goto done; 2316 goto done;
1726 css_get(&mem->css); 2317 css_get(&mem->css);
1727 } else { 2318 } else {
@@ -1729,23 +2320,22 @@ again:
1729 2320
1730 rcu_read_lock(); 2321 rcu_read_lock();
1731 p = rcu_dereference(mm->owner); 2322 p = rcu_dereference(mm->owner);
1732 VM_BUG_ON(!p);
1733 /* 2323 /*
1734 * because we don't have task_lock(), "p" can exit while 2324 * Because we don't have task_lock(), "p" can exit.
1735 * we're here. In that case, "mem" can point to root 2325 * In that case, "mem" can point to root or p can be NULL with
1736 * cgroup but never be NULL. (and task_struct itself is freed 2326 * race with swapoff. Then, we have small risk of mis-accouning.
1737 * by RCU, cgroup itself is RCU safe.) Then, we have small 2327 * But such kind of mis-account by race always happens because
1738 * risk here to get wrong cgroup. But such kind of mis-account 2328 * we don't have cgroup_mutex(). It's overkill and we allo that
1739 * by race always happens because we don't have cgroup_mutex(). 2329 * small race, here.
1740 * It's overkill and we allow that small race, here. 2330 * (*) swapoff at el will charge against mm-struct not against
2331 * task-struct. So, mm->owner can be NULL.
1741 */ 2332 */
1742 mem = mem_cgroup_from_task(p); 2333 mem = mem_cgroup_from_task(p);
1743 VM_BUG_ON(!mem); 2334 if (!mem || mem_cgroup_is_root(mem)) {
1744 if (mem_cgroup_is_root(mem)) {
1745 rcu_read_unlock(); 2335 rcu_read_unlock();
1746 goto done; 2336 goto done;
1747 } 2337 }
1748 if (consume_stock(mem)) { 2338 if (nr_pages == 1 && consume_stock(mem)) {
1749 /* 2339 /*
1750 * It seems dagerous to access memcg without css_get(). 2340 * It seems dagerous to access memcg without css_get().
1751 * But considering how consume_stok works, it's not 2341 * But considering how consume_stok works, it's not
@@ -1780,13 +2370,12 @@ again:
1780 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2370 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1781 } 2371 }
1782 2372
1783 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 2373 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
1784
1785 switch (ret) { 2374 switch (ret) {
1786 case CHARGE_OK: 2375 case CHARGE_OK:
1787 break; 2376 break;
1788 case CHARGE_RETRY: /* not in OOM situation but retry */ 2377 case CHARGE_RETRY: /* not in OOM situation but retry */
1789 csize = PAGE_SIZE; 2378 batch = nr_pages;
1790 css_put(&mem->css); 2379 css_put(&mem->css);
1791 mem = NULL; 2380 mem = NULL;
1792 goto again; 2381 goto again;
@@ -1807,8 +2396,8 @@ again:
1807 } 2396 }
1808 } while (ret != CHARGE_OK); 2397 } while (ret != CHARGE_OK);
1809 2398
1810 if (csize > PAGE_SIZE) 2399 if (batch > nr_pages)
1811 refill_stock(mem, csize - PAGE_SIZE); 2400 refill_stock(mem, batch - nr_pages);
1812 css_put(&mem->css); 2401 css_put(&mem->css);
1813done: 2402done:
1814 *memcg = mem; 2403 *memcg = mem;
@@ -1827,20 +2416,17 @@ bypass:
1827 * gotten by try_charge(). 2416 * gotten by try_charge().
1828 */ 2417 */
1829static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2418static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1830 unsigned long count) 2419 unsigned int nr_pages)
1831{ 2420{
1832 if (!mem_cgroup_is_root(mem)) { 2421 if (!mem_cgroup_is_root(mem)) {
1833 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2422 unsigned long bytes = nr_pages * PAGE_SIZE;
2423
2424 res_counter_uncharge(&mem->res, bytes);
1834 if (do_swap_account) 2425 if (do_swap_account)
1835 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2426 res_counter_uncharge(&mem->memsw, bytes);
1836 } 2427 }
1837} 2428}
1838 2429
1839static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1840{
1841 __mem_cgroup_cancel_charge(mem, 1);
1842}
1843
1844/* 2430/*
1845 * A helper function to get mem_cgroup from ID. must be called under 2431 * A helper function to get mem_cgroup from ID. must be called under
1846 * rcu_read_lock(). The caller must check css_is_removed() or some if 2432 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1888,26 +2474,22 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1888 return mem; 2474 return mem;
1889} 2475}
1890 2476
1891/*
1892 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1893 * USED state. If already USED, uncharge and return.
1894 */
1895
1896static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2477static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1897 struct page_cgroup *pc, 2478 struct page *page,
1898 enum charge_type ctype) 2479 unsigned int nr_pages,
2480 struct page_cgroup *pc,
2481 enum charge_type ctype)
1899{ 2482{
1900 /* try_charge() can return NULL to *memcg, taking care of it. */
1901 if (!mem)
1902 return;
1903
1904 lock_page_cgroup(pc); 2483 lock_page_cgroup(pc);
1905 if (unlikely(PageCgroupUsed(pc))) { 2484 if (unlikely(PageCgroupUsed(pc))) {
1906 unlock_page_cgroup(pc); 2485 unlock_page_cgroup(pc);
1907 mem_cgroup_cancel_charge(mem); 2486 __mem_cgroup_cancel_charge(mem, nr_pages);
1908 return; 2487 return;
1909 } 2488 }
1910 2489 /*
2490 * we don't need page_cgroup_lock about tail pages, becase they are not
2491 * accessed by any other context at this point.
2492 */
1911 pc->mem_cgroup = mem; 2493 pc->mem_cgroup = mem;
1912 /* 2494 /*
1913 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2495 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -1931,19 +2513,62 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1931 break; 2513 break;
1932 } 2514 }
1933 2515
1934 mem_cgroup_charge_statistics(mem, pc, true); 2516 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
1935
1936 unlock_page_cgroup(pc); 2517 unlock_page_cgroup(pc);
1937 /* 2518 /*
1938 * "charge_statistics" updated event counter. Then, check it. 2519 * "charge_statistics" updated event counter. Then, check it.
1939 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2520 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1940 * if they exceeds softlimit. 2521 * if they exceeds softlimit.
1941 */ 2522 */
1942 memcg_check_events(mem, pc->page); 2523 memcg_check_events(mem, page);
2524}
2525
2526#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2527
2528#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2529 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2530/*
2531 * Because tail pages are not marked as "used", set it. We're under
2532 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2533 */
2534void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2535{
2536 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2537 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2538 unsigned long flags;
2539
2540 if (mem_cgroup_disabled())
2541 return;
2542 /*
2543 * We have no races with charge/uncharge but will have races with
2544 * page state accounting.
2545 */
2546 move_lock_page_cgroup(head_pc, &flags);
2547
2548 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2549 smp_wmb(); /* see __commit_charge() */
2550 if (PageCgroupAcctLRU(head_pc)) {
2551 enum lru_list lru;
2552 struct mem_cgroup_per_zone *mz;
2553
2554 /*
2555 * LRU flags cannot be copied because we need to add tail
2556 *.page to LRU by generic call and our hook will be called.
2557 * We hold lru_lock, then, reduce counter directly.
2558 */
2559 lru = page_lru(head);
2560 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2561 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2562 }
2563 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2564 move_unlock_page_cgroup(head_pc, &flags);
1943} 2565}
2566#endif
1944 2567
1945/** 2568/**
1946 * __mem_cgroup_move_account - move account of the page 2569 * mem_cgroup_move_account - move account of the page
2570 * @page: the page
2571 * @nr_pages: number of regular pages (>1 for huge pages)
1947 * @pc: page_cgroup of the page. 2572 * @pc: page_cgroup of the page.
1948 * @from: mem_cgroup which the page is moved from. 2573 * @from: mem_cgroup which the page is moved from.
1949 * @to: mem_cgroup which the page is moved to. @from != @to. 2574 * @to: mem_cgroup which the page is moved to. @from != @to.
@@ -1951,22 +2576,42 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1951 * 2576 *
1952 * The caller must confirm following. 2577 * The caller must confirm following.
1953 * - page is not on LRU (isolate_page() is useful.) 2578 * - page is not on LRU (isolate_page() is useful.)
1954 * - the pc is locked, used, and ->mem_cgroup points to @from. 2579 * - compound_lock is held when nr_pages > 1
1955 * 2580 *
1956 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2581 * This function doesn't do "charge" nor css_get to new cgroup. It should be
1957 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 2582 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
1958 * true, this function does "uncharge" from old cgroup, but it doesn't if 2583 * true, this function does "uncharge" from old cgroup, but it doesn't if
1959 * @uncharge is false, so a caller should do "uncharge". 2584 * @uncharge is false, so a caller should do "uncharge".
1960 */ 2585 */
1961 2586static int mem_cgroup_move_account(struct page *page,
1962static void __mem_cgroup_move_account(struct page_cgroup *pc, 2587 unsigned int nr_pages,
1963 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2588 struct page_cgroup *pc,
2589 struct mem_cgroup *from,
2590 struct mem_cgroup *to,
2591 bool uncharge)
1964{ 2592{
2593 unsigned long flags;
2594 int ret;
2595
1965 VM_BUG_ON(from == to); 2596 VM_BUG_ON(from == to);
1966 VM_BUG_ON(PageLRU(pc->page)); 2597 VM_BUG_ON(PageLRU(page));
1967 VM_BUG_ON(!PageCgroupLocked(pc)); 2598 /*
1968 VM_BUG_ON(!PageCgroupUsed(pc)); 2599 * The page is isolated from LRU. So, collapse function
1969 VM_BUG_ON(pc->mem_cgroup != from); 2600 * will not handle this page. But page splitting can happen.
2601 * Do this check under compound_page_lock(). The caller should
2602 * hold it.
2603 */
2604 ret = -EBUSY;
2605 if (nr_pages > 1 && !PageTransHuge(page))
2606 goto out;
2607
2608 lock_page_cgroup(pc);
2609
2610 ret = -EINVAL;
2611 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2612 goto unlock;
2613
2614 move_lock_page_cgroup(pc, &flags);
1970 2615
1971 if (PageCgroupFileMapped(pc)) { 2616 if (PageCgroupFileMapped(pc)) {
1972 /* Update mapped_file data for mem_cgroup */ 2617 /* Update mapped_file data for mem_cgroup */
@@ -1975,42 +2620,31 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1975 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2620 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1976 preempt_enable(); 2621 preempt_enable();
1977 } 2622 }
1978 mem_cgroup_charge_statistics(from, pc, false); 2623 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
1979 if (uncharge) 2624 if (uncharge)
1980 /* This is not "cancel", but cancel_charge does all we need. */ 2625 /* This is not "cancel", but cancel_charge does all we need. */
1981 mem_cgroup_cancel_charge(from); 2626 __mem_cgroup_cancel_charge(from, nr_pages);
1982 2627
1983 /* caller should have done css_get */ 2628 /* caller should have done css_get */
1984 pc->mem_cgroup = to; 2629 pc->mem_cgroup = to;
1985 mem_cgroup_charge_statistics(to, pc, true); 2630 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
1986 /* 2631 /*
1987 * We charges against "to" which may not have any tasks. Then, "to" 2632 * We charges against "to" which may not have any tasks. Then, "to"
1988 * can be under rmdir(). But in current implementation, caller of 2633 * can be under rmdir(). But in current implementation, caller of
1989 * this function is just force_empty() and move charge, so it's 2634 * this function is just force_empty() and move charge, so it's
1990 * garanteed that "to" is never removed. So, we don't check rmdir 2635 * guaranteed that "to" is never removed. So, we don't check rmdir
1991 * status here. 2636 * status here.
1992 */ 2637 */
1993} 2638 move_unlock_page_cgroup(pc, &flags);
1994 2639 ret = 0;
1995/* 2640unlock:
1996 * check whether the @pc is valid for moving account and call
1997 * __mem_cgroup_move_account()
1998 */
1999static int mem_cgroup_move_account(struct page_cgroup *pc,
2000 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2001{
2002 int ret = -EINVAL;
2003 lock_page_cgroup(pc);
2004 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2005 __mem_cgroup_move_account(pc, from, to, uncharge);
2006 ret = 0;
2007 }
2008 unlock_page_cgroup(pc); 2641 unlock_page_cgroup(pc);
2009 /* 2642 /*
2010 * check events 2643 * check events
2011 */ 2644 */
2012 memcg_check_events(to, pc->page); 2645 memcg_check_events(to, page);
2013 memcg_check_events(from, pc->page); 2646 memcg_check_events(from, page);
2647out:
2014 return ret; 2648 return ret;
2015} 2649}
2016 2650
@@ -2018,14 +2652,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2018 * move charges to its parent. 2652 * move charges to its parent.
2019 */ 2653 */
2020 2654
2021static int mem_cgroup_move_parent(struct page_cgroup *pc, 2655static int mem_cgroup_move_parent(struct page *page,
2656 struct page_cgroup *pc,
2022 struct mem_cgroup *child, 2657 struct mem_cgroup *child,
2023 gfp_t gfp_mask) 2658 gfp_t gfp_mask)
2024{ 2659{
2025 struct page *page = pc->page;
2026 struct cgroup *cg = child->css.cgroup; 2660 struct cgroup *cg = child->css.cgroup;
2027 struct cgroup *pcg = cg->parent; 2661 struct cgroup *pcg = cg->parent;
2028 struct mem_cgroup *parent; 2662 struct mem_cgroup *parent;
2663 unsigned int nr_pages;
2664 unsigned long uninitialized_var(flags);
2029 int ret; 2665 int ret;
2030 2666
2031 /* Is ROOT ? */ 2667 /* Is ROOT ? */
@@ -2038,14 +2674,22 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2038 if (isolate_lru_page(page)) 2674 if (isolate_lru_page(page))
2039 goto put; 2675 goto put;
2040 2676
2677 nr_pages = hpage_nr_pages(page);
2678
2041 parent = mem_cgroup_from_cont(pcg); 2679 parent = mem_cgroup_from_cont(pcg);
2042 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2680 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2043 if (ret || !parent) 2681 if (ret || !parent)
2044 goto put_back; 2682 goto put_back;
2045 2683
2046 ret = mem_cgroup_move_account(pc, child, parent, true); 2684 if (nr_pages > 1)
2685 flags = compound_lock_irqsave(page);
2686
2687 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2047 if (ret) 2688 if (ret)
2048 mem_cgroup_cancel_charge(parent); 2689 __mem_cgroup_cancel_charge(parent, nr_pages);
2690
2691 if (nr_pages > 1)
2692 compound_unlock_irqrestore(page, flags);
2049put_back: 2693put_back:
2050 putback_lru_page(page); 2694 putback_lru_page(page);
2051put: 2695put:
@@ -2064,20 +2708,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2064 gfp_t gfp_mask, enum charge_type ctype) 2708 gfp_t gfp_mask, enum charge_type ctype)
2065{ 2709{
2066 struct mem_cgroup *mem = NULL; 2710 struct mem_cgroup *mem = NULL;
2711 unsigned int nr_pages = 1;
2067 struct page_cgroup *pc; 2712 struct page_cgroup *pc;
2713 bool oom = true;
2068 int ret; 2714 int ret;
2069 2715
2716 if (PageTransHuge(page)) {
2717 nr_pages <<= compound_order(page);
2718 VM_BUG_ON(!PageTransHuge(page));
2719 /*
2720 * Never OOM-kill a process for a huge page. The
2721 * fault handler will fall back to regular pages.
2722 */
2723 oom = false;
2724 }
2725
2070 pc = lookup_page_cgroup(page); 2726 pc = lookup_page_cgroup(page);
2071 /* can happen at boot */ 2727 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2072 if (unlikely(!pc))
2073 return 0;
2074 prefetchw(pc);
2075 2728
2076 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2729 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2077 if (ret || !mem) 2730 if (ret || !mem)
2078 return ret; 2731 return ret;
2079 2732
2080 __mem_cgroup_commit_charge(mem, pc, ctype); 2733 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2081 return 0; 2734 return 0;
2082} 2735}
2083 2736
@@ -2086,8 +2739,6 @@ int mem_cgroup_newpage_charge(struct page *page,
2086{ 2739{
2087 if (mem_cgroup_disabled()) 2740 if (mem_cgroup_disabled())
2088 return 0; 2741 return 0;
2089 if (PageCompound(page))
2090 return 0;
2091 /* 2742 /*
2092 * If already mapped, we don't have to account. 2743 * If already mapped, we don't have to account.
2093 * If page cache, page->mapping has address_space. 2744 * If page cache, page->mapping has address_space.
@@ -2107,9 +2758,26 @@ static void
2107__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2758__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2108 enum charge_type ctype); 2759 enum charge_type ctype);
2109 2760
2761static void
2762__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2763 enum charge_type ctype)
2764{
2765 struct page_cgroup *pc = lookup_page_cgroup(page);
2766 /*
2767 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2768 * is already on LRU. It means the page may on some other page_cgroup's
2769 * LRU. Take care of it.
2770 */
2771 mem_cgroup_lru_del_before_commit(page);
2772 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2773 mem_cgroup_lru_add_after_commit(page);
2774 return;
2775}
2776
2110int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2777int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2111 gfp_t gfp_mask) 2778 gfp_t gfp_mask)
2112{ 2779{
2780 struct mem_cgroup *mem = NULL;
2113 int ret; 2781 int ret;
2114 2782
2115 if (mem_cgroup_disabled()) 2783 if (mem_cgroup_disabled())
@@ -2144,14 +2812,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2144 if (unlikely(!mm)) 2812 if (unlikely(!mm))
2145 mm = &init_mm; 2813 mm = &init_mm;
2146 2814
2147 if (page_is_file_cache(page)) 2815 if (page_is_file_cache(page)) {
2148 return mem_cgroup_charge_common(page, mm, gfp_mask, 2816 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2149 MEM_CGROUP_CHARGE_TYPE_CACHE); 2817 if (ret || !mem)
2818 return ret;
2150 2819
2820 /*
2821 * FUSE reuses pages without going through the final
2822 * put that would remove them from the LRU list, make
2823 * sure that they get relinked properly.
2824 */
2825 __mem_cgroup_commit_charge_lrucare(page, mem,
2826 MEM_CGROUP_CHARGE_TYPE_CACHE);
2827 return ret;
2828 }
2151 /* shmem */ 2829 /* shmem */
2152 if (PageSwapCache(page)) { 2830 if (PageSwapCache(page)) {
2153 struct mem_cgroup *mem = NULL;
2154
2155 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2831 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2156 if (!ret) 2832 if (!ret)
2157 __mem_cgroup_commit_charge_swapin(page, mem, 2833 __mem_cgroup_commit_charge_swapin(page, mem,
@@ -2176,6 +2852,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2176 struct mem_cgroup *mem; 2852 struct mem_cgroup *mem;
2177 int ret; 2853 int ret;
2178 2854
2855 *ptr = NULL;
2856
2179 if (mem_cgroup_disabled()) 2857 if (mem_cgroup_disabled())
2180 return 0; 2858 return 0;
2181 2859
@@ -2193,30 +2871,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2193 if (!mem) 2871 if (!mem)
2194 goto charge_cur_mm; 2872 goto charge_cur_mm;
2195 *ptr = mem; 2873 *ptr = mem;
2196 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2874 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2197 css_put(&mem->css); 2875 css_put(&mem->css);
2198 return ret; 2876 return ret;
2199charge_cur_mm: 2877charge_cur_mm:
2200 if (unlikely(!mm)) 2878 if (unlikely(!mm))
2201 mm = &init_mm; 2879 mm = &init_mm;
2202 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2880 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2203} 2881}
2204 2882
2205static void 2883static void
2206__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2884__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2207 enum charge_type ctype) 2885 enum charge_type ctype)
2208{ 2886{
2209 struct page_cgroup *pc;
2210
2211 if (mem_cgroup_disabled()) 2887 if (mem_cgroup_disabled())
2212 return; 2888 return;
2213 if (!ptr) 2889 if (!ptr)
2214 return; 2890 return;
2215 cgroup_exclude_rmdir(&ptr->css); 2891 cgroup_exclude_rmdir(&ptr->css);
2216 pc = lookup_page_cgroup(page); 2892
2217 mem_cgroup_lru_del_before_commit_swapcache(page); 2893 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2218 __mem_cgroup_commit_charge(ptr, pc, ctype);
2219 mem_cgroup_lru_add_after_commit_swapcache(page);
2220 /* 2894 /*
2221 * Now swap is on-memory. This means this page may be 2895 * Now swap is on-memory. This means this page may be
2222 * counted both as mem and swap....double count. 2896 * counted both as mem and swap....double count.
@@ -2264,14 +2938,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2264 return; 2938 return;
2265 if (!mem) 2939 if (!mem)
2266 return; 2940 return;
2267 mem_cgroup_cancel_charge(mem); 2941 __mem_cgroup_cancel_charge(mem, 1);
2268} 2942}
2269 2943
2270static void 2944static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2271__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2945 unsigned int nr_pages,
2946 const enum charge_type ctype)
2272{ 2947{
2273 struct memcg_batch_info *batch = NULL; 2948 struct memcg_batch_info *batch = NULL;
2274 bool uncharge_memsw = true; 2949 bool uncharge_memsw = true;
2950
2275 /* If swapout, usage of swap doesn't decrease */ 2951 /* If swapout, usage of swap doesn't decrease */
2276 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2952 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2277 uncharge_memsw = false; 2953 uncharge_memsw = false;
@@ -2286,7 +2962,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2286 batch->memcg = mem; 2962 batch->memcg = mem;
2287 /* 2963 /*
2288 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2964 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2289 * In those cases, all pages freed continously can be expected to be in 2965 * In those cases, all pages freed continuously can be expected to be in
2290 * the same cgroup and we have chance to coalesce uncharges. 2966 * the same cgroup and we have chance to coalesce uncharges.
2291 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2967 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2292 * because we want to do uncharge as soon as possible. 2968 * because we want to do uncharge as soon as possible.
@@ -2295,6 +2971,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2295 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2971 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2296 goto direct_uncharge; 2972 goto direct_uncharge;
2297 2973
2974 if (nr_pages > 1)
2975 goto direct_uncharge;
2976
2298 /* 2977 /*
2299 * In typical case, batch->memcg == mem. This means we can 2978 * In typical case, batch->memcg == mem. This means we can
2300 * merge a series of uncharges to an uncharge of res_counter. 2979 * merge a series of uncharges to an uncharge of res_counter.
@@ -2303,14 +2982,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2303 if (batch->memcg != mem) 2982 if (batch->memcg != mem)
2304 goto direct_uncharge; 2983 goto direct_uncharge;
2305 /* remember freed charge and uncharge it later */ 2984 /* remember freed charge and uncharge it later */
2306 batch->bytes += PAGE_SIZE; 2985 batch->nr_pages++;
2307 if (uncharge_memsw) 2986 if (uncharge_memsw)
2308 batch->memsw_bytes += PAGE_SIZE; 2987 batch->memsw_nr_pages++;
2309 return; 2988 return;
2310direct_uncharge: 2989direct_uncharge:
2311 res_counter_uncharge(&mem->res, PAGE_SIZE); 2990 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2312 if (uncharge_memsw) 2991 if (uncharge_memsw)
2313 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2992 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2314 if (unlikely(batch->memcg != mem)) 2993 if (unlikely(batch->memcg != mem))
2315 memcg_oom_recover(mem); 2994 memcg_oom_recover(mem);
2316 return; 2995 return;
@@ -2322,8 +3001,9 @@ direct_uncharge:
2322static struct mem_cgroup * 3001static struct mem_cgroup *
2323__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 3002__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2324{ 3003{
2325 struct page_cgroup *pc;
2326 struct mem_cgroup *mem = NULL; 3004 struct mem_cgroup *mem = NULL;
3005 unsigned int nr_pages = 1;
3006 struct page_cgroup *pc;
2327 3007
2328 if (mem_cgroup_disabled()) 3008 if (mem_cgroup_disabled())
2329 return NULL; 3009 return NULL;
@@ -2331,6 +3011,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2331 if (PageSwapCache(page)) 3011 if (PageSwapCache(page))
2332 return NULL; 3012 return NULL;
2333 3013
3014 if (PageTransHuge(page)) {
3015 nr_pages <<= compound_order(page);
3016 VM_BUG_ON(!PageTransHuge(page));
3017 }
2334 /* 3018 /*
2335 * Check if our page_cgroup is valid 3019 * Check if our page_cgroup is valid
2336 */ 3020 */
@@ -2363,7 +3047,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2363 break; 3047 break;
2364 } 3048 }
2365 3049
2366 mem_cgroup_charge_statistics(mem, pc, false); 3050 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
2367 3051
2368 ClearPageCgroupUsed(pc); 3052 ClearPageCgroupUsed(pc);
2369 /* 3053 /*
@@ -2384,7 +3068,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2384 mem_cgroup_get(mem); 3068 mem_cgroup_get(mem);
2385 } 3069 }
2386 if (!mem_cgroup_is_root(mem)) 3070 if (!mem_cgroup_is_root(mem))
2387 __do_uncharge(mem, ctype); 3071 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
2388 3072
2389 return mem; 3073 return mem;
2390 3074
@@ -2424,8 +3108,8 @@ void mem_cgroup_uncharge_start(void)
2424 /* We can do nest. */ 3108 /* We can do nest. */
2425 if (current->memcg_batch.do_batch == 1) { 3109 if (current->memcg_batch.do_batch == 1) {
2426 current->memcg_batch.memcg = NULL; 3110 current->memcg_batch.memcg = NULL;
2427 current->memcg_batch.bytes = 0; 3111 current->memcg_batch.nr_pages = 0;
2428 current->memcg_batch.memsw_bytes = 0; 3112 current->memcg_batch.memsw_nr_pages = 0;
2429 } 3113 }
2430} 3114}
2431 3115
@@ -2446,10 +3130,12 @@ void mem_cgroup_uncharge_end(void)
2446 * This "batch->memcg" is valid without any css_get/put etc... 3130 * This "batch->memcg" is valid without any css_get/put etc...
2447 * bacause we hide charges behind us. 3131 * bacause we hide charges behind us.
2448 */ 3132 */
2449 if (batch->bytes) 3133 if (batch->nr_pages)
2450 res_counter_uncharge(&batch->memcg->res, batch->bytes); 3134 res_counter_uncharge(&batch->memcg->res,
2451 if (batch->memsw_bytes) 3135 batch->nr_pages * PAGE_SIZE);
2452 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 3136 if (batch->memsw_nr_pages)
3137 res_counter_uncharge(&batch->memcg->memsw,
3138 batch->memsw_nr_pages * PAGE_SIZE);
2453 memcg_oom_recover(batch->memcg); 3139 memcg_oom_recover(batch->memcg);
2454 /* forget this pointer (for sanity check) */ 3140 /* forget this pointer (for sanity check) */
2455 batch->memcg = NULL; 3141 batch->memcg = NULL;
@@ -2572,13 +3258,16 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2572 * page belongs to. 3258 * page belongs to.
2573 */ 3259 */
2574int mem_cgroup_prepare_migration(struct page *page, 3260int mem_cgroup_prepare_migration(struct page *page,
2575 struct page *newpage, struct mem_cgroup **ptr) 3261 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
2576{ 3262{
2577 struct page_cgroup *pc;
2578 struct mem_cgroup *mem = NULL; 3263 struct mem_cgroup *mem = NULL;
3264 struct page_cgroup *pc;
2579 enum charge_type ctype; 3265 enum charge_type ctype;
2580 int ret = 0; 3266 int ret = 0;
2581 3267
3268 *ptr = NULL;
3269
3270 VM_BUG_ON(PageTransHuge(page));
2582 if (mem_cgroup_disabled()) 3271 if (mem_cgroup_disabled())
2583 return 0; 3272 return 0;
2584 3273
@@ -2628,7 +3317,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2628 return 0; 3317 return 0;
2629 3318
2630 *ptr = mem; 3319 *ptr = mem;
2631 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 3320 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
2632 css_put(&mem->css);/* drop extra refcnt */ 3321 css_put(&mem->css);/* drop extra refcnt */
2633 if (ret || *ptr == NULL) { 3322 if (ret || *ptr == NULL) {
2634 if (PageAnon(page)) { 3323 if (PageAnon(page)) {
@@ -2655,13 +3344,13 @@ int mem_cgroup_prepare_migration(struct page *page,
2655 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3344 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2656 else 3345 else
2657 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3346 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2658 __mem_cgroup_commit_charge(mem, pc, ctype); 3347 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2659 return ret; 3348 return ret;
2660} 3349}
2661 3350
2662/* remove redundant charge if migration failed*/ 3351/* remove redundant charge if migration failed*/
2663void mem_cgroup_end_migration(struct mem_cgroup *mem, 3352void mem_cgroup_end_migration(struct mem_cgroup *mem,
2664 struct page *oldpage, struct page *newpage) 3353 struct page *oldpage, struct page *newpage, bool migration_ok)
2665{ 3354{
2666 struct page *used, *unused; 3355 struct page *used, *unused;
2667 struct page_cgroup *pc; 3356 struct page_cgroup *pc;
@@ -2670,8 +3359,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2670 return; 3359 return;
2671 /* blocks rmdir() */ 3360 /* blocks rmdir() */
2672 cgroup_exclude_rmdir(&mem->css); 3361 cgroup_exclude_rmdir(&mem->css);
2673 /* at migration success, oldpage->mapping is NULL. */ 3362 if (!migration_ok) {
2674 if (oldpage->mapping) {
2675 used = oldpage; 3363 used = oldpage;
2676 unused = newpage; 3364 unused = newpage;
2677 } else { 3365 } else {
@@ -2721,7 +3409,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
2721 struct mm_struct *mm, 3409 struct mm_struct *mm,
2722 gfp_t gfp_mask) 3410 gfp_t gfp_mask)
2723{ 3411{
2724 struct mem_cgroup *mem = NULL; 3412 struct mem_cgroup *mem;
2725 int ret; 3413 int ret;
2726 3414
2727 if (mem_cgroup_disabled()) 3415 if (mem_cgroup_disabled())
@@ -2734,6 +3422,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
2734 return ret; 3422 return ret;
2735} 3423}
2736 3424
3425#ifdef CONFIG_DEBUG_VM
3426static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3427{
3428 struct page_cgroup *pc;
3429
3430 pc = lookup_page_cgroup(page);
3431 if (likely(pc) && PageCgroupUsed(pc))
3432 return pc;
3433 return NULL;
3434}
3435
3436bool mem_cgroup_bad_page_check(struct page *page)
3437{
3438 if (mem_cgroup_disabled())
3439 return false;
3440
3441 return lookup_page_cgroup_used(page) != NULL;
3442}
3443
3444void mem_cgroup_print_bad_page(struct page *page)
3445{
3446 struct page_cgroup *pc;
3447
3448 pc = lookup_page_cgroup_used(page);
3449 if (pc) {
3450 int ret = -1;
3451 char *path;
3452
3453 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3454 pc, pc->flags, pc->mem_cgroup);
3455
3456 path = kmalloc(PATH_MAX, GFP_KERNEL);
3457 if (path) {
3458 rcu_read_lock();
3459 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3460 path, PATH_MAX);
3461 rcu_read_unlock();
3462 }
3463
3464 printk(KERN_CONT "(%s)\n",
3465 (ret < 0) ? "cannot get the path" : path);
3466 kfree(path);
3467 }
3468}
3469#endif
3470
2737static DEFINE_MUTEX(set_limit_mutex); 3471static DEFINE_MUTEX(set_limit_mutex);
2738 3472
2739static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3473static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -2791,7 +3525,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2791 break; 3525 break;
2792 3526
2793 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3527 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2794 MEM_CGROUP_RECLAIM_SHRINK); 3528 MEM_CGROUP_RECLAIM_SHRINK,
3529 NULL);
2795 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3530 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2796 /* Usage is reduced ? */ 3531 /* Usage is reduced ? */
2797 if (curusage >= oldusage) 3532 if (curusage >= oldusage)
@@ -2851,7 +3586,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2851 3586
2852 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3587 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2853 MEM_CGROUP_RECLAIM_NOSWAP | 3588 MEM_CGROUP_RECLAIM_NOSWAP |
2854 MEM_CGROUP_RECLAIM_SHRINK); 3589 MEM_CGROUP_RECLAIM_SHRINK,
3590 NULL);
2855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3591 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2856 /* Usage is reduced ? */ 3592 /* Usage is reduced ? */
2857 if (curusage >= oldusage) 3593 if (curusage >= oldusage)
@@ -2865,7 +3601,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2865} 3601}
2866 3602
2867unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3603unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2868 gfp_t gfp_mask) 3604 gfp_t gfp_mask,
3605 unsigned long *total_scanned)
2869{ 3606{
2870 unsigned long nr_reclaimed = 0; 3607 unsigned long nr_reclaimed = 0;
2871 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3608 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2873,6 +3610,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2873 int loop = 0; 3610 int loop = 0;
2874 struct mem_cgroup_tree_per_zone *mctz; 3611 struct mem_cgroup_tree_per_zone *mctz;
2875 unsigned long long excess; 3612 unsigned long long excess;
3613 unsigned long nr_scanned;
2876 3614
2877 if (order > 0) 3615 if (order > 0)
2878 return 0; 3616 return 0;
@@ -2891,10 +3629,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2891 if (!mz) 3629 if (!mz)
2892 break; 3630 break;
2893 3631
3632 nr_scanned = 0;
2894 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3633 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2895 gfp_mask, 3634 gfp_mask,
2896 MEM_CGROUP_RECLAIM_SOFT); 3635 MEM_CGROUP_RECLAIM_SOFT,
3636 &nr_scanned);
2897 nr_reclaimed += reclaimed; 3637 nr_reclaimed += reclaimed;
3638 *total_scanned += nr_scanned;
2898 spin_lock(&mctz->lock); 3639 spin_lock(&mctz->lock);
2899 3640
2900 /* 3641 /*
@@ -2917,10 +3658,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2917 */ 3658 */
2918 next_mz = 3659 next_mz =
2919 __mem_cgroup_largest_soft_limit_node(mctz); 3660 __mem_cgroup_largest_soft_limit_node(mctz);
2920 if (next_mz == mz) { 3661 if (next_mz == mz)
2921 css_put(&next_mz->mem->css); 3662 css_put(&next_mz->mem->css);
2922 next_mz = NULL; 3663 else /* next_mz == NULL or other memcg */
2923 } else /* next_mz == NULL or other memcg */
2924 break; 3664 break;
2925 } while (1); 3665 } while (1);
2926 } 3666 }
@@ -2977,6 +3717,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2977 loop += 256; 3717 loop += 256;
2978 busy = NULL; 3718 busy = NULL;
2979 while (loop--) { 3719 while (loop--) {
3720 struct page *page;
3721
2980 ret = 0; 3722 ret = 0;
2981 spin_lock_irqsave(&zone->lru_lock, flags); 3723 spin_lock_irqsave(&zone->lru_lock, flags);
2982 if (list_empty(list)) { 3724 if (list_empty(list)) {
@@ -2992,7 +3734,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2992 } 3734 }
2993 spin_unlock_irqrestore(&zone->lru_lock, flags); 3735 spin_unlock_irqrestore(&zone->lru_lock, flags);
2994 3736
2995 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 3737 page = lookup_cgroup_page(pc);
3738
3739 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
2996 if (ret == -ENOMEM) 3740 if (ret == -ENOMEM)
2997 break; 3741 break;
2998 3742
@@ -3038,6 +3782,7 @@ move_account:
3038 lru_add_drain_all(); 3782 lru_add_drain_all();
3039 drain_all_stock_sync(); 3783 drain_all_stock_sync();
3040 ret = 0; 3784 ret = 0;
3785 mem_cgroup_start_move(mem);
3041 for_each_node_state(node, N_HIGH_MEMORY) { 3786 for_each_node_state(node, N_HIGH_MEMORY) {
3042 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3787 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3043 enum lru_list l; 3788 enum lru_list l;
@@ -3051,6 +3796,7 @@ move_account:
3051 if (ret) 3796 if (ret)
3052 break; 3797 break;
3053 } 3798 }
3799 mem_cgroup_end_move(mem);
3054 memcg_oom_recover(mem); 3800 memcg_oom_recover(mem);
3055 /* it seems parent cgroup doesn't have enough mem */ 3801 /* it seems parent cgroup doesn't have enough mem */
3056 if (ret == -ENOMEM) 3802 if (ret == -ENOMEM)
@@ -3137,33 +3883,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3137 return retval; 3883 return retval;
3138} 3884}
3139 3885
3140struct mem_cgroup_idx_data {
3141 s64 val;
3142 enum mem_cgroup_stat_index idx;
3143};
3144 3886
3145static int 3887static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3146mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 3888 enum mem_cgroup_stat_index idx)
3147{ 3889{
3148 struct mem_cgroup_idx_data *d = data; 3890 struct mem_cgroup *iter;
3149 d->val += mem_cgroup_read_stat(mem, d->idx); 3891 long val = 0;
3150 return 0;
3151}
3152 3892
3153static void 3893 /* Per-cpu values can be negative, use a signed accumulator */
3154mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3894 for_each_mem_cgroup_tree(iter, mem)
3155 enum mem_cgroup_stat_index idx, s64 *val) 3895 val += mem_cgroup_read_stat(iter, idx);
3156{ 3896
3157 struct mem_cgroup_idx_data d; 3897 if (val < 0) /* race ? */
3158 d.idx = idx; 3898 val = 0;
3159 d.val = 0; 3899 return val;
3160 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
3161 *val = d.val;
3162} 3900}
3163 3901
3164static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3902static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3165{ 3903{
3166 u64 idx_val, val; 3904 u64 val;
3167 3905
3168 if (!mem_cgroup_is_root(mem)) { 3906 if (!mem_cgroup_is_root(mem)) {
3169 if (!swap) 3907 if (!swap)
@@ -3172,16 +3910,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3172 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3910 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3173 } 3911 }
3174 3912
3175 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); 3913 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3176 val = idx_val; 3914 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3177 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
3178 val += idx_val;
3179 3915
3180 if (swap) { 3916 if (swap)
3181 mem_cgroup_get_recursive_idx_stat(mem, 3917 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3182 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
3183 val += idx_val;
3184 }
3185 3918
3186 return val << PAGE_SHIFT; 3919 return val << PAGE_SHIFT;
3187} 3920}
@@ -3359,6 +4092,8 @@ enum {
3359 MCS_PGPGIN, 4092 MCS_PGPGIN,
3360 MCS_PGPGOUT, 4093 MCS_PGPGOUT,
3361 MCS_SWAP, 4094 MCS_SWAP,
4095 MCS_PGFAULT,
4096 MCS_PGMAJFAULT,
3362 MCS_INACTIVE_ANON, 4097 MCS_INACTIVE_ANON,
3363 MCS_ACTIVE_ANON, 4098 MCS_ACTIVE_ANON,
3364 MCS_INACTIVE_FILE, 4099 MCS_INACTIVE_FILE,
@@ -3381,6 +4116,8 @@ struct {
3381 {"pgpgin", "total_pgpgin"}, 4116 {"pgpgin", "total_pgpgin"},
3382 {"pgpgout", "total_pgpgout"}, 4117 {"pgpgout", "total_pgpgout"},
3383 {"swap", "total_swap"}, 4118 {"swap", "total_swap"},
4119 {"pgfault", "total_pgfault"},
4120 {"pgmajfault", "total_pgmajfault"},
3384 {"inactive_anon", "total_inactive_anon"}, 4121 {"inactive_anon", "total_inactive_anon"},
3385 {"active_anon", "total_active_anon"}, 4122 {"active_anon", "total_active_anon"},
3386 {"inactive_file", "total_inactive_file"}, 4123 {"inactive_file", "total_inactive_file"},
@@ -3389,9 +4126,9 @@ struct {
3389}; 4126};
3390 4127
3391 4128
3392static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) 4129static void
4130mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3393{ 4131{
3394 struct mcs_total_stat *s = data;
3395 s64 val; 4132 s64 val;
3396 4133
3397 /* per cpu stat */ 4134 /* per cpu stat */
@@ -3401,14 +4138,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3401 s->stat[MCS_RSS] += val * PAGE_SIZE; 4138 s->stat[MCS_RSS] += val * PAGE_SIZE;
3402 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4139 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3403 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4140 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3404 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 4141 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
3405 s->stat[MCS_PGPGIN] += val; 4142 s->stat[MCS_PGPGIN] += val;
3406 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 4143 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
3407 s->stat[MCS_PGPGOUT] += val; 4144 s->stat[MCS_PGPGOUT] += val;
3408 if (do_swap_account) { 4145 if (do_swap_account) {
3409 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4146 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3410 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4147 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3411 } 4148 }
4149 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4150 s->stat[MCS_PGFAULT] += val;
4151 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4152 s->stat[MCS_PGMAJFAULT] += val;
3412 4153
3413 /* per zone stat */ 4154 /* per zone stat */
3414 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4155 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3421,15 +4162,62 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3421 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4162 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3422 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 4163 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3423 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4164 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3424 return 0;
3425} 4165}
3426 4166
3427static void 4167static void
3428mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4168mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3429{ 4169{
3430 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); 4170 struct mem_cgroup *iter;
4171
4172 for_each_mem_cgroup_tree(iter, mem)
4173 mem_cgroup_get_local_stat(iter, s);
3431} 4174}
3432 4175
4176#ifdef CONFIG_NUMA
4177static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4178{
4179 int nid;
4180 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4181 unsigned long node_nr;
4182 struct cgroup *cont = m->private;
4183 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4184
4185 total_nr = mem_cgroup_nr_lru_pages(mem_cont);
4186 seq_printf(m, "total=%lu", total_nr);
4187 for_each_node_state(nid, N_HIGH_MEMORY) {
4188 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
4189 seq_printf(m, " N%d=%lu", nid, node_nr);
4190 }
4191 seq_putc(m, '\n');
4192
4193 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
4194 seq_printf(m, "file=%lu", file_nr);
4195 for_each_node_state(nid, N_HIGH_MEMORY) {
4196 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
4197 seq_printf(m, " N%d=%lu", nid, node_nr);
4198 }
4199 seq_putc(m, '\n');
4200
4201 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
4202 seq_printf(m, "anon=%lu", anon_nr);
4203 for_each_node_state(nid, N_HIGH_MEMORY) {
4204 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
4205 seq_printf(m, " N%d=%lu", nid, node_nr);
4206 }
4207 seq_putc(m, '\n');
4208
4209 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
4210 seq_printf(m, "unevictable=%lu", unevictable_nr);
4211 for_each_node_state(nid, N_HIGH_MEMORY) {
4212 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
4213 nid);
4214 seq_printf(m, " N%d=%lu", nid, node_nr);
4215 }
4216 seq_putc(m, '\n');
4217 return 0;
4218}
4219#endif /* CONFIG_NUMA */
4220
3433static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4221static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3434 struct cgroup_map_cb *cb) 4222 struct cgroup_map_cb *cb)
3435{ 4223{
@@ -3440,6 +4228,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3440 memset(&mystat, 0, sizeof(mystat)); 4228 memset(&mystat, 0, sizeof(mystat));
3441 mem_cgroup_get_local_stat(mem_cont, &mystat); 4229 mem_cgroup_get_local_stat(mem_cont, &mystat);
3442 4230
4231
3443 for (i = 0; i < NR_MCS_STAT; i++) { 4232 for (i = 0; i < NR_MCS_STAT; i++) {
3444 if (i == MCS_SWAP && !do_swap_account) 4233 if (i == MCS_SWAP && !do_swap_account)
3445 continue; 4234 continue;
@@ -3525,9 +4314,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3525 return -EINVAL; 4314 return -EINVAL;
3526 } 4315 }
3527 4316
3528 spin_lock(&memcg->reclaim_param_lock);
3529 memcg->swappiness = val; 4317 memcg->swappiness = val;
3530 spin_unlock(&memcg->reclaim_param_lock);
3531 4318
3532 cgroup_unlock(); 4319 cgroup_unlock();
3533 4320
@@ -3604,7 +4391,7 @@ static int compare_thresholds(const void *a, const void *b)
3604 return _a->threshold - _b->threshold; 4391 return _a->threshold - _b->threshold;
3605} 4392}
3606 4393
3607static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) 4394static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
3608{ 4395{
3609 struct mem_cgroup_eventfd_list *ev; 4396 struct mem_cgroup_eventfd_list *ev;
3610 4397
@@ -3615,7 +4402,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3615 4402
3616static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4403static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3617{ 4404{
3618 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); 4405 struct mem_cgroup *iter;
4406
4407 for_each_mem_cgroup_tree(iter, mem)
4408 mem_cgroup_oom_notify_cb(iter);
3619} 4409}
3620 4410
3621static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4411static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -3862,6 +4652,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3862 return 0; 4652 return 0;
3863} 4653}
3864 4654
4655#ifdef CONFIG_NUMA
4656static const struct file_operations mem_control_numa_stat_file_operations = {
4657 .read = seq_read,
4658 .llseek = seq_lseek,
4659 .release = single_release,
4660};
4661
4662static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4663{
4664 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4665
4666 file->f_op = &mem_control_numa_stat_file_operations;
4667 return single_open(file, mem_control_numa_stat_show, cont);
4668}
4669#endif /* CONFIG_NUMA */
4670
3865static struct cftype mem_cgroup_files[] = { 4671static struct cftype mem_cgroup_files[] = {
3866 { 4672 {
3867 .name = "usage_in_bytes", 4673 .name = "usage_in_bytes",
@@ -3925,6 +4731,13 @@ static struct cftype mem_cgroup_files[] = {
3925 .unregister_event = mem_cgroup_oom_unregister_event, 4731 .unregister_event = mem_cgroup_oom_unregister_event,
3926 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4732 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3927 }, 4733 },
4734#ifdef CONFIG_NUMA
4735 {
4736 .name = "numa_stat",
4737 .open = mem_control_numa_stat_open,
4738 .mode = S_IRUGO,
4739 },
4740#endif
3928}; 4741};
3929 4742
3930#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4743#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3986,13 +4799,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3986 */ 4799 */
3987 if (!node_state(node, N_NORMAL_MEMORY)) 4800 if (!node_state(node, N_NORMAL_MEMORY))
3988 tmp = -1; 4801 tmp = -1;
3989 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4802 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3990 if (!pn) 4803 if (!pn)
3991 return 1; 4804 return 1;
3992 4805
3993 mem->info.nodeinfo[node] = pn; 4806 mem->info.nodeinfo[node] = pn;
3994 memset(pn, 0, sizeof(*pn));
3995
3996 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4807 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3997 mz = &pn->zoneinfo[zone]; 4808 mz = &pn->zoneinfo[zone];
3998 for_each_lru(l) 4809 for_each_lru(l)
@@ -4016,23 +4827,25 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4016 4827
4017 /* Can be very big if MAX_NUMNODES is very big */ 4828 /* Can be very big if MAX_NUMNODES is very big */
4018 if (size < PAGE_SIZE) 4829 if (size < PAGE_SIZE)
4019 mem = kmalloc(size, GFP_KERNEL); 4830 mem = kzalloc(size, GFP_KERNEL);
4020 else 4831 else
4021 mem = vmalloc(size); 4832 mem = vzalloc(size);
4022 4833
4023 if (!mem) 4834 if (!mem)
4024 return NULL; 4835 return NULL;
4025 4836
4026 memset(mem, 0, size);
4027 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4837 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4028 if (!mem->stat) { 4838 if (!mem->stat)
4029 if (size < PAGE_SIZE) 4839 goto out_free;
4030 kfree(mem); 4840 spin_lock_init(&mem->pcp_counter_lock);
4031 else
4032 vfree(mem);
4033 mem = NULL;
4034 }
4035 return mem; 4841 return mem;
4842
4843out_free:
4844 if (size < PAGE_SIZE)
4845 kfree(mem);
4846 else
4847 vfree(mem);
4848 return NULL;
4036} 4849}
4037 4850
4038/* 4851/*
@@ -4158,7 +4971,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4158 &per_cpu(memcg_stock, cpu); 4971 &per_cpu(memcg_stock, cpu);
4159 INIT_WORK(&stock->work, drain_local_stock); 4972 INIT_WORK(&stock->work, drain_local_stock);
4160 } 4973 }
4161 hotcpu_notifier(memcg_stock_cpu_callback, 0); 4974 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4162 } else { 4975 } else {
4163 parent = mem_cgroup_from_cont(cont->parent); 4976 parent = mem_cgroup_from_cont(cont->parent);
4164 mem->use_hierarchy = parent->use_hierarchy; 4977 mem->use_hierarchy = parent->use_hierarchy;
@@ -4180,7 +4993,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4180 res_counter_init(&mem->memsw, NULL); 4993 res_counter_init(&mem->memsw, NULL);
4181 } 4994 }
4182 mem->last_scanned_child = 0; 4995 mem->last_scanned_child = 0;
4183 spin_lock_init(&mem->reclaim_param_lock); 4996 mem->last_scanned_node = MAX_NUMNODES;
4184 INIT_LIST_HEAD(&mem->oom_notify); 4997 INIT_LIST_HEAD(&mem->oom_notify);
4185 4998
4186 if (parent) 4999 if (parent)
@@ -4268,7 +5081,7 @@ one_by_one:
4268 batch_count = PRECHARGE_COUNT_AT_ONCE; 5081 batch_count = PRECHARGE_COUNT_AT_ONCE;
4269 cond_resched(); 5082 cond_resched();
4270 } 5083 }
4271 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 5084 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
4272 if (ret || !mem) 5085 if (ret || !mem)
4273 /* mem_cgroup_clear_mc() will do uncharge later */ 5086 /* mem_cgroup_clear_mc() will do uncharge later */
4274 return -ENOMEM; 5087 return -ENOMEM;
@@ -4430,6 +5243,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4430 pte_t *pte; 5243 pte_t *pte;
4431 spinlock_t *ptl; 5244 spinlock_t *ptl;
4432 5245
5246 split_huge_page_pmd(walk->mm, pmd);
5247
4433 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5248 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4434 for (; addr != end; pte++, addr += PAGE_SIZE) 5249 for (; addr != end; pte++, addr += PAGE_SIZE)
4435 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5250 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4467,10 +5282,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4467 5282
4468static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5283static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4469{ 5284{
4470 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 5285 unsigned long precharge = mem_cgroup_count_precharge(mm);
5286
5287 VM_BUG_ON(mc.moving_task);
5288 mc.moving_task = current;
5289 return mem_cgroup_do_precharge(precharge);
4471} 5290}
4472 5291
4473static void mem_cgroup_clear_mc(void) 5292/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5293static void __mem_cgroup_clear_mc(void)
4474{ 5294{
4475 struct mem_cgroup *from = mc.from; 5295 struct mem_cgroup *from = mc.from;
4476 struct mem_cgroup *to = mc.to; 5296 struct mem_cgroup *to = mc.to;
@@ -4505,23 +5325,33 @@ static void mem_cgroup_clear_mc(void)
4505 PAGE_SIZE * mc.moved_swap); 5325 PAGE_SIZE * mc.moved_swap);
4506 } 5326 }
4507 /* we've already done mem_cgroup_get(mc.to) */ 5327 /* we've already done mem_cgroup_get(mc.to) */
4508
4509 mc.moved_swap = 0; 5328 mc.moved_swap = 0;
4510 } 5329 }
5330 memcg_oom_recover(from);
5331 memcg_oom_recover(to);
5332 wake_up_all(&mc.waitq);
5333}
5334
5335static void mem_cgroup_clear_mc(void)
5336{
5337 struct mem_cgroup *from = mc.from;
5338
5339 /*
5340 * we must clear moving_task before waking up waiters at the end of
5341 * task migration.
5342 */
5343 mc.moving_task = NULL;
5344 __mem_cgroup_clear_mc();
4511 spin_lock(&mc.lock); 5345 spin_lock(&mc.lock);
4512 mc.from = NULL; 5346 mc.from = NULL;
4513 mc.to = NULL; 5347 mc.to = NULL;
4514 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 5348 spin_unlock(&mc.lock);
4516 memcg_oom_recover(from); 5349 mem_cgroup_end_move(from);
4517 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq);
4519} 5350}
4520 5351
4521static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5352static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4522 struct cgroup *cgroup, 5353 struct cgroup *cgroup,
4523 struct task_struct *p, 5354 struct task_struct *p)
4524 bool threadgroup)
4525{ 5355{
4526 int ret = 0; 5356 int ret = 0;
4527 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5357 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4542,15 +5372,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4542 VM_BUG_ON(mc.precharge); 5372 VM_BUG_ON(mc.precharge);
4543 VM_BUG_ON(mc.moved_charge); 5373 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 5374 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 5375 mem_cgroup_start_move(from);
4546 spin_lock(&mc.lock); 5376 spin_lock(&mc.lock);
4547 mc.from = from; 5377 mc.from = from;
4548 mc.to = mem; 5378 mc.to = mem;
4549 mc.precharge = 0;
4550 mc.moved_charge = 0;
4551 mc.moved_swap = 0;
4552 mc.moving_task = current;
4553 spin_unlock(&mc.lock); 5379 spin_unlock(&mc.lock);
5380 /* We set mc.moving_task later */
4554 5381
4555 ret = mem_cgroup_precharge_mc(mm); 5382 ret = mem_cgroup_precharge_mc(mm);
4556 if (ret) 5383 if (ret)
@@ -4563,8 +5390,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4563 5390
4564static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5391static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4565 struct cgroup *cgroup, 5392 struct cgroup *cgroup,
4566 struct task_struct *p, 5393 struct task_struct *p)
4567 bool threadgroup)
4568{ 5394{
4569 mem_cgroup_clear_mc(); 5395 mem_cgroup_clear_mc();
4570} 5396}
@@ -4578,6 +5404,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4578 pte_t *pte; 5404 pte_t *pte;
4579 spinlock_t *ptl; 5405 spinlock_t *ptl;
4580 5406
5407 split_huge_page_pmd(walk->mm, pmd);
4581retry: 5408retry:
4582 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5409 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4583 for (; addr != end; addr += PAGE_SIZE) { 5410 for (; addr != end; addr += PAGE_SIZE) {
@@ -4598,8 +5425,8 @@ retry:
4598 if (isolate_lru_page(page)) 5425 if (isolate_lru_page(page))
4599 goto put; 5426 goto put;
4600 pc = lookup_page_cgroup(page); 5427 pc = lookup_page_cgroup(page);
4601 if (!mem_cgroup_move_account(pc, 5428 if (!mem_cgroup_move_account(page, 1, pc,
4602 mc.from, mc.to, false)) { 5429 mc.from, mc.to, false)) {
4603 mc.precharge--; 5430 mc.precharge--;
4604 /* we uncharge from mc.from later. */ 5431 /* we uncharge from mc.from later. */
4605 mc.moved_charge++; 5432 mc.moved_charge++;
@@ -4644,7 +5471,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4644 struct vm_area_struct *vma; 5471 struct vm_area_struct *vma;
4645 5472
4646 lru_add_drain_all(); 5473 lru_add_drain_all();
4647 down_read(&mm->mmap_sem); 5474retry:
5475 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5476 /*
5477 * Someone who are holding the mmap_sem might be waiting in
5478 * waitq. So we cancel all extra charges, wake up all waiters,
5479 * and retry. Because we cancel precharges, we might not be able
5480 * to move enough charges, but moving charge is a best-effort
5481 * feature anyway, so it wouldn't be a big problem.
5482 */
5483 __mem_cgroup_clear_mc();
5484 cond_resched();
5485 goto retry;
5486 }
4648 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5487 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4649 int ret; 5488 int ret;
4650 struct mm_walk mem_cgroup_move_charge_walk = { 5489 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4669,41 +5508,35 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4669static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5508static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4670 struct cgroup *cont, 5509 struct cgroup *cont,
4671 struct cgroup *old_cont, 5510 struct cgroup *old_cont,
4672 struct task_struct *p, 5511 struct task_struct *p)
4673 bool threadgroup)
4674{ 5512{
4675 struct mm_struct *mm; 5513 struct mm_struct *mm = get_task_mm(p);
4676 5514
4677 if (!mc.to)
4678 /* no need to move charge */
4679 return;
4680
4681 mm = get_task_mm(p);
4682 if (mm) { 5515 if (mm) {
4683 mem_cgroup_move_charge(mm); 5516 if (mc.to)
5517 mem_cgroup_move_charge(mm);
5518 put_swap_token(mm);
4684 mmput(mm); 5519 mmput(mm);
4685 } 5520 }
4686 mem_cgroup_clear_mc(); 5521 if (mc.to)
5522 mem_cgroup_clear_mc();
4687} 5523}
4688#else /* !CONFIG_MMU */ 5524#else /* !CONFIG_MMU */
4689static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5525static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4690 struct cgroup *cgroup, 5526 struct cgroup *cgroup,
4691 struct task_struct *p, 5527 struct task_struct *p)
4692 bool threadgroup)
4693{ 5528{
4694 return 0; 5529 return 0;
4695} 5530}
4696static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5531static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4697 struct cgroup *cgroup, 5532 struct cgroup *cgroup,
4698 struct task_struct *p, 5533 struct task_struct *p)
4699 bool threadgroup)
4700{ 5534{
4701} 5535}
4702static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5536static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4703 struct cgroup *cont, 5537 struct cgroup *cont,
4704 struct cgroup *old_cont, 5538 struct cgroup *old_cont,
4705 struct task_struct *p, 5539 struct task_struct *p)
4706 bool threadgroup)
4707{ 5540{
4708} 5541}
4709#endif 5542#endif
@@ -4723,11 +5556,15 @@ struct cgroup_subsys mem_cgroup_subsys = {
4723}; 5556};
4724 5557
4725#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5558#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4726 5559static int __init enable_swap_account(char *s)
4727static int __init disable_swap_account(char *s)
4728{ 5560{
4729 really_do_swap_account = 0; 5561 /* consider enabled if no parameter or 1 is given */
5562 if (!strcmp(s, "1"))
5563 really_do_swap_account = 1;
5564 else if (!strcmp(s, "0"))
5565 really_do_swap_account = 0;
4730 return 1; 5566 return 1;
4731} 5567}
4732__setup("noswapaccount", disable_swap_account); 5568__setup("swapaccount=", enable_swap_account);
5569
4733#endif 5570#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
7 * Free Software Foundation. 7 * Free Software Foundation.
8 * 8 *
9 * High level machine check handler. Handles pages reported by the 9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache 10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure. 11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
12 * 15 *
13 * Handles page cache pages in various states. The tricky part 16 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM 17 * here is that we can access any page asynchronously in respect to
15 * users, because memory failures could happen anytime and anywhere, 18 * other VM users, because memory failures could happen anytime and
16 * possibly violating some of their assumptions. This is why this code 19 * anywhere. This could violate some of their assumptions. This is why
17 * has to be extremely careful. Generally it tries to use normal locking 20 * this code has to be extremely careful. Generally it tries to use
18 * rules, as in get the standard locks, even if that means the 21 * normal locking rules, as in get the standard locks, even if that means
19 * error handling takes potentially a long time. 22 * the error handling takes potentially a long time.
20 * 23 *
21 * The operation to map back from RMAP chains to processes has to walk 24 * There are several operations here with exponential complexity because
22 * the complete process list and has non linear complexity with the number 25 * of unsuitable VM data structures. For example the operation to map back
23 * mappings. In short it can be quite slow. But since memory corruptions 26 * from RMAP chains to processes has to walk the complete process list and
24 * are rare we hope to get away with this. 27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
25 */ 30 */
26 31
27/* 32/*
@@ -30,7 +35,6 @@
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages 35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel 36 * - pass bad pages to kdump next kernel
32 */ 37 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h> 38#include <linux/kernel.h>
35#include <linux/mm.h> 39#include <linux/mm.h>
36#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -47,6 +51,8 @@
47#include <linux/slab.h> 51#include <linux/slab.h>
48#include <linux/swapops.h> 52#include <linux/swapops.h>
49#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h>
50#include "internal.h" 56#include "internal.h"
51 57
52int sysctl_memory_failure_early_kill __read_mostly = 0; 58int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -78,7 +84,7 @@ static int hwpoison_filter_dev(struct page *p)
78 return 0; 84 return 0;
79 85
80 /* 86 /*
81 * page_mapping() does not accept slab page 87 * page_mapping() does not accept slab pages.
82 */ 88 */
83 if (PageSlab(p)) 89 if (PageSlab(p))
84 return -EINVAL; 90 return -EINVAL;
@@ -198,12 +204,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
198#ifdef __ARCH_SI_TRAPNO 204#ifdef __ARCH_SI_TRAPNO
199 si.si_trapno = trapno; 205 si.si_trapno = trapno;
200#endif 206#endif
201 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 207 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
202 /* 208 /*
203 * Don't use force here, it's convenient if the signal 209 * Don't use force here, it's convenient if the signal
204 * can be temporarily blocked. 210 * can be temporarily blocked.
205 * This could cause a loop when the user sets SIGBUS 211 * This could cause a loop when the user sets SIGBUS
206 * to SIG_IGN, but hopefully noone will do that? 212 * to SIG_IGN, but hopefully no one will do that?
207 */ 213 */
208 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 214 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
209 if (ret < 0) 215 if (ret < 0)
@@ -228,13 +234,17 @@ void shake_page(struct page *p, int access)
228 } 234 }
229 235
230 /* 236 /*
231 * Only all shrink_slab here (which would also 237 * Only call shrink_slab here (which would also shrink other caches) if
232 * shrink other caches) if access is not potentially fatal. 238 * access is not potentially fatal.
233 */ 239 */
234 if (access) { 240 if (access) {
235 int nr; 241 int nr;
236 do { 242 do {
237 nr = shrink_slab(1000, GFP_KERNEL, 1000); 243 struct shrink_control shrink = {
244 .gfp_mask = GFP_KERNEL,
245 };
246
247 nr = shrink_slab(&shrink, 1000, 1000);
238 if (page_count(p) == 1) 248 if (page_count(p) == 1)
239 break; 249 break;
240 } while (nr > 10); 250 } while (nr > 10);
@@ -268,7 +278,7 @@ struct to_kill {
268 struct list_head nd; 278 struct list_head nd;
269 struct task_struct *tsk; 279 struct task_struct *tsk;
270 unsigned long addr; 280 unsigned long addr;
271 unsigned addr_valid:1; 281 char addr_valid;
272}; 282};
273 283
274/* 284/*
@@ -309,7 +319,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
309 * a SIGKILL because the error is not contained anymore. 319 * a SIGKILL because the error is not contained anymore.
310 */ 320 */
311 if (tk->addr == -EFAULT) { 321 if (tk->addr == -EFAULT) {
312 pr_debug("MCE: Unable to find user space address %lx in %s\n", 322 pr_info("MCE: Unable to find user space address %lx in %s\n",
313 page_to_pfn(p), tsk->comm); 323 page_to_pfn(p), tsk->comm);
314 tk->addr_valid = 0; 324 tk->addr_valid = 0;
315 } 325 }
@@ -381,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
381 struct task_struct *tsk; 391 struct task_struct *tsk;
382 struct anon_vma *av; 392 struct anon_vma *av;
383 393
384 read_lock(&tasklist_lock);
385 av = page_lock_anon_vma(page); 394 av = page_lock_anon_vma(page);
386 if (av == NULL) /* Not actually mapped anymore */ 395 if (av == NULL) /* Not actually mapped anymore */
387 goto out; 396 return;
397
398 read_lock(&tasklist_lock);
388 for_each_process (tsk) { 399 for_each_process (tsk) {
389 struct anon_vma_chain *vmac; 400 struct anon_vma_chain *vmac;
390 401
@@ -398,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
398 add_to_kill(tsk, page, vma, to_kill, tkc); 409 add_to_kill(tsk, page, vma, to_kill, tkc);
399 } 410 }
400 } 411 }
401 page_unlock_anon_vma(av);
402out:
403 read_unlock(&tasklist_lock); 412 read_unlock(&tasklist_lock);
413 page_unlock_anon_vma(av);
404} 414}
405 415
406/* 416/*
@@ -414,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
414 struct prio_tree_iter iter; 424 struct prio_tree_iter iter;
415 struct address_space *mapping = page->mapping; 425 struct address_space *mapping = page->mapping;
416 426
417 /* 427 mutex_lock(&mapping->i_mmap_mutex);
418 * A note on the locking order between the two locks.
419 * We don't rely on this particular order.
420 * If you have some other code that needs a different order
421 * feel free to switch them around. Or add a reverse link
422 * from mm_struct to task_struct, then this could be all
423 * done without taking tasklist_lock and looping over all tasks.
424 */
425
426 read_lock(&tasklist_lock); 428 read_lock(&tasklist_lock);
427 spin_lock(&mapping->i_mmap_lock);
428 for_each_process(tsk) { 429 for_each_process(tsk) {
429 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 430 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
430 431
@@ -444,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
444 add_to_kill(tsk, page, vma, to_kill, tkc); 445 add_to_kill(tsk, page, vma, to_kill, tkc);
445 } 446 }
446 } 447 }
447 spin_unlock(&mapping->i_mmap_lock);
448 read_unlock(&tasklist_lock); 448 read_unlock(&tasklist_lock);
449 mutex_unlock(&mapping->i_mmap_mutex);
449} 450}
450 451
451/* 452/*
@@ -577,7 +578,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
577 pfn, err); 578 pfn, err);
578 } else if (page_has_private(p) && 579 } else if (page_has_private(p) &&
579 !try_to_release_page(p, GFP_NOIO)) { 580 !try_to_release_page(p, GFP_NOIO)) {
580 pr_debug("MCE %#lx: failed to release buffers\n", pfn); 581 pr_info("MCE %#lx: failed to release buffers\n", pfn);
581 } else { 582 } else {
582 ret = RECOVERED; 583 ret = RECOVERED;
583 } 584 }
@@ -629,7 +630,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
629 * when the page is reread or dropped. If an 630 * when the page is reread or dropped. If an
630 * application assumes it will always get error on 631 * application assumes it will always get error on
631 * fsync, but does other operations on the fd before 632 * fsync, but does other operations on the fd before
632 * and the page is dropped inbetween then the error 633 * and the page is dropped between then the error
633 * will not be properly reported. 634 * will not be properly reported.
634 * 635 *
635 * This can already happen even without hwpoisoned 636 * This can already happen even without hwpoisoned
@@ -693,11 +694,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
693 * Issues: 694 * Issues:
694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.) 695 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
695 * To narrow down kill region to one page, we need to break up pmd. 696 * To narrow down kill region to one page, we need to break up pmd.
696 * - To support soft-offlining for hugepage, we need to support hugepage
697 * migration.
698 */ 697 */
699static int me_huge_page(struct page *p, unsigned long pfn) 698static int me_huge_page(struct page *p, unsigned long pfn)
700{ 699{
700 int res = 0;
701 struct page *hpage = compound_head(p); 701 struct page *hpage = compound_head(p);
702 /* 702 /*
703 * We can safely recover from error on free or reserved (i.e. 703 * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +710,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
710 * so there is no race between isolation and mapping/unmapping. 710 * so there is no race between isolation and mapping/unmapping.
711 */ 711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) { 712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage); 713 res = dequeue_hwpoisoned_huge_page(hpage);
714 return RECOVERED; 714 if (!res)
715 return RECOVERED;
715 } 716 }
716 return DELAYED; 717 return DELAYED;
717} 718}
@@ -723,7 +724,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
723 * The table matches them in order and calls the right handler. 724 * The table matches them in order and calls the right handler.
724 * 725 *
725 * This is quite tricky because we can access page at any time 726 * This is quite tricky because we can access page at any time
726 * in its live cycle, so all accesses have to be extremly careful. 727 * in its live cycle, so all accesses have to be extremely careful.
727 * 728 *
728 * This is not complete. More states could be added. 729 * This is not complete. More states could be added.
729 * For any missing state don't attempt recovery. 730 * For any missing state don't attempt recovery.
@@ -836,8 +837,6 @@ static int page_action(struct page_state *ps, struct page *p,
836 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; 837 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
837} 838}
838 839
839#define N_UNMAP_TRIES 5
840
841/* 840/*
842 * Do all that is necessary to remove user space mappings. Unmap 841 * Do all that is necessary to remove user space mappings. Unmap
843 * the pages and send SIGBUS to the processes if the data was dirty. 842 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,9 +848,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
849 struct address_space *mapping; 848 struct address_space *mapping;
850 LIST_HEAD(tokill); 849 LIST_HEAD(tokill);
851 int ret; 850 int ret;
852 int i;
853 int kill = 1; 851 int kill = 1;
854 struct page *hpage = compound_head(p); 852 struct page *hpage = compound_head(p);
853 struct page *ppage;
855 854
856 if (PageReserved(p) || PageSlab(p)) 855 if (PageReserved(p) || PageSlab(p))
857 return SWAP_SUCCESS; 856 return SWAP_SUCCESS;
@@ -893,6 +892,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
893 } 892 }
894 893
895 /* 894 /*
895 * ppage: poisoned page
896 * if p is regular page(4k page)
897 * ppage == real poisoned page;
898 * else p is hugetlb or THP, ppage == head page.
899 */
900 ppage = hpage;
901
902 if (PageTransHuge(hpage)) {
903 /*
904 * Verify that this isn't a hugetlbfs head page, the check for
905 * PageAnon is just for avoid tripping a split_huge_page
906 * internal debug check, as split_huge_page refuses to deal with
907 * anything that isn't an anon page. PageAnon can't go away fro
908 * under us because we hold a refcount on the hpage, without a
909 * refcount on the hpage. split_huge_page can't be safely called
910 * in the first place, having a refcount on the tail isn't
911 * enough * to be safe.
912 */
913 if (!PageHuge(hpage) && PageAnon(hpage)) {
914 if (unlikely(split_huge_page(hpage))) {
915 /*
916 * FIXME: if splitting THP is failed, it is
917 * better to stop the following operation rather
918 * than causing panic by unmapping. System might
919 * survive if the page is freed later.
920 */
921 printk(KERN_INFO
922 "MCE %#lx: failed to split THP\n", pfn);
923
924 BUG_ON(!PageHWPoison(p));
925 return SWAP_FAIL;
926 }
927 /* THP is split, so ppage should be the real poisoned page. */
928 ppage = p;
929 }
930 }
931
932 /*
896 * First collect all the processes that have the page 933 * First collect all the processes that have the page
897 * mapped in dirty form. This has to be done before try_to_unmap, 934 * mapped in dirty form. This has to be done before try_to_unmap,
898 * because ttu takes the rmap data structures down. 935 * because ttu takes the rmap data structures down.
@@ -901,22 +938,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 * there's nothing that can be done. 938 * there's nothing that can be done.
902 */ 939 */
903 if (kill) 940 if (kill)
904 collect_procs(hpage, &tokill); 941 collect_procs(ppage, &tokill);
905 942
906 /* 943 if (hpage != ppage)
907 * try_to_unmap can fail temporarily due to races. 944 lock_page(ppage);
908 * Try a few times (RED-PEN better strategy?)
909 */
910 for (i = 0; i < N_UNMAP_TRIES; i++) {
911 ret = try_to_unmap(hpage, ttu);
912 if (ret == SWAP_SUCCESS)
913 break;
914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
915 }
916 945
946 ret = try_to_unmap(ppage, ttu);
917 if (ret != SWAP_SUCCESS) 947 if (ret != SWAP_SUCCESS)
918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 948 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
919 pfn, page_mapcount(hpage)); 949 pfn, page_mapcount(ppage));
950
951 if (hpage != ppage)
952 unlock_page(ppage);
920 953
921 /* 954 /*
922 * Now that the dirty bit has been propagated to the 955 * Now that the dirty bit has been propagated to the
@@ -927,7 +960,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
927 * use a more force-full uncatchable kill to prevent 960 * use a more force-full uncatchable kill to prevent
928 * any accesses to the poisoned memory. 961 * any accesses to the poisoned memory.
929 */ 962 */
930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 963 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
931 ret != SWAP_SUCCESS, p, pfn); 964 ret != SWAP_SUCCESS, p, pfn);
932 965
933 return ret; 966 return ret;
@@ -936,7 +969,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
936static void set_page_hwpoison_huge_page(struct page *hpage) 969static void set_page_hwpoison_huge_page(struct page *hpage)
937{ 970{
938 int i; 971 int i;
939 int nr_pages = 1 << compound_order(hpage); 972 int nr_pages = 1 << compound_trans_order(hpage);
940 for (i = 0; i < nr_pages; i++) 973 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i); 974 SetPageHWPoison(hpage + i);
942} 975}
@@ -944,7 +977,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
944static void clear_page_hwpoison_huge_page(struct page *hpage) 977static void clear_page_hwpoison_huge_page(struct page *hpage)
945{ 978{
946 int i; 979 int i;
947 int nr_pages = 1 << compound_order(hpage); 980 int nr_pages = 1 << compound_trans_order(hpage);
948 for (i = 0; i < nr_pages; i++) 981 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i); 982 ClearPageHWPoison(hpage + i);
950} 983}
@@ -974,14 +1007,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
974 return 0; 1007 return 0;
975 } 1008 }
976 1009
977 nr_pages = 1 << compound_order(hpage); 1010 nr_pages = 1 << compound_trans_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages); 1011 atomic_long_add(nr_pages, &mce_bad_pages);
979 1012
980 /* 1013 /*
981 * We need/can do nothing about count=0 pages. 1014 * We need/can do nothing about count=0 pages.
982 * 1) it's a free page, and therefore in safe hand: 1015 * 1) it's a free page, and therefore in safe hand:
983 * prep_new_page() will be the gate keeper. 1016 * prep_new_page() will be the gate keeper.
984 * 2) it's part of a non-compound high order page. 1017 * 2) it's a free hugepage, which is also safe:
1018 * an affected hugepage will be dequeued from hugepage freelist,
1019 * so there's no concern about reusing it ever after.
1020 * 3) it's part of a non-compound high order page.
985 * Implies some kernel user: cannot stop them from 1021 * Implies some kernel user: cannot stop them from
986 * R/W the page; let's pray that the page has been 1022 * R/W the page; let's pray that the page has been
987 * used and will be freed some time later. 1023 * used and will be freed some time later.
@@ -993,6 +1029,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
993 if (is_free_buddy_page(p)) { 1029 if (is_free_buddy_page(p)) {
994 action_result(pfn, "free buddy", DELAYED); 1030 action_result(pfn, "free buddy", DELAYED);
995 return 0; 1031 return 0;
1032 } else if (PageHuge(hpage)) {
1033 /*
1034 * Check "just unpoisoned", "filter hit", and
1035 * "race with other subpage."
1036 */
1037 lock_page(hpage);
1038 if (!PageHWPoison(hpage)
1039 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1040 || (p != hpage && TestSetPageHWPoison(hpage))) {
1041 atomic_long_sub(nr_pages, &mce_bad_pages);
1042 return 0;
1043 }
1044 set_page_hwpoison_huge_page(hpage);
1045 res = dequeue_hwpoisoned_huge_page(hpage);
1046 action_result(pfn, "free huge",
1047 res ? IGNORED : DELAYED);
1048 unlock_page(hpage);
1049 return res;
996 } else { 1050 } else {
997 action_result(pfn, "high order kernel", IGNORED); 1051 action_result(pfn, "high order kernel", IGNORED);
998 return -EBUSY; 1052 return -EBUSY;
@@ -1007,19 +1061,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1007 * The check (unnecessarily) ignores LRU pages being isolated and 1061 * The check (unnecessarily) ignores LRU pages being isolated and
1008 * walked by the page reclaim code, however that's not a big loss. 1062 * walked by the page reclaim code, however that's not a big loss.
1009 */ 1063 */
1010 if (!PageLRU(p) && !PageHuge(p)) 1064 if (!PageHuge(p) && !PageTransCompound(p)) {
1011 shake_page(p, 0); 1065 if (!PageLRU(p))
1012 if (!PageLRU(p) && !PageHuge(p)) { 1066 shake_page(p, 0);
1013 /* 1067 if (!PageLRU(p)) {
1014 * shake_page could have turned it free. 1068 /*
1015 */ 1069 * shake_page could have turned it free.
1016 if (is_free_buddy_page(p)) { 1070 */
1017 action_result(pfn, "free buddy, 2nd try", DELAYED); 1071 if (is_free_buddy_page(p)) {
1018 return 0; 1072 action_result(pfn, "free buddy, 2nd try",
1073 DELAYED);
1074 return 0;
1075 }
1076 action_result(pfn, "non LRU", IGNORED);
1077 put_page(p);
1078 return -EBUSY;
1019 } 1079 }
1020 action_result(pfn, "non LRU", IGNORED);
1021 put_page(p);
1022 return -EBUSY;
1023 } 1080 }
1024 1081
1025 /* 1082 /*
@@ -1027,7 +1084,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1027 * It's very difficult to mess with pages currently under IO 1084 * It's very difficult to mess with pages currently under IO
1028 * and in many cases impossible, so we just avoid it here. 1085 * and in many cases impossible, so we just avoid it here.
1029 */ 1086 */
1030 lock_page_nosync(hpage); 1087 lock_page(hpage);
1031 1088
1032 /* 1089 /*
1033 * unpoison always clear PG_hwpoison inside page lock 1090 * unpoison always clear PG_hwpoison inside page lock
@@ -1049,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1049 * For error on the tail page, we should set PG_hwpoison 1106 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned 1107 * on the head page to show that the hugepage is hwpoisoned
1051 */ 1108 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1109 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned", 1110 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED); 1111 IGNORED);
1055 unlock_page(hpage); 1112 unlock_page(hpage);
@@ -1069,7 +1126,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1069 1126
1070 /* 1127 /*
1071 * Now take care of user space mappings. 1128 * Now take care of user space mappings.
1072 * Abort on fail: __remove_from_page_cache() assumes unmapped page. 1129 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1073 */ 1130 */
1074 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1131 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1075 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1132 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1147,20 +1204,30 @@ int unpoison_memory(unsigned long pfn)
1147 page = compound_head(p); 1204 page = compound_head(p);
1148 1205
1149 if (!PageHWPoison(p)) { 1206 if (!PageHWPoison(p)) {
1150 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); 1207 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1151 return 0; 1208 return 0;
1152 } 1209 }
1153 1210
1154 nr_pages = 1 << compound_order(page); 1211 nr_pages = 1 << compound_trans_order(page);
1155 1212
1156 if (!get_page_unless_zero(page)) { 1213 if (!get_page_unless_zero(page)) {
1214 /*
1215 * Since HWPoisoned hugepage should have non-zero refcount,
1216 * race between memory failure and unpoison seems to happen.
1217 * In such case unpoison fails and memory failure runs
1218 * to the end.
1219 */
1220 if (PageHuge(page)) {
1221 pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1222 return 0;
1223 }
1157 if (TestClearPageHWPoison(p)) 1224 if (TestClearPageHWPoison(p))
1158 atomic_long_sub(nr_pages, &mce_bad_pages); 1225 atomic_long_sub(nr_pages, &mce_bad_pages);
1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1226 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1160 return 0; 1227 return 0;
1161 } 1228 }
1162 1229
1163 lock_page_nosync(page); 1230 lock_page(page);
1164 /* 1231 /*
1165 * This test is racy because PG_hwpoison is set outside of page lock. 1232 * This test is racy because PG_hwpoison is set outside of page lock.
1166 * That's acceptable because that won't trigger kernel panic. Instead, 1233 * That's acceptable because that won't trigger kernel panic. Instead,
@@ -1168,12 +1235,12 @@ int unpoison_memory(unsigned long pfn)
1168 * the free buddy page pool. 1235 * the free buddy page pool.
1169 */ 1236 */
1170 if (TestClearPageHWPoison(page)) { 1237 if (TestClearPageHWPoison(page)) {
1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1238 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1172 atomic_long_sub(nr_pages, &mce_bad_pages); 1239 atomic_long_sub(nr_pages, &mce_bad_pages);
1173 freeit = 1; 1240 freeit = 1;
1241 if (PageHuge(page))
1242 clear_page_hwpoison_huge_page(page);
1174 } 1243 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1177 unlock_page(page); 1244 unlock_page(page);
1178 1245
1179 put_page(page); 1246 put_page(page);
@@ -1187,7 +1254,11 @@ EXPORT_SYMBOL(unpoison_memory);
1187static struct page *new_page(struct page *p, unsigned long private, int **x) 1254static struct page *new_page(struct page *p, unsigned long private, int **x)
1188{ 1255{
1189 int nid = page_to_nid(p); 1256 int nid = page_to_nid(p);
1190 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); 1257 if (PageHuge(p))
1258 return alloc_huge_page_node(page_hstate(compound_head(p)),
1259 nid);
1260 else
1261 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1191} 1262}
1192 1263
1193/* 1264/*
@@ -1204,25 +1275,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1204 return 1; 1275 return 1;
1205 1276
1206 /* 1277 /*
1207 * The lock_system_sleep prevents a race with memory hotplug, 1278 * The lock_memory_hotplug prevents a race with memory hotplug.
1208 * because the isolation assumes there's only a single user.
1209 * This is a big hammer, a better would be nicer. 1279 * This is a big hammer, a better would be nicer.
1210 */ 1280 */
1211 lock_system_sleep(); 1281 lock_memory_hotplug();
1212 1282
1213 /* 1283 /*
1214 * Isolate the page, so that it doesn't get reallocated if it 1284 * Isolate the page, so that it doesn't get reallocated if it
1215 * was free. 1285 * was free.
1216 */ 1286 */
1217 set_migratetype_isolate(p); 1287 set_migratetype_isolate(p);
1288 /*
1289 * When the target page is a free hugepage, just remove it
1290 * from free hugepage list.
1291 */
1218 if (!get_page_unless_zero(compound_head(p))) { 1292 if (!get_page_unless_zero(compound_head(p))) {
1219 if (is_free_buddy_page(p)) { 1293 if (PageHuge(p)) {
1220 pr_debug("get_any_page: %#lx free buddy page\n", pfn); 1294 pr_info("get_any_page: %#lx free huge page\n", pfn);
1295 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1296 } else if (is_free_buddy_page(p)) {
1297 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1221 /* Set hwpoison bit while page is still isolated */ 1298 /* Set hwpoison bit while page is still isolated */
1222 SetPageHWPoison(p); 1299 SetPageHWPoison(p);
1223 ret = 0; 1300 ret = 0;
1224 } else { 1301 } else {
1225 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1302 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1226 pfn, p->flags); 1303 pfn, p->flags);
1227 ret = -EIO; 1304 ret = -EIO;
1228 } 1305 }
@@ -1231,7 +1308,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1231 ret = 1; 1308 ret = 1;
1232 } 1309 }
1233 unset_migratetype_isolate(p); 1310 unset_migratetype_isolate(p);
1234 unlock_system_sleep(); 1311 unlock_memory_hotplug();
1312 return ret;
1313}
1314
1315static int soft_offline_huge_page(struct page *page, int flags)
1316{
1317 int ret;
1318 unsigned long pfn = page_to_pfn(page);
1319 struct page *hpage = compound_head(page);
1320 LIST_HEAD(pagelist);
1321
1322 ret = get_any_page(page, pfn, flags);
1323 if (ret < 0)
1324 return ret;
1325 if (ret == 0)
1326 goto done;
1327
1328 if (PageHWPoison(hpage)) {
1329 put_page(hpage);
1330 pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1331 return -EBUSY;
1332 }
1333
1334 /* Keep page count to indicate a given hugepage is isolated. */
1335
1336 list_add(&hpage->lru, &pagelist);
1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1338 true);
1339 if (ret) {
1340 struct page *page1, *page2;
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1342 put_page(page1);
1343
1344 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1345 pfn, ret, page->flags);
1346 if (ret > 0)
1347 ret = -EIO;
1348 return ret;
1349 }
1350done:
1351 if (!PageHWPoison(hpage))
1352 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1353 set_page_hwpoison_huge_page(hpage);
1354 dequeue_hwpoisoned_huge_page(hpage);
1355 /* keep elevated page count for bad page */
1235 return ret; 1356 return ret;
1236} 1357}
1237 1358
@@ -1262,6 +1383,9 @@ int soft_offline_page(struct page *page, int flags)
1262 int ret; 1383 int ret;
1263 unsigned long pfn = page_to_pfn(page); 1384 unsigned long pfn = page_to_pfn(page);
1264 1385
1386 if (PageHuge(page))
1387 return soft_offline_huge_page(page, flags);
1388
1265 ret = get_any_page(page, pfn, flags); 1389 ret = get_any_page(page, pfn, flags);
1266 if (ret < 0) 1390 if (ret < 0)
1267 return ret; 1391 return ret;
@@ -1288,7 +1412,7 @@ int soft_offline_page(struct page *page, int flags)
1288 goto done; 1412 goto done;
1289 } 1413 }
1290 if (!PageLRU(page)) { 1414 if (!PageLRU(page)) {
1291 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", 1415 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1292 pfn, page->flags); 1416 pfn, page->flags);
1293 return -EIO; 1417 return -EIO;
1294 } 1418 }
@@ -1302,7 +1426,7 @@ int soft_offline_page(struct page *page, int flags)
1302 if (PageHWPoison(page)) { 1426 if (PageHWPoison(page)) {
1303 unlock_page(page); 1427 unlock_page(page);
1304 put_page(page); 1428 put_page(page);
1305 pr_debug("soft offline: %#lx page already poisoned\n", pfn); 1429 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1306 return -EBUSY; 1430 return -EBUSY;
1307 } 1431 }
1308 1432
@@ -1312,18 +1436,14 @@ int soft_offline_page(struct page *page, int flags)
1312 */ 1436 */
1313 ret = invalidate_inode_page(page); 1437 ret = invalidate_inode_page(page);
1314 unlock_page(page); 1438 unlock_page(page);
1315
1316 /* 1439 /*
1317 * Drop count because page migration doesn't like raised
1318 * counts. The page could get re-allocated, but if it becomes
1319 * LRU the isolation will just fail.
1320 * RED-PEN would be better to keep it isolated here, but we 1440 * RED-PEN would be better to keep it isolated here, but we
1321 * would need to fix isolation locking first. 1441 * would need to fix isolation locking first.
1322 */ 1442 */
1323 put_page(page);
1324 if (ret == 1) { 1443 if (ret == 1) {
1444 put_page(page);
1325 ret = 0; 1445 ret = 0;
1326 pr_debug("soft_offline: %#lx: invalidated\n", pfn); 1446 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1327 goto done; 1447 goto done;
1328 } 1448 }
1329 1449
@@ -1333,19 +1453,27 @@ int soft_offline_page(struct page *page, int flags)
1333 * handles a large number of cases for us. 1453 * handles a large number of cases for us.
1334 */ 1454 */
1335 ret = isolate_lru_page(page); 1455 ret = isolate_lru_page(page);
1456 /*
1457 * Drop page reference which is came from get_any_page()
1458 * successful isolate_lru_page() already took another one.
1459 */
1460 put_page(page);
1336 if (!ret) { 1461 if (!ret) {
1337 LIST_HEAD(pagelist); 1462 LIST_HEAD(pagelist);
1338 1463 inc_zone_page_state(page, NR_ISOLATED_ANON +
1464 page_is_file_cache(page));
1339 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1340 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1467 0, true);
1341 if (ret) { 1468 if (ret) {
1342 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1469 putback_lru_pages(&pagelist);
1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1343 pfn, ret, page->flags); 1471 pfn, ret, page->flags);
1344 if (ret > 0) 1472 if (ret > 0)
1345 ret = -EIO; 1473 ret = -EIO;
1346 } 1474 }
1347 } else { 1475 } else {
1348 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1476 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1349 pfn, ret, page_count(page), page->flags); 1477 pfn, ret, page_count(page), page->flags);
1350 } 1478 }
1351 if (ret) 1479 if (ret)
@@ -1357,35 +1485,3 @@ done:
1357 /* keep elevated page count for bad page */ 1485 /* keep elevated page count for bad page */
1358 return ret; 1486 return ret;
1359} 1487}
1360
1361/*
1362 * The caller must hold current->mm->mmap_sem in read mode.
1363 */
1364int is_hwpoison_address(unsigned long addr)
1365{
1366 pgd_t *pgdp;
1367 pud_t pud, *pudp;
1368 pmd_t pmd, *pmdp;
1369 pte_t pte, *ptep;
1370 swp_entry_t entry;
1371
1372 pgdp = pgd_offset(current->mm, addr);
1373 if (!pgd_present(*pgdp))
1374 return 0;
1375 pudp = pud_offset(pgdp, addr);
1376 pud = *pudp;
1377 if (!pud_present(pud) || pud_large(pud))
1378 return 0;
1379 pmdp = pmd_offset(pudp, addr);
1380 pmd = *pmdp;
1381 if (!pmd_present(pmd) || pmd_large(pmd))
1382 return 0;
1383 ptep = pte_offset_map(pmdp, addr);
1384 pte = *ptep;
1385 pte_unmap(ptep);
1386 if (!is_swap_pte(pte))
1387 return 0;
1388 entry = pte_to_swp_entry(pte);
1389 return is_hwpoison_entry(entry);
1390}
1391EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..9b8a01d941cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{ 182{
183 __sync_task_rss_stat(task, mm); 183 __sync_task_rss_stat(task, mm);
184} 184}
185#else 185#else /* SPLIT_RSS_COUNTING */
186 186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,8 +191,206 @@ static void check_sync_rss_stat(struct task_struct *task)
191{ 191{
192} 192}
193 193
194#endif /* SPLIT_RSS_COUNTING */
195
196#ifdef HAVE_GENERIC_MMU_GATHER
197
198static int tlb_next_batch(struct mmu_gather *tlb)
199{
200 struct mmu_gather_batch *batch;
201
202 batch = tlb->active;
203 if (batch->next) {
204 tlb->active = batch->next;
205 return 1;
206 }
207
208 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
209 if (!batch)
210 return 0;
211
212 batch->next = NULL;
213 batch->nr = 0;
214 batch->max = MAX_GATHER_BATCH;
215
216 tlb->active->next = batch;
217 tlb->active = batch;
218
219 return 1;
220}
221
222/* tlb_gather_mmu
223 * Called to initialize an (on-stack) mmu_gather structure for page-table
224 * tear-down from @mm. The @fullmm argument is used when @mm is without
225 * users and we're going to destroy the full address space (exit/execve).
226 */
227void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
228{
229 tlb->mm = mm;
230
231 tlb->fullmm = fullmm;
232 tlb->need_flush = 0;
233 tlb->fast_mode = (num_possible_cpus() == 1);
234 tlb->local.next = NULL;
235 tlb->local.nr = 0;
236 tlb->local.max = ARRAY_SIZE(tlb->__pages);
237 tlb->active = &tlb->local;
238
239#ifdef CONFIG_HAVE_RCU_TABLE_FREE
240 tlb->batch = NULL;
241#endif
242}
243
244void tlb_flush_mmu(struct mmu_gather *tlb)
245{
246 struct mmu_gather_batch *batch;
247
248 if (!tlb->need_flush)
249 return;
250 tlb->need_flush = 0;
251 tlb_flush(tlb);
252#ifdef CONFIG_HAVE_RCU_TABLE_FREE
253 tlb_table_flush(tlb);
194#endif 254#endif
195 255
256 if (tlb_fast_mode(tlb))
257 return;
258
259 for (batch = &tlb->local; batch; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266/* tlb_finish_mmu
267 * Called at the end of the shootdown operation to free up any resources
268 * that were required.
269 */
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276 /* keep the page table cache within bounds */
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286/* __tlb_remove_page
287 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
288 * handling the additional races in SMP caused by other CPUs caching valid
289 * mappings in their TLBs. Returns the number of free page slots left.
290 * When out of page slots we must call tlb_flush_mmu().
291 */
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 tlb->need_flush = 1;
297
298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page);
300 return 1; /* avoid calling tlb_flush_mmu() */
301 }
302
303 batch = tlb->active;
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return 0;
308 batch = tlb->active;
309 }
310 VM_BUG_ON(batch->nr > batch->max);
311
312 return batch->max - batch->nr;
313}
314
315#endif /* HAVE_GENERIC_MMU_GATHER */
316
317#ifdef CONFIG_HAVE_RCU_TABLE_FREE
318
319/*
320 * See the comment near struct mmu_table_batch.
321 */
322
323static void tlb_remove_table_smp_sync(void *arg)
324{
325 /* Simply deliver the interrupt */
326}
327
328static void tlb_remove_table_one(void *table)
329{
330 /*
331 * This isn't an RCU grace period and hence the page-tables cannot be
332 * assumed to be actually RCU-freed.
333 *
334 * It is however sufficient for software page-table walkers that rely on
335 * IRQ disabling. See the comment near struct mmu_table_batch.
336 */
337 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
338 __tlb_remove_table(table);
339}
340
341static void tlb_remove_table_rcu(struct rcu_head *head)
342{
343 struct mmu_table_batch *batch;
344 int i;
345
346 batch = container_of(head, struct mmu_table_batch, rcu);
347
348 for (i = 0; i < batch->nr; i++)
349 __tlb_remove_table(batch->tables[i]);
350
351 free_page((unsigned long)batch);
352}
353
354void tlb_table_flush(struct mmu_gather *tlb)
355{
356 struct mmu_table_batch **batch = &tlb->batch;
357
358 if (*batch) {
359 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
360 *batch = NULL;
361 }
362}
363
364void tlb_remove_table(struct mmu_gather *tlb, void *table)
365{
366 struct mmu_table_batch **batch = &tlb->batch;
367
368 tlb->need_flush = 1;
369
370 /*
371 * When there's less then two users of this mm there cannot be a
372 * concurrent page-table walk.
373 */
374 if (atomic_read(&tlb->mm->mm_users) < 2) {
375 __tlb_remove_table(table);
376 return;
377 }
378
379 if (*batch == NULL) {
380 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
381 if (*batch == NULL) {
382 tlb_remove_table_one(table);
383 return;
384 }
385 (*batch)->nr = 0;
386 }
387 (*batch)->tables[(*batch)->nr++] = table;
388 if ((*batch)->nr == MAX_TABLE_BATCH)
389 tlb_table_flush(tlb);
390}
391
392#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
393
196/* 394/*
197 * If a p?d_bad entry is found while walking page tables, report 395 * If a p?d_bad entry is found while walking page tables, report
198 * the error, before resetting entry to p?d_none. Usually (but 396 * the error, before resetting entry to p?d_none. Usually (but
@@ -394,9 +592,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
394 } 592 }
395} 593}
396 594
397int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 595int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
596 pmd_t *pmd, unsigned long address)
398{ 597{
399 pgtable_t new = pte_alloc_one(mm, address); 598 pgtable_t new = pte_alloc_one(mm, address);
599 int wait_split_huge_page;
400 if (!new) 600 if (!new)
401 return -ENOMEM; 601 return -ENOMEM;
402 602
@@ -416,14 +616,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
416 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 616 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
417 617
418 spin_lock(&mm->page_table_lock); 618 spin_lock(&mm->page_table_lock);
419 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 619 wait_split_huge_page = 0;
620 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
420 mm->nr_ptes++; 621 mm->nr_ptes++;
421 pmd_populate(mm, pmd, new); 622 pmd_populate(mm, pmd, new);
422 new = NULL; 623 new = NULL;
423 } 624 } else if (unlikely(pmd_trans_splitting(*pmd)))
625 wait_split_huge_page = 1;
424 spin_unlock(&mm->page_table_lock); 626 spin_unlock(&mm->page_table_lock);
425 if (new) 627 if (new)
426 pte_free(mm, new); 628 pte_free(mm, new);
629 if (wait_split_huge_page)
630 wait_split_huge_page(vma->anon_vma, pmd);
427 return 0; 631 return 0;
428} 632}
429 633
@@ -436,10 +640,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
436 smp_wmb(); /* See comment in __pte_alloc */ 640 smp_wmb(); /* See comment in __pte_alloc */
437 641
438 spin_lock(&init_mm.page_table_lock); 642 spin_lock(&init_mm.page_table_lock);
439 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 643 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
440 pmd_populate_kernel(&init_mm, pmd, new); 644 pmd_populate_kernel(&init_mm, pmd, new);
441 new = NULL; 645 new = NULL;
442 } 646 } else
647 VM_BUG_ON(pmd_trans_splitting(*pmd));
443 spin_unlock(&init_mm.page_table_lock); 648 spin_unlock(&init_mm.page_table_lock);
444 if (new) 649 if (new)
445 pte_free_kernel(&init_mm, new); 650 pte_free_kernel(&init_mm, new);
@@ -526,7 +731,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
526 add_taint(TAINT_BAD_PAGE); 731 add_taint(TAINT_BAD_PAGE);
527} 732}
528 733
529static inline int is_cow_mapping(unsigned int flags) 734static inline int is_cow_mapping(vm_flags_t flags)
530{ 735{
531 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 736 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
532} 737}
@@ -719,9 +924,9 @@ out_set_pte:
719 return 0; 924 return 0;
720} 925}
721 926
722static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 927int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
723 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 928 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
724 unsigned long addr, unsigned long end) 929 unsigned long addr, unsigned long end)
725{ 930{
726 pte_t *orig_src_pte, *orig_dst_pte; 931 pte_t *orig_src_pte, *orig_dst_pte;
727 pte_t *src_pte, *dst_pte; 932 pte_t *src_pte, *dst_pte;
@@ -736,7 +941,7 @@ again:
736 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 941 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
737 if (!dst_pte) 942 if (!dst_pte)
738 return -ENOMEM; 943 return -ENOMEM;
739 src_pte = pte_offset_map_nested(src_pmd, addr); 944 src_pte = pte_offset_map(src_pmd, addr);
740 src_ptl = pte_lockptr(src_mm, src_pmd); 945 src_ptl = pte_lockptr(src_mm, src_pmd);
741 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 946 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
742 orig_src_pte = src_pte; 947 orig_src_pte = src_pte;
@@ -767,7 +972,7 @@ again:
767 972
768 arch_leave_lazy_mmu_mode(); 973 arch_leave_lazy_mmu_mode();
769 spin_unlock(src_ptl); 974 spin_unlock(src_ptl);
770 pte_unmap_nested(orig_src_pte); 975 pte_unmap(orig_src_pte);
771 add_mm_rss_vec(dst_mm, rss); 976 add_mm_rss_vec(dst_mm, rss);
772 pte_unmap_unlock(orig_dst_pte, dst_ptl); 977 pte_unmap_unlock(orig_dst_pte, dst_ptl);
773 cond_resched(); 978 cond_resched();
@@ -795,6 +1000,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
795 src_pmd = pmd_offset(src_pud, addr); 1000 src_pmd = pmd_offset(src_pud, addr);
796 do { 1001 do {
797 next = pmd_addr_end(addr, end); 1002 next = pmd_addr_end(addr, end);
1003 if (pmd_trans_huge(*src_pmd)) {
1004 int err;
1005 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
1006 err = copy_huge_pmd(dst_mm, src_mm,
1007 dst_pmd, src_pmd, addr, vma);
1008 if (err == -ENOMEM)
1009 return -ENOMEM;
1010 if (!err)
1011 continue;
1012 /* fall through */
1013 }
798 if (pmd_none_or_clear_bad(src_pmd)) 1014 if (pmd_none_or_clear_bad(src_pmd))
799 continue; 1015 continue;
800 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 1016 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -891,26 +1107,26 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
891static unsigned long zap_pte_range(struct mmu_gather *tlb, 1107static unsigned long zap_pte_range(struct mmu_gather *tlb,
892 struct vm_area_struct *vma, pmd_t *pmd, 1108 struct vm_area_struct *vma, pmd_t *pmd,
893 unsigned long addr, unsigned long end, 1109 unsigned long addr, unsigned long end,
894 long *zap_work, struct zap_details *details) 1110 struct zap_details *details)
895{ 1111{
896 struct mm_struct *mm = tlb->mm; 1112 struct mm_struct *mm = tlb->mm;
897 pte_t *pte; 1113 int force_flush = 0;
898 spinlock_t *ptl;
899 int rss[NR_MM_COUNTERS]; 1114 int rss[NR_MM_COUNTERS];
1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1117 pte_t *pte;
900 1118
1119again:
901 init_rss_vec(rss); 1120 init_rss_vec(rss);
902 1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
903 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1122 pte = start_pte;
904 arch_enter_lazy_mmu_mode(); 1123 arch_enter_lazy_mmu_mode();
905 do { 1124 do {
906 pte_t ptent = *pte; 1125 pte_t ptent = *pte;
907 if (pte_none(ptent)) { 1126 if (pte_none(ptent)) {
908 (*zap_work)--;
909 continue; 1127 continue;
910 } 1128 }
911 1129
912 (*zap_work) -= PAGE_SIZE;
913
914 if (pte_present(ptent)) { 1130 if (pte_present(ptent)) {
915 struct page *page; 1131 struct page *page;
916 1132
@@ -956,7 +1172,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
956 page_remove_rmap(page); 1172 page_remove_rmap(page);
957 if (unlikely(page_mapcount(page) < 0)) 1173 if (unlikely(page_mapcount(page) < 0))
958 print_bad_pte(vma, addr, ptent, page); 1174 print_bad_pte(vma, addr, ptent, page);
959 tlb_remove_page(tlb, page); 1175 force_flush = !__tlb_remove_page(tlb, page);
1176 if (force_flush)
1177 break;
960 continue; 1178 continue;
961 } 1179 }
962 /* 1180 /*
@@ -977,11 +1195,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
977 print_bad_pte(vma, addr, ptent, NULL); 1195 print_bad_pte(vma, addr, ptent, NULL);
978 } 1196 }
979 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1197 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
980 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 1198 } while (pte++, addr += PAGE_SIZE, addr != end);
981 1199
982 add_mm_rss_vec(mm, rss); 1200 add_mm_rss_vec(mm, rss);
983 arch_leave_lazy_mmu_mode(); 1201 arch_leave_lazy_mmu_mode();
984 pte_unmap_unlock(pte - 1, ptl); 1202 pte_unmap_unlock(start_pte, ptl);
1203
1204 /*
1205 * mmu_gather ran out of room to batch pages, we break out of
1206 * the PTE lock to avoid doing the potential expensive TLB invalidate
1207 * and page-free while holding it.
1208 */
1209 if (force_flush) {
1210 force_flush = 0;
1211 tlb_flush_mmu(tlb);
1212 if (addr != end)
1213 goto again;
1214 }
985 1215
986 return addr; 1216 return addr;
987} 1217}
@@ -989,7 +1219,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
989static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1219static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
990 struct vm_area_struct *vma, pud_t *pud, 1220 struct vm_area_struct *vma, pud_t *pud,
991 unsigned long addr, unsigned long end, 1221 unsigned long addr, unsigned long end,
992 long *zap_work, struct zap_details *details) 1222 struct zap_details *details)
993{ 1223{
994 pmd_t *pmd; 1224 pmd_t *pmd;
995 unsigned long next; 1225 unsigned long next;
@@ -997,13 +1227,19 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
997 pmd = pmd_offset(pud, addr); 1227 pmd = pmd_offset(pud, addr);
998 do { 1228 do {
999 next = pmd_addr_end(addr, end); 1229 next = pmd_addr_end(addr, end);
1000 if (pmd_none_or_clear_bad(pmd)) { 1230 if (pmd_trans_huge(*pmd)) {
1001 (*zap_work)--; 1231 if (next-addr != HPAGE_PMD_SIZE) {
1002 continue; 1232 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1233 split_huge_page_pmd(vma->vm_mm, pmd);
1234 } else if (zap_huge_pmd(tlb, vma, pmd))
1235 continue;
1236 /* fall through */
1003 } 1237 }
1004 next = zap_pte_range(tlb, vma, pmd, addr, next, 1238 if (pmd_none_or_clear_bad(pmd))
1005 zap_work, details); 1239 continue;
1006 } while (pmd++, addr = next, (addr != end && *zap_work > 0)); 1240 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1241 cond_resched();
1242 } while (pmd++, addr = next, addr != end);
1007 1243
1008 return addr; 1244 return addr;
1009} 1245}
@@ -1011,7 +1247,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1011static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1247static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1012 struct vm_area_struct *vma, pgd_t *pgd, 1248 struct vm_area_struct *vma, pgd_t *pgd,
1013 unsigned long addr, unsigned long end, 1249 unsigned long addr, unsigned long end,
1014 long *zap_work, struct zap_details *details) 1250 struct zap_details *details)
1015{ 1251{
1016 pud_t *pud; 1252 pud_t *pud;
1017 unsigned long next; 1253 unsigned long next;
@@ -1019,13 +1255,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1019 pud = pud_offset(pgd, addr); 1255 pud = pud_offset(pgd, addr);
1020 do { 1256 do {
1021 next = pud_addr_end(addr, end); 1257 next = pud_addr_end(addr, end);
1022 if (pud_none_or_clear_bad(pud)) { 1258 if (pud_none_or_clear_bad(pud))
1023 (*zap_work)--;
1024 continue; 1259 continue;
1025 } 1260 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1026 next = zap_pmd_range(tlb, vma, pud, addr, next, 1261 } while (pud++, addr = next, addr != end);
1027 zap_work, details);
1028 } while (pud++, addr = next, (addr != end && *zap_work > 0));
1029 1262
1030 return addr; 1263 return addr;
1031} 1264}
@@ -1033,7 +1266,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1033static unsigned long unmap_page_range(struct mmu_gather *tlb, 1266static unsigned long unmap_page_range(struct mmu_gather *tlb,
1034 struct vm_area_struct *vma, 1267 struct vm_area_struct *vma,
1035 unsigned long addr, unsigned long end, 1268 unsigned long addr, unsigned long end,
1036 long *zap_work, struct zap_details *details) 1269 struct zap_details *details)
1037{ 1270{
1038 pgd_t *pgd; 1271 pgd_t *pgd;
1039 unsigned long next; 1272 unsigned long next;
@@ -1047,13 +1280,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1047 pgd = pgd_offset(vma->vm_mm, addr); 1280 pgd = pgd_offset(vma->vm_mm, addr);
1048 do { 1281 do {
1049 next = pgd_addr_end(addr, end); 1282 next = pgd_addr_end(addr, end);
1050 if (pgd_none_or_clear_bad(pgd)) { 1283 if (pgd_none_or_clear_bad(pgd))
1051 (*zap_work)--;
1052 continue; 1284 continue;
1053 } 1285 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1054 next = zap_pud_range(tlb, vma, pgd, addr, next, 1286 } while (pgd++, addr = next, addr != end);
1055 zap_work, details);
1056 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
1057 tlb_end_vma(tlb, vma); 1287 tlb_end_vma(tlb, vma);
1058 mem_cgroup_uncharge_end(); 1288 mem_cgroup_uncharge_end();
1059 1289
@@ -1069,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1069 1299
1070/** 1300/**
1071 * unmap_vmas - unmap a range of memory covered by a list of vma's 1301 * unmap_vmas - unmap a range of memory covered by a list of vma's
1072 * @tlbp: address of the caller's struct mmu_gather 1302 * @tlb: address of the caller's struct mmu_gather
1073 * @vma: the starting vma 1303 * @vma: the starting vma
1074 * @start_addr: virtual address at which to start unmapping 1304 * @start_addr: virtual address at which to start unmapping
1075 * @end_addr: virtual address at which to end unmapping 1305 * @end_addr: virtual address at which to end unmapping
@@ -1093,17 +1323,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1093 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1323 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1094 * drops the lock and schedules. 1324 * drops the lock and schedules.
1095 */ 1325 */
1096unsigned long unmap_vmas(struct mmu_gather **tlbp, 1326unsigned long unmap_vmas(struct mmu_gather *tlb,
1097 struct vm_area_struct *vma, unsigned long start_addr, 1327 struct vm_area_struct *vma, unsigned long start_addr,
1098 unsigned long end_addr, unsigned long *nr_accounted, 1328 unsigned long end_addr, unsigned long *nr_accounted,
1099 struct zap_details *details) 1329 struct zap_details *details)
1100{ 1330{
1101 long zap_work = ZAP_BLOCK_SIZE;
1102 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
1103 int tlb_start_valid = 0;
1104 unsigned long start = start_addr; 1331 unsigned long start = start_addr;
1105 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1106 int fullmm = (*tlbp)->fullmm;
1107 struct mm_struct *mm = vma->vm_mm; 1332 struct mm_struct *mm = vma->vm_mm;
1108 1333
1109 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1334 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1124,11 +1349,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1124 untrack_pfn_vma(vma, 0, 0); 1349 untrack_pfn_vma(vma, 0, 0);
1125 1350
1126 while (start != end) { 1351 while (start != end) {
1127 if (!tlb_start_valid) {
1128 tlb_start = start;
1129 tlb_start_valid = 1;
1130 }
1131
1132 if (unlikely(is_vm_hugetlb_page(vma))) { 1352 if (unlikely(is_vm_hugetlb_page(vma))) {
1133 /* 1353 /*
1134 * It is undesirable to test vma->vm_file as it 1354 * It is undesirable to test vma->vm_file as it
@@ -1141,39 +1361,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1141 * Since no pte has actually been setup, it is 1361 * Since no pte has actually been setup, it is
1142 * safe to do nothing in this case. 1362 * safe to do nothing in this case.
1143 */ 1363 */
1144 if (vma->vm_file) { 1364 if (vma->vm_file)
1145 unmap_hugepage_range(vma, start, end, NULL); 1365 unmap_hugepage_range(vma, start, end, NULL);
1146 zap_work -= (end - start) /
1147 pages_per_huge_page(hstate_vma(vma));
1148 }
1149 1366
1150 start = end; 1367 start = end;
1151 } else 1368 } else
1152 start = unmap_page_range(*tlbp, vma, 1369 start = unmap_page_range(tlb, vma, start, end, details);
1153 start, end, &zap_work, details);
1154
1155 if (zap_work > 0) {
1156 BUG_ON(start != end);
1157 break;
1158 }
1159
1160 tlb_finish_mmu(*tlbp, tlb_start, start);
1161
1162 if (need_resched() ||
1163 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1164 if (i_mmap_lock) {
1165 *tlbp = NULL;
1166 goto out;
1167 }
1168 cond_resched();
1169 }
1170
1171 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1172 tlb_start_valid = 0;
1173 zap_work = ZAP_BLOCK_SIZE;
1174 } 1370 }
1175 } 1371 }
1176out: 1372
1177 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1373 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1178 return start; /* which is now the end (or restart) address */ 1374 return start; /* which is now the end (or restart) address */
1179} 1375}
@@ -1189,16 +1385,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1189 unsigned long size, struct zap_details *details) 1385 unsigned long size, struct zap_details *details)
1190{ 1386{
1191 struct mm_struct *mm = vma->vm_mm; 1387 struct mm_struct *mm = vma->vm_mm;
1192 struct mmu_gather *tlb; 1388 struct mmu_gather tlb;
1193 unsigned long end = address + size; 1389 unsigned long end = address + size;
1194 unsigned long nr_accounted = 0; 1390 unsigned long nr_accounted = 0;
1195 1391
1196 lru_add_drain(); 1392 lru_add_drain();
1197 tlb = tlb_gather_mmu(mm, 0); 1393 tlb_gather_mmu(&tlb, mm, 0);
1198 update_hiwater_rss(mm); 1394 update_hiwater_rss(mm);
1199 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1395 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1200 if (tlb) 1396 tlb_finish_mmu(&tlb, address, end);
1201 tlb_finish_mmu(tlb, address, end);
1202 return end; 1397 return end;
1203} 1398}
1204 1399
@@ -1262,7 +1457,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1262 pud = pud_offset(pgd, address); 1457 pud = pud_offset(pgd, address);
1263 if (pud_none(*pud)) 1458 if (pud_none(*pud))
1264 goto no_page_table; 1459 goto no_page_table;
1265 if (pud_huge(*pud)) { 1460 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1266 BUG_ON(flags & FOLL_GET); 1461 BUG_ON(flags & FOLL_GET);
1267 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1462 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1268 goto out; 1463 goto out;
@@ -1273,11 +1468,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1273 pmd = pmd_offset(pud, address); 1468 pmd = pmd_offset(pud, address);
1274 if (pmd_none(*pmd)) 1469 if (pmd_none(*pmd))
1275 goto no_page_table; 1470 goto no_page_table;
1276 if (pmd_huge(*pmd)) { 1471 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1277 BUG_ON(flags & FOLL_GET); 1472 BUG_ON(flags & FOLL_GET);
1278 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1473 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1279 goto out; 1474 goto out;
1280 } 1475 }
1476 if (pmd_trans_huge(*pmd)) {
1477 if (flags & FOLL_SPLIT) {
1478 split_huge_page_pmd(mm, pmd);
1479 goto split_fallthrough;
1480 }
1481 spin_lock(&mm->page_table_lock);
1482 if (likely(pmd_trans_huge(*pmd))) {
1483 if (unlikely(pmd_trans_splitting(*pmd))) {
1484 spin_unlock(&mm->page_table_lock);
1485 wait_split_huge_page(vma->anon_vma, pmd);
1486 } else {
1487 page = follow_trans_huge_pmd(mm, address,
1488 pmd, flags);
1489 spin_unlock(&mm->page_table_lock);
1490 goto out;
1491 }
1492 } else
1493 spin_unlock(&mm->page_table_lock);
1494 /* fall through */
1495 }
1496split_fallthrough:
1281 if (unlikely(pmd_bad(*pmd))) 1497 if (unlikely(pmd_bad(*pmd)))
1282 goto no_page_table; 1498 goto no_page_table;
1283 1499
@@ -1310,6 +1526,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1310 */ 1526 */
1311 mark_page_accessed(page); 1527 mark_page_accessed(page);
1312 } 1528 }
1529 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1530 /*
1531 * The preliminary mapping check is mainly to avoid the
1532 * pointless overhead of lock_page on the ZERO_PAGE
1533 * which might bounce very badly if there is contention.
1534 *
1535 * If the page is already locked, we don't need to
1536 * handle it now - vmscan will handle it later if and
1537 * when it attempts to reclaim the page.
1538 */
1539 if (page->mapping && trylock_page(page)) {
1540 lru_add_drain(); /* push cached pages to LRU */
1541 /*
1542 * Because we lock page here and migration is
1543 * blocked by the pte's page reference, we need
1544 * only check for file-cache page truncation.
1545 */
1546 if (page->mapping)
1547 mlock_vma_page(page);
1548 unlock_page(page);
1549 }
1550 }
1313unlock: 1551unlock:
1314 pte_unmap_unlock(ptep, ptl); 1552 pte_unmap_unlock(ptep, ptl);
1315out: 1553out:
@@ -1339,9 +1577,65 @@ no_page_table:
1339 return page; 1577 return page;
1340} 1578}
1341 1579
1580static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1581{
1582 return stack_guard_page_start(vma, addr) ||
1583 stack_guard_page_end(vma, addr+PAGE_SIZE);
1584}
1585
1586/**
1587 * __get_user_pages() - pin user pages in memory
1588 * @tsk: task_struct of target task
1589 * @mm: mm_struct of target mm
1590 * @start: starting user address
1591 * @nr_pages: number of pages from start to pin
1592 * @gup_flags: flags modifying pin behaviour
1593 * @pages: array that receives pointers to the pages pinned.
1594 * Should be at least nr_pages long. Or NULL, if caller
1595 * only intends to ensure the pages are faulted in.
1596 * @vmas: array of pointers to vmas corresponding to each page.
1597 * Or NULL if the caller does not require them.
1598 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1599 *
1600 * Returns number of pages pinned. This may be fewer than the number
1601 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1602 * were pinned, returns -errno. Each page returned must be released
1603 * with a put_page() call when it is finished with. vmas will only
1604 * remain valid while mmap_sem is held.
1605 *
1606 * Must be called with mmap_sem held for read or write.
1607 *
1608 * __get_user_pages walks a process's page tables and takes a reference to
1609 * each struct page that each user address corresponds to at a given
1610 * instant. That is, it takes the page that would be accessed if a user
1611 * thread accesses the given user virtual address at that instant.
1612 *
1613 * This does not guarantee that the page exists in the user mappings when
1614 * __get_user_pages returns, and there may even be a completely different
1615 * page there in some cases (eg. if mmapped pagecache has been invalidated
1616 * and subsequently re faulted). However it does guarantee that the page
1617 * won't be freed completely. And mostly callers simply care that the page
1618 * contains data that was valid *at some point in time*. Typically, an IO
1619 * or similar operation cannot guarantee anything stronger anyway because
1620 * locks can't be held over the syscall boundary.
1621 *
1622 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1623 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1624 * appropriate) must be called after the page is finished with, and
1625 * before put_page is called.
1626 *
1627 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1628 * or mmap_sem contention, and if waiting is needed to pin all pages,
1629 * *@nonblocking will be set to 0.
1630 *
1631 * In most cases, get_user_pages or get_user_pages_fast should be used
1632 * instead of __get_user_pages. __get_user_pages should be used only if
1633 * you need some special @gup_flags.
1634 */
1342int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1635int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1343 unsigned long start, int nr_pages, unsigned int gup_flags, 1636 unsigned long start, int nr_pages, unsigned int gup_flags,
1344 struct page **pages, struct vm_area_struct **vmas) 1637 struct page **pages, struct vm_area_struct **vmas,
1638 int *nonblocking)
1345{ 1639{
1346 int i; 1640 int i;
1347 unsigned long vm_flags; 1641 unsigned long vm_flags;
@@ -1365,9 +1659,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1365 struct vm_area_struct *vma; 1659 struct vm_area_struct *vma;
1366 1660
1367 vma = find_extend_vma(mm, start); 1661 vma = find_extend_vma(mm, start);
1368 if (!vma && in_gate_area(tsk, start)) { 1662 if (!vma && in_gate_area(mm, start)) {
1369 unsigned long pg = start & PAGE_MASK; 1663 unsigned long pg = start & PAGE_MASK;
1370 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1371 pgd_t *pgd; 1664 pgd_t *pgd;
1372 pud_t *pud; 1665 pud_t *pud;
1373 pmd_t *pmd; 1666 pmd_t *pmd;
@@ -1386,15 +1679,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1386 pmd = pmd_offset(pud, pg); 1679 pmd = pmd_offset(pud, pg);
1387 if (pmd_none(*pmd)) 1680 if (pmd_none(*pmd))
1388 return i ? : -EFAULT; 1681 return i ? : -EFAULT;
1682 VM_BUG_ON(pmd_trans_huge(*pmd));
1389 pte = pte_offset_map(pmd, pg); 1683 pte = pte_offset_map(pmd, pg);
1390 if (pte_none(*pte)) { 1684 if (pte_none(*pte)) {
1391 pte_unmap(pte); 1685 pte_unmap(pte);
1392 return i ? : -EFAULT; 1686 return i ? : -EFAULT;
1393 } 1687 }
1688 vma = get_gate_vma(mm);
1394 if (pages) { 1689 if (pages) {
1395 struct page *page; 1690 struct page *page;
1396 1691
1397 page = vm_normal_page(gate_vma, start, *pte); 1692 page = vm_normal_page(vma, start, *pte);
1398 if (!page) { 1693 if (!page) {
1399 if (!(gup_flags & FOLL_DUMP) && 1694 if (!(gup_flags & FOLL_DUMP) &&
1400 is_zero_pfn(pte_pfn(*pte))) 1695 is_zero_pfn(pte_pfn(*pte)))
@@ -1408,12 +1703,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1408 get_page(page); 1703 get_page(page);
1409 } 1704 }
1410 pte_unmap(pte); 1705 pte_unmap(pte);
1411 if (vmas) 1706 goto next_page;
1412 vmas[i] = gate_vma;
1413 i++;
1414 start += PAGE_SIZE;
1415 nr_pages--;
1416 continue;
1417 } 1707 }
1418 1708
1419 if (!vma || 1709 if (!vma ||
@@ -1441,23 +1731,52 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1441 cond_resched(); 1731 cond_resched();
1442 while (!(page = follow_page(vma, start, foll_flags))) { 1732 while (!(page = follow_page(vma, start, foll_flags))) {
1443 int ret; 1733 int ret;
1734 unsigned int fault_flags = 0;
1735
1736 /* For mlock, just skip the stack guard page. */
1737 if (foll_flags & FOLL_MLOCK) {
1738 if (stack_guard_page(vma, start))
1739 goto next_page;
1740 }
1741 if (foll_flags & FOLL_WRITE)
1742 fault_flags |= FAULT_FLAG_WRITE;
1743 if (nonblocking)
1744 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1745 if (foll_flags & FOLL_NOWAIT)
1746 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1444 1747
1445 ret = handle_mm_fault(mm, vma, start, 1748 ret = handle_mm_fault(mm, vma, start,
1446 (foll_flags & FOLL_WRITE) ? 1749 fault_flags);
1447 FAULT_FLAG_WRITE : 0);
1448 1750
1449 if (ret & VM_FAULT_ERROR) { 1751 if (ret & VM_FAULT_ERROR) {
1450 if (ret & VM_FAULT_OOM) 1752 if (ret & VM_FAULT_OOM)
1451 return i ? i : -ENOMEM; 1753 return i ? i : -ENOMEM;
1452 if (ret & 1754 if (ret & (VM_FAULT_HWPOISON |
1453 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1755 VM_FAULT_HWPOISON_LARGE)) {
1756 if (i)
1757 return i;
1758 else if (gup_flags & FOLL_HWPOISON)
1759 return -EHWPOISON;
1760 else
1761 return -EFAULT;
1762 }
1763 if (ret & VM_FAULT_SIGBUS)
1454 return i ? i : -EFAULT; 1764 return i ? i : -EFAULT;
1455 BUG(); 1765 BUG();
1456 } 1766 }
1457 if (ret & VM_FAULT_MAJOR) 1767
1458 tsk->maj_flt++; 1768 if (tsk) {
1459 else 1769 if (ret & VM_FAULT_MAJOR)
1460 tsk->min_flt++; 1770 tsk->maj_flt++;
1771 else
1772 tsk->min_flt++;
1773 }
1774
1775 if (ret & VM_FAULT_RETRY) {
1776 if (nonblocking)
1777 *nonblocking = 0;
1778 return i;
1779 }
1461 1780
1462 /* 1781 /*
1463 * The VM_FAULT_WRITE bit tells us that 1782 * The VM_FAULT_WRITE bit tells us that
@@ -1485,6 +1804,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1485 flush_anon_page(vma, page, start); 1804 flush_anon_page(vma, page, start);
1486 flush_dcache_page(page); 1805 flush_dcache_page(page);
1487 } 1806 }
1807next_page:
1488 if (vmas) 1808 if (vmas)
1489 vmas[i] = vma; 1809 vmas[i] = vma;
1490 i++; 1810 i++;
@@ -1494,10 +1814,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1494 } while (nr_pages); 1814 } while (nr_pages);
1495 return i; 1815 return i;
1496} 1816}
1817EXPORT_SYMBOL(__get_user_pages);
1497 1818
1498/** 1819/**
1499 * get_user_pages() - pin user pages in memory 1820 * get_user_pages() - pin user pages in memory
1500 * @tsk: task_struct of target task 1821 * @tsk: the task_struct to use for page fault accounting, or
1822 * NULL if faults are not to be recorded.
1501 * @mm: mm_struct of target mm 1823 * @mm: mm_struct of target mm
1502 * @start: starting user address 1824 * @start: starting user address
1503 * @nr_pages: number of pages from start to pin 1825 * @nr_pages: number of pages from start to pin
@@ -1558,7 +1880,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1558 if (force) 1880 if (force)
1559 flags |= FOLL_FORCE; 1881 flags |= FOLL_FORCE;
1560 1882
1561 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1883 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1884 NULL);
1562} 1885}
1563EXPORT_SYMBOL(get_user_pages); 1886EXPORT_SYMBOL(get_user_pages);
1564 1887
@@ -1583,22 +1906,25 @@ struct page *get_dump_page(unsigned long addr)
1583 struct page *page; 1906 struct page *page;
1584 1907
1585 if (__get_user_pages(current, current->mm, addr, 1, 1908 if (__get_user_pages(current, current->mm, addr, 1,
1586 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1909 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1910 NULL) < 1)
1587 return NULL; 1911 return NULL;
1588 flush_cache_page(vma, addr, page_to_pfn(page)); 1912 flush_cache_page(vma, addr, page_to_pfn(page));
1589 return page; 1913 return page;
1590} 1914}
1591#endif /* CONFIG_ELF_CORE */ 1915#endif /* CONFIG_ELF_CORE */
1592 1916
1593pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1917pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1594 spinlock_t **ptl) 1918 spinlock_t **ptl)
1595{ 1919{
1596 pgd_t * pgd = pgd_offset(mm, addr); 1920 pgd_t * pgd = pgd_offset(mm, addr);
1597 pud_t * pud = pud_alloc(mm, pgd, addr); 1921 pud_t * pud = pud_alloc(mm, pgd, addr);
1598 if (pud) { 1922 if (pud) {
1599 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1923 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1600 if (pmd) 1924 if (pmd) {
1925 VM_BUG_ON(pmd_trans_huge(*pmd));
1601 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1926 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1927 }
1602 } 1928 }
1603 return NULL; 1929 return NULL;
1604} 1930}
@@ -1817,6 +2143,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1817 pmd = pmd_alloc(mm, pud, addr); 2143 pmd = pmd_alloc(mm, pud, addr);
1818 if (!pmd) 2144 if (!pmd)
1819 return -ENOMEM; 2145 return -ENOMEM;
2146 VM_BUG_ON(pmd_trans_huge(*pmd));
1820 do { 2147 do {
1821 next = pmd_addr_end(addr, end); 2148 next = pmd_addr_end(addr, end);
1822 if (remap_pte_range(mm, pmd, addr, next, 2149 if (remap_pte_range(mm, pmd, addr, next,
@@ -2026,10 +2353,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
2026 * handle_pte_fault chooses page fault handler according to an entry 2353 * handle_pte_fault chooses page fault handler according to an entry
2027 * which was read non-atomically. Before making any commitment, on 2354 * which was read non-atomically. Before making any commitment, on
2028 * those architectures or configurations (e.g. i386 with PAE) which 2355 * those architectures or configurations (e.g. i386 with PAE) which
2029 * might give a mix of unmatched parts, do_swap_page and do_file_page 2356 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
2030 * must check under lock before unmapping the pte and proceeding 2357 * must check under lock before unmapping the pte and proceeding
2031 * (but do_wp_page is only called after already making such a check; 2358 * (but do_wp_page is only called after already making such a check;
2032 * and do_anonymous_page and do_no_page can safely check later on). 2359 * and do_anonymous_page can safely check later on).
2033 */ 2360 */
2034static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 2361static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2035 pte_t *page_table, pte_t orig_pte) 2362 pte_t *page_table, pte_t orig_pte)
@@ -2047,19 +2374,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2047 return same; 2374 return same;
2048} 2375}
2049 2376
2050/*
2051 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
2052 * servicing faults for write access. In the normal case, do always want
2053 * pte_mkwrite. But get_user_pages can cause write faults for mappings
2054 * that do not have writing enabled, when used by access_process_vm.
2055 */
2056static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
2057{
2058 if (likely(vma->vm_flags & VM_WRITE))
2059 pte = pte_mkwrite(pte);
2060 return pte;
2061}
2062
2063static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2377static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2064{ 2378{
2065 /* 2379 /*
@@ -2079,7 +2393,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2079 * zeroes. 2393 * zeroes.
2080 */ 2394 */
2081 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2395 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2082 memset(kaddr, 0, PAGE_SIZE); 2396 clear_page(kaddr);
2083 kunmap_atomic(kaddr, KM_USER0); 2397 kunmap_atomic(kaddr, KM_USER0);
2084 flush_dcache_page(dst); 2398 flush_dcache_page(dst);
2085 } else 2399 } else
@@ -2107,10 +2421,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
2107static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2421static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2108 unsigned long address, pte_t *page_table, pmd_t *pmd, 2422 unsigned long address, pte_t *page_table, pmd_t *pmd,
2109 spinlock_t *ptl, pte_t orig_pte) 2423 spinlock_t *ptl, pte_t orig_pte)
2424 __releases(ptl)
2110{ 2425{
2111 struct page *old_page, *new_page; 2426 struct page *old_page, *new_page;
2112 pte_t entry; 2427 pte_t entry;
2113 int reuse = 0, ret = 0; 2428 int ret = 0;
2114 int page_mkwrite = 0; 2429 int page_mkwrite = 0;
2115 struct page *dirty_page = NULL; 2430 struct page *dirty_page = NULL;
2116 2431
@@ -2142,19 +2457,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2142 &ptl); 2457 &ptl);
2143 if (!pte_same(*page_table, orig_pte)) { 2458 if (!pte_same(*page_table, orig_pte)) {
2144 unlock_page(old_page); 2459 unlock_page(old_page);
2145 page_cache_release(old_page);
2146 goto unlock; 2460 goto unlock;
2147 } 2461 }
2148 page_cache_release(old_page); 2462 page_cache_release(old_page);
2149 } 2463 }
2150 reuse = reuse_swap_page(old_page); 2464 if (reuse_swap_page(old_page)) {
2151 if (reuse)
2152 /* 2465 /*
2153 * The page is all ours. Move it to our anon_vma so 2466 * The page is all ours. Move it to our anon_vma so
2154 * the rmap code will not search our parent or siblings. 2467 * the rmap code will not search our parent or siblings.
2155 * Protected against the rmap code by the page lock. 2468 * Protected against the rmap code by the page lock.
2156 */ 2469 */
2157 page_move_anon_rmap(old_page, vma, address); 2470 page_move_anon_rmap(old_page, vma, address);
2471 unlock_page(old_page);
2472 goto reuse;
2473 }
2158 unlock_page(old_page); 2474 unlock_page(old_page);
2159 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2475 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2160 (VM_WRITE|VM_SHARED))) { 2476 (VM_WRITE|VM_SHARED))) {
@@ -2210,7 +2526,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2210 &ptl); 2526 &ptl);
2211 if (!pte_same(*page_table, orig_pte)) { 2527 if (!pte_same(*page_table, orig_pte)) {
2212 unlock_page(old_page); 2528 unlock_page(old_page);
2213 page_cache_release(old_page);
2214 goto unlock; 2529 goto unlock;
2215 } 2530 }
2216 2531
@@ -2218,18 +2533,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2218 } 2533 }
2219 dirty_page = old_page; 2534 dirty_page = old_page;
2220 get_page(dirty_page); 2535 get_page(dirty_page);
2221 reuse = 1;
2222 }
2223 2536
2224 if (reuse) {
2225reuse: 2537reuse:
2226 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2538 flush_cache_page(vma, address, pte_pfn(orig_pte));
2227 entry = pte_mkyoung(orig_pte); 2539 entry = pte_mkyoung(orig_pte);
2228 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2540 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2229 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2541 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2230 update_mmu_cache(vma, address, page_table); 2542 update_mmu_cache(vma, address, page_table);
2543 pte_unmap_unlock(page_table, ptl);
2231 ret |= VM_FAULT_WRITE; 2544 ret |= VM_FAULT_WRITE;
2232 goto unlock; 2545
2546 if (!dirty_page)
2547 return ret;
2548
2549 /*
2550 * Yes, Virginia, this is actually required to prevent a race
2551 * with clear_page_dirty_for_io() from clearing the page dirty
2552 * bit after it clear all dirty ptes, but before a racing
2553 * do_wp_page installs a dirty pte.
2554 *
2555 * __do_fault is protected similarly.
2556 */
2557 if (!page_mkwrite) {
2558 wait_on_page_locked(dirty_page);
2559 set_page_dirty_balance(dirty_page, page_mkwrite);
2560 }
2561 put_page(dirty_page);
2562 if (page_mkwrite) {
2563 struct address_space *mapping = dirty_page->mapping;
2564
2565 set_page_dirty(dirty_page);
2566 unlock_page(dirty_page);
2567 page_cache_release(dirty_page);
2568 if (mapping) {
2569 /*
2570 * Some device drivers do not set page.mapping
2571 * but still dirty their pages
2572 */
2573 balance_dirty_pages_ratelimited(mapping);
2574 }
2575 }
2576
2577 /* file_update_time outside page_lock */
2578 if (vma->vm_file)
2579 file_update_time(vma->vm_file);
2580
2581 return ret;
2233 } 2582 }
2234 2583
2235 /* 2584 /*
@@ -2254,16 +2603,6 @@ gotten:
2254 } 2603 }
2255 __SetPageUptodate(new_page); 2604 __SetPageUptodate(new_page);
2256 2605
2257 /*
2258 * Don't let another task, with possibly unlocked vma,
2259 * keep the mlocked page.
2260 */
2261 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2262 lock_page(old_page); /* for LRU manipulation */
2263 clear_page_mlock(old_page);
2264 unlock_page(old_page);
2265 }
2266
2267 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2606 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2268 goto oom_free_new; 2607 goto oom_free_new;
2269 2608
@@ -2331,42 +2670,19 @@ gotten:
2331 2670
2332 if (new_page) 2671 if (new_page)
2333 page_cache_release(new_page); 2672 page_cache_release(new_page);
2334 if (old_page)
2335 page_cache_release(old_page);
2336unlock: 2673unlock:
2337 pte_unmap_unlock(page_table, ptl); 2674 pte_unmap_unlock(page_table, ptl);
2338 if (dirty_page) { 2675 if (old_page) {
2339 /* 2676 /*
2340 * Yes, Virginia, this is actually required to prevent a race 2677 * Don't let another task, with possibly unlocked vma,
2341 * with clear_page_dirty_for_io() from clearing the page dirty 2678 * keep the mlocked page.
2342 * bit after it clear all dirty ptes, but before a racing
2343 * do_wp_page installs a dirty pte.
2344 *
2345 * do_no_page is protected similarly.
2346 */ 2679 */
2347 if (!page_mkwrite) { 2680 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2348 wait_on_page_locked(dirty_page); 2681 lock_page(old_page); /* LRU manipulation */
2349 set_page_dirty_balance(dirty_page, page_mkwrite); 2682 munlock_vma_page(old_page);
2350 } 2683 unlock_page(old_page);
2351 put_page(dirty_page);
2352 if (page_mkwrite) {
2353 struct address_space *mapping = dirty_page->mapping;
2354
2355 set_page_dirty(dirty_page);
2356 unlock_page(dirty_page);
2357 page_cache_release(dirty_page);
2358 if (mapping) {
2359 /*
2360 * Some device drivers do not set page.mapping
2361 * but still dirty their pages
2362 */
2363 balance_dirty_pages_ratelimited(mapping);
2364 }
2365 } 2684 }
2366 2685 page_cache_release(old_page);
2367 /* file_update_time outside page_lock */
2368 if (vma->vm_file)
2369 file_update_time(vma->vm_file);
2370 } 2686 }
2371 return ret; 2687 return ret;
2372oom_free_new: 2688oom_free_new:
@@ -2386,96 +2702,11 @@ unwritable_page:
2386 return ret; 2702 return ret;
2387} 2703}
2388 2704
2389/* 2705static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2390 * Helper functions for unmap_mapping_range().
2391 *
2392 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2393 *
2394 * We have to restart searching the prio_tree whenever we drop the lock,
2395 * since the iterator is only valid while the lock is held, and anyway
2396 * a later vma might be split and reinserted earlier while lock dropped.
2397 *
2398 * The list of nonlinear vmas could be handled more efficiently, using
2399 * a placeholder, but handle it in the same way until a need is shown.
2400 * It is important to search the prio_tree before nonlinear list: a vma
2401 * may become nonlinear and be shifted from prio_tree to nonlinear list
2402 * while the lock is dropped; but never shifted from list to prio_tree.
2403 *
2404 * In order to make forward progress despite restarting the search,
2405 * vm_truncate_count is used to mark a vma as now dealt with, so we can
2406 * quickly skip it next time around. Since the prio_tree search only
2407 * shows us those vmas affected by unmapping the range in question, we
2408 * can't efficiently keep all vmas in step with mapping->truncate_count:
2409 * so instead reset them all whenever it wraps back to 0 (then go to 1).
2410 * mapping->truncate_count and vma->vm_truncate_count are protected by
2411 * i_mmap_lock.
2412 *
2413 * In order to make forward progress despite repeatedly restarting some
2414 * large vma, note the restart_addr from unmap_vmas when it breaks out:
2415 * and restart from that address when we reach that vma again. It might
2416 * have been split or merged, shrunk or extended, but never shifted: so
2417 * restart_addr remains valid so long as it remains in the vma's range.
2418 * unmap_mapping_range forces truncate_count to leap over page-aligned
2419 * values so we can save vma's restart_addr in its truncate_count field.
2420 */
2421#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2422
2423static void reset_vma_truncate_counts(struct address_space *mapping)
2424{
2425 struct vm_area_struct *vma;
2426 struct prio_tree_iter iter;
2427
2428 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2429 vma->vm_truncate_count = 0;
2430 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2431 vma->vm_truncate_count = 0;
2432}
2433
2434static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2435 unsigned long start_addr, unsigned long end_addr, 2706 unsigned long start_addr, unsigned long end_addr,
2436 struct zap_details *details) 2707 struct zap_details *details)
2437{ 2708{
2438 unsigned long restart_addr; 2709 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2439 int need_break;
2440
2441 /*
2442 * files that support invalidating or truncating portions of the
2443 * file from under mmaped areas must have their ->fault function
2444 * return a locked page (and set VM_FAULT_LOCKED in the return).
2445 * This provides synchronisation against concurrent unmapping here.
2446 */
2447
2448again:
2449 restart_addr = vma->vm_truncate_count;
2450 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2451 start_addr = restart_addr;
2452 if (start_addr >= end_addr) {
2453 /* Top of vma has been split off since last time */
2454 vma->vm_truncate_count = details->truncate_count;
2455 return 0;
2456 }
2457 }
2458
2459 restart_addr = zap_page_range(vma, start_addr,
2460 end_addr - start_addr, details);
2461 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2462
2463 if (restart_addr >= end_addr) {
2464 /* We have now completed this vma: mark it so */
2465 vma->vm_truncate_count = details->truncate_count;
2466 if (!need_break)
2467 return 0;
2468 } else {
2469 /* Note restart_addr in vma's truncate_count field */
2470 vma->vm_truncate_count = restart_addr;
2471 if (!need_break)
2472 goto again;
2473 }
2474
2475 spin_unlock(details->i_mmap_lock);
2476 cond_resched();
2477 spin_lock(details->i_mmap_lock);
2478 return -EINTR;
2479} 2710}
2480 2711
2481static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2712static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2485,12 +2716,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2485 struct prio_tree_iter iter; 2716 struct prio_tree_iter iter;
2486 pgoff_t vba, vea, zba, zea; 2717 pgoff_t vba, vea, zba, zea;
2487 2718
2488restart:
2489 vma_prio_tree_foreach(vma, &iter, root, 2719 vma_prio_tree_foreach(vma, &iter, root,
2490 details->first_index, details->last_index) { 2720 details->first_index, details->last_index) {
2491 /* Skip quickly over those we have already dealt with */
2492 if (vma->vm_truncate_count == details->truncate_count)
2493 continue;
2494 2721
2495 vba = vma->vm_pgoff; 2722 vba = vma->vm_pgoff;
2496 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2723 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2502,11 +2729,10 @@ restart:
2502 if (zea > vea) 2729 if (zea > vea)
2503 zea = vea; 2730 zea = vea;
2504 2731
2505 if (unmap_mapping_range_vma(vma, 2732 unmap_mapping_range_vma(vma,
2506 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2733 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2507 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2734 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2508 details) < 0) 2735 details);
2509 goto restart;
2510 } 2736 }
2511} 2737}
2512 2738
@@ -2521,15 +2747,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2521 * across *all* the pages in each nonlinear VMA, not just the pages 2747 * across *all* the pages in each nonlinear VMA, not just the pages
2522 * whose virtual address lies outside the file truncation point. 2748 * whose virtual address lies outside the file truncation point.
2523 */ 2749 */
2524restart:
2525 list_for_each_entry(vma, head, shared.vm_set.list) { 2750 list_for_each_entry(vma, head, shared.vm_set.list) {
2526 /* Skip quickly over those we have already dealt with */
2527 if (vma->vm_truncate_count == details->truncate_count)
2528 continue;
2529 details->nonlinear_vma = vma; 2751 details->nonlinear_vma = vma;
2530 if (unmap_mapping_range_vma(vma, vma->vm_start, 2752 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2531 vma->vm_end, details) < 0)
2532 goto restart;
2533 } 2753 }
2534} 2754}
2535 2755
@@ -2568,51 +2788,17 @@ void unmap_mapping_range(struct address_space *mapping,
2568 details.last_index = hba + hlen - 1; 2788 details.last_index = hba + hlen - 1;
2569 if (details.last_index < details.first_index) 2789 if (details.last_index < details.first_index)
2570 details.last_index = ULONG_MAX; 2790 details.last_index = ULONG_MAX;
2571 details.i_mmap_lock = &mapping->i_mmap_lock;
2572
2573 spin_lock(&mapping->i_mmap_lock);
2574 2791
2575 /* Protect against endless unmapping loops */
2576 mapping->truncate_count++;
2577 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2578 if (mapping->truncate_count == 0)
2579 reset_vma_truncate_counts(mapping);
2580 mapping->truncate_count++;
2581 }
2582 details.truncate_count = mapping->truncate_count;
2583 2792
2793 mutex_lock(&mapping->i_mmap_mutex);
2584 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2794 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2585 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2795 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2586 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2796 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2587 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2797 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2588 spin_unlock(&mapping->i_mmap_lock); 2798 mutex_unlock(&mapping->i_mmap_mutex);
2589} 2799}
2590EXPORT_SYMBOL(unmap_mapping_range); 2800EXPORT_SYMBOL(unmap_mapping_range);
2591 2801
2592int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2593{
2594 struct address_space *mapping = inode->i_mapping;
2595
2596 /*
2597 * If the underlying filesystem is not going to provide
2598 * a way to truncate a range of blocks (punch a hole) -
2599 * we should return failure right now.
2600 */
2601 if (!inode->i_op->truncate_range)
2602 return -ENOSYS;
2603
2604 mutex_lock(&inode->i_mutex);
2605 down_write(&inode->i_alloc_sem);
2606 unmap_mapping_range(mapping, offset, (end - offset), 1);
2607 truncate_inode_pages_range(mapping, offset, end);
2608 unmap_mapping_range(mapping, offset, (end - offset), 1);
2609 inode->i_op->truncate_range(inode, offset, end);
2610 up_write(&inode->i_alloc_sem);
2611 mutex_unlock(&inode->i_mutex);
2612
2613 return 0;
2614}
2615
2616/* 2802/*
2617 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2803 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2618 * but allow concurrent faults), and pte mapped but not yet locked. 2804 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2626,7 +2812,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2626 struct page *page, *swapcache = NULL; 2812 struct page *page, *swapcache = NULL;
2627 swp_entry_t entry; 2813 swp_entry_t entry;
2628 pte_t pte; 2814 pte_t pte;
2629 struct mem_cgroup *ptr = NULL; 2815 int locked;
2816 struct mem_cgroup *ptr;
2630 int exclusive = 0; 2817 int exclusive = 0;
2631 int ret = 0; 2818 int ret = 0;
2632 2819
@@ -2666,6 +2853,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2666 /* Had to read the page from swap area: Major fault */ 2853 /* Had to read the page from swap area: Major fault */
2667 ret = VM_FAULT_MAJOR; 2854 ret = VM_FAULT_MAJOR;
2668 count_vm_event(PGMAJFAULT); 2855 count_vm_event(PGMAJFAULT);
2856 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2669 } else if (PageHWPoison(page)) { 2857 } else if (PageHWPoison(page)) {
2670 /* 2858 /*
2671 * hwpoisoned dirty swapcache pages are kept for killing 2859 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2676,8 +2864,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2676 goto out_release; 2864 goto out_release;
2677 } 2865 }
2678 2866
2679 lock_page(page); 2867 locked = lock_page_or_retry(page, mm, flags);
2680 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2868 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2869 if (!locked) {
2870 ret |= VM_FAULT_RETRY;
2871 goto out_release;
2872 }
2681 2873
2682 /* 2874 /*
2683 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 2875 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2810,7 +3002,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2810 if (prev && prev->vm_end == address) 3002 if (prev && prev->vm_end == address)
2811 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 3003 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2812 3004
2813 expand_stack(vma, address - PAGE_SIZE); 3005 expand_downwards(vma, address - PAGE_SIZE);
2814 } 3006 }
2815 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 3007 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2816 struct vm_area_struct *next = vma->vm_next; 3008 struct vm_area_struct *next = vma->vm_next;
@@ -2926,7 +3118,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2926 vmf.page = NULL; 3118 vmf.page = NULL;
2927 3119
2928 ret = vma->vm_ops->fault(vma, &vmf); 3120 ret = vma->vm_ops->fault(vma, &vmf);
2929 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 3121 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3122 VM_FAULT_RETRY)))
2930 return ret; 3123 return ret;
2931 3124
2932 if (unlikely(PageHWPoison(vmf.page))) { 3125 if (unlikely(PageHWPoison(vmf.page))) {
@@ -2967,12 +3160,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2967 goto out; 3160 goto out;
2968 } 3161 }
2969 charged = 1; 3162 charged = 1;
2970 /*
2971 * Don't let another task, with possibly unlocked vma,
2972 * keep the mlocked page.
2973 */
2974 if (vma->vm_flags & VM_LOCKED)
2975 clear_page_mlock(vmf.page);
2976 copy_user_highpage(page, vmf.page, address, vma); 3163 copy_user_highpage(page, vmf.page, address, vma);
2977 __SetPageUptodate(page); 3164 __SetPageUptodate(page);
2978 } else { 3165 } else {
@@ -3139,9 +3326,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3139 * but allow concurrent faults), and pte mapped but not yet locked. 3326 * but allow concurrent faults), and pte mapped but not yet locked.
3140 * We return with mmap_sem still held, but pte unmapped and unlocked. 3327 * We return with mmap_sem still held, but pte unmapped and unlocked.
3141 */ 3328 */
3142static inline int handle_pte_fault(struct mm_struct *mm, 3329int handle_pte_fault(struct mm_struct *mm,
3143 struct vm_area_struct *vma, unsigned long address, 3330 struct vm_area_struct *vma, unsigned long address,
3144 pte_t *pte, pmd_t *pmd, unsigned int flags) 3331 pte_t *pte, pmd_t *pmd, unsigned int flags)
3145{ 3332{
3146 pte_t entry; 3333 pte_t entry;
3147 spinlock_t *ptl; 3334 spinlock_t *ptl;
@@ -3185,7 +3372,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
3185 * with threads. 3372 * with threads.
3186 */ 3373 */
3187 if (flags & FAULT_FLAG_WRITE) 3374 if (flags & FAULT_FLAG_WRITE)
3188 flush_tlb_page(vma, address); 3375 flush_tlb_fix_spurious_fault(vma, address);
3189 } 3376 }
3190unlock: 3377unlock:
3191 pte_unmap_unlock(pte, ptl); 3378 pte_unmap_unlock(pte, ptl);
@@ -3206,6 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3206 __set_current_state(TASK_RUNNING); 3393 __set_current_state(TASK_RUNNING);
3207 3394
3208 count_vm_event(PGFAULT); 3395 count_vm_event(PGFAULT);
3396 mem_cgroup_count_vm_event(mm, PGFAULT);
3209 3397
3210 /* do counter updates before entering really critical section. */ 3398 /* do counter updates before entering really critical section. */
3211 check_sync_rss_stat(current); 3399 check_sync_rss_stat(current);
@@ -3220,9 +3408,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3220 pmd = pmd_alloc(mm, pud, address); 3408 pmd = pmd_alloc(mm, pud, address);
3221 if (!pmd) 3409 if (!pmd)
3222 return VM_FAULT_OOM; 3410 return VM_FAULT_OOM;
3223 pte = pte_alloc_map(mm, pmd, address); 3411 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3224 if (!pte) 3412 if (!vma->vm_ops)
3413 return do_huge_pmd_anonymous_page(mm, vma, address,
3414 pmd, flags);
3415 } else {
3416 pmd_t orig_pmd = *pmd;
3417 barrier();
3418 if (pmd_trans_huge(orig_pmd)) {
3419 if (flags & FAULT_FLAG_WRITE &&
3420 !pmd_write(orig_pmd) &&
3421 !pmd_trans_splitting(orig_pmd))
3422 return do_huge_pmd_wp_page(mm, vma, address,
3423 pmd, orig_pmd);
3424 return 0;
3425 }
3426 }
3427
3428 /*
3429 * Use __pte_alloc instead of pte_alloc_map, because we can't
3430 * run pte_offset_map on the pmd, if an huge pmd could
3431 * materialize from under us from a different thread.
3432 */
3433 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
3225 return VM_FAULT_OOM; 3434 return VM_FAULT_OOM;
3435 /* if an huge pmd materialized from under us just retry later */
3436 if (unlikely(pmd_trans_huge(*pmd)))
3437 return 0;
3438 /*
3439 * A regular pmd is established and it can't morph into a huge pmd
3440 * from under us anymore at this point because we hold the mmap_sem
3441 * read mode and khugepaged takes it in write mode. So now it's
3442 * safe to run pte_offset_map().
3443 */
3444 pte = pte_offset_map(pmd, address);
3226 3445
3227 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3446 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3228} 3447}
@@ -3288,7 +3507,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
3288 vma = find_vma(current->mm, addr); 3507 vma = find_vma(current->mm, addr);
3289 if (!vma) 3508 if (!vma)
3290 return -ENOMEM; 3509 return -ENOMEM;
3291 write = (vma->vm_flags & VM_WRITE) != 0; 3510 /*
3511 * We want to touch writable mappings with a write fault in order
3512 * to break COW, except for shared mappings because these don't COW
3513 * and we would not want to dirty them for nothing.
3514 */
3515 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3292 BUG_ON(addr >= end); 3516 BUG_ON(addr >= end);
3293 BUG_ON(end > vma->vm_end); 3517 BUG_ON(end > vma->vm_end);
3294 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3518 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3323,7 +3547,7 @@ static int __init gate_vma_init(void)
3323__initcall(gate_vma_init); 3547__initcall(gate_vma_init);
3324#endif 3548#endif
3325 3549
3326struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 3550struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3327{ 3551{
3328#ifdef AT_SYSINFO_EHDR 3552#ifdef AT_SYSINFO_EHDR
3329 return &gate_vma; 3553 return &gate_vma;
@@ -3332,7 +3556,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3332#endif 3556#endif
3333} 3557}
3334 3558
3335int in_gate_area_no_task(unsigned long addr) 3559int in_gate_area_no_mm(unsigned long addr)
3336{ 3560{
3337#ifdef AT_SYSINFO_EHDR 3561#ifdef AT_SYSINFO_EHDR
3338 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3562 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3343,7 +3567,7 @@ int in_gate_area_no_task(unsigned long addr)
3343 3567
3344#endif /* __HAVE_ARCH_GATE_AREA */ 3568#endif /* __HAVE_ARCH_GATE_AREA */
3345 3569
3346static int follow_pte(struct mm_struct *mm, unsigned long address, 3570static int __follow_pte(struct mm_struct *mm, unsigned long address,
3347 pte_t **ptepp, spinlock_t **ptlp) 3571 pte_t **ptepp, spinlock_t **ptlp)
3348{ 3572{
3349 pgd_t *pgd; 3573 pgd_t *pgd;
@@ -3360,6 +3584,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address,
3360 goto out; 3584 goto out;
3361 3585
3362 pmd = pmd_offset(pud, address); 3586 pmd = pmd_offset(pud, address);
3587 VM_BUG_ON(pmd_trans_huge(*pmd));
3363 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3588 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3364 goto out; 3589 goto out;
3365 3590
@@ -3380,6 +3605,17 @@ out:
3380 return -EINVAL; 3605 return -EINVAL;
3381} 3606}
3382 3607
3608static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3609 pte_t **ptepp, spinlock_t **ptlp)
3610{
3611 int res;
3612
3613 /* (void) is needed to make gcc happy */
3614 (void) __cond_lock(*ptlp,
3615 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3616 return res;
3617}
3618
3383/** 3619/**
3384 * follow_pfn - look up PFN at a user virtual address 3620 * follow_pfn - look up PFN at a user virtual address
3385 * @vma: memory mapping 3621 * @vma: memory mapping
@@ -3461,20 +3697,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3461#endif 3697#endif
3462 3698
3463/* 3699/*
3464 * Access another process' address space. 3700 * Access another process' address space as given in mm. If non-NULL, use the
3465 * Source/target buffer must be kernel space, 3701 * given task for page fault accounting.
3466 * Do not walk the page table directly, use get_user_pages
3467 */ 3702 */
3468int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 3703static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3704 unsigned long addr, void *buf, int len, int write)
3469{ 3705{
3470 struct mm_struct *mm;
3471 struct vm_area_struct *vma; 3706 struct vm_area_struct *vma;
3472 void *old_buf = buf; 3707 void *old_buf = buf;
3473 3708
3474 mm = get_task_mm(tsk);
3475 if (!mm)
3476 return 0;
3477
3478 down_read(&mm->mmap_sem); 3709 down_read(&mm->mmap_sem);
3479 /* ignore errors, just check how much was successfully transferred */ 3710 /* ignore errors, just check how much was successfully transferred */
3480 while (len) { 3711 while (len) {
@@ -3491,7 +3722,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3491 */ 3722 */
3492#ifdef CONFIG_HAVE_IOREMAP_PROT 3723#ifdef CONFIG_HAVE_IOREMAP_PROT
3493 vma = find_vma(mm, addr); 3724 vma = find_vma(mm, addr);
3494 if (!vma) 3725 if (!vma || vma->vm_start > addr)
3495 break; 3726 break;
3496 if (vma->vm_ops && vma->vm_ops->access) 3727 if (vma->vm_ops && vma->vm_ops->access)
3497 ret = vma->vm_ops->access(vma, addr, buf, 3728 ret = vma->vm_ops->access(vma, addr, buf,
@@ -3523,11 +3754,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3523 addr += bytes; 3754 addr += bytes;
3524 } 3755 }
3525 up_read(&mm->mmap_sem); 3756 up_read(&mm->mmap_sem);
3526 mmput(mm);
3527 3757
3528 return buf - old_buf; 3758 return buf - old_buf;
3529} 3759}
3530 3760
3761/**
3762 * access_remote_vm - access another process' address space
3763 * @mm: the mm_struct of the target address space
3764 * @addr: start address to access
3765 * @buf: source or destination buffer
3766 * @len: number of bytes to transfer
3767 * @write: whether the access is a write
3768 *
3769 * The caller must hold a reference on @mm.
3770 */
3771int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3772 void *buf, int len, int write)
3773{
3774 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3775}
3776
3777/*
3778 * Access another process' address space.
3779 * Source/target buffer must be kernel space,
3780 * Do not walk the page table directly, use get_user_pages
3781 */
3782int access_process_vm(struct task_struct *tsk, unsigned long addr,
3783 void *buf, int len, int write)
3784{
3785 struct mm_struct *mm;
3786 int ret;
3787
3788 mm = get_task_mm(tsk);
3789 if (!mm)
3790 return 0;
3791
3792 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3793 mmput(mm);
3794
3795 return ret;
3796}
3797
3531/* 3798/*
3532 * Print the name of a VMA. 3799 * Print the name of a VMA.
3533 */ 3800 */
@@ -3589,3 +3856,74 @@ void might_fault(void)
3589} 3856}
3590EXPORT_SYMBOL(might_fault); 3857EXPORT_SYMBOL(might_fault);
3591#endif 3858#endif
3859
3860#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3861static void clear_gigantic_page(struct page *page,
3862 unsigned long addr,
3863 unsigned int pages_per_huge_page)
3864{
3865 int i;
3866 struct page *p = page;
3867
3868 might_sleep();
3869 for (i = 0; i < pages_per_huge_page;
3870 i++, p = mem_map_next(p, page, i)) {
3871 cond_resched();
3872 clear_user_highpage(p, addr + i * PAGE_SIZE);
3873 }
3874}
3875void clear_huge_page(struct page *page,
3876 unsigned long addr, unsigned int pages_per_huge_page)
3877{
3878 int i;
3879
3880 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3881 clear_gigantic_page(page, addr, pages_per_huge_page);
3882 return;
3883 }
3884
3885 might_sleep();
3886 for (i = 0; i < pages_per_huge_page; i++) {
3887 cond_resched();
3888 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3889 }
3890}
3891
3892static void copy_user_gigantic_page(struct page *dst, struct page *src,
3893 unsigned long addr,
3894 struct vm_area_struct *vma,
3895 unsigned int pages_per_huge_page)
3896{
3897 int i;
3898 struct page *dst_base = dst;
3899 struct page *src_base = src;
3900
3901 for (i = 0; i < pages_per_huge_page; ) {
3902 cond_resched();
3903 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3904
3905 i++;
3906 dst = mem_map_next(dst, dst_base, i);
3907 src = mem_map_next(src, src_base, i);
3908 }
3909}
3910
3911void copy_user_huge_page(struct page *dst, struct page *src,
3912 unsigned long addr, struct vm_area_struct *vma,
3913 unsigned int pages_per_huge_page)
3914{
3915 int i;
3916
3917 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3918 copy_user_gigantic_page(dst, src, addr, vma,
3919 pages_per_huge_page);
3920 return;
3921 }
3922
3923 might_sleep();
3924 for (i = 0; i < pages_per_huge_page; i++) {
3925 cond_resched();
3926 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3927 }
3928}
3929#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..c46887b5a11e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37DEFINE_MUTEX(mem_hotplug_mutex);
38
39void lock_memory_hotplug(void)
40{
41 mutex_lock(&mem_hotplug_mutex);
42
43 /* for exclusive hibernation if CONFIG_HIBERNATION=y */
44 lock_system_sleep();
45}
46
47void unlock_memory_hotplug(void)
48{
49 unlock_system_sleep();
50 mutex_unlock(&mem_hotplug_mutex);
51}
52
53
37/* add this memory to iomem resource */ 54/* add this memory to iomem resource */
38static struct resource *register_memory_resource(u64 start, u64 size) 55static struct resource *register_memory_resource(u64 start, u64 size)
39{ 56{
@@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res)
65 82
66#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
67#ifndef CONFIG_SPARSEMEM_VMEMMAP 84#ifndef CONFIG_SPARSEMEM_VMEMMAP
68static void get_page_bootmem(unsigned long info, struct page *page, int type) 85static void get_page_bootmem(unsigned long info, struct page *page,
86 unsigned long type)
69{ 87{
70 atomic_set(&page->_mapcount, type); 88 page->lru.next = (struct list_head *) type;
71 SetPagePrivate(page); 89 SetPagePrivate(page);
72 set_page_private(page, info); 90 set_page_private(page, info);
73 atomic_inc(&page->_count); 91 atomic_inc(&page->_count);
@@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
77 * so use __ref to tell modpost not to generate a warning */ 95 * so use __ref to tell modpost not to generate a warning */
78void __ref put_page_bootmem(struct page *page) 96void __ref put_page_bootmem(struct page *page)
79{ 97{
80 int type; 98 unsigned long type;
81 99
82 type = atomic_read(&page->_mapcount); 100 type = (unsigned long) page->lru.next;
83 BUG_ON(type >= -1); 101 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
102 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
84 103
85 if (atomic_dec_return(&page->_count) == 1) { 104 if (atomic_dec_return(&page->_count) == 1) {
86 ClearPagePrivate(page); 105 ClearPagePrivate(page);
87 set_page_private(page, 0); 106 set_page_private(page, 0);
88 reset_page_mapcount(page); 107 INIT_LIST_HEAD(&page->lru);
89 __free_pages_bootmem(page, 0); 108 __free_pages_bootmem(page, 0);
90 } 109 }
91 110
@@ -355,10 +374,6 @@ void online_page(struct page *page)
355 totalhigh_pages++; 374 totalhigh_pages++;
356#endif 375#endif
357 376
358#ifdef CONFIG_FLATMEM
359 max_mapnr = max(page_to_pfn(page), max_mapnr);
360#endif
361
362 ClearPageReserved(page); 377 ClearPageReserved(page);
363 init_page_count(page); 378 init_page_count(page);
364 __free_page(page); 379 __free_page(page);
@@ -381,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
381} 396}
382 397
383 398
384int online_pages(unsigned long pfn, unsigned long nr_pages) 399int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
385{ 400{
386 unsigned long onlined_pages = 0; 401 unsigned long onlined_pages = 0;
387 struct zone *zone; 402 struct zone *zone;
@@ -390,6 +405,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
390 int ret; 405 int ret;
391 struct memory_notify arg; 406 struct memory_notify arg;
392 407
408 lock_memory_hotplug();
393 arg.start_pfn = pfn; 409 arg.start_pfn = pfn;
394 arg.nr_pages = nr_pages; 410 arg.nr_pages = nr_pages;
395 arg.status_change_nid = -1; 411 arg.status_change_nid = -1;
@@ -402,6 +418,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
402 ret = notifier_to_errno(ret); 418 ret = notifier_to_errno(ret);
403 if (ret) { 419 if (ret) {
404 memory_notify(MEM_CANCEL_ONLINE, &arg); 420 memory_notify(MEM_CANCEL_ONLINE, &arg);
421 unlock_memory_hotplug();
405 return ret; 422 return ret;
406 } 423 }
407 /* 424 /*
@@ -426,6 +443,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
426 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 443 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
427 nr_pages, pfn); 444 nr_pages, pfn);
428 memory_notify(MEM_CANCEL_ONLINE, &arg); 445 memory_notify(MEM_CANCEL_ONLINE, &arg);
446 unlock_memory_hotplug();
429 return ret; 447 return ret;
430 } 448 }
431 449
@@ -437,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
437 zone_pcp_update(zone); 455 zone_pcp_update(zone);
438 456
439 mutex_unlock(&zonelists_mutex); 457 mutex_unlock(&zonelists_mutex);
440 setup_per_zone_wmarks(); 458
441 calculate_zone_inactive_ratio(zone); 459 init_per_zone_wmark_min();
460
442 if (onlined_pages) { 461 if (onlined_pages) {
443 kswapd_run(zone_to_nid(zone)); 462 kswapd_run(zone_to_nid(zone));
444 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 463 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -450,6 +469,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
450 469
451 if (onlined_pages) 470 if (onlined_pages)
452 memory_notify(MEM_ONLINE, &arg); 471 memory_notify(MEM_ONLINE, &arg);
472 unlock_memory_hotplug();
453 473
454 return 0; 474 return 0;
455} 475}
@@ -474,6 +494,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
474 /* init node's zones as empty zones, we don't have any present pages.*/ 494 /* init node's zones as empty zones, we don't have any present pages.*/
475 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 495 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
476 496
497 /*
498 * The node we allocated has no zone fallback lists. For avoiding
499 * to access not-initialized zonelist, build here.
500 */
501 mutex_lock(&zonelists_mutex);
502 build_all_zonelists(NULL);
503 mutex_unlock(&zonelists_mutex);
504
477 return pgdat; 505 return pgdat;
478} 506}
479 507
@@ -493,9 +521,9 @@ int mem_online_node(int nid)
493 pg_data_t *pgdat; 521 pg_data_t *pgdat;
494 int ret; 522 int ret;
495 523
496 lock_system_sleep(); 524 lock_memory_hotplug();
497 pgdat = hotadd_new_pgdat(nid, 0); 525 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) { 526 if (!pgdat) {
499 ret = -ENOMEM; 527 ret = -ENOMEM;
500 goto out; 528 goto out;
501 } 529 }
@@ -504,7 +532,7 @@ int mem_online_node(int nid)
504 BUG_ON(ret); 532 BUG_ON(ret);
505 533
506out: 534out:
507 unlock_system_sleep(); 535 unlock_memory_hotplug();
508 return ret; 536 return ret;
509} 537}
510 538
@@ -516,7 +544,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
516 struct resource *res; 544 struct resource *res;
517 int ret; 545 int ret;
518 546
519 lock_system_sleep(); 547 lock_memory_hotplug();
520 548
521 res = register_memory_resource(start, size); 549 res = register_memory_resource(start, size);
522 ret = -EEXIST; 550 ret = -EEXIST;
@@ -563,7 +591,7 @@ error:
563 release_memory_resource(res); 591 release_memory_resource(res);
564 592
565out: 593out:
566 unlock_system_sleep(); 594 unlock_memory_hotplug();
567 return ret; 595 return ret;
568} 596}
569EXPORT_SYMBOL_GPL(add_memory); 597EXPORT_SYMBOL_GPL(add_memory);
@@ -602,27 +630,14 @@ static struct page *next_active_pageblock(struct page *page)
602/* Checks if this range of memory is likely to be hot-removable. */ 630/* Checks if this range of memory is likely to be hot-removable. */
603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 631int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
604{ 632{
605 int type;
606 struct page *page = pfn_to_page(start_pfn); 633 struct page *page = pfn_to_page(start_pfn);
607 struct page *end_page = page + nr_pages; 634 struct page *end_page = page + nr_pages;
608 635
609 /* Check the starting page of each pageblock within the range */ 636 /* Check the starting page of each pageblock within the range */
610 for (; page < end_page; page = next_active_pageblock(page)) { 637 for (; page < end_page; page = next_active_pageblock(page)) {
611 type = get_pageblock_migratetype(page); 638 if (!is_pageblock_removable_nolock(page))
612
613 /*
614 * A pageblock containing MOVABLE or free pages is considered
615 * removable
616 */
617 if (type != MIGRATE_MOVABLE && !pageblock_free(page))
618 return 0;
619
620 /*
621 * A pageblock starting with a PageReserved page is not
622 * considered removable.
623 */
624 if (PageReserved(page))
625 return 0; 639 return 0;
640 cond_resched();
626 } 641 }
627 642
628 /* All pageblocks in the memory block are likely to be hot-removable */ 643 /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +674,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
659 * Scanning pfn is much easier than scanning lru list. 674 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page. 675 * Scan pfn from start to end and Find LRU page.
661 */ 676 */
662int scan_lru_pages(unsigned long start, unsigned long end) 677static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
663{ 678{
664 unsigned long pfn; 679 unsigned long pfn;
665 struct page *page; 680 struct page *page;
@@ -695,7 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
695 if (!pfn_valid(pfn)) 710 if (!pfn_valid(pfn))
696 continue; 711 continue;
697 page = pfn_to_page(pfn); 712 page = pfn_to_page(pfn);
698 if (!page_count(page)) 713 if (!get_page_unless_zero(page))
699 continue; 714 continue;
700 /* 715 /*
701 * We can skip free pages. And we can only deal with pages on 716 * We can skip free pages. And we can only deal with pages on
@@ -703,35 +718,39 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
703 */ 718 */
704 ret = isolate_lru_page(page); 719 ret = isolate_lru_page(page);
705 if (!ret) { /* Success */ 720 if (!ret) { /* Success */
721 put_page(page);
706 list_add_tail(&page->lru, &source); 722 list_add_tail(&page->lru, &source);
707 move_pages--; 723 move_pages--;
708 inc_zone_page_state(page, NR_ISOLATED_ANON + 724 inc_zone_page_state(page, NR_ISOLATED_ANON +
709 page_is_file_cache(page)); 725 page_is_file_cache(page));
710 726
711 } else { 727 } else {
712 /* Becasue we don't have big zone->lock. we should
713 check this again here. */
714 if (page_count(page))
715 not_managed++;
716#ifdef CONFIG_DEBUG_VM 728#ifdef CONFIG_DEBUG_VM
717 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 729 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
718 pfn); 730 pfn);
719 dump_page(page); 731 dump_page(page);
720#endif 732#endif
733 put_page(page);
734 /* Because we don't have big zone->lock. we should
735 check this again here. */
736 if (page_count(page)) {
737 not_managed++;
738 ret = -EBUSY;
739 break;
740 }
721 } 741 }
722 } 742 }
723 ret = -EBUSY; 743 if (!list_empty(&source)) {
724 if (not_managed) { 744 if (not_managed) {
725 if (!list_empty(&source)) 745 putback_lru_pages(&source);
746 goto out;
747 }
748 /* this function returns # of failed pages */
749 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
750 true, true);
751 if (ret)
726 putback_lru_pages(&source); 752 putback_lru_pages(&source);
727 goto out;
728 } 753 }
729 ret = 0;
730 if (list_empty(&source))
731 goto out;
732 /* this function returns # of failed pages */
733 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
734
735out: 754out:
736 return ret; 755 return ret;
737} 756}
@@ -783,7 +802,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
783 return offlined; 802 return offlined;
784} 803}
785 804
786static int offline_pages(unsigned long start_pfn, 805static int __ref offline_pages(unsigned long start_pfn,
787 unsigned long end_pfn, unsigned long timeout) 806 unsigned long end_pfn, unsigned long timeout)
788{ 807{
789 unsigned long pfn, nr_pages, expire; 808 unsigned long pfn, nr_pages, expire;
@@ -803,7 +822,7 @@ static int offline_pages(unsigned long start_pfn,
803 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 822 if (!test_pages_in_a_zone(start_pfn, end_pfn))
804 return -EINVAL; 823 return -EINVAL;
805 824
806 lock_system_sleep(); 825 lock_memory_hotplug();
807 826
808 zone = page_zone(pfn_to_page(start_pfn)); 827 zone = page_zone(pfn_to_page(start_pfn));
809 node = zone_to_nid(zone); 828 node = zone_to_nid(zone);
@@ -840,7 +859,6 @@ repeat:
840 ret = 0; 859 ret = 0;
841 if (drain) { 860 if (drain) {
842 lru_add_drain_all(); 861 lru_add_drain_all();
843 flush_scheduled_work();
844 cond_resched(); 862 cond_resched();
845 drain_all_pages(); 863 drain_all_pages();
846 } 864 }
@@ -862,7 +880,6 @@ repeat:
862 } 880 }
863 /* drain all zone's lru pagevec, this is asyncronous... */ 881 /* drain all zone's lru pagevec, this is asyncronous... */
864 lru_add_drain_all(); 882 lru_add_drain_all();
865 flush_scheduled_work();
866 yield(); 883 yield();
867 /* drain pcp pages , this is synchrouns. */ 884 /* drain pcp pages , this is synchrouns. */
868 drain_all_pages(); 885 drain_all_pages();
@@ -883,8 +900,8 @@ repeat:
883 zone->zone_pgdat->node_present_pages -= offlined_pages; 900 zone->zone_pgdat->node_present_pages -= offlined_pages;
884 totalram_pages -= offlined_pages; 901 totalram_pages -= offlined_pages;
885 902
886 setup_per_zone_wmarks(); 903 init_per_zone_wmark_min();
887 calculate_zone_inactive_ratio(zone); 904
888 if (!node_present_pages(node)) { 905 if (!node_present_pages(node)) {
889 node_clear_state(node, N_HIGH_MEMORY); 906 node_clear_state(node, N_HIGH_MEMORY);
890 kswapd_stop(node); 907 kswapd_stop(node);
@@ -894,7 +911,7 @@ repeat:
894 writeback_set_ratelimit(); 911 writeback_set_ratelimit();
895 912
896 memory_notify(MEM_OFFLINE, &arg); 913 memory_notify(MEM_OFFLINE, &arg);
897 unlock_system_sleep(); 914 unlock_memory_hotplug();
898 return 0; 915 return 0;
899 916
900failed_removal: 917failed_removal:
@@ -905,7 +922,7 @@ failed_removal:
905 undo_isolate_page_range(start_pfn, end_pfn); 922 undo_isolate_page_range(start_pfn, end_pfn);
906 923
907out: 924out:
908 unlock_system_sleep(); 925 unlock_memory_hotplug();
909 return ret; 926 return ret;
910} 927}
911 928
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..e7fb9d25c54e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -99,7 +99,6 @@
99/* Internal flags */ 99/* Internal flags */
100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
102#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
103 102
104static struct kmem_cache *policy_cache; 103static struct kmem_cache *policy_cache;
105static struct kmem_cache *sn_cache; 104static struct kmem_cache *sn_cache;
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
457 }, 456 },
458}; 457};
459 458
460static void gather_stats(struct page *, void *, int pte_dirty);
461static void migrate_page_add(struct page *page, struct list_head *pagelist, 459static void migrate_page_add(struct page *page, struct list_head *pagelist,
462 unsigned long flags); 460 unsigned long flags);
463 461
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
492 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 490 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
493 continue; 491 continue;
494 492
495 if (flags & MPOL_MF_STATS) 493 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
496 gather_stats(page, private, pte_dirty(*pte));
497 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
498 migrate_page_add(page, private, flags); 494 migrate_page_add(page, private, flags);
499 else 495 else
500 break; 496 break;
@@ -514,6 +510,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
514 pmd = pmd_offset(pud, addr); 510 pmd = pmd_offset(pud, addr);
515 do { 511 do {
516 next = pmd_addr_end(addr, end); 512 next = pmd_addr_end(addr, end);
513 split_huge_page_pmd(vma->vm_mm, pmd);
517 if (pmd_none_or_clear_bad(pmd)) 514 if (pmd_none_or_clear_bad(pmd))
518 continue; 515 continue;
519 if (check_pte_range(vma, pmd, addr, next, nodes, 516 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -924,15 +921,22 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 921 nodemask_t nmask;
925 LIST_HEAD(pagelist); 922 LIST_HEAD(pagelist);
926 int err = 0; 923 int err = 0;
924 struct vm_area_struct *vma;
927 925
928 nodes_clear(nmask); 926 nodes_clear(nmask);
929 node_set(source, nmask); 927 node_set(source, nmask);
930 928
931 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 929 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
932 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 930 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
931 if (IS_ERR(vma))
932 return PTR_ERR(vma);
933 933
934 if (!list_empty(&pagelist)) 934 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 0); 935 err = migrate_pages(&pagelist, new_node_page, dest,
936 false, true);
937 if (err)
938 putback_lru_pages(&pagelist);
939 }
936 940
937 return err; 941 return err;
938} 942}
@@ -985,7 +989,7 @@ int do_migrate_pages(struct mm_struct *mm,
985 * most recent <s, d> pair that moved (s != d). If we find a pair 989 * most recent <s, d> pair that moved (s != d). If we find a pair
986 * that not only moved, but what's better, moved to an empty slot 990 * that not only moved, but what's better, moved to an empty slot
987 * (d is not set in tmp), then we break out then, with that pair. 991 * (d is not set in tmp), then we break out then, with that pair.
988 * Otherwise when we finish scannng from_tmp, we at least have the 992 * Otherwise when we finish scanning from_tmp, we at least have the
989 * most recent <s, d> pair that moved. If we get all the way through 993 * most recent <s, d> pair that moved. If we get all the way through
990 * the scan of tmp without finding any node that moved, much less 994 * the scan of tmp without finding any node that moved, much less
991 * moved to an empty node, then there is nothing left worth migrating. 995 * moved to an empty node, then there is nothing left worth migrating.
@@ -1147,9 +1151,13 @@ static long do_mbind(unsigned long start, unsigned long len,
1147 1151
1148 err = mbind_range(mm, start, end, new); 1152 err = mbind_range(mm, start, end, new);
1149 1153
1150 if (!list_empty(&pagelist)) 1154 if (!list_empty(&pagelist)) {
1151 nr_failed = migrate_pages(&pagelist, new_vma_page, 1155 nr_failed = migrate_pages(&pagelist, new_vma_page,
1152 (unsigned long)vma, 0); 1156 (unsigned long)vma,
1157 false, true);
1158 if (nr_failed)
1159 putback_lru_pages(&pagelist);
1160 }
1153 1161
1154 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1162 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1155 err = -EIO; 1163 err = -EIO;
@@ -1298,15 +1306,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1298 goto out; 1306 goto out;
1299 1307
1300 /* Find the mm_struct */ 1308 /* Find the mm_struct */
1301 read_lock(&tasklist_lock); 1309 rcu_read_lock();
1302 task = pid ? find_task_by_vpid(pid) : current; 1310 task = pid ? find_task_by_vpid(pid) : current;
1303 if (!task) { 1311 if (!task) {
1304 read_unlock(&tasklist_lock); 1312 rcu_read_unlock();
1305 err = -ESRCH; 1313 err = -ESRCH;
1306 goto out; 1314 goto out;
1307 } 1315 }
1308 mm = get_task_mm(task); 1316 mm = get_task_mm(task);
1309 read_unlock(&tasklist_lock); 1317 rcu_read_unlock();
1310 1318
1311 err = -EINVAL; 1319 err = -EINVAL;
1312 if (!mm) 1320 if (!mm)
@@ -1477,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1477 * freeing by another task. It is the caller's responsibility to free the 1485 * freeing by another task. It is the caller's responsibility to free the
1478 * extra reference for shared policies. 1486 * extra reference for shared policies.
1479 */ 1487 */
1480static struct mempolicy *get_vma_policy(struct task_struct *task, 1488struct mempolicy *get_vma_policy(struct task_struct *task,
1481 struct vm_area_struct *vma, unsigned long addr) 1489 struct vm_area_struct *vma, unsigned long addr)
1482{ 1490{
1483 struct mempolicy *pol = task->mempolicy; 1491 struct mempolicy *pol = task->mempolicy;
@@ -1512,10 +1520,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1512} 1520}
1513 1521
1514/* Return a zonelist indicated by gfp for node representing a mempolicy */ 1522/* Return a zonelist indicated by gfp for node representing a mempolicy */
1515static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) 1523static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1524 int nd)
1516{ 1525{
1517 int nd = numa_node_id();
1518
1519 switch (policy->mode) { 1526 switch (policy->mode) {
1520 case MPOL_PREFERRED: 1527 case MPOL_PREFERRED:
1521 if (!(policy->flags & MPOL_F_LOCAL)) 1528 if (!(policy->flags & MPOL_F_LOCAL))
@@ -1588,7 +1595,7 @@ unsigned slab_node(struct mempolicy *policy)
1588 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1595 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1589 &policy->v.nodes, 1596 &policy->v.nodes,
1590 &zone); 1597 &zone);
1591 return zone->node; 1598 return zone ? zone->node : numa_node_id();
1592 } 1599 }
1593 1600
1594 default: 1601 default:
@@ -1667,7 +1674,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1667 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1674 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1668 huge_page_shift(hstate_vma(vma))), gfp_flags); 1675 huge_page_shift(hstate_vma(vma))), gfp_flags);
1669 } else { 1676 } else {
1670 zl = policy_zonelist(gfp_flags, *mpol); 1677 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1671 if ((*mpol)->mode == MPOL_BIND) 1678 if ((*mpol)->mode == MPOL_BIND)
1672 *nodemask = &(*mpol)->v.nodes; 1679 *nodemask = &(*mpol)->v.nodes;
1673 } 1680 }
@@ -1784,7 +1791,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1784} 1791}
1785 1792
1786/** 1793/**
1787 * alloc_page_vma - Allocate a page for a VMA. 1794 * alloc_pages_vma - Allocate a page for a VMA.
1788 * 1795 *
1789 * @gfp: 1796 * @gfp:
1790 * %GFP_USER user allocation. 1797 * %GFP_USER user allocation.
@@ -1793,6 +1800,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1793 * %GFP_FS allocation should not call back into a file system. 1800 * %GFP_FS allocation should not call back into a file system.
1794 * %GFP_ATOMIC don't sleep. 1801 * %GFP_ATOMIC don't sleep.
1795 * 1802 *
1803 * @order:Order of the GFP allocation.
1796 * @vma: Pointer to VMA or NULL if not available. 1804 * @vma: Pointer to VMA or NULL if not available.
1797 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1805 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1798 * 1806 *
@@ -1806,7 +1814,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1806 * Should be called with the mm_sem of the vma hold. 1814 * Should be called with the mm_sem of the vma hold.
1807 */ 1815 */
1808struct page * 1816struct page *
1809alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1817alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818 unsigned long addr, int node)
1810{ 1819{
1811 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1820 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1812 struct zonelist *zl; 1821 struct zonelist *zl;
@@ -1816,18 +1825,18 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1816 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1825 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1817 unsigned nid; 1826 unsigned nid;
1818 1827
1819 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1828 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1820 mpol_cond_put(pol); 1829 mpol_cond_put(pol);
1821 page = alloc_page_interleave(gfp, 0, nid); 1830 page = alloc_page_interleave(gfp, order, nid);
1822 put_mems_allowed(); 1831 put_mems_allowed();
1823 return page; 1832 return page;
1824 } 1833 }
1825 zl = policy_zonelist(gfp, pol); 1834 zl = policy_zonelist(gfp, pol, node);
1826 if (unlikely(mpol_needs_cond_ref(pol))) { 1835 if (unlikely(mpol_needs_cond_ref(pol))) {
1827 /* 1836 /*
1828 * slow path: ref counted shared policy 1837 * slow path: ref counted shared policy
1829 */ 1838 */
1830 struct page *page = __alloc_pages_nodemask(gfp, 0, 1839 struct page *page = __alloc_pages_nodemask(gfp, order,
1831 zl, policy_nodemask(gfp, pol)); 1840 zl, policy_nodemask(gfp, pol));
1832 __mpol_put(pol); 1841 __mpol_put(pol);
1833 put_mems_allowed(); 1842 put_mems_allowed();
@@ -1836,7 +1845,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1836 /* 1845 /*
1837 * fast path: default or task policy 1846 * fast path: default or task policy
1838 */ 1847 */
1839 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1848 page = __alloc_pages_nodemask(gfp, order, zl,
1849 policy_nodemask(gfp, pol));
1840 put_mems_allowed(); 1850 put_mems_allowed();
1841 return page; 1851 return page;
1842} 1852}
@@ -1877,7 +1887,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1877 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1887 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1878 else 1888 else
1879 page = __alloc_pages_nodemask(gfp, order, 1889 page = __alloc_pages_nodemask(gfp, order,
1880 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1890 policy_zonelist(gfp, pol, numa_node_id()),
1891 policy_nodemask(gfp, pol));
1881 put_mems_allowed(); 1892 put_mems_allowed();
1882 return page; 1893 return page;
1883} 1894}
@@ -1964,8 +1975,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1964 case MPOL_INTERLEAVE: 1975 case MPOL_INTERLEAVE:
1965 return nodes_equal(a->v.nodes, b->v.nodes); 1976 return nodes_equal(a->v.nodes, b->v.nodes);
1966 case MPOL_PREFERRED: 1977 case MPOL_PREFERRED:
1967 return a->v.preferred_node == b->v.preferred_node && 1978 return a->v.preferred_node == b->v.preferred_node;
1968 a->flags == b->flags;
1969 default: 1979 default:
1970 BUG(); 1980 BUG();
1971 return 0; 1981 return 0;
@@ -2515,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2515 } 2525 }
2516 return p - buffer; 2526 return p - buffer;
2517} 2527}
2518
2519struct numa_maps {
2520 unsigned long pages;
2521 unsigned long anon;
2522 unsigned long active;
2523 unsigned long writeback;
2524 unsigned long mapcount_max;
2525 unsigned long dirty;
2526 unsigned long swapcache;
2527 unsigned long node[MAX_NUMNODES];
2528};
2529
2530static void gather_stats(struct page *page, void *private, int pte_dirty)
2531{
2532 struct numa_maps *md = private;
2533 int count = page_mapcount(page);
2534
2535 md->pages++;
2536 if (pte_dirty || PageDirty(page))
2537 md->dirty++;
2538
2539 if (PageSwapCache(page))
2540 md->swapcache++;
2541
2542 if (PageActive(page) || PageUnevictable(page))
2543 md->active++;
2544
2545 if (PageWriteback(page))
2546 md->writeback++;
2547
2548 if (PageAnon(page))
2549 md->anon++;
2550
2551 if (count > md->mapcount_max)
2552 md->mapcount_max = count;
2553
2554 md->node[page_to_nid(page)]++;
2555}
2556
2557#ifdef CONFIG_HUGETLB_PAGE
2558static void check_huge_range(struct vm_area_struct *vma,
2559 unsigned long start, unsigned long end,
2560 struct numa_maps *md)
2561{
2562 unsigned long addr;
2563 struct page *page;
2564 struct hstate *h = hstate_vma(vma);
2565 unsigned long sz = huge_page_size(h);
2566
2567 for (addr = start; addr < end; addr += sz) {
2568 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2569 addr & huge_page_mask(h));
2570 pte_t pte;
2571
2572 if (!ptep)
2573 continue;
2574
2575 pte = *ptep;
2576 if (pte_none(pte))
2577 continue;
2578
2579 page = pte_page(pte);
2580 if (!page)
2581 continue;
2582
2583 gather_stats(page, md, pte_dirty(*ptep));
2584 }
2585}
2586#else
2587static inline void check_huge_range(struct vm_area_struct *vma,
2588 unsigned long start, unsigned long end,
2589 struct numa_maps *md)
2590{
2591}
2592#endif
2593
2594/*
2595 * Display pages allocated per node and memory policy via /proc.
2596 */
2597int show_numa_map(struct seq_file *m, void *v)
2598{
2599 struct proc_maps_private *priv = m->private;
2600 struct vm_area_struct *vma = v;
2601 struct numa_maps *md;
2602 struct file *file = vma->vm_file;
2603 struct mm_struct *mm = vma->vm_mm;
2604 struct mempolicy *pol;
2605 int n;
2606 char buffer[50];
2607
2608 if (!mm)
2609 return 0;
2610
2611 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2612 if (!md)
2613 return 0;
2614
2615 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2616 mpol_to_str(buffer, sizeof(buffer), pol, 0);
2617 mpol_cond_put(pol);
2618
2619 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2620
2621 if (file) {
2622 seq_printf(m, " file=");
2623 seq_path(m, &file->f_path, "\n\t= ");
2624 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2625 seq_printf(m, " heap");
2626 } else if (vma->vm_start <= mm->start_stack &&
2627 vma->vm_end >= mm->start_stack) {
2628 seq_printf(m, " stack");
2629 }
2630
2631 if (is_vm_hugetlb_page(vma)) {
2632 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2633 seq_printf(m, " huge");
2634 } else {
2635 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2636 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2637 }
2638
2639 if (!md->pages)
2640 goto out;
2641
2642 if (md->anon)
2643 seq_printf(m," anon=%lu",md->anon);
2644
2645 if (md->dirty)
2646 seq_printf(m," dirty=%lu",md->dirty);
2647
2648 if (md->pages != md->anon && md->pages != md->dirty)
2649 seq_printf(m, " mapped=%lu", md->pages);
2650
2651 if (md->mapcount_max > 1)
2652 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2653
2654 if (md->swapcache)
2655 seq_printf(m," swapcache=%lu", md->swapcache);
2656
2657 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2658 seq_printf(m," active=%lu", md->active);
2659
2660 if (md->writeback)
2661 seq_printf(m," writeback=%lu", md->writeback);
2662
2663 for_each_node_state(n, N_HIGH_MEMORY)
2664 if (md->node[n])
2665 seq_printf(m, " N%d=%lu", n, md->node[n]);
2666out:
2667 seq_putc(m, '\n');
2668 kfree(md);
2669
2670 if (m->count < m->size)
2671 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2672 return 0;
2673}
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..666e4e677414 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,8 +32,11 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
35#include <linux/gfp.h> 36#include <linux/gfp.h>
36 37
38#include <asm/tlbflush.h>
39
37#include "internal.h" 40#include "internal.h"
38 41
39#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -95,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
95 pte_t *ptep, pte; 98 pte_t *ptep, pte;
96 spinlock_t *ptl; 99 spinlock_t *ptl;
97 100
98 pgd = pgd_offset(mm, addr); 101 if (unlikely(PageHuge(new))) {
99 if (!pgd_present(*pgd)) 102 ptep = huge_pte_offset(mm, addr);
100 goto out; 103 if (!ptep)
104 goto out;
105 ptl = &mm->page_table_lock;
106 } else {
107 pgd = pgd_offset(mm, addr);
108 if (!pgd_present(*pgd))
109 goto out;
101 110
102 pud = pud_offset(pgd, addr); 111 pud = pud_offset(pgd, addr);
103 if (!pud_present(*pud)) 112 if (!pud_present(*pud))
104 goto out; 113 goto out;
105 114
106 pmd = pmd_offset(pud, addr); 115 pmd = pmd_offset(pud, addr);
107 if (!pmd_present(*pmd)) 116 if (pmd_trans_huge(*pmd))
108 goto out; 117 goto out;
118 if (!pmd_present(*pmd))
119 goto out;
109 120
110 ptep = pte_offset_map(pmd, addr); 121 ptep = pte_offset_map(pmd, addr);
111 122
112 if (!is_swap_pte(*ptep)) { 123 if (!is_swap_pte(*ptep)) {
113 pte_unmap(ptep); 124 pte_unmap(ptep);
114 goto out; 125 goto out;
115 } 126 }
127
128 ptl = pte_lockptr(mm, pmd);
129 }
116 130
117 ptl = pte_lockptr(mm, pmd);
118 spin_lock(ptl); 131 spin_lock(ptl);
119 pte = *ptep; 132 pte = *ptep;
120 if (!is_swap_pte(pte)) 133 if (!is_swap_pte(pte))
@@ -130,10 +143,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 143 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
131 if (is_write_migration_entry(entry)) 144 if (is_write_migration_entry(entry))
132 pte = pte_mkwrite(pte); 145 pte = pte_mkwrite(pte);
146#ifdef CONFIG_HUGETLB_PAGE
147 if (PageHuge(new))
148 pte = pte_mkhuge(pte);
149#endif
133 flush_cache_page(vma, addr, pte_pfn(pte)); 150 flush_cache_page(vma, addr, pte_pfn(pte));
134 set_pte_at(mm, addr, ptep, pte); 151 set_pte_at(mm, addr, ptep, pte);
135 152
136 if (PageAnon(new)) 153 if (PageHuge(new)) {
154 if (PageAnon(new))
155 hugepage_add_anon_rmap(new, vma, addr);
156 else
157 page_dup_rmap(new);
158 } else if (PageAnon(new))
137 page_add_anon_rmap(new, vma, addr); 159 page_add_anon_rmap(new, vma, addr);
138 else 160 else
139 page_add_file_rmap(new); 161 page_add_file_rmap(new);
@@ -226,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
226 248
227 expected_count = 2 + page_has_private(page); 249 expected_count = 2 + page_has_private(page);
228 if (page_count(page) != expected_count || 250 if (page_count(page) != expected_count ||
229 (struct page *)radix_tree_deref_slot(pslot) != page) { 251 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
230 spin_unlock_irq(&mapping->tree_lock); 252 spin_unlock_irq(&mapping->tree_lock);
231 return -EAGAIN; 253 return -EAGAIN;
232 } 254 }
@@ -266,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
266 */ 288 */
267 __dec_zone_page_state(page, NR_FILE_PAGES); 289 __dec_zone_page_state(page, NR_FILE_PAGES);
268 __inc_zone_page_state(newpage, NR_FILE_PAGES); 290 __inc_zone_page_state(newpage, NR_FILE_PAGES);
269 if (PageSwapBacked(page)) { 291 if (!PageSwapCache(page) && PageSwapBacked(page)) {
270 __dec_zone_page_state(page, NR_SHMEM); 292 __dec_zone_page_state(page, NR_SHMEM);
271 __inc_zone_page_state(newpage, NR_SHMEM); 293 __inc_zone_page_state(newpage, NR_SHMEM);
272 } 294 }
@@ -276,11 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
276} 298}
277 299
278/* 300/*
301 * The expected number of remaining references is the same as that
302 * of migrate_page_move_mapping().
303 */
304int migrate_huge_page_move_mapping(struct address_space *mapping,
305 struct page *newpage, struct page *page)
306{
307 int expected_count;
308 void **pslot;
309
310 if (!mapping) {
311 if (page_count(page) != 1)
312 return -EAGAIN;
313 return 0;
314 }
315
316 spin_lock_irq(&mapping->tree_lock);
317
318 pslot = radix_tree_lookup_slot(&mapping->page_tree,
319 page_index(page));
320
321 expected_count = 2 + page_has_private(page);
322 if (page_count(page) != expected_count ||
323 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
324 spin_unlock_irq(&mapping->tree_lock);
325 return -EAGAIN;
326 }
327
328 if (!page_freeze_refs(page, expected_count)) {
329 spin_unlock_irq(&mapping->tree_lock);
330 return -EAGAIN;
331 }
332
333 get_page(newpage);
334
335 radix_tree_replace_slot(pslot, newpage);
336
337 page_unfreeze_refs(page, expected_count);
338
339 __put_page(page);
340
341 spin_unlock_irq(&mapping->tree_lock);
342 return 0;
343}
344
345/*
279 * Copy the page to its new location 346 * Copy the page to its new location
280 */ 347 */
281static void migrate_page_copy(struct page *newpage, struct page *page) 348void migrate_page_copy(struct page *newpage, struct page *page)
282{ 349{
283 copy_highpage(newpage, page); 350 if (PageHuge(page))
351 copy_huge_page(newpage, page);
352 else
353 copy_highpage(newpage, page);
284 354
285 if (PageError(page)) 355 if (PageError(page))
286 SetPageError(newpage); 356 SetPageError(newpage);
@@ -305,7 +375,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
305 * redo the accounting that clear_page_dirty_for_io undid, 375 * redo the accounting that clear_page_dirty_for_io undid,
306 * but we can't use set_page_dirty because that function 376 * but we can't use set_page_dirty because that function
307 * is actually a signal that all of the page has become dirty. 377 * is actually a signal that all of the page has become dirty.
308 * Wheras only part of our page may be dirty. 378 * Whereas only part of our page may be dirty.
309 */ 379 */
310 __set_page_dirty_nobuffers(newpage); 380 __set_page_dirty_nobuffers(newpage);
311 } 381 }
@@ -431,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page)
431 .nr_to_write = 1, 501 .nr_to_write = 1,
432 .range_start = 0, 502 .range_start = 0,
433 .range_end = LLONG_MAX, 503 .range_end = LLONG_MAX,
434 .nonblocking = 1,
435 .for_reclaim = 1 504 .for_reclaim = 1
436 }; 505 };
437 int rc; 506 int rc;
@@ -495,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
495 * == 0 - success 564 * == 0 - success
496 */ 565 */
497static int move_to_new_page(struct page *newpage, struct page *page, 566static int move_to_new_page(struct page *newpage, struct page *page,
498 int remap_swapcache) 567 int remap_swapcache, bool sync)
499{ 568{
500 struct address_space *mapping; 569 struct address_space *mapping;
501 int rc; 570 int rc;
@@ -517,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
517 mapping = page_mapping(page); 586 mapping = page_mapping(page);
518 if (!mapping) 587 if (!mapping)
519 rc = migrate_page(mapping, newpage, page); 588 rc = migrate_page(mapping, newpage, page);
520 else if (mapping->a_ops->migratepage) 589 else {
521 /* 590 /*
522 * Most pages have a mapping and most filesystems 591 * Do not writeback pages if !sync and migratepage is
523 * should provide a migration function. Anonymous 592 * not pointing to migrate_page() which is nonblocking
524 * pages are part of swap space which also has its 593 * (swapcache/tmpfs uses migratepage = migrate_page).
525 * own migration function. This is the most common
526 * path for page migration.
527 */ 594 */
528 rc = mapping->a_ops->migratepage(mapping, 595 if (PageDirty(page) && !sync &&
529 newpage, page); 596 mapping->a_ops->migratepage != migrate_page)
530 else 597 rc = -EBUSY;
531 rc = fallback_migrate_page(mapping, newpage, page); 598 else if (mapping->a_ops->migratepage)
599 /*
600 * Most pages have a mapping and most filesystems
601 * should provide a migration function. Anonymous
602 * pages are part of swap space which also has its
603 * own migration function. This is the most common
604 * path for page migration.
605 */
606 rc = mapping->a_ops->migratepage(mapping,
607 newpage, page);
608 else
609 rc = fallback_migrate_page(mapping, newpage, page);
610 }
532 611
533 if (rc) { 612 if (rc) {
534 newpage->mapping = NULL; 613 newpage->mapping = NULL;
@@ -547,15 +626,14 @@ static int move_to_new_page(struct page *newpage, struct page *page,
547 * to the newly allocated page in newpage. 626 * to the newly allocated page in newpage.
548 */ 627 */
549static int unmap_and_move(new_page_t get_new_page, unsigned long private, 628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
550 struct page *page, int force, int offlining) 629 struct page *page, int force, bool offlining, bool sync)
551{ 630{
552 int rc = 0; 631 int rc = 0;
553 int *result = NULL; 632 int *result = NULL;
554 struct page *newpage = get_new_page(page, private, &result); 633 struct page *newpage = get_new_page(page, private, &result);
555 int remap_swapcache = 1; 634 int remap_swapcache = 1;
556 int rcu_locked = 0;
557 int charge = 0; 635 int charge = 0;
558 struct mem_cgroup *mem = NULL; 636 struct mem_cgroup *mem;
559 struct anon_vma *anon_vma = NULL; 637 struct anon_vma *anon_vma = NULL;
560 638
561 if (!newpage) 639 if (!newpage)
@@ -565,13 +643,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
565 /* page was freed from under us. So we are done. */ 643 /* page was freed from under us. So we are done. */
566 goto move_newpage; 644 goto move_newpage;
567 } 645 }
646 if (unlikely(PageTransHuge(page)))
647 if (unlikely(split_huge_page(page)))
648 goto move_newpage;
568 649
569 /* prepare cgroup just returns 0 or -ENOMEM */ 650 /* prepare cgroup just returns 0 or -ENOMEM */
570 rc = -EAGAIN; 651 rc = -EAGAIN;
571 652
572 if (!trylock_page(page)) { 653 if (!trylock_page(page)) {
573 if (!force) 654 if (!force || !sync)
655 goto move_newpage;
656
657 /*
658 * It's not safe for direct compaction to call lock_page.
659 * For example, during page readahead pages are added locked
660 * to the LRU. Later, when the IO completes the pages are
661 * marked uptodate and unlocked. However, the queueing
662 * could be merging multiple pages for one bio (e.g.
663 * mpage_readpages). If an allocation happens for the
664 * second or third page, the process can end up locking
665 * the same page twice and deadlocking. Rather than
666 * trying to be clever about what pages can be locked,
667 * avoid the use of lock_page for direct compaction
668 * altogether.
669 */
670 if (current->flags & PF_MEMALLOC)
574 goto move_newpage; 671 goto move_newpage;
672
575 lock_page(page); 673 lock_page(page);
576 } 674 }
577 675
@@ -590,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
590 } 688 }
591 689
592 /* charge against new page */ 690 /* charge against new page */
593 charge = mem_cgroup_prepare_migration(page, newpage, &mem); 691 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
594 if (charge == -ENOMEM) { 692 if (charge == -ENOMEM) {
595 rc = -ENOMEM; 693 rc = -ENOMEM;
596 goto unlock; 694 goto unlock;
@@ -598,6 +696,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
598 BUG_ON(charge); 696 BUG_ON(charge);
599 697
600 if (PageWriteback(page)) { 698 if (PageWriteback(page)) {
699 /*
700 * For !sync, there is no point retrying as the retry loop
701 * is expected to be too short for PageWriteback to be cleared
702 */
703 if (!sync) {
704 rc = -EBUSY;
705 goto uncharge;
706 }
601 if (!force) 707 if (!force)
602 goto uncharge; 708 goto uncharge;
603 wait_on_page_writeback(page); 709 wait_on_page_writeback(page);
@@ -605,20 +711,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
605 /* 711 /*
606 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 712 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
607 * we cannot notice that anon_vma is freed while we migrates a page. 713 * we cannot notice that anon_vma is freed while we migrates a page.
608 * This rcu_read_lock() delays freeing anon_vma pointer until the end 714 * This get_anon_vma() delays freeing anon_vma pointer until the end
609 * of migration. File cache pages are no problem because of page_lock() 715 * of migration. File cache pages are no problem because of page_lock()
610 * File Caches may use write_page() or lock_page() in migration, then, 716 * File Caches may use write_page() or lock_page() in migration, then,
611 * just care Anon page here. 717 * just care Anon page here.
612 */ 718 */
613 if (PageAnon(page)) { 719 if (PageAnon(page)) {
614 rcu_read_lock(); 720 /*
615 rcu_locked = 1; 721 * Only page_lock_anon_vma() understands the subtleties of
616 722 * getting a hold on an anon_vma from outside one of its mms.
617 /* Determine how to safely use anon_vma */ 723 */
618 if (!page_mapped(page)) { 724 anon_vma = page_get_anon_vma(page);
619 if (!PageSwapCache(page)) 725 if (anon_vma) {
620 goto rcu_unlock; 726 /*
621 727 * Anon page
728 */
729 } else if (PageSwapCache(page)) {
622 /* 730 /*
623 * We cannot be sure that the anon_vma of an unmapped 731 * We cannot be sure that the anon_vma of an unmapped
624 * swapcache page is safe to use because we don't 732 * swapcache page is safe to use because we don't
@@ -633,13 +741,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
633 */ 741 */
634 remap_swapcache = 0; 742 remap_swapcache = 0;
635 } else { 743 } else {
636 /* 744 goto uncharge;
637 * Take a reference count on the anon_vma if the
638 * page is mapped so that it is guaranteed to
639 * exist when the page is remapped later
640 */
641 anon_vma = page_anon_vma(page);
642 get_anon_vma(anon_vma);
643 } 745 }
644 } 746 }
645 747
@@ -656,16 +758,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
656 * free the metadata, so the page can be freed. 758 * free the metadata, so the page can be freed.
657 */ 759 */
658 if (!page->mapping) { 760 if (!page->mapping) {
659 if (!PageAnon(page) && page_has_private(page)) { 761 VM_BUG_ON(PageAnon(page));
660 /* 762 if (page_has_private(page)) {
661 * Go direct to try_to_free_buffers() here because
662 * a) that's what try_to_release_page() would do anyway
663 * b) we may be under rcu_read_lock() here, so we can't
664 * use GFP_KERNEL which is what try_to_release_page()
665 * needs to be effective.
666 */
667 try_to_free_buffers(page); 763 try_to_free_buffers(page);
668 goto rcu_unlock; 764 goto uncharge;
669 } 765 }
670 goto skip_unmap; 766 goto skip_unmap;
671 } 767 }
@@ -675,24 +771,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
675 771
676skip_unmap: 772skip_unmap:
677 if (!page_mapped(page)) 773 if (!page_mapped(page))
678 rc = move_to_new_page(newpage, page, remap_swapcache); 774 rc = move_to_new_page(newpage, page, remap_swapcache, sync);
679 775
680 if (rc && remap_swapcache) 776 if (rc && remap_swapcache)
681 remove_migration_ptes(page, page); 777 remove_migration_ptes(page, page);
682rcu_unlock:
683 778
684 /* Drop an anon_vma reference if we took one */ 779 /* Drop an anon_vma reference if we took one */
685 if (anon_vma) 780 if (anon_vma)
686 drop_anon_vma(anon_vma); 781 put_anon_vma(anon_vma);
687 782
688 if (rcu_locked)
689 rcu_read_unlock();
690uncharge: 783uncharge:
691 if (!charge) 784 if (!charge)
692 mem_cgroup_end_migration(mem, page, newpage); 785 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
693unlock: 786unlock:
694 unlock_page(page); 787 unlock_page(page);
695 788
789move_newpage:
696 if (rc != -EAGAIN) { 790 if (rc != -EAGAIN) {
697 /* 791 /*
698 * A page that has been migrated has all references 792 * A page that has been migrated has all references
@@ -706,8 +800,6 @@ unlock:
706 putback_lru_page(page); 800 putback_lru_page(page);
707 } 801 }
708 802
709move_newpage:
710
711 /* 803 /*
712 * Move the new page to the LRU. If migration was not successful 804 * Move the new page to the LRU. If migration was not successful
713 * then this will free the page. 805 * then this will free the page.
@@ -724,6 +816,76 @@ move_newpage:
724} 816}
725 817
726/* 818/*
819 * Counterpart of unmap_and_move_page() for hugepage migration.
820 *
821 * This function doesn't wait the completion of hugepage I/O
822 * because there is no race between I/O and migration for hugepage.
823 * Note that currently hugepage I/O occurs only in direct I/O
824 * where no lock is held and PG_writeback is irrelevant,
825 * and writeback status of all subpages are counted in the reference
826 * count of the head page (i.e. if all subpages of a 2MB hugepage are
827 * under direct I/O, the reference of the head page is 512 and a bit more.)
828 * This means that when we try to migrate hugepage whose subpages are
829 * doing direct I/O, some references remain after try_to_unmap() and
830 * hugepage migration fails without data corruption.
831 *
832 * There is also no race when direct I/O is issued on the page under migration,
833 * because then pte is replaced with migration swap entry and direct I/O code
834 * will wait in the page fault for migration to complete.
835 */
836static int unmap_and_move_huge_page(new_page_t get_new_page,
837 unsigned long private, struct page *hpage,
838 int force, bool offlining, bool sync)
839{
840 int rc = 0;
841 int *result = NULL;
842 struct page *new_hpage = get_new_page(hpage, private, &result);
843 struct anon_vma *anon_vma = NULL;
844
845 if (!new_hpage)
846 return -ENOMEM;
847
848 rc = -EAGAIN;
849
850 if (!trylock_page(hpage)) {
851 if (!force || !sync)
852 goto out;
853 lock_page(hpage);
854 }
855
856 if (PageAnon(hpage))
857 anon_vma = page_get_anon_vma(hpage);
858
859 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
860
861 if (!page_mapped(hpage))
862 rc = move_to_new_page(new_hpage, hpage, 1, sync);
863
864 if (rc)
865 remove_migration_ptes(hpage, hpage);
866
867 if (anon_vma)
868 put_anon_vma(anon_vma);
869out:
870 unlock_page(hpage);
871
872 if (rc != -EAGAIN) {
873 list_del(&hpage->lru);
874 put_page(hpage);
875 }
876
877 put_page(new_hpage);
878
879 if (result) {
880 if (rc)
881 *result = rc;
882 else
883 *result = page_to_nid(new_hpage);
884 }
885 return rc;
886}
887
888/*
727 * migrate_pages 889 * migrate_pages
728 * 890 *
729 * The function takes one list of pages to migrate and a function 891 * The function takes one list of pages to migrate and a function
@@ -732,13 +894,15 @@ move_newpage:
732 * 894 *
733 * The function returns after 10 attempts or if no pages 895 * The function returns after 10 attempts or if no pages
734 * are movable anymore because to has become empty 896 * are movable anymore because to has become empty
735 * or no retryable pages exist anymore. All pages will be 897 * or no retryable pages exist anymore.
736 * returned to the LRU or freed. 898 * Caller should call putback_lru_pages to return pages to the LRU
899 * or free list only if ret != 0.
737 * 900 *
738 * Return: Number of pages not migrated or error code. 901 * Return: Number of pages not migrated or error code.
739 */ 902 */
740int migrate_pages(struct list_head *from, 903int migrate_pages(struct list_head *from,
741 new_page_t get_new_page, unsigned long private, int offlining) 904 new_page_t get_new_page, unsigned long private, bool offlining,
905 bool sync)
742{ 906{
743 int retry = 1; 907 int retry = 1;
744 int nr_failed = 0; 908 int nr_failed = 0;
@@ -758,7 +922,8 @@ int migrate_pages(struct list_head *from,
758 cond_resched(); 922 cond_resched();
759 923
760 rc = unmap_and_move(get_new_page, private, 924 rc = unmap_and_move(get_new_page, private,
761 page, pass > 2, offlining); 925 page, pass > 2, offlining,
926 sync);
762 927
763 switch(rc) { 928 switch(rc) {
764 case -ENOMEM: 929 case -ENOMEM:
@@ -780,8 +945,50 @@ out:
780 if (!swapwrite) 945 if (!swapwrite)
781 current->flags &= ~PF_SWAPWRITE; 946 current->flags &= ~PF_SWAPWRITE;
782 947
783 putback_lru_pages(from); 948 if (rc)
949 return rc;
950
951 return nr_failed + retry;
952}
953
954int migrate_huge_pages(struct list_head *from,
955 new_page_t get_new_page, unsigned long private, bool offlining,
956 bool sync)
957{
958 int retry = 1;
959 int nr_failed = 0;
960 int pass = 0;
961 struct page *page;
962 struct page *page2;
963 int rc;
964
965 for (pass = 0; pass < 10 && retry; pass++) {
966 retry = 0;
967
968 list_for_each_entry_safe(page, page2, from, lru) {
969 cond_resched();
970
971 rc = unmap_and_move_huge_page(get_new_page,
972 private, page, pass > 2, offlining,
973 sync);
784 974
975 switch(rc) {
976 case -ENOMEM:
977 goto out;
978 case -EAGAIN:
979 retry++;
980 break;
981 case 0:
982 break;
983 default:
984 /* Permanent failure */
985 nr_failed++;
986 break;
987 }
988 }
989 }
990 rc = 0;
991out:
785 if (rc) 992 if (rc)
786 return rc; 993 return rc;
787 994
@@ -841,10 +1048,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
841 1048
842 err = -EFAULT; 1049 err = -EFAULT;
843 vma = find_vma(mm, pp->addr); 1050 vma = find_vma(mm, pp->addr);
844 if (!vma || !vma_migratable(vma)) 1051 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
845 goto set_status; 1052 goto set_status;
846 1053
847 page = follow_page(vma, pp->addr, FOLL_GET); 1054 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
848 1055
849 err = PTR_ERR(page); 1056 err = PTR_ERR(page);
850 if (IS_ERR(page)) 1057 if (IS_ERR(page))
@@ -890,9 +1097,12 @@ set_status:
890 } 1097 }
891 1098
892 err = 0; 1099 err = 0;
893 if (!list_empty(&pagelist)) 1100 if (!list_empty(&pagelist)) {
894 err = migrate_pages(&pagelist, new_page_node, 1101 err = migrate_pages(&pagelist, new_page_node,
895 (unsigned long)pm, 0); 1102 (unsigned long)pm, 0, true);
1103 if (err)
1104 putback_lru_pages(&pagelist);
1105 }
896 1106
897 up_read(&mm->mmap_sem); 1107 up_read(&mm->mmap_sem);
898 return err; 1108 return err;
@@ -1005,7 +1215,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1005 int err = -EFAULT; 1215 int err = -EFAULT;
1006 1216
1007 vma = find_vma(mm, addr); 1217 vma = find_vma(mm, addr);
1008 if (!vma) 1218 if (!vma || addr < vma->vm_start)
1009 goto set_status; 1219 goto set_status;
1010 1220
1011 page = follow_page(vma, addr, 0); 1221 page = follow_page(vma, addr, 0);
@@ -1086,14 +1296,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1086 return -EPERM; 1296 return -EPERM;
1087 1297
1088 /* Find the mm_struct */ 1298 /* Find the mm_struct */
1089 read_lock(&tasklist_lock); 1299 rcu_read_lock();
1090 task = pid ? find_task_by_vpid(pid) : current; 1300 task = pid ? find_task_by_vpid(pid) : current;
1091 if (!task) { 1301 if (!task) {
1092 read_unlock(&tasklist_lock); 1302 rcu_read_unlock();
1093 return -ESRCH; 1303 return -ESRCH;
1094 } 1304 }
1095 mm = get_task_mm(task); 1305 mm = get_task_mm(task);
1096 read_unlock(&tasklist_lock); 1306 rcu_read_unlock();
1097 1307
1098 if (!mm) 1308 if (!mm)
1099 return -EINVAL; 1309 return -EINVAL;
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
154 pmd = pmd_offset(pud, addr); 154 pmd = pmd_offset(pud, addr);
155 do { 155 do {
156 next = pmd_addr_end(addr, end); 156 next = pmd_addr_end(addr, end);
157 if (pmd_trans_huge(*pmd)) {
158 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
159 vec += (next - addr) >> PAGE_SHIFT;
160 continue;
161 }
162 /* fall through */
163 }
157 if (pmd_none_or_clear_bad(pmd)) 164 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec); 165 mincore_unmapped_range(vma, addr, next, vec);
159 else 166 else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
139{
140 return (vma->vm_flags & VM_GROWSDOWN) &&
141 (vma->vm_start == addr) &&
142 !vma_stack_continue(vma->vm_prev, addr);
143}
144
145/** 138/**
146 * __mlock_vma_pages_range() - mlock a range of pages in the vma. 139 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
147 * @vma: target vma 140 * @vma: target vma
@@ -155,13 +148,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
155 * vma->vm_mm->mmap_sem must be held for at least read. 148 * vma->vm_mm->mmap_sem must be held for at least read.
156 */ 149 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma, 150static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end) 151 unsigned long start, unsigned long end,
152 int *nonblocking)
159{ 153{
160 struct mm_struct *mm = vma->vm_mm; 154 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 155 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 156 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0;
165 int gup_flags; 157 int gup_flags;
166 158
167 VM_BUG_ON(start & ~PAGE_MASK); 159 VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +162,24 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
170 VM_BUG_ON(end > vma->vm_end); 162 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 163 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 164
173 gup_flags = FOLL_TOUCH | FOLL_GET; 165 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
174 if (vma->vm_flags & VM_WRITE) 166 /*
167 * We want to touch writable mappings with a write fault in order
168 * to break COW, except for shared mappings because these don't COW
169 * and we would not want to dirty them for nothing.
170 */
171 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
175 gup_flags |= FOLL_WRITE; 172 gup_flags |= FOLL_WRITE;
176 173
177 /* We don't try to access the guard page of a stack vma */ 174 /*
178 if (stack_guard_page(vma, start)) { 175 * We want mlock to succeed for regions that have any permissions
179 addr += PAGE_SIZE; 176 * other than PROT_NONE.
180 nr_pages--; 177 */
181 } 178 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
182 179 gup_flags |= FOLL_FORCE;
183 while (nr_pages > 0) {
184 int i;
185
186 cond_resched();
187
188 /*
189 * get_user_pages makes pages present if we are
190 * setting mlock. and this extra reference count will
191 * disable migration of this page. However, page may
192 * still be truncated out from under us.
193 */
194 ret = __get_user_pages(current, mm, addr,
195 min_t(int, nr_pages, ARRAY_SIZE(pages)),
196 gup_flags, pages, NULL);
197 /*
198 * This can happen for, e.g., VM_NONLINEAR regions before
199 * a page has been allocated and mapped at a given offset,
200 * or for addresses that map beyond end of a file.
201 * We'll mlock the pages if/when they get faulted in.
202 */
203 if (ret < 0)
204 break;
205
206 lru_add_drain(); /* push cached pages to LRU */
207
208 for (i = 0; i < ret; i++) {
209 struct page *page = pages[i];
210
211 if (page->mapping) {
212 /*
213 * That preliminary check is mainly to avoid
214 * the pointless overhead of lock_page on the
215 * ZERO_PAGE: which might bounce very badly if
216 * there is contention. However, we're still
217 * dirtying its cacheline with get/put_page:
218 * we'll add another __get_user_pages flag to
219 * avoid it if that case turns out to matter.
220 */
221 lock_page(page);
222 /*
223 * Because we lock page here and migration is
224 * blocked by the elevated reference, we need
225 * only check for file-cache page truncation.
226 */
227 if (page->mapping)
228 mlock_vma_page(page);
229 unlock_page(page);
230 }
231 put_page(page); /* ref from get_user_pages() */
232 }
233
234 addr += ret * PAGE_SIZE;
235 nr_pages -= ret;
236 ret = 0;
237 }
238 180
239 return ret; /* 0 or negative error code */ 181 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
182 NULL, NULL, nonblocking);
240} 183}
241 184
242/* 185/*
@@ -278,9 +221,9 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
278 221
279 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 222 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
280 is_vm_hugetlb_page(vma) || 223 is_vm_hugetlb_page(vma) ||
281 vma == get_gate_vma(current))) { 224 vma == get_gate_vma(current->mm))) {
282 225
283 __mlock_vma_pages_range(vma, start, end); 226 __mlock_vma_pages_range(vma, start, end, NULL);
284 227
285 /* Hide errors from mmap() and other callers */ 228 /* Hide errors from mmap() and other callers */
286 return 0; 229 return 0;
@@ -364,26 +307,18 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
364 * For vmas that pass the filters, merge/split as appropriate. 307 * For vmas that pass the filters, merge/split as appropriate.
365 */ 308 */
366static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 309static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
367 unsigned long start, unsigned long end, unsigned int newflags) 310 unsigned long start, unsigned long end, vm_flags_t newflags)
368{ 311{
369 struct mm_struct *mm = vma->vm_mm; 312 struct mm_struct *mm = vma->vm_mm;
370 pgoff_t pgoff; 313 pgoff_t pgoff;
371 int nr_pages; 314 int nr_pages;
372 int ret = 0; 315 int ret = 0;
373 int lock = newflags & VM_LOCKED; 316 int lock = !!(newflags & VM_LOCKED);
374 317
375 if (newflags == vma->vm_flags || 318 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
376 (vma->vm_flags & (VM_IO | VM_PFNMAP))) 319 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
377 goto out; /* don't set VM_LOCKED, don't count */ 320 goto out; /* don't set VM_LOCKED, don't count */
378 321
379 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
380 is_vm_hugetlb_page(vma) ||
381 vma == get_gate_vma(current)) {
382 if (lock)
383 make_pages_present(start, end);
384 goto out; /* don't set VM_LOCKED, don't count */
385 }
386
387 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 322 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
388 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 323 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
389 vma->vm_file, pgoff, vma_policy(vma)); 324 vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +354,10 @@ success:
419 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 354 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
420 */ 355 */
421 356
422 if (lock) { 357 if (lock)
423 vma->vm_flags = newflags; 358 vma->vm_flags = newflags;
424 ret = __mlock_vma_pages_range(vma, start, end); 359 else
425 if (ret < 0)
426 ret = __mlock_posix_error_return(ret);
427 } else {
428 munlock_vma_pages_range(vma, start, end); 360 munlock_vma_pages_range(vma, start, end);
429 }
430 361
431out: 362out:
432 *prev = vma; 363 *prev = vma;
@@ -439,7 +370,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
439 struct vm_area_struct * vma, * prev; 370 struct vm_area_struct * vma, * prev;
440 int error; 371 int error;
441 372
442 len = PAGE_ALIGN(len); 373 VM_BUG_ON(start & ~PAGE_MASK);
374 VM_BUG_ON(len != PAGE_ALIGN(len));
443 end = start + len; 375 end = start + len;
444 if (end < start) 376 if (end < start)
445 return -EINVAL; 377 return -EINVAL;
@@ -453,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
453 prev = vma; 385 prev = vma;
454 386
455 for (nstart = start ; ; ) { 387 for (nstart = start ; ; ) {
456 unsigned int newflags; 388 vm_flags_t newflags;
457 389
458 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 390 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
459 391
@@ -482,6 +414,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
482 return error; 414 return error;
483} 415}
484 416
417static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
418{
419 struct mm_struct *mm = current->mm;
420 unsigned long end, nstart, nend;
421 struct vm_area_struct *vma = NULL;
422 int locked = 0;
423 int ret = 0;
424
425 VM_BUG_ON(start & ~PAGE_MASK);
426 VM_BUG_ON(len != PAGE_ALIGN(len));
427 end = start + len;
428
429 for (nstart = start; nstart < end; nstart = nend) {
430 /*
431 * We want to fault in pages for [nstart; end) address range.
432 * Find first corresponding VMA.
433 */
434 if (!locked) {
435 locked = 1;
436 down_read(&mm->mmap_sem);
437 vma = find_vma(mm, nstart);
438 } else if (nstart >= vma->vm_end)
439 vma = vma->vm_next;
440 if (!vma || vma->vm_start >= end)
441 break;
442 /*
443 * Set [nstart; nend) to intersection of desired address
444 * range with the first VMA. Also, skip undesirable VMA types.
445 */
446 nend = min(end, vma->vm_end);
447 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
448 continue;
449 if (nstart < vma->vm_start)
450 nstart = vma->vm_start;
451 /*
452 * Now fault in a range of pages. __mlock_vma_pages_range()
453 * double checks the vma flags, so that it won't mlock pages
454 * if the vma was already munlocked.
455 */
456 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
457 if (ret < 0) {
458 if (ignore_errors) {
459 ret = 0;
460 continue; /* continue at next VMA */
461 }
462 ret = __mlock_posix_error_return(ret);
463 break;
464 }
465 nend = nstart + ret * PAGE_SIZE;
466 ret = 0;
467 }
468 if (locked)
469 up_read(&mm->mmap_sem);
470 return ret; /* 0 or negative error code */
471}
472
485SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 473SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
486{ 474{
487 unsigned long locked; 475 unsigned long locked;
@@ -507,6 +495,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
507 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 495 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
508 error = do_mlock(start, len, 1); 496 error = do_mlock(start, len, 1);
509 up_write(&current->mm->mmap_sem); 497 up_write(&current->mm->mmap_sem);
498 if (!error)
499 error = do_mlock_pages(start, len, 0);
510 return error; 500 return error;
511} 501}
512 502
@@ -534,7 +524,7 @@ static int do_mlockall(int flags)
534 goto out; 524 goto out;
535 525
536 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 526 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
537 unsigned int newflags; 527 vm_flags_t newflags;
538 528
539 newflags = vma->vm_flags | VM_LOCKED; 529 newflags = vma->vm_flags | VM_LOCKED;
540 if (!(flags & MCL_CURRENT)) 530 if (!(flags & MCL_CURRENT))
@@ -571,6 +561,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
571 capable(CAP_IPC_LOCK)) 561 capable(CAP_IPC_LOCK))
572 ret = do_mlockall(flags); 562 ret = do_mlockall(flags);
573 up_write(&current->mm->mmap_sem); 563 up_write(&current->mm->mmap_sem);
564 if (!ret && (flags & MCL_CURRENT)) {
565 /* Ignore errors */
566 do_mlock_pages(0, TASK_SIZE, 1);
567 }
574out: 568out:
575 return ret; 569 return ret;
576} 570}
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..d49736ff8a8d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,8 @@
28#include <linux/rmap.h> 28#include <linux/rmap.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h>
32#include <linux/khugepaged.h>
31 33
32#include <asm/uaccess.h> 34#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 35#include <asm/cacheflush.h>
@@ -82,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
82} 84}
83EXPORT_SYMBOL(vm_get_page_prot); 85EXPORT_SYMBOL(vm_get_page_prot);
84 86
85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
86int sysctl_overcommit_ratio = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88struct percpu_counter vm_committed_as; 90/*
91 * Make sure vm_committed_as in one cacheline and not cacheline shared with
92 * other variables. It can be updated by several CPUs frequently.
93 */
94struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
89 95
90/* 96/*
91 * Check that a process has enough memory to allocate a new virtual 97 * Check that a process has enough memory to allocate a new virtual
@@ -188,7 +194,7 @@ error:
188} 194}
189 195
190/* 196/*
191 * Requires inode->i_mapping->i_mmap_lock 197 * Requires inode->i_mapping->i_mmap_mutex
192 */ 198 */
193static void __remove_shared_vm_struct(struct vm_area_struct *vma, 199static void __remove_shared_vm_struct(struct vm_area_struct *vma,
194 struct file *file, struct address_space *mapping) 200 struct file *file, struct address_space *mapping)
@@ -216,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
216 222
217 if (file) { 223 if (file) {
218 struct address_space *mapping = file->f_mapping; 224 struct address_space *mapping = file->f_mapping;
219 spin_lock(&mapping->i_mmap_lock); 225 mutex_lock(&mapping->i_mmap_mutex);
220 __remove_shared_vm_struct(vma, file, mapping); 226 __remove_shared_vm_struct(vma, file, mapping);
221 spin_unlock(&mapping->i_mmap_lock); 227 mutex_unlock(&mapping->i_mmap_mutex);
222 } 228 }
223} 229}
224 230
@@ -252,7 +258,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
252 down_write(&mm->mmap_sem); 258 down_write(&mm->mmap_sem);
253 259
254#ifdef CONFIG_COMPAT_BRK 260#ifdef CONFIG_COMPAT_BRK
255 min_brk = mm->end_code; 261 /*
262 * CONFIG_COMPAT_BRK can still be overridden by setting
263 * randomize_va_space to 2, which will still cause mm->start_brk
264 * to be arbitrarily shifted
265 */
266 if (current->brk_randomized)
267 min_brk = mm->start_brk;
268 else
269 min_brk = mm->end_data;
256#else 270#else
257 min_brk = mm->start_brk; 271 min_brk = mm->start_brk;
258#endif 272#endif
@@ -384,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
384 return vma; 398 return vma;
385} 399}
386 400
387static inline void
388__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
389 struct vm_area_struct *prev, struct rb_node *rb_parent)
390{
391 struct vm_area_struct *next;
392
393 vma->vm_prev = prev;
394 if (prev) {
395 next = prev->vm_next;
396 prev->vm_next = vma;
397 } else {
398 mm->mmap = vma;
399 if (rb_parent)
400 next = rb_entry(rb_parent,
401 struct vm_area_struct, vm_rb);
402 else
403 next = NULL;
404 }
405 vma->vm_next = next;
406 if (next)
407 next->vm_prev = vma;
408}
409
410void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 401void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
411 struct rb_node **rb_link, struct rb_node *rb_parent) 402 struct rb_node **rb_link, struct rb_node *rb_parent)
412{ 403{
@@ -454,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
454 if (vma->vm_file) 445 if (vma->vm_file)
455 mapping = vma->vm_file->f_mapping; 446 mapping = vma->vm_file->f_mapping;
456 447
457 if (mapping) { 448 if (mapping)
458 spin_lock(&mapping->i_mmap_lock); 449 mutex_lock(&mapping->i_mmap_mutex);
459 vma->vm_truncate_count = mapping->truncate_count;
460 }
461 450
462 __vma_link(mm, vma, prev, rb_link, rb_parent); 451 __vma_link(mm, vma, prev, rb_link, rb_parent);
463 __vma_link_file(vma); 452 __vma_link_file(vma);
464 453
465 if (mapping) 454 if (mapping)
466 spin_unlock(&mapping->i_mmap_lock); 455 mutex_unlock(&mapping->i_mmap_mutex);
467 456
468 mm->map_count++; 457 mm->map_count++;
469 validate_mm(mm); 458 validate_mm(mm);
@@ -566,17 +555,8 @@ again: remove_next = 1 + (end > next->vm_end);
566 mapping = file->f_mapping; 555 mapping = file->f_mapping;
567 if (!(vma->vm_flags & VM_NONLINEAR)) 556 if (!(vma->vm_flags & VM_NONLINEAR))
568 root = &mapping->i_mmap; 557 root = &mapping->i_mmap;
569 spin_lock(&mapping->i_mmap_lock); 558 mutex_lock(&mapping->i_mmap_mutex);
570 if (importer &&
571 vma->vm_truncate_count != next->vm_truncate_count) {
572 /*
573 * unmap_mapping_range might be in progress:
574 * ensure that the expanding vma is rescanned.
575 */
576 importer->vm_truncate_count = 0;
577 }
578 if (insert) { 559 if (insert) {
579 insert->vm_truncate_count = vma->vm_truncate_count;
580 /* 560 /*
581 * Put into prio_tree now, so instantiated pages 561 * Put into prio_tree now, so instantiated pages
582 * are visible to arm/parisc __flush_dcache_page 562 * are visible to arm/parisc __flush_dcache_page
@@ -587,13 +567,15 @@ again: remove_next = 1 + (end > next->vm_end);
587 } 567 }
588 } 568 }
589 569
570 vma_adjust_trans_huge(vma, start, end, adjust_next);
571
590 /* 572 /*
591 * When changing only vma->vm_end, we don't really need anon_vma 573 * When changing only vma->vm_end, we don't really need anon_vma
592 * lock. This is a fairly rare case by itself, but the anon_vma 574 * lock. This is a fairly rare case by itself, but the anon_vma
593 * lock may be shared between many sibling processes. Skipping 575 * lock may be shared between many sibling processes. Skipping
594 * the lock for brk adjustments makes a difference sometimes. 576 * the lock for brk adjustments makes a difference sometimes.
595 */ 577 */
596 if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { 578 if (vma->anon_vma && (importer || start != vma->vm_start)) {
597 anon_vma = vma->anon_vma; 579 anon_vma = vma->anon_vma;
598 anon_vma_lock(anon_vma); 580 anon_vma_lock(anon_vma);
599 } 581 }
@@ -640,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end);
640 if (anon_vma) 622 if (anon_vma)
641 anon_vma_unlock(anon_vma); 623 anon_vma_unlock(anon_vma);
642 if (mapping) 624 if (mapping)
643 spin_unlock(&mapping->i_mmap_lock); 625 mutex_unlock(&mapping->i_mmap_mutex);
644 626
645 if (remove_next) { 627 if (remove_next) {
646 if (file) { 628 if (file) {
@@ -687,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
687} 669}
688 670
689static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 671static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
690 struct anon_vma *anon_vma2) 672 struct anon_vma *anon_vma2,
673 struct vm_area_struct *vma)
691{ 674{
692 return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); 675 /*
676 * The list_is_singular() test is to avoid merging VMA cloned from
677 * parents. This can improve scalability caused by anon_vma lock.
678 */
679 if ((!anon_vma1 || !anon_vma2) && (!vma ||
680 list_is_singular(&vma->anon_vma_chain)))
681 return 1;
682 return anon_vma1 == anon_vma2;
693} 683}
694 684
695/* 685/*
@@ -708,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
708 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 698 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
709{ 699{
710 if (is_mergeable_vma(vma, file, vm_flags) && 700 if (is_mergeable_vma(vma, file, vm_flags) &&
711 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 701 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
712 if (vma->vm_pgoff == vm_pgoff) 702 if (vma->vm_pgoff == vm_pgoff)
713 return 1; 703 return 1;
714 } 704 }
@@ -727,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
727 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 717 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
728{ 718{
729 if (is_mergeable_vma(vma, file, vm_flags) && 719 if (is_mergeable_vma(vma, file, vm_flags) &&
730 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 720 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
731 pgoff_t vm_pglen; 721 pgoff_t vm_pglen;
732 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 722 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
733 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 723 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -805,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
805 can_vma_merge_before(next, vm_flags, 795 can_vma_merge_before(next, vm_flags,
806 anon_vma, file, pgoff+pglen) && 796 anon_vma, file, pgoff+pglen) &&
807 is_mergeable_anon_vma(prev->anon_vma, 797 is_mergeable_anon_vma(prev->anon_vma,
808 next->anon_vma)) { 798 next->anon_vma, NULL)) {
809 /* cases 1, 6 */ 799 /* cases 1, 6 */
810 err = vma_adjust(prev, prev->vm_start, 800 err = vma_adjust(prev, prev->vm_start,
811 next->vm_end, prev->vm_pgoff, NULL); 801 next->vm_end, prev->vm_pgoff, NULL);
@@ -814,6 +804,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
814 end, prev->vm_pgoff, NULL); 804 end, prev->vm_pgoff, NULL);
815 if (err) 805 if (err)
816 return NULL; 806 return NULL;
807 khugepaged_enter_vma_merge(prev);
817 return prev; 808 return prev;
818 } 809 }
819 810
@@ -832,6 +823,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
832 next->vm_pgoff - pglen, NULL); 823 next->vm_pgoff - pglen, NULL);
833 if (err) 824 if (err)
834 return NULL; 825 return NULL;
826 khugepaged_enter_vma_merge(area);
835 return area; 827 return area;
836 } 828 }
837 829
@@ -914,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
914 if (anon_vma) 906 if (anon_vma)
915 return anon_vma; 907 return anon_vma;
916try_prev: 908try_prev:
917 /* 909 near = vma->vm_prev;
918 * It is potentially slow to have to call find_vma_prev here.
919 * But it's only on the first write fault on the vma, not
920 * every time, and we could devise a way to avoid it later
921 * (e.g. stash info in next's anon_vma_node when assigning
922 * an anon_vma, or when trying vma_merge). Another time.
923 */
924 BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
925 if (!near) 910 if (!near)
926 goto none; 911 goto none;
927 912
@@ -968,7 +953,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
968{ 953{
969 struct mm_struct * mm = current->mm; 954 struct mm_struct * mm = current->mm;
970 struct inode *inode; 955 struct inode *inode;
971 unsigned int vm_flags; 956 vm_flags_t vm_flags;
972 int error; 957 int error;
973 unsigned long reqprot = prot; 958 unsigned long reqprot = prot;
974 959
@@ -1108,6 +1093,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1108 unsigned long retval = -EBADF; 1093 unsigned long retval = -EBADF;
1109 1094
1110 if (!(flags & MAP_ANONYMOUS)) { 1095 if (!(flags & MAP_ANONYMOUS)) {
1096 audit_mmap_fd(fd, flags);
1111 if (unlikely(flags & MAP_HUGETLB)) 1097 if (unlikely(flags & MAP_HUGETLB))
1112 return -EINVAL; 1098 return -EINVAL;
1113 file = fget(fd); 1099 file = fget(fd);
@@ -1172,7 +1158,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1172 */ 1158 */
1173int vma_wants_writenotify(struct vm_area_struct *vma) 1159int vma_wants_writenotify(struct vm_area_struct *vma)
1174{ 1160{
1175 unsigned int vm_flags = vma->vm_flags; 1161 vm_flags_t vm_flags = vma->vm_flags;
1176 1162
1177 /* If it was private or non-writable, the write bit is already clear */ 1163 /* If it was private or non-writable, the write bit is already clear */
1178 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1164 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1200,7 +1186,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1200 * We account for memory if it's a private writeable mapping, 1186 * We account for memory if it's a private writeable mapping,
1201 * not hugepages and VM_NORESERVE wasn't set. 1187 * not hugepages and VM_NORESERVE wasn't set.
1202 */ 1188 */
1203static inline int accountable_mapping(struct file *file, unsigned int vm_flags) 1189static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1204{ 1190{
1205 /* 1191 /*
1206 * hugetlb has its own accounting separate from the core VM 1192 * hugetlb has its own accounting separate from the core VM
@@ -1214,7 +1200,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1214 1200
1215unsigned long mmap_region(struct file *file, unsigned long addr, 1201unsigned long mmap_region(struct file *file, unsigned long addr,
1216 unsigned long len, unsigned long flags, 1202 unsigned long len, unsigned long flags,
1217 unsigned int vm_flags, unsigned long pgoff) 1203 vm_flags_t vm_flags, unsigned long pgoff)
1218{ 1204{
1219 struct mm_struct *mm = current->mm; 1205 struct mm_struct *mm = current->mm;
1220 struct vm_area_struct *vma, *prev; 1206 struct vm_area_struct *vma, *prev;
@@ -1752,13 +1738,17 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1752 size = address - vma->vm_start; 1738 size = address - vma->vm_start;
1753 grow = (address - vma->vm_end) >> PAGE_SHIFT; 1739 grow = (address - vma->vm_end) >> PAGE_SHIFT;
1754 1740
1755 error = acct_stack_growth(vma, size, grow); 1741 error = -ENOMEM;
1756 if (!error) { 1742 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1757 vma->vm_end = address; 1743 error = acct_stack_growth(vma, size, grow);
1758 perf_event_mmap(vma); 1744 if (!error) {
1745 vma->vm_end = address;
1746 perf_event_mmap(vma);
1747 }
1759 } 1748 }
1760 } 1749 }
1761 vma_unlock_anon_vma(vma); 1750 vma_unlock_anon_vma(vma);
1751 khugepaged_enter_vma_merge(vma);
1762 return error; 1752 return error;
1763} 1753}
1764#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1754#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1766,7 +1756,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1766/* 1756/*
1767 * vma is the first one with address < vma->vm_start. Have to extend vma. 1757 * vma is the first one with address < vma->vm_start. Have to extend vma.
1768 */ 1758 */
1769static int expand_downwards(struct vm_area_struct *vma, 1759int expand_downwards(struct vm_area_struct *vma,
1770 unsigned long address) 1760 unsigned long address)
1771{ 1761{
1772 int error; 1762 int error;
@@ -1798,22 +1788,21 @@ static int expand_downwards(struct vm_area_struct *vma,
1798 size = vma->vm_end - address; 1788 size = vma->vm_end - address;
1799 grow = (vma->vm_start - address) >> PAGE_SHIFT; 1789 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1800 1790
1801 error = acct_stack_growth(vma, size, grow); 1791 error = -ENOMEM;
1802 if (!error) { 1792 if (grow <= vma->vm_pgoff) {
1803 vma->vm_start = address; 1793 error = acct_stack_growth(vma, size, grow);
1804 vma->vm_pgoff -= grow; 1794 if (!error) {
1805 perf_event_mmap(vma); 1795 vma->vm_start = address;
1796 vma->vm_pgoff -= grow;
1797 perf_event_mmap(vma);
1798 }
1806 } 1799 }
1807 } 1800 }
1808 vma_unlock_anon_vma(vma); 1801 vma_unlock_anon_vma(vma);
1802 khugepaged_enter_vma_merge(vma);
1809 return error; 1803 return error;
1810} 1804}
1811 1805
1812int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
1813{
1814 return expand_downwards(vma, address);
1815}
1816
1817#ifdef CONFIG_STACK_GROWSUP 1806#ifdef CONFIG_STACK_GROWSUP
1818int expand_stack(struct vm_area_struct *vma, unsigned long address) 1807int expand_stack(struct vm_area_struct *vma, unsigned long address)
1819{ 1808{
@@ -1896,17 +1885,17 @@ static void unmap_region(struct mm_struct *mm,
1896 unsigned long start, unsigned long end) 1885 unsigned long start, unsigned long end)
1897{ 1886{
1898 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 1887 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1899 struct mmu_gather *tlb; 1888 struct mmu_gather tlb;
1900 unsigned long nr_accounted = 0; 1889 unsigned long nr_accounted = 0;
1901 1890
1902 lru_add_drain(); 1891 lru_add_drain();
1903 tlb = tlb_gather_mmu(mm, 0); 1892 tlb_gather_mmu(&tlb, mm, 0);
1904 update_hiwater_rss(mm); 1893 update_hiwater_rss(mm);
1905 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1894 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1906 vm_unacct_memory(nr_accounted); 1895 vm_unacct_memory(nr_accounted);
1907 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1896 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1908 next? next->vm_start: 0); 1897 next ? next->vm_start : 0);
1909 tlb_finish_mmu(tlb, start, end); 1898 tlb_finish_mmu(&tlb, start, end);
1910} 1899}
1911 1900
1912/* 1901/*
@@ -2048,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2048 return -EINVAL; 2037 return -EINVAL;
2049 2038
2050 /* Find the first overlapping VMA */ 2039 /* Find the first overlapping VMA */
2051 vma = find_vma_prev(mm, start, &prev); 2040 vma = find_vma(mm, start);
2052 if (!vma) 2041 if (!vma)
2053 return 0; 2042 return 0;
2043 prev = vma->vm_prev;
2054 /* we have start < vma->vm_end */ 2044 /* we have start < vma->vm_end */
2055 2045
2056 /* if it doesn't overlap, we have nothing.. */ 2046 /* if it doesn't overlap, we have nothing.. */
@@ -2248,7 +2238,7 @@ EXPORT_SYMBOL(do_brk);
2248/* Release all mmaps. */ 2238/* Release all mmaps. */
2249void exit_mmap(struct mm_struct *mm) 2239void exit_mmap(struct mm_struct *mm)
2250{ 2240{
2251 struct mmu_gather *tlb; 2241 struct mmu_gather tlb;
2252 struct vm_area_struct *vma; 2242 struct vm_area_struct *vma;
2253 unsigned long nr_accounted = 0; 2243 unsigned long nr_accounted = 0;
2254 unsigned long end; 2244 unsigned long end;
@@ -2273,14 +2263,14 @@ void exit_mmap(struct mm_struct *mm)
2273 2263
2274 lru_add_drain(); 2264 lru_add_drain();
2275 flush_cache_mm(mm); 2265 flush_cache_mm(mm);
2276 tlb = tlb_gather_mmu(mm, 1); 2266 tlb_gather_mmu(&tlb, mm, 1);
2277 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2267 /* update_hiwater_rss(mm) here? but nobody should be looking */
2278 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2268 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2279 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2269 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2280 vm_unacct_memory(nr_accounted); 2270 vm_unacct_memory(nr_accounted);
2281 2271
2282 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2272 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2283 tlb_finish_mmu(tlb, 0, end); 2273 tlb_finish_mmu(&tlb, 0, end);
2284 2274
2285 /* 2275 /*
2286 * Walk the list again, actually closing and freeing it, 2276 * Walk the list again, actually closing and freeing it,
@@ -2294,7 +2284,7 @@ void exit_mmap(struct mm_struct *mm)
2294 2284
2295/* Insert vm structure into process list sorted by address 2285/* Insert vm structure into process list sorted by address
2296 * and into the inode's i_mmap tree. If vm_file is non-NULL 2286 * and into the inode's i_mmap tree. If vm_file is non-NULL
2297 * then i_mmap_lock is taken here. 2287 * then i_mmap_mutex is taken here.
2298 */ 2288 */
2299int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 2289int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2300{ 2290{
@@ -2460,6 +2450,7 @@ int install_special_mapping(struct mm_struct *mm,
2460 unsigned long addr, unsigned long len, 2450 unsigned long addr, unsigned long len,
2461 unsigned long vm_flags, struct page **pages) 2451 unsigned long vm_flags, struct page **pages)
2462{ 2452{
2453 int ret;
2463 struct vm_area_struct *vma; 2454 struct vm_area_struct *vma;
2464 2455
2465 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2456 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2477,16 +2468,23 @@ int install_special_mapping(struct mm_struct *mm,
2477 vma->vm_ops = &special_mapping_vmops; 2468 vma->vm_ops = &special_mapping_vmops;
2478 vma->vm_private_data = pages; 2469 vma->vm_private_data = pages;
2479 2470
2480 if (unlikely(insert_vm_struct(mm, vma))) { 2471 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2481 kmem_cache_free(vm_area_cachep, vma); 2472 if (ret)
2482 return -ENOMEM; 2473 goto out;
2483 } 2474
2475 ret = insert_vm_struct(mm, vma);
2476 if (ret)
2477 goto out;
2484 2478
2485 mm->total_vm += len >> PAGE_SHIFT; 2479 mm->total_vm += len >> PAGE_SHIFT;
2486 2480
2487 perf_event_mmap(vma); 2481 perf_event_mmap(vma);
2488 2482
2489 return 0; 2483 return 0;
2484
2485out:
2486 kmem_cache_free(vm_area_cachep, vma);
2487 return ret;
2490} 2488}
2491 2489
2492static DEFINE_MUTEX(mm_all_locks_mutex); 2490static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -2498,15 +2496,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2498 * The LSB of head.next can't change from under us 2496 * The LSB of head.next can't change from under us
2499 * because we hold the mm_all_locks_mutex. 2497 * because we hold the mm_all_locks_mutex.
2500 */ 2498 */
2501 spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); 2499 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
2502 /* 2500 /*
2503 * We can safely modify head.next after taking the 2501 * We can safely modify head.next after taking the
2504 * anon_vma->root->lock. If some other vma in this mm shares 2502 * anon_vma->root->mutex. If some other vma in this mm shares
2505 * the same anon_vma we won't take it again. 2503 * the same anon_vma we won't take it again.
2506 * 2504 *
2507 * No need of atomic instructions here, head.next 2505 * No need of atomic instructions here, head.next
2508 * can't change from under us thanks to the 2506 * can't change from under us thanks to the
2509 * anon_vma->root->lock. 2507 * anon_vma->root->mutex.
2510 */ 2508 */
2511 if (__test_and_set_bit(0, (unsigned long *) 2509 if (__test_and_set_bit(0, (unsigned long *)
2512 &anon_vma->root->head.next)) 2510 &anon_vma->root->head.next))
@@ -2528,7 +2526,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2528 */ 2526 */
2529 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2527 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2530 BUG(); 2528 BUG();
2531 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); 2529 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
2532 } 2530 }
2533} 2531}
2534 2532
@@ -2555,7 +2553,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2555 * vma in this mm is backed by the same anon_vma or address_space. 2553 * vma in this mm is backed by the same anon_vma or address_space.
2556 * 2554 *
2557 * We can take all the locks in random order because the VM code 2555 * We can take all the locks in random order because the VM code
2558 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never 2556 * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
2559 * takes more than one of them in a row. Secondly we're protected 2557 * takes more than one of them in a row. Secondly we're protected
2560 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 2558 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2561 * 2559 *
@@ -2611,7 +2609,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2611 * 2609 *
2612 * No need of atomic instructions here, head.next 2610 * No need of atomic instructions here, head.next
2613 * can't change from under us until we release the 2611 * can't change from under us until we release the
2614 * anon_vma->root->lock. 2612 * anon_vma->root->mutex.
2615 */ 2613 */
2616 if (!__test_and_clear_bit(0, (unsigned long *) 2614 if (!__test_and_clear_bit(0, (unsigned long *)
2617 &anon_vma->root->head.next)) 2615 &anon_vma->root->head.next))
@@ -2627,7 +2625,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
2627 * AS_MM_ALL_LOCKS can't change to 0 from under us 2625 * AS_MM_ALL_LOCKS can't change to 0 from under us
2628 * because we hold the mm_all_locks_mutex. 2626 * because we hold the mm_all_locks_mutex.
2629 */ 2627 */
2630 spin_unlock(&mapping->i_mmap_lock); 2628 mutex_unlock(&mapping->i_mmap_mutex);
2631 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2629 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2632 &mapping->flags)) 2630 &mapping->flags))
2633 BUG(); 2631 BUG();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100 return young; 100 return young;
101} 101}
102 102
103int __mmu_notifier_test_young(struct mm_struct *mm,
104 unsigned long address)
105{
106 struct mmu_notifier *mn;
107 struct hlist_node *n;
108 int young = 0;
109
110 rcu_read_lock();
111 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112 if (mn->ops->test_young) {
113 young = mn->ops->test_young(mn, mm, address);
114 if (young)
115 break;
116 }
117 }
118 rcu_read_unlock();
119
120 return young;
121}
122
103void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 123void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 pte_t pte) 124 pte_t pte)
105{ 125{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
78 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
79} 79}
80 80
81static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 82 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 83 int dirty_accountable)
84{ 84{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
88 pmd = pmd_offset(pud, addr); 88 pmd = pmd_offset(pud, addr);
89 do { 89 do {
90 next = pmd_addr_end(addr, end); 90 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot))
95 continue;
96 /* fall through */
97 }
91 if (pmd_none_or_clear_bad(pmd)) 98 if (pmd_none_or_clear_bad(pmd))
92 continue; 99 continue;
93 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); 100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
101 dirty_accountable);
94 } while (pmd++, addr = next, addr != end); 102 } while (pmd++, addr = next, addr != end);
95} 103}
96 104
97static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
98 unsigned long addr, unsigned long end, pgprot_t newprot, 106 unsigned long addr, unsigned long end, pgprot_t newprot,
99 int dirty_accountable) 107 int dirty_accountable)
100{ 108{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
106 next = pud_addr_end(addr, end); 114 next = pud_addr_end(addr, end);
107 if (pud_none_or_clear_bad(pud)) 115 if (pud_none_or_clear_bad(pud))
108 continue; 116 continue;
109 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); 117 change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable);
110 } while (pud++, addr = next, addr != end); 119 } while (pud++, addr = next, addr != end);
111} 120}
112 121
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
126 next = pgd_addr_end(addr, end); 135 next = pgd_addr_end(addr, end);
127 if (pgd_none_or_clear_bad(pgd)) 136 if (pgd_none_or_clear_bad(pgd))
128 continue; 137 continue;
129 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); 138 change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable);
130 } while (pgd++, addr = next, addr != end); 140 } while (pgd++, addr = next, addr != end);
131 flush_tlb_range(vma, start, end); 141 flush_tlb_range(vma, start, end);
132} 142}
@@ -211,6 +221,7 @@ success:
211 mmu_notifier_invalidate_range_end(mm, start, end); 221 mmu_notifier_invalidate_range_end(mm, start, end);
212 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
213 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 223 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma);
214 return 0; 225 return 0;
215 226
216fail: 227fail:
@@ -299,7 +310,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 310 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 311 if (error)
301 goto out; 312 goto out;
302 perf_event_mmap(vma);
303 nstart = tmp; 313 nstart = tmp;
304 314
305 if (nstart < prev->vm_end) 315 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..506fa44403df 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd);
44 if (pmd_none_or_clear_bad(pmd)) 45 if (pmd_none_or_clear_bad(pmd))
45 return NULL; 46 return NULL;
46 47
47 return pmd; 48 return pmd;
48} 49}
49 50
50static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
52 unsigned long addr)
51{ 53{
52 pgd_t *pgd; 54 pgd_t *pgd;
53 pud_t *pud; 55 pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
62 if (!pmd) 64 if (!pmd)
63 return NULL; 65 return NULL;
64 66
65 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) 67 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
66 return NULL; 69 return NULL;
67 70
68 return pmd; 71 return pmd;
@@ -90,10 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
90 * and we propagate stale pages into the dst afterward. 93 * and we propagate stale pages into the dst afterward.
91 */ 94 */
92 mapping = vma->vm_file->f_mapping; 95 mapping = vma->vm_file->f_mapping;
93 spin_lock(&mapping->i_mmap_lock); 96 mutex_lock(&mapping->i_mmap_mutex);
94 if (new_vma->vm_truncate_count &&
95 new_vma->vm_truncate_count != vma->vm_truncate_count)
96 new_vma->vm_truncate_count = 0;
97 } 97 }
98 98
99 /* 99 /*
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
101 * pte locks because exclusive mmap_sem prevents deadlock. 101 * pte locks because exclusive mmap_sem prevents deadlock.
102 */ 102 */
103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 103 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
104 new_pte = pte_offset_map_nested(new_pmd, new_addr); 104 new_pte = pte_offset_map(new_pmd, new_addr);
105 new_ptl = pte_lockptr(mm, new_pmd); 105 new_ptl = pte_lockptr(mm, new_pmd);
106 if (new_ptl != old_ptl) 106 if (new_ptl != old_ptl)
107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 107 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,10 +119,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
119 arch_leave_lazy_mmu_mode(); 119 arch_leave_lazy_mmu_mode();
120 if (new_ptl != old_ptl) 120 if (new_ptl != old_ptl)
121 spin_unlock(new_ptl); 121 spin_unlock(new_ptl);
122 pte_unmap_nested(new_pte - 1); 122 pte_unmap(new_pte - 1);
123 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
124 if (mapping) 124 if (mapping)
125 spin_unlock(&mapping->i_mmap_lock); 125 mutex_unlock(&mapping->i_mmap_mutex);
126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); 126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
127} 127}
128 128
@@ -147,7 +147,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 147 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 148 if (!old_pmd)
149 continue; 149 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); 150 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 151 if (!new_pmd)
152 break; 152 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 153 next = (new_addr + PMD_SIZE) & PMD_MASK;
@@ -276,9 +276,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
276 if (old_len > vma->vm_end - addr) 276 if (old_len > vma->vm_end - addr)
277 goto Efault; 277 goto Efault;
278 278
279 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { 279 /* Need to be careful about a growing mapping */
280 if (new_len > old_len) 280 if (new_len > old_len) {
281 unsigned long pgoff;
282
283 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
281 goto Efault; 284 goto Efault;
285 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
286 pgoff += vma->vm_pgoff;
287 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
288 goto Einval;
282 } 289 }
283 290
284 if (vma->vm_flags & VM_LOCKED) { 291 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 000000000000..6e93dc7f2586
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,404 @@
1/*
2 * bootmem - A boot-time physical memory allocator and configurator
3 *
4 * Copyright (C) 1999 Ingo Molnar
5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
7 *
8 * Access to this subsystem has to be serialized externally (which is true
9 * for the boot process anyway).
10 */
11#include <linux/init.h>
12#include <linux/pfn.h>
13#include <linux/slab.h>
14#include <linux/bootmem.h>
15#include <linux/module.h>
16#include <linux/kmemleak.h>
17#include <linux/range.h>
18#include <linux/memblock.h>
19
20#include <asm/bug.h>
21#include <asm/io.h>
22#include <asm/processor.h>
23
24#include "internal.h"
25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data;
28EXPORT_SYMBOL(contig_page_data);
29#endif
30
31unsigned long max_low_pfn;
32unsigned long min_low_pfn;
33unsigned long max_pfn;
34
35static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
36 u64 goal, u64 limit)
37{
38 void *ptr;
39 u64 addr;
40
41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit;
43
44 addr = find_memory_core_early(nid, size, align, goal, limit);
45
46 if (addr == MEMBLOCK_ERROR)
47 return NULL;
48
49 ptr = phys_to_virt(addr);
50 memset(ptr, 0, size);
51 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
52 /*
53 * The min_count is set to 0 so that bootmem allocated blocks
54 * are never reported as leaks.
55 */
56 kmemleak_alloc(ptr, size, 0, 0);
57 return ptr;
58}
59
60/*
61 * free_bootmem_late - free bootmem pages directly to page allocator
62 * @addr: starting address of the range
63 * @size: size of the range in bytes
64 *
65 * This is only useful when the bootmem allocator has already been torn
66 * down, but we are still initializing the system. Pages are given directly
67 * to the page allocator, no bootmem metadata is updated because it is gone.
68 */
69void __init free_bootmem_late(unsigned long addr, unsigned long size)
70{
71 unsigned long cursor, end;
72
73 kmemleak_free_part(__va(addr), size);
74
75 cursor = PFN_UP(addr);
76 end = PFN_DOWN(addr + size);
77
78 for (; cursor < end; cursor++) {
79 __free_pages_bootmem(pfn_to_page(cursor), 0);
80 totalram_pages++;
81 }
82}
83
84static void __init __free_pages_memory(unsigned long start, unsigned long end)
85{
86 int i;
87 unsigned long start_aligned, end_aligned;
88 int order = ilog2(BITS_PER_LONG);
89
90 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
91 end_aligned = end & ~(BITS_PER_LONG - 1);
92
93 if (end_aligned <= start_aligned) {
94 for (i = start; i < end; i++)
95 __free_pages_bootmem(pfn_to_page(i), 0);
96
97 return;
98 }
99
100 for (i = start; i < start_aligned; i++)
101 __free_pages_bootmem(pfn_to_page(i), 0);
102
103 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
104 __free_pages_bootmem(pfn_to_page(i), order);
105
106 for (i = end_aligned; i < end; i++)
107 __free_pages_bootmem(pfn_to_page(i), 0);
108}
109
110unsigned long __init free_all_memory_core_early(int nodeid)
111{
112 int i;
113 u64 start, end;
114 unsigned long count = 0;
115 struct range *range = NULL;
116 int nr_range;
117
118 nr_range = get_free_all_memory_range(&range, nodeid);
119
120 for (i = 0; i < nr_range; i++) {
121 start = range[i].start;
122 end = range[i].end;
123 count += end - start;
124 __free_pages_memory(start, end);
125 }
126
127 return count;
128}
129
130/**
131 * free_all_bootmem_node - release a node's free pages to the buddy allocator
132 * @pgdat: node to be released
133 *
134 * Returns the number of pages actually released.
135 */
136unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
137{
138 register_page_bootmem_info_node(pgdat);
139
140 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
141 return 0;
142}
143
144/**
145 * free_all_bootmem - release free pages to the buddy allocator
146 *
147 * Returns the number of pages actually released.
148 */
149unsigned long __init free_all_bootmem(void)
150{
151 /*
152 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
153 * because in some case like Node0 doesn't have RAM installed
154 * low ram will be on Node1
155 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
156 * will be used instead of only Node0 related
157 */
158 return free_all_memory_core_early(MAX_NUMNODES);
159}
160
161/**
162 * free_bootmem_node - mark a page range as usable
163 * @pgdat: node the range resides on
164 * @physaddr: starting address of the range
165 * @size: size of the range in bytes
166 *
167 * Partial pages will be considered reserved and left as they are.
168 *
169 * The range must reside completely on the specified node.
170 */
171void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
172 unsigned long size)
173{
174 kmemleak_free_part(__va(physaddr), size);
175 memblock_x86_free_range(physaddr, physaddr + size);
176}
177
178/**
179 * free_bootmem - mark a page range as usable
180 * @addr: starting address of the range
181 * @size: size of the range in bytes
182 *
183 * Partial pages will be considered reserved and left as they are.
184 *
185 * The range must be contiguous but may span node boundaries.
186 */
187void __init free_bootmem(unsigned long addr, unsigned long size)
188{
189 kmemleak_free_part(__va(addr), size);
190 memblock_x86_free_range(addr, addr + size);
191}
192
193static void * __init ___alloc_bootmem_nopanic(unsigned long size,
194 unsigned long align,
195 unsigned long goal,
196 unsigned long limit)
197{
198 void *ptr;
199
200 if (WARN_ON_ONCE(slab_is_available()))
201 return kzalloc(size, GFP_NOWAIT);
202
203restart:
204
205 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
206
207 if (ptr)
208 return ptr;
209
210 if (goal != 0) {
211 goal = 0;
212 goto restart;
213 }
214
215 return NULL;
216}
217
218/**
219 * __alloc_bootmem_nopanic - allocate boot memory without panicking
220 * @size: size of the request in bytes
221 * @align: alignment of the region
222 * @goal: preferred starting address of the region
223 *
224 * The goal is dropped if it can not be satisfied and the allocation will
225 * fall back to memory below @goal.
226 *
227 * Allocation may happen on any node in the system.
228 *
229 * Returns NULL on failure.
230 */
231void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
232 unsigned long goal)
233{
234 unsigned long limit = -1UL;
235
236 return ___alloc_bootmem_nopanic(size, align, goal, limit);
237}
238
239static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
240 unsigned long goal, unsigned long limit)
241{
242 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
243
244 if (mem)
245 return mem;
246 /*
247 * Whoops, we cannot satisfy the allocation request.
248 */
249 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
250 panic("Out of memory");
251 return NULL;
252}
253
254/**
255 * __alloc_bootmem - allocate boot memory
256 * @size: size of the request in bytes
257 * @align: alignment of the region
258 * @goal: preferred starting address of the region
259 *
260 * The goal is dropped if it can not be satisfied and the allocation will
261 * fall back to memory below @goal.
262 *
263 * Allocation may happen on any node in the system.
264 *
265 * The function panics if the request can not be satisfied.
266 */
267void * __init __alloc_bootmem(unsigned long size, unsigned long align,
268 unsigned long goal)
269{
270 unsigned long limit = -1UL;
271
272 return ___alloc_bootmem(size, align, goal, limit);
273}
274
275/**
276 * __alloc_bootmem_node - allocate boot memory from a specific node
277 * @pgdat: node to allocate from
278 * @size: size of the request in bytes
279 * @align: alignment of the region
280 * @goal: preferred starting address of the region
281 *
282 * The goal is dropped if it can not be satisfied and the allocation will
283 * fall back to memory below @goal.
284 *
285 * Allocation may fall back to any node in the system if the specified node
286 * can not hold the requested memory.
287 *
288 * The function panics if the request can not be satisfied.
289 */
290void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
291 unsigned long align, unsigned long goal)
292{
293 void *ptr;
294
295 if (WARN_ON_ONCE(slab_is_available()))
296 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
297
298 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
299 goal, -1ULL);
300 if (ptr)
301 return ptr;
302
303 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
304 goal, -1ULL);
305}
306
307void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
308 unsigned long align, unsigned long goal)
309{
310 return __alloc_bootmem_node(pgdat, size, align, goal);
311}
312
313#ifdef CONFIG_SPARSEMEM
314/**
315 * alloc_bootmem_section - allocate boot memory from a specific section
316 * @size: size of the request in bytes
317 * @section_nr: sparse map section to allocate from
318 *
319 * Return NULL on failure.
320 */
321void * __init alloc_bootmem_section(unsigned long size,
322 unsigned long section_nr)
323{
324 unsigned long pfn, goal, limit;
325
326 pfn = section_nr_to_pfn(section_nr);
327 goal = pfn << PAGE_SHIFT;
328 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
329
330 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
331 SMP_CACHE_BYTES, goal, limit);
332}
333#endif
334
335void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
336 unsigned long align, unsigned long goal)
337{
338 void *ptr;
339
340 if (WARN_ON_ONCE(slab_is_available()))
341 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
342
343 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
344 goal, -1ULL);
345 if (ptr)
346 return ptr;
347
348 return __alloc_bootmem_nopanic(size, align, goal);
349}
350
351#ifndef ARCH_LOW_ADDRESS_LIMIT
352#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
353#endif
354
355/**
356 * __alloc_bootmem_low - allocate low boot memory
357 * @size: size of the request in bytes
358 * @align: alignment of the region
359 * @goal: preferred starting address of the region
360 *
361 * The goal is dropped if it can not be satisfied and the allocation will
362 * fall back to memory below @goal.
363 *
364 * Allocation may happen on any node in the system.
365 *
366 * The function panics if the request can not be satisfied.
367 */
368void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
369 unsigned long goal)
370{
371 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
372}
373
374/**
375 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
376 * @pgdat: node to allocate from
377 * @size: size of the request in bytes
378 * @align: alignment of the region
379 * @goal: preferred starting address of the region
380 *
381 * The goal is dropped if it can not be satisfied and the allocation will
382 * fall back to memory below @goal.
383 *
384 * Allocation may fall back to any node in the system if the specified node
385 * can not hold the requested memory.
386 *
387 * The function panics if the request can not be satisfied.
388 */
389void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
390 unsigned long align, unsigned long goal)
391{
392 void *ptr;
393
394 if (WARN_ON_ONCE(slab_is_available()))
395 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
396
397 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
398 goal, ARCH_LOW_ADDRESS_LIMIT);
399 if (ptr)
400 return ptr;
401
402 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
403 goal, ARCH_LOW_ADDRESS_LIMIT);
404}
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..9edc897a3970 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -29,6 +29,7 @@
29#include <linux/personality.h> 29#include <linux/personality.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/audit.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlb.h> 35#include <asm/tlb.h>
@@ -126,7 +127,8 @@ unsigned int kobjsize(const void *objp)
126 127
127int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int nr_pages, unsigned int foll_flags, 129 unsigned long start, int nr_pages, unsigned int foll_flags,
129 struct page **pages, struct vm_area_struct **vmas) 130 struct page **pages, struct vm_area_struct **vmas,
131 int *retry)
130{ 132{
131 struct vm_area_struct *vma; 133 struct vm_area_struct *vma;
132 unsigned long vm_flags; 134 unsigned long vm_flags;
@@ -184,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
184 if (force) 186 if (force)
185 flags |= FOLL_FORCE; 187 flags |= FOLL_FORCE;
186 188
187 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 189 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
190 NULL);
188} 191}
189EXPORT_SYMBOL(get_user_pages); 192EXPORT_SYMBOL(get_user_pages);
190 193
@@ -293,12 +296,60 @@ void *vmalloc(unsigned long size)
293} 296}
294EXPORT_SYMBOL(vmalloc); 297EXPORT_SYMBOL(vmalloc);
295 298
299/*
300 * vzalloc - allocate virtually continguos memory with zero fill
301 *
302 * @size: allocation size
303 *
304 * Allocate enough pages to cover @size from the page level
305 * allocator and map them into continguos kernel virtual space.
306 * The memory allocated is set to zero.
307 *
308 * For tight control over page level allocator and protection flags
309 * use __vmalloc() instead.
310 */
311void *vzalloc(unsigned long size)
312{
313 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
314 PAGE_KERNEL);
315}
316EXPORT_SYMBOL(vzalloc);
317
318/**
319 * vmalloc_node - allocate memory on a specific node
320 * @size: allocation size
321 * @node: numa node
322 *
323 * Allocate enough pages to cover @size from the page level
324 * allocator and map them into contiguous kernel virtual space.
325 *
326 * For tight control over page level allocator and protection flags
327 * use __vmalloc() instead.
328 */
296void *vmalloc_node(unsigned long size, int node) 329void *vmalloc_node(unsigned long size, int node)
297{ 330{
298 return vmalloc(size); 331 return vmalloc(size);
299} 332}
300EXPORT_SYMBOL(vmalloc_node); 333EXPORT_SYMBOL(vmalloc_node);
301 334
335/**
336 * vzalloc_node - allocate memory on a specific node with zero fill
337 * @size: allocation size
338 * @node: numa node
339 *
340 * Allocate enough pages to cover @size from the page level
341 * allocator and map them into contiguous kernel virtual space.
342 * The memory allocated is set to zero.
343 *
344 * For tight control over page level allocator and protection flags
345 * use __vmalloc() instead.
346 */
347void *vzalloc_node(unsigned long size, int node)
348{
349 return vzalloc(size);
350}
351EXPORT_SYMBOL(vzalloc_node);
352
302#ifndef PAGE_KERNEL_EXEC 353#ifndef PAGE_KERNEL_EXEC
303# define PAGE_KERNEL_EXEC PAGE_KERNEL 354# define PAGE_KERNEL_EXEC PAGE_KERNEL
304#endif 355#endif
@@ -392,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
392{ 443{
393} 444}
394 445
446/**
447 * alloc_vm_area - allocate a range of kernel address space
448 * @size: size of the area
449 *
450 * Returns: NULL on failure, vm_struct on success
451 *
452 * This function reserves a range of kernel address space, and
453 * allocates pagetables to map that range. No actual mappings
454 * are created. If the kernel address space is not shared
455 * between processes, it syncs the pagetable across all
456 * processes.
457 */
458struct vm_struct *alloc_vm_area(size_t size)
459{
460 BUG();
461 return NULL;
462}
463EXPORT_SYMBOL_GPL(alloc_vm_area);
464
465void free_vm_area(struct vm_struct *area)
466{
467 BUG();
468}
469EXPORT_SYMBOL_GPL(free_vm_area);
470
395int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 471int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
396 struct page *page) 472 struct page *page)
397{ 473{
@@ -604,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
604 */ 680 */
605static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 681static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
606{ 682{
607 struct vm_area_struct *pvma, **pp, *next; 683 struct vm_area_struct *pvma, *prev;
608 struct address_space *mapping; 684 struct address_space *mapping;
609 struct rb_node **p, *parent; 685 struct rb_node **p, *parent, *rb_prev;
610 686
611 kenter(",%p", vma); 687 kenter(",%p", vma);
612 688
@@ -627,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
627 } 703 }
628 704
629 /* add the VMA to the tree */ 705 /* add the VMA to the tree */
630 parent = NULL; 706 parent = rb_prev = NULL;
631 p = &mm->mm_rb.rb_node; 707 p = &mm->mm_rb.rb_node;
632 while (*p) { 708 while (*p) {
633 parent = *p; 709 parent = *p;
@@ -637,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
637 * (the latter is necessary as we may get identical VMAs) */ 713 * (the latter is necessary as we may get identical VMAs) */
638 if (vma->vm_start < pvma->vm_start) 714 if (vma->vm_start < pvma->vm_start)
639 p = &(*p)->rb_left; 715 p = &(*p)->rb_left;
640 else if (vma->vm_start > pvma->vm_start) 716 else if (vma->vm_start > pvma->vm_start) {
717 rb_prev = parent;
641 p = &(*p)->rb_right; 718 p = &(*p)->rb_right;
642 else if (vma->vm_end < pvma->vm_end) 719 } else if (vma->vm_end < pvma->vm_end)
643 p = &(*p)->rb_left; 720 p = &(*p)->rb_left;
644 else if (vma->vm_end > pvma->vm_end) 721 else if (vma->vm_end > pvma->vm_end) {
722 rb_prev = parent;
645 p = &(*p)->rb_right; 723 p = &(*p)->rb_right;
646 else if (vma < pvma) 724 } else if (vma < pvma)
647 p = &(*p)->rb_left; 725 p = &(*p)->rb_left;
648 else if (vma > pvma) 726 else if (vma > pvma) {
727 rb_prev = parent;
649 p = &(*p)->rb_right; 728 p = &(*p)->rb_right;
650 else 729 } else
651 BUG(); 730 BUG();
652 } 731 }
653 732
@@ -655,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
655 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 734 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
656 735
657 /* add VMA to the VMA list also */ 736 /* add VMA to the VMA list also */
658 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { 737 prev = NULL;
659 if (pvma->vm_start > vma->vm_start) 738 if (rb_prev)
660 break; 739 prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
661 if (pvma->vm_start < vma->vm_start)
662 continue;
663 if (pvma->vm_end < vma->vm_end)
664 break;
665 }
666 740
667 next = *pp; 741 __vma_link_list(mm, vma, prev, parent);
668 *pp = vma;
669 vma->vm_next = next;
670 if (next)
671 next->vm_prev = vma;
672} 742}
673 743
674/* 744/*
@@ -676,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
676 */ 746 */
677static void delete_vma_from_mm(struct vm_area_struct *vma) 747static void delete_vma_from_mm(struct vm_area_struct *vma)
678{ 748{
679 struct vm_area_struct **pp;
680 struct address_space *mapping; 749 struct address_space *mapping;
681 struct mm_struct *mm = vma->vm_mm; 750 struct mm_struct *mm = vma->vm_mm;
682 751
@@ -699,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
699 768
700 /* remove from the MM's tree and list */ 769 /* remove from the MM's tree and list */
701 rb_erase(&vma->vm_rb, &mm->mm_rb); 770 rb_erase(&vma->vm_rb, &mm->mm_rb);
702 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { 771
703 if (*pp == vma) { 772 if (vma->vm_prev)
704 *pp = vma->vm_next; 773 vma->vm_prev->vm_next = vma->vm_next;
705 break; 774 else
706 } 775 mm->mmap = vma->vm_next;
707 } 776
777 if (vma->vm_next)
778 vma->vm_next->vm_prev = vma->vm_prev;
708 779
709 vma->vm_mm = NULL; 780 vma->vm_mm = NULL;
710} 781}
@@ -733,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
733struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 804struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
734{ 805{
735 struct vm_area_struct *vma; 806 struct vm_area_struct *vma;
736 struct rb_node *n = mm->mm_rb.rb_node;
737 807
738 /* check the cache first */ 808 /* check the cache first */
739 vma = mm->mmap_cache; 809 vma = mm->mmap_cache;
740 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 810 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
741 return vma; 811 return vma;
742 812
743 /* trawl the tree (there may be multiple mappings in which addr 813 /* trawl the list (there may be multiple mappings in which addr
744 * resides) */ 814 * resides) */
745 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 815 for (vma = mm->mmap; vma; vma = vma->vm_next) {
746 vma = rb_entry(n, struct vm_area_struct, vm_rb);
747 if (vma->vm_start > addr) 816 if (vma->vm_start > addr)
748 return NULL; 817 return NULL;
749 if (vma->vm_end > addr) { 818 if (vma->vm_end > addr) {
@@ -783,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
783 unsigned long len) 852 unsigned long len)
784{ 853{
785 struct vm_area_struct *vma; 854 struct vm_area_struct *vma;
786 struct rb_node *n = mm->mm_rb.rb_node;
787 unsigned long end = addr + len; 855 unsigned long end = addr + len;
788 856
789 /* check the cache first */ 857 /* check the cache first */
@@ -791,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
791 if (vma && vma->vm_start == addr && vma->vm_end == end) 859 if (vma && vma->vm_start == addr && vma->vm_end == end)
792 return vma; 860 return vma;
793 861
794 /* trawl the tree (there may be multiple mappings in which addr 862 /* trawl the list (there may be multiple mappings in which addr
795 * resides) */ 863 * resides) */
796 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 864 for (vma = mm->mmap; vma; vma = vma->vm_next) {
797 vma = rb_entry(n, struct vm_area_struct, vm_rb);
798 if (vma->vm_start < addr) 865 if (vma->vm_start < addr)
799 continue; 866 continue;
800 if (vma->vm_start > addr) 867 if (vma->vm_start > addr)
@@ -1057,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1057 unsigned long capabilities) 1124 unsigned long capabilities)
1058{ 1125{
1059 struct page *pages; 1126 struct page *pages;
1060 unsigned long total, point, n, rlen; 1127 unsigned long total, point, n;
1061 void *base; 1128 void *base;
1062 int ret, order; 1129 int ret, order;
1063 1130
@@ -1081,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
1081 * make a private copy of the data and map that instead */ 1148 * make a private copy of the data and map that instead */
1082 } 1149 }
1083 1150
1084 rlen = PAGE_ALIGN(len);
1085 1151
1086 /* allocate some memory to hold the mapping 1152 /* allocate some memory to hold the mapping
1087 * - note that this may not return a page-aligned address if the object 1153 * - note that this may not return a page-aligned address if the object
1088 * we're allocating is smaller than a page 1154 * we're allocating is smaller than a page
1089 */ 1155 */
1090 order = get_order(rlen); 1156 order = get_order(len);
1091 kdebug("alloc order %d for %lx", order, len); 1157 kdebug("alloc order %d for %lx", order, len);
1092 1158
1093 pages = alloc_pages(GFP_KERNEL, order); 1159 pages = alloc_pages(GFP_KERNEL, order);
@@ -1097,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1097 total = 1 << order; 1163 total = 1 << order;
1098 atomic_long_add(total, &mmap_pages_allocated); 1164 atomic_long_add(total, &mmap_pages_allocated);
1099 1165
1100 point = rlen >> PAGE_SHIFT; 1166 point = len >> PAGE_SHIFT;
1101 1167
1102 /* we allocated a power-of-2 sized page set, so we may want to trim off 1168 /* we allocated a power-of-2 sized page set, so we may want to trim off
1103 * the excess */ 1169 * the excess */
@@ -1119,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1119 base = page_address(pages); 1185 base = page_address(pages);
1120 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1186 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1121 region->vm_start = (unsigned long) base; 1187 region->vm_start = (unsigned long) base;
1122 region->vm_end = region->vm_start + rlen; 1188 region->vm_end = region->vm_start + len;
1123 region->vm_top = region->vm_start + (total << PAGE_SHIFT); 1189 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1124 1190
1125 vma->vm_start = region->vm_start; 1191 vma->vm_start = region->vm_start;
@@ -1135,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
1135 1201
1136 old_fs = get_fs(); 1202 old_fs = get_fs();
1137 set_fs(KERNEL_DS); 1203 set_fs(KERNEL_DS);
1138 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); 1204 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
1139 set_fs(old_fs); 1205 set_fs(old_fs);
1140 1206
1141 if (ret < 0) 1207 if (ret < 0)
1142 goto error_free; 1208 goto error_free;
1143 1209
1144 /* clear the last little bit */ 1210 /* clear the last little bit */
1145 if (ret < rlen) 1211 if (ret < len)
1146 memset(base + ret, 0, rlen - ret); 1212 memset(base + ret, 0, len - ret);
1147 1213
1148 } 1214 }
1149 1215
1150 return 0; 1216 return 0;
1151 1217
1152error_free: 1218error_free:
1153 free_page_series(region->vm_start, region->vm_end); 1219 free_page_series(region->vm_start, region->vm_top);
1154 region->vm_start = vma->vm_start = 0; 1220 region->vm_start = vma->vm_start = 0;
1155 region->vm_end = vma->vm_end = 0; 1221 region->vm_end = vma->vm_end = 0;
1156 region->vm_top = 0; 1222 region->vm_top = 0;
@@ -1159,7 +1225,7 @@ error_free:
1159enomem: 1225enomem:
1160 printk("Allocation of length %lu from process %d (%s) failed\n", 1226 printk("Allocation of length %lu from process %d (%s) failed\n",
1161 len, current->pid, current->comm); 1227 len, current->pid, current->comm);
1162 show_free_areas(); 1228 show_free_areas(0);
1163 return -ENOMEM; 1229 return -ENOMEM;
1164} 1230}
1165 1231
@@ -1192,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1192 1258
1193 /* we ignore the address hint */ 1259 /* we ignore the address hint */
1194 addr = 0; 1260 addr = 0;
1261 len = PAGE_ALIGN(len);
1195 1262
1196 /* we've determined that we can make the mapping, now translate what we 1263 /* we've determined that we can make the mapping, now translate what we
1197 * now know into VMA flags */ 1264 * now know into VMA flags */
@@ -1309,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file,
1309 if (capabilities & BDI_CAP_MAP_DIRECT) { 1376 if (capabilities & BDI_CAP_MAP_DIRECT) {
1310 addr = file->f_op->get_unmapped_area(file, addr, len, 1377 addr = file->f_op->get_unmapped_area(file, addr, len,
1311 pgoff, flags); 1378 pgoff, flags);
1312 if (IS_ERR((void *) addr)) { 1379 if (IS_ERR_VALUE(addr)) {
1313 ret = addr; 1380 ret = addr;
1314 if (ret != (unsigned long) -ENOSYS) 1381 if (ret != -ENOSYS)
1315 goto error_just_free; 1382 goto error_just_free;
1316 1383
1317 /* the driver refused to tell us where to site 1384 /* the driver refused to tell us where to site
1318 * the mapping so we'll have to attempt to copy 1385 * the mapping so we'll have to attempt to copy
1319 * it */ 1386 * it */
1320 ret = (unsigned long) -ENODEV; 1387 ret = -ENODEV;
1321 if (!(capabilities & BDI_CAP_MAP_COPY)) 1388 if (!(capabilities & BDI_CAP_MAP_COPY))
1322 goto error_just_free; 1389 goto error_just_free;
1323 1390
@@ -1392,14 +1459,14 @@ error_getting_vma:
1392 printk(KERN_WARNING "Allocation of vma for %lu byte allocation" 1459 printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1393 " from process %d failed\n", 1460 " from process %d failed\n",
1394 len, current->pid); 1461 len, current->pid);
1395 show_free_areas(); 1462 show_free_areas(0);
1396 return -ENOMEM; 1463 return -ENOMEM;
1397 1464
1398error_getting_region: 1465error_getting_region:
1399 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" 1466 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1400 " from process %d failed\n", 1467 " from process %d failed\n",
1401 len, current->pid); 1468 len, current->pid);
1402 show_free_areas(); 1469 show_free_areas(0);
1403 return -ENOMEM; 1470 return -ENOMEM;
1404} 1471}
1405EXPORT_SYMBOL(do_mmap_pgoff); 1472EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1411,6 +1478,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1411 struct file *file = NULL; 1478 struct file *file = NULL;
1412 unsigned long retval = -EBADF; 1479 unsigned long retval = -EBADF;
1413 1480
1481 audit_mmap_fd(fd, flags);
1414 if (!(flags & MAP_ANONYMOUS)) { 1482 if (!(flags & MAP_ANONYMOUS)) {
1415 file = fget(fd); 1483 file = fget(fd);
1416 if (!file) 1484 if (!file)
@@ -1567,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm,
1567int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 1635int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1568{ 1636{
1569 struct vm_area_struct *vma; 1637 struct vm_area_struct *vma;
1570 struct rb_node *rb; 1638 unsigned long end;
1571 unsigned long end = start + len;
1572 int ret; 1639 int ret;
1573 1640
1574 kenter(",%lx,%zx", start, len); 1641 kenter(",%lx,%zx", start, len);
1575 1642
1643 len = PAGE_ALIGN(len);
1576 if (len == 0) 1644 if (len == 0)
1577 return -EINVAL; 1645 return -EINVAL;
1578 1646
1647 end = start + len;
1648
1579 /* find the first potentially overlapping VMA */ 1649 /* find the first potentially overlapping VMA */
1580 vma = find_vma(mm, start); 1650 vma = find_vma(mm, start);
1581 if (!vma) { 1651 if (!vma) {
@@ -1600,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1600 } 1670 }
1601 if (end == vma->vm_end) 1671 if (end == vma->vm_end)
1602 goto erase_whole_vma; 1672 goto erase_whole_vma;
1603 rb = rb_next(&vma->vm_rb); 1673 vma = vma->vm_next;
1604 vma = rb_entry(rb, struct vm_area_struct, vm_rb); 1674 } while (vma);
1605 } while (rb);
1606 kleave(" = -EINVAL [split file]"); 1675 kleave(" = -EINVAL [split file]");
1607 return -EINVAL; 1676 return -EINVAL;
1608 } else { 1677 } else {
@@ -1668,6 +1737,7 @@ void exit_mmap(struct mm_struct *mm)
1668 mm->mmap = vma->vm_next; 1737 mm->mmap = vma->vm_next;
1669 delete_vma_from_mm(vma); 1738 delete_vma_from_mm(vma);
1670 delete_vma(mm, vma); 1739 delete_vma(mm, vma);
1740 cond_resched();
1671 } 1741 }
1672 1742
1673 kleave(""); 1743 kleave("");
@@ -1695,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr,
1695 struct vm_area_struct *vma; 1765 struct vm_area_struct *vma;
1696 1766
1697 /* insanity checks first */ 1767 /* insanity checks first */
1768 old_len = PAGE_ALIGN(old_len);
1769 new_len = PAGE_ALIGN(new_len);
1698 if (old_len == 0 || new_len == 0) 1770 if (old_len == 0 || new_len == 0)
1699 return (unsigned long) -EINVAL; 1771 return (unsigned long) -EINVAL;
1700 1772
@@ -1741,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1741 return NULL; 1813 return NULL;
1742} 1814}
1743 1815
1744int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1816int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1745 unsigned long to, unsigned long size, pgprot_t prot) 1817 unsigned long pfn, unsigned long size, pgprot_t prot)
1746{ 1818{
1747 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1819 if (addr != (pfn << PAGE_SHIFT))
1820 return -EINVAL;
1821
1822 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1748 return 0; 1823 return 0;
1749} 1824}
1750EXPORT_SYMBOL(remap_pfn_range); 1825EXPORT_SYMBOL(remap_pfn_range);
@@ -1764,10 +1839,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1764} 1839}
1765EXPORT_SYMBOL(remap_vmalloc_range); 1840EXPORT_SYMBOL(remap_vmalloc_range);
1766 1841
1767void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1768{
1769}
1770
1771unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1842unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1772 unsigned long len, unsigned long pgoff, unsigned long flags) 1843 unsigned long len, unsigned long pgoff, unsigned long flags)
1773{ 1844{
@@ -1885,7 +1956,7 @@ error:
1885 return -ENOMEM; 1956 return -ENOMEM;
1886} 1957}
1887 1958
1888int in_gate_area_no_task(unsigned long addr) 1959int in_gate_area_no_mm(unsigned long addr)
1889{ 1960{
1890 return 0; 1961 return 0;
1891} 1962}
@@ -1897,21 +1968,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1897} 1968}
1898EXPORT_SYMBOL(filemap_fault); 1969EXPORT_SYMBOL(filemap_fault);
1899 1970
1900/* 1971static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1901 * Access another process' address space. 1972 unsigned long addr, void *buf, int len, int write)
1902 * - source/target buffer must be kernel space
1903 */
1904int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1905{ 1973{
1906 struct vm_area_struct *vma; 1974 struct vm_area_struct *vma;
1907 struct mm_struct *mm;
1908
1909 if (addr + len < addr)
1910 return 0;
1911
1912 mm = get_task_mm(tsk);
1913 if (!mm)
1914 return 0;
1915 1975
1916 down_read(&mm->mmap_sem); 1976 down_read(&mm->mmap_sem);
1917 1977
@@ -1936,6 +1996,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1936 } 1996 }
1937 1997
1938 up_read(&mm->mmap_sem); 1998 up_read(&mm->mmap_sem);
1999
2000 return len;
2001}
2002
2003/**
2004 * @access_remote_vm - access another process' address space
2005 * @mm: the mm_struct of the target address space
2006 * @addr: start address to access
2007 * @buf: source or destination buffer
2008 * @len: number of bytes to transfer
2009 * @write: whether the access is a write
2010 *
2011 * The caller must hold a reference on @mm.
2012 */
2013int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2014 void *buf, int len, int write)
2015{
2016 return __access_remote_vm(NULL, mm, addr, buf, len, write);
2017}
2018
2019/*
2020 * Access another process' address space.
2021 * - source/target buffer must be kernel space
2022 */
2023int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2024{
2025 struct mm_struct *mm;
2026
2027 if (addr + len < addr)
2028 return 0;
2029
2030 mm = get_task_mm(tsk);
2031 if (!mm)
2032 return 0;
2033
2034 len = __access_remote_vm(tsk, mm, addr, buf, len, write);
2035
1939 mmput(mm); 2036 mmput(mm);
1940 return len; 2037 return len;
1941} 2038}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..e4b0991ca351 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,12 +31,40 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h>
34 35
35int sysctl_panic_on_oom; 36int sysctl_panic_on_oom;
36int sysctl_oom_kill_allocating_task; 37int sysctl_oom_kill_allocating_task;
37int sysctl_oom_dump_tasks = 1; 38int sysctl_oom_dump_tasks = 1;
38static DEFINE_SPINLOCK(zone_scan_lock); 39static DEFINE_SPINLOCK(zone_scan_lock);
39 40
41/**
42 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
43 * @new_val: new oom_score_adj value
44 *
45 * Sets the oom_score_adj value for current to @new_val with proper
46 * synchronization and returns the old value. Usually used to temporarily
47 * set a value, save the old value in the caller, and then reinstate it later.
48 */
49int test_set_oom_score_adj(int new_val)
50{
51 struct sighand_struct *sighand = current->sighand;
52 int old_val;
53
54 spin_lock_irq(&sighand->siglock);
55 old_val = current->signal->oom_score_adj;
56 if (new_val != old_val) {
57 if (new_val == OOM_SCORE_ADJ_MIN)
58 atomic_inc(&current->mm->oom_disable_count);
59 else if (old_val == OOM_SCORE_ADJ_MIN)
60 atomic_dec(&current->mm->oom_disable_count);
61 current->signal->oom_score_adj = new_val;
62 }
63 spin_unlock_irq(&sighand->siglock);
64
65 return old_val;
66}
67
40#ifdef CONFIG_NUMA 68#ifdef CONFIG_NUMA
41/** 69/**
42 * has_intersects_mems_allowed() - check task eligiblity for kill 70 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
83#endif /* CONFIG_NUMA */ 111#endif /* CONFIG_NUMA */
84 112
85/* 113/*
86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
90 */
91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
93{
94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through 114 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid 115 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with 116 * pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -162,24 +172,16 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
162 return 0; 172 return 0;
163 173
164 /* 174 /*
165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't 175 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
166 * need to be executed for something that cannot be killed. 176 * so the entire heuristic doesn't need to be executed for something
177 * that cannot be killed.
167 */ 178 */
168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { 179 if (atomic_read(&p->mm->oom_disable_count)) {
169 task_unlock(p); 180 task_unlock(p);
170 return 0; 181 return 0;
171 } 182 }
172 183
173 /* 184 /*
174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
175 * priority for oom killing.
176 */
177 if (p->flags & PF_OOM_ORIGIN) {
178 task_unlock(p);
179 return 1000;
180 }
181
182 /*
183 * The memory controller may have a limit of 0 bytes, so avoid a divide 185 * The memory controller may have a limit of 0 bytes, so avoid a divide
184 * by zero, if necessary. 186 * by zero, if necessary.
185 */ 187 */
@@ -188,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
188 190
189 /* 191 /*
190 * The baseline for the badness score is the proportion of RAM that each 192 * The baseline for the badness score is the proportion of RAM that each
191 * task's rss and swap space use. 193 * task's rss, pagetable and swap space use.
192 */ 194 */
193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / 195 points = get_mm_rss(p->mm) + p->mm->nr_ptes;
194 totalpages; 196 points += get_mm_counter(p->mm, MM_SWAPENTS);
197
198 points *= 1000;
199 points /= totalpages;
195 task_unlock(p); 200 task_unlock(p);
196 201
197 /* 202 /*
@@ -291,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
291 unsigned long totalpages, struct mem_cgroup *mem, 296 unsigned long totalpages, struct mem_cgroup *mem,
292 const nodemask_t *nodemask) 297 const nodemask_t *nodemask)
293{ 298{
294 struct task_struct *p; 299 struct task_struct *g, *p;
295 struct task_struct *chosen = NULL; 300 struct task_struct *chosen = NULL;
296 *ppoints = 0; 301 *ppoints = 0;
297 302
298 for_each_process(p) { 303 do_each_thread(g, p) {
299 unsigned int points; 304 unsigned int points;
300 305
306 if (!p->mm)
307 continue;
301 if (oom_unkillable_task(p, mem, nodemask)) 308 if (oom_unkillable_task(p, mem, nodemask))
302 continue; 309 continue;
303 310
@@ -313,22 +320,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
313 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 320 if (test_tsk_thread_flag(p, TIF_MEMDIE))
314 return ERR_PTR(-1UL); 321 return ERR_PTR(-1UL);
315 322
316 /* 323 if (p->flags & PF_EXITING) {
317 * This is in the process of releasing memory so wait for it 324 /*
318 * to finish before killing some other task by mistake. 325 * If p is the current task and is in the process of
319 * 326 * releasing memory, we allow the "kill" to set
320 * However, if p is the current task, we allow the 'kill' to 327 * TIF_MEMDIE, which will allow it to gain access to
321 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 328 * memory reserves. Otherwise, it may stall forever.
322 * which will allow it to gain access to memory reserves in 329 *
323 * the process of exiting and releasing its resources. 330 * The loop isn't broken here, however, in case other
324 * Otherwise we could get an easy OOM deadlock. 331 * threads are found to have already been oom killed.
325 */ 332 */
326 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { 333 if (p == current) {
327 if (p != current) 334 chosen = p;
328 return ERR_PTR(-1UL); 335 *ppoints = 1000;
329 336 } else {
330 chosen = p; 337 /*
331 *ppoints = 1000; 338 * If this task is not being ptraced on exit,
339 * then wait for it to finish before killing
340 * some other task unnecessarily.
341 */
342 if (!(task_ptrace(p->group_leader) &
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL);
345 }
332 } 346 }
333 347
334 points = oom_badness(p, mem, nodemask, totalpages); 348 points = oom_badness(p, mem, nodemask, totalpages);
@@ -336,7 +350,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
336 chosen = p; 350 chosen = p;
337 *ppoints = points; 351 *ppoints = points;
338 } 352 }
339 } 353 } while_each_thread(g, p);
340 354
341 return chosen; 355 return chosen;
342} 356}
@@ -395,7 +409,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
395 task_unlock(current); 409 task_unlock(current);
396 dump_stack(); 410 dump_stack();
397 mem_cgroup_print_oom_info(mem, p); 411 mem_cgroup_print_oom_info(mem, p);
398 show_mem(); 412 show_mem(SHOW_MEM_FILTER_NODES);
399 if (sysctl_oom_dump_tasks) 413 if (sysctl_oom_dump_tasks)
400 dump_tasks(mem, nodemask); 414 dump_tasks(mem, nodemask);
401} 415}
@@ -403,27 +417,44 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
403#define K(x) ((x) << (PAGE_SHIFT-10)) 417#define K(x) ((x) << (PAGE_SHIFT-10))
404static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) 418static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
405{ 419{
420 struct task_struct *q;
421 struct mm_struct *mm;
422
406 p = find_lock_task_mm(p); 423 p = find_lock_task_mm(p);
407 if (!p) 424 if (!p)
408 return 1; 425 return 1;
409 426
427 /* mm cannot be safely dereferenced after task_unlock(p) */
428 mm = p->mm;
429
410 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 430 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
411 task_pid_nr(p), p->comm, K(p->mm->total_vm), 431 task_pid_nr(p), p->comm, K(p->mm->total_vm),
412 K(get_mm_counter(p->mm, MM_ANONPAGES)), 432 K(get_mm_counter(p->mm, MM_ANONPAGES)),
413 K(get_mm_counter(p->mm, MM_FILEPAGES))); 433 K(get_mm_counter(p->mm, MM_FILEPAGES)));
414 task_unlock(p); 434 task_unlock(p);
415 435
436 /*
437 * Kill all processes sharing p->mm in other thread groups, if any.
438 * They don't get access to memory reserves or a higher scheduler
439 * priority, though, to avoid depletion of all memory or task
440 * starvation. This prevents mm->mmap_sem livelock when an oom killed
441 * task cannot exit because it requires the semaphore and its contended
442 * by another thread trying to allocate memory itself. That thread will
443 * now get access to memory reserves since it has a pending fatal
444 * signal.
445 */
446 for_each_process(q)
447 if (q->mm == mm && !same_thread_group(q, p)) {
448 task_lock(q); /* Protect ->comm from prctl() */
449 pr_err("Kill process %d (%s) sharing same memory\n",
450 task_pid_nr(q), q->comm);
451 task_unlock(q);
452 force_sig(SIGKILL, q);
453 }
416 454
417 set_tsk_thread_flag(p, TIF_MEMDIE); 455 set_tsk_thread_flag(p, TIF_MEMDIE);
418 force_sig(SIGKILL, p); 456 force_sig(SIGKILL, p);
419 457
420 /*
421 * We give our sacrificial lamb high priority and access to
422 * all the memory it needs. That way it should be able to
423 * exit() and clear out its resources quickly...
424 */
425 boost_dying_task_prio(p, mem);
426
427 return 0; 458 return 0;
428} 459}
429#undef K 460#undef K
@@ -447,7 +478,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
447 */ 478 */
448 if (p->flags & PF_EXITING) { 479 if (p->flags & PF_EXITING) {
449 set_tsk_thread_flag(p, TIF_MEMDIE); 480 set_tsk_thread_flag(p, TIF_MEMDIE);
450 boost_dying_task_prio(p, mem);
451 return 0; 481 return 0;
452 } 482 }
453 483
@@ -466,6 +496,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
466 list_for_each_entry(child, &t->children, sibling) { 496 list_for_each_entry(child, &t->children, sibling) {
467 unsigned int child_points; 497 unsigned int child_points;
468 498
499 if (child->mm == p->mm)
500 continue;
469 /* 501 /*
470 * oom_badness() returns 0 if the thread is unkillable 502 * oom_badness() returns 0 if the thread is unkillable
471 */ 503 */
@@ -512,6 +544,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
512 unsigned int points = 0; 544 unsigned int points = 0;
513 struct task_struct *p; 545 struct task_struct *p;
514 546
547 /*
548 * If current has a pending SIGKILL, then automatically select it. The
549 * goal is to allow it to allocate so that it may quickly exit and free
550 * its memory.
551 */
552 if (fatal_signal_pending(current)) {
553 set_thread_flag(TIF_MEMDIE);
554 return;
555 }
556
515 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 557 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
516 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 558 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
517 read_lock(&tasklist_lock); 559 read_lock(&tasklist_lock);
@@ -664,7 +706,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
664 */ 706 */
665 if (fatal_signal_pending(current)) { 707 if (fatal_signal_pending(current)) {
666 set_thread_flag(TIF_MEMDIE); 708 set_thread_flag(TIF_MEMDIE);
667 boost_dying_task_prio(current, NULL);
668 return; 709 return;
669 } 710 }
670 711
@@ -680,7 +721,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
680 read_lock(&tasklist_lock); 721 read_lock(&tasklist_lock);
681 if (sysctl_oom_kill_allocating_task && 722 if (sysctl_oom_kill_allocating_task &&
682 !oom_unkillable_task(current, NULL, nodemask) && 723 !oom_unkillable_task(current, NULL, nodemask) &&
683 (current->signal->oom_adj != OOM_DISABLE)) { 724 current->mm && !atomic_read(&current->mm->oom_disable_count)) {
684 /* 725 /*
685 * oom_kill_process() needs tasklist_lock held. If it returns 726 * oom_kill_process() needs tasklist_lock held. If it returns
686 * non-zero, current could not be killed so we must fallback to 727 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,25 +404,22 @@ unsigned long determine_dirtyable_memory(void)
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes 404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes 405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and 406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks. 407 * real-time tasks.
408 */ 408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
410{ 410{
411 unsigned long background; 411 unsigned long background;
412 unsigned long dirty; 412 unsigned long dirty;
413 unsigned long available_memory = determine_dirtyable_memory(); 413 unsigned long uninitialized_var(available_memory);
414 struct task_struct *tsk; 414 struct task_struct *tsk;
415 415
416 if (!vm_dirty_bytes || !dirty_background_bytes)
417 available_memory = determine_dirtyable_memory();
418
416 if (vm_dirty_bytes) 419 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 420 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else { 421 else
419 int dirty_ratio; 422 dirty = (vm_dirty_ratio * available_memory) / 100;
420
421 dirty_ratio = vm_dirty_ratio;
422 if (dirty_ratio < 5)
423 dirty_ratio = 5;
424 dirty = (dirty_ratio * available_memory) / 100;
425 }
426 423
427 if (dirty_background_bytes) 424 if (dirty_background_bytes)
428 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); 425 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +507,7 @@ static void balance_dirty_pages(struct address_space *mapping,
510 * catch-up. This avoids (excessively) small writeouts 507 * catch-up. This avoids (excessively) small writeouts
511 * when the bdi limits are ramping up. 508 * when the bdi limits are ramping up.
512 */ 509 */
513 if (nr_reclaimable + nr_writeback < 510 if (nr_reclaimable + nr_writeback <=
514 (background_thresh + dirty_thresh) / 2) 511 (background_thresh + dirty_thresh) / 2)
515 break; 512 break;
516 513
@@ -542,8 +539,8 @@ static void balance_dirty_pages(struct address_space *mapping,
542 * the last resort safeguard. 539 * the last resort safeguard.
543 */ 540 */
544 dirty_exceeded = 541 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) 542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh); 543 || (nr_reclaimable + nr_writeback > dirty_thresh);
547 544
548 if (!dirty_exceeded) 545 if (!dirty_exceeded)
549 break; 546 break;
@@ -569,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping,
569 break; /* We've done our duty */ 566 break; /* We've done our duty */
570 } 567 }
571 trace_wbc_balance_dirty_wait(&wbc, bdi); 568 trace_wbc_balance_dirty_wait(&wbc, bdi);
572 __set_current_state(TASK_INTERRUPTIBLE); 569 __set_current_state(TASK_UNINTERRUPTIBLE);
573 io_schedule_timeout(pause); 570 io_schedule_timeout(pause);
574 571
575 /* 572 /*
@@ -930,7 +927,7 @@ retry:
930 break; 927 break;
931 } 928 }
932 929
933 done_index = page->index + 1; 930 done_index = page->index;
934 931
935 lock_page(page); 932 lock_page(page);
936 933
@@ -980,6 +977,7 @@ continue_unlock:
980 * not be suitable for data integrity 977 * not be suitable for data integrity
981 * writeout). 978 * writeout).
982 */ 979 */
980 done_index = page->index + 1;
983 done = 1; 981 done = 1;
984 break; 982 break;
985 } 983 }
@@ -1042,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
1042int generic_writepages(struct address_space *mapping, 1040int generic_writepages(struct address_space *mapping,
1043 struct writeback_control *wbc) 1041 struct writeback_control *wbc)
1044{ 1042{
1043 struct blk_plug plug;
1044 int ret;
1045
1045 /* deal with chardevs and other special file */ 1046 /* deal with chardevs and other special file */
1046 if (!mapping->a_ops->writepage) 1047 if (!mapping->a_ops->writepage)
1047 return 0; 1048 return 0;
1048 1049
1049 return write_cache_pages(mapping, wbc, __writepage, mapping); 1050 blk_start_plug(&plug);
1051 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1052 blk_finish_plug(&plug);
1053 return ret;
1050} 1054}
1051 1055
1052EXPORT_SYMBOL(generic_writepages); 1056EXPORT_SYMBOL(generic_writepages);
@@ -1109,7 +1113,7 @@ EXPORT_SYMBOL(write_one_page);
1109int __set_page_dirty_no_writeback(struct page *page) 1113int __set_page_dirty_no_writeback(struct page *page)
1110{ 1114{
1111 if (!PageDirty(page)) 1115 if (!PageDirty(page))
1112 SetPageDirty(page); 1116 return !TestSetPageDirty(page);
1113 return 0; 1117 return 0;
1114} 1118}
1115 1119
@@ -1121,6 +1125,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1121{ 1125{
1122 if (mapping_cap_account_dirty(mapping)) { 1126 if (mapping_cap_account_dirty(mapping)) {
1123 __inc_zone_page_state(page, NR_FILE_DIRTY); 1127 __inc_zone_page_state(page, NR_FILE_DIRTY);
1128 __inc_zone_page_state(page, NR_DIRTIED);
1124 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 1129 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1125 task_dirty_inc(current); 1130 task_dirty_inc(current);
1126 task_io_account_write(PAGE_CACHE_SIZE); 1131 task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1134,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
1129EXPORT_SYMBOL(account_page_dirtied); 1134EXPORT_SYMBOL(account_page_dirtied);
1130 1135
1131/* 1136/*
1137 * Helper function for set_page_writeback family.
1138 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
1139 * wrt interrupts.
1140 */
1141void account_page_writeback(struct page *page)
1142{
1143 inc_zone_page_state(page, NR_WRITEBACK);
1144 inc_zone_page_state(page, NR_WRITTEN);
1145}
1146EXPORT_SYMBOL(account_page_writeback);
1147
1148/*
1132 * For address_spaces which do not use buffers. Just tag the page as dirty in 1149 * For address_spaces which do not use buffers. Just tag the page as dirty in
1133 * its radix tree. 1150 * its radix tree.
1134 * 1151 *
@@ -1201,6 +1218,17 @@ int set_page_dirty(struct page *page)
1201 1218
1202 if (likely(mapping)) { 1219 if (likely(mapping)) {
1203 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 1220 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
1221 /*
1222 * readahead/lru_deactivate_page could remain
1223 * PG_readahead/PG_reclaim due to race with end_page_writeback
1224 * About readahead, if the page is written, the flags would be
1225 * reset. So no problem.
1226 * About lru_deactivate_page, if the page is redirty, the flag
1227 * will be reset. So no problem. but if the page is used by readahead
1228 * it will confuse readahead and make it restart the size rampup
1229 * process. But it's a trivial problem.
1230 */
1231 ClearPageReclaim(page);
1204#ifdef CONFIG_BLOCK 1232#ifdef CONFIG_BLOCK
1205 if (!spd) 1233 if (!spd)
1206 spd = __set_page_dirty_buffers; 1234 spd = __set_page_dirty_buffers;
@@ -1229,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
1229{ 1257{
1230 int ret; 1258 int ret;
1231 1259
1232 lock_page_nosync(page); 1260 lock_page(page);
1233 ret = set_page_dirty(page); 1261 ret = set_page_dirty(page);
1234 unlock_page(page); 1262 unlock_page(page);
1235 return ret; 1263 return ret;
@@ -1256,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page)
1256 1284
1257 BUG_ON(!PageLocked(page)); 1285 BUG_ON(!PageLocked(page));
1258 1286
1259 ClearPageReclaim(page);
1260 if (mapping && mapping_cap_account_dirty(mapping)) { 1287 if (mapping && mapping_cap_account_dirty(mapping)) {
1261 /* 1288 /*
1262 * Yes, Virginia, this is indeed insane. 1289 * Yes, Virginia, this is indeed insane.
@@ -1366,7 +1393,7 @@ int test_set_page_writeback(struct page *page)
1366 ret = TestSetPageWriteback(page); 1393 ret = TestSetPageWriteback(page);
1367 } 1394 }
1368 if (!ret) 1395 if (!ret)
1369 inc_zone_page_state(page, NR_WRITEBACK); 1396 account_page_writeback(page);
1370 return ret; 1397 return ret;
1371 1398
1372} 1399}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..4e8985acdab8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/compiler.h> 25#include <linux/compiler.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
@@ -29,6 +30,7 @@
29#include <linux/pagevec.h> 30#include <linux/pagevec.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/ratelimit.h>
32#include <linux/oom.h> 34#include <linux/oom.h>
33#include <linux/notifier.h> 35#include <linux/notifier.h>
34#include <linux/topology.h> 36#include <linux/topology.h>
@@ -38,6 +40,7 @@
38#include <linux/memory_hotplug.h> 40#include <linux/memory_hotplug.h>
39#include <linux/nodemask.h> 41#include <linux/nodemask.h>
40#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
43#include <linux/vmstat.h>
41#include <linux/mempolicy.h> 44#include <linux/mempolicy.h>
42#include <linux/stop_machine.h> 45#include <linux/stop_machine.h>
43#include <linux/sort.h> 46#include <linux/sort.h>
@@ -52,6 +55,8 @@
52#include <linux/compaction.h> 55#include <linux/compaction.h>
53#include <trace/events/kmem.h> 56#include <trace/events/kmem.h>
54#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h>
59#include <linux/prefetch.h>
55 60
56#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
57#include <asm/div64.h> 62#include <asm/div64.h>
@@ -103,19 +108,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
103 * only be modified with pm_mutex held, unless the suspend/hibernate code is 108 * only be modified with pm_mutex held, unless the suspend/hibernate code is
104 * guaranteed not to run in parallel with that modification). 109 * guaranteed not to run in parallel with that modification).
105 */ 110 */
106void set_gfp_allowed_mask(gfp_t mask) 111
112static gfp_t saved_gfp_mask;
113
114void pm_restore_gfp_mask(void)
107{ 115{
108 WARN_ON(!mutex_is_locked(&pm_mutex)); 116 WARN_ON(!mutex_is_locked(&pm_mutex));
109 gfp_allowed_mask = mask; 117 if (saved_gfp_mask) {
118 gfp_allowed_mask = saved_gfp_mask;
119 saved_gfp_mask = 0;
120 }
110} 121}
111 122
112gfp_t clear_gfp_allowed_mask(gfp_t mask) 123void pm_restrict_gfp_mask(void)
113{ 124{
114 gfp_t ret = gfp_allowed_mask;
115
116 WARN_ON(!mutex_is_locked(&pm_mutex)); 125 WARN_ON(!mutex_is_locked(&pm_mutex));
117 gfp_allowed_mask &= ~mask; 126 WARN_ON(saved_gfp_mask);
118 return ret; 127 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS;
119} 129}
120#endif /* CONFIG_PM_SLEEP */ 130#endif /* CONFIG_PM_SLEEP */
121 131
@@ -280,7 +290,7 @@ static void bad_page(struct page *page)
280 290
281 /* Don't complain about poisoned pages */ 291 /* Don't complain about poisoned pages */
282 if (PageHWPoison(page)) { 292 if (PageHWPoison(page)) {
283 __ClearPageBuddy(page); 293 reset_page_mapcount(page); /* remove PageBuddy */
284 return; 294 return;
285 } 295 }
286 296
@@ -311,7 +321,7 @@ static void bad_page(struct page *page)
311 dump_stack(); 321 dump_stack();
312out: 322out:
313 /* Leave bad fields for debug, except PageBuddy could make trouble */ 323 /* Leave bad fields for debug, except PageBuddy could make trouble */
314 __ClearPageBuddy(page); 324 reset_page_mapcount(page); /* remove PageBuddy */
315 add_taint(TAINT_BAD_PAGE); 325 add_taint(TAINT_BAD_PAGE);
316} 326}
317 327
@@ -351,6 +361,7 @@ void prep_compound_page(struct page *page, unsigned long order)
351 } 361 }
352} 362}
353 363
364/* update __split_huge_page_refcount if you change this function */
354static int destroy_compound_page(struct page *page, unsigned long order) 365static int destroy_compound_page(struct page *page, unsigned long order)
355{ 366{
356 int i; 367 int i;
@@ -420,18 +431,10 @@ static inline void rmv_page_order(struct page *page)
420 * 431 *
421 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 432 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
422 */ 433 */
423static inline struct page *
424__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
425{
426 unsigned long buddy_idx = page_idx ^ (1 << order);
427
428 return page + (buddy_idx - page_idx);
429}
430
431static inline unsigned long 434static inline unsigned long
432__find_combined_index(unsigned long page_idx, unsigned int order) 435__find_buddy_index(unsigned long page_idx, unsigned int order)
433{ 436{
434 return (page_idx & ~(1 << order)); 437 return page_idx ^ (1 << order);
435} 438}
436 439
437/* 440/*
@@ -442,8 +445,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
442 * (c) a page and its buddy have the same order && 445 * (c) a page and its buddy have the same order &&
443 * (d) a page and its buddy are in the same zone. 446 * (d) a page and its buddy are in the same zone.
444 * 447 *
445 * For recording whether a page is in the buddy system, we use PG_buddy. 448 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
446 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 449 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
447 * 450 *
448 * For recording page's order, we use page_private(page). 451 * For recording page's order, we use page_private(page).
449 */ 452 */
@@ -476,7 +479,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
476 * as necessary, plus some accounting needed to play nicely with other 479 * as necessary, plus some accounting needed to play nicely with other
477 * parts of the VM system. 480 * parts of the VM system.
478 * At each level, we keep a list of pages, which are heads of continuous 481 * At each level, we keep a list of pages, which are heads of continuous
479 * free pages of length of (1 << order) and marked with PG_buddy. Page's 482 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
480 * order is recorded in page_private(page) field. 483 * order is recorded in page_private(page) field.
481 * So when we are allocating or freeing one, we can derive the state of the 484 * So when we are allocating or freeing one, we can derive the state of the
482 * other. That is, if we allocate a small block, and both were 485 * other. That is, if we allocate a small block, and both were
@@ -493,6 +496,7 @@ static inline void __free_one_page(struct page *page,
493{ 496{
494 unsigned long page_idx; 497 unsigned long page_idx;
495 unsigned long combined_idx; 498 unsigned long combined_idx;
499 unsigned long uninitialized_var(buddy_idx);
496 struct page *buddy; 500 struct page *buddy;
497 501
498 if (unlikely(PageCompound(page))) 502 if (unlikely(PageCompound(page)))
@@ -507,7 +511,8 @@ static inline void __free_one_page(struct page *page,
507 VM_BUG_ON(bad_range(zone, page)); 511 VM_BUG_ON(bad_range(zone, page));
508 512
509 while (order < MAX_ORDER-1) { 513 while (order < MAX_ORDER-1) {
510 buddy = __page_find_buddy(page, page_idx, order); 514 buddy_idx = __find_buddy_index(page_idx, order);
515 buddy = page + (buddy_idx - page_idx);
511 if (!page_is_buddy(page, buddy, order)) 516 if (!page_is_buddy(page, buddy, order))
512 break; 517 break;
513 518
@@ -515,7 +520,7 @@ static inline void __free_one_page(struct page *page,
515 list_del(&buddy->lru); 520 list_del(&buddy->lru);
516 zone->free_area[order].nr_free--; 521 zone->free_area[order].nr_free--;
517 rmv_page_order(buddy); 522 rmv_page_order(buddy);
518 combined_idx = __find_combined_index(page_idx, order); 523 combined_idx = buddy_idx & page_idx;
519 page = page + (combined_idx - page_idx); 524 page = page + (combined_idx - page_idx);
520 page_idx = combined_idx; 525 page_idx = combined_idx;
521 order++; 526 order++;
@@ -530,11 +535,12 @@ static inline void __free_one_page(struct page *page,
530 * so it's less likely to be used soon and more likely to be merged 535 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page 536 * as a higher order page
532 */ 537 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 538 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy; 539 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order); 540 combined_idx = buddy_idx & page_idx;
536 higher_page = page + combined_idx - page_idx; 541 higher_page = page + (combined_idx - page_idx);
537 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 542 buddy_idx = __find_buddy_index(combined_idx, order + 1);
543 higher_buddy = page + (buddy_idx - combined_idx);
538 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 544 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
539 list_add_tail(&page->lru, 545 list_add_tail(&page->lru,
540 &zone->free_area[order].free_list[migratetype]); 546 &zone->free_area[order].free_list[migratetype]);
@@ -563,7 +569,8 @@ static inline int free_pages_check(struct page *page)
563 if (unlikely(page_mapcount(page) | 569 if (unlikely(page_mapcount(page) |
564 (page->mapping != NULL) | 570 (page->mapping != NULL) |
565 (atomic_read(&page->_count) != 0) | 571 (atomic_read(&page->_count) != 0) |
566 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 572 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
573 (mem_cgroup_bad_page_check(page)))) {
567 bad_page(page); 574 bad_page(page);
568 return 1; 575 return 1;
569 } 576 }
@@ -612,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
612 list = &pcp->lists[migratetype]; 619 list = &pcp->lists[migratetype];
613 } while (list_empty(list)); 620 } while (list_empty(list));
614 621
622 /* This is the only non-empty list. Free them all. */
623 if (batch_free == MIGRATE_PCPTYPES)
624 batch_free = to_free;
625
615 do { 626 do {
616 page = list_entry(list->prev, struct page, lru); 627 page = list_entry(list->prev, struct page, lru);
617 /* must delete as __free_one_page list manipulates */ 628 /* must delete as __free_one_page list manipulates */
@@ -645,13 +656,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
645 trace_mm_page_free_direct(page, order); 656 trace_mm_page_free_direct(page, order);
646 kmemcheck_free_shadow(page, order); 657 kmemcheck_free_shadow(page, order);
647 658
648 for (i = 0; i < (1 << order); i++) { 659 if (PageAnon(page))
649 struct page *pg = page + i; 660 page->mapping = NULL;
650 661 for (i = 0; i < (1 << order); i++)
651 if (PageAnon(pg)) 662 bad += free_pages_check(page + i);
652 pg->mapping = NULL;
653 bad += free_pages_check(pg);
654 }
655 if (bad) 663 if (bad)
656 return false; 664 return false;
657 665
@@ -751,7 +759,8 @@ static inline int check_new_page(struct page *page)
751 if (unlikely(page_mapcount(page) | 759 if (unlikely(page_mapcount(page) |
752 (page->mapping != NULL) | 760 (page->mapping != NULL) |
753 (atomic_read(&page->_count) != 0) | 761 (atomic_read(&page->_count) != 0) |
754 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 762 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
763 (mem_cgroup_bad_page_check(page)))) {
755 bad_page(page); 764 bad_page(page);
756 return 1; 765 return 1;
757 } 766 }
@@ -864,9 +873,8 @@ static int move_freepages(struct zone *zone,
864 } 873 }
865 874
866 order = page_order(page); 875 order = page_order(page);
867 list_del(&page->lru); 876 list_move(&page->lru,
868 list_add(&page->lru, 877 &zone->free_area[order].free_list[migratetype]);
869 &zone->free_area[order].free_list[migratetype]);
870 page += 1 << order; 878 page += 1 << order;
871 pages_moved += 1 << order; 879 pages_moved += 1 << order;
872 } 880 }
@@ -937,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
937 * If breaking a large block of pages, move all free 945 * If breaking a large block of pages, move all free
938 * pages to the preferred allocation list. If falling 946 * pages to the preferred allocation list. If falling
939 * back for a reclaimable kernel allocation, be more 947 * back for a reclaimable kernel allocation, be more
940 * agressive about taking ownership of free pages 948 * aggressive about taking ownership of free pages
941 */ 949 */
942 if (unlikely(current_order >= (pageblock_order >> 1)) || 950 if (unlikely(current_order >= (pageblock_order >> 1)) ||
943 start_migratetype == MIGRATE_RECLAIMABLE || 951 start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1089,8 +1097,10 @@ static void drain_pages(unsigned int cpu)
1089 pset = per_cpu_ptr(zone->pageset, cpu); 1097 pset = per_cpu_ptr(zone->pageset, cpu);
1090 1098
1091 pcp = &pset->pcp; 1099 pcp = &pset->pcp;
1092 free_pcppages_bulk(zone, pcp->count, pcp); 1100 if (pcp->count) {
1093 pcp->count = 0; 1101 free_pcppages_bulk(zone, pcp->count, pcp);
1102 pcp->count = 0;
1103 }
1094 local_irq_restore(flags); 1104 local_irq_restore(flags);
1095 } 1105 }
1096} 1106}
@@ -1332,7 +1342,7 @@ again:
1332 } 1342 }
1333 1343
1334 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1344 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1335 zone_statistics(preferred_zone, zone); 1345 zone_statistics(preferred_zone, zone, gfp_flags);
1336 local_irq_restore(flags); 1346 local_irq_restore(flags);
1337 1347
1338 VM_BUG_ON(bad_range(zone, page)); 1348 VM_BUG_ON(bad_range(zone, page));
@@ -1454,24 +1464,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1454#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1464#endif /* CONFIG_FAIL_PAGE_ALLOC */
1455 1465
1456/* 1466/*
1457 * Return 1 if free pages are above 'mark'. This takes into account the order 1467 * Return true if free pages are above 'mark'. This takes into account the order
1458 * of the allocation. 1468 * of the allocation.
1459 */ 1469 */
1460int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1470static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1461 int classzone_idx, int alloc_flags) 1471 int classzone_idx, int alloc_flags, long free_pages)
1462{ 1472{
1463 /* free_pages my go negative - that's OK */ 1473 /* free_pages my go negative - that's OK */
1464 long min = mark; 1474 long min = mark;
1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1475 int o;
1467 1476
1477 free_pages -= (1 << order) + 1;
1468 if (alloc_flags & ALLOC_HIGH) 1478 if (alloc_flags & ALLOC_HIGH)
1469 min -= min / 2; 1479 min -= min / 2;
1470 if (alloc_flags & ALLOC_HARDER) 1480 if (alloc_flags & ALLOC_HARDER)
1471 min -= min / 4; 1481 min -= min / 4;
1472 1482
1473 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1483 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1474 return 0; 1484 return false;
1475 for (o = 0; o < order; o++) { 1485 for (o = 0; o < order; o++) {
1476 /* At the next order, this order's pages become unavailable */ 1486 /* At the next order, this order's pages become unavailable */
1477 free_pages -= z->free_area[o].nr_free << o; 1487 free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1490,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1480 min >>= 1; 1490 min >>= 1;
1481 1491
1482 if (free_pages <= min) 1492 if (free_pages <= min)
1483 return 0; 1493 return false;
1484 } 1494 }
1485 return 1; 1495 return true;
1496}
1497
1498bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1499 int classzone_idx, int alloc_flags)
1500{
1501 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1502 zone_page_state(z, NR_FREE_PAGES));
1503}
1504
1505bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1506 int classzone_idx, int alloc_flags)
1507{
1508 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1509
1510 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1511 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1512
1513 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1514 free_pages);
1486} 1515}
1487 1516
1488#ifdef CONFIG_NUMA 1517#ifdef CONFIG_NUMA
@@ -1694,6 +1723,59 @@ try_next_zone:
1694 return page; 1723 return page;
1695} 1724}
1696 1725
1726/*
1727 * Large machines with many possible nodes should not always dump per-node
1728 * meminfo in irq context.
1729 */
1730static inline bool should_suppress_show_mem(void)
1731{
1732 bool ret = false;
1733
1734#if NODES_SHIFT > 8
1735 ret = in_interrupt();
1736#endif
1737 return ret;
1738}
1739
1740static DEFINE_RATELIMIT_STATE(nopage_rs,
1741 DEFAULT_RATELIMIT_INTERVAL,
1742 DEFAULT_RATELIMIT_BURST);
1743
1744void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1745{
1746 va_list args;
1747 unsigned int filter = SHOW_MEM_FILTER_NODES;
1748
1749 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
1750 return;
1751
1752 /*
1753 * This documents exceptions given to allocations in certain
1754 * contexts that are allowed to allocate outside current's set
1755 * of allowed nodes.
1756 */
1757 if (!(gfp_mask & __GFP_NOMEMALLOC))
1758 if (test_thread_flag(TIF_MEMDIE) ||
1759 (current->flags & (PF_MEMALLOC | PF_EXITING)))
1760 filter &= ~SHOW_MEM_FILTER_NODES;
1761 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1762 filter &= ~SHOW_MEM_FILTER_NODES;
1763
1764 if (fmt) {
1765 printk(KERN_WARNING);
1766 va_start(args, fmt);
1767 vprintk(fmt, args);
1768 va_end(args);
1769 }
1770
1771 pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
1772 current->comm, order, gfp_mask);
1773
1774 dump_stack();
1775 if (!should_suppress_show_mem())
1776 show_mem(filter);
1777}
1778
1697static inline int 1779static inline int
1698should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1780should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1699 unsigned long pages_reclaimed) 1781 unsigned long pages_reclaimed)
@@ -1787,15 +1869,18 @@ static struct page *
1787__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1869__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1788 struct zonelist *zonelist, enum zone_type high_zoneidx, 1870 struct zonelist *zonelist, enum zone_type high_zoneidx,
1789 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1871 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1790 int migratetype, unsigned long *did_some_progress) 1872 int migratetype, unsigned long *did_some_progress,
1873 bool sync_migration)
1791{ 1874{
1792 struct page *page; 1875 struct page *page;
1793 1876
1794 if (!order || compaction_deferred(preferred_zone)) 1877 if (!order || compaction_deferred(preferred_zone))
1795 return NULL; 1878 return NULL;
1796 1879
1880 current->flags |= PF_MEMALLOC;
1797 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1881 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1798 nodemask); 1882 nodemask, sync_migration);
1883 current->flags &= ~PF_MEMALLOC;
1799 if (*did_some_progress != COMPACT_SKIPPED) { 1884 if (*did_some_progress != COMPACT_SKIPPED) {
1800 1885
1801 /* Page migration frees to the PCP lists but we want merging */ 1886 /* Page migration frees to the PCP lists but we want merging */
@@ -1831,7 +1916,8 @@ static inline struct page *
1831__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1916__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1832 struct zonelist *zonelist, enum zone_type high_zoneidx, 1917 struct zonelist *zonelist, enum zone_type high_zoneidx,
1833 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1918 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1834 int migratetype, unsigned long *did_some_progress) 1919 int migratetype, unsigned long *did_some_progress,
1920 bool sync_migration)
1835{ 1921{
1836 return NULL; 1922 return NULL;
1837} 1923}
@@ -1846,23 +1932,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1846{ 1932{
1847 struct page *page = NULL; 1933 struct page *page = NULL;
1848 struct reclaim_state reclaim_state; 1934 struct reclaim_state reclaim_state;
1849 struct task_struct *p = current;
1850 bool drained = false; 1935 bool drained = false;
1851 1936
1852 cond_resched(); 1937 cond_resched();
1853 1938
1854 /* We now go into synchronous reclaim */ 1939 /* We now go into synchronous reclaim */
1855 cpuset_memory_pressure_bump(); 1940 cpuset_memory_pressure_bump();
1856 p->flags |= PF_MEMALLOC; 1941 current->flags |= PF_MEMALLOC;
1857 lockdep_set_current_reclaim_state(gfp_mask); 1942 lockdep_set_current_reclaim_state(gfp_mask);
1858 reclaim_state.reclaimed_slab = 0; 1943 reclaim_state.reclaimed_slab = 0;
1859 p->reclaim_state = &reclaim_state; 1944 current->reclaim_state = &reclaim_state;
1860 1945
1861 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1946 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1862 1947
1863 p->reclaim_state = NULL; 1948 current->reclaim_state = NULL;
1864 lockdep_clear_current_reclaim_state(); 1949 lockdep_clear_current_reclaim_state();
1865 p->flags &= ~PF_MEMALLOC; 1950 current->flags &= ~PF_MEMALLOC;
1866 1951
1867 cond_resched(); 1952 cond_resched();
1868 1953
@@ -1906,7 +1991,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1906 preferred_zone, migratetype); 1991 preferred_zone, migratetype);
1907 1992
1908 if (!page && gfp_mask & __GFP_NOFAIL) 1993 if (!page && gfp_mask & __GFP_NOFAIL)
1909 congestion_wait(BLK_RW_ASYNC, HZ/50); 1994 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1910 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1995 } while (!page && (gfp_mask & __GFP_NOFAIL));
1911 1996
1912 return page; 1997 return page;
@@ -1914,24 +1999,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1914 1999
1915static inline 2000static inline
1916void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2001void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1917 enum zone_type high_zoneidx) 2002 enum zone_type high_zoneidx,
2003 enum zone_type classzone_idx)
1918{ 2004{
1919 struct zoneref *z; 2005 struct zoneref *z;
1920 struct zone *zone; 2006 struct zone *zone;
1921 2007
1922 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2008 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1923 wakeup_kswapd(zone, order); 2009 wakeup_kswapd(zone, order, classzone_idx);
1924} 2010}
1925 2011
1926static inline int 2012static inline int
1927gfp_to_alloc_flags(gfp_t gfp_mask) 2013gfp_to_alloc_flags(gfp_t gfp_mask)
1928{ 2014{
1929 struct task_struct *p = current;
1930 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2015 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1931 const gfp_t wait = gfp_mask & __GFP_WAIT; 2016 const gfp_t wait = gfp_mask & __GFP_WAIT;
1932 2017
1933 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2018 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1934 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 2019 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1935 2020
1936 /* 2021 /*
1937 * The caller may dip into page reserves a bit more if the caller 2022 * The caller may dip into page reserves a bit more if the caller
@@ -1939,21 +2024,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1939 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2024 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1940 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2025 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1941 */ 2026 */
1942 alloc_flags |= (gfp_mask & __GFP_HIGH); 2027 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1943 2028
1944 if (!wait) { 2029 if (!wait) {
1945 alloc_flags |= ALLOC_HARDER; 2030 /*
2031 * Not worth trying to allocate harder for
2032 * __GFP_NOMEMALLOC even if it can't schedule.
2033 */
2034 if (!(gfp_mask & __GFP_NOMEMALLOC))
2035 alloc_flags |= ALLOC_HARDER;
1946 /* 2036 /*
1947 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2037 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1948 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2038 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1949 */ 2039 */
1950 alloc_flags &= ~ALLOC_CPUSET; 2040 alloc_flags &= ~ALLOC_CPUSET;
1951 } else if (unlikely(rt_task(p)) && !in_interrupt()) 2041 } else if (unlikely(rt_task(current)) && !in_interrupt())
1952 alloc_flags |= ALLOC_HARDER; 2042 alloc_flags |= ALLOC_HARDER;
1953 2043
1954 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2044 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1955 if (!in_interrupt() && 2045 if (!in_interrupt() &&
1956 ((p->flags & PF_MEMALLOC) || 2046 ((current->flags & PF_MEMALLOC) ||
1957 unlikely(test_thread_flag(TIF_MEMDIE)))) 2047 unlikely(test_thread_flag(TIF_MEMDIE))))
1958 alloc_flags |= ALLOC_NO_WATERMARKS; 2048 alloc_flags |= ALLOC_NO_WATERMARKS;
1959 } 2049 }
@@ -1972,7 +2062,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1972 int alloc_flags; 2062 int alloc_flags;
1973 unsigned long pages_reclaimed = 0; 2063 unsigned long pages_reclaimed = 0;
1974 unsigned long did_some_progress; 2064 unsigned long did_some_progress;
1975 struct task_struct *p = current; 2065 bool sync_migration = false;
1976 2066
1977 /* 2067 /*
1978 * In the slowpath, we sanity check order to avoid ever trying to 2068 * In the slowpath, we sanity check order to avoid ever trying to
@@ -1997,7 +2087,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1997 goto nopage; 2087 goto nopage;
1998 2088
1999restart: 2089restart:
2000 wake_all_kswapd(order, zonelist, high_zoneidx); 2090 if (!(gfp_mask & __GFP_NO_KSWAPD))
2091 wake_all_kswapd(order, zonelist, high_zoneidx,
2092 zone_idx(preferred_zone));
2001 2093
2002 /* 2094 /*
2003 * OK, we're below the kswapd watermark and have kicked background 2095 * OK, we're below the kswapd watermark and have kicked background
@@ -2006,6 +2098,15 @@ restart:
2006 */ 2098 */
2007 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2099 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2008 2100
2101 /*
2102 * Find the true preferred zone if the allocation is unconstrained by
2103 * cpusets.
2104 */
2105 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2106 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2107 &preferred_zone);
2108
2109rebalance:
2009 /* This is the last chance, in general, before the goto nopage. */ 2110 /* This is the last chance, in general, before the goto nopage. */
2010 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2111 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2011 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2112 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2013,7 +2114,6 @@ restart:
2013 if (page) 2114 if (page)
2014 goto got_pg; 2115 goto got_pg;
2015 2116
2016rebalance:
2017 /* Allocate without watermarks if the context allows */ 2117 /* Allocate without watermarks if the context allows */
2018 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2118 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2019 page = __alloc_pages_high_priority(gfp_mask, order, 2119 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2028,21 +2128,26 @@ rebalance:
2028 goto nopage; 2128 goto nopage;
2029 2129
2030 /* Avoid recursion of direct reclaim */ 2130 /* Avoid recursion of direct reclaim */
2031 if (p->flags & PF_MEMALLOC) 2131 if (current->flags & PF_MEMALLOC)
2032 goto nopage; 2132 goto nopage;
2033 2133
2034 /* Avoid allocations with no watermarks from looping endlessly */ 2134 /* Avoid allocations with no watermarks from looping endlessly */
2035 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2135 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2036 goto nopage; 2136 goto nopage;
2037 2137
2038 /* Try direct compaction */ 2138 /*
2139 * Try direct compaction. The first pass is asynchronous. Subsequent
2140 * attempts after direct reclaim are synchronous
2141 */
2039 page = __alloc_pages_direct_compact(gfp_mask, order, 2142 page = __alloc_pages_direct_compact(gfp_mask, order,
2040 zonelist, high_zoneidx, 2143 zonelist, high_zoneidx,
2041 nodemask, 2144 nodemask,
2042 alloc_flags, preferred_zone, 2145 alloc_flags, preferred_zone,
2043 migratetype, &did_some_progress); 2146 migratetype, &did_some_progress,
2147 sync_migration);
2044 if (page) 2148 if (page)
2045 goto got_pg; 2149 goto got_pg;
2150 sync_migration = true;
2046 2151
2047 /* Try direct reclaim and then allocating */ 2152 /* Try direct reclaim and then allocating */
2048 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2153 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2094,18 +2199,26 @@ rebalance:
2094 pages_reclaimed += did_some_progress; 2199 pages_reclaimed += did_some_progress;
2095 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2200 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2096 /* Wait for some write requests to complete then retry */ 2201 /* Wait for some write requests to complete then retry */
2097 congestion_wait(BLK_RW_ASYNC, HZ/50); 2202 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2098 goto rebalance; 2203 goto rebalance;
2204 } else {
2205 /*
2206 * High-order allocations do not necessarily loop after
2207 * direct reclaim and reclaim/compaction depends on compaction
2208 * being called after reclaim so call directly if necessary
2209 */
2210 page = __alloc_pages_direct_compact(gfp_mask, order,
2211 zonelist, high_zoneidx,
2212 nodemask,
2213 alloc_flags, preferred_zone,
2214 migratetype, &did_some_progress,
2215 sync_migration);
2216 if (page)
2217 goto got_pg;
2099 } 2218 }
2100 2219
2101nopage: 2220nopage:
2102 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2221 warn_alloc_failed(gfp_mask, order, NULL);
2103 printk(KERN_WARNING "%s: page allocation failure."
2104 " order:%d, mode:0x%x\n",
2105 p->comm, order, gfp_mask);
2106 dump_stack();
2107 show_mem();
2108 }
2109 return page; 2222 return page;
2110got_pg: 2223got_pg:
2111 if (kmemcheck_enabled) 2224 if (kmemcheck_enabled)
@@ -2145,7 +2258,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2145 2258
2146 get_mems_allowed(); 2259 get_mems_allowed();
2147 /* The preferred zone is used for statistics later */ 2260 /* The preferred zone is used for statistics later */
2148 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2261 first_zones_zonelist(zonelist, high_zoneidx,
2262 nodemask ? : &cpuset_current_mems_allowed,
2263 &preferred_zone);
2149 if (!preferred_zone) { 2264 if (!preferred_zone) {
2150 put_mems_allowed(); 2265 put_mems_allowed();
2151 return NULL; 2266 return NULL;
@@ -2224,6 +2339,21 @@ void free_pages(unsigned long addr, unsigned int order)
2224 2339
2225EXPORT_SYMBOL(free_pages); 2340EXPORT_SYMBOL(free_pages);
2226 2341
2342static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2343{
2344 if (addr) {
2345 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2346 unsigned long used = addr + PAGE_ALIGN(size);
2347
2348 split_page(virt_to_page((void *)addr), order);
2349 while (used < alloc_end) {
2350 free_page(used);
2351 used += PAGE_SIZE;
2352 }
2353 }
2354 return (void *)addr;
2355}
2356
2227/** 2357/**
2228 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2358 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2229 * @size: the number of bytes to allocate 2359 * @size: the number of bytes to allocate
@@ -2243,22 +2373,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2243 unsigned long addr; 2373 unsigned long addr;
2244 2374
2245 addr = __get_free_pages(gfp_mask, order); 2375 addr = __get_free_pages(gfp_mask, order);
2246 if (addr) { 2376 return make_alloc_exact(addr, order, size);
2247 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2248 unsigned long used = addr + PAGE_ALIGN(size);
2249
2250 split_page(virt_to_page((void *)addr), order);
2251 while (used < alloc_end) {
2252 free_page(used);
2253 used += PAGE_SIZE;
2254 }
2255 }
2256
2257 return (void *)addr;
2258} 2377}
2259EXPORT_SYMBOL(alloc_pages_exact); 2378EXPORT_SYMBOL(alloc_pages_exact);
2260 2379
2261/** 2380/**
2381 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2382 * pages on a node.
2383 * @nid: the preferred node ID where memory should be allocated
2384 * @size: the number of bytes to allocate
2385 * @gfp_mask: GFP flags for the allocation
2386 *
2387 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2388 * back.
2389 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2390 * but is not exact.
2391 */
2392void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2393{
2394 unsigned order = get_order(size);
2395 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2396 if (!p)
2397 return NULL;
2398 return make_alloc_exact((unsigned long)page_address(p), order, size);
2399}
2400EXPORT_SYMBOL(alloc_pages_exact_nid);
2401
2402/**
2262 * free_pages_exact - release memory allocated via alloc_pages_exact() 2403 * free_pages_exact - release memory allocated via alloc_pages_exact()
2263 * @virt: the value returned by alloc_pages_exact. 2404 * @virt: the value returned by alloc_pages_exact.
2264 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2405 * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2352,19 +2493,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2352} 2493}
2353#endif 2494#endif
2354 2495
2496/*
2497 * Determine whether the node should be displayed or not, depending on whether
2498 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2499 */
2500bool skip_free_areas_node(unsigned int flags, int nid)
2501{
2502 bool ret = false;
2503
2504 if (!(flags & SHOW_MEM_FILTER_NODES))
2505 goto out;
2506
2507 get_mems_allowed();
2508 ret = !node_isset(nid, cpuset_current_mems_allowed);
2509 put_mems_allowed();
2510out:
2511 return ret;
2512}
2513
2355#define K(x) ((x) << (PAGE_SHIFT-10)) 2514#define K(x) ((x) << (PAGE_SHIFT-10))
2356 2515
2357/* 2516/*
2358 * Show free area list (used inside shift_scroll-lock stuff) 2517 * Show free area list (used inside shift_scroll-lock stuff)
2359 * We also calculate the percentage fragmentation. We do this by counting the 2518 * We also calculate the percentage fragmentation. We do this by counting the
2360 * memory on each free list with the exception of the first item on the list. 2519 * memory on each free list with the exception of the first item on the list.
2520 * Suppresses nodes that are not allowed by current's cpuset if
2521 * SHOW_MEM_FILTER_NODES is passed.
2361 */ 2522 */
2362void show_free_areas(void) 2523void show_free_areas(unsigned int filter)
2363{ 2524{
2364 int cpu; 2525 int cpu;
2365 struct zone *zone; 2526 struct zone *zone;
2366 2527
2367 for_each_populated_zone(zone) { 2528 for_each_populated_zone(zone) {
2529 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2530 continue;
2368 show_node(zone); 2531 show_node(zone);
2369 printk("%s per-cpu:\n", zone->name); 2532 printk("%s per-cpu:\n", zone->name);
2370 2533
@@ -2406,6 +2569,8 @@ void show_free_areas(void)
2406 for_each_populated_zone(zone) { 2569 for_each_populated_zone(zone) {
2407 int i; 2570 int i;
2408 2571
2572 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2573 continue;
2409 show_node(zone); 2574 show_node(zone);
2410 printk("%s" 2575 printk("%s"
2411 " free:%lukB" 2576 " free:%lukB"
@@ -2436,7 +2601,7 @@ void show_free_areas(void)
2436 " all_unreclaimable? %s" 2601 " all_unreclaimable? %s"
2437 "\n", 2602 "\n",
2438 zone->name, 2603 zone->name,
2439 K(zone_nr_free_pages(zone)), 2604 K(zone_page_state(zone, NR_FREE_PAGES)),
2440 K(min_wmark_pages(zone)), 2605 K(min_wmark_pages(zone)),
2441 K(low_wmark_pages(zone)), 2606 K(low_wmark_pages(zone)),
2442 K(high_wmark_pages(zone)), 2607 K(high_wmark_pages(zone)),
@@ -2473,6 +2638,8 @@ void show_free_areas(void)
2473 for_each_populated_zone(zone) { 2638 for_each_populated_zone(zone) {
2474 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2639 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2475 2640
2641 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2642 continue;
2476 show_node(zone); 2643 show_node(zone);
2477 printk("%s: ", zone->name); 2644 printk("%s: ", zone->name);
2478 2645
@@ -2579,9 +2746,16 @@ static int __parse_numa_zonelist_order(char *s)
2579 2746
2580static __init int setup_numa_zonelist_order(char *s) 2747static __init int setup_numa_zonelist_order(char *s)
2581{ 2748{
2582 if (s) 2749 int ret;
2583 return __parse_numa_zonelist_order(s); 2750
2584 return 0; 2751 if (!s)
2752 return 0;
2753
2754 ret = __parse_numa_zonelist_order(s);
2755 if (ret == 0)
2756 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2757
2758 return ret;
2585} 2759}
2586early_param("numa_zonelist_order", setup_numa_zonelist_order); 2760early_param("numa_zonelist_order", setup_numa_zonelist_order);
2587 2761
@@ -3007,14 +3181,6 @@ static __init_refok int __build_all_zonelists(void *data)
3007 build_zonelist_cache(pgdat); 3181 build_zonelist_cache(pgdat);
3008 } 3182 }
3009 3183
3010#ifdef CONFIG_MEMORY_HOTPLUG
3011 /* Setup real pagesets for the new zone */
3012 if (data) {
3013 struct zone *zone = data;
3014 setup_zone_pageset(zone);
3015 }
3016#endif
3017
3018 /* 3184 /*
3019 * Initialize the boot_pagesets that are going to be used 3185 * Initialize the boot_pagesets that are going to be used
3020 * for bootstrapping processors. The real pagesets for 3186 * for bootstrapping processors. The real pagesets for
@@ -3052,7 +3218,7 @@ static __init_refok int __build_all_zonelists(void *data)
3052 * Called with zonelists_mutex held always 3218 * Called with zonelists_mutex held always
3053 * unless system_state == SYSTEM_BOOTING. 3219 * unless system_state == SYSTEM_BOOTING.
3054 */ 3220 */
3055void build_all_zonelists(void *data) 3221void __ref build_all_zonelists(void *data)
3056{ 3222{
3057 set_zonelist_order(); 3223 set_zonelist_order();
3058 3224
@@ -3063,7 +3229,11 @@ void build_all_zonelists(void *data)
3063 } else { 3229 } else {
3064 /* we have to stop all cpus to guarantee there is no user 3230 /* we have to stop all cpus to guarantee there is no user
3065 of zonelist */ 3231 of zonelist */
3066 stop_machine(__build_all_zonelists, data, NULL); 3232#ifdef CONFIG_MEMORY_HOTPLUG
3233 if (data)
3234 setup_zone_pageset((struct zone *)data);
3235#endif
3236 stop_machine(__build_all_zonelists, NULL, NULL);
3067 /* cpuset refresh routine should be here */ 3237 /* cpuset refresh routine should be here */
3068 } 3238 }
3069 vm_total_pages = nr_free_pagecache_pages(); 3239 vm_total_pages = nr_free_pagecache_pages();
@@ -3159,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
3159#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3329#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3160 3330
3161/* 3331/*
3332 * Check if a pageblock contains reserved pages
3333 */
3334static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3335{
3336 unsigned long pfn;
3337
3338 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3339 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3340 return 1;
3341 }
3342 return 0;
3343}
3344
3345/*
3162 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3346 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3163 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3347 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3164 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3348 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3167,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
3167 */ 3351 */
3168static void setup_zone_migrate_reserve(struct zone *zone) 3352static void setup_zone_migrate_reserve(struct zone *zone)
3169{ 3353{
3170 unsigned long start_pfn, pfn, end_pfn; 3354 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3171 struct page *page; 3355 struct page *page;
3172 unsigned long block_migratetype; 3356 unsigned long block_migratetype;
3173 int reserve; 3357 int reserve;
@@ -3197,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3197 continue; 3381 continue;
3198 3382
3199 /* Blocks with reserved pages will never free, skip them. */ 3383 /* Blocks with reserved pages will never free, skip them. */
3200 if (PageReserved(page)) 3384 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3385 if (pageblock_is_reserved(pfn, block_end_pfn))
3201 continue; 3386 continue;
3202 3387
3203 block_migratetype = get_pageblock_migratetype(page); 3388 block_migratetype = get_pageblock_migratetype(page);
@@ -3386,7 +3571,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3386 pcp->batch = PAGE_SHIFT * 8; 3571 pcp->batch = PAGE_SHIFT * 8;
3387} 3572}
3388 3573
3389static __meminit void setup_zone_pageset(struct zone *zone) 3574static void setup_zone_pageset(struct zone *zone)
3390{ 3575{
3391 int cpu; 3576 int cpu;
3392 3577
@@ -3436,7 +3621,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3436 3621
3437 if (!slab_is_available()) { 3622 if (!slab_is_available()) {
3438 zone->wait_table = (wait_queue_head_t *) 3623 zone->wait_table = (wait_queue_head_t *)
3439 alloc_bootmem_node(pgdat, alloc_size); 3624 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3440 } else { 3625 } else {
3441 /* 3626 /*
3442 * This case means that a zone whose size was 0 gets new memory 3627 * This case means that a zone whose size was 0 gets new memory
@@ -3636,68 +3821,87 @@ void __init free_bootmem_with_active_regions(int nid,
3636 } 3821 }
3637} 3822}
3638 3823
3639int __init add_from_early_node_map(struct range *range, int az, 3824#ifdef CONFIG_HAVE_MEMBLOCK
3640 int nr_range, int nid) 3825/*
3826 * Basic iterator support. Return the last range of PFNs for a node
3827 * Note: nid == MAX_NUMNODES returns last region regardless of node
3828 */
3829static int __meminit last_active_region_index_in_nid(int nid)
3641{ 3830{
3642 int i; 3831 int i;
3643 u64 start, end;
3644 3832
3645 /* need to go over early_node_map to find out good range for node */ 3833 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3646 for_each_active_range_index_in_nid(i, nid) { 3834 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3647 start = early_node_map[i].start_pfn; 3835 return i;
3648 end = early_node_map[i].end_pfn; 3836
3649 nr_range = add_range(range, az, nr_range, start, end); 3837 return -1;
3650 }
3651 return nr_range;
3652} 3838}
3653 3839
3654#ifdef CONFIG_NO_BOOTMEM 3840/*
3655void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, 3841 * Basic iterator support. Return the previous active range of PFNs for a node
3842 * Note: nid == MAX_NUMNODES returns next region regardless of node
3843 */
3844static int __meminit previous_active_region_index_in_nid(int index, int nid)
3845{
3846 for (index = index - 1; index >= 0; index--)
3847 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3848 return index;
3849
3850 return -1;
3851}
3852
3853#define for_each_active_range_index_in_nid_reverse(i, nid) \
3854 for (i = last_active_region_index_in_nid(nid); i != -1; \
3855 i = previous_active_region_index_in_nid(i, nid))
3856
3857u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3656 u64 goal, u64 limit) 3858 u64 goal, u64 limit)
3657{ 3859{
3658 int i; 3860 int i;
3659 void *ptr;
3660
3661 if (limit > get_max_mapped())
3662 limit = get_max_mapped();
3663 3861
3664 /* need to go over early_node_map to find out good range for node */ 3862 /* Need to go over early_node_map to find out good range for node */
3665 for_each_active_range_index_in_nid(i, nid) { 3863 for_each_active_range_index_in_nid_reverse(i, nid) {
3666 u64 addr; 3864 u64 addr;
3667 u64 ei_start, ei_last; 3865 u64 ei_start, ei_last;
3866 u64 final_start, final_end;
3668 3867
3669 ei_last = early_node_map[i].end_pfn; 3868 ei_last = early_node_map[i].end_pfn;
3670 ei_last <<= PAGE_SHIFT; 3869 ei_last <<= PAGE_SHIFT;
3671 ei_start = early_node_map[i].start_pfn; 3870 ei_start = early_node_map[i].start_pfn;
3672 ei_start <<= PAGE_SHIFT; 3871 ei_start <<= PAGE_SHIFT;
3673 addr = find_early_area(ei_start, ei_last,
3674 goal, limit, size, align);
3675 3872
3676 if (addr == -1ULL) 3873 final_start = max(ei_start, goal);
3874 final_end = min(ei_last, limit);
3875
3876 if (final_start >= final_end)
3677 continue; 3877 continue;
3678 3878
3679#if 0 3879 addr = memblock_find_in_range(final_start, final_end, size, align);
3680 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3681 nid,
3682 ei_start, ei_last, goal, limit, size,
3683 align, addr);
3684#endif
3685 3880
3686 ptr = phys_to_virt(addr); 3881 if (addr == MEMBLOCK_ERROR)
3687 memset(ptr, 0, size); 3882 continue;
3688 reserve_early_without_check(addr, addr + size, "BOOTMEM"); 3883
3689 /* 3884 return addr;
3690 * The min_count is set to 0 so that bootmem allocated blocks
3691 * are never reported as leaks.
3692 */
3693 kmemleak_alloc(ptr, size, 0, 0);
3694 return ptr;
3695 } 3885 }
3696 3886
3697 return NULL; 3887 return MEMBLOCK_ERROR;
3698} 3888}
3699#endif 3889#endif
3700 3890
3891int __init add_from_early_node_map(struct range *range, int az,
3892 int nr_range, int nid)
3893{
3894 int i;
3895 u64 start, end;
3896
3897 /* need to go over early_node_map to find out good range for node */
3898 for_each_active_range_index_in_nid(i, nid) {
3899 start = early_node_map[i].start_pfn;
3900 end = early_node_map[i].end_pfn;
3901 nr_range = add_range(range, az, nr_range, start, end);
3902 }
3903 return nr_range;
3904}
3701 3905
3702void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3906void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3703{ 3907{
@@ -3779,7 +3983,7 @@ static void __init find_usable_zone_for_movable(void)
3779 3983
3780/* 3984/*
3781 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3985 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3782 * because it is sized independant of architecture. Unlike the other zones, 3986 * because it is sized independent of architecture. Unlike the other zones,
3783 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3987 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3784 * in each node depending on the size of each node and how evenly kernelcore 3988 * in each node depending on the size of each node and how evenly kernelcore
3785 * is distributed. This helper function adjusts the zone ranges 3989 * is distributed. This helper function adjusts the zone ranges
@@ -3994,10 +4198,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
3994 unsigned long usemapsize = usemap_size(zonesize); 4198 unsigned long usemapsize = usemap_size(zonesize);
3995 zone->pageblock_flags = NULL; 4199 zone->pageblock_flags = NULL;
3996 if (usemapsize) 4200 if (usemapsize)
3997 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4201 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4202 usemapsize);
3998} 4203}
3999#else 4204#else
4000static void inline setup_usemap(struct pglist_data *pgdat, 4205static inline void setup_usemap(struct pglist_data *pgdat,
4001 struct zone *zone, unsigned long zonesize) {} 4206 struct zone *zone, unsigned long zonesize) {}
4002#endif /* CONFIG_SPARSEMEM */ 4207#endif /* CONFIG_SPARSEMEM */
4003 4208
@@ -4114,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4114 zone->zone_pgdat = pgdat; 4319 zone->zone_pgdat = pgdat;
4115 4320
4116 zone_pcp_init(zone); 4321 zone_pcp_init(zone);
4117 for_each_lru(l) { 4322 for_each_lru(l)
4118 INIT_LIST_HEAD(&zone->lru[l].list); 4323 INIT_LIST_HEAD(&zone->lru[l].list);
4119 zone->reclaim_stat.nr_saved_scan[l] = 0;
4120 }
4121 zone->reclaim_stat.recent_rotated[0] = 0; 4324 zone->reclaim_stat.recent_rotated[0] = 0;
4122 zone->reclaim_stat.recent_rotated[1] = 0; 4325 zone->reclaim_stat.recent_rotated[1] = 0;
4123 zone->reclaim_stat.recent_scanned[0] = 0; 4326 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4160,7 +4363,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4160 size = (end - start) * sizeof(struct page); 4363 size = (end - start) * sizeof(struct page);
4161 map = alloc_remap(pgdat->node_id, size); 4364 map = alloc_remap(pgdat->node_id, size);
4162 if (!map) 4365 if (!map)
4163 map = alloc_bootmem_node(pgdat, size); 4366 map = alloc_bootmem_node_nopanic(pgdat, size);
4164 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4367 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4165 } 4368 }
4166#ifndef CONFIG_NEED_MULTIPLE_NODES 4369#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4732,15 +4935,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4732 dma_reserve = new_dma_reserve; 4935 dma_reserve = new_dma_reserve;
4733} 4936}
4734 4937
4735#ifndef CONFIG_NEED_MULTIPLE_NODES
4736struct pglist_data __refdata contig_page_data = {
4737#ifndef CONFIG_NO_BOOTMEM
4738 .bdata = &bootmem_node_data[0]
4739#endif
4740 };
4741EXPORT_SYMBOL(contig_page_data);
4742#endif
4743
4744void __init free_area_init(unsigned long *zones_size) 4938void __init free_area_init(unsigned long *zones_size)
4745{ 4939{
4746 free_area_init_node(0, zones_size, 4940 free_area_init_node(0, zones_size,
@@ -4934,7 +5128,7 @@ void setup_per_zone_wmarks(void)
4934 * 1TB 101 10GB 5128 * 1TB 101 10GB
4935 * 10TB 320 32GB 5129 * 10TB 320 32GB
4936 */ 5130 */
4937void calculate_zone_inactive_ratio(struct zone *zone) 5131static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
4938{ 5132{
4939 unsigned int gb, ratio; 5133 unsigned int gb, ratio;
4940 5134
@@ -4948,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
4948 zone->inactive_ratio = ratio; 5142 zone->inactive_ratio = ratio;
4949} 5143}
4950 5144
4951static void __init setup_per_zone_inactive_ratio(void) 5145static void __meminit setup_per_zone_inactive_ratio(void)
4952{ 5146{
4953 struct zone *zone; 5147 struct zone *zone;
4954 5148
@@ -4980,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void)
4980 * 8192MB: 11584k 5174 * 8192MB: 11584k
4981 * 16384MB: 16384k 5175 * 16384MB: 16384k
4982 */ 5176 */
4983static int __init init_per_zone_wmark_min(void) 5177int __meminit init_per_zone_wmark_min(void)
4984{ 5178{
4985 unsigned long lowmem_kbytes; 5179 unsigned long lowmem_kbytes;
4986 5180
@@ -4992,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void)
4992 if (min_free_kbytes > 65536) 5186 if (min_free_kbytes > 65536)
4993 min_free_kbytes = 65536; 5187 min_free_kbytes = 65536;
4994 setup_per_zone_wmarks(); 5188 setup_per_zone_wmarks();
5189 refresh_zone_stat_thresholds();
4995 setup_per_zone_lowmem_reserve(); 5190 setup_per_zone_lowmem_reserve();
4996 setup_per_zone_inactive_ratio(); 5191 setup_per_zone_inactive_ratio();
4997 return 0; 5192 return 0;
@@ -5281,26 +5476,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5281 * page allocater never alloc memory from ISOLATE block. 5476 * page allocater never alloc memory from ISOLATE block.
5282 */ 5477 */
5283 5478
5479static int
5480__count_immobile_pages(struct zone *zone, struct page *page, int count)
5481{
5482 unsigned long pfn, iter, found;
5483 /*
5484 * For avoiding noise data, lru_add_drain_all() should be called
5485 * If ZONE_MOVABLE, the zone never contains immobile pages
5486 */
5487 if (zone_idx(zone) == ZONE_MOVABLE)
5488 return true;
5489
5490 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5491 return true;
5492
5493 pfn = page_to_pfn(page);
5494 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5495 unsigned long check = pfn + iter;
5496
5497 if (!pfn_valid_within(check))
5498 continue;
5499
5500 page = pfn_to_page(check);
5501 if (!page_count(page)) {
5502 if (PageBuddy(page))
5503 iter += (1 << page_order(page)) - 1;
5504 continue;
5505 }
5506 if (!PageLRU(page))
5507 found++;
5508 /*
5509 * If there are RECLAIMABLE pages, we need to check it.
5510 * But now, memory offline itself doesn't call shrink_slab()
5511 * and it still to be fixed.
5512 */
5513 /*
5514 * If the page is not RAM, page_count()should be 0.
5515 * we don't need more check. This is an _used_ not-movable page.
5516 *
5517 * The problematic thing here is PG_reserved pages. PG_reserved
5518 * is set to both of a memory hole page and a _used_ kernel
5519 * page at boot.
5520 */
5521 if (found > count)
5522 return false;
5523 }
5524 return true;
5525}
5526
5527bool is_pageblock_removable_nolock(struct page *page)
5528{
5529 struct zone *zone = page_zone(page);
5530 return __count_immobile_pages(zone, page, 0);
5531}
5532
5284int set_migratetype_isolate(struct page *page) 5533int set_migratetype_isolate(struct page *page)
5285{ 5534{
5286 struct zone *zone; 5535 struct zone *zone;
5287 struct page *curr_page; 5536 unsigned long flags, pfn;
5288 unsigned long flags, pfn, iter;
5289 unsigned long immobile = 0;
5290 struct memory_isolate_notify arg; 5537 struct memory_isolate_notify arg;
5291 int notifier_ret; 5538 int notifier_ret;
5292 int ret = -EBUSY; 5539 int ret = -EBUSY;
5293 int zone_idx;
5294 5540
5295 zone = page_zone(page); 5541 zone = page_zone(page);
5296 zone_idx = zone_idx(zone);
5297 5542
5298 spin_lock_irqsave(&zone->lock, flags); 5543 spin_lock_irqsave(&zone->lock, flags);
5299 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5300 zone_idx == ZONE_MOVABLE) {
5301 ret = 0;
5302 goto out;
5303 }
5304 5544
5305 pfn = page_to_pfn(page); 5545 pfn = page_to_pfn(page);
5306 arg.start_pfn = pfn; 5546 arg.start_pfn = pfn;
@@ -5320,23 +5560,20 @@ int set_migratetype_isolate(struct page *page)
5320 */ 5560 */
5321 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5561 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5322 notifier_ret = notifier_to_errno(notifier_ret); 5562 notifier_ret = notifier_to_errno(notifier_ret);
5323 if (notifier_ret || !arg.pages_found) 5563 if (notifier_ret)
5324 goto out; 5564 goto out;
5325 5565 /*
5326 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { 5566 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5327 if (!pfn_valid_within(pfn)) 5567 * We just check MOVABLE pages.
5328 continue; 5568 */
5329 5569 if (__count_immobile_pages(zone, page, arg.pages_found))
5330 curr_page = pfn_to_page(iter);
5331 if (!page_count(curr_page) || PageLRU(curr_page))
5332 continue;
5333
5334 immobile++;
5335 }
5336
5337 if (arg.pages_found == immobile)
5338 ret = 0; 5570 ret = 0;
5339 5571
5572 /*
5573 * immobile means "not-on-lru" paes. If immobile is larger than
5574 * removable-by-driver pages reported by notifier, we'll fail.
5575 */
5576
5340out: 5577out:
5341 if (!ret) { 5578 if (!ret) {
5342 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5579 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5455,7 +5692,6 @@ static struct trace_print_flags pageflag_names[] = {
5455 {1UL << PG_swapcache, "swapcache" }, 5692 {1UL << PG_swapcache, "swapcache" },
5456 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5693 {1UL << PG_mappedtodisk, "mappedtodisk" },
5457 {1UL << PG_reclaim, "reclaim" }, 5694 {1UL << PG_reclaim, "reclaim" },
5458 {1UL << PG_buddy, "buddy" },
5459 {1UL << PG_swapbacked, "swapbacked" }, 5695 {1UL << PG_swapbacked, "swapbacked" },
5460 {1UL << PG_unevictable, "unevictable" }, 5696 {1UL << PG_unevictable, "unevictable" },
5461#ifdef CONFIG_MMU 5697#ifdef CONFIG_MMU
@@ -5503,7 +5739,8 @@ void dump_page(struct page *page)
5503{ 5739{
5504 printk(KERN_ALERT 5740 printk(KERN_ALERT
5505 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5741 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5506 page, page_count(page), page_mapcount(page), 5742 page, atomic_read(&page->_count), page_mapcount(page),
5507 page->mapping, page->index); 5743 page->mapping, page->index);
5508 dump_page_flags(page->flags); 5744 dump_page_flags(page->flags);
5745 mem_cgroup_print_bad_page(page);
5509} 5746}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde1..53bffc6c293e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12#include <linux/kmemleak.h> 12#include <linux/kmemleak.h>
13 13
14static void __meminit 14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
16{ 15{
17 pc->flags = 0; 16 pc->flags = 0;
17 set_page_cgroup_array_id(pc, id);
18 pc->mem_cgroup = NULL; 18 pc->mem_cgroup = NULL;
19 pc->page = pfn_to_page(pfn);
20 INIT_LIST_HEAD(&pc->lru); 19 INIT_LIST_HEAD(&pc->lru);
21} 20}
22static unsigned long total_usage; 21static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
43 return base + offset; 42 return base + offset;
44} 43}
45 44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47 unsigned long pfn;
48 struct page *page;
49 pg_data_t *pgdat;
50
51 pgdat = NODE_DATA(page_cgroup_array_id(pc));
52 pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53 page = pfn_to_page(pfn);
54 VM_BUG_ON(pc != lookup_page_cgroup(page));
55 return page;
56}
57
46static int __init alloc_node_page_cgroup(int nid) 58static int __init alloc_node_page_cgroup(int nid)
47{ 59{
48 struct page_cgroup *base, *pc; 60 struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
63 return -ENOMEM; 75 return -ENOMEM;
64 for (index = 0; index < nr_pages; index++) { 76 for (index = 0; index < nr_pages; index++) {
65 pc = base + index; 77 pc = base + index;
66 __init_page_cgroup(pc, start_pfn + index); 78 init_page_cgroup(pc, nid);
67 } 79 }
68 NODE_DATA(nid)->node_page_cgroup = base; 80 NODE_DATA(nid)->node_page_cgroup = base;
69 total_usage += table_size; 81 total_usage += table_size;
@@ -105,46 +117,74 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
105 return section->page_cgroup + pfn; 117 return section->page_cgroup + pfn;
106} 118}
107 119
108/* __alloc_bootmem...() is protected by !slab_available() */ 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
109static int __init_refok init_section_page_cgroup(unsigned long pfn)
110{ 121{
111 struct mem_section *section = __pfn_to_section(pfn); 122 struct mem_section *section;
112 struct page_cgroup *base, *pc; 123 struct page *page;
113 unsigned long table_size; 124 unsigned long nr;
114 int nid, index; 125
115 126 nr = page_cgroup_array_id(pc);
116 if (!section->page_cgroup) { 127 section = __nr_to_section(nr);
117 nid = page_to_nid(pfn_to_page(pfn)); 128 page = pfn_to_page(pc - section->page_cgroup);
118 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 129 VM_BUG_ON(pc != lookup_page_cgroup(page));
119 VM_BUG_ON(!slab_is_available()); 130 return page;
120 if (node_state(nid, N_HIGH_MEMORY)) { 131}
121 base = kmalloc_node(table_size, 132
122 GFP_KERNEL | __GFP_NOWARN, nid); 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
123 if (!base) 134{
124 base = vmalloc_node(table_size, nid); 135 void *addr = NULL;
125 } else { 136
126 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); 137 addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
127 if (!base) 138 if (addr)
128 base = vmalloc(table_size); 139 return addr;
129 } 140
130 /* 141 if (node_state(nid, N_HIGH_MEMORY))
131 * The value stored in section->page_cgroup is (base - pfn) 142 addr = vmalloc_node(size, nid);
132 * and it does not point to the memory block allocated above, 143 else
133 * causing kmemleak false positives. 144 addr = vmalloc(size);
134 */ 145
135 kmemleak_not_leak(base); 146 return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152 if (is_vmalloc_addr(addr)) {
153 vfree(addr);
136 } else { 154 } else {
137 /* 155 struct page *page = virt_to_page(addr);
138 * We don't have to allocate page_cgroup again, but 156 size_t table_size =
139 * address of memmap may be changed. So, we have to initialize 157 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
140 * again. 158
141 */ 159 BUG_ON(PageReserved(page));
142 base = section->page_cgroup + pfn; 160 free_pages_exact(addr, table_size);
143 table_size = 0;
144 /* check address of memmap is changed or not. */
145 if (base->page == pfn_to_page(pfn))
146 return 0;
147 } 161 }
162}
163#endif
164
165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{
167 struct page_cgroup *base, *pc;
168 struct mem_section *section;
169 unsigned long table_size;
170 unsigned long nr;
171 int index;
172
173 nr = pfn_to_section_nr(pfn);
174 section = __nr_to_section(nr);
175
176 if (section->page_cgroup)
177 return 0;
178
179 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
180 base = alloc_page_cgroup(table_size, nid);
181
182 /*
183 * The value stored in section->page_cgroup is (base - pfn)
184 * and it does not point to the memory block allocated above,
185 * causing kmemleak false positives.
186 */
187 kmemleak_not_leak(base);
148 188
149 if (!base) { 189 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n"); 190 printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,9 +193,13 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
153 193
154 for (index = 0; index < PAGES_PER_SECTION; index++) { 194 for (index = 0; index < PAGES_PER_SECTION; index++) {
155 pc = base + index; 195 pc = base + index;
156 __init_page_cgroup(pc, pfn + index); 196 init_page_cgroup(pc, nr);
157 } 197 }
158 198 /*
199 * The passed "pfn" may not be aligned to SECTION. For the calculation
200 * we need to apply a mask.
201 */
202 pfn &= PAGE_SECTION_MASK;
159 section->page_cgroup = base - pfn; 203 section->page_cgroup = base - pfn;
160 total_usage += table_size; 204 total_usage += table_size;
161 return 0; 205 return 0;
@@ -170,16 +214,8 @@ void __free_page_cgroup(unsigned long pfn)
170 if (!ms || !ms->page_cgroup) 214 if (!ms || !ms->page_cgroup)
171 return; 215 return;
172 base = ms->page_cgroup + pfn; 216 base = ms->page_cgroup + pfn;
173 if (is_vmalloc_addr(base)) { 217 free_page_cgroup(base);
174 vfree(base); 218 ms->page_cgroup = NULL;
175 ms->page_cgroup = NULL;
176 } else {
177 struct page *page = virt_to_page(base);
178 if (!PageReserved(page)) { /* Is bootmem ? */
179 kfree(base);
180 ms->page_cgroup = NULL;
181 }
182 }
183} 219}
184 220
185int __meminit online_page_cgroup(unsigned long start_pfn, 221int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -192,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
192 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = start_pfn & ~(PAGES_PER_SECTION - 1);
193 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
194 230
231 if (nid == -1) {
232 /*
233 * In this case, "nid" already exists and contains valid memory.
234 * "start_pfn" passed to us is a pfn which is an arg for
235 * online__pages(), and start_pfn should exist.
236 */
237 nid = pfn_to_nid(start_pfn);
238 VM_BUG_ON(!node_state(nid, N_ONLINE));
239 }
240
195 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 241 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
196 if (!pfn_present(pfn)) 242 if (!pfn_present(pfn))
197 continue; 243 continue;
198 fail = init_section_page_cgroup(pfn); 244 fail = init_section_page_cgroup(pfn, nid);
199 } 245 }
200 if (!fail) 246 if (!fail)
201 return 0; 247 return 0;
@@ -243,12 +289,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
243 break; 289 break;
244 } 290 }
245 291
246 if (ret) 292 return notifier_from_errno(ret);
247 ret = notifier_from_errno(ret);
248 else
249 ret = NOTIFY_OK;
250
251 return ret;
252} 293}
253 294
254#endif 295#endif
@@ -256,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
256void __init page_cgroup_init(void) 297void __init page_cgroup_init(void)
257{ 298{
258 unsigned long pfn; 299 unsigned long pfn;
259 int fail = 0; 300 int nid;
260 301
261 if (mem_cgroup_disabled()) 302 if (mem_cgroup_disabled())
262 return; 303 return;
263 304
264 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 305 for_each_node_state(nid, N_HIGH_MEMORY) {
265 if (!pfn_present(pfn)) 306 unsigned long start_pfn, end_pfn;
266 continue; 307
267 fail = init_section_page_cgroup(pfn); 308 start_pfn = node_start_pfn(nid);
268 } 309 end_pfn = node_end_pfn(nid);
269 if (fail) { 310 /*
270 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 311 * start_pfn and end_pfn may not be aligned to SECTION and the
271 panic("Out of memory"); 312 * page->flags of out of node pages are not initialized. So we
272 } else { 313 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
273 hotplug_memory_notifier(page_cgroup_callback, 0); 314 */
315 for (pfn = start_pfn;
316 pfn < end_pfn;
317 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319 if (!pfn_valid(pfn))
320 continue;
321 /*
322 * Nodes's pfns can be overlapping.
323 * We know some arch can have a nodes layout such as
324 * -------------pfn-------------->
325 * N0 | N1 | N2 | N0 | N1 | N2|....
326 */
327 if (pfn_to_nid(pfn) != nid)
328 continue;
329 if (init_section_page_cgroup(pfn, nid))
330 goto oom;
331 }
274 } 332 }
333 hotplug_memory_notifier(page_cgroup_callback, 0);
275 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 334 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
276 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" 335 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
277 " want memory cgroups\n"); 336 "don't want memory cgroups\n");
337 return;
338oom:
339 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340 panic("Out of memory");
278} 341}
279 342
280void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -349,7 +412,7 @@ not_enough_page:
349 * @new: new id 412 * @new: new id
350 * 413 *
351 * Returns old id at success, 0 at failure. 414 * Returns old id at success, 0 at failure.
352 * (There is no mem_cgroup useing 0 as its id) 415 * (There is no mem_cgroup using 0 as its id)
353 */ 416 */
354unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 417unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
355 unsigned short old, unsigned short new) 418 unsigned short old, unsigned short new)
@@ -447,7 +510,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
447 if (!do_swap_account) 510 if (!do_swap_account)
448 return 0; 511 return 0;
449 512
450 length = ((max_pages/SC_PER_PAGE) + 1); 513 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
451 array_size = length * sizeof(void *); 514 array_size = length * sizeof(void *);
452 515
453 array = vmalloc(array_size); 516 array = vmalloc(array_size);
@@ -464,8 +527,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
464 /* memory shortage */ 527 /* memory shortage */
465 ctrl->map = NULL; 528 ctrl->map = NULL;
466 ctrl->length = 0; 529 ctrl->length = 0;
467 vfree(array);
468 mutex_unlock(&swap_cgroup_mutex); 530 mutex_unlock(&swap_cgroup_mutex);
531 vfree(array);
469 goto nomem; 532 goto nomem;
470 } 533 }
471 mutex_unlock(&swap_cgroup_mutex); 534 mutex_unlock(&swap_cgroup_mutex);
@@ -480,7 +543,8 @@ nomem:
480 543
481void swap_cgroup_swapoff(int type) 544void swap_cgroup_swapoff(int type)
482{ 545{
483 int i; 546 struct page **map;
547 unsigned long i, length;
484 struct swap_cgroup_ctrl *ctrl; 548 struct swap_cgroup_ctrl *ctrl;
485 549
486 if (!do_swap_account) 550 if (!do_swap_account)
@@ -488,17 +552,20 @@ void swap_cgroup_swapoff(int type)
488 552
489 mutex_lock(&swap_cgroup_mutex); 553 mutex_lock(&swap_cgroup_mutex);
490 ctrl = &swap_cgroup_ctrl[type]; 554 ctrl = &swap_cgroup_ctrl[type];
491 if (ctrl->map) { 555 map = ctrl->map;
492 for (i = 0; i < ctrl->length; i++) { 556 length = ctrl->length;
493 struct page *page = ctrl->map[i]; 557 ctrl->map = NULL;
558 ctrl->length = 0;
559 mutex_unlock(&swap_cgroup_mutex);
560
561 if (map) {
562 for (i = 0; i < length; i++) {
563 struct page *page = map[i];
494 if (page) 564 if (page)
495 __free_page(page); 565 __free_page(page);
496 } 566 }
497 vfree(ctrl->map); 567 vfree(map);
498 ctrl->map = NULL;
499 ctrl->length = 0;
500 } 568 }
501 mutex_unlock(&swap_cgroup_mutex);
502} 569}
503 570
504#endif 571#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= REQ_SYNC | REQ_UNPLUG; 109 rw |= REQ_SYNC;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
86 * all pages in [start_pfn...end_pfn) must be in the same zone. 86 * all pages in [start_pfn...end_pfn) must be in the same zone.
87 * zone->lock must be held before call this. 87 * zone->lock must be held before call this.
88 * 88 *
89 * Returns 0 if all pages in the range is isolated. 89 * Returns 1 if all pages in the range is isolated.
90 */ 90 */
91static int 91static int
92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 92__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
119 struct zone *zone; 119 struct zone *zone;
120 int ret; 120 int ret;
121 121
122 pfn = start_pfn;
123 /* 122 /*
124 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page 123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
125 * is not aligned to pageblock_nr_pages. 124 * is not aligned to pageblock_nr_pages.
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee5..c3450d533611 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,18 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
33 33
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36again:
36 next = pmd_addr_end(addr, end); 37 next = pmd_addr_end(addr, end);
37 if (pmd_none_or_clear_bad(pmd)) { 38 if (pmd_none(*pmd)) {
38 if (walk->pte_hole) 39 if (walk->pte_hole)
39 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
40 if (err) 41 if (err)
41 break; 42 break;
42 continue; 43 continue;
43 } 44 }
45 /*
46 * This implies that each ->pmd_entry() handler
47 * needs to know about pmd_trans_huge() pmds
48 */
44 if (walk->pmd_entry) 49 if (walk->pmd_entry)
45 err = walk->pmd_entry(pmd, addr, next, walk); 50 err = walk->pmd_entry(pmd, addr, next, walk);
46 if (!err && walk->pte_entry) 51 if (err)
47 err = walk_pte_range(pmd, addr, next, walk); 52 break;
53
54 /*
55 * Check this here so we only break down trans_huge
56 * pages when we _need_ to
57 */
58 if (!walk->pte_entry)
59 continue;
60
61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd))
63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk);
48 if (err) 65 if (err)
49 break; 66 break;
50 } while (pmd++, addr = next, addr != end); 67 } while (pmd++, addr = next, addr != end);
@@ -139,7 +156,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
139 pgd_t *pgd; 156 pgd_t *pgd;
140 unsigned long next; 157 unsigned long next;
141 int err = 0; 158 int err = 0;
142 struct vm_area_struct *vma;
143 159
144 if (addr >= end) 160 if (addr >= end)
145 return err; 161 return err;
@@ -149,15 +165,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
149 165
150 pgd = pgd_offset(walk->mm, addr); 166 pgd = pgd_offset(walk->mm, addr);
151 do { 167 do {
168 struct vm_area_struct *uninitialized_var(vma);
169
152 next = pgd_addr_end(addr, end); 170 next = pgd_addr_end(addr, end);
153 171
172#ifdef CONFIG_HUGETLB_PAGE
154 /* 173 /*
155 * handle hugetlb vma individually because pagetable walk for 174 * handle hugetlb vma individually because pagetable walk for
156 * the hugetlb page is dependent on the architecture and 175 * the hugetlb page is dependent on the architecture and
157 * we can't handled it in the same manner as non-huge pages. 176 * we can't handled it in the same manner as non-huge pages.
158 */ 177 */
159 vma = find_vma(walk->mm, addr); 178 vma = find_vma(walk->mm, addr);
160#ifdef CONFIG_HUGETLB_PAGE
161 if (vma && is_vm_hugetlb_page(vma)) { 179 if (vma && is_vm_hugetlb_page(vma)) {
162 if (vma->vm_end < next) 180 if (vma->vm_end < next)
163 next = vma->vm_end; 181 next = vma->vm_end;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540a..89633fefc6a2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
27 * chunk size is not aligned. percpu-km code will whine about it. 27 * chunk size is not aligned. percpu-km code will whine about it.
28 */ 28 */
29 29
30#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 30#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
31#error "contiguous percpu allocation is incompatible with paged first chunk" 31#error "contiguous percpu allocation is incompatible with paged first chunk"
32#endif 32#endif
33 33
@@ -35,7 +35,11 @@
35 35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
37{ 37{
38 /* noop */ 38 unsigned int cpu;
39
40 for_each_possible_cpu(cpu)
41 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
42
39 return 0; 43 return 0;
40} 44}
41 45
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
421 return NULL; 421 return NULL;
422 422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); 424 pcpu_nr_groups, pcpu_atom_size);
425 if (!vms) { 425 if (!vms) {
426 pcpu_free_chunk(chunk); 426 pcpu_free_chunk(chunk);
427 return NULL; 427 return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index c76ef3891e0d..bf80e55dbed7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
31 * as small as 4 bytes. The allocator organizes chunks into lists 31 * as small as 4 bytes. The allocator organizes chunks into lists
32 * according to free size and tries to allocate from the fullest one. 32 * according to free size and tries to allocate from the fullest one.
33 * Each chunk keeps the maximum contiguous area size hint which is 33 * Each chunk keeps the maximum contiguous area size hint which is
34 * guaranteed to be eqaul to or larger than the maximum contiguous 34 * guaranteed to be equal to or larger than the maximum contiguous
35 * area in the chunk. This helps the allocator not to iterate the 35 * area in the chunk. This helps the allocator not to iterate the
36 * chunk maps unnecessarily. 36 * chunk maps unnecessarily.
37 * 37 *
@@ -76,6 +76,7 @@
76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
78 78
79#ifdef CONFIG_SMP
79/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
80#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
81#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
@@ -89,6 +90,11 @@
89 (unsigned long)pcpu_base_addr - \ 90 (unsigned long)pcpu_base_addr - \
90 (unsigned long)__per_cpu_start) 91 (unsigned long)__per_cpu_start)
91#endif 92#endif
93#else /* CONFIG_SMP */
94/* on UP, it's always identity mapped */
95#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
96#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
97#endif /* CONFIG_SMP */
92 98
93struct pcpu_chunk { 99struct pcpu_chunk {
94 struct list_head list; /* linked to pcpu_slot lists */ 100 struct list_head list; /* linked to pcpu_slot lists */
@@ -252,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
252 258
253/* 259/*
254 * (Un)populated page region iterators. Iterate over (un)populated 260 * (Un)populated page region iterators. Iterate over (un)populated
255 * page regions betwen @start and @end in @chunk. @rs and @re should 261 * page regions between @start and @end in @chunk. @rs and @re should
256 * be integer variables and will be set to start and end page index of 262 * be integer variables and will be set to start and end page index of
257 * the current region. 263 * the current region.
258 */ 264 */
@@ -287,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
287 293
288 if (size <= PAGE_SIZE) 294 if (size <= PAGE_SIZE)
289 return kzalloc(size, GFP_KERNEL); 295 return kzalloc(size, GFP_KERNEL);
290 else { 296 else
291 void *ptr = vmalloc(size); 297 return vzalloc(size);
292 if (ptr)
293 memset(ptr, 0, size);
294 return ptr;
295 }
296} 298}
297 299
298/** 300/**
@@ -340,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
340 * @chunk: chunk of interest 342 * @chunk: chunk of interest
341 * 343 *
342 * Determine whether area map of @chunk needs to be extended to 344 * Determine whether area map of @chunk needs to be extended to
343 * accomodate a new allocation. 345 * accommodate a new allocation.
344 * 346 *
345 * CONTEXT: 347 * CONTEXT:
346 * pcpu_lock. 348 * pcpu_lock.
@@ -429,7 +431,7 @@ out_unlock:
429 * depending on @head, is reduced by @tail bytes and @tail byte block 431 * depending on @head, is reduced by @tail bytes and @tail byte block
430 * is inserted after the target block. 432 * is inserted after the target block.
431 * 433 *
432 * @chunk->map must have enough free slots to accomodate the split. 434 * @chunk->map must have enough free slots to accommodate the split.
433 * 435 *
434 * CONTEXT: 436 * CONTEXT:
435 * pcpu_lock. 437 * pcpu_lock.
@@ -820,8 +822,8 @@ fail_unlock_mutex:
820 * @size: size of area to allocate in bytes 822 * @size: size of area to allocate in bytes
821 * @align: alignment of area (max PAGE_SIZE) 823 * @align: alignment of area (max PAGE_SIZE)
822 * 824 *
823 * Allocate percpu area of @size bytes aligned at @align. Might 825 * Allocate zero-filled percpu area of @size bytes aligned at @align.
824 * sleep. Might trigger writeouts. 826 * Might sleep. Might trigger writeouts.
825 * 827 *
826 * CONTEXT: 828 * CONTEXT:
827 * Does GFP_KERNEL allocation. 829 * Does GFP_KERNEL allocation.
@@ -840,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
840 * @size: size of area to allocate in bytes 842 * @size: size of area to allocate in bytes
841 * @align: alignment of area (max PAGE_SIZE) 843 * @align: alignment of area (max PAGE_SIZE)
842 * 844 *
843 * Allocate percpu area of @size bytes aligned at @align from reserved 845 * Allocate zero-filled percpu area of @size bytes aligned at @align
844 * percpu area if arch has set it up; otherwise, allocation is served 846 * from reserved percpu area if arch has set it up; otherwise,
845 * from the same dynamic area. Might sleep. Might trigger writeouts. 847 * allocation is served from the same dynamic area. Might sleep.
848 * Might trigger writeouts.
846 * 849 *
847 * CONTEXT: 850 * CONTEXT:
848 * Does GFP_KERNEL allocation. 851 * Does GFP_KERNEL allocation.
@@ -949,6 +952,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
949 */ 952 */
950bool is_kernel_percpu_address(unsigned long addr) 953bool is_kernel_percpu_address(unsigned long addr)
951{ 954{
955#ifdef CONFIG_SMP
952 const size_t static_size = __per_cpu_end - __per_cpu_start; 956 const size_t static_size = __per_cpu_end - __per_cpu_start;
953 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); 957 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
954 unsigned int cpu; 958 unsigned int cpu;
@@ -959,6 +963,8 @@ bool is_kernel_percpu_address(unsigned long addr)
959 if ((void *)addr >= start && (void *)addr < start + static_size) 963 if ((void *)addr >= start && (void *)addr < start + static_size)
960 return true; 964 return true;
961 } 965 }
966#endif
967 /* on UP, can't distinguish from other static vars, always false */
962 return false; 968 return false;
963} 969}
964 970
@@ -1002,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1002 } 1008 }
1003 1009
1004 if (in_first_chunk) { 1010 if (in_first_chunk) {
1005 if ((unsigned long)addr < VMALLOC_START || 1011 if (!is_vmalloc_addr(addr))
1006 (unsigned long)addr >= VMALLOC_END)
1007 return __pa(addr); 1012 return __pa(addr);
1008 else 1013 else
1009 return page_to_phys(vmalloc_to_page(addr)); 1014 return page_to_phys(vmalloc_to_page(addr));
@@ -1067,161 +1072,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1067} 1072}
1068 1073
1069/** 1074/**
1070 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1071 * @reserved_size: the size of reserved percpu area in bytes
1072 * @dyn_size: minimum free size for dynamic allocation in bytes
1073 * @atom_size: allocation atom size
1074 * @cpu_distance_fn: callback to determine distance between cpus, optional
1075 *
1076 * This function determines grouping of units, their mappings to cpus
1077 * and other parameters considering needed percpu size, allocation
1078 * atom size and distances between CPUs.
1079 *
1080 * Groups are always mutliples of atom size and CPUs which are of
1081 * LOCAL_DISTANCE both ways are grouped together and share space for
1082 * units in the same group. The returned configuration is guaranteed
1083 * to have CPUs on different nodes on different groups and >=75% usage
1084 * of allocated virtual address space.
1085 *
1086 * RETURNS:
1087 * On success, pointer to the new allocation_info is returned. On
1088 * failure, ERR_PTR value is returned.
1089 */
1090static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1091 size_t reserved_size, size_t dyn_size,
1092 size_t atom_size,
1093 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1094{
1095 static int group_map[NR_CPUS] __initdata;
1096 static int group_cnt[NR_CPUS] __initdata;
1097 const size_t static_size = __per_cpu_end - __per_cpu_start;
1098 int nr_groups = 1, nr_units = 0;
1099 size_t size_sum, min_unit_size, alloc_size;
1100 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1101 int last_allocs, group, unit;
1102 unsigned int cpu, tcpu;
1103 struct pcpu_alloc_info *ai;
1104 unsigned int *cpu_map;
1105
1106 /* this function may be called multiple times */
1107 memset(group_map, 0, sizeof(group_map));
1108 memset(group_cnt, 0, sizeof(group_cnt));
1109
1110 /* calculate size_sum and ensure dyn_size is enough for early alloc */
1111 size_sum = PFN_ALIGN(static_size + reserved_size +
1112 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1113 dyn_size = size_sum - static_size - reserved_size;
1114
1115 /*
1116 * Determine min_unit_size, alloc_size and max_upa such that
1117 * alloc_size is multiple of atom_size and is the smallest
1118 * which can accomodate 4k aligned segments which are equal to
1119 * or larger than min_unit_size.
1120 */
1121 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1122
1123 alloc_size = roundup(min_unit_size, atom_size);
1124 upa = alloc_size / min_unit_size;
1125 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1126 upa--;
1127 max_upa = upa;
1128
1129 /* group cpus according to their proximity */
1130 for_each_possible_cpu(cpu) {
1131 group = 0;
1132 next_group:
1133 for_each_possible_cpu(tcpu) {
1134 if (cpu == tcpu)
1135 break;
1136 if (group_map[tcpu] == group && cpu_distance_fn &&
1137 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1138 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1139 group++;
1140 nr_groups = max(nr_groups, group + 1);
1141 goto next_group;
1142 }
1143 }
1144 group_map[cpu] = group;
1145 group_cnt[group]++;
1146 }
1147
1148 /*
1149 * Expand unit size until address space usage goes over 75%
1150 * and then as much as possible without using more address
1151 * space.
1152 */
1153 last_allocs = INT_MAX;
1154 for (upa = max_upa; upa; upa--) {
1155 int allocs = 0, wasted = 0;
1156
1157 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1158 continue;
1159
1160 for (group = 0; group < nr_groups; group++) {
1161 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1162 allocs += this_allocs;
1163 wasted += this_allocs * upa - group_cnt[group];
1164 }
1165
1166 /*
1167 * Don't accept if wastage is over 1/3. The
1168 * greater-than comparison ensures upa==1 always
1169 * passes the following check.
1170 */
1171 if (wasted > num_possible_cpus() / 3)
1172 continue;
1173
1174 /* and then don't consume more memory */
1175 if (allocs > last_allocs)
1176 break;
1177 last_allocs = allocs;
1178 best_upa = upa;
1179 }
1180 upa = best_upa;
1181
1182 /* allocate and fill alloc_info */
1183 for (group = 0; group < nr_groups; group++)
1184 nr_units += roundup(group_cnt[group], upa);
1185
1186 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1187 if (!ai)
1188 return ERR_PTR(-ENOMEM);
1189 cpu_map = ai->groups[0].cpu_map;
1190
1191 for (group = 0; group < nr_groups; group++) {
1192 ai->groups[group].cpu_map = cpu_map;
1193 cpu_map += roundup(group_cnt[group], upa);
1194 }
1195
1196 ai->static_size = static_size;
1197 ai->reserved_size = reserved_size;
1198 ai->dyn_size = dyn_size;
1199 ai->unit_size = alloc_size / upa;
1200 ai->atom_size = atom_size;
1201 ai->alloc_size = alloc_size;
1202
1203 for (group = 0, unit = 0; group_cnt[group]; group++) {
1204 struct pcpu_group_info *gi = &ai->groups[group];
1205
1206 /*
1207 * Initialize base_offset as if all groups are located
1208 * back-to-back. The caller should update this to
1209 * reflect actual allocation.
1210 */
1211 gi->base_offset = unit * ai->unit_size;
1212
1213 for_each_possible_cpu(cpu)
1214 if (group_map[cpu] == group)
1215 gi->cpu_map[gi->nr_units++] = cpu;
1216 gi->nr_units = roundup(gi->nr_units, upa);
1217 unit += gi->nr_units;
1218 }
1219 BUG_ON(unit != nr_units);
1220
1221 return ai;
1222}
1223
1224/**
1225 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info 1075 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1226 * @lvl: loglevel 1076 * @lvl: loglevel
1227 * @ai: allocation info to dump 1077 * @ai: allocation info to dump
@@ -1363,8 +1213,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1363 1213
1364 /* sanity checks */ 1214 /* sanity checks */
1365 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1215 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1216#ifdef CONFIG_SMP
1366 PCPU_SETUP_BUG_ON(!ai->static_size); 1217 PCPU_SETUP_BUG_ON(!ai->static_size);
1218 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
1219#endif
1367 PCPU_SETUP_BUG_ON(!base_addr); 1220 PCPU_SETUP_BUG_ON(!base_addr);
1221 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
1368 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1222 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1369 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1223 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1370 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1224 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1411,7 +1265,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1411 1265
1412 /* we're done parsing the input, undefine BUG macro and dump config */ 1266 /* we're done parsing the input, undefine BUG macro and dump config */
1413#undef PCPU_SETUP_BUG_ON 1267#undef PCPU_SETUP_BUG_ON
1414 pcpu_dump_alloc_info(KERN_INFO, ai); 1268 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1415 1269
1416 pcpu_nr_groups = ai->nr_groups; 1270 pcpu_nr_groups = ai->nr_groups;
1417 pcpu_group_offsets = group_offsets; 1271 pcpu_group_offsets = group_offsets;
@@ -1488,6 +1342,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1488 return 0; 1342 return 0;
1489} 1343}
1490 1344
1345#ifdef CONFIG_SMP
1346
1491const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { 1347const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1492 [PCPU_FC_AUTO] = "auto", 1348 [PCPU_FC_AUTO] = "auto",
1493 [PCPU_FC_EMBED] = "embed", 1349 [PCPU_FC_EMBED] = "embed",
@@ -1515,8 +1371,180 @@ static int __init percpu_alloc_setup(char *str)
1515} 1371}
1516early_param("percpu_alloc", percpu_alloc_setup); 1372early_param("percpu_alloc", percpu_alloc_setup);
1517 1373
1374/*
1375 * pcpu_embed_first_chunk() is used by the generic percpu setup.
1376 * Build it if needed by the arch config or the generic setup is going
1377 * to be used.
1378 */
1518#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ 1379#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1519 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 1380 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1381#define BUILD_EMBED_FIRST_CHUNK
1382#endif
1383
1384/* build pcpu_page_first_chunk() iff needed by the arch config */
1385#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
1386#define BUILD_PAGE_FIRST_CHUNK
1387#endif
1388
1389/* pcpu_build_alloc_info() is used by both embed and page first chunk */
1390#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
1391/**
1392 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
1393 * @reserved_size: the size of reserved percpu area in bytes
1394 * @dyn_size: minimum free size for dynamic allocation in bytes
1395 * @atom_size: allocation atom size
1396 * @cpu_distance_fn: callback to determine distance between cpus, optional
1397 *
1398 * This function determines grouping of units, their mappings to cpus
1399 * and other parameters considering needed percpu size, allocation
1400 * atom size and distances between CPUs.
1401 *
1402 * Groups are always mutliples of atom size and CPUs which are of
1403 * LOCAL_DISTANCE both ways are grouped together and share space for
1404 * units in the same group. The returned configuration is guaranteed
1405 * to have CPUs on different nodes on different groups and >=75% usage
1406 * of allocated virtual address space.
1407 *
1408 * RETURNS:
1409 * On success, pointer to the new allocation_info is returned. On
1410 * failure, ERR_PTR value is returned.
1411 */
1412static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1413 size_t reserved_size, size_t dyn_size,
1414 size_t atom_size,
1415 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1416{
1417 static int group_map[NR_CPUS] __initdata;
1418 static int group_cnt[NR_CPUS] __initdata;
1419 const size_t static_size = __per_cpu_end - __per_cpu_start;
1420 int nr_groups = 1, nr_units = 0;
1421 size_t size_sum, min_unit_size, alloc_size;
1422 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1423 int last_allocs, group, unit;
1424 unsigned int cpu, tcpu;
1425 struct pcpu_alloc_info *ai;
1426 unsigned int *cpu_map;
1427
1428 /* this function may be called multiple times */
1429 memset(group_map, 0, sizeof(group_map));
1430 memset(group_cnt, 0, sizeof(group_cnt));
1431
1432 /* calculate size_sum and ensure dyn_size is enough for early alloc */
1433 size_sum = PFN_ALIGN(static_size + reserved_size +
1434 max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
1435 dyn_size = size_sum - static_size - reserved_size;
1436
1437 /*
1438 * Determine min_unit_size, alloc_size and max_upa such that
1439 * alloc_size is multiple of atom_size and is the smallest
1440 * which can accommodate 4k aligned segments which are equal to
1441 * or larger than min_unit_size.
1442 */
1443 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1444
1445 alloc_size = roundup(min_unit_size, atom_size);
1446 upa = alloc_size / min_unit_size;
1447 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1448 upa--;
1449 max_upa = upa;
1450
1451 /* group cpus according to their proximity */
1452 for_each_possible_cpu(cpu) {
1453 group = 0;
1454 next_group:
1455 for_each_possible_cpu(tcpu) {
1456 if (cpu == tcpu)
1457 break;
1458 if (group_map[tcpu] == group && cpu_distance_fn &&
1459 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1460 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1461 group++;
1462 nr_groups = max(nr_groups, group + 1);
1463 goto next_group;
1464 }
1465 }
1466 group_map[cpu] = group;
1467 group_cnt[group]++;
1468 }
1469
1470 /*
1471 * Expand unit size until address space usage goes over 75%
1472 * and then as much as possible without using more address
1473 * space.
1474 */
1475 last_allocs = INT_MAX;
1476 for (upa = max_upa; upa; upa--) {
1477 int allocs = 0, wasted = 0;
1478
1479 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1480 continue;
1481
1482 for (group = 0; group < nr_groups; group++) {
1483 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1484 allocs += this_allocs;
1485 wasted += this_allocs * upa - group_cnt[group];
1486 }
1487
1488 /*
1489 * Don't accept if wastage is over 1/3. The
1490 * greater-than comparison ensures upa==1 always
1491 * passes the following check.
1492 */
1493 if (wasted > num_possible_cpus() / 3)
1494 continue;
1495
1496 /* and then don't consume more memory */
1497 if (allocs > last_allocs)
1498 break;
1499 last_allocs = allocs;
1500 best_upa = upa;
1501 }
1502 upa = best_upa;
1503
1504 /* allocate and fill alloc_info */
1505 for (group = 0; group < nr_groups; group++)
1506 nr_units += roundup(group_cnt[group], upa);
1507
1508 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1509 if (!ai)
1510 return ERR_PTR(-ENOMEM);
1511 cpu_map = ai->groups[0].cpu_map;
1512
1513 for (group = 0; group < nr_groups; group++) {
1514 ai->groups[group].cpu_map = cpu_map;
1515 cpu_map += roundup(group_cnt[group], upa);
1516 }
1517
1518 ai->static_size = static_size;
1519 ai->reserved_size = reserved_size;
1520 ai->dyn_size = dyn_size;
1521 ai->unit_size = alloc_size / upa;
1522 ai->atom_size = atom_size;
1523 ai->alloc_size = alloc_size;
1524
1525 for (group = 0, unit = 0; group_cnt[group]; group++) {
1526 struct pcpu_group_info *gi = &ai->groups[group];
1527
1528 /*
1529 * Initialize base_offset as if all groups are located
1530 * back-to-back. The caller should update this to
1531 * reflect actual allocation.
1532 */
1533 gi->base_offset = unit * ai->unit_size;
1534
1535 for_each_possible_cpu(cpu)
1536 if (group_map[cpu] == group)
1537 gi->cpu_map[gi->nr_units++] = cpu;
1538 gi->nr_units = roundup(gi->nr_units, upa);
1539 unit += gi->nr_units;
1540 }
1541 BUG_ON(unit != nr_units);
1542
1543 return ai;
1544}
1545#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
1546
1547#if defined(BUILD_EMBED_FIRST_CHUNK)
1520/** 1548/**
1521 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1549 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1522 * @reserved_size: the size of reserved percpu area in bytes 1550 * @reserved_size: the size of reserved percpu area in bytes
@@ -1524,7 +1552,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
1524 * @atom_size: allocation atom size 1552 * @atom_size: allocation atom size
1525 * @cpu_distance_fn: callback to determine distance between cpus, optional 1553 * @cpu_distance_fn: callback to determine distance between cpus, optional
1526 * @alloc_fn: function to allocate percpu page 1554 * @alloc_fn: function to allocate percpu page
1527 * @free_fn: funtion to free percpu page 1555 * @free_fn: function to free percpu page
1528 * 1556 *
1529 * This is a helper to ease setting up embedded first percpu chunk and 1557 * This is a helper to ease setting up embedded first percpu chunk and
1530 * can be called where pcpu_setup_first_chunk() is expected. 1558 * can be called where pcpu_setup_first_chunk() is expected.
@@ -1619,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1619 /* warn if maximum distance is further than 75% of vmalloc space */ 1647 /* warn if maximum distance is further than 75% of vmalloc space */
1620 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { 1648 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1621 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " 1649 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1622 "space 0x%lx\n", 1650 "space 0x%lx\n", max_distance,
1623 max_distance, VMALLOC_END - VMALLOC_START); 1651 (unsigned long)(VMALLOC_END - VMALLOC_START));
1624#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1652#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1625 /* and fail if we have fallback */ 1653 /* and fail if we have fallback */
1626 rc = -EINVAL; 1654 rc = -EINVAL;
@@ -1645,15 +1673,14 @@ out_free:
1645 free_bootmem(__pa(areas), areas_size); 1673 free_bootmem(__pa(areas), areas_size);
1646 return rc; 1674 return rc;
1647} 1675}
1648#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || 1676#endif /* BUILD_EMBED_FIRST_CHUNK */
1649 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1650 1677
1651#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1678#ifdef BUILD_PAGE_FIRST_CHUNK
1652/** 1679/**
1653 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1680 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1654 * @reserved_size: the size of reserved percpu area in bytes 1681 * @reserved_size: the size of reserved percpu area in bytes
1655 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE 1682 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1656 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE 1683 * @free_fn: function to free percpu page, always called with PAGE_SIZE
1657 * @populate_pte_fn: function to populate pte 1684 * @populate_pte_fn: function to populate pte
1658 * 1685 *
1659 * This is a helper to ease setting up page-remapped first percpu 1686 * This is a helper to ease setting up page-remapped first percpu
@@ -1756,10 +1783,11 @@ out_free_ar:
1756 pcpu_free_alloc_info(ai); 1783 pcpu_free_alloc_info(ai);
1757 return rc; 1784 return rc;
1758} 1785}
1759#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ 1786#endif /* BUILD_PAGE_FIRST_CHUNK */
1760 1787
1788#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1761/* 1789/*
1762 * Generic percpu area setup. 1790 * Generic SMP percpu area setup.
1763 * 1791 *
1764 * The embedding helper is used because its behavior closely resembles 1792 * The embedding helper is used because its behavior closely resembles
1765 * the original non-dynamic generic percpu area setup. This is 1793 * the original non-dynamic generic percpu area setup. This is
@@ -1770,7 +1798,6 @@ out_free_ar:
1770 * on the physical linear memory mapping which uses large page 1798 * on the physical linear memory mapping which uses large page
1771 * mappings on applicable archs. 1799 * mappings on applicable archs.
1772 */ 1800 */
1773#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1774unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 1801unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1775EXPORT_SYMBOL(__per_cpu_offset); 1802EXPORT_SYMBOL(__per_cpu_offset);
1776 1803
@@ -1799,13 +1826,48 @@ void __init setup_per_cpu_areas(void)
1799 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, 1826 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
1800 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); 1827 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
1801 if (rc < 0) 1828 if (rc < 0)
1802 panic("Failed to initialized percpu areas."); 1829 panic("Failed to initialize percpu areas.");
1803 1830
1804 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 1831 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1805 for_each_possible_cpu(cpu) 1832 for_each_possible_cpu(cpu)
1806 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 1833 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1807} 1834}
1808#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ 1835#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
1836
1837#else /* CONFIG_SMP */
1838
1839/*
1840 * UP percpu area setup.
1841 *
1842 * UP always uses km-based percpu allocator with identity mapping.
1843 * Static percpu variables are indistinguishable from the usual static
1844 * variables and don't require any special preparation.
1845 */
1846void __init setup_per_cpu_areas(void)
1847{
1848 const size_t unit_size =
1849 roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
1850 PERCPU_DYNAMIC_RESERVE));
1851 struct pcpu_alloc_info *ai;
1852 void *fc;
1853
1854 ai = pcpu_alloc_alloc_info(1, 1);
1855 fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1856 if (!ai || !fc)
1857 panic("Failed to allocate memory for percpu areas.");
1858
1859 ai->dyn_size = unit_size;
1860 ai->unit_size = unit_size;
1861 ai->atom_size = unit_size;
1862 ai->alloc_size = unit_size;
1863 ai->groups[0].nr_units = 1;
1864 ai->groups[0].cpu_map[0] = 0;
1865
1866 if (pcpu_setup_first_chunk(ai, fc) < 0)
1867 panic("Failed to initialize percpu areas.");
1868}
1869
1870#endif /* CONFIG_SMP */
1809 1871
1810/* 1872/*
1811 * First and reserved chunks are initialized with temporary allocation 1873 * First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index db884fae5721..000000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(this_cpu_ptr(p));
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..eb663fb533e0
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,121 @@
1/*
2 * mm/pgtable-generic.c
3 *
4 * Generic pgtable methods declared in asm-generic/pgtable.h
5 *
6 * Copyright (C) 2010 Linus Torvalds
7 */
8
9#include <linux/pagemap.h>
10#include <asm/tlb.h>
11#include <asm-generic/pgtable.h>
12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/*
15 * Only sets the access flags (dirty, accessed, and
16 * writable). Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This
20 * used to be done in the caller, but sparc needs minor faults to
21 * force that call on sun4c so we changed this macro slightly
22 */
23int ptep_set_access_flags(struct vm_area_struct *vma,
24 unsigned long address, pte_t *ptep,
25 pte_t entry, int dirty)
26{
27 int changed = !pte_same(*ptep, entry);
28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address);
31 }
32 return changed;
33}
34#endif
35
36#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
37int pmdp_set_access_flags(struct vm_area_struct *vma,
38 unsigned long address, pmd_t *pmdp,
39 pmd_t entry, int dirty)
40{
41#ifdef CONFIG_TRANSPARENT_HUGEPAGE
42 int changed = !pmd_same(*pmdp, entry);
43 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
44 if (changed) {
45 set_pmd_at(vma->vm_mm, address, pmdp, entry);
46 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
47 }
48 return changed;
49#else /* CONFIG_TRANSPARENT_HUGEPAGE */
50 BUG();
51 return 0;
52#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
53}
54#endif
55
56#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
57int ptep_clear_flush_young(struct vm_area_struct *vma,
58 unsigned long address, pte_t *ptep)
59{
60 int young;
61 young = ptep_test_and_clear_young(vma, address, ptep);
62 if (young)
63 flush_tlb_page(vma, address);
64 return young;
65}
66#endif
67
68#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
69int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp)
71{
72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE
74 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
80 return young;
81}
82#endif
83
84#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
85pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
86 pte_t *ptep)
87{
88 pte_t pte;
89 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
90 flush_tlb_page(vma, address);
91 return pte;
92}
93#endif
94
95#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
96#ifdef CONFIG_TRANSPARENT_HUGEPAGE
97pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
98 pmd_t *pmdp)
99{
100 pmd_t pmd;
101 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
102 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
103 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
104 return pmd;
105}
106#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
107#endif
108
109#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
110#ifdef CONFIG_TRANSPARENT_HUGEPAGE
111pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
112 pmd_t *pmdp)
113{
114 pmd_t pmd = pmd_mksplitting(*pmdp);
115 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
116 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
117 /* tlb flush only to serialize against gup-fast */
118 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
119}
120#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
121#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d9694..799dcfd7cd8c 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/prio_tree.h> 15#include <linux/prio_tree.h>
16#include <linux/prefetch.h>
16 17
17/* 18/*
18 * See lib/prio_tree.c for details on the general radix priority search tree 19 * See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..867f9dd82dcd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
109static int read_pages(struct address_space *mapping, struct file *filp, 109static int read_pages(struct address_space *mapping, struct file *filp,
110 struct list_head *pages, unsigned nr_pages) 110 struct list_head *pages, unsigned nr_pages)
111{ 111{
112 struct blk_plug plug;
112 unsigned page_idx; 113 unsigned page_idx;
113 int ret; 114 int ret;
114 115
116 blk_start_plug(&plug);
117
115 if (mapping->a_ops->readpages) { 118 if (mapping->a_ops->readpages) {
116 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 119 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
117 /* Clean up the remaining pages */ 120 /* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
129 page_cache_release(page); 132 page_cache_release(page);
130 } 133 }
131 ret = 0; 134 ret = 0;
135
132out: 136out:
137 blk_finish_plug(&plug);
138
133 return ret; 139 return ret;
134} 140}
135 141
@@ -174,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
174 if (page) 180 if (page)
175 continue; 181 continue;
176 182
177 page = page_cache_alloc_cold(mapping); 183 page = page_cache_alloc_readahead(mapping);
178 if (!page) 184 if (!page)
179 break; 185 break;
180 page->index = page_offset; 186 page->index = page_offset;
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
554 560
555 /* do read-ahead */ 561 /* do read-ahead */
556 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 562 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
557
558#ifdef CONFIG_BLOCK
559 /*
560 * Normally the current page is !uptodate and lock_page() will be
561 * immediately called to implicitly unplug the device. However this
562 * is not always true for RAID conifgurations, where data arrives
563 * not strictly in their submission order. In this case we need to
564 * explicitly kick off the IO.
565 */
566 if (PageUptodate(page))
567 blk_run_backing_dev(mapping->backing_dev_info, NULL);
568#endif
569} 563}
570EXPORT_SYMBOL_GPL(page_cache_async_readahead); 564EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..23295f65ae43 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,22 +24,22 @@
24 * inode->i_alloc_sem (vmtruncate_range) 24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 25 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 26 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_lock 27 * mapping->i_mmap_mutex
28 * anon_vma->lock 28 * anon_vma->mutex
29 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 38 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 39 * within inode_wb_list_lock in __sync_single_inode)
39 * 40 *
40 * (code doesn't rely on that order so it could be switched around) 41 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 42 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock 43 * pte map lock
44 */ 44 */
45 45
@@ -67,20 +67,56 @@ static struct kmem_cache *anon_vma_chain_cachep;
67 67
68static inline struct anon_vma *anon_vma_alloc(void) 68static inline struct anon_vma *anon_vma_alloc(void)
69{ 69{
70 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 70 struct anon_vma *anon_vma;
71
72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
73 if (anon_vma) {
74 atomic_set(&anon_vma->refcount, 1);
75 /*
76 * Initialise the anon_vma root to point to itself. If called
77 * from fork, the root will be reset to the parents anon_vma.
78 */
79 anon_vma->root = anon_vma;
80 }
81
82 return anon_vma;
71} 83}
72 84
73void anon_vma_free(struct anon_vma *anon_vma) 85static inline void anon_vma_free(struct anon_vma *anon_vma)
74{ 86{
87 VM_BUG_ON(atomic_read(&anon_vma->refcount));
88
89 /*
90 * Synchronize against page_lock_anon_vma() such that
91 * we can safely hold the lock without the anon_vma getting
92 * freed.
93 *
94 * Relies on the full mb implied by the atomic_dec_and_test() from
95 * put_anon_vma() against the acquire barrier implied by
96 * mutex_trylock() from page_lock_anon_vma(). This orders:
97 *
98 * page_lock_anon_vma() VS put_anon_vma()
99 * mutex_trylock() atomic_dec_and_test()
100 * LOCK MB
101 * atomic_read() mutex_is_locked()
102 *
103 * LOCK should suffice since the actual taking of the lock must
104 * happen _before_ what follows.
105 */
106 if (mutex_is_locked(&anon_vma->root->mutex)) {
107 anon_vma_lock(anon_vma);
108 anon_vma_unlock(anon_vma);
109 }
110
75 kmem_cache_free(anon_vma_cachep, anon_vma); 111 kmem_cache_free(anon_vma_cachep, anon_vma);
76} 112}
77 113
78static inline struct anon_vma_chain *anon_vma_chain_alloc(void) 114static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
79{ 115{
80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 116 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
81} 117}
82 118
83void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 119static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
84{ 120{
85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 121 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
86} 122}
@@ -94,7 +130,7 @@ void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
94 * anonymous pages mapped into it with that anon_vma. 130 * anonymous pages mapped into it with that anon_vma.
95 * 131 *
96 * The common case will be that we already have one, but if 132 * The common case will be that we already have one, but if
97 * if not we either need to find an adjacent mapping that we 133 * not we either need to find an adjacent mapping that we
98 * can re-use the anon_vma from (very common when the only 134 * can re-use the anon_vma from (very common when the only
99 * reason for splitting a vma has been mprotect()), or we 135 * reason for splitting a vma has been mprotect()), or we
100 * allocate a new one. 136 * allocate a new one.
@@ -122,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
122 struct mm_struct *mm = vma->vm_mm; 158 struct mm_struct *mm = vma->vm_mm;
123 struct anon_vma *allocated; 159 struct anon_vma *allocated;
124 160
125 avc = anon_vma_chain_alloc(); 161 avc = anon_vma_chain_alloc(GFP_KERNEL);
126 if (!avc) 162 if (!avc)
127 goto out_enomem; 163 goto out_enomem;
128 164
@@ -133,11 +169,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
133 if (unlikely(!anon_vma)) 169 if (unlikely(!anon_vma))
134 goto out_enomem_free_avc; 170 goto out_enomem_free_avc;
135 allocated = anon_vma; 171 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
141 } 172 }
142 173
143 anon_vma_lock(anon_vma); 174 anon_vma_lock(anon_vma);
@@ -156,7 +187,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
156 anon_vma_unlock(anon_vma); 187 anon_vma_unlock(anon_vma);
157 188
158 if (unlikely(allocated)) 189 if (unlikely(allocated))
159 anon_vma_free(allocated); 190 put_anon_vma(allocated);
160 if (unlikely(avc)) 191 if (unlikely(avc))
161 anon_vma_chain_free(avc); 192 anon_vma_chain_free(avc);
162 } 193 }
@@ -168,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
168 return -ENOMEM; 199 return -ENOMEM;
169} 200}
170 201
202/*
203 * This is a useful helper function for locking the anon_vma root as
204 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
205 * have the same vma.
206 *
207 * Such anon_vma's should have the same root, so you'd expect to see
208 * just a single mutex_lock for the whole traversal.
209 */
210static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
211{
212 struct anon_vma *new_root = anon_vma->root;
213 if (new_root != root) {
214 if (WARN_ON_ONCE(root))
215 mutex_unlock(&root->mutex);
216 root = new_root;
217 mutex_lock(&root->mutex);
218 }
219 return root;
220}
221
222static inline void unlock_anon_vma_root(struct anon_vma *root)
223{
224 if (root)
225 mutex_unlock(&root->mutex);
226}
227
171static void anon_vma_chain_link(struct vm_area_struct *vma, 228static void anon_vma_chain_link(struct vm_area_struct *vma,
172 struct anon_vma_chain *avc, 229 struct anon_vma_chain *avc,
173 struct anon_vma *anon_vma) 230 struct anon_vma *anon_vma)
@@ -176,9 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
176 avc->anon_vma = anon_vma; 233 avc->anon_vma = anon_vma;
177 list_add(&avc->same_vma, &vma->anon_vma_chain); 234 list_add(&avc->same_vma, &vma->anon_vma_chain);
178 235
179 anon_vma_lock(anon_vma); 236 /*
237 * It's critical to add new vmas to the tail of the anon_vma,
238 * see comment in huge_memory.c:__split_huge_page().
239 */
180 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 240 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
181 anon_vma_unlock(anon_vma);
182} 241}
183 242
184/* 243/*
@@ -188,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
188int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 247int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
189{ 248{
190 struct anon_vma_chain *avc, *pavc; 249 struct anon_vma_chain *avc, *pavc;
250 struct anon_vma *root = NULL;
191 251
192 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 252 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
193 avc = anon_vma_chain_alloc(); 253 struct anon_vma *anon_vma;
194 if (!avc) 254
195 goto enomem_failure; 255 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
196 anon_vma_chain_link(dst, avc, pavc->anon_vma); 256 if (unlikely(!avc)) {
257 unlock_anon_vma_root(root);
258 root = NULL;
259 avc = anon_vma_chain_alloc(GFP_KERNEL);
260 if (!avc)
261 goto enomem_failure;
262 }
263 anon_vma = pavc->anon_vma;
264 root = lock_anon_vma_root(root, anon_vma);
265 anon_vma_chain_link(dst, avc, anon_vma);
197 } 266 }
267 unlock_anon_vma_root(root);
198 return 0; 268 return 0;
199 269
200 enomem_failure: 270 enomem_failure:
@@ -227,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
227 anon_vma = anon_vma_alloc(); 297 anon_vma = anon_vma_alloc();
228 if (!anon_vma) 298 if (!anon_vma)
229 goto out_error; 299 goto out_error;
230 avc = anon_vma_chain_alloc(); 300 avc = anon_vma_chain_alloc(GFP_KERNEL);
231 if (!avc) 301 if (!avc)
232 goto out_error_free_anon_vma; 302 goto out_error_free_anon_vma;
233 303
@@ -237,58 +307,63 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
237 */ 307 */
238 anon_vma->root = pvma->anon_vma->root; 308 anon_vma->root = pvma->anon_vma->root;
239 /* 309 /*
240 * With KSM refcounts, an anon_vma can stay around longer than the 310 * With refcounts, an anon_vma can stay around longer than the
241 * process it belongs to. The root anon_vma needs to be pinned 311 * process it belongs to. The root anon_vma needs to be pinned until
242 * until this anon_vma is freed, because the lock lives in the root. 312 * this anon_vma is freed, because the lock lives in the root.
243 */ 313 */
244 get_anon_vma(anon_vma->root); 314 get_anon_vma(anon_vma->root);
245 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 315 /* Mark this anon_vma as the one where our new (COWed) pages go. */
246 vma->anon_vma = anon_vma; 316 vma->anon_vma = anon_vma;
317 anon_vma_lock(anon_vma);
247 anon_vma_chain_link(vma, avc, anon_vma); 318 anon_vma_chain_link(vma, avc, anon_vma);
319 anon_vma_unlock(anon_vma);
248 320
249 return 0; 321 return 0;
250 322
251 out_error_free_anon_vma: 323 out_error_free_anon_vma:
252 anon_vma_free(anon_vma); 324 put_anon_vma(anon_vma);
253 out_error: 325 out_error:
254 unlink_anon_vmas(vma); 326 unlink_anon_vmas(vma);
255 return -ENOMEM; 327 return -ENOMEM;
256} 328}
257 329
258static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) 330void unlink_anon_vmas(struct vm_area_struct *vma)
259{ 331{
260 struct anon_vma *anon_vma = anon_vma_chain->anon_vma; 332 struct anon_vma_chain *avc, *next;
261 int empty; 333 struct anon_vma *root = NULL;
262 334
263 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ 335 /*
264 if (!anon_vma) 336 * Unlink each anon_vma chained to the VMA. This list is ordered
265 return; 337 * from newest to oldest, ensuring the root anon_vma gets freed last.
338 */
339 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
340 struct anon_vma *anon_vma = avc->anon_vma;
266 341
267 anon_vma_lock(anon_vma); 342 root = lock_anon_vma_root(root, anon_vma);
268 list_del(&anon_vma_chain->same_anon_vma); 343 list_del(&avc->same_anon_vma);
269 344
270 /* We must garbage collect the anon_vma if it's empty */ 345 /*
271 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 346 * Leave empty anon_vmas on the list - we'll need
272 anon_vma_unlock(anon_vma); 347 * to free them outside the lock.
348 */
349 if (list_empty(&anon_vma->head))
350 continue;
273 351
274 if (empty) { 352 list_del(&avc->same_vma);
275 /* We no longer need the root anon_vma */ 353 anon_vma_chain_free(avc);
276 if (anon_vma->root != anon_vma)
277 drop_anon_vma(anon_vma->root);
278 anon_vma_free(anon_vma);
279 } 354 }
280} 355 unlock_anon_vma_root(root);
281
282void unlink_anon_vmas(struct vm_area_struct *vma)
283{
284 struct anon_vma_chain *avc, *next;
285 356
286 /* 357 /*
287 * Unlink each anon_vma chained to the VMA. This list is ordered 358 * Iterate the list once more, it now only contains empty and unlinked
288 * from newest to oldest, ensuring the root anon_vma gets freed last. 359 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
360 * needing to acquire the anon_vma->root->mutex.
289 */ 361 */
290 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 362 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
291 anon_vma_unlink(avc); 363 struct anon_vma *anon_vma = avc->anon_vma;
364
365 put_anon_vma(anon_vma);
366
292 list_del(&avc->same_vma); 367 list_del(&avc->same_vma);
293 anon_vma_chain_free(avc); 368 anon_vma_chain_free(avc);
294 } 369 }
@@ -298,8 +373,8 @@ static void anon_vma_ctor(void *data)
298{ 373{
299 struct anon_vma *anon_vma = data; 374 struct anon_vma *anon_vma = data;
300 375
301 spin_lock_init(&anon_vma->lock); 376 mutex_init(&anon_vma->mutex);
302 anonvma_external_refcount_init(anon_vma); 377 atomic_set(&anon_vma->refcount, 0);
303 INIT_LIST_HEAD(&anon_vma->head); 378 INIT_LIST_HEAD(&anon_vma->head);
304} 379}
305 380
@@ -311,12 +386,31 @@ void __init anon_vma_init(void)
311} 386}
312 387
313/* 388/*
314 * Getting a lock on a stable anon_vma from a page off the LRU is 389 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
315 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 390 *
391 * Since there is no serialization what so ever against page_remove_rmap()
392 * the best this function can do is return a locked anon_vma that might
393 * have been relevant to this page.
394 *
395 * The page might have been remapped to a different anon_vma or the anon_vma
396 * returned may already be freed (and even reused).
397 *
398 * In case it was remapped to a different anon_vma, the new anon_vma will be a
399 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
400 * ensure that any anon_vma obtained from the page will still be valid for as
401 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
402 *
403 * All users of this function must be very careful when walking the anon_vma
404 * chain and verify that the page in question is indeed mapped in it
405 * [ something equivalent to page_mapped_in_vma() ].
406 *
407 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
408 * that the anon_vma pointer from page->mapping is valid if there is a
409 * mapcount, we can dereference the anon_vma after observing those.
316 */ 410 */
317struct anon_vma *page_lock_anon_vma(struct page *page) 411struct anon_vma *page_get_anon_vma(struct page *page)
318{ 412{
319 struct anon_vma *anon_vma, *root_anon_vma; 413 struct anon_vma *anon_vma = NULL;
320 unsigned long anon_mapping; 414 unsigned long anon_mapping;
321 415
322 rcu_read_lock(); 416 rcu_read_lock();
@@ -327,30 +421,100 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
327 goto out; 421 goto out;
328 422
329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 423 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
330 root_anon_vma = ACCESS_ONCE(anon_vma->root); 424 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
331 spin_lock(&root_anon_vma->lock); 425 anon_vma = NULL;
426 goto out;
427 }
332 428
333 /* 429 /*
334 * If this page is still mapped, then its anon_vma cannot have been 430 * If this page is still mapped, then its anon_vma cannot have been
335 * freed. But if it has been unmapped, we have no security against 431 * freed. But if it has been unmapped, we have no security against the
336 * the anon_vma structure being freed and reused (for another anon_vma: 432 * anon_vma structure being freed and reused (for another anon_vma:
337 * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot 433 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
338 * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting 434 * above cannot corrupt).
339 * anon_vma->root before page_unlock_anon_vma() is called to unlock.
340 */ 435 */
341 if (page_mapped(page)) 436 if (!page_mapped(page)) {
342 return anon_vma; 437 put_anon_vma(anon_vma);
438 anon_vma = NULL;
439 }
440out:
441 rcu_read_unlock();
442
443 return anon_vma;
444}
445
446/*
447 * Similar to page_get_anon_vma() except it locks the anon_vma.
448 *
449 * Its a little more complex as it tries to keep the fast path to a single
450 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
451 * reference like with page_get_anon_vma() and then block on the mutex.
452 */
453struct anon_vma *page_lock_anon_vma(struct page *page)
454{
455 struct anon_vma *anon_vma = NULL;
456 struct anon_vma *root_anon_vma;
457 unsigned long anon_mapping;
458
459 rcu_read_lock();
460 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
461 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
462 goto out;
463 if (!page_mapped(page))
464 goto out;
465
466 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
467 root_anon_vma = ACCESS_ONCE(anon_vma->root);
468 if (mutex_trylock(&root_anon_vma->mutex)) {
469 /*
470 * If the page is still mapped, then this anon_vma is still
471 * its anon_vma, and holding the mutex ensures that it will
472 * not go away, see anon_vma_free().
473 */
474 if (!page_mapped(page)) {
475 mutex_unlock(&root_anon_vma->mutex);
476 anon_vma = NULL;
477 }
478 goto out;
479 }
480
481 /* trylock failed, we got to sleep */
482 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
483 anon_vma = NULL;
484 goto out;
485 }
486
487 if (!page_mapped(page)) {
488 put_anon_vma(anon_vma);
489 anon_vma = NULL;
490 goto out;
491 }
492
493 /* we pinned the anon_vma, its safe to sleep */
494 rcu_read_unlock();
495 anon_vma_lock(anon_vma);
496
497 if (atomic_dec_and_test(&anon_vma->refcount)) {
498 /*
499 * Oops, we held the last refcount, release the lock
500 * and bail -- can't simply use put_anon_vma() because
501 * we'll deadlock on the anon_vma_lock() recursion.
502 */
503 anon_vma_unlock(anon_vma);
504 __put_anon_vma(anon_vma);
505 anon_vma = NULL;
506 }
507
508 return anon_vma;
343 509
344 spin_unlock(&root_anon_vma->lock);
345out: 510out:
346 rcu_read_unlock(); 511 rcu_read_unlock();
347 return NULL; 512 return anon_vma;
348} 513}
349 514
350void page_unlock_anon_vma(struct anon_vma *anon_vma) 515void page_unlock_anon_vma(struct anon_vma *anon_vma)
351{ 516{
352 anon_vma_unlock(anon_vma); 517 anon_vma_unlock(anon_vma);
353 rcu_read_unlock();
354} 518}
355 519
356/* 520/*
@@ -358,7 +522,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
358 * Returns virtual address or -EFAULT if page's index/offset is not 522 * Returns virtual address or -EFAULT if page's index/offset is not
359 * within the range mapped the @vma. 523 * within the range mapped the @vma.
360 */ 524 */
361static inline unsigned long 525inline unsigned long
362vma_address(struct page *page, struct vm_area_struct *vma) 526vma_address(struct page *page, struct vm_area_struct *vma)
363{ 527{
364 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 528 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -407,7 +571,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
407 * 571 *
408 * On success returns with pte mapped and locked. 572 * On success returns with pte mapped and locked.
409 */ 573 */
410pte_t *page_check_address(struct page *page, struct mm_struct *mm, 574pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
411 unsigned long address, spinlock_t **ptlp, int sync) 575 unsigned long address, spinlock_t **ptlp, int sync)
412{ 576{
413 pgd_t *pgd; 577 pgd_t *pgd;
@@ -433,6 +597,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
433 pmd = pmd_offset(pud, address); 597 pmd = pmd_offset(pud, address);
434 if (!pmd_present(*pmd)) 598 if (!pmd_present(*pmd))
435 return NULL; 599 return NULL;
600 if (pmd_trans_huge(*pmd))
601 return NULL;
436 602
437 pte = pte_offset_map(pmd, address); 603 pte = pte_offset_map(pmd, address);
438 /* Make a quick check before getting the lock */ 604 /* Make a quick check before getting the lock */
@@ -487,35 +653,65 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
487 unsigned long *vm_flags) 653 unsigned long *vm_flags)
488{ 654{
489 struct mm_struct *mm = vma->vm_mm; 655 struct mm_struct *mm = vma->vm_mm;
490 pte_t *pte;
491 spinlock_t *ptl;
492 int referenced = 0; 656 int referenced = 0;
493 657
494 pte = page_check_address(page, mm, address, &ptl, 0); 658 if (unlikely(PageTransHuge(page))) {
495 if (!pte) 659 pmd_t *pmd;
496 goto out;
497
498 /*
499 * Don't want to elevate referenced for mlocked page that gets this far,
500 * in order that it progresses to try_to_unmap and is moved to the
501 * unevictable list.
502 */
503 if (vma->vm_flags & VM_LOCKED) {
504 *mapcount = 1; /* break early from loop */
505 *vm_flags |= VM_LOCKED;
506 goto out_unmap;
507 }
508 660
509 if (ptep_clear_flush_young_notify(vma, address, pte)) { 661 spin_lock(&mm->page_table_lock);
510 /* 662 /*
511 * Don't treat a reference through a sequentially read 663 * rmap might return false positives; we must filter
512 * mapping as such. If the page has been used in 664 * these out using page_check_address_pmd().
513 * another mapping, we will catch it; if this other
514 * mapping is already gone, the unmap path will have
515 * set PG_referenced or activated the page.
516 */ 665 */
517 if (likely(!VM_SequentialReadHint(vma))) 666 pmd = page_check_address_pmd(page, mm, address,
667 PAGE_CHECK_ADDRESS_PMD_FLAG);
668 if (!pmd) {
669 spin_unlock(&mm->page_table_lock);
670 goto out;
671 }
672
673 if (vma->vm_flags & VM_LOCKED) {
674 spin_unlock(&mm->page_table_lock);
675 *mapcount = 0; /* break early from loop */
676 *vm_flags |= VM_LOCKED;
677 goto out;
678 }
679
680 /* go ahead even if the pmd is pmd_trans_splitting() */
681 if (pmdp_clear_flush_young_notify(vma, address, pmd))
518 referenced++; 682 referenced++;
683 spin_unlock(&mm->page_table_lock);
684 } else {
685 pte_t *pte;
686 spinlock_t *ptl;
687
688 /*
689 * rmap might return false positives; we must filter
690 * these out using page_check_address().
691 */
692 pte = page_check_address(page, mm, address, &ptl, 0);
693 if (!pte)
694 goto out;
695
696 if (vma->vm_flags & VM_LOCKED) {
697 pte_unmap_unlock(pte, ptl);
698 *mapcount = 0; /* break early from loop */
699 *vm_flags |= VM_LOCKED;
700 goto out;
701 }
702
703 if (ptep_clear_flush_young_notify(vma, address, pte)) {
704 /*
705 * Don't treat a reference through a sequentially read
706 * mapping as such. If the page has been used in
707 * another mapping, we will catch it; if this other
708 * mapping is already gone, the unmap path will have
709 * set PG_referenced or activated the page.
710 */
711 if (likely(!VM_SequentialReadHint(vma)))
712 referenced++;
713 }
714 pte_unmap_unlock(pte, ptl);
519 } 715 }
520 716
521 /* Pretend the page is referenced if the task has the 717 /* Pretend the page is referenced if the task has the
@@ -524,9 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
524 rwsem_is_locked(&mm->mmap_sem)) 720 rwsem_is_locked(&mm->mmap_sem))
525 referenced++; 721 referenced++;
526 722
527out_unmap:
528 (*mapcount)--; 723 (*mapcount)--;
529 pte_unmap_unlock(pte, ptl);
530 724
531 if (referenced) 725 if (referenced)
532 *vm_flags |= vma->vm_flags; 726 *vm_flags |= vma->vm_flags;
@@ -605,14 +799,14 @@ static int page_referenced_file(struct page *page,
605 * The page lock not only makes sure that page->mapping cannot 799 * The page lock not only makes sure that page->mapping cannot
606 * suddenly be NULLified by truncation, it makes sure that the 800 * suddenly be NULLified by truncation, it makes sure that the
607 * structure at mapping cannot be freed and reused yet, 801 * structure at mapping cannot be freed and reused yet,
608 * so we can safely take mapping->i_mmap_lock. 802 * so we can safely take mapping->i_mmap_mutex.
609 */ 803 */
610 BUG_ON(!PageLocked(page)); 804 BUG_ON(!PageLocked(page));
611 805
612 spin_lock(&mapping->i_mmap_lock); 806 mutex_lock(&mapping->i_mmap_mutex);
613 807
614 /* 808 /*
615 * i_mmap_lock does not stabilize mapcount at all, but mapcount 809 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
616 * is more likely to be accurate if we note it after spinning. 810 * is more likely to be accurate if we note it after spinning.
617 */ 811 */
618 mapcount = page_mapcount(page); 812 mapcount = page_mapcount(page);
@@ -634,7 +828,7 @@ static int page_referenced_file(struct page *page,
634 break; 828 break;
635 } 829 }
636 830
637 spin_unlock(&mapping->i_mmap_lock); 831 mutex_unlock(&mapping->i_mmap_mutex);
638 return referenced; 832 return referenced;
639} 833}
640 834
@@ -678,7 +872,7 @@ int page_referenced(struct page *page,
678 unlock_page(page); 872 unlock_page(page);
679 } 873 }
680out: 874out:
681 if (page_test_and_clear_young(page)) 875 if (page_test_and_clear_young(page_to_pfn(page)))
682 referenced++; 876 referenced++;
683 877
684 return referenced; 878 return referenced;
@@ -721,7 +915,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
721 915
722 BUG_ON(PageAnon(page)); 916 BUG_ON(PageAnon(page));
723 917
724 spin_lock(&mapping->i_mmap_lock); 918 mutex_lock(&mapping->i_mmap_mutex);
725 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 919 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
726 if (vma->vm_flags & VM_SHARED) { 920 if (vma->vm_flags & VM_SHARED) {
727 unsigned long address = vma_address(page, vma); 921 unsigned long address = vma_address(page, vma);
@@ -730,7 +924,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
730 ret += page_mkclean_one(page, vma, address); 924 ret += page_mkclean_one(page, vma, address);
731 } 925 }
732 } 926 }
733 spin_unlock(&mapping->i_mmap_lock); 927 mutex_unlock(&mapping->i_mmap_mutex);
734 return ret; 928 return ret;
735} 929}
736 930
@@ -744,10 +938,8 @@ int page_mkclean(struct page *page)
744 struct address_space *mapping = page_mapping(page); 938 struct address_space *mapping = page_mapping(page);
745 if (mapping) { 939 if (mapping) {
746 ret = page_mkclean_file(mapping, page); 940 ret = page_mkclean_file(mapping, page);
747 if (page_test_dirty(page)) { 941 if (page_test_and_clear_dirty(page_to_pfn(page), 1))
748 page_clear_dirty(page);
749 ret = 1; 942 ret = 1;
750 }
751 } 943 }
752 } 944 }
753 945
@@ -780,10 +972,10 @@ void page_move_anon_rmap(struct page *page,
780} 972}
781 973
782/** 974/**
783 * __page_set_anon_rmap - setup new anonymous rmap 975 * __page_set_anon_rmap - set up new anonymous rmap
784 * @page: the page to add the mapping to 976 * @page: Page to add to rmap
785 * @vma: the vm area in which the mapping is added 977 * @vma: VM area to add page to.
786 * @address: the user virtual address mapped 978 * @address: User virtual address of the mapping
787 * @exclusive: the page is exclusively owned by the current process 979 * @exclusive: the page is exclusively owned by the current process
788 */ 980 */
789static void __page_set_anon_rmap(struct page *page, 981static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +985,16 @@ static void __page_set_anon_rmap(struct page *page,
793 985
794 BUG_ON(!anon_vma); 986 BUG_ON(!anon_vma);
795 987
988 if (PageAnon(page))
989 return;
990
796 /* 991 /*
797 * If the page isn't exclusively mapped into this vma, 992 * If the page isn't exclusively mapped into this vma,
798 * we must use the _oldest_ possible anon_vma for the 993 * we must use the _oldest_ possible anon_vma for the
799 * page mapping! 994 * page mapping!
800 */ 995 */
801 if (!exclusive) { 996 if (!exclusive)
802 if (PageAnon(page))
803 return;
804 anon_vma = anon_vma->root; 997 anon_vma = anon_vma->root;
805 } else {
806 /*
807 * In this case, swapped-out-but-not-discarded swap-cache
808 * is remapped. So, no need to update page->mapping here.
809 * We convice anon_vma poitned by page->mapping is not obsolete
810 * because vma->anon_vma is necessary to be a family of it.
811 */
812 if (PageAnon(page))
813 return;
814 }
815 998
816 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 999 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
817 page->mapping = (struct address_space *) anon_vma; 1000 page->mapping = (struct address_space *) anon_vma;
@@ -871,13 +1054,18 @@ void do_page_add_anon_rmap(struct page *page,
871 struct vm_area_struct *vma, unsigned long address, int exclusive) 1054 struct vm_area_struct *vma, unsigned long address, int exclusive)
872{ 1055{
873 int first = atomic_inc_and_test(&page->_mapcount); 1056 int first = atomic_inc_and_test(&page->_mapcount);
874 if (first) 1057 if (first) {
875 __inc_zone_page_state(page, NR_ANON_PAGES); 1058 if (!PageTransHuge(page))
1059 __inc_zone_page_state(page, NR_ANON_PAGES);
1060 else
1061 __inc_zone_page_state(page,
1062 NR_ANON_TRANSPARENT_HUGEPAGES);
1063 }
876 if (unlikely(PageKsm(page))) 1064 if (unlikely(PageKsm(page)))
877 return; 1065 return;
878 1066
879 VM_BUG_ON(!PageLocked(page)); 1067 VM_BUG_ON(!PageLocked(page));
880 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1068 /* address might be in next vma when migration races vma_adjust */
881 if (first) 1069 if (first)
882 __page_set_anon_rmap(page, vma, address, exclusive); 1070 __page_set_anon_rmap(page, vma, address, exclusive);
883 else 1071 else
@@ -900,7 +1088,10 @@ void page_add_new_anon_rmap(struct page *page,
900 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1088 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
901 SetPageSwapBacked(page); 1089 SetPageSwapBacked(page);
902 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1090 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
903 __inc_zone_page_state(page, NR_ANON_PAGES); 1091 if (!PageTransHuge(page))
1092 __inc_zone_page_state(page, NR_ANON_PAGES);
1093 else
1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
904 __page_set_anon_rmap(page, vma, address, 1); 1095 __page_set_anon_rmap(page, vma, address, 1);
905 if (page_evictable(page, vma)) 1096 if (page_evictable(page, vma))
906 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1097 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -918,7 +1109,7 @@ void page_add_file_rmap(struct page *page)
918{ 1109{
919 if (atomic_inc_and_test(&page->_mapcount)) { 1110 if (atomic_inc_and_test(&page->_mapcount)) {
920 __inc_zone_page_state(page, NR_FILE_MAPPED); 1111 __inc_zone_page_state(page, NR_FILE_MAPPED);
921 mem_cgroup_update_file_mapped(page, 1); 1112 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
922 } 1113 }
923} 1114}
924 1115
@@ -941,10 +1132,9 @@ void page_remove_rmap(struct page *page)
941 * not if it's in swapcache - there might be another pte slot 1132 * not if it's in swapcache - there might be another pte slot
942 * containing the swap entry, but page not yet written to swap. 1133 * containing the swap entry, but page not yet written to swap.
943 */ 1134 */
944 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 1135 if ((!PageAnon(page) || PageSwapCache(page)) &&
945 page_clear_dirty(page); 1136 page_test_and_clear_dirty(page_to_pfn(page), 1))
946 set_page_dirty(page); 1137 set_page_dirty(page);
947 }
948 /* 1138 /*
949 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1139 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
950 * and not charged by memcg for now. 1140 * and not charged by memcg for now.
@@ -953,10 +1143,14 @@ void page_remove_rmap(struct page *page)
953 return; 1143 return;
954 if (PageAnon(page)) { 1144 if (PageAnon(page)) {
955 mem_cgroup_uncharge_page(page); 1145 mem_cgroup_uncharge_page(page);
956 __dec_zone_page_state(page, NR_ANON_PAGES); 1146 if (!PageTransHuge(page))
1147 __dec_zone_page_state(page, NR_ANON_PAGES);
1148 else
1149 __dec_zone_page_state(page,
1150 NR_ANON_TRANSPARENT_HUGEPAGES);
957 } else { 1151 } else {
958 __dec_zone_page_state(page, NR_FILE_MAPPED); 1152 __dec_zone_page_state(page, NR_FILE_MAPPED);
959 mem_cgroup_update_file_mapped(page, -1); 1153 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
960 } 1154 }
961 /* 1155 /*
962 * It would be tidy to reset the PageAnon mapping here, 1156 * It would be tidy to reset the PageAnon mapping here,
@@ -1078,7 +1272,7 @@ out_mlock:
1078 /* 1272 /*
1079 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1273 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1080 * unstable result and race. Plus, We can't wait here because 1274 * unstable result and race. Plus, We can't wait here because
1081 * we now hold anon_vma->lock or mapping->i_mmap_lock. 1275 * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
1082 * if trylock failed, the page remain in evictable lru and later 1276 * if trylock failed, the page remain in evictable lru and later
1083 * vmscan could retry to move the page to unevictable lru if the 1277 * vmscan could retry to move the page to unevictable lru if the
1084 * page is actually mlocked. 1278 * page is actually mlocked.
@@ -1209,7 +1403,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1209 return ret; 1403 return ret;
1210} 1404}
1211 1405
1212static bool is_vma_temporary_stack(struct vm_area_struct *vma) 1406bool is_vma_temporary_stack(struct vm_area_struct *vma)
1213{ 1407{
1214 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1408 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1215 1409
@@ -1304,7 +1498,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1304 unsigned long max_nl_size = 0; 1498 unsigned long max_nl_size = 0;
1305 unsigned int mapcount; 1499 unsigned int mapcount;
1306 1500
1307 spin_lock(&mapping->i_mmap_lock); 1501 mutex_lock(&mapping->i_mmap_mutex);
1308 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1502 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1309 unsigned long address = vma_address(page, vma); 1503 unsigned long address = vma_address(page, vma);
1310 if (address == -EFAULT) 1504 if (address == -EFAULT)
@@ -1350,7 +1544,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1350 mapcount = page_mapcount(page); 1544 mapcount = page_mapcount(page);
1351 if (!mapcount) 1545 if (!mapcount)
1352 goto out; 1546 goto out;
1353 cond_resched_lock(&mapping->i_mmap_lock); 1547 cond_resched();
1354 1548
1355 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1549 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1356 if (max_nl_cursor == 0) 1550 if (max_nl_cursor == 0)
@@ -1372,7 +1566,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1372 } 1566 }
1373 vma->vm_private_data = (void *) max_nl_cursor; 1567 vma->vm_private_data = (void *) max_nl_cursor;
1374 } 1568 }
1375 cond_resched_lock(&mapping->i_mmap_lock); 1569 cond_resched();
1376 max_nl_cursor += CLUSTER_SIZE; 1570 max_nl_cursor += CLUSTER_SIZE;
1377 } while (max_nl_cursor <= max_nl_size); 1571 } while (max_nl_cursor <= max_nl_size);
1378 1572
@@ -1384,7 +1578,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1384 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1578 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1385 vma->vm_private_data = NULL; 1579 vma->vm_private_data = NULL;
1386out: 1580out:
1387 spin_unlock(&mapping->i_mmap_lock); 1581 mutex_unlock(&mapping->i_mmap_mutex);
1388 return ret; 1582 return ret;
1389} 1583}
1390 1584
@@ -1407,6 +1601,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1407 int ret; 1601 int ret;
1408 1602
1409 BUG_ON(!PageLocked(page)); 1603 BUG_ON(!PageLocked(page));
1604 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1410 1605
1411 if (unlikely(PageKsm(page))) 1606 if (unlikely(PageKsm(page)))
1412 ret = try_to_unmap_ksm(page, flags); 1607 ret = try_to_unmap_ksm(page, flags);
@@ -1446,41 +1641,15 @@ int try_to_munlock(struct page *page)
1446 return try_to_unmap_file(page, TTU_MUNLOCK); 1641 return try_to_unmap_file(page, TTU_MUNLOCK);
1447} 1642}
1448 1643
1449#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) 1644void __put_anon_vma(struct anon_vma *anon_vma)
1450/*
1451 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1452 * if necessary. Be careful to do all the tests under the lock. Once
1453 * we know we are the last user, nobody else can get a reference and we
1454 * can do the freeing without the lock.
1455 */
1456void drop_anon_vma(struct anon_vma *anon_vma)
1457{ 1645{
1458 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); 1646 struct anon_vma *root = anon_vma->root;
1459 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1460 struct anon_vma *root = anon_vma->root;
1461 int empty = list_empty(&anon_vma->head);
1462 int last_root_user = 0;
1463 int root_empty = 0;
1464 1647
1465 /* 1648 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1466 * The refcount on a non-root anon_vma got dropped. Drop 1649 anon_vma_free(root);
1467 * the refcount on the root and check if we need to free it.
1468 */
1469 if (empty && anon_vma != root) {
1470 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1471 last_root_user = atomic_dec_and_test(&root->external_refcount);
1472 root_empty = list_empty(&root->head);
1473 }
1474 anon_vma_unlock(anon_vma);
1475 1650
1476 if (empty) { 1651 anon_vma_free(anon_vma);
1477 anon_vma_free(anon_vma);
1478 if (root_empty && last_root_user)
1479 anon_vma_free(root);
1480 }
1481 }
1482} 1652}
1483#endif
1484 1653
1485#ifdef CONFIG_MIGRATION 1654#ifdef CONFIG_MIGRATION
1486/* 1655/*
@@ -1528,7 +1697,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1528 1697
1529 if (!mapping) 1698 if (!mapping)
1530 return ret; 1699 return ret;
1531 spin_lock(&mapping->i_mmap_lock); 1700 mutex_lock(&mapping->i_mmap_mutex);
1532 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1701 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1533 unsigned long address = vma_address(page, vma); 1702 unsigned long address = vma_address(page, vma);
1534 if (address == -EFAULT) 1703 if (address == -EFAULT)
@@ -1542,7 +1711,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1542 * never contain migration ptes. Decide what to do about this 1711 * never contain migration ptes. Decide what to do about this
1543 * limitation to linear when we need rmap_walk() on nonlinear. 1712 * limitation to linear when we need rmap_walk() on nonlinear.
1544 */ 1713 */
1545 spin_unlock(&mapping->i_mmap_lock); 1714 mutex_unlock(&mapping->i_mmap_mutex);
1546 return ret; 1715 return ret;
1547} 1716}
1548 1717
@@ -1591,7 +1760,7 @@ void hugepage_add_anon_rmap(struct page *page,
1591 1760
1592 BUG_ON(!PageLocked(page)); 1761 BUG_ON(!PageLocked(page));
1593 BUG_ON(!anon_vma); 1762 BUG_ON(!anon_vma);
1594 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1763 /* address might be in next vma when migration races vma_adjust */
1595 first = atomic_inc_and_test(&page->_mapcount); 1764 first = atomic_inc_and_test(&page->_mapcount);
1596 if (first) 1765 if (first)
1597 __hugepage_set_anon_rmap(page, vma, address, 0); 1766 __hugepage_set_anon_rmap(page, vma, address, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..fcedf5464eb7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt;
99/* Pretend that each entry is of this size in directory's i_size */ 99/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20 100#define BOGO_DIRENT_SIZE 20
101 101
102struct shmem_xattr {
103 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104 char *name; /* xattr name */
105 size_t size;
106 char value[0];
107};
108
102/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
103enum sgp_type { 110enum sgp_type {
104 SGP_READ, /* don't exceed i_size, don't allocate page */ 111 SGP_READ, /* don't exceed i_size, don't allocate page */
@@ -224,7 +231,6 @@ static const struct vm_operations_struct shmem_vm_ops;
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 231static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 232 .ra_pages = 0, /* No readahead */
226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 233 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
227 .unplug_io_fn = default_unplug_io_fn,
228}; 234};
229 235
230static LIST_HEAD(shmem_swaplist); 236static LIST_HEAD(shmem_swaplist);
@@ -422,7 +428,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
422 * a waste to allocate index if we cannot allocate data. 428 * a waste to allocate index if we cannot allocate data.
423 */ 429 */
424 if (sbinfo->max_blocks) { 430 if (sbinfo->max_blocks) {
425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) 431 if (percpu_counter_compare(&sbinfo->used_blocks,
432 sbinfo->max_blocks - 1) >= 0)
426 return ERR_PTR(-ENOSPC); 433 return ERR_PTR(-ENOSPC);
427 percpu_counter_inc(&sbinfo->used_blocks); 434 percpu_counter_inc(&sbinfo->used_blocks);
428 spin_lock(&inode->i_lock); 435 spin_lock(&inode->i_lock);
@@ -532,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
532 } while (next); 539 } while (next);
533} 540}
534 541
535static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
536{ 543{
537 struct shmem_inode_info *info = SHMEM_I(inode); 544 struct shmem_inode_info *info = SHMEM_I(inode);
538 unsigned long idx; 545 unsigned long idx;
@@ -555,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
555 spinlock_t *punch_lock; 562 spinlock_t *punch_lock;
556 unsigned long upper_limit; 563 unsigned long upper_limit;
557 564
565 truncate_inode_pages_range(inode->i_mapping, start, end);
566
558 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 567 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
559 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 568 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
560 if (idx >= info->next_index) 569 if (idx >= info->next_index)
@@ -731,16 +740,8 @@ done2:
731 * lowered next_index. Also, though shmem_getpage checks 740 * lowered next_index. Also, though shmem_getpage checks
732 * i_size before adding to cache, no recheck after: so fix the 741 * i_size before adding to cache, no recheck after: so fix the
733 * narrow window there too. 742 * narrow window there too.
734 *
735 * Recalling truncate_inode_pages_range and unmap_mapping_range
736 * every time for punch_hole (which never got a chance to clear
737 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
738 * yet hardly ever necessary: try to optimize them out later.
739 */ 743 */
740 truncate_inode_pages_range(inode->i_mapping, start, end); 744 truncate_inode_pages_range(inode->i_mapping, start, end);
741 if (punch_hole)
742 unmap_mapping_range(inode->i_mapping, start,
743 end - start, 1);
744 } 745 }
745 746
746 spin_lock(&info->lock); 747 spin_lock(&info->lock);
@@ -759,27 +760,28 @@ done2:
759 shmem_free_pages(pages_to_free.next); 760 shmem_free_pages(pages_to_free.next);
760 } 761 }
761} 762}
763EXPORT_SYMBOL_GPL(shmem_truncate_range);
762 764
763static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 765static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
764{ 766{
765 struct inode *inode = dentry->d_inode; 767 struct inode *inode = dentry->d_inode;
766 loff_t newsize = attr->ia_size;
767 int error; 768 int error;
768 769
769 error = inode_change_ok(inode, attr); 770 error = inode_change_ok(inode, attr);
770 if (error) 771 if (error)
771 return error; 772 return error;
772 773
773 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) 774 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
774 && newsize != inode->i_size) { 775 loff_t oldsize = inode->i_size;
776 loff_t newsize = attr->ia_size;
775 struct page *page = NULL; 777 struct page *page = NULL;
776 778
777 if (newsize < inode->i_size) { 779 if (newsize < oldsize) {
778 /* 780 /*
779 * If truncating down to a partial page, then 781 * If truncating down to a partial page, then
780 * if that page is already allocated, hold it 782 * if that page is already allocated, hold it
781 * in memory until the truncation is over, so 783 * in memory until the truncation is over, so
782 * truncate_partial_page cannnot miss it were 784 * truncate_partial_page cannot miss it were
783 * it assigned to swap. 785 * it assigned to swap.
784 */ 786 */
785 if (newsize & (PAGE_CACHE_SIZE-1)) { 787 if (newsize & (PAGE_CACHE_SIZE-1)) {
@@ -803,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
803 spin_unlock(&info->lock); 805 spin_unlock(&info->lock);
804 } 806 }
805 } 807 }
806 808 if (newsize != oldsize) {
807 /* XXX(truncate): truncate_setsize should be called last */ 809 i_size_write(inode, newsize);
808 truncate_setsize(inode, newsize); 810 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
811 }
812 if (newsize < oldsize) {
813 loff_t holebegin = round_up(newsize, PAGE_SIZE);
814 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
815 shmem_truncate_range(inode, newsize, (loff_t)-1);
816 /* unmap again to remove racily COWed private pages */
817 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818 }
809 if (page) 819 if (page)
810 page_cache_release(page); 820 page_cache_release(page);
811 shmem_truncate_range(inode, newsize, (loff_t)-1);
812 } 821 }
813 822
814 setattr_copy(inode, attr); 823 setattr_copy(inode, attr);
@@ -822,9 +831,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
822static void shmem_evict_inode(struct inode *inode) 831static void shmem_evict_inode(struct inode *inode)
823{ 832{
824 struct shmem_inode_info *info = SHMEM_I(inode); 833 struct shmem_inode_info *info = SHMEM_I(inode);
834 struct shmem_xattr *xattr, *nxattr;
825 835
826 if (inode->i_mapping->a_ops == &shmem_aops) { 836 if (inode->i_mapping->a_ops == &shmem_aops) {
827 truncate_inode_pages(inode->i_mapping, 0);
828 shmem_unacct_size(info->flags, inode->i_size); 837 shmem_unacct_size(info->flags, inode->i_size);
829 inode->i_size = 0; 838 inode->i_size = 0;
830 shmem_truncate_range(inode, 0, (loff_t)-1); 839 shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -834,6 +843,11 @@ static void shmem_evict_inode(struct inode *inode)
834 mutex_unlock(&shmem_swaplist_mutex); 843 mutex_unlock(&shmem_swaplist_mutex);
835 } 844 }
836 } 845 }
846
847 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
848 kfree(xattr->name);
849 kfree(xattr);
850 }
837 BUG_ON(inode->i_blocks); 851 BUG_ON(inode->i_blocks);
838 shmem_free_inode(inode->i_sb); 852 shmem_free_inode(inode->i_sb);
839 end_writeback(inode); 853 end_writeback(inode);
@@ -852,7 +866,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_
852 866
853static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) 867static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
854{ 868{
855 struct inode *inode; 869 struct address_space *mapping;
856 unsigned long idx; 870 unsigned long idx;
857 unsigned long size; 871 unsigned long size;
858 unsigned long limit; 872 unsigned long limit;
@@ -875,8 +889,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
875 if (size > SHMEM_NR_DIRECT) 889 if (size > SHMEM_NR_DIRECT)
876 size = SHMEM_NR_DIRECT; 890 size = SHMEM_NR_DIRECT;
877 offset = shmem_find_swp(entry, ptr, ptr+size); 891 offset = shmem_find_swp(entry, ptr, ptr+size);
878 if (offset >= 0) 892 if (offset >= 0) {
893 shmem_swp_balance_unmap();
879 goto found; 894 goto found;
895 }
880 if (!info->i_indirect) 896 if (!info->i_indirect)
881 goto lost2; 897 goto lost2;
882 898
@@ -917,6 +933,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
917 shmem_swp_unmap(ptr); 933 shmem_swp_unmap(ptr);
918 if (offset >= 0) { 934 if (offset >= 0) {
919 shmem_dir_unmap(dir); 935 shmem_dir_unmap(dir);
936 ptr = shmem_swp_map(subdir);
920 goto found; 937 goto found;
921 } 938 }
922 } 939 }
@@ -928,8 +945,7 @@ lost2:
928 return 0; 945 return 0;
929found: 946found:
930 idx += offset; 947 idx += offset;
931 inode = igrab(&info->vfs_inode); 948 ptr += offset;
932 spin_unlock(&info->lock);
933 949
934 /* 950 /*
935 * Move _head_ to start search for next from here. 951 * Move _head_ to start search for next from here.
@@ -940,37 +956,18 @@ found:
940 */ 956 */
941 if (shmem_swaplist.next != &info->swaplist) 957 if (shmem_swaplist.next != &info->swaplist)
942 list_move_tail(&shmem_swaplist, &info->swaplist); 958 list_move_tail(&shmem_swaplist, &info->swaplist);
943 mutex_unlock(&shmem_swaplist_mutex);
944 959
945 error = 1;
946 if (!inode)
947 goto out;
948 /* 960 /*
949 * Charge page using GFP_KERNEL while we can wait. 961 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
950 * Charged back to the user(not to caller) when swap account is used. 962 * but also to hold up shmem_evict_inode(): so inode cannot be freed
951 * add_to_page_cache() will be called with GFP_NOWAIT. 963 * beneath us (pagelock doesn't help until the page is in pagecache).
952 */ 964 */
953 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 965 mapping = info->vfs_inode.i_mapping;
954 if (error) 966 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
955 goto out; 967 /* which does mem_cgroup_uncharge_cache_page on error */
956 error = radix_tree_preload(GFP_KERNEL);
957 if (error) {
958 mem_cgroup_uncharge_cache_page(page);
959 goto out;
960 }
961 error = 1;
962
963 spin_lock(&info->lock);
964 ptr = shmem_swp_entry(info, idx, NULL);
965 if (ptr && ptr->val == entry.val) {
966 error = add_to_page_cache_locked(page, inode->i_mapping,
967 idx, GFP_NOWAIT);
968 /* does mem_cgroup_uncharge_cache_page on error */
969 } else /* we must compensate for our precharge above */
970 mem_cgroup_uncharge_cache_page(page);
971 968
972 if (error == -EEXIST) { 969 if (error == -EEXIST) {
973 struct page *filepage = find_get_page(inode->i_mapping, idx); 970 struct page *filepage = find_get_page(mapping, idx);
974 error = 1; 971 error = 1;
975 if (filepage) { 972 if (filepage) {
976 /* 973 /*
@@ -990,14 +987,8 @@ found:
990 swap_free(entry); 987 swap_free(entry);
991 error = 1; /* not an error, but entry was found */ 988 error = 1; /* not an error, but entry was found */
992 } 989 }
993 if (ptr) 990 shmem_swp_unmap(ptr);
994 shmem_swp_unmap(ptr);
995 spin_unlock(&info->lock); 991 spin_unlock(&info->lock);
996 radix_tree_preload_end();
997out:
998 unlock_page(page);
999 page_cache_release(page);
1000 iput(inode); /* allows for NULL */
1001 return error; 992 return error;
1002} 993}
1003 994
@@ -1009,6 +1000,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1009 struct list_head *p, *next; 1000 struct list_head *p, *next;
1010 struct shmem_inode_info *info; 1001 struct shmem_inode_info *info;
1011 int found = 0; 1002 int found = 0;
1003 int error;
1004
1005 /*
1006 * Charge page using GFP_KERNEL while we can wait, before taking
1007 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1008 * Charged back to the user (not to caller) when swap account is used.
1009 * add_to_page_cache() will be called with GFP_NOWAIT.
1010 */
1011 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1012 if (error)
1013 goto out;
1014 /*
1015 * Try to preload while we can wait, to not make a habit of
1016 * draining atomic reserves; but don't latch on to this cpu,
1017 * it's okay if sometimes we get rescheduled after this.
1018 */
1019 error = radix_tree_preload(GFP_KERNEL);
1020 if (error)
1021 goto uncharge;
1022 radix_tree_preload_end();
1012 1023
1013 mutex_lock(&shmem_swaplist_mutex); 1024 mutex_lock(&shmem_swaplist_mutex);
1014 list_for_each_safe(p, next, &shmem_swaplist) { 1025 list_for_each_safe(p, next, &shmem_swaplist) {
@@ -1016,17 +1027,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1016 found = shmem_unuse_inode(info, entry, page); 1027 found = shmem_unuse_inode(info, entry, page);
1017 cond_resched(); 1028 cond_resched();
1018 if (found) 1029 if (found)
1019 goto out; 1030 break;
1020 } 1031 }
1021 mutex_unlock(&shmem_swaplist_mutex); 1032 mutex_unlock(&shmem_swaplist_mutex);
1022 /* 1033
1023 * Can some race bring us here? We've been holding page lock, 1034uncharge:
1024 * so I think not; but would rather try again later than BUG() 1035 if (!found)
1025 */ 1036 mem_cgroup_uncharge_cache_page(page);
1037 if (found < 0)
1038 error = found;
1039out:
1026 unlock_page(page); 1040 unlock_page(page);
1027 page_cache_release(page); 1041 page_cache_release(page);
1028out: 1042 return error;
1029 return (found < 0) ? found : 0;
1030} 1043}
1031 1044
1032/* 1045/*
@@ -1064,7 +1077,25 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1064 else 1077 else
1065 swap.val = 0; 1078 swap.val = 0;
1066 1079
1080 /*
1081 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1082 * if it's not already there. Do it now because we cannot take
1083 * mutex while holding spinlock, and must do so before the page
1084 * is moved to swap cache, when its pagelock no longer protects
1085 * the inode from eviction. But don't unlock the mutex until
1086 * we've taken the spinlock, because shmem_unuse_inode() will
1087 * prune a !swapped inode from the swaplist under both locks.
1088 */
1089 if (swap.val) {
1090 mutex_lock(&shmem_swaplist_mutex);
1091 if (list_empty(&info->swaplist))
1092 list_add_tail(&info->swaplist, &shmem_swaplist);
1093 }
1094
1067 spin_lock(&info->lock); 1095 spin_lock(&info->lock);
1096 if (swap.val)
1097 mutex_unlock(&shmem_swaplist_mutex);
1098
1068 if (index >= info->next_index) { 1099 if (index >= info->next_index) {
1069 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1100 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1070 goto unlock; 1101 goto unlock;
@@ -1081,25 +1112,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1081 shmem_recalc_inode(inode); 1112 shmem_recalc_inode(inode);
1082 1113
1083 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1114 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1084 remove_from_page_cache(page); 1115 delete_from_page_cache(page);
1085 shmem_swp_set(info, entry, swap.val); 1116 shmem_swp_set(info, entry, swap.val);
1086 shmem_swp_unmap(entry); 1117 shmem_swp_unmap(entry);
1087 if (list_empty(&info->swaplist))
1088 inode = igrab(inode);
1089 else
1090 inode = NULL;
1091 spin_unlock(&info->lock);
1092 swap_shmem_alloc(swap); 1118 swap_shmem_alloc(swap);
1119 spin_unlock(&info->lock);
1093 BUG_ON(page_mapped(page)); 1120 BUG_ON(page_mapped(page));
1094 page_cache_release(page); /* pagecache ref */
1095 swap_writepage(page, wbc); 1121 swap_writepage(page, wbc);
1096 if (inode) {
1097 mutex_lock(&shmem_swaplist_mutex);
1098 /* move instead of add in case we're racing */
1099 list_move_tail(&info->swaplist, &shmem_swaplist);
1100 mutex_unlock(&shmem_swaplist_mutex);
1101 iput(inode);
1102 }
1103 return 0; 1122 return 0;
1104 } 1123 }
1105 1124
@@ -1287,12 +1306,10 @@ repeat:
1287 swappage = lookup_swap_cache(swap); 1306 swappage = lookup_swap_cache(swap);
1288 if (!swappage) { 1307 if (!swappage) {
1289 shmem_swp_unmap(entry); 1308 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock);
1290 /* here we actually do the io */ 1310 /* here we actually do the io */
1291 if (type && !(*type & VM_FAULT_MAJOR)) { 1311 if (type)
1292 __count_vm_event(PGMAJFAULT);
1293 *type |= VM_FAULT_MAJOR; 1312 *type |= VM_FAULT_MAJOR;
1294 }
1295 spin_unlock(&info->lock);
1296 swappage = shmem_swapin(swap, gfp, info, idx); 1313 swappage = shmem_swapin(swap, gfp, info, idx);
1297 if (!swappage) { 1314 if (!swappage) {
1298 spin_lock(&info->lock); 1315 spin_lock(&info->lock);
@@ -1399,21 +1416,16 @@ repeat:
1399 shmem_swp_unmap(entry); 1416 shmem_swp_unmap(entry);
1400 sbinfo = SHMEM_SB(inode->i_sb); 1417 sbinfo = SHMEM_SB(inode->i_sb);
1401 if (sbinfo->max_blocks) { 1418 if (sbinfo->max_blocks) {
1402 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || 1419 if (percpu_counter_compare(&sbinfo->used_blocks,
1403 shmem_acct_block(info->flags)) { 1420 sbinfo->max_blocks) >= 0 ||
1404 spin_unlock(&info->lock); 1421 shmem_acct_block(info->flags))
1405 error = -ENOSPC; 1422 goto nospace;
1406 goto failed;
1407 }
1408 percpu_counter_inc(&sbinfo->used_blocks); 1423 percpu_counter_inc(&sbinfo->used_blocks);
1409 spin_lock(&inode->i_lock); 1424 spin_lock(&inode->i_lock);
1410 inode->i_blocks += BLOCKS_PER_PAGE; 1425 inode->i_blocks += BLOCKS_PER_PAGE;
1411 spin_unlock(&inode->i_lock); 1426 spin_unlock(&inode->i_lock);
1412 } else if (shmem_acct_block(info->flags)) { 1427 } else if (shmem_acct_block(info->flags))
1413 spin_unlock(&info->lock); 1428 goto nospace;
1414 error = -ENOSPC;
1415 goto failed;
1416 }
1417 1429
1418 if (!filepage) { 1430 if (!filepage) {
1419 int ret; 1431 int ret;
@@ -1493,6 +1505,24 @@ done:
1493 error = 0; 1505 error = 0;
1494 goto out; 1506 goto out;
1495 1507
1508nospace:
1509 /*
1510 * Perhaps the page was brought in from swap between find_lock_page
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1514 * is already in page cache, which prevents this race from occurring.)
1515 */
1516 if (!filepage) {
1517 struct page *page = find_get_page(mapping, idx);
1518 if (page) {
1519 spin_unlock(&info->lock);
1520 page_cache_release(page);
1521 goto repeat;
1522 }
1523 }
1524 spin_unlock(&info->lock);
1525 error = -ENOSPC;
1496failed: 1526failed:
1497 if (*pagep != filepage) { 1527 if (*pagep != filepage) {
1498 unlock_page(filepage); 1528 unlock_page(filepage);
@@ -1518,7 +1548,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1518 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1519 if (error) 1549 if (error)
1520 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1521 1551 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 }
1522 return ret | VM_FAULT_LOCKED; 1555 return ret | VM_FAULT_LOCKED;
1523} 1556}
1524 1557
@@ -1586,6 +1619,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1586 1619
1587 inode = new_inode(sb); 1620 inode = new_inode(sb);
1588 if (inode) { 1621 if (inode) {
1622 inode->i_ino = get_next_ino();
1589 inode_init_owner(inode, dir, mode); 1623 inode_init_owner(inode, dir, mode);
1590 inode->i_blocks = 0; 1624 inode->i_blocks = 0;
1591 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1625 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1596,6 +1630,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1596 spin_lock_init(&info->lock); 1630 spin_lock_init(&info->lock);
1597 info->flags = flags & VM_NORESERVE; 1631 info->flags = flags & VM_NORESERVE;
1598 INIT_LIST_HEAD(&info->swaplist); 1632 INIT_LIST_HEAD(&info->swaplist);
1633 INIT_LIST_HEAD(&info->xattr_list);
1599 cache_no_acl(inode); 1634 cache_no_acl(inode);
1600 1635
1601 switch (mode & S_IFMT) { 1636 switch (mode & S_IFMT) {
@@ -1842,8 +1877,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1842 1877
1843 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1878 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1844 if (inode) { 1879 if (inode) {
1845 error = security_inode_init_security(inode, dir, NULL, NULL, 1880 error = security_inode_init_security(inode, dir,
1846 NULL); 1881 &dentry->d_name, NULL,
1882 NULL, NULL);
1847 if (error) { 1883 if (error) {
1848 if (error != -EOPNOTSUPP) { 1884 if (error != -EOPNOTSUPP) {
1849 iput(inode); 1885 iput(inode);
@@ -1903,7 +1939,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1903 dir->i_size += BOGO_DIRENT_SIZE; 1939 dir->i_size += BOGO_DIRENT_SIZE;
1904 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1940 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1905 inc_nlink(inode); 1941 inc_nlink(inode);
1906 atomic_inc(&inode->i_count); /* New dentry reference */ 1942 ihold(inode); /* New dentry reference */
1907 dget(dentry); /* Extra pinning count for the created dentry */ 1943 dget(dentry); /* Extra pinning count for the created dentry */
1908 d_instantiate(dentry, inode); 1944 d_instantiate(dentry, inode);
1909out: 1945out:
@@ -1982,8 +2018,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1982 if (!inode) 2018 if (!inode)
1983 return -ENOSPC; 2019 return -ENOSPC;
1984 2020
1985 error = security_inode_init_security(inode, dir, NULL, NULL, 2021 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
1986 NULL); 2022 NULL, NULL);
1987 if (error) { 2023 if (error) {
1988 if (error != -EOPNOTSUPP) { 2024 if (error != -EOPNOTSUPP) {
1989 iput(inode); 2025 iput(inode);
@@ -1994,9 +2030,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1994 2030
1995 info = SHMEM_I(inode); 2031 info = SHMEM_I(inode);
1996 inode->i_size = len-1; 2032 inode->i_size = len-1;
1997 if (len <= (char *)inode - (char *)info) { 2033 if (len <= SHMEM_SYMLINK_INLINE_LEN) {
1998 /* do it inline */ 2034 /* do it inline */
1999 memcpy(info, symname, len); 2035 memcpy(info->inline_symlink, symname, len);
2000 inode->i_op = &shmem_symlink_inline_operations; 2036 inode->i_op = &shmem_symlink_inline_operations;
2001 } else { 2037 } else {
2002 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2038 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -2022,7 +2058,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2022 2058
2023static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 2059static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
2024{ 2060{
2025 nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); 2061 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
2026 return NULL; 2062 return NULL;
2027} 2063}
2028 2064
@@ -2046,63 +2082,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
2046 } 2082 }
2047} 2083}
2048 2084
2049static const struct inode_operations shmem_symlink_inline_operations = { 2085#ifdef CONFIG_TMPFS_XATTR
2050 .readlink = generic_readlink,
2051 .follow_link = shmem_follow_link_inline,
2052};
2053
2054static const struct inode_operations shmem_symlink_inode_operations = {
2055 .readlink = generic_readlink,
2056 .follow_link = shmem_follow_link,
2057 .put_link = shmem_put_link,
2058};
2059
2060#ifdef CONFIG_TMPFS_POSIX_ACL
2061/* 2086/*
2062 * Superblocks without xattr inode operations will get security.* xattr 2087 * Superblocks without xattr inode operations may get some security.* xattr
2063 * support from the VFS "for free". As soon as we have any other xattrs 2088 * support from the LSM "for free". As soon as we have any other xattrs
2064 * like ACLs, we also need to implement the security.* handlers at 2089 * like ACLs, we also need to implement the security.* handlers at
2065 * filesystem level, though. 2090 * filesystem level, though.
2066 */ 2091 */
2067 2092
2068static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, 2093static int shmem_xattr_get(struct dentry *dentry, const char *name,
2069 size_t list_len, const char *name, 2094 void *buffer, size_t size)
2070 size_t name_len, int handler_flags)
2071{ 2095{
2072 return security_inode_listsecurity(dentry->d_inode, list, list_len); 2096 struct shmem_inode_info *info;
2073} 2097 struct shmem_xattr *xattr;
2098 int ret = -ENODATA;
2074 2099
2075static int shmem_xattr_security_get(struct dentry *dentry, const char *name, 2100 info = SHMEM_I(dentry->d_inode);
2076 void *buffer, size_t size, int handler_flags) 2101
2077{ 2102 spin_lock(&info->lock);
2078 if (strcmp(name, "") == 0) 2103 list_for_each_entry(xattr, &info->xattr_list, list) {
2079 return -EINVAL; 2104 if (strcmp(name, xattr->name))
2080 return xattr_getsecurity(dentry->d_inode, name, buffer, size); 2105 continue;
2106
2107 ret = xattr->size;
2108 if (buffer) {
2109 if (size < xattr->size)
2110 ret = -ERANGE;
2111 else
2112 memcpy(buffer, xattr->value, xattr->size);
2113 }
2114 break;
2115 }
2116 spin_unlock(&info->lock);
2117 return ret;
2081} 2118}
2082 2119
2083static int shmem_xattr_security_set(struct dentry *dentry, const char *name, 2120static int shmem_xattr_set(struct dentry *dentry, const char *name,
2084 const void *value, size_t size, int flags, int handler_flags) 2121 const void *value, size_t size, int flags)
2085{ 2122{
2086 if (strcmp(name, "") == 0) 2123 struct inode *inode = dentry->d_inode;
2087 return -EINVAL; 2124 struct shmem_inode_info *info = SHMEM_I(inode);
2088 return security_inode_setsecurity(dentry->d_inode, name, value, 2125 struct shmem_xattr *xattr;
2089 size, flags); 2126 struct shmem_xattr *new_xattr = NULL;
2127 size_t len;
2128 int err = 0;
2129
2130 /* value == NULL means remove */
2131 if (value) {
2132 /* wrap around? */
2133 len = sizeof(*new_xattr) + size;
2134 if (len <= sizeof(*new_xattr))
2135 return -ENOMEM;
2136
2137 new_xattr = kmalloc(len, GFP_KERNEL);
2138 if (!new_xattr)
2139 return -ENOMEM;
2140
2141 new_xattr->name = kstrdup(name, GFP_KERNEL);
2142 if (!new_xattr->name) {
2143 kfree(new_xattr);
2144 return -ENOMEM;
2145 }
2146
2147 new_xattr->size = size;
2148 memcpy(new_xattr->value, value, size);
2149 }
2150
2151 spin_lock(&info->lock);
2152 list_for_each_entry(xattr, &info->xattr_list, list) {
2153 if (!strcmp(name, xattr->name)) {
2154 if (flags & XATTR_CREATE) {
2155 xattr = new_xattr;
2156 err = -EEXIST;
2157 } else if (new_xattr) {
2158 list_replace(&xattr->list, &new_xattr->list);
2159 } else {
2160 list_del(&xattr->list);
2161 }
2162 goto out;
2163 }
2164 }
2165 if (flags & XATTR_REPLACE) {
2166 xattr = new_xattr;
2167 err = -ENODATA;
2168 } else {
2169 list_add(&new_xattr->list, &info->xattr_list);
2170 xattr = NULL;
2171 }
2172out:
2173 spin_unlock(&info->lock);
2174 if (xattr)
2175 kfree(xattr->name);
2176 kfree(xattr);
2177 return err;
2090} 2178}
2091 2179
2092static const struct xattr_handler shmem_xattr_security_handler = {
2093 .prefix = XATTR_SECURITY_PREFIX,
2094 .list = shmem_xattr_security_list,
2095 .get = shmem_xattr_security_get,
2096 .set = shmem_xattr_security_set,
2097};
2098 2180
2099static const struct xattr_handler *shmem_xattr_handlers[] = { 2181static const struct xattr_handler *shmem_xattr_handlers[] = {
2182#ifdef CONFIG_TMPFS_POSIX_ACL
2100 &generic_acl_access_handler, 2183 &generic_acl_access_handler,
2101 &generic_acl_default_handler, 2184 &generic_acl_default_handler,
2102 &shmem_xattr_security_handler, 2185#endif
2103 NULL 2186 NULL
2104}; 2187};
2188
2189static int shmem_xattr_validate(const char *name)
2190{
2191 struct { const char *prefix; size_t len; } arr[] = {
2192 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2193 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2194 };
2195 int i;
2196
2197 for (i = 0; i < ARRAY_SIZE(arr); i++) {
2198 size_t preflen = arr[i].len;
2199 if (strncmp(name, arr[i].prefix, preflen) == 0) {
2200 if (!name[preflen])
2201 return -EINVAL;
2202 return 0;
2203 }
2204 }
2205 return -EOPNOTSUPP;
2206}
2207
2208static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2209 void *buffer, size_t size)
2210{
2211 int err;
2212
2213 /*
2214 * If this is a request for a synthetic attribute in the system.*
2215 * namespace use the generic infrastructure to resolve a handler
2216 * for it via sb->s_xattr.
2217 */
2218 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2219 return generic_getxattr(dentry, name, buffer, size);
2220
2221 err = shmem_xattr_validate(name);
2222 if (err)
2223 return err;
2224
2225 return shmem_xattr_get(dentry, name, buffer, size);
2226}
2227
2228static int shmem_setxattr(struct dentry *dentry, const char *name,
2229 const void *value, size_t size, int flags)
2230{
2231 int err;
2232
2233 /*
2234 * If this is a request for a synthetic attribute in the system.*
2235 * namespace use the generic infrastructure to resolve a handler
2236 * for it via sb->s_xattr.
2237 */
2238 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2239 return generic_setxattr(dentry, name, value, size, flags);
2240
2241 err = shmem_xattr_validate(name);
2242 if (err)
2243 return err;
2244
2245 if (size == 0)
2246 value = ""; /* empty EA, do not remove */
2247
2248 return shmem_xattr_set(dentry, name, value, size, flags);
2249
2250}
2251
2252static int shmem_removexattr(struct dentry *dentry, const char *name)
2253{
2254 int err;
2255
2256 /*
2257 * If this is a request for a synthetic attribute in the system.*
2258 * namespace use the generic infrastructure to resolve a handler
2259 * for it via sb->s_xattr.
2260 */
2261 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2262 return generic_removexattr(dentry, name);
2263
2264 err = shmem_xattr_validate(name);
2265 if (err)
2266 return err;
2267
2268 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
2269}
2270
2271static bool xattr_is_trusted(const char *name)
2272{
2273 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
2274}
2275
2276static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2277{
2278 bool trusted = capable(CAP_SYS_ADMIN);
2279 struct shmem_xattr *xattr;
2280 struct shmem_inode_info *info;
2281 size_t used = 0;
2282
2283 info = SHMEM_I(dentry->d_inode);
2284
2285 spin_lock(&info->lock);
2286 list_for_each_entry(xattr, &info->xattr_list, list) {
2287 size_t len;
2288
2289 /* skip "trusted." attributes for unprivileged callers */
2290 if (!trusted && xattr_is_trusted(xattr->name))
2291 continue;
2292
2293 len = strlen(xattr->name) + 1;
2294 used += len;
2295 if (buffer) {
2296 if (size < used) {
2297 used = -ERANGE;
2298 break;
2299 }
2300 memcpy(buffer, xattr->name, len);
2301 buffer += len;
2302 }
2303 }
2304 spin_unlock(&info->lock);
2305
2306 return used;
2307}
2308#endif /* CONFIG_TMPFS_XATTR */
2309
2310static const struct inode_operations shmem_symlink_inline_operations = {
2311 .readlink = generic_readlink,
2312 .follow_link = shmem_follow_link_inline,
2313#ifdef CONFIG_TMPFS_XATTR
2314 .setxattr = shmem_setxattr,
2315 .getxattr = shmem_getxattr,
2316 .listxattr = shmem_listxattr,
2317 .removexattr = shmem_removexattr,
2318#endif
2319};
2320
2321static const struct inode_operations shmem_symlink_inode_operations = {
2322 .readlink = generic_readlink,
2323 .follow_link = shmem_follow_link,
2324 .put_link = shmem_put_link,
2325#ifdef CONFIG_TMPFS_XATTR
2326 .setxattr = shmem_setxattr,
2327 .getxattr = shmem_getxattr,
2328 .listxattr = shmem_listxattr,
2329 .removexattr = shmem_removexattr,
2105#endif 2330#endif
2331};
2106 2332
2107static struct dentry *shmem_get_parent(struct dentry *child) 2333static struct dentry *shmem_get_parent(struct dentry *child)
2108{ 2334{
@@ -2143,10 +2369,12 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2143{ 2369{
2144 struct inode *inode = dentry->d_inode; 2370 struct inode *inode = dentry->d_inode;
2145 2371
2146 if (*len < 3) 2372 if (*len < 3) {
2373 *len = 3;
2147 return 255; 2374 return 255;
2375 }
2148 2376
2149 if (hlist_unhashed(&inode->i_hash)) { 2377 if (inode_unhashed(inode)) {
2150 /* Unfortunately insert_inode_hash is not idempotent, 2378 /* Unfortunately insert_inode_hash is not idempotent,
2151 * so as we hash inodes here rather than at creation 2379 * so as we hash inodes here rather than at creation
2152 * time, we need a lock to ensure we only try 2380 * time, we need a lock to ensure we only try
@@ -2154,7 +2382,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2154 */ 2382 */
2155 static DEFINE_SPINLOCK(lock); 2383 static DEFINE_SPINLOCK(lock);
2156 spin_lock(&lock); 2384 spin_lock(&lock);
2157 if (hlist_unhashed(&inode->i_hash)) 2385 if (inode_unhashed(inode))
2158 __insert_inode_hash(inode, 2386 __insert_inode_hash(inode,
2159 inode->i_ino + inode->i_generation); 2387 inode->i_ino + inode->i_generation);
2160 spin_unlock(&lock); 2388 spin_unlock(&lock);
@@ -2380,8 +2608,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2380 sb->s_magic = TMPFS_MAGIC; 2608 sb->s_magic = TMPFS_MAGIC;
2381 sb->s_op = &shmem_ops; 2609 sb->s_op = &shmem_ops;
2382 sb->s_time_gran = 1; 2610 sb->s_time_gran = 1;
2383#ifdef CONFIG_TMPFS_POSIX_ACL 2611#ifdef CONFIG_TMPFS_XATTR
2384 sb->s_xattr = shmem_xattr_handlers; 2612 sb->s_xattr = shmem_xattr_handlers;
2613#endif
2614#ifdef CONFIG_TMPFS_POSIX_ACL
2385 sb->s_flags |= MS_POSIXACL; 2615 sb->s_flags |= MS_POSIXACL;
2386#endif 2616#endif
2387 2617
@@ -2414,13 +2644,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
2414 return &p->vfs_inode; 2644 return &p->vfs_inode;
2415} 2645}
2416 2646
2647static void shmem_i_callback(struct rcu_head *head)
2648{
2649 struct inode *inode = container_of(head, struct inode, i_rcu);
2650 INIT_LIST_HEAD(&inode->i_dentry);
2651 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2652}
2653
2417static void shmem_destroy_inode(struct inode *inode) 2654static void shmem_destroy_inode(struct inode *inode)
2418{ 2655{
2419 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2656 if ((inode->i_mode & S_IFMT) == S_IFREG) {
2420 /* only struct inode is valid if it's an inline symlink */ 2657 /* only struct inode is valid if it's an inline symlink */
2421 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2658 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2422 } 2659 }
2423 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2660 call_rcu(&inode->i_rcu, shmem_i_callback);
2424} 2661}
2425 2662
2426static void init_once(void *foo) 2663static void init_once(void *foo)
@@ -2470,13 +2707,15 @@ static const struct file_operations shmem_file_operations = {
2470}; 2707};
2471 2708
2472static const struct inode_operations shmem_inode_operations = { 2709static const struct inode_operations shmem_inode_operations = {
2473 .setattr = shmem_notify_change, 2710 .setattr = shmem_setattr,
2474 .truncate_range = shmem_truncate_range, 2711 .truncate_range = shmem_truncate_range,
2712#ifdef CONFIG_TMPFS_XATTR
2713 .setxattr = shmem_setxattr,
2714 .getxattr = shmem_getxattr,
2715 .listxattr = shmem_listxattr,
2716 .removexattr = shmem_removexattr,
2717#endif
2475#ifdef CONFIG_TMPFS_POSIX_ACL 2718#ifdef CONFIG_TMPFS_POSIX_ACL
2476 .setxattr = generic_setxattr,
2477 .getxattr = generic_getxattr,
2478 .listxattr = generic_listxattr,
2479 .removexattr = generic_removexattr,
2480 .check_acl = generic_check_acl, 2719 .check_acl = generic_check_acl,
2481#endif 2720#endif
2482 2721
@@ -2494,23 +2733,27 @@ static const struct inode_operations shmem_dir_inode_operations = {
2494 .mknod = shmem_mknod, 2733 .mknod = shmem_mknod,
2495 .rename = shmem_rename, 2734 .rename = shmem_rename,
2496#endif 2735#endif
2736#ifdef CONFIG_TMPFS_XATTR
2737 .setxattr = shmem_setxattr,
2738 .getxattr = shmem_getxattr,
2739 .listxattr = shmem_listxattr,
2740 .removexattr = shmem_removexattr,
2741#endif
2497#ifdef CONFIG_TMPFS_POSIX_ACL 2742#ifdef CONFIG_TMPFS_POSIX_ACL
2498 .setattr = shmem_notify_change, 2743 .setattr = shmem_setattr,
2499 .setxattr = generic_setxattr,
2500 .getxattr = generic_getxattr,
2501 .listxattr = generic_listxattr,
2502 .removexattr = generic_removexattr,
2503 .check_acl = generic_check_acl, 2744 .check_acl = generic_check_acl,
2504#endif 2745#endif
2505}; 2746};
2506 2747
2507static const struct inode_operations shmem_special_inode_operations = { 2748static const struct inode_operations shmem_special_inode_operations = {
2749#ifdef CONFIG_TMPFS_XATTR
2750 .setxattr = shmem_setxattr,
2751 .getxattr = shmem_getxattr,
2752 .listxattr = shmem_listxattr,
2753 .removexattr = shmem_removexattr,
2754#endif
2508#ifdef CONFIG_TMPFS_POSIX_ACL 2755#ifdef CONFIG_TMPFS_POSIX_ACL
2509 .setattr = shmem_notify_change, 2756 .setattr = shmem_setattr,
2510 .setxattr = generic_setxattr,
2511 .getxattr = generic_getxattr,
2512 .listxattr = generic_listxattr,
2513 .removexattr = generic_removexattr,
2514 .check_acl = generic_check_acl, 2757 .check_acl = generic_check_acl,
2515#endif 2758#endif
2516}; 2759};
@@ -2537,16 +2780,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
2537}; 2780};
2538 2781
2539 2782
2540static int shmem_get_sb(struct file_system_type *fs_type, 2783static struct dentry *shmem_mount(struct file_system_type *fs_type,
2541 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 2784 int flags, const char *dev_name, void *data)
2542{ 2785{
2543 return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); 2786 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2544} 2787}
2545 2788
2546static struct file_system_type tmpfs_fs_type = { 2789static struct file_system_type tmpfs_fs_type = {
2547 .owner = THIS_MODULE, 2790 .owner = THIS_MODULE,
2548 .name = "tmpfs", 2791 .name = "tmpfs",
2549 .get_sb = shmem_get_sb, 2792 .mount = shmem_mount,
2550 .kill_sb = kill_litter_super, 2793 .kill_sb = kill_litter_super,
2551}; 2794};
2552 2795
@@ -2642,7 +2885,7 @@ out:
2642 2885
2643static struct file_system_type tmpfs_fs_type = { 2886static struct file_system_type tmpfs_fs_type = {
2644 .name = "tmpfs", 2887 .name = "tmpfs",
2645 .get_sb = ramfs_get_sb, 2888 .mount = ramfs_mount,
2646 .kill_sb = kill_litter_super, 2889 .kill_sb = kill_litter_super,
2647}; 2890};
2648 2891
@@ -2666,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2666 return 0; 2909 return 0;
2667} 2910}
2668 2911
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
2913{
2914 truncate_inode_pages_range(inode->i_mapping, start, end);
2915}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917
2669#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2670/** 2919/**
2671 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -2783,5 +3032,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2783 fput(vma->vm_file); 3032 fput(vma->vm_file);
2784 vma->vm_file = file; 3033 vma->vm_file = file;
2785 vma->vm_ops = &shmem_vm_ops; 3034 vma->vm_ops = &shmem_vm_ops;
3035 vma->vm_flags |= VM_CAN_NONLINEAR;
2786 return 0; 3036 return 0;
2787} 3037}
3038
3039/**
3040 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3041 * @mapping: the page's address_space
3042 * @index: the page index
3043 * @gfp: the page allocator flags to use if allocating
3044 *
3045 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3046 * with any new page allocations done using the specified allocation flags.
3047 * But read_cache_page_gfp() uses the ->readpage() method: which does not
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 *
3051 * Provide a stub for those callers to start using now, then later
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp)
3057{
3058 return read_cache_page_gfp(mapping, index, gfp);
3059}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..d96e223de775 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
115#include <linux/debugobjects.h> 115#include <linux/debugobjects.h>
116#include <linux/kmemcheck.h> 116#include <linux/kmemcheck.h>
117#include <linux/memory.h> 117#include <linux/memory.h>
118#include <linux/prefetch.h>
118 119
119#include <asm/cacheflush.h> 120#include <asm/cacheflush.h>
120#include <asm/tlbflush.h> 121#include <asm/tlbflush.h>
@@ -191,22 +192,6 @@ typedef unsigned int kmem_bufctl_t;
191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 192#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
192 193
193/* 194/*
194 * struct slab
195 *
196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
197 * for a slab, or allocated from an general cache.
198 * Slabs are chained into three list: fully used, partial, fully free slabs.
199 */
200struct slab {
201 struct list_head list;
202 unsigned long colouroff;
203 void *s_mem; /* including colour offset */
204 unsigned int inuse; /* num of objs active in slab */
205 kmem_bufctl_t free;
206 unsigned short nodeid;
207};
208
209/*
210 * struct slab_rcu 195 * struct slab_rcu
211 * 196 *
212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 197 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +204,6 @@ struct slab {
219 * 204 *
220 * rcu_read_lock before reading the address, then rcu_read_unlock after 205 * rcu_read_lock before reading the address, then rcu_read_unlock after
221 * taking the spinlock within the structure expected at that address. 206 * taking the spinlock within the structure expected at that address.
222 *
223 * We assume struct slab_rcu can overlay struct slab when destroying.
224 */ 207 */
225struct slab_rcu { 208struct slab_rcu {
226 struct rcu_head head; 209 struct rcu_head head;
@@ -229,6 +212,27 @@ struct slab_rcu {
229}; 212};
230 213
231/* 214/*
215 * struct slab
216 *
217 * Manages the objs in a slab. Placed either at the beginning of mem allocated
218 * for a slab, or allocated from an general cache.
219 * Slabs are chained into three list: fully used, partial, fully free slabs.
220 */
221struct slab {
222 union {
223 struct {
224 struct list_head list;
225 unsigned long colouroff;
226 void *s_mem; /* including colour offset */
227 unsigned int inuse; /* num of objs active in slab */
228 kmem_bufctl_t free;
229 unsigned short nodeid;
230 };
231 struct slab_rcu __slab_cover_slab_rcu;
232 };
233};
234
235/*
232 * struct array_cache 236 * struct array_cache
233 * 237 *
234 * Purpose: 238 * Purpose:
@@ -284,7 +288,7 @@ struct kmem_list3 {
284 * Need this for bootstrapping a per node allocator. 288 * Need this for bootstrapping a per node allocator.
285 */ 289 */
286#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 290#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
287struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 291static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
288#define CACHE_CACHE 0 292#define CACHE_CACHE 0
289#define SIZE_AC MAX_NUMNODES 293#define SIZE_AC MAX_NUMNODES
290#define SIZE_L3 (2 * MAX_NUMNODES) 294#define SIZE_L3 (2 * MAX_NUMNODES)
@@ -829,12 +833,12 @@ static void init_reap_node(int cpu)
829 833
830static void next_reap_node(void) 834static void next_reap_node(void)
831{ 835{
832 int node = __get_cpu_var(slab_reap_node); 836 int node = __this_cpu_read(slab_reap_node);
833 837
834 node = next_node(node, node_online_map); 838 node = next_node(node, node_online_map);
835 if (unlikely(node >= MAX_NUMNODES)) 839 if (unlikely(node >= MAX_NUMNODES))
836 node = first_node(node_online_map); 840 node = first_node(node_online_map);
837 __get_cpu_var(slab_reap_node) = node; 841 __this_cpu_write(slab_reap_node, node);
838} 842}
839 843
840#else 844#else
@@ -875,7 +879,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
875 nc = kmalloc_node(memsize, gfp, node); 879 nc = kmalloc_node(memsize, gfp, node);
876 /* 880 /*
877 * The array_cache structures contain pointers to free object. 881 * The array_cache structures contain pointers to free object.
878 * However, when such objects are allocated or transfered to another 882 * However, when such objects are allocated or transferred to another
879 * cache the pointers are not cleared and they could be counted as 883 * cache the pointers are not cleared and they could be counted as
880 * valid references during a kmemleak scan. Therefore, kmemleak must 884 * valid references during a kmemleak scan. Therefore, kmemleak must
881 * not scan such objects. 885 * not scan such objects.
@@ -901,7 +905,7 @@ static int transfer_objects(struct array_cache *to,
901 struct array_cache *from, unsigned int max) 905 struct array_cache *from, unsigned int max)
902{ 906{
903 /* Figure out how many entries to transfer */ 907 /* Figure out how many entries to transfer */
904 int nr = min(min(from->avail, max), to->limit - to->avail); 908 int nr = min3(from->avail, max, to->limit - to->avail);
905 909
906 if (!nr) 910 if (!nr)
907 return 0; 911 return 0;
@@ -1012,7 +1016,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1012 */ 1016 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1017static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{ 1018{
1015 int node = __get_cpu_var(slab_reap_node); 1019 int node = __this_cpu_read(slab_reap_node);
1016 1020
1017 if (l3->alien) { 1021 if (l3->alien) {
1018 struct array_cache *ac = l3->alien[node]; 1022 struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1297,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1293 * anything expensive but will only modify reap_work 1297 * anything expensive but will only modify reap_work
1294 * and reschedule the timer. 1298 * and reschedule the timer.
1295 */ 1299 */
1296 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); 1300 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1297 /* Now the cache_reaper is guaranteed to be not running. */ 1301 /* Now the cache_reaper is guaranteed to be not running. */
1298 per_cpu(slab_reap_work, cpu).work.func = NULL; 1302 per_cpu(slab_reap_work, cpu).work.func = NULL;
1299 break; 1303 break;
@@ -1387,7 +1391,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1387 break; 1391 break;
1388 } 1392 }
1389out: 1393out:
1390 return ret ? notifier_from_errno(ret) : NOTIFY_OK; 1394 return notifier_from_errno(ret);
1391} 1395}
1392#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1396#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1393 1397
@@ -2147,8 +2151,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2147 * 2151 *
2148 * @name must be valid until the cache is destroyed. This implies that 2152 * @name must be valid until the cache is destroyed. This implies that
2149 * the module calling this has to destroy the cache before getting unloaded. 2153 * the module calling this has to destroy the cache before getting unloaded.
2150 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2151 * therefore applications must manage it themselves.
2152 * 2154 *
2153 * The flags are 2155 * The flags are
2154 * 2156 *
@@ -2288,8 +2290,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2288 if (ralign < align) { 2290 if (ralign < align) {
2289 ralign = align; 2291 ralign = align;
2290 } 2292 }
2291 /* disable debug if not aligning with REDZONE_ALIGN */ 2293 /* disable debug if necessary */
2292 if (ralign & (__alignof__(unsigned long long) - 1)) 2294 if (ralign > __alignof__(unsigned long long))
2293 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2295 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2294 /* 2296 /*
2295 * 4) Store it. 2297 * 4) Store it.
@@ -2315,8 +2317,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2315 */ 2317 */
2316 if (flags & SLAB_RED_ZONE) { 2318 if (flags & SLAB_RED_ZONE) {
2317 /* add space for red zone words */ 2319 /* add space for red zone words */
2318 cachep->obj_offset += align; 2320 cachep->obj_offset += sizeof(unsigned long long);
2319 size += align + sizeof(unsigned long long); 2321 size += 2 * sizeof(unsigned long long);
2320 } 2322 }
2321 if (flags & SLAB_STORE_USER) { 2323 if (flags & SLAB_STORE_USER) {
2322 /* user store requires one word storage behind the end of 2324 /* user store requires one word storage behind the end of
@@ -2605,7 +2607,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2605 * 2607 *
2606 * The cache must be empty before calling this function. 2608 * The cache must be empty before calling this function.
2607 * 2609 *
2608 * The caller must guarantee that noone will allocate memory from the cache 2610 * The caller must guarantee that no one will allocate memory from the cache
2609 * during the kmem_cache_destroy(). 2611 * during the kmem_cache_destroy().
2610 */ 2612 */
2611void kmem_cache_destroy(struct kmem_cache *cachep) 2613void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -2781,7 +2783,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2781/* 2783/*
2782 * Map pages beginning at addr to the given cache and slab. This is required 2784 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a 2785 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2786 * virtual address for kfree, ksize, and slab debugging.
2785 */ 2787 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2788static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787 void *addr) 2789 void *addr)
@@ -3602,13 +3604,14 @@ free_done:
3602 * Release an obj back to its cache. If the obj has a constructed state, it must 3604 * Release an obj back to its cache. If the obj has a constructed state, it must
3603 * be in this state _before_ it is released. Called with disabled ints. 3605 * be in this state _before_ it is released. Called with disabled ints.
3604 */ 3606 */
3605static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3607static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3608 void *caller)
3606{ 3609{
3607 struct array_cache *ac = cpu_cache_get(cachep); 3610 struct array_cache *ac = cpu_cache_get(cachep);
3608 3611
3609 check_irq_off(); 3612 check_irq_off();
3610 kmemleak_free_recursive(objp, cachep->flags); 3613 kmemleak_free_recursive(objp, cachep->flags);
3611 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3614 objp = cache_free_debugcheck(cachep, objp, caller);
3612 3615
3613 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3616 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3614 3617
@@ -3653,42 +3656,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3653EXPORT_SYMBOL(kmem_cache_alloc); 3656EXPORT_SYMBOL(kmem_cache_alloc);
3654 3657
3655#ifdef CONFIG_TRACING 3658#ifdef CONFIG_TRACING
3656void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3659void *
3660kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3657{ 3661{
3658 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3662 void *ret;
3659}
3660EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3661#endif
3662 3663
3663/** 3664 ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3664 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3665 * @cachep: the cache we're checking against
3666 * @ptr: pointer to validate
3667 *
3668 * This verifies that the untrusted pointer looks sane;
3669 * it is _not_ a guarantee that the pointer is actually
3670 * part of the slab cache in question, but it at least
3671 * validates that the pointer can be dereferenced and
3672 * looks half-way sane.
3673 *
3674 * Currently only used for dentry validation.
3675 */
3676int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3677{
3678 unsigned long size = cachep->buffer_size;
3679 struct page *page;
3680 3665
3681 if (unlikely(!kern_ptr_validate(ptr, size))) 3666 trace_kmalloc(_RET_IP_, ret,
3682 goto out; 3667 size, slab_buffer_size(cachep), flags);
3683 page = virt_to_page(ptr); 3668 return ret;
3684 if (unlikely(!PageSlab(page)))
3685 goto out;
3686 if (unlikely(page_get_cache(page) != cachep))
3687 goto out;
3688 return 1;
3689out:
3690 return 0;
3691} 3669}
3670EXPORT_SYMBOL(kmem_cache_alloc_trace);
3671#endif
3692 3672
3693#ifdef CONFIG_NUMA 3673#ifdef CONFIG_NUMA
3694void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3674void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3685,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3705EXPORT_SYMBOL(kmem_cache_alloc_node); 3685EXPORT_SYMBOL(kmem_cache_alloc_node);
3706 3686
3707#ifdef CONFIG_TRACING 3687#ifdef CONFIG_TRACING
3708void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3688void *kmem_cache_alloc_node_trace(size_t size,
3709 gfp_t flags, 3689 struct kmem_cache *cachep,
3710 int nodeid) 3690 gfp_t flags,
3691 int nodeid)
3711{ 3692{
3712 return __cache_alloc_node(cachep, flags, nodeid, 3693 void *ret;
3694
3695 ret = __cache_alloc_node(cachep, flags, nodeid,
3713 __builtin_return_address(0)); 3696 __builtin_return_address(0));
3697 trace_kmalloc_node(_RET_IP_, ret,
3698 size, slab_buffer_size(cachep),
3699 flags, nodeid);
3700 return ret;
3714} 3701}
3715EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3702EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3716#endif 3703#endif
3717 3704
3718static __always_inline void * 3705static __always_inline void *
3719__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3706__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3720{ 3707{
3721 struct kmem_cache *cachep; 3708 struct kmem_cache *cachep;
3722 void *ret;
3723 3709
3724 cachep = kmem_find_general_cachep(size, flags); 3710 cachep = kmem_find_general_cachep(size, flags);
3725 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3711 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3726 return cachep; 3712 return cachep;
3727 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3713 return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3728
3729 trace_kmalloc_node((unsigned long) caller, ret,
3730 size, cachep->buffer_size, flags, node);
3731
3732 return ret;
3733} 3714}
3734 3715
3735#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3716#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -3821,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3821 debug_check_no_locks_freed(objp, obj_size(cachep)); 3802 debug_check_no_locks_freed(objp, obj_size(cachep));
3822 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3803 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3823 debug_check_no_obj_freed(objp, obj_size(cachep)); 3804 debug_check_no_obj_freed(objp, obj_size(cachep));
3824 __cache_free(cachep, objp); 3805 __cache_free(cachep, objp, __builtin_return_address(0));
3825 local_irq_restore(flags); 3806 local_irq_restore(flags);
3826 3807
3827 trace_kmem_cache_free(_RET_IP_, objp); 3808 trace_kmem_cache_free(_RET_IP_, objp);
@@ -3851,7 +3832,7 @@ void kfree(const void *objp)
3851 c = virt_to_cache(objp); 3832 c = virt_to_cache(objp);
3852 debug_check_no_locks_freed(objp, obj_size(c)); 3833 debug_check_no_locks_freed(objp, obj_size(c));
3853 debug_check_no_obj_freed(objp, obj_size(c)); 3834 debug_check_no_obj_freed(objp, obj_size(c));
3854 __cache_free(c, (void *)objp); 3835 __cache_free(c, (void *)objp, __builtin_return_address(0));
3855 local_irq_restore(flags); 3836 local_irq_restore(flags);
3856} 3837}
3857EXPORT_SYMBOL(kfree); 3838EXPORT_SYMBOL(kfree);
@@ -3862,12 +3843,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
3862} 3843}
3863EXPORT_SYMBOL(kmem_cache_size); 3844EXPORT_SYMBOL(kmem_cache_size);
3864 3845
3865const char *kmem_cache_name(struct kmem_cache *cachep)
3866{
3867 return cachep->name;
3868}
3869EXPORT_SYMBOL_GPL(kmem_cache_name);
3870
3871/* 3846/*
3872 * This initializes kmem_list3 or resizes various caches for all nodes. 3847 * This initializes kmem_list3 or resizes various caches for all nodes.
3873 */ 3848 */
@@ -4075,7 +4050,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4075 * necessary. Note that the l3 listlock also protects the array_cache 4050 * necessary. Note that the l3 listlock also protects the array_cache
4076 * if drain_array() is used on the shared array. 4051 * if drain_array() is used on the shared array.
4077 */ 4052 */
4078void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4053static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4079 struct array_cache *ac, int force, int node) 4054 struct array_cache *ac, int force, int node)
4080{ 4055{
4081 int tofree; 4056 int tofree;
@@ -4339,7 +4314,7 @@ static const struct seq_operations slabinfo_op = {
4339 * @count: data length 4314 * @count: data length
4340 * @ppos: unused 4315 * @ppos: unused
4341 */ 4316 */
4342ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4317static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4343 size_t count, loff_t *ppos) 4318 size_t count, loff_t *ppos)
4344{ 4319{
4345 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4320 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index d582171c8101..46e0aee33a23 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
500 } else { 500 } else {
501 unsigned int order = get_order(size); 501 unsigned int order = get_order(size);
502 502
503 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); 503 if (likely(order))
504 gfp |= __GFP_COMP;
505 ret = slob_new_pages(gfp, order, node);
504 if (ret) { 506 if (ret) {
505 struct page *page; 507 struct page *page;
506 page = virt_to_page(ret); 508 page = virt_to_page(ret);
@@ -664,23 +666,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
664} 666}
665EXPORT_SYMBOL(kmem_cache_size); 667EXPORT_SYMBOL(kmem_cache_size);
666 668
667const char *kmem_cache_name(struct kmem_cache *c)
668{
669 return c->name;
670}
671EXPORT_SYMBOL(kmem_cache_name);
672
673int kmem_cache_shrink(struct kmem_cache *d) 669int kmem_cache_shrink(struct kmem_cache *d)
674{ 670{
675 return 0; 671 return 0;
676} 672}
677EXPORT_SYMBOL(kmem_cache_shrink); 673EXPORT_SYMBOL(kmem_cache_shrink);
678 674
679int kmem_ptr_validate(struct kmem_cache *a, const void *b)
680{
681 return 0;
682}
683
684static unsigned int slob_ready __read_mostly; 675static unsigned int slob_ready __read_mostly;
685 676
686int slab_is_available(void) 677int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1f0f3d..35f351f26193 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30 30
31#include <trace/events/kmem.h>
32
31/* 33/*
32 * Lock order: 34 * Lock order:
33 * 1. slab_lock(page) 35 * 1. slab_lock(page)
@@ -62,7 +64,7 @@
62 * we must stay away from it for a while since we may cause a bouncing 64 * we must stay away from it for a while since we may cause a bouncing
63 * cacheline if we try to acquire the lock. So go onto the next slab. 65 * cacheline if we try to acquire the lock. So go onto the next slab.
64 * If all pages are busy then we may allocate a new slab instead of reusing 66 * If all pages are busy then we may allocate a new slab instead of reusing
65 * a partial slab. A new slab has noone operating on it and thus there is 67 * a partial slab. A new slab has no one operating on it and thus there is
66 * no danger of cacheline contention. 68 * no danger of cacheline contention.
67 * 69 *
68 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
@@ -168,7 +170,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
168 170
169/* Internal SLUB flags */ 171/* Internal SLUB flags */
170#define __OBJECT_POISON 0x80000000UL /* Poison object */ 172#define __OBJECT_POISON 0x80000000UL /* Poison object */
171#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
172 173
173static int kmem_size = sizeof(struct kmem_cache); 174static int kmem_size = sizeof(struct kmem_cache);
174 175
@@ -178,7 +179,7 @@ static struct notifier_block slab_notifier;
178 179
179static enum { 180static enum {
180 DOWN, /* No slab functionality available */ 181 DOWN, /* No slab functionality available */
181 PARTIAL, /* kmem_cache_open() works but kmalloc does not */ 182 PARTIAL, /* Kmem_cache_node works */
182 UP, /* Everything works but does not show up in sysfs */ 183 UP, /* Everything works but does not show up in sysfs */
183 SYSFS /* Sysfs up */ 184 SYSFS /* Sysfs up */
184} slab_state = DOWN; 185} slab_state = DOWN;
@@ -199,7 +200,7 @@ struct track {
199 200
200enum track_item { TRACK_ALLOC, TRACK_FREE }; 201enum track_item { TRACK_ALLOC, TRACK_FREE };
201 202
202#ifdef CONFIG_SLUB_DEBUG 203#ifdef CONFIG_SYSFS
203static int sysfs_slab_add(struct kmem_cache *); 204static int sysfs_slab_add(struct kmem_cache *);
204static int sysfs_slab_alias(struct kmem_cache *, const char *); 205static int sysfs_slab_alias(struct kmem_cache *, const char *);
205static void sysfs_slab_remove(struct kmem_cache *); 206static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,12 +211,13 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
210 { return 0; } 211 { return 0; }
211static inline void sysfs_slab_remove(struct kmem_cache *s) 212static inline void sysfs_slab_remove(struct kmem_cache *s)
212{ 213{
214 kfree(s->name);
213 kfree(s); 215 kfree(s);
214} 216}
215 217
216#endif 218#endif
217 219
218static inline void stat(struct kmem_cache *s, enum stat_item si) 220static inline void stat(const struct kmem_cache *s, enum stat_item si)
219{ 221{
220#ifdef CONFIG_SLUB_STATS 222#ifdef CONFIG_SLUB_STATS
221 __this_cpu_inc(s->cpu_slab->stat[si]); 223 __this_cpu_inc(s->cpu_slab->stat[si]);
@@ -233,11 +235,7 @@ int slab_is_available(void)
233 235
234static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 236static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
235{ 237{
236#ifdef CONFIG_NUMA
237 return s->node[node]; 238 return s->node[node];
238#else
239 return &s->local_node;
240#endif
241} 239}
242 240
243/* Verify that a pointer has an address that is valid within a slab page */ 241/* Verify that a pointer has an address that is valid within a slab page */
@@ -263,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
263 return *(void **)(object + s->offset); 261 return *(void **)(object + s->offset);
264} 262}
265 263
264static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
265{
266 void *p;
267
268#ifdef CONFIG_DEBUG_PAGEALLOC
269 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
270#else
271 p = get_freepointer(s, object);
272#endif
273 return p;
274}
275
266static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 276static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
267{ 277{
268 *(void **)(object + s->offset) = fp; 278 *(void **)(object + s->offset) = fp;
@@ -273,21 +283,46 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
273 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 283 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
274 __p += (__s)->size) 284 __p += (__s)->size)
275 285
276/* Scan freelist */
277#define for_each_free_object(__p, __s, __free) \
278 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
279
280/* Determine object index from a given position */ 286/* Determine object index from a given position */
281static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 287static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
282{ 288{
283 return (p - addr) / s->size; 289 return (p - addr) / s->size;
284} 290}
285 291
292static inline size_t slab_ksize(const struct kmem_cache *s)
293{
294#ifdef CONFIG_SLUB_DEBUG
295 /*
296 * Debugging requires use of the padding between object
297 * and whatever may come after it.
298 */
299 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
300 return s->objsize;
301
302#endif
303 /*
304 * If we have the need to store the freelist pointer
305 * back there or track user information then we can
306 * only use the space before that information.
307 */
308 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
309 return s->inuse;
310 /*
311 * Else we can use all the padding etc for the allocation
312 */
313 return s->size;
314}
315
316static inline int order_objects(int order, unsigned long size, int reserved)
317{
318 return ((PAGE_SIZE << order) - reserved) / size;
319}
320
286static inline struct kmem_cache_order_objects oo_make(int order, 321static inline struct kmem_cache_order_objects oo_make(int order,
287 unsigned long size) 322 unsigned long size, int reserved)
288{ 323{
289 struct kmem_cache_order_objects x = { 324 struct kmem_cache_order_objects x = {
290 (order << OO_SHIFT) + (PAGE_SIZE << order) / size 325 (order << OO_SHIFT) + order_objects(order, size, reserved)
291 }; 326 };
292 327
293 return x; 328 return x;
@@ -305,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
305 340
306#ifdef CONFIG_SLUB_DEBUG 341#ifdef CONFIG_SLUB_DEBUG
307/* 342/*
343 * Determine a map of object in use on a page.
344 *
345 * Slab lock or node listlock must be held to guarantee that the page does
346 * not vanish from under us.
347 */
348static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
349{
350 void *p;
351 void *addr = page_address(page);
352
353 for (p = page->freelist; p; p = get_freepointer(s, p))
354 set_bit(slab_index(p, s, addr), map);
355}
356
357/*
308 * Debug settings: 358 * Debug settings:
309 */ 359 */
310#ifdef CONFIG_SLUB_DEBUG_ON 360#ifdef CONFIG_SLUB_DEBUG_ON
@@ -494,7 +544,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
494 dump_stack(); 544 dump_stack();
495} 545}
496 546
497static void init_object(struct kmem_cache *s, void *object, int active) 547static void init_object(struct kmem_cache *s, void *object, u8 val)
498{ 548{
499 u8 *p = object; 549 u8 *p = object;
500 550
@@ -504,9 +554,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
504 } 554 }
505 555
506 if (s->flags & SLAB_RED_ZONE) 556 if (s->flags & SLAB_RED_ZONE)
507 memset(p + s->objsize, 557 memset(p + s->objsize, val, s->inuse - s->objsize);
508 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
509 s->inuse - s->objsize);
510} 558}
511 559
512static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -621,7 +669,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
621 return 1; 669 return 1;
622 670
623 start = page_address(page); 671 start = page_address(page);
624 length = (PAGE_SIZE << compound_order(page)); 672 length = (PAGE_SIZE << compound_order(page)) - s->reserved;
625 end = start + length; 673 end = start + length;
626 remainder = length % s->size; 674 remainder = length % s->size;
627 if (!remainder) 675 if (!remainder)
@@ -641,17 +689,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
641} 689}
642 690
643static int check_object(struct kmem_cache *s, struct page *page, 691static int check_object(struct kmem_cache *s, struct page *page,
644 void *object, int active) 692 void *object, u8 val)
645{ 693{
646 u8 *p = object; 694 u8 *p = object;
647 u8 *endobject = object + s->objsize; 695 u8 *endobject = object + s->objsize;
648 696
649 if (s->flags & SLAB_RED_ZONE) { 697 if (s->flags & SLAB_RED_ZONE) {
650 unsigned int red =
651 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
652
653 if (!check_bytes_and_report(s, page, object, "Redzone", 698 if (!check_bytes_and_report(s, page, object, "Redzone",
654 endobject, red, s->inuse - s->objsize)) 699 endobject, val, s->inuse - s->objsize))
655 return 0; 700 return 0;
656 } else { 701 } else {
657 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { 702 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +706,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
661 } 706 }
662 707
663 if (s->flags & SLAB_POISON) { 708 if (s->flags & SLAB_POISON) {
664 if (!active && (s->flags & __OBJECT_POISON) && 709 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
665 (!check_bytes_and_report(s, page, p, "Poison", p, 710 (!check_bytes_and_report(s, page, p, "Poison", p,
666 POISON_FREE, s->objsize - 1) || 711 POISON_FREE, s->objsize - 1) ||
667 !check_bytes_and_report(s, page, p, "Poison", 712 !check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +718,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
673 check_pad_bytes(s, page, p); 718 check_pad_bytes(s, page, p);
674 } 719 }
675 720
676 if (!s->offset && active) 721 if (!s->offset && val == SLUB_RED_ACTIVE)
677 /* 722 /*
678 * Object and freepointer overlap. Cannot check 723 * Object and freepointer overlap. Cannot check
679 * freepointer while object is allocated. 724 * freepointer while object is allocated.
@@ -705,7 +750,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
705 return 0; 750 return 0;
706 } 751 }
707 752
708 maxobj = (PAGE_SIZE << compound_order(page)) / s->size; 753 maxobj = order_objects(compound_order(page), s->size, s->reserved);
709 if (page->objects > maxobj) { 754 if (page->objects > maxobj) {
710 slab_err(s, page, "objects %u > max %u", 755 slab_err(s, page, "objects %u > max %u",
711 s->name, page->objects, maxobj); 756 s->name, page->objects, maxobj);
@@ -755,7 +800,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
755 nr++; 800 nr++;
756 } 801 }
757 802
758 max_objects = (PAGE_SIZE << compound_order(page)) / s->size; 803 max_objects = order_objects(compound_order(page), s->size, s->reserved);
759 if (max_objects > MAX_OBJS_PER_PAGE) 804 if (max_objects > MAX_OBJS_PER_PAGE)
760 max_objects = MAX_OBJS_PER_PAGE; 805 max_objects = MAX_OBJS_PER_PAGE;
761 806
@@ -792,6 +837,49 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
792} 837}
793 838
794/* 839/*
840 * Hooks for other subsystems that check memory allocations. In a typical
841 * production configuration these hooks all should produce no code at all.
842 */
843static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
844{
845 flags &= gfp_allowed_mask;
846 lockdep_trace_alloc(flags);
847 might_sleep_if(flags & __GFP_WAIT);
848
849 return should_failslab(s->objsize, flags, s->flags);
850}
851
852static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
853{
854 flags &= gfp_allowed_mask;
855 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
856 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
857}
858
859static inline void slab_free_hook(struct kmem_cache *s, void *x)
860{
861 kmemleak_free_recursive(x, s->flags);
862
863 /*
864 * Trouble is that we may no longer disable interupts in the fast path
865 * So in order to make the debug calls that expect irqs to be
866 * disabled we need to disable interrupts temporarily.
867 */
868#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
869 {
870 unsigned long flags;
871
872 local_irq_save(flags);
873 kmemcheck_slab_free(s, x, s->objsize);
874 debug_check_no_locks_freed(x, s->objsize);
875 local_irq_restore(flags);
876 }
877#endif
878 if (!(s->flags & SLAB_DEBUG_OBJECTS))
879 debug_check_no_obj_freed(x, s->objsize);
880}
881
882/*
795 * Tracking of fully allocated slabs for debugging purposes. 883 * Tracking of fully allocated slabs for debugging purposes.
796 */ 884 */
797static void add_full(struct kmem_cache_node *n, struct page *page) 885static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +926,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
838 * dilemma by deferring the increment of the count during 926 * dilemma by deferring the increment of the count during
839 * bootstrap (see early_kmem_cache_node_alloc). 927 * bootstrap (see early_kmem_cache_node_alloc).
840 */ 928 */
841 if (!NUMA_BUILD || n) { 929 if (n) {
842 atomic_long_inc(&n->nr_slabs); 930 atomic_long_inc(&n->nr_slabs);
843 atomic_long_add(objects, &n->total_objects); 931 atomic_long_add(objects, &n->total_objects);
844 } 932 }
@@ -858,11 +946,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
858 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) 946 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
859 return; 947 return;
860 948
861 init_object(s, object, 0); 949 init_object(s, object, SLUB_RED_INACTIVE);
862 init_tracking(s, object); 950 init_tracking(s, object);
863} 951}
864 952
865static int alloc_debug_processing(struct kmem_cache *s, struct page *page, 953static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
866 void *object, unsigned long addr) 954 void *object, unsigned long addr)
867{ 955{
868 if (!check_slab(s, page)) 956 if (!check_slab(s, page))
@@ -878,14 +966,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
878 goto bad; 966 goto bad;
879 } 967 }
880 968
881 if (!check_object(s, page, object, 0)) 969 if (!check_object(s, page, object, SLUB_RED_INACTIVE))
882 goto bad; 970 goto bad;
883 971
884 /* Success perform special debug activities for allocs */ 972 /* Success perform special debug activities for allocs */
885 if (s->flags & SLAB_STORE_USER) 973 if (s->flags & SLAB_STORE_USER)
886 set_track(s, object, TRACK_ALLOC, addr); 974 set_track(s, object, TRACK_ALLOC, addr);
887 trace(s, page, object, 1); 975 trace(s, page, object, 1);
888 init_object(s, object, 1); 976 init_object(s, object, SLUB_RED_ACTIVE);
889 return 1; 977 return 1;
890 978
891bad: 979bad:
@@ -902,8 +990,8 @@ bad:
902 return 0; 990 return 0;
903} 991}
904 992
905static int free_debug_processing(struct kmem_cache *s, struct page *page, 993static noinline int free_debug_processing(struct kmem_cache *s,
906 void *object, unsigned long addr) 994 struct page *page, void *object, unsigned long addr)
907{ 995{
908 if (!check_slab(s, page)) 996 if (!check_slab(s, page))
909 goto fail; 997 goto fail;
@@ -918,7 +1006,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
918 goto fail; 1006 goto fail;
919 } 1007 }
920 1008
921 if (!check_object(s, page, object, 1)) 1009 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
922 return 0; 1010 return 0;
923 1011
924 if (unlikely(s != page->slab)) { 1012 if (unlikely(s != page->slab)) {
@@ -942,7 +1030,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
942 if (s->flags & SLAB_STORE_USER) 1030 if (s->flags & SLAB_STORE_USER)
943 set_track(s, object, TRACK_FREE, addr); 1031 set_track(s, object, TRACK_FREE, addr);
944 trace(s, page, object, 0); 1032 trace(s, page, object, 0);
945 init_object(s, object, 0); 1033 init_object(s, object, SLUB_RED_INACTIVE);
946 return 1; 1034 return 1;
947 1035
948fail: 1036fail:
@@ -1046,7 +1134,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
1046static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1134static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1047 { return 1; } 1135 { return 1; }
1048static inline int check_object(struct kmem_cache *s, struct page *page, 1136static inline int check_object(struct kmem_cache *s, struct page *page,
1049 void *object, int active) { return 1; } 1137 void *object, u8 val) { return 1; }
1050static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1138static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1051static inline unsigned long kmem_cache_flags(unsigned long objsize, 1139static inline unsigned long kmem_cache_flags(unsigned long objsize,
1052 unsigned long flags, const char *name, 1140 unsigned long flags, const char *name,
@@ -1066,7 +1154,16 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
1066 int objects) {} 1154 int objects) {}
1067static inline void dec_slabs_node(struct kmem_cache *s, int node, 1155static inline void dec_slabs_node(struct kmem_cache *s, int node,
1068 int objects) {} 1156 int objects) {}
1069#endif 1157
1158static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1159 { return 0; }
1160
1161static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1162 void *object) {}
1163
1164static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1165
1166#endif /* CONFIG_SLUB_DEBUG */
1070 1167
1071/* 1168/*
1072 * Slab allocation and freeing 1169 * Slab allocation and freeing
@@ -1194,7 +1291,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1194 slab_pad_check(s, page); 1291 slab_pad_check(s, page);
1195 for_each_object(p, s, page_address(page), 1292 for_each_object(p, s, page_address(page),
1196 page->objects) 1293 page->objects)
1197 check_object(s, page, p, 0); 1294 check_object(s, page, p, SLUB_RED_INACTIVE);
1198 } 1295 }
1199 1296
1200 kmemcheck_free_shadow(page, compound_order(page)); 1297 kmemcheck_free_shadow(page, compound_order(page));
@@ -1211,21 +1308,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1211 __free_pages(page, order); 1308 __free_pages(page, order);
1212} 1309}
1213 1310
1311#define need_reserve_slab_rcu \
1312 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1313
1214static void rcu_free_slab(struct rcu_head *h) 1314static void rcu_free_slab(struct rcu_head *h)
1215{ 1315{
1216 struct page *page; 1316 struct page *page;
1217 1317
1218 page = container_of((struct list_head *)h, struct page, lru); 1318 if (need_reserve_slab_rcu)
1319 page = virt_to_head_page(h);
1320 else
1321 page = container_of((struct list_head *)h, struct page, lru);
1322
1219 __free_slab(page->slab, page); 1323 __free_slab(page->slab, page);
1220} 1324}
1221 1325
1222static void free_slab(struct kmem_cache *s, struct page *page) 1326static void free_slab(struct kmem_cache *s, struct page *page)
1223{ 1327{
1224 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1328 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1225 /* 1329 struct rcu_head *head;
1226 * RCU free overloads the RCU head over the LRU 1330
1227 */ 1331 if (need_reserve_slab_rcu) {
1228 struct rcu_head *head = (void *)&page->lru; 1332 int order = compound_order(page);
1333 int offset = (PAGE_SIZE << order) - s->reserved;
1334
1335 VM_BUG_ON(s->reserved != sizeof(*head));
1336 head = page_address(page) + offset;
1337 } else {
1338 /*
1339 * RCU free overloads the RCU head over the LRU
1340 */
1341 head = (void *)&page->lru;
1342 }
1229 1343
1230 call_rcu(head, rcu_free_slab); 1344 call_rcu(head, rcu_free_slab);
1231 } else 1345 } else
@@ -1274,13 +1388,19 @@ static void add_partial(struct kmem_cache_node *n,
1274 spin_unlock(&n->list_lock); 1388 spin_unlock(&n->list_lock);
1275} 1389}
1276 1390
1391static inline void __remove_partial(struct kmem_cache_node *n,
1392 struct page *page)
1393{
1394 list_del(&page->lru);
1395 n->nr_partial--;
1396}
1397
1277static void remove_partial(struct kmem_cache *s, struct page *page) 1398static void remove_partial(struct kmem_cache *s, struct page *page)
1278{ 1399{
1279 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1400 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1280 1401
1281 spin_lock(&n->list_lock); 1402 spin_lock(&n->list_lock);
1282 list_del(&page->lru); 1403 __remove_partial(n, page);
1283 n->nr_partial--;
1284 spin_unlock(&n->list_lock); 1404 spin_unlock(&n->list_lock);
1285} 1405}
1286 1406
@@ -1293,8 +1413,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
1293 struct page *page) 1413 struct page *page)
1294{ 1414{
1295 if (slab_trylock(page)) { 1415 if (slab_trylock(page)) {
1296 list_del(&page->lru); 1416 __remove_partial(n, page);
1297 n->nr_partial--;
1298 __SetPageSlubFrozen(page); 1417 __SetPageSlubFrozen(page);
1299 return 1; 1418 return 1;
1300 } 1419 }
@@ -1391,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1391 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1510 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1392 1511
1393 page = get_partial_node(get_node(s, searchnode)); 1512 page = get_partial_node(get_node(s, searchnode));
1394 if (page || node != -1) 1513 if (page || node != NUMA_NO_NODE)
1395 return page; 1514 return page;
1396 1515
1397 return get_any_partial(s, flags); 1516 return get_any_partial(s, flags);
@@ -1405,6 +1524,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1405 * On exit the slab lock will have been dropped. 1524 * On exit the slab lock will have been dropped.
1406 */ 1525 */
1407static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1526static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1527 __releases(bitlock)
1408{ 1528{
1409 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1529 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1410 1530
@@ -1443,10 +1563,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1443 } 1563 }
1444} 1564}
1445 1565
1566#ifdef CONFIG_PREEMPT
1567/*
1568 * Calculate the next globally unique transaction for disambiguiation
1569 * during cmpxchg. The transactions start with the cpu number and are then
1570 * incremented by CONFIG_NR_CPUS.
1571 */
1572#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1573#else
1574/*
1575 * No preemption supported therefore also no need to check for
1576 * different cpus.
1577 */
1578#define TID_STEP 1
1579#endif
1580
1581static inline unsigned long next_tid(unsigned long tid)
1582{
1583 return tid + TID_STEP;
1584}
1585
1586static inline unsigned int tid_to_cpu(unsigned long tid)
1587{
1588 return tid % TID_STEP;
1589}
1590
1591static inline unsigned long tid_to_event(unsigned long tid)
1592{
1593 return tid / TID_STEP;
1594}
1595
1596static inline unsigned int init_tid(int cpu)
1597{
1598 return cpu;
1599}
1600
1601static inline void note_cmpxchg_failure(const char *n,
1602 const struct kmem_cache *s, unsigned long tid)
1603{
1604#ifdef SLUB_DEBUG_CMPXCHG
1605 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1606
1607 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1608
1609#ifdef CONFIG_PREEMPT
1610 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1611 printk("due to cpu change %d -> %d\n",
1612 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1613 else
1614#endif
1615 if (tid_to_event(tid) != tid_to_event(actual_tid))
1616 printk("due to cpu running other code. Event %ld->%ld\n",
1617 tid_to_event(tid), tid_to_event(actual_tid));
1618 else
1619 printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1620 actual_tid, tid, next_tid(tid));
1621#endif
1622 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1623}
1624
1625void init_kmem_cache_cpus(struct kmem_cache *s)
1626{
1627 int cpu;
1628
1629 for_each_possible_cpu(cpu)
1630 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1631}
1446/* 1632/*
1447 * Remove the cpu slab 1633 * Remove the cpu slab
1448 */ 1634 */
1449static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1635static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1636 __releases(bitlock)
1450{ 1637{
1451 struct page *page = c->page; 1638 struct page *page = c->page;
1452 int tail = 1; 1639 int tail = 1;
@@ -1473,6 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1473 page->inuse--; 1660 page->inuse--;
1474 } 1661 }
1475 c->page = NULL; 1662 c->page = NULL;
1663 c->tid = next_tid(c->tid);
1476 unfreeze_slab(s, page, tail); 1664 unfreeze_slab(s, page, tail);
1477} 1665}
1478 1666
@@ -1606,33 +1794,46 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1606 unsigned long addr, struct kmem_cache_cpu *c) 1794 unsigned long addr, struct kmem_cache_cpu *c)
1607{ 1795{
1608 void **object; 1796 void **object;
1609 struct page *new; 1797 struct page *page;
1798 unsigned long flags;
1799
1800 local_irq_save(flags);
1801#ifdef CONFIG_PREEMPT
1802 /*
1803 * We may have been preempted and rescheduled on a different
1804 * cpu before disabling interrupts. Need to reload cpu area
1805 * pointer.
1806 */
1807 c = this_cpu_ptr(s->cpu_slab);
1808#endif
1610 1809
1611 /* We handle __GFP_ZERO in the caller */ 1810 /* We handle __GFP_ZERO in the caller */
1612 gfpflags &= ~__GFP_ZERO; 1811 gfpflags &= ~__GFP_ZERO;
1613 1812
1614 if (!c->page) 1813 page = c->page;
1814 if (!page)
1615 goto new_slab; 1815 goto new_slab;
1616 1816
1617 slab_lock(c->page); 1817 slab_lock(page);
1618 if (unlikely(!node_match(c, node))) 1818 if (unlikely(!node_match(c, node)))
1619 goto another_slab; 1819 goto another_slab;
1620 1820
1621 stat(s, ALLOC_REFILL); 1821 stat(s, ALLOC_REFILL);
1622 1822
1623load_freelist: 1823load_freelist:
1624 object = c->page->freelist; 1824 object = page->freelist;
1625 if (unlikely(!object)) 1825 if (unlikely(!object))
1626 goto another_slab; 1826 goto another_slab;
1627 if (kmem_cache_debug(s)) 1827 if (kmem_cache_debug(s))
1628 goto debug; 1828 goto debug;
1629 1829
1630 c->freelist = get_freepointer(s, object); 1830 c->freelist = get_freepointer(s, object);
1631 c->page->inuse = c->page->objects; 1831 page->inuse = page->objects;
1632 c->page->freelist = NULL; 1832 page->freelist = NULL;
1633 c->node = page_to_nid(c->page); 1833
1634unlock_out: 1834 slab_unlock(page);
1635 slab_unlock(c->page); 1835 c->tid = next_tid(c->tid);
1836 local_irq_restore(flags);
1636 stat(s, ALLOC_SLOWPATH); 1837 stat(s, ALLOC_SLOWPATH);
1637 return object; 1838 return object;
1638 1839
@@ -1640,42 +1841,50 @@ another_slab:
1640 deactivate_slab(s, c); 1841 deactivate_slab(s, c);
1641 1842
1642new_slab: 1843new_slab:
1643 new = get_partial(s, gfpflags, node); 1844 page = get_partial(s, gfpflags, node);
1644 if (new) { 1845 if (page) {
1645 c->page = new;
1646 stat(s, ALLOC_FROM_PARTIAL); 1846 stat(s, ALLOC_FROM_PARTIAL);
1847 c->node = page_to_nid(page);
1848 c->page = page;
1647 goto load_freelist; 1849 goto load_freelist;
1648 } 1850 }
1649 1851
1852 gfpflags &= gfp_allowed_mask;
1650 if (gfpflags & __GFP_WAIT) 1853 if (gfpflags & __GFP_WAIT)
1651 local_irq_enable(); 1854 local_irq_enable();
1652 1855
1653 new = new_slab(s, gfpflags, node); 1856 page = new_slab(s, gfpflags, node);
1654 1857
1655 if (gfpflags & __GFP_WAIT) 1858 if (gfpflags & __GFP_WAIT)
1656 local_irq_disable(); 1859 local_irq_disable();
1657 1860
1658 if (new) { 1861 if (page) {
1659 c = __this_cpu_ptr(s->cpu_slab); 1862 c = __this_cpu_ptr(s->cpu_slab);
1660 stat(s, ALLOC_SLAB); 1863 stat(s, ALLOC_SLAB);
1661 if (c->page) 1864 if (c->page)
1662 flush_slab(s, c); 1865 flush_slab(s, c);
1663 slab_lock(new); 1866
1664 __SetPageSlubFrozen(new); 1867 slab_lock(page);
1665 c->page = new; 1868 __SetPageSlubFrozen(page);
1869 c->node = page_to_nid(page);
1870 c->page = page;
1666 goto load_freelist; 1871 goto load_freelist;
1667 } 1872 }
1668 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 1873 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1669 slab_out_of_memory(s, gfpflags, node); 1874 slab_out_of_memory(s, gfpflags, node);
1875 local_irq_restore(flags);
1670 return NULL; 1876 return NULL;
1671debug: 1877debug:
1672 if (!alloc_debug_processing(s, c->page, object, addr)) 1878 if (!alloc_debug_processing(s, page, object, addr))
1673 goto another_slab; 1879 goto another_slab;
1674 1880
1675 c->page->inuse++; 1881 page->inuse++;
1676 c->page->freelist = get_freepointer(s, object); 1882 page->freelist = get_freepointer(s, object);
1677 c->node = -1; 1883 deactivate_slab(s, c);
1678 goto unlock_out; 1884 c->page = NULL;
1885 c->node = NUMA_NO_NODE;
1886 local_irq_restore(flags);
1887 return object;
1679} 1888}
1680 1889
1681/* 1890/*
@@ -1693,34 +1902,63 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1693{ 1902{
1694 void **object; 1903 void **object;
1695 struct kmem_cache_cpu *c; 1904 struct kmem_cache_cpu *c;
1696 unsigned long flags; 1905 unsigned long tid;
1697
1698 gfpflags &= gfp_allowed_mask;
1699 1906
1700 lockdep_trace_alloc(gfpflags); 1907 if (slab_pre_alloc_hook(s, gfpflags))
1701 might_sleep_if(gfpflags & __GFP_WAIT);
1702
1703 if (should_failslab(s->objsize, gfpflags, s->flags))
1704 return NULL; 1908 return NULL;
1705 1909
1706 local_irq_save(flags); 1910redo:
1911
1912 /*
1913 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
1914 * enabled. We may switch back and forth between cpus while
1915 * reading from one cpu area. That does not matter as long
1916 * as we end up on the original cpu again when doing the cmpxchg.
1917 */
1707 c = __this_cpu_ptr(s->cpu_slab); 1918 c = __this_cpu_ptr(s->cpu_slab);
1919
1920 /*
1921 * The transaction ids are globally unique per cpu and per operation on
1922 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
1923 * occurs on the right processor and that there was no operation on the
1924 * linked list in between.
1925 */
1926 tid = c->tid;
1927 barrier();
1928
1708 object = c->freelist; 1929 object = c->freelist;
1709 if (unlikely(!object || !node_match(c, node))) 1930 if (unlikely(!object || !node_match(c, node)))
1710 1931
1711 object = __slab_alloc(s, gfpflags, node, addr, c); 1932 object = __slab_alloc(s, gfpflags, node, addr, c);
1712 1933
1713 else { 1934 else {
1714 c->freelist = get_freepointer(s, object); 1935 /*
1936 * The cmpxchg will only match if there was no additional
1937 * operation and if we are on the right processor.
1938 *
1939 * The cmpxchg does the following atomically (without lock semantics!)
1940 * 1. Relocate first pointer to the current per cpu area.
1941 * 2. Verify that tid and freelist have not been changed
1942 * 3. If they were not changed replace tid and freelist
1943 *
1944 * Since this is without lock semantics the protection is only against
1945 * code executing on this cpu *not* from access by other cpus.
1946 */
1947 if (unlikely(!irqsafe_cpu_cmpxchg_double(
1948 s->cpu_slab->freelist, s->cpu_slab->tid,
1949 object, tid,
1950 get_freepointer_safe(s, object), next_tid(tid)))) {
1951
1952 note_cmpxchg_failure("slab_alloc", s, tid);
1953 goto redo;
1954 }
1715 stat(s, ALLOC_FASTPATH); 1955 stat(s, ALLOC_FASTPATH);
1716 } 1956 }
1717 local_irq_restore(flags);
1718 1957
1719 if (unlikely(gfpflags & __GFP_ZERO) && object) 1958 if (unlikely(gfpflags & __GFP_ZERO) && object)
1720 memset(object, 0, s->objsize); 1959 memset(object, 0, s->objsize);
1721 1960
1722 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); 1961 slab_post_alloc_hook(s, gfpflags, object);
1723 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1724 1962
1725 return object; 1963 return object;
1726} 1964}
@@ -1736,11 +1974,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1736EXPORT_SYMBOL(kmem_cache_alloc); 1974EXPORT_SYMBOL(kmem_cache_alloc);
1737 1975
1738#ifdef CONFIG_TRACING 1976#ifdef CONFIG_TRACING
1739void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1977void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1740{ 1978{
1741 return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 1979 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1980 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1981 return ret;
1982}
1983EXPORT_SYMBOL(kmem_cache_alloc_trace);
1984
1985void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1986{
1987 void *ret = kmalloc_order(size, flags, order);
1988 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1989 return ret;
1742} 1990}
1743EXPORT_SYMBOL(kmem_cache_alloc_notrace); 1991EXPORT_SYMBOL(kmalloc_order_trace);
1744#endif 1992#endif
1745 1993
1746#ifdef CONFIG_NUMA 1994#ifdef CONFIG_NUMA
@@ -1754,16 +2002,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1754 return ret; 2002 return ret;
1755} 2003}
1756EXPORT_SYMBOL(kmem_cache_alloc_node); 2004EXPORT_SYMBOL(kmem_cache_alloc_node);
1757#endif
1758 2005
1759#ifdef CONFIG_TRACING 2006#ifdef CONFIG_TRACING
1760void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 2007void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1761 gfp_t gfpflags, 2008 gfp_t gfpflags,
1762 int node) 2009 int node, size_t size)
1763{ 2010{
1764 return slab_alloc(s, gfpflags, node, _RET_IP_); 2011 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2012
2013 trace_kmalloc_node(_RET_IP_, ret,
2014 size, s->size, gfpflags, node);
2015 return ret;
1765} 2016}
1766EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 2017EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2018#endif
1767#endif 2019#endif
1768 2020
1769/* 2021/*
@@ -1779,14 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1779{ 2031{
1780 void *prior; 2032 void *prior;
1781 void **object = (void *)x; 2033 void **object = (void *)x;
2034 unsigned long flags;
1782 2035
1783 stat(s, FREE_SLOWPATH); 2036 local_irq_save(flags);
1784 slab_lock(page); 2037 slab_lock(page);
2038 stat(s, FREE_SLOWPATH);
1785 2039
1786 if (kmem_cache_debug(s)) 2040 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
1787 goto debug; 2041 goto out_unlock;
1788 2042
1789checks_ok:
1790 prior = page->freelist; 2043 prior = page->freelist;
1791 set_freepointer(s, object, prior); 2044 set_freepointer(s, object, prior);
1792 page->freelist = object; 2045 page->freelist = object;
@@ -1811,6 +2064,7 @@ checks_ok:
1811 2064
1812out_unlock: 2065out_unlock:
1813 slab_unlock(page); 2066 slab_unlock(page);
2067 local_irq_restore(flags);
1814 return; 2068 return;
1815 2069
1816slab_empty: 2070slab_empty:
@@ -1822,14 +2076,9 @@ slab_empty:
1822 stat(s, FREE_REMOVE_PARTIAL); 2076 stat(s, FREE_REMOVE_PARTIAL);
1823 } 2077 }
1824 slab_unlock(page); 2078 slab_unlock(page);
2079 local_irq_restore(flags);
1825 stat(s, FREE_SLAB); 2080 stat(s, FREE_SLAB);
1826 discard_slab(s, page); 2081 discard_slab(s, page);
1827 return;
1828
1829debug:
1830 if (!free_debug_processing(s, page, x, addr))
1831 goto out_unlock;
1832 goto checks_ok;
1833} 2082}
1834 2083
1835/* 2084/*
@@ -1848,23 +2097,38 @@ static __always_inline void slab_free(struct kmem_cache *s,
1848{ 2097{
1849 void **object = (void *)x; 2098 void **object = (void *)x;
1850 struct kmem_cache_cpu *c; 2099 struct kmem_cache_cpu *c;
1851 unsigned long flags; 2100 unsigned long tid;
1852 2101
1853 kmemleak_free_recursive(x, s->flags); 2102 slab_free_hook(s, x);
1854 local_irq_save(flags); 2103
2104redo:
2105
2106 /*
2107 * Determine the currently cpus per cpu slab.
2108 * The cpu may change afterward. However that does not matter since
2109 * data is retrieved via this pointer. If we are on the same cpu
2110 * during the cmpxchg then the free will succedd.
2111 */
1855 c = __this_cpu_ptr(s->cpu_slab); 2112 c = __this_cpu_ptr(s->cpu_slab);
1856 kmemcheck_slab_free(s, object, s->objsize); 2113
1857 debug_check_no_locks_freed(object, s->objsize); 2114 tid = c->tid;
1858 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 2115 barrier();
1859 debug_check_no_obj_freed(object, s->objsize); 2116
1860 if (likely(page == c->page && c->node >= 0)) { 2117 if (likely(page == c->page)) {
1861 set_freepointer(s, object, c->freelist); 2118 set_freepointer(s, object, c->freelist);
1862 c->freelist = object; 2119
2120 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2121 s->cpu_slab->freelist, s->cpu_slab->tid,
2122 c->freelist, tid,
2123 object, next_tid(tid)))) {
2124
2125 note_cmpxchg_failure("slab_free", s, tid);
2126 goto redo;
2127 }
1863 stat(s, FREE_FASTPATH); 2128 stat(s, FREE_FASTPATH);
1864 } else 2129 } else
1865 __slab_free(s, page, x, addr); 2130 __slab_free(s, page, x, addr);
1866 2131
1867 local_irq_restore(flags);
1868} 2132}
1869 2133
1870void kmem_cache_free(struct kmem_cache *s, void *x) 2134void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1879,17 +2143,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1879} 2143}
1880EXPORT_SYMBOL(kmem_cache_free); 2144EXPORT_SYMBOL(kmem_cache_free);
1881 2145
1882/* Figure out on which slab page the object resides */
1883static struct page *get_object_page(const void *x)
1884{
1885 struct page *page = virt_to_head_page(x);
1886
1887 if (!PageSlab(page))
1888 return NULL;
1889
1890 return page;
1891}
1892
1893/* 2146/*
1894 * Object placement in a slab is made very easy because we always start at 2147 * Object placement in a slab is made very easy because we always start at
1895 * offset 0. If we tune the size of the object to the alignment then we can 2148 * offset 0. If we tune the size of the object to the alignment then we can
@@ -1945,13 +2198,13 @@ static int slub_nomerge;
1945 * the smallest order which will fit the object. 2198 * the smallest order which will fit the object.
1946 */ 2199 */
1947static inline int slab_order(int size, int min_objects, 2200static inline int slab_order(int size, int min_objects,
1948 int max_order, int fract_leftover) 2201 int max_order, int fract_leftover, int reserved)
1949{ 2202{
1950 int order; 2203 int order;
1951 int rem; 2204 int rem;
1952 int min_order = slub_min_order; 2205 int min_order = slub_min_order;
1953 2206
1954 if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) 2207 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
1955 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2208 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
1956 2209
1957 for (order = max(min_order, 2210 for (order = max(min_order,
@@ -1960,10 +2213,10 @@ static inline int slab_order(int size, int min_objects,
1960 2213
1961 unsigned long slab_size = PAGE_SIZE << order; 2214 unsigned long slab_size = PAGE_SIZE << order;
1962 2215
1963 if (slab_size < min_objects * size) 2216 if (slab_size < min_objects * size + reserved)
1964 continue; 2217 continue;
1965 2218
1966 rem = slab_size % size; 2219 rem = (slab_size - reserved) % size;
1967 2220
1968 if (rem <= slab_size / fract_leftover) 2221 if (rem <= slab_size / fract_leftover)
1969 break; 2222 break;
@@ -1973,7 +2226,7 @@ static inline int slab_order(int size, int min_objects,
1973 return order; 2226 return order;
1974} 2227}
1975 2228
1976static inline int calculate_order(int size) 2229static inline int calculate_order(int size, int reserved)
1977{ 2230{
1978 int order; 2231 int order;
1979 int min_objects; 2232 int min_objects;
@@ -1991,14 +2244,14 @@ static inline int calculate_order(int size)
1991 min_objects = slub_min_objects; 2244 min_objects = slub_min_objects;
1992 if (!min_objects) 2245 if (!min_objects)
1993 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2246 min_objects = 4 * (fls(nr_cpu_ids) + 1);
1994 max_objects = (PAGE_SIZE << slub_max_order)/size; 2247 max_objects = order_objects(slub_max_order, size, reserved);
1995 min_objects = min(min_objects, max_objects); 2248 min_objects = min(min_objects, max_objects);
1996 2249
1997 while (min_objects > 1) { 2250 while (min_objects > 1) {
1998 fraction = 16; 2251 fraction = 16;
1999 while (fraction >= 4) { 2252 while (fraction >= 4) {
2000 order = slab_order(size, min_objects, 2253 order = slab_order(size, min_objects,
2001 slub_max_order, fraction); 2254 slub_max_order, fraction, reserved);
2002 if (order <= slub_max_order) 2255 if (order <= slub_max_order)
2003 return order; 2256 return order;
2004 fraction /= 2; 2257 fraction /= 2;
@@ -2010,14 +2263,14 @@ static inline int calculate_order(int size)
2010 * We were unable to place multiple objects in a slab. Now 2263 * We were unable to place multiple objects in a slab. Now
2011 * lets see if we can place a single object there. 2264 * lets see if we can place a single object there.
2012 */ 2265 */
2013 order = slab_order(size, 1, slub_max_order, 1); 2266 order = slab_order(size, 1, slub_max_order, 1, reserved);
2014 if (order <= slub_max_order) 2267 if (order <= slub_max_order)
2015 return order; 2268 return order;
2016 2269
2017 /* 2270 /*
2018 * Doh this slab cannot be placed using slub_max_order. 2271 * Doh this slab cannot be placed using slub_max_order.
2019 */ 2272 */
2020 order = slab_order(size, 1, MAX_ORDER, 1); 2273 order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2021 if (order < MAX_ORDER) 2274 if (order < MAX_ORDER)
2022 return order; 2275 return order;
2023 return -ENOSYS; 2276 return -ENOSYS;
@@ -2062,26 +2315,28 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2062#endif 2315#endif
2063} 2316}
2064 2317
2065static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); 2318static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2066
2067static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2068{ 2319{
2069 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) 2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2070 /* 2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2071 * Boot time creation of the kmalloc array. Use static per cpu data 2322
2072 * since the per cpu allocator is not available yet. 2323 /*
2073 */ 2324 * Must align to double word boundary for the double cmpxchg
2074 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); 2325 * instructions to work; see __pcpu_double_call_return_bool().
2075 else 2326 */
2076 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); 2327 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2328 2 * sizeof(void *));
2077 2329
2078 if (!s->cpu_slab) 2330 if (!s->cpu_slab)
2079 return 0; 2331 return 0;
2080 2332
2333 init_kmem_cache_cpus(s);
2334
2081 return 1; 2335 return 1;
2082} 2336}
2083 2337
2084#ifdef CONFIG_NUMA 2338static struct kmem_cache *kmem_cache_node;
2339
2085/* 2340/*
2086 * No kmalloc_node yet so do it by hand. We know that this is the first 2341 * No kmalloc_node yet so do it by hand. We know that this is the first
2087 * slab on the node for this slabcache. There are no concurrent accesses 2342 * slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2346,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2091 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2346 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2092 * memory on a fresh node that has no slab structures yet. 2347 * memory on a fresh node that has no slab structures yet.
2093 */ 2348 */
2094static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) 2349static void early_kmem_cache_node_alloc(int node)
2095{ 2350{
2096 struct page *page; 2351 struct page *page;
2097 struct kmem_cache_node *n; 2352 struct kmem_cache_node *n;
2098 unsigned long flags; 2353 unsigned long flags;
2099 2354
2100 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2355 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2101 2356
2102 page = new_slab(kmalloc_caches, gfpflags, node); 2357 page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
2103 2358
2104 BUG_ON(!page); 2359 BUG_ON(!page);
2105 if (page_to_nid(page) != node) { 2360 if (page_to_nid(page) != node) {
@@ -2111,15 +2366,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
2111 2366
2112 n = page->freelist; 2367 n = page->freelist;
2113 BUG_ON(!n); 2368 BUG_ON(!n);
2114 page->freelist = get_freepointer(kmalloc_caches, n); 2369 page->freelist = get_freepointer(kmem_cache_node, n);
2115 page->inuse++; 2370 page->inuse++;
2116 kmalloc_caches->node[node] = n; 2371 kmem_cache_node->node[node] = n;
2117#ifdef CONFIG_SLUB_DEBUG 2372#ifdef CONFIG_SLUB_DEBUG
2118 init_object(kmalloc_caches, n, 1); 2373 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2119 init_tracking(kmalloc_caches, n); 2374 init_tracking(kmem_cache_node, n);
2120#endif 2375#endif
2121 init_kmem_cache_node(n, kmalloc_caches); 2376 init_kmem_cache_node(n, kmem_cache_node);
2122 inc_slabs_node(kmalloc_caches, node, page->objects); 2377 inc_slabs_node(kmem_cache_node, node, page->objects);
2123 2378
2124 /* 2379 /*
2125 * lockdep requires consistent irq usage for each lock 2380 * lockdep requires consistent irq usage for each lock
@@ -2137,13 +2392,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
2137 2392
2138 for_each_node_state(node, N_NORMAL_MEMORY) { 2393 for_each_node_state(node, N_NORMAL_MEMORY) {
2139 struct kmem_cache_node *n = s->node[node]; 2394 struct kmem_cache_node *n = s->node[node];
2395
2140 if (n) 2396 if (n)
2141 kmem_cache_free(kmalloc_caches, n); 2397 kmem_cache_free(kmem_cache_node, n);
2398
2142 s->node[node] = NULL; 2399 s->node[node] = NULL;
2143 } 2400 }
2144} 2401}
2145 2402
2146static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) 2403static int init_kmem_cache_nodes(struct kmem_cache *s)
2147{ 2404{
2148 int node; 2405 int node;
2149 2406
@@ -2151,11 +2408,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2151 struct kmem_cache_node *n; 2408 struct kmem_cache_node *n;
2152 2409
2153 if (slab_state == DOWN) { 2410 if (slab_state == DOWN) {
2154 early_kmem_cache_node_alloc(gfpflags, node); 2411 early_kmem_cache_node_alloc(node);
2155 continue; 2412 continue;
2156 } 2413 }
2157 n = kmem_cache_alloc_node(kmalloc_caches, 2414 n = kmem_cache_alloc_node(kmem_cache_node,
2158 gfpflags, node); 2415 GFP_KERNEL, node);
2159 2416
2160 if (!n) { 2417 if (!n) {
2161 free_kmem_cache_nodes(s); 2418 free_kmem_cache_nodes(s);
@@ -2167,17 +2424,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2167 } 2424 }
2168 return 1; 2425 return 1;
2169} 2426}
2170#else
2171static void free_kmem_cache_nodes(struct kmem_cache *s)
2172{
2173}
2174
2175static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2176{
2177 init_kmem_cache_node(&s->local_node, s);
2178 return 1;
2179}
2180#endif
2181 2427
2182static void set_min_partial(struct kmem_cache *s, unsigned long min) 2428static void set_min_partial(struct kmem_cache *s, unsigned long min)
2183{ 2429{
@@ -2285,7 +2531,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2285 if (forced_order >= 0) 2531 if (forced_order >= 0)
2286 order = forced_order; 2532 order = forced_order;
2287 else 2533 else
2288 order = calculate_order(size); 2534 order = calculate_order(size, s->reserved);
2289 2535
2290 if (order < 0) 2536 if (order < 0)
2291 return 0; 2537 return 0;
@@ -2303,8 +2549,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2303 /* 2549 /*
2304 * Determine the number of objects per slab 2550 * Determine the number of objects per slab
2305 */ 2551 */
2306 s->oo = oo_make(order, size); 2552 s->oo = oo_make(order, size, s->reserved);
2307 s->min = oo_make(get_order(size), size); 2553 s->min = oo_make(get_order(size), size, s->reserved);
2308 if (oo_objects(s->oo) > oo_objects(s->max)) 2554 if (oo_objects(s->oo) > oo_objects(s->max))
2309 s->max = s->oo; 2555 s->max = s->oo;
2310 2556
@@ -2312,7 +2558,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2312 2558
2313} 2559}
2314 2560
2315static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2561static int kmem_cache_open(struct kmem_cache *s,
2316 const char *name, size_t size, 2562 const char *name, size_t size,
2317 size_t align, unsigned long flags, 2563 size_t align, unsigned long flags,
2318 void (*ctor)(void *)) 2564 void (*ctor)(void *))
@@ -2323,6 +2569,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2323 s->objsize = size; 2569 s->objsize = size;
2324 s->align = align; 2570 s->align = align;
2325 s->flags = kmem_cache_flags(size, flags, name, ctor); 2571 s->flags = kmem_cache_flags(size, flags, name, ctor);
2572 s->reserved = 0;
2573
2574 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
2575 s->reserved = sizeof(struct rcu_head);
2326 2576
2327 if (!calculate_sizes(s, -1)) 2577 if (!calculate_sizes(s, -1))
2328 goto error; 2578 goto error;
@@ -2348,10 +2598,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2348#ifdef CONFIG_NUMA 2598#ifdef CONFIG_NUMA
2349 s->remote_node_defrag_ratio = 1000; 2599 s->remote_node_defrag_ratio = 1000;
2350#endif 2600#endif
2351 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2601 if (!init_kmem_cache_nodes(s))
2352 goto error; 2602 goto error;
2353 2603
2354 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2604 if (alloc_kmem_cache_cpus(s))
2355 return 1; 2605 return 1;
2356 2606
2357 free_kmem_cache_nodes(s); 2607 free_kmem_cache_nodes(s);
@@ -2365,35 +2615,6 @@ error:
2365} 2615}
2366 2616
2367/* 2617/*
2368 * Check if a given pointer is valid
2369 */
2370int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2371{
2372 struct page *page;
2373
2374 if (!kern_ptr_validate(object, s->size))
2375 return 0;
2376
2377 page = get_object_page(object);
2378
2379 if (!page || s != page->slab)
2380 /* No slab or wrong slab */
2381 return 0;
2382
2383 if (!check_valid_pointer(s, page, object))
2384 return 0;
2385
2386 /*
2387 * We could also check if the object is on the slabs freelist.
2388 * But this would be too expensive and it seems that the main
2389 * purpose of kmem_ptr_valid() is to check if the object belongs
2390 * to a certain slab.
2391 */
2392 return 1;
2393}
2394EXPORT_SYMBOL(kmem_ptr_validate);
2395
2396/*
2397 * Determine the size of a slab object 2618 * Determine the size of a slab object
2398 */ 2619 */
2399unsigned int kmem_cache_size(struct kmem_cache *s) 2620unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -2402,28 +2623,20 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
2402} 2623}
2403EXPORT_SYMBOL(kmem_cache_size); 2624EXPORT_SYMBOL(kmem_cache_size);
2404 2625
2405const char *kmem_cache_name(struct kmem_cache *s)
2406{
2407 return s->name;
2408}
2409EXPORT_SYMBOL(kmem_cache_name);
2410
2411static void list_slab_objects(struct kmem_cache *s, struct page *page, 2626static void list_slab_objects(struct kmem_cache *s, struct page *page,
2412 const char *text) 2627 const char *text)
2413{ 2628{
2414#ifdef CONFIG_SLUB_DEBUG 2629#ifdef CONFIG_SLUB_DEBUG
2415 void *addr = page_address(page); 2630 void *addr = page_address(page);
2416 void *p; 2631 void *p;
2417 long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long), 2632 unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
2418 GFP_ATOMIC); 2633 sizeof(long), GFP_ATOMIC);
2419
2420 if (!map) 2634 if (!map)
2421 return; 2635 return;
2422 slab_err(s, page, "%s", text); 2636 slab_err(s, page, "%s", text);
2423 slab_lock(page); 2637 slab_lock(page);
2424 for_each_free_object(p, s, page->freelist)
2425 set_bit(slab_index(p, s, addr), map);
2426 2638
2639 get_map(s, page, map);
2427 for_each_object(p, s, addr, page->objects) { 2640 for_each_object(p, s, addr, page->objects) {
2428 2641
2429 if (!test_bit(slab_index(p, s, addr), map)) { 2642 if (!test_bit(slab_index(p, s, addr), map)) {
@@ -2448,9 +2661,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2448 spin_lock_irqsave(&n->list_lock, flags); 2661 spin_lock_irqsave(&n->list_lock, flags);
2449 list_for_each_entry_safe(page, h, &n->partial, lru) { 2662 list_for_each_entry_safe(page, h, &n->partial, lru) {
2450 if (!page->inuse) { 2663 if (!page->inuse) {
2451 list_del(&page->lru); 2664 __remove_partial(n, page);
2452 discard_slab(s, page); 2665 discard_slab(s, page);
2453 n->nr_partial--;
2454 } else { 2666 } else {
2455 list_slab_objects(s, page, 2667 list_slab_objects(s, page,
2456 "Objects remaining on kmem_cache_close()"); 2668 "Objects remaining on kmem_cache_close()");
@@ -2507,9 +2719,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2507 * Kmalloc subsystem 2719 * Kmalloc subsystem
2508 *******************************************************************/ 2720 *******************************************************************/
2509 2721
2510struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; 2722struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
2511EXPORT_SYMBOL(kmalloc_caches); 2723EXPORT_SYMBOL(kmalloc_caches);
2512 2724
2725static struct kmem_cache *kmem_cache;
2726
2727#ifdef CONFIG_ZONE_DMA
2728static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
2729#endif
2730
2513static int __init setup_slub_min_order(char *str) 2731static int __init setup_slub_min_order(char *str)
2514{ 2732{
2515 get_option(&str, &slub_min_order); 2733 get_option(&str, &slub_min_order);
@@ -2546,116 +2764,29 @@ static int __init setup_slub_nomerge(char *str)
2546 2764
2547__setup("slub_nomerge", setup_slub_nomerge); 2765__setup("slub_nomerge", setup_slub_nomerge);
2548 2766
2549static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, 2767static struct kmem_cache *__init create_kmalloc_cache(const char *name,
2550 const char *name, int size, gfp_t gfp_flags) 2768 int size, unsigned int flags)
2551{ 2769{
2552 unsigned int flags = 0; 2770 struct kmem_cache *s;
2553 2771
2554 if (gfp_flags & SLUB_DMA) 2772 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
2555 flags = SLAB_CACHE_DMA;
2556 2773
2557 /* 2774 /*
2558 * This function is called with IRQs disabled during early-boot on 2775 * This function is called with IRQs disabled during early-boot on
2559 * single CPU so there's no need to take slub_lock here. 2776 * single CPU so there's no need to take slub_lock here.
2560 */ 2777 */
2561 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2778 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
2562 flags, NULL)) 2779 flags, NULL))
2563 goto panic; 2780 goto panic;
2564 2781
2565 list_add(&s->list, &slab_caches); 2782 list_add(&s->list, &slab_caches);
2566
2567 if (sysfs_slab_add(s))
2568 goto panic;
2569 return s; 2783 return s;
2570 2784
2571panic: 2785panic:
2572 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); 2786 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
2787 return NULL;
2573} 2788}
2574 2789
2575#ifdef CONFIG_ZONE_DMA
2576static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2577
2578static void sysfs_add_func(struct work_struct *w)
2579{
2580 struct kmem_cache *s;
2581
2582 down_write(&slub_lock);
2583 list_for_each_entry(s, &slab_caches, list) {
2584 if (s->flags & __SYSFS_ADD_DEFERRED) {
2585 s->flags &= ~__SYSFS_ADD_DEFERRED;
2586 sysfs_slab_add(s);
2587 }
2588 }
2589 up_write(&slub_lock);
2590}
2591
2592static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
2593
2594static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2595{
2596 struct kmem_cache *s;
2597 char *text;
2598 size_t realsize;
2599 unsigned long slabflags;
2600 int i;
2601
2602 s = kmalloc_caches_dma[index];
2603 if (s)
2604 return s;
2605
2606 /* Dynamically create dma cache */
2607 if (flags & __GFP_WAIT)
2608 down_write(&slub_lock);
2609 else {
2610 if (!down_write_trylock(&slub_lock))
2611 goto out;
2612 }
2613
2614 if (kmalloc_caches_dma[index])
2615 goto unlock_out;
2616
2617 realsize = kmalloc_caches[index].objsize;
2618 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2619 (unsigned int)realsize);
2620
2621 s = NULL;
2622 for (i = 0; i < KMALLOC_CACHES; i++)
2623 if (!kmalloc_caches[i].size)
2624 break;
2625
2626 BUG_ON(i >= KMALLOC_CACHES);
2627 s = kmalloc_caches + i;
2628
2629 /*
2630 * Must defer sysfs creation to a workqueue because we don't know
2631 * what context we are called from. Before sysfs comes up, we don't
2632 * need to do anything because our sysfs initcall will start by
2633 * adding all existing slabs to sysfs.
2634 */
2635 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2636 if (slab_state >= SYSFS)
2637 slabflags |= __SYSFS_ADD_DEFERRED;
2638
2639 if (!text || !kmem_cache_open(s, flags, text,
2640 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2641 s->size = 0;
2642 kfree(text);
2643 goto unlock_out;
2644 }
2645
2646 list_add(&s->list, &slab_caches);
2647 kmalloc_caches_dma[index] = s;
2648
2649 if (slab_state >= SYSFS)
2650 schedule_work(&sysfs_add_work);
2651
2652unlock_out:
2653 up_write(&slub_lock);
2654out:
2655 return kmalloc_caches_dma[index];
2656}
2657#endif
2658
2659/* 2790/*
2660 * Conversion table for small slabs sizes / 8 to the index in the 2791 * Conversion table for small slabs sizes / 8 to the index in the
2661 * kmalloc array. This is necessary for slabs < 192 since we have non power 2792 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2839,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2708 2839
2709#ifdef CONFIG_ZONE_DMA 2840#ifdef CONFIG_ZONE_DMA
2710 if (unlikely((flags & SLUB_DMA))) 2841 if (unlikely((flags & SLUB_DMA)))
2711 return dma_kmalloc_cache(index, flags); 2842 return kmalloc_dma_caches[index];
2712 2843
2713#endif 2844#endif
2714 return &kmalloc_caches[index]; 2845 return kmalloc_caches[index];
2715} 2846}
2716 2847
2717void *__kmalloc(size_t size, gfp_t flags) 2848void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2866,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2735} 2866}
2736EXPORT_SYMBOL(__kmalloc); 2867EXPORT_SYMBOL(__kmalloc);
2737 2868
2869#ifdef CONFIG_NUMA
2738static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2870static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2739{ 2871{
2740 struct page *page; 2872 struct page *page;
@@ -2749,7 +2881,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2749 return ptr; 2881 return ptr;
2750} 2882}
2751 2883
2752#ifdef CONFIG_NUMA
2753void *__kmalloc_node(size_t size, gfp_t flags, int node) 2884void *__kmalloc_node(size_t size, gfp_t flags, int node)
2754{ 2885{
2755 struct kmem_cache *s; 2886 struct kmem_cache *s;
@@ -2782,7 +2913,6 @@ EXPORT_SYMBOL(__kmalloc_node);
2782size_t ksize(const void *object) 2913size_t ksize(const void *object)
2783{ 2914{
2784 struct page *page; 2915 struct page *page;
2785 struct kmem_cache *s;
2786 2916
2787 if (unlikely(object == ZERO_SIZE_PTR)) 2917 if (unlikely(object == ZERO_SIZE_PTR))
2788 return 0; 2918 return 0;
@@ -2793,28 +2923,8 @@ size_t ksize(const void *object)
2793 WARN_ON(!PageCompound(page)); 2923 WARN_ON(!PageCompound(page));
2794 return PAGE_SIZE << compound_order(page); 2924 return PAGE_SIZE << compound_order(page);
2795 } 2925 }
2796 s = page->slab;
2797 2926
2798#ifdef CONFIG_SLUB_DEBUG 2927 return slab_ksize(page->slab);
2799 /*
2800 * Debugging requires use of the padding between object
2801 * and whatever may come after it.
2802 */
2803 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2804 return s->objsize;
2805
2806#endif
2807 /*
2808 * If we have the need to store the freelist pointer
2809 * back there or track user information then we can
2810 * only use the space before that information.
2811 */
2812 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2813 return s->inuse;
2814 /*
2815 * Else we can use all the padding etc for the allocation
2816 */
2817 return s->size;
2818} 2928}
2819EXPORT_SYMBOL(ksize); 2929EXPORT_SYMBOL(ksize);
2820 2930
@@ -2889,8 +2999,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2889 * may have freed the last object and be 2999 * may have freed the last object and be
2890 * waiting to release the slab. 3000 * waiting to release the slab.
2891 */ 3001 */
2892 list_del(&page->lru); 3002 __remove_partial(n, page);
2893 n->nr_partial--;
2894 slab_unlock(page); 3003 slab_unlock(page);
2895 discard_slab(s, page); 3004 discard_slab(s, page);
2896 } else { 3005 } else {
@@ -2914,7 +3023,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2914} 3023}
2915EXPORT_SYMBOL(kmem_cache_shrink); 3024EXPORT_SYMBOL(kmem_cache_shrink);
2916 3025
2917#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) 3026#if defined(CONFIG_MEMORY_HOTPLUG)
2918static int slab_mem_going_offline_callback(void *arg) 3027static int slab_mem_going_offline_callback(void *arg)
2919{ 3028{
2920 struct kmem_cache *s; 3029 struct kmem_cache *s;
@@ -2956,7 +3065,7 @@ static void slab_mem_offline_callback(void *arg)
2956 BUG_ON(slabs_node(s, offline_node)); 3065 BUG_ON(slabs_node(s, offline_node));
2957 3066
2958 s->node[offline_node] = NULL; 3067 s->node[offline_node] = NULL;
2959 kmem_cache_free(kmalloc_caches, n); 3068 kmem_cache_free(kmem_cache_node, n);
2960 } 3069 }
2961 } 3070 }
2962 up_read(&slub_lock); 3071 up_read(&slub_lock);
@@ -2989,7 +3098,7 @@ static int slab_mem_going_online_callback(void *arg)
2989 * since memory is not yet available from the node that 3098 * since memory is not yet available from the node that
2990 * is brought up. 3099 * is brought up.
2991 */ 3100 */
2992 n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); 3101 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
2993 if (!n) { 3102 if (!n) {
2994 ret = -ENOMEM; 3103 ret = -ENOMEM;
2995 goto out; 3104 goto out;
@@ -3035,46 +3144,92 @@ static int slab_memory_callback(struct notifier_block *self,
3035 * Basic setup of slabs 3144 * Basic setup of slabs
3036 *******************************************************************/ 3145 *******************************************************************/
3037 3146
3147/*
3148 * Used for early kmem_cache structures that were allocated using
3149 * the page allocator
3150 */
3151
3152static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3153{
3154 int node;
3155
3156 list_add(&s->list, &slab_caches);
3157 s->refcount = -1;
3158
3159 for_each_node_state(node, N_NORMAL_MEMORY) {
3160 struct kmem_cache_node *n = get_node(s, node);
3161 struct page *p;
3162
3163 if (n) {
3164 list_for_each_entry(p, &n->partial, lru)
3165 p->slab = s;
3166
3167#ifdef CONFIG_SLUB_DEBUG
3168 list_for_each_entry(p, &n->full, lru)
3169 p->slab = s;
3170#endif
3171 }
3172 }
3173}
3174
3038void __init kmem_cache_init(void) 3175void __init kmem_cache_init(void)
3039{ 3176{
3040 int i; 3177 int i;
3041 int caches = 0; 3178 int caches = 0;
3179 struct kmem_cache *temp_kmem_cache;
3180 int order;
3181 struct kmem_cache *temp_kmem_cache_node;
3182 unsigned long kmalloc_size;
3183
3184 kmem_size = offsetof(struct kmem_cache, node) +
3185 nr_node_ids * sizeof(struct kmem_cache_node *);
3186
3187 /* Allocate two kmem_caches from the page allocator */
3188 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3189 order = get_order(2 * kmalloc_size);
3190 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
3042 3191
3043#ifdef CONFIG_NUMA
3044 /* 3192 /*
3045 * Must first have the slab cache available for the allocations of the 3193 * Must first have the slab cache available for the allocations of the
3046 * struct kmem_cache_node's. There is special bootstrap code in 3194 * struct kmem_cache_node's. There is special bootstrap code in
3047 * kmem_cache_open for slab_state == DOWN. 3195 * kmem_cache_open for slab_state == DOWN.
3048 */ 3196 */
3049 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3197 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3050 sizeof(struct kmem_cache_node), GFP_NOWAIT); 3198
3051 kmalloc_caches[0].refcount = -1; 3199 kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3052 caches++; 3200 sizeof(struct kmem_cache_node),
3201 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3053 3202
3054 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3203 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3055#endif
3056 3204
3057 /* Able to allocate the per node structures */ 3205 /* Able to allocate the per node structures */
3058 slab_state = PARTIAL; 3206 slab_state = PARTIAL;
3059 3207
3060 /* Caches that are not of the two-to-the-power-of size */ 3208 temp_kmem_cache = kmem_cache;
3061 if (KMALLOC_MIN_SIZE <= 32) { 3209 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3062 create_kmalloc_cache(&kmalloc_caches[1], 3210 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3063 "kmalloc-96", 96, GFP_NOWAIT); 3211 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3064 caches++; 3212 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3065 }
3066 if (KMALLOC_MIN_SIZE <= 64) {
3067 create_kmalloc_cache(&kmalloc_caches[2],
3068 "kmalloc-192", 192, GFP_NOWAIT);
3069 caches++;
3070 }
3071 3213
3072 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3214 /*
3073 create_kmalloc_cache(&kmalloc_caches[i], 3215 * Allocate kmem_cache_node properly from the kmem_cache slab.
3074 "kmalloc", 1 << i, GFP_NOWAIT); 3216 * kmem_cache_node is separately allocated so no need to
3075 caches++; 3217 * update any list pointers.
3076 } 3218 */
3219 temp_kmem_cache_node = kmem_cache_node;
3220
3221 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3222 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3223
3224 kmem_cache_bootstrap_fixup(kmem_cache_node);
3225
3226 caches++;
3227 kmem_cache_bootstrap_fixup(kmem_cache);
3228 caches++;
3229 /* Free temporary boot structure */
3230 free_pages((unsigned long)temp_kmem_cache, order);
3077 3231
3232 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3078 3233
3079 /* 3234 /*
3080 * Patch up the size_index table if we have strange large alignment 3235 * Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3269,60 @@ void __init kmem_cache_init(void)
3114 size_index[size_index_elem(i)] = 8; 3269 size_index[size_index_elem(i)] = 8;
3115 } 3270 }
3116 3271
3272 /* Caches that are not of the two-to-the-power-of size */
3273 if (KMALLOC_MIN_SIZE <= 32) {
3274 kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
3275 caches++;
3276 }
3277
3278 if (KMALLOC_MIN_SIZE <= 64) {
3279 kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
3280 caches++;
3281 }
3282
3283 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3284 kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
3285 caches++;
3286 }
3287
3117 slab_state = UP; 3288 slab_state = UP;
3118 3289
3119 /* Provide the correct kmalloc names now that the caches are up */ 3290 /* Provide the correct kmalloc names now that the caches are up */
3291 if (KMALLOC_MIN_SIZE <= 32) {
3292 kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
3293 BUG_ON(!kmalloc_caches[1]->name);
3294 }
3295
3296 if (KMALLOC_MIN_SIZE <= 64) {
3297 kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
3298 BUG_ON(!kmalloc_caches[2]->name);
3299 }
3300
3120 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3301 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3121 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); 3302 char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3122 3303
3123 BUG_ON(!s); 3304 BUG_ON(!s);
3124 kmalloc_caches[i].name = s; 3305 kmalloc_caches[i]->name = s;
3125 } 3306 }
3126 3307
3127#ifdef CONFIG_SMP 3308#ifdef CONFIG_SMP
3128 register_cpu_notifier(&slab_notifier); 3309 register_cpu_notifier(&slab_notifier);
3129#endif 3310#endif
3130#ifdef CONFIG_NUMA
3131 kmem_size = offsetof(struct kmem_cache, node) +
3132 nr_node_ids * sizeof(struct kmem_cache_node *);
3133#else
3134 kmem_size = sizeof(struct kmem_cache);
3135#endif
3136 3311
3312#ifdef CONFIG_ZONE_DMA
3313 for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
3314 struct kmem_cache *s = kmalloc_caches[i];
3315
3316 if (s && s->size) {
3317 char *name = kasprintf(GFP_NOWAIT,
3318 "dma-kmalloc-%d", s->objsize);
3319
3320 BUG_ON(!name);
3321 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3322 s->objsize, SLAB_CACHE_DMA);
3323 }
3324 }
3325#endif
3137 printk(KERN_INFO 3326 printk(KERN_INFO
3138 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 3327 "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
3139 " CPUs=%d, Nodes=%d\n", 3328 " CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3400,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3211 size_t align, unsigned long flags, void (*ctor)(void *)) 3400 size_t align, unsigned long flags, void (*ctor)(void *))
3212{ 3401{
3213 struct kmem_cache *s; 3402 struct kmem_cache *s;
3403 char *n;
3214 3404
3215 if (WARN_ON(!name)) 3405 if (WARN_ON(!name))
3216 return NULL; 3406 return NULL;
@@ -3234,24 +3424,30 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3234 return s; 3424 return s;
3235 } 3425 }
3236 3426
3427 n = kstrdup(name, GFP_KERNEL);
3428 if (!n)
3429 goto err;
3430
3237 s = kmalloc(kmem_size, GFP_KERNEL); 3431 s = kmalloc(kmem_size, GFP_KERNEL);
3238 if (s) { 3432 if (s) {
3239 if (kmem_cache_open(s, GFP_KERNEL, name, 3433 if (kmem_cache_open(s, n,
3240 size, align, flags, ctor)) { 3434 size, align, flags, ctor)) {
3241 list_add(&s->list, &slab_caches); 3435 list_add(&s->list, &slab_caches);
3242 if (sysfs_slab_add(s)) { 3436 if (sysfs_slab_add(s)) {
3243 list_del(&s->list); 3437 list_del(&s->list);
3438 kfree(n);
3244 kfree(s); 3439 kfree(s);
3245 goto err; 3440 goto err;
3246 } 3441 }
3247 up_write(&slub_lock); 3442 up_write(&slub_lock);
3248 return s; 3443 return s;
3249 } 3444 }
3445 kfree(n);
3250 kfree(s); 3446 kfree(s);
3251 } 3447 }
3448err:
3252 up_write(&slub_lock); 3449 up_write(&slub_lock);
3253 3450
3254err:
3255 if (flags & SLAB_PANIC) 3451 if (flags & SLAB_PANIC)
3256 panic("Cannot create slabcache %s\n", name); 3452 panic("Cannot create slabcache %s\n", name);
3257 else 3453 else
@@ -3312,12 +3508,13 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3312 3508
3313 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3509 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3314 3510
3315 /* Honor the call site pointer we recieved. */ 3511 /* Honor the call site pointer we received. */
3316 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3512 trace_kmalloc(caller, ret, size, s->size, gfpflags);
3317 3513
3318 return ret; 3514 return ret;
3319} 3515}
3320 3516
3517#ifdef CONFIG_NUMA
3321void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3518void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3322 int node, unsigned long caller) 3519 int node, unsigned long caller)
3323{ 3520{
@@ -3341,13 +3538,14 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3341 3538
3342 ret = slab_alloc(s, gfpflags, node, caller); 3539 ret = slab_alloc(s, gfpflags, node, caller);
3343 3540
3344 /* Honor the call site pointer we recieved. */ 3541 /* Honor the call site pointer we received. */
3345 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3542 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3346 3543
3347 return ret; 3544 return ret;
3348} 3545}
3546#endif
3349 3547
3350#ifdef CONFIG_SLUB_DEBUG 3548#ifdef CONFIG_SYSFS
3351static int count_inuse(struct page *page) 3549static int count_inuse(struct page *page)
3352{ 3550{
3353 return page->inuse; 3551 return page->inuse;
@@ -3357,7 +3555,9 @@ static int count_total(struct page *page)
3357{ 3555{
3358 return page->objects; 3556 return page->objects;
3359} 3557}
3558#endif
3360 3559
3560#ifdef CONFIG_SLUB_DEBUG
3361static int validate_slab(struct kmem_cache *s, struct page *page, 3561static int validate_slab(struct kmem_cache *s, struct page *page,
3362 unsigned long *map) 3562 unsigned long *map)
3363{ 3563{
@@ -3371,15 +3571,16 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3371 /* Now we know that a valid freelist exists */ 3571 /* Now we know that a valid freelist exists */
3372 bitmap_zero(map, page->objects); 3572 bitmap_zero(map, page->objects);
3373 3573
3374 for_each_free_object(p, s, page->freelist) { 3574 get_map(s, page, map);
3375 set_bit(slab_index(p, s, addr), map); 3575 for_each_object(p, s, addr, page->objects) {
3376 if (!check_object(s, page, p, 0)) 3576 if (test_bit(slab_index(p, s, addr), map))
3377 return 0; 3577 if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3578 return 0;
3378 } 3579 }
3379 3580
3380 for_each_object(p, s, addr, page->objects) 3581 for_each_object(p, s, addr, page->objects)
3381 if (!test_bit(slab_index(p, s, addr), map)) 3582 if (!test_bit(slab_index(p, s, addr), map))
3382 if (!check_object(s, page, p, 1)) 3583 if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3383 return 0; 3584 return 0;
3384 return 1; 3585 return 1;
3385} 3586}
@@ -3448,65 +3649,6 @@ static long validate_slab_cache(struct kmem_cache *s)
3448 kfree(map); 3649 kfree(map);
3449 return count; 3650 return count;
3450} 3651}
3451
3452#ifdef SLUB_RESILIENCY_TEST
3453static void resiliency_test(void)
3454{
3455 u8 *p;
3456
3457 printk(KERN_ERR "SLUB resiliency testing\n");
3458 printk(KERN_ERR "-----------------------\n");
3459 printk(KERN_ERR "A. Corruption after allocation\n");
3460
3461 p = kzalloc(16, GFP_KERNEL);
3462 p[16] = 0x12;
3463 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3464 " 0x12->0x%p\n\n", p + 16);
3465
3466 validate_slab_cache(kmalloc_caches + 4);
3467
3468 /* Hmmm... The next two are dangerous */
3469 p = kzalloc(32, GFP_KERNEL);
3470 p[32 + sizeof(void *)] = 0x34;
3471 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3472 " 0x34 -> -0x%p\n", p);
3473 printk(KERN_ERR
3474 "If allocated object is overwritten then not detectable\n\n");
3475
3476 validate_slab_cache(kmalloc_caches + 5);
3477 p = kzalloc(64, GFP_KERNEL);
3478 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3479 *p = 0x56;
3480 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3481 p);
3482 printk(KERN_ERR
3483 "If allocated object is overwritten then not detectable\n\n");
3484 validate_slab_cache(kmalloc_caches + 6);
3485
3486 printk(KERN_ERR "\nB. Corruption after free\n");
3487 p = kzalloc(128, GFP_KERNEL);
3488 kfree(p);
3489 *p = 0x78;
3490 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3491 validate_slab_cache(kmalloc_caches + 7);
3492
3493 p = kzalloc(256, GFP_KERNEL);
3494 kfree(p);
3495 p[50] = 0x9a;
3496 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3497 p);
3498 validate_slab_cache(kmalloc_caches + 8);
3499
3500 p = kzalloc(512, GFP_KERNEL);
3501 kfree(p);
3502 p[512] = 0xab;
3503 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3504 validate_slab_cache(kmalloc_caches + 9);
3505}
3506#else
3507static void resiliency_test(void) {};
3508#endif
3509
3510/* 3652/*
3511 * Generate lists of code addresses where slabcache objects are allocated 3653 * Generate lists of code addresses where slabcache objects are allocated
3512 * and freed. 3654 * and freed.
@@ -3635,14 +3777,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3635 3777
3636static void process_slab(struct loc_track *t, struct kmem_cache *s, 3778static void process_slab(struct loc_track *t, struct kmem_cache *s,
3637 struct page *page, enum track_item alloc, 3779 struct page *page, enum track_item alloc,
3638 long *map) 3780 unsigned long *map)
3639{ 3781{
3640 void *addr = page_address(page); 3782 void *addr = page_address(page);
3641 void *p; 3783 void *p;
3642 3784
3643 bitmap_zero(map, page->objects); 3785 bitmap_zero(map, page->objects);
3644 for_each_free_object(p, s, page->freelist) 3786 get_map(s, page, map);
3645 set_bit(slab_index(p, s, addr), map);
3646 3787
3647 for_each_object(p, s, addr, page->objects) 3788 for_each_object(p, s, addr, page->objects)
3648 if (!test_bit(slab_index(p, s, addr), map)) 3789 if (!test_bit(slab_index(p, s, addr), map))
@@ -3691,7 +3832,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3691 len += sprintf(buf + len, "%7ld ", l->count); 3832 len += sprintf(buf + len, "%7ld ", l->count);
3692 3833
3693 if (l->addr) 3834 if (l->addr)
3694 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3835 len += sprintf(buf + len, "%pS", (void *)l->addr);
3695 else 3836 else
3696 len += sprintf(buf + len, "<not-available>"); 3837 len += sprintf(buf + len, "<not-available>");
3697 3838
@@ -3735,7 +3876,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
3735 len += sprintf(buf, "No data\n"); 3876 len += sprintf(buf, "No data\n");
3736 return len; 3877 return len;
3737} 3878}
3879#endif
3880
3881#ifdef SLUB_RESILIENCY_TEST
3882static void resiliency_test(void)
3883{
3884 u8 *p;
3885
3886 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
3887
3888 printk(KERN_ERR "SLUB resiliency testing\n");
3889 printk(KERN_ERR "-----------------------\n");
3890 printk(KERN_ERR "A. Corruption after allocation\n");
3891
3892 p = kzalloc(16, GFP_KERNEL);
3893 p[16] = 0x12;
3894 printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
3895 " 0x12->0x%p\n\n", p + 16);
3896
3897 validate_slab_cache(kmalloc_caches[4]);
3898
3899 /* Hmmm... The next two are dangerous */
3900 p = kzalloc(32, GFP_KERNEL);
3901 p[32 + sizeof(void *)] = 0x34;
3902 printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
3903 " 0x34 -> -0x%p\n", p);
3904 printk(KERN_ERR
3905 "If allocated object is overwritten then not detectable\n\n");
3906
3907 validate_slab_cache(kmalloc_caches[5]);
3908 p = kzalloc(64, GFP_KERNEL);
3909 p += 64 + (get_cycles() & 0xff) * sizeof(void *);
3910 *p = 0x56;
3911 printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
3912 p);
3913 printk(KERN_ERR
3914 "If allocated object is overwritten then not detectable\n\n");
3915 validate_slab_cache(kmalloc_caches[6]);
3916
3917 printk(KERN_ERR "\nB. Corruption after free\n");
3918 p = kzalloc(128, GFP_KERNEL);
3919 kfree(p);
3920 *p = 0x78;
3921 printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
3922 validate_slab_cache(kmalloc_caches[7]);
3923
3924 p = kzalloc(256, GFP_KERNEL);
3925 kfree(p);
3926 p[50] = 0x9a;
3927 printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
3928 p);
3929 validate_slab_cache(kmalloc_caches[8]);
3930
3931 p = kzalloc(512, GFP_KERNEL);
3932 kfree(p);
3933 p[512] = 0xab;
3934 printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
3935 validate_slab_cache(kmalloc_caches[9]);
3936}
3937#else
3938#ifdef CONFIG_SYSFS
3939static void resiliency_test(void) {};
3940#endif
3941#endif
3738 3942
3943#ifdef CONFIG_SYSFS
3739enum slab_stat_type { 3944enum slab_stat_type {
3740 SL_ALL, /* All slabs */ 3945 SL_ALL, /* All slabs */
3741 SL_PARTIAL, /* Only partially allocated slabs */ 3946 SL_PARTIAL, /* Only partially allocated slabs */
@@ -3788,6 +3993,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3788 } 3993 }
3789 } 3994 }
3790 3995
3996 lock_memory_hotplug();
3997#ifdef CONFIG_SLUB_DEBUG
3791 if (flags & SO_ALL) { 3998 if (flags & SO_ALL) {
3792 for_each_node_state(node, N_NORMAL_MEMORY) { 3999 for_each_node_state(node, N_NORMAL_MEMORY) {
3793 struct kmem_cache_node *n = get_node(s, node); 4000 struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +4011,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3804 nodes[node] += x; 4011 nodes[node] += x;
3805 } 4012 }
3806 4013
3807 } else if (flags & SO_PARTIAL) { 4014 } else
4015#endif
4016 if (flags & SO_PARTIAL) {
3808 for_each_node_state(node, N_NORMAL_MEMORY) { 4017 for_each_node_state(node, N_NORMAL_MEMORY) {
3809 struct kmem_cache_node *n = get_node(s, node); 4018 struct kmem_cache_node *n = get_node(s, node);
3810 4019
@@ -3825,10 +4034,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3825 x += sprintf(buf + x, " N%d=%lu", 4034 x += sprintf(buf + x, " N%d=%lu",
3826 node, nodes[node]); 4035 node, nodes[node]);
3827#endif 4036#endif
4037 unlock_memory_hotplug();
3828 kfree(nodes); 4038 kfree(nodes);
3829 return x + sprintf(buf + x, "\n"); 4039 return x + sprintf(buf + x, "\n");
3830} 4040}
3831 4041
4042#ifdef CONFIG_SLUB_DEBUG
3832static int any_slab_objects(struct kmem_cache *s) 4043static int any_slab_objects(struct kmem_cache *s)
3833{ 4044{
3834 int node; 4045 int node;
@@ -3844,6 +4055,7 @@ static int any_slab_objects(struct kmem_cache *s)
3844 } 4055 }
3845 return 0; 4056 return 0;
3846} 4057}
4058#endif
3847 4059
3848#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4060#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3849#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4061#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3930,12 +4142,9 @@ SLAB_ATTR(min_partial);
3930 4142
3931static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4143static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3932{ 4144{
3933 if (s->ctor) { 4145 if (!s->ctor)
3934 int n = sprint_symbol(buf, (unsigned long)s->ctor); 4146 return 0;
3935 4147 return sprintf(buf, "%pS\n", s->ctor);
3936 return n + sprintf(buf + n, "\n");
3937 }
3938 return 0;
3939} 4148}
3940SLAB_ATTR_RO(ctor); 4149SLAB_ATTR_RO(ctor);
3941 4150
@@ -3945,12 +4154,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
3945} 4154}
3946SLAB_ATTR_RO(aliases); 4155SLAB_ATTR_RO(aliases);
3947 4156
3948static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3949{
3950 return show_slab_objects(s, buf, SO_ALL);
3951}
3952SLAB_ATTR_RO(slabs);
3953
3954static ssize_t partial_show(struct kmem_cache *s, char *buf) 4157static ssize_t partial_show(struct kmem_cache *s, char *buf)
3955{ 4158{
3956 return show_slab_objects(s, buf, SO_PARTIAL); 4159 return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +4178,89 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
3975} 4178}
3976SLAB_ATTR_RO(objects_partial); 4179SLAB_ATTR_RO(objects_partial);
3977 4180
3978static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 4181static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3979{
3980 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
3981}
3982SLAB_ATTR_RO(total_objects);
3983
3984static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
3985{ 4182{
3986 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); 4183 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3987} 4184}
3988 4185
3989static ssize_t sanity_checks_store(struct kmem_cache *s, 4186static ssize_t reclaim_account_store(struct kmem_cache *s,
3990 const char *buf, size_t length) 4187 const char *buf, size_t length)
3991{ 4188{
3992 s->flags &= ~SLAB_DEBUG_FREE; 4189 s->flags &= ~SLAB_RECLAIM_ACCOUNT;
3993 if (buf[0] == '1') 4190 if (buf[0] == '1')
3994 s->flags |= SLAB_DEBUG_FREE; 4191 s->flags |= SLAB_RECLAIM_ACCOUNT;
3995 return length; 4192 return length;
3996} 4193}
3997SLAB_ATTR(sanity_checks); 4194SLAB_ATTR(reclaim_account);
3998 4195
3999static ssize_t trace_show(struct kmem_cache *s, char *buf) 4196static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
4000{ 4197{
4001 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 4198 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
4002} 4199}
4200SLAB_ATTR_RO(hwcache_align);
4003 4201
4004static ssize_t trace_store(struct kmem_cache *s, const char *buf, 4202#ifdef CONFIG_ZONE_DMA
4005 size_t length) 4203static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
4006{ 4204{
4007 s->flags &= ~SLAB_TRACE; 4205 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
4008 if (buf[0] == '1')
4009 s->flags |= SLAB_TRACE;
4010 return length;
4011} 4206}
4012SLAB_ATTR(trace); 4207SLAB_ATTR_RO(cache_dma);
4208#endif
4013 4209
4014#ifdef CONFIG_FAILSLAB 4210static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4015static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4016{ 4211{
4017 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 4212 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
4018} 4213}
4214SLAB_ATTR_RO(destroy_by_rcu);
4019 4215
4020static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 4216static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4021 size_t length)
4022{ 4217{
4023 s->flags &= ~SLAB_FAILSLAB; 4218 return sprintf(buf, "%d\n", s->reserved);
4024 if (buf[0] == '1')
4025 s->flags |= SLAB_FAILSLAB;
4026 return length;
4027} 4219}
4028SLAB_ATTR(failslab); 4220SLAB_ATTR_RO(reserved);
4029#endif
4030 4221
4031static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4222#ifdef CONFIG_SLUB_DEBUG
4223static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4032{ 4224{
4033 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4225 return show_slab_objects(s, buf, SO_ALL);
4034} 4226}
4227SLAB_ATTR_RO(slabs);
4035 4228
4036static ssize_t reclaim_account_store(struct kmem_cache *s, 4229static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
4037 const char *buf, size_t length)
4038{ 4230{
4039 s->flags &= ~SLAB_RECLAIM_ACCOUNT; 4231 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
4040 if (buf[0] == '1')
4041 s->flags |= SLAB_RECLAIM_ACCOUNT;
4042 return length;
4043} 4232}
4044SLAB_ATTR(reclaim_account); 4233SLAB_ATTR_RO(total_objects);
4045 4234
4046static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 4235static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
4047{ 4236{
4048 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 4237 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
4049} 4238}
4050SLAB_ATTR_RO(hwcache_align);
4051 4239
4052#ifdef CONFIG_ZONE_DMA 4240static ssize_t sanity_checks_store(struct kmem_cache *s,
4053static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 4241 const char *buf, size_t length)
4054{ 4242{
4055 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 4243 s->flags &= ~SLAB_DEBUG_FREE;
4244 if (buf[0] == '1')
4245 s->flags |= SLAB_DEBUG_FREE;
4246 return length;
4056} 4247}
4057SLAB_ATTR_RO(cache_dma); 4248SLAB_ATTR(sanity_checks);
4058#endif
4059 4249
4060static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 4250static ssize_t trace_show(struct kmem_cache *s, char *buf)
4061{ 4251{
4062 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 4252 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
4063} 4253}
4064SLAB_ATTR_RO(destroy_by_rcu); 4254
4255static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4256 size_t length)
4257{
4258 s->flags &= ~SLAB_TRACE;
4259 if (buf[0] == '1')
4260 s->flags |= SLAB_TRACE;
4261 return length;
4262}
4263SLAB_ATTR(trace);
4065 4264
4066static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 4265static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
4067{ 4266{
@@ -4139,6 +4338,40 @@ static ssize_t validate_store(struct kmem_cache *s,
4139} 4338}
4140SLAB_ATTR(validate); 4339SLAB_ATTR(validate);
4141 4340
4341static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4342{
4343 if (!(s->flags & SLAB_STORE_USER))
4344 return -ENOSYS;
4345 return list_locations(s, buf, TRACK_ALLOC);
4346}
4347SLAB_ATTR_RO(alloc_calls);
4348
4349static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4350{
4351 if (!(s->flags & SLAB_STORE_USER))
4352 return -ENOSYS;
4353 return list_locations(s, buf, TRACK_FREE);
4354}
4355SLAB_ATTR_RO(free_calls);
4356#endif /* CONFIG_SLUB_DEBUG */
4357
4358#ifdef CONFIG_FAILSLAB
4359static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4360{
4361 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4362}
4363
4364static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4365 size_t length)
4366{
4367 s->flags &= ~SLAB_FAILSLAB;
4368 if (buf[0] == '1')
4369 s->flags |= SLAB_FAILSLAB;
4370 return length;
4371}
4372SLAB_ATTR(failslab);
4373#endif
4374
4142static ssize_t shrink_show(struct kmem_cache *s, char *buf) 4375static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4143{ 4376{
4144 return 0; 4377 return 0;
@@ -4158,22 +4391,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
4158} 4391}
4159SLAB_ATTR(shrink); 4392SLAB_ATTR(shrink);
4160 4393
4161static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
4162{
4163 if (!(s->flags & SLAB_STORE_USER))
4164 return -ENOSYS;
4165 return list_locations(s, buf, TRACK_ALLOC);
4166}
4167SLAB_ATTR_RO(alloc_calls);
4168
4169static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
4170{
4171 if (!(s->flags & SLAB_STORE_USER))
4172 return -ENOSYS;
4173 return list_locations(s, buf, TRACK_FREE);
4174}
4175SLAB_ATTR_RO(free_calls);
4176
4177#ifdef CONFIG_NUMA 4394#ifdef CONFIG_NUMA
4178static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 4395static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
4179{ 4396{
@@ -4279,25 +4496,28 @@ static struct attribute *slab_attrs[] = {
4279 &min_partial_attr.attr, 4496 &min_partial_attr.attr,
4280 &objects_attr.attr, 4497 &objects_attr.attr,
4281 &objects_partial_attr.attr, 4498 &objects_partial_attr.attr,
4282 &total_objects_attr.attr,
4283 &slabs_attr.attr,
4284 &partial_attr.attr, 4499 &partial_attr.attr,
4285 &cpu_slabs_attr.attr, 4500 &cpu_slabs_attr.attr,
4286 &ctor_attr.attr, 4501 &ctor_attr.attr,
4287 &aliases_attr.attr, 4502 &aliases_attr.attr,
4288 &align_attr.attr, 4503 &align_attr.attr,
4289 &sanity_checks_attr.attr,
4290 &trace_attr.attr,
4291 &hwcache_align_attr.attr, 4504 &hwcache_align_attr.attr,
4292 &reclaim_account_attr.attr, 4505 &reclaim_account_attr.attr,
4293 &destroy_by_rcu_attr.attr, 4506 &destroy_by_rcu_attr.attr,
4507 &shrink_attr.attr,
4508 &reserved_attr.attr,
4509#ifdef CONFIG_SLUB_DEBUG
4510 &total_objects_attr.attr,
4511 &slabs_attr.attr,
4512 &sanity_checks_attr.attr,
4513 &trace_attr.attr,
4294 &red_zone_attr.attr, 4514 &red_zone_attr.attr,
4295 &poison_attr.attr, 4515 &poison_attr.attr,
4296 &store_user_attr.attr, 4516 &store_user_attr.attr,
4297 &validate_attr.attr, 4517 &validate_attr.attr,
4298 &shrink_attr.attr,
4299 &alloc_calls_attr.attr, 4518 &alloc_calls_attr.attr,
4300 &free_calls_attr.attr, 4519 &free_calls_attr.attr,
4520#endif
4301#ifdef CONFIG_ZONE_DMA 4521#ifdef CONFIG_ZONE_DMA
4302 &cache_dma_attr.attr, 4522 &cache_dma_attr.attr,
4303#endif 4523#endif
@@ -4377,6 +4597,7 @@ static void kmem_cache_release(struct kobject *kobj)
4377{ 4597{
4378 struct kmem_cache *s = to_slab(kobj); 4598 struct kmem_cache *s = to_slab(kobj);
4379 4599
4600 kfree(s->name);
4380 kfree(s); 4601 kfree(s);
4381} 4602}
4382 4603
@@ -4579,7 +4800,7 @@ static int __init slab_sysfs_init(void)
4579} 4800}
4580 4801
4581__initcall(slab_sysfs_init); 4802__initcall(slab_sysfs_init);
4582#endif 4803#endif /* CONFIG_SYSFS */
4583 4804
4584/* 4805/*
4585 * The /proc/slabinfo ABI 4806 * The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa41..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux 10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings 11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially 12 * via TLBs. For those arches the virtual memory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that 13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are 14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap. 15 * allocated to create a view of memory for vmemmap.
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
220 220
221 if (vmemmap_buf_start) { 221 if (vmemmap_buf_start) {
222 /* need to free left buf */ 222 /* need to free left buf */
223#ifdef CONFIG_NO_BOOTMEM
224 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
225 if (vmemmap_buf_start < vmemmap_buf) {
226 char name[15];
227
228 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
229 reserve_early_without_check(__pa(vmemmap_buf_start),
230 __pa(vmemmap_buf), name);
231 }
232#else
233 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); 223 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
234#endif
235 vmemmap_buf = NULL; 224 vmemmap_buf = NULL;
236 vmemmap_buf_end = NULL; 225 vmemmap_buf_end = NULL;
237 } 226 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..aa64b12831a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -500,7 +500,7 @@ void __init sparse_init(void)
500 * so alloc 2M (with 2M align) and 24 bytes in turn will 500 * so alloc 2M (with 2M align) and 24 bytes in turn will
501 * make next 2M slip to one more 2M later. 501 * make next 2M slip to one more 2M later.
502 * then in big system, the memory will have a lot of holes... 502 * then in big system, the memory will have a lot of holes...
503 * here try to allocate 2M pages continously. 503 * here try to allocate 2M pages continuously.
504 * 504 *
505 * powerpc need to call sparse_init_one_section right after each 505 * powerpc need to call sparse_init_one_section right after each
506 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 506 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
671static void free_map_bootmem(struct page *page, unsigned long nr_pages) 671static void free_map_bootmem(struct page *page, unsigned long nr_pages)
672{ 672{
673 unsigned long maps_section_nr, removing_section_nr, i; 673 unsigned long maps_section_nr, removing_section_nr, i;
674 int magic; 674 unsigned long magic;
675 675
676 for (i = 0; i < nr_pages; i++, page++) { 676 for (i = 0; i < nr_pages; i++, page++) {
677 magic = atomic_read(&page->_mapcount); 677 magic = (unsigned long) page->lru.next;
678 678
679 BUG_ON(magic == NODE_INFO); 679 BUG_ON(magic == NODE_INFO);
680 680
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3a442f18b0b3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
39 39
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
42 43
43/* 44/*
44 * This path almost never happens for VM activity - pages are normally 45 * This path almost never happens for VM activity - pages are normally
@@ -56,17 +57,97 @@ static void __page_cache_release(struct page *page)
56 del_page_from_lru(zone, page); 57 del_page_from_lru(zone, page);
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 58 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 59 }
60}
61
62static void __put_single_page(struct page *page)
63{
64 __page_cache_release(page);
59 free_hot_cold_page(page, 0); 65 free_hot_cold_page(page, 0);
60} 66}
61 67
62static void put_compound_page(struct page *page) 68static void __put_compound_page(struct page *page)
63{ 69{
64 page = compound_head(page); 70 compound_page_dtor *dtor;
65 if (put_page_testzero(page)) {
66 compound_page_dtor *dtor;
67 71
68 dtor = get_compound_page_dtor(page); 72 __page_cache_release(page);
69 (*dtor)(page); 73 dtor = get_compound_page_dtor(page);
74 (*dtor)(page);
75}
76
77static void put_compound_page(struct page *page)
78{
79 if (unlikely(PageTail(page))) {
80 /* __split_huge_page_refcount can run under us */
81 struct page *page_head = page->first_page;
82 smp_rmb();
83 /*
84 * If PageTail is still set after smp_rmb() we can be sure
85 * that the page->first_page we read wasn't a dangling pointer.
86 * See __split_huge_page_refcount() smp_wmb().
87 */
88 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
89 unsigned long flags;
90 /*
91 * Verify that our page_head wasn't converted
92 * to a a regular page before we got a
93 * reference on it.
94 */
95 if (unlikely(!PageHead(page_head))) {
96 /* PageHead is cleared after PageTail */
97 smp_rmb();
98 VM_BUG_ON(PageTail(page));
99 goto out_put_head;
100 }
101 /*
102 * Only run compound_lock on a valid PageHead,
103 * after having it pinned with
104 * get_page_unless_zero() above.
105 */
106 smp_mb();
107 /* page_head wasn't a dangling pointer */
108 flags = compound_lock_irqsave(page_head);
109 if (unlikely(!PageTail(page))) {
110 /* __split_huge_page_refcount run before us */
111 compound_unlock_irqrestore(page_head, flags);
112 VM_BUG_ON(PageHead(page_head));
113 out_put_head:
114 if (put_page_testzero(page_head))
115 __put_single_page(page_head);
116 out_put_single:
117 if (put_page_testzero(page))
118 __put_single_page(page);
119 return;
120 }
121 VM_BUG_ON(page_head != page->first_page);
122 /*
123 * We can release the refcount taken by
124 * get_page_unless_zero now that
125 * split_huge_page_refcount is blocked on the
126 * compound_lock.
127 */
128 if (put_page_testzero(page_head))
129 VM_BUG_ON(1);
130 /* __split_huge_page_refcount will wait now */
131 VM_BUG_ON(atomic_read(&page->_count) <= 0);
132 atomic_dec(&page->_count);
133 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
134 compound_unlock_irqrestore(page_head, flags);
135 if (put_page_testzero(page_head)) {
136 if (PageHead(page_head))
137 __put_compound_page(page_head);
138 else
139 __put_single_page(page_head);
140 }
141 } else {
142 /* page_head is a dangling pointer */
143 VM_BUG_ON(PageTail(page));
144 goto out_put_single;
145 }
146 } else if (put_page_testzero(page)) {
147 if (PageHead(page))
148 __put_compound_page(page);
149 else
150 __put_single_page(page);
70 } 151 }
71} 152}
72 153
@@ -75,7 +156,7 @@ void put_page(struct page *page)
75 if (unlikely(PageCompound(page))) 156 if (unlikely(PageCompound(page)))
76 put_compound_page(page); 157 put_compound_page(page);
77 else if (put_page_testzero(page)) 158 else if (put_page_testzero(page))
78 __page_cache_release(page); 159 __put_single_page(page);
79} 160}
80EXPORT_SYMBOL(put_page); 161EXPORT_SYMBOL(put_page);
81 162
@@ -98,15 +179,13 @@ void put_pages_list(struct list_head *pages)
98} 179}
99EXPORT_SYMBOL(put_pages_list); 180EXPORT_SYMBOL(put_pages_list);
100 181
101/* 182static void pagevec_lru_move_fn(struct pagevec *pvec,
102 * pagevec_move_tail() must be called with IRQ disabled. 183 void (*move_fn)(struct page *page, void *arg),
103 * Otherwise this may cause nasty races. 184 void *arg)
104 */
105static void pagevec_move_tail(struct pagevec *pvec)
106{ 185{
107 int i; 186 int i;
108 int pgmoved = 0;
109 struct zone *zone = NULL; 187 struct zone *zone = NULL;
188 unsigned long flags = 0;
110 189
111 for (i = 0; i < pagevec_count(pvec); i++) { 190 for (i = 0; i < pagevec_count(pvec); i++) {
112 struct page *page = pvec->pages[i]; 191 struct page *page = pvec->pages[i];
@@ -114,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
114 193
115 if (pagezone != zone) { 194 if (pagezone != zone) {
116 if (zone) 195 if (zone)
117 spin_unlock(&zone->lru_lock); 196 spin_unlock_irqrestore(&zone->lru_lock, flags);
118 zone = pagezone; 197 zone = pagezone;
119 spin_lock(&zone->lru_lock); 198 spin_lock_irqsave(&zone->lru_lock, flags);
120 }
121 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
122 int lru = page_lru_base_type(page);
123 list_move_tail(&page->lru, &zone->lru[lru].list);
124 pgmoved++;
125 } 199 }
200
201 (*move_fn)(page, arg);
126 } 202 }
127 if (zone) 203 if (zone)
128 spin_unlock(&zone->lru_lock); 204 spin_unlock_irqrestore(&zone->lru_lock, flags);
129 __count_vm_events(PGROTATED, pgmoved);
130 release_pages(pvec->pages, pvec->nr, pvec->cold); 205 release_pages(pvec->pages, pvec->nr, pvec->cold);
131 pagevec_reinit(pvec); 206 pagevec_reinit(pvec);
132} 207}
133 208
209static void pagevec_move_tail_fn(struct page *page, void *arg)
210{
211 int *pgmoved = arg;
212 struct zone *zone = page_zone(page);
213
214 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
215 enum lru_list lru = page_lru_base_type(page);
216 list_move_tail(&page->lru, &zone->lru[lru].list);
217 mem_cgroup_rotate_reclaimable_page(page);
218 (*pgmoved)++;
219 }
220}
221
222/*
223 * pagevec_move_tail() must be called with IRQ disabled.
224 * Otherwise this may cause nasty races.
225 */
226static void pagevec_move_tail(struct pagevec *pvec)
227{
228 int pgmoved = 0;
229
230 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
231 __count_vm_events(PGROTATED, pgmoved);
232}
233
134/* 234/*
135 * Writeback is about to end against a page which has been marked for immediate 235 * Writeback is about to end against a page which has been marked for immediate
136 * reclaim. If it still appears to be reclaimable, move it to the tail of the 236 * reclaim. If it still appears to be reclaimable, move it to the tail of the
137 * inactive list. 237 * inactive list.
138 */ 238 */
139void rotate_reclaimable_page(struct page *page) 239void rotate_reclaimable_page(struct page *page)
140{ 240{
141 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 241 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
142 !PageUnevictable(page) && PageLRU(page)) { 242 !PageUnevictable(page) && PageLRU(page)) {
@@ -172,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
172 memcg_reclaim_stat->recent_rotated[file]++; 272 memcg_reclaim_stat->recent_rotated[file]++;
173} 273}
174 274
175/* 275static void __activate_page(struct page *page, void *arg)
176 * FIXME: speed this up?
177 */
178void activate_page(struct page *page)
179{ 276{
180 struct zone *zone = page_zone(page); 277 struct zone *zone = page_zone(page);
181 278
182 spin_lock_irq(&zone->lru_lock);
183 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 279 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
184 int file = page_is_file_cache(page); 280 int file = page_is_file_cache(page);
185 int lru = page_lru_base_type(page); 281 int lru = page_lru_base_type(page);
@@ -192,8 +288,45 @@ void activate_page(struct page *page)
192 288
193 update_page_reclaim_stat(zone, page, file, 1); 289 update_page_reclaim_stat(zone, page, file, 1);
194 } 290 }
291}
292
293#ifdef CONFIG_SMP
294static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
295
296static void activate_page_drain(int cpu)
297{
298 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
299
300 if (pagevec_count(pvec))
301 pagevec_lru_move_fn(pvec, __activate_page, NULL);
302}
303
304void activate_page(struct page *page)
305{
306 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
307 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
308
309 page_cache_get(page);
310 if (!pagevec_add(pvec, page))
311 pagevec_lru_move_fn(pvec, __activate_page, NULL);
312 put_cpu_var(activate_page_pvecs);
313 }
314}
315
316#else
317static inline void activate_page_drain(int cpu)
318{
319}
320
321void activate_page(struct page *page)
322{
323 struct zone *zone = page_zone(page);
324
325 spin_lock_irq(&zone->lru_lock);
326 __activate_page(page, NULL);
195 spin_unlock_irq(&zone->lru_lock); 327 spin_unlock_irq(&zone->lru_lock);
196} 328}
329#endif
197 330
198/* 331/*
199 * Mark a page as having seen activity. 332 * Mark a page as having seen activity.
@@ -267,6 +400,74 @@ void add_page_to_unevictable_list(struct page *page)
267} 400}
268 401
269/* 402/*
403 * If the page can not be invalidated, it is moved to the
404 * inactive list to speed up its reclaim. It is moved to the
405 * head of the list, rather than the tail, to give the flusher
406 * threads some time to write it out, as this is much more
407 * effective than the single-page writeout from reclaim.
408 *
409 * If the page isn't page_mapped and dirty/writeback, the page
410 * could reclaim asap using PG_reclaim.
411 *
412 * 1. active, mapped page -> none
413 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
414 * 3. inactive, mapped page -> none
415 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
416 * 5. inactive, clean -> inactive, tail
417 * 6. Others -> none
418 *
419 * In 4, why it moves inactive's head, the VM expects the page would
420 * be write it out by flusher threads as this is much more effective
421 * than the single-page writeout from reclaim.
422 */
423static void lru_deactivate_fn(struct page *page, void *arg)
424{
425 int lru, file;
426 bool active;
427 struct zone *zone = page_zone(page);
428
429 if (!PageLRU(page))
430 return;
431
432 if (PageUnevictable(page))
433 return;
434
435 /* Some processes are using the page */
436 if (page_mapped(page))
437 return;
438
439 active = PageActive(page);
440
441 file = page_is_file_cache(page);
442 lru = page_lru_base_type(page);
443 del_page_from_lru_list(zone, page, lru + active);
444 ClearPageActive(page);
445 ClearPageReferenced(page);
446 add_page_to_lru_list(zone, page, lru);
447
448 if (PageWriteback(page) || PageDirty(page)) {
449 /*
450 * PG_reclaim could be raced with end_page_writeback
451 * It can make readahead confusing. But race window
452 * is _really_ small and it's non-critical problem.
453 */
454 SetPageReclaim(page);
455 } else {
456 /*
457 * The page's writeback ends up during pagevec
458 * We moves tha page into tail of inactive.
459 */
460 list_move_tail(&page->lru, &zone->lru[lru].list);
461 mem_cgroup_rotate_reclaimable_page(page);
462 __count_vm_event(PGROTATED);
463 }
464
465 if (active)
466 __count_vm_event(PGDEACTIVATE);
467 update_page_reclaim_stat(zone, page, file, 0);
468}
469
470/*
270 * Drain pages out of the cpu's pagevecs. 471 * Drain pages out of the cpu's pagevecs.
271 * Either "cpu" is the current CPU, and preemption has already been 472 * Either "cpu" is the current CPU, and preemption has already been
272 * disabled; or "cpu" is being hot-unplugged, and is already dead. 473 * disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -292,6 +493,38 @@ static void drain_cpu_pagevecs(int cpu)
292 pagevec_move_tail(pvec); 493 pagevec_move_tail(pvec);
293 local_irq_restore(flags); 494 local_irq_restore(flags);
294 } 495 }
496
497 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
498 if (pagevec_count(pvec))
499 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
500
501 activate_page_drain(cpu);
502}
503
504/**
505 * deactivate_page - forcefully deactivate a page
506 * @page: page to deactivate
507 *
508 * This function hints the VM that @page is a good reclaim candidate,
509 * for example if its invalidation fails due to the page being dirty
510 * or under writeback.
511 */
512void deactivate_page(struct page *page)
513{
514 /*
515 * In a workload with many unevictable page such as mprotect, unevictable
516 * page deactivation for accelerating reclaim is pointless.
517 */
518 if (PageUnevictable(page))
519 return;
520
521 if (likely(get_page_unless_zero(page))) {
522 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
523
524 if (!pagevec_add(pvec, page))
525 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
526 put_cpu_var(lru_deactivate_pvecs);
527 }
295} 528}
296 529
297void lru_add_drain(void) 530void lru_add_drain(void)
@@ -378,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
378 611
379 pagevec_free(&pages_to_free); 612 pagevec_free(&pages_to_free);
380} 613}
614EXPORT_SYMBOL(release_pages);
381 615
382/* 616/*
383 * The pages which we're about to release may be in the deferred lru-addition 617 * The pages which we're about to release may be in the deferred lru-addition
@@ -398,44 +632,70 @@ void __pagevec_release(struct pagevec *pvec)
398 632
399EXPORT_SYMBOL(__pagevec_release); 633EXPORT_SYMBOL(__pagevec_release);
400 634
635/* used by __split_huge_page_refcount() */
636void lru_add_page_tail(struct zone* zone,
637 struct page *page, struct page *page_tail)
638{
639 int active;
640 enum lru_list lru;
641 const int file = 0;
642 struct list_head *head;
643
644 VM_BUG_ON(!PageHead(page));
645 VM_BUG_ON(PageCompound(page_tail));
646 VM_BUG_ON(PageLRU(page_tail));
647 VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
648
649 SetPageLRU(page_tail);
650
651 if (page_evictable(page_tail, NULL)) {
652 if (PageActive(page)) {
653 SetPageActive(page_tail);
654 active = 1;
655 lru = LRU_ACTIVE_ANON;
656 } else {
657 active = 0;
658 lru = LRU_INACTIVE_ANON;
659 }
660 update_page_reclaim_stat(zone, page_tail, file, active);
661 if (likely(PageLRU(page)))
662 head = page->lru.prev;
663 else
664 head = &zone->lru[lru].list;
665 __add_page_to_lru_list(zone, page_tail, lru, head);
666 } else {
667 SetPageUnevictable(page_tail);
668 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
669 }
670}
671
672static void ____pagevec_lru_add_fn(struct page *page, void *arg)
673{
674 enum lru_list lru = (enum lru_list)arg;
675 struct zone *zone = page_zone(page);
676 int file = is_file_lru(lru);
677 int active = is_active_lru(lru);
678
679 VM_BUG_ON(PageActive(page));
680 VM_BUG_ON(PageUnevictable(page));
681 VM_BUG_ON(PageLRU(page));
682
683 SetPageLRU(page);
684 if (active)
685 SetPageActive(page);
686 update_page_reclaim_stat(zone, page, file, active);
687 add_page_to_lru_list(zone, page, lru);
688}
689
401/* 690/*
402 * Add the passed pages to the LRU, then drop the caller's refcount 691 * Add the passed pages to the LRU, then drop the caller's refcount
403 * on them. Reinitialises the caller's pagevec. 692 * on them. Reinitialises the caller's pagevec.
404 */ 693 */
405void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 694void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
406{ 695{
407 int i;
408 struct zone *zone = NULL;
409
410 VM_BUG_ON(is_unevictable_lru(lru)); 696 VM_BUG_ON(is_unevictable_lru(lru));
411 697
412 for (i = 0; i < pagevec_count(pvec); i++) { 698 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
413 struct page *page = pvec->pages[i];
414 struct zone *pagezone = page_zone(page);
415 int file;
416 int active;
417
418 if (pagezone != zone) {
419 if (zone)
420 spin_unlock_irq(&zone->lru_lock);
421 zone = pagezone;
422 spin_lock_irq(&zone->lru_lock);
423 }
424 VM_BUG_ON(PageActive(page));
425 VM_BUG_ON(PageUnevictable(page));
426 VM_BUG_ON(PageLRU(page));
427 SetPageLRU(page);
428 active = is_active_lru(lru);
429 file = is_file_lru(lru);
430 if (active)
431 SetPageActive(page);
432 update_page_reclaim_stat(zone, page, file, active);
433 add_page_to_lru_list(zone, page, lru);
434 }
435 if (zone)
436 spin_unlock_irq(&zone->lru_lock);
437 release_pages(pvec->pages, pvec->nr, pvec->cold);
438 pagevec_reinit(pvec);
439} 699}
440 700
441EXPORT_SYMBOL(____pagevec_lru_add); 701EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
24 24
25/* 25/*
26 * swapper_space is a fiction, retained to simplify the path through 26 * swapper_space is a fiction, retained to simplify the path through
27 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 27 * vmscan's shrink_page_list.
28 * future use of radix_tree tags in the swap cache.
29 */ 28 */
30static const struct address_space_operations swap_aops = { 29static const struct address_space_operations swap_aops = {
31 .writepage = swap_writepage, 30 .writepage = swap_writepage,
32 .sync_page = block_sync_page,
33 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_nobuffers,
34 .migratepage = migrate_page, 32 .migratepage = migrate_page,
35}; 33};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
37static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
38 .name = "swap", 36 .name = "swap",
39 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
40 .unplug_io_fn = swap_unplug_io_fn,
41}; 38};
42 39
43struct address_space swapper_space = { 40struct address_space swapper_space = {
@@ -157,6 +154,12 @@ int add_to_swap(struct page *page)
157 if (!entry.val) 154 if (!entry.val)
158 return 0; 155 return 0;
159 156
157 if (unlikely(PageTransHuge(page)))
158 if (unlikely(split_huge_page(page))) {
159 swapcache_free(entry, NULL);
160 return 0;
161 }
162
160 /* 163 /*
161 * Radix-tree node allocations from PF_MEMALLOC contexts could 164 * Radix-tree node allocations from PF_MEMALLOC contexts could
162 * completely exhaust the page allocator. __GFP_NOMEMALLOC 165 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36f..ff8dc1a18cb4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
@@ -30,6 +30,8 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34#include <linux/oom.h>
33 35
34#include <asm/pgtable.h> 36#include <asm/pgtable.h>
35#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
@@ -58,6 +60,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
58 60
59static DEFINE_MUTEX(swapon_mutex); 61static DEFINE_MUTEX(swapon_mutex);
60 62
63static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
64/* Activity counter to indicate that a swapon or swapoff has occurred */
65static atomic_t proc_poll_event = ATOMIC_INIT(0);
66
61static inline unsigned char swap_count(unsigned char ent) 67static inline unsigned char swap_count(unsigned char ent)
62{ 68{
63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ 69 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -90,39 +96,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
90} 96}
91 97
92/* 98/*
93 * We need this because the bdev->unplug_fn can sleep and we cannot
94 * hold swap_lock while calling the unplug_fn. And swap_lock
95 * cannot be turned into a mutex.
96 */
97static DECLARE_RWSEM(swap_unplug_sem);
98
99void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
100{
101 swp_entry_t entry;
102
103 down_read(&swap_unplug_sem);
104 entry.val = page_private(page);
105 if (PageSwapCache(page)) {
106 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
107 struct backing_dev_info *bdi;
108
109 /*
110 * If the page is removed from swapcache from under us (with a
111 * racy try_to_unuse/swapoff) we need an additional reference
112 * count to avoid reading garbage from page_private(page) above.
113 * If the WARN_ON triggers during a swapoff it maybe the race
114 * condition and it's harmless. However if it triggers without
115 * swapoff it signals a problem.
116 */
117 WARN_ON(page_count(page) <= 1);
118
119 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
120 blk_run_backing_dev(bdi, page);
121 }
122 up_read(&swap_unplug_sem);
123}
124
125/*
126 * swapon tell device that all the old swap contents can be discarded, 99 * swapon tell device that all the old swap contents can be discarded,
127 * to allow the swap device to optimize its wear-levelling. 100 * to allow the swap device to optimize its wear-levelling.
128 */ 101 */
@@ -139,7 +112,7 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 112 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 113 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 114 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 115 nr_blocks, GFP_KERNEL, 0);
143 if (err) 116 if (err)
144 return err; 117 return err;
145 cond_resched(); 118 cond_resched();
@@ -150,7 +123,7 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 123 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 124
152 err = blkdev_issue_discard(si->bdev, start_block, 125 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT); 126 nr_blocks, GFP_KERNEL, 0);
154 if (err) 127 if (err)
155 break; 128 break;
156 129
@@ -189,7 +162,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 162 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 163 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 164 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT)) 165 nr_blocks, GFP_NOIO, 0))
193 break; 166 break;
194 } 167 }
195 168
@@ -207,8 +180,8 @@ static int wait_for_discard(void *word)
207#define SWAPFILE_CLUSTER 256 180#define SWAPFILE_CLUSTER 256
208#define LATENCY_LIMIT 256 181#define LATENCY_LIMIT 256
209 182
210static inline unsigned long scan_swap_map(struct swap_info_struct *si, 183static unsigned long scan_swap_map(struct swap_info_struct *si,
211 unsigned char usage) 184 unsigned char usage)
212{ 185{
213 unsigned long offset; 186 unsigned long offset;
214 unsigned long scan_base; 187 unsigned long scan_base;
@@ -875,7 +848,7 @@ unsigned int count_swap_pages(int type, int free)
875static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 848static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
876 unsigned long addr, swp_entry_t entry, struct page *page) 849 unsigned long addr, swp_entry_t entry, struct page *page)
877{ 850{
878 struct mem_cgroup *ptr = NULL; 851 struct mem_cgroup *ptr;
879 spinlock_t *ptl; 852 spinlock_t *ptl;
880 pte_t *pte; 853 pte_t *pte;
881 int ret = 1; 854 int ret = 1;
@@ -959,6 +932,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
959 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
960 do { 933 do {
961 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd)))
936 continue;
962 if (pmd_none_or_clear_bad(pmd)) 937 if (pmd_none_or_clear_bad(pmd))
963 continue; 938 continue;
964 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1543,6 +1518,36 @@ bad_bmap:
1543 goto out; 1518 goto out;
1544} 1519}
1545 1520
1521static void enable_swap_info(struct swap_info_struct *p, int prio,
1522 unsigned char *swap_map)
1523{
1524 int i, prev;
1525
1526 spin_lock(&swap_lock);
1527 if (prio >= 0)
1528 p->prio = prio;
1529 else
1530 p->prio = --least_priority;
1531 p->swap_map = swap_map;
1532 p->flags |= SWP_WRITEOK;
1533 nr_swap_pages += p->pages;
1534 total_swap_pages += p->pages;
1535
1536 /* insert swap space into swap_list: */
1537 prev = -1;
1538 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1539 if (p->prio >= swap_info[i]->prio)
1540 break;
1541 prev = i;
1542 }
1543 p->next = i;
1544 if (prev < 0)
1545 swap_list.head = swap_list.next = p->type;
1546 else
1547 swap_info[prev]->next = p->type;
1548 spin_unlock(&swap_lock);
1549}
1550
1546SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1551SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1547{ 1552{
1548 struct swap_info_struct *p = NULL; 1553 struct swap_info_struct *p = NULL;
@@ -1551,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1551 struct address_space *mapping; 1556 struct address_space *mapping;
1552 struct inode *inode; 1557 struct inode *inode;
1553 char *pathname; 1558 char *pathname;
1559 int oom_score_adj;
1554 int i, type, prev; 1560 int i, type, prev;
1555 int err; 1561 int err;
1556 1562
@@ -1609,37 +1615,22 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1609 p->flags &= ~SWP_WRITEOK; 1615 p->flags &= ~SWP_WRITEOK;
1610 spin_unlock(&swap_lock); 1616 spin_unlock(&swap_lock);
1611 1617
1612 current->flags |= PF_OOM_ORIGIN; 1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1613 err = try_to_unuse(type); 1619 err = try_to_unuse(type);
1614 current->flags &= ~PF_OOM_ORIGIN; 1620 test_set_oom_score_adj(oom_score_adj);
1615 1621
1616 if (err) { 1622 if (err) {
1623 /*
1624 * reading p->prio and p->swap_map outside the lock is
1625 * safe here because only sys_swapon and sys_swapoff
1626 * change them, and there can be no other sys_swapon or
1627 * sys_swapoff for this swap_info_struct at this point.
1628 */
1617 /* re-insert swap space back into swap_list */ 1629 /* re-insert swap space back into swap_list */
1618 spin_lock(&swap_lock); 1630 enable_swap_info(p, p->prio, p->swap_map);
1619 if (p->prio < 0)
1620 p->prio = --least_priority;
1621 prev = -1;
1622 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1623 if (p->prio >= swap_info[i]->prio)
1624 break;
1625 prev = i;
1626 }
1627 p->next = i;
1628 if (prev < 0)
1629 swap_list.head = swap_list.next = type;
1630 else
1631 swap_info[prev]->next = type;
1632 nr_swap_pages += p->pages;
1633 total_swap_pages += p->pages;
1634 p->flags |= SWP_WRITEOK;
1635 spin_unlock(&swap_lock);
1636 goto out_dput; 1631 goto out_dput;
1637 } 1632 }
1638 1633
1639 /* wait for any unplug function to finish */
1640 down_write(&swap_unplug_sem);
1641 up_write(&swap_unplug_sem);
1642
1643 destroy_swap_extents(p); 1634 destroy_swap_extents(p);
1644 if (p->flags & SWP_CONTINUED) 1635 if (p->flags & SWP_CONTINUED)
1645 free_swap_count_continuations(p); 1636 free_swap_count_continuations(p);
@@ -1672,7 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1672 if (S_ISBLK(inode->i_mode)) { 1663 if (S_ISBLK(inode->i_mode)) {
1673 struct block_device *bdev = I_BDEV(inode); 1664 struct block_device *bdev = I_BDEV(inode);
1674 set_blocksize(bdev, p->old_block_size); 1665 set_blocksize(bdev, p->old_block_size);
1675 bd_release(bdev); 1666 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1676 } else { 1667 } else {
1677 mutex_lock(&inode->i_mutex); 1668 mutex_lock(&inode->i_mutex);
1678 inode->i_flags &= ~S_SWAPFILE; 1669 inode->i_flags &= ~S_SWAPFILE;
@@ -1680,6 +1671,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1680 } 1671 }
1681 filp_close(swap_file, NULL); 1672 filp_close(swap_file, NULL);
1682 err = 0; 1673 err = 0;
1674 atomic_inc(&proc_poll_event);
1675 wake_up_interruptible(&proc_poll_wait);
1683 1676
1684out_dput: 1677out_dput:
1685 filp_close(victim, NULL); 1678 filp_close(victim, NULL);
@@ -1688,6 +1681,25 @@ out:
1688} 1681}
1689 1682
1690#ifdef CONFIG_PROC_FS 1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{
1691 struct proc_swaps *s = file->private_data;
1692
1693 poll_wait(file, &proc_poll_wait, wait);
1694
1695 if (s->event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 }
1699
1700 return POLLIN | POLLRDNORM;
1701}
1702
1691/* iterator */ 1703/* iterator */
1692static void *swap_start(struct seq_file *swap, loff_t *pos) 1704static void *swap_start(struct seq_file *swap, loff_t *pos)
1693{ 1705{
@@ -1771,7 +1783,24 @@ static const struct seq_operations swaps_op = {
1771 1783
1772static int swaps_open(struct inode *inode, struct file *file) 1784static int swaps_open(struct inode *inode, struct file *file)
1773{ 1785{
1774 return seq_open(file, &swaps_op); 1786 struct proc_swaps *s;
1787 int ret;
1788
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op);
1796 if (ret) {
1797 kfree(s);
1798 return ret;
1799 }
1800
1801 s->seq.private = s;
1802 s->event = atomic_read(&proc_poll_event);
1803 return ret;
1775} 1804}
1776 1805
1777static const struct file_operations proc_swaps_operations = { 1806static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1808,7 @@ static const struct file_operations proc_swaps_operations = {
1779 .read = seq_read, 1808 .read = seq_read,
1780 .llseek = seq_lseek, 1809 .llseek = seq_lseek,
1781 .release = seq_release, 1810 .release = seq_release,
1811 .poll = swaps_poll,
1782}; 1812};
1783 1813
1784static int __init procswaps_init(void) 1814static int __init procswaps_init(void)
@@ -1798,49 +1828,24 @@ static int __init max_swapfiles_check(void)
1798late_initcall(max_swapfiles_check); 1828late_initcall(max_swapfiles_check);
1799#endif 1829#endif
1800 1830
1801/* 1831static struct swap_info_struct *alloc_swap_info(void)
1802 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1803 *
1804 * The swapon system call
1805 */
1806SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1807{ 1832{
1808 struct swap_info_struct *p; 1833 struct swap_info_struct *p;
1809 char *name = NULL;
1810 struct block_device *bdev = NULL;
1811 struct file *swap_file = NULL;
1812 struct address_space *mapping;
1813 unsigned int type; 1834 unsigned int type;
1814 int i, prev;
1815 int error;
1816 union swap_header *swap_header;
1817 unsigned int nr_good_pages;
1818 int nr_extents = 0;
1819 sector_t span;
1820 unsigned long maxpages;
1821 unsigned long swapfilepages;
1822 unsigned char *swap_map = NULL;
1823 struct page *page = NULL;
1824 struct inode *inode = NULL;
1825 int did_down = 0;
1826
1827 if (!capable(CAP_SYS_ADMIN))
1828 return -EPERM;
1829 1835
1830 p = kzalloc(sizeof(*p), GFP_KERNEL); 1836 p = kzalloc(sizeof(*p), GFP_KERNEL);
1831 if (!p) 1837 if (!p)
1832 return -ENOMEM; 1838 return ERR_PTR(-ENOMEM);
1833 1839
1834 spin_lock(&swap_lock); 1840 spin_lock(&swap_lock);
1835 for (type = 0; type < nr_swapfiles; type++) { 1841 for (type = 0; type < nr_swapfiles; type++) {
1836 if (!(swap_info[type]->flags & SWP_USED)) 1842 if (!(swap_info[type]->flags & SWP_USED))
1837 break; 1843 break;
1838 } 1844 }
1839 error = -EPERM;
1840 if (type >= MAX_SWAPFILES) { 1845 if (type >= MAX_SWAPFILES) {
1841 spin_unlock(&swap_lock); 1846 spin_unlock(&swap_lock);
1842 kfree(p); 1847 kfree(p);
1843 goto out; 1848 return ERR_PTR(-EPERM);
1844 } 1849 }
1845 if (type >= nr_swapfiles) { 1850 if (type >= nr_swapfiles) {
1846 p->type = type; 1851 p->type = type;
@@ -1865,80 +1870,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1865 p->next = -1; 1870 p->next = -1;
1866 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1867 1872
1868 name = getname(specialfile); 1873 return p;
1869 error = PTR_ERR(name); 1874}
1870 if (IS_ERR(name)) {
1871 name = NULL;
1872 goto bad_swap_2;
1873 }
1874 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1875 error = PTR_ERR(swap_file);
1876 if (IS_ERR(swap_file)) {
1877 swap_file = NULL;
1878 goto bad_swap_2;
1879 }
1880
1881 p->swap_file = swap_file;
1882 mapping = swap_file->f_mapping;
1883 inode = mapping->host;
1884
1885 error = -EBUSY;
1886 for (i = 0; i < nr_swapfiles; i++) {
1887 struct swap_info_struct *q = swap_info[i];
1888 1875
1889 if (i == type || !q->swap_file) 1876static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1890 continue; 1877{
1891 if (mapping == q->swap_file->f_mapping) 1878 int error;
1892 goto bad_swap;
1893 }
1894 1879
1895 error = -EINVAL;
1896 if (S_ISBLK(inode->i_mode)) { 1880 if (S_ISBLK(inode->i_mode)) {
1897 bdev = I_BDEV(inode); 1881 p->bdev = bdgrab(I_BDEV(inode));
1898 error = bd_claim(bdev, sys_swapon); 1882 error = blkdev_get(p->bdev,
1883 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1884 sys_swapon);
1899 if (error < 0) { 1885 if (error < 0) {
1900 bdev = NULL; 1886 p->bdev = NULL;
1901 error = -EINVAL; 1887 return -EINVAL;
1902 goto bad_swap;
1903 } 1888 }
1904 p->old_block_size = block_size(bdev); 1889 p->old_block_size = block_size(p->bdev);
1905 error = set_blocksize(bdev, PAGE_SIZE); 1890 error = set_blocksize(p->bdev, PAGE_SIZE);
1906 if (error < 0) 1891 if (error < 0)
1907 goto bad_swap; 1892 return error;
1908 p->bdev = bdev;
1909 p->flags |= SWP_BLKDEV; 1893 p->flags |= SWP_BLKDEV;
1910 } else if (S_ISREG(inode->i_mode)) { 1894 } else if (S_ISREG(inode->i_mode)) {
1911 p->bdev = inode->i_sb->s_bdev; 1895 p->bdev = inode->i_sb->s_bdev;
1912 mutex_lock(&inode->i_mutex); 1896 mutex_lock(&inode->i_mutex);
1913 did_down = 1; 1897 if (IS_SWAPFILE(inode))
1914 if (IS_SWAPFILE(inode)) { 1898 return -EBUSY;
1915 error = -EBUSY; 1899 } else
1916 goto bad_swap; 1900 return -EINVAL;
1917 }
1918 } else {
1919 goto bad_swap;
1920 }
1921 1901
1922 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1902 return 0;
1903}
1923 1904
1924 /* 1905static unsigned long read_swap_header(struct swap_info_struct *p,
1925 * Read the swap header. 1906 union swap_header *swap_header,
1926 */ 1907 struct inode *inode)
1927 if (!mapping->a_ops->readpage) { 1908{
1928 error = -EINVAL; 1909 int i;
1929 goto bad_swap; 1910 unsigned long maxpages;
1930 } 1911 unsigned long swapfilepages;
1931 page = read_mapping_page(mapping, 0, swap_file);
1932 if (IS_ERR(page)) {
1933 error = PTR_ERR(page);
1934 goto bad_swap;
1935 }
1936 swap_header = kmap(page);
1937 1912
1938 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1913 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1939 printk(KERN_ERR "Unable to find swap-space signature\n"); 1914 printk(KERN_ERR "Unable to find swap-space signature\n");
1940 error = -EINVAL; 1915 return 0;
1941 goto bad_swap;
1942 } 1916 }
1943 1917
1944 /* swap partition endianess hack... */ 1918 /* swap partition endianess hack... */
@@ -1954,8 +1928,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1954 printk(KERN_WARNING 1928 printk(KERN_WARNING
1955 "Unable to handle swap header version %d\n", 1929 "Unable to handle swap header version %d\n",
1956 swap_header->info.version); 1930 swap_header->info.version);
1957 error = -EINVAL; 1931 return 0;
1958 goto bad_swap;
1959 } 1932 }
1960 1933
1961 p->lowest_bit = 1; 1934 p->lowest_bit = 1;
@@ -1986,62 +1959,156 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1986 } 1959 }
1987 p->highest_bit = maxpages - 1; 1960 p->highest_bit = maxpages - 1;
1988 1961
1989 error = -EINVAL;
1990 if (!maxpages) 1962 if (!maxpages)
1991 goto bad_swap; 1963 return 0;
1964 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1992 if (swapfilepages && maxpages > swapfilepages) { 1965 if (swapfilepages && maxpages > swapfilepages) {
1993 printk(KERN_WARNING 1966 printk(KERN_WARNING
1994 "Swap area shorter than signature indicates\n"); 1967 "Swap area shorter than signature indicates\n");
1995 goto bad_swap; 1968 return 0;
1996 } 1969 }
1997 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1970 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1998 goto bad_swap; 1971 return 0;
1999 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1972 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2000 goto bad_swap; 1973 return 0;
2001 1974
2002 /* OK, set up the swap map and apply the bad block list */ 1975 return maxpages;
2003 swap_map = vmalloc(maxpages); 1976}
2004 if (!swap_map) { 1977
2005 error = -ENOMEM; 1978static int setup_swap_map_and_extents(struct swap_info_struct *p,
2006 goto bad_swap; 1979 union swap_header *swap_header,
2007 } 1980 unsigned char *swap_map,
1981 unsigned long maxpages,
1982 sector_t *span)
1983{
1984 int i;
1985 unsigned int nr_good_pages;
1986 int nr_extents;
2008 1987
2009 memset(swap_map, 0, maxpages);
2010 nr_good_pages = maxpages - 1; /* omit header page */ 1988 nr_good_pages = maxpages - 1; /* omit header page */
2011 1989
2012 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1990 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2013 unsigned int page_nr = swap_header->info.badpages[i]; 1991 unsigned int page_nr = swap_header->info.badpages[i];
2014 if (page_nr == 0 || page_nr > swap_header->info.last_page) { 1992 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2015 error = -EINVAL; 1993 return -EINVAL;
2016 goto bad_swap;
2017 }
2018 if (page_nr < maxpages) { 1994 if (page_nr < maxpages) {
2019 swap_map[page_nr] = SWAP_MAP_BAD; 1995 swap_map[page_nr] = SWAP_MAP_BAD;
2020 nr_good_pages--; 1996 nr_good_pages--;
2021 } 1997 }
2022 } 1998 }
2023 1999
2024 error = swap_cgroup_swapon(type, maxpages);
2025 if (error)
2026 goto bad_swap;
2027
2028 if (nr_good_pages) { 2000 if (nr_good_pages) {
2029 swap_map[0] = SWAP_MAP_BAD; 2001 swap_map[0] = SWAP_MAP_BAD;
2030 p->max = maxpages; 2002 p->max = maxpages;
2031 p->pages = nr_good_pages; 2003 p->pages = nr_good_pages;
2032 nr_extents = setup_swap_extents(p, &span); 2004 nr_extents = setup_swap_extents(p, span);
2033 if (nr_extents < 0) { 2005 if (nr_extents < 0)
2034 error = nr_extents; 2006 return nr_extents;
2035 goto bad_swap;
2036 }
2037 nr_good_pages = p->pages; 2007 nr_good_pages = p->pages;
2038 } 2008 }
2039 if (!nr_good_pages) { 2009 if (!nr_good_pages) {
2040 printk(KERN_WARNING "Empty swap-file\n"); 2010 printk(KERN_WARNING "Empty swap-file\n");
2011 return -EINVAL;
2012 }
2013
2014 return nr_extents;
2015}
2016
2017SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2018{
2019 struct swap_info_struct *p;
2020 char *name;
2021 struct file *swap_file = NULL;
2022 struct address_space *mapping;
2023 int i;
2024 int prio;
2025 int error;
2026 union swap_header *swap_header;
2027 int nr_extents;
2028 sector_t span;
2029 unsigned long maxpages;
2030 unsigned char *swap_map = NULL;
2031 struct page *page = NULL;
2032 struct inode *inode = NULL;
2033
2034 if (!capable(CAP_SYS_ADMIN))
2035 return -EPERM;
2036
2037 p = alloc_swap_info();
2038 if (IS_ERR(p))
2039 return PTR_ERR(p);
2040
2041 name = getname(specialfile);
2042 if (IS_ERR(name)) {
2043 error = PTR_ERR(name);
2044 name = NULL;
2045 goto bad_swap;
2046 }
2047 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2048 if (IS_ERR(swap_file)) {
2049 error = PTR_ERR(swap_file);
2050 swap_file = NULL;
2051 goto bad_swap;
2052 }
2053
2054 p->swap_file = swap_file;
2055 mapping = swap_file->f_mapping;
2056
2057 for (i = 0; i < nr_swapfiles; i++) {
2058 struct swap_info_struct *q = swap_info[i];
2059
2060 if (q == p || !q->swap_file)
2061 continue;
2062 if (mapping == q->swap_file->f_mapping) {
2063 error = -EBUSY;
2064 goto bad_swap;
2065 }
2066 }
2067
2068 inode = mapping->host;
2069 /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
2070 error = claim_swapfile(p, inode);
2071 if (unlikely(error))
2072 goto bad_swap;
2073
2074 /*
2075 * Read the swap header.
2076 */
2077 if (!mapping->a_ops->readpage) {
2078 error = -EINVAL;
2079 goto bad_swap;
2080 }
2081 page = read_mapping_page(mapping, 0, swap_file);
2082 if (IS_ERR(page)) {
2083 error = PTR_ERR(page);
2084 goto bad_swap;
2085 }
2086 swap_header = kmap(page);
2087
2088 maxpages = read_swap_header(p, swap_header, inode);
2089 if (unlikely(!maxpages)) {
2041 error = -EINVAL; 2090 error = -EINVAL;
2042 goto bad_swap; 2091 goto bad_swap;
2043 } 2092 }
2044 2093
2094 /* OK, set up the swap map and apply the bad block list */
2095 swap_map = vzalloc(maxpages);
2096 if (!swap_map) {
2097 error = -ENOMEM;
2098 goto bad_swap;
2099 }
2100
2101 error = swap_cgroup_swapon(p->type, maxpages);
2102 if (error)
2103 goto bad_swap;
2104
2105 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2106 maxpages, &span);
2107 if (unlikely(nr_extents < 0)) {
2108 error = nr_extents;
2109 goto bad_swap;
2110 }
2111
2045 if (p->bdev) { 2112 if (p->bdev) {
2046 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2113 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2047 p->flags |= SWP_SOLIDSTATE; 2114 p->flags |= SWP_SOLIDSTATE;
@@ -2052,55 +2119,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2052 } 2119 }
2053 2120
2054 mutex_lock(&swapon_mutex); 2121 mutex_lock(&swapon_mutex);
2055 spin_lock(&swap_lock); 2122 prio = -1;
2056 if (swap_flags & SWAP_FLAG_PREFER) 2123 if (swap_flags & SWAP_FLAG_PREFER)
2057 p->prio = 2124 prio =
2058 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2125 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2059 else 2126 enable_swap_info(p, prio, swap_map);
2060 p->prio = --least_priority;
2061 p->swap_map = swap_map;
2062 p->flags |= SWP_WRITEOK;
2063 nr_swap_pages += nr_good_pages;
2064 total_swap_pages += nr_good_pages;
2065 2127
2066 printk(KERN_INFO "Adding %uk swap on %s. " 2128 printk(KERN_INFO "Adding %uk swap on %s. "
2067 "Priority:%d extents:%d across:%lluk %s%s\n", 2129 "Priority:%d extents:%d across:%lluk %s%s\n",
2068 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2130 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2069 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2131 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2070 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2132 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2071 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2133 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2072 2134
2073 /* insert swap space into swap_list: */
2074 prev = -1;
2075 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2076 if (p->prio >= swap_info[i]->prio)
2077 break;
2078 prev = i;
2079 }
2080 p->next = i;
2081 if (prev < 0)
2082 swap_list.head = swap_list.next = type;
2083 else
2084 swap_info[prev]->next = type;
2085 spin_unlock(&swap_lock);
2086 mutex_unlock(&swapon_mutex); 2135 mutex_unlock(&swapon_mutex);
2136 atomic_inc(&proc_poll_event);
2137 wake_up_interruptible(&proc_poll_wait);
2138
2139 if (S_ISREG(inode->i_mode))
2140 inode->i_flags |= S_SWAPFILE;
2087 error = 0; 2141 error = 0;
2088 goto out; 2142 goto out;
2089bad_swap: 2143bad_swap:
2090 if (bdev) { 2144 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2091 set_blocksize(bdev, p->old_block_size); 2145 set_blocksize(p->bdev, p->old_block_size);
2092 bd_release(bdev); 2146 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2093 } 2147 }
2094 destroy_swap_extents(p); 2148 destroy_swap_extents(p);
2095 swap_cgroup_swapoff(type); 2149 swap_cgroup_swapoff(p->type);
2096bad_swap_2:
2097 spin_lock(&swap_lock); 2150 spin_lock(&swap_lock);
2098 p->swap_file = NULL; 2151 p->swap_file = NULL;
2099 p->flags = 0; 2152 p->flags = 0;
2100 spin_unlock(&swap_lock); 2153 spin_unlock(&swap_lock);
2101 vfree(swap_map); 2154 vfree(swap_map);
2102 if (swap_file) 2155 if (swap_file) {
2156 if (inode && S_ISREG(inode->i_mode)) {
2157 mutex_unlock(&inode->i_mutex);
2158 inode = NULL;
2159 }
2103 filp_close(swap_file, NULL); 2160 filp_close(swap_file, NULL);
2161 }
2104out: 2162out:
2105 if (page && !IS_ERR(page)) { 2163 if (page && !IS_ERR(page)) {
2106 kunmap(page); 2164 kunmap(page);
@@ -2108,11 +2166,8 @@ out:
2108 } 2166 }
2109 if (name) 2167 if (name)
2110 putname(name); 2168 putname(name);
2111 if (did_down) { 2169 if (inode && S_ISREG(inode->i_mode))
2112 if (!error)
2113 inode->i_flags |= S_SWAPFILE;
2114 mutex_unlock(&inode->i_mutex); 2170 mutex_unlock(&inode->i_mutex);
2115 }
2116 return error; 2171 return error;
2117} 2172}
2118 2173
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..fabf2d0f5169 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
24 29
25static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg;
27static unsigned int global_faults; 33static unsigned int global_faults;
34static unsigned int last_aging;
35
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
38{
39 struct mem_cgroup *memcg;
40
41 memcg = try_get_mem_cgroup_from_mm(mm);
42 if (memcg)
43 css_put(mem_cgroup_css(memcg));
44
45 return memcg;
46}
47#else
48static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
49{
50 return NULL;
51}
52#endif
28 53
29void grab_swap_token(struct mm_struct *mm) 54void grab_swap_token(struct mm_struct *mm)
30{ 55{
31 int current_interval; 56 int current_interval;
57 unsigned int old_prio = mm->token_priority;
32 58
33 global_faults++; 59 global_faults++;
34 60
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
38 return; 64 return;
39 65
40 /* First come first served */ 66 /* First come first served */
41 if (swap_token_mm == NULL) { 67 if (!swap_token_mm)
42 mm->token_priority = mm->token_priority + 2; 68 goto replace_token;
43 swap_token_mm = mm; 69
44 goto out; 70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults;
45 } 73 }
46 74
47 if (mm != swap_token_mm) { 75 if (mm == swap_token_mm) {
48 if (current_interval < mm->last_interval)
49 mm->token_priority++;
50 else {
51 if (likely(mm->token_priority > 0))
52 mm->token_priority--;
53 }
54 /* Check if we deserve the token */
55 if (mm->token_priority > swap_token_mm->token_priority) {
56 mm->token_priority += 2;
57 swap_token_mm = mm;
58 }
59 } else {
60 /* Token holder came in again! */
61 mm->token_priority += 2; 76 mm->token_priority += 2;
77 goto update_priority;
78 }
79
80 if (current_interval < mm->last_interval)
81 mm->token_priority++;
82 else {
83 if (likely(mm->token_priority > 0))
84 mm->token_priority--;
62 } 85 }
63 86
87 /* Check if we deserve the token */
88 if (mm->token_priority > swap_token_mm->token_priority)
89 goto replace_token;
90
91update_priority:
92 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
93
64out: 94out:
65 mm->faultstamp = global_faults; 95 mm->faultstamp = global_faults;
66 mm->last_interval = current_interval; 96 mm->last_interval = current_interval;
67 spin_unlock(&swap_token_lock); 97 spin_unlock(&swap_token_lock);
98 return;
99
100replace_token:
101 mm->token_priority += 2;
102 trace_replace_swap_token(swap_token_mm, mm);
103 swap_token_mm = mm;
104 swap_token_memcg = swap_token_memcg_from_mm(mm);
105 last_aging = global_faults;
106 goto out;
68} 107}
69 108
70/* Called on process exit. */ 109/* Called on process exit. */
71void __put_swap_token(struct mm_struct *mm) 110void __put_swap_token(struct mm_struct *mm)
72{ 111{
73 spin_lock(&swap_token_lock); 112 spin_lock(&swap_token_lock);
74 if (likely(mm == swap_token_mm)) 113 if (likely(mm == swap_token_mm)) {
114 trace_put_swap_token(swap_token_mm);
75 swap_token_mm = NULL; 115 swap_token_mm = NULL;
116 swap_token_memcg = NULL;
117 }
76 spin_unlock(&swap_token_lock); 118 spin_unlock(&swap_token_lock);
77} 119}
120
121static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
122{
123 if (!a)
124 return true;
125 if (!b)
126 return true;
127 if (a == b)
128 return true;
129 return false;
130}
131
132void disable_swap_token(struct mem_cgroup *memcg)
133{
134 /* memcg reclaim don't disable unrelated mm token. */
135 if (match_memcg(memcg, swap_token_memcg)) {
136 spin_lock(&swap_token_lock);
137 if (match_memcg(memcg, swap_token_memcg)) {
138 trace_disable_swap_token(swap_token_mm);
139 swap_token_mm = NULL;
140 swap_token_memcg = NULL;
141 }
142 spin_unlock(&swap_token_lock);
143 }
144}
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..e13f22efaad7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
19#include <linux/task_io_accounting_ops.h> 19#include <linux/task_io_accounting_ops.h>
20#include <linux/buffer_head.h> /* grr. try_to_release_page, 20#include <linux/buffer_head.h> /* grr. try_to_release_page,
21 do_invalidatepage */ 21 do_invalidatepage */
22#include <linux/cleancache.h>
22#include "internal.h" 23#include "internal.h"
23 24
24 25
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
51static inline void truncate_partial_page(struct page *page, unsigned partial) 52static inline void truncate_partial_page(struct page *page, unsigned partial)
52{ 53{
53 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_flush_page(page->mapping, page);
54 if (page_has_private(page)) 56 if (page_has_private(page))
55 do_invalidatepage(page, partial); 57 do_invalidatepage(page, partial);
56} 58}
@@ -106,9 +108,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
106 cancel_dirty_page(page, PAGE_CACHE_SIZE); 108 cancel_dirty_page(page, PAGE_CACHE_SIZE);
107 109
108 clear_page_mlock(page); 110 clear_page_mlock(page);
109 remove_from_page_cache(page);
110 ClearPageMappedToDisk(page); 111 ClearPageMappedToDisk(page);
111 page_cache_release(page); /* pagecache ref */ 112 delete_from_page_cache(page);
112 return 0; 113 return 0;
113} 114}
114 115
@@ -215,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
215 pgoff_t next; 216 pgoff_t next;
216 int i; 217 int i;
217 218
219 cleancache_flush_inode(mapping);
218 if (mapping->nrpages == 0) 220 if (mapping->nrpages == 0)
219 return; 221 return;
220 222
@@ -225,6 +227,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
225 next = start; 227 next = start;
226 while (next <= end && 228 while (next <= end &&
227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
230 mem_cgroup_uncharge_start();
228 for (i = 0; i < pagevec_count(&pvec); i++) { 231 for (i = 0; i < pagevec_count(&pvec); i++) {
229 struct page *page = pvec.pages[i]; 232 struct page *page = pvec.pages[i];
230 pgoff_t page_index = page->index; 233 pgoff_t page_index = page->index;
@@ -247,6 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 unlock_page(page); 250 unlock_page(page);
248 } 251 }
249 pagevec_release(&pvec); 252 pagevec_release(&pvec);
253 mem_cgroup_uncharge_end();
250 cond_resched(); 254 cond_resched();
251 } 255 }
252 256
@@ -290,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
290 pagevec_release(&pvec); 294 pagevec_release(&pvec);
291 mem_cgroup_uncharge_end(); 295 mem_cgroup_uncharge_end();
292 } 296 }
297 cleancache_flush_inode(mapping);
293} 298}
294EXPORT_SYMBOL(truncate_inode_pages_range); 299EXPORT_SYMBOL(truncate_inode_pages_range);
295 300
@@ -299,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
299 * @lstart: offset from which to truncate 304 * @lstart: offset from which to truncate
300 * 305 *
301 * Called under (and serialised by) inode->i_mutex. 306 * Called under (and serialised by) inode->i_mutex.
307 *
308 * Note: When this function returns, there can be a page in the process of
309 * deletion (inside __delete_from_page_cache()) in the specified range. Thus
310 * mapping->nrpages can be non-zero when this function returns even after
311 * truncation of the whole mapping.
302 */ 312 */
303void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 313void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
304{ 314{
@@ -320,11 +330,12 @@ EXPORT_SYMBOL(truncate_inode_pages);
320 * pagetables. 330 * pagetables.
321 */ 331 */
322unsigned long invalidate_mapping_pages(struct address_space *mapping, 332unsigned long invalidate_mapping_pages(struct address_space *mapping,
323 pgoff_t start, pgoff_t end) 333 pgoff_t start, pgoff_t end)
324{ 334{
325 struct pagevec pvec; 335 struct pagevec pvec;
326 pgoff_t next = start; 336 pgoff_t next = start;
327 unsigned long ret = 0; 337 unsigned long ret;
338 unsigned long count = 0;
328 int i; 339 int i;
329 340
330 pagevec_init(&pvec, 0); 341 pagevec_init(&pvec, 0);
@@ -351,9 +362,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
351 if (lock_failed) 362 if (lock_failed)
352 continue; 363 continue;
353 364
354 ret += invalidate_inode_page(page); 365 ret = invalidate_inode_page(page);
355
356 unlock_page(page); 366 unlock_page(page);
367 /*
368 * Invalidation is a hint that the page is no longer
369 * of interest and try to speed up its reclaim.
370 */
371 if (!ret)
372 deactivate_page(page);
373 count += ret;
357 if (next > end) 374 if (next > end)
358 break; 375 break;
359 } 376 }
@@ -361,7 +378,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
361 mem_cgroup_uncharge_end(); 378 mem_cgroup_uncharge_end();
362 cond_resched(); 379 cond_resched();
363 } 380 }
364 return ret; 381 return count;
365} 382}
366EXPORT_SYMBOL(invalidate_mapping_pages); 383EXPORT_SYMBOL(invalidate_mapping_pages);
367 384
@@ -387,9 +404,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
387 404
388 clear_page_mlock(page); 405 clear_page_mlock(page);
389 BUG_ON(page_has_private(page)); 406 BUG_ON(page_has_private(page));
390 __remove_from_page_cache(page); 407 __delete_from_page_cache(page);
391 spin_unlock_irq(&mapping->tree_lock); 408 spin_unlock_irq(&mapping->tree_lock);
392 mem_cgroup_uncharge_cache_page(page); 409 mem_cgroup_uncharge_cache_page(page);
410
411 if (mapping->a_ops->freepage)
412 mapping->a_ops->freepage(page);
413
393 page_cache_release(page); /* pagecache ref */ 414 page_cache_release(page); /* pagecache ref */
394 return 1; 415 return 1;
395failed: 416failed:
@@ -428,6 +449,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 int did_range_unmap = 0; 449 int did_range_unmap = 0;
429 int wrapped = 0; 450 int wrapped = 0;
430 451
452 cleancache_flush_inode(mapping);
431 pagevec_init(&pvec, 0); 453 pagevec_init(&pvec, 0);
432 next = start; 454 next = start;
433 while (next <= end && !wrapped && 455 while (next <= end && !wrapped &&
@@ -486,6 +508,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
486 mem_cgroup_uncharge_end(); 508 mem_cgroup_uncharge_end();
487 cond_resched(); 509 cond_resched();
488 } 510 }
511 cleancache_flush_inode(mapping);
489 return ret; 512 return ret;
490} 513}
491EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 514EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -545,13 +568,12 @@ EXPORT_SYMBOL(truncate_pagecache);
545 * @inode: inode 568 * @inode: inode
546 * @newsize: new file size 569 * @newsize: new file size
547 * 570 *
548 * truncate_setsize updastes i_size update and performs pagecache 571 * truncate_setsize updates i_size and performs pagecache truncation (if
549 * truncation (if necessary) for a file size updates. It will be 572 * necessary) to @newsize. It will be typically be called from the filesystem's
550 * typically be called from the filesystem's setattr function when 573 * setattr function when ATTR_SIZE is passed in.
551 * ATTR_SIZE is passed in.
552 * 574 *
553 * Must be called with inode_mutex held and after all filesystem 575 * Must be called with inode_mutex held and before all filesystem specific
554 * specific block truncation has been performed. 576 * block truncation has been performed.
555 */ 577 */
556void truncate_setsize(struct inode *inode, loff_t newsize) 578void truncate_setsize(struct inode *inode, loff_t newsize)
557{ 579{
@@ -586,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset)
586 return 0; 608 return 0;
587} 609}
588EXPORT_SYMBOL(vmtruncate); 610EXPORT_SYMBOL(vmtruncate);
611
612int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
613{
614 struct address_space *mapping = inode->i_mapping;
615
616 /*
617 * If the underlying filesystem is not going to provide
618 * a way to truncate a range of blocks (punch a hole) -
619 * we should return failure right now.
620 */
621 if (!inode->i_op->truncate_range)
622 return -ENOSYS;
623
624 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem);
626 unmap_mapping_range(mapping, offset, (end - offset), 1);
627 inode->i_op->truncate_range(inode, offset, end);
628 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex);
632
633 return 0;
634}
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..88ea1bd661c0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8 8
9#include "internal.h"
10
9#define CREATE_TRACE_POINTS 11#define CREATE_TRACE_POINTS
10#include <trace/events/kmem.h> 12#include <trace/events/kmem.h>
11 13
@@ -186,27 +188,6 @@ void kzfree(const void *p)
186} 188}
187EXPORT_SYMBOL(kzfree); 189EXPORT_SYMBOL(kzfree);
188 190
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
210/* 191/*
211 * strndup_user - duplicate an existing string from user space 192 * strndup_user - duplicate an existing string from user space
212 * @s: The string to duplicate 193 * @s: The string to duplicate
@@ -236,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
236} 217}
237EXPORT_SYMBOL(strndup_user); 218EXPORT_SYMBOL(strndup_user);
238 219
220void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
221 struct vm_area_struct *prev, struct rb_node *rb_parent)
222{
223 struct vm_area_struct *next;
224
225 vma->vm_prev = prev;
226 if (prev) {
227 next = prev->vm_next;
228 prev->vm_next = vma;
229 } else {
230 mm->mmap = vma;
231 if (rb_parent)
232 next = rb_entry(rb_parent,
233 struct vm_area_struct, vm_rb);
234 else
235 next = NULL;
236 }
237 vma->vm_next = next;
238 if (next)
239 next->vm_prev = vma;
240}
241
239#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
240void arch_pick_mmap_layout(struct mm_struct *mm) 243void arch_pick_mmap_layout(struct mm_struct *mm)
241{ 244{
@@ -245,6 +248,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
245} 248}
246#endif 249#endif
247 250
251/*
252 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
253 * back to the regular GUP.
254 * If the architecture not support this function, simply return with no
255 * page pinned
256 */
257int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
258 int nr_pages, int write, struct page **pages)
259{
260 return 0;
261}
262EXPORT_SYMBOL_GPL(__get_user_pages_fast);
263
248/** 264/**
249 * get_user_pages_fast() - pin user pages in memory 265 * get_user_pages_fast() - pin user pages in memory
250 * @start: starting user address 266 * @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..1d34d75366a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
35
36/*** Page table manipulation functions ***/ 34/*** Page table manipulation functions ***/
37 35
38static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 36static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -263,8 +261,15 @@ struct vmap_area {
263}; 261};
264 262
265static DEFINE_SPINLOCK(vmap_area_lock); 263static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 264static LIST_HEAD(vmap_area_list);
265static struct rb_root vmap_area_root = RB_ROOT;
266
267/* The vmap cache globals are protected by vmap_area_lock */
268static struct rb_node *free_vmap_cache;
269static unsigned long cached_hole_size;
270static unsigned long cached_vstart;
271static unsigned long cached_align;
272
268static unsigned long vmap_area_pcpu_hole; 273static unsigned long vmap_area_pcpu_hole;
269 274
270static struct vmap_area *__find_vmap_area(unsigned long addr) 275static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -293,13 +298,13 @@ static void __insert_vmap_area(struct vmap_area *va)
293 struct rb_node *tmp; 298 struct rb_node *tmp;
294 299
295 while (*p) { 300 while (*p) {
296 struct vmap_area *tmp; 301 struct vmap_area *tmp_va;
297 302
298 parent = *p; 303 parent = *p;
299 tmp = rb_entry(parent, struct vmap_area, rb_node); 304 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
300 if (va->va_start < tmp->va_end) 305 if (va->va_start < tmp_va->va_end)
301 p = &(*p)->rb_left; 306 p = &(*p)->rb_left;
302 else if (va->va_end > tmp->va_start) 307 else if (va->va_end > tmp_va->va_start)
303 p = &(*p)->rb_right; 308 p = &(*p)->rb_right;
304 else 309 else
305 BUG(); 310 BUG();
@@ -333,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
333 struct rb_node *n; 338 struct rb_node *n;
334 unsigned long addr; 339 unsigned long addr;
335 int purged = 0; 340 int purged = 0;
341 struct vmap_area *first;
336 342
337 BUG_ON(!size); 343 BUG_ON(!size);
338 BUG_ON(size & ~PAGE_MASK); 344 BUG_ON(size & ~PAGE_MASK);
345 BUG_ON(!is_power_of_2(align));
339 346
340 va = kmalloc_node(sizeof(struct vmap_area), 347 va = kmalloc_node(sizeof(struct vmap_area),
341 gfp_mask & GFP_RECLAIM_MASK, node); 348 gfp_mask & GFP_RECLAIM_MASK, node);
@@ -343,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
343 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
344 351
345retry: 352retry:
346 addr = ALIGN(vstart, align);
347
348 spin_lock(&vmap_area_lock); 353 spin_lock(&vmap_area_lock);
349 if (addr + size - 1 < addr) 354 /*
350 goto overflow; 355 * Invalidate cache if we have more permissive parameters.
356 * cached_hole_size notes the largest hole noticed _below_
357 * the vmap_area cached in free_vmap_cache: if size fits
358 * into that hole, we want to scan from vstart to reuse
359 * the hole instead of allocating above free_vmap_cache.
360 * Note that __free_vmap_area may update free_vmap_cache
361 * without updating cached_hole_size or cached_align.
362 */
363 if (!free_vmap_cache ||
364 size < cached_hole_size ||
365 vstart < cached_vstart ||
366 align < cached_align) {
367nocache:
368 cached_hole_size = 0;
369 free_vmap_cache = NULL;
370 }
371 /* record if we encounter less permissive parameters */
372 cached_vstart = vstart;
373 cached_align = align;
374
375 /* find starting point for our search */
376 if (free_vmap_cache) {
377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
378 addr = ALIGN(first->va_end, align);
379 if (addr < vstart)
380 goto nocache;
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 } else {
385 addr = ALIGN(vstart, align);
386 if (addr + size - 1 < addr)
387 goto overflow;
351 388
352 /* XXX: could have a last_hole cache */ 389 n = vmap_area_root.rb_node;
353 n = vmap_area_root.rb_node; 390 first = NULL;
354 if (n) {
355 struct vmap_area *first = NULL;
356 391
357 do { 392 while (n) {
358 struct vmap_area *tmp; 393 struct vmap_area *tmp;
359 tmp = rb_entry(n, struct vmap_area, rb_node); 394 tmp = rb_entry(n, struct vmap_area, rb_node);
360 if (tmp->va_end >= addr) { 395 if (tmp->va_end >= addr) {
361 if (!first && tmp->va_start < addr + size)
362 first = tmp;
363 n = n->rb_left;
364 } else {
365 first = tmp; 396 first = tmp;
397 if (tmp->va_start <= addr)
398 break;
399 n = n->rb_left;
400 } else
366 n = n->rb_right; 401 n = n->rb_right;
367 } 402 }
368 } while (n);
369 403
370 if (!first) 404 if (!first)
371 goto found; 405 goto found;
372
373 if (first->va_end < addr) {
374 n = rb_next(&first->rb_node);
375 if (n)
376 first = rb_entry(n, struct vmap_area, rb_node);
377 else
378 goto found;
379 }
380
381 while (addr + size > first->va_start && addr + size <= vend) {
382 addr = ALIGN(first->va_end + PAGE_SIZE, align);
383 if (addr + size - 1 < addr)
384 goto overflow;
385
386 n = rb_next(&first->rb_node);
387 if (n)
388 first = rb_entry(n, struct vmap_area, rb_node);
389 else
390 goto found;
391 }
392 } 406 }
393found: 407
394 if (addr + size > vend) { 408 /* from the starting point, walk areas until a suitable hole is found */
395overflow: 409 while (addr + size > first->va_start && addr + size <= vend) {
396 spin_unlock(&vmap_area_lock); 410 if (addr + cached_hole_size < first->va_start)
397 if (!purged) { 411 cached_hole_size = first->va_start - addr;
398 purge_vmap_area_lazy(); 412 addr = ALIGN(first->va_end, align);
399 purged = 1; 413 if (addr + size - 1 < addr)
400 goto retry; 414 goto overflow;
401 } 415
402 if (printk_ratelimit()) 416 n = rb_next(&first->rb_node);
403 printk(KERN_WARNING 417 if (n)
404 "vmap allocation for size %lu failed: " 418 first = rb_entry(n, struct vmap_area, rb_node);
405 "use vmalloc=<size> to increase size.\n", size); 419 else
406 kfree(va); 420 goto found;
407 return ERR_PTR(-EBUSY);
408 } 421 }
409 422
410 BUG_ON(addr & (align-1)); 423found:
424 if (addr + size > vend)
425 goto overflow;
411 426
412 va->va_start = addr; 427 va->va_start = addr;
413 va->va_end = addr + size; 428 va->va_end = addr + size;
414 va->flags = 0; 429 va->flags = 0;
415 __insert_vmap_area(va); 430 __insert_vmap_area(va);
431 free_vmap_cache = &va->rb_node;
416 spin_unlock(&vmap_area_lock); 432 spin_unlock(&vmap_area_lock);
417 433
434 BUG_ON(va->va_start & (align-1));
435 BUG_ON(va->va_start < vstart);
436 BUG_ON(va->va_end > vend);
437
418 return va; 438 return va;
439
440overflow:
441 spin_unlock(&vmap_area_lock);
442 if (!purged) {
443 purge_vmap_area_lazy();
444 purged = 1;
445 goto retry;
446 }
447 if (printk_ratelimit())
448 printk(KERN_WARNING
449 "vmap allocation for size %lu failed: "
450 "use vmalloc=<size> to increase size.\n", size);
451 kfree(va);
452 return ERR_PTR(-EBUSY);
419} 453}
420 454
421static void rcu_free_va(struct rcu_head *head) 455static void rcu_free_va(struct rcu_head *head)
@@ -428,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
428static void __free_vmap_area(struct vmap_area *va) 462static void __free_vmap_area(struct vmap_area *va)
429{ 463{
430 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 464 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
465
466 if (free_vmap_cache) {
467 if (va->va_end < cached_vstart) {
468 free_vmap_cache = NULL;
469 } else {
470 struct vmap_area *cache;
471 cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
472 if (va->va_start <= cache->va_start) {
473 free_vmap_cache = rb_prev(&va->rb_node);
474 /*
475 * We don't try to update cached_hole_size or
476 * cached_align, but it won't go very wrong.
477 */
478 }
479 }
480 }
431 rb_erase(&va->rb_node, &vmap_area_root); 481 rb_erase(&va->rb_node, &vmap_area_root);
432 RB_CLEAR_NODE(&va->rb_node); 482 RB_CLEAR_NODE(&va->rb_node);
433 list_del_rcu(&va->list); 483 list_del_rcu(&va->list);
@@ -503,9 +553,6 @@ static unsigned long lazy_max_pages(void)
503{ 553{
504 unsigned int log; 554 unsigned int log;
505 555
506 if (!vmap_lazy_unmap)
507 return 0;
508
509 log = fls(num_online_cpus()); 556 log = fls(num_online_cpus());
510 557
511 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 558 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -517,6 +564,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
517static void purge_fragmented_blocks_allcpus(void); 564static void purge_fragmented_blocks_allcpus(void);
518 565
519/* 566/*
567 * called before a call to iounmap() if the caller wants vm_area_struct's
568 * immediately freed.
569 */
570void set_iounmap_nonlazy(void)
571{
572 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
573}
574
575/*
520 * Purges all lazily-freed vmap areas. 576 * Purges all lazily-freed vmap areas.
521 * 577 *
522 * If sync is 0 then don't purge if there is already a purge in progress. 578 * If sync is 0 then don't purge if there is already a purge in progress.
@@ -557,7 +613,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
557 if (va->va_end > *end) 613 if (va->va_end > *end)
558 *end = va->va_end; 614 *end = va->va_end;
559 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 615 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
560 unmap_vmap_area(va);
561 list_add_tail(&va->purge_list, &valist); 616 list_add_tail(&va->purge_list, &valist);
562 va->flags |= VM_LAZY_FREEING; 617 va->flags |= VM_LAZY_FREEING;
563 va->flags &= ~VM_LAZY_FREE; 618 va->flags &= ~VM_LAZY_FREE;
@@ -602,10 +657,11 @@ static void purge_vmap_area_lazy(void)
602} 657}
603 658
604/* 659/*
605 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 660 * Free a vmap area, caller ensuring that the area has been unmapped
606 * called for the correct range previously. 661 * and flush_cache_vunmap had been called for the correct range
662 * previously.
607 */ 663 */
608static void free_unmap_vmap_area_noflush(struct vmap_area *va) 664static void free_vmap_area_noflush(struct vmap_area *va)
609{ 665{
610 va->flags |= VM_LAZY_FREE; 666 va->flags |= VM_LAZY_FREE;
611 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 667 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -614,6 +670,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
614} 670}
615 671
616/* 672/*
673 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
674 * called for the correct range previously.
675 */
676static void free_unmap_vmap_area_noflush(struct vmap_area *va)
677{
678 unmap_vmap_area(va);
679 free_vmap_area_noflush(va);
680}
681
682/*
617 * Free and unmap a vmap area 683 * Free and unmap a vmap area
618 */ 684 */
619static void free_unmap_vmap_area(struct vmap_area *va) 685static void free_unmap_vmap_area(struct vmap_area *va)
@@ -734,7 +800,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
734 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 800 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
735 VMALLOC_START, VMALLOC_END, 801 VMALLOC_START, VMALLOC_END,
736 node, gfp_mask); 802 node, gfp_mask);
737 if (unlikely(IS_ERR(va))) { 803 if (IS_ERR(va)) {
738 kfree(vb); 804 kfree(vb);
739 return ERR_CAST(va); 805 return ERR_CAST(va);
740 } 806 }
@@ -789,7 +855,7 @@ static void free_vmap_block(struct vmap_block *vb)
789 spin_unlock(&vmap_block_tree_lock); 855 spin_unlock(&vmap_block_tree_lock);
790 BUG_ON(tmp != vb); 856 BUG_ON(tmp != vb);
791 857
792 free_unmap_vmap_area_noflush(vb->va); 858 free_vmap_area_noflush(vb->va);
793 call_rcu(&vb->rcu_head, rcu_free_vb); 859 call_rcu(&vb->rcu_head, rcu_free_vb);
794} 860}
795 861
@@ -927,6 +993,8 @@ static void vb_free(const void *addr, unsigned long size)
927 rcu_read_unlock(); 993 rcu_read_unlock();
928 BUG_ON(!vb); 994 BUG_ON(!vb);
929 995
996 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
997
930 spin_lock(&vb->lock); 998 spin_lock(&vb->lock);
931 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 999 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
932 1000
@@ -979,7 +1047,6 @@ void vm_unmap_aliases(void)
979 1047
980 s = vb->va->va_start + (i << PAGE_SHIFT); 1048 s = vb->va->va_start + (i << PAGE_SHIFT);
981 e = vb->va->va_start + (j << PAGE_SHIFT); 1049 e = vb->va->va_start + (j << PAGE_SHIFT);
982 vunmap_page_range(s, e);
983 flush = 1; 1050 flush = 1;
984 1051
985 if (s < start) 1052 if (s < start)
@@ -1160,6 +1227,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1160{ 1227{
1161 vunmap_page_range(addr, addr + size); 1228 vunmap_page_range(addr, addr + size);
1162} 1229}
1230EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
1163 1231
1164/** 1232/**
1165 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 1233 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1300,13 +1368,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1300 -1, GFP_KERNEL, caller); 1368 -1, GFP_KERNEL, caller);
1301} 1369}
1302 1370
1303struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1304 int node, gfp_t gfp_mask)
1305{
1306 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1307 node, gfp_mask, __builtin_return_address(0));
1308}
1309
1310static struct vm_struct *find_vm_area(const void *addr) 1371static struct vm_struct *find_vm_area(const void *addr)
1311{ 1372{
1312 struct vmap_area *va; 1373 struct vmap_area *va;
@@ -1473,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1473static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1534static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1474 pgprot_t prot, int node, void *caller) 1535 pgprot_t prot, int node, void *caller)
1475{ 1536{
1537 const int order = 0;
1476 struct page **pages; 1538 struct page **pages;
1477 unsigned int nr_pages, array_size, i; 1539 unsigned int nr_pages, array_size, i;
1478 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1540 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1499,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1499 1561
1500 for (i = 0; i < area->nr_pages; i++) { 1562 for (i = 0; i < area->nr_pages; i++) {
1501 struct page *page; 1563 struct page *page;
1564 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
1502 1565
1503 if (node < 0) 1566 if (node < 0)
1504 page = alloc_page(gfp_mask); 1567 page = alloc_page(tmp_mask);
1505 else 1568 else
1506 page = alloc_pages_node(node, gfp_mask, 0); 1569 page = alloc_pages_node(node, tmp_mask, order);
1507 1570
1508 if (unlikely(!page)) { 1571 if (unlikely(!page)) {
1509 /* Successfully allocated i pages, free them in __vunmap() */ 1572 /* Successfully allocated i pages, free them in __vunmap() */
@@ -1518,29 +1581,19 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1518 return area->addr; 1581 return area->addr;
1519 1582
1520fail: 1583fail:
1584 warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
1585 "allocated %ld of %ld bytes\n",
1586 (area->nr_pages*PAGE_SIZE), area->size);
1521 vfree(area->addr); 1587 vfree(area->addr);
1522 return NULL; 1588 return NULL;
1523} 1589}
1524 1590
1525void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1526{
1527 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1528 __builtin_return_address(0));
1529
1530 /*
1531 * A ref_count = 3 is needed because the vm_struct and vmap_area
1532 * structures allocated in the __get_vm_area_node() function contain
1533 * references to the virtual address of the vmalloc'ed block.
1534 */
1535 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1536
1537 return addr;
1538}
1539
1540/** 1591/**
1541 * __vmalloc_node - allocate virtually contiguous memory 1592 * __vmalloc_node_range - allocate virtually contiguous memory
1542 * @size: allocation size 1593 * @size: allocation size
1543 * @align: desired alignment 1594 * @align: desired alignment
1595 * @start: vm area range start
1596 * @end: vm area range end
1544 * @gfp_mask: flags for the page level allocator 1597 * @gfp_mask: flags for the page level allocator
1545 * @prot: protection mask for the allocated pages 1598 * @prot: protection mask for the allocated pages
1546 * @node: node to use for allocation or -1 1599 * @node: node to use for allocation or -1
@@ -1550,9 +1603,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1550 * allocator with @gfp_mask flags. Map them into contiguous 1603 * allocator with @gfp_mask flags. Map them into contiguous
1551 * kernel virtual space, using a pagetable protection of @prot. 1604 * kernel virtual space, using a pagetable protection of @prot.
1552 */ 1605 */
1553static void *__vmalloc_node(unsigned long size, unsigned long align, 1606void *__vmalloc_node_range(unsigned long size, unsigned long align,
1554 gfp_t gfp_mask, pgprot_t prot, 1607 unsigned long start, unsigned long end, gfp_t gfp_mask,
1555 int node, void *caller) 1608 pgprot_t prot, int node, void *caller)
1556{ 1609{
1557 struct vm_struct *area; 1610 struct vm_struct *area;
1558 void *addr; 1611 void *addr;
@@ -1562,8 +1615,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1562 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1615 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1563 return NULL; 1616 return NULL;
1564 1617
1565 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1618 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1566 VMALLOC_END, node, gfp_mask, caller); 1619 gfp_mask, caller);
1567 1620
1568 if (!area) 1621 if (!area)
1569 return NULL; 1622 return NULL;
@@ -1580,6 +1633,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1580 return addr; 1633 return addr;
1581} 1634}
1582 1635
1636/**
1637 * __vmalloc_node - allocate virtually contiguous memory
1638 * @size: allocation size
1639 * @align: desired alignment
1640 * @gfp_mask: flags for the page level allocator
1641 * @prot: protection mask for the allocated pages
1642 * @node: node to use for allocation or -1
1643 * @caller: caller's return address
1644 *
1645 * Allocate enough pages to cover @size from the page level
1646 * allocator with @gfp_mask flags. Map them into contiguous
1647 * kernel virtual space, using a pagetable protection of @prot.
1648 */
1649static void *__vmalloc_node(unsigned long size, unsigned long align,
1650 gfp_t gfp_mask, pgprot_t prot,
1651 int node, void *caller)
1652{
1653 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1654 gfp_mask, prot, node, caller);
1655}
1656
1583void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1657void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1584{ 1658{
1585 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1659 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -1587,6 +1661,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1587} 1661}
1588EXPORT_SYMBOL(__vmalloc); 1662EXPORT_SYMBOL(__vmalloc);
1589 1663
1664static inline void *__vmalloc_node_flags(unsigned long size,
1665 int node, gfp_t flags)
1666{
1667 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1668 node, __builtin_return_address(0));
1669}
1670
1590/** 1671/**
1591 * vmalloc - allocate virtually contiguous memory 1672 * vmalloc - allocate virtually contiguous memory
1592 * @size: allocation size 1673 * @size: allocation size
@@ -1598,12 +1679,28 @@ EXPORT_SYMBOL(__vmalloc);
1598 */ 1679 */
1599void *vmalloc(unsigned long size) 1680void *vmalloc(unsigned long size)
1600{ 1681{
1601 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1682 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
1602 -1, __builtin_return_address(0));
1603} 1683}
1604EXPORT_SYMBOL(vmalloc); 1684EXPORT_SYMBOL(vmalloc);
1605 1685
1606/** 1686/**
1687 * vzalloc - allocate virtually contiguous memory with zero fill
1688 * @size: allocation size
1689 * Allocate enough pages to cover @size from the page level
1690 * allocator and map them into contiguous kernel virtual space.
1691 * The memory allocated is set to zero.
1692 *
1693 * For tight control over page level allocator and protection flags
1694 * use __vmalloc() instead.
1695 */
1696void *vzalloc(unsigned long size)
1697{
1698 return __vmalloc_node_flags(size, -1,
1699 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1700}
1701EXPORT_SYMBOL(vzalloc);
1702
1703/**
1607 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1704 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1608 * @size: allocation size 1705 * @size: allocation size
1609 * 1706 *
@@ -1644,6 +1741,25 @@ void *vmalloc_node(unsigned long size, int node)
1644} 1741}
1645EXPORT_SYMBOL(vmalloc_node); 1742EXPORT_SYMBOL(vmalloc_node);
1646 1743
1744/**
1745 * vzalloc_node - allocate memory on a specific node with zero fill
1746 * @size: allocation size
1747 * @node: numa node
1748 *
1749 * Allocate enough pages to cover @size from the page level
1750 * allocator and map them into contiguous kernel virtual space.
1751 * The memory allocated is set to zero.
1752 *
1753 * For tight control over page level allocator and protection flags
1754 * use __vmalloc_node() instead.
1755 */
1756void *vzalloc_node(unsigned long size, int node)
1757{
1758 return __vmalloc_node_flags(size, node,
1759 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1760}
1761EXPORT_SYMBOL(vzalloc_node);
1762
1647#ifndef PAGE_KERNEL_EXEC 1763#ifndef PAGE_KERNEL_EXEC
1648# define PAGE_KERNEL_EXEC PAGE_KERNEL 1764# define PAGE_KERNEL_EXEC PAGE_KERNEL
1649#endif 1765#endif
@@ -1892,8 +2008,6 @@ finished:
1892 * should know vmalloc() area is valid and can use memcpy(). 2008 * should know vmalloc() area is valid and can use memcpy().
1893 * This is for routines which have to access vmalloc area without 2009 * This is for routines which have to access vmalloc area without
1894 * any informaion, as /dev/kmem. 2010 * any informaion, as /dev/kmem.
1895 *
1896 * The caller should guarantee KM_USER1 is not used.
1897 */ 2011 */
1898 2012
1899long vwrite(char *buf, char *addr, unsigned long count) 2013long vwrite(char *buf, char *addr, unsigned long count)
@@ -2039,10 +2153,6 @@ struct vm_struct *alloc_vm_area(size_t size)
2039 return NULL; 2153 return NULL;
2040 } 2154 }
2041 2155
2042 /* Make sure the pagetables are constructed in process kernel
2043 mappings */
2044 vmalloc_sync_all();
2045
2046 return area; 2156 return area;
2047} 2157}
2048EXPORT_SYMBOL_GPL(alloc_vm_area); 2158EXPORT_SYMBOL_GPL(alloc_vm_area);
@@ -2056,6 +2166,7 @@ void free_vm_area(struct vm_struct *area)
2056} 2166}
2057EXPORT_SYMBOL_GPL(free_vm_area); 2167EXPORT_SYMBOL_GPL(free_vm_area);
2058 2168
2169#ifdef CONFIG_SMP
2059static struct vmap_area *node_to_va(struct rb_node *n) 2170static struct vmap_area *node_to_va(struct rb_node *n)
2060{ 2171{
2061 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2172 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2145,17 +2256,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2145 * @sizes: array containing size of each area 2256 * @sizes: array containing size of each area
2146 * @nr_vms: the number of areas to allocate 2257 * @nr_vms: the number of areas to allocate
2147 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2258 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2148 * @gfp_mask: allocation mask
2149 * 2259 *
2150 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2260 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2151 * vm_structs on success, %NULL on failure 2261 * vm_structs on success, %NULL on failure
2152 * 2262 *
2153 * Percpu allocator wants to use congruent vm areas so that it can 2263 * Percpu allocator wants to use congruent vm areas so that it can
2154 * maintain the offsets among percpu areas. This function allocates 2264 * maintain the offsets among percpu areas. This function allocates
2155 * congruent vmalloc areas for it. These areas tend to be scattered 2265 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
2156 * pretty far, distance between two areas easily going up to 2266 * be scattered pretty far, distance between two areas easily going up
2157 * gigabytes. To avoid interacting with regular vmallocs, these areas 2267 * to gigabytes. To avoid interacting with regular vmallocs, these
2158 * are allocated from top. 2268 * areas are allocated from top.
2159 * 2269 *
2160 * Despite its complicated look, this allocator is rather simple. It 2270 * Despite its complicated look, this allocator is rather simple. It
2161 * does everything top-down and scans areas from the end looking for 2271 * does everything top-down and scans areas from the end looking for
@@ -2166,7 +2276,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2166 */ 2276 */
2167struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2277struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2168 const size_t *sizes, int nr_vms, 2278 const size_t *sizes, int nr_vms,
2169 size_t align, gfp_t gfp_mask) 2279 size_t align)
2170{ 2280{
2171 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2281 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2172 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2282 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2176,8 +2286,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2176 unsigned long base, start, end, last_end; 2286 unsigned long base, start, end, last_end;
2177 bool purged = false; 2287 bool purged = false;
2178 2288
2179 gfp_mask &= GFP_RECLAIM_MASK;
2180
2181 /* verify parameters and allocate data structures */ 2289 /* verify parameters and allocate data structures */
2182 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2290 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2183 for (last_area = 0, area = 0; area < nr_vms; area++) { 2291 for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2210,14 +2318,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2210 return NULL; 2318 return NULL;
2211 } 2319 }
2212 2320
2213 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2321 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2214 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2322 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2215 if (!vas || !vms) 2323 if (!vas || !vms)
2216 goto err_free; 2324 goto err_free;
2217 2325
2218 for (area = 0; area < nr_vms; area++) { 2326 for (area = 0; area < nr_vms; area++) {
2219 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2327 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
2220 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2328 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
2221 if (!vas[area] || !vms[area]) 2329 if (!vas[area] || !vms[area])
2222 goto err_free; 2330 goto err_free;
2223 } 2331 }
@@ -2336,9 +2444,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2336 free_vm_area(vms[i]); 2444 free_vm_area(vms[i]);
2337 kfree(vms); 2445 kfree(vms);
2338} 2446}
2447#endif /* CONFIG_SMP */
2339 2448
2340#ifdef CONFIG_PROC_FS 2449#ifdef CONFIG_PROC_FS
2341static void *s_start(struct seq_file *m, loff_t *pos) 2450static void *s_start(struct seq_file *m, loff_t *pos)
2451 __acquires(&vmlist_lock)
2342{ 2452{
2343 loff_t n = *pos; 2453 loff_t n = *pos;
2344 struct vm_struct *v; 2454 struct vm_struct *v;
@@ -2365,6 +2475,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2365} 2475}
2366 2476
2367static void s_stop(struct seq_file *m, void *p) 2477static void s_stop(struct seq_file *m, void *p)
2478 __releases(&vmlist_lock)
2368{ 2479{
2369 read_unlock(&vmlist_lock); 2480 read_unlock(&vmlist_lock);
2370} 2481}
@@ -2395,13 +2506,8 @@ static int s_show(struct seq_file *m, void *p)
2395 seq_printf(m, "0x%p-0x%p %7ld", 2506 seq_printf(m, "0x%p-0x%p %7ld",
2396 v->addr, v->addr + v->size, v->size); 2507 v->addr, v->addr + v->size, v->size);
2397 2508
2398 if (v->caller) { 2509 if (v->caller)
2399 char buff[KSYM_SYMBOL_LEN]; 2510 seq_printf(m, " %pS", v->caller);
2400
2401 seq_putc(m, ' ');
2402 sprint_symbol(buff, (unsigned long)v->caller);
2403 seq_puts(m, buff);
2404 }
2405 2511
2406 if (v->nr_pages) 2512 if (v->nr_pages)
2407 seq_printf(m, " pages=%d", v->nr_pages); 2513 seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..d036e59d302b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -40,6 +41,8 @@
40#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 42#include <linux/delayacct.h>
42#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/oom.h>
45#include <linux/prefetch.h>
43 46
44#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
45#include <asm/div64.h> 48#include <asm/div64.h>
@@ -51,6 +54,24 @@
51#define CREATE_TRACE_POINTS 54#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 55#include <trace/events/vmscan.h>
53 56
57/*
58 * reclaim_mode determines how the inactive list is shrunk
59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
60 * RECLAIM_MODE_ASYNC: Do not block
61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
63 * page from the LRU and reclaim all pages within a
64 * naturally aligned range
65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
66 * order-0 pages and then compact the zone
67 */
68typedef unsigned __bitwise__ reclaim_mode_t;
69#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
70#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
71#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
72#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
73#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
74
54struct scan_control { 75struct scan_control {
55 /* Incremented by the number of inactive pages that were scanned */ 76 /* Incremented by the number of inactive pages that were scanned */
56 unsigned long nr_scanned; 77 unsigned long nr_scanned;
@@ -79,10 +100,10 @@ struct scan_control {
79 int order; 100 int order;
80 101
81 /* 102 /*
82 * Intend to reclaim enough contenious memory rather than to reclaim 103 * Intend to reclaim enough continuous memory rather than reclaim
83 * enough amount memory. I.e, it's the mode for high order allocation. 104 * enough amount of memory. i.e, mode for high order allocation.
84 */ 105 */
85 bool lumpy_reclaim_mode; 106 reclaim_mode_t reclaim_mode;
86 107
87 /* Which cgroup do we reclaim from */ 108 /* Which cgroup do we reclaim from */
88 struct mem_cgroup *mem_cgroup; 109 struct mem_cgroup *mem_cgroup;
@@ -152,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
152 struct scan_control *sc, enum lru_list lru) 173 struct scan_control *sc, enum lru_list lru)
153{ 174{
154 if (!scanning_global_lru(sc)) 175 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
156 177
157 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
158} 179}
@@ -181,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
181} 202}
182EXPORT_SYMBOL(unregister_shrinker); 203EXPORT_SYMBOL(unregister_shrinker);
183 204
205static inline int do_shrinker_shrink(struct shrinker *shrinker,
206 struct shrink_control *sc,
207 unsigned long nr_to_scan)
208{
209 sc->nr_to_scan = nr_to_scan;
210 return (*shrinker->shrink)(shrinker, sc);
211}
212
184#define SHRINK_BATCH 128 213#define SHRINK_BATCH 128
185/* 214/*
186 * Call the shrink functions to age shrinkable caches 215 * Call the shrink functions to age shrinkable caches
@@ -201,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
201 * 230 *
202 * Returns the number of slab objects which we shrunk. 231 * Returns the number of slab objects which we shrunk.
203 */ 232 */
204unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 233unsigned long shrink_slab(struct shrink_control *shrink,
205 unsigned long lru_pages) 234 unsigned long nr_pages_scanned,
235 unsigned long lru_pages)
206{ 236{
207 struct shrinker *shrinker; 237 struct shrinker *shrinker;
208 unsigned long ret = 0; 238 unsigned long ret = 0;
209 239
210 if (scanned == 0) 240 if (nr_pages_scanned == 0)
211 scanned = SWAP_CLUSTER_MAX; 241 nr_pages_scanned = SWAP_CLUSTER_MAX;
212 242
213 if (!down_read_trylock(&shrinker_rwsem)) 243 if (!down_read_trylock(&shrinker_rwsem)) {
214 return 1; /* Assume we'll be able to shrink next time */ 244 /* Assume we'll be able to shrink next time */
245 ret = 1;
246 goto out;
247 }
215 248
216 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
217 unsigned long long delta; 250 unsigned long long delta;
218 unsigned long total_scan; 251 unsigned long total_scan;
219 unsigned long max_pass; 252 unsigned long max_pass;
220 253
221 max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); 254 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
222 delta = (4 * scanned) / shrinker->seeks; 255 delta = (4 * nr_pages_scanned) / shrinker->seeks;
223 delta *= max_pass; 256 delta *= max_pass;
224 do_div(delta, lru_pages + 1); 257 do_div(delta, lru_pages + 1);
225 shrinker->nr += delta; 258 shrinker->nr += delta;
@@ -246,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
246 int shrink_ret; 279 int shrink_ret;
247 int nr_before; 280 int nr_before;
248 281
249 nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); 282 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
250 shrink_ret = (*shrinker->shrink)(shrinker, this_scan, 283 shrink_ret = do_shrinker_shrink(shrinker, shrink,
251 gfp_mask); 284 this_scan);
252 if (shrink_ret == -1) 285 if (shrink_ret == -1)
253 break; 286 break;
254 if (shrink_ret < nr_before) 287 if (shrink_ret < nr_before)
@@ -262,9 +295,44 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 shrinker->nr += total_scan; 295 shrinker->nr += total_scan;
263 } 296 }
264 up_read(&shrinker_rwsem); 297 up_read(&shrinker_rwsem);
298out:
299 cond_resched();
265 return ret; 300 return ret;
266} 301}
267 302
303static void set_reclaim_mode(int priority, struct scan_control *sc,
304 bool sync)
305{
306 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
307
308 /*
309 * Initially assume we are entering either lumpy reclaim or
310 * reclaim/compaction.Depending on the order, we will either set the
311 * sync mode or just reclaim order-0 pages later.
312 */
313 if (COMPACTION_BUILD)
314 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
315 else
316 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
317
318 /*
319 * Avoid using lumpy reclaim or reclaim/compaction if possible by
320 * restricting when its set to either costly allocations or when
321 * under memory pressure
322 */
323 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
324 sc->reclaim_mode |= syncmode;
325 else if (sc->order && priority < DEF_PRIORITY - 2)
326 sc->reclaim_mode |= syncmode;
327 else
328 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
329}
330
331static void reset_reclaim_mode(struct scan_control *sc)
332{
333 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
334}
335
268static inline int is_page_cache_freeable(struct page *page) 336static inline int is_page_cache_freeable(struct page *page)
269{ 337{
270 /* 338 /*
@@ -275,7 +343,8 @@ static inline int is_page_cache_freeable(struct page *page)
275 return page_count(page) - page_has_private(page) == 2; 343 return page_count(page) - page_has_private(page) == 2;
276} 344}
277 345
278static int may_write_to_queue(struct backing_dev_info *bdi) 346static int may_write_to_queue(struct backing_dev_info *bdi,
347 struct scan_control *sc)
279{ 348{
280 if (current->flags & PF_SWAPWRITE) 349 if (current->flags & PF_SWAPWRITE)
281 return 1; 350 return 1;
@@ -283,6 +352,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
283 return 1; 352 return 1;
284 if (bdi == current->backing_dev_info) 353 if (bdi == current->backing_dev_info)
285 return 1; 354 return 1;
355
356 /* lumpy reclaim for hugepage often need a lot of write */
357 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
358 return 1;
286 return 0; 359 return 0;
287} 360}
288 361
@@ -301,18 +374,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
301static void handle_write_error(struct address_space *mapping, 374static void handle_write_error(struct address_space *mapping,
302 struct page *page, int error) 375 struct page *page, int error)
303{ 376{
304 lock_page_nosync(page); 377 lock_page(page);
305 if (page_mapping(page) == mapping) 378 if (page_mapping(page) == mapping)
306 mapping_set_error(mapping, error); 379 mapping_set_error(mapping, error);
307 unlock_page(page); 380 unlock_page(page);
308} 381}
309 382
310/* Request for sync pageout. */
311enum pageout_io {
312 PAGEOUT_IO_ASYNC,
313 PAGEOUT_IO_SYNC,
314};
315
316/* possible outcome of pageout() */ 383/* possible outcome of pageout() */
317typedef enum { 384typedef enum {
318 /* failed to write page out, page is locked */ 385 /* failed to write page out, page is locked */
@@ -330,7 +397,7 @@ typedef enum {
330 * Calls ->writepage(). 397 * Calls ->writepage().
331 */ 398 */
332static pageout_t pageout(struct page *page, struct address_space *mapping, 399static pageout_t pageout(struct page *page, struct address_space *mapping,
333 enum pageout_io sync_writeback) 400 struct scan_control *sc)
334{ 401{
335 /* 402 /*
336 * If the page is dirty, only perform writeback if that write 403 * If the page is dirty, only perform writeback if that write
@@ -366,7 +433,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
366 } 433 }
367 if (mapping->a_ops->writepage == NULL) 434 if (mapping->a_ops->writepage == NULL)
368 return PAGE_ACTIVATE; 435 return PAGE_ACTIVATE;
369 if (!may_write_to_queue(mapping->backing_dev_info)) 436 if (!may_write_to_queue(mapping->backing_dev_info, sc))
370 return PAGE_KEEP; 437 return PAGE_KEEP;
371 438
372 if (clear_page_dirty_for_io(page)) { 439 if (clear_page_dirty_for_io(page)) {
@@ -376,7 +443,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
376 .nr_to_write = SWAP_CLUSTER_MAX, 443 .nr_to_write = SWAP_CLUSTER_MAX,
377 .range_start = 0, 444 .range_start = 0,
378 .range_end = LLONG_MAX, 445 .range_end = LLONG_MAX,
379 .nonblocking = 1,
380 .for_reclaim = 1, 446 .for_reclaim = 1,
381 }; 447 };
382 448
@@ -394,7 +460,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
394 * direct reclaiming a large contiguous area and the 460 * direct reclaiming a large contiguous area and the
395 * first attempt to free a range of pages fails. 461 * first attempt to free a range of pages fails.
396 */ 462 */
397 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 463 if (PageWriteback(page) &&
464 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
398 wait_on_page_writeback(page); 465 wait_on_page_writeback(page);
399 466
400 if (!PageWriteback(page)) { 467 if (!PageWriteback(page)) {
@@ -402,7 +469,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
402 ClearPageReclaim(page); 469 ClearPageReclaim(page);
403 } 470 }
404 trace_mm_vmscan_writepage(page, 471 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback)); 472 trace_reclaim_flags(page, sc->reclaim_mode));
406 inc_zone_page_state(page, NR_VMSCAN_WRITE); 473 inc_zone_page_state(page, NR_VMSCAN_WRITE);
407 return PAGE_SUCCESS; 474 return PAGE_SUCCESS;
408 } 475 }
@@ -459,9 +526,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
459 spin_unlock_irq(&mapping->tree_lock); 526 spin_unlock_irq(&mapping->tree_lock);
460 swapcache_free(swap, page); 527 swapcache_free(swap, page);
461 } else { 528 } else {
462 __remove_from_page_cache(page); 529 void (*freepage)(struct page *);
530
531 freepage = mapping->a_ops->freepage;
532
533 __delete_from_page_cache(page);
463 spin_unlock_irq(&mapping->tree_lock); 534 spin_unlock_irq(&mapping->tree_lock);
464 mem_cgroup_uncharge_cache_page(page); 535 mem_cgroup_uncharge_cache_page(page);
536
537 if (freepage != NULL)
538 freepage(page);
465 } 539 }
466 540
467 return 1; 541 return 1;
@@ -580,7 +654,7 @@ static enum page_references page_check_references(struct page *page,
580 referenced_page = TestClearPageReferenced(page); 654 referenced_page = TestClearPageReferenced(page);
581 655
582 /* Lumpy reclaim - ignore references */ 656 /* Lumpy reclaim - ignore references */
583 if (sc->lumpy_reclaim_mode) 657 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
584 return PAGEREF_RECLAIM; 658 return PAGEREF_RECLAIM;
585 659
586 /* 660 /*
@@ -616,7 +690,7 @@ static enum page_references page_check_references(struct page *page,
616 } 690 }
617 691
618 /* Reclaim if clean, defer dirty pages to writeback */ 692 /* Reclaim if clean, defer dirty pages to writeback */
619 if (referenced_page) 693 if (referenced_page && !PageSwapBacked(page))
620 return PAGEREF_RECLAIM_CLEAN; 694 return PAGEREF_RECLAIM_CLEAN;
621 695
622 return PAGEREF_RECLAIM; 696 return PAGEREF_RECLAIM;
@@ -644,12 +718,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
644 * shrink_page_list() returns the number of reclaimed pages 718 * shrink_page_list() returns the number of reclaimed pages
645 */ 719 */
646static unsigned long shrink_page_list(struct list_head *page_list, 720static unsigned long shrink_page_list(struct list_head *page_list,
647 struct scan_control *sc, 721 struct zone *zone,
648 enum pageout_io sync_writeback) 722 struct scan_control *sc)
649{ 723{
650 LIST_HEAD(ret_pages); 724 LIST_HEAD(ret_pages);
651 LIST_HEAD(free_pages); 725 LIST_HEAD(free_pages);
652 int pgactivate = 0; 726 int pgactivate = 0;
727 unsigned long nr_dirty = 0;
728 unsigned long nr_congested = 0;
653 unsigned long nr_reclaimed = 0; 729 unsigned long nr_reclaimed = 0;
654 730
655 cond_resched(); 731 cond_resched();
@@ -669,6 +745,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
669 goto keep; 745 goto keep;
670 746
671 VM_BUG_ON(PageActive(page)); 747 VM_BUG_ON(PageActive(page));
748 VM_BUG_ON(page_zone(page) != zone);
672 749
673 sc->nr_scanned++; 750 sc->nr_scanned++;
674 751
@@ -694,10 +771,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
694 * for any page for which writeback has already 771 * for any page for which writeback has already
695 * started. 772 * started.
696 */ 773 */
697 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 774 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
775 may_enter_fs)
698 wait_on_page_writeback(page); 776 wait_on_page_writeback(page);
699 else 777 else {
700 goto keep_locked; 778 unlock_page(page);
779 goto keep_lumpy;
780 }
701 } 781 }
702 782
703 references = page_check_references(page, sc); 783 references = page_check_references(page, sc);
@@ -743,6 +823,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
743 } 823 }
744 824
745 if (PageDirty(page)) { 825 if (PageDirty(page)) {
826 nr_dirty++;
827
746 if (references == PAGEREF_RECLAIM_CLEAN) 828 if (references == PAGEREF_RECLAIM_CLEAN)
747 goto keep_locked; 829 goto keep_locked;
748 if (!may_enter_fs) 830 if (!may_enter_fs)
@@ -751,14 +833,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
751 goto keep_locked; 833 goto keep_locked;
752 834
753 /* Page is dirty, try to write it out here */ 835 /* Page is dirty, try to write it out here */
754 switch (pageout(page, mapping, sync_writeback)) { 836 switch (pageout(page, mapping, sc)) {
755 case PAGE_KEEP: 837 case PAGE_KEEP:
838 nr_congested++;
756 goto keep_locked; 839 goto keep_locked;
757 case PAGE_ACTIVATE: 840 case PAGE_ACTIVATE:
758 goto activate_locked; 841 goto activate_locked;
759 case PAGE_SUCCESS: 842 case PAGE_SUCCESS:
760 if (PageWriteback(page) || PageDirty(page)) 843 if (PageWriteback(page))
844 goto keep_lumpy;
845 if (PageDirty(page))
761 goto keep; 846 goto keep;
847
762 /* 848 /*
763 * A synchronous write - probably a ramdisk. Go 849 * A synchronous write - probably a ramdisk. Go
764 * ahead and try to reclaim the page. 850 * ahead and try to reclaim the page.
@@ -841,6 +927,7 @@ cull_mlocked:
841 try_to_free_swap(page); 927 try_to_free_swap(page);
842 unlock_page(page); 928 unlock_page(page);
843 putback_lru_page(page); 929 putback_lru_page(page);
930 reset_reclaim_mode(sc);
844 continue; 931 continue;
845 932
846activate_locked: 933activate_locked:
@@ -853,10 +940,21 @@ activate_locked:
853keep_locked: 940keep_locked:
854 unlock_page(page); 941 unlock_page(page);
855keep: 942keep:
943 reset_reclaim_mode(sc);
944keep_lumpy:
856 list_add(&page->lru, &ret_pages); 945 list_add(&page->lru, &ret_pages);
857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
858 } 947 }
859 948
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
860 free_page_list(&free_pages); 958 free_page_list(&free_pages);
861 959
862 list_splice(&ret_pages, page_list); 960 list_splice(&ret_pages, page_list);
@@ -962,7 +1060,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
962 case 0: 1060 case 0:
963 list_move(&page->lru, dst); 1061 list_move(&page->lru, dst);
964 mem_cgroup_del_lru(page); 1062 mem_cgroup_del_lru(page);
965 nr_taken++; 1063 nr_taken += hpage_nr_pages(page);
966 break; 1064 break;
967 1065
968 case -EBUSY: 1066 case -EBUSY:
@@ -983,7 +1081,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
983 * surrounding the tag page. Only take those pages of 1081 * surrounding the tag page. Only take those pages of
984 * the same active state as that tag page. We may safely 1082 * the same active state as that tag page. We may safely
985 * round the target page pfn down to the requested order 1083 * round the target page pfn down to the requested order
986 * as the mem_map is guarenteed valid out to MAX_ORDER, 1084 * as the mem_map is guaranteed valid out to MAX_ORDER,
987 * where that page is in a different zone we will detect 1085 * where that page is in a different zone we will detect
988 * it from its zone id and abort this block scan. 1086 * it from its zone id and abort this block scan.
989 */ 1087 */
@@ -1006,7 +1104,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1006 1104
1007 /* Check that we have not crossed a zone boundary. */ 1105 /* Check that we have not crossed a zone boundary. */
1008 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1106 if (unlikely(page_zone_id(cursor_page) != zone_id))
1009 continue; 1107 break;
1010 1108
1011 /* 1109 /*
1012 * If we don't have enough swap space, reclaiming of 1110 * If we don't have enough swap space, reclaiming of
@@ -1014,23 +1112,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1014 * pointless. 1112 * pointless.
1015 */ 1113 */
1016 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1114 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1017 !PageSwapCache(cursor_page)) 1115 !PageSwapCache(cursor_page))
1018 continue; 1116 break;
1019 1117
1020 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1118 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1021 list_move(&cursor_page->lru, dst); 1119 list_move(&cursor_page->lru, dst);
1022 mem_cgroup_del_lru(cursor_page); 1120 mem_cgroup_del_lru(cursor_page);
1023 nr_taken++; 1121 nr_taken += hpage_nr_pages(page);
1024 nr_lumpy_taken++; 1122 nr_lumpy_taken++;
1025 if (PageDirty(cursor_page)) 1123 if (PageDirty(cursor_page))
1026 nr_lumpy_dirty++; 1124 nr_lumpy_dirty++;
1027 scan++; 1125 scan++;
1028 } else { 1126 } else {
1029 if (mode == ISOLATE_BOTH && 1127 /*
1030 page_count(cursor_page)) 1128 * Check if the page is freed already.
1031 nr_lumpy_failed++; 1129 *
1130 * We can't use page_count() as that
1131 * requires compound_head and we don't
1132 * have a pin on the page here. If a
1133 * page is tail, we may or may not
1134 * have isolated the head, so assume
1135 * it's not free, it'd be tricky to
1136 * track the head status without a
1137 * page pin.
1138 */
1139 if (!PageTail(cursor_page) &&
1140 !atomic_read(&cursor_page->_count))
1141 continue;
1142 break;
1032 } 1143 }
1033 } 1144 }
1145
1146 /* If we break out of the loop above, lumpy reclaim failed */
1147 if (pfn < end_pfn)
1148 nr_lumpy_failed++;
1034 } 1149 }
1035 1150
1036 *scanned = scan; 1151 *scanned = scan;
@@ -1070,14 +1185,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1070 struct page *page; 1185 struct page *page;
1071 1186
1072 list_for_each_entry(page, page_list, lru) { 1187 list_for_each_entry(page, page_list, lru) {
1188 int numpages = hpage_nr_pages(page);
1073 lru = page_lru_base_type(page); 1189 lru = page_lru_base_type(page);
1074 if (PageActive(page)) { 1190 if (PageActive(page)) {
1075 lru += LRU_ACTIVE; 1191 lru += LRU_ACTIVE;
1076 ClearPageActive(page); 1192 ClearPageActive(page);
1077 nr_active++; 1193 nr_active += numpages;
1078 } 1194 }
1079 if (count) 1195 if (count)
1080 count[lru]++; 1196 count[lru] += numpages;
1081 } 1197 }
1082 1198
1083 return nr_active; 1199 return nr_active;
@@ -1112,13 +1228,16 @@ int isolate_lru_page(struct page *page)
1112{ 1228{
1113 int ret = -EBUSY; 1229 int ret = -EBUSY;
1114 1230
1231 VM_BUG_ON(!page_count(page));
1232
1115 if (PageLRU(page)) { 1233 if (PageLRU(page)) {
1116 struct zone *zone = page_zone(page); 1234 struct zone *zone = page_zone(page);
1117 1235
1118 spin_lock_irq(&zone->lru_lock); 1236 spin_lock_irq(&zone->lru_lock);
1119 if (PageLRU(page) && get_page_unless_zero(page)) { 1237 if (PageLRU(page)) {
1120 int lru = page_lru(page); 1238 int lru = page_lru(page);
1121 ret = 0; 1239 ret = 0;
1240 get_page(page);
1122 ClearPageLRU(page); 1241 ClearPageLRU(page);
1123 1242
1124 del_page_from_lru_list(zone, page, lru); 1243 del_page_from_lru_list(zone, page, lru);
@@ -1187,7 +1306,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1187 add_page_to_lru_list(zone, page, lru); 1306 add_page_to_lru_list(zone, page, lru);
1188 if (is_active_lru(lru)) { 1307 if (is_active_lru(lru)) {
1189 int file = is_file_lru(lru); 1308 int file = is_file_lru(lru);
1190 reclaim_stat->recent_rotated[file]++; 1309 int numpages = hpage_nr_pages(page);
1310 reclaim_stat->recent_rotated[file] += numpages;
1191 } 1311 }
1192 if (!pagevec_add(&pvec, page)) { 1312 if (!pagevec_add(&pvec, page)) {
1193 spin_unlock_irq(&zone->lru_lock); 1313 spin_unlock_irq(&zone->lru_lock);
@@ -1253,7 +1373,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1253 return false; 1373 return false;
1254 1374
1255 /* Only stall on lumpy reclaim */ 1375 /* Only stall on lumpy reclaim */
1256 if (!sc->lumpy_reclaim_mode) 1376 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1257 return false; 1377 return false;
1258 1378
1259 /* If we have relaimed everything on the isolated list, no stall */ 1379 /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1406,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1286 unsigned long nr_scanned; 1406 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0; 1407 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken; 1408 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon; 1409 unsigned long nr_anon;
1291 unsigned long nr_file; 1410 unsigned long nr_file;
1292 1411
@@ -1298,15 +1417,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1298 return SWAP_CLUSTER_MAX; 1417 return SWAP_CLUSTER_MAX;
1299 } 1418 }
1300 1419
1301 1420 set_reclaim_mode(priority, sc, false);
1302 lru_add_drain(); 1421 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock); 1422 spin_lock_irq(&zone->lru_lock);
1304 1423
1305 if (scanning_global_lru(sc)) { 1424 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan, 1425 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order, 1426 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ? 1427 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE, 1428 ISOLATE_BOTH : ISOLATE_INACTIVE,
1310 zone, 0, file); 1429 zone, 0, file);
1311 zone->pages_scanned += nr_scanned; 1430 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd()) 1431 if (current_is_kswapd())
@@ -1318,8 +1437,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1318 } else { 1437 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1438 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order, 1439 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ? 1440 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE, 1441 ISOLATE_BOTH : ISOLATE_INACTIVE,
1323 zone, sc->mem_cgroup, 1442 zone, sc->mem_cgroup,
1324 0, file); 1443 0, file);
1325 /* 1444 /*
@@ -1337,20 +1456,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1337 1456
1338 spin_unlock_irq(&zone->lru_lock); 1457 spin_unlock_irq(&zone->lru_lock);
1339 1458
1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1459 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1341 1460
1342 /* Check if we should syncronously wait for writeback */ 1461 /* Check if we should syncronously wait for writeback */
1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1462 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10); 1463 set_reclaim_mode(priority, sc, true);
1345 1464 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1346 /*
1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1349 */
1350 nr_active = clear_active_flags(&page_list, NULL);
1351 count_vm_events(PGDEACTIVATE, nr_active);
1352
1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 } 1465 }
1355 1466
1356 local_irq_disable(); 1467 local_irq_disable();
@@ -1359,6 +1470,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1470 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1360 1471
1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1472 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1473
1474 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1475 zone_idx(zone),
1476 nr_scanned, nr_reclaimed,
1477 priority,
1478 trace_shrink_flags(file, sc->reclaim_mode));
1362 return nr_reclaimed; 1479 return nr_reclaimed;
1363} 1480}
1364 1481
@@ -1398,7 +1515,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1398 1515
1399 list_move(&page->lru, &zone->lru[lru].list); 1516 list_move(&page->lru, &zone->lru[lru].list);
1400 mem_cgroup_add_lru_list(page, lru); 1517 mem_cgroup_add_lru_list(page, lru);
1401 pgmoved++; 1518 pgmoved += hpage_nr_pages(page);
1402 1519
1403 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1520 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1404 spin_unlock_irq(&zone->lru_lock); 1521 spin_unlock_irq(&zone->lru_lock);
@@ -1466,7 +1583,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1466 } 1583 }
1467 1584
1468 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1585 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1469 nr_rotated++; 1586 nr_rotated += hpage_nr_pages(page);
1470 /* 1587 /*
1471 * Identify referenced, file-backed active pages and 1588 * Identify referenced, file-backed active pages and
1472 * give them one more trip around the active list. So 1589 * give them one more trip around the active list. So
@@ -1506,6 +1623,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1506 spin_unlock_irq(&zone->lru_lock); 1623 spin_unlock_irq(&zone->lru_lock);
1507} 1624}
1508 1625
1626#ifdef CONFIG_SWAP
1509static int inactive_anon_is_low_global(struct zone *zone) 1627static int inactive_anon_is_low_global(struct zone *zone)
1510{ 1628{
1511 unsigned long active, inactive; 1629 unsigned long active, inactive;
@@ -1531,12 +1649,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1531{ 1649{
1532 int low; 1650 int low;
1533 1651
1652 /*
1653 * If we don't have swap space, anonymous page deactivation
1654 * is pointless.
1655 */
1656 if (!total_swap_pages)
1657 return 0;
1658
1534 if (scanning_global_lru(sc)) 1659 if (scanning_global_lru(sc))
1535 low = inactive_anon_is_low_global(zone); 1660 low = inactive_anon_is_low_global(zone);
1536 else 1661 else
1537 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1662 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1538 return low; 1663 return low;
1539} 1664}
1665#else
1666static inline int inactive_anon_is_low(struct zone *zone,
1667 struct scan_control *sc)
1668{
1669 return 0;
1670}
1671#endif
1540 1672
1541static int inactive_file_is_low_global(struct zone *zone) 1673static int inactive_file_is_low_global(struct zone *zone)
1542{ 1674{
@@ -1598,26 +1730,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1598} 1730}
1599 1731
1600/* 1732/*
1601 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1602 * until we collected @swap_cluster_max pages to scan.
1603 */
1604static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1605 unsigned long *nr_saved_scan)
1606{
1607 unsigned long nr;
1608
1609 *nr_saved_scan += nr_to_scan;
1610 nr = *nr_saved_scan;
1611
1612 if (nr >= SWAP_CLUSTER_MAX)
1613 *nr_saved_scan = 0;
1614 else
1615 nr = 0;
1616
1617 return nr;
1618}
1619
1620/*
1621 * Determine how aggressively the anon and file LRU lists should be 1733 * Determine how aggressively the anon and file LRU lists should be
1622 * scanned. The relative value of each set of LRU lists is determined 1734 * scanned. The relative value of each set of LRU lists is determined
1623 * by looking at the fraction of the pages scanned we did rotate back 1735 * by looking at the fraction of the pages scanned we did rotate back
@@ -1635,6 +1747,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1635 u64 fraction[2], denominator; 1747 u64 fraction[2], denominator;
1636 enum lru_list l; 1748 enum lru_list l;
1637 int noswap = 0; 1749 int noswap = 0;
1750 int force_scan = 0;
1751
1752
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1754 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1755 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1756 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1757
1758 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1759 /* kswapd does zone balancing and need to scan this zone */
1760 if (scanning_global_lru(sc) && current_is_kswapd())
1761 force_scan = 1;
1762 /* memcg may have small limit and need to avoid priority drop */
1763 if (!scanning_global_lru(sc))
1764 force_scan = 1;
1765 }
1638 1766
1639 /* If we have no swap space, do not bother scanning anon pages. */ 1767 /* If we have no swap space, do not bother scanning anon pages. */
1640 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1768 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1645,11 +1773,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1645 goto out; 1773 goto out;
1646 } 1774 }
1647 1775
1648 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1649 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1650 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1651 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1652
1653 if (scanning_global_lru(sc)) { 1776 if (scanning_global_lru(sc)) {
1654 free = zone_page_state(zone, NR_FREE_PAGES); 1777 free = zone_page_state(zone, NR_FREE_PAGES);
1655 /* If we have very few page cache pages, 1778 /* If we have very few page cache pages,
@@ -1716,24 +1839,87 @@ out:
1716 scan >>= priority; 1839 scan >>= priority;
1717 scan = div64_u64(scan * fraction[file], denominator); 1840 scan = div64_u64(scan * fraction[file], denominator);
1718 } 1841 }
1719 nr[l] = nr_scan_try_batch(scan, 1842
1720 &reclaim_stat->nr_saved_scan[l]); 1843 /*
1844 * If zone is small or memcg is small, nr[l] can be 0.
1845 * This results no-scan on this priority and priority drop down.
1846 * For global direct reclaim, it can visit next zone and tend
1847 * not to have problems. For global kswapd, it's for zone
1848 * balancing and it need to scan a small amounts. When using
1849 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above.
1851 */
1852 if (!scan && force_scan) {
1853 if (file)
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX;
1857 }
1858 nr[l] = scan;
1721 } 1859 }
1722} 1860}
1723 1861
1724static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) 1862/*
1863 * Reclaim/compaction depends on a number of pages being freed. To avoid
1864 * disruption to the system, a small number of order-0 pages continue to be
1865 * rotated and reclaimed in the normal fashion. However, by the time we get
1866 * back to the allocator and call try_to_compact_zone(), we ensure that
1867 * there are enough free pages for it to be likely successful
1868 */
1869static inline bool should_continue_reclaim(struct zone *zone,
1870 unsigned long nr_reclaimed,
1871 unsigned long nr_scanned,
1872 struct scan_control *sc)
1725{ 1873{
1874 unsigned long pages_for_compaction;
1875 unsigned long inactive_lru_pages;
1876
1877 /* If not in reclaim/compaction mode, stop */
1878 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1879 return false;
1880
1881 /* Consider stopping depending on scan and reclaim activity */
1882 if (sc->gfp_mask & __GFP_REPEAT) {
1883 /*
1884 * For __GFP_REPEAT allocations, stop reclaiming if the
1885 * full LRU list has been scanned and we are still failing
1886 * to reclaim pages. This full LRU scan is potentially
1887 * expensive but a __GFP_REPEAT caller really wants to succeed
1888 */
1889 if (!nr_reclaimed && !nr_scanned)
1890 return false;
1891 } else {
1892 /*
1893 * For non-__GFP_REPEAT allocations which can presumably
1894 * fail without consequence, stop if we failed to reclaim
1895 * any pages from the last SWAP_CLUSTER_MAX number of
1896 * pages that were scanned. This will return to the
1897 * caller faster at the risk reclaim/compaction and
1898 * the resulting allocation attempt fails
1899 */
1900 if (!nr_reclaimed)
1901 return false;
1902 }
1903
1726 /* 1904 /*
1727 * If we need a large contiguous chunk of memory, or have 1905 * If we have not reclaimed enough pages for compaction and the
1728 * trouble getting a small set of contiguous pages, we 1906 * inactive lists are large enough, continue reclaiming
1729 * will reclaim both active and inactive pages.
1730 */ 1907 */
1731 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1908 pages_for_compaction = (2UL << sc->order);
1732 sc->lumpy_reclaim_mode = 1; 1909 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1733 else if (sc->order && priority < DEF_PRIORITY - 2) 1910 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1734 sc->lumpy_reclaim_mode = 1; 1911 if (sc->nr_reclaimed < pages_for_compaction &&
1735 else 1912 inactive_lru_pages > pages_for_compaction)
1736 sc->lumpy_reclaim_mode = 0; 1913 return true;
1914
1915 /* If compaction would go ahead or the allocation would succeed, stop */
1916 switch (compaction_suitable(zone, sc->order)) {
1917 case COMPACT_PARTIAL:
1918 case COMPACT_CONTINUE:
1919 return false;
1920 default:
1921 return true;
1922 }
1737} 1923}
1738 1924
1739/* 1925/*
@@ -1745,13 +1931,14 @@ static void shrink_zone(int priority, struct zone *zone,
1745 unsigned long nr[NR_LRU_LISTS]; 1931 unsigned long nr[NR_LRU_LISTS];
1746 unsigned long nr_to_scan; 1932 unsigned long nr_to_scan;
1747 enum lru_list l; 1933 enum lru_list l;
1748 unsigned long nr_reclaimed = sc->nr_reclaimed; 1934 unsigned long nr_reclaimed, nr_scanned;
1749 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1935 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1750 1936
1937restart:
1938 nr_reclaimed = 0;
1939 nr_scanned = sc->nr_scanned;
1751 get_scan_count(zone, sc, nr, priority); 1940 get_scan_count(zone, sc, nr, priority);
1752 1941
1753 set_lumpy_reclaim_mode(priority, sc);
1754
1755 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1942 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1756 nr[LRU_INACTIVE_FILE]) { 1943 nr[LRU_INACTIVE_FILE]) {
1757 for_each_evictable_lru(l) { 1944 for_each_evictable_lru(l) {
@@ -1775,16 +1962,20 @@ static void shrink_zone(int priority, struct zone *zone,
1775 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1962 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1776 break; 1963 break;
1777 } 1964 }
1778 1965 sc->nr_reclaimed += nr_reclaimed;
1779 sc->nr_reclaimed = nr_reclaimed;
1780 1966
1781 /* 1967 /*
1782 * Even if we did not try to evict anon pages at all, we want to 1968 * Even if we did not try to evict anon pages at all, we want to
1783 * rebalance the anon lru active/inactive ratio. 1969 * rebalance the anon lru active/inactive ratio.
1784 */ 1970 */
1785 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) 1971 if (inactive_anon_is_low(zone, sc))
1786 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1972 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1787 1973
1974 /* reclaim/compaction might need reclaim to continue */
1975 if (should_continue_reclaim(zone, nr_reclaimed,
1976 sc->nr_scanned - nr_scanned, sc))
1977 goto restart;
1978
1788 throttle_vm_writeout(sc->gfp_mask); 1979 throttle_vm_writeout(sc->gfp_mask);
1789} 1980}
1790 1981
@@ -1809,6 +2000,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1809{ 2000{
1810 struct zoneref *z; 2001 struct zoneref *z;
1811 struct zone *zone; 2002 struct zone *zone;
2003 unsigned long nr_soft_reclaimed;
2004 unsigned long nr_soft_scanned;
1812 2005
1813 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2006 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1814 gfp_zone(sc->gfp_mask), sc->nodemask) { 2007 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1823,6 +2016,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1823 continue; 2016 continue;
1824 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2017 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1825 continue; /* Let kswapd poll it */ 2018 continue; /* Let kswapd poll it */
2019 /*
2020 * This steals pages from memory cgroups over softlimit
2021 * and returns the number of reclaimed pages and
2022 * scanned pages. This works for global memory pressure
2023 * and balancing, not for a memcg's limit.
2024 */
2025 nr_soft_scanned = 0;
2026 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2027 sc->order, sc->gfp_mask,
2028 &nr_soft_scanned);
2029 sc->nr_reclaimed += nr_soft_reclaimed;
2030 sc->nr_scanned += nr_soft_scanned;
2031 /* need some check for avoid more shrink_zone() */
1826 } 2032 }
1827 2033
1828 shrink_zone(priority, zone, sc); 2034 shrink_zone(priority, zone, sc);
@@ -1834,17 +2040,12 @@ static bool zone_reclaimable(struct zone *zone)
1834 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2040 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1835} 2041}
1836 2042
1837/* 2043/* All zones in zonelist are unreclaimable? */
1838 * As hibernation is going on, kswapd is freezed so that it can't mark
1839 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1840 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1841 */
1842static bool all_unreclaimable(struct zonelist *zonelist, 2044static bool all_unreclaimable(struct zonelist *zonelist,
1843 struct scan_control *sc) 2045 struct scan_control *sc)
1844{ 2046{
1845 struct zoneref *z; 2047 struct zoneref *z;
1846 struct zone *zone; 2048 struct zone *zone;
1847 bool all_unreclaimable = true;
1848 2049
1849 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2050 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1850 gfp_zone(sc->gfp_mask), sc->nodemask) { 2051 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1852,13 +2053,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
1852 continue; 2053 continue;
1853 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2054 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1854 continue; 2055 continue;
1855 if (zone_reclaimable(zone)) { 2056 if (!zone->all_unreclaimable)
1856 all_unreclaimable = false; 2057 return false;
1857 break;
1858 }
1859 } 2058 }
1860 2059
1861 return all_unreclaimable; 2060 return true;
1862} 2061}
1863 2062
1864/* 2063/*
@@ -1878,7 +2077,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
1878 * else, the number of pages reclaimed 2077 * else, the number of pages reclaimed
1879 */ 2078 */
1880static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2079static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1881 struct scan_control *sc) 2080 struct scan_control *sc,
2081 struct shrink_control *shrink)
1882{ 2082{
1883 int priority; 2083 int priority;
1884 unsigned long total_scanned = 0; 2084 unsigned long total_scanned = 0;
@@ -1896,7 +2096,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1896 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2096 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1897 sc->nr_scanned = 0; 2097 sc->nr_scanned = 0;
1898 if (!priority) 2098 if (!priority)
1899 disable_swap_token(); 2099 disable_swap_token(sc->mem_cgroup);
1900 shrink_zones(priority, zonelist, sc); 2100 shrink_zones(priority, zonelist, sc);
1901 /* 2101 /*
1902 * Don't shrink slabs when reclaiming memory from 2102 * Don't shrink slabs when reclaiming memory from
@@ -1912,7 +2112,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1912 lru_pages += zone_reclaimable_pages(zone); 2112 lru_pages += zone_reclaimable_pages(zone);
1913 } 2113 }
1914 2114
1915 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 2115 shrink_slab(shrink, sc->nr_scanned, lru_pages);
1916 if (reclaim_state) { 2116 if (reclaim_state) {
1917 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2117 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1918 reclaim_state->reclaimed_slab = 0; 2118 reclaim_state->reclaimed_slab = 0;
@@ -1937,27 +2137,31 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1937 2137
1938 /* Take a nap, wait for some writeback to complete */ 2138 /* Take a nap, wait for some writeback to complete */
1939 if (!sc->hibernation_mode && sc->nr_scanned && 2139 if (!sc->hibernation_mode && sc->nr_scanned &&
1940 priority < DEF_PRIORITY - 2) 2140 priority < DEF_PRIORITY - 2) {
1941 congestion_wait(BLK_RW_ASYNC, HZ/10); 2141 struct zone *preferred_zone;
2142
2143 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2144 &cpuset_current_mems_allowed,
2145 &preferred_zone);
2146 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2147 }
1942 } 2148 }
1943 2149
1944out: 2150out:
1945 /*
1946 * Now that we've scanned all the zones at this priority level, note
1947 * that level within the zone so that the next thread which performs
1948 * scanning of this zone will immediately start out at this priority
1949 * level. This affects only the decision whether or not to bring
1950 * mapped pages onto the inactive list.
1951 */
1952 if (priority < 0)
1953 priority = 0;
1954
1955 delayacct_freepages_end(); 2151 delayacct_freepages_end();
1956 put_mems_allowed(); 2152 put_mems_allowed();
1957 2153
1958 if (sc->nr_reclaimed) 2154 if (sc->nr_reclaimed)
1959 return sc->nr_reclaimed; 2155 return sc->nr_reclaimed;
1960 2156
2157 /*
2158 * As hibernation is going on, kswapd is freezed so that it can't mark
2159 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2160 * check.
2161 */
2162 if (oom_killer_disabled)
2163 return 0;
2164
1961 /* top priority shrink_zones still had more to do? don't OOM, then */ 2165 /* top priority shrink_zones still had more to do? don't OOM, then */
1962 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2166 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
1963 return 1; 2167 return 1;
@@ -1980,12 +2184,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1980 .mem_cgroup = NULL, 2184 .mem_cgroup = NULL,
1981 .nodemask = nodemask, 2185 .nodemask = nodemask,
1982 }; 2186 };
2187 struct shrink_control shrink = {
2188 .gfp_mask = sc.gfp_mask,
2189 };
1983 2190
1984 trace_mm_vmscan_direct_reclaim_begin(order, 2191 trace_mm_vmscan_direct_reclaim_begin(order,
1985 sc.may_writepage, 2192 sc.may_writepage,
1986 gfp_mask); 2193 gfp_mask);
1987 2194
1988 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2195 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
1989 2196
1990 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2197 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1991 2198
@@ -1997,9 +2204,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1997unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2204unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1998 gfp_t gfp_mask, bool noswap, 2205 gfp_t gfp_mask, bool noswap,
1999 unsigned int swappiness, 2206 unsigned int swappiness,
2000 struct zone *zone) 2207 struct zone *zone,
2208 unsigned long *nr_scanned)
2001{ 2209{
2002 struct scan_control sc = { 2210 struct scan_control sc = {
2211 .nr_scanned = 0,
2003 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2212 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2004 .may_writepage = !laptop_mode, 2213 .may_writepage = !laptop_mode,
2005 .may_unmap = 1, 2214 .may_unmap = 1,
@@ -2008,6 +2217,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2008 .order = 0, 2217 .order = 0,
2009 .mem_cgroup = mem, 2218 .mem_cgroup = mem,
2010 }; 2219 };
2220
2011 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2221 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2012 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2222 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2013 2223
@@ -2026,6 +2236,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2026 2236
2027 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2237 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2028 2238
2239 *nr_scanned = sc.nr_scanned;
2029 return sc.nr_reclaimed; 2240 return sc.nr_reclaimed;
2030} 2241}
2031 2242
@@ -2036,6 +2247,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2036{ 2247{
2037 struct zonelist *zonelist; 2248 struct zonelist *zonelist;
2038 unsigned long nr_reclaimed; 2249 unsigned long nr_reclaimed;
2250 int nid;
2039 struct scan_control sc = { 2251 struct scan_control sc = {
2040 .may_writepage = !laptop_mode, 2252 .may_writepage = !laptop_mode,
2041 .may_unmap = 1, 2253 .may_unmap = 1,
@@ -2045,17 +2257,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2045 .order = 0, 2257 .order = 0,
2046 .mem_cgroup = mem_cont, 2258 .mem_cgroup = mem_cont,
2047 .nodemask = NULL, /* we don't care the placement */ 2259 .nodemask = NULL, /* we don't care the placement */
2260 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2261 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2262 };
2263 struct shrink_control shrink = {
2264 .gfp_mask = sc.gfp_mask,
2048 }; 2265 };
2049 2266
2050 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2267 /*
2051 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2268 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2052 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2269 * take care of from where we get pages. So the node where we start the
2270 * scan does not need to be the current node.
2271 */
2272 nid = mem_cgroup_select_victim_node(mem_cont);
2273
2274 zonelist = NODE_DATA(nid)->node_zonelists;
2053 2275
2054 trace_mm_vmscan_memcg_reclaim_begin(0, 2276 trace_mm_vmscan_memcg_reclaim_begin(0,
2055 sc.may_writepage, 2277 sc.may_writepage,
2056 sc.gfp_mask); 2278 sc.gfp_mask);
2057 2279
2058 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2280 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2059 2281
2060 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2282 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2061 2283
@@ -2063,38 +2285,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2063} 2285}
2064#endif 2286#endif
2065 2287
2288/*
2289 * pgdat_balanced is used when checking if a node is balanced for high-order
2290 * allocations. Only zones that meet watermarks and are in a zone allowed
2291 * by the callers classzone_idx are added to balanced_pages. The total of
2292 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2293 * for the node to be considered balanced. Forcing all zones to be balanced
2294 * for high orders can cause excessive reclaim when there are imbalanced zones.
2295 * The choice of 25% is due to
2296 * o a 16M DMA zone that is balanced will not balance a zone on any
2297 * reasonable sized machine
2298 * o On all other machines, the top zone must be at least a reasonable
2299 * percentage of the middle zones. For example, on 32-bit x86, highmem
2300 * would need to be at least 256M for it to be balance a whole node.
2301 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2302 * to balance a node on its own. These seemed like reasonable ratios.
2303 */
2304static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2305 int classzone_idx)
2306{
2307 unsigned long present_pages = 0;
2308 int i;
2309
2310 for (i = 0; i <= classzone_idx; i++)
2311 present_pages += pgdat->node_zones[i].present_pages;
2312
2313 /* A special case here: if zone has no page, we think it's balanced */
2314 return balanced_pages >= (present_pages >> 2);
2315}
2316
2066/* is kswapd sleeping prematurely? */ 2317/* is kswapd sleeping prematurely? */
2067static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2318static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2319 int classzone_idx)
2068{ 2320{
2069 int i; 2321 int i;
2322 unsigned long balanced = 0;
2323 bool all_zones_ok = true;
2070 2324
2071 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2325 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2072 if (remaining) 2326 if (remaining)
2073 return 1; 2327 return true;
2074 2328
2075 /* If after HZ/10, a zone is below the high mark, it's premature */ 2329 /* Check the watermark levels */
2076 for (i = 0; i < pgdat->nr_zones; i++) { 2330 for (i = 0; i <= classzone_idx; i++) {
2077 struct zone *zone = pgdat->node_zones + i; 2331 struct zone *zone = pgdat->node_zones + i;
2078 2332
2079 if (!populated_zone(zone)) 2333 if (!populated_zone(zone))
2080 continue; 2334 continue;
2081 2335
2082 if (zone->all_unreclaimable) 2336 /*
2337 * balance_pgdat() skips over all_unreclaimable after
2338 * DEF_PRIORITY. Effectively, it considers them balanced so
2339 * they must be considered balanced here as well if kswapd
2340 * is to sleep
2341 */
2342 if (zone->all_unreclaimable) {
2343 balanced += zone->present_pages;
2083 continue; 2344 continue;
2345 }
2084 2346
2085 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2347 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2086 0, 0)) 2348 i, 0))
2087 return 1; 2349 all_zones_ok = false;
2350 else
2351 balanced += zone->present_pages;
2088 } 2352 }
2089 2353
2090 return 0; 2354 /*
2355 * For high-order requests, the balanced zones must contain at least
2356 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2357 * must be balanced
2358 */
2359 if (order)
2360 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2361 else
2362 return !all_zones_ok;
2091} 2363}
2092 2364
2093/* 2365/*
2094 * For kswapd, balance_pgdat() will work across all this node's zones until 2366 * For kswapd, balance_pgdat() will work across all this node's zones until
2095 * they are all at high_wmark_pages(zone). 2367 * they are all at high_wmark_pages(zone).
2096 * 2368 *
2097 * Returns the number of pages which were actually freed. 2369 * Returns the final order kswapd was reclaiming at
2098 * 2370 *
2099 * There is special handling here for zones which are full of pinned pages. 2371 * There is special handling here for zones which are full of pinned pages.
2100 * This can happen if the pages are all mlocked, or if they are all used by 2372 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2111,13 +2383,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2111 * interoperates with the page allocator fallback scheme to ensure that aging 2383 * interoperates with the page allocator fallback scheme to ensure that aging
2112 * of pages is balanced across the zones. 2384 * of pages is balanced across the zones.
2113 */ 2385 */
2114static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2386static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2387 int *classzone_idx)
2115{ 2388{
2116 int all_zones_ok; 2389 int all_zones_ok;
2390 unsigned long balanced;
2117 int priority; 2391 int priority;
2118 int i; 2392 int i;
2393 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2119 unsigned long total_scanned; 2394 unsigned long total_scanned;
2120 struct reclaim_state *reclaim_state = current->reclaim_state; 2395 struct reclaim_state *reclaim_state = current->reclaim_state;
2396 unsigned long nr_soft_reclaimed;
2397 unsigned long nr_soft_scanned;
2121 struct scan_control sc = { 2398 struct scan_control sc = {
2122 .gfp_mask = GFP_KERNEL, 2399 .gfp_mask = GFP_KERNEL,
2123 .may_unmap = 1, 2400 .may_unmap = 1,
@@ -2131,6 +2408,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2131 .order = order, 2408 .order = order,
2132 .mem_cgroup = NULL, 2409 .mem_cgroup = NULL,
2133 }; 2410 };
2411 struct shrink_control shrink = {
2412 .gfp_mask = sc.gfp_mask,
2413 };
2134loop_again: 2414loop_again:
2135 total_scanned = 0; 2415 total_scanned = 0;
2136 sc.nr_reclaimed = 0; 2416 sc.nr_reclaimed = 0;
@@ -2138,15 +2418,15 @@ loop_again:
2138 count_vm_event(PAGEOUTRUN); 2418 count_vm_event(PAGEOUTRUN);
2139 2419
2140 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2420 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2141 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2142 unsigned long lru_pages = 0; 2421 unsigned long lru_pages = 0;
2143 int has_under_min_watermark_zone = 0; 2422 int has_under_min_watermark_zone = 0;
2144 2423
2145 /* The swap token gets in the way of swapout... */ 2424 /* The swap token gets in the way of swapout... */
2146 if (!priority) 2425 if (!priority)
2147 disable_swap_token(); 2426 disable_swap_token(NULL);
2148 2427
2149 all_zones_ok = 1; 2428 all_zones_ok = 1;
2429 balanced = 0;
2150 2430
2151 /* 2431 /*
2152 * Scan in the highmem->dma direction for the highest 2432 * Scan in the highmem->dma direction for the highest
@@ -2169,7 +2449,7 @@ loop_again:
2169 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2449 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2170 &sc, priority, 0); 2450 &sc, priority, 0);
2171 2451
2172 if (!zone_watermark_ok(zone, order, 2452 if (!zone_watermark_ok_safe(zone, order,
2173 high_wmark_pages(zone), 0, 0)) { 2453 high_wmark_pages(zone), 0, 0)) {
2174 end_zone = i; 2454 end_zone = i;
2175 break; 2455 break;
@@ -2196,6 +2476,7 @@ loop_again:
2196 for (i = 0; i <= end_zone; i++) { 2476 for (i = 0; i <= end_zone; i++) {
2197 struct zone *zone = pgdat->node_zones + i; 2477 struct zone *zone = pgdat->node_zones + i;
2198 int nr_slab; 2478 int nr_slab;
2479 unsigned long balance_gap;
2199 2480
2200 if (!populated_zone(zone)) 2481 if (!populated_zone(zone))
2201 continue; 2482 continue;
@@ -2205,28 +2486,42 @@ loop_again:
2205 2486
2206 sc.nr_scanned = 0; 2487 sc.nr_scanned = 0;
2207 2488
2489 nr_soft_scanned = 0;
2208 /* 2490 /*
2209 * Call soft limit reclaim before calling shrink_zone. 2491 * Call soft limit reclaim before calling shrink_zone.
2210 * For now we ignore the return value
2211 */ 2492 */
2212 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2493 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2494 order, sc.gfp_mask,
2495 &nr_soft_scanned);
2496 sc.nr_reclaimed += nr_soft_reclaimed;
2497 total_scanned += nr_soft_scanned;
2213 2498
2214 /* 2499 /*
2215 * We put equal pressure on every zone, unless one 2500 * We put equal pressure on every zone, unless
2216 * zone has way too many pages free already. 2501 * one zone has way too many pages free
2502 * already. The "too many pages" is defined
2503 * as the high wmark plus a "gap" where the
2504 * gap is either the low watermark or 1%
2505 * of the zone, whichever is smaller.
2217 */ 2506 */
2218 if (!zone_watermark_ok(zone, order, 2507 balance_gap = min(low_wmark_pages(zone),
2219 8*high_wmark_pages(zone), end_zone, 0)) 2508 (zone->present_pages +
2509 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2510 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2511 if (!zone_watermark_ok_safe(zone, order,
2512 high_wmark_pages(zone) + balance_gap,
2513 end_zone, 0)) {
2220 shrink_zone(priority, zone, &sc); 2514 shrink_zone(priority, zone, &sc);
2221 reclaim_state->reclaimed_slab = 0; 2515
2222 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2516 reclaim_state->reclaimed_slab = 0;
2223 lru_pages); 2517 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2224 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2518 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2225 total_scanned += sc.nr_scanned; 2519 total_scanned += sc.nr_scanned;
2226 if (zone->all_unreclaimable) 2520
2227 continue; 2521 if (nr_slab == 0 && !zone_reclaimable(zone))
2228 if (nr_slab == 0 && !zone_reclaimable(zone)) 2522 zone->all_unreclaimable = 1;
2229 zone->all_unreclaimable = 1; 2523 }
2524
2230 /* 2525 /*
2231 * If we've done a decent amount of scanning and 2526 * If we've done a decent amount of scanning and
2232 * the reclaim ratio is low, start doing writepage 2527 * the reclaim ratio is low, start doing writepage
@@ -2236,7 +2531,13 @@ loop_again:
2236 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2531 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2237 sc.may_writepage = 1; 2532 sc.may_writepage = 1;
2238 2533
2239 if (!zone_watermark_ok(zone, order, 2534 if (zone->all_unreclaimable) {
2535 if (end_zone && end_zone == i)
2536 end_zone--;
2537 continue;
2538 }
2539
2540 if (!zone_watermark_ok_safe(zone, order,
2240 high_wmark_pages(zone), end_zone, 0)) { 2541 high_wmark_pages(zone), end_zone, 0)) {
2241 all_zones_ok = 0; 2542 all_zones_ok = 0;
2242 /* 2543 /*
@@ -2244,13 +2545,24 @@ loop_again:
2244 * means that we have a GFP_ATOMIC allocation 2545 * means that we have a GFP_ATOMIC allocation
2245 * failure risk. Hurry up! 2546 * failure risk. Hurry up!
2246 */ 2547 */
2247 if (!zone_watermark_ok(zone, order, 2548 if (!zone_watermark_ok_safe(zone, order,
2248 min_wmark_pages(zone), end_zone, 0)) 2549 min_wmark_pages(zone), end_zone, 0))
2249 has_under_min_watermark_zone = 1; 2550 has_under_min_watermark_zone = 1;
2551 } else {
2552 /*
2553 * If a zone reaches its high watermark,
2554 * consider it to be no longer congested. It's
2555 * possible there are dirty pages backed by
2556 * congested BDIs but as pressure is relieved,
2557 * spectulatively avoid congestion waits
2558 */
2559 zone_clear_flag(zone, ZONE_CONGESTED);
2560 if (i <= *classzone_idx)
2561 balanced += zone->present_pages;
2250 } 2562 }
2251 2563
2252 } 2564 }
2253 if (all_zones_ok) 2565 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2254 break; /* kswapd: all done */ 2566 break; /* kswapd: all done */
2255 /* 2567 /*
2256 * OK, kswapd is getting into trouble. Take a nap, then take 2568 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2273,7 +2585,13 @@ loop_again:
2273 break; 2585 break;
2274 } 2586 }
2275out: 2587out:
2276 if (!all_zones_ok) { 2588
2589 /*
2590 * order-0: All zones must meet high watermark for a balanced node
2591 * high-order: Balanced zones must make up at least 25% of the node
2592 * for the node to be balanced
2593 */
2594 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2277 cond_resched(); 2595 cond_resched();
2278 2596
2279 try_to_freeze(); 2597 try_to_freeze();
@@ -2298,7 +2616,88 @@ out:
2298 goto loop_again; 2616 goto loop_again;
2299 } 2617 }
2300 2618
2301 return sc.nr_reclaimed; 2619 /*
2620 * If kswapd was reclaiming at a higher order, it has the option of
2621 * sleeping without all zones being balanced. Before it does, it must
2622 * ensure that the watermarks for order-0 on *all* zones are met and
2623 * that the congestion flags are cleared. The congestion flag must
2624 * be cleared as kswapd is the only mechanism that clears the flag
2625 * and it is potentially going to sleep here.
2626 */
2627 if (order) {
2628 for (i = 0; i <= end_zone; i++) {
2629 struct zone *zone = pgdat->node_zones + i;
2630
2631 if (!populated_zone(zone))
2632 continue;
2633
2634 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2635 continue;
2636
2637 /* Confirm the zone is balanced for order-0 */
2638 if (!zone_watermark_ok(zone, 0,
2639 high_wmark_pages(zone), 0, 0)) {
2640 order = sc.order = 0;
2641 goto loop_again;
2642 }
2643
2644 /* If balanced, clear the congested flag */
2645 zone_clear_flag(zone, ZONE_CONGESTED);
2646 }
2647 }
2648
2649 /*
2650 * Return the order we were reclaiming at so sleeping_prematurely()
2651 * makes a decision on the order we were last reclaiming at. However,
2652 * if another caller entered the allocator slow path while kswapd
2653 * was awake, order will remain at the higher level
2654 */
2655 *classzone_idx = end_zone;
2656 return order;
2657}
2658
2659static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2660{
2661 long remaining = 0;
2662 DEFINE_WAIT(wait);
2663
2664 if (freezing(current) || kthread_should_stop())
2665 return;
2666
2667 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668
2669 /* Try to sleep for a short interval */
2670 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2671 remaining = schedule_timeout(HZ/10);
2672 finish_wait(&pgdat->kswapd_wait, &wait);
2673 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2674 }
2675
2676 /*
2677 * After a short sleep, check if it was a premature sleep. If not, then
2678 * go fully to sleep until explicitly woken up.
2679 */
2680 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2681 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682
2683 /*
2684 * vmstat counters are not perfectly accurate and the estimated
2685 * value for counters such as NR_FREE_PAGES can deviate from the
2686 * true value by nr_online_cpus * threshold. To avoid the zone
2687 * watermarks being breached while under pressure, we reduce the
2688 * per-cpu vmstat threshold while kswapd is awake and restore
2689 * them before going back to sleep.
2690 */
2691 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2692 schedule();
2693 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2694 } else {
2695 if (remaining)
2696 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2697 else
2698 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2699 }
2700 finish_wait(&pgdat->kswapd_wait, &wait);
2302} 2701}
2303 2702
2304/* 2703/*
@@ -2316,10 +2715,11 @@ out:
2316 */ 2715 */
2317static int kswapd(void *p) 2716static int kswapd(void *p)
2318{ 2717{
2319 unsigned long order; 2718 unsigned long order, new_order;
2719 int classzone_idx, new_classzone_idx;
2320 pg_data_t *pgdat = (pg_data_t*)p; 2720 pg_data_t *pgdat = (pg_data_t*)p;
2321 struct task_struct *tsk = current; 2721 struct task_struct *tsk = current;
2322 DEFINE_WAIT(wait); 2722
2323 struct reclaim_state reclaim_state = { 2723 struct reclaim_state reclaim_state = {
2324 .reclaimed_slab = 0, 2724 .reclaimed_slab = 0,
2325 }; 2725 };
@@ -2346,50 +2746,37 @@ static int kswapd(void *p)
2346 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2746 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2347 set_freezable(); 2747 set_freezable();
2348 2748
2349 order = 0; 2749 order = new_order = 0;
2750 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2350 for ( ; ; ) { 2751 for ( ; ; ) {
2351 unsigned long new_order;
2352 int ret; 2752 int ret;
2353 2753
2354 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2754 /*
2355 new_order = pgdat->kswapd_max_order; 2755 * If the last balance_pgdat was unsuccessful it's unlikely a
2356 pgdat->kswapd_max_order = 0; 2756 * new request of a similar or harder type will succeed soon
2357 if (order < new_order) { 2757 * so consider going to sleep on the basis we reclaimed at
2758 */
2759 if (classzone_idx >= new_classzone_idx && order == new_order) {
2760 new_order = pgdat->kswapd_max_order;
2761 new_classzone_idx = pgdat->classzone_idx;
2762 pgdat->kswapd_max_order = 0;
2763 pgdat->classzone_idx = pgdat->nr_zones - 1;
2764 }
2765
2766 if (order < new_order || classzone_idx > new_classzone_idx) {
2358 /* 2767 /*
2359 * Don't sleep if someone wants a larger 'order' 2768 * Don't sleep if someone wants a larger 'order'
2360 * allocation 2769 * allocation or has tigher zone constraints
2361 */ 2770 */
2362 order = new_order; 2771 order = new_order;
2772 classzone_idx = new_classzone_idx;
2363 } else { 2773 } else {
2364 if (!freezing(current) && !kthread_should_stop()) { 2774 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2365 long remaining = 0;
2366
2367 /* Try to sleep for a short interval */
2368 if (!sleeping_prematurely(pgdat, order, remaining)) {
2369 remaining = schedule_timeout(HZ/10);
2370 finish_wait(&pgdat->kswapd_wait, &wait);
2371 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2372 }
2373
2374 /*
2375 * After a short sleep, check if it was a
2376 * premature sleep. If not, then go fully
2377 * to sleep until explicitly woken up
2378 */
2379 if (!sleeping_prematurely(pgdat, order, remaining)) {
2380 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2381 schedule();
2382 } else {
2383 if (remaining)
2384 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2385 else
2386 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2387 }
2388 }
2389
2390 order = pgdat->kswapd_max_order; 2775 order = pgdat->kswapd_max_order;
2776 classzone_idx = pgdat->classzone_idx;
2777 pgdat->kswapd_max_order = 0;
2778 pgdat->classzone_idx = pgdat->nr_zones - 1;
2391 } 2779 }
2392 finish_wait(&pgdat->kswapd_wait, &wait);
2393 2780
2394 ret = try_to_freeze(); 2781 ret = try_to_freeze();
2395 if (kthread_should_stop()) 2782 if (kthread_should_stop())
@@ -2401,7 +2788,7 @@ static int kswapd(void *p)
2401 */ 2788 */
2402 if (!ret) { 2789 if (!ret) {
2403 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2790 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2404 balance_pgdat(pgdat, order); 2791 order = balance_pgdat(pgdat, order, &classzone_idx);
2405 } 2792 }
2406 } 2793 }
2407 return 0; 2794 return 0;
@@ -2410,23 +2797,26 @@ static int kswapd(void *p)
2410/* 2797/*
2411 * A zone is low on free memory, so wake its kswapd task to service it. 2798 * A zone is low on free memory, so wake its kswapd task to service it.
2412 */ 2799 */
2413void wakeup_kswapd(struct zone *zone, int order) 2800void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2414{ 2801{
2415 pg_data_t *pgdat; 2802 pg_data_t *pgdat;
2416 2803
2417 if (!populated_zone(zone)) 2804 if (!populated_zone(zone))
2418 return; 2805 return;
2419 2806
2420 pgdat = zone->zone_pgdat;
2421 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2422 return;
2423 if (pgdat->kswapd_max_order < order)
2424 pgdat->kswapd_max_order = order;
2425 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2426 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2427 return; 2808 return;
2809 pgdat = zone->zone_pgdat;
2810 if (pgdat->kswapd_max_order < order) {
2811 pgdat->kswapd_max_order = order;
2812 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2813 }
2428 if (!waitqueue_active(&pgdat->kswapd_wait)) 2814 if (!waitqueue_active(&pgdat->kswapd_wait))
2429 return; 2815 return;
2816 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2817 return;
2818
2819 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2430 wake_up_interruptible(&pgdat->kswapd_wait); 2820 wake_up_interruptible(&pgdat->kswapd_wait);
2431} 2821}
2432 2822
@@ -2487,7 +2877,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2487 .swappiness = vm_swappiness, 2877 .swappiness = vm_swappiness,
2488 .order = 0, 2878 .order = 0,
2489 }; 2879 };
2490 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2880 struct shrink_control shrink = {
2881 .gfp_mask = sc.gfp_mask,
2882 };
2883 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2491 struct task_struct *p = current; 2884 struct task_struct *p = current;
2492 unsigned long nr_reclaimed; 2885 unsigned long nr_reclaimed;
2493 2886
@@ -2496,7 +2889,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2496 reclaim_state.reclaimed_slab = 0; 2889 reclaim_state.reclaimed_slab = 0;
2497 p->reclaim_state = &reclaim_state; 2890 p->reclaim_state = &reclaim_state;
2498 2891
2499 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2892 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2500 2893
2501 p->reclaim_state = NULL; 2894 p->reclaim_state = NULL;
2502 lockdep_clear_current_reclaim_state(); 2895 lockdep_clear_current_reclaim_state();
@@ -2671,6 +3064,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2671 .swappiness = vm_swappiness, 3064 .swappiness = vm_swappiness,
2672 .order = order, 3065 .order = order,
2673 }; 3066 };
3067 struct shrink_control shrink = {
3068 .gfp_mask = sc.gfp_mask,
3069 };
2674 unsigned long nr_slab_pages0, nr_slab_pages1; 3070 unsigned long nr_slab_pages0, nr_slab_pages1;
2675 3071
2676 cond_resched(); 3072 cond_resched();
@@ -2712,7 +3108,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2712 unsigned long lru_pages = zone_reclaimable_pages(zone); 3108 unsigned long lru_pages = zone_reclaimable_pages(zone);
2713 3109
2714 /* No reclaimable slab or very low memory pressure */ 3110 /* No reclaimable slab or very low memory pressure */
2715 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) 3111 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
2716 break; 3112 break;
2717 3113
2718 /* Freed enough memory */ 3114 /* Freed enough memory */
@@ -2987,6 +3383,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
2987 return 0; 3383 return 0;
2988} 3384}
2989 3385
3386#ifdef CONFIG_NUMA
2990/* 3387/*
2991 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3388 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2992 * a specified node's per zone unevictable lists for evictable pages. 3389 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3430,4 @@ void scan_unevictable_unregister_node(struct node *node)
3033{ 3430{
3034 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3431 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3035} 3432}
3036 3433#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..20c18b7694b2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
17#include <linux/vmstat.h> 17#include <linux/vmstat.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h>
21#include <linux/compaction.h>
20 22
21#ifdef CONFIG_VM_EVENT_COUNTERS 23#ifdef CONFIG_VM_EVENT_COUNTERS
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -81,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
81 83
82#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
83 85
84static int calculate_threshold(struct zone *zone) 86int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
110int calculate_normal_threshold(struct zone *zone)
85{ 111{
86 int threshold; 112 int threshold;
87 int mem; /* memory in 128 MB units */ 113 int mem; /* memory in 128 MB units */
@@ -131,7 +157,7 @@ static int calculate_threshold(struct zone *zone)
131/* 157/*
132 * Refresh the thresholds for each zone. 158 * Refresh the thresholds for each zone.
133 */ 159 */
134static void refresh_zone_stat_thresholds(void) 160void refresh_zone_stat_thresholds(void)
135{ 161{
136 struct zone *zone; 162 struct zone *zone;
137 int cpu; 163 int cpu;
@@ -140,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
140 for_each_populated_zone(zone) { 166 for_each_populated_zone(zone) {
141 unsigned long max_drift, tolerate_drift; 167 unsigned long max_drift, tolerate_drift;
142 168
143 threshold = calculate_threshold(zone); 169 threshold = calculate_normal_threshold(zone);
144 170
145 for_each_online_cpu(cpu) 171 for_each_online_cpu(cpu)
146 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 172 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -159,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
159 } 185 }
160} 186}
161 187
188void set_pgdat_percpu_threshold(pg_data_t *pgdat,
189 int (*calculate_pressure)(struct zone *))
190{
191 struct zone *zone;
192 int cpu;
193 int threshold;
194 int i;
195
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = (*calculate_pressure)(zone);
202 for_each_possible_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206}
207
162/* 208/*
163 * For use when we know that interrupts are disabled. 209 * For use when we know that interrupts are disabled.
164 */ 210 */
165void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 211void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
166 int delta) 212 int delta)
167{ 213{
168 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 214 struct per_cpu_pageset __percpu *pcp = zone->pageset;
169 215 s8 __percpu *p = pcp->vm_stat_diff + item;
170 s8 *p = pcp->vm_stat_diff + item;
171 long x; 216 long x;
217 long t;
172 218
173 x = delta + *p; 219 x = delta + __this_cpu_read(*p);
174 220
175 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 221 t = __this_cpu_read(pcp->stat_threshold);
222
223 if (unlikely(x > t || x < -t)) {
176 zone_page_state_add(x, zone, item); 224 zone_page_state_add(x, zone, item);
177 x = 0; 225 x = 0;
178 } 226 }
179 *p = x; 227 __this_cpu_write(*p, x);
180} 228}
181EXPORT_SYMBOL(__mod_zone_page_state); 229EXPORT_SYMBOL(__mod_zone_page_state);
182 230
183/* 231/*
184 * For an unknown interrupt state
185 */
186void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
187 int delta)
188{
189 unsigned long flags;
190
191 local_irq_save(flags);
192 __mod_zone_page_state(zone, item, delta);
193 local_irq_restore(flags);
194}
195EXPORT_SYMBOL(mod_zone_page_state);
196
197/*
198 * Optimized increment and decrement functions. 232 * Optimized increment and decrement functions.
199 * 233 *
200 * These are only for a single page and therefore can take a struct page * 234 * These are only for a single page and therefore can take a struct page *
@@ -219,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
219 */ 253 */
220void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 254void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
221{ 255{
222 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 256 struct per_cpu_pageset __percpu *pcp = zone->pageset;
223 s8 *p = pcp->vm_stat_diff + item; 257 s8 __percpu *p = pcp->vm_stat_diff + item;
224 258 s8 v, t;
225 (*p)++;
226 259
227 if (unlikely(*p > pcp->stat_threshold)) { 260 v = __this_cpu_inc_return(*p);
228 int overstep = pcp->stat_threshold / 2; 261 t = __this_cpu_read(pcp->stat_threshold);
262 if (unlikely(v > t)) {
263 s8 overstep = t >> 1;
229 264
230 zone_page_state_add(*p + overstep, zone, item); 265 zone_page_state_add(v + overstep, zone, item);
231 *p = -overstep; 266 __this_cpu_write(*p, -overstep);
232 } 267 }
233} 268}
234 269
@@ -240,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
240 275
241void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 276void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
242{ 277{
243 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 278 struct per_cpu_pageset __percpu *pcp = zone->pageset;
244 s8 *p = pcp->vm_stat_diff + item; 279 s8 __percpu *p = pcp->vm_stat_diff + item;
280 s8 v, t;
245 281
246 (*p)--; 282 v = __this_cpu_dec_return(*p);
283 t = __this_cpu_read(pcp->stat_threshold);
284 if (unlikely(v < - t)) {
285 s8 overstep = t >> 1;
247 286
248 if (unlikely(*p < - pcp->stat_threshold)) { 287 zone_page_state_add(v - overstep, zone, item);
249 int overstep = pcp->stat_threshold / 2; 288 __this_cpu_write(*p, overstep);
250
251 zone_page_state_add(*p - overstep, zone, item);
252 *p = overstep;
253 } 289 }
254} 290}
255 291
@@ -259,6 +295,95 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
259} 295}
260EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
261 297
298#ifdef CONFIG_CMPXCHG_LOCAL
299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
302 *
303 * mod_state() modifies the zone counter state through atomic per cpu
304 * operations.
305 *
306 * Overstep mode specifies how overstep should handled:
307 * 0 No overstepping
308 * 1 Overstepping half of threshold
309 * -1 Overstepping minus half of threshold
310*/
311static inline void mod_state(struct zone *zone,
312 enum zone_stat_item item, int delta, int overstep_mode)
313{
314 struct per_cpu_pageset __percpu *pcp = zone->pageset;
315 s8 __percpu *p = pcp->vm_stat_diff + item;
316 long o, n, t, z;
317
318 do {
319 z = 0; /* overflow to zone counters */
320
321 /*
322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the next
325 * counter update will apply the threshold again and
326 * therefore bring the counter under the threshold again.
327 *
328 * Most of the time the thresholds are the same anyways
329 * for all cpus in a zone.
330 */
331 t = this_cpu_read(pcp->stat_threshold);
332
333 o = this_cpu_read(*p);
334 n = delta + o;
335
336 if (n > t || n < -t) {
337 int os = overstep_mode * (t >> 1) ;
338
339 /* Overflow must be added to zone counters */
340 z = n + os;
341 n = -os;
342 }
343 } while (this_cpu_cmpxchg(*p, o, n) != o);
344
345 if (z)
346 zone_page_state_add(z, zone, item);
347}
348
349void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
350 int delta)
351{
352 mod_state(zone, item, delta, 0);
353}
354EXPORT_SYMBOL(mod_zone_page_state);
355
356void inc_zone_state(struct zone *zone, enum zone_stat_item item)
357{
358 mod_state(zone, item, 1, 1);
359}
360
361void inc_zone_page_state(struct page *page, enum zone_stat_item item)
362{
363 mod_state(page_zone(page), item, 1, 1);
364}
365EXPORT_SYMBOL(inc_zone_page_state);
366
367void dec_zone_page_state(struct page *page, enum zone_stat_item item)
368{
369 mod_state(page_zone(page), item, -1, -1);
370}
371EXPORT_SYMBOL(dec_zone_page_state);
372#else
373/*
374 * Use interrupt disable to serialize counter updates
375 */
376void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
377 int delta)
378{
379 unsigned long flags;
380
381 local_irq_save(flags);
382 __mod_zone_page_state(zone, item, delta);
383 local_irq_restore(flags);
384}
385EXPORT_SYMBOL(mod_zone_page_state);
386
262void inc_zone_state(struct zone *zone, enum zone_stat_item item) 387void inc_zone_state(struct zone *zone, enum zone_stat_item item)
263{ 388{
264 unsigned long flags; 389 unsigned long flags;
@@ -289,6 +414,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
289 local_irq_restore(flags); 414 local_irq_restore(flags);
290} 415}
291EXPORT_SYMBOL(dec_zone_page_state); 416EXPORT_SYMBOL(dec_zone_page_state);
417#endif
292 418
293/* 419/*
294 * Update the zone counters for one cpu. 420 * Update the zone counters for one cpu.
@@ -377,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
377 * z = the zone from which the allocation occurred. 503 * z = the zone from which the allocation occurred.
378 * 504 *
379 * Must be called with interrupts disabled. 505 * Must be called with interrupts disabled.
506 *
507 * When __GFP_OTHER_NODE is set assume the node of the preferred
508 * zone is the local node. This is useful for daemons who allocate
509 * memory on behalf of other processes.
380 */ 510 */
381void zone_statistics(struct zone *preferred_zone, struct zone *z) 511void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
382{ 512{
383 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 513 if (z->zone_pgdat == preferred_zone->zone_pgdat) {
384 __inc_zone_state(z, NUMA_HIT); 514 __inc_zone_state(z, NUMA_HIT);
@@ -386,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
386 __inc_zone_state(z, NUMA_MISS); 516 __inc_zone_state(z, NUMA_MISS);
387 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 517 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
388 } 518 }
389 if (z->node == numa_node_id()) 519 if (z->node == ((flags & __GFP_OTHER_NODE) ?
520 preferred_zone->node : numa_node_id()))
390 __inc_zone_state(z, NUMA_LOCAL); 521 __inc_zone_state(z, NUMA_LOCAL);
391 else 522 else
392 __inc_zone_state(z, NUMA_OTHER); 523 __inc_zone_state(z, NUMA_OTHER);
@@ -394,6 +525,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
394#endif 525#endif
395 526
396#ifdef CONFIG_COMPACTION 527#ifdef CONFIG_COMPACTION
528
397struct contig_page_info { 529struct contig_page_info {
398 unsigned long free_pages; 530 unsigned long free_pages;
399 unsigned long free_blocks_total; 531 unsigned long free_blocks_total;
@@ -527,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
527} 659}
528#endif 660#endif
529 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else
666#define TEXT_FOR_DMA(xx)
667#endif
668
669#ifdef CONFIG_ZONE_DMA32
670#define TEXT_FOR_DMA32(xx) xx "_dma32",
671#else
672#define TEXT_FOR_DMA32(xx)
673#endif
674
675#ifdef CONFIG_HIGHMEM
676#define TEXT_FOR_HIGHMEM(xx) xx "_high",
677#else
678#define TEXT_FOR_HIGHMEM(xx)
679#endif
680
681#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
682 TEXT_FOR_HIGHMEM(xx) xx "_movable",
683
684const char * const vmstat_text[] = {
685 /* Zoned VM counters */
686 "nr_free_pages",
687 "nr_inactive_anon",
688 "nr_active_anon",
689 "nr_inactive_file",
690 "nr_active_file",
691 "nr_unevictable",
692 "nr_mlock",
693 "nr_anon_pages",
694 "nr_mapped",
695 "nr_file_pages",
696 "nr_dirty",
697 "nr_writeback",
698 "nr_slab_reclaimable",
699 "nr_slab_unreclaimable",
700 "nr_page_table_pages",
701 "nr_kernel_stack",
702 "nr_unstable",
703 "nr_bounce",
704 "nr_vmscan_write",
705 "nr_writeback_temp",
706 "nr_isolated_anon",
707 "nr_isolated_file",
708 "nr_shmem",
709 "nr_dirtied",
710 "nr_written",
711
712#ifdef CONFIG_NUMA
713 "numa_hit",
714 "numa_miss",
715 "numa_foreign",
716 "numa_interleave",
717 "numa_local",
718 "numa_other",
719#endif
720 "nr_anon_transparent_hugepages",
721 "nr_dirty_threshold",
722 "nr_dirty_background_threshold",
723
724#ifdef CONFIG_VM_EVENT_COUNTERS
725 "pgpgin",
726 "pgpgout",
727 "pswpin",
728 "pswpout",
729
730 TEXTS_FOR_ZONES("pgalloc")
731
732 "pgfree",
733 "pgactivate",
734 "pgdeactivate",
735
736 "pgfault",
737 "pgmajfault",
738
739 TEXTS_FOR_ZONES("pgrefill")
740 TEXTS_FOR_ZONES("pgsteal")
741 TEXTS_FOR_ZONES("pgscan_kswapd")
742 TEXTS_FOR_ZONES("pgscan_direct")
743
744#ifdef CONFIG_NUMA
745 "zone_reclaim_failed",
746#endif
747 "pginodesteal",
748 "slabs_scanned",
749 "kswapd_steal",
750 "kswapd_inodesteal",
751 "kswapd_low_wmark_hit_quickly",
752 "kswapd_high_wmark_hit_quickly",
753 "kswapd_skip_congestion_wait",
754 "pageoutrun",
755 "allocstall",
756
757 "pgrotated",
758
759#ifdef CONFIG_COMPACTION
760 "compact_blocks_moved",
761 "compact_pages_moved",
762 "compact_pagemigrate_failed",
763 "compact_stall",
764 "compact_fail",
765 "compact_success",
766#endif
767
768#ifdef CONFIG_HUGETLB_PAGE
769 "htlb_buddy_alloc_success",
770 "htlb_buddy_alloc_fail",
771#endif
772 "unevictable_pgs_culled",
773 "unevictable_pgs_scanned",
774 "unevictable_pgs_rescued",
775 "unevictable_pgs_mlocked",
776 "unevictable_pgs_munlocked",
777 "unevictable_pgs_cleared",
778 "unevictable_pgs_stranded",
779 "unevictable_pgs_mlockfreed",
780
781#ifdef CONFIG_TRANSPARENT_HUGEPAGE
782 "thp_fault_alloc",
783 "thp_fault_fallback",
784 "thp_collapse_alloc",
785 "thp_collapse_alloc_failed",
786 "thp_split",
787#endif
788
789#endif /* CONFIG_VM_EVENTS_COUNTERS */
790};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
792
793
530#ifdef CONFIG_PROC_FS 794#ifdef CONFIG_PROC_FS
531static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 795static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
532 struct zone *zone) 796 struct zone *zone)
@@ -699,120 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
699 .release = seq_release, 963 .release = seq_release,
700}; 964};
701 965
702#ifdef CONFIG_ZONE_DMA
703#define TEXT_FOR_DMA(xx) xx "_dma",
704#else
705#define TEXT_FOR_DMA(xx)
706#endif
707
708#ifdef CONFIG_ZONE_DMA32
709#define TEXT_FOR_DMA32(xx) xx "_dma32",
710#else
711#define TEXT_FOR_DMA32(xx)
712#endif
713
714#ifdef CONFIG_HIGHMEM
715#define TEXT_FOR_HIGHMEM(xx) xx "_high",
716#else
717#define TEXT_FOR_HIGHMEM(xx)
718#endif
719
720#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
721 TEXT_FOR_HIGHMEM(xx) xx "_movable",
722
723static const char * const vmstat_text[] = {
724 /* Zoned VM counters */
725 "nr_free_pages",
726 "nr_inactive_anon",
727 "nr_active_anon",
728 "nr_inactive_file",
729 "nr_active_file",
730 "nr_unevictable",
731 "nr_mlock",
732 "nr_anon_pages",
733 "nr_mapped",
734 "nr_file_pages",
735 "nr_dirty",
736 "nr_writeback",
737 "nr_slab_reclaimable",
738 "nr_slab_unreclaimable",
739 "nr_page_table_pages",
740 "nr_kernel_stack",
741 "nr_unstable",
742 "nr_bounce",
743 "nr_vmscan_write",
744 "nr_writeback_temp",
745 "nr_isolated_anon",
746 "nr_isolated_file",
747 "nr_shmem",
748#ifdef CONFIG_NUMA
749 "numa_hit",
750 "numa_miss",
751 "numa_foreign",
752 "numa_interleave",
753 "numa_local",
754 "numa_other",
755#endif
756
757#ifdef CONFIG_VM_EVENT_COUNTERS
758 "pgpgin",
759 "pgpgout",
760 "pswpin",
761 "pswpout",
762
763 TEXTS_FOR_ZONES("pgalloc")
764
765 "pgfree",
766 "pgactivate",
767 "pgdeactivate",
768
769 "pgfault",
770 "pgmajfault",
771
772 TEXTS_FOR_ZONES("pgrefill")
773 TEXTS_FOR_ZONES("pgsteal")
774 TEXTS_FOR_ZONES("pgscan_kswapd")
775 TEXTS_FOR_ZONES("pgscan_direct")
776
777#ifdef CONFIG_NUMA
778 "zone_reclaim_failed",
779#endif
780 "pginodesteal",
781 "slabs_scanned",
782 "kswapd_steal",
783 "kswapd_inodesteal",
784 "kswapd_low_wmark_hit_quickly",
785 "kswapd_high_wmark_hit_quickly",
786 "kswapd_skip_congestion_wait",
787 "pageoutrun",
788 "allocstall",
789
790 "pgrotated",
791
792#ifdef CONFIG_COMPACTION
793 "compact_blocks_moved",
794 "compact_pages_moved",
795 "compact_pagemigrate_failed",
796 "compact_stall",
797 "compact_fail",
798 "compact_success",
799#endif
800
801#ifdef CONFIG_HUGETLB_PAGE
802 "htlb_buddy_alloc_success",
803 "htlb_buddy_alloc_fail",
804#endif
805 "unevictable_pgs_culled",
806 "unevictable_pgs_scanned",
807 "unevictable_pgs_rescued",
808 "unevictable_pgs_mlocked",
809 "unevictable_pgs_munlocked",
810 "unevictable_pgs_cleared",
811 "unevictable_pgs_stranded",
812 "unevictable_pgs_mlockfreed",
813#endif
814};
815
816static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 966static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
817 struct zone *zone) 967 struct zone *zone)
818{ 968{
@@ -826,7 +976,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
826 "\n scanned %lu" 976 "\n scanned %lu"
827 "\n spanned %lu" 977 "\n spanned %lu"
828 "\n present %lu", 978 "\n present %lu",
829 zone_nr_free_pages(zone), 979 zone_page_state(zone, NR_FREE_PAGES),
830 min_wmark_pages(zone), 980 min_wmark_pages(zone),
831 low_wmark_pages(zone), 981 low_wmark_pages(zone),
832 high_wmark_pages(zone), 982 high_wmark_pages(zone),
@@ -904,36 +1054,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
904 .release = seq_release, 1054 .release = seq_release,
905}; 1055};
906 1056
1057enum writeback_stat_item {
1058 NR_DIRTY_THRESHOLD,
1059 NR_DIRTY_BG_THRESHOLD,
1060 NR_VM_WRITEBACK_STAT_ITEMS,
1061};
1062
907static void *vmstat_start(struct seq_file *m, loff_t *pos) 1063static void *vmstat_start(struct seq_file *m, loff_t *pos)
908{ 1064{
909 unsigned long *v; 1065 unsigned long *v;
910#ifdef CONFIG_VM_EVENT_COUNTERS 1066 int i, stat_items_size;
911 unsigned long *e;
912#endif
913 int i;
914 1067
915 if (*pos >= ARRAY_SIZE(vmstat_text)) 1068 if (*pos >= ARRAY_SIZE(vmstat_text))
916 return NULL; 1069 return NULL;
1070 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1071 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
917 1072
918#ifdef CONFIG_VM_EVENT_COUNTERS 1073#ifdef CONFIG_VM_EVENT_COUNTERS
919 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) 1074 stat_items_size += sizeof(struct vm_event_state);
920 + sizeof(struct vm_event_state), GFP_KERNEL);
921#else
922 v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
923 GFP_KERNEL);
924#endif 1075#endif
1076
1077 v = kmalloc(stat_items_size, GFP_KERNEL);
925 m->private = v; 1078 m->private = v;
926 if (!v) 1079 if (!v)
927 return ERR_PTR(-ENOMEM); 1080 return ERR_PTR(-ENOMEM);
928 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1081 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
929 v[i] = global_page_state(i); 1082 v[i] = global_page_state(i);
1083 v += NR_VM_ZONE_STAT_ITEMS;
1084
1085 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1086 v + NR_DIRTY_THRESHOLD);
1087 v += NR_VM_WRITEBACK_STAT_ITEMS;
1088
930#ifdef CONFIG_VM_EVENT_COUNTERS 1089#ifdef CONFIG_VM_EVENT_COUNTERS
931 e = v + NR_VM_ZONE_STAT_ITEMS; 1090 all_vm_events(v);
932 all_vm_events(e); 1091 v[PGPGIN] /= 2; /* sectors -> kbytes */
933 e[PGPGIN] /= 2; /* sectors -> kbytes */ 1092 v[PGPGOUT] /= 2;
934 e[PGPGOUT] /= 2;
935#endif 1093#endif
936 return v + *pos; 1094 return (unsigned long *)m->private + *pos;
937} 1095}
938 1096
939static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1097static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -1017,7 +1175,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1017 break; 1175 break;
1018 case CPU_DOWN_PREPARE: 1176 case CPU_DOWN_PREPARE:
1019 case CPU_DOWN_PREPARE_FROZEN: 1177 case CPU_DOWN_PREPARE_FROZEN:
1020 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1178 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1021 per_cpu(vmstat_work, cpu).work.func = NULL; 1179 per_cpu(vmstat_work, cpu).work.func = NULL;
1022 break; 1180 break;
1023 case CPU_DOWN_FAILED: 1181 case CPU_DOWN_FAILED:
@@ -1043,7 +1201,6 @@ static int __init setup_vmstat(void)
1043#ifdef CONFIG_SMP 1201#ifdef CONFIG_SMP
1044 int cpu; 1202 int cpu;
1045 1203
1046 refresh_zone_stat_thresholds();
1047 register_cpu_notifier(&vmstat_notifier); 1204 register_cpu_notifier(&vmstat_notifier);
1048 1205
1049 for_each_online_cpu(cpu) 1206 for_each_online_cpu(cpu)